!--------------------------------------------------------------------------------------------------!
!   CP2K: A general program to perform molecular dynamics simulations                              !
!   Copyright (C) 2000 - 2019  CP2K developers group                                               !
!--------------------------------------------------------------------------------------------------!

! **************************************************************************************************
!> \brief Calculate the plane wave density by collocating the primitive Gaussian
!>      functions (pgf).
!> \par History
!>      - rewrote collocate for increased accuracy and speed
!>      - introduced the PGI hack for increased speed with that compiler
!>        (22.02.02)
!>      - Added Multiple Grid feature
!>      - new way to get over the grid (01.03.02)
!>      - removed timing calls since they were getting expensive
!>      - Updated with the new QS data structures (09.04.02,MK)
!>      - introduction of the real space grid type ( prelim. version JVdV 05.02)
!>      - parallel FFT (JGH 22.05.02)
!>      - multigrid arrays independent from density (JGH 30.08.02)
!>      - old density stored in g space (JGH 30.08.02)
!>      - distributed real space code (JGH 17.07.03)
!>      - refactoring and new loop ordering (JGH 23.11.03)
!>      - OpenMP parallelization (JGH 03.12.03)
!>      - Modified to compute tau (Joost 12.03)
!>      - removed the incremental density rebuild (Joost 01.04)
!>      - introduced realspace multigridding (Joost 02.04)
!>      - introduced map_consistent (Joost 02.04)
!>      - Addition of the subroutine calculate_atomic_charge_density (TdK, 08.05)
!>      - rewrite of the collocate/integrate kernels (Joost VandeVondele, 03.07)
!> \author Matthias Krack (03.04.2001)
!>      1) Joost VandeVondele (01.2002)
!>      Thomas D. Kuehne (04.08.2005)
! **************************************************************************************************
MODULE qs_collocate_density
   USE ao_util,                         ONLY: exp_radius_very_extended
   USE atomic_kind_types,               ONLY: atomic_kind_type,&
                                              get_atomic_kind,&
                                              get_atomic_kind_set
   USE basis_set_types,                 ONLY: get_gto_basis_set,&
                                              gto_basis_set_type
   USE cell_types,                      ONLY: cell_type,&
                                              pbc
   USE cp_control_types,                ONLY: dft_control_type
   USE cp_dbcsr_operations,             ONLY: dbcsr_allocate_matrix_set,&
                                              dbcsr_deallocate_matrix_set
   USE cp_fm_types,                     ONLY: cp_fm_get_element,&
                                              cp_fm_get_info,&
                                              cp_fm_type
   USE cube_utils,                      ONLY: compute_cube_center,&
                                              cube_info_type,&
                                              return_cube,&
                                              return_cube_nonortho
   USE d3_poly,                         ONLY: poly_cp2k2d3
   USE dbcsr_api,                       ONLY: dbcsr_copy,&
                                              dbcsr_get_block_p,&
                                              dbcsr_p_type,&
                                              dbcsr_type
   USE external_potential_types,        ONLY: get_potential,&
                                              gth_potential_type
   USE gauss_colloc,                    ONLY: collocGauss
   USE gaussian_gridlevels,             ONLY: gaussian_gridlevel,&
                                              gridlevel_info_type
   USE kinds,                           ONLY: default_string_length,&
                                              dp,&
                                              int_8
   USE lgrid_types,                     ONLY: lgrid_allocate_grid,&
                                              lgrid_type
   USE lri_environment_types,           ONLY: lri_kind_type
   USE mathconstants,                   ONLY: fac,&
                                              pi,&
                                              twopi
   USE memory_utilities,                ONLY: reallocate
   USE orbital_pointers,                ONLY: coset,&
                                              indco,&
                                              ncoset
   USE particle_types,                  ONLY: particle_type
   USE pw_env_types,                    ONLY: pw_env_get,&
                                              pw_env_type
   USE pw_methods,                      ONLY: pw_axpy,&
                                              pw_integrate_function,&
                                              pw_transfer,&
                                              pw_zero
   USE pw_pool_types,                   ONLY: pw_pool_create_pw,&
                                              pw_pool_give_back_pw,&
                                              pw_pool_p_type,&
                                              pw_pool_type,&
                                              pw_pools_create_pws,&
                                              pw_pools_give_back_pws
   USE pw_types,                        ONLY: COMPLEXDATA1D,&
                                              REALDATA3D,&
                                              REALSPACE,&
                                              RECIPROCALSPACE,&
                                              pw_p_type,&
                                              pw_type
   USE qs_environment_types,            ONLY: get_qs_env,&
                                              qs_environment_type
   USE qs_kind_types,                   ONLY: get_qs_kind,&
                                              get_qs_kind_set,&
                                              qs_kind_type
   USE qs_ks_types,                     ONLY: get_ks_env,&
                                              qs_ks_env_type
   USE qs_modify_pab_block,             ONLY: &
        FUNC_AB, FUNC_ADBmDAB, FUNC_ARB, FUNC_ARDBmDARB, FUNC_DABpADB, FUNC_DADB, FUNC_DER, &
        FUNC_DX, FUNC_DXDX, FUNC_DXDY, FUNC_DY, FUNC_DYDY, FUNC_DYDZ, FUNC_DZ, FUNC_DZDX, &
        FUNC_DZDZ, prepare_adb_m_dab, prepare_arb, prepare_ardb_m_darb, prepare_dab_p_adb, &
        prepare_dadb, prepare_diadib, prepare_diiadiib, prepare_dijadijb
   USE qs_neighbor_list_types,          ONLY: neighbor_list_set_p_type
   USE realspace_grid_types,            ONLY: realspace_grid_desc_p_type,&
                                              realspace_grid_p_type,&
                                              realspace_grid_type,&
                                              rs2pw,&
                                              rs_grid_release,&
                                              rs_grid_retain,&
                                              rs_grid_zero,&
                                              rs_pw_transfer
   USE rs_pw_interface,                 ONLY: density_rs2pw,&
                                              density_rs2pw_basic
   USE task_list_methods,               ONLY: int2pair,&
                                              rs_distribute_matrix
   USE task_list_types,                 ONLY: task_list_type

!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads

#include "./base/base_uses.f90"

   IMPLICIT NONE

   PRIVATE

   CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'qs_collocate_density'
! *** Public subroutines ***

   PUBLIC :: calculate_ppl_grid, &
             calculate_rho_core, &
             calculate_lri_rho_elec, &
             calculate_rho_single_gaussian, &
             calculate_rho_metal, &
             calculate_rho_resp_single, &
             calculate_rho_resp_all, &
             calculate_rho_elec, &
             calculate_drho_elec, &
             calculate_wavefunction, &
             collocate_pgf_product_gspace, &
             collocate_pgf_product_rspace, &
             calculate_rho_nlcc

   INTEGER :: debug_count = 0

CONTAINS

! **************************************************************************************************
!> \brief computes the density of the non-linear core correction on the grid
!> \param rho_nlcc ...
!> \param qs_env ...
! **************************************************************************************************
   SUBROUTINE calculate_rho_nlcc(rho_nlcc, qs_env)

      TYPE(pw_p_type), INTENT(INOUT)                     :: rho_nlcc
      TYPE(qs_environment_type), POINTER                 :: qs_env

      CHARACTER(len=*), PARAMETER :: routineN = 'calculate_rho_nlcc', &
         routineP = moduleN//':'//routineN

      INTEGER                                            :: atom_a, handle, iatom, iexp_nlcc, ikind, &
                                                            ithread, j, lmax_global, n, natom, nc, &
                                                            nexp_nlcc, ni, npme, nthread
      INTEGER(KIND=int_8)                                :: subpatch_pattern
      INTEGER, DIMENSION(:), POINTER                     :: atom_list, cores, mylmax, nct_nlcc
      LOGICAL                                            :: nlcc
      REAL(KIND=dp)                                      :: alpha, eps_rho_rspace
      REAL(KIND=dp), DIMENSION(3)                        :: ra
      REAL(KIND=dp), DIMENSION(:), POINTER               :: alpha_nlcc
      REAL(KIND=dp), DIMENSION(:, :), POINTER            :: cval_nlcc, pab
      TYPE(atomic_kind_type), DIMENSION(:), POINTER      :: atomic_kind_set
      TYPE(cell_type), POINTER                           :: cell
      TYPE(cube_info_type)                               :: cube_info
      TYPE(dft_control_type), POINTER                    :: dft_control
      TYPE(gth_potential_type), POINTER                  :: gth_potential
      TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set
      TYPE(pw_env_type), POINTER                         :: pw_env
      TYPE(pw_pool_type), POINTER                        :: auxbas_pw_pool
      TYPE(qs_kind_type), DIMENSION(:), POINTER          :: qs_kind_set
      TYPE(realspace_grid_type), POINTER                 :: rs_rho

      CALL timeset(routineN, handle)

      NULLIFY (cell, dft_control, pab, particle_set, atomic_kind_set, qs_kind_set, &
               atom_list, pw_env, rs_rho, auxbas_pw_pool, cores, mylmax)

      CALL get_qs_env(qs_env=qs_env, &
                      atomic_kind_set=atomic_kind_set, &
                      qs_kind_set=qs_kind_set, &
                      cell=cell, &
                      dft_control=dft_control, &
                      particle_set=particle_set, &
                      pw_env=pw_env)
      CALL pw_env_get(pw_env, auxbas_rs_grid=rs_rho, &
                      auxbas_pw_pool=auxbas_pw_pool)
      cube_info = pw_env%cube_info(1)
      ! be careful in parallel nsmax is choosen with multigrid in mind!
      CALL rs_grid_retain(rs_rho)
      CALL rs_grid_zero(rs_rho)

      eps_rho_rspace = dft_control%qs_control%eps_rho_rspace

      !Find the maximum la_max over all of the atomic_kind_set
      !We are using FUNC_AB so no need to cater for different ga_gb_functions
      lmax_global = 0
      DO ikind = 1, SIZE(atomic_kind_set)
         CALL get_qs_kind(qs_kind_set(ikind), gth_potential=gth_potential)
         IF (.NOT. ASSOCIATED(gth_potential)) CYCLE
         CALL get_potential(potential=gth_potential, nct_nlcc=mylmax, nlcc_present=nlcc)
         IF (.NOT. nlcc) CYCLE
         lmax_global = MAX(MAXVAL(mylmax), lmax_global)
      END DO
      !lmax_global set according to ni = 2*nc-2 below
      lmax_global = 2*lmax_global - 2

      DO ikind = 1, SIZE(atomic_kind_set)
         CALL get_atomic_kind(atomic_kind_set(ikind), natom=natom, atom_list=atom_list)
         CALL get_qs_kind(qs_kind_set(ikind), gth_potential=gth_potential)

         IF (.NOT. ASSOCIATED(gth_potential)) CYCLE
         CALL get_potential(potential=gth_potential, nlcc_present=nlcc, nexp_nlcc=nexp_nlcc, &
                            alpha_nlcc=alpha_nlcc, nct_nlcc=nct_nlcc, cval_nlcc=cval_nlcc)

         IF (.NOT. nlcc) CYCLE

         DO iexp_nlcc = 1, nexp_nlcc

            alpha = alpha_nlcc(iexp_nlcc)
            nc = nct_nlcc(iexp_nlcc)

            ni = ncoset(2*nc - 2)
            ALLOCATE (pab(ni, 1))
            pab = 0._dp

            nthread = 1
            ithread = 0

            CALL reallocate(cores, 1, natom)
            npme = 0
            cores = 0

            ! prepare core function
            DO j = 1, nc
               SELECT CASE (j)
               CASE (1)
                  pab(1, 1) = cval_nlcc(1, iexp_nlcc)
               CASE (2)
                  n = coset(2, 0, 0)
                  pab(n, 1) = cval_nlcc(2, iexp_nlcc)/alpha**2
                  n = coset(0, 2, 0)
                  pab(n, 1) = cval_nlcc(2, iexp_nlcc)/alpha**2
                  n = coset(0, 0, 2)
                  pab(n, 1) = cval_nlcc(2, iexp_nlcc)/alpha**2
               CASE (3)
                  n = coset(4, 0, 0)
                  pab(n, 1) = cval_nlcc(3, iexp_nlcc)/alpha**4
                  n = coset(0, 4, 0)
                  pab(n, 1) = cval_nlcc(3, iexp_nlcc)/alpha**4
                  n = coset(0, 0, 4)
                  pab(n, 1) = cval_nlcc(3, iexp_nlcc)/alpha**4
                  n = coset(2, 2, 0)
                  pab(n, 1) = 2._dp*cval_nlcc(3, iexp_nlcc)/alpha**4
                  n = coset(2, 0, 2)
                  pab(n, 1) = 2._dp*cval_nlcc(3, iexp_nlcc)/alpha**4
                  n = coset(0, 2, 2)
                  pab(n, 1) = 2._dp*cval_nlcc(3, iexp_nlcc)/alpha**4
               CASE (4)
                  n = coset(6, 0, 0)
                  pab(n, 1) = cval_nlcc(4, iexp_nlcc)/alpha**6
                  n = coset(0, 6, 0)
                  pab(n, 1) = cval_nlcc(4, iexp_nlcc)/alpha**6
                  n = coset(0, 0, 6)
                  pab(n, 1) = cval_nlcc(4, iexp_nlcc)/alpha**6
                  n = coset(4, 2, 0)
                  pab(n, 1) = 3._dp*cval_nlcc(4, iexp_nlcc)/alpha**6
                  n = coset(4, 0, 2)
                  pab(n, 1) = 3._dp*cval_nlcc(4, iexp_nlcc)/alpha**6
                  n = coset(2, 4, 0)
                  pab(n, 1) = 3._dp*cval_nlcc(4, iexp_nlcc)/alpha**6
                  n = coset(2, 0, 4)
                  pab(n, 1) = 3._dp*cval_nlcc(4, iexp_nlcc)/alpha**6
                  n = coset(0, 4, 2)
                  pab(n, 1) = 3._dp*cval_nlcc(4, iexp_nlcc)/alpha**6
                  n = coset(0, 2, 4)
                  pab(n, 1) = 3._dp*cval_nlcc(4, iexp_nlcc)/alpha**6
                  n = coset(2, 2, 2)
                  pab(n, 1) = 6._dp*cval_nlcc(4, iexp_nlcc)/alpha**6
               CASE DEFAULT
                  CPABORT("")
               END SELECT
            END DO
            IF (dft_control%nspins == 2) pab = pab*0.5_dp

            DO iatom = 1, natom
               atom_a = atom_list(iatom)
               ra(:) = pbc(particle_set(atom_a)%r, cell)
               IF (rs_rho%desc%parallel .AND. .NOT. rs_rho%desc%distributed) THEN
                  ! replicated realspace grid, split the atoms up between procs
                  IF (MODULO(iatom, rs_rho%desc%group_size) == rs_rho%desc%my_pos) THEN
                     npme = npme + 1
                     cores(npme) = iatom
                  ENDIF
               ELSE
                  npme = npme + 1
                  cores(npme) = iatom
               ENDIF
            END DO

            DO j = 1, npme

               iatom = cores(j)
               atom_a = atom_list(iatom)
               ra(:) = pbc(particle_set(atom_a)%r, cell)
               subpatch_pattern = 0
               ni = 2*nc - 2
               CALL collocate_pgf_product_rspace(ni, 1/(2*alpha**2), 0, 0, 0.0_dp, 0, ra, &
                                                 (/0.0_dp, 0.0_dp, 0.0_dp/), 0.0_dp, 1.0_dp, pab, 0, 0, rs_rho, &
                                                 cell, cube_info, eps_rho_rspace, ga_gb_function=FUNC_AB, &
                                                 ithread=ithread, use_subpatch=.TRUE., subpatch_pattern=subpatch_pattern, &
                                                 lmax_global=lmax_global)

            END DO

            DEALLOCATE (pab)

         END DO

      END DO

      IF (ASSOCIATED(cores)) THEN
         DEALLOCATE (cores)
      END IF

      CALL rs_pw_transfer(rs_rho, rho_nlcc%pw, rs2pw)
      CALL rs_grid_release(rs_rho)

      CALL timestop(handle)

   END SUBROUTINE calculate_rho_nlcc

! **************************************************************************************************
!> \brief computes the local pseudopotential (without erf term) on the grid
!> \param vppl ...
!> \param qs_env ...
! **************************************************************************************************
   SUBROUTINE calculate_ppl_grid(vppl, qs_env)

      TYPE(pw_p_type), INTENT(INOUT)                     :: vppl
      TYPE(qs_environment_type), POINTER                 :: qs_env

      CHARACTER(len=*), PARAMETER :: routineN = 'calculate_ppl_grid', &
         routineP = moduleN//':'//routineN

      INTEGER                                            :: atom_a, handle, iatom, ikind, ithread, &
                                                            j, lmax_global, lppl, n, natom, ni, &
                                                            npme, nthread
      INTEGER(KIND=int_8)                                :: subpatch_pattern
      INTEGER, DIMENSION(:), POINTER                     :: atom_list, cores
      REAL(KIND=dp)                                      :: alpha, eps_rho_rspace
      REAL(KIND=dp), DIMENSION(3)                        :: ra
      REAL(KIND=dp), DIMENSION(:), POINTER               :: cexp_ppl
      REAL(KIND=dp), DIMENSION(:, :), POINTER            :: pab
      TYPE(atomic_kind_type), DIMENSION(:), POINTER      :: atomic_kind_set
      TYPE(cell_type), POINTER                           :: cell
      TYPE(cube_info_type)                               :: cube_info
      TYPE(dft_control_type), POINTER                    :: dft_control
      TYPE(gth_potential_type), POINTER                  :: gth_potential
      TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set
      TYPE(pw_env_type), POINTER                         :: pw_env
      TYPE(pw_pool_type), POINTER                        :: auxbas_pw_pool
      TYPE(qs_kind_type), DIMENSION(:), POINTER          :: qs_kind_set
      TYPE(realspace_grid_type), POINTER                 :: rs_rho

      CALL timeset(routineN, handle)

      NULLIFY (cell, dft_control, pab, atomic_kind_set, qs_kind_set, particle_set, &
               atom_list, pw_env, rs_rho, auxbas_pw_pool, cores)

      CALL get_qs_env(qs_env=qs_env, &
                      atomic_kind_set=atomic_kind_set, &
                      qs_kind_set=qs_kind_set, &
                      cell=cell, &
                      dft_control=dft_control, &
                      particle_set=particle_set, &
                      pw_env=pw_env)
      CALL pw_env_get(pw_env, auxbas_rs_grid=rs_rho, &
                      auxbas_pw_pool=auxbas_pw_pool)
      cube_info = pw_env%cube_info(1)
      ! be careful in parallel nsmax is choosen with multigrid in mind!
      CALL rs_grid_retain(rs_rho)
      CALL rs_grid_zero(rs_rho)

      eps_rho_rspace = dft_control%qs_control%eps_rho_rspace

      !Find maximum la_max over all of the atomic kind set where la_max=ni & ni=2*lppl-2
      !Thus need to find the max of lppl over atomic_kind_set and use to define lmax_global
      !We are using FUNC_AB so no need to cater for different ga_gb_functions
      lmax_global = 0
      DO ikind = 1, SIZE(atomic_kind_set)
         CALL get_qs_kind(qs_kind_set(ikind), gth_potential=gth_potential)
         IF (.NOT. ASSOCIATED(gth_potential)) CYCLE
         CALL get_potential(potential=gth_potential, nexp_ppl=lppl)
         lmax_global = MAX(lppl, lmax_global)
      END DO
      lmax_global = 2*lmax_global - 2

      DO ikind = 1, SIZE(atomic_kind_set)
         CALL get_atomic_kind(atomic_kind_set(ikind), natom=natom, atom_list=atom_list)
         CALL get_qs_kind(qs_kind_set(ikind), gth_potential=gth_potential)

         IF (.NOT. ASSOCIATED(gth_potential)) CYCLE
         CALL get_potential(potential=gth_potential, alpha_ppl=alpha, nexp_ppl=lppl, cexp_ppl=cexp_ppl)

         IF (lppl <= 0) CYCLE

         ni = ncoset(2*lppl - 2)
         ALLOCATE (pab(ni, 1))
         pab = 0._dp

         nthread = 1
         ithread = 0

         CALL reallocate(cores, 1, natom)
         npme = 0
         cores = 0

         ! prepare core function
         DO j = 1, lppl
            SELECT CASE (j)
            CASE (1)
               pab(1, 1) = cexp_ppl(1)
            CASE (2)
               n = coset(2, 0, 0)
               pab(n, 1) = cexp_ppl(2)
               n = coset(0, 2, 0)
               pab(n, 1) = cexp_ppl(2)
               n = coset(0, 0, 2)
               pab(n, 1) = cexp_ppl(2)
            CASE (3)
               n = coset(4, 0, 0)
               pab(n, 1) = cexp_ppl(3)
               n = coset(0, 4, 0)
               pab(n, 1) = cexp_ppl(3)
               n = coset(0, 0, 4)
               pab(n, 1) = cexp_ppl(3)
               n = coset(2, 2, 0)
               pab(n, 1) = 2._dp*cexp_ppl(3)
               n = coset(2, 0, 2)
               pab(n, 1) = 2._dp*cexp_ppl(3)
               n = coset(0, 2, 2)
               pab(n, 1) = 2._dp*cexp_ppl(3)
            CASE (4)
               n = coset(6, 0, 0)
               pab(n, 1) = cexp_ppl(4)
               n = coset(0, 6, 0)
               pab(n, 1) = cexp_ppl(4)
               n = coset(0, 0, 6)
               pab(n, 1) = cexp_ppl(4)
               n = coset(4, 2, 0)
               pab(n, 1) = 3._dp*cexp_ppl(4)
               n = coset(4, 0, 2)
               pab(n, 1) = 3._dp*cexp_ppl(4)
               n = coset(2, 4, 0)
               pab(n, 1) = 3._dp*cexp_ppl(4)
               n = coset(2, 0, 4)
               pab(n, 1) = 3._dp*cexp_ppl(4)
               n = coset(0, 4, 2)
               pab(n, 1) = 3._dp*cexp_ppl(4)
               n = coset(0, 2, 4)
               pab(n, 1) = 3._dp*cexp_ppl(4)
               n = coset(2, 2, 2)
               pab(n, 1) = 6._dp*cexp_ppl(4)
            CASE DEFAULT
               CPABORT("")
            END SELECT
         END DO

         DO iatom = 1, natom
            atom_a = atom_list(iatom)
            ra(:) = pbc(particle_set(atom_a)%r, cell)
            IF (rs_rho%desc%parallel .AND. .NOT. rs_rho%desc%distributed) THEN
               ! replicated realspace grid, split the atoms up between procs
               IF (MODULO(iatom, rs_rho%desc%group_size) == rs_rho%desc%my_pos) THEN
                  npme = npme + 1
                  cores(npme) = iatom
               ENDIF
            ELSE
               npme = npme + 1
               cores(npme) = iatom
            ENDIF
         END DO

         IF (npme .GT. 0) THEN
            DO j = 1, npme

               iatom = cores(j)
               atom_a = atom_list(iatom)
               ra(:) = pbc(particle_set(atom_a)%r, cell)
               subpatch_pattern = 0
               ni = 2*lppl - 2
               CALL collocate_pgf_product_rspace(ni, alpha, 0, 0, 0.0_dp, 0, ra, &
                                                 (/0.0_dp, 0.0_dp, 0.0_dp/), 0.0_dp, 1.0_dp, pab, 0, 0, rs_rho, &
                                                 cell, cube_info, eps_rho_rspace, ga_gb_function=FUNC_AB, &
                                                 ithread=ithread, use_subpatch=.TRUE., subpatch_pattern=subpatch_pattern, &
                                                 lmax_global=lmax_global)

            END DO
         ENDIF

         DEALLOCATE (pab)

      END DO

      IF (ASSOCIATED(cores)) THEN
         DEALLOCATE (cores)
      END IF

      CALL rs_pw_transfer(rs_rho, vppl%pw, rs2pw)
      CALL rs_grid_release(rs_rho)

      CALL timestop(handle)

   END SUBROUTINE calculate_ppl_grid

! **************************************************************************************************
!> \brief Collocates the fitted lri density on a grid.
!> \param lri_rho_g ...
!> \param lri_rho_r ...
!> \param qs_env ...
!> \param lri_coef ...
!> \param total_rho ...
!> \param basis_type ...
!> \param exact_1c_terms ...
!> \param pmat replicated block diagonal density matrix (optional)
!> \param atomlist list of atoms to be included (optional)
!> \par History
!>      04.2013
!> \author Dorothea Golze
! **************************************************************************************************
   SUBROUTINE calculate_lri_rho_elec(lri_rho_g, lri_rho_r, qs_env, &
                                     lri_coef, total_rho, basis_type, exact_1c_terms, pmat, atomlist)

      TYPE(pw_p_type), INTENT(INOUT)                     :: lri_rho_g, lri_rho_r
      TYPE(qs_environment_type), POINTER                 :: qs_env
      TYPE(lri_kind_type), DIMENSION(:), POINTER         :: lri_coef
      REAL(KIND=dp), INTENT(OUT)                         :: total_rho
      CHARACTER(len=*), INTENT(IN)                       :: basis_type
      LOGICAL, INTENT(IN)                                :: exact_1c_terms
      TYPE(dbcsr_type), OPTIONAL                         :: pmat
      INTEGER, DIMENSION(:), OPTIONAL                    :: atomlist

      CHARACTER(len=*), PARAMETER :: routineN = 'calculate_lri_rho_elec', &
         routineP = moduleN//':'//routineN

      INTEGER :: atom_a, dir, group_size, handle, iatom, igrid_level, ikind, ipgf, iset, jpgf, &
         jset, lmax_global, m1, maxco, maxpgf, maxset, maxsgf, maxsgf_set, my_pos, na1, natom, &
         nb1, ncoa, ncob, nseta, offset, sgfa, sgfb
      INTEGER, DIMENSION(3)                              :: lb, location, tp, ub
      INTEGER, DIMENSION(:), POINTER                     :: atom_list, la_max, la_min, mylmax, &
                                                            npgfa, nsgfa
      INTEGER, DIMENSION(:, :), POINTER                  :: first_sgfa
      LOGICAL                                            :: found, map_consistent
      LOGICAL, ALLOCATABLE, DIMENSION(:)                 :: map_it
      LOGICAL, ALLOCATABLE, DIMENSION(:, :)              :: map_it2
      REAL(KIND=dp)                                      :: eps_rho_rspace, rab2, zetp
      REAL(KIND=dp), DIMENSION(3)                        :: ra, rab
      REAL(KIND=dp), DIMENSION(:), POINTER               :: aci
      REAL(KIND=dp), DIMENSION(:, :), POINTER            :: p_block, pab, sphi_a, work, zeta
      TYPE(atomic_kind_type), DIMENSION(:), POINTER      :: atomic_kind_set
      TYPE(cell_type), POINTER                           :: cell
      TYPE(cube_info_type), DIMENSION(:), POINTER        :: cube_info
      TYPE(dft_control_type), POINTER                    :: dft_control
      TYPE(gridlevel_info_type), POINTER                 :: gridlevel_info
      TYPE(gto_basis_set_type), POINTER                  :: lri_basis_set, orb_basis_set
      TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set
      TYPE(pw_env_type), POINTER                         :: pw_env
      TYPE(pw_p_type), DIMENSION(:), POINTER             :: mgrid_gspace, mgrid_rspace
      TYPE(pw_pool_p_type), DIMENSION(:), POINTER        :: pw_pools
      TYPE(qs_kind_type), DIMENSION(:), POINTER          :: qs_kind_set
      TYPE(qs_kind_type), POINTER                        :: qs_kind
      TYPE(realspace_grid_p_type), DIMENSION(:), POINTER :: rs_rho
      TYPE(realspace_grid_type), POINTER                 :: rs_grid

      NULLIFY (aci, atomic_kind_set, qs_kind_set, atom_list, cell, cube_info, &
               dft_control, first_sgfa, gridlevel_info, la_max, la_min, lri_basis_set, &
               mgrid_gspace, mgrid_rspace, npgfa, nsgfa, pab, &
               particle_set, pw_env, pw_pools, rs_grid, rs_rho, sphi_a, &
               work, zeta, mylmax)

      CALL timeset(routineN, handle)

      IF (exact_1c_terms) THEN
         CPASSERT(PRESENT(pmat))
      END IF

      CALL get_qs_env(qs_env=qs_env, qs_kind_set=qs_kind_set, &
                      atomic_kind_set=atomic_kind_set, &
                      cell=cell, particle_set=particle_set, &
                      pw_env=pw_env, &
                      dft_control=dft_control)

      cube_info => pw_env%cube_info
      eps_rho_rspace = dft_control%qs_control%eps_rho_rspace
      map_consistent = dft_control%qs_control%map_consistent
      gridlevel_info => pw_env%gridlevel_info

      ! *** set up the pw multi-grids *** !
      CPASSERT(ASSOCIATED(pw_env))
      CALL pw_env_get(pw_env=pw_env, rs_grids=rs_rho, pw_pools=pw_pools)

      CALL pw_pools_create_pws(pw_pools, mgrid_rspace, &
                               use_data=REALDATA3D, &
                               in_space=REALSPACE)

      CALL pw_pools_create_pws(pw_pools, mgrid_gspace, &
                               use_data=COMPLEXDATA1D, &
                               in_space=RECIPROCALSPACE)

      ! *** set up the rs multi-grids *** !
      DO igrid_level = 1, gridlevel_info%ngrid_levels
         CALL rs_grid_retain(rs_rho(igrid_level)%rs_grid)
         CALL rs_grid_zero(rs_rho(igrid_level)%rs_grid)
      END DO

      !take maxco from the LRI basis set!
      CALL get_qs_kind_set(qs_kind_set=qs_kind_set, &
                           maxco=maxco, basis_type=basis_type)

      ALLOCATE (pab(maxco, 1))
      offset = 0
      my_pos = mgrid_rspace(1)%pw%pw_grid%para%my_pos
      group_size = mgrid_rspace(1)%pw%pw_grid%para%group_size

      !Find lmax_global across atomic_kind_set
      lmax_global = 0
      DO ikind = 1, SIZE(atomic_kind_set)
         CALL get_qs_kind(qs_kind_set(ikind), basis_set=lri_basis_set, basis_type=basis_type)
         CALL get_gto_basis_set(gto_basis_set=lri_basis_set, lmax=mylmax)
         lmax_global = MAX(MAXVAL(mylmax), lmax_global)
      END DO

      DO ikind = 1, SIZE(atomic_kind_set)

         CALL get_atomic_kind(atomic_kind_set(ikind), natom=natom, atom_list=atom_list)
         CALL get_qs_kind(qs_kind_set(ikind), basis_set=lri_basis_set, basis_type=basis_type)

         !Take the lri basis set here!
         CALL get_gto_basis_set(gto_basis_set=lri_basis_set, lmax=la_max, &
                                lmin=la_min, zet=zeta, nset=nseta, npgf=npgfa, &
                                sphi=sphi_a, first_sgf=first_sgfa, nsgf_set=nsgfa)

         DO iatom = 1, natom
            atom_a = atom_list(iatom)
            IF (PRESENT(ATOMLIST)) THEN
               IF (atomlist(atom_a) == 0) CYCLE
            END IF
            ra(:) = pbc(particle_set(atom_a)%r, cell)
            aci => lri_coef(ikind)%acoef(iatom, :)

            m1 = MAXVAL(npgfa(1:nseta))
            ALLOCATE (map_it(m1))
            DO iset = 1, nseta
               ! collocate this set locally?
               map_it = .FALSE.
               DO ipgf = 1, npgfa(iset)
                  igrid_level = gaussian_gridlevel(gridlevel_info, zeta(ipgf, iset))
                  rs_grid => rs_rho(igrid_level)%rs_grid
                  IF (.NOT. ALL(rs_grid%desc%perd == 1)) THEN
                     DO dir = 1, 3
                        ! bounds of local grid (i.e. removing the 'wings'), if periodic
                        tp(dir) = FLOOR(DOT_PRODUCT(cell%h_inv(dir, :), ra)*rs_grid%desc%npts(dir))
                        tp(dir) = MODULO(tp(dir), rs_grid%desc%npts(dir))
                        IF (rs_grid%desc%perd(dir) .NE. 1) THEN
                           lb(dir) = rs_grid%lb_local(dir) + rs_grid%desc%border
                           ub(dir) = rs_grid%ub_local(dir) - rs_grid%desc%border
                        ELSE
                           lb(dir) = rs_grid%lb_local(dir)
                           ub(dir) = rs_grid%ub_local(dir)
                        ENDIF
                        ! distributed grid, only map if it is local to the grid
                        location(dir) = tp(dir) + rs_grid%desc%lb(dir)
                     ENDDO
                     IF (lb(1) <= location(1) .AND. location(1) <= ub(1) .AND. &
                         lb(2) <= location(2) .AND. location(2) <= ub(2) .AND. &
                         lb(3) <= location(3) .AND. location(3) <= ub(3)) THEN
                        map_it(ipgf) = .TRUE.
                     ENDIF
                  ELSE
                     ! not distributed, just a round-robin distribution over the full set of CPUs
                     IF (MODULO(offset, group_size) == my_pos) map_it(ipgf) = .TRUE.
                  ENDIF
               END DO
               offset = offset + 1

               IF (ANY(map_it(1:npgfa(iset)))) THEN
                  sgfa = first_sgfa(1, iset)
                  ncoa = npgfa(iset)*ncoset(la_max(iset))
                  m1 = sgfa + nsgfa(iset) - 1
                  ALLOCATE (work(nsgfa(iset), 1))
                  work(1:nsgfa(iset), 1) = aci(sgfa:m1)
                  pab = 0._dp

                  CALL dgemm("N", "N", ncoa, 1, nsgfa(iset), 1.0_dp, lri_basis_set%sphi(1, sgfa), &
                             SIZE(lri_basis_set%sphi, 1), work(1, 1), SIZE(work, 1), 0.0_dp, pab(1, 1), &
                             SIZE(pab, 1))

                  DO ipgf = 1, npgfa(iset)
                     na1 = (ipgf - 1)*ncoset(la_max(iset))
                     igrid_level = gaussian_gridlevel(gridlevel_info, zeta(ipgf, iset))
                     rs_grid => rs_rho(igrid_level)%rs_grid
                     IF (map_it(ipgf)) THEN
                        CALL collocate_pgf_product_rspace(la_max=la_max(iset), &
                                                          zeta=zeta(ipgf, iset), &
                                                          la_min=la_min(iset), &
                                                          lb_max=0, zetb=0.0_dp, lb_min=0, &
                                                          ra=ra, rab=(/0.0_dp, 0.0_dp, 0.0_dp/), &
                                                          rab2=0.0_dp, scale=1._dp, &
                                                          pab=pab, o1=na1, o2=0, &
                                                          rsgrid=rs_grid, cell=cell, &
                                                          cube_info=cube_info(igrid_level), &
                                                          eps_rho_rspace=eps_rho_rspace, &
                                                          ga_gb_function=FUNC_AB, &
                                                          map_consistent=map_consistent, &
                                                          lmax_global=lmax_global)
                     ENDIF
                  END DO
                  DEALLOCATE (work)
               END IF
            END DO
            DEALLOCATE (map_it)
         END DO
      END DO

      DEALLOCATE (pab)

      ! process the one-center terms
      IF (exact_1c_terms) THEN
         ! find maximum numbers
         maxset = 0
         maxpgf = 0
         lmax_global = 0
         offset = 0
         DO ikind = 1, SIZE(qs_kind_set)
            qs_kind => qs_kind_set(ikind)
            CALL get_qs_kind(qs_kind=qs_kind, &
                             basis_set=orb_basis_set, basis_type="ORB")
            IF (.NOT. ASSOCIATED(orb_basis_set)) CYCLE
            CALL get_gto_basis_set(gto_basis_set=orb_basis_set, &
                                   npgf=npgfa, nset=nseta, lmax=mylmax)
            maxset = MAX(nseta, maxset)
            maxpgf = MAX(MAXVAL(npgfa), maxpgf)
            lmax_global = MAX(MAXVAL(mylmax), lmax_global)
         END DO
         CALL get_qs_kind_set(qs_kind_set=qs_kind_set, &
                              maxco=maxco, &
                              maxsgf=maxsgf, &
                              maxsgf_set=maxsgf_set, &
                              basis_type="ORB")
         ALLOCATE (pab(maxco, maxco), work(maxco, maxsgf_set))

         DO ikind = 1, SIZE(atomic_kind_set)
            CALL get_atomic_kind(atomic_kind_set(ikind), natom=natom, atom_list=atom_list)
            CALL get_qs_kind(qs_kind_set(ikind), basis_set=orb_basis_set, basis_type="ORB")
            CALL get_gto_basis_set(gto_basis_set=orb_basis_set, lmax=la_max, &
                                   lmin=la_min, zet=zeta, nset=nseta, npgf=npgfa, &
                                   sphi=sphi_a, first_sgf=first_sgfa, nsgf_set=nsgfa)
            DO iatom = 1, natom
               atom_a = atom_list(iatom)
               ra(:) = pbc(particle_set(atom_a)%r, cell)
               rab(:) = 0.0_dp
               rab2 = 0.0_dp
               CALL dbcsr_get_block_p(matrix=pmat, row=atom_a, col=atom_a, BLOCK=p_block, found=found)
               m1 = MAXVAL(npgfa(1:nseta))
               ALLOCATE (map_it2(m1, m1))
               DO iset = 1, nseta
                  DO jset = 1, nseta
                     ! processor mappint
                     map_it2 = .FALSE.
                     DO ipgf = 1, npgfa(iset)
                        DO jpgf = 1, npgfa(jset)
                           zetp = zeta(ipgf, iset) + zeta(jpgf, jset)
                           igrid_level = gaussian_gridlevel(gridlevel_info, zetp)
                           rs_grid => rs_rho(igrid_level)%rs_grid
                           IF (.NOT. ALL(rs_grid%desc%perd == 1)) THEN
                              DO dir = 1, 3
                                 ! bounds of local grid (i.e. removing the 'wings'), if periodic
                                 tp(dir) = FLOOR(DOT_PRODUCT(cell%h_inv(dir, :), ra)*rs_grid%desc%npts(dir))
                                 tp(dir) = MODULO(tp(dir), rs_grid%desc%npts(dir))
                                 IF (rs_grid%desc%perd(dir) .NE. 1) THEN
                                    lb(dir) = rs_grid%lb_local(dir) + rs_grid%desc%border
                                    ub(dir) = rs_grid%ub_local(dir) - rs_grid%desc%border
                                 ELSE
                                    lb(dir) = rs_grid%lb_local(dir)
                                    ub(dir) = rs_grid%ub_local(dir)
                                 ENDIF
                                 ! distributed grid, only map if it is local to the grid
                                 location(dir) = tp(dir) + rs_grid%desc%lb(dir)
                              ENDDO
                              IF (lb(1) <= location(1) .AND. location(1) <= ub(1) .AND. &
                                  lb(2) <= location(2) .AND. location(2) <= ub(2) .AND. &
                                  lb(3) <= location(3) .AND. location(3) <= ub(3)) THEN
                                 map_it2(ipgf, jpgf) = .TRUE.
                              ENDIF
                           ELSE
                              ! not distributed, just a round-robin distribution over the full set of CPUs
                              IF (MODULO(offset, group_size) == my_pos) map_it2(ipgf, jpgf) = .TRUE.
                           ENDIF
                        END DO
                     END DO
                     offset = offset + 1
                     !
                     IF (ANY(map_it2(1:npgfa(iset), 1:npgfa(jset)))) THEN
                        ncoa = npgfa(iset)*ncoset(la_max(iset))
                        sgfa = first_sgfa(1, iset)
                        ncob = npgfa(jset)*ncoset(la_max(jset))
                        sgfb = first_sgfa(1, jset)
                        ! decontract density block
                        CALL dgemm("N", "N", ncoa, nsgfa(jset), nsgfa(iset), &
                                   1.0_dp, sphi_a(1, sgfa), SIZE(sphi_a, 1), &
                                   p_block(sgfa, sgfb), SIZE(p_block, 1), &
                                   0.0_dp, work(1, 1), maxco)
                        CALL dgemm("N", "T", ncoa, ncob, nsgfa(jset), &
                                   1.0_dp, work(1, 1), maxco, &
                                   sphi_a(1, sgfb), SIZE(sphi_a, 1), &
                                   0.0_dp, pab(1, 1), maxco)
                        DO ipgf = 1, npgfa(iset)
                           DO jpgf = 1, npgfa(jset)
                              zetp = zeta(ipgf, iset) + zeta(jpgf, jset)
                              igrid_level = gaussian_gridlevel(gridlevel_info, zetp)
                              rs_grid => rs_rho(igrid_level)%rs_grid

                              na1 = (ipgf - 1)*ncoset(la_max(iset))
                              nb1 = (jpgf - 1)*ncoset(la_max(jset))

                              IF (map_it2(ipgf, jpgf)) THEN
                                 CALL collocate_pgf_product_rspace( &
                                    la_max(iset), zeta(ipgf, iset), la_min(iset), &
                                    la_max(jset), zeta(jpgf, jset), la_min(jset), &
                                    ra, rab, rab2, 1.0_dp, pab, na1, nb1, &
                                    rs_grid, cell, cube_info(igrid_level), &
                                    eps_rho_rspace, ga_gb_function=FUNC_AB, &
                                    map_consistent=map_consistent, lmax_global=lmax_global)
                              END IF
                           END DO
                        END DO
                     END IF
                  END DO
               END DO
               DEALLOCATE (map_it2)
               !
            END DO
         END DO
         DEALLOCATE (pab, work)
      END IF

      CALL pw_zero(lri_rho_g%pw)
      CALL pw_zero(lri_rho_r%pw)

      DO igrid_level = 1, gridlevel_info%ngrid_levels
         CALL pw_zero(mgrid_rspace(igrid_level)%pw)
         CALL rs_pw_transfer(rs=rs_rho(igrid_level)%rs_grid, &
                             pw=mgrid_rspace(igrid_level)%pw, dir=rs2pw)
         CALL rs_grid_release(rs_rho(igrid_level)%rs_grid)
      END DO

      DO igrid_level = 1, gridlevel_info%ngrid_levels
         CALL pw_zero(mgrid_gspace(igrid_level)%pw)
         CALL pw_transfer(mgrid_rspace(igrid_level)%pw, &
                          mgrid_gspace(igrid_level)%pw)
         CALL pw_axpy(mgrid_gspace(igrid_level)%pw, lri_rho_g%pw)
      END DO
      CALL pw_transfer(lri_rho_g%pw, lri_rho_r%pw)
      total_rho = pw_integrate_function(lri_rho_r%pw, isign=-1)

      ! *** give back the multi-grids *** !
      CALL pw_pools_give_back_pws(pw_pools, mgrid_gspace)
      CALL pw_pools_give_back_pws(pw_pools, mgrid_rspace)

      CALL timestop(handle)

   END SUBROUTINE calculate_lri_rho_elec

! **************************************************************************************************
!> \brief computes the density of the core charges on the grid
!> \param rho_core ...
!> \param total_rho ...
!> \param qs_env ...
!> \param only_nopaw ...
! **************************************************************************************************
   SUBROUTINE calculate_rho_core(rho_core, total_rho, qs_env, only_nopaw)

      TYPE(pw_p_type), INTENT(INOUT)                     :: rho_core
      REAL(KIND=dp), INTENT(OUT)                         :: total_rho
      TYPE(qs_environment_type), POINTER                 :: qs_env
      LOGICAL, INTENT(IN), OPTIONAL                      :: only_nopaw

      CHARACTER(len=*), PARAMETER :: routineN = 'calculate_rho_core', &
         routineP = moduleN//':'//routineN

      INTEGER                                            :: atom_a, handle, iatom, ikind, ithread, &
                                                            j, natom, npme, nthread
      INTEGER(KIND=int_8)                                :: subpatch_pattern
      INTEGER, DIMENSION(:), POINTER                     :: atom_list, cores
      LOGICAL                                            :: my_only_nopaw, paw_atom
      REAL(KIND=dp)                                      :: alpha, eps_rho_rspace
      REAL(KIND=dp), DIMENSION(3)                        :: ra
      REAL(KIND=dp), DIMENSION(:, :), POINTER            :: pab
      TYPE(atomic_kind_type), DIMENSION(:), POINTER      :: atomic_kind_set
      TYPE(cell_type), POINTER                           :: cell
      TYPE(cube_info_type)                               :: cube_info
      TYPE(dft_control_type), POINTER                    :: dft_control
      TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set
      TYPE(pw_env_type), POINTER                         :: pw_env
      TYPE(pw_p_type)                                    :: rhoc_r
      TYPE(pw_pool_type), POINTER                        :: auxbas_pw_pool
      TYPE(qs_kind_type), DIMENSION(:), POINTER          :: qs_kind_set
      TYPE(realspace_grid_type), POINTER                 :: rs_rho

      CALL timeset(routineN, handle)
      NULLIFY (cell, dft_control, pab, atomic_kind_set, qs_kind_set, particle_set, &
               atom_list, pw_env, rs_rho, auxbas_pw_pool, cores)
      ALLOCATE (pab(1, 1))

      my_only_nopaw = .FALSE.
      IF (PRESENT(only_nopaw)) my_only_nopaw = only_nopaw

      CALL get_qs_env(qs_env=qs_env, &
                      atomic_kind_set=atomic_kind_set, &
                      qs_kind_set=qs_kind_set, &
                      cell=cell, &
                      dft_control=dft_control, &
                      particle_set=particle_set, &
                      pw_env=pw_env)
      CALL pw_env_get(pw_env, auxbas_rs_grid=rs_rho, &
                      auxbas_pw_pool=auxbas_pw_pool)
      cube_info = pw_env%cube_info(1)
      ! be careful in parallel nsmax is choosen with multigrid in mind!
      CALL rs_grid_retain(rs_rho)
      CALL rs_grid_zero(rs_rho)

      eps_rho_rspace = dft_control%qs_control%eps_rho_rspace

      DO ikind = 1, SIZE(atomic_kind_set)
         CALL get_atomic_kind(atomic_kind_set(ikind), natom=natom, atom_list=atom_list)
         CALL get_qs_kind(qs_kind_set(ikind), paw_atom=paw_atom, &
                          alpha_core_charge=alpha, ccore_charge=pab(1, 1))

         IF (my_only_nopaw .AND. paw_atom) CYCLE
         IF (alpha == 0.0_dp .OR. pab(1, 1) == 0.0_dp) CYCLE

         nthread = 1
         ithread = 0

         CALL reallocate(cores, 1, natom)
         npme = 0
         cores = 0

         DO iatom = 1, natom
            IF (rs_rho%desc%parallel .AND. .NOT. rs_rho%desc%distributed) THEN
               ! replicated realspace grid, split the atoms up between procs
               IF (MODULO(iatom, rs_rho%desc%group_size) == rs_rho%desc%my_pos) THEN
                  npme = npme + 1
                  cores(npme) = iatom
               ENDIF
            ELSE
               npme = npme + 1
               cores(npme) = iatom
            ENDIF
         END DO

         IF (npme .GT. 0) THEN
            DO j = 1, npme

               iatom = cores(j)
               atom_a = atom_list(iatom)
               ra(:) = pbc(particle_set(atom_a)%r, cell)
               subpatch_pattern = 0
               !lmax == 0 so set lmax_global to 0
               CALL collocate_pgf_product_rspace(0, alpha, 0, 0, 0.0_dp, 0, ra, &
                                                 (/0.0_dp, 0.0_dp, 0.0_dp/), 0.0_dp, -1.0_dp, pab, 0, 0, rs_rho, &
                                                 cell, cube_info, eps_rho_rspace, ga_gb_function=FUNC_AB, &
                                                 ithread=ithread, use_subpatch=.TRUE., subpatch_pattern=subpatch_pattern, &
                                                 lmax_global=0)

            END DO
         ENDIF

      END DO

      IF (ASSOCIATED(cores)) THEN
         DEALLOCATE (cores)
      END IF
      DEALLOCATE (pab)

      CALL pw_pool_create_pw(auxbas_pw_pool, rhoc_r%pw, &
                             use_data=REALDATA3D, in_space=REALSPACE)

      CALL rs_pw_transfer(rs_rho, rhoc_r%pw, rs2pw)
      CALL rs_grid_release(rs_rho)

      total_rho = pw_integrate_function(rhoc_r%pw, isign=-1)

      CALL pw_transfer(rhoc_r%pw, rho_core%pw)

      CALL pw_pool_give_back_pw(auxbas_pw_pool, rhoc_r%pw)

      CALL timestop(handle)

   END SUBROUTINE calculate_rho_core

! **************************************************************************************************
!> \brief collocate a single Gaussian on the grid
!> \param rho_gb charge density generated by a single gaussian
!> \param qs_env qs environment
!> \param iatom_in atom index
!> \par History
!>        12.2011 created
!> \author Dorothea Golze
! **************************************************************************************************
   SUBROUTINE calculate_rho_single_gaussian(rho_gb, qs_env, iatom_in)

      TYPE(pw_p_type), INTENT(INOUT)                     :: rho_gb
      TYPE(qs_environment_type), POINTER                 :: qs_env
      INTEGER, INTENT(IN)                                :: iatom_in

      CHARACTER(len=*), PARAMETER :: routineN = 'calculate_rho_single_gaussian', &
         routineP = moduleN//':'//routineN

      INTEGER                                            :: atom_a, handle, iatom, npme
      INTEGER(KIND=int_8)                                :: subpatch_pattern
      REAL(KIND=dp)                                      :: eps_rho_rspace
      REAL(KIND=dp), DIMENSION(3)                        :: ra
      REAL(KIND=dp), DIMENSION(:, :), POINTER            :: pab
      TYPE(cell_type), POINTER                           :: cell
      TYPE(dft_control_type), POINTER                    :: dft_control
      TYPE(pw_env_type), POINTER                         :: pw_env
      TYPE(pw_p_type)                                    :: rhoc_r
      TYPE(pw_pool_type), POINTER                        :: auxbas_pw_pool
      TYPE(realspace_grid_type), POINTER                 :: rs_rho

      CALL timeset(routineN, handle)
      NULLIFY (cell, dft_control, pab, pw_env, rs_rho, auxbas_pw_pool)

      ALLOCATE (pab(1, 1))

      CALL get_qs_env(qs_env=qs_env, &
                      cell=cell, &
                      dft_control=dft_control, &
                      pw_env=pw_env)
      CALL pw_env_get(pw_env, auxbas_rs_grid=rs_rho, &
                      auxbas_pw_pool=auxbas_pw_pool)
      CALL rs_grid_retain(rs_rho)
      CALL rs_grid_zero(rs_rho)

      eps_rho_rspace = dft_control%qs_control%eps_rho_rspace
      pab(1, 1) = 1.0_dp
      iatom = iatom_in

      npme = 0

      IF (rs_rho%desc%parallel .AND. .NOT. rs_rho%desc%distributed) THEN
         IF (MODULO(iatom, rs_rho%desc%group_size) == rs_rho%desc%my_pos) THEN
            npme = npme + 1
         ENDIF
      ELSE
         npme = npme + 1
      ENDIF

      IF (npme .GT. 0) THEN
         atom_a = qs_env%qmmm_env_qm%image_charge_pot%image_mm_list(iatom)
         ra(:) = pbc(qs_env%qmmm_env_qm%image_charge_pot%particles_all(atom_a)%r, cell)
         subpatch_pattern = 0
         !lmax == 0 so set lmax_global to 0
         CALL collocate_pgf_product_rspace(0, qs_env%qmmm_env_qm%image_charge_pot%eta, &
                                           0, 0, 0.0_dp, 0, ra, (/0.0_dp, 0.0_dp, 0.0_dp/), 0.0_dp, 1.0_dp, pab, 0, 0, rs_rho, &
                                           cell, pw_env%cube_info(1), eps_rho_rspace, ga_gb_function=FUNC_AB, &
                                           use_subpatch=.TRUE., subpatch_pattern=subpatch_pattern, &
                                           lmax_global=0)
      ENDIF

      DEALLOCATE (pab)

      CALL pw_pool_create_pw(auxbas_pw_pool, rhoc_r%pw, &
                             use_data=REALDATA3D, in_space=REALSPACE)

      CALL rs_pw_transfer(rs_rho, rhoc_r%pw, rs2pw)
      CALL rs_grid_release(rs_rho)

      CALL pw_transfer(rhoc_r%pw, rho_gb%pw)

      CALL pw_pool_give_back_pw(auxbas_pw_pool, rhoc_r%pw)

      CALL timestop(handle)

   END SUBROUTINE calculate_rho_single_gaussian

! **************************************************************************************************
!> \brief computes the image charge density on the grid (including coeffcients)
!> \param rho_metal image charge density
!> \param coeff expansion coefficients of the image charge density, i.e.
!>        rho_metal=sum_a c_a*g_a
!> \param total_rho_metal total induced image charge density
!> \param qs_env qs environment
!> \par History
!>        01.2012 created
!> \author Dorothea Golze
! **************************************************************************************************
   SUBROUTINE calculate_rho_metal(rho_metal, coeff, total_rho_metal, qs_env)

      TYPE(pw_p_type), INTENT(INOUT)                     :: rho_metal
      REAL(KIND=dp), DIMENSION(:), POINTER               :: coeff
      REAL(KIND=dp), INTENT(OUT), OPTIONAL               :: total_rho_metal
      TYPE(qs_environment_type), POINTER                 :: qs_env

      CHARACTER(len=*), PARAMETER :: routineN = 'calculate_rho_metal', &
         routineP = moduleN//':'//routineN

      INTEGER                                            :: atom_a, handle, iatom, j, natom, npme
      INTEGER(KIND=int_8)                                :: subpatch_pattern
      INTEGER, DIMENSION(:), POINTER                     :: cores
      REAL(KIND=dp)                                      :: eps_rho_rspace
      REAL(KIND=dp), DIMENSION(3)                        :: ra
      REAL(KIND=dp), DIMENSION(:, :), POINTER            :: pab
      TYPE(cell_type), POINTER                           :: cell
      TYPE(dft_control_type), POINTER                    :: dft_control
      TYPE(pw_env_type), POINTER                         :: pw_env
      TYPE(pw_p_type)                                    :: rhoc_r
      TYPE(pw_pool_type), POINTER                        :: auxbas_pw_pool
      TYPE(realspace_grid_type), POINTER                 :: rs_rho

      CALL timeset(routineN, handle)

      NULLIFY (cell, dft_control, pab, pw_env, rs_rho, auxbas_pw_pool, cores)

      ALLOCATE (pab(1, 1))

      CALL get_qs_env(qs_env=qs_env, &
                      cell=cell, &
                      dft_control=dft_control, &
                      pw_env=pw_env)
      CALL pw_env_get(pw_env, auxbas_rs_grid=rs_rho, &
                      auxbas_pw_pool=auxbas_pw_pool)
      CALL rs_grid_retain(rs_rho)
      CALL rs_grid_zero(rs_rho)

      eps_rho_rspace = dft_control%qs_control%eps_rho_rspace
      pab(1, 1) = 1.0_dp

      natom = SIZE(qs_env%qmmm_env_qm%image_charge_pot%image_mm_list)

      CALL reallocate(cores, 1, natom)
      npme = 0
      cores = 0

      DO iatom = 1, natom
         IF (rs_rho%desc%parallel .AND. .NOT. rs_rho%desc%distributed) THEN
            IF (MODULO(iatom, rs_rho%desc%group_size) == rs_rho%desc%my_pos) THEN
               npme = npme + 1
               cores(npme) = iatom
            ENDIF
         ELSE
            npme = npme + 1
            cores(npme) = iatom
         ENDIF
      ENDDO

      IF (npme .GT. 0) THEN
         DO j = 1, npme
            iatom = cores(j)
            atom_a = qs_env%qmmm_env_qm%image_charge_pot%image_mm_list(iatom)
            ra(:) = pbc(qs_env%qmmm_env_qm%image_charge_pot%particles_all(atom_a)%r, cell)
            subpatch_pattern = 0
            !lmax == 0 so set lmax_global to 0
            CALL collocate_pgf_product_rspace( &
               0, qs_env%qmmm_env_qm%image_charge_pot%eta, &
               0, 0, 0.0_dp, 0, ra, (/0.0_dp, 0.0_dp, 0.0_dp/), 0.0_dp, coeff(iatom), pab, 0, 0, rs_rho, &
               cell, pw_env%cube_info(1), eps_rho_rspace, ga_gb_function=FUNC_AB, &
               use_subpatch=.TRUE., subpatch_pattern=subpatch_pattern, lmax_global=0)
         ENDDO
      ENDIF

      DEALLOCATE (pab, cores)

      CALL pw_pool_create_pw(auxbas_pw_pool, rhoc_r%pw, &
                             use_data=REALDATA3D, in_space=REALSPACE)

      CALL rs_pw_transfer(rs_rho, rhoc_r%pw, rs2pw)
      CALL rs_grid_release(rs_rho)

      IF (PRESENT(total_rho_metal)) &
         !minus sign: account for the fact that rho_metal has opposite sign
         total_rho_metal = pw_integrate_function(rhoc_r%pw, isign=-1)

      CALL pw_transfer(rhoc_r%pw, rho_metal%pw)
      CALL pw_pool_give_back_pw(auxbas_pw_pool, rhoc_r%pw)

      CALL timestop(handle)

   END SUBROUTINE calculate_rho_metal

! **************************************************************************************************
!> \brief collocate a single Gaussian on the grid for periodic RESP fitting
!> \param rho_gb charge density generated by a single gaussian
!> \param qs_env qs environment
!> \param eta width of single Gaussian
!> \param iatom_in atom index
!> \par History
!>        06.2012 created
!> \author Dorothea Golze
! **************************************************************************************************
   SUBROUTINE calculate_rho_resp_single(rho_gb, qs_env, eta, iatom_in)

      TYPE(pw_p_type), INTENT(INOUT)                     :: rho_gb
      TYPE(qs_environment_type), POINTER                 :: qs_env
      REAL(KIND=dp), INTENT(IN)                          :: eta
      INTEGER, INTENT(IN)                                :: iatom_in

      CHARACTER(len=*), PARAMETER :: routineN = 'calculate_rho_resp_single', &
         routineP = moduleN//':'//routineN

      INTEGER                                            :: handle, iatom, npme
      INTEGER(KIND=int_8)                                :: subpatch_pattern
      REAL(KIND=dp)                                      :: eps_rho_rspace
      REAL(KIND=dp), DIMENSION(3)                        :: ra
      REAL(KIND=dp), DIMENSION(:, :), POINTER            :: pab
      TYPE(cell_type), POINTER                           :: cell
      TYPE(dft_control_type), POINTER                    :: dft_control
      TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set
      TYPE(pw_env_type), POINTER                         :: pw_env
      TYPE(pw_p_type)                                    :: rhoc_r
      TYPE(pw_pool_type), POINTER                        :: auxbas_pw_pool
      TYPE(realspace_grid_type), POINTER                 :: rs_rho

      CALL timeset(routineN, handle)
      NULLIFY (cell, dft_control, pab, pw_env, rs_rho, auxbas_pw_pool, &
               particle_set)

      ALLOCATE (pab(1, 1))

      CALL get_qs_env(qs_env=qs_env, &
                      cell=cell, &
                      dft_control=dft_control, &
                      particle_set=particle_set, &
                      pw_env=pw_env)
      CALL pw_env_get(pw_env, auxbas_rs_grid=rs_rho, &
                      auxbas_pw_pool=auxbas_pw_pool)
      CALL rs_grid_retain(rs_rho)
      CALL rs_grid_zero(rs_rho)

      eps_rho_rspace = dft_control%qs_control%eps_rho_rspace
      pab(1, 1) = 1.0_dp
      iatom = iatom_in

      npme = 0

      IF (rs_rho%desc%parallel .AND. .NOT. rs_rho%desc%distributed) THEN
         IF (MODULO(iatom, rs_rho%desc%group_size) == rs_rho%desc%my_pos) THEN
            npme = npme + 1
         ENDIF
      ELSE
         npme = npme + 1
      ENDIF

      IF (npme .GT. 0) THEN
         ra(:) = pbc(particle_set(iatom)%r, cell)
         subpatch_pattern = 0
         ! lmax == 0 so set lmax_global to 0
         CALL collocate_pgf_product_rspace(0, eta, 0, 0, 0.0_dp, 0, ra, &
                                           (/0.0_dp, 0.0_dp, 0.0_dp/), 0.0_dp, 1.0_dp, pab, 0, 0, rs_rho, &
                                           cell, pw_env%cube_info(1), eps_rho_rspace, ga_gb_function=FUNC_AB, &
                                           use_subpatch=.TRUE., subpatch_pattern=subpatch_pattern, &
                                           lmax_global=0)
      ENDIF

      DEALLOCATE (pab)

      CALL pw_pool_create_pw(auxbas_pw_pool, rhoc_r%pw, &
                             use_data=REALDATA3D, in_space=REALSPACE)

      CALL rs_pw_transfer(rs_rho, rhoc_r%pw, rs2pw)
      CALL rs_grid_release(rs_rho)

      CALL pw_transfer(rhoc_r%pw, rho_gb%pw)

      CALL pw_pool_give_back_pw(auxbas_pw_pool, rhoc_r%pw)

      CALL timestop(handle)

   END SUBROUTINE calculate_rho_resp_single

! **************************************************************************************************
!> \brief computes the RESP charge density on a grid based on the RESP charges
!> \param rho_resp RESP charge density
!> \param coeff RESP charges, take care of normalization factor
!>        (eta/pi)**1.5 later
!> \param natom number of atoms
!> \param eta width of single Gaussian
!> \param qs_env qs environment
!> \par History
!>        01.2012 created
!> \author Dorothea Golze
! **************************************************************************************************
   SUBROUTINE calculate_rho_resp_all(rho_resp, coeff, natom, eta, qs_env)

      TYPE(pw_p_type), INTENT(INOUT)                     :: rho_resp
      REAL(KIND=dp), DIMENSION(:), POINTER               :: coeff
      INTEGER, INTENT(IN)                                :: natom
      REAL(KIND=dp), INTENT(IN)                          :: eta
      TYPE(qs_environment_type), POINTER                 :: qs_env

      CHARACTER(len=*), PARAMETER :: routineN = 'calculate_rho_resp_all', &
         routineP = moduleN//':'//routineN

      INTEGER                                            :: handle, iatom, j, npme
      INTEGER(KIND=int_8)                                :: subpatch_pattern
      INTEGER, DIMENSION(:), POINTER                     :: cores
      REAL(KIND=dp)                                      :: eps_rho_rspace
      REAL(KIND=dp), DIMENSION(3)                        :: ra
      REAL(KIND=dp), DIMENSION(:, :), POINTER            :: pab
      TYPE(cell_type), POINTER                           :: cell
      TYPE(dft_control_type), POINTER                    :: dft_control
      TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set
      TYPE(pw_env_type), POINTER                         :: pw_env
      TYPE(pw_p_type)                                    :: rhoc_r
      TYPE(pw_pool_type), POINTER                        :: auxbas_pw_pool
      TYPE(realspace_grid_type), POINTER                 :: rs_rho

      CALL timeset(routineN, handle)

      NULLIFY (cell, cores, dft_control, pab, pw_env, rs_rho, auxbas_pw_pool, &
               particle_set)

      ALLOCATE (pab(1, 1))

      CALL get_qs_env(qs_env=qs_env, &
                      cell=cell, &
                      dft_control=dft_control, &
                      particle_set=particle_set, &
                      pw_env=pw_env)
      CALL pw_env_get(pw_env, auxbas_rs_grid=rs_rho, &
                      auxbas_pw_pool=auxbas_pw_pool)
      CALL rs_grid_retain(rs_rho)
      CALL rs_grid_zero(rs_rho)

      eps_rho_rspace = dft_control%qs_control%eps_rho_rspace
      pab(1, 1) = 1.0_dp

      CALL reallocate(cores, 1, natom)
      npme = 0
      cores = 0

      DO iatom = 1, natom
         IF (rs_rho%desc%parallel .AND. .NOT. rs_rho%desc%distributed) THEN
            IF (MODULO(iatom, rs_rho%desc%group_size) == rs_rho%desc%my_pos) THEN
               npme = npme + 1
               cores(npme) = iatom
            ENDIF
         ELSE
            npme = npme + 1
            cores(npme) = iatom
         ENDIF
      ENDDO

      IF (npme .GT. 0) THEN
         DO j = 1, npme
            iatom = cores(j)
            ra(:) = pbc(particle_set(iatom)%r, cell)
            subpatch_pattern = 0
            ! la_max==0 so set lmax_global to 0
            CALL collocate_pgf_product_rspace( &
               0, eta, &
               0, 0, 0.0_dp, 0, ra, (/0.0_dp, 0.0_dp, 0.0_dp/), 0.0_dp, coeff(iatom), pab, 0, 0, rs_rho, &
               cell, pw_env%cube_info(1), eps_rho_rspace, ga_gb_function=FUNC_AB, &
               use_subpatch=.TRUE., subpatch_pattern=subpatch_pattern, lmax_global=0)
         ENDDO
      ENDIF

      DEALLOCATE (pab, cores)

      CALL pw_pool_create_pw(auxbas_pw_pool, rhoc_r%pw, &
                             use_data=REALDATA3D, in_space=REALSPACE)

      CALL rs_pw_transfer(rs_rho, rhoc_r%pw, rs2pw)
      CALL rs_grid_release(rs_rho)

      CALL pw_transfer(rhoc_r%pw, rho_resp%pw)
      CALL pw_pool_give_back_pw(auxbas_pw_pool, rhoc_r%pw)

      CALL timestop(handle)

   END SUBROUTINE calculate_rho_resp_all

! **************************************************************************************************
!> \brief computes the density corresponding to a given density matrix on the grid
!> \param matrix_p ...
!> \param matrix_p_kp ...
!> \param rho ...
!> \param rho_gspace ...
!> \param total_rho ...
!> \param ks_env ...
!> \param soft_valid ...
!> \param compute_tau ...
!> \param compute_grad ...
!> \param basis_type ...
!> \param der_type ...
!> \param idir ...
!> \param task_list_external ...
!> \param pw_env_external ...
!> \par History
!>      IAB (15-Feb-2010): Added OpenMP parallelisation to task loop
!>                         (c) The Numerical Algorithms Group (NAG) Ltd, 2010 on behalf of the HECToR project
!> \note
!>      both rho and rho_gspace contain the new rho
!>      (in real and g-space respectively)
! **************************************************************************************************
   SUBROUTINE calculate_rho_elec(matrix_p, matrix_p_kp, rho, rho_gspace, total_rho, &
                                 ks_env, soft_valid, compute_tau, compute_grad, &
                                 basis_type, der_type, idir, task_list_external, pw_env_external)

      TYPE(dbcsr_type), OPTIONAL, POINTER                :: matrix_p
      TYPE(dbcsr_p_type), DIMENSION(:), OPTIONAL, &
         POINTER                                         :: matrix_p_kp
      TYPE(pw_p_type), INTENT(INOUT)                     :: rho, rho_gspace
      REAL(KIND=dp), INTENT(OUT)                         :: total_rho
      TYPE(qs_ks_env_type), POINTER                      :: ks_env
      LOGICAL, INTENT(IN), OPTIONAL                      :: soft_valid, compute_tau, compute_grad
      CHARACTER(LEN=*), INTENT(IN), OPTIONAL             :: basis_type
      INTEGER, INTENT(IN), OPTIONAL                      :: der_type, idir
      TYPE(task_list_type), OPTIONAL, POINTER            :: task_list_external
      TYPE(pw_env_type), OPTIONAL, POINTER               :: pw_env_external

      CHARACTER(len=*), PARAMETER :: routineN = 'calculate_rho_elec', &
         routineP = moduleN//':'//routineN

      CHARACTER(LEN=default_string_length)               :: my_basis_type
      INTEGER :: bcol, brow, ga_gb_function, handle, iatom, iatom_old, igrid_level, &
         igrid_level_dummy, ikind, ikind_old, img, img_old, ipair, ipgf, iset, iset_old, itask, &
         ithread, jatom, jatom_old, jkind, jkind_old, jpgf, jset, jset_old, lb, lbr, lbw, &
         lmax_global, maxco, maxpgf, maxset, maxsgf, maxsgf_set, my_der_type, my_idir, n, na1, &
         na2, natoms, nb1, nb2, nblock, ncoa, ncob, nimages, nr, nrlevel, nseta, nsetb, ntasks, &
         nthread, nw, nxy, nz, nzsize, sgfa, sgfb, ub
      INTEGER(kind=int_8), DIMENSION(:), POINTER         :: atom_pair_recv, atom_pair_send
      INTEGER(kind=int_8), DIMENSION(:, :), POINTER      :: tasks
      INTEGER, DIMENSION(:), POINTER                     :: la_max, la_min, lb_max, lb_min, mylmax, &
                                                            npgfa, npgfb, nsgfa, nsgfb
      INTEGER, DIMENSION(:, :), POINTER                  :: first_sgfa, first_sgfb
      LOGICAL :: atom_pair_changed, distributed_rs_grids, do_kp, found, map_consistent, &
         my_compute_grad, my_compute_tau, my_soft, use_subpatch
      REAL(KIND=dp)                                      :: eps_rho_rspace, rab2, scale, zetp
      REAL(KIND=dp), DIMENSION(3)                        :: ra, rab, rab_inv, rb
      REAL(KIND=dp), DIMENSION(:, :), POINTER            :: dist_ab, p_block, pab, sphi_a, sphi_b, &
                                                            work, zeta, zetb
      REAL(KIND=dp), DIMENSION(:, :, :), POINTER         :: pabt, workt
      TYPE(cell_type), POINTER                           :: cell
      TYPE(cube_info_type), DIMENSION(:), POINTER        :: cube_info
      TYPE(dbcsr_p_type), DIMENSION(:), POINTER          :: deltap
      TYPE(dft_control_type), POINTER                    :: dft_control
      TYPE(gridlevel_info_type), POINTER                 :: gridlevel_info
      TYPE(gto_basis_set_type), POINTER                  :: orb_basis_set
      TYPE(lgrid_type), POINTER                          :: lgrid
      TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set
      TYPE(pw_env_type), POINTER                         :: pw_env
      TYPE(qs_kind_type), DIMENSION(:), POINTER          :: qs_kind_set
      TYPE(qs_kind_type), POINTER                        :: qs_kind
      TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
         POINTER                                         :: rs_descs
      TYPE(realspace_grid_p_type), DIMENSION(:), POINTER :: rs_rho
      TYPE(task_list_type), POINTER                      :: task_list, task_list_soft

      CALL timeset(routineN, handle)

      CPASSERT(PRESENT(matrix_p) .OR. PRESENT(matrix_p_kp))
      do_kp = PRESENT(matrix_p_kp)

      NULLIFY (qs_kind, cell, dft_control, orb_basis_set, deltap, &
               qs_kind_set, particle_set, rs_rho, pw_env, rs_descs, &
               dist_ab, la_max, la_min, &
               lb_max, lb_min, npgfa, npgfb, nsgfa, nsgfb, p_block, &
               sphi_a, sphi_b, zeta, zetb, first_sgfa, first_sgfb, tasks, pabt, &
               workt, lgrid, mylmax)

      debug_count = debug_count + 1

      ! by default, the full density is calculated
      my_soft = .FALSE.
      IF (PRESENT(soft_valid)) my_soft = soft_valid

      ! by default, do not compute the kinetic energy density (tau)
      ! if compute_tau, all grids referencing to rho are actually tau
      IF (PRESENT(compute_tau)) THEN
         my_compute_tau = compute_tau
      ELSE
         my_compute_tau = .FALSE.
      ENDIF

      IF (PRESENT(compute_grad)) THEN
         my_compute_grad = compute_grad
      ELSE
         my_compute_grad = .FALSE.
      ENDIF

      my_der_type = 0
      my_idir = 0
      IF (PRESENT(der_type)) THEN
         my_der_type = der_type
         ga_gb_function = FUNC_DER(my_der_type)
      ELSE IF (my_compute_tau) THEN
         ga_gb_function = FUNC_DADB
      ELSE IF (my_compute_grad) THEN
         ga_gb_function = FUNC_DABpADB
      ELSE
         ga_gb_function = FUNC_AB
      ENDIF
      IF (PRESENT(idir)) my_idir = idir

      IF (PRESENT(basis_type)) THEN
         my_basis_type = basis_type
      ELSE
         my_basis_type = "ORB"
      END IF

      CALL get_ks_env(ks_env, &
                      qs_kind_set=qs_kind_set, &
                      cell=cell, &
                      dft_control=dft_control, &
                      particle_set=particle_set, &
                      pw_env=pw_env)

      SELECT CASE (my_basis_type)
      CASE ("ORB")
         CALL get_ks_env(ks_env, &
                         task_list=task_list, &
                         task_list_soft=task_list_soft)
      CASE ("AUX_FIT")
         CALL get_ks_env(ks_env, &
                         task_list_aux_fit=task_list, &
                         task_list_soft=task_list_soft)
      END SELECT

      nimages = dft_control%nimages
      CPASSERT(nimages == 1 .OR. do_kp)

      IF (PRESENT(pw_env_external)) pw_env => pw_env_external
      IF (PRESENT(task_list_external)) task_list => task_list_external

      ! *** assign from pw_env
      gridlevel_info => pw_env%gridlevel_info
      cube_info => pw_env%cube_info

      !   *** Allocate work storage ***
      nthread = 1
!$    nthread = omp_get_max_threads()
      CALL get_qs_kind_set(qs_kind_set=qs_kind_set, &
                           maxco=maxco, &
                           maxsgf=maxsgf, &
                           maxsgf_set=maxsgf_set, &
                           basis_type=my_basis_type)
      CALL reallocate(pabt, 1, maxco, 1, maxco, 0, nthread - 1)
      CALL reallocate(workt, 1, maxco, 1, maxsgf_set, 0, nthread - 1)

      ! find maximum numbers
      natoms = SIZE(particle_set)
      maxset = 0
      maxpgf = 0
      lmax_global = 0

      DO ikind = 1, SIZE(qs_kind_set)
         qs_kind => qs_kind_set(ikind)
         CALL get_qs_kind(qs_kind=qs_kind, &
                          softb=my_soft, &
                          basis_set=orb_basis_set, basis_type=my_basis_type)

         IF (.NOT. ASSOCIATED(orb_basis_set)) CYCLE

         CALL get_gto_basis_set(gto_basis_set=orb_basis_set, &
                                npgf=npgfa, nset=nseta, lmax=mylmax)

         maxset = MAX(nseta, maxset)
         maxpgf = MAX(MAXVAL(npgfa), maxpgf)
         lmax_global = MAX(MAXVAL(mylmax), lmax_global)
      END DO

      ! Optionally increment lmax_global depending on which ga_gb_function
      ! will be used for collocation
      SELECT CASE (ga_gb_function)
      CASE (FUNC_DADB)
         lmax_global = lmax_global + 1
      CASE (FUNC_ADBmDAB)
         lmax_global = lmax_global + 1
      CASE (FUNC_DABpADB)
         lmax_global = lmax_global + 1
      CASE (FUNC_DX, FUNC_DY, FUNC_DZ)
         lmax_global = lmax_global + 1
      CASE (FUNC_DXDY, FUNC_DYDZ, FUNC_DZDX)
         lmax_global = lmax_global + 2
      CASE (FUNC_DXDX, FUNC_DYDY, FUNC_DZDZ)
         lmax_global = lmax_global + 2
      CASE (FUNC_ARDBmDARB)
         lmax_global = lmax_global + 2
      CASE (FUNC_ARB)
         lmax_global = lmax_global + 1
      CASE (FUNC_AB)
         lmax_global = lmax_global
      CASE DEFAULT
         CPABORT("")
      END SELECT

      ! get the task lists
      IF (my_soft) task_list => task_list_soft
      CPASSERT(ASSOCIATED(task_list))
      tasks => task_list%tasks
      dist_ab => task_list%dist_ab
      atom_pair_send => task_list%atom_pair_send
      atom_pair_recv => task_list%atom_pair_recv
      ntasks = task_list%ntasks

      ! *** set up the pw multi-grids
      CPASSERT(ASSOCIATED(pw_env))
      CALL pw_env_get(pw_env, rs_descs=rs_descs, rs_grids=rs_rho, lgrid=lgrid)
      ! *** set up the rs multi-grids
      CPASSERT(ASSOCIATED(rs_rho))
      CPASSERT(ASSOCIATED(rs_descs))
      CPASSERT(ASSOCIATED(lgrid))
      distributed_rs_grids = .FALSE.

      DO igrid_level = 1, gridlevel_info%ngrid_levels
         CALL rs_grid_retain(rs_rho(igrid_level)%rs_grid)
         CALL rs_grid_zero(rs_rho(igrid_level)%rs_grid)
         IF (rs_rho(igrid_level)%rs_grid%desc%distributed) THEN
            distributed_rs_grids = .TRUE.
         ENDIF
      END DO

      IF (nthread > 1) THEN
         IF (.NOT. ASSOCIATED(lgrid%r)) THEN
            CALL lgrid_allocate_grid(lgrid, nthread)
         ENDIF
      END IF

      eps_rho_rspace = dft_control%qs_control%eps_rho_rspace
      map_consistent = dft_control%qs_control%map_consistent

      !   *** Initialize working density matrix ***
      ! distributed rs grids require a matrix that will be changed
      ! whereas this is not the case for replicated grids
      NULLIFY (deltap)
      CALL dbcsr_allocate_matrix_set(deltap, nimages)
      IF (distributed_rs_grids) THEN
         DO img = 1, nimages
            ALLOCATE (deltap(img)%matrix)
         END DO
         ! this matrix has no strict sparsity pattern in parallel
         ! deltap%sparsity_id=-1
         IF (do_kp) THEN
            DO img = 1, nimages
               CALL dbcsr_copy(deltap(img)%matrix, matrix_p_kp(img)%matrix, &
                               name="DeltaP")
            END DO
         ELSE
            CALL dbcsr_copy(deltap(1)%matrix, matrix_p, name="DeltaP")
         END IF
      ELSE
         IF (do_kp) THEN
            DO img = 1, nimages
               deltap(img)%matrix => matrix_p_kp(img)%matrix
            END DO
         ELSE
            deltap(1)%matrix => matrix_p
         END IF
      ENDIF

      ! distribute the matrix
      IF (distributed_rs_grids) THEN
         CALL rs_distribute_matrix(rs_descs, deltap, atom_pair_send, atom_pair_recv, &
                                   natoms, nimages, scatter=.TRUE.)
      ENDIF

      ! map all tasks on the grids

!$OMP PARALLEL DEFAULT(NONE), &
!$OMP          SHARED(ntasks,tasks,nimages,natoms,maxset,maxpgf,particle_set,pabt,workt), &
!$OMP          SHARED(my_basis_type,my_soft,deltap,maxco,dist_ab,ncoset,nthread), &
!$OMP          SHARED(cell,cube_info,eps_rho_rspace,ga_gb_function, my_idir,map_consistent), &
!$OMP          SHARED(rs_rho,lgrid,gridlevel_info,task_list,qs_kind_set,lmax_global), &
!$OMP          PRIVATE(igrid_level,iatom,jatom,iset,jset,ipgf,jpgf,ikind,jkind,pab,work), &
!$OMP          PRIVATE(img,img_old,iatom_old,jatom_old,iset_old,jset_old,ikind_old,jkind_old), &
!$OMP          PRIVATE(brow,bcol,qs_kind,orb_basis_set,first_sgfa,la_max,la_min), &
!$OMP          PRIVATE(npgfa,nseta,nsgfa,sphi_a,zeta,first_sgfb,lb_max,lb_min,npgfb), &
!$OMP          PRIVATE(nsetb,nsgfb,sphi_b,zetb,p_block,found), &
!$OMP          PRIVATE(atom_pair_changed,ncoa,sgfa,ncob,sgfb,rab,rab2,ra,rb,zetp), &
!$OMP          PRIVATE(na1,na2,nb1,nb2,scale,use_subpatch,rab_inv,ithread,lb,ub,n,nw), &
!$OMP          PRIVATE(itask,nz,nxy,nzsize,nrlevel,nblock,lbw,lbr,nr,igrid_level_dummy)

      ithread = 0
!$    ithread = omp_get_thread_num()
      pab => pabt(:, :, ithread)
      work => workt(:, :, ithread)

      iatom_old = -1; jatom_old = -1; iset_old = -1; jset_old = -1
      ikind_old = -1; jkind_old = -1; img_old = -1

      ! Loop over each gridlevel first, then loop and load balance over atom pairs
      ! We only need a single lgrid, which we sum back onto the rsgrid after each
      ! grid level is completed
      loop_gridlevels: DO igrid_level = 1, gridlevel_info%ngrid_levels

         ! Only zero the region of the lgrid required for this grid level
         IF (nthread > 1) THEN
            lgrid%r(1:rs_rho(igrid_level)%rs_grid%ngpts_local, ithread) = 0._dp
         END IF
!$OMP DO schedule(dynamic, MAX(1,task_list%npairs(igrid_level)/(nthread*50)))
         loop_pairs: DO ipair = 1, task_list%npairs(igrid_level)
         loop_tasks: DO itask = task_list%taskstart(ipair, igrid_level), task_list%taskstop(ipair, igrid_level)
            !decode the atom pair and basis info (igrid_level_dummy equals do loop variable by construction).
            CALL int2pair(tasks(3, itask), igrid_level_dummy, img, iatom, jatom, iset, jset, ipgf, jpgf, &
                          nimages, natoms, maxset, maxpgf)
            ikind = particle_set(iatom)%atomic_kind%kind_number
            jkind = particle_set(jatom)%atomic_kind%kind_number

            IF (iatom .NE. iatom_old .OR. jatom .NE. jatom_old .OR. img .NE. img_old) THEN

               IF (iatom .NE. iatom_old) ra(:) = pbc(particle_set(iatom)%r, cell)

               IF (iatom <= jatom) THEN
                  brow = iatom
                  bcol = jatom
               ELSE
                  brow = jatom
                  bcol = iatom
               END IF

               IF (ikind .NE. ikind_old) THEN
                  CALL get_qs_kind(qs_kind_set(ikind), softb=my_soft, &
                                   basis_set=orb_basis_set, basis_type=my_basis_type)
                  CALL get_gto_basis_set(gto_basis_set=orb_basis_set, &
                                         first_sgf=first_sgfa, &
                                         lmax=la_max, &
                                         lmin=la_min, &
                                         npgf=npgfa, &
                                         nset=nseta, &
                                         nsgf_set=nsgfa, &
                                         sphi=sphi_a, &
                                         zet=zeta)
               ENDIF

               IF (jkind .NE. jkind_old) THEN
                  CALL get_qs_kind(qs_kind_set(jkind), softb=my_soft, &
                                   basis_set=orb_basis_set, basis_type=my_basis_type)
                  CALL get_gto_basis_set(gto_basis_set=orb_basis_set, &
                                         first_sgf=first_sgfb, &
                                         lmax=lb_max, &
                                         lmin=lb_min, &
                                         npgf=npgfb, &
                                         nset=nsetb, &
                                         nsgf_set=nsgfb, &
                                         sphi=sphi_b, &
                                         zet=zetb)
               ENDIF

               CALL dbcsr_get_block_p(matrix=deltap(img)%matrix, &
                                      row=brow, col=bcol, BLOCK=p_block, found=found)
               CPASSERT(found)

               iatom_old = iatom
               jatom_old = jatom
               ikind_old = ikind
               jkind_old = jkind
               img_old = img
               atom_pair_changed = .TRUE.

            ELSE

               atom_pair_changed = .FALSE.

            ENDIF

            IF (atom_pair_changed .OR. iset_old .NE. iset .OR. jset_old .NE. jset) THEN
               ncoa = npgfa(iset)*ncoset(la_max(iset))
               sgfa = first_sgfa(1, iset)
               ncob = npgfb(jset)*ncoset(lb_max(jset))
               sgfb = first_sgfb(1, jset)

               IF (iatom <= jatom) THEN
                  CALL dgemm("N", "N", ncoa, nsgfb(jset), nsgfa(iset), &
                             1.0_dp, sphi_a(1, sgfa), SIZE(sphi_a, 1), &
                             p_block(sgfa, sgfb), SIZE(p_block, 1), &
                             0.0_dp, work(1, 1), maxco)
                  CALL dgemm("N", "T", ncoa, ncob, nsgfb(jset), &
                             1.0_dp, work(1, 1), maxco, &
                             sphi_b(1, sgfb), SIZE(sphi_b, 1), &
                             0.0_dp, pab(1, 1), maxco)
               ELSE
                  CALL dgemm("N", "N", ncob, nsgfa(iset), nsgfb(jset), &
                             1.0_dp, sphi_b(1, sgfb), SIZE(sphi_b, 1), &
                             p_block(sgfb, sgfa), SIZE(p_block, 1), &
                             0.0_dp, work(1, 1), maxco)
                  CALL dgemm("N", "T", ncob, ncoa, nsgfa(iset), &
                             1.0_dp, work(1, 1), maxco, &
                             sphi_a(1, sgfa), SIZE(sphi_a, 1), &
                             0.0_dp, pab(1, 1), maxco)
               END IF

               iset_old = iset
               jset_old = jset
            ENDIF

            rab(:) = dist_ab(:, itask)
            rab2 = rab(1)*rab(1) + rab(2)*rab(2) + rab(3)*rab(3)
            rb(:) = ra(:) + rab(:)
            zetp = zeta(ipgf, iset) + zetb(jpgf, jset)

            na1 = (ipgf - 1)*ncoset(la_max(iset)) + 1
            na2 = ipgf*ncoset(la_max(iset))
            nb1 = (jpgf - 1)*ncoset(lb_max(jset)) + 1
            nb2 = jpgf*ncoset(lb_max(jset))

            ! takes the density matrix symmetry in account, i.e. off-diagonal blocks need to be mapped 'twice'
            IF (iatom == jatom) THEN
               scale = 1.0_dp
            ELSE
               scale = 2.0_dp
            END IF

            ! check whether we need to use fawzi's generalised collocation scheme
            IF (rs_rho(igrid_level)%rs_grid%desc%distributed) THEN
               !tasks(4,:) is 0 for replicated, 1 for distributed 2 for exceptional distributed tasks
               IF (tasks(4, itask) .EQ. 2) THEN
                  use_subpatch = .TRUE.
               ELSE
                  use_subpatch = .FALSE.
               ENDIF
            ELSE
               use_subpatch = .FALSE.
            ENDIF

            IF (nthread > 1) THEN
               IF (iatom <= jatom) THEN
                  CALL collocate_pgf_product_rspace( &
                     la_max(iset), zeta(ipgf, iset), la_min(iset), &
                     lb_max(jset), zetb(jpgf, jset), lb_min(jset), &
                     ra, rab, rab2, scale, pab, na1 - 1, nb1 - 1, &
                     rs_rho(igrid_level)%rs_grid, cell, cube_info(igrid_level), &
                     eps_rho_rspace, &
                     ga_gb_function=ga_gb_function, &
                     idir=my_idir, &
                     lgrid=lgrid, ithread=ithread, &
                     map_consistent=map_consistent, use_subpatch=use_subpatch, &
                     subpatch_pattern=tasks(6, itask), lmax_global=lmax_global)
               ELSE
                  rab_inv = -rab
                  CALL collocate_pgf_product_rspace( &
                     lb_max(jset), zetb(jpgf, jset), lb_min(jset), &
                     la_max(iset), zeta(ipgf, iset), la_min(iset), &
                     rb, rab_inv, rab2, scale, pab, nb1 - 1, na1 - 1, &
                     rs_rho(igrid_level)%rs_grid, cell, cube_info(igrid_level), &
                     eps_rho_rspace, &
                     ga_gb_function=ga_gb_function, &
                     idir=my_idir, &
                     lgrid=lgrid, ithread=ithread, &
                     map_consistent=map_consistent, use_subpatch=use_subpatch, &
                     subpatch_pattern=tasks(6, itask), lmax_global=lmax_global)
               END IF
            ELSE
               IF (iatom <= jatom) THEN
                  CALL collocate_pgf_product_rspace( &
                     la_max(iset), zeta(ipgf, iset), la_min(iset), &
                     lb_max(jset), zetb(jpgf, jset), lb_min(jset), &
                     ra, rab, rab2, scale, pab, na1 - 1, nb1 - 1, &
                     rs_rho(igrid_level)%rs_grid, cell, cube_info(igrid_level), &
                     eps_rho_rspace, &
                     ga_gb_function=ga_gb_function, &
                     idir=my_idir, &
                     map_consistent=map_consistent, use_subpatch=use_subpatch, &
                     subpatch_pattern=tasks(6, itask), lmax_global=lmax_global)
               ELSE
                  rab_inv = -rab
                  CALL collocate_pgf_product_rspace( &
                     lb_max(jset), zetb(jpgf, jset), lb_min(jset), &
                     la_max(iset), zeta(ipgf, iset), la_min(iset), &
                     rb, rab_inv, rab2, scale, pab, nb1 - 1, na1 - 1, &
                     rs_rho(igrid_level)%rs_grid, cell, cube_info(igrid_level), &
                     eps_rho_rspace, &
                     ga_gb_function=ga_gb_function, &
                     idir=my_idir, &
                     map_consistent=map_consistent, use_subpatch=use_subpatch, &
                     subpatch_pattern=tasks(6, itask), lmax_global=lmax_global)
               END IF
            END IF
         END DO loop_tasks
         END DO loop_pairs
!$OMP END DO

         ! Now sum the thread-local grids back into the rs_grid
         ! (in parallel, each thread writes to a section of the rs_grid at a time)
         IF (nthread > 1) THEN
            nz = (1 + rs_rho(igrid_level)%rs_grid%ub_local(3) &
                  - rs_rho(igrid_level)%rs_grid%lb_local(3))
            nxy = (1 + rs_rho(igrid_level)%rs_grid%ub_local(1) &
                   - rs_rho(igrid_level)%rs_grid%lb_local(1)) &
                  *(1 + rs_rho(igrid_level)%rs_grid%ub_local(2) &
                    - rs_rho(igrid_level)%rs_grid%lb_local(2))

            ! Work out the number of tree levels, and start the reduction

            nrlevel = CEILING(LOG10(1.0_dp*nthread)/LOG10(2.0_dp))

            DO nr = 1, nrlevel
               nblock = 2**nr
               nzsize = MIN(nblock, nthread - (ithread/nblock)*nblock)
               itask = MODULO(ithread, nblock)

               lb = (nz*itask)/nzsize
               ub = (nz*(itask + 1))/nzsize - 1
               nw = (ithread/nblock)*nblock
               lbw = 1 + nxy*lb

               n = nw + nblock/2
               IF (n < nthread) THEN
                  lbr = 1 + nxy*lb

                  CALL daxpy(nxy*(1 + ub - lb), 1.0_dp, lgrid%r(lbr, n), 1, lgrid%r(lbw, nw), 1)
               END IF
!$OMP BARRIER
            END DO

            ! Copy final result from first local grid to rs_grid
            lb = (nz*ithread)/nthread
            ub = (nz*(ithread + 1))/nthread - 1
            nzsize = 1 + (ub - lb)

            lb = lb + rs_rho(igrid_level)%rs_grid%lb_local(3)
            ub = ub + rs_rho(igrid_level)%rs_grid%lb_local(3)
            lbr = 1 + nxy*(nz*ithread/nthread)

            CALL daxpy(nxy*nzsize, 1.0_dp, lgrid%r(lbr, 0), 1, &
                       rs_rho(igrid_level)%rs_grid%r(:, :, lb:ub), 1)
!$OMP BARRIER
         END IF
      END DO loop_gridlevels
!$OMP END PARALLEL

      !   *** Release work storage ***

      IF (distributed_rs_grids) THEN
         CALL dbcsr_deallocate_matrix_set(deltap)
      ELSE
         DO img = 1, nimages
            NULLIFY (deltap(img)%matrix)
         END DO
         DEALLOCATE (deltap)
      ENDIF

      DEALLOCATE (pabt, workt)

      CALL density_rs2pw(pw_env, rs_rho, rho, rho_gspace)

      total_rho = pw_integrate_function(rho%pw, isign=-1)

      CALL timestop(handle)

   END SUBROUTINE calculate_rho_elec

! **************************************************************************************************
!> \brief computes the gradient of the density corresponding to a given
!>        density matrix on the grid
!> \param matrix_p ...
!> \param matrix_p_kp ...
!> \param drho ...
!> \param drho_gspace ...
!> \param qs_env ...
!> \param soft_valid ...
!> \param basis_type ...
!> \note  this is an alternative to calculate the gradient through FFTs
! **************************************************************************************************
   SUBROUTINE calculate_drho_elec(matrix_p, matrix_p_kp, drho, drho_gspace, qs_env, &
                                  soft_valid, basis_type)

      TYPE(dbcsr_type), OPTIONAL, POINTER                :: matrix_p
      TYPE(dbcsr_p_type), DIMENSION(:), OPTIONAL, &
         POINTER                                         :: matrix_p_kp
      TYPE(pw_p_type), DIMENSION(:), INTENT(INOUT)       :: drho, drho_gspace
      TYPE(qs_environment_type), POINTER                 :: qs_env
      LOGICAL, INTENT(IN), OPTIONAL                      :: soft_valid
      CHARACTER(LEN=*), INTENT(IN), OPTIONAL             :: basis_type

      CHARACTER(len=*), PARAMETER :: routineN = 'calculate_drho_elec', &
         routineP = moduleN//':'//routineN

      CHARACTER(LEN=default_string_length)               :: my_basis_type
      INTEGER :: bcol, brow, handle, i, iatom, iatom_old, idir, igrid_level, ikind, ikind_old, &
         img, img_old, ipgf, iset, iset_old, itask, ithread, jatom, jatom_old, jkind, jkind_old, &
         jpgf, jset, jset_old, lmax_global, maxco, maxpgf, maxset, maxsgf, maxsgf_set, na1, na2, &
         natoms, nb1, nb2, ncoa, ncob, nimages, nseta, nsetb, ntasks, nthread, sgfa, sgfb
      INTEGER(kind=int_8), DIMENSION(:), POINTER         :: atom_pair_recv, atom_pair_send
      INTEGER(kind=int_8), DIMENSION(:, :), POINTER      :: tasks
      INTEGER, DIMENSION(:), POINTER                     :: la_max, la_min, lb_max, lb_min, mylmax, &
                                                            npgfa, npgfb, nsgfa, nsgfb
      INTEGER, DIMENSION(:, :), POINTER                  :: first_sgfa, first_sgfb
      LOGICAL                                            :: atom_pair_changed, distributed_rs_grids, &
                                                            do_kp, found, map_consistent, my_soft, &
                                                            use_subpatch
      REAL(KIND=dp)                                      :: eps_rho_rspace, rab2, scale, zetp
      REAL(KIND=dp), DIMENSION(3)                        :: ra, rab, rab_inv, rb
      REAL(KIND=dp), DIMENSION(:, :), POINTER            :: dist_ab, p_block, pab, sphi_a, sphi_b, &
                                                            work, zeta, zetb
      REAL(KIND=dp), DIMENSION(:, :, :), POINTER         :: pabt, workt
      TYPE(cell_type), POINTER                           :: cell
      TYPE(cube_info_type), DIMENSION(:), POINTER        :: cube_info
      TYPE(dbcsr_p_type), DIMENSION(:), POINTER          :: deltap
      TYPE(dft_control_type), POINTER                    :: dft_control
      TYPE(gridlevel_info_type), POINTER                 :: gridlevel_info
      TYPE(gto_basis_set_type), POINTER                  :: orb_basis_set
      TYPE(neighbor_list_set_p_type), DIMENSION(:), &
         POINTER                                         :: sab_orb
      TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set
      TYPE(pw_env_type), POINTER                         :: pw_env
      TYPE(qs_kind_type), DIMENSION(:), POINTER          :: qs_kind_set
      TYPE(qs_kind_type), POINTER                        :: qs_kind
      TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
         POINTER                                         :: rs_descs
      TYPE(realspace_grid_p_type), DIMENSION(:), POINTER :: rs_rho
      TYPE(task_list_type), POINTER                      :: task_list, task_list_soft

      CALL timeset(routineN, handle)

      CPASSERT(PRESENT(matrix_p) .OR. PRESENT(matrix_p_kp))
      do_kp = PRESENT(matrix_p_kp)

      NULLIFY (qs_kind, cell, dft_control, orb_basis_set, deltap, &
               qs_kind_set, sab_orb, particle_set, rs_rho, pw_env, rs_descs, &
               dist_ab, la_max, la_min, &
               lb_max, lb_min, npgfa, npgfb, nsgfa, nsgfb, p_block, &
               sphi_a, sphi_b, zeta, zetb, first_sgfa, first_sgfb, tasks, pabt, &
               workt, mylmax)

      debug_count = debug_count + 1

      ! by default, the full density is calculated
      my_soft = .FALSE.
      IF (PRESENT(soft_valid)) my_soft = soft_valid

      IF (PRESENT(basis_type)) THEN
         my_basis_type = basis_type
      ELSE
         my_basis_type = "ORB"
      END IF

      CALL get_qs_env(qs_env=qs_env, &
                      qs_kind_set=qs_kind_set, &
                      cell=cell, &
                      dft_control=dft_control, &
                      particle_set=particle_set, &
                      sab_orb=sab_orb, &
                      pw_env=pw_env)

      SELECT CASE (my_basis_type)
      CASE ("ORB")
         CALL get_qs_env(qs_env=qs_env, &
                         task_list=task_list, &
                         task_list_soft=task_list_soft)
      CASE ("AUX_FIT")
         CALL get_qs_env(qs_env=qs_env, &
                         task_list_aux_fit=task_list, &
                         task_list_soft=task_list_soft)
      END SELECT

      ! *** assign from pw_env
      gridlevel_info => pw_env%gridlevel_info
      cube_info => pw_env%cube_info

      !   *** Allocate work storage ***
      nthread = 1
      CALL get_qs_kind_set(qs_kind_set=qs_kind_set, &
                           maxco=maxco, &
                           maxsgf=maxsgf, &
                           maxsgf_set=maxsgf_set, &
                           basis_type=my_basis_type)
      CALL reallocate(pabt, 1, maxco, 1, maxco, 0, nthread - 1)
      CALL reallocate(workt, 1, maxco, 1, maxsgf_set, 0, nthread - 1)

      ! find maximum numbers
      nimages = dft_control%nimages
      CPASSERT(nimages == 1 .OR. do_kp)

      natoms = SIZE(particle_set)
      maxset = 0
      maxpgf = 0
      lmax_global = 0

      DO ikind = 1, SIZE(qs_kind_set)
         qs_kind => qs_kind_set(ikind)
         CALL get_qs_kind(qs_kind=qs_kind, &
                          softb=my_soft, &
                          basis_set=orb_basis_set, basis_type=my_basis_type)

         IF (.NOT. ASSOCIATED(orb_basis_set)) CYCLE

         CALL get_gto_basis_set(gto_basis_set=orb_basis_set, &
                                npgf=npgfa, nset=nseta, lmax=mylmax)

         maxset = MAX(nseta, maxset)
         maxpgf = MAX(MAXVAL(npgfa), maxpgf)
         lmax_global = MAX(MAXVAL(mylmax), lmax_global)
      END DO

      !update lmax_global since ga_gb_function=FUNC_DABpADB
      lmax_global = lmax_global + 1

      ! get the task lists
      IF (my_soft) task_list => task_list_soft
      CPASSERT(ASSOCIATED(task_list))
      tasks => task_list%tasks
      dist_ab => task_list%dist_ab
      atom_pair_send => task_list%atom_pair_send
      atom_pair_recv => task_list%atom_pair_recv
      ntasks = task_list%ntasks

      ! *** set up the rs multi-grids
      CPASSERT(ASSOCIATED(pw_env))
      CALL pw_env_get(pw_env, rs_descs=rs_descs, rs_grids=rs_rho)
      DO igrid_level = 1, gridlevel_info%ngrid_levels
         CALL rs_grid_retain(rs_rho(igrid_level)%rs_grid)
         distributed_rs_grids = rs_rho(igrid_level)%rs_grid%desc%distributed
      END DO

      eps_rho_rspace = dft_control%qs_control%eps_rho_rspace
      map_consistent = dft_control%qs_control%map_consistent

      !   *** Initialize working density matrix ***
      ! distributed rs grids require a matrix that will be changed
      ! whereas this is not the case for replicated grids
      ALLOCATE (deltap(nimages))
      IF (distributed_rs_grids) THEN
         DO img = 1, nimages
         END DO
         ! this matrix has no strict sparsity pattern in parallel
         ! deltap%sparsity_id=-1
         IF (do_kp) THEN
            DO img = 1, nimages
               CALL dbcsr_copy(deltap(img)%matrix, matrix_p_kp(img)%matrix, &
                               name="DeltaP")
            END DO
         ELSE
            CALL dbcsr_copy(deltap(1)%matrix, matrix_p, name="DeltaP")
         END IF
      ELSE
         IF (do_kp) THEN
            DO img = 1, nimages
               deltap(img)%matrix => matrix_p_kp(img)%matrix
            END DO
         ELSE
            deltap(1)%matrix => matrix_p
         END IF
      ENDIF

      ! distribute the matrix
      IF (distributed_rs_grids) THEN
         CALL rs_distribute_matrix(rs_descs, deltap, atom_pair_send, atom_pair_recv, &
                                   natoms, nimages, scatter=.TRUE.)
      ENDIF

      ! map all tasks on the grids

      ithread = 0
      pab => pabt(:, :, ithread)
      work => workt(:, :, ithread)

      loop_xyz: DO idir = 1, 3

         DO igrid_level = 1, gridlevel_info%ngrid_levels
            CALL rs_grid_zero(rs_rho(igrid_level)%rs_grid)
         END DO

         iatom_old = -1; jatom_old = -1; iset_old = -1; jset_old = -1
         ikind_old = -1; jkind_old = -1; img_old = -1
         loop_tasks: DO itask = 1, ntasks

            !decode the atom pair and basis info
            CALL int2pair(tasks(3, itask), igrid_level, img, iatom, jatom, iset, jset, ipgf, jpgf, &
                          nimages, natoms, maxset, maxpgf)

            ikind = particle_set(iatom)%atomic_kind%kind_number
            jkind = particle_set(jatom)%atomic_kind%kind_number

            IF (iatom .NE. iatom_old .OR. jatom .NE. jatom_old .OR. img .NE. img_old) THEN

               IF (iatom .NE. iatom_old) ra(:) = pbc(particle_set(iatom)%r, cell)

               IF (iatom <= jatom) THEN
                  brow = iatom
                  bcol = jatom
               ELSE
                  brow = jatom
                  bcol = iatom
               END IF

               IF (ikind .NE. ikind_old) THEN
                  CALL get_qs_kind(qs_kind_set(ikind), softb=my_soft, &
                                   basis_set=orb_basis_set, basis_type=my_basis_type)
                  CALL get_gto_basis_set(gto_basis_set=orb_basis_set, &
                                         first_sgf=first_sgfa, &
                                         lmax=la_max, &
                                         lmin=la_min, &
                                         npgf=npgfa, &
                                         nset=nseta, &
                                         nsgf_set=nsgfa, &
                                         sphi=sphi_a, &
                                         zet=zeta)
               ENDIF

               IF (jkind .NE. jkind_old) THEN
                  CALL get_qs_kind(qs_kind_set(jkind), softb=my_soft, &
                                   basis_set=orb_basis_set, basis_type=my_basis_type)
                  CALL get_gto_basis_set(gto_basis_set=orb_basis_set, &
                                         first_sgf=first_sgfb, &
                                         lmax=lb_max, &
                                         lmin=lb_min, &
                                         npgf=npgfb, &
                                         nset=nsetb, &
                                         nsgf_set=nsgfb, &
                                         sphi=sphi_b, &
                                         zet=zetb)
               ENDIF

               CALL dbcsr_get_block_p(matrix=deltap(img)%matrix, &
                                      row=brow, col=bcol, BLOCK=p_block, found=found)
               CPASSERT(found)

               iatom_old = iatom
               jatom_old = jatom
               ikind_old = ikind
               jkind_old = jkind
               img_old = img
               atom_pair_changed = .TRUE.

            ELSE

               atom_pair_changed = .FALSE.

            ENDIF

            IF (atom_pair_changed .OR. iset_old .NE. iset .OR. jset_old .NE. jset) THEN

               ncoa = npgfa(iset)*ncoset(la_max(iset))
               sgfa = first_sgfa(1, iset)
               ncob = npgfb(jset)*ncoset(lb_max(jset))
               sgfb = first_sgfb(1, jset)

               IF (iatom <= jatom) THEN
                  CALL dgemm("N", "N", ncoa, nsgfb(jset), nsgfa(iset), &
                             1.0_dp, sphi_a(1, sgfa), SIZE(sphi_a, 1), &
                             p_block(sgfa, sgfb), SIZE(p_block, 1), &
                             0.0_dp, work(1, 1), maxco)
                  CALL dgemm("N", "T", ncoa, ncob, nsgfb(jset), &
                             1.0_dp, work(1, 1), maxco, &
                             sphi_b(1, sgfb), SIZE(sphi_b, 1), &
                             0.0_dp, pab(1, 1), maxco)
               ELSE
                  CALL dgemm("N", "N", ncob, nsgfa(iset), nsgfb(jset), &
                             1.0_dp, sphi_b(1, sgfb), SIZE(sphi_b, 1), &
                             p_block(sgfb, sgfa), SIZE(p_block, 1), &
                             0.0_dp, work(1, 1), maxco)
                  CALL dgemm("N", "T", ncob, ncoa, nsgfa(iset), &
                             1.0_dp, work(1, 1), maxco, &
                             sphi_a(1, sgfa), SIZE(sphi_a, 1), &
                             0.0_dp, pab(1, 1), maxco)
               END IF

               iset_old = iset
               jset_old = jset

            ENDIF

            rab(:) = dist_ab(:, itask)
            rab2 = rab(1)*rab(1) + rab(2)*rab(2) + rab(3)*rab(3)
            rb(:) = ra(:) + rab(:)
            zetp = zeta(ipgf, iset) + zetb(jpgf, jset)

            na1 = (ipgf - 1)*ncoset(la_max(iset)) + 1
            na2 = ipgf*ncoset(la_max(iset))
            nb1 = (jpgf - 1)*ncoset(lb_max(jset)) + 1
            nb2 = jpgf*ncoset(lb_max(jset))

            ! takes the density matrix symmetry in account, i.e. off-diagonal blocks need to be mapped 'twice'
            IF (iatom == jatom .AND. img == 1) THEN
               scale = 1.0_dp
            ELSE
               scale = 2.0_dp
            END IF

            ! check whether we need to use fawzi's generalised collocation scheme
            IF (rs_rho(igrid_level)%rs_grid%desc%distributed) THEN
               !tasks(4,:) is 0 for replicated, 1 for distributed 2 for exceptional distributed tasks
               IF (tasks(4, itask) .EQ. 2) THEN
                  use_subpatch = .TRUE.
               ELSE
                  use_subpatch = .FALSE.
               ENDIF
            ELSE
               use_subpatch = .FALSE.
            ENDIF
            IF (iatom <= jatom) THEN
               CALL collocate_pgf_product_rspace( &
                  la_max(iset), zeta(ipgf, iset), la_min(iset), &
                  lb_max(jset), zetb(jpgf, jset), lb_min(jset), &
                  ra, rab, rab2, scale, pab, na1 - 1, nb1 - 1, &
                  rs_rho(igrid_level)%rs_grid, cell, cube_info(igrid_level), &
                  eps_rho_rspace, &
                  ga_gb_function=FUNC_DABpADB, &
                  idir=idir, &
                  map_consistent=map_consistent, use_subpatch=use_subpatch, subpatch_pattern=tasks(6, itask), &
                  lmax_global=lmax_global)
            ELSE
               rab_inv = -rab
               CALL collocate_pgf_product_rspace( &
                  lb_max(jset), zetb(jpgf, jset), lb_min(jset), &
                  la_max(iset), zeta(ipgf, iset), la_min(iset), &
                  rb, rab_inv, rab2, scale, pab, nb1 - 1, na1 - 1, &
                  rs_rho(igrid_level)%rs_grid, cell, cube_info(igrid_level), &
                  eps_rho_rspace, &
                  ga_gb_function=FUNC_DABpADB, &
                  idir=idir, &
                  map_consistent=map_consistent, use_subpatch=use_subpatch, subpatch_pattern=tasks(6, itask), &
                  lmax_global=lmax_global)
            END IF

         END DO loop_tasks

         CALL density_rs2pw_basic(pw_env, rs_rho, drho(idir), drho_gspace(idir))

      END DO loop_xyz

      !   *** Release work storage ***
      IF (ASSOCIATED(rs_rho)) THEN
         DO i = 1, SIZE(rs_rho)
            CALL rs_grid_release(rs_rho(i)%rs_grid)
         END DO
      END IF

      IF (distributed_rs_grids) THEN
         CALL dbcsr_deallocate_matrix_set(deltap)
      ELSE
         DO img = 1, nimages
            NULLIFY (deltap(img)%matrix)
         END DO
         DEALLOCATE (deltap)
      ENDIF

      DEALLOCATE (pabt, workt)

      CALL timestop(handle)

   END SUBROUTINE calculate_drho_elec

! **************************************************************************************************
!> \brief maps a given wavefunction on the grid
!> \param mo_vectors ...
!> \param ivector ...
!> \param rho ...
!> \param rho_gspace ...
!> \param atomic_kind_set ...
!> \param qs_kind_set ...
!> \param cell ...
!> \param dft_control ...
!> \param particle_set ...
!> \param pw_env ...
!> \param basis_type ...
!> \param external_vector ...
!> \par History
!>      08.2002 created [Joost VandeVondele]
!>      03.2006 made independent of qs_env [Joost VandeVondele]
!> \note
!>      modified calculate_rho_elec, should write the wavefunction represented by
!>      it's presumably dominated by the FFT and the rs->pw and back routines
!>
!>    BUGS ???
!>      does it take correctly the periodic images of the basis functions into account
!>      i.e. is only correct if the basis functions are localised enough to be just in 1 cell ?
! **************************************************************************************************
   SUBROUTINE calculate_wavefunction(mo_vectors, ivector, rho, rho_gspace, &
                                     atomic_kind_set, qs_kind_set, cell, dft_control, particle_set, &
                                     pw_env, basis_type, external_vector)

      TYPE(cp_fm_type), POINTER                          :: mo_vectors
      INTEGER                                            :: ivector
      TYPE(pw_p_type), INTENT(INOUT)                     :: rho, rho_gspace
      TYPE(atomic_kind_type), DIMENSION(:), POINTER      :: atomic_kind_set
      TYPE(qs_kind_type), DIMENSION(:), POINTER          :: qs_kind_set
      TYPE(cell_type), POINTER                           :: cell
      TYPE(dft_control_type), POINTER                    :: dft_control
      TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set
      TYPE(pw_env_type), POINTER                         :: pw_env
      CHARACTER(LEN=*), INTENT(IN), OPTIONAL             :: basis_type
      REAL(KIND=dp), DIMENSION(:), OPTIONAL              :: external_vector

      CHARACTER(LEN=*), PARAMETER :: routineN = 'calculate_wavefunction', &
         routineP = moduleN//':'//routineN

      CHARACTER(LEN=default_string_length)               :: my_basis_type
      INTEGER :: dir, group, group_size, handle, i, iatom, igrid_level, ikind, ipgf, iset, &
         lmax_global, maxco, maxsgf_set, my_pos, na1, na2, nao, natom, ncoa, nseta, offset, sgfa
      INTEGER, ALLOCATABLE, DIMENSION(:)                 :: where_is_the_point
      INTEGER, DIMENSION(3)                              :: lb, location, tp, ub
      INTEGER, DIMENSION(:), POINTER                     :: la_max, la_min, mylmax, npgfa, nsgfa
      INTEGER, DIMENSION(:, :), POINTER                  :: first_sgfa
      LOGICAL                                            :: local, map_it_here
      REAL(KIND=dp)                                      :: dab, eps_rho_rspace, rab2, scale, zetp
      REAL(KIND=dp), DIMENSION(3)                        :: ra, rab
      REAL(KIND=dp), DIMENSION(:), POINTER               :: eigenvector
      REAL(KIND=dp), DIMENSION(:, :), POINTER            :: pab, sphi_a, work, zeta
      TYPE(cube_info_type), DIMENSION(:), POINTER        :: cube_info
      TYPE(gridlevel_info_type), POINTER                 :: gridlevel_info
      TYPE(gto_basis_set_type), POINTER                  :: orb_basis_set
      TYPE(pw_p_type), DIMENSION(:), POINTER             :: mgrid_gspace, mgrid_rspace
      TYPE(pw_pool_p_type), DIMENSION(:), POINTER        :: pw_pools
      TYPE(REALSPACE_GRID_P_TYPE), DIMENSION(:), POINTER :: rs_rho

      IF (PRESENT(basis_type)) THEN
         my_basis_type = basis_type
      ELSE
         my_basis_type = "ORB"
      END IF

      CALL timeset(routineN, handle)

      NULLIFY (eigenvector, orb_basis_set, &
               pab, work, la_max, la_min, &
               npgfa, nsgfa, &
               sphi_a, zeta, first_sgfa, &
               rs_rho, pw_pools, mgrid_rspace, mgrid_gspace, mylmax)

      IF (PRESENT(external_vector)) THEN
         nao = SIZE(external_vector)
         ALLOCATE (eigenvector(nao))
         eigenvector = external_vector
      ELSE
         CALL cp_fm_get_info(matrix=mo_vectors, nrow_global=nao)
         ALLOCATE (eigenvector(nao))
         DO i = 1, nao
            CALL cp_fm_get_element(mo_vectors, i, ivector, eigenvector(i), local)
         ENDDO
      ENDIF

      ! *** set up the pw multi-grids
      CPASSERT(ASSOCIATED(pw_env))
      CALL pw_env_get(pw_env, rs_grids=rs_rho, pw_pools=pw_pools, &
                      cube_info=cube_info, gridlevel_info=gridlevel_info)

      CALL pw_pools_create_pws(pw_pools, mgrid_gspace, &
                               use_data=COMPLEXDATA1D, &
                               in_space=RECIPROCALSPACE)
      CALL pw_pools_create_pws(pw_pools, mgrid_rspace, &
                               use_data=REALDATA3D, &
                               in_space=REALSPACE)

      ! *** set up rs multi-grids
      DO igrid_level = 1, gridlevel_info%ngrid_levels
         CALL rs_grid_retain(rs_rho(igrid_level)%rs_grid)
         CALL rs_grid_zero(rs_rho(igrid_level)%rs_grid)
      END DO

      eps_rho_rspace = dft_control%qs_control%eps_rho_rspace
!   *** Allocate work storage ***
      CALL get_atomic_kind_set(atomic_kind_set, natom=natom)
      CALL get_qs_kind_set(qs_kind_set, &
                           maxco=maxco, &
                           maxsgf_set=maxsgf_set, &
                           basis_type=my_basis_type)

      ALLOCATE (pab(maxco, 1))
      ALLOCATE (work(maxco, 1))

      offset = 0
      group = mgrid_rspace(1)%pw%pw_grid%para%group
      my_pos = mgrid_rspace(1)%pw%pw_grid%para%my_pos
      group_size = mgrid_rspace(1)%pw%pw_grid%para%group_size
      ALLOCATE (where_is_the_point(0:group_size - 1))

      !loop over natom to find maximum value of la_max
      lmax_global = 0
      DO iatom = 1, natom
         ikind = particle_set(iatom)%atomic_kind%kind_number
         CALL get_qs_kind(qs_kind_set(ikind), basis_set=orb_basis_set, basis_type=my_basis_type)
         CALL get_gto_basis_set(gto_basis_set=orb_basis_set, lmax=mylmax)
         lmax_global = MAX(MAXVAL(mylmax), lmax_global)
      END DO

      DO iatom = 1, natom
         ikind = particle_set(iatom)%atomic_kind%kind_number
         CALL get_qs_kind(qs_kind_set(ikind), basis_set=orb_basis_set, basis_type=my_basis_type)
         CALL get_gto_basis_set(gto_basis_set=orb_basis_set, &
                                first_sgf=first_sgfa, &
                                lmax=la_max, &
                                lmin=la_min, &
                                npgf=npgfa, &
                                nset=nseta, &
                                nsgf_set=nsgfa, &
                                sphi=sphi_a, &
                                zet=zeta)
         ra(:) = pbc(particle_set(iatom)%r, cell)
         rab(:) = 0.0_dp
         rab2 = 0.0_dp
         dab = 0.0_dp

         DO iset = 1, nseta

            ncoa = npgfa(iset)*ncoset(la_max(iset))
            sgfa = first_sgfa(1, iset)

            DO i = 1, nsgfa(iset)
               work(i, 1) = eigenvector(offset + i)
            ENDDO

            CALL dgemm("N", "N", ncoa, 1, nsgfa(iset), &
                       1.0_dp, sphi_a(1, sgfa), SIZE(sphi_a, 1), &
                       work(1, 1), SIZE(work, 1), &
                       0.0_dp, pab(1, 1), SIZE(pab, 1))

            DO ipgf = 1, npgfa(iset)

               na1 = (ipgf - 1)*ncoset(la_max(iset)) + 1
               na2 = ipgf*ncoset(la_max(iset))

               scale = 1.0_dp
               zetp = zeta(ipgf, iset)
               igrid_level = gaussian_gridlevel(gridlevel_info, zetp)

               map_it_here = .FALSE.

               IF (.NOT. ALL(rs_rho(igrid_level)%rs_grid%desc%perd == 1)) THEN
                  DO dir = 1, 3
                     ! bounds of local grid (i.e. removing the 'wings'), if periodic
                     tp(dir) = FLOOR(DOT_PRODUCT(cell%h_inv(dir, :), ra)*rs_rho(igrid_level)%rs_grid%desc%npts(dir))
                     tp(dir) = MODULO(tp(dir), rs_rho(igrid_level)%rs_grid%desc%npts(dir))
                     IF (rs_rho(igrid_level)%rs_grid%desc%perd(dir) .NE. 1) THEN
                        lb(dir) = rs_rho(igrid_level)%rs_grid%lb_local(dir) + rs_rho(igrid_level)%rs_grid%desc%border
                        ub(dir) = rs_rho(igrid_level)%rs_grid%ub_local(dir) - rs_rho(igrid_level)%rs_grid%desc%border
                     ELSE
                        lb(dir) = rs_rho(igrid_level)%rs_grid%lb_local(dir)
                        ub(dir) = rs_rho(igrid_level)%rs_grid%ub_local(dir)
                     ENDIF
                     ! distributed grid, only map if it is local to the grid
                     location(dir) = tp(dir) + rs_rho(igrid_level)%rs_grid%desc%lb(dir)
                  ENDDO
                  IF (lb(1) <= location(1) .AND. location(1) <= ub(1) .AND. &
                      lb(2) <= location(2) .AND. location(2) <= ub(2) .AND. &
                      lb(3) <= location(3) .AND. location(3) <= ub(3)) THEN
                     map_it_here = .TRUE.
                  ENDIF
               ELSE
                  ! not distributed, just a round-robin distribution over the full set of CPUs
                  IF (MODULO(offset, group_size) == my_pos) map_it_here = .TRUE.
               ENDIF

               IF (map_it_here) CALL collocate_pgf_product_rspace( &
                  la_max(iset), zeta(ipgf, iset), la_min(iset), &
                  0, 0.0_dp, 0, &
                  ra, rab, rab2, scale, pab, na1 - 1, 0, &
                  rs_rho(igrid_level)%rs_grid, cell, cube_info(igrid_level), &
                  eps_rho_rspace, map_consistent=.TRUE., ga_gb_function=FUNC_AB, &
                  lmax_global=lmax_global)

            END DO

            offset = offset + nsgfa(iset)

         END DO

      END DO

      DO igrid_level = 1, gridlevel_info%ngrid_levels
         CALL rs_pw_transfer(rs_rho(igrid_level)%rs_grid, &
                             mgrid_rspace(igrid_level)%pw, rs2pw)
         CALL rs_grid_release(rs_rho(igrid_level)%rs_grid)
      ENDDO

      CALL pw_zero(rho_gspace%pw)
      DO igrid_level = 1, gridlevel_info%ngrid_levels
         CALL pw_transfer(mgrid_rspace(igrid_level)%pw, &
                          mgrid_gspace(igrid_level)%pw)
         CALL pw_axpy(mgrid_gspace(igrid_level)%pw, rho_gspace%pw)
      END DO

      CALL pw_transfer(rho_gspace%pw, rho%pw)

      ! Release work storage
      DEALLOCATE (eigenvector)

      DEALLOCATE (pab)

      DEALLOCATE (work)

      ! give back the pw multi-grids
      CALL pw_pools_give_back_pws(pw_pools, mgrid_gspace)
      CALL pw_pools_give_back_pws(pw_pools, mgrid_rspace)

      CALL timestop(handle)

   END SUBROUTINE calculate_wavefunction

! **************************************************************************************************
!> \brief low level collocation of primitive gaussian functions
!> \param la_max ...
!> \param zeta ...
!> \param la_min ...
!> \param lb_max ...
!> \param zetb ...
!> \param lb_min ...
!> \param ra ...
!> \param rab ...
!> \param rab2 ...
!> \param scale ...
!> \param pab ...
!> \param o1 ...
!> \param o2 ...
!> \param rsgrid ...
!> \param cell ...
!> \param cube_info ...
!> \param eps_rho_rspace ...
!> \param ga_gb_function ...
!> \param lgrid ...
!> \param ithread ...
!> \param map_consistent ...
!> \param collocate_rho0 ...
!> \param rpgf0_s ...
!> \param idir ...
!> \param ir ...
!> \param rsgauge ...
!> \param rsbuf ...
!> \param use_subpatch ...
!> \param subpatch_pattern ...
!> \param lmax_global Maximum possible value of lmax used to dimension arrays
! **************************************************************************************************
   SUBROUTINE collocate_pgf_product_rspace(la_max, zeta, la_min, &
                                           lb_max, zetb, lb_min, &
                                           ra, rab, rab2, scale, pab, o1, o2, &
                                           rsgrid, cell, cube_info, &
                                           eps_rho_rspace, ga_gb_function, &
                                           lgrid, ithread, &
                                           map_consistent, &
                                           collocate_rho0, &
                                           rpgf0_s, idir, ir, rsgauge, rsbuf, &
                                           use_subpatch, subpatch_pattern, &
                                           lmax_global)

      INTEGER, INTENT(IN)                      :: la_max
      REAL(KIND=dp), INTENT(IN)                :: zeta
      INTEGER, INTENT(IN)                      :: la_min, lb_max
      REAL(KIND=dp), INTENT(IN)                :: zetb
      INTEGER, INTENT(IN)                      :: lb_min
      REAL(KIND=dp), DIMENSION(3), INTENT(IN)  :: ra, rab
      REAL(KIND=dp), INTENT(IN)                :: rab2, scale
      REAL(KIND=dp), DIMENSION(:, :), POINTER  :: pab
      INTEGER, INTENT(IN)                      :: o1, o2
      TYPE(realspace_grid_type), POINTER       :: rsgrid
      TYPE(cell_type), POINTER                 :: cell
      TYPE(cube_info_type), INTENT(IN)         :: cube_info
      REAL(KIND=dp), INTENT(IN)                :: eps_rho_rspace
      INTEGER, INTENT(IN)                      :: ga_gb_function
      TYPE(lgrid_type), OPTIONAL               :: lgrid
      INTEGER, INTENT(IN), OPTIONAL            :: ithread
      LOGICAL, INTENT(IN), OPTIONAL            :: map_consistent, collocate_rho0
      REAL(dp), INTENT(IN), OPTIONAL           :: rpgf0_s
      INTEGER, INTENT(IN), OPTIONAL            :: idir, ir
      TYPE(realspace_grid_type), POINTER, OPTIONAL :: rsgauge, rsbuf
      LOGICAL, OPTIONAL                        :: use_subpatch
      INTEGER(KIND=int_8), OPTIONAL, INTENT(IN):: subpatch_pattern
      INTEGER, INTENT(IN)                      :: lmax_global

      CHARACTER(len=*), PARAMETER :: routineN = 'collocate_pgf_product_rspace', &
                                     routineP = moduleN//':'//routineN

      INTEGER :: cmax, gridbounds(2, 3), i, ico, icoef, ider1, ider2, ig, ithread_l, &
                 jco, k, l, la_max_local, la_min_local, lb_max_local, lb_min_local, &
                 length, lxa, lxb, lxy, lxyz, lya, lyb, &
                 lza, lzb, o1_local, o2_local, offset, start
      INTEGER, DIMENSION(3)                    :: cubecenter, lb_cube, ng, &
                                                  ub_cube
      INTEGER, DIMENSION(:), POINTER           :: sphere_bounds
      LOGICAL                                  :: my_collocate_rho0, &
                                                  my_map_consistent, subpatch_collocate
      REAL(KIND=dp) :: a, b, binomial_k_lxa, binomial_l_lxb, cutoff, f, pg, &
                       prefactor, radius, rpg, zetp
      REAL(KIND=dp), DIMENSION(3)              :: dr, rap, rb, rbp, roffset, rp
      REAL(KIND=dp), DIMENSION(:, :), POINTER  :: pab_local
      REAL(KIND=dp), DIMENSION(:, :, :), &
         POINTER                                :: grid

      INTEGER :: lxp, lyp, lzp, lp, lxpm, iaxis
      INTEGER, ALLOCATABLE, DIMENSION(:, :) :: map
      REAL(kind=dp) :: p_ele
      REAL(kind=dp), DIMENSION(0:lmax_global*2, 0:lmax_global, 0:lmax_global, 3) :: alpha
      REAL(kind=dp), DIMENSION(((lmax_global*2 + 1)*(lmax_global*2 + 2)*(lmax_global*2 + 3))/6) :: coef_xyz
      REAL(kind=dp), DIMENSION(((lmax_global*2 + 1)*(lmax_global*2 + 2))/2) :: coef_xyt
      REAL(kind=dp), DIMENSION(0:lmax_global*2) :: coef_xtt

      REAL(kind=dp), ALLOCATABLE, DIMENSION(:, :, :) :: pol_z
      REAL(kind=dp), ALLOCATABLE, DIMENSION(:, :, :) :: pol_y
      REAL(kind=dp), ALLOCATABLE, DIMENSION(:, :) :: pol_x
      REAL(KIND=dp) :: t_exp_1, t_exp_2, t_exp_min_1, t_exp_min_2, t_exp_plus_1, t_exp_plus_2

      REAL(kind=dp), POINTER, DIMENSION(:, :, :) :: grid_tmp, gauge

      subpatch_collocate = .FALSE.

      IF (PRESENT(use_subpatch)) THEN
         IF (use_subpatch) THEN
            subpatch_collocate = .TRUE.
            CPASSERT(PRESENT(subpatch_pattern))
         ENDIF
      ENDIF

      IF (PRESENT(ithread)) THEN
         ithread_l = ithread
      ELSE
         ithread_l = 0
      ENDIF

      ! use identical radii for integrate and collocate ?
      IF (PRESENT(map_consistent)) THEN
         my_map_consistent = map_consistent
      ELSE
         my_map_consistent = .FALSE.
      ENDIF

      IF (PRESENT(collocate_rho0) .AND. PRESENT(rpgf0_s)) THEN
         my_collocate_rho0 = collocate_rho0
      ELSE
         my_collocate_rho0 = .FALSE.
      END IF

      zetp = zeta + zetb
      f = zetb/zetp
      rap(:) = f*rab(:)
      rbp(:) = rap(:) - rab(:)
      rp(:) = ra(:) + rap(:)
      rb(:) = ra(:) + rab(:)

      ! check to avoid overflows
      a = MAXVAL(ABS(rsgrid%desc%dh))
      a = 300._dp/(a*a)
      !   CPASSERT(zetp < a)

      IF (my_map_consistent) THEN
         cutoff = 1.0_dp
         prefactor = EXP(-zeta*f*rab2)
         radius = exp_radius_very_extended(la_min, la_max, lb_min, lb_max, ra=ra, rb=rb, rp=rp, &
                                           zetp=zetp, eps=eps_rho_rspace, &
                                           prefactor=prefactor, cutoff=cutoff)
         prefactor = scale*EXP(-zeta*f*rab2)
      ELSE IF (my_collocate_rho0) THEN
         cutoff = 0.0_dp
         prefactor = 1.0_dp
         radius = rpgf0_s
      ELSE
         cutoff = 0.0_dp
         prefactor = scale*EXP(-zeta*f*rab2)
         radius = exp_radius_very_extended(la_min, la_max, lb_min, lb_max, pab, o1, o2, ra, rb, rp, &
                                           zetp, eps_rho_rspace, prefactor, cutoff)
      ENDIF

      a = MAXVAL(ABS(rsgrid%desc%dh))
      IF (radius .LT. a/2.0_dp) THEN
         RETURN
      END IF

      ! it's a choice to compute lX_min/max, pab here,
      ! this way we get the same radius as we use for the corresponding density
      SELECT CASE (ga_gb_function)
      CASE (FUNC_DADB)
         la_max_local = la_max + 1
         la_min_local = MAX(la_min - 1, 0)
         lb_max_local = lb_max + 1
         lb_min_local = MAX(lb_min - 1, 0)
         ! create a new pab_local so that mapping pab_local with pgf_a pgf_b
         ! is equivalent to mapping pab with 0.5 * (nabla pgf_a) . (nabla pgf_b)
         ! (ddx pgf_a ) (ddx pgf_b) = (lax pgf_{a-1x} - 2*zeta*pgf_{a+1x})*(lbx pgf_{b-1x} - 2*zetb*pgf_{b+1x})
         ! cleaner would possibly be to touch pzyx directly (avoiding the following allocate)
         ALLOCATE (pab_local(ncoset(la_max_local), ncoset(lb_max_local)))
         pab_local = 0.0_dp
         DO lxa = 0, la_max
         DO lxb = 0, lb_max
            DO lya = 0, la_max - lxa
            DO lyb = 0, lb_max - lxb
               DO lza = MAX(la_min - lxa - lya, 0), la_max - lxa - lya
               DO lzb = MAX(lb_min - lxb - lyb, 0), lb_max - lxb - lyb

                  ! this element of pab results in 12 elements of pab_local
                  CALL prepare_dadb(pab_local, pab, lxa, lya, lza, lxb, lyb, lzb, o1, o2, zeta, zetb)

               ENDDO
               ENDDO
            ENDDO
            ENDDO
         ENDDO
         ENDDO
         o1_local = 0
         o2_local = 0
         pab_local = pab_local*0.5_dp
      CASE (FUNC_ADBmDAB)
         CPASSERT(PRESENT(idir))
         la_max_local = la_max + 1
         la_min_local = MAX(la_min - 1, 0)
         lb_max_local = lb_max + 1
         lb_min_local = MAX(lb_min - 1, 0)
         ! create a new pab_local so that mapping pab_local with pgf_a pgf_b
         ! is equivalent to mapping pab with
         !    pgf_a (nabla_{idir} pgf_b) - (nabla_{idir} pgf_a) pgf_b
         ! ( pgf_a ) (ddx pgf_b) - (ddx pgf_a)( pgf_b ) =
         !          pgf_a *(lbx pgf_{b-1x} - 2*zetb*pgf_{b+1x}) -
         !                   (lax pgf_{a-1x} - 2*zeta*pgf_{a+1x}) pgf_b

         ALLOCATE (pab_local(ncoset(la_max_local), ncoset(lb_max_local)))
         pab_local = 0.0_dp
         DO lxa = 0, la_max
         DO lxb = 0, lb_max
            DO lya = 0, la_max - lxa
            DO lyb = 0, lb_max - lxb
               DO lza = MAX(la_min - lxa - lya, 0), la_max - lxa - lya
               DO lzb = MAX(lb_min - lxb - lyb, 0), lb_max - lxb - lyb
                  ! this element of pab results in 4 elements of pab_local
                  CALL prepare_adb_m_dab(pab_local, pab, idir, &
                                         lxa, lya, lza, lxb, lyb, lzb, o1, o2, zeta, zetb)
               END DO
               END DO
            END DO
            END DO
         END DO
         END DO
         o1_local = 0
         o2_local = 0
      CASE (FUNC_DABpADB)
         CPASSERT(PRESENT(idir))
         la_max_local = la_max + 1
         la_min_local = MAX(la_min - 1, 0)
         lb_max_local = lb_max + 1
         lb_min_local = MAX(lb_min - 1, 0)
         ! create a new pab_local so that mapping pab_local with pgf_a pgf_b
         ! is equivalent to mapping pab with
         !    pgf_a (nabla_{idir} pgf_b) + (nabla_{idir} pgf_a) pgf_b
         ! ( pgf_a ) (ddx pgf_b) + (ddx pgf_a)( pgf_b ) =
         !          pgf_a *(lbx pgf_{b-1x} + 2*zetb*pgf_{b+1x}) +
         !                   (lax pgf_{a-1x} + 2*zeta*pgf_{a+1x}) pgf_b

         ALLOCATE (pab_local(ncoset(la_max_local), ncoset(lb_max_local)))
         pab_local = 0.0_dp
         DO lxa = 0, la_max
         DO lxb = 0, lb_max
            DO lya = 0, la_max - lxa
            DO lyb = 0, lb_max - lxb
               DO lza = MAX(la_min - lxa - lya, 0), la_max - lxa - lya
               DO lzb = MAX(lb_min - lxb - lyb, 0), lb_max - lxb - lyb
                  ! this element of pab results in 4 elements of pab_local
                  CALL prepare_dab_p_adb(pab_local, pab, idir, &
                                         lxa, lya, lza, lxb, lyb, lzb, o1, o2, zeta, zetb)
               END DO
               END DO
            END DO
            END DO
         END DO
         END DO
         o1_local = 0
         o2_local = 0
      CASE (FUNC_DX, FUNC_DY, FUNC_DZ)
         ider1 = ga_gb_function - 500
         la_max_local = la_max + 1
         la_min_local = MAX(la_min - 1, 0)
         lb_max_local = lb_max + 1
         lb_min_local = MAX(lb_min - 1, 0)
         ! create a new pab_local so that mapping pab_local with pgf_a pgf_b
         ! is equivalent to mapping pab with
         !   d_{ider1} pgf_a d_{ider1} pgf_b
         ! dx pgf_a dx pgf_b =
         !        (lax pgf_{a-1x})*(lbx pgf_{b-1x}) - 2*zetb*lax*pgf_{a-1x}*pgf_{b+1x} -
         !         lbx pgf_{b-1x}*2*zeta*pgf_{a+1x}+ 4*zeta*zetab*pgf_{a+1x}pgf_{b+1x}

         ALLOCATE (pab_local(ncoset(la_max_local), ncoset(lb_max_local)))
         pab_local = 0.0_dp
         DO lxa = 0, la_max
         DO lxb = 0, lb_max
            DO lya = 0, la_max - lxa
            DO lyb = 0, lb_max - lxb
               DO lza = MAX(la_min - lxa - lya, 0), la_max - lxa - lya
               DO lzb = MAX(lb_min - lxb - lyb, 0), lb_max - lxb - lyb
                  ! this element of pab results in 4 elements of pab_local
                  CALL prepare_dIadIb(pab_local, pab, ider1, &
                                      lxa, lya, lza, lxb, lyb, lzb, o1, o2, zeta, zetb)
               END DO
               END DO
            END DO
            END DO
         END DO
         END DO
         o1_local = 0
         o2_local = 0
      CASE (FUNC_DXDY, FUNC_DYDZ, FUNC_DZDX)
         ider1 = ga_gb_function - 600
         ider2 = ga_gb_function - 600 + 1
         IF (ider2 > 3) ider2 = ider1 - 2
         la_max_local = la_max + 2
         la_min_local = MAX(la_min - 2, 0)
         lb_max_local = lb_max + 2
         lb_min_local = MAX(lb_min - 2, 0)
         ! create a new pab_local so that mapping pab_local with pgf_a pgf_b
         ! is equivalent to mapping pab with
         !   d_{ider1} pgf_a d_{ider1} pgf_b
         ALLOCATE (pab_local(ncoset(la_max_local), ncoset(lb_max_local)))
         pab_local = 0.0_dp
         DO lxa = 0, la_max
         DO lxb = 0, lb_max
            DO lya = 0, la_max - lxa
            DO lyb = 0, lb_max - lxb
               DO lza = MAX(la_min - lxa - lya, 0), la_max - lxa - lya
               DO lzb = MAX(lb_min - lxb - lyb, 0), lb_max - lxb - lyb
                  ! this element of pab results in 16 elements of pab_local
                  CALL prepare_dijadijb(pab_local, pab, ider1, ider2, &
                                        lxa, lya, lza, lxb, lyb, lzb, o1, o2, zeta, zetb)
               END DO
               END DO
            END DO
            END DO
         END DO
         END DO
         o1_local = 0
         o2_local = 0
      CASE (FUNC_DXDX, FUNC_DYDY, FUNC_DZDZ)
         ider1 = ga_gb_function - 603
         la_max_local = la_max + 2
         la_min_local = MAX(la_min - 2, 0)
         lb_max_local = lb_max + 2
         lb_min_local = MAX(lb_min - 2, 0)
         ! create a new pab_local so that mapping pab_local with pgf_a pgf_b
         ! is equivalent to mapping pab with
         !   dd_{ider1} pgf_a dd_{ider1} pgf_b

         ALLOCATE (pab_local(ncoset(la_max_local), ncoset(lb_max_local)))
         pab_local = 0.0_dp
         DO lxa = 0, la_max
         DO lxb = 0, lb_max
            DO lya = 0, la_max - lxa
            DO lyb = 0, lb_max - lxb
               DO lza = MAX(la_min - lxa - lya, 0), la_max - lxa - lya
               DO lzb = MAX(lb_min - lxb - lyb, 0), lb_max - lxb - lyb
                  ! this element of pab results in 9 elements of pab_local
                  CALL prepare_diiadiib(pab_local, pab, ider1, &
                                        lxa, lya, lza, lxb, lyb, lzb, o1, o2, zeta, zetb)
               END DO
               END DO
            END DO
            END DO
         END DO
         END DO
         o1_local = 0
         o2_local = 0
      CASE (FUNC_ARDBmDARB)
         CPASSERT(PRESENT(idir))
         CPASSERT(PRESENT(ir))
         la_max_local = la_max + 1
         la_min_local = MAX(la_min - 1, 0)
         lb_max_local = lb_max + 2
         lb_min_local = MAX(lb_min - 1, 0)
         ! create a new pab_local so that mapping pab_local with pgf_a pgf_b
         ! is equivalent to mapping pab with
         ! pgf_a (r-Rb)_{ir} (nabla_{idir} pgf_b) - (nabla_{idir} pgf_a) (r-Rb)_{ir}  pgf_b
         ! ( pgf_a )(r-Rb)_{ir} (ddx pgf_b) - (ddx pgf_a) (r-Rb)_{ir} ( pgf_b ) =
         !                        pgf_a *(lbx pgf_{b-1x+1ir} - 2*zetb*pgf_{b+1x+1ir}) -
         !                       (lax pgf_{a-1x} - 2*zeta*pgf_{a+1x}) pgf_{b+1ir}

         ALLOCATE (pab_local(ncoset(la_max_local), ncoset(lb_max_local)))
         pab_local = 0.0_dp
         DO lxa = 0, la_max
         DO lxb = 0, lb_max
            DO lya = 0, la_max - lxa
            DO lyb = 0, lb_max - lxb
               DO lza = MAX(la_min - lxa - lya, 0), la_max - lxa - lya
               DO lzb = MAX(lb_min - lxb - lyb, 0), lb_max - lxb - lyb

                  ! this element of pab results in 4 elements of pab_local
                  CALL prepare_ardb_m_darb(pab_local, pab, idir, ir, &
                                           lxa, lya, lza, lxb, lyb, lzb, o1, o2, zeta, zetb)
               END DO
               END DO
            END DO
            END DO
         END DO
         END DO
         o1_local = 0
         o2_local = 0
      CASE (FUNC_ARB)
         CPASSERT(PRESENT(ir))
         la_max_local = la_max
         la_min_local = la_min
         lb_max_local = lb_max + 1
         lb_min_local = lb_min
         ! create a new pab_local so that mapping pab_local with pgf_a pgf_b
         ! is equivalent to mapping pab with
         ! pgf_a (r-Rb)_{ir} pgf_b = pgf_a * pgf_{b+1ir}
         ALLOCATE (pab_local(ncoset(la_max_local), ncoset(lb_max_local)))
         pab_local = 0.0_dp
         DO lxa = 0, la_max
         DO lxb = 0, lb_max
            DO lya = 0, la_max - lxa
            DO lyb = 0, lb_max - lxb
               DO lza = MAX(la_min - lxa - lya, 0), la_max - lxa - lya
               DO lzb = MAX(lb_min - lxb - lyb, 0), lb_max - lxb - lyb
                  ! this element of pab results in 4 elements of pab_local
                  CALL prepare_arb(pab_local, pab, ir, lxa, lya, lza, lxb, lyb, lzb, o1, o2)
               END DO
               END DO
            END DO
            END DO
         END DO
         END DO
         o1_local = 0
         o2_local = 0
      CASE (FUNC_AB)
         la_max_local = la_max
         la_min_local = la_min
         lb_max_local = lb_max
         lb_min_local = lb_min
         pab_local => pab
         o1_local = o1
         o2_local = o2
      CASE DEFAULT
         CPASSERT(.FALSE.)
      END SELECT

      ng(:) = rsgrid%desc%npts(:)
      grid => rsgrid%r(:, :, :)
      gridbounds(1, 1) = LBOUND(GRID, 1)
      gridbounds(2, 1) = UBOUND(GRID, 1)
      gridbounds(1, 2) = LBOUND(GRID, 2)
      gridbounds(2, 2) = UBOUND(GRID, 2)
      gridbounds(1, 3) = LBOUND(GRID, 3)
      gridbounds(2, 3) = UBOUND(GRID, 3)

      IF (PRESENT(ir) .AND. PRESENT(rsgauge) .AND. PRESENT(rsbuf)) THEN
         grid => rsbuf%r(:, :, :)
         grid_tmp => rsgrid%r(:, :, :)
         gauge => rsgauge%r(:, :, :)
      ENDIF

!   *** initialise the coefficient matrix, we transform the sum
!
!   sum_{lxa,lya,lza,lxb,lyb,lzb} P_{lxa,lya,lza,lxb,lyb,lzb} *
!           (x-a_x)**lxa (y-a_y)**lya (z-a_z)**lza (x-b_x)**lxb (y-a_y)**lya (z-a_z)**lza
!
!   into
!
!   sum_{lxp,lyp,lzp} P_{lxp,lyp,lzp} (x-p_x)**lxp (y-p_y)**lyp (z-p_z)**lzp
!
!   where p is center of the product gaussian, and lp = la_max + lb_max
!   (current implementation is l**7)
!
      lp = la_max_local + lb_max_local
!
!   compute polynomial expansion coefs -> (x-a)**lxa (x-b)**lxb -> sum_{ls} alpha(ls,lxa,lxb,1)*(x-p)**ls
!
!   *** make the alpha matrix ***
      alpha(:, :, :, :) = 0.0_dp
      DO iaxis = 1, 3
      DO lxa = 0, la_max_local
      DO lxb = 0, lb_max_local
         binomial_k_lxa = 1.0_dp
         a = 1.0_dp
         DO k = 0, lxa
            binomial_l_lxb = 1.0_dp
            b = 1.0_dp
            DO l = 0, lxb
               alpha(lxa - l + lxb - k, lxa, lxb, iaxis) = alpha(lxa - l + lxb - k, lxa, lxb, iaxis) + &
                                                           binomial_k_lxa*binomial_l_lxb*a*b
               binomial_l_lxb = binomial_l_lxb*REAL(lxb - l, dp)/REAL(l + 1, dp)
               b = b*(rp(iaxis) - (ra(iaxis) + rab(iaxis)))
            ENDDO
            binomial_k_lxa = binomial_k_lxa*REAL(lxa - k, dp)/REAL(k + 1, dp)
            a = a*(-ra(iaxis) + rp(iaxis))
         ENDDO
      ENDDO
      ENDDO
      ENDDO

!
!   compute P_{lxp,lyp,lzp} given P_{lxa,lya,lza,lxb,lyb,lzb} and alpha(ls,lxa,lxb,1)
!   use a three step procedure
!   we don't store zeros, so counting is done using lxyz,lxy in order to have contiguous memory access in collocate_fast.F
!
      lxyz = 0
      DO lzp = 0, lp
      DO lyp = 0, lp - lzp
      DO lxp = 0, lp - lzp - lyp
         lxyz = lxyz + 1
         coef_xyz(lxyz) = 0.0_dp
      ENDDO
      ENDDO
      ENDDO
      DO lzb = 0, lb_max_local
      DO lza = 0, la_max_local
         lxy = 0
         DO lyp = 0, lp - lza - lzb
            DO lxp = 0, lp - lza - lzb - lyp
               lxy = lxy + 1
               coef_xyt(lxy) = 0.0_dp
            ENDDO
            lxy = lxy + lza + lzb
         ENDDO
         DO lyb = 0, lb_max_local - lzb
         DO lya = 0, la_max_local - lza
            lxpm = (lb_max_local - lzb - lyb) + (la_max_local - lza - lya)
            coef_xtt(0:lxpm) = 0.0_dp
            DO lxb = MAX(lb_min_local - lzb - lyb, 0), lb_max_local - lzb - lyb
            DO lxa = MAX(la_min_local - lza - lya, 0), la_max_local - lza - lya
               ico = coset(lxa, lya, lza)
               jco = coset(lxb, lyb, lzb)
               p_ele = prefactor*pab_local(o1_local + ico, o2_local + jco)
               DO lxp = 0, lxa + lxb
                  coef_xtt(lxp) = coef_xtt(lxp) + p_ele*alpha(lxp, lxa, lxb, 1)
               ENDDO
            ENDDO
            ENDDO
            lxy = 0
            DO lyp = 0, lya + lyb
               DO lxp = 0, lp - lza - lzb - lya - lyb
                  lxy = lxy + 1
                  coef_xyt(lxy) = coef_xyt(lxy) + alpha(lyp, lya, lyb, 2)*coef_xtt(lxp)
               ENDDO
               lxy = lxy + lza + lzb + lya + lyb - lyp
            ENDDO
         ENDDO
         ENDDO
         lxyz = 0
         DO lzp = 0, lza + lzb
            lxy = 0
            DO lyp = 0, lp - lza - lzb
               DO lxp = 0, lp - lza - lzb - lyp
                  lxy = lxy + 1; lxyz = lxyz + 1
                  coef_xyz(lxyz) = coef_xyz(lxyz) + alpha(lzp, lza, lzb, 3)*coef_xyt(lxy)
               ENDDO
               lxy = lxy + lza + lzb; lxyz = lxyz + lza + lzb - lzp
            ENDDO
            DO lyp = lp - lza - lzb + 1, lp - lzp
               DO lxp = 0, lp - lyp - lzp
                  lxyz = lxyz + 1
               ENDDO
            ENDDO
         ENDDO
      ENDDO
      ENDDO

      IF (subpatch_collocate) THEN
         CALL collocate_general_subpatch()
      ELSE
         IF (rsgrid%desc%orthorhombic) THEN
            CALL collocate_ortho()
            ! CALL collocate_general()
         ELSE
            CALL collocate_general_wings()
            !CALL collocate_general_opt()
         END IF
      END IF

      IF (ga_gb_function /= FUNC_AB) THEN
         DEALLOCATE (pab_local)
      ENDIF

   CONTAINS

      !
      ! this treats efficiently the orthogonal case
      !
! **************************************************************************************************
!> \brief ...
! **************************************************************************************************
      SUBROUTINE collocate_ortho()

!   *** properties of the grid ***

         ! notice we're in the ortho case
         dr(1) = rsgrid%desc%dh(1, 1)
         dr(2) = rsgrid%desc%dh(2, 2)
         dr(3) = rsgrid%desc%dh(3, 3)

!   *** get the sub grid properties for the given radius ***
         CALL return_cube(cube_info, radius, lb_cube, ub_cube, sphere_bounds)
         cmax = MAXVAL(ub_cube)

!   *** position of the gaussian product
!
!   this is the actual definition of the position on the grid
!   i.e. a point rp(:) gets here grid coordinates
!   MODULO(rp(:)/dr(:),ng(:))+1
!   hence (0.0,0.0,0.0) in real space is rsgrid%lb on the rsgrid ((1,1,1) on grid)
!

         ALLOCATE (map(-cmax:cmax, 3))
         CALL compute_cube_center(cubecenter, rsgrid%desc, zeta, zetb, ra, rab)
         roffset(:) = rp(:) - REAL(cubecenter(:), dp)*dr(:)
!   *** a mapping so that the ig corresponds to the right grid point
         DO i = 1, 3
            IF (rsgrid%desc%perd(i) == 1) THEN
               start = lb_cube(i)
               DO
                  offset = MODULO(cubecenter(i) + start, ng(i)) + 1 - start
                  length = MIN(ub_cube(i), ng(i) - offset) - start
                  DO ig = start, start + length
                     map(ig, i) = ig + offset
                  END DO
                  IF (start + length .GE. ub_cube(i)) EXIT
                  start = start + length + 1
               END DO
            ELSE
               ! this takes partial grid + border regions into account
               offset = MODULO(cubecenter(i) + lb_cube(i) + rsgrid%desc%lb(i) - rsgrid%lb_local(i), ng(i)) + 1 - lb_cube(i)
               ! check for out of bounds
               IF (ub_cube(i) + offset > UBOUND(grid, i) .OR. lb_cube(i) + offset < LBOUND(grid, i)) THEN
                  CPASSERT(.FALSE.)
               ENDIF
               DO ig = lb_cube(i), ub_cube(i)
                  map(ig, i) = ig + offset
               END DO
            END IF
         ENDDO
         ALLOCATE (pol_z(1:2, 0:lp, -cmax:0))
         ALLOCATE (pol_y(1:2, 0:lp, -cmax:0))
         ALLOCATE (pol_x(0:lp, -cmax:cmax))

         IF (PRESENT(ir) .AND. PRESENT(rsgauge)) CALL collocate_ortho_set_to_0()

#include "prep.f90"

         IF (PRESENT(lgrid)) THEN
#include "call_collocate_omp.f90"
         ELSE
#include "call_collocate.f90"
         END IF

         IF (PRESENT(ir) .AND. PRESENT(rsgauge)) CALL collocate_gauge_ortho()

         ! deallocation needed to pass around a pgi bug..
         DEALLOCATE (pol_z)
         DEALLOCATE (pol_y)
         DEALLOCATE (pol_x)
         DEALLOCATE (map)

      END SUBROUTINE collocate_ortho

! **************************************************************************************************
!> \brief ...
! **************************************************************************************************
      SUBROUTINE collocate_gauge_ortho()
      INTEGER                                            :: i, igmax, igmin, j, j2, jg, jg2, jgmin, &
                                                            k, k2, kg, kg2, kgmin, sci
      REAL(KIND=dp)                                      :: point(3, 4), res(4), x, y, y2, z, z2

! notice we're in the ortho case

         dr(1) = rsgrid%desc%dh(1, 1)
         dr(2) = rsgrid%desc%dh(2, 2)
         dr(3) = rsgrid%desc%dh(3, 3)
         !
         sci = 1
         kgmin = sphere_bounds(sci)
         sci = sci + 1
         DO kg = kgmin, 0
            kg2 = 1 - kg
            k = map(kg, 3)
            k2 = map(kg2, 3)
            jgmin = sphere_bounds(sci)
            sci = sci + 1
            z = (REAL(kg, dp) + REAL(cubecenter(3), dp))*dr(3)
            z2 = (REAL(kg2, dp) + REAL(cubecenter(3), dp))*dr(3)
            DO jg = jgmin, 0
               jg2 = 1 - jg
               j = map(jg, 2)
               j2 = map(jg2, 2)
               igmin = sphere_bounds(sci)
               sci = sci + 1
               igmax = 1 - igmin
               y = (REAL(jg, dp) + REAL(cubecenter(2), dp))*dr(2)
               y2 = (REAL(jg2, dp) + REAL(cubecenter(2), dp))*dr(2)
               DO ig = igmin, igmax
                  i = map(ig, 1)
                  x = (REAL(ig, dp) + REAL(cubecenter(1), dp))*dr(1)
                  point(1, 1) = x; point(2, 1) = y; point(3, 1) = z
                  point(1, 2) = x; point(2, 2) = y2; point(3, 2) = z
                  point(1, 3) = x; point(2, 3) = y; point(3, 3) = z2
                  point(1, 4) = x; point(2, 4) = y2; point(3, 4) = z2
                  !
                  res(1) = (point(ir, 1) - rb(ir)) - gauge(i, j, k)
                  res(2) = (point(ir, 2) - rb(ir)) - gauge(i, j2, k)
                  res(3) = (point(ir, 3) - rb(ir)) - gauge(i, j, k2)
                  res(4) = (point(ir, 4) - rb(ir)) - gauge(i, j2, k2)
                  !
                  grid_tmp(i, j, k) = grid_tmp(i, j, k) + grid(i, j, k)*res(1)
                  grid_tmp(i, j2, k) = grid_tmp(i, j2, k) + grid(i, j2, k)*res(2)
                  grid_tmp(i, j, k2) = grid_tmp(i, j, k2) + grid(i, j, k2)*res(3)
                  grid_tmp(i, j2, k2) = grid_tmp(i, j2, k2) + grid(i, j2, k2)*res(4)
               ENDDO
            ENDDO
         ENDDO
      END SUBROUTINE collocate_gauge_ortho

! **************************************************************************************************
!> \brief ...
! **************************************************************************************************
      SUBROUTINE collocate_ortho_set_to_0()
      INTEGER                                            :: i, igmax, igmin, j, j2, jg, jg2, jgmin, &
                                                            k, k2, kg, kg2, kgmin, sci

!

         dr(1) = rsgrid%desc%dh(1, 1)
         dr(2) = rsgrid%desc%dh(2, 2)
         dr(3) = rsgrid%desc%dh(3, 3)
         !
         sci = 1
         kgmin = sphere_bounds(sci)
         sci = sci + 1
         DO kg = kgmin, 0
            kg2 = 1 - kg
            k = map(kg, 3)
            k2 = map(kg2, 3)
            jgmin = sphere_bounds(sci)
            sci = sci + 1
            DO jg = jgmin, 0
               jg2 = 1 - jg
               j = map(jg, 2)
               j2 = map(jg2, 2)
               igmin = sphere_bounds(sci)
               sci = sci + 1
               igmax = 1 - igmin
               DO ig = igmin, igmax
                  i = map(ig, 1)
                  grid(i, j, k) = 0.0_dp
                  grid(i, j2, k) = 0.0_dp
                  grid(i, j, k2) = 0.0_dp
                  grid(i, j2, k2) = 0.0_dp
               ENDDO
            ENDDO
         ENDDO
      END SUBROUTINE collocate_ortho_set_to_0

!
!   this is a general 'optimized' routine to do the collocation
!
! **************************************************************************************************
!> \brief ...
! **************************************************************************************************
      SUBROUTINE collocate_general_opt()

      INTEGER :: i, i_index, il, ilx, ily, ilz, index_max(3), index_min(3), ismax, ismin, j, &
         j_index, jl, jlx, jly, jlz, k, k_index, kl, klx, kly, klz, lpx, lpy, lpz, lx, ly, lz, &
         offset(3)
      INTEGER, ALLOCATABLE, DIMENSION(:)                 :: grid_map
      INTEGER, ALLOCATABLE, DIMENSION(:, :, :)           :: coef_map
      REAL(KIND=dp)                                      :: a, b, c, d, di, dip, dj, djp, dk, dkp, &
                                                            exp0i, exp1i, exp2i, gp(3), &
                                                            hmatgrid(3, 3), pointj(3), pointk(3), &
                                                            res, v(3)
      REAL(KIND=dp), ALLOCATABLE, DIMENSION(:)           :: coef_ijk
      REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :)     :: hmatgridp

!
! transform P_{lxp,lyp,lzp} into a P_{lip,ljp,lkp} such that
! sum_{lxp,lyp,lzp} P_{lxp,lyp,lzp} (x-x_p)**lxp (y-y_p)**lyp (z-z_p)**lzp =
! sum_{lip,ljp,lkp} P_{lip,ljp,lkp} (i-i_p)**lip (j-j_p)**ljp (k-k_p)**lkp
!

         ALLOCATE (coef_ijk(((lp + 1)*(lp + 2)*(lp + 3))/6))

         ! aux mapping array to simplify life
         ALLOCATE (coef_map(0:lp, 0:lp, 0:lp))
         coef_map = HUGE(coef_map)
         lxyz = 0
         DO lzp = 0, lp
         DO lyp = 0, lp - lzp
         DO lxp = 0, lp - lzp - lyp
            lxyz = lxyz + 1
            coef_ijk(lxyz) = 0.0_dp
            coef_map(lxp, lyp, lzp) = lxyz
         ENDDO
         ENDDO
         ENDDO

         ! cell hmat in grid points
         hmatgrid = rsgrid%desc%dh

         ! center in grid coords
         gp = MATMUL(rsgrid%desc%dh_inv, rp)
         cubecenter(:) = FLOOR(gp)

         ! transform using multinomials
         ALLOCATE (hmatgridp(3, 3, 0:lp))
         hmatgridp(:, :, 0) = 1.0_dp
         DO k = 1, lp
            hmatgridp(:, :, k) = hmatgridp(:, :, k - 1)*hmatgrid(:, :)
         ENDDO

         lpx = lp
         DO klx = 0, lpx
         DO jlx = 0, lpx - klx
         DO ilx = 0, lpx - klx - jlx
            lx = ilx + jlx + klx
            lpy = lp - lx
            DO kly = 0, lpy
            DO jly = 0, lpy - kly
            DO ily = 0, lpy - kly - jly
               ly = ily + jly + kly
               lpz = lp - lx - ly
               DO klz = 0, lpz
               DO jlz = 0, lpz - klz
               DO ilz = 0, lpz - klz - jlz
                  lz = ilz + jlz + klz

                  il = ilx + ily + ilz
                  jl = jlx + jly + jlz
                  kl = klx + kly + klz
                  coef_ijk(coef_map(il, jl, kl)) = &
                     coef_ijk(coef_map(il, jl, kl)) + coef_xyz(coef_map(lx, ly, lz))* &
                     hmatgridp(1, 1, ilx)*hmatgridp(1, 2, jlx)*hmatgridp(1, 3, klx)* &
                     hmatgridp(2, 1, ily)*hmatgridp(2, 2, jly)*hmatgridp(2, 3, kly)* &
                     hmatgridp(3, 1, ilz)*hmatgridp(3, 2, jlz)*hmatgridp(3, 3, klz)* &
                     fac(lx)*fac(ly)*fac(lz)/ &
                     (fac(ilx)*fac(ily)*fac(ilz)*fac(jlx)*fac(jly)*fac(jlz)*fac(klx)*fac(kly)*fac(klz))
               ENDDO
               ENDDO
               ENDDO
            ENDDO
            ENDDO
            ENDDO
         ENDDO
         ENDDO
         ENDDO

         CALL return_cube_nonortho(cube_info, radius, index_min, index_max, rp)

         offset(:) = MODULO(index_min(:) + rsgrid%desc%lb(:) - rsgrid%lb_local(:), ng(:)) + 1

         ALLOCATE (grid_map(index_min(1):index_max(1)))
         DO i = index_min(1), index_max(1)
            grid_map(i) = MODULO(i, ng(1)) + 1
            IF (rsgrid%desc%perd(1) == 1) THEN
               grid_map(i) = MODULO(i, ng(1)) + 1
            ELSE
               grid_map(i) = i - index_min(1) + offset(1)
            ENDIF
         ENDDO

         ! go over the grid, but cycle if the point is not within the radius
         DO k = index_min(3), index_max(3)
            dk = k - gp(3)
            pointk = hmatgrid(:, 3)*dk

            IF (rsgrid%desc%perd(3) == 1) THEN
               k_index = MODULO(k, ng(3)) + 1
            ELSE
               k_index = k - index_min(3) + offset(3)
            ENDIF

            coef_xyt = 0.0_dp
            lxyz = 0
            dkp = 1.0_dp
            DO kl = 0, lp
               lxy = 0
               DO jl = 0, lp - kl
                  DO il = 0, lp - kl - jl
                     lxyz = lxyz + 1; lxy = lxy + 1
                     coef_xyt(lxy) = coef_xyt(lxy) + coef_ijk(lxyz)*dkp
                  ENDDO
                  lxy = lxy + kl
               ENDDO
               dkp = dkp*dk
            ENDDO

            DO j = index_min(2), index_max(2)
               dj = j - gp(2)
               pointj = pointk + hmatgrid(:, 2)*dj
               IF (rsgrid%desc%perd(2) == 1) THEN
                  j_index = MODULO(j, ng(2)) + 1
               ELSE
                  j_index = j - index_min(2) + offset(2)
               ENDIF

               coef_xtt = 0.0_dp
               lxy = 0
               djp = 1.0_dp
               DO jl = 0, lp
                  DO il = 0, lp - jl
                     lxy = lxy + 1
                     coef_xtt(il) = coef_xtt(il) + coef_xyt(lxy)*djp
                  ENDDO
                  djp = djp*dj
               ENDDO

               ! find bounds for the inner loop
               ! based on a quadratic equation in i
               ! a*i**2+b*i+c=radius**2
               v = pointj - gp(1)*hmatgrid(:, 1)
               a = DOT_PRODUCT(hmatgrid(:, 1), hmatgrid(:, 1))
               b = 2*DOT_PRODUCT(v, hmatgrid(:, 1))
               c = DOT_PRODUCT(v, v)
               d = b*b - 4*a*(c - radius**2)

               IF (d < 0) THEN
                  CYCLE
               ELSE
                  d = SQRT(d)
                  ismin = CEILING((-b - d)/(2*a))
                  ismax = FLOOR((-b + d)/(2*a))
               ENDIF
               ! prepare for computing -zetp*rsq
               a = -zetp*a
               b = -zetp*b
               c = -zetp*c
               i = ismin - 1

               ! the recursion relation might have to be done
               ! from the center of the gaussian (in both directions)
               ! instead as the current implementation from an edge
               exp2i = EXP((a*i + b)*i + c)
               exp1i = EXP(2*a*i + a + b)
               exp0i = EXP(2*a)

               DO i = ismin, ismax
                  di = i - gp(1)

                  ! polynomial terms
                  res = 0.0_dp
                  dip = 1.0_dp
                  DO il = 0, lp
                     res = res + coef_xtt(il)*dip
                     dip = dip*di
                  ENDDO

                  ! the exponential recursion
                  exp2i = exp2i*exp1i
                  exp1i = exp1i*exp0i
                  res = res*exp2i

                  i_index = grid_map(i)
                  IF (PRESENT(lgrid)) THEN
                     ig = (k_index - 1)*ng(2)*ng(1) + (j_index - 1)*ng(1) + (i_index - 1) + 1
                     lgrid%r(ig, ithread_l) = lgrid%r(ig, ithread_l) + res
                  ELSE
                     grid(i_index, j_index, k_index) = grid(i_index, j_index, k_index) + res
                  ENDIF
               ENDDO
            ENDDO
         ENDDO
         !t2=nanotime_ia32()
         !write(*,*) t2-t1
         ! deallocation needed to pass around a pgi bug..
         DEALLOCATE (coef_ijk)
         DEALLOCATE (coef_map)
         DEALLOCATE (hmatgridp)
         DEALLOCATE (grid_map)

      END SUBROUTINE collocate_general_opt

! **************************************************************************************************
!> \brief ...
! **************************************************************************************************
      SUBROUTINE collocate_general_subpatch()
      INTEGER, DIMENSION(2, 3)                           :: local_b
      INTEGER, DIMENSION(3)                              :: local_s, periodic
      REAL(dp), DIMENSION((lp+1)*(lp+2)*(lp+3)/6)        :: poly_d3

         periodic = 1 ! cell%perd
         CALL poly_cp2k2d3(coef_xyz, lp, poly_d3)
         local_b(1, :) = rsgrid%lb_real - rsgrid%desc%lb
         local_b(2, :) = rsgrid%ub_real - rsgrid%desc%lb
         local_s = rsgrid%lb_real - rsgrid%lb_local
         IF (BTEST(subpatch_pattern, 0)) local_b(1, 1) = local_b(1, 1) - rsgrid%desc%border
         IF (BTEST(subpatch_pattern, 1)) local_b(2, 1) = local_b(2, 1) + rsgrid%desc%border
         IF (BTEST(subpatch_pattern, 2)) local_b(1, 2) = local_b(1, 2) - rsgrid%desc%border
         IF (BTEST(subpatch_pattern, 3)) local_b(2, 2) = local_b(2, 2) + rsgrid%desc%border
         IF (BTEST(subpatch_pattern, 4)) local_b(1, 3) = local_b(1, 3) - rsgrid%desc%border
         IF (BTEST(subpatch_pattern, 5)) local_b(2, 3) = local_b(2, 3) + rsgrid%desc%border
         IF (BTEST(subpatch_pattern, 0)) local_s(1) = local_s(1) - rsgrid%desc%border
         IF (BTEST(subpatch_pattern, 2)) local_s(2) = local_s(2) - rsgrid%desc%border
         IF (BTEST(subpatch_pattern, 4)) local_s(3) = local_s(3) - rsgrid%desc%border
         IF (PRESENT(lgrid)) THEN
            CALL collocGauss(h=cell%hmat, h_inv=cell%h_inv, &
                             grid=grid, poly=poly_d3, alphai=zetp, posi=rp, max_r2=radius*radius, &
                             periodic=periodic, gdim=ng, local_bounds=local_b, local_shift=local_s, &
                             lgrid=lgrid)
         ELSE
            CALL collocGauss(h=cell%hmat, h_inv=cell%h_inv, &
                             grid=grid, poly=poly_d3, alphai=zetp, posi=rp, max_r2=radius*radius, &
                             periodic=periodic, gdim=ng, local_bounds=local_b, local_shift=local_s)
         END IF
         ! defaults: local_shift=(/0,0,0/),poly_shift=(/0.0_dp,0.0_dp,0.0_dp/),scale=1.0_dp,

      END SUBROUTINE

! **************************************************************************************************
!> \brief ...
! **************************************************************************************************
      SUBROUTINE collocate_general_wings()
      INTEGER, DIMENSION(2, 3)                           :: local_b
      INTEGER, DIMENSION(3)                              :: periodic
      REAL(dp), DIMENSION((lp+1)*(lp+2)*(lp+3)/6)        :: poly_d3
      REAL(dp), DIMENSION(3)                             :: local_shift, rShifted

         periodic = 1 ! cell%perd
         CALL poly_cp2k2d3(coef_xyz, lp, poly_d3)
         local_b(1, :) = 0
         local_b(2, :) = MIN(rsgrid%desc%npts - 1, rsgrid%ub_local - rsgrid%lb_local)
         local_shift = REAL(rsgrid%desc%lb - rsgrid%lb_local, dp)/REAL(rsgrid%desc%npts, dp)
         rShifted(1) = rp(1) + cell%hmat(1, 1)*local_shift(1) &
                       + cell%hmat(1, 2)*local_shift(2) &
                       + cell%hmat(1, 3)*local_shift(3)
         rShifted(2) = rp(2) + cell%hmat(2, 1)*local_shift(1) &
                       + cell%hmat(2, 2)*local_shift(2) &
                       + cell%hmat(2, 3)*local_shift(3)
         rShifted(3) = rp(3) + cell%hmat(3, 1)*local_shift(1) &
                       + cell%hmat(3, 2)*local_shift(2) &
                       + cell%hmat(3, 3)*local_shift(3)
         IF (PRESENT(lgrid)) THEN
            CALL collocGauss(h=cell%hmat, h_inv=cell%h_inv, &
                             grid=grid, poly=poly_d3, alphai=zetp, posi=rShifted, max_r2=radius*radius, &
                             periodic=periodic, gdim=ng, local_bounds=local_b, &
                             lgrid=lgrid)
         ELSE
            CALL collocGauss(h=cell%hmat, h_inv=cell%h_inv, &
                             grid=grid, poly=poly_d3, alphai=zetp, posi=rShifted, max_r2=radius*radius, &
                             periodic=periodic, gdim=ng, local_bounds=local_b)
         END IF
         ! defaults: local_shift=(/0,0,0/),poly_shift=(/0.0_dp,0.0_dp,0.0_dp/),scale=1.0_dp,

      END SUBROUTINE

!
!   this is a general 'reference' routine to do the collocation
!
! **************************************************************************************************
!> \brief ...
! **************************************************************************************************
      SUBROUTINE collocate_general()

      INTEGER                                            :: i, index_max(3), index_min(3), &
                                                            ipoint(3), j, k
      REAL(KIND=dp)                                      :: inv_ng(3), point(3), primpt

! still hard-wired (see MODULO)

         CPASSERT(ALL(rsgrid%desc%perd == 1))

         CALL return_cube_nonortho(cube_info, radius, index_min, index_max, rp)

         inv_ng = 1.0_dp/ng

         ! go over the grid, but cycle if the point is not within the radius
         DO k = index_min(3), index_max(3)
         DO j = index_min(2), index_max(2)
         DO i = index_min(1), index_max(1)
            ! point in real space
            point = MATMUL(cell%hmat, REAL((/i, j, k/), KIND=dp)*inv_ng)
            ! primitive_value of point
            primpt = primitive_value(point)
            ! skip if outside of the sphere
            IF (SUM((point - rp)**2) > radius**2) CYCLE
            ! point on the grid (including pbc)
            ipoint = MODULO((/i, j, k/), ng) + 1
            ! add to grid
            IF (PRESENT(lgrid)) THEN
               ig = ipoint(3)*ng(2)*ng(1) + ipoint(2)*ng(1) + ipoint(1) + 1
               lgrid%r(ig, ithread_l) = lgrid%r(ig, ithread_l) + primpt
            ELSE
               grid(ipoint(1), ipoint(2), ipoint(3)) = grid(ipoint(1), ipoint(2), ipoint(3)) + primpt
            ENDIF
         ENDDO
         ENDDO
         ENDDO

      END SUBROUTINE collocate_general

! **************************************************************************************************
!> \brief ...
!> \param point ...
!> \return ...
! **************************************************************************************************
      FUNCTION primitive_value(point) RESULT(res)
      REAL(KIND=dp)                                      :: point(3), res

      REAL(KIND=dp)                                      :: dra(3), drap(3), drb(3), drbp(3), myexp, &
                                                            pdrap

         res = 0.0_dp
         myexp = EXP(-zetp*SUM((point - rp)**2))*prefactor
         dra = point - ra
         drb = point - rb
         drap(1) = 1.0_dp
         DO lxa = 0, la_max_local
            drbp(1) = 1.0_dp
            DO lxb = 0, lb_max_local
               drap(2) = 1.0_dp
               DO lya = 0, la_max_local - lxa
                  drbp(2) = 1.0_dp
                  DO lyb = 0, lb_max_local - lxb
                     drap(3) = 1.0_dp
                     DO lza = 1, MAX(la_min_local - lxa - lya, 0)
                        drap(3) = drap(3)*dra(3)
                     ENDDO
                     DO lza = MAX(la_min_local - lxa - lya, 0), la_max_local - lxa - lya
                        drbp(3) = 1.0_dp
                        DO lzb = 1, MAX(lb_min_local - lxb - lyb, 0)
                           drbp(3) = drbp(3)*drb(3)
                        ENDDO
                        ico = coset(lxa, lya, lza)
                        pdrap = PRODUCT(drap)
                        DO lzb = MAX(lb_min_local - lxb - lyb, 0), lb_max_local - lxb - lyb
                           jco = coset(lxb, lyb, lzb)
                           res = res + pab_local(ico + o1_local, jco + o2_local)*myexp*pdrap*PRODUCT(drbp)
                           drbp(3) = drbp(3)*drb(3)
                        ENDDO
                        drap(3) = drap(3)*dra(3)
                     ENDDO
                     drbp(2) = drbp(2)*drb(2)
                  ENDDO
                  drap(2) = drap(2)*dra(2)
               ENDDO
               drbp(1) = drbp(1)*drb(1)
            ENDDO
            drap(1) = drap(1)*dra(1)
         ENDDO

      END FUNCTION primitive_value

   END SUBROUTINE collocate_pgf_product_rspace

! **************************************************************************************************
!> \brief low level collocation of primitive gaussian functions in g-space
!> \param la_max ...
!> \param zeta ...
!> \param la_min ...
!> \param lb_max ...
!> \param zetb ...
!> \param lb_min ...
!> \param ra ...
!> \param rab ...
!> \param rab2 ...
!> \param scale ...
!> \param pab ...
!> \param na ...
!> \param nb ...
!> \param eps_rho_gspace ...
!> \param gsq_max ...
!> \param pw ...
! **************************************************************************************************
   SUBROUTINE collocate_pgf_product_gspace(la_max, zeta, la_min, &
                                           lb_max, zetb, lb_min, &
                                           ra, rab, rab2, scale, pab, na, nb, &
                                           eps_rho_gspace, gsq_max, pw)

      ! NOTE: this routine is much slower than collocate_pgf_product_rspace

      INTEGER, INTENT(IN)                                :: la_max
      REAL(dp), INTENT(IN)                               :: zeta
      INTEGER, INTENT(IN)                                :: la_min, lb_max
      REAL(dp), INTENT(IN)                               :: zetb
      INTEGER, INTENT(IN)                                :: lb_min
      REAL(dp), DIMENSION(3), INTENT(IN)                 :: ra, rab
      REAL(dp), INTENT(IN)                               :: rab2, scale
      REAL(dp), DIMENSION(:, :), POINTER                 :: pab
      INTEGER, INTENT(IN)                                :: na, nb
      REAL(dp), INTENT(IN)                               :: eps_rho_gspace, gsq_max
      TYPE(pw_type), POINTER                             :: pw

      CHARACTER(LEN=*), PARAMETER :: routineN = 'collocate_pgf_product_gspace', &
                                     routineP = moduleN//':'//routineN

      COMPLEX(dp), DIMENSION(3)                          :: phasefactor
      COMPLEX(dp), DIMENSION(:), POINTER                 :: rag, rbg
      COMPLEX(dp), DIMENSION(:, :, :, :), POINTER        :: cubeaxis
      INTEGER                                            :: ax, ay, az, bx, by, bz, handle, i, ico, &
                                                            ig, ig2, jco, jg, kg, la, lb, &
                                                            lb_cube_min, lb_grid, ub_cube_max, &
                                                            ub_grid
      INTEGER, DIMENSION(3)                              :: lb_cube, ub_cube
      REAL(dp)                                           :: f, fa, fb, pij, prefactor, rzetp, &
                                                            twozetp, zetp
      REAL(dp), DIMENSION(3)                             :: dg, expfactor, fap, fbp, rap, rbp, rp
      REAL(dp), DIMENSION(:), POINTER                    :: g

      CALL timeset(routineN, handle)

      dg(:) = twopi/(pw%pw_grid%npts(:)*pw%pw_grid%dr(:))

      zetp = zeta + zetb
      rzetp = 1.0_dp/zetp
      f = zetb*rzetp
      rap(:) = f*rab(:)
      rbp(:) = rap(:) - rab(:)
      rp(:) = ra(:) + rap(:)
      twozetp = 2.0_dp*zetp
      fap(:) = twozetp*rap(:)
      fbp(:) = twozetp*rbp(:)

      prefactor = scale*SQRT((pi*rzetp)**3)*EXP(-zeta*f*rab2)
      phasefactor(:) = EXP(CMPLX(0.0_dp, -rp(:)*dg(:), KIND=dp))
      expfactor(:) = EXP(-0.25*rzetp*dg(:)*dg(:))

      lb_cube(:) = pw%pw_grid%bounds(1, :)
      ub_cube(:) = pw%pw_grid%bounds(2, :)

      lb_cube_min = MINVAL(lb_cube(:))
      ub_cube_max = MAXVAL(ub_cube(:))

      NULLIFY (cubeaxis, g, rag, rbg)

      CALL reallocate(cubeaxis, lb_cube_min, ub_cube_max, 1, 3, 0, la_max, 0, lb_max)
      CALL reallocate(g, lb_cube_min, ub_cube_max)
      CALL reallocate(rag, lb_cube_min, ub_cube_max)
      CALL reallocate(rbg, lb_cube_min, ub_cube_max)

      lb_grid = LBOUND(pw%cc, 1)
      ub_grid = UBOUND(pw%cc, 1)

      DO i = 1, 3

         DO ig = lb_cube(i), ub_cube(i)
            ig2 = ig*ig
            cubeaxis(ig, i, 0, 0) = expfactor(i)**ig2*phasefactor(i)**ig
         END DO

         IF (la_max > 0) THEN
            DO ig = lb_cube(i), ub_cube(i)
               g(ig) = REAL(ig, dp)*dg(i)
               rag(ig) = CMPLX(fap(i), -g(ig), KIND=dp)
               cubeaxis(ig, i, 1, 0) = rag(ig)*cubeaxis(ig, i, 0, 0)
            END DO
            DO la = 2, la_max
               fa = REAL(la - 1, dp)*twozetp
               DO ig = lb_cube(i), ub_cube(i)
                  cubeaxis(ig, i, la, 0) = rag(ig)*cubeaxis(ig, i, la - 1, 0) + &
                                           fa*cubeaxis(ig, i, la - 2, 0)
               END DO
            END DO
            IF (lb_max > 0) THEN
               fa = twozetp
               DO ig = lb_cube(i), ub_cube(i)
                  rbg(ig) = CMPLX(fbp(i), -g(ig), KIND=dp)
                  cubeaxis(ig, i, 0, 1) = rbg(ig)*cubeaxis(ig, i, 0, 0)
                  cubeaxis(ig, i, 1, 1) = rbg(ig)*cubeaxis(ig, i, 1, 0) + &
                                          fa*cubeaxis(ig, i, 0, 0)
               END DO
               DO lb = 2, lb_max
                  fb = REAL(lb - 1, dp)*twozetp
                  DO ig = lb_cube(i), ub_cube(i)
                     cubeaxis(ig, i, 0, lb) = rbg(ig)*cubeaxis(ig, i, 0, lb - 1) + &
                                              fb*cubeaxis(ig, i, 0, lb - 2)
                     cubeaxis(ig, i, 1, lb) = rbg(ig)*cubeaxis(ig, i, 1, lb - 1) + &
                                              fb*cubeaxis(ig, i, 1, lb - 2) + &
                                              fa*cubeaxis(ig, i, 0, lb - 1)
                  END DO
               END DO
               DO la = 2, la_max
                  fa = REAL(la, dp)*twozetp
                  DO ig = lb_cube(i), ub_cube(i)
                     cubeaxis(ig, i, la, 1) = rbg(ig)*cubeaxis(ig, i, la, 0) + &
                                              fa*cubeaxis(ig, i, la - 1, 0)
                  END DO
                  DO lb = 2, lb_max
                     fb = REAL(lb - 1, dp)*twozetp
                     DO ig = lb_cube(i), ub_cube(i)
                        cubeaxis(ig, i, la, lb) = rbg(ig)*cubeaxis(ig, i, la, lb - 1) + &
                                                  fb*cubeaxis(ig, i, la, lb - 2) + &
                                                  fa*cubeaxis(ig, i, la - 1, lb - 1)
                     END DO
                  END DO
               END DO
            END IF
         ELSE
            IF (lb_max > 0) THEN
               DO ig = lb_cube(i), ub_cube(i)
                  g(ig) = REAL(ig, dp)*dg(i)
                  rbg(ig) = CMPLX(fbp(i), -g(ig), KIND=dp)
                  cubeaxis(ig, i, 0, 1) = rbg(ig)*cubeaxis(ig, i, 0, 0)
               END DO
               DO lb = 2, lb_max
                  fb = REAL(lb - 1, dp)*twozetp
                  DO ig = lb_cube(i), ub_cube(i)
                     cubeaxis(ig, i, 0, lb) = rbg(ig)*cubeaxis(ig, i, 0, lb - 1) + &
                                              fb*cubeaxis(ig, i, 0, lb - 2)
                  END DO
               END DO
            END IF
         END IF

      END DO

      DO la = 0, la_max
         DO lb = 0, lb_max
            IF (la + lb == 0) CYCLE
            fa = (1.0_dp/twozetp)**(la + lb)
            DO i = 1, 3
               DO ig = lb_cube(i), ub_cube(i)
                  cubeaxis(ig, i, la, lb) = fa*cubeaxis(ig, i, la, lb)
               END DO
            END DO
         END DO
      END DO

      ! Add the current primitive Gaussian function product to grid

      DO ico = ncoset(la_min - 1) + 1, ncoset(la_max)

         ax = indco(1, ico)
         ay = indco(2, ico)
         az = indco(3, ico)

         DO jco = ncoset(lb_min - 1) + 1, ncoset(lb_max)

            pij = prefactor*pab(na + ico, nb + jco)

            IF (ABS(pij) < eps_rho_gspace) CYCLE

            bx = indco(1, jco)
            by = indco(2, jco)
            bz = indco(3, jco)

            DO i = lb_grid, ub_grid
               IF (pw%pw_grid%gsq(i) > gsq_max) CYCLE
               ig = pw%pw_grid%g_hat(1, i)
               jg = pw%pw_grid%g_hat(2, i)
               kg = pw%pw_grid%g_hat(3, i)
               pw%cc(i) = pw%cc(i) + pij*cubeaxis(ig, 1, ax, bx)* &
                          cubeaxis(jg, 2, ay, by)* &
                          cubeaxis(kg, 3, az, bz)
            END DO

         END DO

      END DO

      DEALLOCATE (cubeaxis)
      DEALLOCATE (g)
      DEALLOCATE (rag)
      DEALLOCATE (rbg)

      CALL timestop(handle)

   END SUBROUTINE collocate_pgf_product_gspace

END MODULE qs_collocate_density