1!--------------------------------------------------------------------------------------------------!
2!   CP2K: A general program to perform molecular dynamics simulations                              !
3!   Copyright (C) 2000 - 2019  CP2K developers group                                               !
4!--------------------------------------------------------------------------------------------------!
5
6! **************************************************************************************************
7!> \brief generate the tasks lists used by collocate and integrate routines
8!> \par History
9!>      01.2008 [Joost VandeVondele] refactered out of qs_collocate / qs_integrate
10!> \author Joost VandeVondele
11! **************************************************************************************************
12MODULE task_list_methods
13   USE ao_util,                         ONLY: exp_radius_very_extended
14   USE basis_set_types,                 ONLY: get_gto_basis_set,&
15                                              gto_basis_set_p_type,&
16                                              gto_basis_set_type
17   USE cell_types,                      ONLY: cell_type,&
18                                              pbc
19   USE cp_control_types,                ONLY: dft_control_type
20   USE cube_utils,                      ONLY: compute_cube_center,&
21                                              cube_info_type,&
22                                              return_cube,&
23                                              return_cube_nonortho
24   USE dbcsr_api,                       ONLY: dbcsr_convert_sizes_to_offsets,&
25                                              dbcsr_finalize,&
26                                              dbcsr_get_block_p,&
27                                              dbcsr_get_info,&
28                                              dbcsr_p_type,&
29                                              dbcsr_put_block,&
30                                              dbcsr_type,&
31                                              dbcsr_work_create
32   USE gaussian_gridlevels,             ONLY: gaussian_gridlevel,&
33                                              gridlevel_info_type
34   USE kinds,                           ONLY: default_string_length,&
35                                              dp,&
36                                              int_8
37   USE kpoint_types,                    ONLY: get_kpoint_info,&
38                                              kpoint_type
39   USE memory_utilities,                ONLY: reallocate
40   USE message_passing,                 ONLY: mp_allgather,&
41                                              mp_alltoall,&
42                                              mp_bcast,&
43                                              mp_sum,&
44                                              mp_sum_partial
45   USE particle_types,                  ONLY: particle_type
46   USE pw_env_types,                    ONLY: pw_env_get,&
47                                              pw_env_type
48   USE qs_kind_types,                   ONLY: get_qs_kind,&
49                                              qs_kind_type
50   USE qs_ks_types,                     ONLY: get_ks_env,&
51                                              qs_ks_env_type
52   USE qs_neighbor_list_types,          ONLY: get_iterator_info,&
53                                              neighbor_list_iterate,&
54                                              neighbor_list_iterator_create,&
55                                              neighbor_list_iterator_p_type,&
56                                              neighbor_list_iterator_release,&
57                                              neighbor_list_set_p_type
58   USE realspace_grid_types,            ONLY: realspace_grid_desc_p_type,&
59                                              realspace_grid_desc_type,&
60                                              realspace_grid_p_type,&
61                                              rs_grid_create,&
62                                              rs_grid_locate_rank,&
63                                              rs_grid_release,&
64                                              rs_grid_reorder_ranks
65   USE task_list_types,                 ONLY: task_list_type
66   USE util,                            ONLY: sort
67
68!$ USE OMP_LIB, ONLY: omp_destroy_lock, omp_get_num_threads, omp_init_lock, &
69!$                    omp_lock_kind, omp_set_lock, omp_unset_lock
70#include "./base/base_uses.f90"
71
72   IMPLICIT NONE
73
74   LOGICAL, PRIVATE, PARAMETER :: debug_this_module = .FALSE.
75
76   PRIVATE
77
78   CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'task_list_methods'
79
80   PUBLIC :: generate_qs_task_list, &
81             task_list_inner_loop
82   PUBLIC :: distribute_tasks, &
83             int2pair, &
84             rs_distribute_matrix
85
86CONTAINS
87
88! **************************************************************************************************
89!> \brief ...
90!> \param ks_env ...
91!> \param task_list ...
92!> \param reorder_rs_grid_ranks Flag that indicates if this routine should
93!>        or should not overwrite the rs descriptor (see comment below)
94!> \param skip_load_balance_distributed ...
95!> \param soft_valid ...
96!> \param basis_type ...
97!> \param pw_env_external ...
98!> \param sab_orb_external ...
99!> \par History
100!>      01.2008 factored out of calculate_rho_elec [Joost VandeVondele]
101!>      04.2010 divides tasks into grid levels and atom pairs for integrate/collocate [Iain Bethune]
102!>              (c) The Numerical Algorithms Group (NAG) Ltd, 2010 on behalf of the HECToR project
103!>      06.2015 adjusted to be used with multiple images (k-points) [JGH]
104!> \note  If this routine is called several times with different task lists,
105!>        the default behaviour is to re-optimize the grid ranks and overwrite
106!>        the rs descriptor and grids. reorder_rs_grid_ranks = .FALSE. prevents the code
107!>        of performing a new optimization by leaving the rank order in
108!>        its current state.
109! **************************************************************************************************
110   SUBROUTINE generate_qs_task_list(ks_env, task_list, &
111                                    reorder_rs_grid_ranks, skip_load_balance_distributed, &
112                                    soft_valid, basis_type, pw_env_external, sab_orb_external)
113
114      TYPE(qs_ks_env_type), POINTER                      :: ks_env
115      TYPE(task_list_type), POINTER                      :: task_list
116      LOGICAL, INTENT(IN)                                :: reorder_rs_grid_ranks, &
117                                                            skip_load_balance_distributed
118      LOGICAL, INTENT(IN), OPTIONAL                      :: soft_valid
119      CHARACTER(LEN=*), INTENT(IN), OPTIONAL             :: basis_type
120      TYPE(pw_env_type), OPTIONAL, POINTER               :: pw_env_external
121      TYPE(neighbor_list_set_p_type), DIMENSION(:), &
122         OPTIONAL, POINTER                               :: sab_orb_external
123
124      CHARACTER(LEN=*), PARAMETER :: routineN = 'generate_qs_task_list', &
125         routineP = moduleN//':'//routineN
126      INTEGER, PARAMETER                                 :: max_tasks = 2000
127
128      CHARACTER(LEN=default_string_length)               :: my_basis_type
129      INTEGER :: cindex, curr_tasks, handle, i, iatom, iatom_old, igrid_level, igrid_level_old, &
130         ikind, ilevel, img, img_old, inode, ipair, ipgf, iset, itask, jatom, jatom_old, jkind, &
131         jpgf, jset, maxpgf, maxset, natoms, nimages, nkind, nseta, nsetb
132      INTEGER(kind=int_8), DIMENSION(:, :), POINTER      :: tasks
133      INTEGER, ALLOCATABLE, DIMENSION(:, :, :)           :: blocks
134      INTEGER, DIMENSION(3)                              :: cellind
135      INTEGER, DIMENSION(:), POINTER                     :: la_max, la_min, lb_max, lb_min, npgfa, &
136                                                            npgfb
137      INTEGER, DIMENSION(:, :, :), POINTER               :: cell_to_index
138      LOGICAL                                            :: dokp, my_soft
139      REAL(KIND=dp)                                      :: kind_radius_a, kind_radius_b
140      REAL(KIND=dp), DIMENSION(3)                        :: ra, rab
141      REAL(KIND=dp), DIMENSION(:), POINTER               :: set_radius_a, set_radius_b
142      REAL(KIND=dp), DIMENSION(:, :), POINTER            :: rpgfa, rpgfb, zeta, zetb
143      TYPE(cell_type), POINTER                           :: cell
144      TYPE(cube_info_type), DIMENSION(:), POINTER        :: cube_info
145      TYPE(dft_control_type), POINTER                    :: dft_control
146      TYPE(gridlevel_info_type), POINTER                 :: gridlevel_info
147      TYPE(gto_basis_set_p_type), DIMENSION(:), POINTER  :: basis_set_list
148      TYPE(gto_basis_set_type), POINTER                  :: basis_set_a, basis_set_b, orb_basis_set
149      TYPE(kpoint_type), POINTER                         :: kpoints
150      TYPE(neighbor_list_iterator_p_type), &
151         DIMENSION(:), POINTER                           :: nl_iterator
152      TYPE(neighbor_list_set_p_type), DIMENSION(:), &
153         POINTER                                         :: sab_orb
154      TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set
155      TYPE(pw_env_type), POINTER                         :: pw_env
156      TYPE(qs_kind_type), DIMENSION(:), POINTER          :: qs_kind_set
157      TYPE(qs_kind_type), POINTER                        :: qs_kind
158      TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
159         POINTER                                         :: rs_descs
160      TYPE(realspace_grid_p_type), DIMENSION(:), POINTER :: rs_grids
161
162      CALL timeset(routineN, handle)
163
164      ! by default, the full density is calculated
165      my_soft = .FALSE.
166      IF (PRESENT(soft_valid)) my_soft = soft_valid
167
168      CALL get_ks_env(ks_env, &
169                      qs_kind_set=qs_kind_set, &
170                      cell=cell, &
171                      particle_set=particle_set, &
172                      dft_control=dft_control)
173
174      IF (PRESENT(basis_type)) THEN
175         my_basis_type = basis_type
176      ELSE
177         my_basis_type = "ORB"
178      END IF
179
180      CALL get_ks_env(ks_env, sab_orb=sab_orb)
181      IF (PRESENT(sab_orb_external)) sab_orb => sab_orb_external
182
183      CALL get_ks_env(ks_env, pw_env=pw_env)
184      IF (PRESENT(pw_env_external)) pw_env => pw_env_external
185      CALL pw_env_get(pw_env, rs_descs=rs_descs, rs_grids=rs_grids)
186
187      ! *** assign from pw_env
188      gridlevel_info => pw_env%gridlevel_info
189      cube_info => pw_env%cube_info
190
191      ! find maximum numbers
192      nkind = SIZE(qs_kind_set)
193      natoms = SIZE(particle_set)
194      maxset = 0
195      maxpgf = 0
196      DO ikind = 1, nkind
197         qs_kind => qs_kind_set(ikind)
198         CALL get_qs_kind(qs_kind=qs_kind, &
199                          softb=my_soft, &
200                          basis_set=orb_basis_set, basis_type=my_basis_type)
201
202         IF (.NOT. ASSOCIATED(orb_basis_set)) CYCLE
203         CALL get_gto_basis_set(gto_basis_set=orb_basis_set, npgf=npgfa, nset=nseta)
204
205         maxset = MAX(nseta, maxset)
206         maxpgf = MAX(MAXVAL(npgfa), maxpgf)
207      END DO
208
209      ! kpoint related
210      nimages = dft_control%nimages
211      IF (nimages > 1) THEN
212         dokp = .TRUE.
213         NULLIFY (kpoints)
214         CALL get_ks_env(ks_env=ks_env, kpoints=kpoints)
215         CALL get_kpoint_info(kpoint=kpoints, cell_to_index=cell_to_index)
216      ELSE
217         dokp = .FALSE.
218         NULLIFY (cell_to_index)
219      END IF
220
221      ! free the atom_pair lists if allocated
222      IF (ASSOCIATED(task_list%atom_pair_send)) DEALLOCATE (task_list%atom_pair_send)
223      IF (ASSOCIATED(task_list%atom_pair_recv)) DEALLOCATE (task_list%atom_pair_recv)
224
225      ! construct a list of all tasks
226      IF (.NOT. ASSOCIATED(task_list%tasks)) THEN
227         CALL reallocate(task_list%tasks, 1, 6, 1, max_tasks)
228         CALL reallocate(task_list%dist_ab, 1, 3, 1, max_tasks)
229      ENDIF
230      task_list%ntasks = 0
231      curr_tasks = SIZE(task_list%tasks, 2)
232
233      ALLOCATE (basis_set_list(nkind))
234      DO ikind = 1, nkind
235         qs_kind => qs_kind_set(ikind)
236         CALL get_qs_kind(qs_kind=qs_kind, softb=my_soft, basis_set=basis_set_a, &
237                          basis_type=my_basis_type)
238         IF (ASSOCIATED(basis_set_a)) THEN
239            basis_set_list(ikind)%gto_basis_set => basis_set_a
240         ELSE
241            NULLIFY (basis_set_list(ikind)%gto_basis_set)
242         END IF
243      END DO
244      CALL neighbor_list_iterator_create(nl_iterator, sab_orb)
245      DO WHILE (neighbor_list_iterate(nl_iterator) == 0)
246         CALL get_iterator_info(nl_iterator, ikind=ikind, jkind=jkind, inode=inode, &
247                                iatom=iatom, jatom=jatom, r=rab, cell=cellind)
248         basis_set_a => basis_set_list(ikind)%gto_basis_set
249         IF (.NOT. ASSOCIATED(basis_set_a)) CYCLE
250         basis_set_b => basis_set_list(jkind)%gto_basis_set
251         IF (.NOT. ASSOCIATED(basis_set_b)) CYCLE
252         ra(:) = pbc(particle_set(iatom)%r, cell)
253         ! basis ikind
254         la_max => basis_set_a%lmax
255         la_min => basis_set_a%lmin
256         npgfa => basis_set_a%npgf
257         nseta = basis_set_a%nset
258         rpgfa => basis_set_a%pgf_radius
259         set_radius_a => basis_set_a%set_radius
260         kind_radius_a = basis_set_a%kind_radius
261         zeta => basis_set_a%zet
262         ! basis jkind
263         lb_max => basis_set_b%lmax
264         lb_min => basis_set_b%lmin
265         npgfb => basis_set_b%npgf
266         nsetb = basis_set_b%nset
267         rpgfb => basis_set_b%pgf_radius
268         set_radius_b => basis_set_b%set_radius
269         kind_radius_b = basis_set_b%kind_radius
270         zetb => basis_set_b%zet
271
272         IF (dokp) THEN
273            cindex = cell_to_index(cellind(1), cellind(2), cellind(3))
274         ELSE
275            cindex = 1
276         END IF
277
278         CALL task_list_inner_loop(task_list%tasks, task_list%dist_ab, task_list%ntasks, curr_tasks, &
279                                   rs_descs, dft_control, cube_info, gridlevel_info, cindex, &
280                                   iatom, jatom, rpgfa, rpgfb, zeta, zetb, kind_radius_b, set_radius_a, set_radius_b, ra, rab, &
281                                   la_max, la_min, lb_max, lb_min, npgfa, npgfb, maxpgf, maxset, natoms, nimages, nseta, nsetb)
282
283      END DO
284      CALL neighbor_list_iterator_release(nl_iterator)
285
286      DEALLOCATE (basis_set_list)
287
288      ! redistribute the task list so that all tasks map on the local rs grids
289      CALL distribute_tasks( &
290         rs_descs, task_list%ntasks, natoms, maxset, maxpgf, nimages, &
291         task_list%tasks, rval=task_list%dist_ab, atom_pair_send=task_list%atom_pair_send, &
292         atom_pair_recv=task_list%atom_pair_recv, symmetric=.TRUE., &
293         reorder_rs_grid_ranks=reorder_rs_grid_ranks, skip_load_balance_distributed=skip_load_balance_distributed)
294
295      ! If the rank order has changed, reallocate any of the distributed rs_grids
296
297      IF (reorder_rs_grid_ranks) THEN
298         DO i = 1, gridlevel_info%ngrid_levels
299            IF (rs_descs(i)%rs_desc%distributed) THEN
300               CALL rs_grid_release(rs_grids(i)%rs_grid)
301               NULLIFY (rs_grids(i)%rs_grid)
302               CALL rs_grid_create(rs_grids(i)%rs_grid, rs_descs(i)%rs_desc)
303            END IF
304         END DO
305      END IF
306
307      ! Now we have the final list of tasks, setup the task_list with the
308      ! data needed for the loops in integrate_v/calculate_rho
309
310      IF (ASSOCIATED(task_list%taskstart)) THEN
311         DEALLOCATE (task_list%taskstart)
312      END IF
313      IF (ASSOCIATED(task_list%taskstop)) THEN
314         DEALLOCATE (task_list%taskstop)
315      END IF
316      IF (ASSOCIATED(task_list%npairs)) THEN
317         DEALLOCATE (task_list%npairs)
318      END IF
319
320      ! First, count the number of unique atom pairs per grid level
321
322      ALLOCATE (task_list%npairs(SIZE(rs_descs)))
323
324      iatom_old = -1; jatom_old = -1; igrid_level_old = -1; img_old = -1
325      ipair = 0
326      task_list%npairs = 0
327
328      DO i = 1, task_list%ntasks
329         CALL int2pair(task_list%tasks(3, i), igrid_level, img, iatom, jatom, iset, jset, ipgf, jpgf, &
330                       nimages, natoms, maxset, maxpgf)
331         IF (igrid_level .NE. igrid_level_old) THEN
332            IF (igrid_level_old .NE. -1) THEN
333               task_list%npairs(igrid_level_old) = ipair
334            END IF
335            ipair = 1
336            igrid_level_old = igrid_level
337            iatom_old = iatom
338            jatom_old = jatom
339            img_old = img
340         ELSE IF (iatom .NE. iatom_old .OR. jatom .NE. jatom_old .OR. img .NE. img_old) THEN
341            ipair = ipair + 1
342            iatom_old = iatom
343            jatom_old = jatom
344            img_old = img
345         END IF
346      END DO
347      ! Take care of the last iteration
348      IF (task_list%ntasks /= 0) THEN
349         task_list%npairs(igrid_level) = ipair
350      END IF
351
352      ! Second, for each atom pair, find the indices in the task list
353      ! of the first and last task
354
355      ! Array sized for worst case
356      ALLOCATE (task_list%taskstart(MAXVAL(task_list%npairs), SIZE(rs_descs)))
357      ALLOCATE (task_list%taskstop(MAXVAL(task_list%npairs), SIZE(rs_descs)))
358
359      iatom_old = -1; jatom_old = -1; igrid_level_old = -1; img_old = -1
360      ipair = 0
361      task_list%taskstart = 0
362      task_list%taskstop = 0
363
364      DO i = 1, task_list%ntasks
365         CALL int2pair(task_list%tasks(3, i), igrid_level, img, iatom, jatom, iset, jset, ipgf, jpgf, &
366                       nimages, natoms, maxset, maxpgf)
367         IF (igrid_level .NE. igrid_level_old) THEN
368            IF (igrid_level_old .NE. -1) THEN
369               task_list%taskstop(ipair, igrid_level_old) = i - 1
370            END IF
371            ipair = 1
372            task_list%taskstart(ipair, igrid_level) = i
373            igrid_level_old = igrid_level
374            iatom_old = iatom
375            jatom_old = jatom
376            img_old = img
377         ELSE IF (iatom .NE. iatom_old .OR. jatom .NE. jatom_old .OR. img .NE. img_old) THEN
378            ipair = ipair + 1
379            task_list%taskstart(ipair, igrid_level) = i
380            task_list%taskstop(ipair - 1, igrid_level) = i - 1
381            iatom_old = iatom
382            jatom_old = jatom
383            img_old = img
384         END IF
385      END DO
386      ! Take care of the last iteration
387      IF (task_list%ntasks /= 0) THEN
388         task_list%taskstop(ipair, igrid_level) = task_list%ntasks
389      END IF
390
391      ! Debug task destribution
392      IF (debug_this_module) THEN
393         tasks => task_list%tasks
394         WRITE (6, *)
395         WRITE (6, *) "Total number of tasks              ", task_list%ntasks
396         DO igrid_level = 1, gridlevel_info%ngrid_levels
397            WRITE (6, *) "Total number of pairs(grid_level)  ", igrid_level, task_list%npairs(igrid_level)
398         END DO
399         WRITE (6, *)
400
401         DO igrid_level = 1, gridlevel_info%ngrid_levels
402
403            ALLOCATE (blocks(natoms, natoms, nimages))
404            blocks = -1
405            DO ipair = 1, task_list%npairs(igrid_level)
406               itask = task_list%taskstart(ipair, igrid_level)
407               CALL int2pair(tasks(3, itask), ilevel, img, iatom, jatom, iset, jset, ipgf, jpgf, &
408                             nimages, natoms, maxset, maxpgf)
409               IF (blocks(iatom, jatom, img) == -1 .AND. blocks(jatom, iatom, img) == -1) THEN
410                  blocks(iatom, jatom, img) = 1
411                  blocks(jatom, iatom, img) = 1
412               ELSE
413                  WRITE (6, *) "TASK LIST CONFLICT IN PAIR       ", ipair
414                  WRITE (6, *) "Reuse of iatom, jatom, image     ", iatom, jatom, img
415               END IF
416
417               iatom_old = iatom
418               jatom_old = jatom
419               img_old = img
420               DO itask = task_list%taskstart(ipair, igrid_level), task_list%taskstop(ipair, igrid_level)
421
422                  CALL int2pair(tasks(3, itask), ilevel, img, iatom, jatom, iset, jset, ipgf, jpgf, &
423                                nimages, natoms, maxset, maxpgf)
424                  IF (iatom /= iatom_old .OR. jatom /= jatom_old .OR. img /= img_old) THEN
425                     WRITE (6, *) "TASK LIST CONFLICT IN TASK       ", itask
426                     WRITE (6, *) "Inconsistent iatom, jatom, image ", iatom, jatom, img
427                     WRITE (6, *) "Should be    iatom, jatom, image ", iatom_old, jatom_old, img_old
428                  END IF
429
430               END DO
431            END DO
432            DEALLOCATE (blocks)
433
434         END DO
435
436      END IF
437
438      CALL timestop(handle)
439
440   END SUBROUTINE generate_qs_task_list
441
442! **************************************************************************************************
443!> \brief ...
444!> \param tasks ...
445!> \param dist_ab ...
446!> \param ntasks ...
447!> \param curr_tasks ...
448!> \param rs_descs ...
449!> \param dft_control ...
450!> \param cube_info ...
451!> \param gridlevel_info ...
452!> \param cindex ...
453!> \param iatom ...
454!> \param jatom ...
455!> \param rpgfa ...
456!> \param rpgfb ...
457!> \param zeta ...
458!> \param zetb ...
459!> \param kind_radius_b ...
460!> \param set_radius_a ...
461!> \param set_radius_b ...
462!> \param ra ...
463!> \param rab ...
464!> \param la_max ...
465!> \param la_min ...
466!> \param lb_max ...
467!> \param lb_min ...
468!> \param npgfa ...
469!> \param npgfb ...
470!> \param maxpgf ...
471!> \param maxset ...
472!> \param natoms ...
473!> \param nimages ...
474!> \param nseta ...
475!> \param nsetb ...
476!> \par History
477!>      Joost VandeVondele: 10.2008 refactored
478! **************************************************************************************************
479   SUBROUTINE task_list_inner_loop(tasks, dist_ab, ntasks, curr_tasks, rs_descs, dft_control, &
480                                   cube_info, gridlevel_info, cindex, &
481                                   iatom, jatom, rpgfa, rpgfb, zeta, zetb, kind_radius_b, set_radius_a, set_radius_b, ra, rab, &
482                                   la_max, la_min, lb_max, lb_min, npgfa, npgfb, maxpgf, maxset, natoms, nimages, nseta, nsetb)
483
484      INTEGER(KIND=int_8), DIMENSION(:, :), POINTER      :: tasks
485      REAL(KIND=dp), DIMENSION(:, :), POINTER            :: dist_ab
486      INTEGER                                            :: ntasks, curr_tasks
487      TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
488         POINTER                                         :: rs_descs
489      TYPE(dft_control_type), POINTER                    :: dft_control
490      TYPE(cube_info_type), DIMENSION(:), POINTER        :: cube_info
491      TYPE(gridlevel_info_type), POINTER                 :: gridlevel_info
492      INTEGER                                            :: cindex, iatom, jatom
493      REAL(KIND=dp), DIMENSION(:, :), POINTER            :: rpgfa, rpgfb, zeta, zetb
494      REAL(KIND=dp)                                      :: kind_radius_b
495      REAL(KIND=dp), DIMENSION(:), POINTER               :: set_radius_a, set_radius_b
496      REAL(KIND=dp), DIMENSION(3)                        :: ra, rab
497      INTEGER, DIMENSION(:), POINTER                     :: la_max, la_min, lb_max, lb_min, npgfa, &
498                                                            npgfb
499      INTEGER                                            :: maxpgf, maxset, natoms, nimages, nseta, &
500                                                            nsetb
501
502      CHARACTER(LEN=*), PARAMETER :: routineN = 'task_list_inner_loop', &
503         routineP = moduleN//':'//routineN
504
505      INTEGER                                            :: cube_center(3), igrid_level, ipgf, iset, &
506                                                            jpgf, jset, lb_cube(3), ub_cube(3)
507      REAL(KIND=dp)                                      :: dab, rab2, radius, zetp
508
509      rab2 = rab(1)*rab(1) + rab(2)*rab(2) + rab(3)*rab(3)
510      dab = SQRT(rab2)
511
512      loop_iset: DO iset = 1, nseta
513
514         IF (set_radius_a(iset) + kind_radius_b < dab) CYCLE
515
516         loop_jset: DO jset = 1, nsetb
517
518            IF (set_radius_a(iset) + set_radius_b(jset) < dab) CYCLE
519
520            loop_ipgf: DO ipgf = 1, npgfa(iset)
521
522               IF (rpgfa(ipgf, iset) + set_radius_b(jset) < dab) CYCLE
523
524               loop_jpgf: DO jpgf = 1, npgfb(jset)
525
526                  IF (rpgfa(ipgf, iset) + rpgfb(jpgf, jset) < dab) CYCLE
527
528                  zetp = zeta(ipgf, iset) + zetb(jpgf, jset)
529                  igrid_level = gaussian_gridlevel(gridlevel_info, zetp)
530
531                  CALL compute_pgf_properties(cube_center, lb_cube, ub_cube, radius, &
532                                              rs_descs(igrid_level)%rs_desc, cube_info(igrid_level), &
533                                              la_max(iset), zeta(ipgf, iset), la_min(iset), &
534                                              lb_max(jset), zetb(jpgf, jset), lb_min(jset), &
535                                              ra, rab, rab2, dft_control%qs_control%eps_rho_rspace)
536
537                  CALL pgf_to_tasks(tasks, dist_ab, ntasks, curr_tasks, &
538                                    rab, cindex, iatom, jatom, iset, jset, ipgf, jpgf, nimages, natoms, maxset, maxpgf, &
539                                    la_max(iset), lb_max(jset), rs_descs(igrid_level)%rs_desc, &
540                                    igrid_level, gridlevel_info%ngrid_levels, cube_center, &
541                                    lb_cube, ub_cube)
542
543               END DO loop_jpgf
544
545            END DO loop_ipgf
546
547         END DO loop_jset
548
549      END DO loop_iset
550
551   END SUBROUTINE task_list_inner_loop
552
553! **************************************************************************************************
554!> \brief combines the calculation of several basic properties of a given pgf:
555!>  its center, the bounding cube, the radius, the cost,
556!>  tries to predict the time needed for processing this task
557!>      in this way an improved load balance might be obtained
558!> \param cube_center ...
559!> \param lb_cube ...
560!> \param ub_cube ...
561!> \param radius ...
562!> \param rs_desc ...
563!> \param cube_info ...
564!> \param la_max ...
565!> \param zeta ...
566!> \param la_min ...
567!> \param lb_max ...
568!> \param zetb ...
569!> \param lb_min ...
570!> \param ra ...
571!> \param rab ...
572!> \param rab2 ...
573!> \param eps ...
574!> \par History
575!>      10.2008 refactored [Joost VandeVondele]
576!> \note
577!>      -) this requires the radius to be computed in the same way as
578!>      collocate_pgf_product_rspace, we should factor that part into a subroutine
579!>      -) we're assuming that integrate_pgf and collocate_pgf are the same cost for load balancing
580!>         this is more or less true for map_consistent
581!>      -) in principle, the computed radius could be recycled in integrate_pgf/collocate_pgf if it is certainly
582!>         the same, this could lead to a small speedup
583!>      -) the cost function is a fit through the median cost of mapping a pgf with a given l and a given radius (in grid points)
584!>         fitting the measured data on an opteron/g95 using the expression
585!>         a*(l+b)(r+c)**3+d which is based on the innerloop of the collocating routines
586! **************************************************************************************************
587   SUBROUTINE compute_pgf_properties(cube_center, lb_cube, ub_cube, radius, &
588                                     rs_desc, cube_info, la_max, zeta, la_min, lb_max, zetb, lb_min, ra, rab, rab2, eps)
589
590      INTEGER, DIMENSION(3), INTENT(OUT)                 :: cube_center, lb_cube, ub_cube
591      REAL(KIND=dp), INTENT(OUT)                         :: radius
592      TYPE(realspace_grid_desc_type), POINTER            :: rs_desc
593      TYPE(cube_info_type), INTENT(IN)                   :: cube_info
594      INTEGER, INTENT(IN)                                :: la_max
595      REAL(KIND=dp), INTENT(IN)                          :: zeta
596      INTEGER, INTENT(IN)                                :: la_min, lb_max
597      REAL(KIND=dp), INTENT(IN)                          :: zetb
598      INTEGER, INTENT(IN)                                :: lb_min
599      REAL(KIND=dp), INTENT(IN)                          :: ra(3), rab(3), rab2, eps
600
601      CHARACTER(LEN=*), PARAMETER :: routineN = 'compute_pgf_properties', &
602         routineP = moduleN//':'//routineN
603
604      INTEGER                                            :: extent(3)
605      INTEGER, DIMENSION(:), POINTER                     :: sphere_bounds
606      REAL(KIND=dp)                                      :: cutoff, f, prefactor, rb(3), zetp
607      REAL(KIND=dp), DIMENSION(3)                        :: rp
608
609! the radius for this task
610
611      zetp = zeta + zetb
612      rp(:) = ra(:) + zetb/zetp*rab(:)
613      rb(:) = ra(:) + rab(:)
614      cutoff = 1.0_dp
615      f = zetb/zetp
616      prefactor = EXP(-zeta*f*rab2)
617      radius = exp_radius_very_extended(la_min, la_max, lb_min, lb_max, ra=ra, rb=rb, rp=rp, &
618                                        zetp=zetp, eps=eps, prefactor=prefactor, cutoff=cutoff)
619
620      CALL compute_cube_center(cube_center, rs_desc, zeta, zetb, ra, rab)
621      ! compute cube_center, the center of the gaussian product to map (folded to within the unit cell)
622      cube_center(:) = MODULO(cube_center(:), rs_desc%npts(:))
623      cube_center(:) = cube_center(:) + rs_desc%lb(:)
624
625      IF (rs_desc%orthorhombic) THEN
626         CALL return_cube(cube_info, radius, lb_cube, ub_cube, sphere_bounds)
627      ELSE
628         CALL return_cube_nonortho(cube_info, radius, lb_cube, ub_cube, rp)
629         !? unclear if extent is computed correctly.
630         extent(:) = ub_cube(:) - lb_cube(:)
631         lb_cube(:) = -extent(:)/2 - 1
632         ub_cube(:) = extent(:)/2
633      ENDIF
634
635   END SUBROUTINE compute_pgf_properties
636! **************************************************************************************************
637!> \brief predicts the cost of a task in kcycles for a given task
638!>        the model is based on a fit of actual data, and might need updating
639!>        as collocate_pgf_product_rspace changes (or CPUs/compilers change)
640!>        maybe some dynamic approach, improving the cost model on the fly could
641!>        work as well
642!>        the cost model does not yet take into account the fraction of space
643!>        that is mapped locally for a given cube and rs_grid (generalised tasks)
644!> \param lb_cube ...
645!> \param ub_cube ...
646!> \param fraction ...
647!> \param lmax ...
648!> \param is_ortho ...
649!> \return ...
650! **************************************************************************************************
651   INTEGER FUNCTION cost_model(lb_cube, ub_cube, fraction, lmax, is_ortho)
652      INTEGER, DIMENSION(3), INTENT(IN)                  :: lb_cube, ub_cube
653      REAL(KIND=dp), INTENT(IN)                          :: fraction
654      INTEGER                                            :: lmax
655      LOGICAL                                            :: is_ortho
656
657      INTEGER                                            :: cmax
658      REAL(KIND=dp)                                      :: v1, v2, v3, v4, v5
659
660      cmax = MAXVAL(((ub_cube - lb_cube) + 1)/2)
661
662      IF (is_ortho) THEN
663         v1 = 1.504760E+00_dp
664         v2 = 3.126770E+00_dp
665         v3 = 5.074106E+00_dp
666         v4 = 1.091568E+00_dp
667         v5 = 1.070187E+00_dp
668      ELSE
669         v1 = 7.831105E+00_dp
670         v2 = 2.675174E+00_dp
671         v3 = 7.546553E+00_dp
672         v4 = 6.122446E-01_dp
673         v5 = 3.886382E+00_dp
674      ENDIF
675      cost_model = CEILING(((lmax + v1)*(cmax + v2)**3*v3*fraction + v4 + v5*lmax**7)/1000.0_dp)
676
677   END FUNCTION cost_model
678! **************************************************************************************************
679!> \brief pgf_to_tasks converts a given pgf to one or more tasks, in particular
680!>        this determines by which CPUs a given pgf gets collocated
681!>        the format of the task array is as follows
682!>        tasks(1,i) := destination
683!>        tasks(2,i) := source
684!>        tasks(3,i) := compressed type (iatom, jatom, ....)
685!>        tasks(4,i) := type (0: replicated, 1: distributed local, 2: distributed generalised)
686!>        tasks(5,i) := cost
687!>        tasks(6,i) := alternate destination code (0 if none available)
688!>
689!> \param tasks ...
690!> \param dist_ab ...
691!> \param ntasks ...
692!> \param curr_tasks ...
693!> \param rab ...
694!> \param cindex ...
695!> \param iatom ...
696!> \param jatom ...
697!> \param iset ...
698!> \param jset ...
699!> \param ipgf ...
700!> \param jpgf ...
701!> \param nimages ...
702!> \param natoms ...
703!> \param maxset ...
704!> \param maxpgf ...
705!> \param la_max ...
706!> \param lb_max ...
707!> \param rs_desc ...
708!> \param igrid_level ...
709!> \param n_levels ...
710!> \param cube_center ...
711!> \param lb_cube ...
712!> \param ub_cube ...
713!> \par History
714!>      10.2008 Refactored based on earlier routines by MattW [Joost VandeVondele]
715! **************************************************************************************************
716   SUBROUTINE pgf_to_tasks(tasks, dist_ab, ntasks, curr_tasks, &
717                           rab, cindex, iatom, jatom, iset, jset, ipgf, jpgf, nimages, natoms, maxset, maxpgf, &
718                           la_max, lb_max, rs_desc, igrid_level, n_levels, cube_center, lb_cube, ub_cube)
719
720      INTEGER(KIND=int_8), DIMENSION(:, :), POINTER      :: tasks
721      REAL(KIND=dp), DIMENSION(:, :), POINTER            :: dist_ab
722      INTEGER, INTENT(INOUT)                             :: ntasks, curr_tasks
723      REAL(KIND=dp), DIMENSION(3), INTENT(IN)            :: rab
724      INTEGER, INTENT(IN)                                :: cindex, iatom, jatom, iset, jset, ipgf, &
725                                                            jpgf, nimages, natoms, maxset, maxpgf, &
726                                                            la_max, lb_max
727      TYPE(realspace_grid_desc_type), POINTER            :: rs_desc
728      INTEGER, INTENT(IN)                                :: igrid_level, n_levels
729      INTEGER, DIMENSION(3), INTENT(IN)                  :: cube_center, lb_cube, ub_cube
730
731      INTEGER, PARAMETER                                 :: add_tasks = 1000
732      REAL(kind=dp), PARAMETER                           :: mult_tasks = 2.0_dp
733
734      INTEGER                                            :: added_tasks, cost, j, lmax
735      LOGICAL                                            :: is_ortho
736      REAL(KIND=dp)                                      :: tfraction
737
738      ntasks = ntasks + 1
739      IF (ntasks > curr_tasks) THEN
740         curr_tasks = INT((curr_tasks + add_tasks)*mult_tasks)
741         CALL reallocate(tasks, 1, 6, 1, curr_tasks)
742      END IF
743
744      IF (rs_desc%distributed) THEN
745
746         ! finds the node(s) that need to process this task
747         ! on exit tasks(4,:) is 1 for distributed tasks and 2 for generalised tasks
748         CALL rs_find_node(rs_desc, igrid_level, n_levels, cube_center, &
749                           ntasks=ntasks, tasks=tasks, lb_cube=lb_cube, ub_cube=ub_cube, added_tasks=added_tasks)
750
751      ELSE
752         tasks(1, ntasks) = encode_rank(rs_desc%my_pos, igrid_level, n_levels)
753         tasks(4, ntasks) = 0
754         tasks(6, ntasks) = 0
755         added_tasks = 1
756      ENDIF
757
758      IF (SIZE(dist_ab, 2) .NE. SIZE(tasks, 2)) THEN
759         CALL reallocate(dist_ab, 1, 3, 1, SIZE(tasks, 2))
760      ENDIF
761
762      lmax = la_max + lb_max
763      is_ortho = (tasks(4, ntasks) == 0 .OR. tasks(4, ntasks) == 1) .AND. rs_desc%orthorhombic
764      ! we assume the load is shared equally between processes dealing with a generalised Gaussian.
765      ! this could be refined in the future
766      tfraction = 1.0_dp/added_tasks
767
768      cost = cost_model(lb_cube, ub_cube, tfraction, lmax, is_ortho)
769
770      DO j = 1, added_tasks
771
772         tasks(2, ntasks - added_tasks + j) = encode_rank(rs_desc%my_pos, igrid_level, n_levels)
773         tasks(5, ntasks - added_tasks + j) = cost
774
775         !encode the atom pairs and basis info as a single long integer
776         CALL pair2int(tasks(3, ntasks - added_tasks + j), igrid_level, cindex, &
777                       iatom, jatom, iset, jset, ipgf, jpgf, nimages, natoms, maxset, maxpgf)
778
779         dist_ab(1, ntasks - added_tasks + j) = rab(1)
780         dist_ab(2, ntasks - added_tasks + j) = rab(2)
781         dist_ab(3, ntasks - added_tasks + j) = rab(3)
782
783      ENDDO
784
785   END SUBROUTINE pgf_to_tasks
786
787! **************************************************************************************************
788!> \brief converts a pgf index pair (ipgf,iset,iatom),(jpgf,jset,jatom)
789!>      to a unique integer.
790!>      a list of integers can be sorted, and will result in a list of pgf pairs
791!>      for which all atom pairs are grouped, and for each atom pair all set pairs are grouped
792!>      and for each set pair, all pgfs are grouped
793!> \param res ...
794!> \param ilevel ...
795!> \param image ...
796!> \param iatom ...
797!> \param jatom ...
798!> \param iset ...
799!> \param jset ...
800!> \param ipgf ...
801!> \param jpgf ...
802!> \param nimages ...
803!> \param natom ...
804!> \param maxset ...
805!> \param maxpgf ...
806!> \par History
807!>      11.2007 created [Joost]
808!> \note
809!>      will hopefully not overflow any more
810! **************************************************************************************************
811   SUBROUTINE pair2int(res, ilevel, image, iatom, jatom, iset, jset, ipgf, jpgf, &
812                       nimages, natom, maxset, maxpgf)
813      INTEGER(KIND=int_8), INTENT(OUT)                   :: res
814      INTEGER, INTENT(IN)                                :: ilevel, image, iatom, jatom, iset, jset, &
815                                                            ipgf, jpgf, nimages, natom, maxset, &
816                                                            maxpgf
817
818      INTEGER(KIND=int_8)                                :: maxpgf8, maxset8, natom8, nimages8, &
819                                                            nlev1, nlev2, nlev3, nlev4
820
821      natom8 = natom; maxset8 = maxset; maxpgf8 = maxpgf; nimages8 = nimages
822      !
823      ! this encoding yields the right order of the tasks for collocation after the sort
824      ! in distribute_tasks. E.g. for a atom pair, all sets and pgfs are computed in one go.
825      ! The exception is the gridlevel. Tasks are first ordered wrt to grid_level. This implies
826      ! that a given density matrix block will be decontracted several times, but cache effects on the
827      ! grid make up for this.
828      !
829      nlev1 = maxpgf8**2
830      nlev2 = maxset8**2*nlev1
831      nlev3 = natom8**2*nlev2
832      nlev4 = nimages8*nlev3
833      !
834      res = ilevel*nlev4 + &
835            (image - 1)*nlev3 + &
836            ((iatom - 1)*natom8 + jatom - 1)*nlev2 + &
837            ((iset - 1)*maxset8 + jset - 1)*nlev1 + &
838            (ipgf - 1)*maxpgf8 + jpgf - 1
839
840   END SUBROUTINE pair2int
841
842! **************************************************************************************************
843!> \brief ...
844!> \param res ...
845!> \param ilevel ...
846!> \param image ...
847!> \param iatom ...
848!> \param jatom ...
849!> \param iset ...
850!> \param jset ...
851!> \param ipgf ...
852!> \param jpgf ...
853!> \param nimages ...
854!> \param natom ...
855!> \param maxset ...
856!> \param maxpgf ...
857! **************************************************************************************************
858   SUBROUTINE int2pair(res, ilevel, image, iatom, jatom, iset, jset, ipgf, jpgf, &
859                       nimages, natom, maxset, maxpgf)
860      INTEGER(KIND=int_8), INTENT(IN)                    :: res
861      INTEGER, INTENT(OUT)                               :: ilevel, image, iatom, jatom, iset, jset, &
862                                                            ipgf, jpgf
863      INTEGER, INTENT(IN)                                :: nimages, natom, maxset, maxpgf
864
865      INTEGER(KIND=int_8) :: iatom8, ijatom, ijset, img, ipgf8, iset8, jatom8, jpgf8, jset8, &
866         maxpgf8, maxset8, natom8, nimages8, nlev1, nlev2, nlev3, nlev4, tmp
867
868      natom8 = natom; maxset8 = maxset; maxpgf8 = maxpgf; nimages8 = nimages
869      !
870      nlev1 = maxpgf8**2
871      nlev2 = maxset8**2*nlev1
872      nlev3 = natom8**2*nlev2
873      nlev4 = nimages8*nlev3
874      !
875      ilevel = INT(res/nlev4)
876      tmp = MOD(res, nlev4)
877      img = tmp/nlev3 + 1
878      tmp = MOD(tmp, nlev3)
879      ijatom = tmp/nlev2
880      iatom8 = ijatom/natom8 + 1
881      jatom8 = MOD(ijatom, natom8) + 1
882      tmp = MOD(tmp, nlev2)
883      ijset = tmp/nlev1
884      iset8 = ijset/maxset8 + 1
885      jset8 = MOD(ijset, maxset8) + 1
886      tmp = MOD(tmp, nlev1)
887      ipgf8 = tmp/maxpgf8 + 1
888      jpgf8 = MOD(tmp, maxpgf8) + 1
889      !
890      image = INT(img)
891      iatom = INT(iatom8); jatom = INT(jatom8); iset = INT(iset8); jset = INT(jset8)
892      ipgf = INT(ipgf8); jpgf = INT(jpgf8)
893
894   END SUBROUTINE int2pair
895
896! **************************************************************************************************
897!> \brief performs load balancing of the tasks on the distributed grids
898!> \param tasks ...
899!> \param ntasks ...
900!> \param rs_descs ...
901!> \param grid_level ...
902!> \param nimages ...
903!> \param natoms ...
904!> \param maxset ...
905!> \param maxpgf ...
906!> \par History
907!>      created 2008-10-03 [Joost VandeVondele]
908! **************************************************************************************************
909   SUBROUTINE load_balance_distributed(tasks, ntasks, rs_descs, grid_level, &
910                                       nimages, natoms, maxset, maxpgf)
911
912      INTEGER(KIND=int_8), DIMENSION(:, :), POINTER      :: tasks
913      INTEGER                                            :: ntasks
914      TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
915         POINTER                                         :: rs_descs
916      INTEGER                                            :: grid_level, nimages, natoms, maxset, &
917                                                            maxpgf
918
919      CHARACTER(LEN=*), PARAMETER :: routineN = 'load_balance_distributed', &
920         routineP = moduleN//':'//routineN
921
922      INTEGER                                            :: handle
923      INTEGER, DIMENSION(:, :, :), POINTER               :: list
924
925      CALL timeset(routineN, handle)
926
927      NULLIFY (list)
928      ! here we create for each cpu (0:ncpu-1) a list of possible destinations.
929      ! if a destination would not be in this list, it is a bug
930      CALL create_destination_list(list, rs_descs, grid_level)
931
932      ! now, walk over the tasks, filling in the loads of each destination
933      CALL compute_load_list(list, rs_descs, grid_level, tasks, ntasks, nimages, natoms, maxset, maxpgf, &
934                             create_list=.TRUE.)
935
936      ! optimize loads & fluxes
937      CALL optimize_load_list(list, rs_descs(1)%rs_desc%group, rs_descs(1)%rs_desc%my_pos)
938
939      ! now, walk over the tasks, using the list to set the destinations
940      CALL compute_load_list(list, rs_descs, grid_level, tasks, ntasks, nimages, natoms, maxset, maxpgf, &
941                             create_list=.FALSE.)
942
943      DEALLOCATE (list)
944
945      CALL timestop(handle)
946
947   END SUBROUTINE load_balance_distributed
948
949! **************************************************************************************************
950!> \brief this serial routine adjusts the fluxes in the global list
951!>
952!> \param list_global ...
953!> \par History
954!>      created 2008-10-06 [Joost VandeVondele]
955! **************************************************************************************************
956   SUBROUTINE balance_global_list(list_global)
957      INTEGER, DIMENSION(:, :, 0:)                       :: list_global
958
959      CHARACTER(LEN=*), PARAMETER :: routineN = 'balance_global_list', &
960         routineP = moduleN//':'//routineN
961      INTEGER, PARAMETER                                 :: Max_Iter = 100
962      REAL(KIND=dp), PARAMETER                           :: Tolerance_factor = 0.005_dp
963
964      INTEGER                                            :: dest, handle, icpu, idest, iflux, &
965                                                            ilocal, k, maxdest, Ncpu, Nflux
966      INTEGER, ALLOCATABLE, DIMENSION(:, :)              :: flux_connections
967      LOGICAL                                            :: solution_optimal
968      REAL(KIND=dp)                                      :: average, load_shift, max_load_shift, &
969                                                            tolerance
970      REAL(KIND=dp), ALLOCATABLE, DIMENSION(:)           :: load, optimized_flux, optimized_load
971      REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :)        :: flux_limits
972
973      CALL timeset(routineN, handle)
974
975      Ncpu = SIZE(list_global, 3)
976      maxdest = SIZE(list_global, 2)
977      ALLOCATE (load(0:Ncpu - 1))
978      load = 0.0_dp
979      ALLOCATE (optimized_load(0:Ncpu - 1))
980
981      ! figure out the number of fluxes
982      ! we assume that the global_list is symmetric
983      Nflux = 0
984      DO icpu = 0, ncpu - 1
985         DO idest = 1, maxdest
986            dest = list_global(1, idest, icpu)
987            IF (dest < ncpu .AND. dest > icpu) Nflux = Nflux + 1
988         ENDDO
989      ENDDO
990      ALLOCATE (optimized_flux(Nflux))
991      ALLOCATE (flux_limits(2, Nflux))
992      ALLOCATE (flux_connections(2, Nflux))
993
994      ! reorder data
995      flux_limits = 0
996      Nflux = 0
997      DO icpu = 0, ncpu - 1
998         load(icpu) = SUM(list_global(2, :, icpu))
999         DO idest = 1, maxdest
1000            dest = list_global(1, idest, icpu)
1001            IF (dest < ncpu) THEN
1002               IF (dest .NE. icpu) THEN
1003                  IF (dest > icpu) THEN
1004                     Nflux = Nflux + 1
1005                     flux_limits(2, Nflux) = list_global(2, idest, icpu)
1006                     flux_connections(1, Nflux) = icpu
1007                     flux_connections(2, Nflux) = dest
1008                  ELSE
1009                     DO iflux = 1, Nflux
1010                        IF (flux_connections(1, iflux) == dest .AND. flux_connections(2, iflux) == icpu) THEN
1011                           flux_limits(1, iflux) = -list_global(2, idest, icpu)
1012                           EXIT
1013                        ENDIF
1014                     ENDDO
1015                  ENDIF
1016               ENDIF
1017            ENDIF
1018         ENDDO
1019      ENDDO
1020
1021      solution_optimal = .FALSE.
1022      optimized_flux = 0.0_dp
1023
1024      ! an iterative solver, if iterated till convergence the maximum load is minimal
1025      ! we terminate before things are fully converged, since this does show up in the timings
1026      ! once the largest shift becomes less than a small fraction of the average load, we're done
1027      ! we're perfectly happy if the load balance is within 1 percent or so
1028      ! the maximum load normally converges even faster
1029      average = SUM(load)/SIZE(load)
1030      tolerance = Tolerance_factor*average
1031
1032      optimized_load(:) = load
1033      DO k = 1, Max_iter
1034         max_load_shift = 0.0_dp
1035         DO iflux = 1, Nflux
1036            load_shift = (optimized_load(flux_connections(1, iflux)) - optimized_load(flux_connections(2, iflux)))/2
1037            load_shift = MAX(flux_limits(1, iflux) - optimized_flux(iflux), load_shift)
1038            load_shift = MIN(flux_limits(2, iflux) - optimized_flux(iflux), load_shift)
1039            max_load_shift = MAX(ABS(load_shift), max_load_shift)
1040            optimized_load(flux_connections(1, iflux)) = optimized_load(flux_connections(1, iflux)) - load_shift
1041            optimized_load(flux_connections(2, iflux)) = optimized_load(flux_connections(2, iflux)) + load_shift
1042            optimized_flux(iflux) = optimized_flux(iflux) + load_shift
1043         ENDDO
1044         IF (max_load_shift < tolerance) THEN
1045            solution_optimal = .TRUE.
1046            EXIT
1047         ENDIF
1048      ENDDO
1049
1050      ! now adjust the load list to reflect the optimized fluxes
1051      ! reorder data
1052      Nflux = 0
1053      DO icpu = 0, ncpu - 1
1054         DO idest = 1, maxdest
1055            IF (list_global(1, idest, icpu) == icpu) ilocal = idest
1056         ENDDO
1057         DO idest = 1, maxdest
1058            dest = list_global(1, idest, icpu)
1059            IF (dest < ncpu) THEN
1060               IF (dest .NE. icpu) THEN
1061                  IF (dest > icpu) THEN
1062                     Nflux = Nflux + 1
1063                     IF (optimized_flux(Nflux) > 0) THEN
1064                        list_global(2, ilocal, icpu) = list_global(2, ilocal, icpu) + &
1065                                                       list_global(2, idest, icpu) - NINT(optimized_flux(Nflux))
1066                        list_global(2, idest, icpu) = NINT(optimized_flux(Nflux))
1067                     ELSE
1068                        list_global(2, ilocal, icpu) = list_global(2, ilocal, icpu) + &
1069                                                       list_global(2, idest, icpu)
1070                        list_global(2, idest, icpu) = 0
1071                     ENDIF
1072                  ELSE
1073                     DO iflux = 1, Nflux
1074                        IF (flux_connections(1, iflux) == dest .AND. flux_connections(2, iflux) == icpu) THEN
1075                           IF (optimized_flux(iflux) > 0) THEN
1076                              list_global(2, ilocal, icpu) = list_global(2, ilocal, icpu) + &
1077                                                             list_global(2, idest, icpu)
1078                              list_global(2, idest, icpu) = 0
1079                           ELSE
1080                              list_global(2, ilocal, icpu) = list_global(2, ilocal, icpu) + &
1081                                                             list_global(2, idest, icpu) + NINT(optimized_flux(iflux))
1082                              list_global(2, idest, icpu) = -NINT(optimized_flux(iflux))
1083                           ENDIF
1084                           EXIT
1085                        ENDIF
1086                     ENDDO
1087                  ENDIF
1088               ENDIF
1089            ENDIF
1090         ENDDO
1091      ENDDO
1092
1093      CALL timestop(handle)
1094
1095   END SUBROUTINE balance_global_list
1096
1097! **************************************************************************************************
1098!> \brief this routine gets back optimized loads for all destinations
1099!>
1100!> \param list ...
1101!> \param group ...
1102!> \param my_pos ...
1103!> \par History
1104!>      created 2008-10-06 [Joost VandeVondele]
1105!>      Modified 2016-01   [EPCC] Reduce memory requirements on P processes
1106!>                                from O(P^2) to O(P)
1107! **************************************************************************************************
1108   SUBROUTINE optimize_load_list(list, group, my_pos)
1109      INTEGER, DIMENSION(:, :, 0:)                       :: list
1110      INTEGER, INTENT(IN)                                :: group, my_pos
1111
1112      CHARACTER(LEN=*), PARAMETER :: routineN = 'optimize_load_list', &
1113         routineP = moduleN//':'//routineN
1114      INTEGER, PARAMETER                                 :: rank_of_root = 0
1115
1116      INTEGER                                            :: handle, icpu, idest, maxdest, ncpu
1117      INTEGER, ALLOCATABLE, DIMENSION(:)                 :: load_all
1118      INTEGER, ALLOCATABLE, DIMENSION(:, :)              :: load_partial
1119      INTEGER, ALLOCATABLE, DIMENSION(:, :, :)           :: list_global
1120
1121      CALL timeset(routineN, handle)
1122
1123      ncpu = SIZE(list, 3)
1124      maxdest = SIZE(list, 2)
1125
1126      !find total workload ...
1127      ALLOCATE (load_all(maxdest*ncpu))
1128      load_all(:) = RESHAPE(list(2, :, :), (/maxdest*ncpu/))
1129      CALL mp_sum(load_all(:), rank_of_root, group)
1130
1131      ! ... and optimise the work per process
1132      ALLOCATE (list_global(2, maxdest, ncpu))
1133      IF (rank_of_root .EQ. my_pos) THEN
1134         list_global(1, :, :) = list(1, :, :)
1135         list_global(2, :, :) = RESHAPE(load_all, (/maxdest, ncpu/))
1136         CALL balance_global_list(list_global)
1137      ENDIF
1138      CALL mp_bcast(list_global, rank_of_root, group)
1139
1140      !figure out how much can be sent to other processes
1141      ALLOCATE (load_partial(maxdest, ncpu))
1142      ! send 'load_all', which is a copy of 'list' (but without leading dimension/stride)
1143      CALL mp_sum_partial(RESHAPE(load_all, (/maxdest, ncpu/)), load_partial(:, :), group)
1144
1145      DO icpu = 1, ncpu
1146         DO idest = 1, maxdest
1147
1148            !need to deduct 1 because `list' was passed in to this routine as being indexed from zero
1149            IF (load_partial(idest, icpu) > list_global(2, idest, icpu)) THEN
1150               IF (load_partial(idest, icpu) - list(2, idest, icpu - 1) < list_global(2, idest, icpu)) THEN
1151                  list(2, idest, icpu - 1) = list_global(2, idest, icpu) &
1152                                             - (load_partial(idest, icpu) - list(2, idest, icpu - 1))
1153               ELSE
1154                  list(2, idest, icpu - 1) = 0
1155               ENDIF
1156            ENDIF
1157
1158         ENDDO
1159      ENDDO
1160
1161      !clean up before leaving
1162      DEALLOCATE (load_all)
1163      DEALLOCATE (list_global)
1164      DEALLOCATE (load_partial)
1165
1166      CALL timestop(handle)
1167   END SUBROUTINE optimize_load_list
1168
1169! **************************************************************************************************
1170!> \brief fill the load list with values derived from the tasks array
1171!>        from the alternate locations, we select the alternate location that
1172!>        can be used without increasing the number of matrix blocks needed to
1173!>        distribute.
1174!>        Replicated tasks are not yet considered
1175!>
1176!> \param list ...
1177!> \param rs_descs ...
1178!> \param grid_level ...
1179!> \param tasks ...
1180!> \param ntasks ...
1181!> \param nimages ...
1182!> \param natoms ...
1183!> \param maxset ...
1184!> \param maxpgf ...
1185!> \param create_list ...
1186!> \par History
1187!>      created 2008-10-06 [Joost VandeVondele]
1188! **************************************************************************************************
1189   SUBROUTINE compute_load_list(list, rs_descs, grid_level, tasks, &
1190                                ntasks, nimages, natoms, maxset, maxpgf, create_list)
1191      INTEGER, DIMENSION(:, :, 0:)                       :: list
1192      TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
1193         POINTER                                         :: rs_descs
1194      INTEGER                                            :: grid_level
1195      INTEGER(KIND=int_8), DIMENSION(:, :), POINTER      :: tasks
1196      INTEGER                                            :: ntasks, nimages, natoms, maxset, maxpgf
1197      LOGICAL                                            :: create_list
1198
1199      CHARACTER(LEN=*), PARAMETER :: routineN = 'compute_load_list', &
1200         routineP = moduleN//':'//routineN
1201
1202      INTEGER :: cost, dest, handle, i, iatom, ilevel, img, img_old, iopt, ipgf, iset, itask, &
1203         itask_start, itask_stop, jatom, jpgf, jset, li, maxdest, ncpu, ndest_pair, nopt, nshort, &
1204         rank
1205      INTEGER(KIND=int_8)                                :: bit_pattern, ipair, ipair_old, natom8
1206      INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:)     :: loads
1207      INTEGER, ALLOCATABLE, DIMENSION(:)                 :: all_dests, index
1208      INTEGER, DIMENSION(6)                              :: options
1209
1210      CALL timeset(routineN, handle)
1211
1212      ALLOCATE (loads(0:rs_descs(grid_level)%rs_desc%group_size - 1))
1213      CALL get_current_loads(loads, rs_descs, grid_level, ntasks, nimages, natoms, maxset, maxpgf, &
1214                             tasks, use_reordered_ranks=.FALSE.)
1215
1216      maxdest = SIZE(list, 2)
1217      ncpu = SIZE(list, 3)
1218      natom8 = natoms
1219
1220      ! first find the tasks that deal with the same atom pair
1221      itask_stop = 0
1222      ipair_old = HUGE(ipair_old)
1223      img_old = -1
1224      ALLOCATE (all_dests(0))
1225      ALLOCATE (INDEX(0))
1226
1227      DO
1228
1229         ! first find the range of tasks that deal with the same atom pair
1230         itask_start = itask_stop + 1
1231         itask_stop = itask_start
1232         IF (itask_stop > ntasks) EXIT
1233         CALL int2pair(tasks(3, itask_stop), ilevel, img_old, iatom, jatom, iset, jset, ipgf, jpgf, &
1234                       nimages, natoms, maxset, maxpgf)
1235         ipair_old = (iatom - 1)*natom8 + (jatom - 1)
1236         DO
1237            IF (itask_stop + 1 > ntasks) EXIT
1238            CALL int2pair(tasks(3, itask_stop + 1), ilevel, img, iatom, jatom, iset, jset, ipgf, jpgf, &
1239                          nimages, natoms, maxset, maxpgf)
1240            ipair = (iatom - 1)*natom8 + (jatom - 1)
1241            IF (ipair == ipair_old .AND. img == img_old) THEN
1242               itask_stop = itask_stop + 1
1243            ELSE
1244               EXIT
1245            ENDIF
1246         ENDDO
1247         ipair = ipair_old
1248         nshort = itask_stop - itask_start + 1
1249
1250         ! find the unique list of destinations on this grid level only
1251         DEALLOCATE (all_dests)
1252         ALLOCATE (all_dests(nshort))
1253         DEALLOCATE (index)
1254         ALLOCATE (INDEX(nshort))
1255         DO i = 1, nshort
1256            CALL int2pair(tasks(3, itask_start + i - 1), ilevel, img, iatom, jatom, iset, jset, ipgf, jpgf, &
1257                          nimages, natoms, maxset, maxpgf)
1258            IF (ilevel .EQ. grid_level) THEN
1259               all_dests(i) = decode_rank(tasks(1, itask_start + i - 1), SIZE(rs_descs))
1260            ELSE
1261               all_dests(i) = HUGE(all_dests(i))
1262            END IF
1263         ENDDO
1264         CALL sort(all_dests, nshort, index)
1265         ndest_pair = 1
1266         DO i = 2, nshort
1267            IF ((all_dests(ndest_pair) .NE. all_dests(i)) .AND. (all_dests(i) .NE. HUGE(all_dests(i)))) THEN
1268               ndest_pair = ndest_pair + 1
1269               all_dests(ndest_pair) = all_dests(i)
1270            ENDIF
1271         ENDDO
1272
1273         DO itask = itask_start, itask_stop
1274
1275            dest = decode_rank(tasks(1, itask), SIZE(rs_descs)) ! notice that dest can be changed
1276            CALL int2pair(tasks(3, itask), ilevel, img, iatom, jatom, iset, jset, ipgf, jpgf, &
1277                          nimages, natoms, maxset, maxpgf)
1278            ! Only proceed with tasks which are on this grid level
1279            IF (ilevel .NE. grid_level) CYCLE
1280            ipair = (iatom - 1)*natom8 + (jatom - 1)
1281            cost = INT(tasks(5, itask))
1282
1283            SELECT CASE (tasks(4, itask))
1284            CASE (1)
1285               bit_pattern = tasks(6, itask)
1286               nopt = 0
1287               IF (BTEST(bit_pattern, 0)) THEN
1288                  rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, (/-1, 0, 0/))
1289                  IF (ANY(all_dests(1:ndest_pair) .EQ. rank)) THEN
1290                     nopt = nopt + 1
1291                     options(nopt) = rank
1292                  ENDIF
1293               ENDIF
1294               IF (BTEST(bit_pattern, 1)) THEN
1295                  rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, (/+1, 0, 0/))
1296                  IF (ANY(all_dests(1:ndest_pair) .EQ. rank)) THEN
1297                     nopt = nopt + 1
1298                     options(nopt) = rank
1299                  ENDIF
1300               ENDIF
1301               IF (BTEST(bit_pattern, 2)) THEN
1302                  rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, (/0, -1, 0/))
1303                  IF (ANY(all_dests(1:ndest_pair) .EQ. rank)) THEN
1304                     nopt = nopt + 1
1305                     options(nopt) = rank
1306                  ENDIF
1307               ENDIF
1308               IF (BTEST(bit_pattern, 3)) THEN
1309                  rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, (/0, +1, 0/))
1310                  IF (ANY(all_dests(1:ndest_pair) .EQ. rank)) THEN
1311                     nopt = nopt + 1
1312                     options(nopt) = rank
1313                  ENDIF
1314               ENDIF
1315               IF (BTEST(bit_pattern, 4)) THEN
1316                  rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, (/0, 0, -1/))
1317                  IF (ANY(all_dests(1:ndest_pair) .EQ. rank)) THEN
1318                     nopt = nopt + 1
1319                     options(nopt) = rank
1320                  ENDIF
1321               ENDIF
1322               IF (BTEST(bit_pattern, 5)) THEN
1323                  rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, (/0, 0, +1/))
1324                  IF (ANY(all_dests(1:ndest_pair) .EQ. rank)) THEN
1325                     nopt = nopt + 1
1326                     options(nopt) = rank
1327                  ENDIF
1328               ENDIF
1329               IF (nopt > 0) THEN
1330                  ! set it to the rank with the lowest load
1331                  rank = options(1)
1332                  DO iopt = 2, nopt
1333                     IF (loads(rank) > loads(options(iopt))) rank = options(iopt)
1334                  ENDDO
1335               ELSE
1336                  rank = dest
1337               ENDIF
1338               li = list_index(list, rank, dest)
1339               IF (create_list) THEN
1340                  list(2, li, dest) = list(2, li, dest) + cost
1341               ELSE
1342                  IF (list(1, li, dest) == dest) THEN
1343                     tasks(1, itask) = encode_rank(dest, ilevel, SIZE(rs_descs))
1344                  ELSE
1345                     IF (list(2, li, dest) >= cost) THEN
1346                        list(2, li, dest) = list(2, li, dest) - cost
1347                        tasks(1, itask) = encode_rank(list(1, li, dest), ilevel, SIZE(rs_descs))
1348                     ELSE
1349                        tasks(1, itask) = encode_rank(dest, ilevel, SIZE(rs_descs))
1350                     ENDIF
1351                  ENDIF
1352               ENDIF
1353            CASE (2) ! generalised
1354               li = list_index(list, dest, dest)
1355               IF (create_list) THEN
1356                  list(2, li, dest) = list(2, li, dest) + cost
1357               ELSE
1358                  IF (list(1, li, dest) == dest) THEN
1359                     tasks(1, itask) = encode_rank(dest, ilevel, SIZE(rs_descs))
1360                  ELSE
1361                     IF (list(2, li, dest) >= cost) THEN
1362                        list(2, li, dest) = list(2, li, dest) - cost
1363                        tasks(1, itask) = encode_rank(list(1, li, dest), ilevel, SIZE(rs_descs))
1364                     ELSE
1365                        tasks(1, itask) = encode_rank(dest, ilevel, SIZE(rs_descs))
1366                     ENDIF
1367                  ENDIF
1368               ENDIF
1369            CASE DEFAULT
1370               CPABORT("")
1371            END SELECT
1372
1373         ENDDO
1374
1375      ENDDO
1376
1377      CALL timestop(handle)
1378
1379   END SUBROUTINE compute_load_list
1380! **************************************************************************************************
1381!> \brief small helper function to return the proper index in the list array
1382!>
1383!> \param list ...
1384!> \param rank ...
1385!> \param dest ...
1386!> \return ...
1387!> \par History
1388!>      created 2008-10-06 [Joost VandeVondele]
1389! **************************************************************************************************
1390   INTEGER FUNCTION list_index(list, rank, dest)
1391      INTEGER, DIMENSION(:, :, 0:), INTENT(IN)           :: list
1392      INTEGER, INTENT(IN)                                :: rank, dest
1393
1394      list_index = 1
1395      DO
1396         IF (list(1, list_index, dest) == rank) EXIT
1397         list_index = list_index + 1
1398      ENDDO
1399   END FUNCTION list_index
1400! **************************************************************************************************
1401!> \brief create a list with possible destinations (i.e. the central cpu and neighbors) for each cpu
1402!>        note that we allocate it with an additional field to store the load of this destination
1403!>
1404!> \param list ...
1405!> \param rs_descs ...
1406!> \param grid_level ...
1407!> \par History
1408!>      created 2008-10-06 [Joost VandeVondele]
1409! **************************************************************************************************
1410   SUBROUTINE create_destination_list(list, rs_descs, grid_level)
1411      INTEGER, DIMENSION(:, :, :), POINTER               :: list
1412      TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
1413         POINTER                                         :: rs_descs
1414      INTEGER, INTENT(IN)                                :: grid_level
1415
1416      CHARACTER(LEN=*), PARAMETER :: routineN = 'create_destination_list', &
1417         routineP = moduleN//':'//routineN
1418
1419      INTEGER                                            :: handle, i, icpu, j, maxcount, ncpu, &
1420                                                            ultimate_max
1421      INTEGER, ALLOCATABLE, DIMENSION(:)                 :: index, sublist
1422
1423      CALL timeset(routineN, handle)
1424
1425      CPASSERT(.NOT. ASSOCIATED(list))
1426      ncpu = rs_descs(grid_level)%rs_desc%group_size
1427      ultimate_max = 7
1428
1429      ALLOCATE (list(2, ultimate_max, 0:ncpu - 1))
1430
1431      ALLOCATE (INDEX(ultimate_max))
1432      ALLOCATE (sublist(ultimate_max))
1433      sublist = HUGE(sublist)
1434
1435      maxcount = 1
1436      DO icpu = 0, ncpu - 1
1437         sublist(1) = icpu
1438         sublist(2) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, (/-1, 0, 0/))
1439         sublist(3) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, (/+1, 0, 0/))
1440         sublist(4) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, (/0, -1, 0/))
1441         sublist(5) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, (/0, +1, 0/))
1442         sublist(6) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, (/0, 0, -1/))
1443         sublist(7) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, (/0, 0, +1/))
1444         ! only retain unique values of the destination
1445         CALL sort(sublist, ultimate_max, index)
1446         j = 1
1447         DO i = 2, 7
1448            IF (sublist(i) .NE. sublist(j)) THEN
1449               j = j + 1
1450               sublist(j) = sublist(i)
1451            ENDIF
1452         ENDDO
1453         maxcount = MAX(maxcount, j)
1454         sublist(j + 1:ultimate_max) = HUGE(sublist)
1455         list(1, :, icpu) = sublist
1456         list(2, :, icpu) = 0
1457      ENDDO
1458
1459      CALL reallocate(list, 1, 2, 1, maxcount, 0, ncpu - 1)
1460
1461      CALL timestop(handle)
1462
1463   END SUBROUTINE create_destination_list
1464
1465! **************************************************************************************************
1466!> \brief given a task list, compute the load of each process everywhere
1467!>        giving this function the ability to loop over a (sub)set of rs_grids,
1468!>        and do all the communication in one shot, would speed it up
1469!> \param loads ...
1470!> \param rs_descs ...
1471!> \param grid_level ...
1472!> \param ntasks ...
1473!> \param nimages ...
1474!> \param natom ...
1475!> \param maxset ...
1476!> \param maxpgf ...
1477!> \param tasks ...
1478!> \param use_reordered_ranks ...
1479!> \par History
1480!>      none
1481!> \author MattW 21/11/2007
1482! **************************************************************************************************
1483   SUBROUTINE get_current_loads(loads, rs_descs, grid_level, ntasks, nimages, natom, maxset, &
1484                                maxpgf, tasks, use_reordered_ranks)
1485      INTEGER(KIND=int_8), DIMENSION(:)                  :: loads
1486      TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
1487         POINTER                                         :: rs_descs
1488      INTEGER                                            :: grid_level, ntasks, nimages, natom, &
1489                                                            maxset, maxpgf
1490      INTEGER(KIND=int_8), DIMENSION(:, :), POINTER      :: tasks
1491      LOGICAL, INTENT(IN)                                :: use_reordered_ranks
1492
1493      CHARACTER(LEN=*), PARAMETER :: routineN = 'get_current_loads', &
1494         routineP = moduleN//':'//routineN
1495
1496      INTEGER                                            :: handle, i, iatom, ilevel, img, ipgf, &
1497                                                            iset, jatom, jpgf, jset
1498      INTEGER(KIND=int_8)                                :: total_cost_local
1499      INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:)     :: recv_buf_i, send_buf_i
1500      TYPE(realspace_grid_desc_type), POINTER            :: desc
1501
1502      CALL timeset(routineN, handle)
1503
1504      desc => rs_descs(grid_level)%rs_desc
1505
1506      ! allocate local arrays
1507      ALLOCATE (send_buf_i(desc%group_size))
1508      ALLOCATE (recv_buf_i(desc%group_size))
1509
1510      ! communication step 1 : compute the total local cost of the tasks
1511      !                        each proc needs to know the amount of work he will receive
1512
1513      ! send buffer now contains for each target the cost of the tasks it will receive
1514      send_buf_i = 0
1515      DO i = 1, ntasks
1516         CALL int2pair(tasks(3, i), ilevel, img, iatom, jatom, iset, jset, ipgf, jpgf, nimages, natom, maxset, maxpgf)
1517         IF (ilevel .NE. grid_level) CYCLE
1518         IF (use_reordered_ranks) THEN
1519            send_buf_i(rs_descs(ilevel)%rs_desc%virtual2real(decode_rank(tasks(1, i), SIZE(rs_descs))) + 1) = &
1520               send_buf_i(rs_descs(ilevel)%rs_desc%virtual2real(decode_rank(tasks(1, i), SIZE(rs_descs))) + 1) &
1521               + tasks(5, i)
1522         ELSE
1523            send_buf_i(decode_rank(tasks(1, i), SIZE(rs_descs)) + 1) = &
1524               send_buf_i(decode_rank(tasks(1, i), SIZE(rs_descs)) + 1) &
1525               + tasks(5, i)
1526         END IF
1527      END DO
1528      CALL mp_alltoall(send_buf_i, recv_buf_i, 1, desc%group)
1529
1530      ! communication step 2 : compute the global cost of the tasks
1531      total_cost_local = SUM(recv_buf_i)
1532
1533      ! after this step, the recv buffer contains the local cost for each CPU
1534      CALL mp_allgather(total_cost_local, loads, desc%group)
1535
1536      CALL timestop(handle)
1537
1538   END SUBROUTINE get_current_loads
1539! **************************************************************************************************
1540!> \brief performs load balancing shifting tasks on the replicated grids
1541!>        this modifies the destination of some of the tasks on replicated
1542!>        grids, and in this way balances the load
1543!> \param rs_descs ...
1544!> \param ntasks ...
1545!> \param tasks ...
1546!> \param nimages ...
1547!> \param natoms ...
1548!> \param maxset ...
1549!> \param maxpgf ...
1550!> \par History
1551!>      none
1552!> \author MattW 21/11/2007
1553! **************************************************************************************************
1554   SUBROUTINE load_balance_replicated(rs_descs, ntasks, tasks, nimages, natoms, maxset, maxpgf)
1555
1556      TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
1557         POINTER                                         :: rs_descs
1558      INTEGER                                            :: ntasks
1559      INTEGER(KIND=int_8), DIMENSION(:, :), POINTER      :: tasks
1560      INTEGER, INTENT(IN)                                :: nimages, natoms, maxset, maxpgf
1561
1562      CHARACTER(LEN=*), PARAMETER :: routineN = 'load_balance_replicated', &
1563         routineP = moduleN//':'//routineN
1564
1565      INTEGER                                            :: handle, i, iatom, ilevel, img, ipgf, &
1566                                                            iset, j, jatom, jpgf, jset, &
1567                                                            no_overloaded, no_underloaded, &
1568                                                            proc_receiving
1569      INTEGER(KIND=int_8)                                :: average_cost, cost_task_rep, count, &
1570                                                            offset, total_cost_global
1571      INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:)     :: load_imbalance, loads, recv_buf_i
1572      INTEGER, ALLOCATABLE, DIMENSION(:)                 :: index
1573      TYPE(realspace_grid_desc_type), POINTER            :: desc
1574
1575      CALL timeset(routineN, handle)
1576
1577      desc => rs_descs(1)%rs_desc
1578
1579      ! allocate local arrays
1580      ALLOCATE (recv_buf_i(desc%group_size))
1581      ALLOCATE (loads(desc%group_size))
1582
1583      recv_buf_i = 0
1584      DO i = 1, SIZE(rs_descs)
1585         CALL get_current_loads(loads, rs_descs, i, ntasks, nimages, natoms, maxset, maxpgf, &
1586                                tasks, use_reordered_ranks=.TRUE.)
1587         recv_buf_i(:) = recv_buf_i + loads
1588      END DO
1589
1590      total_cost_global = SUM(recv_buf_i)
1591      average_cost = total_cost_global/desc%group_size
1592
1593      !
1594      ! compute how to redistribute the replicated tasks so that the average cost is reached
1595      !
1596
1597      ! load imbalance measures the load of a given CPU relative
1598      ! to the optimal load distribution (load=average)
1599      ALLOCATE (load_imbalance(desc%group_size))
1600      ALLOCATE (INDEX(desc%group_size))
1601
1602      load_imbalance(:) = recv_buf_i - average_cost
1603      no_overloaded = 0
1604      no_underloaded = 0
1605
1606      DO i = 1, desc%group_size
1607         IF (load_imbalance(i) .GT. 0) no_overloaded = no_overloaded + 1
1608         IF (load_imbalance(i) .LT. 0) no_underloaded = no_underloaded + 1
1609      ENDDO
1610
1611      ! sort the recv_buffer on number of tasks, gives us index which provides a
1612      ! mapping between processor ranks and how overloaded the processor
1613      CALL sort(recv_buf_i, SIZE(recv_buf_i), index)
1614
1615      ! find out the number of replicated tasks each proc has
1616      ! but only those tasks which have not yet been assigned
1617      cost_task_rep = 0
1618      DO i = 1, ntasks
1619         IF (tasks(4, i) .EQ. 0 &
1620             .AND. decode_rank(tasks(1, i), SIZE(rs_descs)) == decode_rank(tasks(2, i), SIZE(rs_descs))) THEN
1621            cost_task_rep = cost_task_rep + tasks(5, i)
1622         ENDIF
1623      ENDDO
1624
1625      ! now, correct the load imbalance for the overloaded CPUs
1626      ! they will send away not more than the total load of replicated tasks
1627      CALL mp_allgather(cost_task_rep, recv_buf_i, desc%group)
1628
1629      DO i = 1, desc%group_size
1630         ! At the moment we can only offload replicated tasks
1631         IF (load_imbalance(i) .GT. 0) &
1632            load_imbalance(i) = MIN(load_imbalance(i), recv_buf_i(i))
1633      ENDDO
1634
1635      ! simplest algorithm I can think of of is that the processor with the most
1636      ! excess tasks fills up the process needing most, then moves on to next most.
1637      ! At the moment if we've got less replicated tasks than we're overloaded then
1638      ! task balancing will be incomplete
1639
1640      ! only need to do anything if I've excess tasks
1641      IF (load_imbalance(desc%my_pos + 1) .GT. 0) THEN
1642
1643         count = 0 ! weighted amount of tasks offloaded
1644         offset = 0 ! no of underloaded processes already filled by other more overloaded procs
1645
1646         ! calculate offset
1647         DO i = desc%group_size, desc%group_size - no_overloaded + 1, -1
1648            IF (INDEX(i) .EQ. desc%my_pos + 1) THEN
1649               EXIT
1650            ELSE
1651               offset = offset + load_imbalance(INDEX(i))
1652            ENDIF
1653         ENDDO
1654
1655         ! find my starting processor to send to
1656         proc_receiving = HUGE(proc_receiving)
1657         DO i = 1, no_underloaded
1658            offset = offset + load_imbalance(INDEX(i))
1659            IF (offset .LE. 0) THEN
1660               proc_receiving = i
1661               EXIT
1662            ENDIF
1663         ENDDO
1664
1665         ! offset now contains minus the number of tasks proc_receiving requires
1666         ! we fill this up by adjusting the destination of tasks on the replicated grid,
1667         ! then move to next most underloaded proc
1668         DO j = 1, ntasks
1669            IF (tasks(4, j) .EQ. 0 &
1670                .AND. decode_rank(tasks(1, j), SIZE(rs_descs)) == decode_rank(tasks(2, j), SIZE(rs_descs))) THEN
1671               ! just avoid sending to non existing procs due to integer truncation
1672               ! in the computation of the average
1673               IF (proc_receiving .GT. no_underloaded) EXIT
1674               ! set new destination
1675               CALL int2pair(tasks(3, j), ilevel, img, iatom, jatom, iset, jset, ipgf, jpgf, nimages, natoms, maxset, maxpgf)
1676               tasks(1, j) = encode_rank(INDEX(proc_receiving) - 1, ilevel, SIZE(rs_descs))
1677               offset = offset + tasks(5, j)
1678               count = count + tasks(5, j)
1679               IF (count .GE. load_imbalance(desc%my_pos + 1)) EXIT
1680               IF (offset .GT. 0) THEN
1681                  proc_receiving = proc_receiving + 1
1682                  ! just avoid sending to non existing procs due to integer truncation
1683                  ! in the computation of the average
1684                  IF (proc_receiving .GT. no_underloaded) EXIT
1685                  offset = load_imbalance(INDEX(proc_receiving))
1686               ENDIF
1687            ENDIF
1688         ENDDO
1689      ENDIF
1690
1691      DEALLOCATE (index)
1692      DEALLOCATE (load_imbalance)
1693
1694      CALL timestop(handle)
1695
1696   END SUBROUTINE load_balance_replicated
1697
1698! **************************************************************************************************
1699!> \brief given an input task list, redistribute so that all tasks can be processed locally,
1700!>        i.e. dest equals rank
1701!> \param rs_descs ...
1702!> \param ntasks ...
1703!> \param tasks ...
1704!> \param rval ...
1705!> \param ntasks_recv ...
1706!> \param tasks_recv ...
1707!> \param rval_recv ...
1708!> \par History
1709!>      none
1710!> \author MattW 21/11/2007
1711! **************************************************************************************************
1712   SUBROUTINE create_local_tasks(rs_descs, ntasks, tasks, rval, &
1713                                 ntasks_recv, tasks_recv, rval_recv)
1714
1715      TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
1716         POINTER                                         :: rs_descs
1717      INTEGER                                            :: ntasks
1718      INTEGER(KIND=int_8), DIMENSION(:, :), POINTER      :: tasks
1719      REAL(KIND=dp), DIMENSION(:, :), POINTER            :: rval
1720      INTEGER                                            :: ntasks_recv
1721      INTEGER(KIND=int_8), DIMENSION(:, :), POINTER      :: tasks_recv
1722      REAL(KIND=dp), DIMENSION(:, :), POINTER            :: rval_recv
1723
1724      CHARACTER(LEN=*), PARAMETER :: routineN = 'create_local_tasks', &
1725         routineP = moduleN//':'//routineN
1726
1727      INTEGER                                            :: handle, i, j, k, l, rank, task_dim
1728      INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:)     :: recv_buf_i, send_buf_i
1729      INTEGER, ALLOCATABLE, DIMENSION(:)                 :: recv_disps, recv_sizes, send_disps, &
1730                                                            send_sizes
1731      REAL(KIND=dp), ALLOCATABLE, DIMENSION(:)           :: recv_buf_r, send_buf_r
1732      TYPE(realspace_grid_desc_type), POINTER            :: desc
1733
1734      CALL timeset(routineN, handle)
1735
1736      desc => rs_descs(1)%rs_desc
1737
1738      ! allocate local arrays
1739      ALLOCATE (send_sizes(desc%group_size))
1740      ALLOCATE (recv_sizes(desc%group_size))
1741      ALLOCATE (send_disps(desc%group_size))
1742      ALLOCATE (recv_disps(desc%group_size))
1743      ALLOCATE (send_buf_i(desc%group_size))
1744      ALLOCATE (recv_buf_i(desc%group_size))
1745
1746      ! fill send buffer, now counting how many tasks will be send (stored in an int8 array for convenience only).
1747      send_buf_i = 0
1748      DO i = 1, ntasks
1749         rank = rs_descs(decode_level(tasks(1, i), SIZE(rs_descs)))%rs_desc%virtual2real(decode_rank(tasks(1, i), SIZE(rs_descs)))
1750         send_buf_i(rank + 1) = send_buf_i(rank + 1) + 1
1751      END DO
1752
1753      CALL mp_alltoall(send_buf_i, recv_buf_i, 1, desc%group)
1754
1755      ! pack the tasks, and send them around
1756
1757      task_dim = SIZE(tasks, 1)
1758
1759      send_sizes = 0
1760      send_disps = 0
1761      recv_sizes = 0
1762      recv_disps = 0
1763
1764      send_sizes(1) = INT(send_buf_i(1)*task_dim)
1765      recv_sizes(1) = INT(recv_buf_i(1)*task_dim)
1766      DO i = 2, desc%group_size
1767         send_sizes(i) = INT(send_buf_i(i)*task_dim)
1768         recv_sizes(i) = INT(recv_buf_i(i)*task_dim)
1769         send_disps(i) = send_disps(i - 1) + send_sizes(i - 1)
1770         recv_disps(i) = recv_disps(i - 1) + recv_sizes(i - 1)
1771      ENDDO
1772
1773      ! deallocate old send/recv buffers
1774      DEALLOCATE (send_buf_i)
1775      DEALLOCATE (recv_buf_i)
1776
1777      ! allocate them with new sizes
1778      ALLOCATE (send_buf_i(SUM(send_sizes)))
1779      ALLOCATE (recv_buf_i(SUM(recv_sizes)))
1780
1781      ! do packing
1782      send_buf_i = 0
1783      send_sizes = 0
1784      DO j = 1, ntasks
1785         i = rs_descs(decode_level(tasks(1, j), SIZE(rs_descs)))%rs_desc%virtual2real(decode_rank(tasks(1, j), SIZE(rs_descs))) + 1
1786         DO k = 1, task_dim
1787            send_buf_i(send_disps(i) + send_sizes(i) + k) = tasks(k, j)
1788         ENDDO
1789         send_sizes(i) = send_sizes(i) + task_dim
1790      ENDDO
1791
1792      ! do communication
1793      CALL mp_alltoall(send_buf_i, send_sizes, send_disps, recv_buf_i, recv_sizes, recv_disps, desc%group)
1794
1795      DEALLOCATE (send_buf_i)
1796
1797      ntasks_recv = SUM(recv_sizes)/task_dim
1798      ALLOCATE (tasks_recv(task_dim, ntasks_recv))
1799
1800      ! do unpacking
1801      l = 0
1802      DO i = 1, desc%group_size
1803         DO j = 0, recv_sizes(i)/task_dim - 1
1804            l = l + 1
1805            DO k = 1, task_dim
1806               tasks_recv(k, l) = recv_buf_i(recv_disps(i) + j*task_dim + k)
1807            ENDDO
1808         ENDDO
1809      ENDDO
1810
1811      DEALLOCATE (recv_buf_i)
1812
1813      ! send rvals (to be removed :-)
1814
1815      ! take care of the new task_dim in the send_sizes
1816      send_sizes(:) = (send_sizes(:)/task_dim)*SIZE(rval, 1)
1817      recv_sizes(:) = (recv_sizes(:)/task_dim)*SIZE(rval, 1)
1818      send_disps(:) = (send_disps(:)/task_dim)*SIZE(rval, 1)
1819      recv_disps(:) = (recv_disps(:)/task_dim)*SIZE(rval, 1)
1820      task_dim = SIZE(rval, 1)
1821
1822      ALLOCATE (send_buf_r(SUM(send_sizes)))
1823      ALLOCATE (recv_buf_r(SUM(recv_sizes)))
1824
1825      !do packing
1826      send_sizes = 0
1827      DO j = 1, ntasks
1828         i = rs_descs(decode_level(tasks(1, j), SIZE(rs_descs)))%rs_desc%virtual2real(decode_rank(tasks(1, j), SIZE(rs_descs))) + 1
1829         DO k = 1, task_dim
1830            send_buf_r(send_disps(i) + send_sizes(i) + k) = rval(k, j)
1831         ENDDO
1832         send_sizes(i) = send_sizes(i) + task_dim
1833      ENDDO
1834
1835      ! do communication
1836      CALL mp_alltoall(send_buf_r, send_sizes, send_disps, &
1837                       recv_buf_r, recv_sizes, recv_disps, desc%group)
1838
1839      DEALLOCATE (send_buf_r)
1840      ALLOCATE (rval_recv(task_dim, SUM(recv_sizes)/task_dim))
1841
1842      ! do unpacking
1843      l = 0
1844      DO i = 1, desc%group_size
1845         DO j = 0, recv_sizes(i)/task_dim - 1
1846            l = l + 1
1847            DO k = 1, task_dim
1848               rval_recv(k, l) = recv_buf_r(recv_disps(i) + j*task_dim + k)
1849            ENDDO
1850         ENDDO
1851      ENDDO
1852
1853      DEALLOCATE (recv_buf_r)
1854      DEALLOCATE (send_sizes)
1855      DEALLOCATE (recv_sizes)
1856      DEALLOCATE (send_disps)
1857      DEALLOCATE (recv_disps)
1858
1859      CALL timestop(handle)
1860
1861   END SUBROUTINE create_local_tasks
1862
1863! **************************************************************************************************
1864!> \brief Assembles tasks to be performed on local grid
1865!> \param rs_descs the grids
1866!> \param ntasks Number of tasks for local processing
1867!> \param natoms ...
1868!> \param maxset ...
1869!> \param maxpgf ...
1870!> \param nimages ...
1871!> \param tasks the task set generated on this processor
1872!> \param rval ...
1873!> \param atom_pair_send ...
1874!> \param atom_pair_recv ...
1875!> \param symmetric ...
1876!> \param reorder_rs_grid_ranks ...
1877!> \param skip_load_balance_distributed ...
1878!> \par History
1879!>      none
1880!> \author MattW 21/11/2007
1881! **************************************************************************************************
1882   SUBROUTINE distribute_tasks(rs_descs, ntasks, natoms, maxset, maxpgf, nimages, &
1883                               tasks, rval, atom_pair_send, atom_pair_recv, &
1884                               symmetric, reorder_rs_grid_ranks, skip_load_balance_distributed)
1885
1886      TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
1887         POINTER                                         :: rs_descs
1888      INTEGER                                            :: ntasks, natoms, maxset, maxpgf, nimages
1889      INTEGER(KIND=int_8), DIMENSION(:, :), POINTER      :: tasks
1890      REAL(KIND=dp), DIMENSION(:, :), POINTER            :: rval
1891      INTEGER(KIND=int_8), DIMENSION(:), POINTER         :: atom_pair_send, atom_pair_recv
1892      LOGICAL, INTENT(IN)                                :: symmetric, reorder_rs_grid_ranks, &
1893                                                            skip_load_balance_distributed
1894
1895      CHARACTER(LEN=*), PARAMETER :: routineN = 'distribute_tasks', &
1896         routineP = moduleN//':'//routineN
1897
1898      INTEGER                                            :: handle, igrid_level, irank, k, &
1899                                                            ntasks_recv
1900      INTEGER(KIND=int_8)                                :: load_gap, max_load, replicated_load
1901      INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:)     :: taskid, total_loads, total_loads_tmp, &
1902                                                            trial_loads
1903      INTEGER(KIND=int_8), DIMENSION(:, :), POINTER      :: loads, tasks_recv
1904      INTEGER, ALLOCATABLE, DIMENSION(:)                 :: index, real2virtual, total_index
1905      LOGICAL                                            :: distributed_grids, fixed_first_grid
1906      REAL(KIND=dp), DIMENSION(:, :), POINTER            :: rval_recv
1907      TYPE(realspace_grid_desc_type), POINTER            :: desc
1908
1909      CALL timeset(routineN, handle)
1910
1911      CPASSERT(ASSOCIATED(tasks))
1912
1913      ! *** figure out if we have distributed grids
1914      distributed_grids = .FALSE.
1915      DO igrid_level = 1, SIZE(rs_descs)
1916         IF (rs_descs(igrid_level)%rs_desc%distributed) THEN
1917            distributed_grids = .TRUE.
1918         ENDIF
1919      END DO
1920      desc => rs_descs(1)%rs_desc
1921
1922      IF (distributed_grids) THEN
1923
1924         ALLOCATE (loads(0:desc%group_size - 1, SIZE(rs_descs)))
1925         ALLOCATE (total_loads(0:desc%group_size - 1))
1926
1927         total_loads = 0
1928
1929         ! First round of balancing on the distributed grids
1930         ! we just balance each of the distributed grids independently
1931         DO igrid_level = 1, SIZE(rs_descs)
1932            IF (rs_descs(igrid_level)%rs_desc%distributed) THEN
1933
1934               IF (.NOT. skip_load_balance_distributed) &
1935                  CALL load_balance_distributed(tasks, ntasks, rs_descs, igrid_level, nimages, natoms, maxset, maxpgf)
1936
1937               CALL get_current_loads(loads(:, igrid_level), rs_descs, igrid_level, ntasks, nimages, natoms, maxset, maxpgf, &
1938                                      tasks, use_reordered_ranks=.FALSE.)
1939
1940               total_loads(:) = total_loads + loads(:, igrid_level)
1941
1942            END IF
1943         END DO
1944
1945         ! calculate the total load of replicated tasks, so we can decide if it is worth
1946         ! reordering the distributed grid levels
1947
1948         replicated_load = 0
1949         DO igrid_level = 1, SIZE(rs_descs)
1950            IF (.NOT. rs_descs(igrid_level)%rs_desc%distributed) THEN
1951               CALL get_current_loads(loads(:, igrid_level), rs_descs, igrid_level, ntasks, nimages, natoms, maxset, maxpgf, &
1952                                      tasks, use_reordered_ranks=.FALSE.)
1953               replicated_load = replicated_load + SUM(loads(:, igrid_level))
1954            END IF
1955         END DO
1956
1957         !IF (desc%my_pos==0) THEN
1958         ! WRITE(*,*) "Total replicated load is ",replicated_load
1959         !END IF
1960
1961         ! Now we adjust the rank ordering based on the current loads
1962         ! we leave the first distributed level and all the replicated levels in the default order
1963         IF (reorder_rs_grid_ranks) THEN
1964            fixed_first_grid = .FALSE.
1965            DO igrid_level = 1, SIZE(rs_descs)
1966               IF (rs_descs(igrid_level)%rs_desc%distributed) THEN
1967                  IF (fixed_first_grid .EQV. .FALSE.) THEN
1968                     total_loads(:) = loads(:, igrid_level)
1969                     fixed_first_grid = .TRUE.
1970                  ELSE
1971                     ALLOCATE (trial_loads(0:desc%group_size - 1))
1972
1973                     trial_loads(:) = total_loads + loads(:, igrid_level)
1974                     max_load = MAXVAL(trial_loads)
1975                     load_gap = 0
1976                     DO irank = 0, desc%group_size - 1
1977                        load_gap = load_gap + max_load - trial_loads(irank)
1978                     END DO
1979
1980                     ! If there is not enough replicated load to load balance well enough
1981                     ! then we will reorder this grid level
1982                     IF (load_gap > replicated_load*1.05_dp) THEN
1983
1984                        ALLOCATE (INDEX(0:desc%group_size - 1))
1985                        ALLOCATE (total_index(0:desc%group_size - 1))
1986                        ALLOCATE (total_loads_tmp(0:desc%group_size - 1))
1987                        ALLOCATE (real2virtual(0:desc%group_size - 1))
1988
1989                        total_loads_tmp(:) = total_loads
1990                        CALL sort(total_loads_tmp, desc%group_size, total_index)
1991                        CALL sort(loads(:, igrid_level), desc%group_size, index)
1992
1993                        ! Reorder so that the rank with smallest load on this grid level is paired with
1994                        ! the highest load in total
1995                        DO irank = 0, desc%group_size - 1
1996                           total_loads(total_index(irank) - 1) = total_loads(total_index(irank) - 1) + &
1997                                                                 loads(desc%group_size - irank - 1, igrid_level)
1998                           real2virtual(total_index(irank) - 1) = INDEX(desc%group_size - irank - 1) - 1
1999                        END DO
2000
2001                        CALL rs_grid_reorder_ranks(rs_descs(igrid_level)%rs_desc, real2virtual)
2002
2003                        DEALLOCATE (index)
2004                        DEALLOCATE (total_index)
2005                        DEALLOCATE (total_loads_tmp)
2006                        DEALLOCATE (real2virtual)
2007                     ELSE
2008                        total_loads(:) = trial_loads
2009                     ENDIF
2010
2011                     DEALLOCATE (trial_loads)
2012
2013                  ENDIF
2014               ENDIF
2015            END DO
2016         END IF
2017
2018         ! Now we use the replicated tasks to balance out the rest of the load
2019         CALL load_balance_replicated(rs_descs, ntasks, tasks, nimages, natoms, maxset, maxpgf)
2020
2021         !total_loads = 0
2022         !DO igrid_level=1,SIZE(rs_descs)
2023         !  CALL get_current_loads(loads(:,igrid_level), rs_descs, igrid_level, ntasks, nimages, natoms, maxset,maxpgf, &
2024         !                         tasks, use_reordered_ranks=.TRUE.)
2025         !  total_loads = total_loads + loads(:, igrid_level)
2026         !END DO
2027
2028         !IF (desc%my_pos==0) THEN
2029         !  WRITE(*,*) ""
2030         !  WRITE(*,*) "At the end of the load balancing procedure"
2031         !  WRITE(*,*) "Maximum  load:",MAXVAL(total_loads)
2032         !  WRITE(*,*) "Average  load:",SUM(total_loads)/SIZE(total_loads)
2033         !  WRITE(*,*) "Minimum  load:",MINVAL(total_loads)
2034         !ENDIF
2035
2036         ! given a list of tasks, this will do the needed reshuffle so that all tasks will be local
2037         CALL create_local_tasks(rs_descs, ntasks, tasks, rval, ntasks_recv, tasks_recv, rval_recv)
2038
2039         !
2040         ! tasks list are complete, we can compute the list of atomic blocks (atom pairs)
2041         ! we will be sending. These lists are needed for redistribute_matrix.
2042         !
2043         ALLOCATE (atom_pair_send(ntasks))
2044         CALL get_atom_pair(atom_pair_send, tasks, send=.TRUE., &
2045                            symmetric=symmetric, natoms=natoms, maxset=maxset, maxpgf=maxpgf, &
2046                            nimages=nimages, rs_descs=rs_descs)
2047
2048         ! natom_send=SIZE(atom_pair_send)
2049         ! CALL mp_sum(natom_send,desc%group)
2050         ! IF (desc%my_pos==0) THEN
2051         !     WRITE(*,*) ""
2052         !     WRITE(*,*) "Total number of atomic blocks to be send:",natom_send
2053         ! ENDIF
2054
2055         ALLOCATE (atom_pair_recv(ntasks_recv))
2056         CALL get_atom_pair(atom_pair_recv, tasks_recv, send=.FALSE., &
2057                            symmetric=symmetric, natoms=natoms, maxset=maxset, maxpgf=maxpgf, &
2058                            nimages=nimages, rs_descs=rs_descs)
2059
2060         ! cleanup, at this point we  don't need the original tasks & rvals anymore
2061         DEALLOCATE (tasks)
2062         DEALLOCATE (rval)
2063         DEALLOCATE (loads)
2064         DEALLOCATE (total_loads)
2065
2066      ELSE
2067
2068         tasks_recv => tasks
2069         ntasks_recv = ntasks
2070         rval_recv => rval
2071
2072      ENDIF
2073
2074      !
2075      ! here we sort the task list we will process locally.
2076      ! the sort is determined by pair2int
2077      !
2078      rval => rval_recv
2079
2080      ALLOCATE (taskid(ntasks_recv))
2081      ALLOCATE (INDEX(ntasks_recv))
2082
2083      taskid(:) = tasks_recv(3, 1:ntasks_recv)
2084      CALL sort(taskid, SIZE(taskid), index)
2085
2086      DO k = 1, SIZE(tasks_recv, 1)
2087         tasks_recv(k, 1:ntasks_recv) = tasks_recv(k, index)
2088      ENDDO
2089
2090      DO k = 1, SIZE(rval, 1)
2091         rval(k, 1:ntasks_recv) = rval(k, index)
2092      ENDDO
2093
2094      DEALLOCATE (taskid)
2095      DEALLOCATE (index)
2096
2097      !
2098      ! final lists are ready
2099      !
2100
2101      tasks => tasks_recv
2102      ntasks = ntasks_recv
2103
2104      CALL timestop(handle)
2105
2106   END SUBROUTINE distribute_tasks
2107
2108! **************************************************************************************************
2109!> \brief ...
2110!> \param atom_pair ...
2111!> \param my_tasks ...
2112!> \param send ...
2113!> \param symmetric ...
2114!> \param natoms ...
2115!> \param maxset ...
2116!> \param maxpgf ...
2117!> \param nimages ...
2118!> \param rs_descs ...
2119! **************************************************************************************************
2120   SUBROUTINE get_atom_pair(atom_pair, my_tasks, send, symmetric, natoms, maxset, &
2121                            maxpgf, nimages, rs_descs)
2122
2123      INTEGER(KIND=int_8), DIMENSION(:), POINTER         :: atom_pair
2124      INTEGER(KIND=int_8), DIMENSION(:, :), POINTER      :: my_tasks
2125      LOGICAL                                            :: send, symmetric
2126      INTEGER                                            :: natoms, maxset, maxpgf, nimages
2127      TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
2128         POINTER                                         :: rs_descs
2129
2130      CHARACTER(LEN=*), PARAMETER :: routineN = 'get_atom_pair', routineP = moduleN//':'//routineN
2131
2132      INTEGER                                            :: acol, arow, i, iatom, ilevel, img, ipgf, &
2133                                                            iset, j, jatom, jpgf, jset
2134      INTEGER(KIND=int_8)                                :: natom8, nim8
2135      INTEGER, DIMENSION(:), POINTER                     :: index
2136
2137      ! calculate list of atom pairs
2138      ! fill pair list taking into acount symmetry
2139
2140      natom8 = natoms
2141      nim8 = nimages
2142
2143      atom_pair = 0
2144      DO i = 1, SIZE(atom_pair)
2145         CALL int2pair(my_tasks(3, i), ilevel, img, iatom, jatom, iset, jset, ipgf, jpgf, nimages, natoms, maxset, maxpgf)
2146         IF (symmetric) THEN
2147            IF (iatom .LE. jatom) THEN
2148               arow = iatom
2149               acol = jatom
2150            ELSE
2151               arow = jatom
2152               acol = iatom
2153            ENDIF
2154         ELSE
2155            arow = iatom
2156            acol = jatom
2157         ENDIF
2158         IF (send) THEN
2159            ! If sending, we need to use the 'real rank' as the pair has to be sent to the process which
2160            ! actually has the correct part of the rs_grid to do the mapping
2161            atom_pair(i) = rs_descs(ilevel)%rs_desc%virtual2real(decode_rank(my_tasks(1, i), SIZE(rs_descs))) &
2162                           *natom8*natom8*nim8 + (arow - 1)*natom8*nim8 + (acol - 1)*nim8 + (img - 1)
2163         ELSE
2164            ! If we are recieving, then no conversion is needed as the rank is that of the process with the
2165            ! required matrix block, and the ordering of the rs grid is irrelevant
2166            atom_pair(i) = decode_rank(my_tasks(2, i), SIZE(rs_descs)) &
2167                           *natom8*natom8*nim8 + (arow - 1)*natom8*nim8 + (acol - 1)*nim8 + (img - 1)
2168         ENDIF
2169      ENDDO
2170
2171      ALLOCATE (INDEX(SIZE(atom_pair)))
2172
2173      ! find unique atom pairs that I'm sending/receiving
2174      CALL sort(atom_pair, SIZE(atom_pair), index)
2175
2176      DEALLOCATE (index)
2177
2178      IF (SIZE(atom_pair) > 1) THEN
2179         j = 1
2180         ! first atom pair must be allowed
2181         DO i = 2, SIZE(atom_pair)
2182            IF (atom_pair(i) .GT. atom_pair(i - 1)) THEN
2183               j = j + 1
2184               atom_pair(j) = atom_pair(i)
2185            ENDIF
2186         ENDDO
2187         ! reallocate the atom pair list
2188         CALL reallocate(atom_pair, 1, j)
2189      ENDIF
2190
2191   END SUBROUTINE get_atom_pair
2192
2193! **************************************************************************************************
2194!> \brief redistributes the matrix so that it can be used in realspace operations
2195!>        i.e. according to the task lists for collocate and integrate.
2196!>        This routine can become a bottleneck in large calculations.
2197!> \param rs_descs ...
2198!> \param pmats ...
2199!> \param atom_pair_send ...
2200!> \param atom_pair_recv ...
2201!> \param natoms ...
2202!> \param nimages ...
2203!> \param scatter ...
2204!> \param hmats ...
2205! **************************************************************************************************
2206   SUBROUTINE rs_distribute_matrix(rs_descs, pmats, atom_pair_send, atom_pair_recv, &
2207                                   natoms, nimages, scatter, hmats)
2208
2209      TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
2210         POINTER                                         :: rs_descs
2211      TYPE(dbcsr_p_type), DIMENSION(:), POINTER          :: pmats
2212      INTEGER(KIND=int_8), DIMENSION(:), POINTER         :: atom_pair_send, atom_pair_recv
2213      INTEGER                                            :: natoms, nimages
2214      LOGICAL                                            :: scatter
2215      TYPE(dbcsr_p_type), DIMENSION(:), OPTIONAL, &
2216         POINTER                                         :: hmats
2217
2218      CHARACTER(LEN=*), PARAMETER :: routineN = 'rs_distribute_matrix', &
2219         routineP = moduleN//':'//routineN
2220
2221      INTEGER                                            :: acol, arow, handle, i, img, j, k, l, me, &
2222                                                            nblkcols_total, nblkrows_total, ncol, &
2223                                                            nrow, nthread, nthread_left
2224      INTEGER(KIND=int_8)                                :: natom8, nim8, pair
2225      INTEGER, ALLOCATABLE, DIMENSION(:) :: first_col, first_row, last_col, last_row, recv_disps, &
2226         recv_pair_count, recv_pair_disps, recv_sizes, send_disps, send_pair_count, &
2227         send_pair_disps, send_sizes
2228      INTEGER, DIMENSION(:), POINTER                     :: col_blk_size, row_blk_size
2229      LOGICAL                                            :: found
2230      REAL(KIND=dp), ALLOCATABLE, DIMENSION(:)           :: recv_buf_r, send_buf_r
2231      REAL(KIND=dp), DIMENSION(:, :), POINTER            :: h_block, p_block
2232      TYPE(dbcsr_type), POINTER                          :: hmat, pmat
2233      TYPE(realspace_grid_desc_type), POINTER            :: desc
2234
2235!$    INTEGER(kind=omp_lock_kind), ALLOCATABLE, DIMENSION(:) :: locks
2236
2237      CALL timeset(routineN, handle)
2238
2239      IF (.NOT. scatter) THEN
2240         CPASSERT(PRESENT(hmats))
2241      END IF
2242
2243      desc => rs_descs(1)%rs_desc
2244      me = desc%my_pos + 1
2245
2246      ! allocate local arrays
2247      ALLOCATE (send_sizes(desc%group_size))
2248      ALLOCATE (recv_sizes(desc%group_size))
2249      ALLOCATE (send_disps(desc%group_size))
2250      ALLOCATE (recv_disps(desc%group_size))
2251      ALLOCATE (send_pair_count(desc%group_size))
2252      ALLOCATE (recv_pair_count(desc%group_size))
2253      ALLOCATE (send_pair_disps(desc%group_size))
2254      ALLOCATE (recv_pair_disps(desc%group_size))
2255
2256      pmat => pmats(1)%matrix
2257      CALL dbcsr_get_info(pmat, &
2258                          row_blk_size=row_blk_size, &
2259                          col_blk_size=col_blk_size, &
2260                          nblkrows_total=nblkrows_total, &
2261                          nblkcols_total=nblkcols_total)
2262      ALLOCATE (first_row(nblkrows_total), last_row(nblkrows_total), &
2263                first_col(nblkcols_total), last_col(nblkcols_total))
2264      CALL dbcsr_convert_sizes_to_offsets(row_blk_size, first_row, last_row)
2265      CALL dbcsr_convert_sizes_to_offsets(col_blk_size, first_col, last_col)
2266      ! set up send buffer sizes
2267      natom8 = natoms
2268      nim8 = nimages
2269
2270      send_sizes = 0
2271      send_pair_count = 0
2272      DO i = 1, SIZE(atom_pair_send)
2273
2274         ! proc we're sending this block to
2275         k = INT(atom_pair_send(i)/(nim8*natom8**2)) + 1
2276
2277         pair = MOD(atom_pair_send(i), nim8*natom8**2)
2278
2279         arow = INT(pair/(nim8*natom8)) + 1
2280         acol = INT(MOD(pair, nim8*natom8)/nim8) + 1
2281
2282         nrow = last_row(arow) - first_row(arow) + 1
2283         ncol = last_col(acol) - first_col(acol) + 1
2284
2285         send_sizes(k) = send_sizes(k) + nrow*ncol
2286         send_pair_count(k) = send_pair_count(k) + 1
2287
2288      ENDDO
2289
2290      send_disps = 0
2291      send_pair_disps = 0
2292      DO i = 2, desc%group_size
2293         send_disps(i) = send_disps(i - 1) + send_sizes(i - 1)
2294         send_pair_disps(i) = send_pair_disps(i - 1) + send_pair_count(i - 1)
2295      ENDDO
2296
2297      ALLOCATE (send_buf_r(SUM(send_sizes)))
2298
2299      ! set up recv buffer
2300
2301      recv_sizes = 0
2302      recv_pair_count = 0
2303      DO i = 1, SIZE(atom_pair_recv)
2304
2305         ! proc we're receiving this data from
2306         k = INT(atom_pair_recv(i)/(nim8*natom8**2)) + 1
2307
2308         pair = MOD(atom_pair_recv(i), nim8*natom8**2)
2309
2310         arow = INT(pair/(nim8*natom8)) + 1
2311         acol = INT(MOD(pair, nim8*natom8)/nim8) + 1
2312
2313         nrow = last_row(arow) - first_row(arow) + 1
2314         ncol = last_col(acol) - first_col(acol) + 1
2315
2316         recv_sizes(k) = recv_sizes(k) + nrow*ncol
2317         recv_pair_count(k) = recv_pair_count(k) + 1
2318
2319      ENDDO
2320
2321      recv_disps = 0
2322      recv_pair_disps = 0
2323      DO i = 2, desc%group_size
2324         recv_disps(i) = recv_disps(i - 1) + recv_sizes(i - 1)
2325         recv_pair_disps(i) = recv_pair_disps(i - 1) + recv_pair_count(i - 1)
2326      ENDDO
2327      ALLOCATE (recv_buf_r(SUM(recv_sizes)))
2328
2329!$OMP PARALLEL DEFAULT(NONE), &
2330!$OMP          SHARED(desc,send_pair_count,send_pair_disps,natom8,nim8,nimages),&
2331!$OMP          SHARED(last_row,first_row,last_col,first_col),&
2332!$OMP          SHARED(pmats,send_buf_r,send_disps,send_sizes),&
2333!$OMP          SHARED(atom_pair_send,me,hmats,nblkrows_total),&
2334!$OMP          SHARED(atom_pair_recv,recv_buf_r,scatter,recv_pair_disps), &
2335!$OMP          SHARED(recv_sizes,recv_disps,recv_pair_count,locks), &
2336!$OMP          PRIVATE(i,img,pair,arow,acol,nrow,ncol,p_block,found,j,k,l),&
2337!$OMP          PRIVATE(nthread,h_block,nthread_left,hmat,pmat)
2338
2339      nthread = 1
2340!$    nthread = omp_get_num_threads()
2341      nthread_left = 1
2342!$    nthread_left = MAX(1, nthread - 1)
2343
2344      ! do packing
2345!$OMP DO schedule(guided)
2346      DO l = 1, desc%group_size
2347         IF (l .EQ. me) CYCLE
2348         send_sizes(l) = 0
2349         DO i = 1, send_pair_count(l)
2350            pair = MOD(atom_pair_send(send_pair_disps(l) + i), nim8*natom8**2)
2351
2352            arow = INT(pair/(nim8*natom8)) + 1
2353            acol = INT(MOD(pair, nim8*natom8)/nim8) + 1
2354            img = INT(MOD(pair, nim8)) + 1
2355
2356            nrow = last_row(arow) - first_row(arow) + 1
2357            ncol = last_col(acol) - first_col(acol) + 1
2358
2359            pmat => pmats(img)%matrix
2360            CALL dbcsr_get_block_p(matrix=pmat, row=arow, col=acol, &
2361                                   block=p_block, found=found)
2362            CPASSERT(found)
2363
2364            DO k = 1, ncol
2365               DO j = 1, nrow
2366                  send_buf_r(send_disps(l) + send_sizes(l) + j + (k - 1)*nrow) = p_block(j, k)
2367               ENDDO
2368            ENDDO
2369            send_sizes(l) = send_sizes(l) + nrow*ncol
2370         ENDDO
2371      ENDDO
2372!$OMP END DO
2373
2374      IF (.NOT. scatter) THEN
2375         ! We need locks to protect concurrent summation into H
2376!$OMP SINGLE
2377!$       ALLOCATE (locks(nthread*10))
2378!$OMP END SINGLE
2379
2380!$OMP DO
2381!$       do i = 1, nthread*10
2382!$          call omp_init_lock(locks(i))
2383!$       end do
2384!$OMP END DO
2385      END IF
2386
2387!$OMP MASTER
2388      ! do communication
2389      CALL mp_alltoall(send_buf_r, send_sizes, send_disps, &
2390                       recv_buf_r, recv_sizes, recv_disps, desc%group)
2391!$OMP END MASTER
2392
2393      ! If this is a scatter, then no need to copy local blocks,
2394      ! If not, we sum them directly into H (bypassing the alltoall)
2395      IF (.NOT. scatter) THEN
2396
2397         ! Distribute work over remaining threads assuming one is still in the alltoall
2398!$OMP DO schedule(dynamic,MAX(1,send_pair_count(me)/nthread_left))
2399         DO i = 1, send_pair_count(me)
2400            pair = MOD(atom_pair_send(send_pair_disps(me) + i), nim8*natom8**2)
2401
2402            arow = INT(pair/(nim8*natom8)) + 1
2403            acol = INT(MOD(pair, nim8*natom8)/nim8) + 1
2404            img = INT(MOD(pair, nim8)) + 1
2405
2406            nrow = last_row(arow) - first_row(arow) + 1
2407            ncol = last_col(acol) - first_col(acol) + 1
2408
2409            hmat => hmats(img)%matrix
2410            pmat => pmats(img)%matrix
2411            CALL dbcsr_get_block_p(matrix=hmat, row=arow, col=acol, &
2412                                   BLOCK=h_block, found=found)
2413            CPASSERT(found)
2414            CALL dbcsr_get_block_p(matrix=pmat, row=arow, col=acol, &
2415                                   BLOCK=p_block, found=found)
2416            CPASSERT(found)
2417
2418!$          call omp_set_lock(locks((arow - 1)*nthread*10/nblkrows_total + 1))
2419            DO k = 1, ncol
2420               DO j = 1, nrow
2421                  h_block(j, k) = h_block(j, k) + p_block(j, k)
2422               ENDDO
2423            ENDDO
2424!$          call omp_unset_lock(locks((arow - 1)*nthread*10/nblkrows_total + 1))
2425         ENDDO
2426!$OMP END DO
2427      ELSE
2428         ! We will insert new blocks into P, so create mutable work matrices
2429         DO img = 1, nimages
2430            pmat => pmats(img)%matrix
2431            CALL dbcsr_work_create(pmat, work_mutable=.TRUE., &
2432                                   nblks_guess=SIZE(atom_pair_recv)/nthread, sizedata_guess=SIZE(recv_buf_r)/nthread, &
2433                                   n=nthread)
2434         END DO
2435      END IF
2436
2437! wait for comm and setup to finish
2438!$OMP BARRIER
2439
2440      !do unpacking
2441!$OMP DO schedule(guided)
2442      DO l = 1, desc%group_size
2443         IF (l .EQ. me) CYCLE
2444         recv_sizes(l) = 0
2445         DO i = 1, recv_pair_count(l)
2446            pair = MOD(atom_pair_recv(recv_pair_disps(l) + i), nim8*natom8**2)
2447
2448            arow = INT(pair/(nim8*natom8)) + 1
2449            acol = INT(MOD(pair, nim8*natom8)/nim8) + 1
2450            img = INT(MOD(pair, nim8)) + 1
2451
2452            nrow = last_row(arow) - first_row(arow) + 1
2453            ncol = last_col(acol) - first_col(acol) + 1
2454
2455            pmat => pmats(img)%matrix
2456            NULLIFY (p_block)
2457            CALL dbcsr_get_block_p(matrix=pmat, row=arow, col=acol, &
2458                                   BLOCK=p_block, found=found)
2459
2460            IF (PRESENT(hmats)) THEN
2461               hmat => hmats(img)%matrix
2462               CALL dbcsr_get_block_p(matrix=hmat, row=arow, col=acol, &
2463                                      BLOCK=h_block, found=found)
2464               CPASSERT(found)
2465            ENDIF
2466
2467            IF (scatter .AND. .NOT. ASSOCIATED(p_block)) THEN
2468               CALL dbcsr_put_block(pmat, arow, acol, &
2469                                    block=recv_buf_r(recv_disps(l) + recv_sizes(l) + 1:recv_disps(l) + recv_sizes(l) + nrow*ncol))
2470            END IF
2471            IF (.NOT. scatter) THEN
2472!$             call omp_set_lock(locks((arow - 1)*nthread*10/nblkrows_total + 1))
2473               DO k = 1, ncol
2474                  DO j = 1, nrow
2475                     h_block(j, k) = h_block(j, k) + recv_buf_r(recv_disps(l) + recv_sizes(l) + j + (k - 1)*nrow)
2476                  ENDDO
2477               ENDDO
2478!$             call omp_unset_lock(locks((arow - 1)*nthread*10/nblkrows_total + 1))
2479            ENDIF
2480            recv_sizes(l) = recv_sizes(l) + nrow*ncol
2481         ENDDO
2482      ENDDO
2483!$OMP END DO
2484
2485!$    IF (.not. scatter) THEN
2486!$OMP DO
2487!$       do i = 1, nthread*10
2488!$          call omp_destroy_lock(locks(i))
2489!$       end do
2490!$OMP END DO
2491!$    END IF
2492
2493!$OMP SINGLE
2494!$    IF (.not. scatter) THEN
2495!$       DEALLOCATE (locks)
2496!$    END IF
2497!$OMP END SINGLE NOWAIT
2498
2499      IF (scatter) THEN
2500         ! Blocks were added to P
2501         DO img = 1, nimages
2502            pmat => pmats(img)%matrix
2503            CALL dbcsr_finalize(pmat)
2504         END DO
2505      END IF
2506!$OMP END PARALLEL
2507
2508      DEALLOCATE (send_buf_r)
2509      DEALLOCATE (recv_buf_r)
2510
2511      DEALLOCATE (send_sizes)
2512      DEALLOCATE (recv_sizes)
2513      DEALLOCATE (send_disps)
2514      DEALLOCATE (recv_disps)
2515      DEALLOCATE (send_pair_count)
2516      DEALLOCATE (recv_pair_count)
2517      DEALLOCATE (send_pair_disps)
2518      DEALLOCATE (recv_pair_disps)
2519
2520      DEALLOCATE (first_row, last_row, first_col, last_col)
2521
2522      CALL timestop(handle)
2523
2524   END SUBROUTINE rs_distribute_matrix
2525
2526! **************************************************************************************************
2527!> \brief determines the rank of the processor who's real rs grid contains point
2528!> \param rs_desc ...
2529!> \param igrid_level ...
2530!> \param n_levels ...
2531!> \param cube_center ...
2532!> \param ntasks ...
2533!> \param tasks ...
2534!> \param lb_cube ...
2535!> \param ub_cube ...
2536!> \param added_tasks ...
2537!> \par History
2538!>      11.2007 created [MattW]
2539!>      10.2008 rewritten [Joost VandeVondele]
2540!> \author MattW
2541! **************************************************************************************************
2542   SUBROUTINE rs_find_node(rs_desc, igrid_level, n_levels, cube_center, ntasks, tasks, &
2543                           lb_cube, ub_cube, added_tasks)
2544
2545      TYPE(realspace_grid_desc_type), POINTER            :: rs_desc
2546      INTEGER, INTENT(IN)                                :: igrid_level, n_levels
2547      INTEGER, DIMENSION(3), INTENT(IN)                  :: cube_center
2548      INTEGER, INTENT(INOUT)                             :: ntasks
2549      INTEGER(KIND=int_8), DIMENSION(:, :), POINTER      :: tasks
2550      INTEGER, DIMENSION(3), INTENT(IN)                  :: lb_cube, ub_cube
2551      INTEGER, INTENT(OUT)                               :: added_tasks
2552
2553      CHARACTER(LEN=*), PARAMETER :: routineN = 'rs_find_node', routineP = moduleN//':'//routineN
2554      INTEGER, PARAMETER                                 :: add_tasks = 1000
2555      REAL(kind=dp), PARAMETER                           :: mult_tasks = 2.0_dp
2556
2557      INTEGER :: bit_index, coord(3), curr_tasks, dest, i, icoord(3), idest, itask, ix, iy, iz, &
2558         lb_coord(3), lb_domain(3), lbc(3), ub_coord(3), ub_domain(3), ubc(3)
2559      INTEGER(KIND=int_8)                                :: bit_pattern
2560      LOGICAL                                            :: dir_periodic(3)
2561
2562      coord(1) = rs_desc%x2coord(cube_center(1))
2563      coord(2) = rs_desc%y2coord(cube_center(2))
2564      coord(3) = rs_desc%z2coord(cube_center(3))
2565      dest = rs_desc%coord2rank(coord(1), coord(2), coord(3))
2566
2567      ! the real cube coordinates
2568      lbc = lb_cube + cube_center
2569      ubc = ub_cube + cube_center
2570
2571      IF (ALL((rs_desc%lb_global(:, dest) - rs_desc%border) .LE. lbc) .AND. &
2572          ALL((rs_desc%ub_global(:, dest) + rs_desc%border) .GE. ubc)) THEN
2573         !standard distributed collocation/integration
2574         tasks(1, ntasks) = encode_rank(dest, igrid_level, n_levels)
2575         tasks(4, ntasks) = 1
2576         tasks(6, ntasks) = 0
2577         added_tasks = 1
2578
2579         ! here we figure out if there is an alternate location for this task
2580         ! i.e. even though the cube_center is not in the real local domain,
2581         ! it might fully fit in the halo of the neighbor
2582         ! if its radius is smaller than the maximum radius
2583         ! the list of possible neighbors is stored here as a bit pattern
2584         ! to reconstruct what the rank of the neigbor is,
2585         ! we can use (note this requires the correct rs_grid)
2586         !  IF (BTEST(bit_pattern,0)) rank=rs_grid_locate_rank(rs_desc,tasks(1,ntasks),(/-1,0,0/))
2587         !  IF (BTEST(bit_pattern,1)) rank=rs_grid_locate_rank(rs_desc,tasks(1,ntasks),(/+1,0,0/))
2588         !  IF (BTEST(bit_pattern,2)) rank=rs_grid_locate_rank(rs_desc,tasks(1,ntasks),(/0,-1,0/))
2589         !  IF (BTEST(bit_pattern,3)) rank=rs_grid_locate_rank(rs_desc,tasks(1,ntasks),(/0,+1,0/))
2590         !  IF (BTEST(bit_pattern,4)) rank=rs_grid_locate_rank(rs_desc,tasks(1,ntasks),(/0,0,-1/))
2591         !  IF (BTEST(bit_pattern,5)) rank=rs_grid_locate_rank(rs_desc,tasks(1,ntasks),(/0,0,+1/))
2592         bit_index = 0
2593         bit_pattern = 0
2594         DO i = 1, 3
2595            IF (rs_desc%perd(i) == 1) THEN
2596               bit_pattern = IBCLR(bit_pattern, bit_index)
2597               bit_index = bit_index + 1
2598               bit_pattern = IBCLR(bit_pattern, bit_index)
2599               bit_index = bit_index + 1
2600            ELSE
2601               ! fits the left neighbor ?
2602               IF (ubc(i) <= rs_desc%lb_global(i, dest) - 1 + rs_desc%border) THEN
2603                  bit_pattern = IBSET(bit_pattern, bit_index)
2604                  bit_index = bit_index + 1
2605               ELSE
2606                  bit_pattern = IBCLR(bit_pattern, bit_index)
2607                  bit_index = bit_index + 1
2608               ENDIF
2609               ! fits the right neighbor ?
2610               IF (lbc(i) >= rs_desc%ub_global(i, dest) + 1 - rs_desc%border) THEN
2611                  bit_pattern = IBSET(bit_pattern, bit_index)
2612                  bit_index = bit_index + 1
2613               ELSE
2614                  bit_pattern = IBCLR(bit_pattern, bit_index)
2615                  bit_index = bit_index + 1
2616               ENDIF
2617            ENDIF
2618         ENDDO
2619         tasks(6, ntasks) = bit_pattern
2620
2621      ELSE
2622         ! generalised collocation/integration needed
2623         ! first we figure out how many neighbors we have to add to include the lbc/ubc
2624         ! in the available domains (inclusive of halo points)
2625         ! first we 'ignore' periodic boundary conditions
2626         ! i.e. ub_coord-lb_coord+1 might be larger than group_dim
2627         lb_coord = coord
2628         ub_coord = coord
2629         lb_domain = rs_desc%lb_global(:, dest) - rs_desc%border
2630         ub_domain = rs_desc%ub_global(:, dest) + rs_desc%border
2631         DO i = 1, 3
2632            ! only if the grid is not periodic in this direction we need to take care of adding neighbors
2633            IF (rs_desc%perd(i) == 0) THEN
2634               ! if the domain lower bound is greater than the lbc we need to add the size of the neighbor domain
2635               DO
2636                  IF (lb_domain(i) > lbc(i)) THEN
2637                     lb_coord(i) = lb_coord(i) - 1
2638                     icoord = MODULO(lb_coord, rs_desc%group_dim)
2639                     idest = rs_desc%coord2rank(icoord(1), icoord(2), icoord(3))
2640                     lb_domain(i) = lb_domain(i) - (rs_desc%ub_global(i, idest) - rs_desc%lb_global(i, idest) + 1)
2641                  ELSE
2642                     EXIT
2643                  ENDIF
2644               ENDDO
2645               ! same for the upper bound
2646               DO
2647                  IF (ub_domain(i) < ubc(i)) THEN
2648                     ub_coord(i) = ub_coord(i) + 1
2649                     icoord = MODULO(ub_coord, rs_desc%group_dim)
2650                     idest = rs_desc%coord2rank(icoord(1), icoord(2), icoord(3))
2651                     ub_domain(i) = ub_domain(i) + (rs_desc%ub_global(i, idest) - rs_desc%lb_global(i, idest) + 1)
2652                  ELSE
2653                     EXIT
2654                  ENDIF
2655               ENDDO
2656            ENDIF
2657         ENDDO
2658
2659         ! some care is needed for the periodic boundaries
2660         DO i = 1, 3
2661            IF (ub_domain(i) - lb_domain(i) + 1 >= rs_desc%npts(i)) THEN
2662               dir_periodic(i) = .TRUE.
2663               lb_coord(i) = 0
2664               ub_coord(i) = rs_desc%group_dim(i) - 1
2665            ELSE
2666               dir_periodic(i) = .FALSE.
2667            ENDIF
2668         ENDDO
2669
2670         added_tasks = PRODUCT(ub_coord - lb_coord + 1)
2671         itask = ntasks
2672         ntasks = ntasks + added_tasks - 1
2673         IF (ntasks > SIZE(tasks, 2)) THEN
2674            curr_tasks = INT((SIZE(tasks, 2) + add_tasks)*mult_tasks)
2675            CALL reallocate(tasks, 1, 6, 1, curr_tasks)
2676         END IF
2677         DO iz = lb_coord(3), ub_coord(3)
2678         DO iy = lb_coord(2), ub_coord(2)
2679         DO ix = lb_coord(1), ub_coord(1)
2680            icoord = MODULO((/ix, iy, iz/), rs_desc%group_dim)
2681            idest = rs_desc%coord2rank(icoord(1), icoord(2), icoord(3))
2682            tasks(1, itask) = encode_rank(idest, igrid_level, n_levels)
2683            tasks(4, itask) = 2
2684            tasks(6, itask) = 0
2685            ! encode the domain size for this task
2686            ! if the bit is set, we need to add the border in that direction
2687            IF (ix == lb_coord(1) .AND. .NOT. dir_periodic(1)) tasks(6, itask) = IBSET(tasks(6, itask), 0)
2688            IF (ix == ub_coord(1) .AND. .NOT. dir_periodic(1)) tasks(6, itask) = IBSET(tasks(6, itask), 1)
2689            IF (iy == lb_coord(2) .AND. .NOT. dir_periodic(2)) tasks(6, itask) = IBSET(tasks(6, itask), 2)
2690            IF (iy == ub_coord(2) .AND. .NOT. dir_periodic(2)) tasks(6, itask) = IBSET(tasks(6, itask), 3)
2691            IF (iz == lb_coord(3) .AND. .NOT. dir_periodic(3)) tasks(6, itask) = IBSET(tasks(6, itask), 4)
2692            IF (iz == ub_coord(3) .AND. .NOT. dir_periodic(3)) tasks(6, itask) = IBSET(tasks(6, itask), 5)
2693            itask = itask + 1
2694         ENDDO
2695         ENDDO
2696         ENDDO
2697      ENDIF
2698
2699   END SUBROUTINE rs_find_node
2700
2701! **************************************************************************************************
2702!> \brief utility functions for encoding the grid level with a rank, allowing
2703!>        different grid levels to maintain a different rank ordering without
2704!>        losing information.  These encoded_ints are stored in the tasks(1:2,:) array
2705!> \param rank ...
2706!> \param grid_level ...
2707!> \param n_levels ...
2708!> \return ...
2709!> \par History
2710!>      4.2009 created [Iain Bethune]
2711!>        (c) The Numerical Algorithms Group (NAG) Ltd, 2009 on behalf of the HECToR project
2712! **************************************************************************************************
2713   FUNCTION encode_rank(rank, grid_level, n_levels) RESULT(encoded_int)
2714
2715      INTEGER, INTENT(IN)                                :: rank, grid_level, n_levels
2716      INTEGER(KIND=int_8)                                :: encoded_int
2717
2718! ordered so can still sort by rank
2719
2720      encoded_int = rank*n_levels + grid_level - 1
2721
2722   END FUNCTION
2723
2724! **************************************************************************************************
2725!> \brief ...
2726!> \param encoded_int ...
2727!> \param n_levels ...
2728!> \return ...
2729! **************************************************************************************************
2730   FUNCTION decode_rank(encoded_int, n_levels) RESULT(rank)
2731
2732      INTEGER(KIND=int_8), INTENT(IN)                    :: encoded_int
2733      INTEGER, INTENT(IN)                                :: n_levels
2734      INTEGER                                            :: rank
2735
2736      rank = INT(encoded_int/n_levels)
2737
2738   END FUNCTION
2739
2740! **************************************************************************************************
2741!> \brief ...
2742!> \param encoded_int ...
2743!> \param n_levels ...
2744!> \return ...
2745! **************************************************************************************************
2746   FUNCTION decode_level(encoded_int, n_levels) RESULT(grid_level)
2747
2748      INTEGER(KIND=int_8), INTENT(IN)                    :: encoded_int
2749      INTEGER, INTENT(IN)                                :: n_levels
2750      INTEGER                                            :: grid_level
2751
2752      INTEGER(KIND=int_8)                                :: n_levels8
2753
2754      n_levels8 = n_levels
2755
2756      grid_level = INT(MODULO(encoded_int, n_levels8)) + 1
2757
2758   END FUNCTION decode_level
2759
2760END MODULE task_list_methods
2761