1!--------------------------------------------------------------------------------------------------! 2! CP2K: A general program to perform molecular dynamics simulations ! 3! Copyright (C) 2000 - 2019 CP2K developers group ! 4!--------------------------------------------------------------------------------------------------! 5 6! ************************************************************************************************** 7!> \brief generate the tasks lists used by collocate and integrate routines 8!> \par History 9!> 01.2008 [Joost VandeVondele] refactered out of qs_collocate / qs_integrate 10!> \author Joost VandeVondele 11! ************************************************************************************************** 12MODULE task_list_methods 13 USE ao_util, ONLY: exp_radius_very_extended 14 USE basis_set_types, ONLY: get_gto_basis_set,& 15 gto_basis_set_p_type,& 16 gto_basis_set_type 17 USE cell_types, ONLY: cell_type,& 18 pbc 19 USE cp_control_types, ONLY: dft_control_type 20 USE cube_utils, ONLY: compute_cube_center,& 21 cube_info_type,& 22 return_cube,& 23 return_cube_nonortho 24 USE dbcsr_api, ONLY: dbcsr_convert_sizes_to_offsets,& 25 dbcsr_finalize,& 26 dbcsr_get_block_p,& 27 dbcsr_get_info,& 28 dbcsr_p_type,& 29 dbcsr_put_block,& 30 dbcsr_type,& 31 dbcsr_work_create 32 USE gaussian_gridlevels, ONLY: gaussian_gridlevel,& 33 gridlevel_info_type 34 USE kinds, ONLY: default_string_length,& 35 dp,& 36 int_8 37 USE kpoint_types, ONLY: get_kpoint_info,& 38 kpoint_type 39 USE memory_utilities, ONLY: reallocate 40 USE message_passing, ONLY: mp_allgather,& 41 mp_alltoall,& 42 mp_bcast,& 43 mp_sum,& 44 mp_sum_partial 45 USE particle_types, ONLY: particle_type 46 USE pw_env_types, ONLY: pw_env_get,& 47 pw_env_type 48 USE qs_kind_types, ONLY: get_qs_kind,& 49 qs_kind_type 50 USE qs_ks_types, ONLY: get_ks_env,& 51 qs_ks_env_type 52 USE qs_neighbor_list_types, ONLY: get_iterator_info,& 53 neighbor_list_iterate,& 54 neighbor_list_iterator_create,& 55 neighbor_list_iterator_p_type,& 56 neighbor_list_iterator_release,& 57 neighbor_list_set_p_type 58 USE realspace_grid_types, ONLY: realspace_grid_desc_p_type,& 59 realspace_grid_desc_type,& 60 realspace_grid_p_type,& 61 rs_grid_create,& 62 rs_grid_locate_rank,& 63 rs_grid_release,& 64 rs_grid_reorder_ranks 65 USE task_list_types, ONLY: task_list_type 66 USE util, ONLY: sort 67 68!$ USE OMP_LIB, ONLY: omp_destroy_lock, omp_get_num_threads, omp_init_lock, & 69!$ omp_lock_kind, omp_set_lock, omp_unset_lock 70#include "./base/base_uses.f90" 71 72 IMPLICIT NONE 73 74 LOGICAL, PRIVATE, PARAMETER :: debug_this_module = .FALSE. 75 76 PRIVATE 77 78 CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'task_list_methods' 79 80 PUBLIC :: generate_qs_task_list, & 81 task_list_inner_loop 82 PUBLIC :: distribute_tasks, & 83 int2pair, & 84 rs_distribute_matrix 85 86CONTAINS 87 88! ************************************************************************************************** 89!> \brief ... 90!> \param ks_env ... 91!> \param task_list ... 92!> \param reorder_rs_grid_ranks Flag that indicates if this routine should 93!> or should not overwrite the rs descriptor (see comment below) 94!> \param skip_load_balance_distributed ... 95!> \param soft_valid ... 96!> \param basis_type ... 97!> \param pw_env_external ... 98!> \param sab_orb_external ... 99!> \par History 100!> 01.2008 factored out of calculate_rho_elec [Joost VandeVondele] 101!> 04.2010 divides tasks into grid levels and atom pairs for integrate/collocate [Iain Bethune] 102!> (c) The Numerical Algorithms Group (NAG) Ltd, 2010 on behalf of the HECToR project 103!> 06.2015 adjusted to be used with multiple images (k-points) [JGH] 104!> \note If this routine is called several times with different task lists, 105!> the default behaviour is to re-optimize the grid ranks and overwrite 106!> the rs descriptor and grids. reorder_rs_grid_ranks = .FALSE. prevents the code 107!> of performing a new optimization by leaving the rank order in 108!> its current state. 109! ************************************************************************************************** 110 SUBROUTINE generate_qs_task_list(ks_env, task_list, & 111 reorder_rs_grid_ranks, skip_load_balance_distributed, & 112 soft_valid, basis_type, pw_env_external, sab_orb_external) 113 114 TYPE(qs_ks_env_type), POINTER :: ks_env 115 TYPE(task_list_type), POINTER :: task_list 116 LOGICAL, INTENT(IN) :: reorder_rs_grid_ranks, & 117 skip_load_balance_distributed 118 LOGICAL, INTENT(IN), OPTIONAL :: soft_valid 119 CHARACTER(LEN=*), INTENT(IN), OPTIONAL :: basis_type 120 TYPE(pw_env_type), OPTIONAL, POINTER :: pw_env_external 121 TYPE(neighbor_list_set_p_type), DIMENSION(:), & 122 OPTIONAL, POINTER :: sab_orb_external 123 124 CHARACTER(LEN=*), PARAMETER :: routineN = 'generate_qs_task_list', & 125 routineP = moduleN//':'//routineN 126 INTEGER, PARAMETER :: max_tasks = 2000 127 128 CHARACTER(LEN=default_string_length) :: my_basis_type 129 INTEGER :: cindex, curr_tasks, handle, i, iatom, iatom_old, igrid_level, igrid_level_old, & 130 ikind, ilevel, img, img_old, inode, ipair, ipgf, iset, itask, jatom, jatom_old, jkind, & 131 jpgf, jset, maxpgf, maxset, natoms, nimages, nkind, nseta, nsetb 132 INTEGER(kind=int_8), DIMENSION(:, :), POINTER :: tasks 133 INTEGER, ALLOCATABLE, DIMENSION(:, :, :) :: blocks 134 INTEGER, DIMENSION(3) :: cellind 135 INTEGER, DIMENSION(:), POINTER :: la_max, la_min, lb_max, lb_min, npgfa, & 136 npgfb 137 INTEGER, DIMENSION(:, :, :), POINTER :: cell_to_index 138 LOGICAL :: dokp, my_soft 139 REAL(KIND=dp) :: kind_radius_a, kind_radius_b 140 REAL(KIND=dp), DIMENSION(3) :: ra, rab 141 REAL(KIND=dp), DIMENSION(:), POINTER :: set_radius_a, set_radius_b 142 REAL(KIND=dp), DIMENSION(:, :), POINTER :: rpgfa, rpgfb, zeta, zetb 143 TYPE(cell_type), POINTER :: cell 144 TYPE(cube_info_type), DIMENSION(:), POINTER :: cube_info 145 TYPE(dft_control_type), POINTER :: dft_control 146 TYPE(gridlevel_info_type), POINTER :: gridlevel_info 147 TYPE(gto_basis_set_p_type), DIMENSION(:), POINTER :: basis_set_list 148 TYPE(gto_basis_set_type), POINTER :: basis_set_a, basis_set_b, orb_basis_set 149 TYPE(kpoint_type), POINTER :: kpoints 150 TYPE(neighbor_list_iterator_p_type), & 151 DIMENSION(:), POINTER :: nl_iterator 152 TYPE(neighbor_list_set_p_type), DIMENSION(:), & 153 POINTER :: sab_orb 154 TYPE(particle_type), DIMENSION(:), POINTER :: particle_set 155 TYPE(pw_env_type), POINTER :: pw_env 156 TYPE(qs_kind_type), DIMENSION(:), POINTER :: qs_kind_set 157 TYPE(qs_kind_type), POINTER :: qs_kind 158 TYPE(realspace_grid_desc_p_type), DIMENSION(:), & 159 POINTER :: rs_descs 160 TYPE(realspace_grid_p_type), DIMENSION(:), POINTER :: rs_grids 161 162 CALL timeset(routineN, handle) 163 164 ! by default, the full density is calculated 165 my_soft = .FALSE. 166 IF (PRESENT(soft_valid)) my_soft = soft_valid 167 168 CALL get_ks_env(ks_env, & 169 qs_kind_set=qs_kind_set, & 170 cell=cell, & 171 particle_set=particle_set, & 172 dft_control=dft_control) 173 174 IF (PRESENT(basis_type)) THEN 175 my_basis_type = basis_type 176 ELSE 177 my_basis_type = "ORB" 178 END IF 179 180 CALL get_ks_env(ks_env, sab_orb=sab_orb) 181 IF (PRESENT(sab_orb_external)) sab_orb => sab_orb_external 182 183 CALL get_ks_env(ks_env, pw_env=pw_env) 184 IF (PRESENT(pw_env_external)) pw_env => pw_env_external 185 CALL pw_env_get(pw_env, rs_descs=rs_descs, rs_grids=rs_grids) 186 187 ! *** assign from pw_env 188 gridlevel_info => pw_env%gridlevel_info 189 cube_info => pw_env%cube_info 190 191 ! find maximum numbers 192 nkind = SIZE(qs_kind_set) 193 natoms = SIZE(particle_set) 194 maxset = 0 195 maxpgf = 0 196 DO ikind = 1, nkind 197 qs_kind => qs_kind_set(ikind) 198 CALL get_qs_kind(qs_kind=qs_kind, & 199 softb=my_soft, & 200 basis_set=orb_basis_set, basis_type=my_basis_type) 201 202 IF (.NOT. ASSOCIATED(orb_basis_set)) CYCLE 203 CALL get_gto_basis_set(gto_basis_set=orb_basis_set, npgf=npgfa, nset=nseta) 204 205 maxset = MAX(nseta, maxset) 206 maxpgf = MAX(MAXVAL(npgfa), maxpgf) 207 END DO 208 209 ! kpoint related 210 nimages = dft_control%nimages 211 IF (nimages > 1) THEN 212 dokp = .TRUE. 213 NULLIFY (kpoints) 214 CALL get_ks_env(ks_env=ks_env, kpoints=kpoints) 215 CALL get_kpoint_info(kpoint=kpoints, cell_to_index=cell_to_index) 216 ELSE 217 dokp = .FALSE. 218 NULLIFY (cell_to_index) 219 END IF 220 221 ! free the atom_pair lists if allocated 222 IF (ASSOCIATED(task_list%atom_pair_send)) DEALLOCATE (task_list%atom_pair_send) 223 IF (ASSOCIATED(task_list%atom_pair_recv)) DEALLOCATE (task_list%atom_pair_recv) 224 225 ! construct a list of all tasks 226 IF (.NOT. ASSOCIATED(task_list%tasks)) THEN 227 CALL reallocate(task_list%tasks, 1, 6, 1, max_tasks) 228 CALL reallocate(task_list%dist_ab, 1, 3, 1, max_tasks) 229 ENDIF 230 task_list%ntasks = 0 231 curr_tasks = SIZE(task_list%tasks, 2) 232 233 ALLOCATE (basis_set_list(nkind)) 234 DO ikind = 1, nkind 235 qs_kind => qs_kind_set(ikind) 236 CALL get_qs_kind(qs_kind=qs_kind, softb=my_soft, basis_set=basis_set_a, & 237 basis_type=my_basis_type) 238 IF (ASSOCIATED(basis_set_a)) THEN 239 basis_set_list(ikind)%gto_basis_set => basis_set_a 240 ELSE 241 NULLIFY (basis_set_list(ikind)%gto_basis_set) 242 END IF 243 END DO 244 CALL neighbor_list_iterator_create(nl_iterator, sab_orb) 245 DO WHILE (neighbor_list_iterate(nl_iterator) == 0) 246 CALL get_iterator_info(nl_iterator, ikind=ikind, jkind=jkind, inode=inode, & 247 iatom=iatom, jatom=jatom, r=rab, cell=cellind) 248 basis_set_a => basis_set_list(ikind)%gto_basis_set 249 IF (.NOT. ASSOCIATED(basis_set_a)) CYCLE 250 basis_set_b => basis_set_list(jkind)%gto_basis_set 251 IF (.NOT. ASSOCIATED(basis_set_b)) CYCLE 252 ra(:) = pbc(particle_set(iatom)%r, cell) 253 ! basis ikind 254 la_max => basis_set_a%lmax 255 la_min => basis_set_a%lmin 256 npgfa => basis_set_a%npgf 257 nseta = basis_set_a%nset 258 rpgfa => basis_set_a%pgf_radius 259 set_radius_a => basis_set_a%set_radius 260 kind_radius_a = basis_set_a%kind_radius 261 zeta => basis_set_a%zet 262 ! basis jkind 263 lb_max => basis_set_b%lmax 264 lb_min => basis_set_b%lmin 265 npgfb => basis_set_b%npgf 266 nsetb = basis_set_b%nset 267 rpgfb => basis_set_b%pgf_radius 268 set_radius_b => basis_set_b%set_radius 269 kind_radius_b = basis_set_b%kind_radius 270 zetb => basis_set_b%zet 271 272 IF (dokp) THEN 273 cindex = cell_to_index(cellind(1), cellind(2), cellind(3)) 274 ELSE 275 cindex = 1 276 END IF 277 278 CALL task_list_inner_loop(task_list%tasks, task_list%dist_ab, task_list%ntasks, curr_tasks, & 279 rs_descs, dft_control, cube_info, gridlevel_info, cindex, & 280 iatom, jatom, rpgfa, rpgfb, zeta, zetb, kind_radius_b, set_radius_a, set_radius_b, ra, rab, & 281 la_max, la_min, lb_max, lb_min, npgfa, npgfb, maxpgf, maxset, natoms, nimages, nseta, nsetb) 282 283 END DO 284 CALL neighbor_list_iterator_release(nl_iterator) 285 286 DEALLOCATE (basis_set_list) 287 288 ! redistribute the task list so that all tasks map on the local rs grids 289 CALL distribute_tasks( & 290 rs_descs, task_list%ntasks, natoms, maxset, maxpgf, nimages, & 291 task_list%tasks, rval=task_list%dist_ab, atom_pair_send=task_list%atom_pair_send, & 292 atom_pair_recv=task_list%atom_pair_recv, symmetric=.TRUE., & 293 reorder_rs_grid_ranks=reorder_rs_grid_ranks, skip_load_balance_distributed=skip_load_balance_distributed) 294 295 ! If the rank order has changed, reallocate any of the distributed rs_grids 296 297 IF (reorder_rs_grid_ranks) THEN 298 DO i = 1, gridlevel_info%ngrid_levels 299 IF (rs_descs(i)%rs_desc%distributed) THEN 300 CALL rs_grid_release(rs_grids(i)%rs_grid) 301 NULLIFY (rs_grids(i)%rs_grid) 302 CALL rs_grid_create(rs_grids(i)%rs_grid, rs_descs(i)%rs_desc) 303 END IF 304 END DO 305 END IF 306 307 ! Now we have the final list of tasks, setup the task_list with the 308 ! data needed for the loops in integrate_v/calculate_rho 309 310 IF (ASSOCIATED(task_list%taskstart)) THEN 311 DEALLOCATE (task_list%taskstart) 312 END IF 313 IF (ASSOCIATED(task_list%taskstop)) THEN 314 DEALLOCATE (task_list%taskstop) 315 END IF 316 IF (ASSOCIATED(task_list%npairs)) THEN 317 DEALLOCATE (task_list%npairs) 318 END IF 319 320 ! First, count the number of unique atom pairs per grid level 321 322 ALLOCATE (task_list%npairs(SIZE(rs_descs))) 323 324 iatom_old = -1; jatom_old = -1; igrid_level_old = -1; img_old = -1 325 ipair = 0 326 task_list%npairs = 0 327 328 DO i = 1, task_list%ntasks 329 CALL int2pair(task_list%tasks(3, i), igrid_level, img, iatom, jatom, iset, jset, ipgf, jpgf, & 330 nimages, natoms, maxset, maxpgf) 331 IF (igrid_level .NE. igrid_level_old) THEN 332 IF (igrid_level_old .NE. -1) THEN 333 task_list%npairs(igrid_level_old) = ipair 334 END IF 335 ipair = 1 336 igrid_level_old = igrid_level 337 iatom_old = iatom 338 jatom_old = jatom 339 img_old = img 340 ELSE IF (iatom .NE. iatom_old .OR. jatom .NE. jatom_old .OR. img .NE. img_old) THEN 341 ipair = ipair + 1 342 iatom_old = iatom 343 jatom_old = jatom 344 img_old = img 345 END IF 346 END DO 347 ! Take care of the last iteration 348 IF (task_list%ntasks /= 0) THEN 349 task_list%npairs(igrid_level) = ipair 350 END IF 351 352 ! Second, for each atom pair, find the indices in the task list 353 ! of the first and last task 354 355 ! Array sized for worst case 356 ALLOCATE (task_list%taskstart(MAXVAL(task_list%npairs), SIZE(rs_descs))) 357 ALLOCATE (task_list%taskstop(MAXVAL(task_list%npairs), SIZE(rs_descs))) 358 359 iatom_old = -1; jatom_old = -1; igrid_level_old = -1; img_old = -1 360 ipair = 0 361 task_list%taskstart = 0 362 task_list%taskstop = 0 363 364 DO i = 1, task_list%ntasks 365 CALL int2pair(task_list%tasks(3, i), igrid_level, img, iatom, jatom, iset, jset, ipgf, jpgf, & 366 nimages, natoms, maxset, maxpgf) 367 IF (igrid_level .NE. igrid_level_old) THEN 368 IF (igrid_level_old .NE. -1) THEN 369 task_list%taskstop(ipair, igrid_level_old) = i - 1 370 END IF 371 ipair = 1 372 task_list%taskstart(ipair, igrid_level) = i 373 igrid_level_old = igrid_level 374 iatom_old = iatom 375 jatom_old = jatom 376 img_old = img 377 ELSE IF (iatom .NE. iatom_old .OR. jatom .NE. jatom_old .OR. img .NE. img_old) THEN 378 ipair = ipair + 1 379 task_list%taskstart(ipair, igrid_level) = i 380 task_list%taskstop(ipair - 1, igrid_level) = i - 1 381 iatom_old = iatom 382 jatom_old = jatom 383 img_old = img 384 END IF 385 END DO 386 ! Take care of the last iteration 387 IF (task_list%ntasks /= 0) THEN 388 task_list%taskstop(ipair, igrid_level) = task_list%ntasks 389 END IF 390 391 ! Debug task destribution 392 IF (debug_this_module) THEN 393 tasks => task_list%tasks 394 WRITE (6, *) 395 WRITE (6, *) "Total number of tasks ", task_list%ntasks 396 DO igrid_level = 1, gridlevel_info%ngrid_levels 397 WRITE (6, *) "Total number of pairs(grid_level) ", igrid_level, task_list%npairs(igrid_level) 398 END DO 399 WRITE (6, *) 400 401 DO igrid_level = 1, gridlevel_info%ngrid_levels 402 403 ALLOCATE (blocks(natoms, natoms, nimages)) 404 blocks = -1 405 DO ipair = 1, task_list%npairs(igrid_level) 406 itask = task_list%taskstart(ipair, igrid_level) 407 CALL int2pair(tasks(3, itask), ilevel, img, iatom, jatom, iset, jset, ipgf, jpgf, & 408 nimages, natoms, maxset, maxpgf) 409 IF (blocks(iatom, jatom, img) == -1 .AND. blocks(jatom, iatom, img) == -1) THEN 410 blocks(iatom, jatom, img) = 1 411 blocks(jatom, iatom, img) = 1 412 ELSE 413 WRITE (6, *) "TASK LIST CONFLICT IN PAIR ", ipair 414 WRITE (6, *) "Reuse of iatom, jatom, image ", iatom, jatom, img 415 END IF 416 417 iatom_old = iatom 418 jatom_old = jatom 419 img_old = img 420 DO itask = task_list%taskstart(ipair, igrid_level), task_list%taskstop(ipair, igrid_level) 421 422 CALL int2pair(tasks(3, itask), ilevel, img, iatom, jatom, iset, jset, ipgf, jpgf, & 423 nimages, natoms, maxset, maxpgf) 424 IF (iatom /= iatom_old .OR. jatom /= jatom_old .OR. img /= img_old) THEN 425 WRITE (6, *) "TASK LIST CONFLICT IN TASK ", itask 426 WRITE (6, *) "Inconsistent iatom, jatom, image ", iatom, jatom, img 427 WRITE (6, *) "Should be iatom, jatom, image ", iatom_old, jatom_old, img_old 428 END IF 429 430 END DO 431 END DO 432 DEALLOCATE (blocks) 433 434 END DO 435 436 END IF 437 438 CALL timestop(handle) 439 440 END SUBROUTINE generate_qs_task_list 441 442! ************************************************************************************************** 443!> \brief ... 444!> \param tasks ... 445!> \param dist_ab ... 446!> \param ntasks ... 447!> \param curr_tasks ... 448!> \param rs_descs ... 449!> \param dft_control ... 450!> \param cube_info ... 451!> \param gridlevel_info ... 452!> \param cindex ... 453!> \param iatom ... 454!> \param jatom ... 455!> \param rpgfa ... 456!> \param rpgfb ... 457!> \param zeta ... 458!> \param zetb ... 459!> \param kind_radius_b ... 460!> \param set_radius_a ... 461!> \param set_radius_b ... 462!> \param ra ... 463!> \param rab ... 464!> \param la_max ... 465!> \param la_min ... 466!> \param lb_max ... 467!> \param lb_min ... 468!> \param npgfa ... 469!> \param npgfb ... 470!> \param maxpgf ... 471!> \param maxset ... 472!> \param natoms ... 473!> \param nimages ... 474!> \param nseta ... 475!> \param nsetb ... 476!> \par History 477!> Joost VandeVondele: 10.2008 refactored 478! ************************************************************************************************** 479 SUBROUTINE task_list_inner_loop(tasks, dist_ab, ntasks, curr_tasks, rs_descs, dft_control, & 480 cube_info, gridlevel_info, cindex, & 481 iatom, jatom, rpgfa, rpgfb, zeta, zetb, kind_radius_b, set_radius_a, set_radius_b, ra, rab, & 482 la_max, la_min, lb_max, lb_min, npgfa, npgfb, maxpgf, maxset, natoms, nimages, nseta, nsetb) 483 484 INTEGER(KIND=int_8), DIMENSION(:, :), POINTER :: tasks 485 REAL(KIND=dp), DIMENSION(:, :), POINTER :: dist_ab 486 INTEGER :: ntasks, curr_tasks 487 TYPE(realspace_grid_desc_p_type), DIMENSION(:), & 488 POINTER :: rs_descs 489 TYPE(dft_control_type), POINTER :: dft_control 490 TYPE(cube_info_type), DIMENSION(:), POINTER :: cube_info 491 TYPE(gridlevel_info_type), POINTER :: gridlevel_info 492 INTEGER :: cindex, iatom, jatom 493 REAL(KIND=dp), DIMENSION(:, :), POINTER :: rpgfa, rpgfb, zeta, zetb 494 REAL(KIND=dp) :: kind_radius_b 495 REAL(KIND=dp), DIMENSION(:), POINTER :: set_radius_a, set_radius_b 496 REAL(KIND=dp), DIMENSION(3) :: ra, rab 497 INTEGER, DIMENSION(:), POINTER :: la_max, la_min, lb_max, lb_min, npgfa, & 498 npgfb 499 INTEGER :: maxpgf, maxset, natoms, nimages, nseta, & 500 nsetb 501 502 CHARACTER(LEN=*), PARAMETER :: routineN = 'task_list_inner_loop', & 503 routineP = moduleN//':'//routineN 504 505 INTEGER :: cube_center(3), igrid_level, ipgf, iset, & 506 jpgf, jset, lb_cube(3), ub_cube(3) 507 REAL(KIND=dp) :: dab, rab2, radius, zetp 508 509 rab2 = rab(1)*rab(1) + rab(2)*rab(2) + rab(3)*rab(3) 510 dab = SQRT(rab2) 511 512 loop_iset: DO iset = 1, nseta 513 514 IF (set_radius_a(iset) + kind_radius_b < dab) CYCLE 515 516 loop_jset: DO jset = 1, nsetb 517 518 IF (set_radius_a(iset) + set_radius_b(jset) < dab) CYCLE 519 520 loop_ipgf: DO ipgf = 1, npgfa(iset) 521 522 IF (rpgfa(ipgf, iset) + set_radius_b(jset) < dab) CYCLE 523 524 loop_jpgf: DO jpgf = 1, npgfb(jset) 525 526 IF (rpgfa(ipgf, iset) + rpgfb(jpgf, jset) < dab) CYCLE 527 528 zetp = zeta(ipgf, iset) + zetb(jpgf, jset) 529 igrid_level = gaussian_gridlevel(gridlevel_info, zetp) 530 531 CALL compute_pgf_properties(cube_center, lb_cube, ub_cube, radius, & 532 rs_descs(igrid_level)%rs_desc, cube_info(igrid_level), & 533 la_max(iset), zeta(ipgf, iset), la_min(iset), & 534 lb_max(jset), zetb(jpgf, jset), lb_min(jset), & 535 ra, rab, rab2, dft_control%qs_control%eps_rho_rspace) 536 537 CALL pgf_to_tasks(tasks, dist_ab, ntasks, curr_tasks, & 538 rab, cindex, iatom, jatom, iset, jset, ipgf, jpgf, nimages, natoms, maxset, maxpgf, & 539 la_max(iset), lb_max(jset), rs_descs(igrid_level)%rs_desc, & 540 igrid_level, gridlevel_info%ngrid_levels, cube_center, & 541 lb_cube, ub_cube) 542 543 END DO loop_jpgf 544 545 END DO loop_ipgf 546 547 END DO loop_jset 548 549 END DO loop_iset 550 551 END SUBROUTINE task_list_inner_loop 552 553! ************************************************************************************************** 554!> \brief combines the calculation of several basic properties of a given pgf: 555!> its center, the bounding cube, the radius, the cost, 556!> tries to predict the time needed for processing this task 557!> in this way an improved load balance might be obtained 558!> \param cube_center ... 559!> \param lb_cube ... 560!> \param ub_cube ... 561!> \param radius ... 562!> \param rs_desc ... 563!> \param cube_info ... 564!> \param la_max ... 565!> \param zeta ... 566!> \param la_min ... 567!> \param lb_max ... 568!> \param zetb ... 569!> \param lb_min ... 570!> \param ra ... 571!> \param rab ... 572!> \param rab2 ... 573!> \param eps ... 574!> \par History 575!> 10.2008 refactored [Joost VandeVondele] 576!> \note 577!> -) this requires the radius to be computed in the same way as 578!> collocate_pgf_product_rspace, we should factor that part into a subroutine 579!> -) we're assuming that integrate_pgf and collocate_pgf are the same cost for load balancing 580!> this is more or less true for map_consistent 581!> -) in principle, the computed radius could be recycled in integrate_pgf/collocate_pgf if it is certainly 582!> the same, this could lead to a small speedup 583!> -) the cost function is a fit through the median cost of mapping a pgf with a given l and a given radius (in grid points) 584!> fitting the measured data on an opteron/g95 using the expression 585!> a*(l+b)(r+c)**3+d which is based on the innerloop of the collocating routines 586! ************************************************************************************************** 587 SUBROUTINE compute_pgf_properties(cube_center, lb_cube, ub_cube, radius, & 588 rs_desc, cube_info, la_max, zeta, la_min, lb_max, zetb, lb_min, ra, rab, rab2, eps) 589 590 INTEGER, DIMENSION(3), INTENT(OUT) :: cube_center, lb_cube, ub_cube 591 REAL(KIND=dp), INTENT(OUT) :: radius 592 TYPE(realspace_grid_desc_type), POINTER :: rs_desc 593 TYPE(cube_info_type), INTENT(IN) :: cube_info 594 INTEGER, INTENT(IN) :: la_max 595 REAL(KIND=dp), INTENT(IN) :: zeta 596 INTEGER, INTENT(IN) :: la_min, lb_max 597 REAL(KIND=dp), INTENT(IN) :: zetb 598 INTEGER, INTENT(IN) :: lb_min 599 REAL(KIND=dp), INTENT(IN) :: ra(3), rab(3), rab2, eps 600 601 CHARACTER(LEN=*), PARAMETER :: routineN = 'compute_pgf_properties', & 602 routineP = moduleN//':'//routineN 603 604 INTEGER :: extent(3) 605 INTEGER, DIMENSION(:), POINTER :: sphere_bounds 606 REAL(KIND=dp) :: cutoff, f, prefactor, rb(3), zetp 607 REAL(KIND=dp), DIMENSION(3) :: rp 608 609! the radius for this task 610 611 zetp = zeta + zetb 612 rp(:) = ra(:) + zetb/zetp*rab(:) 613 rb(:) = ra(:) + rab(:) 614 cutoff = 1.0_dp 615 f = zetb/zetp 616 prefactor = EXP(-zeta*f*rab2) 617 radius = exp_radius_very_extended(la_min, la_max, lb_min, lb_max, ra=ra, rb=rb, rp=rp, & 618 zetp=zetp, eps=eps, prefactor=prefactor, cutoff=cutoff) 619 620 CALL compute_cube_center(cube_center, rs_desc, zeta, zetb, ra, rab) 621 ! compute cube_center, the center of the gaussian product to map (folded to within the unit cell) 622 cube_center(:) = MODULO(cube_center(:), rs_desc%npts(:)) 623 cube_center(:) = cube_center(:) + rs_desc%lb(:) 624 625 IF (rs_desc%orthorhombic) THEN 626 CALL return_cube(cube_info, radius, lb_cube, ub_cube, sphere_bounds) 627 ELSE 628 CALL return_cube_nonortho(cube_info, radius, lb_cube, ub_cube, rp) 629 !? unclear if extent is computed correctly. 630 extent(:) = ub_cube(:) - lb_cube(:) 631 lb_cube(:) = -extent(:)/2 - 1 632 ub_cube(:) = extent(:)/2 633 ENDIF 634 635 END SUBROUTINE compute_pgf_properties 636! ************************************************************************************************** 637!> \brief predicts the cost of a task in kcycles for a given task 638!> the model is based on a fit of actual data, and might need updating 639!> as collocate_pgf_product_rspace changes (or CPUs/compilers change) 640!> maybe some dynamic approach, improving the cost model on the fly could 641!> work as well 642!> the cost model does not yet take into account the fraction of space 643!> that is mapped locally for a given cube and rs_grid (generalised tasks) 644!> \param lb_cube ... 645!> \param ub_cube ... 646!> \param fraction ... 647!> \param lmax ... 648!> \param is_ortho ... 649!> \return ... 650! ************************************************************************************************** 651 INTEGER FUNCTION cost_model(lb_cube, ub_cube, fraction, lmax, is_ortho) 652 INTEGER, DIMENSION(3), INTENT(IN) :: lb_cube, ub_cube 653 REAL(KIND=dp), INTENT(IN) :: fraction 654 INTEGER :: lmax 655 LOGICAL :: is_ortho 656 657 INTEGER :: cmax 658 REAL(KIND=dp) :: v1, v2, v3, v4, v5 659 660 cmax = MAXVAL(((ub_cube - lb_cube) + 1)/2) 661 662 IF (is_ortho) THEN 663 v1 = 1.504760E+00_dp 664 v2 = 3.126770E+00_dp 665 v3 = 5.074106E+00_dp 666 v4 = 1.091568E+00_dp 667 v5 = 1.070187E+00_dp 668 ELSE 669 v1 = 7.831105E+00_dp 670 v2 = 2.675174E+00_dp 671 v3 = 7.546553E+00_dp 672 v4 = 6.122446E-01_dp 673 v5 = 3.886382E+00_dp 674 ENDIF 675 cost_model = CEILING(((lmax + v1)*(cmax + v2)**3*v3*fraction + v4 + v5*lmax**7)/1000.0_dp) 676 677 END FUNCTION cost_model 678! ************************************************************************************************** 679!> \brief pgf_to_tasks converts a given pgf to one or more tasks, in particular 680!> this determines by which CPUs a given pgf gets collocated 681!> the format of the task array is as follows 682!> tasks(1,i) := destination 683!> tasks(2,i) := source 684!> tasks(3,i) := compressed type (iatom, jatom, ....) 685!> tasks(4,i) := type (0: replicated, 1: distributed local, 2: distributed generalised) 686!> tasks(5,i) := cost 687!> tasks(6,i) := alternate destination code (0 if none available) 688!> 689!> \param tasks ... 690!> \param dist_ab ... 691!> \param ntasks ... 692!> \param curr_tasks ... 693!> \param rab ... 694!> \param cindex ... 695!> \param iatom ... 696!> \param jatom ... 697!> \param iset ... 698!> \param jset ... 699!> \param ipgf ... 700!> \param jpgf ... 701!> \param nimages ... 702!> \param natoms ... 703!> \param maxset ... 704!> \param maxpgf ... 705!> \param la_max ... 706!> \param lb_max ... 707!> \param rs_desc ... 708!> \param igrid_level ... 709!> \param n_levels ... 710!> \param cube_center ... 711!> \param lb_cube ... 712!> \param ub_cube ... 713!> \par History 714!> 10.2008 Refactored based on earlier routines by MattW [Joost VandeVondele] 715! ************************************************************************************************** 716 SUBROUTINE pgf_to_tasks(tasks, dist_ab, ntasks, curr_tasks, & 717 rab, cindex, iatom, jatom, iset, jset, ipgf, jpgf, nimages, natoms, maxset, maxpgf, & 718 la_max, lb_max, rs_desc, igrid_level, n_levels, cube_center, lb_cube, ub_cube) 719 720 INTEGER(KIND=int_8), DIMENSION(:, :), POINTER :: tasks 721 REAL(KIND=dp), DIMENSION(:, :), POINTER :: dist_ab 722 INTEGER, INTENT(INOUT) :: ntasks, curr_tasks 723 REAL(KIND=dp), DIMENSION(3), INTENT(IN) :: rab 724 INTEGER, INTENT(IN) :: cindex, iatom, jatom, iset, jset, ipgf, & 725 jpgf, nimages, natoms, maxset, maxpgf, & 726 la_max, lb_max 727 TYPE(realspace_grid_desc_type), POINTER :: rs_desc 728 INTEGER, INTENT(IN) :: igrid_level, n_levels 729 INTEGER, DIMENSION(3), INTENT(IN) :: cube_center, lb_cube, ub_cube 730 731 INTEGER, PARAMETER :: add_tasks = 1000 732 REAL(kind=dp), PARAMETER :: mult_tasks = 2.0_dp 733 734 INTEGER :: added_tasks, cost, j, lmax 735 LOGICAL :: is_ortho 736 REAL(KIND=dp) :: tfraction 737 738 ntasks = ntasks + 1 739 IF (ntasks > curr_tasks) THEN 740 curr_tasks = INT((curr_tasks + add_tasks)*mult_tasks) 741 CALL reallocate(tasks, 1, 6, 1, curr_tasks) 742 END IF 743 744 IF (rs_desc%distributed) THEN 745 746 ! finds the node(s) that need to process this task 747 ! on exit tasks(4,:) is 1 for distributed tasks and 2 for generalised tasks 748 CALL rs_find_node(rs_desc, igrid_level, n_levels, cube_center, & 749 ntasks=ntasks, tasks=tasks, lb_cube=lb_cube, ub_cube=ub_cube, added_tasks=added_tasks) 750 751 ELSE 752 tasks(1, ntasks) = encode_rank(rs_desc%my_pos, igrid_level, n_levels) 753 tasks(4, ntasks) = 0 754 tasks(6, ntasks) = 0 755 added_tasks = 1 756 ENDIF 757 758 IF (SIZE(dist_ab, 2) .NE. SIZE(tasks, 2)) THEN 759 CALL reallocate(dist_ab, 1, 3, 1, SIZE(tasks, 2)) 760 ENDIF 761 762 lmax = la_max + lb_max 763 is_ortho = (tasks(4, ntasks) == 0 .OR. tasks(4, ntasks) == 1) .AND. rs_desc%orthorhombic 764 ! we assume the load is shared equally between processes dealing with a generalised Gaussian. 765 ! this could be refined in the future 766 tfraction = 1.0_dp/added_tasks 767 768 cost = cost_model(lb_cube, ub_cube, tfraction, lmax, is_ortho) 769 770 DO j = 1, added_tasks 771 772 tasks(2, ntasks - added_tasks + j) = encode_rank(rs_desc%my_pos, igrid_level, n_levels) 773 tasks(5, ntasks - added_tasks + j) = cost 774 775 !encode the atom pairs and basis info as a single long integer 776 CALL pair2int(tasks(3, ntasks - added_tasks + j), igrid_level, cindex, & 777 iatom, jatom, iset, jset, ipgf, jpgf, nimages, natoms, maxset, maxpgf) 778 779 dist_ab(1, ntasks - added_tasks + j) = rab(1) 780 dist_ab(2, ntasks - added_tasks + j) = rab(2) 781 dist_ab(3, ntasks - added_tasks + j) = rab(3) 782 783 ENDDO 784 785 END SUBROUTINE pgf_to_tasks 786 787! ************************************************************************************************** 788!> \brief converts a pgf index pair (ipgf,iset,iatom),(jpgf,jset,jatom) 789!> to a unique integer. 790!> a list of integers can be sorted, and will result in a list of pgf pairs 791!> for which all atom pairs are grouped, and for each atom pair all set pairs are grouped 792!> and for each set pair, all pgfs are grouped 793!> \param res ... 794!> \param ilevel ... 795!> \param image ... 796!> \param iatom ... 797!> \param jatom ... 798!> \param iset ... 799!> \param jset ... 800!> \param ipgf ... 801!> \param jpgf ... 802!> \param nimages ... 803!> \param natom ... 804!> \param maxset ... 805!> \param maxpgf ... 806!> \par History 807!> 11.2007 created [Joost] 808!> \note 809!> will hopefully not overflow any more 810! ************************************************************************************************** 811 SUBROUTINE pair2int(res, ilevel, image, iatom, jatom, iset, jset, ipgf, jpgf, & 812 nimages, natom, maxset, maxpgf) 813 INTEGER(KIND=int_8), INTENT(OUT) :: res 814 INTEGER, INTENT(IN) :: ilevel, image, iatom, jatom, iset, jset, & 815 ipgf, jpgf, nimages, natom, maxset, & 816 maxpgf 817 818 INTEGER(KIND=int_8) :: maxpgf8, maxset8, natom8, nimages8, & 819 nlev1, nlev2, nlev3, nlev4 820 821 natom8 = natom; maxset8 = maxset; maxpgf8 = maxpgf; nimages8 = nimages 822 ! 823 ! this encoding yields the right order of the tasks for collocation after the sort 824 ! in distribute_tasks. E.g. for a atom pair, all sets and pgfs are computed in one go. 825 ! The exception is the gridlevel. Tasks are first ordered wrt to grid_level. This implies 826 ! that a given density matrix block will be decontracted several times, but cache effects on the 827 ! grid make up for this. 828 ! 829 nlev1 = maxpgf8**2 830 nlev2 = maxset8**2*nlev1 831 nlev3 = natom8**2*nlev2 832 nlev4 = nimages8*nlev3 833 ! 834 res = ilevel*nlev4 + & 835 (image - 1)*nlev3 + & 836 ((iatom - 1)*natom8 + jatom - 1)*nlev2 + & 837 ((iset - 1)*maxset8 + jset - 1)*nlev1 + & 838 (ipgf - 1)*maxpgf8 + jpgf - 1 839 840 END SUBROUTINE pair2int 841 842! ************************************************************************************************** 843!> \brief ... 844!> \param res ... 845!> \param ilevel ... 846!> \param image ... 847!> \param iatom ... 848!> \param jatom ... 849!> \param iset ... 850!> \param jset ... 851!> \param ipgf ... 852!> \param jpgf ... 853!> \param nimages ... 854!> \param natom ... 855!> \param maxset ... 856!> \param maxpgf ... 857! ************************************************************************************************** 858 SUBROUTINE int2pair(res, ilevel, image, iatom, jatom, iset, jset, ipgf, jpgf, & 859 nimages, natom, maxset, maxpgf) 860 INTEGER(KIND=int_8), INTENT(IN) :: res 861 INTEGER, INTENT(OUT) :: ilevel, image, iatom, jatom, iset, jset, & 862 ipgf, jpgf 863 INTEGER, INTENT(IN) :: nimages, natom, maxset, maxpgf 864 865 INTEGER(KIND=int_8) :: iatom8, ijatom, ijset, img, ipgf8, iset8, jatom8, jpgf8, jset8, & 866 maxpgf8, maxset8, natom8, nimages8, nlev1, nlev2, nlev3, nlev4, tmp 867 868 natom8 = natom; maxset8 = maxset; maxpgf8 = maxpgf; nimages8 = nimages 869 ! 870 nlev1 = maxpgf8**2 871 nlev2 = maxset8**2*nlev1 872 nlev3 = natom8**2*nlev2 873 nlev4 = nimages8*nlev3 874 ! 875 ilevel = INT(res/nlev4) 876 tmp = MOD(res, nlev4) 877 img = tmp/nlev3 + 1 878 tmp = MOD(tmp, nlev3) 879 ijatom = tmp/nlev2 880 iatom8 = ijatom/natom8 + 1 881 jatom8 = MOD(ijatom, natom8) + 1 882 tmp = MOD(tmp, nlev2) 883 ijset = tmp/nlev1 884 iset8 = ijset/maxset8 + 1 885 jset8 = MOD(ijset, maxset8) + 1 886 tmp = MOD(tmp, nlev1) 887 ipgf8 = tmp/maxpgf8 + 1 888 jpgf8 = MOD(tmp, maxpgf8) + 1 889 ! 890 image = INT(img) 891 iatom = INT(iatom8); jatom = INT(jatom8); iset = INT(iset8); jset = INT(jset8) 892 ipgf = INT(ipgf8); jpgf = INT(jpgf8) 893 894 END SUBROUTINE int2pair 895 896! ************************************************************************************************** 897!> \brief performs load balancing of the tasks on the distributed grids 898!> \param tasks ... 899!> \param ntasks ... 900!> \param rs_descs ... 901!> \param grid_level ... 902!> \param nimages ... 903!> \param natoms ... 904!> \param maxset ... 905!> \param maxpgf ... 906!> \par History 907!> created 2008-10-03 [Joost VandeVondele] 908! ************************************************************************************************** 909 SUBROUTINE load_balance_distributed(tasks, ntasks, rs_descs, grid_level, & 910 nimages, natoms, maxset, maxpgf) 911 912 INTEGER(KIND=int_8), DIMENSION(:, :), POINTER :: tasks 913 INTEGER :: ntasks 914 TYPE(realspace_grid_desc_p_type), DIMENSION(:), & 915 POINTER :: rs_descs 916 INTEGER :: grid_level, nimages, natoms, maxset, & 917 maxpgf 918 919 CHARACTER(LEN=*), PARAMETER :: routineN = 'load_balance_distributed', & 920 routineP = moduleN//':'//routineN 921 922 INTEGER :: handle 923 INTEGER, DIMENSION(:, :, :), POINTER :: list 924 925 CALL timeset(routineN, handle) 926 927 NULLIFY (list) 928 ! here we create for each cpu (0:ncpu-1) a list of possible destinations. 929 ! if a destination would not be in this list, it is a bug 930 CALL create_destination_list(list, rs_descs, grid_level) 931 932 ! now, walk over the tasks, filling in the loads of each destination 933 CALL compute_load_list(list, rs_descs, grid_level, tasks, ntasks, nimages, natoms, maxset, maxpgf, & 934 create_list=.TRUE.) 935 936 ! optimize loads & fluxes 937 CALL optimize_load_list(list, rs_descs(1)%rs_desc%group, rs_descs(1)%rs_desc%my_pos) 938 939 ! now, walk over the tasks, using the list to set the destinations 940 CALL compute_load_list(list, rs_descs, grid_level, tasks, ntasks, nimages, natoms, maxset, maxpgf, & 941 create_list=.FALSE.) 942 943 DEALLOCATE (list) 944 945 CALL timestop(handle) 946 947 END SUBROUTINE load_balance_distributed 948 949! ************************************************************************************************** 950!> \brief this serial routine adjusts the fluxes in the global list 951!> 952!> \param list_global ... 953!> \par History 954!> created 2008-10-06 [Joost VandeVondele] 955! ************************************************************************************************** 956 SUBROUTINE balance_global_list(list_global) 957 INTEGER, DIMENSION(:, :, 0:) :: list_global 958 959 CHARACTER(LEN=*), PARAMETER :: routineN = 'balance_global_list', & 960 routineP = moduleN//':'//routineN 961 INTEGER, PARAMETER :: Max_Iter = 100 962 REAL(KIND=dp), PARAMETER :: Tolerance_factor = 0.005_dp 963 964 INTEGER :: dest, handle, icpu, idest, iflux, & 965 ilocal, k, maxdest, Ncpu, Nflux 966 INTEGER, ALLOCATABLE, DIMENSION(:, :) :: flux_connections 967 LOGICAL :: solution_optimal 968 REAL(KIND=dp) :: average, load_shift, max_load_shift, & 969 tolerance 970 REAL(KIND=dp), ALLOCATABLE, DIMENSION(:) :: load, optimized_flux, optimized_load 971 REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :) :: flux_limits 972 973 CALL timeset(routineN, handle) 974 975 Ncpu = SIZE(list_global, 3) 976 maxdest = SIZE(list_global, 2) 977 ALLOCATE (load(0:Ncpu - 1)) 978 load = 0.0_dp 979 ALLOCATE (optimized_load(0:Ncpu - 1)) 980 981 ! figure out the number of fluxes 982 ! we assume that the global_list is symmetric 983 Nflux = 0 984 DO icpu = 0, ncpu - 1 985 DO idest = 1, maxdest 986 dest = list_global(1, idest, icpu) 987 IF (dest < ncpu .AND. dest > icpu) Nflux = Nflux + 1 988 ENDDO 989 ENDDO 990 ALLOCATE (optimized_flux(Nflux)) 991 ALLOCATE (flux_limits(2, Nflux)) 992 ALLOCATE (flux_connections(2, Nflux)) 993 994 ! reorder data 995 flux_limits = 0 996 Nflux = 0 997 DO icpu = 0, ncpu - 1 998 load(icpu) = SUM(list_global(2, :, icpu)) 999 DO idest = 1, maxdest 1000 dest = list_global(1, idest, icpu) 1001 IF (dest < ncpu) THEN 1002 IF (dest .NE. icpu) THEN 1003 IF (dest > icpu) THEN 1004 Nflux = Nflux + 1 1005 flux_limits(2, Nflux) = list_global(2, idest, icpu) 1006 flux_connections(1, Nflux) = icpu 1007 flux_connections(2, Nflux) = dest 1008 ELSE 1009 DO iflux = 1, Nflux 1010 IF (flux_connections(1, iflux) == dest .AND. flux_connections(2, iflux) == icpu) THEN 1011 flux_limits(1, iflux) = -list_global(2, idest, icpu) 1012 EXIT 1013 ENDIF 1014 ENDDO 1015 ENDIF 1016 ENDIF 1017 ENDIF 1018 ENDDO 1019 ENDDO 1020 1021 solution_optimal = .FALSE. 1022 optimized_flux = 0.0_dp 1023 1024 ! an iterative solver, if iterated till convergence the maximum load is minimal 1025 ! we terminate before things are fully converged, since this does show up in the timings 1026 ! once the largest shift becomes less than a small fraction of the average load, we're done 1027 ! we're perfectly happy if the load balance is within 1 percent or so 1028 ! the maximum load normally converges even faster 1029 average = SUM(load)/SIZE(load) 1030 tolerance = Tolerance_factor*average 1031 1032 optimized_load(:) = load 1033 DO k = 1, Max_iter 1034 max_load_shift = 0.0_dp 1035 DO iflux = 1, Nflux 1036 load_shift = (optimized_load(flux_connections(1, iflux)) - optimized_load(flux_connections(2, iflux)))/2 1037 load_shift = MAX(flux_limits(1, iflux) - optimized_flux(iflux), load_shift) 1038 load_shift = MIN(flux_limits(2, iflux) - optimized_flux(iflux), load_shift) 1039 max_load_shift = MAX(ABS(load_shift), max_load_shift) 1040 optimized_load(flux_connections(1, iflux)) = optimized_load(flux_connections(1, iflux)) - load_shift 1041 optimized_load(flux_connections(2, iflux)) = optimized_load(flux_connections(2, iflux)) + load_shift 1042 optimized_flux(iflux) = optimized_flux(iflux) + load_shift 1043 ENDDO 1044 IF (max_load_shift < tolerance) THEN 1045 solution_optimal = .TRUE. 1046 EXIT 1047 ENDIF 1048 ENDDO 1049 1050 ! now adjust the load list to reflect the optimized fluxes 1051 ! reorder data 1052 Nflux = 0 1053 DO icpu = 0, ncpu - 1 1054 DO idest = 1, maxdest 1055 IF (list_global(1, idest, icpu) == icpu) ilocal = idest 1056 ENDDO 1057 DO idest = 1, maxdest 1058 dest = list_global(1, idest, icpu) 1059 IF (dest < ncpu) THEN 1060 IF (dest .NE. icpu) THEN 1061 IF (dest > icpu) THEN 1062 Nflux = Nflux + 1 1063 IF (optimized_flux(Nflux) > 0) THEN 1064 list_global(2, ilocal, icpu) = list_global(2, ilocal, icpu) + & 1065 list_global(2, idest, icpu) - NINT(optimized_flux(Nflux)) 1066 list_global(2, idest, icpu) = NINT(optimized_flux(Nflux)) 1067 ELSE 1068 list_global(2, ilocal, icpu) = list_global(2, ilocal, icpu) + & 1069 list_global(2, idest, icpu) 1070 list_global(2, idest, icpu) = 0 1071 ENDIF 1072 ELSE 1073 DO iflux = 1, Nflux 1074 IF (flux_connections(1, iflux) == dest .AND. flux_connections(2, iflux) == icpu) THEN 1075 IF (optimized_flux(iflux) > 0) THEN 1076 list_global(2, ilocal, icpu) = list_global(2, ilocal, icpu) + & 1077 list_global(2, idest, icpu) 1078 list_global(2, idest, icpu) = 0 1079 ELSE 1080 list_global(2, ilocal, icpu) = list_global(2, ilocal, icpu) + & 1081 list_global(2, idest, icpu) + NINT(optimized_flux(iflux)) 1082 list_global(2, idest, icpu) = -NINT(optimized_flux(iflux)) 1083 ENDIF 1084 EXIT 1085 ENDIF 1086 ENDDO 1087 ENDIF 1088 ENDIF 1089 ENDIF 1090 ENDDO 1091 ENDDO 1092 1093 CALL timestop(handle) 1094 1095 END SUBROUTINE balance_global_list 1096 1097! ************************************************************************************************** 1098!> \brief this routine gets back optimized loads for all destinations 1099!> 1100!> \param list ... 1101!> \param group ... 1102!> \param my_pos ... 1103!> \par History 1104!> created 2008-10-06 [Joost VandeVondele] 1105!> Modified 2016-01 [EPCC] Reduce memory requirements on P processes 1106!> from O(P^2) to O(P) 1107! ************************************************************************************************** 1108 SUBROUTINE optimize_load_list(list, group, my_pos) 1109 INTEGER, DIMENSION(:, :, 0:) :: list 1110 INTEGER, INTENT(IN) :: group, my_pos 1111 1112 CHARACTER(LEN=*), PARAMETER :: routineN = 'optimize_load_list', & 1113 routineP = moduleN//':'//routineN 1114 INTEGER, PARAMETER :: rank_of_root = 0 1115 1116 INTEGER :: handle, icpu, idest, maxdest, ncpu 1117 INTEGER, ALLOCATABLE, DIMENSION(:) :: load_all 1118 INTEGER, ALLOCATABLE, DIMENSION(:, :) :: load_partial 1119 INTEGER, ALLOCATABLE, DIMENSION(:, :, :) :: list_global 1120 1121 CALL timeset(routineN, handle) 1122 1123 ncpu = SIZE(list, 3) 1124 maxdest = SIZE(list, 2) 1125 1126 !find total workload ... 1127 ALLOCATE (load_all(maxdest*ncpu)) 1128 load_all(:) = RESHAPE(list(2, :, :), (/maxdest*ncpu/)) 1129 CALL mp_sum(load_all(:), rank_of_root, group) 1130 1131 ! ... and optimise the work per process 1132 ALLOCATE (list_global(2, maxdest, ncpu)) 1133 IF (rank_of_root .EQ. my_pos) THEN 1134 list_global(1, :, :) = list(1, :, :) 1135 list_global(2, :, :) = RESHAPE(load_all, (/maxdest, ncpu/)) 1136 CALL balance_global_list(list_global) 1137 ENDIF 1138 CALL mp_bcast(list_global, rank_of_root, group) 1139 1140 !figure out how much can be sent to other processes 1141 ALLOCATE (load_partial(maxdest, ncpu)) 1142 ! send 'load_all', which is a copy of 'list' (but without leading dimension/stride) 1143 CALL mp_sum_partial(RESHAPE(load_all, (/maxdest, ncpu/)), load_partial(:, :), group) 1144 1145 DO icpu = 1, ncpu 1146 DO idest = 1, maxdest 1147 1148 !need to deduct 1 because `list' was passed in to this routine as being indexed from zero 1149 IF (load_partial(idest, icpu) > list_global(2, idest, icpu)) THEN 1150 IF (load_partial(idest, icpu) - list(2, idest, icpu - 1) < list_global(2, idest, icpu)) THEN 1151 list(2, idest, icpu - 1) = list_global(2, idest, icpu) & 1152 - (load_partial(idest, icpu) - list(2, idest, icpu - 1)) 1153 ELSE 1154 list(2, idest, icpu - 1) = 0 1155 ENDIF 1156 ENDIF 1157 1158 ENDDO 1159 ENDDO 1160 1161 !clean up before leaving 1162 DEALLOCATE (load_all) 1163 DEALLOCATE (list_global) 1164 DEALLOCATE (load_partial) 1165 1166 CALL timestop(handle) 1167 END SUBROUTINE optimize_load_list 1168 1169! ************************************************************************************************** 1170!> \brief fill the load list with values derived from the tasks array 1171!> from the alternate locations, we select the alternate location that 1172!> can be used without increasing the number of matrix blocks needed to 1173!> distribute. 1174!> Replicated tasks are not yet considered 1175!> 1176!> \param list ... 1177!> \param rs_descs ... 1178!> \param grid_level ... 1179!> \param tasks ... 1180!> \param ntasks ... 1181!> \param nimages ... 1182!> \param natoms ... 1183!> \param maxset ... 1184!> \param maxpgf ... 1185!> \param create_list ... 1186!> \par History 1187!> created 2008-10-06 [Joost VandeVondele] 1188! ************************************************************************************************** 1189 SUBROUTINE compute_load_list(list, rs_descs, grid_level, tasks, & 1190 ntasks, nimages, natoms, maxset, maxpgf, create_list) 1191 INTEGER, DIMENSION(:, :, 0:) :: list 1192 TYPE(realspace_grid_desc_p_type), DIMENSION(:), & 1193 POINTER :: rs_descs 1194 INTEGER :: grid_level 1195 INTEGER(KIND=int_8), DIMENSION(:, :), POINTER :: tasks 1196 INTEGER :: ntasks, nimages, natoms, maxset, maxpgf 1197 LOGICAL :: create_list 1198 1199 CHARACTER(LEN=*), PARAMETER :: routineN = 'compute_load_list', & 1200 routineP = moduleN//':'//routineN 1201 1202 INTEGER :: cost, dest, handle, i, iatom, ilevel, img, img_old, iopt, ipgf, iset, itask, & 1203 itask_start, itask_stop, jatom, jpgf, jset, li, maxdest, ncpu, ndest_pair, nopt, nshort, & 1204 rank 1205 INTEGER(KIND=int_8) :: bit_pattern, ipair, ipair_old, natom8 1206 INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:) :: loads 1207 INTEGER, ALLOCATABLE, DIMENSION(:) :: all_dests, index 1208 INTEGER, DIMENSION(6) :: options 1209 1210 CALL timeset(routineN, handle) 1211 1212 ALLOCATE (loads(0:rs_descs(grid_level)%rs_desc%group_size - 1)) 1213 CALL get_current_loads(loads, rs_descs, grid_level, ntasks, nimages, natoms, maxset, maxpgf, & 1214 tasks, use_reordered_ranks=.FALSE.) 1215 1216 maxdest = SIZE(list, 2) 1217 ncpu = SIZE(list, 3) 1218 natom8 = natoms 1219 1220 ! first find the tasks that deal with the same atom pair 1221 itask_stop = 0 1222 ipair_old = HUGE(ipair_old) 1223 img_old = -1 1224 ALLOCATE (all_dests(0)) 1225 ALLOCATE (INDEX(0)) 1226 1227 DO 1228 1229 ! first find the range of tasks that deal with the same atom pair 1230 itask_start = itask_stop + 1 1231 itask_stop = itask_start 1232 IF (itask_stop > ntasks) EXIT 1233 CALL int2pair(tasks(3, itask_stop), ilevel, img_old, iatom, jatom, iset, jset, ipgf, jpgf, & 1234 nimages, natoms, maxset, maxpgf) 1235 ipair_old = (iatom - 1)*natom8 + (jatom - 1) 1236 DO 1237 IF (itask_stop + 1 > ntasks) EXIT 1238 CALL int2pair(tasks(3, itask_stop + 1), ilevel, img, iatom, jatom, iset, jset, ipgf, jpgf, & 1239 nimages, natoms, maxset, maxpgf) 1240 ipair = (iatom - 1)*natom8 + (jatom - 1) 1241 IF (ipair == ipair_old .AND. img == img_old) THEN 1242 itask_stop = itask_stop + 1 1243 ELSE 1244 EXIT 1245 ENDIF 1246 ENDDO 1247 ipair = ipair_old 1248 nshort = itask_stop - itask_start + 1 1249 1250 ! find the unique list of destinations on this grid level only 1251 DEALLOCATE (all_dests) 1252 ALLOCATE (all_dests(nshort)) 1253 DEALLOCATE (index) 1254 ALLOCATE (INDEX(nshort)) 1255 DO i = 1, nshort 1256 CALL int2pair(tasks(3, itask_start + i - 1), ilevel, img, iatom, jatom, iset, jset, ipgf, jpgf, & 1257 nimages, natoms, maxset, maxpgf) 1258 IF (ilevel .EQ. grid_level) THEN 1259 all_dests(i) = decode_rank(tasks(1, itask_start + i - 1), SIZE(rs_descs)) 1260 ELSE 1261 all_dests(i) = HUGE(all_dests(i)) 1262 END IF 1263 ENDDO 1264 CALL sort(all_dests, nshort, index) 1265 ndest_pair = 1 1266 DO i = 2, nshort 1267 IF ((all_dests(ndest_pair) .NE. all_dests(i)) .AND. (all_dests(i) .NE. HUGE(all_dests(i)))) THEN 1268 ndest_pair = ndest_pair + 1 1269 all_dests(ndest_pair) = all_dests(i) 1270 ENDIF 1271 ENDDO 1272 1273 DO itask = itask_start, itask_stop 1274 1275 dest = decode_rank(tasks(1, itask), SIZE(rs_descs)) ! notice that dest can be changed 1276 CALL int2pair(tasks(3, itask), ilevel, img, iatom, jatom, iset, jset, ipgf, jpgf, & 1277 nimages, natoms, maxset, maxpgf) 1278 ! Only proceed with tasks which are on this grid level 1279 IF (ilevel .NE. grid_level) CYCLE 1280 ipair = (iatom - 1)*natom8 + (jatom - 1) 1281 cost = INT(tasks(5, itask)) 1282 1283 SELECT CASE (tasks(4, itask)) 1284 CASE (1) 1285 bit_pattern = tasks(6, itask) 1286 nopt = 0 1287 IF (BTEST(bit_pattern, 0)) THEN 1288 rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, (/-1, 0, 0/)) 1289 IF (ANY(all_dests(1:ndest_pair) .EQ. rank)) THEN 1290 nopt = nopt + 1 1291 options(nopt) = rank 1292 ENDIF 1293 ENDIF 1294 IF (BTEST(bit_pattern, 1)) THEN 1295 rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, (/+1, 0, 0/)) 1296 IF (ANY(all_dests(1:ndest_pair) .EQ. rank)) THEN 1297 nopt = nopt + 1 1298 options(nopt) = rank 1299 ENDIF 1300 ENDIF 1301 IF (BTEST(bit_pattern, 2)) THEN 1302 rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, (/0, -1, 0/)) 1303 IF (ANY(all_dests(1:ndest_pair) .EQ. rank)) THEN 1304 nopt = nopt + 1 1305 options(nopt) = rank 1306 ENDIF 1307 ENDIF 1308 IF (BTEST(bit_pattern, 3)) THEN 1309 rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, (/0, +1, 0/)) 1310 IF (ANY(all_dests(1:ndest_pair) .EQ. rank)) THEN 1311 nopt = nopt + 1 1312 options(nopt) = rank 1313 ENDIF 1314 ENDIF 1315 IF (BTEST(bit_pattern, 4)) THEN 1316 rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, (/0, 0, -1/)) 1317 IF (ANY(all_dests(1:ndest_pair) .EQ. rank)) THEN 1318 nopt = nopt + 1 1319 options(nopt) = rank 1320 ENDIF 1321 ENDIF 1322 IF (BTEST(bit_pattern, 5)) THEN 1323 rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, (/0, 0, +1/)) 1324 IF (ANY(all_dests(1:ndest_pair) .EQ. rank)) THEN 1325 nopt = nopt + 1 1326 options(nopt) = rank 1327 ENDIF 1328 ENDIF 1329 IF (nopt > 0) THEN 1330 ! set it to the rank with the lowest load 1331 rank = options(1) 1332 DO iopt = 2, nopt 1333 IF (loads(rank) > loads(options(iopt))) rank = options(iopt) 1334 ENDDO 1335 ELSE 1336 rank = dest 1337 ENDIF 1338 li = list_index(list, rank, dest) 1339 IF (create_list) THEN 1340 list(2, li, dest) = list(2, li, dest) + cost 1341 ELSE 1342 IF (list(1, li, dest) == dest) THEN 1343 tasks(1, itask) = encode_rank(dest, ilevel, SIZE(rs_descs)) 1344 ELSE 1345 IF (list(2, li, dest) >= cost) THEN 1346 list(2, li, dest) = list(2, li, dest) - cost 1347 tasks(1, itask) = encode_rank(list(1, li, dest), ilevel, SIZE(rs_descs)) 1348 ELSE 1349 tasks(1, itask) = encode_rank(dest, ilevel, SIZE(rs_descs)) 1350 ENDIF 1351 ENDIF 1352 ENDIF 1353 CASE (2) ! generalised 1354 li = list_index(list, dest, dest) 1355 IF (create_list) THEN 1356 list(2, li, dest) = list(2, li, dest) + cost 1357 ELSE 1358 IF (list(1, li, dest) == dest) THEN 1359 tasks(1, itask) = encode_rank(dest, ilevel, SIZE(rs_descs)) 1360 ELSE 1361 IF (list(2, li, dest) >= cost) THEN 1362 list(2, li, dest) = list(2, li, dest) - cost 1363 tasks(1, itask) = encode_rank(list(1, li, dest), ilevel, SIZE(rs_descs)) 1364 ELSE 1365 tasks(1, itask) = encode_rank(dest, ilevel, SIZE(rs_descs)) 1366 ENDIF 1367 ENDIF 1368 ENDIF 1369 CASE DEFAULT 1370 CPABORT("") 1371 END SELECT 1372 1373 ENDDO 1374 1375 ENDDO 1376 1377 CALL timestop(handle) 1378 1379 END SUBROUTINE compute_load_list 1380! ************************************************************************************************** 1381!> \brief small helper function to return the proper index in the list array 1382!> 1383!> \param list ... 1384!> \param rank ... 1385!> \param dest ... 1386!> \return ... 1387!> \par History 1388!> created 2008-10-06 [Joost VandeVondele] 1389! ************************************************************************************************** 1390 INTEGER FUNCTION list_index(list, rank, dest) 1391 INTEGER, DIMENSION(:, :, 0:), INTENT(IN) :: list 1392 INTEGER, INTENT(IN) :: rank, dest 1393 1394 list_index = 1 1395 DO 1396 IF (list(1, list_index, dest) == rank) EXIT 1397 list_index = list_index + 1 1398 ENDDO 1399 END FUNCTION list_index 1400! ************************************************************************************************** 1401!> \brief create a list with possible destinations (i.e. the central cpu and neighbors) for each cpu 1402!> note that we allocate it with an additional field to store the load of this destination 1403!> 1404!> \param list ... 1405!> \param rs_descs ... 1406!> \param grid_level ... 1407!> \par History 1408!> created 2008-10-06 [Joost VandeVondele] 1409! ************************************************************************************************** 1410 SUBROUTINE create_destination_list(list, rs_descs, grid_level) 1411 INTEGER, DIMENSION(:, :, :), POINTER :: list 1412 TYPE(realspace_grid_desc_p_type), DIMENSION(:), & 1413 POINTER :: rs_descs 1414 INTEGER, INTENT(IN) :: grid_level 1415 1416 CHARACTER(LEN=*), PARAMETER :: routineN = 'create_destination_list', & 1417 routineP = moduleN//':'//routineN 1418 1419 INTEGER :: handle, i, icpu, j, maxcount, ncpu, & 1420 ultimate_max 1421 INTEGER, ALLOCATABLE, DIMENSION(:) :: index, sublist 1422 1423 CALL timeset(routineN, handle) 1424 1425 CPASSERT(.NOT. ASSOCIATED(list)) 1426 ncpu = rs_descs(grid_level)%rs_desc%group_size 1427 ultimate_max = 7 1428 1429 ALLOCATE (list(2, ultimate_max, 0:ncpu - 1)) 1430 1431 ALLOCATE (INDEX(ultimate_max)) 1432 ALLOCATE (sublist(ultimate_max)) 1433 sublist = HUGE(sublist) 1434 1435 maxcount = 1 1436 DO icpu = 0, ncpu - 1 1437 sublist(1) = icpu 1438 sublist(2) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, (/-1, 0, 0/)) 1439 sublist(3) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, (/+1, 0, 0/)) 1440 sublist(4) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, (/0, -1, 0/)) 1441 sublist(5) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, (/0, +1, 0/)) 1442 sublist(6) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, (/0, 0, -1/)) 1443 sublist(7) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, (/0, 0, +1/)) 1444 ! only retain unique values of the destination 1445 CALL sort(sublist, ultimate_max, index) 1446 j = 1 1447 DO i = 2, 7 1448 IF (sublist(i) .NE. sublist(j)) THEN 1449 j = j + 1 1450 sublist(j) = sublist(i) 1451 ENDIF 1452 ENDDO 1453 maxcount = MAX(maxcount, j) 1454 sublist(j + 1:ultimate_max) = HUGE(sublist) 1455 list(1, :, icpu) = sublist 1456 list(2, :, icpu) = 0 1457 ENDDO 1458 1459 CALL reallocate(list, 1, 2, 1, maxcount, 0, ncpu - 1) 1460 1461 CALL timestop(handle) 1462 1463 END SUBROUTINE create_destination_list 1464 1465! ************************************************************************************************** 1466!> \brief given a task list, compute the load of each process everywhere 1467!> giving this function the ability to loop over a (sub)set of rs_grids, 1468!> and do all the communication in one shot, would speed it up 1469!> \param loads ... 1470!> \param rs_descs ... 1471!> \param grid_level ... 1472!> \param ntasks ... 1473!> \param nimages ... 1474!> \param natom ... 1475!> \param maxset ... 1476!> \param maxpgf ... 1477!> \param tasks ... 1478!> \param use_reordered_ranks ... 1479!> \par History 1480!> none 1481!> \author MattW 21/11/2007 1482! ************************************************************************************************** 1483 SUBROUTINE get_current_loads(loads, rs_descs, grid_level, ntasks, nimages, natom, maxset, & 1484 maxpgf, tasks, use_reordered_ranks) 1485 INTEGER(KIND=int_8), DIMENSION(:) :: loads 1486 TYPE(realspace_grid_desc_p_type), DIMENSION(:), & 1487 POINTER :: rs_descs 1488 INTEGER :: grid_level, ntasks, nimages, natom, & 1489 maxset, maxpgf 1490 INTEGER(KIND=int_8), DIMENSION(:, :), POINTER :: tasks 1491 LOGICAL, INTENT(IN) :: use_reordered_ranks 1492 1493 CHARACTER(LEN=*), PARAMETER :: routineN = 'get_current_loads', & 1494 routineP = moduleN//':'//routineN 1495 1496 INTEGER :: handle, i, iatom, ilevel, img, ipgf, & 1497 iset, jatom, jpgf, jset 1498 INTEGER(KIND=int_8) :: total_cost_local 1499 INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:) :: recv_buf_i, send_buf_i 1500 TYPE(realspace_grid_desc_type), POINTER :: desc 1501 1502 CALL timeset(routineN, handle) 1503 1504 desc => rs_descs(grid_level)%rs_desc 1505 1506 ! allocate local arrays 1507 ALLOCATE (send_buf_i(desc%group_size)) 1508 ALLOCATE (recv_buf_i(desc%group_size)) 1509 1510 ! communication step 1 : compute the total local cost of the tasks 1511 ! each proc needs to know the amount of work he will receive 1512 1513 ! send buffer now contains for each target the cost of the tasks it will receive 1514 send_buf_i = 0 1515 DO i = 1, ntasks 1516 CALL int2pair(tasks(3, i), ilevel, img, iatom, jatom, iset, jset, ipgf, jpgf, nimages, natom, maxset, maxpgf) 1517 IF (ilevel .NE. grid_level) CYCLE 1518 IF (use_reordered_ranks) THEN 1519 send_buf_i(rs_descs(ilevel)%rs_desc%virtual2real(decode_rank(tasks(1, i), SIZE(rs_descs))) + 1) = & 1520 send_buf_i(rs_descs(ilevel)%rs_desc%virtual2real(decode_rank(tasks(1, i), SIZE(rs_descs))) + 1) & 1521 + tasks(5, i) 1522 ELSE 1523 send_buf_i(decode_rank(tasks(1, i), SIZE(rs_descs)) + 1) = & 1524 send_buf_i(decode_rank(tasks(1, i), SIZE(rs_descs)) + 1) & 1525 + tasks(5, i) 1526 END IF 1527 END DO 1528 CALL mp_alltoall(send_buf_i, recv_buf_i, 1, desc%group) 1529 1530 ! communication step 2 : compute the global cost of the tasks 1531 total_cost_local = SUM(recv_buf_i) 1532 1533 ! after this step, the recv buffer contains the local cost for each CPU 1534 CALL mp_allgather(total_cost_local, loads, desc%group) 1535 1536 CALL timestop(handle) 1537 1538 END SUBROUTINE get_current_loads 1539! ************************************************************************************************** 1540!> \brief performs load balancing shifting tasks on the replicated grids 1541!> this modifies the destination of some of the tasks on replicated 1542!> grids, and in this way balances the load 1543!> \param rs_descs ... 1544!> \param ntasks ... 1545!> \param tasks ... 1546!> \param nimages ... 1547!> \param natoms ... 1548!> \param maxset ... 1549!> \param maxpgf ... 1550!> \par History 1551!> none 1552!> \author MattW 21/11/2007 1553! ************************************************************************************************** 1554 SUBROUTINE load_balance_replicated(rs_descs, ntasks, tasks, nimages, natoms, maxset, maxpgf) 1555 1556 TYPE(realspace_grid_desc_p_type), DIMENSION(:), & 1557 POINTER :: rs_descs 1558 INTEGER :: ntasks 1559 INTEGER(KIND=int_8), DIMENSION(:, :), POINTER :: tasks 1560 INTEGER, INTENT(IN) :: nimages, natoms, maxset, maxpgf 1561 1562 CHARACTER(LEN=*), PARAMETER :: routineN = 'load_balance_replicated', & 1563 routineP = moduleN//':'//routineN 1564 1565 INTEGER :: handle, i, iatom, ilevel, img, ipgf, & 1566 iset, j, jatom, jpgf, jset, & 1567 no_overloaded, no_underloaded, & 1568 proc_receiving 1569 INTEGER(KIND=int_8) :: average_cost, cost_task_rep, count, & 1570 offset, total_cost_global 1571 INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:) :: load_imbalance, loads, recv_buf_i 1572 INTEGER, ALLOCATABLE, DIMENSION(:) :: index 1573 TYPE(realspace_grid_desc_type), POINTER :: desc 1574 1575 CALL timeset(routineN, handle) 1576 1577 desc => rs_descs(1)%rs_desc 1578 1579 ! allocate local arrays 1580 ALLOCATE (recv_buf_i(desc%group_size)) 1581 ALLOCATE (loads(desc%group_size)) 1582 1583 recv_buf_i = 0 1584 DO i = 1, SIZE(rs_descs) 1585 CALL get_current_loads(loads, rs_descs, i, ntasks, nimages, natoms, maxset, maxpgf, & 1586 tasks, use_reordered_ranks=.TRUE.) 1587 recv_buf_i(:) = recv_buf_i + loads 1588 END DO 1589 1590 total_cost_global = SUM(recv_buf_i) 1591 average_cost = total_cost_global/desc%group_size 1592 1593 ! 1594 ! compute how to redistribute the replicated tasks so that the average cost is reached 1595 ! 1596 1597 ! load imbalance measures the load of a given CPU relative 1598 ! to the optimal load distribution (load=average) 1599 ALLOCATE (load_imbalance(desc%group_size)) 1600 ALLOCATE (INDEX(desc%group_size)) 1601 1602 load_imbalance(:) = recv_buf_i - average_cost 1603 no_overloaded = 0 1604 no_underloaded = 0 1605 1606 DO i = 1, desc%group_size 1607 IF (load_imbalance(i) .GT. 0) no_overloaded = no_overloaded + 1 1608 IF (load_imbalance(i) .LT. 0) no_underloaded = no_underloaded + 1 1609 ENDDO 1610 1611 ! sort the recv_buffer on number of tasks, gives us index which provides a 1612 ! mapping between processor ranks and how overloaded the processor 1613 CALL sort(recv_buf_i, SIZE(recv_buf_i), index) 1614 1615 ! find out the number of replicated tasks each proc has 1616 ! but only those tasks which have not yet been assigned 1617 cost_task_rep = 0 1618 DO i = 1, ntasks 1619 IF (tasks(4, i) .EQ. 0 & 1620 .AND. decode_rank(tasks(1, i), SIZE(rs_descs)) == decode_rank(tasks(2, i), SIZE(rs_descs))) THEN 1621 cost_task_rep = cost_task_rep + tasks(5, i) 1622 ENDIF 1623 ENDDO 1624 1625 ! now, correct the load imbalance for the overloaded CPUs 1626 ! they will send away not more than the total load of replicated tasks 1627 CALL mp_allgather(cost_task_rep, recv_buf_i, desc%group) 1628 1629 DO i = 1, desc%group_size 1630 ! At the moment we can only offload replicated tasks 1631 IF (load_imbalance(i) .GT. 0) & 1632 load_imbalance(i) = MIN(load_imbalance(i), recv_buf_i(i)) 1633 ENDDO 1634 1635 ! simplest algorithm I can think of of is that the processor with the most 1636 ! excess tasks fills up the process needing most, then moves on to next most. 1637 ! At the moment if we've got less replicated tasks than we're overloaded then 1638 ! task balancing will be incomplete 1639 1640 ! only need to do anything if I've excess tasks 1641 IF (load_imbalance(desc%my_pos + 1) .GT. 0) THEN 1642 1643 count = 0 ! weighted amount of tasks offloaded 1644 offset = 0 ! no of underloaded processes already filled by other more overloaded procs 1645 1646 ! calculate offset 1647 DO i = desc%group_size, desc%group_size - no_overloaded + 1, -1 1648 IF (INDEX(i) .EQ. desc%my_pos + 1) THEN 1649 EXIT 1650 ELSE 1651 offset = offset + load_imbalance(INDEX(i)) 1652 ENDIF 1653 ENDDO 1654 1655 ! find my starting processor to send to 1656 proc_receiving = HUGE(proc_receiving) 1657 DO i = 1, no_underloaded 1658 offset = offset + load_imbalance(INDEX(i)) 1659 IF (offset .LE. 0) THEN 1660 proc_receiving = i 1661 EXIT 1662 ENDIF 1663 ENDDO 1664 1665 ! offset now contains minus the number of tasks proc_receiving requires 1666 ! we fill this up by adjusting the destination of tasks on the replicated grid, 1667 ! then move to next most underloaded proc 1668 DO j = 1, ntasks 1669 IF (tasks(4, j) .EQ. 0 & 1670 .AND. decode_rank(tasks(1, j), SIZE(rs_descs)) == decode_rank(tasks(2, j), SIZE(rs_descs))) THEN 1671 ! just avoid sending to non existing procs due to integer truncation 1672 ! in the computation of the average 1673 IF (proc_receiving .GT. no_underloaded) EXIT 1674 ! set new destination 1675 CALL int2pair(tasks(3, j), ilevel, img, iatom, jatom, iset, jset, ipgf, jpgf, nimages, natoms, maxset, maxpgf) 1676 tasks(1, j) = encode_rank(INDEX(proc_receiving) - 1, ilevel, SIZE(rs_descs)) 1677 offset = offset + tasks(5, j) 1678 count = count + tasks(5, j) 1679 IF (count .GE. load_imbalance(desc%my_pos + 1)) EXIT 1680 IF (offset .GT. 0) THEN 1681 proc_receiving = proc_receiving + 1 1682 ! just avoid sending to non existing procs due to integer truncation 1683 ! in the computation of the average 1684 IF (proc_receiving .GT. no_underloaded) EXIT 1685 offset = load_imbalance(INDEX(proc_receiving)) 1686 ENDIF 1687 ENDIF 1688 ENDDO 1689 ENDIF 1690 1691 DEALLOCATE (index) 1692 DEALLOCATE (load_imbalance) 1693 1694 CALL timestop(handle) 1695 1696 END SUBROUTINE load_balance_replicated 1697 1698! ************************************************************************************************** 1699!> \brief given an input task list, redistribute so that all tasks can be processed locally, 1700!> i.e. dest equals rank 1701!> \param rs_descs ... 1702!> \param ntasks ... 1703!> \param tasks ... 1704!> \param rval ... 1705!> \param ntasks_recv ... 1706!> \param tasks_recv ... 1707!> \param rval_recv ... 1708!> \par History 1709!> none 1710!> \author MattW 21/11/2007 1711! ************************************************************************************************** 1712 SUBROUTINE create_local_tasks(rs_descs, ntasks, tasks, rval, & 1713 ntasks_recv, tasks_recv, rval_recv) 1714 1715 TYPE(realspace_grid_desc_p_type), DIMENSION(:), & 1716 POINTER :: rs_descs 1717 INTEGER :: ntasks 1718 INTEGER(KIND=int_8), DIMENSION(:, :), POINTER :: tasks 1719 REAL(KIND=dp), DIMENSION(:, :), POINTER :: rval 1720 INTEGER :: ntasks_recv 1721 INTEGER(KIND=int_8), DIMENSION(:, :), POINTER :: tasks_recv 1722 REAL(KIND=dp), DIMENSION(:, :), POINTER :: rval_recv 1723 1724 CHARACTER(LEN=*), PARAMETER :: routineN = 'create_local_tasks', & 1725 routineP = moduleN//':'//routineN 1726 1727 INTEGER :: handle, i, j, k, l, rank, task_dim 1728 INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:) :: recv_buf_i, send_buf_i 1729 INTEGER, ALLOCATABLE, DIMENSION(:) :: recv_disps, recv_sizes, send_disps, & 1730 send_sizes 1731 REAL(KIND=dp), ALLOCATABLE, DIMENSION(:) :: recv_buf_r, send_buf_r 1732 TYPE(realspace_grid_desc_type), POINTER :: desc 1733 1734 CALL timeset(routineN, handle) 1735 1736 desc => rs_descs(1)%rs_desc 1737 1738 ! allocate local arrays 1739 ALLOCATE (send_sizes(desc%group_size)) 1740 ALLOCATE (recv_sizes(desc%group_size)) 1741 ALLOCATE (send_disps(desc%group_size)) 1742 ALLOCATE (recv_disps(desc%group_size)) 1743 ALLOCATE (send_buf_i(desc%group_size)) 1744 ALLOCATE (recv_buf_i(desc%group_size)) 1745 1746 ! fill send buffer, now counting how many tasks will be send (stored in an int8 array for convenience only). 1747 send_buf_i = 0 1748 DO i = 1, ntasks 1749 rank = rs_descs(decode_level(tasks(1, i), SIZE(rs_descs)))%rs_desc%virtual2real(decode_rank(tasks(1, i), SIZE(rs_descs))) 1750 send_buf_i(rank + 1) = send_buf_i(rank + 1) + 1 1751 END DO 1752 1753 CALL mp_alltoall(send_buf_i, recv_buf_i, 1, desc%group) 1754 1755 ! pack the tasks, and send them around 1756 1757 task_dim = SIZE(tasks, 1) 1758 1759 send_sizes = 0 1760 send_disps = 0 1761 recv_sizes = 0 1762 recv_disps = 0 1763 1764 send_sizes(1) = INT(send_buf_i(1)*task_dim) 1765 recv_sizes(1) = INT(recv_buf_i(1)*task_dim) 1766 DO i = 2, desc%group_size 1767 send_sizes(i) = INT(send_buf_i(i)*task_dim) 1768 recv_sizes(i) = INT(recv_buf_i(i)*task_dim) 1769 send_disps(i) = send_disps(i - 1) + send_sizes(i - 1) 1770 recv_disps(i) = recv_disps(i - 1) + recv_sizes(i - 1) 1771 ENDDO 1772 1773 ! deallocate old send/recv buffers 1774 DEALLOCATE (send_buf_i) 1775 DEALLOCATE (recv_buf_i) 1776 1777 ! allocate them with new sizes 1778 ALLOCATE (send_buf_i(SUM(send_sizes))) 1779 ALLOCATE (recv_buf_i(SUM(recv_sizes))) 1780 1781 ! do packing 1782 send_buf_i = 0 1783 send_sizes = 0 1784 DO j = 1, ntasks 1785 i = rs_descs(decode_level(tasks(1, j), SIZE(rs_descs)))%rs_desc%virtual2real(decode_rank(tasks(1, j), SIZE(rs_descs))) + 1 1786 DO k = 1, task_dim 1787 send_buf_i(send_disps(i) + send_sizes(i) + k) = tasks(k, j) 1788 ENDDO 1789 send_sizes(i) = send_sizes(i) + task_dim 1790 ENDDO 1791 1792 ! do communication 1793 CALL mp_alltoall(send_buf_i, send_sizes, send_disps, recv_buf_i, recv_sizes, recv_disps, desc%group) 1794 1795 DEALLOCATE (send_buf_i) 1796 1797 ntasks_recv = SUM(recv_sizes)/task_dim 1798 ALLOCATE (tasks_recv(task_dim, ntasks_recv)) 1799 1800 ! do unpacking 1801 l = 0 1802 DO i = 1, desc%group_size 1803 DO j = 0, recv_sizes(i)/task_dim - 1 1804 l = l + 1 1805 DO k = 1, task_dim 1806 tasks_recv(k, l) = recv_buf_i(recv_disps(i) + j*task_dim + k) 1807 ENDDO 1808 ENDDO 1809 ENDDO 1810 1811 DEALLOCATE (recv_buf_i) 1812 1813 ! send rvals (to be removed :-) 1814 1815 ! take care of the new task_dim in the send_sizes 1816 send_sizes(:) = (send_sizes(:)/task_dim)*SIZE(rval, 1) 1817 recv_sizes(:) = (recv_sizes(:)/task_dim)*SIZE(rval, 1) 1818 send_disps(:) = (send_disps(:)/task_dim)*SIZE(rval, 1) 1819 recv_disps(:) = (recv_disps(:)/task_dim)*SIZE(rval, 1) 1820 task_dim = SIZE(rval, 1) 1821 1822 ALLOCATE (send_buf_r(SUM(send_sizes))) 1823 ALLOCATE (recv_buf_r(SUM(recv_sizes))) 1824 1825 !do packing 1826 send_sizes = 0 1827 DO j = 1, ntasks 1828 i = rs_descs(decode_level(tasks(1, j), SIZE(rs_descs)))%rs_desc%virtual2real(decode_rank(tasks(1, j), SIZE(rs_descs))) + 1 1829 DO k = 1, task_dim 1830 send_buf_r(send_disps(i) + send_sizes(i) + k) = rval(k, j) 1831 ENDDO 1832 send_sizes(i) = send_sizes(i) + task_dim 1833 ENDDO 1834 1835 ! do communication 1836 CALL mp_alltoall(send_buf_r, send_sizes, send_disps, & 1837 recv_buf_r, recv_sizes, recv_disps, desc%group) 1838 1839 DEALLOCATE (send_buf_r) 1840 ALLOCATE (rval_recv(task_dim, SUM(recv_sizes)/task_dim)) 1841 1842 ! do unpacking 1843 l = 0 1844 DO i = 1, desc%group_size 1845 DO j = 0, recv_sizes(i)/task_dim - 1 1846 l = l + 1 1847 DO k = 1, task_dim 1848 rval_recv(k, l) = recv_buf_r(recv_disps(i) + j*task_dim + k) 1849 ENDDO 1850 ENDDO 1851 ENDDO 1852 1853 DEALLOCATE (recv_buf_r) 1854 DEALLOCATE (send_sizes) 1855 DEALLOCATE (recv_sizes) 1856 DEALLOCATE (send_disps) 1857 DEALLOCATE (recv_disps) 1858 1859 CALL timestop(handle) 1860 1861 END SUBROUTINE create_local_tasks 1862 1863! ************************************************************************************************** 1864!> \brief Assembles tasks to be performed on local grid 1865!> \param rs_descs the grids 1866!> \param ntasks Number of tasks for local processing 1867!> \param natoms ... 1868!> \param maxset ... 1869!> \param maxpgf ... 1870!> \param nimages ... 1871!> \param tasks the task set generated on this processor 1872!> \param rval ... 1873!> \param atom_pair_send ... 1874!> \param atom_pair_recv ... 1875!> \param symmetric ... 1876!> \param reorder_rs_grid_ranks ... 1877!> \param skip_load_balance_distributed ... 1878!> \par History 1879!> none 1880!> \author MattW 21/11/2007 1881! ************************************************************************************************** 1882 SUBROUTINE distribute_tasks(rs_descs, ntasks, natoms, maxset, maxpgf, nimages, & 1883 tasks, rval, atom_pair_send, atom_pair_recv, & 1884 symmetric, reorder_rs_grid_ranks, skip_load_balance_distributed) 1885 1886 TYPE(realspace_grid_desc_p_type), DIMENSION(:), & 1887 POINTER :: rs_descs 1888 INTEGER :: ntasks, natoms, maxset, maxpgf, nimages 1889 INTEGER(KIND=int_8), DIMENSION(:, :), POINTER :: tasks 1890 REAL(KIND=dp), DIMENSION(:, :), POINTER :: rval 1891 INTEGER(KIND=int_8), DIMENSION(:), POINTER :: atom_pair_send, atom_pair_recv 1892 LOGICAL, INTENT(IN) :: symmetric, reorder_rs_grid_ranks, & 1893 skip_load_balance_distributed 1894 1895 CHARACTER(LEN=*), PARAMETER :: routineN = 'distribute_tasks', & 1896 routineP = moduleN//':'//routineN 1897 1898 INTEGER :: handle, igrid_level, irank, k, & 1899 ntasks_recv 1900 INTEGER(KIND=int_8) :: load_gap, max_load, replicated_load 1901 INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:) :: taskid, total_loads, total_loads_tmp, & 1902 trial_loads 1903 INTEGER(KIND=int_8), DIMENSION(:, :), POINTER :: loads, tasks_recv 1904 INTEGER, ALLOCATABLE, DIMENSION(:) :: index, real2virtual, total_index 1905 LOGICAL :: distributed_grids, fixed_first_grid 1906 REAL(KIND=dp), DIMENSION(:, :), POINTER :: rval_recv 1907 TYPE(realspace_grid_desc_type), POINTER :: desc 1908 1909 CALL timeset(routineN, handle) 1910 1911 CPASSERT(ASSOCIATED(tasks)) 1912 1913 ! *** figure out if we have distributed grids 1914 distributed_grids = .FALSE. 1915 DO igrid_level = 1, SIZE(rs_descs) 1916 IF (rs_descs(igrid_level)%rs_desc%distributed) THEN 1917 distributed_grids = .TRUE. 1918 ENDIF 1919 END DO 1920 desc => rs_descs(1)%rs_desc 1921 1922 IF (distributed_grids) THEN 1923 1924 ALLOCATE (loads(0:desc%group_size - 1, SIZE(rs_descs))) 1925 ALLOCATE (total_loads(0:desc%group_size - 1)) 1926 1927 total_loads = 0 1928 1929 ! First round of balancing on the distributed grids 1930 ! we just balance each of the distributed grids independently 1931 DO igrid_level = 1, SIZE(rs_descs) 1932 IF (rs_descs(igrid_level)%rs_desc%distributed) THEN 1933 1934 IF (.NOT. skip_load_balance_distributed) & 1935 CALL load_balance_distributed(tasks, ntasks, rs_descs, igrid_level, nimages, natoms, maxset, maxpgf) 1936 1937 CALL get_current_loads(loads(:, igrid_level), rs_descs, igrid_level, ntasks, nimages, natoms, maxset, maxpgf, & 1938 tasks, use_reordered_ranks=.FALSE.) 1939 1940 total_loads(:) = total_loads + loads(:, igrid_level) 1941 1942 END IF 1943 END DO 1944 1945 ! calculate the total load of replicated tasks, so we can decide if it is worth 1946 ! reordering the distributed grid levels 1947 1948 replicated_load = 0 1949 DO igrid_level = 1, SIZE(rs_descs) 1950 IF (.NOT. rs_descs(igrid_level)%rs_desc%distributed) THEN 1951 CALL get_current_loads(loads(:, igrid_level), rs_descs, igrid_level, ntasks, nimages, natoms, maxset, maxpgf, & 1952 tasks, use_reordered_ranks=.FALSE.) 1953 replicated_load = replicated_load + SUM(loads(:, igrid_level)) 1954 END IF 1955 END DO 1956 1957 !IF (desc%my_pos==0) THEN 1958 ! WRITE(*,*) "Total replicated load is ",replicated_load 1959 !END IF 1960 1961 ! Now we adjust the rank ordering based on the current loads 1962 ! we leave the first distributed level and all the replicated levels in the default order 1963 IF (reorder_rs_grid_ranks) THEN 1964 fixed_first_grid = .FALSE. 1965 DO igrid_level = 1, SIZE(rs_descs) 1966 IF (rs_descs(igrid_level)%rs_desc%distributed) THEN 1967 IF (fixed_first_grid .EQV. .FALSE.) THEN 1968 total_loads(:) = loads(:, igrid_level) 1969 fixed_first_grid = .TRUE. 1970 ELSE 1971 ALLOCATE (trial_loads(0:desc%group_size - 1)) 1972 1973 trial_loads(:) = total_loads + loads(:, igrid_level) 1974 max_load = MAXVAL(trial_loads) 1975 load_gap = 0 1976 DO irank = 0, desc%group_size - 1 1977 load_gap = load_gap + max_load - trial_loads(irank) 1978 END DO 1979 1980 ! If there is not enough replicated load to load balance well enough 1981 ! then we will reorder this grid level 1982 IF (load_gap > replicated_load*1.05_dp) THEN 1983 1984 ALLOCATE (INDEX(0:desc%group_size - 1)) 1985 ALLOCATE (total_index(0:desc%group_size - 1)) 1986 ALLOCATE (total_loads_tmp(0:desc%group_size - 1)) 1987 ALLOCATE (real2virtual(0:desc%group_size - 1)) 1988 1989 total_loads_tmp(:) = total_loads 1990 CALL sort(total_loads_tmp, desc%group_size, total_index) 1991 CALL sort(loads(:, igrid_level), desc%group_size, index) 1992 1993 ! Reorder so that the rank with smallest load on this grid level is paired with 1994 ! the highest load in total 1995 DO irank = 0, desc%group_size - 1 1996 total_loads(total_index(irank) - 1) = total_loads(total_index(irank) - 1) + & 1997 loads(desc%group_size - irank - 1, igrid_level) 1998 real2virtual(total_index(irank) - 1) = INDEX(desc%group_size - irank - 1) - 1 1999 END DO 2000 2001 CALL rs_grid_reorder_ranks(rs_descs(igrid_level)%rs_desc, real2virtual) 2002 2003 DEALLOCATE (index) 2004 DEALLOCATE (total_index) 2005 DEALLOCATE (total_loads_tmp) 2006 DEALLOCATE (real2virtual) 2007 ELSE 2008 total_loads(:) = trial_loads 2009 ENDIF 2010 2011 DEALLOCATE (trial_loads) 2012 2013 ENDIF 2014 ENDIF 2015 END DO 2016 END IF 2017 2018 ! Now we use the replicated tasks to balance out the rest of the load 2019 CALL load_balance_replicated(rs_descs, ntasks, tasks, nimages, natoms, maxset, maxpgf) 2020 2021 !total_loads = 0 2022 !DO igrid_level=1,SIZE(rs_descs) 2023 ! CALL get_current_loads(loads(:,igrid_level), rs_descs, igrid_level, ntasks, nimages, natoms, maxset,maxpgf, & 2024 ! tasks, use_reordered_ranks=.TRUE.) 2025 ! total_loads = total_loads + loads(:, igrid_level) 2026 !END DO 2027 2028 !IF (desc%my_pos==0) THEN 2029 ! WRITE(*,*) "" 2030 ! WRITE(*,*) "At the end of the load balancing procedure" 2031 ! WRITE(*,*) "Maximum load:",MAXVAL(total_loads) 2032 ! WRITE(*,*) "Average load:",SUM(total_loads)/SIZE(total_loads) 2033 ! WRITE(*,*) "Minimum load:",MINVAL(total_loads) 2034 !ENDIF 2035 2036 ! given a list of tasks, this will do the needed reshuffle so that all tasks will be local 2037 CALL create_local_tasks(rs_descs, ntasks, tasks, rval, ntasks_recv, tasks_recv, rval_recv) 2038 2039 ! 2040 ! tasks list are complete, we can compute the list of atomic blocks (atom pairs) 2041 ! we will be sending. These lists are needed for redistribute_matrix. 2042 ! 2043 ALLOCATE (atom_pair_send(ntasks)) 2044 CALL get_atom_pair(atom_pair_send, tasks, send=.TRUE., & 2045 symmetric=symmetric, natoms=natoms, maxset=maxset, maxpgf=maxpgf, & 2046 nimages=nimages, rs_descs=rs_descs) 2047 2048 ! natom_send=SIZE(atom_pair_send) 2049 ! CALL mp_sum(natom_send,desc%group) 2050 ! IF (desc%my_pos==0) THEN 2051 ! WRITE(*,*) "" 2052 ! WRITE(*,*) "Total number of atomic blocks to be send:",natom_send 2053 ! ENDIF 2054 2055 ALLOCATE (atom_pair_recv(ntasks_recv)) 2056 CALL get_atom_pair(atom_pair_recv, tasks_recv, send=.FALSE., & 2057 symmetric=symmetric, natoms=natoms, maxset=maxset, maxpgf=maxpgf, & 2058 nimages=nimages, rs_descs=rs_descs) 2059 2060 ! cleanup, at this point we don't need the original tasks & rvals anymore 2061 DEALLOCATE (tasks) 2062 DEALLOCATE (rval) 2063 DEALLOCATE (loads) 2064 DEALLOCATE (total_loads) 2065 2066 ELSE 2067 2068 tasks_recv => tasks 2069 ntasks_recv = ntasks 2070 rval_recv => rval 2071 2072 ENDIF 2073 2074 ! 2075 ! here we sort the task list we will process locally. 2076 ! the sort is determined by pair2int 2077 ! 2078 rval => rval_recv 2079 2080 ALLOCATE (taskid(ntasks_recv)) 2081 ALLOCATE (INDEX(ntasks_recv)) 2082 2083 taskid(:) = tasks_recv(3, 1:ntasks_recv) 2084 CALL sort(taskid, SIZE(taskid), index) 2085 2086 DO k = 1, SIZE(tasks_recv, 1) 2087 tasks_recv(k, 1:ntasks_recv) = tasks_recv(k, index) 2088 ENDDO 2089 2090 DO k = 1, SIZE(rval, 1) 2091 rval(k, 1:ntasks_recv) = rval(k, index) 2092 ENDDO 2093 2094 DEALLOCATE (taskid) 2095 DEALLOCATE (index) 2096 2097 ! 2098 ! final lists are ready 2099 ! 2100 2101 tasks => tasks_recv 2102 ntasks = ntasks_recv 2103 2104 CALL timestop(handle) 2105 2106 END SUBROUTINE distribute_tasks 2107 2108! ************************************************************************************************** 2109!> \brief ... 2110!> \param atom_pair ... 2111!> \param my_tasks ... 2112!> \param send ... 2113!> \param symmetric ... 2114!> \param natoms ... 2115!> \param maxset ... 2116!> \param maxpgf ... 2117!> \param nimages ... 2118!> \param rs_descs ... 2119! ************************************************************************************************** 2120 SUBROUTINE get_atom_pair(atom_pair, my_tasks, send, symmetric, natoms, maxset, & 2121 maxpgf, nimages, rs_descs) 2122 2123 INTEGER(KIND=int_8), DIMENSION(:), POINTER :: atom_pair 2124 INTEGER(KIND=int_8), DIMENSION(:, :), POINTER :: my_tasks 2125 LOGICAL :: send, symmetric 2126 INTEGER :: natoms, maxset, maxpgf, nimages 2127 TYPE(realspace_grid_desc_p_type), DIMENSION(:), & 2128 POINTER :: rs_descs 2129 2130 CHARACTER(LEN=*), PARAMETER :: routineN = 'get_atom_pair', routineP = moduleN//':'//routineN 2131 2132 INTEGER :: acol, arow, i, iatom, ilevel, img, ipgf, & 2133 iset, j, jatom, jpgf, jset 2134 INTEGER(KIND=int_8) :: natom8, nim8 2135 INTEGER, DIMENSION(:), POINTER :: index 2136 2137 ! calculate list of atom pairs 2138 ! fill pair list taking into acount symmetry 2139 2140 natom8 = natoms 2141 nim8 = nimages 2142 2143 atom_pair = 0 2144 DO i = 1, SIZE(atom_pair) 2145 CALL int2pair(my_tasks(3, i), ilevel, img, iatom, jatom, iset, jset, ipgf, jpgf, nimages, natoms, maxset, maxpgf) 2146 IF (symmetric) THEN 2147 IF (iatom .LE. jatom) THEN 2148 arow = iatom 2149 acol = jatom 2150 ELSE 2151 arow = jatom 2152 acol = iatom 2153 ENDIF 2154 ELSE 2155 arow = iatom 2156 acol = jatom 2157 ENDIF 2158 IF (send) THEN 2159 ! If sending, we need to use the 'real rank' as the pair has to be sent to the process which 2160 ! actually has the correct part of the rs_grid to do the mapping 2161 atom_pair(i) = rs_descs(ilevel)%rs_desc%virtual2real(decode_rank(my_tasks(1, i), SIZE(rs_descs))) & 2162 *natom8*natom8*nim8 + (arow - 1)*natom8*nim8 + (acol - 1)*nim8 + (img - 1) 2163 ELSE 2164 ! If we are recieving, then no conversion is needed as the rank is that of the process with the 2165 ! required matrix block, and the ordering of the rs grid is irrelevant 2166 atom_pair(i) = decode_rank(my_tasks(2, i), SIZE(rs_descs)) & 2167 *natom8*natom8*nim8 + (arow - 1)*natom8*nim8 + (acol - 1)*nim8 + (img - 1) 2168 ENDIF 2169 ENDDO 2170 2171 ALLOCATE (INDEX(SIZE(atom_pair))) 2172 2173 ! find unique atom pairs that I'm sending/receiving 2174 CALL sort(atom_pair, SIZE(atom_pair), index) 2175 2176 DEALLOCATE (index) 2177 2178 IF (SIZE(atom_pair) > 1) THEN 2179 j = 1 2180 ! first atom pair must be allowed 2181 DO i = 2, SIZE(atom_pair) 2182 IF (atom_pair(i) .GT. atom_pair(i - 1)) THEN 2183 j = j + 1 2184 atom_pair(j) = atom_pair(i) 2185 ENDIF 2186 ENDDO 2187 ! reallocate the atom pair list 2188 CALL reallocate(atom_pair, 1, j) 2189 ENDIF 2190 2191 END SUBROUTINE get_atom_pair 2192 2193! ************************************************************************************************** 2194!> \brief redistributes the matrix so that it can be used in realspace operations 2195!> i.e. according to the task lists for collocate and integrate. 2196!> This routine can become a bottleneck in large calculations. 2197!> \param rs_descs ... 2198!> \param pmats ... 2199!> \param atom_pair_send ... 2200!> \param atom_pair_recv ... 2201!> \param natoms ... 2202!> \param nimages ... 2203!> \param scatter ... 2204!> \param hmats ... 2205! ************************************************************************************************** 2206 SUBROUTINE rs_distribute_matrix(rs_descs, pmats, atom_pair_send, atom_pair_recv, & 2207 natoms, nimages, scatter, hmats) 2208 2209 TYPE(realspace_grid_desc_p_type), DIMENSION(:), & 2210 POINTER :: rs_descs 2211 TYPE(dbcsr_p_type), DIMENSION(:), POINTER :: pmats 2212 INTEGER(KIND=int_8), DIMENSION(:), POINTER :: atom_pair_send, atom_pair_recv 2213 INTEGER :: natoms, nimages 2214 LOGICAL :: scatter 2215 TYPE(dbcsr_p_type), DIMENSION(:), OPTIONAL, & 2216 POINTER :: hmats 2217 2218 CHARACTER(LEN=*), PARAMETER :: routineN = 'rs_distribute_matrix', & 2219 routineP = moduleN//':'//routineN 2220 2221 INTEGER :: acol, arow, handle, i, img, j, k, l, me, & 2222 nblkcols_total, nblkrows_total, ncol, & 2223 nrow, nthread, nthread_left 2224 INTEGER(KIND=int_8) :: natom8, nim8, pair 2225 INTEGER, ALLOCATABLE, DIMENSION(:) :: first_col, first_row, last_col, last_row, recv_disps, & 2226 recv_pair_count, recv_pair_disps, recv_sizes, send_disps, send_pair_count, & 2227 send_pair_disps, send_sizes 2228 INTEGER, DIMENSION(:), POINTER :: col_blk_size, row_blk_size 2229 LOGICAL :: found 2230 REAL(KIND=dp), ALLOCATABLE, DIMENSION(:) :: recv_buf_r, send_buf_r 2231 REAL(KIND=dp), DIMENSION(:, :), POINTER :: h_block, p_block 2232 TYPE(dbcsr_type), POINTER :: hmat, pmat 2233 TYPE(realspace_grid_desc_type), POINTER :: desc 2234 2235!$ INTEGER(kind=omp_lock_kind), ALLOCATABLE, DIMENSION(:) :: locks 2236 2237 CALL timeset(routineN, handle) 2238 2239 IF (.NOT. scatter) THEN 2240 CPASSERT(PRESENT(hmats)) 2241 END IF 2242 2243 desc => rs_descs(1)%rs_desc 2244 me = desc%my_pos + 1 2245 2246 ! allocate local arrays 2247 ALLOCATE (send_sizes(desc%group_size)) 2248 ALLOCATE (recv_sizes(desc%group_size)) 2249 ALLOCATE (send_disps(desc%group_size)) 2250 ALLOCATE (recv_disps(desc%group_size)) 2251 ALLOCATE (send_pair_count(desc%group_size)) 2252 ALLOCATE (recv_pair_count(desc%group_size)) 2253 ALLOCATE (send_pair_disps(desc%group_size)) 2254 ALLOCATE (recv_pair_disps(desc%group_size)) 2255 2256 pmat => pmats(1)%matrix 2257 CALL dbcsr_get_info(pmat, & 2258 row_blk_size=row_blk_size, & 2259 col_blk_size=col_blk_size, & 2260 nblkrows_total=nblkrows_total, & 2261 nblkcols_total=nblkcols_total) 2262 ALLOCATE (first_row(nblkrows_total), last_row(nblkrows_total), & 2263 first_col(nblkcols_total), last_col(nblkcols_total)) 2264 CALL dbcsr_convert_sizes_to_offsets(row_blk_size, first_row, last_row) 2265 CALL dbcsr_convert_sizes_to_offsets(col_blk_size, first_col, last_col) 2266 ! set up send buffer sizes 2267 natom8 = natoms 2268 nim8 = nimages 2269 2270 send_sizes = 0 2271 send_pair_count = 0 2272 DO i = 1, SIZE(atom_pair_send) 2273 2274 ! proc we're sending this block to 2275 k = INT(atom_pair_send(i)/(nim8*natom8**2)) + 1 2276 2277 pair = MOD(atom_pair_send(i), nim8*natom8**2) 2278 2279 arow = INT(pair/(nim8*natom8)) + 1 2280 acol = INT(MOD(pair, nim8*natom8)/nim8) + 1 2281 2282 nrow = last_row(arow) - first_row(arow) + 1 2283 ncol = last_col(acol) - first_col(acol) + 1 2284 2285 send_sizes(k) = send_sizes(k) + nrow*ncol 2286 send_pair_count(k) = send_pair_count(k) + 1 2287 2288 ENDDO 2289 2290 send_disps = 0 2291 send_pair_disps = 0 2292 DO i = 2, desc%group_size 2293 send_disps(i) = send_disps(i - 1) + send_sizes(i - 1) 2294 send_pair_disps(i) = send_pair_disps(i - 1) + send_pair_count(i - 1) 2295 ENDDO 2296 2297 ALLOCATE (send_buf_r(SUM(send_sizes))) 2298 2299 ! set up recv buffer 2300 2301 recv_sizes = 0 2302 recv_pair_count = 0 2303 DO i = 1, SIZE(atom_pair_recv) 2304 2305 ! proc we're receiving this data from 2306 k = INT(atom_pair_recv(i)/(nim8*natom8**2)) + 1 2307 2308 pair = MOD(atom_pair_recv(i), nim8*natom8**2) 2309 2310 arow = INT(pair/(nim8*natom8)) + 1 2311 acol = INT(MOD(pair, nim8*natom8)/nim8) + 1 2312 2313 nrow = last_row(arow) - first_row(arow) + 1 2314 ncol = last_col(acol) - first_col(acol) + 1 2315 2316 recv_sizes(k) = recv_sizes(k) + nrow*ncol 2317 recv_pair_count(k) = recv_pair_count(k) + 1 2318 2319 ENDDO 2320 2321 recv_disps = 0 2322 recv_pair_disps = 0 2323 DO i = 2, desc%group_size 2324 recv_disps(i) = recv_disps(i - 1) + recv_sizes(i - 1) 2325 recv_pair_disps(i) = recv_pair_disps(i - 1) + recv_pair_count(i - 1) 2326 ENDDO 2327 ALLOCATE (recv_buf_r(SUM(recv_sizes))) 2328 2329!$OMP PARALLEL DEFAULT(NONE), & 2330!$OMP SHARED(desc,send_pair_count,send_pair_disps,natom8,nim8,nimages),& 2331!$OMP SHARED(last_row,first_row,last_col,first_col),& 2332!$OMP SHARED(pmats,send_buf_r,send_disps,send_sizes),& 2333!$OMP SHARED(atom_pair_send,me,hmats,nblkrows_total),& 2334!$OMP SHARED(atom_pair_recv,recv_buf_r,scatter,recv_pair_disps), & 2335!$OMP SHARED(recv_sizes,recv_disps,recv_pair_count,locks), & 2336!$OMP PRIVATE(i,img,pair,arow,acol,nrow,ncol,p_block,found,j,k,l),& 2337!$OMP PRIVATE(nthread,h_block,nthread_left,hmat,pmat) 2338 2339 nthread = 1 2340!$ nthread = omp_get_num_threads() 2341 nthread_left = 1 2342!$ nthread_left = MAX(1, nthread - 1) 2343 2344 ! do packing 2345!$OMP DO schedule(guided) 2346 DO l = 1, desc%group_size 2347 IF (l .EQ. me) CYCLE 2348 send_sizes(l) = 0 2349 DO i = 1, send_pair_count(l) 2350 pair = MOD(atom_pair_send(send_pair_disps(l) + i), nim8*natom8**2) 2351 2352 arow = INT(pair/(nim8*natom8)) + 1 2353 acol = INT(MOD(pair, nim8*natom8)/nim8) + 1 2354 img = INT(MOD(pair, nim8)) + 1 2355 2356 nrow = last_row(arow) - first_row(arow) + 1 2357 ncol = last_col(acol) - first_col(acol) + 1 2358 2359 pmat => pmats(img)%matrix 2360 CALL dbcsr_get_block_p(matrix=pmat, row=arow, col=acol, & 2361 block=p_block, found=found) 2362 CPASSERT(found) 2363 2364 DO k = 1, ncol 2365 DO j = 1, nrow 2366 send_buf_r(send_disps(l) + send_sizes(l) + j + (k - 1)*nrow) = p_block(j, k) 2367 ENDDO 2368 ENDDO 2369 send_sizes(l) = send_sizes(l) + nrow*ncol 2370 ENDDO 2371 ENDDO 2372!$OMP END DO 2373 2374 IF (.NOT. scatter) THEN 2375 ! We need locks to protect concurrent summation into H 2376!$OMP SINGLE 2377!$ ALLOCATE (locks(nthread*10)) 2378!$OMP END SINGLE 2379 2380!$OMP DO 2381!$ do i = 1, nthread*10 2382!$ call omp_init_lock(locks(i)) 2383!$ end do 2384!$OMP END DO 2385 END IF 2386 2387!$OMP MASTER 2388 ! do communication 2389 CALL mp_alltoall(send_buf_r, send_sizes, send_disps, & 2390 recv_buf_r, recv_sizes, recv_disps, desc%group) 2391!$OMP END MASTER 2392 2393 ! If this is a scatter, then no need to copy local blocks, 2394 ! If not, we sum them directly into H (bypassing the alltoall) 2395 IF (.NOT. scatter) THEN 2396 2397 ! Distribute work over remaining threads assuming one is still in the alltoall 2398!$OMP DO schedule(dynamic,MAX(1,send_pair_count(me)/nthread_left)) 2399 DO i = 1, send_pair_count(me) 2400 pair = MOD(atom_pair_send(send_pair_disps(me) + i), nim8*natom8**2) 2401 2402 arow = INT(pair/(nim8*natom8)) + 1 2403 acol = INT(MOD(pair, nim8*natom8)/nim8) + 1 2404 img = INT(MOD(pair, nim8)) + 1 2405 2406 nrow = last_row(arow) - first_row(arow) + 1 2407 ncol = last_col(acol) - first_col(acol) + 1 2408 2409 hmat => hmats(img)%matrix 2410 pmat => pmats(img)%matrix 2411 CALL dbcsr_get_block_p(matrix=hmat, row=arow, col=acol, & 2412 BLOCK=h_block, found=found) 2413 CPASSERT(found) 2414 CALL dbcsr_get_block_p(matrix=pmat, row=arow, col=acol, & 2415 BLOCK=p_block, found=found) 2416 CPASSERT(found) 2417 2418!$ call omp_set_lock(locks((arow - 1)*nthread*10/nblkrows_total + 1)) 2419 DO k = 1, ncol 2420 DO j = 1, nrow 2421 h_block(j, k) = h_block(j, k) + p_block(j, k) 2422 ENDDO 2423 ENDDO 2424!$ call omp_unset_lock(locks((arow - 1)*nthread*10/nblkrows_total + 1)) 2425 ENDDO 2426!$OMP END DO 2427 ELSE 2428 ! We will insert new blocks into P, so create mutable work matrices 2429 DO img = 1, nimages 2430 pmat => pmats(img)%matrix 2431 CALL dbcsr_work_create(pmat, work_mutable=.TRUE., & 2432 nblks_guess=SIZE(atom_pair_recv)/nthread, sizedata_guess=SIZE(recv_buf_r)/nthread, & 2433 n=nthread) 2434 END DO 2435 END IF 2436 2437! wait for comm and setup to finish 2438!$OMP BARRIER 2439 2440 !do unpacking 2441!$OMP DO schedule(guided) 2442 DO l = 1, desc%group_size 2443 IF (l .EQ. me) CYCLE 2444 recv_sizes(l) = 0 2445 DO i = 1, recv_pair_count(l) 2446 pair = MOD(atom_pair_recv(recv_pair_disps(l) + i), nim8*natom8**2) 2447 2448 arow = INT(pair/(nim8*natom8)) + 1 2449 acol = INT(MOD(pair, nim8*natom8)/nim8) + 1 2450 img = INT(MOD(pair, nim8)) + 1 2451 2452 nrow = last_row(arow) - first_row(arow) + 1 2453 ncol = last_col(acol) - first_col(acol) + 1 2454 2455 pmat => pmats(img)%matrix 2456 NULLIFY (p_block) 2457 CALL dbcsr_get_block_p(matrix=pmat, row=arow, col=acol, & 2458 BLOCK=p_block, found=found) 2459 2460 IF (PRESENT(hmats)) THEN 2461 hmat => hmats(img)%matrix 2462 CALL dbcsr_get_block_p(matrix=hmat, row=arow, col=acol, & 2463 BLOCK=h_block, found=found) 2464 CPASSERT(found) 2465 ENDIF 2466 2467 IF (scatter .AND. .NOT. ASSOCIATED(p_block)) THEN 2468 CALL dbcsr_put_block(pmat, arow, acol, & 2469 block=recv_buf_r(recv_disps(l) + recv_sizes(l) + 1:recv_disps(l) + recv_sizes(l) + nrow*ncol)) 2470 END IF 2471 IF (.NOT. scatter) THEN 2472!$ call omp_set_lock(locks((arow - 1)*nthread*10/nblkrows_total + 1)) 2473 DO k = 1, ncol 2474 DO j = 1, nrow 2475 h_block(j, k) = h_block(j, k) + recv_buf_r(recv_disps(l) + recv_sizes(l) + j + (k - 1)*nrow) 2476 ENDDO 2477 ENDDO 2478!$ call omp_unset_lock(locks((arow - 1)*nthread*10/nblkrows_total + 1)) 2479 ENDIF 2480 recv_sizes(l) = recv_sizes(l) + nrow*ncol 2481 ENDDO 2482 ENDDO 2483!$OMP END DO 2484 2485!$ IF (.not. scatter) THEN 2486!$OMP DO 2487!$ do i = 1, nthread*10 2488!$ call omp_destroy_lock(locks(i)) 2489!$ end do 2490!$OMP END DO 2491!$ END IF 2492 2493!$OMP SINGLE 2494!$ IF (.not. scatter) THEN 2495!$ DEALLOCATE (locks) 2496!$ END IF 2497!$OMP END SINGLE NOWAIT 2498 2499 IF (scatter) THEN 2500 ! Blocks were added to P 2501 DO img = 1, nimages 2502 pmat => pmats(img)%matrix 2503 CALL dbcsr_finalize(pmat) 2504 END DO 2505 END IF 2506!$OMP END PARALLEL 2507 2508 DEALLOCATE (send_buf_r) 2509 DEALLOCATE (recv_buf_r) 2510 2511 DEALLOCATE (send_sizes) 2512 DEALLOCATE (recv_sizes) 2513 DEALLOCATE (send_disps) 2514 DEALLOCATE (recv_disps) 2515 DEALLOCATE (send_pair_count) 2516 DEALLOCATE (recv_pair_count) 2517 DEALLOCATE (send_pair_disps) 2518 DEALLOCATE (recv_pair_disps) 2519 2520 DEALLOCATE (first_row, last_row, first_col, last_col) 2521 2522 CALL timestop(handle) 2523 2524 END SUBROUTINE rs_distribute_matrix 2525 2526! ************************************************************************************************** 2527!> \brief determines the rank of the processor who's real rs grid contains point 2528!> \param rs_desc ... 2529!> \param igrid_level ... 2530!> \param n_levels ... 2531!> \param cube_center ... 2532!> \param ntasks ... 2533!> \param tasks ... 2534!> \param lb_cube ... 2535!> \param ub_cube ... 2536!> \param added_tasks ... 2537!> \par History 2538!> 11.2007 created [MattW] 2539!> 10.2008 rewritten [Joost VandeVondele] 2540!> \author MattW 2541! ************************************************************************************************** 2542 SUBROUTINE rs_find_node(rs_desc, igrid_level, n_levels, cube_center, ntasks, tasks, & 2543 lb_cube, ub_cube, added_tasks) 2544 2545 TYPE(realspace_grid_desc_type), POINTER :: rs_desc 2546 INTEGER, INTENT(IN) :: igrid_level, n_levels 2547 INTEGER, DIMENSION(3), INTENT(IN) :: cube_center 2548 INTEGER, INTENT(INOUT) :: ntasks 2549 INTEGER(KIND=int_8), DIMENSION(:, :), POINTER :: tasks 2550 INTEGER, DIMENSION(3), INTENT(IN) :: lb_cube, ub_cube 2551 INTEGER, INTENT(OUT) :: added_tasks 2552 2553 CHARACTER(LEN=*), PARAMETER :: routineN = 'rs_find_node', routineP = moduleN//':'//routineN 2554 INTEGER, PARAMETER :: add_tasks = 1000 2555 REAL(kind=dp), PARAMETER :: mult_tasks = 2.0_dp 2556 2557 INTEGER :: bit_index, coord(3), curr_tasks, dest, i, icoord(3), idest, itask, ix, iy, iz, & 2558 lb_coord(3), lb_domain(3), lbc(3), ub_coord(3), ub_domain(3), ubc(3) 2559 INTEGER(KIND=int_8) :: bit_pattern 2560 LOGICAL :: dir_periodic(3) 2561 2562 coord(1) = rs_desc%x2coord(cube_center(1)) 2563 coord(2) = rs_desc%y2coord(cube_center(2)) 2564 coord(3) = rs_desc%z2coord(cube_center(3)) 2565 dest = rs_desc%coord2rank(coord(1), coord(2), coord(3)) 2566 2567 ! the real cube coordinates 2568 lbc = lb_cube + cube_center 2569 ubc = ub_cube + cube_center 2570 2571 IF (ALL((rs_desc%lb_global(:, dest) - rs_desc%border) .LE. lbc) .AND. & 2572 ALL((rs_desc%ub_global(:, dest) + rs_desc%border) .GE. ubc)) THEN 2573 !standard distributed collocation/integration 2574 tasks(1, ntasks) = encode_rank(dest, igrid_level, n_levels) 2575 tasks(4, ntasks) = 1 2576 tasks(6, ntasks) = 0 2577 added_tasks = 1 2578 2579 ! here we figure out if there is an alternate location for this task 2580 ! i.e. even though the cube_center is not in the real local domain, 2581 ! it might fully fit in the halo of the neighbor 2582 ! if its radius is smaller than the maximum radius 2583 ! the list of possible neighbors is stored here as a bit pattern 2584 ! to reconstruct what the rank of the neigbor is, 2585 ! we can use (note this requires the correct rs_grid) 2586 ! IF (BTEST(bit_pattern,0)) rank=rs_grid_locate_rank(rs_desc,tasks(1,ntasks),(/-1,0,0/)) 2587 ! IF (BTEST(bit_pattern,1)) rank=rs_grid_locate_rank(rs_desc,tasks(1,ntasks),(/+1,0,0/)) 2588 ! IF (BTEST(bit_pattern,2)) rank=rs_grid_locate_rank(rs_desc,tasks(1,ntasks),(/0,-1,0/)) 2589 ! IF (BTEST(bit_pattern,3)) rank=rs_grid_locate_rank(rs_desc,tasks(1,ntasks),(/0,+1,0/)) 2590 ! IF (BTEST(bit_pattern,4)) rank=rs_grid_locate_rank(rs_desc,tasks(1,ntasks),(/0,0,-1/)) 2591 ! IF (BTEST(bit_pattern,5)) rank=rs_grid_locate_rank(rs_desc,tasks(1,ntasks),(/0,0,+1/)) 2592 bit_index = 0 2593 bit_pattern = 0 2594 DO i = 1, 3 2595 IF (rs_desc%perd(i) == 1) THEN 2596 bit_pattern = IBCLR(bit_pattern, bit_index) 2597 bit_index = bit_index + 1 2598 bit_pattern = IBCLR(bit_pattern, bit_index) 2599 bit_index = bit_index + 1 2600 ELSE 2601 ! fits the left neighbor ? 2602 IF (ubc(i) <= rs_desc%lb_global(i, dest) - 1 + rs_desc%border) THEN 2603 bit_pattern = IBSET(bit_pattern, bit_index) 2604 bit_index = bit_index + 1 2605 ELSE 2606 bit_pattern = IBCLR(bit_pattern, bit_index) 2607 bit_index = bit_index + 1 2608 ENDIF 2609 ! fits the right neighbor ? 2610 IF (lbc(i) >= rs_desc%ub_global(i, dest) + 1 - rs_desc%border) THEN 2611 bit_pattern = IBSET(bit_pattern, bit_index) 2612 bit_index = bit_index + 1 2613 ELSE 2614 bit_pattern = IBCLR(bit_pattern, bit_index) 2615 bit_index = bit_index + 1 2616 ENDIF 2617 ENDIF 2618 ENDDO 2619 tasks(6, ntasks) = bit_pattern 2620 2621 ELSE 2622 ! generalised collocation/integration needed 2623 ! first we figure out how many neighbors we have to add to include the lbc/ubc 2624 ! in the available domains (inclusive of halo points) 2625 ! first we 'ignore' periodic boundary conditions 2626 ! i.e. ub_coord-lb_coord+1 might be larger than group_dim 2627 lb_coord = coord 2628 ub_coord = coord 2629 lb_domain = rs_desc%lb_global(:, dest) - rs_desc%border 2630 ub_domain = rs_desc%ub_global(:, dest) + rs_desc%border 2631 DO i = 1, 3 2632 ! only if the grid is not periodic in this direction we need to take care of adding neighbors 2633 IF (rs_desc%perd(i) == 0) THEN 2634 ! if the domain lower bound is greater than the lbc we need to add the size of the neighbor domain 2635 DO 2636 IF (lb_domain(i) > lbc(i)) THEN 2637 lb_coord(i) = lb_coord(i) - 1 2638 icoord = MODULO(lb_coord, rs_desc%group_dim) 2639 idest = rs_desc%coord2rank(icoord(1), icoord(2), icoord(3)) 2640 lb_domain(i) = lb_domain(i) - (rs_desc%ub_global(i, idest) - rs_desc%lb_global(i, idest) + 1) 2641 ELSE 2642 EXIT 2643 ENDIF 2644 ENDDO 2645 ! same for the upper bound 2646 DO 2647 IF (ub_domain(i) < ubc(i)) THEN 2648 ub_coord(i) = ub_coord(i) + 1 2649 icoord = MODULO(ub_coord, rs_desc%group_dim) 2650 idest = rs_desc%coord2rank(icoord(1), icoord(2), icoord(3)) 2651 ub_domain(i) = ub_domain(i) + (rs_desc%ub_global(i, idest) - rs_desc%lb_global(i, idest) + 1) 2652 ELSE 2653 EXIT 2654 ENDIF 2655 ENDDO 2656 ENDIF 2657 ENDDO 2658 2659 ! some care is needed for the periodic boundaries 2660 DO i = 1, 3 2661 IF (ub_domain(i) - lb_domain(i) + 1 >= rs_desc%npts(i)) THEN 2662 dir_periodic(i) = .TRUE. 2663 lb_coord(i) = 0 2664 ub_coord(i) = rs_desc%group_dim(i) - 1 2665 ELSE 2666 dir_periodic(i) = .FALSE. 2667 ENDIF 2668 ENDDO 2669 2670 added_tasks = PRODUCT(ub_coord - lb_coord + 1) 2671 itask = ntasks 2672 ntasks = ntasks + added_tasks - 1 2673 IF (ntasks > SIZE(tasks, 2)) THEN 2674 curr_tasks = INT((SIZE(tasks, 2) + add_tasks)*mult_tasks) 2675 CALL reallocate(tasks, 1, 6, 1, curr_tasks) 2676 END IF 2677 DO iz = lb_coord(3), ub_coord(3) 2678 DO iy = lb_coord(2), ub_coord(2) 2679 DO ix = lb_coord(1), ub_coord(1) 2680 icoord = MODULO((/ix, iy, iz/), rs_desc%group_dim) 2681 idest = rs_desc%coord2rank(icoord(1), icoord(2), icoord(3)) 2682 tasks(1, itask) = encode_rank(idest, igrid_level, n_levels) 2683 tasks(4, itask) = 2 2684 tasks(6, itask) = 0 2685 ! encode the domain size for this task 2686 ! if the bit is set, we need to add the border in that direction 2687 IF (ix == lb_coord(1) .AND. .NOT. dir_periodic(1)) tasks(6, itask) = IBSET(tasks(6, itask), 0) 2688 IF (ix == ub_coord(1) .AND. .NOT. dir_periodic(1)) tasks(6, itask) = IBSET(tasks(6, itask), 1) 2689 IF (iy == lb_coord(2) .AND. .NOT. dir_periodic(2)) tasks(6, itask) = IBSET(tasks(6, itask), 2) 2690 IF (iy == ub_coord(2) .AND. .NOT. dir_periodic(2)) tasks(6, itask) = IBSET(tasks(6, itask), 3) 2691 IF (iz == lb_coord(3) .AND. .NOT. dir_periodic(3)) tasks(6, itask) = IBSET(tasks(6, itask), 4) 2692 IF (iz == ub_coord(3) .AND. .NOT. dir_periodic(3)) tasks(6, itask) = IBSET(tasks(6, itask), 5) 2693 itask = itask + 1 2694 ENDDO 2695 ENDDO 2696 ENDDO 2697 ENDIF 2698 2699 END SUBROUTINE rs_find_node 2700 2701! ************************************************************************************************** 2702!> \brief utility functions for encoding the grid level with a rank, allowing 2703!> different grid levels to maintain a different rank ordering without 2704!> losing information. These encoded_ints are stored in the tasks(1:2,:) array 2705!> \param rank ... 2706!> \param grid_level ... 2707!> \param n_levels ... 2708!> \return ... 2709!> \par History 2710!> 4.2009 created [Iain Bethune] 2711!> (c) The Numerical Algorithms Group (NAG) Ltd, 2009 on behalf of the HECToR project 2712! ************************************************************************************************** 2713 FUNCTION encode_rank(rank, grid_level, n_levels) RESULT(encoded_int) 2714 2715 INTEGER, INTENT(IN) :: rank, grid_level, n_levels 2716 INTEGER(KIND=int_8) :: encoded_int 2717 2718! ordered so can still sort by rank 2719 2720 encoded_int = rank*n_levels + grid_level - 1 2721 2722 END FUNCTION 2723 2724! ************************************************************************************************** 2725!> \brief ... 2726!> \param encoded_int ... 2727!> \param n_levels ... 2728!> \return ... 2729! ************************************************************************************************** 2730 FUNCTION decode_rank(encoded_int, n_levels) RESULT(rank) 2731 2732 INTEGER(KIND=int_8), INTENT(IN) :: encoded_int 2733 INTEGER, INTENT(IN) :: n_levels 2734 INTEGER :: rank 2735 2736 rank = INT(encoded_int/n_levels) 2737 2738 END FUNCTION 2739 2740! ************************************************************************************************** 2741!> \brief ... 2742!> \param encoded_int ... 2743!> \param n_levels ... 2744!> \return ... 2745! ************************************************************************************************** 2746 FUNCTION decode_level(encoded_int, n_levels) RESULT(grid_level) 2747 2748 INTEGER(KIND=int_8), INTENT(IN) :: encoded_int 2749 INTEGER, INTENT(IN) :: n_levels 2750 INTEGER :: grid_level 2751 2752 INTEGER(KIND=int_8) :: n_levels8 2753 2754 n_levels8 = n_levels 2755 2756 grid_level = INT(MODULO(encoded_int, n_levels8)) + 1 2757 2758 END FUNCTION decode_level 2759 2760END MODULE task_list_methods 2761