1!--------------------------------------------------------------------------------------------------! 2! CP2K: A general program to perform molecular dynamics simulations ! 3! Copyright (C) 2000 - 2019 CP2K developers group ! 4!--------------------------------------------------------------------------------------------------! 5 6! ************************************************************************************************** 7!> \brief Rountines for optimizing load balance between processes in HFX calculations 8!> \par History 9!> 04.2008 created [Manuel Guidon] 10!> \author Manuel Guidon 11! ************************************************************************************************** 12MODULE hfx_load_balance_methods 13 USE cell_types, ONLY: cell_type 14 USE cp_files, ONLY: close_file,& 15 open_file 16 USE cp_para_types, ONLY: cp_para_env_type 17 USE hfx_pair_list_methods, ONLY: build_atomic_pair_list,& 18 build_pair_list 19 USE hfx_types, ONLY: & 20 hfx_basis_type, hfx_block_range_type, hfx_distribution, hfx_load_balance_type, hfx_p_kind, & 21 hfx_screen_coeff_type, hfx_set_distr_energy, hfx_set_distr_forces, hfx_type, & 22 pair_list_type, pair_set_list_type 23 USE input_constants, ONLY: hfx_do_eval_energy,& 24 hfx_do_eval_forces 25 USE kinds, ONLY: dp,& 26 int_8 27 USE message_passing, ONLY: mp_allgather,& 28 mp_gatherv,& 29 mp_isendrecv,& 30 mp_max,& 31 mp_sum,& 32 mp_sync,& 33 mp_waitall 34 USE parallel_rng_types, ONLY: UNIFORM,& 35 create_rng_stream,& 36 delete_rng_stream,& 37 next_random_number,& 38 rng_stream_type 39 USE particle_types, ONLY: particle_type 40 USE util, ONLY: sort 41#include "./base/base_uses.f90" 42 43 IMPLICIT NONE 44 PRIVATE 45 46 PUBLIC :: hfx_load_balance, & 47 hfx_update_load_balance, & 48 collect_load_balance_info, cost_model, p1_energy, p2_energy, p3_energy 49 50 CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'hfx_load_balance_methods' 51 52 REAL(KIND=dp), PARAMETER :: p1_energy(12) = (/2.9461408209700424_dp, 1.0624718662999657_dp, & 53 -1.91570128356921242E-002_dp, -1.6668495454436603_dp, & 54 1.7512639006523709_dp, -9.76074323945336081E-002_dp, & 55 2.6230786127311889_dp, -0.31870737623014189_dp, & 56 7.9588203912690973_dp, 1.8331423413134813_dp, & 57 -0.15427618665346299_dp, 0.19749436090711650_dp/) 58 REAL(KIND=dp), PARAMETER :: p2_energy(12) = (/2.3104682960662593_dp, 1.8744052737304417_dp, & 59 -9.36564055598656797E-002_dp, 0.64284973765086939_dp, & 60 1.0137565430060556_dp, -6.80088178288954567E-003_dp, & 61 1.1692629207374552_dp, -2.6314710080507573_dp, & 62 19.237814781880786_dp, 1.0505934173661349_dp, & 63 0.80382371955699250_dp, 0.49903401991818103_dp/) 64 REAL(KIND=dp), PARAMETER :: p3_energy(2) = (/7.82336287670072350E-002_dp, 0.38073304105744837_dp/) 65 REAL(KIND=dp), PARAMETER :: p1_forces(12) = (/2.5746279948798874_dp, 1.3420575378609276_dp, & 66 -9.41673106447732111E-002_dp, 0.94568006899317825_dp, & 67 -1.4511897117448544_dp, 0.59178934677316952_dp, & 68 2.7291149361757236_dp, -0.50555512044800210_dp, & 69 8.3508180969609871_dp, 1.6829982496141809_dp, & 70 -0.74895370472152600_dp, 0.43801726744197500_dp/) 71 REAL(KIND=dp), PARAMETER :: p2_forces(12) = (/2.6398568961569020_dp, 2.3024918834564101_dp, & 72 5.33216585432061581E-003_dp, 0.45572145697283628_dp, & 73 1.8119743851500618_dp, -0.12533918548421166_dp, & 74 -1.4040312084552751_dp, -4.5331650463917859_dp, & 75 12.593431549069477_dp, 1.1311978374487595_dp, & 76 1.4245996087624646_dp, 1.1425350529853495_dp/) 77 REAL(KIND=dp), PARAMETER :: p3_forces(2) = (/0.12051930516830946_dp, 1.3828051586144336_dp/) 78 79!*** 80 81CONTAINS 82 83! ************************************************************************************************** 84!> \brief Distributes the computation of eri's to all available processes. 85!> \param x_data Object that stores the indices array 86!> \param eps_schwarz screening parameter 87!> \param particle_set , atomic_kind_set, para_env ... 88!> \param max_set Maximum number of set to be considered 89!> \param para_env para_env 90!> \param coeffs_set screening functions 91!> \param coeffs_kind screening functions 92!> \param is_assoc_atomic_block_global KS-matrix sparsity 93!> \param do_periodic flag for periodicity 94!> \param load_balance_parameter Paramters for Monte-Carlo routines 95!> \param kind_of helper array for mapping 96!> \param basis_parameter Basis set parameters 97!> \param pmax_set Initial screening matrix 98!> \param pmax_atom ... 99!> \param i_thread Process ID of current Thread 100!> \param n_threads Total Number of Threads 101!> \param cell cell 102!> \param do_p_screening Flag for initial p screening 103!> \param map_atom_to_kind_atom ... 104!> \param nkind ... 105!> \param eval_type ... 106!> \param pmax_block ... 107!> \param use_virial ... 108!> \par History 109!> 06.2007 created [Manuel Guidon] 110!> 08.2007 new parallel scheme [Manuel Guidon] 111!> 09.2007 new 'modulo' parellel scheme and Monte Carlo step [Manuel Guidon] 112!> 11.2007 parallelize load balance on box_idx1 [Manuel Guidon] 113!> 02.2009 completely refactored [Manuel Guidon] 114!> \author Manuel Guidon 115!> \note 116!> The optimization is done via a binning procedure followed by simple 117!> Monte Carlo procedure: 118!> In a first step the total amount of integrals in the system is calculated, 119!> taking into account the sparsity of the KS-matrix , the screening based 120!> on near/farfield approximations and if desired the screening on an initial 121!> density matrix. 122!> In a second step, bins are generate that contain approximately the same number 123!> of integrals, and a cost for these bins is estimated (currently the number of integrals) 124!> In a third step, a Monte Carlo procedure optimizes the assignment 125!> of the different loads to each process 126!> At the end each process owns an unique array of *atomic* indices-ranges 127!> that are used to decide whether a process has to calculate a certain 128!> bunch of integrals or not 129! ************************************************************************************************** 130 SUBROUTINE hfx_load_balance(x_data, eps_schwarz, particle_set, max_set, para_env, & 131 coeffs_set, coeffs_kind, & 132 is_assoc_atomic_block_global, do_periodic, & 133 load_balance_parameter, kind_of, basis_parameter, pmax_set, & 134 pmax_atom, i_thread, n_threads, cell, & 135 do_p_screening, map_atom_to_kind_atom, nkind, eval_type, & 136 pmax_block, use_virial) 137 TYPE(hfx_type), POINTER :: x_data 138 REAL(dp), INTENT(IN) :: eps_schwarz 139 TYPE(particle_type), DIMENSION(:), POINTER :: particle_set 140 INTEGER, INTENT(IN) :: max_set 141 TYPE(cp_para_env_type), POINTER :: para_env 142 TYPE(hfx_screen_coeff_type), & 143 DIMENSION(:, :, :, :), POINTER :: coeffs_set 144 TYPE(hfx_screen_coeff_type), DIMENSION(:, :), & 145 POINTER :: coeffs_kind 146 INTEGER, DIMENSION(:, :) :: is_assoc_atomic_block_global 147 LOGICAL :: do_periodic 148 TYPE(hfx_load_balance_type), POINTER :: load_balance_parameter 149 INTEGER :: kind_of(*) 150 TYPE(hfx_basis_type), DIMENSION(:), POINTER :: basis_parameter 151 TYPE(hfx_p_kind), DIMENSION(:), POINTER :: pmax_set 152 REAL(dp), DIMENSION(:, :), POINTER :: pmax_atom 153 INTEGER, INTENT(IN) :: i_thread, n_threads 154 TYPE(cell_type), POINTER :: cell 155 LOGICAL, INTENT(IN) :: do_p_screening 156 INTEGER, DIMENSION(:), POINTER :: map_atom_to_kind_atom 157 INTEGER, INTENT(IN) :: nkind, eval_type 158 REAL(dp), DIMENSION(:, :), POINTER :: pmax_block 159 LOGICAL, INTENT(IN) :: use_virial 160 161 CHARACTER(LEN=*), PARAMETER :: routineN = 'hfx_load_balance', & 162 routineP = moduleN//':'//routineN 163 164 CHARACTER(LEN=512) :: error_msg 165 INTEGER :: block_size, current_block_id, data_from, dest, handle, handle_inner, & 166 handle_range, i, iatom_block, iatom_end, iatom_start, ibin, icpu, j, jatom_block, & 167 jatom_end, jatom_start, katom_block, katom_end, katom_start, latom_block, latom_end, & 168 latom_start, mepos, my_process_id, n_processes, natom, nbins, nblocks, ncpu, & 169 new_iatom_end, new_iatom_start, new_jatom_end, new_jatom_start, non_empty_blocks, & 170 objective_block_size, objective_nblocks, req(2), source, total_blocks 171 INTEGER(int_8) :: atom_block, cost_per_bin, cost_per_core, current_cost, & 172 distribution_counter_end, distribution_counter_start, global_quartet_counter, & 173 local_quartet_counter, self_cost_per_block, tmp_block, total_block_self_cost 174 INTEGER(int_8), ALLOCATABLE, DIMENSION(:) :: buffer_in, buffer_out 175 INTEGER(int_8), DIMENSION(:), POINTER :: local_cost_matrix, recbuffer, & 176 sendbuffer, swapbuffer 177 INTEGER(int_8), DIMENSION(:), POINTER, SAVE :: cost_matrix 178 INTEGER(int_8), SAVE :: shm_global_quartet_counter, & 179 shm_local_quartet_counter 180 INTEGER, ALLOCATABLE, DIMENSION(:) :: rcount, rdispl, tmp_index, tmp_pos, & 181 to_be_sorted 182 INTEGER, DIMENSION(:), POINTER, SAVE :: shm_distribution_vector 183 INTEGER, SAVE :: shm_nblocks 184 LOGICAL :: changed, last_bin_needs_to_be_filled, & 185 optimized 186 LOGICAL, DIMENSION(:, :), POINTER, SAVE :: atomic_pair_list 187 REAL(dp) :: coeffs_kind_max0, log10_eps_schwarz, & 188 log_2, pmax_blocks 189 TYPE(hfx_block_range_type), DIMENSION(:), POINTER :: blocks_guess, tmp_blocks, tmp_blocks2 190 TYPE(hfx_block_range_type), DIMENSION(:), & 191 POINTER, SAVE :: shm_blocks 192 TYPE(hfx_distribution), DIMENSION(:), POINTER :: binned_dist, ptr_to_tmp_dist, tmp_dist 193 TYPE(hfx_distribution), DIMENSION(:, :), POINTER, & 194 SAVE :: full_dist 195 TYPE(pair_list_type) :: list_ij, list_kl 196 TYPE(pair_set_list_type), ALLOCATABLE, & 197 DIMENSION(:) :: set_list_ij, set_list_kl 198 199!$OMP BARRIER 200!$OMP MASTER 201 CALL timeset(routineN, handle) 202!$OMP END MASTER 203!$OMP BARRIER 204 205 log10_eps_schwarz = LOG10(eps_schwarz) 206 log_2 = LOG10(2.0_dp) 207 coeffs_kind_max0 = MAXVAL(coeffs_kind(:, :)%x(2)) 208 ncpu = para_env%num_pe 209 n_processes = ncpu*n_threads 210 natom = SIZE(particle_set) 211 212 block_size = load_balance_parameter%block_size 213 ALLOCATE (set_list_ij((max_set*block_size)**2)) 214 ALLOCATE (set_list_kl((max_set*block_size)**2)) 215 216 IF (.NOT. load_balance_parameter%blocks_initialized) THEN 217!$OMP BARRIER 218!$OMP MASTER 219 CALL timeset(routineN//"_range", handle_range) 220 221 nblocks = MAX((natom + block_size - 1)/block_size, 1) 222 ALLOCATE (blocks_guess(nblocks)) 223 ALLOCATE (tmp_blocks(natom)) 224 ALLOCATE (tmp_blocks2(natom)) 225 226 pmax_blocks = 0.0_dp 227 SELECT CASE (eval_type) 228 CASE (hfx_do_eval_energy) 229 atomic_pair_list => x_data%atomic_pair_list 230 CASE (hfx_do_eval_forces) 231 atomic_pair_list => x_data%atomic_pair_list_forces 232 END SELECT 233 atomic_pair_list = .TRUE. 234 CALL init_blocks(nkind, para_env, natom, block_size, nblocks, blocks_guess, & 235 list_ij, list_kl, set_list_ij, set_list_kl, & 236 particle_set, & 237 coeffs_set, coeffs_kind, & 238 is_assoc_atomic_block_global, do_periodic, & 239 kind_of, basis_parameter, pmax_set, pmax_atom, & 240 pmax_blocks, cell, & 241 do_p_screening, map_atom_to_kind_atom, eval_type, & 242 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list) 243 244 total_block_self_cost = 0 245 246 DO i = 1, nblocks 247 total_block_self_cost = total_block_self_cost + blocks_guess(i)%cost 248 END DO 249 250 CALL mp_sum(total_block_self_cost, para_env%group) 251 252 objective_block_size = load_balance_parameter%block_size 253 objective_nblocks = MAX((natom + objective_block_size - 1)/objective_block_size, 1) 254 255 self_cost_per_block = (total_block_self_cost + objective_nblocks - 1)/(objective_nblocks) 256 257 DO i = 1, nblocks 258 tmp_blocks2(i) = blocks_guess(i) 259 END DO 260 261 optimized = .FALSE. 262 i = 0 263 DO WHILE (.NOT. optimized) 264 i = i + 1 265 current_block_id = 0 266 changed = .FALSE. 267 DO atom_block = 1, nblocks 268 current_block_id = current_block_id + 1 269 iatom_start = tmp_blocks2(atom_block)%istart 270 iatom_end = tmp_blocks2(atom_block)%iend 271 IF (tmp_blocks2(atom_block)%cost > 1.5_dp*self_cost_per_block .AND. iatom_end - iatom_start > 0) THEN 272 changed = .TRUE. 273 new_iatom_start = iatom_start 274 new_iatom_end = (iatom_end - iatom_start + 1)/2 + iatom_start - 1 275 new_jatom_start = new_iatom_end + 1 276 new_jatom_end = iatom_end 277 tmp_blocks(current_block_id)%istart = new_iatom_start 278 tmp_blocks(current_block_id)%iend = new_iatom_end 279 tmp_blocks(current_block_id)%cost = estimate_block_cost( & 280 natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, & 281 new_iatom_start, new_iatom_end, new_iatom_start, new_iatom_end, & 282 new_iatom_start, new_iatom_end, new_iatom_start, new_iatom_end, & 283 particle_set, & 284 coeffs_set, coeffs_kind, & 285 is_assoc_atomic_block_global, do_periodic, & 286 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, & 287 cell, & 288 do_p_screening, map_atom_to_kind_atom, eval_type, & 289 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list) 290 current_block_id = current_block_id + 1 291 tmp_blocks(current_block_id)%istart = new_jatom_start 292 tmp_blocks(current_block_id)%iend = new_jatom_end 293 tmp_blocks(current_block_id)%cost = estimate_block_cost( & 294 natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, & 295 new_jatom_start, new_jatom_end, new_jatom_start, new_jatom_end, & 296 new_jatom_start, new_jatom_end, new_jatom_start, new_jatom_end, & 297 particle_set, & 298 coeffs_set, coeffs_kind, & 299 is_assoc_atomic_block_global, do_periodic, & 300 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, & 301 cell, & 302 do_p_screening, map_atom_to_kind_atom, eval_type, & 303 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list) 304 ELSE 305 tmp_blocks(current_block_id)%istart = iatom_start 306 tmp_blocks(current_block_id)%iend = iatom_end 307 tmp_blocks(current_block_id)%cost = tmp_blocks2(atom_block)%cost 308 END IF 309 END DO 310 IF (.NOT. changed) optimized = .TRUE. 311 IF (i > 20) optimized = .TRUE. 312 nblocks = current_block_id 313 DO atom_block = 1, nblocks 314 tmp_blocks2(atom_block) = tmp_blocks(atom_block) 315 END DO 316 END DO 317 318 DEALLOCATE (tmp_blocks2) 319 320 ! ** count number of non empty blocks on each node 321 non_empty_blocks = 0 322 DO atom_block = 1, nblocks 323 IF (tmp_blocks(atom_block)%istart == 0) CYCLE 324 non_empty_blocks = non_empty_blocks + 1 325 END DO 326 327 ALLOCATE (rcount(ncpu)) 328 rcount = 0 329 rcount(para_env%mepos + 1) = non_empty_blocks 330 CALL mp_sum(rcount, para_env%group) 331 332 ! ** sum all non_empty_blocks 333 total_blocks = 0 334 DO i = 1, ncpu 335 total_blocks = total_blocks + rcount(i) 336 END DO 337 338 ! ** calculate offsets 339 ALLOCATE (rdispl(ncpu)) 340 rcount(:) = rcount(:)*3 341 rdispl(1) = 0 342 DO i = 2, ncpu 343 rdispl(i) = rdispl(i - 1) + rcount(i - 1) 344 END DO 345 346 ALLOCATE (buffer_in(3*non_empty_blocks)) 347 348 non_empty_blocks = 0 349 DO atom_block = 1, nblocks 350 IF (tmp_blocks(atom_block)%istart == 0) CYCLE 351 buffer_in(non_empty_blocks*3 + 1) = tmp_blocks(atom_block)%istart 352 buffer_in(non_empty_blocks*3 + 2) = tmp_blocks(atom_block)%iend 353 buffer_in(non_empty_blocks*3 + 3) = tmp_blocks(atom_block)%cost 354 non_empty_blocks = non_empty_blocks + 1 355 END DO 356 357 nblocks = total_blocks 358 359 ALLOCATE (tmp_blocks2(nblocks)) 360 361 ALLOCATE (buffer_out(3*nblocks)) 362 363 ! ** Gather all three arrays 364 CALL mp_allgather(buffer_in, buffer_out, rcount, rdispl, para_env%group) 365 366 DO i = 1, nblocks 367 tmp_blocks2(i)%istart = INT(buffer_out((i - 1)*3 + 1)) 368 tmp_blocks2(i)%iend = INT(buffer_out((i - 1)*3 + 2)) 369 tmp_blocks2(i)%cost = buffer_out((i - 1)*3 + 3) 370 END DO 371 372 ! ** Now we sort the blocks 373 ALLOCATE (to_be_sorted(nblocks)) 374 ALLOCATE (tmp_index(nblocks)) 375 376 DO atom_block = 1, nblocks 377 to_be_sorted(atom_block) = tmp_blocks2(atom_block)%istart 378 END DO 379 380 CALL sort(to_be_sorted, nblocks, tmp_index) 381 382 ALLOCATE (x_data%blocks(nblocks)) 383 384 DO atom_block = 1, nblocks 385 x_data%blocks(atom_block) = tmp_blocks2(tmp_index(atom_block)) 386 END DO 387 388 shm_blocks => x_data%blocks 389 shm_nblocks = nblocks 390 391 ! ** Set nblocks in structure 392 load_balance_parameter%nblocks = nblocks 393 394 DEALLOCATE (blocks_guess, tmp_blocks, tmp_blocks2) 395 396 DEALLOCATE (rcount, rdispl, buffer_in, buffer_out, to_be_sorted, tmp_index) 397 398 load_balance_parameter%blocks_initialized = .TRUE. 399 400 x_data%blocks = shm_blocks 401 load_balance_parameter%nblocks = shm_nblocks 402 load_balance_parameter%blocks_initialized = .TRUE. 403 404 ALLOCATE (x_data%pmax_block(shm_nblocks, shm_nblocks)) 405 x_data%pmax_block = 0.0_dp 406 pmax_block => x_data%pmax_block 407 CALL timestop(handle_range) 408!$OMP END MASTER 409!$OMP BARRIER 410 411 IF (.NOT. load_balance_parameter%blocks_initialized) THEN 412 ALLOCATE (x_data%blocks(shm_nblocks)) 413 x_data%blocks = shm_blocks 414 load_balance_parameter%nblocks = shm_nblocks 415 load_balance_parameter%blocks_initialized = .TRUE. 416 END IF 417 !! ** precalculate maximum density matrix elements in blocks 418!$OMP BARRIER 419 END IF 420 421!$OMP BARRIER 422!$OMP MASTER 423 pmax_block => x_data%pmax_block 424 pmax_block = 0.0_dp 425 IF (do_p_screening) THEN 426 DO iatom_block = 1, shm_nblocks 427 iatom_start = x_data%blocks(iatom_block)%istart 428 iatom_end = x_data%blocks(iatom_block)%iend 429 DO jatom_block = 1, shm_nblocks 430 jatom_start = x_data%blocks(jatom_block)%istart 431 jatom_end = x_data%blocks(jatom_block)%iend 432 pmax_block(iatom_block, jatom_block) = MAXVAL(pmax_atom(iatom_start:iatom_end, jatom_start:jatom_end)) 433 END DO 434 END DO 435 END IF 436 437 SELECT CASE (eval_type) 438 CASE (hfx_do_eval_energy) 439 atomic_pair_list => x_data%atomic_pair_list 440 CASE (hfx_do_eval_forces) 441 atomic_pair_list => x_data%atomic_pair_list_forces 442 END SELECT 443 CALL build_atomic_pair_list(natom, atomic_pair_list, kind_of, basis_parameter, particle_set, & 444 do_periodic, coeffs_kind, coeffs_kind_max0, log10_eps_schwarz, cell, & 445 x_data%blocks) 446 447!$OMP END MASTER 448!$OMP BARRIER 449 450 !! If there is only 1 cpu skip the binning 451 IF (n_processes == 1) THEN 452 ALLOCATE (tmp_dist(1)) 453 tmp_dist(1)%number_of_atom_quartets = HUGE(tmp_dist(1)%number_of_atom_quartets) 454 tmp_dist(1)%istart = 0_int_8 455 ptr_to_tmp_dist => tmp_dist(:) 456 SELECT CASE (eval_type) 457 CASE (hfx_do_eval_energy) 458 CALL hfx_set_distr_energy(ptr_to_tmp_dist, x_data) 459 CASE (hfx_do_eval_forces) 460 CALL hfx_set_distr_forces(ptr_to_tmp_dist, x_data) 461 END SELECT 462 DEALLOCATE (tmp_dist) 463 ELSE 464 !! Calculate total numbers of integrals that have to be calculated (wrt screening and symmetry) 465!$OMP BARRIER 466!$OMP MASTER 467 CALL timeset(routineN//"_count", handle_inner) 468!$OMP END MASTER 469!$OMP BARRIER 470 471 cost_per_core = 0_int_8 472 my_process_id = para_env%mepos*n_threads + i_thread 473 nblocks = load_balance_parameter%nblocks 474 475 DO atom_block = my_process_id, INT(nblocks, KIND=int_8)**4 - 1, n_processes 476 477 latom_block = INT(MODULO(atom_block, INT(nblocks, KIND=int_8))) + 1 478 tmp_block = atom_block/nblocks 479 katom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1 480 IF (latom_block < katom_block) CYCLE 481 tmp_block = tmp_block/nblocks 482 jatom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1 483 tmp_block = tmp_block/nblocks 484 iatom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1 485 IF (jatom_block < iatom_block) CYCLE 486 487 iatom_start = x_data%blocks(iatom_block)%istart 488 iatom_end = x_data%blocks(iatom_block)%iend 489 jatom_start = x_data%blocks(jatom_block)%istart 490 jatom_end = x_data%blocks(jatom_block)%iend 491 katom_start = x_data%blocks(katom_block)%istart 492 katom_end = x_data%blocks(katom_block)%iend 493 latom_start = x_data%blocks(latom_block)%istart 494 latom_end = x_data%blocks(latom_block)%iend 495 496 SELECT CASE (eval_type) 497 CASE (hfx_do_eval_energy) 498 pmax_blocks = MAX(pmax_block(katom_block, iatom_block), & 499 pmax_block(latom_block, jatom_block), & 500 pmax_block(latom_block, iatom_block), & 501 pmax_block(katom_block, jatom_block)) 502 CASE (hfx_do_eval_forces) 503 pmax_blocks = MAX(pmax_block(katom_block, iatom_block) + & 504 pmax_block(latom_block, jatom_block), & 505 pmax_block(latom_block, iatom_block) + & 506 pmax_block(katom_block, jatom_block)) 507 END SELECT 508 509 IF (2.0_dp*coeffs_kind_max0 + pmax_blocks < log10_eps_schwarz) CYCLE 510 511 cost_per_core = cost_per_core & 512 + estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, & 513 iatom_start, iatom_end, jatom_start, jatom_end, & 514 katom_start, katom_end, latom_start, latom_end, & 515 particle_set, & 516 coeffs_set, coeffs_kind, & 517 is_assoc_atomic_block_global, do_periodic, & 518 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, & 519 cell, & 520 do_p_screening, map_atom_to_kind_atom, eval_type, & 521 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list) 522 523 END DO ! atom_block 524 525 nbins = load_balance_parameter%nbins 526 cost_per_bin = (cost_per_core + nbins - 1)/(nbins) 527 528!$OMP BARRIER 529!$OMP MASTER 530 CALL timestop(handle_inner) 531!$OMP END MASTER 532!$OMP BARRIER 533 534! new load balancing test 535 IF (.FALSE.) THEN 536 CALL hfx_recursive_load_balance(n_processes, my_process_id, nblocks, & 537 natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, & 538 particle_set, & 539 coeffs_set, coeffs_kind, & 540 is_assoc_atomic_block_global, do_periodic, & 541 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, & 542 cell, x_data, para_env, pmax_block, & 543 do_p_screening, map_atom_to_kind_atom, eval_type, & 544 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list) 545 END IF 546 547!$OMP BARRIER 548!$OMP MASTER 549 CALL timeset(routineN//"_bin", handle_inner) 550!$OMP END MASTER 551!$OMP BARRIER 552 553 ALLOCATE (binned_dist(nbins)) 554 binned_dist(:)%istart = -1_int_8 555 binned_dist(:)%number_of_atom_quartets = 0_int_8 556 binned_dist(:)%cost = 0_int_8 557 binned_dist(:)%time_first_scf = 0.0_dp 558 binned_dist(:)%time_other_scf = 0.0_dp 559 binned_dist(:)%time_forces = 0.0_dp 560 561 current_cost = 0 562 mepos = 1 563 distribution_counter_start = 1 564 distribution_counter_end = 0 565 ibin = 1 566 567 global_quartet_counter = 0 568 local_quartet_counter = 0 569 last_bin_needs_to_be_filled = .FALSE. 570 DO atom_block = my_process_id, INT(nblocks, KIND=int_8)**4 - 1, n_processes 571 latom_block = INT(MODULO(atom_block, INT(nblocks, KIND=int_8))) + 1 572 tmp_block = atom_block/nblocks 573 katom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1 574 IF (latom_block < katom_block) CYCLE 575 tmp_block = tmp_block/nblocks 576 jatom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1 577 tmp_block = tmp_block/nblocks 578 iatom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1 579 IF (jatom_block < iatom_block) CYCLE 580 581 distribution_counter_end = distribution_counter_end + 1 582 global_quartet_counter = global_quartet_counter + 1 583 last_bin_needs_to_be_filled = .TRUE. 584 585 IF (binned_dist(ibin)%istart == -1_int_8) binned_dist(ibin)%istart = atom_block 586 587 iatom_start = x_data%blocks(iatom_block)%istart 588 iatom_end = x_data%blocks(iatom_block)%iend 589 jatom_start = x_data%blocks(jatom_block)%istart 590 jatom_end = x_data%blocks(jatom_block)%iend 591 katom_start = x_data%blocks(katom_block)%istart 592 katom_end = x_data%blocks(katom_block)%iend 593 latom_start = x_data%blocks(latom_block)%istart 594 latom_end = x_data%blocks(latom_block)%iend 595 596 SELECT CASE (eval_type) 597 CASE (hfx_do_eval_energy) 598 pmax_blocks = MAX(pmax_block(katom_block, iatom_block), & 599 pmax_block(latom_block, jatom_block), & 600 pmax_block(latom_block, iatom_block), & 601 pmax_block(katom_block, jatom_block)) 602 CASE (hfx_do_eval_forces) 603 pmax_blocks = MAX(pmax_block(katom_block, iatom_block) + & 604 pmax_block(latom_block, jatom_block), & 605 pmax_block(latom_block, iatom_block) + & 606 pmax_block(katom_block, jatom_block)) 607 END SELECT 608 609 IF (2.0_dp*coeffs_kind_max0 + pmax_blocks < log10_eps_schwarz) CYCLE 610 611 current_cost = current_cost & 612 + estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, & 613 iatom_start, iatom_end, jatom_start, jatom_end, & 614 katom_start, katom_end, latom_start, latom_end, & 615 particle_set, & 616 coeffs_set, coeffs_kind, & 617 is_assoc_atomic_block_global, do_periodic, & 618 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, & 619 cell, & 620 do_p_screening, map_atom_to_kind_atom, eval_type, & 621 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list) 622 623 IF (current_cost >= cost_per_bin) THEN 624 IF (ibin == nbins) THEN 625 binned_dist(ibin)%number_of_atom_quartets = binned_dist(ibin)%number_of_atom_quartets + & 626 distribution_counter_end - distribution_counter_start + 1 627 ELSE 628 binned_dist(ibin)%number_of_atom_quartets = distribution_counter_end - distribution_counter_start + 1 629 END IF 630 binned_dist(ibin)%cost = binned_dist(ibin)%cost + current_cost 631 ibin = MIN(ibin + 1, nbins) 632 distribution_counter_start = distribution_counter_end + 1 633 current_cost = 0 634 last_bin_needs_to_be_filled = .FALSE. 635 END IF 636 ENDDO 637 638!$OMP BARRIER 639!$OMP MASTER 640 CALL timestop(handle_inner) 641 CALL timeset(routineN//"_dist", handle_inner) 642!$OMP END MASTER 643!$OMP BARRIER 644 !! Fill the last bin if necessary 645 IF (last_bin_needs_to_be_filled) THEN 646 binned_dist(ibin)%cost = binned_dist(ibin)%cost + current_cost 647 IF (ibin == nbins) THEN 648 binned_dist(ibin)%number_of_atom_quartets = binned_dist(ibin)%number_of_atom_quartets + & 649 distribution_counter_end - distribution_counter_start + 1 650 ELSE 651 binned_dist(ibin)%number_of_atom_quartets = distribution_counter_end - distribution_counter_start + 1 652 END IF 653 END IF 654 655 !! Sanity-Check 656 DO ibin = 1, nbins 657 local_quartet_counter = local_quartet_counter + binned_dist(ibin)%number_of_atom_quartets 658 END DO 659!$OMP BARRIER 660!$OMP MASTER 661 shm_local_quartet_counter = 0 662 shm_global_quartet_counter = 0 663!$OMP END MASTER 664!$OMP BARRIER 665!$OMP ATOMIC 666 shm_local_quartet_counter = shm_local_quartet_counter + local_quartet_counter 667!$OMP ATOMIC 668 shm_global_quartet_counter = shm_global_quartet_counter + global_quartet_counter 669 670!$OMP BARRIER 671!$OMP MASTER 672 CALL mp_sum(shm_local_quartet_counter, para_env%group) 673 CALL mp_sum(shm_global_quartet_counter, para_env%group) 674 IF (para_env%ionode) THEN 675 IF (shm_local_quartet_counter /= shm_global_quartet_counter) THEN 676 WRITE (error_msg, '(A,I0,A,I0,A)') "HFX Sanity check for parallel distribution failed. "// & 677 "Number of local quartets (", shm_local_quartet_counter, & 678 ") and number of global quartets (", shm_global_quartet_counter, & 679 ") are different. Please send in a bug report." 680 CPABORT(error_msg) 681 END IF 682 END IF 683!$OMP END MASTER 684 685!$OMP BARRIER 686!$OMP MASTER 687 ALLOCATE (cost_matrix(ncpu*nbins*n_threads)) 688 cost_matrix = 0 689!$OMP END MASTER 690!$OMP BARRIER 691 icpu = para_env%mepos + 1 692 DO i = 1, nbins 693 cost_matrix((icpu - 1)*nbins*n_threads + i_thread*nbins + i) = binned_dist(i)%cost 694 END DO 695 mepos = para_env%mepos 696!$OMP BARRIER 697 698!$OMP MASTER 699 ! sync before/after ring of isendrecv 700 CALL mp_sync(para_env%group) 701 702 ALLOCATE (sendbuffer(nbins*n_threads)) 703 ALLOCATE (recbuffer(nbins*n_threads)) 704 705 sendbuffer = cost_matrix(mepos*nbins*n_threads + 1:mepos*nbins*n_threads + nbins*n_threads) 706 707 dest = MODULO(mepos + 1, ncpu) 708 source = MODULO(mepos - 1, ncpu) 709 DO icpu = 0, ncpu - 1 710 IF (icpu .NE. ncpu - 1) THEN 711 CALL mp_isendrecv(sendbuffer, dest, recbuffer, source, & 712 para_env%group, req(1), req(2), 13) 713 ENDIF 714 data_from = MODULO(mepos - icpu, ncpu) 715 cost_matrix(data_from*nbins*n_threads + 1:data_from*nbins*n_threads + nbins*n_threads) = sendbuffer 716 IF (icpu .NE. ncpu - 1) THEN 717 CALL mp_waitall(req) 718 ENDIF 719 swapbuffer => sendbuffer 720 sendbuffer => recbuffer 721 recbuffer => swapbuffer 722 ENDDO 723 DEALLOCATE (recbuffer, sendbuffer) 724!$OMP END MASTER 725!$OMP BARRIER 726 727!$OMP BARRIER 728!$OMP MASTER 729 CALL timestop(handle_inner) 730 CALL timeset(routineN//"_opt", handle_inner) 731!$OMP END MASTER 732!$OMP BARRIER 733 734 !! Find an optimal distribution i.e. assign each element of the cost matrix to a certain process 735!$OMP BARRIER 736 ALLOCATE (local_cost_matrix(SIZE(cost_matrix, 1))) 737 local_cost_matrix = cost_matrix 738!$OMP MASTER 739 ALLOCATE (shm_distribution_vector(ncpu*nbins*n_threads)) 740 741 CALL optimize_distribution(ncpu*nbins*n_threads, ncpu*n_threads, local_cost_matrix, & 742 shm_distribution_vector, x_data%load_balance_parameter%do_randomize) 743 744 CALL timestop(handle_inner) 745 CALL timeset(routineN//"_redist", handle_inner) 746 !! Collect local data to global array 747 ALLOCATE (full_dist(ncpu*n_threads, nbins)) 748 749 full_dist(:, :)%istart = 0_int_8 750 full_dist(:, :)%number_of_atom_quartets = 0_int_8 751 full_dist(:, :)%cost = 0_int_8 752 full_dist(:, :)%time_first_scf = 0.0_dp 753 full_dist(:, :)%time_other_scf = 0.0_dp 754 full_dist(:, :)%time_forces = 0.0_dp 755!$OMP END MASTER 756!$OMP BARRIER 757 mepos = para_env%mepos + 1 758 full_dist((mepos - 1)*n_threads + i_thread + 1, :) = binned_dist(:) 759 760!$OMP BARRIER 761!$OMP MASTER 762 ALLOCATE (sendbuffer(3*nbins*n_threads)) 763 ALLOCATE (recbuffer(3*nbins*n_threads)) 764 mepos = para_env%mepos 765 DO j = 1, n_threads 766 DO i = 1, nbins 767 sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 1) = full_dist(mepos*n_threads + j, i)%istart 768 sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 2) = full_dist(mepos*n_threads + j, i)%number_of_atom_quartets 769 sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 3) = full_dist(mepos*n_threads + j, i)%cost 770 END DO 771 END DO 772 773 ! sync before/after ring of isendrecv 774 CALL mp_sync(para_env%group) 775 dest = MODULO(mepos + 1, ncpu) 776 source = MODULO(mepos - 1, ncpu) 777 DO icpu = 0, ncpu - 1 778 IF (icpu .NE. ncpu - 1) THEN 779 CALL mp_isendrecv(sendbuffer, dest, recbuffer, source, & 780 para_env%group, req(1), req(2), 13) 781 ENDIF 782 data_from = MODULO(mepos - icpu, ncpu) 783 DO j = 1, n_threads 784 DO i = 1, nbins 785 full_dist(data_from*n_threads + j, i)%istart = sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 1) 786 full_dist(data_from*n_threads + j, i)%number_of_atom_quartets = sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 2) 787 full_dist(data_from*n_threads + j, i)%cost = sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 3) 788 END DO 789 END DO 790 791 IF (icpu .NE. ncpu - 1) THEN 792 CALL mp_waitall(req) 793 ENDIF 794 swapbuffer => sendbuffer 795 sendbuffer => recbuffer 796 recbuffer => swapbuffer 797 ENDDO 798 DEALLOCATE (recbuffer, sendbuffer) 799 800 ! sync before/after ring of isendrecv 801 CALL mp_sync(para_env%group) 802!$OMP END MASTER 803!$OMP BARRIER 804 !! reorder the distribution according to the distribution vector 805 ALLOCATE (tmp_pos(ncpu*n_threads)) 806 tmp_pos = 1 807 ALLOCATE (tmp_dist(nbins*ncpu*n_threads)) 808 809 tmp_dist(:)%istart = 0_int_8 810 tmp_dist(:)%number_of_atom_quartets = 0_int_8 811 tmp_dist(:)%cost = 0_int_8 812 tmp_dist(:)%time_first_scf = 0.0_dp 813 tmp_dist(:)%time_other_scf = 0.0_dp 814 tmp_dist(:)%time_forces = 0.0_dp 815 816 DO icpu = 1, n_processes 817 DO i = 1, nbins 818 mepos = my_process_id + 1 819 IF (shm_distribution_vector((icpu - 1)*nbins + i) == mepos) THEN 820 tmp_dist(tmp_pos(mepos)) = full_dist(icpu, i) 821 tmp_pos(mepos) = tmp_pos(mepos) + 1 822 END IF 823 END DO 824 END DO 825 826 !! Assign the load to each process 827 NULLIFY (ptr_to_tmp_dist) 828 mepos = my_process_id + 1 829 ptr_to_tmp_dist => tmp_dist(1:tmp_pos(mepos) - 1) 830 SELECT CASE (eval_type) 831 CASE (hfx_do_eval_energy) 832 CALL hfx_set_distr_energy(ptr_to_tmp_dist, x_data) 833 CASE (hfx_do_eval_forces) 834 CALL hfx_set_distr_forces(ptr_to_tmp_dist, x_data) 835 END SELECT 836 837!$OMP BARRIER 838!$OMP MASTER 839 DEALLOCATE (full_dist, cost_matrix, shm_distribution_vector) 840!$OMP END MASTER 841!$OMP BARRIER 842 DEALLOCATE (tmp_dist, tmp_pos) 843 DEALLOCATE (binned_dist, local_cost_matrix) 844 DEALLOCATE (set_list_ij, set_list_kl) 845 846!$OMP BARRIER 847!$OMP MASTER 848 CALL timestop(handle_inner) 849!$OMP END MASTER 850!$OMP BARRIER 851 END IF 852!$OMP BARRIER 853!$OMP MASTER 854 CALL timestop(handle) 855!$OMP END MASTER 856!$OMP BARRIER 857 END SUBROUTINE hfx_load_balance 858 859! ************************************************************************************************** 860!> \brief Reference implementation of new recursive load balancing routine 861!> Computes a local list of atom_blocks (p_atom_blocks,q_atom_blocks) for 862!> each process in a P-Q grid such that every process has more or less the 863!> same amount of work. Has no output at the moment (not used) but writes 864!> its computed load balance values into a file. Possible output is ready 865!> to use in the two arrays p_atom_blocks & q_atom_blocks 866!> \param n_processes ... 867!> \param my_process_id ... 868!> \param nblocks ... 869!> \param natom ... 870!> \param nkind ... 871!> \param list_ij ... 872!> \param list_kl ... 873!> \param set_list_ij ... 874!> \param set_list_kl ... 875!> \param particle_set ... 876!> \param coeffs_set ... 877!> \param coeffs_kind ... 878!> \param is_assoc_atomic_block_global ... 879!> \param do_periodic ... 880!> \param kind_of ... 881!> \param basis_parameter ... 882!> \param pmax_set ... 883!> \param pmax_atom ... 884!> \param pmax_blocks ... 885!> \param cell ... 886!> \param x_data ... 887!> \param para_env ... 888!> \param pmax_block ... 889!> \param do_p_screening ... 890!> \param map_atom_to_kind_atom ... 891!> \param eval_type ... 892!> \param log10_eps_schwarz ... 893!> \param log_2 ... 894!> \param coeffs_kind_max0 ... 895!> \param use_virial ... 896!> \param atomic_pair_list ... 897!> \par History 898!> 03.2011 created [Michael Steinlechner] 899!> \author Michael Steinlechner 900! ************************************************************************************************** 901 902 SUBROUTINE hfx_recursive_load_balance(n_processes, my_process_id, nblocks, & 903 natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, & 904 particle_set, & 905 coeffs_set, coeffs_kind, & 906 is_assoc_atomic_block_global, do_periodic, & 907 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, & 908 cell, x_data, para_env, pmax_block, & 909 do_p_screening, map_atom_to_kind_atom, eval_type, & 910 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list) 911 912! input variables: 913 INTEGER, INTENT(IN) :: n_processes, my_process_id, nblocks, & 914 natom, nkind 915 TYPE(pair_list_type), INTENT(IN) :: list_ij, list_kl 916 TYPE(pair_set_list_type), ALLOCATABLE, & 917 DIMENSION(:), INTENT(IN) :: set_list_ij, set_list_kl 918 TYPE(particle_type), DIMENSION(:), INTENT(IN), & 919 POINTER :: particle_set 920 TYPE(hfx_screen_coeff_type), & 921 DIMENSION(:, :, :, :), INTENT(IN), POINTER :: coeffs_set 922 TYPE(hfx_screen_coeff_type), DIMENSION(:, :), & 923 INTENT(IN), POINTER :: coeffs_kind 924 INTEGER, DIMENSION(:, :), INTENT(IN) :: is_assoc_atomic_block_global 925 LOGICAL, INTENT(IN) :: do_periodic 926 INTEGER, INTENT(IN) :: kind_of(*) 927 TYPE(hfx_basis_type), DIMENSION(:), INTENT(IN), & 928 POINTER :: basis_parameter 929 TYPE(hfx_p_kind), DIMENSION(:), INTENT(IN), & 930 POINTER :: pmax_set 931 REAL(dp), DIMENSION(:, :), INTENT(IN), POINTER :: pmax_atom 932 REAL(dp) :: pmax_blocks 933 TYPE(cell_type), INTENT(IN), POINTER :: cell 934 TYPE(hfx_type), INTENT(IN), POINTER :: x_data 935 TYPE(cp_para_env_type), INTENT(IN), POINTER :: para_env 936 REAL(dp), DIMENSION(:, :), INTENT(IN), POINTER :: pmax_block 937 LOGICAL, INTENT(IN) :: do_p_screening 938 INTEGER, DIMENSION(:), INTENT(IN), POINTER :: map_atom_to_kind_atom 939 INTEGER, INTENT(IN) :: eval_type 940 REAL(dp), INTENT(IN) :: log10_eps_schwarz, log_2, & 941 coeffs_kind_max0 942 LOGICAL, INTENT(IN) :: use_virial 943 LOGICAL, DIMENSION(:, :), INTENT(IN), POINTER :: atomic_pair_list 944 945 CHARACTER(LEN=*), PARAMETER :: routineN = 'hfx_recursive_load_balance', & 946 routineP = moduleN//':'//routineN 947 948 INTEGER :: handle, i, iatom_block, iatom_end, iatom_start, j, jatom_block, jatom_end, & 949 jatom_start, katom_block, katom_end, katom_start, latom_block, latom_end, latom_start, & 950 nP, nQ, numBins, p, q, sizeP, sizeQ, unit_nr 951 INTEGER(int_8) :: local_cost, pidx, qidx, sumP, sumQ 952 INTEGER(int_8), ALLOCATABLE, DIMENSION(:) :: local_cost_vector 953 INTEGER, ALLOCATABLE, DIMENSION(:) :: blocksize, p_atom_blocks, permute, & 954 q_atom_blocks 955 REAL(dp) :: maximum, mean 956 957! internal variables: 958 959!$OMP BARRIER 960!$OMP MASTER 961 CALL timeset(routineN, handle) 962!$OMP END MASTER 963!$OMP BARRIER 964 965 ! calculate best p/q distribution grid for the n_processes 966 CALL hfx_calculate_PQ(p, q, numBins, n_processes) 967 968 ALLOCATE (blocksize(numBins)) 969 ALLOCATE (permute(nblocks**2)) 970 DO i = 1, nblocks**2 971 permute(i) = i 972 END DO 973 974 ! call the main recursive permutation routine. 975 ! Output: 976 ! blocksize :: vector (size numBins) with the sizes for each column/row block 977 ! permute :: permutation vector 978 CALL hfx_recursive_permute(blocksize, 1, nblocks**2, numBins, & 979 permute, 1, & 980 my_process_id, n_processes, nblocks, & 981 natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, & 982 particle_set, & 983 coeffs_set, coeffs_kind, & 984 is_assoc_atomic_block_global, do_periodic, & 985 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, & 986 cell, x_data, para_env, pmax_block, & 987 do_p_screening, map_atom_to_kind_atom, eval_type, & 988 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list) 989 990 ! number of blocks per processor in p-direction (vertical) 991 nP = numBins/p 992 ! number of blocks per processor in q-direction (horizontal) 993 nQ = numBins/q 994 995 ! calc own position in P-Q-processor grid (PQ-grid is column-major) 996 pidx = MODULO(INT(my_process_id), INT(p)) + 1 997 qidx = my_process_id/p + 1 998 999 sizeP = SUM(blocksize((nP*(pidx - 1) + 1):(nP*pidx))) 1000 sizeQ = SUM(blocksize((nQ*(qidx - 1) + 1):(nQ*qidx))) 1001 1002 sumP = SUM(blocksize(1:(nP*(pidx - 1)))) 1003 sumQ = SUM(blocksize(1:(nQ*(qidx - 1)))) 1004 1005 ALLOCATE (p_atom_blocks(sizeP)) 1006 ALLOCATE (q_atom_blocks(sizeQ)) 1007 1008 p_atom_blocks(:) = permute((sumP + 1):(sumP + sizeP)) 1009 q_atom_blocks(:) = permute((sumQ + 1):(sumQ + sizeQ)) 1010 1011 ! from here on, we are actually finished, each process has been 1012 ! assigned a (p_atom_blocks,q_atom_blocks) pair list. 1013 ! what follows is just a small routine to calculate the local cost 1014 ! for each processor which is then written to a file. 1015 1016 ! calculate local cost for each processor! 1017 ! **************************************** 1018 local_cost = 0 1019 DO i = 1, sizeP 1020 DO j = 1, sizeQ 1021 1022 ! get corresponding 4D block indices out of our own P-Q-block 1023 latom_block = MODULO(q_atom_blocks(j), nblocks) 1024 iatom_block = q_atom_blocks(j)/nblocks + 1 1025 jatom_block = MODULO(p_atom_blocks(i), nblocks) 1026 katom_block = p_atom_blocks(i)/nblocks + 1 1027 1028 ! symmetry checks. 1029 IF (latom_block < katom_block) CYCLE 1030 IF (jatom_block < iatom_block) CYCLE 1031 1032 iatom_start = x_data%blocks(iatom_block)%istart 1033 iatom_end = x_data%blocks(iatom_block)%iend 1034 jatom_start = x_data%blocks(jatom_block)%istart 1035 jatom_end = x_data%blocks(jatom_block)%iend 1036 katom_start = x_data%blocks(katom_block)%istart 1037 katom_end = x_data%blocks(katom_block)%iend 1038 latom_start = x_data%blocks(latom_block)%istart 1039 latom_end = x_data%blocks(latom_block)%iend 1040 1041 ! whatever. 1042 SELECT CASE (eval_type) 1043 CASE (hfx_do_eval_energy) 1044 pmax_blocks = MAX(pmax_block(katom_block, iatom_block), & 1045 pmax_block(latom_block, jatom_block), & 1046 pmax_block(latom_block, iatom_block), & 1047 pmax_block(katom_block, jatom_block)) 1048 CASE (hfx_do_eval_forces) 1049 pmax_blocks = MAX(pmax_block(katom_block, iatom_block) + & 1050 pmax_block(latom_block, jatom_block), & 1051 pmax_block(latom_block, iatom_block) + & 1052 pmax_block(katom_block, jatom_block)) 1053 END SELECT 1054 1055 ! screening. 1056 IF (2.0_dp*coeffs_kind_max0 + pmax_blocks < log10_eps_schwarz) CYCLE 1057 1058 ! estimate the cost of this atom_block. 1059 local_cost = local_cost + estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, & 1060 set_list_kl, & 1061 iatom_start, iatom_end, jatom_start, jatom_end, & 1062 katom_start, katom_end, latom_start, latom_end, & 1063 particle_set, & 1064 coeffs_set, coeffs_kind, & 1065 is_assoc_atomic_block_global, do_periodic, & 1066 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, & 1067 cell, & 1068 do_p_screening, map_atom_to_kind_atom, eval_type, & 1069 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list) 1070 END DO 1071 END DO 1072 1073 ALLOCATE (local_cost_vector(n_processes)) 1074 local_cost_vector = 0 1075 local_cost_vector(my_process_id + 1) = local_cost 1076 CALL mp_sum(local_cost_vector, para_env%group) 1077 1078 mean = SUM(local_cost_vector)/n_processes 1079 maximum = MAXVAL(local_cost_vector) 1080 1081!$OMP BARRIER 1082!$OMP MASTER 1083 ! only output once 1084 IF (my_process_id == 0) THEN 1085 CALL open_file(unit_number=unit_nr, file_name="loads.dat") 1086 WRITE (unit_nr, *) 'maximum cost:', maximum 1087 WRITE (unit_nr, *) 'mean cost:', mean 1088 WRITE (unit_nr, *) 'load balance ratio max/mean: ', maximum/mean 1089 WRITE (unit_nr, *) '-------- detailed per-process costs ---------' 1090 DO i = 1, n_processes 1091 WRITE (unit_nr, *) local_cost_vector(i) 1092 END DO 1093 CALL close_file(unit_nr) 1094 END IF 1095!$OMP END MASTER 1096!$OMP BARRIER 1097 1098 DEALLOCATE (local_cost_vector) 1099 DEALLOCATE (p_atom_blocks, q_atom_blocks) 1100 DEALLOCATE (blocksize, permute) 1101 1102!$OMP BARRIER 1103!$OMP MASTER 1104 CALL timestop(handle) 1105!$OMP END MASTER 1106!$OMP BARRIER 1107 1108 END SUBROUTINE hfx_recursive_load_balance 1109 1110! ************************************************************************************************** 1111!> \brief Small routine to calculate the optimal P-Q-processor grid distribution 1112!> for a given number of processors N 1113!> and the corresponding number of Bins for the load balancing routine 1114!> \param p number of rows on P-Q process grid (output) 1115!> \param q number of columns on P-Q process grid (output) 1116!> \param nBins number of Bins (output) 1117!> \param N number of processes (input) 1118!> \par History 1119!> 03.2011 created [Michael Steinlechner] 1120!> \author Michael Steinlechner 1121! ************************************************************************************************** 1122 SUBROUTINE hfx_calculate_PQ(p, q, nBins, N) 1123 1124 INTEGER, INTENT(OUT) :: p, q, nBins 1125 INTEGER, INTENT(IN) :: N 1126 1127 INTEGER :: a, b, k 1128 REAL(dp) :: sqN 1129 1130 k = 2 1131 sqN = SQRT(REAL(N, KIND=dp)) 1132 p = 1 1133 1134 DO WHILE (REAL(k, KIND=dp) <= sqN) 1135 IF (MODULO(N, k) == 0) THEN 1136 p = k 1137 END IF 1138 k = k + 1 1139 END DO 1140 q = N/p 1141 1142 ! now compute the least common multiple of p & q to get the number of necessary bins 1143 ! compute using the relation LCM(p,q) = abs(p*q) / GCD(p,q) 1144 ! and use euclid's algorithm for GCD computation. 1145 a = p 1146 b = q 1147 1148 DO WHILE (b .NE. 0) 1149 IF (a > b) THEN 1150 a = a - b 1151 ELSE 1152 b = b - a 1153 END IF 1154 END DO 1155 ! gcd(p,q) is now saved in a 1156 1157 nBins = p*q/a 1158 1159 END SUBROUTINE 1160 1161! ************************************************************************************************** 1162!> \brief Recursive permutation routine for the load balancing of the integral 1163!> computation 1164!> \param blocksize vector of blocksizes, size(nProc), which contains for 1165!> each process the local blocksize (OUTPUT) 1166!> \param blockstart starting row/column idx of the block which is to be examined 1167!> at this point (INPUT) 1168!> \param blockend ending row/column idx of the block which is to be examined 1169!> (INPUT) 1170!> \param nProc_in number of bins into which the current block has to be divided 1171!> (INPUT) 1172!> \param permute permutation vector which balances column/row cost 1173!> size(nblocks^2). (OUTPUT) 1174!> \param step ... 1175!> \param my_process_id ... 1176!> \param n_processes ... 1177!> \param nblocks ... 1178!> \param natom ... 1179!> \param nkind ... 1180!> \param list_ij ... 1181!> \param list_kl ... 1182!> \param set_list_ij ... 1183!> \param set_list_kl ... 1184!> \param particle_set ... 1185!> \param coeffs_set ... 1186!> \param coeffs_kind ... 1187!> \param is_assoc_atomic_block_global ... 1188!> \param do_periodic ... 1189!> \param kind_of ... 1190!> \param basis_parameter ... 1191!> \param pmax_set ... 1192!> \param pmax_atom ... 1193!> \param pmax_blocks ... 1194!> \param cell ... 1195!> \param x_data ... 1196!> \param para_env ... 1197!> \param pmax_block ... 1198!> \param do_p_screening ... 1199!> \param map_atom_to_kind_atom ... 1200!> \param eval_type ... 1201!> \param log10_eps_schwarz ... 1202!> \param log_2 ... 1203!> \param coeffs_kind_max0 ... 1204!> \param use_virial ... 1205!> \param atomic_pair_list ... 1206!> \par History 1207!> 03.2011 created [Michael Steinlechner] 1208!> \author Michael Steinlechner 1209! ************************************************************************************************** 1210 RECURSIVE SUBROUTINE hfx_recursive_permute(blocksize, blockstart, blockend, nProc_in, & 1211 permute, step, & 1212 my_process_id, n_processes, nblocks, & 1213 natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, & 1214 particle_set, & 1215 coeffs_set, coeffs_kind, & 1216 is_assoc_atomic_block_global, do_periodic, & 1217 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, & 1218 cell, x_data, para_env, pmax_block, & 1219 do_p_screening, map_atom_to_kind_atom, eval_type, & 1220 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list) 1221 1222 INTEGER :: nProc_in, blockend, blockstart 1223 INTEGER, DIMENSION(nProc_in) :: blocksize 1224 INTEGER :: nblocks, n_processes, my_process_id 1225 INTEGER, INTENT(IN) :: step 1226 INTEGER, DIMENSION(nblocks*nblocks) :: permute 1227 INTEGER :: natom 1228 INTEGER, INTENT(IN) :: nkind 1229 TYPE(pair_list_type) :: list_ij, list_kl 1230 TYPE(pair_set_list_type), ALLOCATABLE, & 1231 DIMENSION(:) :: set_list_ij, set_list_kl 1232 TYPE(particle_type), DIMENSION(:), POINTER :: particle_set 1233 TYPE(hfx_screen_coeff_type), & 1234 DIMENSION(:, :, :, :), POINTER :: coeffs_set 1235 TYPE(hfx_screen_coeff_type), DIMENSION(:, :), & 1236 POINTER :: coeffs_kind 1237 INTEGER, DIMENSION(:, :) :: is_assoc_atomic_block_global 1238 LOGICAL :: do_periodic 1239 INTEGER :: kind_of(*) 1240 TYPE(hfx_basis_type), DIMENSION(:), POINTER :: basis_parameter 1241 TYPE(hfx_p_kind), DIMENSION(:), POINTER :: pmax_set 1242 REAL(dp), DIMENSION(:, :), POINTER :: pmax_atom 1243 REAL(dp) :: pmax_blocks 1244 TYPE(cell_type), POINTER :: cell 1245 TYPE(hfx_type), POINTER :: x_data 1246 TYPE(cp_para_env_type), POINTER :: para_env 1247 REAL(dp), DIMENSION(:, :), POINTER :: pmax_block 1248 LOGICAL, INTENT(IN) :: do_p_screening 1249 INTEGER, DIMENSION(:), POINTER :: map_atom_to_kind_atom 1250 INTEGER, INTENT(IN) :: eval_type 1251 REAL(dp) :: log10_eps_schwarz, log_2, & 1252 coeffs_kind_max0 1253 LOGICAL, INTENT(IN) :: use_virial 1254 LOGICAL, DIMENSION(:, :), POINTER :: atomic_pair_list 1255 1256 INTEGER :: col, endoffset, i, iatom_block, iatom_end, iatom_start, idx, inv_perm, & 1257 jatom_block, jatom_end, jatom_start, katom_block, katom_end, katom_start, latom_block, & 1258 latom_end, latom_start, nbins, nProc, row, startoffset 1259 INTEGER(int_8) :: atom_block, tmp_block 1260 INTEGER, ALLOCATABLE, DIMENSION(:) :: ithblocksize, localblocksize 1261 INTEGER, DIMENSION(blockend-blockstart+1) :: bin_perm, tmp_perm 1262 REAL(dp) :: partialcost 1263 REAL(dp), DIMENSION(nblocks*nblocks) :: cost_vector 1264 1265 nProc = nProc_in 1266 cost_vector = 0.0_dp 1267 1268! loop over local atom_blocks. 1269 DO atom_block = my_process_id, INT(nblocks, KIND=int_8)**4 - 1, n_processes 1270 1271! get corresponding 4D block indices 1272 latom_block = INT(MODULO(atom_block, INT(nblocks, KIND=int_8))) + 1 1273 tmp_block = atom_block/nblocks 1274 katom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1 1275 IF (latom_block < katom_block) CYCLE 1276 tmp_block = tmp_block/nblocks 1277 jatom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1 1278 tmp_block = tmp_block/nblocks 1279 iatom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1 1280 IF (jatom_block < iatom_block) CYCLE 1281 1282! get 2D indices of this atom_block (with permutation applied) 1283! for this, we need to invert the permutation, this means 1284! find position in permutation vector where value==idx 1285 1286 row = (katom_block - 1)*nblocks + jatom_block 1287 inv_perm = 1 1288 DO WHILE (permute(inv_perm) .NE. row) 1289 inv_perm = inv_perm + 1 1290 END DO 1291 row = inv_perm 1292 1293 col = (iatom_block - 1)*nblocks + latom_block 1294 inv_perm = 1 1295 DO WHILE (permute(inv_perm) .NE. col) 1296 inv_perm = inv_perm + 1 1297 END DO 1298 col = inv_perm 1299 1300! if row/col outside our current diagonal block, skip calculation. 1301 IF (col < blockstart .OR. col > blockend) CYCLE 1302 IF (row < blockstart .OR. row > blockend) CYCLE 1303 1304 iatom_start = x_data%blocks(iatom_block)%istart 1305 iatom_end = x_data%blocks(iatom_block)%iend 1306 jatom_start = x_data%blocks(jatom_block)%istart 1307 jatom_end = x_data%blocks(jatom_block)%iend 1308 katom_start = x_data%blocks(katom_block)%istart 1309 katom_end = x_data%blocks(katom_block)%iend 1310 latom_start = x_data%blocks(latom_block)%istart 1311 latom_end = x_data%blocks(latom_block)%iend 1312 1313! whatever. 1314 SELECT CASE (eval_type) 1315 CASE (hfx_do_eval_energy) 1316 pmax_blocks = MAX(pmax_block(katom_block, iatom_block), & 1317 pmax_block(latom_block, jatom_block), & 1318 pmax_block(latom_block, iatom_block), & 1319 pmax_block(katom_block, jatom_block)) 1320 CASE (hfx_do_eval_forces) 1321 pmax_blocks = MAX(pmax_block(katom_block, iatom_block) + & 1322 pmax_block(latom_block, jatom_block), & 1323 pmax_block(latom_block, iatom_block) + & 1324 pmax_block(katom_block, jatom_block)) 1325 END SELECT 1326 1327! screening. 1328 IF (2.0_dp*coeffs_kind_max0 + pmax_blocks < log10_eps_schwarz) CYCLE 1329 1330! every second recursion step, compute row sum instead of column sum 1331 1332 IF (MODULO(step, 2) .EQ. 0) THEN 1333 idx = row 1334 ELSE 1335 idx = col 1336 END IF 1337 1338! estimate the cost of this atom_block. 1339 partialcost = estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, & 1340 set_list_kl, & 1341 iatom_start, iatom_end, jatom_start, jatom_end, & 1342 katom_start, katom_end, latom_start, latom_end, & 1343 particle_set, & 1344 coeffs_set, coeffs_kind, & 1345 is_assoc_atomic_block_global, do_periodic, & 1346 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, & 1347 cell, & 1348 do_p_screening, map_atom_to_kind_atom, eval_type, & 1349 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list) 1350 1351 cost_vector(idx) = cost_vector(idx) + partialcost 1352 END DO ! atom_block 1353 1354! sum costvector over all processes 1355 CALL mp_sum(cost_vector, para_env%group) 1356 1357! calculate next prime factor of nProc 1358 nBins = 2 1359 DO WHILE (MODULO(INT(nProc), INT(nBins)) .NE. 0) 1360 nBins = nBins + 1 1361 END DO 1362 1363 nProc = nProc/nBins 1364 1365! ... do the binning... 1366 1367 ALLOCATE (localblocksize(nBins)) 1368 CALL hfx_permute_binning(nBins, cost_vector(blockstart:blockend), blockend - blockstart + 1, bin_perm, localblocksize) 1369 1370!... and update the permutation vector 1371 1372 tmp_perm = permute(blockstart:blockend) 1373 permute(blockstart:blockend) = tmp_perm(bin_perm) 1374 1375! split recursion into the nBins Bins 1376 IF (nProc > 1) THEN 1377 ALLOCATE (ithblocksize(nProc)) 1378 DO i = 1, nBins 1379 startoffset = SUM(localblocksize(1:(i - 1))) 1380 endoffset = SUM(localblocksize(1:i)) - 1 1381 1382 CALL hfx_recursive_permute(ithblocksize, blockstart + startoffset, blockstart + endoffset, nProc, & 1383 permute, step + 1, & 1384 my_process_id, n_processes, nblocks, & 1385 natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, & 1386 particle_set, & 1387 coeffs_set, coeffs_kind, & 1388 is_assoc_atomic_block_global, do_periodic, & 1389 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, & 1390 cell, x_data, para_env, pmax_block, & 1391 do_p_screening, map_atom_to_kind_atom, eval_type, & 1392 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list) 1393 blocksize(((i - 1)*nProc + 1):(i*nProc)) = ithblocksize 1394 END DO 1395 DEALLOCATE (ithblocksize) 1396 ELSE 1397 DO i = 1, nBins 1398 blocksize(i) = localblocksize(i) 1399 END DO 1400 END IF 1401 1402 DEALLOCATE (localblocksize) 1403 1404 END SUBROUTINE hfx_recursive_permute 1405 1406! ************************************************************************************************** 1407!> \brief small binning routine for the recursive load balancing 1408!> 1409!> \param nBins number of Bins (INPUT) 1410!> \param costvector vector of current row/column costs which have to be binned (INPUT) 1411!> \param maxbinsize upper bound for bin size (INPUT) 1412!> \param perm resulting permutation due to be binning routine (OUTPUT) 1413!> \param block_count vector of size(nbins) which contains the size of each bin (OUTPUT) 1414!> \par History 1415!> 03.2011 created [Michael Steinlechner] 1416!> \author Michael Steinlechner 1417! ************************************************************************************************** 1418 SUBROUTINE hfx_permute_binning(nBins, costvector, maxbinsize, perm, block_count) 1419 1420 INTEGER, INTENT(IN) :: nBins, maxbinsize 1421 REAL(dp), DIMENSION(maxbinsize), INTENT(IN) :: costvector 1422 INTEGER, DIMENSION(maxbinsize), INTENT(OUT) :: perm 1423 INTEGER, DIMENSION(nBins), INTENT(OUT) :: block_count 1424 1425 INTEGER :: i, j, mod_idx, offset 1426 INTEGER, DIMENSION(nBins, maxbinsize) :: bin 1427 INTEGER, DIMENSION(nBins) :: bin_idx 1428 INTEGER, DIMENSION(maxbinsize) :: idx 1429 REAL(dp), DIMENSION(maxbinsize) :: vec 1430 REAL(dp), DIMENSION(nBins) :: bincosts 1431 1432! be careful not to change costvector (copy it!) 1433 1434 vec = costvector 1435 block_count = 0 1436 bincosts = 0 1437 1438 !sort the array (ascending) 1439 CALL sort(vec, maxbinsize, idx) 1440 1441 ! count the loop down to distribute the largest cols/rows first 1442 DO i = maxbinsize, 1, -1 1443 IF (vec(i) == 0) THEN 1444 ! spread zero-cost col/rows evenly among procs 1445 mod_idx = MODULO(i, nBins) + 1 !(note the fortran offset by one!) 1446 block_count(mod_idx) = block_count(mod_idx) + 1 1447 bin(mod_idx, block_count(mod_idx)) = idx(i) 1448 ELSE 1449 ! sort the bins so that the one with the lowest cost is at the 1450 ! first place, where we then assign the current col/row 1451 CALL sort(bincosts, nBins, bin_idx) 1452 block_count = block_count(bin_idx) 1453 bin = bin(bin_idx, :) 1454 1455 bincosts(1) = bincosts(1) + vec(i) 1456 block_count(1) = block_count(1) + 1 1457 bin(1, block_count(1)) = idx(i) 1458 END IF 1459 END DO 1460 1461 ! construct permutation vector from the binning 1462 offset = 0 1463 DO i = 1, nBins 1464 DO j = 1, block_count(i) 1465 perm(offset + j) = bin(i, j) 1466 END DO 1467 offset = offset + block_count(i) 1468 END DO 1469 1470 END SUBROUTINE hfx_permute_binning 1471 1472! ************************************************************************************************** 1473!> \brief Cheap way of redistributing the eri's 1474!> \param x_data Object that stores the indices array 1475!> \param para_env para_env 1476!> \param load_balance_parameter contains parmameter for Monte-Carlo routines 1477!> \param i_thread current thread ID 1478!> \param n_threads Total Number of threads 1479!> \param eval_type ... 1480!> \par History 1481!> 12.2007 created [Manuel Guidon] 1482!> 02.2009 optimize Memory Usage [Manuel Guidon] 1483!> \author Manuel Guidon 1484!> \note 1485!> The cost matrix is given by the walltime for each bin that is measured 1486!> during the calculation 1487! ************************************************************************************************** 1488 SUBROUTINE hfx_update_load_balance(x_data, para_env, & 1489 load_balance_parameter, & 1490 i_thread, n_threads, eval_type) 1491 1492 TYPE(hfx_type), POINTER :: x_data 1493 TYPE(cp_para_env_type), POINTER :: para_env 1494 TYPE(hfx_load_balance_type) :: load_balance_parameter 1495 INTEGER, INTENT(IN) :: i_thread, n_threads, eval_type 1496 1497 CHARACTER(LEN=*), PARAMETER :: routineN = 'hfx_update_load_balance', & 1498 routineP = moduleN//':'//routineN 1499 1500 INTEGER :: data_from, dest, end_idx, handle, i, ibin, icpu, iprocess, j, mepos, my_bin_size, & 1501 my_global_start_idx, my_process_id, n_processes, nbins, ncpu, req(2), source, start_idx 1502 INTEGER(int_8), DIMENSION(:), POINTER :: local_cost_matrix, recbuffer, & 1503 sendbuffer, swapbuffer 1504 INTEGER(int_8), DIMENSION(:), POINTER, SAVE :: cost_matrix 1505 INTEGER, ALLOCATABLE, DIMENSION(:) :: tmp_pos 1506 INTEGER, ALLOCATABLE, DIMENSION(:), SAVE :: bins_per_rank 1507 INTEGER, ALLOCATABLE, DIMENSION(:, :), SAVE :: bin_histogram 1508 INTEGER, DIMENSION(:), POINTER, SAVE :: shm_distribution_vector 1509 INTEGER, SAVE :: max_bin_size 1510 TYPE(hfx_distribution), DIMENSION(:), POINTER :: binned_dist, ptr_to_tmp_dist, tmp_dist 1511 TYPE(hfx_distribution), DIMENSION(:, :), POINTER, & 1512 SAVE :: full_dist 1513 1514!$OMP BARRIER 1515!$OMP MASTER 1516 CALL timeset(routineN, handle) 1517!$OMP END MASTER 1518!$OMP BARRIER 1519 1520 ncpu = para_env%num_pe 1521 n_processes = ncpu*n_threads 1522 !! If there is only 1 cpu skip the binning 1523 IF (n_processes == 1) THEN 1524 ALLOCATE (tmp_dist(1)) 1525 tmp_dist(1)%number_of_atom_quartets = HUGE(tmp_dist(1)%number_of_atom_quartets) 1526 tmp_dist(1)%istart = 0_int_8 1527 ptr_to_tmp_dist => tmp_dist(:) 1528 SELECT CASE (eval_type) 1529 CASE (hfx_do_eval_energy) 1530 CALL hfx_set_distr_energy(ptr_to_tmp_dist, x_data) 1531 CASE (hfx_do_eval_forces) 1532 CALL hfx_set_distr_forces(ptr_to_tmp_dist, x_data) 1533 END SELECT 1534 DEALLOCATE (tmp_dist) 1535 ELSE 1536 mepos = para_env%mepos 1537 my_process_id = para_env%mepos*n_threads + i_thread 1538 nbins = load_balance_parameter%nbins 1539!$OMP MASTER 1540 ALLOCATE (bin_histogram(n_processes, 2)) 1541 bin_histogram = 0 1542!$OMP END MASTER 1543!$OMP BARRIER 1544 SELECT CASE (eval_type) 1545 CASE (hfx_do_eval_energy) 1546 my_bin_size = SIZE(x_data%distribution_energy) 1547 CASE (hfx_do_eval_forces) 1548 my_bin_size = SIZE(x_data%distribution_forces) 1549 END SELECT 1550 bin_histogram(my_process_id + 1, 1) = my_bin_size 1551!$OMP BARRIER 1552!$OMP MASTER 1553 CALL mp_sum(bin_histogram(:, 1), para_env%group) 1554 bin_histogram(1, 2) = bin_histogram(1, 1) 1555 DO iprocess = 2, n_processes 1556 bin_histogram(iprocess, 2) = bin_histogram(iprocess - 1, 2) + bin_histogram(iprocess, 1) 1557 END DO 1558 1559 max_bin_size = MAXVAL(bin_histogram(para_env%mepos*n_threads + 1:para_env%mepos*n_threads + n_threads, 1)) 1560 CALL mp_max(max_bin_size, para_env%group) 1561!$OMP END MASTER 1562!$OMP BARRIER 1563 ALLOCATE (binned_dist(my_bin_size)) 1564 !! Use old binned_dist, but with timings cost 1565 SELECT CASE (eval_type) 1566 CASE (hfx_do_eval_energy) 1567 binned_dist = x_data%distribution_energy 1568 CASE (hfx_do_eval_forces) 1569 binned_dist = x_data%distribution_forces 1570 END SELECT 1571 1572 DO ibin = 1, my_bin_size 1573 IF (binned_dist(ibin)%number_of_atom_quartets == 0) THEN 1574 binned_dist(ibin)%cost = 0 1575 ELSE 1576 SELECT CASE (eval_type) 1577 CASE (hfx_do_eval_energy) 1578 IF (.NOT. load_balance_parameter%rtp_redistribute) THEN 1579 binned_dist(ibin)%cost = INT((binned_dist(ibin)%time_first_scf + & 1580 binned_dist(ibin)%time_other_scf)*10000.0_dp, int_8) 1581 ELSE 1582 binned_dist(ibin)%cost = INT((binned_dist(ibin)%time_other_scf)*10000.0_dp, int_8) 1583 END IF 1584 CASE (hfx_do_eval_forces) 1585 binned_dist(ibin)%cost = INT((binned_dist(ibin)%time_forces)*10000.0_dp, int_8) 1586 END SELECT 1587 END IF 1588 END DO 1589!$OMP BARRIER 1590!$OMP MASTER 1591 !! store all local results in a big cost matrix 1592 ALLOCATE (cost_matrix(ncpu*nbins*n_threads)) 1593 cost_matrix = 0 1594 ALLOCATE (sendbuffer(max_bin_size*n_threads)) 1595 ALLOCATE (recbuffer(max_bin_size*n_threads)) 1596!$OMP END MASTER 1597!$OMP BARRIER 1598 my_global_start_idx = bin_histogram(my_process_id + 1, 2) - my_bin_size 1599 icpu = para_env%mepos + 1 1600 DO i = 1, my_bin_size 1601 cost_matrix(my_global_start_idx + i) = binned_dist(i)%cost 1602 END DO 1603 1604 mepos = para_env%mepos 1605!$OMP BARRIER 1606!$OMP MASTER 1607 ALLOCATE (bins_per_rank(ncpu)) 1608 bins_per_rank = 0 1609 DO icpu = 1, ncpu 1610 bins_per_rank(icpu) = SUM(bin_histogram((icpu - 1)*n_threads + 1:(icpu - 1)*n_threads + n_threads, 1)) 1611 END DO 1612 sendbuffer(1:bins_per_rank(para_env%mepos + 1)) = & 1613 cost_matrix(my_global_start_idx + 1:my_global_start_idx + bins_per_rank(para_env%mepos + 1)) 1614 1615 dest = MODULO(mepos + 1, ncpu) 1616 source = MODULO(mepos - 1, ncpu) 1617 ! sync before/after ring of isendrecv 1618 CALL mp_sync(para_env%group) 1619 DO icpu = 0, ncpu - 1 1620 IF (icpu .NE. ncpu - 1) THEN 1621 CALL mp_isendrecv(sendbuffer, dest, recbuffer, source, & 1622 para_env%group, req(1), req(2), 13) 1623 ENDIF 1624 data_from = MODULO(mepos - icpu, ncpu) 1625 start_idx = SUM(bins_per_rank(1:data_from + 1)) - bins_per_rank(data_from + 1) + 1 1626 end_idx = start_idx + bins_per_rank(data_from + 1) - 1 1627 cost_matrix(start_idx:end_idx) = sendbuffer(1:end_idx - start_idx + 1) 1628 1629 IF (icpu .NE. ncpu - 1) THEN 1630 CALL mp_waitall(req) 1631 ENDIF 1632 swapbuffer => sendbuffer 1633 sendbuffer => recbuffer 1634 recbuffer => swapbuffer 1635 ENDDO 1636 DEALLOCATE (recbuffer, sendbuffer) 1637 ! sync before/after ring of isendrecv 1638 CALL mp_sync(para_env%group) 1639!$OMP END MASTER 1640!$OMP BARRIER 1641 ALLOCATE (local_cost_matrix(SIZE(cost_matrix, 1))) 1642 local_cost_matrix = cost_matrix 1643!$OMP MASTER 1644 ALLOCATE (shm_distribution_vector(ncpu*nbins*n_threads)) 1645 CALL optimize_distribution(ncpu*nbins*n_threads, ncpu*n_threads, local_cost_matrix, & 1646 shm_distribution_vector, x_data%load_balance_parameter%do_randomize) 1647 1648 ALLOCATE (full_dist(ncpu*n_threads, max_bin_size)) 1649 1650 full_dist(:, :)%istart = 0_int_8 1651 full_dist(:, :)%number_of_atom_quartets = 0_int_8 1652 full_dist(:, :)%cost = 0_int_8 1653 full_dist(:, :)%time_first_scf = 0.0_dp 1654 full_dist(:, :)%time_other_scf = 0.0_dp 1655 full_dist(:, :)%time_forces = 0.0_dp 1656!$OMP END MASTER 1657 1658!$OMP BARRIER 1659 mepos = para_env%mepos + 1 1660 full_dist((mepos - 1)*n_threads + i_thread + 1, 1:my_bin_size) = binned_dist(1:my_bin_size) 1661!$OMP BARRIER 1662!$OMP MASTER 1663 ALLOCATE (sendbuffer(3*max_bin_size*n_threads)) 1664 ALLOCATE (recbuffer(3*max_bin_size*n_threads)) 1665 mepos = para_env%mepos 1666 DO j = 1, n_threads 1667 DO i = 1, max_bin_size 1668 sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 1) = full_dist(mepos*n_threads + j, i)%istart 1669 sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 2) = full_dist(mepos*n_threads + j, i)%number_of_atom_quartets 1670 sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 3) = full_dist(mepos*n_threads + j, i)%cost 1671 END DO 1672 END DO 1673 dest = MODULO(mepos + 1, ncpu) 1674 source = MODULO(mepos - 1, ncpu) 1675 ! sync before/after ring of isendrecv 1676 CALL mp_sync(para_env%group) 1677 DO icpu = 0, ncpu - 1 1678 IF (icpu .NE. ncpu - 1) THEN 1679 CALL mp_isendrecv(sendbuffer, dest, recbuffer, source, & 1680 para_env%group, req(1), req(2), 13) 1681 ENDIF 1682 data_from = MODULO(mepos - icpu, ncpu) 1683 DO j = 1, n_threads 1684 DO i = 1, max_bin_size 1685 full_dist(data_from*n_threads + j, i)%istart = sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 1) 1686 full_dist(data_from*n_threads + j, i)%number_of_atom_quartets = sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 2) 1687 full_dist(data_from*n_threads + j, i)%cost = sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 3) 1688 END DO 1689 END DO 1690 1691 IF (icpu .NE. ncpu - 1) THEN 1692 CALL mp_waitall(req) 1693 ENDIF 1694 swapbuffer => sendbuffer 1695 sendbuffer => recbuffer 1696 recbuffer => swapbuffer 1697 ENDDO 1698 ! sync before/after ring of isendrecv 1699 DEALLOCATE (recbuffer, sendbuffer) 1700 CALL mp_sync(para_env%group) 1701!$OMP END MASTER 1702!$OMP BARRIER 1703 !! reorder the distribution according to the distribution vector 1704 ALLOCATE (tmp_pos(ncpu*n_threads)) 1705 tmp_pos = 1 1706 ALLOCATE (tmp_dist(nbins*ncpu*n_threads)) 1707 1708 tmp_dist(:)%istart = 0_int_8 1709 tmp_dist(:)%number_of_atom_quartets = 0_int_8 1710 tmp_dist(:)%cost = 0_int_8 1711 tmp_dist(:)%time_first_scf = 0.0_dp 1712 tmp_dist(:)%time_other_scf = 0.0_dp 1713 tmp_dist(:)%time_forces = 0.0_dp 1714 1715 mepos = my_process_id + 1 1716 DO icpu = 1, n_processes 1717 DO i = 1, bin_histogram(icpu, 1) 1718 IF (shm_distribution_vector(bin_histogram(icpu, 2) - bin_histogram(icpu, 1) + i) == mepos) THEN 1719 tmp_dist(tmp_pos(mepos)) = full_dist(icpu, i) 1720 tmp_pos(mepos) = tmp_pos(mepos) + 1 1721 END IF 1722 END DO 1723 END DO 1724 1725 !! Assign the load to each process 1726 NULLIFY (ptr_to_tmp_dist) 1727 mepos = my_process_id + 1 1728 ptr_to_tmp_dist => tmp_dist(1:tmp_pos(mepos) - 1) 1729 SELECT CASE (eval_type) 1730 CASE (hfx_do_eval_energy) 1731 CALL hfx_set_distr_energy(ptr_to_tmp_dist, x_data) 1732 CASE (hfx_do_eval_forces) 1733 CALL hfx_set_distr_forces(ptr_to_tmp_dist, x_data) 1734 END SELECT 1735 1736!$OMP BARRIER 1737!$OMP MASTER 1738 DEALLOCATE (full_dist, cost_matrix, shm_distribution_vector) 1739 DEALLOCATE (bins_per_rank, bin_histogram) 1740!$OMP END MASTER 1741!$OMP BARRIER 1742 DEALLOCATE (tmp_dist, tmp_pos) 1743 DEALLOCATE (binned_dist, local_cost_matrix) 1744 END IF 1745!$OMP BARRIER 1746!$OMP MASTER 1747 CALL timestop(handle) 1748!$OMP END MASTER 1749!$OMP BARRIER 1750 1751 END SUBROUTINE hfx_update_load_balance 1752 1753! ************************************************************************************************** 1754!> \brief estimates the cost of a set quartet with info available at load balance time 1755!> i.e. without much info on the primitives primitives 1756!> \param nsa ... 1757!> \param nsb ... 1758!> \param nsc ... 1759!> \param nsd ... 1760!> \param npgfa ... 1761!> \param npgfb ... 1762!> \param npgfc ... 1763!> \param npgfd ... 1764!> \param ratio ... 1765!> \param p1 ... 1766!> \param p2 ... 1767!> \param p3 ... 1768!> \return ... 1769!> \par History 1770!> 08.2009 created Joost VandeVondele 1771!> \author Joost VandeVondele 1772! ************************************************************************************************** 1773 FUNCTION cost_model(nsa, nsb, nsc, nsd, npgfa, npgfb, npgfc, npgfd, ratio, p1, p2, p3) RESULT(res) 1774 IMPLICIT NONE 1775 REAL(KIND=dp) :: estimate1, estimate2, estimate, ratio, switch, mu, sigma 1776 INTEGER(KIND=int_8) :: res 1777 REAL(KIND=dp), INTENT(IN) :: p1(12), p2(12), p3(2) 1778 1779 INTEGER :: nsa, nsb, nsc, nsd, npgfa, npgfb, npgfc, npgfd 1780 1781 estimate1 = estimate_basic(p1) 1782 estimate2 = estimate_basic(p2) 1783 mu = LOG(ABS(1.0E6_dp*p3(1)) + 1) 1784 sigma = p3(2)*0.1_dp*mu 1785 switch = 1.0_dp/(1.0_dp + EXP((LOG(estimate1) - mu)/sigma)) 1786 estimate = estimate1*(1.0_dp - switch) + estimate2*switch 1787 res = INT(estimate*0.001_dp, KIND=int_8) + 1 1788 1789 CONTAINS 1790 1791! ************************************************************************************************** 1792!> \brief ... 1793!> \param p ... 1794!> \return ... 1795! ************************************************************************************************** 1796 REAL(KIND=dp) FUNCTION estimate_basic(p) RESULT(res) 1797 REAL(KIND=dp) :: p(12) 1798 1799 REAL(KIND=dp) :: p1, p10, p11, p12, p2, p3, p4, p5, p6, & 1800 p7, p8, p9 1801 1802 p1 = p(1); p2 = p(2); p3 = p(3); p4 = p(4) 1803 p5 = p(5); p6 = p(6); p7 = p(7); p8 = p(8) 1804 p9 = p(9); p10 = p(10); p11 = p(11); p12 = p(12) 1805 res = poly2(nsa, p1, p2, p3)*poly2(nsb, p1, p2, p3)*poly2(nsc, p1, p2, p3)*poly2(nsd, p1, p2, p3)* & 1806 poly2(npgfa, p4, p5, p6)*poly2(npgfb, p4, p5, p6)*poly2(npgfc, p4, p5, p6)* & 1807 poly2(npgfd, p4, p5, p6)*EXP(-p7*ratio + p8*ratio**2) + & 1808 1000.0_dp*p9 + poly2(nsa, p10, p11, p12)*poly2(nsb, p10, p11, p12)*poly2(nsc, p10, p11, p12)*poly2(nsd, p10, p11, p12) 1809 res = 1 + ABS(res) 1810 END FUNCTION estimate_basic 1811 1812! ************************************************************************************************** 1813!> \brief ... 1814!> \param x ... 1815!> \param a0 ... 1816!> \param a1 ... 1817!> \param a2 ... 1818!> \return ... 1819! ************************************************************************************************** 1820 REAL(KIND=dp) FUNCTION poly2(x, a0, a1, a2) 1821 INTEGER :: x 1822 REAL(KIND=dp) :: a0, a1, a2 1823 1824 poly2 = a0 + a1*x + a2*x*x 1825 END FUNCTION poly2 1826 1827 END FUNCTION cost_model 1828! ************************************************************************************************** 1829!> \brief Minimizes the maximum cost per cpu by shuffling around all bins 1830!> \param total_number_of_bins ... 1831!> \param number_of_processes ... 1832!> \param bin_costs costs per bin 1833!> \param distribution_vector will contain the final distribution 1834!> \param do_randomize ... 1835!> \par History 1836!> 03.2009 created from a hack by Joost [Manuel Guidon] 1837!> \author Manuel Guidon 1838! ************************************************************************************************** 1839 SUBROUTINE optimize_distribution(total_number_of_bins, number_of_processes, bin_costs, & 1840 distribution_vector, do_randomize) 1841 INTEGER :: total_number_of_bins, number_of_processes 1842 INTEGER(int_8), DIMENSION(:), POINTER :: bin_costs 1843 INTEGER, DIMENSION(:), POINTER :: distribution_vector 1844 LOGICAL, INTENT(IN) :: do_randomize 1845 1846 CHARACTER(LEN=*), PARAMETER :: routineN = 'optimize_distribution', & 1847 routineP = moduleN//':'//routineN 1848 1849 INTEGER :: i, itmp, j, nstep 1850 INTEGER(int_8), DIMENSION(:), POINTER :: my_cost_cpu, tmp_cost, tmp_cpu_cost 1851 INTEGER, DIMENSION(:), POINTER :: tmp_cpu_index, tmp_index 1852 TYPE(rng_stream_type), POINTER :: rng_stream 1853 1854 nstep = MAX(1, INT(number_of_processes)/2) 1855 1856 ALLOCATE (tmp_cost(total_number_of_bins)) 1857 ALLOCATE (tmp_index(total_number_of_bins)) 1858 ALLOCATE (tmp_cpu_cost(number_of_processes)) 1859 ALLOCATE (tmp_cpu_index(number_of_processes)) 1860 ALLOCATE (my_cost_cpu(number_of_processes)) 1861 tmp_cost = bin_costs 1862 1863 CALL sort(tmp_cost, total_number_of_bins, tmp_index) 1864 my_cost_cpu = 0 1865 ! 1866 ! assign the largest remaining bin to the CPU with the smallest load 1867 ! gives near perfect distributions for a sufficient number of bins ... 1868 ! doing this in chunks of nstep (where nstep ~ number_of_processes) makes this n log n and gives 1869 ! each cpu a similar number of tasks. 1870 ! it also avoids degenerate cases where thousands of zero sized tasks 1871 ! are assigned to the same (least loaded) cpu 1872 ! 1873 IF (do_randomize) THEN 1874 NULLIFY (rng_stream) 1875 CALL create_rng_stream(rng_stream=rng_stream, & 1876 name="uniform_rng", & 1877 distribution_type=UNIFORM) 1878 END IF 1879 1880 DO i = total_number_of_bins, 1, -nstep 1881 tmp_cpu_cost = my_cost_cpu 1882 CALL sort(tmp_cpu_cost, INT(number_of_processes), tmp_cpu_index) 1883 IF (do_randomize) THEN 1884 CALL reshuffle(MIN(i, nstep), tmp_cpu_index(1:MIN(i, nstep)), rng_stream) 1885 END IF 1886 DO j = 1, MIN(i, nstep) 1887 itmp = tmp_cpu_index(j) 1888 distribution_vector(tmp_index(i - j + 1)) = itmp 1889 my_cost_cpu(itmp) = my_cost_cpu(itmp) + bin_costs(tmp_index(i - j + 1)) 1890 ENDDO 1891 ENDDO 1892 1893 IF (do_randomize) THEN 1894 CALL delete_rng_stream(rng_stream) 1895 END IF 1896 1897 DEALLOCATE (tmp_cost, tmp_index, tmp_cpu_cost) 1898 DEALLOCATE (tmp_cpu_index, my_cost_cpu) 1899 END SUBROUTINE optimize_distribution 1900 1901! ************************************************************************************************** 1902!> \brief Given a 2d index pair, this function returns a 1d index pair for 1903!> a symmetric upper triangle NxN matrix 1904!> The compiler should inline this function, therefore it appears in 1905!> several modules 1906!> \param i 2d index 1907!> \param j 2d index 1908!> \param N matrix size 1909!> \return ... 1910!> \par History 1911!> 03.2009 created [Manuel Guidon] 1912!> \author Manuel Guidon 1913! ************************************************************************************************** 1914 PURE FUNCTION get_1D_idx(i, j, N) 1915 INTEGER, INTENT(IN) :: i, j 1916 INTEGER(int_8), INTENT(IN) :: N 1917 INTEGER(int_8) :: get_1D_idx 1918 1919 INTEGER(int_8) :: min_ij 1920 1921 min_ij = MIN(i, j) 1922 get_1D_idx = min_ij*N + MAX(i, j) - (min_ij - 1)*min_ij/2 - N 1923 1924 END FUNCTION get_1D_idx 1925 1926! ************************************************************************************************** 1927!> \brief ... 1928!> \param natom ... 1929!> \param nkind ... 1930!> \param list_ij ... 1931!> \param list_kl ... 1932!> \param set_list_ij ... 1933!> \param set_list_kl ... 1934!> \param iatom_start ... 1935!> \param iatom_end ... 1936!> \param jatom_start ... 1937!> \param jatom_end ... 1938!> \param katom_start ... 1939!> \param katom_end ... 1940!> \param latom_start ... 1941!> \param latom_end ... 1942!> \param particle_set ... 1943!> \param coeffs_set ... 1944!> \param coeffs_kind ... 1945!> \param is_assoc_atomic_block_global ... 1946!> \param do_periodic ... 1947!> \param kind_of ... 1948!> \param basis_parameter ... 1949!> \param pmax_set ... 1950!> \param pmax_atom ... 1951!> \param pmax_blocks ... 1952!> \param cell ... 1953!> \param do_p_screening ... 1954!> \param map_atom_to_kind_atom ... 1955!> \param eval_type ... 1956!> \param log10_eps_schwarz ... 1957!> \param log_2 ... 1958!> \param coeffs_kind_max0 ... 1959!> \param use_virial ... 1960!> \param atomic_pair_list ... 1961!> \return ... 1962! ************************************************************************************************** 1963 FUNCTION estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, & 1964 iatom_start, iatom_end, jatom_start, jatom_end, & 1965 katom_start, katom_end, latom_start, latom_end, & 1966 particle_set, & 1967 coeffs_set, coeffs_kind, & 1968 is_assoc_atomic_block_global, do_periodic, & 1969 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, & 1970 cell, & 1971 do_p_screening, map_atom_to_kind_atom, eval_type, & 1972 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, & 1973 atomic_pair_list) 1974 1975 INTEGER, INTENT(IN) :: natom, nkind 1976 TYPE(pair_list_type) :: list_ij, list_kl 1977 TYPE(pair_set_list_type), DIMENSION(:) :: set_list_ij, set_list_kl 1978 INTEGER, INTENT(IN) :: iatom_start, iatom_end, jatom_start, & 1979 jatom_end, katom_start, katom_end, & 1980 latom_start, latom_end 1981 TYPE(particle_type), DIMENSION(:), POINTER :: particle_set 1982 TYPE(hfx_screen_coeff_type), & 1983 DIMENSION(:, :, :, :), POINTER :: coeffs_set 1984 TYPE(hfx_screen_coeff_type), & 1985 DIMENSION(nkind, nkind) :: coeffs_kind 1986 INTEGER, DIMENSION(:, :) :: is_assoc_atomic_block_global 1987 LOGICAL :: do_periodic 1988 INTEGER :: kind_of(*) 1989 TYPE(hfx_basis_type), DIMENSION(:), POINTER :: basis_parameter 1990 TYPE(hfx_p_kind), DIMENSION(:), POINTER :: pmax_set 1991 REAL(dp), DIMENSION(:, :), POINTER :: pmax_atom 1992 REAL(dp) :: pmax_blocks 1993 TYPE(cell_type), POINTER :: cell 1994 LOGICAL, INTENT(IN) :: do_p_screening 1995 INTEGER, DIMENSION(:), POINTER :: map_atom_to_kind_atom 1996 INTEGER, INTENT(IN) :: eval_type 1997 REAL(dp) :: log10_eps_schwarz, log_2, & 1998 coeffs_kind_max0 1999 LOGICAL, INTENT(IN) :: use_virial 2000 LOGICAL, DIMENSION(natom, natom) :: atomic_pair_list 2001 INTEGER(int_8) :: estimate_block_cost 2002 2003 INTEGER :: i_list_ij, i_list_kl, i_set_list_ij, i_set_list_ij_start, i_set_list_ij_stop, & 2004 i_set_list_kl, i_set_list_kl_start, i_set_list_kl_stop, iatom, ikind, iset, jatom, jkind, & 2005 jset, katom, kind_kind_idx, kkind, kset, latom, lkind, lset, swap_id 2006 INTEGER, DIMENSION(:), POINTER :: npgfa, npgfb, npgfc, npgfd, nsgfa, & 2007 nsgfb, nsgfc, nsgfd 2008 REAL(dp) :: actual_pmax_atom, cost_tmp, max_val1, & 2009 max_val2, pmax_entry, rab2, rcd2, & 2010 screen_kind_ij, screen_kind_kl 2011 REAL(dp), DIMENSION(:, :), POINTER :: ptr_p_1, ptr_p_2, ptr_p_3, ptr_p_4 2012 2013 estimate_block_cost = 0_int_8 2014 2015 CALL build_pair_list(natom, list_ij, set_list_ij, iatom_start, iatom_end, jatom_start, jatom_end, & 2016 kind_of, basis_parameter, particle_set, & 2017 do_periodic, coeffs_set, coeffs_kind, coeffs_kind_max0, & 2018 log10_eps_schwarz, cell, pmax_blocks, atomic_pair_list) 2019 2020 CALL build_pair_list(natom, list_kl, set_list_kl, katom_start, katom_end, latom_start, latom_end, & 2021 kind_of, basis_parameter, particle_set, & 2022 do_periodic, coeffs_set, coeffs_kind, coeffs_kind_max0, & 2023 log10_eps_schwarz, cell, pmax_blocks, atomic_pair_list) 2024 2025 DO i_list_ij = 1, list_ij%n_element 2026 iatom = list_ij%elements(i_list_ij)%pair(1) 2027 jatom = list_ij%elements(i_list_ij)%pair(2) 2028 i_set_list_ij_start = list_ij%elements(i_list_ij)%set_bounds(1) 2029 i_set_list_ij_stop = list_ij%elements(i_list_ij)%set_bounds(2) 2030 ikind = list_ij%elements(i_list_ij)%kind_pair(1) 2031 jkind = list_ij%elements(i_list_ij)%kind_pair(2) 2032 rab2 = list_ij%elements(i_list_ij)%dist2 2033 2034 nsgfa => basis_parameter(ikind)%nsgf 2035 nsgfb => basis_parameter(jkind)%nsgf 2036 npgfa => basis_parameter(ikind)%npgf 2037 npgfb => basis_parameter(jkind)%npgf 2038 2039 DO i_list_kl = 1, list_kl%n_element 2040 2041 katom = list_kl%elements(i_list_kl)%pair(1) 2042 latom = list_kl%elements(i_list_kl)%pair(2) 2043 2044 IF (.NOT. (katom + latom <= iatom + jatom)) CYCLE 2045 IF (((iatom + jatom) .EQ. (katom + latom)) .AND. (katom < iatom)) CYCLE 2046 2047 IF (eval_type == hfx_do_eval_forces) THEN 2048 IF (.NOT. use_virial) THEN 2049 IF ((iatom == jatom .AND. iatom == katom .AND. iatom == latom)) CYCLE 2050 END IF 2051 END IF 2052 2053 i_set_list_kl_start = list_kl%elements(i_list_kl)%set_bounds(1) 2054 i_set_list_kl_stop = list_kl%elements(i_list_kl)%set_bounds(2) 2055 kkind = list_kl%elements(i_list_kl)%kind_pair(1) 2056 lkind = list_kl%elements(i_list_kl)%kind_pair(2) 2057 rcd2 = list_kl%elements(i_list_kl)%dist2 2058 2059 nsgfc => basis_parameter(kkind)%nsgf 2060 nsgfd => basis_parameter(lkind)%nsgf 2061 npgfc => basis_parameter(kkind)%npgf 2062 npgfd => basis_parameter(lkind)%npgf 2063 2064 IF (do_p_screening) THEN 2065 actual_pmax_atom = MAX(pmax_atom(katom, iatom), & 2066 pmax_atom(latom, jatom), & 2067 pmax_atom(latom, iatom), & 2068 pmax_atom(katom, jatom)) 2069 ELSE 2070 actual_pmax_atom = 0.0_dp 2071 END IF 2072 2073 screen_kind_ij = coeffs_kind(jkind, ikind)%x(1)*rab2 + & 2074 coeffs_kind(jkind, ikind)%x(2) 2075 screen_kind_kl = coeffs_kind(lkind, kkind)%x(1)*rcd2 + & 2076 coeffs_kind(lkind, kkind)%x(2) 2077 IF (screen_kind_ij + screen_kind_kl + actual_pmax_atom < log10_eps_schwarz) CYCLE 2078 2079 IF (.NOT. (is_assoc_atomic_block_global(latom, iatom) >= 1 .AND. & 2080 is_assoc_atomic_block_global(katom, iatom) >= 1 .AND. & 2081 is_assoc_atomic_block_global(katom, jatom) >= 1 .AND. & 2082 is_assoc_atomic_block_global(latom, jatom) >= 1)) CYCLE 2083 2084 IF (do_p_screening) THEN 2085 SELECT CASE (eval_type) 2086 CASE (hfx_do_eval_energy) 2087 swap_id = 0 2088 kind_kind_idx = INT(get_1D_idx(kkind, ikind, INT(nkind, int_8))) 2089 IF (ikind >= kkind) THEN 2090 ptr_p_1 => pmax_set(kind_kind_idx)%p_kind(:, :, & 2091 map_atom_to_kind_atom(katom), & 2092 map_atom_to_kind_atom(iatom)) 2093 ELSE 2094 ptr_p_1 => pmax_set(kind_kind_idx)%p_kind(:, :, & 2095 map_atom_to_kind_atom(iatom), & 2096 map_atom_to_kind_atom(katom)) 2097 swap_id = swap_id + 1 2098 END IF 2099 kind_kind_idx = INT(get_1D_idx(lkind, jkind, INT(nkind, int_8))) 2100 IF (jkind >= lkind) THEN 2101 ptr_p_2 => pmax_set(kind_kind_idx)%p_kind(:, :, & 2102 map_atom_to_kind_atom(latom), & 2103 map_atom_to_kind_atom(jatom)) 2104 ELSE 2105 ptr_p_2 => pmax_set(kind_kind_idx)%p_kind(:, :, & 2106 map_atom_to_kind_atom(jatom), & 2107 map_atom_to_kind_atom(latom)) 2108 swap_id = swap_id + 2 2109 END IF 2110 kind_kind_idx = INT(get_1D_idx(lkind, ikind, INT(nkind, int_8))) 2111 IF (ikind >= lkind) THEN 2112 ptr_p_3 => pmax_set(kind_kind_idx)%p_kind(:, :, & 2113 map_atom_to_kind_atom(latom), & 2114 map_atom_to_kind_atom(iatom)) 2115 ELSE 2116 ptr_p_3 => pmax_set(kind_kind_idx)%p_kind(:, :, & 2117 map_atom_to_kind_atom(iatom), & 2118 map_atom_to_kind_atom(latom)) 2119 swap_id = swap_id + 4 2120 END IF 2121 kind_kind_idx = INT(get_1D_idx(kkind, jkind, INT(nkind, int_8))) 2122 IF (jkind >= kkind) THEN 2123 ptr_p_4 => pmax_set(kind_kind_idx)%p_kind(:, :, & 2124 map_atom_to_kind_atom(katom), & 2125 map_atom_to_kind_atom(jatom)) 2126 ELSE 2127 ptr_p_4 => pmax_set(kind_kind_idx)%p_kind(:, :, & 2128 map_atom_to_kind_atom(jatom), & 2129 map_atom_to_kind_atom(katom)) 2130 swap_id = swap_id + 8 2131 END IF 2132 CASE (hfx_do_eval_forces) 2133 swap_id = 16 2134 kind_kind_idx = INT(get_1D_idx(kkind, ikind, INT(nkind, int_8))) 2135 IF (ikind >= kkind) THEN 2136 ptr_p_1 => pmax_set(kind_kind_idx)%p_kind(:, :, & 2137 map_atom_to_kind_atom(katom), & 2138 map_atom_to_kind_atom(iatom)) 2139 ELSE 2140 ptr_p_1 => pmax_set(kind_kind_idx)%p_kind(:, :, & 2141 map_atom_to_kind_atom(iatom), & 2142 map_atom_to_kind_atom(katom)) 2143 swap_id = swap_id + 1 2144 END IF 2145 kind_kind_idx = INT(get_1D_idx(lkind, jkind, INT(nkind, int_8))) 2146 IF (jkind >= lkind) THEN 2147 ptr_p_2 => pmax_set(kind_kind_idx)%p_kind(:, :, & 2148 map_atom_to_kind_atom(latom), & 2149 map_atom_to_kind_atom(jatom)) 2150 ELSE 2151 ptr_p_2 => pmax_set(kind_kind_idx)%p_kind(:, :, & 2152 map_atom_to_kind_atom(jatom), & 2153 map_atom_to_kind_atom(latom)) 2154 swap_id = swap_id + 2 2155 END IF 2156 kind_kind_idx = INT(get_1D_idx(lkind, ikind, INT(nkind, int_8))) 2157 IF (ikind >= lkind) THEN 2158 ptr_p_3 => pmax_set(kind_kind_idx)%p_kind(:, :, & 2159 map_atom_to_kind_atom(latom), & 2160 map_atom_to_kind_atom(iatom)) 2161 ELSE 2162 ptr_p_3 => pmax_set(kind_kind_idx)%p_kind(:, :, & 2163 map_atom_to_kind_atom(iatom), & 2164 map_atom_to_kind_atom(latom)) 2165 swap_id = swap_id + 4 2166 END IF 2167 kind_kind_idx = INT(get_1D_idx(kkind, jkind, INT(nkind, int_8))) 2168 IF (jkind >= kkind) THEN 2169 ptr_p_4 => pmax_set(kind_kind_idx)%p_kind(:, :, & 2170 map_atom_to_kind_atom(katom), & 2171 map_atom_to_kind_atom(jatom)) 2172 ELSE 2173 ptr_p_4 => pmax_set(kind_kind_idx)%p_kind(:, :, & 2174 map_atom_to_kind_atom(jatom), & 2175 map_atom_to_kind_atom(katom)) 2176 swap_id = swap_id + 8 2177 END IF 2178 END SELECT 2179 END IF 2180 2181 DO i_set_list_ij = i_set_list_ij_start, i_set_list_ij_stop 2182 iset = set_list_ij(i_set_list_ij)%pair(1) 2183 jset = set_list_ij(i_set_list_ij)%pair(2) 2184 2185 max_val1 = coeffs_set(jset, iset, jkind, ikind)%x(1)*rab2 + & 2186 coeffs_set(jset, iset, jkind, ikind)%x(2) 2187 2188 IF (max_val1 + screen_kind_kl + actual_pmax_atom < log10_eps_schwarz) CYCLE 2189 DO i_set_list_kl = i_set_list_kl_start, i_set_list_kl_stop 2190 kset = set_list_kl(i_set_list_kl)%pair(1) 2191 lset = set_list_kl(i_set_list_kl)%pair(2) 2192 2193 max_val2 = max_val1 + (coeffs_set(lset, kset, lkind, kkind)%x(1)*rcd2 + & 2194 coeffs_set(lset, kset, lkind, kkind)%x(2)) 2195 2196 IF (max_val2 + actual_pmax_atom < log10_eps_schwarz) CYCLE 2197 IF (do_p_screening) THEN 2198 CALL get_pmax_val(ptr_p_1, ptr_p_2, ptr_p_3, ptr_p_4, & 2199 iset, jset, kset, lset, & 2200 pmax_entry, swap_id) 2201 IF (eval_type == hfx_do_eval_forces) THEN 2202 pmax_entry = log_2 + pmax_entry 2203 END IF 2204 ELSE 2205 pmax_entry = 0.0_dp 2206 END IF 2207 max_val2 = max_val2 + pmax_entry 2208 IF (max_val2 < log10_eps_schwarz) CYCLE 2209 SELECT CASE (eval_type) 2210 CASE (hfx_do_eval_energy) 2211 cost_tmp = cost_model(nsgfa(iset), nsgfb(jset), nsgfc(kset), nsgfd(lset), & 2212 npgfa(iset), npgfb(jset), npgfc(kset), npgfd(lset), & 2213 max_val2/log10_eps_schwarz, & 2214 p1_energy, p2_energy, p3_energy) 2215 estimate_block_cost = estimate_block_cost + INT(cost_tmp, KIND=int_8) 2216 CASE (hfx_do_eval_forces) 2217 cost_tmp = cost_model(nsgfa(iset), nsgfb(jset), nsgfc(kset), nsgfd(lset), & 2218 npgfa(iset), npgfb(jset), npgfc(kset), npgfd(lset), & 2219 max_val2/log10_eps_schwarz, & 2220 p1_forces, p2_forces, p3_forces) 2221 estimate_block_cost = estimate_block_cost + INT(cost_tmp, KIND=int_8) 2222 END SELECT 2223 ENDDO ! i_set_list_kl 2224 ENDDO ! i_set_list_ij 2225 ENDDO ! i_list_kl 2226 ENDDO ! i_list_ij 2227 2228 END FUNCTION estimate_block_cost 2229 2230! ************************************************************************************************** 2231!> \brief ... 2232!> \param nkind ... 2233!> \param para_env ... 2234!> \param natom ... 2235!> \param block_size ... 2236!> \param nblock ... 2237!> \param blocks ... 2238!> \param list_ij ... 2239!> \param list_kl ... 2240!> \param set_list_ij ... 2241!> \param set_list_kl ... 2242!> \param particle_set ... 2243!> \param coeffs_set ... 2244!> \param coeffs_kind ... 2245!> \param is_assoc_atomic_block_global ... 2246!> \param do_periodic ... 2247!> \param kind_of ... 2248!> \param basis_parameter ... 2249!> \param pmax_set ... 2250!> \param pmax_atom ... 2251!> \param pmax_blocks ... 2252!> \param cell ... 2253!> \param do_p_screening ... 2254!> \param map_atom_to_kind_atom ... 2255!> \param eval_type ... 2256!> \param log10_eps_schwarz ... 2257!> \param log_2 ... 2258!> \param coeffs_kind_max0 ... 2259!> \param use_virial ... 2260!> \param atomic_pair_list ... 2261! ************************************************************************************************** 2262 SUBROUTINE init_blocks(nkind, para_env, natom, block_size, nblock, blocks, & 2263 list_ij, list_kl, set_list_ij, set_list_kl, & 2264 particle_set, & 2265 coeffs_set, coeffs_kind, & 2266 is_assoc_atomic_block_global, do_periodic, & 2267 kind_of, basis_parameter, pmax_set, pmax_atom, & 2268 pmax_blocks, cell, & 2269 do_p_screening, map_atom_to_kind_atom, eval_type, & 2270 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, & 2271 atomic_pair_list) 2272 2273 INTEGER, INTENT(IN) :: nkind 2274 TYPE(cp_para_env_type), POINTER :: para_env 2275 INTEGER :: natom, block_size, nblock 2276 TYPE(hfx_block_range_type), DIMENSION(1:nblock) :: blocks 2277 TYPE(pair_list_type) :: list_ij, list_kl 2278 TYPE(pair_set_list_type), DIMENSION(:) :: set_list_ij, set_list_kl 2279 TYPE(particle_type), DIMENSION(:), POINTER :: particle_set 2280 TYPE(hfx_screen_coeff_type), & 2281 DIMENSION(:, :, :, :), POINTER :: coeffs_set 2282 TYPE(hfx_screen_coeff_type), DIMENSION(:, :), & 2283 POINTER :: coeffs_kind 2284 INTEGER, DIMENSION(:, :) :: is_assoc_atomic_block_global 2285 LOGICAL :: do_periodic 2286 INTEGER :: kind_of(*) 2287 TYPE(hfx_basis_type), DIMENSION(:), POINTER :: basis_parameter 2288 TYPE(hfx_p_kind), DIMENSION(:), POINTER :: pmax_set 2289 REAL(dp), DIMENSION(:, :), POINTER :: pmax_atom 2290 REAL(dp) :: pmax_blocks 2291 TYPE(cell_type), POINTER :: cell 2292 LOGICAL, INTENT(IN) :: do_p_screening 2293 INTEGER, DIMENSION(:), POINTER :: map_atom_to_kind_atom 2294 INTEGER, INTENT(IN) :: eval_type 2295 REAL(dp) :: log10_eps_schwarz, log_2, & 2296 coeffs_kind_max0 2297 LOGICAL, INTENT(IN) :: use_virial 2298 LOGICAL, DIMENSION(natom, natom) :: atomic_pair_list 2299 2300 INTEGER :: atom_block, i, iatom_block, iatom_end, & 2301 iatom_start, my_cpu_rank, ncpus 2302 2303 DO atom_block = 0, nblock - 1 2304 iatom_block = MODULO(atom_block, nblock) + 1 2305 iatom_start = (iatom_block - 1)*block_size + 1 2306 iatom_end = MIN(iatom_block*block_size, natom) 2307 blocks(atom_block + 1)%istart = iatom_start 2308 blocks(atom_block + 1)%iend = iatom_end 2309 blocks(atom_block + 1)%cost = 0_int_8 2310 END DO 2311 2312 ncpus = para_env%num_pe 2313 my_cpu_rank = para_env%mepos 2314 DO i = 1, nblock 2315 IF (MODULO(i, ncpus) /= my_cpu_rank) THEN 2316 blocks(i)%istart = 0 2317 blocks(i)%iend = 0 2318 CYCLE 2319 END IF 2320 iatom_start = blocks(i)%istart 2321 iatom_end = blocks(i)%iend 2322 blocks(i)%cost = estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, & 2323 iatom_start, iatom_end, iatom_start, iatom_end, & 2324 iatom_start, iatom_end, iatom_start, iatom_end, & 2325 particle_set, & 2326 coeffs_set, coeffs_kind, & 2327 is_assoc_atomic_block_global, do_periodic, & 2328 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, & 2329 cell, & 2330 do_p_screening, map_atom_to_kind_atom, eval_type, & 2331 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list) 2332 2333 END DO 2334 END SUBROUTINE init_blocks 2335 2336! ************************************************************************************************** 2337!> \brief ... 2338!> \param para_env ... 2339!> \param x_data ... 2340!> \param iw ... 2341!> \param n_threads ... 2342!> \param i_thread ... 2343!> \param eval_type ... 2344! ************************************************************************************************** 2345 SUBROUTINE collect_load_balance_info(para_env, x_data, iw, n_threads, i_thread, & 2346 eval_type) 2347 2348 TYPE(cp_para_env_type), POINTER :: para_env 2349 TYPE(hfx_type), POINTER :: x_data 2350 INTEGER, INTENT(IN) :: iw, n_threads, i_thread, eval_type 2351 2352 CHARACTER(LEN=*), PARAMETER :: routineN = 'collect_load_balance_info', & 2353 routineP = moduleN//':'//routineN 2354 2355 INTEGER :: i, j, k, my_rank, nbins, nranks, & 2356 total_bins 2357 INTEGER(int_8) :: avg_bin, avg_rank, max_bin, max_rank, & 2358 min_bin, min_rank, sum_bin, sum_rank 2359 INTEGER(int_8), ALLOCATABLE, DIMENSION(:) :: buffer, buffer_in, buffer_out, summary 2360 INTEGER(int_8), ALLOCATABLE, DIMENSION(:), SAVE :: shm_cost_vector 2361 INTEGER, ALLOCATABLE, DIMENSION(:) :: bins_per_rank, rdispl, sort_idx 2362 INTEGER, ALLOCATABLE, DIMENSION(:), SAVE :: shm_bins_per_rank, shm_displ 2363 2364 SELECT CASE (eval_type) 2365 CASE (hfx_do_eval_energy) 2366 nbins = SIZE(x_data%distribution_energy) 2367 CASE (hfx_do_eval_forces) 2368 nbins = SIZE(x_data%distribution_forces) 2369 END SELECT 2370 2371!$OMP MASTER 2372 ALLOCATE (shm_bins_per_rank(n_threads)) 2373 ALLOCATE (shm_displ(n_threads + 1)) 2374!$OMP END MASTER 2375!$OMP BARRIER 2376 2377 shm_bins_per_rank(i_thread + 1) = nbins 2378!$OMP BARRIER 2379 nbins = 0 2380 DO i = 1, n_threads 2381 nbins = nbins + shm_bins_per_rank(i) 2382 END DO 2383 my_rank = para_env%mepos 2384 nranks = para_env%num_pe 2385 2386!$OMP BARRIER 2387!$OMP MASTER 2388 ALLOCATE (bins_per_rank(nranks)) 2389 bins_per_rank = 0 2390 2391 bins_per_rank(my_rank + 1) = nbins 2392 2393 CALL mp_sum(bins_per_rank, para_env%group) 2394 2395 total_bins = 0 2396 DO i = 1, nranks 2397 total_bins = total_bins + bins_per_rank(i) 2398 END DO 2399 2400 ALLOCATE (shm_cost_vector(2*total_bins)) 2401 shm_cost_vector = -1_int_8 2402 shm_displ(1) = 1 2403 DO i = 2, n_threads 2404 shm_displ(i) = shm_displ(i - 1) + shm_bins_per_rank(i - 1) 2405 END DO 2406 shm_displ(n_threads + 1) = nbins + 1 2407!$OMP END MASTER 2408!$OMP BARRIER 2409 j = 0 2410 SELECT CASE (eval_type) 2411 CASE (hfx_do_eval_energy) 2412 DO i = shm_displ(i_thread + 1), shm_displ(i_thread + 2) - 1 2413 j = j + 1 2414 shm_cost_vector(2*(i - 1) + 1) = x_data%distribution_energy(j)%cost 2415 shm_cost_vector(2*i) = INT(x_data%distribution_energy(j)%time_first_scf*10000.0_dp, KIND=int_8) 2416 END DO 2417 CASE (hfx_do_eval_forces) 2418 DO i = shm_displ(i_thread + 1), shm_displ(i_thread + 2) - 1 2419 j = j + 1 2420 shm_cost_vector(2*(i - 1) + 1) = x_data%distribution_forces(j)%cost 2421 shm_cost_vector(2*i) = INT(x_data%distribution_forces(j)%time_forces*10000.0_dp, KIND=int_8) 2422 END DO 2423 END SELECT 2424!$OMP BARRIER 2425!$OMP MASTER 2426 ! ** calculate offsets 2427 ALLOCATE (rdispl(nranks)) 2428 bins_per_rank(:) = bins_per_rank(:)*2 2429 rdispl(1) = 0 2430 DO i = 2, nranks 2431 rdispl(i) = rdispl(i - 1) + bins_per_rank(i - 1) 2432 END DO 2433 2434 ALLOCATE (buffer_in(2*nbins)) 2435 ALLOCATE (buffer_out(2*total_bins)) 2436 2437 DO i = 1, nbins 2438 buffer_in(2*(i - 1) + 1) = shm_cost_vector(2*(i - 1) + 1) 2439 buffer_in(2*i) = shm_cost_vector(2*i) 2440 END DO 2441 2442 CALL mp_gatherv(buffer_in, buffer_out, bins_per_rank, rdispl, para_env%source, para_env%group) 2443 2444 IF (iw > 0) THEN 2445 2446 ALLOCATE (summary(2*nranks)) 2447 summary = 0_int_8 2448 2449 WRITE (iw, '( /, 1X, 79("-") )') 2450 WRITE (iw, '( " -", 77X, "-" )') 2451 SELECT CASE (eval_type) 2452 CASE (hfx_do_eval_energy) 2453 WRITE (iw, '( " -", 20X, A, 19X, "-" )') ' HFX LOAD BALANCE INFORMATION - ENERGY ' 2454 CASE (hfx_do_eval_forces) 2455 WRITE (iw, '( " -", 20X, A, 19X, "-" )') ' HFX LOAD BALANCE INFORMATION - FORCES ' 2456 END SELECT 2457 WRITE (iw, '( " -", 77X, "-" )') 2458 WRITE (iw, '( 1X, 79("-") )') 2459 2460 WRITE (iw, FMT="(T3,A,T15,A,T35,A,T55,A)") "MPI RANK", "BIN #", "EST cost", "Processing time [s]" 2461 WRITE (iw, '( 1X, 79("-"), / )') 2462 k = 0 2463 DO i = 1, nranks 2464 DO j = 1, bins_per_rank(i)/2 2465 k = k + 1 2466 WRITE (iw, FMT="(T6,I5,T15,I5,T27,I16,T55,F19.8)") & 2467 i - 1, j, buffer_out(2*(k - 1) + 1), REAL(buffer_out(2*k), dp)/10000.0_dp 2468 summary(2*(i - 1) + 1) = summary(2*(i - 1) + 1) + buffer_out(2*(k - 1) + 1) 2469 summary(2*i) = summary(2*i) + buffer_out(2*k) 2470 END DO 2471 END DO 2472 2473 !** Summary 2474 max_bin = 0_int_8 2475 min_bin = HUGE(min_bin) 2476 sum_bin = 0_int_8 2477 DO i = 1, total_bins 2478 sum_bin = sum_bin + buffer_out(2*i) 2479 max_bin = MAX(max_bin, buffer_out(2*i)) 2480 min_bin = MIN(min_bin, buffer_out(2*i)) 2481 END DO 2482 avg_bin = sum_bin/total_bins 2483 2484 max_rank = 0_int_8 2485 min_rank = HUGE(min_rank) 2486 sum_rank = 0_int_8 2487 DO i = 1, nranks 2488 sum_rank = sum_rank + summary(2*i) 2489 max_rank = MAX(max_rank, summary(2*i)) 2490 min_rank = MIN(min_rank, summary(2*i)) 2491 END DO 2492 avg_rank = sum_rank/nranks 2493 2494 WRITE (iw, FMT='(/,T3,A,/)') "SUMMARY:" 2495 WRITE (iw, FMT="(T3,A,T35,F19.8)") "Max bin", REAL(max_bin, dp)/10000.0_dp 2496 WRITE (iw, FMT="(T3,A,T35,F19.8)") "Min bin", REAL(min_bin, dp)/10000.0_dp 2497 WRITE (iw, FMT="(T3,A,T35,F19.8)") "Sum bin", REAL(sum_bin, dp)/10000.0_dp 2498 WRITE (iw, FMT="(T3,A,T35,F19.8,/)") "Avg bin", REAL(avg_bin, dp)/10000.0_dp 2499 WRITE (iw, FMT="(T3,A,T35,F19.8)") "Max rank", REAL(max_rank, dp)/10000.0_dp 2500 WRITE (iw, FMT="(T3,A,T35,F19.8)") "Min rank", REAL(min_rank, dp)/10000.0_dp 2501 WRITE (iw, FMT="(T3,A,T35,F19.8)") "Sum rank", REAL(sum_rank, dp)/10000.0_dp 2502 WRITE (iw, FMT="(T3,A,T35,F19.8,/)") "Avg rank", REAL(avg_rank, dp)/10000.0_dp 2503 2504 ALLOCATE (buffer(nranks)) 2505 ALLOCATE (sort_idx(nranks)) 2506 2507 DO i = 1, nranks 2508 buffer(i) = summary(2*i) 2509 END DO 2510 2511 CALL sort(buffer, nranks, sort_idx) 2512 2513 WRITE (iw, FMT="(T3,A,T35,A,T55,A,/)") "MPI RANK", "EST cost", "Processing time [s]" 2514 DO i = nranks, 1, -1 2515 WRITE (iw, FMT="(T6,I5,T27,I16,T55,F19.8)") sort_idx(i) - 1, summary(2*(sort_idx(i) - 1) + 1), REAL(buffer(i), dp)/10000.0_dp 2516 END DO 2517 2518 DEALLOCATE (summary, buffer, sort_idx) 2519 2520 END IF 2521 2522 DEALLOCATE (buffer_in, buffer_out, rdispl) 2523 2524 CALL mp_sync(para_env%group) 2525 2526 DEALLOCATE (shm_bins_per_rank, shm_displ, shm_cost_vector) 2527!$OMP END MASTER 2528!$OMP BARRIER 2529 2530 END SUBROUTINE collect_load_balance_info 2531 2532! ************************************************************************************************** 2533!> \brief ... 2534!> \param size ... 2535!> \param array ... 2536!> \param rng_stream ... 2537! ************************************************************************************************** 2538 SUBROUTINE reshuffle(size, array, rng_stream) 2539 INTEGER :: size 2540 INTEGER, DIMENSION(size) :: array 2541 TYPE(rng_stream_type), POINTER :: rng_stream 2542 2543 INTEGER :: i, idx1, idx2, tmp 2544 REAL(dp) :: x 2545 2546 DO i = 1, size*10 2547 x = next_random_number(rng_stream) 2548 idx1 = INT(x*(size + 1 - 1)) + 1 2549 x = next_random_number(rng_stream) 2550 idx2 = INT(x*(size + 1 - 1)) + 1 2551 2552 tmp = array(idx1) 2553 array(idx1) = array(idx2) 2554 array(idx2) = tmp 2555 END DO 2556 2557 END SUBROUTINE 2558 2559#include "hfx_get_pmax_val.f90" 2560 2561END MODULE hfx_load_balance_methods 2562