1!! Copyright (C) 2002-2015 M. Marques, A. Castro, A. Rubio, G. Bertsch, X. Andrade 2!! 3!! This program is free software; you can redistribute it and/or modify 4!! it under the terms of the GNU General Public License as published by 5!! the Free Software Foundation; either version 2, or (at your option) 6!! any later version. 7!! 8!! This program is distributed in the hope that it will be useful, 9!! but WITHOUT ANY WARRANTY; without even the implied warranty of 10!! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11!! GNU General Public License for more details. 12!! 13!! You should have received a copy of the GNU General Public License 14!! along with this program; if not, write to the Free Software 15!! Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 16!! 02110-1301, USA. 17!! 18 19#include "global.h" 20 21!> Given a number n_index of indices and their ranges index_range(1:n_index), 22!! we divide the n_nodes in groups and create communicators for each group. 23!! Each group is associated with one index. The min_range indicates the minimum 24!! number of elements in each processor. For example, given min_range(1) = 25000, 25!! the algorithm will try to put at least 25000 points in each processor 26!! 27!! Example: 28!! \verbatim 29!! Given 3 indices with ranges 30!! index_range(1) = 100000, (number of points in the mesh) 31!! index_range(2) = 15, (number of states) 32!! index_range(3) = 2, (number of k-points) 33!! \endverbatim 34!! and 12 processors, we could get 35!! \verbatim 36!! mc%group_sizes = (2, 3, 2) 37!! \endverbatim 38!! which means that 39!! - space is divided in 2 domains per state, 40!! - the states are divided in 3 groups, i.e. 5 states per processor, and 41!! - the whole setting is duplicated because of the 2 k-points. 42!! 43!! To perform collective operations (like a reduce), you can use the communicators 44!! provided in mc%group_comm(:). For example, to sum over states, the communicator 45!! to use is mc%group_comm(P_STRATEGY_STATES) 46!! 47!! You can use the routine multicomm_strategy_is_parallel to know if a certain 48!! index is parallelized. 49 50module multicomm_oct_m 51 use blacs_oct_m 52 use global_oct_m 53 use messages_oct_m 54 use mpi_oct_m 55 use namespace_oct_m 56#if defined(HAVE_OPENMP) 57 use omp_lib 58#endif 59 use parser_oct_m 60 use profiling_oct_m 61 use utils_oct_m 62 63 implicit none 64 65 private 66 67 public :: & 68 multicomm_divide_range, & 69 multicomm_divide_range_omp, & 70#if defined(HAVE_MPI) 71 multicomm_create_all_pairs, & 72#endif 73 multicomm_t, & 74 multicomm_all_pairs_t, & 75 multicomm_init, & 76 multicomm_end, & 77 multicomm_all_pairs_copy, & 78 multicomm_strategy_is_parallel, & 79 multicomm_is_slave, & 80 multicomm_have_slaves 81 82 !> possible parallelization strategies 83 integer, public, parameter :: & 84 P_STRATEGY_SERIAL = 0, & !< single domain, all states, k-points on a single processor 85 P_STRATEGY_DOMAINS = 1, & !< parallelization in domains 86 P_STRATEGY_STATES = 2, & !< parallelization in states 87 P_STRATEGY_KPOINTS = 3, & !< parallelization in k-points 88 P_STRATEGY_OTHER = 4, & !< something else like e-h pairs 89 P_STRATEGY_MAX = 4 90 91 integer, public, parameter :: & 92 P_MASTER = 1, & 93 P_SLAVE = 2 94 95 integer, public, parameter :: & 96 PAR_AUTO = -1, & 97 PAR_NO = 0 98 99 integer, parameter :: n_par_types = 4 100 character(len=11), parameter :: par_types(0:n_par_types) = & 101 (/ & 102 "serial ", & 103 "ParDomains", & 104 "ParStates ", & 105 "ParKPoints", & 106 "ParOther " & 107 /) 108 109 integer, parameter :: MAX_INDEX = 5 110 111 !> Stores all communicators and groups 112 type multicomm_t 113 private 114 integer :: n_node !< Total number of nodes. 115 116 integer, public :: par_strategy !< What kind of parallelization strategy should we use? 117 118 integer, allocatable :: group_sizes(:) !< Number of processors in each group. 119 integer, allocatable, public :: who_am_i(:) !< Rank in the "line"-communicators. 120 integer, allocatable, public :: group_comm(:) !< "Line"-communicators I belong to. 121 integer, public :: dom_st_comm !< States-domain plane communicator. 122 integer, public :: st_kpt_comm !< Kpoints-states plane communicator. 123 integer, public :: dom_st_kpt_comm !< Kpoints-states-domain cube communicator. 124 125 integer :: nthreads !< Number of OMP threads 126 integer :: node_type !< Is this node a P_MASTER or a P_SLAVE? 127 logical :: have_slaves !< are slaves available? 128 129 integer :: full_comm !< The base communicator. 130 integer :: full_comm_rank !< The rank in the base communicator. 131 integer, public :: master_comm !< The communicator without slaves. 132 integer :: master_comm_rank !< The rank in the communicator without slaves. 133 integer, public :: slave_intercomm !< the intercomm to communicate with slaves 134 135 logical :: reorder_ranks !< do we reorder ranks in a more compact way? 136 end type multicomm_t 137 138 !> An all-pairs communication schedule for a given group. 139 type multicomm_all_pairs_t 140 private 141 type(mpi_grp_t) :: grp !< Schedule for this group. 142 integer :: rounds !< This many comm. rounds. 143 integer, pointer, public :: schedule(:, :) !< This is the schedule. 144 end type multicomm_all_pairs_t 145 146contains 147 148 ! --------------------------------------------------------- 149 subroutine multicomm_all_pairs_copy(apout, apin) 150 type(multicomm_all_pairs_t), intent(inout) :: apout 151 type(multicomm_all_pairs_t), intent(in) :: apin 152 153 PUSH_SUB(multicomm_all_pairs_copy) 154 155 call mpi_grp_copy(apout%grp, apin%grp) 156 apout%rounds = apin%rounds 157 if(associated(apin%schedule)) then 158 SAFE_ALLOCATE(apout%schedule(1:size(apin%schedule, 1), 1:size(apin%schedule, 2))) 159 apout%schedule = apin%schedule 160 end if 161 162 POP_SUB(multicomm_all_pairs_copy) 163 end subroutine multicomm_all_pairs_copy 164 165 ! --------------------------------------------------------- 166 !> create index and domain communicators 167 subroutine multicomm_init(mc, namespace, base_grp, parallel_mask, default_mask, n_node, index_range, min_range) 168 type(multicomm_t), intent(out) :: mc 169 type(namespace_t), intent(in) :: namespace 170 type(mpi_grp_t), intent(inout) :: base_grp 171 integer, intent(in) :: parallel_mask 172 integer, intent(in) :: default_mask 173 integer, intent(in) :: n_node 174 integer, intent(inout) :: index_range(:) 175 integer, intent(in) :: min_range(:) 176 177 integer :: ii, num_slaves, slave_level, ipar 178 integer :: parse(1:P_STRATEGY_MAX), default(1:P_STRATEGY_MAX) 179 180 PUSH_SUB(multicomm_init) 181 182 mc%n_node = n_node 183 184 call messages_print_stress(stdout, "Parallelization") 185 186 !%Variable ReorderRanks 187 !%Default no 188 !%Type logical 189 !%Section Execution::Parallelization 190 !%Description 191 !% This variable controls whether the ranks are reorganized to have a more 192 !% compact distribution with respect to domain parallelization which needs 193 !% to communicate most often. Depending on the system, this can improve 194 !% communication speeds. 195 !%End 196 call parse_variable(namespace, 'ReorderRanks', .false., mc%reorder_ranks) 197 198 call messages_obsolete_variable(namespace, 'ParallelizationStrategy') 199 call messages_obsolete_variable(namespace, 'ParallelizationGroupRanks') 200 201 do ipar = 1, P_STRATEGY_MAX 202 default(ipar) = PAR_NO 203 if(bitand(default_mask, ibset(0, ipar - 1)) /= 0) then 204 default(ipar) = PAR_AUTO 205 end if 206 end do 207 208 !%Variable ParDomains 209 !%Type integer 210 !%Default auto 211 !%Section Execution::Parallelization 212 !%Description 213 !% This variable controls the number of processors used for the 214 !% parallelization in domains. 215 !% The special value <tt>auto</tt>, the default, lets Octopus 216 !% decide how many processors will be assigned for this 217 !% strategy. To disable parallelization in domains, you can use 218 !% <tt>ParDomains = no</tt> (or set the number of processors to 219 !% 1). 220 !% 221 !% The total number of processors required is the multiplication 222 !% of the processors assigned to each parallelization strategy. 223 !%Option auto -1 224 !% The number of processors is assigned automatically. 225 !%Option no 0 226 !% This parallelization strategy is not used. 227 !%End 228 call parse_variable(namespace, 'ParDomains', default(P_STRATEGY_DOMAINS), parse(P_STRATEGY_DOMAINS)) 229 230 !%Variable ParStates 231 !%Type integer 232 !%Section Execution::Parallelization 233 !%Description 234 !% This variable controls the number of processors used for the 235 !% parallelization in states. The special value <tt>auto</tt> lets 236 !% Octopus decide how many processors will be assigned for this 237 !% strategy. To disable parallelization in states, you can use 238 !% <tt>ParStates = no</tt> (or set the number of processors to 1). 239 !% 240 !% The default value depends on the <tt>CalculationMode</tt>. For 241 !% <tt>CalculationMode = td</tt> the default is <tt>auto</tt>, while 242 !% for for other modes the default is <tt>no</tt>. 243 !% 244 !% The total number of processors required is the multiplication 245 !% of the processors assigned to each parallelization strategy. 246 !%Option auto -1 247 !% The number of processors is assigned automatically. 248 !%Option no 0 249 !% This parallelization strategy is not used. 250 !%End 251 call parse_variable(namespace, 'ParStates', default(P_STRATEGY_STATES), parse(P_STRATEGY_STATES)) 252 253 !%Variable ParKPoints 254 !%Type integer 255 !%Default auto 256 !%Section Execution::Parallelization 257 !%Description 258 !% This variable controls the number of processors used for the 259 !% parallelization in K-Points and/or spin. 260 !% The special value <tt>auto</tt> lets Octopus decide how many processors will be 261 !% assigned for this strategy. To disable parallelization in 262 !% KPoints, you can use <tt>ParKPoints = no</tt> (or set the 263 !% number of processors to 1). 264 !% 265 !% The total number of processors required is the multiplication 266 !% of the processors assigned to each parallelization strategy. 267 !%Option auto -1 268 !% The number of processors is assigned automatically. 269 !%Option no 0 270 !% This parallelization strategy is not used. 271 !%End 272 call parse_variable(namespace, 'ParKPoints', default(P_STRATEGY_KPOINTS), parse(P_STRATEGY_KPOINTS)) 273 274 !%Variable ParOther 275 !%Type integer 276 !%Default auto 277 !%Section Execution::Parallelization 278 !%Description 279 !% This variable controls the number of processors used for the 280 !% 'other' parallelization mode, that is CalculatioMode 281 !% dependent. For <tt>CalculationMode = casida</tt>, it means 282 !% parallelization in electron-hole pairs. 283 !% 284 !% The special value <tt>auto</tt>, 285 !% the default, lets Octopus decide how many processors will be 286 !% assigned for this strategy. To disable parallelization in 287 !% Other, you can use <tt>ParOther = no</tt> (or set the 288 !% number of processors to 1). 289 !% 290 !% The total number of processors required is the multiplication 291 !% of the processors assigned to each parallelization strategy. 292 !%Option auto -1 293 !% The number of processors is assigned automatically. 294 !%Option no 0 295 !% This parallelization strategy is not used. 296 !%End 297 call parse_variable(namespace, 'ParOther', default(P_STRATEGY_OTHER), parse(P_STRATEGY_OTHER)) 298 299 do ipar = 1, P_STRATEGY_MAX 300 if(parse(ipar) == PAR_NO) parse(ipar) = 1 301 end do 302 303 call strategy() 304 305 mc%have_slaves = .false. 306 307 if(mc%par_strategy /= P_STRATEGY_SERIAL) then 308 SAFE_ALLOCATE(mc%group_sizes(1:P_STRATEGY_MAX)) 309 310 mc%group_sizes = 1 311 312 do ipar = 1, P_STRATEGY_MAX 313 if(multicomm_strategy_is_parallel(mc, ipar)) then 314 mc%group_sizes(ipar) = parse(ipar) 315 else if(parse(ipar) /= 1) then 316 call messages_write('Ignoring specification for ' // par_types(ipar)) 317 call messages_new_line() 318 call messages_write('This parallelization strategy is not available.') 319 call messages_warning() 320 end if 321 end do 322 323 call assign_nodes() 324 325 326 !%Variable ParallelizationNumberSlaves 327 !%Type integer 328 !%Default 0 329 !%Section Execution::Parallelization 330 !%Description 331 !% Slaves are nodes used for task parallelization. The number of 332 !% such nodes is given by this variable multiplied by the number 333 !% of domains used in domain parallelization. 334 !%End 335 call parse_variable(namespace, 'ParallelizationNumberSlaves', 0, num_slaves) 336 337 ! the slaves must be defined at a certain parallelization level, for the moment this is state parallelization. 338 slave_level = P_STRATEGY_STATES 339 mc%have_slaves = (num_slaves > 0) 340 341 if(mc%have_slaves) then 342#ifndef HAVE_MPI2 343 message(1) = 'Task parallelization requires MPI 2.' 344 call messages_fatal(1) 345#endif 346 call messages_experimental('Task parallelization') 347 end if 348 349 ! clear parallel strategies that were available but will not be used 350 do ii = 1, P_STRATEGY_MAX 351 if(mc%group_sizes(ii) == 1) mc%par_strategy = ibclr(mc%par_strategy, ii - 1) 352 end do 353 354 ! reset 355 call sanity_check() 356 end if 357 358 call group_comm_create() 359 360 call messages_print_stress(stdout) 361 362 POP_SUB(multicomm_init) 363 364 contains 365 366 ! --------------------------------------------------------- 367 subroutine strategy() 368 integer :: jj, ipar 369 370 PUSH_SUB(multicomm_init.strategy) 371 372 if(base_grp%size > 1) then 373 374 mc%par_strategy = 0 375 376 do ipar = 1, P_STRATEGY_MAX 377 if(parse(ipar) == PAR_AUTO .or. parse(ipar) > 1) then 378 mc%par_strategy = ibset(mc%par_strategy, ipar - 1) 379 end if 380 end do 381 382 if(mc%par_strategy /= bitand(mc%par_strategy, parallel_mask)) then 383 call messages_write('Parallelization strategies unavailable for this run mode are being discarded.') 384 call messages_warning() 385 end if 386 387 mc%par_strategy = bitand(mc%par_strategy, parallel_mask) 388 389 if(mc%par_strategy == P_STRATEGY_SERIAL) then 390 message(1) = "More than one node is available, but this run mode cannot run with the requested parallelization." 391 message(2) = "Please select a parallelization strategy compatible with" 392 jj = 2 393 do ii = 1, n_par_types 394 if(bitand(parallel_mask, 2**(ii - 1)) /= 0) then 395 jj = jj + 1 396 write(message(jj), '(2a)') " -> ", par_types(ii) 397 end if 398 end do 399 jj=jj+1 400 write(message(jj),'(a,i6)') "mc%par_strategy is : ",mc%par_strategy 401 call messages_fatal(jj, only_root_writes = .true.) 402 end if 403 else 404 mc%par_strategy = P_STRATEGY_SERIAL 405 end if 406 407 mc%nthreads = 1 408#if defined(HAVE_OPENMP) 409 !$omp parallel 410 !$omp master 411 mc%nthreads = omp_get_num_threads() 412 !$omp end master 413 !$omp end parallel 414#endif 415 416 if(mc%par_strategy == P_STRATEGY_SERIAL .and. mc%nthreads == 1) then 417 message(1) = "Info: Octopus will run in *serial*" 418 call messages_info(1) 419 else 420 write(message(1),'(a)') 'Info: Octopus will run in *parallel*' 421 write(message(2),'(a)') '' 422 write(message(3),'(a, i8)') ' Number of processes :', base_grp%size 423 write(message(4),'(a, i8)') ' Number of threads per process :', mc%nthreads 424 write(message(5),'(a)') '' 425 call messages_info(5) 426 end if 427 428 POP_SUB(multicomm_init.strategy) 429 end subroutine strategy 430 431 ! --------------------------------------------------------- 432 433 subroutine assign_nodes() 434 integer :: ii, nn, kk, n_divisors, divisors(1:50) 435 integer :: n_group_max(1:P_STRATEGY_MAX) 436 437 PUSH_SUB(multicomm_init.assign_nodes) 438 439 ! this is the maximum number of processors in each group 440 n_group_max(1:P_STRATEGY_MAX) = max(index_range(1:P_STRATEGY_MAX), 1) 441 do kk = 1, P_STRATEGY_MAX 442 if(.not. multicomm_strategy_is_parallel(mc, kk)) n_group_max(kk) = 1 443 end do 444 445 if(debug%info) then 446 call messages_write('Debug info: Allowable group ranks:', new_line = .true.) 447 do kk = 1, P_STRATEGY_MAX 448 call messages_write(par_types(kk), fmt = '2x,a12,":",1x') 449 call messages_write(n_group_max(kk), new_line = .true.) 450 end do 451 call messages_info() 452 end if 453 454 nn = mc%n_node 455 456 ! first loop, check the processors assigned by the user 457 do ipar = P_STRATEGY_MAX, 1, -1 458 459 if(mc%group_sizes(ipar) == PAR_AUTO) cycle 460 461 if(mc%group_sizes(ipar) > n_group_max(ipar)) then 462 call messages_write('The number of processors specified for '//par_types(ipar)//'(') 463 call messages_write(mc%group_sizes(ipar)) 464 call messages_write(')', new_line = .true.) 465 call messages_write('is larger than the degrees of freedom for that level (') 466 call messages_write(n_group_max(ipar)) 467 call messages_write(').') 468 call messages_warning() 469 end if 470 471 if(mod(nn, mc%group_sizes(ipar)) /= 0) then 472 call messages_write('The number of processors specified for '//par_types(ipar)//'(') 473 call messages_write(mc%group_sizes(ipar)) 474 call messages_write(')', new_line = .true.) 475 call messages_write('is not a divisor of the number of processors (') 476 call messages_write(mc%n_node) 477 call messages_write(').') 478 call messages_fatal() 479 end if 480 481 nn = nn/mc%group_sizes(ipar) 482 483 end do 484 485 ! second loop, now assign the rest automatically 486 do ipar = P_STRATEGY_MAX, 1, -1 487 488 if(mc%group_sizes(ipar) /= PAR_AUTO) cycle 489 490 n_divisors = ubound(divisors, dim = 1) 491 call get_divisors(nn, n_divisors, divisors) 492 493 mc%group_sizes(ipar) = nn 494 do ii = 2, n_divisors 495 if(divisors(ii) > n_group_max(ipar)) then 496 mc%group_sizes(ipar) = divisors(ii - 1) 497 exit 498 end if 499 end do 500 501 nn = nn/mc%group_sizes(ipar) 502 503 end do 504 505 POP_SUB(multicomm_init.assign_nodes) 506 end subroutine assign_nodes 507 508 509 ! --------------------------------------------------------- 510 !> check if a balanced distribution of nodes will be used 511 subroutine sanity_check() 512 FLOAT :: frac 513 integer :: ii, kk, n_max 514 integer :: real_group_sizes(1:MAX_INDEX) 515 516 PUSH_SUB(multicomm_init.sanity_check) 517 518 if(num_slaves > 0) then 519 520 if(mc%group_sizes(slave_level) < num_slaves + 1) then 521 message(1) = 'Too many nodes assigned to task parallelization.' 522 call messages_fatal(1) 523 end if 524 525 write(message(1),'(a,i6)') 'Info: Number of slaves nodes :', & 526 num_slaves*product(mc%group_sizes(1:slave_level - 1)) 527 call messages_info(1) 528 529 end if 530 531 ! print out some info 532 ii = 0 533 do kk = P_STRATEGY_MAX, 1, -1 534 real_group_sizes(kk) = mc%group_sizes(kk) 535 if(.not. multicomm_strategy_is_parallel(mc, kk)) cycle 536 ii = ii + 1 537 if(kk == slave_level) INCR(real_group_sizes(kk), -num_slaves) 538 write(message(ii),'(3a,i6,a,i8,a)') 'Info: Number of nodes in ', & 539 par_types(kk), ' group:', real_group_sizes(kk), ' (', index_range(kk), ')' 540 end do 541 call messages_info(ii) 542 543 ! do we have the correct number of processors 544 if(product(mc%group_sizes(1:P_STRATEGY_MAX)) /= base_grp%size) then 545 write(message(1),'(a)') 'Inconsistent number of processors:' 546 write(message(2),'(a,i6)') ' MPI processes = ', base_grp%size 547 write(message(3),'(a,i6)') ' Required processes = ', product(mc%group_sizes(1:P_STRATEGY_MAX)) 548 message(4) = '' 549 message(5) = 'You probably have a problem in the ParDomains, ParStates, ParKPoints or ParOther.' 550 call messages_fatal(5, only_root_writes = .true.) 551 end if 552 553 if(any(real_group_sizes(1:P_STRATEGY_MAX) > index_range(1:P_STRATEGY_MAX))) then 554 message(1) = "Could not distribute nodes in parallel job. Most likely you are trying to" 555 message(2) = "use too many nodes for the job." 556 call messages_fatal(2, only_root_writes = .true.) 557 end if 558 559 if(any(index_range(1:P_STRATEGY_MAX) / real_group_sizes(1:P_STRATEGY_MAX) < min_range(1:P_STRATEGY_MAX) .and. & 560 real_group_sizes(1:P_STRATEGY_MAX) > 1)) then 561 message(1) = "I have fewer elements in a parallel group than recommended." 562 message(2) = "Maybe you should reduce the number of nodes." 563 call messages_warning(2) 564 end if 565 566 ! calculate fraction of idle time 567 frac = M_ONE 568 do ii = 1, P_STRATEGY_MAX 569 n_max = ceiling(TOFLOAT(index_range(ii)) / TOFLOAT(real_group_sizes(ii))) 570 kk = n_max*real_group_sizes(ii) 571 frac = frac*(M_ONE - TOFLOAT(kk - index_range(ii)) / TOFLOAT(kk)) 572 end do 573 574 write(message(1), '(a,f5.2,a)') "Info: Octopus will waste at least ", & 575 (M_ONE - frac)*CNST(100.0), "% of computer time." 576 if(frac < CNST(0.8)) then 577 message(2) = "Usually a number of processors which is a multiple of small primes is best." 578 call messages_warning(2) 579 else 580 call messages_info(1) 581 end if 582 583 POP_SUB(multicomm_init.sanity_check) 584 end subroutine sanity_check 585 586 ! --------------------------------------------------------- 587 subroutine group_comm_create() 588#if defined(HAVE_MPI) 589 logical :: dim_mask(MAX_INDEX) 590 integer :: i_strategy, irank 591 logical :: reorder, periodic_mask(MAX_INDEX) 592 integer :: coords(MAX_INDEX) 593 integer :: new_comm, new_comm_size 594 character(len=6) :: node_type 595 type(mpi_grp_t) :: reorder_grp 596 integer :: base_group, reorder_group, ranks(base_grp%size) 597 integer :: ii, jj, kk, ll, nn, reorder_comm 598#endif 599 600 PUSH_SUB(multicomm_init.group_comm_create) 601 602 mc%node_type = P_MASTER 603 604 SAFE_ALLOCATE(mc%group_comm(1:P_STRATEGY_MAX)) 605 SAFE_ALLOCATE(mc%who_am_i(1:P_STRATEGY_MAX)) 606 607#if defined(HAVE_MPI) 608 mc%full_comm = MPI_COMM_NULL 609 mc%slave_intercomm = MPI_COMM_NULL 610 if(mc%par_strategy /= P_STRATEGY_SERIAL) then 611 if(mc%reorder_ranks) then 612 ! first, reorder the ranks 613 ! this is done to get a column-major ordering of the ranks in the 614 ! Cartesian communicator, since they a ordered row-major otherwise 615 call MPI_Comm_group(base_grp%comm, base_group, mpi_err) 616 if(mpi_err /= MPI_SUCCESS) then 617 message(1) = "Error in getting MPI group!" 618 call messages_fatal(1) 619 end if 620 ! now transpose the hypercube => get rank numbers in column-major order 621 nn = 1 622 do ii = 1, mc%group_sizes(1) 623 do jj = 1, mc%group_sizes(2) 624 do kk = 1, mc%group_sizes(3) 625 do ll = 1, mc%group_sizes(4) 626 ranks(nn) = (ll-1)*mc%group_sizes(3)*mc%group_sizes(2)*mc%group_sizes(1) & 627 + (kk-1)*mc%group_sizes(2)*mc%group_sizes(1) & 628 + (jj-1)*mc%group_sizes(1) + ii - 1 629 nn = nn + 1 630 end do 631 end do 632 end do 633 end do 634 call MPI_Group_incl(base_group, base_grp%size, ranks, reorder_group, mpi_err) 635 if(mpi_err /= MPI_SUCCESS) then 636 message(1) = "Error in creating MPI group!" 637 call messages_fatal(1) 638 end if 639 ! now get the reordered communicator 640 call MPI_Comm_create(base_grp%comm, reorder_group, reorder_comm, mpi_err) 641 if(mpi_err /= MPI_SUCCESS) then 642 message(1) = "Error in creating reordered communicator!" 643 call messages_fatal(1) 644 end if 645 call mpi_grp_init(reorder_grp, reorder_comm) 646 call mpi_grp_copy(base_grp, reorder_grp) 647 else 648 call mpi_grp_copy(reorder_grp, base_grp) 649 end if 650 651 ! Multilevel parallelization is organized in a hypercube. We 652 ! use an MPI Cartesian topology to generate the communicators 653 ! that correspond to each level. 654 655 ! create the topology 656 periodic_mask = .false. 657 reorder = .true. 658 659 ! The domain and states dimensions have to be periodic (2D torus) 660 ! in order to circulate matrix blocks. 661 periodic_mask(P_STRATEGY_DOMAINS) = multicomm_strategy_is_parallel(mc, P_STRATEGY_DOMAINS) 662 periodic_mask(P_STRATEGY_STATES) = multicomm_strategy_is_parallel(mc, P_STRATEGY_STATES) 663 664 ! We allow reordering of ranks. 665 call MPI_Cart_create(reorder_grp%comm, P_STRATEGY_MAX, mc%group_sizes, periodic_mask, reorder, mc%full_comm, mpi_err) 666 667 call MPI_Comm_rank(mc%full_comm, mc%full_comm_rank, mpi_err) 668 669 ! get the coordinates of the current processor 670 call MPI_Cart_coords(mc%full_comm, mc%full_comm_rank, P_STRATEGY_MAX, coords, mpi_err) 671 672 ! find out what type of node this is 673 if(coords(slave_level) >= mc%group_sizes(slave_level) - num_slaves) then 674 mc%node_type = P_SLAVE 675 end if 676 677 if(mc%node_type == P_MASTER) then 678 INCR(mc%group_sizes(slave_level), -num_slaves) 679 else 680 mc%group_sizes(slave_level) = num_slaves 681 end if 682 683 call MPI_Comm_split(mc%full_comm, mc%node_type, mc%full_comm_rank, new_comm, mpi_err) 684 ASSERT(new_comm /= MPI_COMM_NULL) 685 call MPI_Comm_size(new_comm, new_comm_size, mpi_err) 686 687 reorder = .false. 688 if(product(mc%group_sizes(:)) /= new_comm_size) then 689 write(stderr,*) 'node ', mpi_world%rank, ': mc%group_sizes = ', mc%group_sizes, ' new_comm_size = ', new_comm_size 690 call MPI_Barrier(mpi_world%comm, mpi_err) 691 ASSERT(product(mc%group_sizes(:)) == new_comm_size) 692 endif 693 call MPI_Cart_create(new_comm, P_STRATEGY_MAX, mc%group_sizes, periodic_mask, reorder, mc%master_comm, mpi_err) 694 ASSERT(mc%master_comm /= MPI_COMM_NULL) 695 696 call MPI_Comm_free(new_comm, mpi_err) 697 698 call MPI_Comm_rank(mc%master_comm, mc%master_comm_rank, mpi_err) 699 700 ! The "lines" of the Cartesian grid. 701 ! Initialize all the communicators, even if they are not parallelized 702 do i_strategy = 1, P_STRATEGY_MAX 703 dim_mask = .false. 704 dim_mask(i_strategy) = .true. 705 call MPI_Cart_sub(mc%master_comm, dim_mask, mc%group_comm(i_strategy), mpi_err) 706 call MPI_Comm_rank(mc%group_comm(i_strategy), mc%who_am_i(i_strategy), mpi_err) 707 end do 708 709 ! The domain-state "planes" of the grid (the ones with periodic dimensions). 710 dim_mask = .false. 711 dim_mask(P_STRATEGY_DOMAINS) = .true. 712 dim_mask(P_STRATEGY_STATES) = .true. 713 call MPI_Cart_sub(mc%master_comm, dim_mask, mc%dom_st_comm, mpi_err) 714 715 ! The state-kpoints "planes" of the grid 716 dim_mask = .false. 717 dim_mask(P_STRATEGY_STATES) = .true. 718 dim_mask(P_STRATEGY_KPOINTS) = .true. 719 call MPI_Cart_sub(mc%master_comm, dim_mask, mc%st_kpt_comm, mpi_err) 720 721 ! The domains-states-kpoints "cubes" of the grid 722 dim_mask = .false. 723 dim_mask(P_STRATEGY_DOMAINS) = .true. 724 dim_mask(P_STRATEGY_STATES) = .true. 725 dim_mask(P_STRATEGY_KPOINTS) = .true. 726 call MPI_Cart_sub(mc%master_comm, dim_mask, mc%dom_st_kpt_comm, mpi_err) 727 728 if(num_slaves > 0) call create_slave_intercommunicators() 729 else 730 ! we initialize these communicators so we can use them even in serial 731 mc%group_comm = base_grp%comm 732 mc%who_am_i = 0 733 mc%master_comm = base_grp%comm 734 mc%dom_st_comm = base_grp%comm 735 mc%st_kpt_comm = base_grp%comm 736 mc%dom_st_kpt_comm = base_grp%comm 737 end if 738 739 ! This is temporary debugging information. 740 if(debug%info .and. mc%par_strategy /= P_STRATEGY_SERIAL) then 741 write(message(1),'(a)') 'Debug: MPI Task Assignment to MPI Groups' 742 write(message(2),'(5a10)') 'World', 'Domains', 'States', 'K-Points', 'Other' 743 call messages_info(1) 744 745 if(mc%node_type == P_SLAVE) then 746 node_type = "slave" 747 else 748 node_type = "master" 749 end if 750 do irank = 0, mpi_world%size - 1 751 if(mpi_world%rank == irank) then 752 write(message(1),'(5i10,5x,a)') mpi_world%rank, mc%who_am_i(P_STRATEGY_DOMAINS), mc%who_am_i(P_STRATEGY_STATES), & 753 mc%who_am_i(P_STRATEGY_KPOINTS), mc%who_am_i(P_STRATEGY_OTHER), trim(node_type) 754 call messages_info(1, all_nodes = .true.) 755 end if 756 call MPI_Barrier(mpi_world%comm, mpi_err) 757 end do 758 end if 759 760#else 761 mc%group_comm = -1 762 mc%who_am_i = 0 763 mc%full_comm = -1 764 mc%master_comm = -1 765 mc%dom_st_comm = -1 766 mc%st_kpt_comm = -1 767 mc%dom_st_kpt_comm = -1 768 mc%slave_intercomm = -1 769#endif 770 771 POP_SUB(multicomm_init.group_comm_create) 772 end subroutine group_comm_create 773 774 ! ----------------------------------------------------- 775 776 subroutine create_slave_intercommunicators() 777#ifdef HAVE_MPI2 778 integer :: remote_leader 779 integer :: tag 780 integer :: coords(MAX_INDEX) 781 782 PUSH_SUB(multicomm_init.create_slave_intercommunicators) 783 784 ! create the intercommunicators to communicate with slaves 785 786 ! get the coordinates of the current processor 787 call MPI_Cart_coords(mc%full_comm, mc%full_comm_rank, P_STRATEGY_MAX, coords, mpi_err) 788 789 !now get the rank of the remote_leader 790 if(mc%node_type == P_SLAVE) then 791 coords(slave_level) = 0 792 else 793 coords(slave_level) = mc%group_sizes(slave_level) 794 end if 795 call MPI_Cart_rank(mc%full_comm, coords, remote_leader, mpi_err) 796 797 ! now create the intercommunicator 798 tag = coords(P_STRATEGY_DOMAINS) 799 call MPI_Intercomm_create(mc%group_comm(slave_level), 0, base_grp%comm, remote_leader, tag, mc%slave_intercomm, mpi_err) 800 801 POP_SUB(multicomm_init.create_slave_intercommunicators) 802#endif 803 end subroutine create_slave_intercommunicators 804 805 end subroutine multicomm_init 806 807 808 ! --------------------------------------------------------- 809 subroutine multicomm_end(mc) 810 type(multicomm_t), intent(inout) :: mc 811 812#if defined(HAVE_MPI) 813 integer :: ii 814#endif 815 816 PUSH_SUB(multicomm_end) 817 818 if(mc%par_strategy /= P_STRATEGY_SERIAL) then 819#if defined(HAVE_MPI) 820 ! Delete communicators. 821 do ii = 1, P_STRATEGY_MAX 822 ! initialized even if not parallelized 823 call MPI_Comm_free(mc%group_comm(ii), mpi_err) 824 end do 825 call MPI_Comm_free(mc%dom_st_comm, mpi_err) 826 call MPI_Comm_free(mc%st_kpt_comm, mpi_err) 827 call MPI_Comm_free(mc%dom_st_kpt_comm, mpi_err) 828 call MPI_Comm_free(mc%full_comm, mpi_err) 829 call MPI_Comm_free(mc%master_comm, mpi_err) 830 831#ifdef HAVE_MPI2 832 if(multicomm_have_slaves(mc)) call MPI_Comm_free(mc%slave_intercomm, mpi_err) 833#endif 834 835#endif 836 end if 837 838 SAFE_DEALLOCATE_A(mc%group_sizes) 839 SAFE_DEALLOCATE_A(mc%group_comm) 840 SAFE_DEALLOCATE_A(mc%who_am_i) 841 842 POP_SUB(multicomm_end) 843 end subroutine multicomm_end 844 845 846 ! --------------------------------------------------------- 847 logical pure function multicomm_strategy_is_parallel(mc, level) result(rr) 848 type(multicomm_t), intent(in) :: mc 849 integer, intent(in) :: level 850 851 rr = bitand(mc%par_strategy, 2**(level - 1)) /= 0 852 853 end function multicomm_strategy_is_parallel 854 855 856 ! --------------------------------------------------------- 857 !> This routine uses the one-factorization (or near-one-factorization 858 !! of a complete graph to construct an all-pair communication 859 !! schedule (cf. Wang, X., Blum, E. K., Parker, D. S., and Massey, 860 !! D. 1997. The dance-party problem and its application to collective 861 !! communication in computer networks. Parallel Comput. 23, 8 862 !! (Aug. 1997), 1141-1156. 863#if defined(HAVE_MPI) 864 865 subroutine multicomm_create_all_pairs(mpi_grp, ap) 866 type(mpi_grp_t), intent(in) :: mpi_grp 867 type(multicomm_all_pairs_t), intent(out) :: ap 868 869 integer :: grp_size, rounds, ir, in 870 871 PUSH_SUB(create_all_pairs) 872 873 ap%grp = mpi_grp 874 grp_size = mpi_grp%size 875 876 ! Number of rounds. 877 if(mod(grp_size, 2) == 0) then 878 rounds = grp_size - 1 879 else 880 rounds = grp_size 881 end if 882 ap%rounds = rounds 883 884 ! Calculate schedule. 885 SAFE_ALLOCATE(ap%schedule(0:grp_size - 1, 1:rounds)) 886 do ir = 1, rounds 887 do in = 0, grp_size - 1 888 ap%schedule(in, ir) = get_partner(in + 1, ir) - 1 889 end do 890 end do 891 892 POP_SUB(create_all_pairs) 893 894 contains 895 896 ! --------------------------------------------------------- 897 !> Those are from the paper cited above. 898 integer pure function get_partner(in, ir) 899 integer, intent(in) :: in, ir 900 901 ! No PUSH SUB, called too often. 902 903 if(mod(grp_size, 2) == 0) then 904 get_partner = get_partner_even(grp_size, in - 1, ir - 1) + 1 905 else 906 get_partner = get_partner_odd(grp_size, in - 1, ir - 1) + 1 907 end if 908 909 end function get_partner 910 911 ! --------------------------------------------------------- 912 integer pure function get_partner_even(grp_size, ii, rr) result(pp) 913 integer, intent(in) :: grp_size, ii, rr 914 915 integer :: mm 916 917 ! No PUSH SUB, called too often. 918 919 mm = grp_size / 2 920 921 if(ii == 0) then 922 pp = rr + 1 923 elseif(ii == rr + 1) then 924 pp = 0 925 else 926 ! I never know when to use which remainder function, but here 927 ! it has to be the modulo one. Do not change that! 928 pp = modulo(2 * rr - ii + 1, 2 * mm - 1) + 1 929 end if 930 931 end function get_partner_even 932 933 ! --------------------------------------------------------- 934 integer pure function get_partner_odd(grp_size, ii, rr) result(pp) 935 integer, intent(in) :: grp_size, ii, rr 936 937 integer :: mm 938 939 ! No PUSH SUB, called too often. 940 941 mm = (grp_size + 1) / 2 942 943 pp = get_partner_even(grp_size + 1, ii, rr) 944 945 if(pp == 2 * mm - 1) then 946 pp = ii 947 end if 948 949 end function get_partner_odd 950 951 end subroutine multicomm_create_all_pairs 952#endif 953 954 !--------------------------------------------------- 955 !> Function to divide the range of numbers from 1 to nobjs 956 !! between nprocs processors. 957 !! THREADSAFE 958 subroutine multicomm_divide_range(nobjs, nprocs, istart, ifinal, lsize, scalapack_compat) 959 integer, intent(in) :: nobjs !< Number of points to divide 960 integer, intent(in) :: nprocs !< Number of processors 961 integer, intent(out) :: istart(:) 962 integer, intent(out) :: ifinal(:) 963 integer, optional, intent(out) :: lsize(:) !< Number of objects in each partition 964 logical, optional, intent(in) :: scalapack_compat 965 966 integer :: ii, jj, rank 967 logical :: scalapack_compat_ 968 integer :: nbl, size 969 970 ! no push_sub, threadsafe 971 972 scalapack_compat_ = optional_default(scalapack_compat, .false.) 973#ifndef HAVE_SCALAPACK 974 scalapack_compat_ = .false. 975#endif 976 977 if (scalapack_compat_) then 978 nbl = nobjs/nprocs 979 if (mod(nobjs, nprocs) /= 0) INCR(nbl, 1) 980 981 istart(1) = 1 982 do rank = 1, nprocs 983#ifdef HAVE_SCALAPACK 984 size = numroc(nobjs, nbl, rank - 1, 0, nprocs) 985#endif 986 if(size > 0) then 987 if(rank > 1) istart(rank) = ifinal(rank - 1) + 1 988 ifinal(rank) = istart(rank) + size - 1 989 else 990 istart(rank) = 1 991 ifinal(rank) = 0 992 end if 993 end do 994 else 995 996 if(nprocs <= nobjs) then 997 998 ! procs are assigned to groups by round robin 999 do rank = 0, nprocs - 1 1000 jj = nobjs / nprocs 1001 ii = nobjs - jj*nprocs 1002 if(ii > 0 .and. rank < ii) then 1003 jj = jj + 1 1004 istart(rank + 1) = rank*jj + 1 1005 ifinal(rank + 1) = istart(rank + 1) + jj - 1 1006 else 1007 ifinal(rank + 1) = nobjs - (nprocs - rank - 1)*jj 1008 istart(rank + 1) = ifinal(rank + 1) - jj + 1 1009 end if 1010 end do 1011 1012 else 1013 do ii = 1, nprocs 1014 if(ii <= nobjs) then 1015 istart(ii) = ii 1016 ifinal(ii) = ii 1017 else 1018 istart(ii) = 1 1019 ifinal(ii) = 0 1020 end if 1021 end do 1022 end if 1023 end if 1024 1025 if(present(lsize)) then 1026 lsize(1:nprocs) = ifinal(1:nprocs) - istart(1:nprocs) + 1 1027 ASSERT(sum(lsize(1:nprocs)) == nobjs) 1028 end if 1029 1030 end subroutine multicomm_divide_range 1031 1032 ! --------------------------------------------------------- 1033 !> Function to divide the range of numbers from 1 to nobjs 1034 !! between all available threads with OpenMP. 1035 ! THREADSAFE 1036 subroutine multicomm_divide_range_omp(nobjs, ini, nobjs_loc) 1037 integer, intent(in) :: nobjs !< Number of points to divide 1038 integer, intent(out) :: ini !< Start point of the partition 1039 integer, intent(out) :: nobjs_loc !< Number of objects in each partition 1040 1041 integer :: rank 1042#ifdef HAVE_OPENMP 1043 integer, allocatable :: istart(:), ifinal(:), lsize(:) 1044 integer :: nthreads 1045#endif 1046 1047 ! no push_sub, threadsafe 1048 rank = 1 1049#ifdef HAVE_OPENMP 1050 nthreads = omp_get_num_threads() 1051 SAFE_ALLOCATE(istart(1:nthreads)) 1052 SAFE_ALLOCATE(ifinal(1:nthreads)) 1053 SAFE_ALLOCATE(lsize(1:nthreads)) 1054 call multicomm_divide_range(nobjs, nthreads, istart, ifinal, lsize) 1055 rank = 1 + omp_get_thread_num() 1056 ini = istart(rank) 1057 nobjs_loc = lsize(rank) 1058 SAFE_DEALLOCATE_A(istart) 1059 SAFE_DEALLOCATE_A(ifinal) 1060 SAFE_DEALLOCATE_A(lsize) 1061#else 1062 ini = 1 1063 nobjs_loc = nobjs 1064#endif 1065 1066 end subroutine multicomm_divide_range_omp 1067 1068 ! --------------------------------------------------------- 1069 1070 logical pure function multicomm_is_slave(this) result(slave) 1071 type(multicomm_t), intent(in) :: this 1072 1073 slave = this%node_type == P_SLAVE 1074 end function multicomm_is_slave 1075 1076 ! --------------------------------------------------------- 1077 1078 logical pure function multicomm_have_slaves(this) result(have_slaves) 1079 type(multicomm_t), intent(in) :: this 1080 1081 have_slaves = this%have_slaves 1082 end function multicomm_have_slaves 1083 1084end module multicomm_oct_m 1085 1086 1087!! Local Variables: 1088!! mode: f90 1089!! coding: utf-8 1090!! End: 1091