1!! Copyright (C) 2002-2015 M. Marques, A. Castro, A. Rubio, G. Bertsch, X. Andrade
2!!
3!! This program is free software; you can redistribute it and/or modify
4!! it under the terms of the GNU General Public License as published by
5!! the Free Software Foundation; either version 2, or (at your option)
6!! any later version.
7!!
8!! This program is distributed in the hope that it will be useful,
9!! but WITHOUT ANY WARRANTY; without even the implied warranty of
10!! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11!! GNU General Public License for more details.
12!!
13!! You should have received a copy of the GNU General Public License
14!! along with this program; if not, write to the Free Software
15!! Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16!! 02110-1301, USA.
17!!
18
19#include "global.h"
20
21!> Given a number n_index of indices and their ranges index_range(1:n_index),
22!! we divide the n_nodes in groups and create communicators for each group.
23!! Each group is associated with one index. The min_range indicates the minimum
24!! number of elements in each processor. For example, given min_range(1) = 25000,
25!! the algorithm will try to put at least 25000 points in each processor
26!!
27!! Example:
28!! \verbatim
29!! Given 3 indices with ranges
30!!   index_range(1) = 100000, (number of points in the mesh)
31!!   index_range(2) = 15,     (number of states)
32!!   index_range(3) = 2,      (number of k-points)
33!! \endverbatim
34!! and 12 processors, we could get
35!! \verbatim
36!!   mc%group_sizes = (2, 3, 2)
37!! \endverbatim
38!! which means that
39!! - space is divided in 2 domains per state,
40!! - the states are divided in 3 groups, i.e. 5 states per processor, and
41!! - the whole setting is duplicated because of the 2 k-points.
42!!
43!! To perform collective operations (like a reduce), you can use the communicators
44!! provided in mc%group_comm(:). For example, to sum over states, the communicator
45!! to use is mc%group_comm(P_STRATEGY_STATES)
46!!
47!! You can use the routine multicomm_strategy_is_parallel to know if a certain
48!! index is parallelized.
49
50module multicomm_oct_m
51  use blacs_oct_m
52  use global_oct_m
53  use messages_oct_m
54  use mpi_oct_m
55  use namespace_oct_m
56#if defined(HAVE_OPENMP)
57  use omp_lib
58#endif
59  use parser_oct_m
60  use profiling_oct_m
61  use utils_oct_m
62
63  implicit none
64
65  private
66
67  public ::                          &
68    multicomm_divide_range,          &
69    multicomm_divide_range_omp,      &
70#if defined(HAVE_MPI)
71    multicomm_create_all_pairs,      &
72#endif
73    multicomm_t,                     &
74    multicomm_all_pairs_t,           &
75    multicomm_init,                  &
76    multicomm_end,                   &
77    multicomm_all_pairs_copy,        &
78    multicomm_strategy_is_parallel,  &
79    multicomm_is_slave,              &
80    multicomm_have_slaves
81
82  !> possible parallelization strategies
83  integer, public, parameter ::      &
84    P_STRATEGY_SERIAL  = 0,          & !< single domain, all states, k-points on a single processor
85    P_STRATEGY_DOMAINS = 1,          & !< parallelization in domains
86    P_STRATEGY_STATES  = 2,          & !< parallelization in states
87    P_STRATEGY_KPOINTS = 3,          & !< parallelization in k-points
88    P_STRATEGY_OTHER   = 4,          &   !< something else like e-h pairs
89    P_STRATEGY_MAX     = 4
90
91  integer, public, parameter ::      &
92    P_MASTER           = 1,          &
93    P_SLAVE            = 2
94
95  integer, public, parameter ::      &
96    PAR_AUTO            = -1,        &
97    PAR_NO              =  0
98
99  integer,           parameter :: n_par_types = 4
100  character(len=11), parameter :: par_types(0:n_par_types) = &
101    (/                              &
102    "serial    ",                   &
103    "ParDomains",                   &
104    "ParStates ",                   &
105    "ParKPoints",                   &
106    "ParOther  "                    &
107    /)
108
109  integer, parameter :: MAX_INDEX = 5
110
111  !> Stores all communicators and groups
112  type multicomm_t
113    private
114    integer          :: n_node           !< Total number of nodes.
115
116    integer, public  :: par_strategy     !< What kind of parallelization strategy should we use?
117
118    integer, allocatable         :: group_sizes(:)   !< Number of processors in each group.
119    integer, allocatable, public :: who_am_i(:)      !< Rank in the "line"-communicators.
120    integer, allocatable, public :: group_comm(:)    !< "Line"-communicators I belong to.
121    integer, public              :: dom_st_comm      !< States-domain plane communicator.
122    integer, public              :: st_kpt_comm      !< Kpoints-states plane communicator.
123    integer, public              :: dom_st_kpt_comm  !< Kpoints-states-domain cube communicator.
124
125    integer          :: nthreads         !< Number of OMP threads
126    integer          :: node_type        !< Is this node a P_MASTER or a P_SLAVE?
127    logical          :: have_slaves      !< are slaves available?
128
129    integer          :: full_comm        !< The base communicator.
130    integer          :: full_comm_rank   !< The rank in the base communicator.
131    integer, public  :: master_comm      !< The communicator without slaves.
132    integer          :: master_comm_rank !< The rank in the communicator without slaves.
133    integer, public  :: slave_intercomm  !< the intercomm to communicate with slaves
134
135    logical          :: reorder_ranks    !< do we reorder ranks in a more compact way?
136  end type multicomm_t
137
138  !> An all-pairs communication schedule for a given group.
139  type multicomm_all_pairs_t
140    private
141    type(mpi_grp_t)  :: grp                    !< Schedule for this group.
142    integer          :: rounds                 !< This many comm. rounds.
143    integer, pointer, public :: schedule(:, :) !< This is the schedule.
144  end type multicomm_all_pairs_t
145
146contains
147
148  ! ---------------------------------------------------------
149  subroutine multicomm_all_pairs_copy(apout, apin)
150    type(multicomm_all_pairs_t), intent(inout) :: apout
151    type(multicomm_all_pairs_t), intent(in)    :: apin
152
153    PUSH_SUB(multicomm_all_pairs_copy)
154
155    call mpi_grp_copy(apout%grp, apin%grp)
156    apout%rounds = apin%rounds
157    if(associated(apin%schedule)) then
158      SAFE_ALLOCATE(apout%schedule(1:size(apin%schedule, 1), 1:size(apin%schedule, 2)))
159      apout%schedule = apin%schedule
160    end if
161
162    POP_SUB(multicomm_all_pairs_copy)
163  end subroutine multicomm_all_pairs_copy
164
165  ! ---------------------------------------------------------
166  !> create index and domain communicators
167  subroutine multicomm_init(mc, namespace, base_grp, parallel_mask, default_mask, n_node, index_range, min_range)
168    type(multicomm_t), intent(out)   :: mc
169    type(namespace_t), intent(in)    :: namespace
170    type(mpi_grp_t),   intent(inout) :: base_grp
171    integer,           intent(in)    :: parallel_mask
172    integer,           intent(in)    :: default_mask
173    integer,           intent(in)    :: n_node
174    integer,           intent(inout) :: index_range(:)
175    integer,           intent(in)    :: min_range(:)
176
177    integer :: ii, num_slaves, slave_level, ipar
178    integer :: parse(1:P_STRATEGY_MAX), default(1:P_STRATEGY_MAX)
179
180    PUSH_SUB(multicomm_init)
181
182    mc%n_node  = n_node
183
184    call messages_print_stress(stdout, "Parallelization")
185
186    !%Variable ReorderRanks
187    !%Default no
188    !%Type logical
189    !%Section Execution::Parallelization
190    !%Description
191    !% This variable controls whether the ranks are reorganized to have a more
192    !% compact distribution with respect to domain parallelization which needs
193    !% to communicate most often. Depending on the system, this can improve
194    !% communication speeds.
195    !%End
196    call parse_variable(namespace, 'ReorderRanks', .false., mc%reorder_ranks)
197
198    call messages_obsolete_variable(namespace, 'ParallelizationStrategy')
199    call messages_obsolete_variable(namespace, 'ParallelizationGroupRanks')
200
201    do ipar = 1, P_STRATEGY_MAX
202      default(ipar) = PAR_NO
203      if(bitand(default_mask, ibset(0, ipar - 1)) /= 0) then
204        default(ipar) = PAR_AUTO
205      end if
206    end do
207
208    !%Variable ParDomains
209    !%Type integer
210    !%Default auto
211    !%Section Execution::Parallelization
212    !%Description
213    !% This variable controls the number of processors used for the
214    !% parallelization in domains.
215    !% The special value <tt>auto</tt>, the default, lets Octopus
216    !% decide how many processors will be assigned for this
217    !% strategy. To disable parallelization in domains, you can use
218    !% <tt>ParDomains = no</tt> (or set the number of processors to
219    !% 1).
220    !%
221    !% The total number of processors required is the multiplication
222    !% of the processors assigned to each parallelization strategy.
223    !%Option auto -1
224    !% The number of processors is assigned automatically.
225    !%Option no 0
226    !% This parallelization strategy is not used.
227    !%End
228    call parse_variable(namespace, 'ParDomains', default(P_STRATEGY_DOMAINS), parse(P_STRATEGY_DOMAINS))
229
230    !%Variable ParStates
231    !%Type integer
232    !%Section Execution::Parallelization
233    !%Description
234    !% This variable controls the number of processors used for the
235    !% parallelization in states. The special value <tt>auto</tt> lets
236    !% Octopus decide how many processors will be assigned for this
237    !% strategy. To disable parallelization in states, you can use
238    !% <tt>ParStates = no</tt> (or set the number of processors to 1).
239    !%
240    !% The default value depends on the <tt>CalculationMode</tt>. For
241    !% <tt>CalculationMode = td</tt> the default is <tt>auto</tt>, while
242    !% for for other modes the default is <tt>no</tt>.
243    !%
244    !% The total number of processors required is the multiplication
245    !% of the processors assigned to each parallelization strategy.
246    !%Option auto -1
247    !% The number of processors is assigned automatically.
248    !%Option no 0
249    !% This parallelization strategy is not used.
250    !%End
251    call parse_variable(namespace, 'ParStates', default(P_STRATEGY_STATES), parse(P_STRATEGY_STATES))
252
253    !%Variable ParKPoints
254    !%Type integer
255    !%Default auto
256    !%Section Execution::Parallelization
257    !%Description
258    !% This variable controls the number of processors used for the
259    !% parallelization in K-Points and/or spin.
260    !% The special value <tt>auto</tt> lets Octopus decide how many processors will be
261    !% assigned for this strategy. To disable parallelization in
262    !% KPoints, you can use <tt>ParKPoints = no</tt> (or set the
263    !% number of processors to 1).
264    !%
265    !% The total number of processors required is the multiplication
266    !% of the processors assigned to each parallelization strategy.
267    !%Option auto -1
268    !% The number of processors is assigned automatically.
269    !%Option no 0
270    !% This parallelization strategy is not used.
271    !%End
272    call parse_variable(namespace, 'ParKPoints', default(P_STRATEGY_KPOINTS), parse(P_STRATEGY_KPOINTS))
273
274    !%Variable ParOther
275    !%Type integer
276    !%Default auto
277    !%Section Execution::Parallelization
278    !%Description
279    !% This variable controls the number of processors used for the
280    !% 'other' parallelization mode, that is CalculatioMode
281    !% dependent. For <tt>CalculationMode = casida</tt>, it means
282    !% parallelization in electron-hole pairs.
283    !%
284    !% The special value <tt>auto</tt>,
285    !% the default, lets Octopus decide how many processors will be
286    !% assigned for this strategy. To disable parallelization in
287    !% Other, you can use <tt>ParOther = no</tt> (or set the
288    !% number of processors to 1).
289    !%
290    !% The total number of processors required is the multiplication
291    !% of the processors assigned to each parallelization strategy.
292    !%Option auto -1
293    !% The number of processors is assigned automatically.
294    !%Option no 0
295    !% This parallelization strategy is not used.
296    !%End
297    call parse_variable(namespace, 'ParOther', default(P_STRATEGY_OTHER), parse(P_STRATEGY_OTHER))
298
299    do ipar = 1, P_STRATEGY_MAX
300      if(parse(ipar) == PAR_NO) parse(ipar) = 1
301    end do
302
303    call strategy()
304
305    mc%have_slaves = .false.
306
307    if(mc%par_strategy /= P_STRATEGY_SERIAL) then
308      SAFE_ALLOCATE(mc%group_sizes(1:P_STRATEGY_MAX))
309
310      mc%group_sizes = 1
311
312      do ipar = 1, P_STRATEGY_MAX
313        if(multicomm_strategy_is_parallel(mc, ipar)) then
314          mc%group_sizes(ipar) = parse(ipar)
315        else if(parse(ipar) /= 1) then
316          call messages_write('Ignoring specification for ' // par_types(ipar))
317          call messages_new_line()
318          call messages_write('This parallelization strategy is not available.')
319          call messages_warning()
320        end if
321      end do
322
323      call assign_nodes()
324
325
326      !%Variable ParallelizationNumberSlaves
327      !%Type integer
328      !%Default 0
329      !%Section Execution::Parallelization
330      !%Description
331      !% Slaves are nodes used for task parallelization. The number of
332      !% such nodes is given by this variable multiplied by the number
333      !% of domains used in domain parallelization.
334      !%End
335      call parse_variable(namespace, 'ParallelizationNumberSlaves', 0, num_slaves)
336
337      ! the slaves must be defined at a certain parallelization level, for the moment this is state parallelization.
338      slave_level = P_STRATEGY_STATES
339      mc%have_slaves = (num_slaves > 0)
340
341      if(mc%have_slaves) then
342#ifndef HAVE_MPI2
343        message(1) = 'Task parallelization requires MPI 2.'
344        call messages_fatal(1)
345#endif
346        call messages_experimental('Task parallelization')
347      end if
348
349      ! clear parallel strategies that were available but will not be used
350      do ii = 1, P_STRATEGY_MAX
351        if(mc%group_sizes(ii) == 1) mc%par_strategy = ibclr(mc%par_strategy, ii - 1)
352      end do
353
354      ! reset
355      call sanity_check()
356    end if
357
358    call group_comm_create()
359
360    call messages_print_stress(stdout)
361
362    POP_SUB(multicomm_init)
363
364  contains
365
366    ! ---------------------------------------------------------
367    subroutine strategy()
368      integer :: jj, ipar
369
370      PUSH_SUB(multicomm_init.strategy)
371
372      if(base_grp%size > 1) then
373
374        mc%par_strategy = 0
375
376        do ipar = 1, P_STRATEGY_MAX
377          if(parse(ipar) == PAR_AUTO .or. parse(ipar) > 1) then
378            mc%par_strategy = ibset(mc%par_strategy, ipar - 1)
379          end if
380        end do
381
382        if(mc%par_strategy /= bitand(mc%par_strategy, parallel_mask)) then
383          call messages_write('Parallelization strategies unavailable for this run mode are being discarded.')
384          call messages_warning()
385        end if
386
387        mc%par_strategy = bitand(mc%par_strategy, parallel_mask)
388
389        if(mc%par_strategy == P_STRATEGY_SERIAL) then
390          message(1) = "More than one node is available, but this run mode cannot run with the requested parallelization."
391          message(2) = "Please select a parallelization strategy compatible with"
392          jj = 2
393          do ii = 1, n_par_types
394            if(bitand(parallel_mask, 2**(ii - 1)) /= 0) then
395              jj = jj + 1
396              write(message(jj), '(2a)') "  -> ", par_types(ii)
397            end if
398          end do
399          jj=jj+1
400          write(message(jj),'(a,i6)') "mc%par_strategy is : ",mc%par_strategy
401          call messages_fatal(jj, only_root_writes = .true.)
402        end if
403      else
404        mc%par_strategy = P_STRATEGY_SERIAL
405      end if
406
407       mc%nthreads = 1
408#if defined(HAVE_OPENMP)
409      !$omp parallel
410      !$omp master
411      mc%nthreads = omp_get_num_threads()
412      !$omp end master
413      !$omp end parallel
414#endif
415
416      if(mc%par_strategy == P_STRATEGY_SERIAL .and. mc%nthreads == 1) then
417        message(1) = "Info: Octopus will run in *serial*"
418        call messages_info(1)
419      else
420        write(message(1),'(a)')     'Info: Octopus will run in *parallel*'
421        write(message(2),'(a)')     ''
422        write(message(3),'(a, i8)') '      Number of processes           :', base_grp%size
423        write(message(4),'(a, i8)') '      Number of threads per process :', mc%nthreads
424        write(message(5),'(a)')     ''
425        call messages_info(5)
426      end if
427
428      POP_SUB(multicomm_init.strategy)
429    end subroutine strategy
430
431    ! ---------------------------------------------------------
432
433    subroutine assign_nodes()
434      integer :: ii, nn, kk, n_divisors, divisors(1:50)
435      integer :: n_group_max(1:P_STRATEGY_MAX)
436
437      PUSH_SUB(multicomm_init.assign_nodes)
438
439      ! this is the maximum number of processors in each group
440      n_group_max(1:P_STRATEGY_MAX) = max(index_range(1:P_STRATEGY_MAX), 1)
441      do kk = 1, P_STRATEGY_MAX
442        if(.not. multicomm_strategy_is_parallel(mc, kk)) n_group_max(kk) = 1
443      end do
444
445      if(debug%info) then
446        call messages_write('Debug info: Allowable group ranks:', new_line = .true.)
447        do kk = 1, P_STRATEGY_MAX
448          call messages_write(par_types(kk), fmt = '2x,a12,":",1x')
449          call messages_write(n_group_max(kk), new_line = .true.)
450        end do
451        call messages_info()
452      end if
453
454      nn = mc%n_node
455
456      ! first loop, check the processors assigned by the user
457      do ipar = P_STRATEGY_MAX, 1, -1
458
459        if(mc%group_sizes(ipar) == PAR_AUTO) cycle
460
461        if(mc%group_sizes(ipar) > n_group_max(ipar)) then
462          call messages_write('The number of processors specified for '//par_types(ipar)//'(')
463          call messages_write(mc%group_sizes(ipar))
464          call messages_write(')', new_line = .true.)
465          call messages_write('is larger than the degrees of freedom for that level (')
466          call messages_write(n_group_max(ipar))
467          call messages_write(').')
468          call messages_warning()
469        end if
470
471        if(mod(nn, mc%group_sizes(ipar)) /= 0) then
472          call messages_write('The number of processors specified for '//par_types(ipar)//'(')
473          call messages_write(mc%group_sizes(ipar))
474          call messages_write(')', new_line = .true.)
475          call messages_write('is not a divisor of the number of processors (')
476          call messages_write(mc%n_node)
477          call messages_write(').')
478          call messages_fatal()
479        end if
480
481        nn = nn/mc%group_sizes(ipar)
482
483      end do
484
485      ! second loop, now assign the rest automatically
486      do ipar = P_STRATEGY_MAX, 1, -1
487
488        if(mc%group_sizes(ipar) /= PAR_AUTO) cycle
489
490        n_divisors = ubound(divisors, dim = 1)
491        call get_divisors(nn, n_divisors, divisors)
492
493        mc%group_sizes(ipar) = nn
494        do ii = 2, n_divisors
495          if(divisors(ii) > n_group_max(ipar)) then
496            mc%group_sizes(ipar) = divisors(ii - 1)
497            exit
498          end if
499        end do
500
501        nn = nn/mc%group_sizes(ipar)
502
503      end do
504
505      POP_SUB(multicomm_init.assign_nodes)
506    end subroutine assign_nodes
507
508
509    ! ---------------------------------------------------------
510    !> check if a balanced distribution of nodes will be used
511    subroutine sanity_check()
512      FLOAT :: frac
513      integer :: ii, kk, n_max
514      integer :: real_group_sizes(1:MAX_INDEX)
515
516      PUSH_SUB(multicomm_init.sanity_check)
517
518      if(num_slaves > 0) then
519
520        if(mc%group_sizes(slave_level) < num_slaves + 1) then
521          message(1) = 'Too many nodes assigned to task parallelization.'
522          call messages_fatal(1)
523        end if
524
525        write(message(1),'(a,i6)') 'Info: Number of slaves nodes              :', &
526          num_slaves*product(mc%group_sizes(1:slave_level - 1))
527        call messages_info(1)
528
529      end if
530
531      ! print out some info
532      ii = 0
533      do kk = P_STRATEGY_MAX, 1, -1
534        real_group_sizes(kk) = mc%group_sizes(kk)
535        if(.not. multicomm_strategy_is_parallel(mc, kk)) cycle
536        ii = ii + 1
537        if(kk == slave_level) INCR(real_group_sizes(kk), -num_slaves)
538        write(message(ii),'(3a,i6,a,i8,a)') 'Info: Number of nodes in ', &
539          par_types(kk), ' group:', real_group_sizes(kk), ' (', index_range(kk), ')'
540      end do
541      call messages_info(ii)
542
543      ! do we have the correct number of processors
544      if(product(mc%group_sizes(1:P_STRATEGY_MAX)) /= base_grp%size) then
545        write(message(1),'(a)') 'Inconsistent number of processors:'
546        write(message(2),'(a,i6)') '  MPI processes      = ', base_grp%size
547        write(message(3),'(a,i6)') '  Required processes = ', product(mc%group_sizes(1:P_STRATEGY_MAX))
548        message(4) = ''
549        message(5) = 'You probably have a problem in the ParDomains, ParStates, ParKPoints or ParOther.'
550        call messages_fatal(5, only_root_writes = .true.)
551      end if
552
553      if(any(real_group_sizes(1:P_STRATEGY_MAX) > index_range(1:P_STRATEGY_MAX))) then
554        message(1) = "Could not distribute nodes in parallel job. Most likely you are trying to"
555        message(2) = "use too many nodes for the job."
556        call messages_fatal(2, only_root_writes = .true.)
557      end if
558
559      if(any(index_range(1:P_STRATEGY_MAX) / real_group_sizes(1:P_STRATEGY_MAX) < min_range(1:P_STRATEGY_MAX) .and. &
560             real_group_sizes(1:P_STRATEGY_MAX) >  1)) then
561        message(1) = "I have fewer elements in a parallel group than recommended."
562        message(2) = "Maybe you should reduce the number of nodes."
563        call messages_warning(2)
564      end if
565
566      ! calculate fraction of idle time
567      frac = M_ONE
568      do ii = 1, P_STRATEGY_MAX
569        n_max = ceiling(TOFLOAT(index_range(ii)) / TOFLOAT(real_group_sizes(ii)))
570        kk = n_max*real_group_sizes(ii)
571        frac = frac*(M_ONE - TOFLOAT(kk - index_range(ii)) / TOFLOAT(kk))
572      end do
573
574      write(message(1), '(a,f5.2,a)') "Info: Octopus will waste at least ", &
575        (M_ONE - frac)*CNST(100.0), "% of computer time."
576      if(frac < CNST(0.8)) then
577        message(2) = "Usually a number of processors which is a multiple of small primes is best."
578        call messages_warning(2)
579      else
580        call messages_info(1)
581      end if
582
583      POP_SUB(multicomm_init.sanity_check)
584    end subroutine sanity_check
585
586    ! ---------------------------------------------------------
587    subroutine group_comm_create()
588#if defined(HAVE_MPI)
589      logical :: dim_mask(MAX_INDEX)
590      integer :: i_strategy, irank
591      logical :: reorder, periodic_mask(MAX_INDEX)
592      integer :: coords(MAX_INDEX)
593      integer :: new_comm, new_comm_size
594      character(len=6) :: node_type
595      type(mpi_grp_t) :: reorder_grp
596      integer :: base_group, reorder_group, ranks(base_grp%size)
597      integer :: ii, jj, kk, ll, nn, reorder_comm
598#endif
599
600      PUSH_SUB(multicomm_init.group_comm_create)
601
602      mc%node_type = P_MASTER
603
604      SAFE_ALLOCATE(mc%group_comm(1:P_STRATEGY_MAX))
605      SAFE_ALLOCATE(mc%who_am_i(1:P_STRATEGY_MAX))
606
607#if defined(HAVE_MPI)
608      mc%full_comm = MPI_COMM_NULL
609      mc%slave_intercomm = MPI_COMM_NULL
610      if(mc%par_strategy /= P_STRATEGY_SERIAL) then
611        if(mc%reorder_ranks) then
612          ! first, reorder the ranks
613          ! this is done to get a column-major ordering of the ranks in the
614          ! Cartesian communicator, since they a ordered row-major otherwise
615          call MPI_Comm_group(base_grp%comm, base_group, mpi_err)
616          if(mpi_err /= MPI_SUCCESS) then
617            message(1) = "Error in getting MPI group!"
618            call messages_fatal(1)
619          end if
620          ! now transpose the hypercube => get rank numbers in column-major order
621          nn = 1
622          do ii = 1, mc%group_sizes(1)
623            do jj = 1, mc%group_sizes(2)
624              do kk = 1, mc%group_sizes(3)
625                do ll = 1, mc%group_sizes(4)
626                  ranks(nn) = (ll-1)*mc%group_sizes(3)*mc%group_sizes(2)*mc%group_sizes(1) &
627                            + (kk-1)*mc%group_sizes(2)*mc%group_sizes(1) &
628                            + (jj-1)*mc%group_sizes(1) + ii - 1
629                  nn = nn + 1
630                end do
631              end do
632            end do
633          end do
634          call MPI_Group_incl(base_group, base_grp%size, ranks, reorder_group, mpi_err)
635          if(mpi_err /= MPI_SUCCESS) then
636            message(1) = "Error in creating MPI group!"
637            call messages_fatal(1)
638          end if
639          ! now get the reordered communicator
640          call MPI_Comm_create(base_grp%comm, reorder_group, reorder_comm, mpi_err)
641          if(mpi_err /= MPI_SUCCESS) then
642            message(1) = "Error in creating reordered communicator!"
643            call messages_fatal(1)
644          end if
645          call mpi_grp_init(reorder_grp, reorder_comm)
646          call mpi_grp_copy(base_grp, reorder_grp)
647        else
648          call mpi_grp_copy(reorder_grp, base_grp)
649        end if
650
651        ! Multilevel parallelization is organized in a hypercube. We
652        ! use an MPI Cartesian topology to generate the communicators
653        ! that correspond to each level.
654
655        ! create the topology
656        periodic_mask = .false.
657        reorder = .true.
658
659        ! The domain and states dimensions have to be periodic (2D torus)
660        ! in order to circulate matrix blocks.
661        periodic_mask(P_STRATEGY_DOMAINS) = multicomm_strategy_is_parallel(mc, P_STRATEGY_DOMAINS)
662        periodic_mask(P_STRATEGY_STATES)  = multicomm_strategy_is_parallel(mc, P_STRATEGY_STATES)
663
664        ! We allow reordering of ranks.
665        call MPI_Cart_create(reorder_grp%comm, P_STRATEGY_MAX, mc%group_sizes, periodic_mask, reorder, mc%full_comm, mpi_err)
666
667        call MPI_Comm_rank(mc%full_comm, mc%full_comm_rank, mpi_err)
668
669        ! get the coordinates of the current processor
670        call MPI_Cart_coords(mc%full_comm, mc%full_comm_rank, P_STRATEGY_MAX, coords, mpi_err)
671
672        ! find out what type of node this is
673        if(coords(slave_level) >= mc%group_sizes(slave_level) - num_slaves) then
674          mc%node_type = P_SLAVE
675        end if
676
677        if(mc%node_type == P_MASTER) then
678          INCR(mc%group_sizes(slave_level), -num_slaves)
679        else
680          mc%group_sizes(slave_level) = num_slaves
681        end if
682
683        call MPI_Comm_split(mc%full_comm, mc%node_type, mc%full_comm_rank, new_comm, mpi_err)
684        ASSERT(new_comm /= MPI_COMM_NULL)
685        call MPI_Comm_size(new_comm, new_comm_size, mpi_err)
686
687        reorder = .false.
688        if(product(mc%group_sizes(:)) /= new_comm_size) then
689          write(stderr,*) 'node ', mpi_world%rank, ': mc%group_sizes = ', mc%group_sizes, ' new_comm_size = ', new_comm_size
690          call MPI_Barrier(mpi_world%comm, mpi_err)
691          ASSERT(product(mc%group_sizes(:)) == new_comm_size)
692        endif
693        call MPI_Cart_create(new_comm, P_STRATEGY_MAX, mc%group_sizes, periodic_mask, reorder, mc%master_comm, mpi_err)
694        ASSERT(mc%master_comm /= MPI_COMM_NULL)
695
696        call MPI_Comm_free(new_comm, mpi_err)
697
698        call MPI_Comm_rank(mc%master_comm, mc%master_comm_rank, mpi_err)
699
700        ! The "lines" of the Cartesian grid.
701        ! Initialize all the communicators, even if they are not parallelized
702        do i_strategy = 1, P_STRATEGY_MAX
703            dim_mask             = .false.
704            dim_mask(i_strategy) = .true.
705            call MPI_Cart_sub(mc%master_comm, dim_mask, mc%group_comm(i_strategy), mpi_err)
706            call MPI_Comm_rank(mc%group_comm(i_strategy), mc%who_am_i(i_strategy), mpi_err)
707        end do
708
709        ! The domain-state "planes" of the grid (the ones with periodic dimensions).
710        dim_mask                     = .false.
711        dim_mask(P_STRATEGY_DOMAINS) = .true.
712        dim_mask(P_STRATEGY_STATES)  = .true.
713        call MPI_Cart_sub(mc%master_comm, dim_mask, mc%dom_st_comm, mpi_err)
714
715        ! The state-kpoints "planes" of the grid
716        dim_mask                     = .false.
717        dim_mask(P_STRATEGY_STATES)  = .true.
718        dim_mask(P_STRATEGY_KPOINTS) = .true.
719        call MPI_Cart_sub(mc%master_comm, dim_mask, mc%st_kpt_comm, mpi_err)
720
721        ! The domains-states-kpoints "cubes" of the grid
722        dim_mask                     = .false.
723        dim_mask(P_STRATEGY_DOMAINS) = .true.
724        dim_mask(P_STRATEGY_STATES)  = .true.
725        dim_mask(P_STRATEGY_KPOINTS) = .true.
726        call MPI_Cart_sub(mc%master_comm, dim_mask, mc%dom_st_kpt_comm, mpi_err)
727
728        if(num_slaves > 0) call create_slave_intercommunicators()
729      else
730        ! we initialize these communicators so we can use them even in serial
731        mc%group_comm = base_grp%comm
732        mc%who_am_i   = 0
733        mc%master_comm = base_grp%comm
734        mc%dom_st_comm = base_grp%comm
735        mc%st_kpt_comm = base_grp%comm
736        mc%dom_st_kpt_comm = base_grp%comm
737      end if
738
739      ! This is temporary debugging information.
740      if(debug%info .and. mc%par_strategy /= P_STRATEGY_SERIAL) then
741        write(message(1),'(a)') 'Debug: MPI Task Assignment to MPI Groups'
742        write(message(2),'(5a10)') 'World', 'Domains', 'States', 'K-Points', 'Other'
743        call messages_info(1)
744
745        if(mc%node_type == P_SLAVE) then
746          node_type = "slave"
747        else
748          node_type = "master"
749        end if
750        do irank = 0, mpi_world%size - 1
751          if(mpi_world%rank == irank) then
752            write(message(1),'(5i10,5x,a)') mpi_world%rank, mc%who_am_i(P_STRATEGY_DOMAINS), mc%who_am_i(P_STRATEGY_STATES), &
753            mc%who_am_i(P_STRATEGY_KPOINTS), mc%who_am_i(P_STRATEGY_OTHER), trim(node_type)
754            call messages_info(1, all_nodes = .true.)
755          end if
756          call MPI_Barrier(mpi_world%comm, mpi_err)
757        end do
758      end if
759
760#else
761      mc%group_comm = -1
762      mc%who_am_i   = 0
763      mc%full_comm = -1
764      mc%master_comm = -1
765      mc%dom_st_comm = -1
766      mc%st_kpt_comm = -1
767      mc%dom_st_kpt_comm = -1
768      mc%slave_intercomm = -1
769#endif
770
771      POP_SUB(multicomm_init.group_comm_create)
772    end subroutine group_comm_create
773
774    ! -----------------------------------------------------
775
776    subroutine create_slave_intercommunicators()
777#ifdef HAVE_MPI2
778      integer :: remote_leader
779      integer :: tag
780      integer :: coords(MAX_INDEX)
781
782      PUSH_SUB(multicomm_init.create_slave_intercommunicators)
783
784      ! create the intercommunicators to communicate with slaves
785
786      ! get the coordinates of the current processor
787      call MPI_Cart_coords(mc%full_comm, mc%full_comm_rank, P_STRATEGY_MAX, coords, mpi_err)
788
789      !now get the rank of the remote_leader
790      if(mc%node_type == P_SLAVE) then
791        coords(slave_level) = 0
792      else
793        coords(slave_level) = mc%group_sizes(slave_level)
794      end if
795      call MPI_Cart_rank(mc%full_comm, coords, remote_leader, mpi_err)
796
797      ! now create the intercommunicator
798      tag = coords(P_STRATEGY_DOMAINS)
799      call MPI_Intercomm_create(mc%group_comm(slave_level), 0, base_grp%comm, remote_leader, tag, mc%slave_intercomm, mpi_err)
800
801      POP_SUB(multicomm_init.create_slave_intercommunicators)
802#endif
803    end subroutine create_slave_intercommunicators
804
805  end subroutine multicomm_init
806
807
808  ! ---------------------------------------------------------
809  subroutine multicomm_end(mc)
810    type(multicomm_t), intent(inout) :: mc
811
812#if defined(HAVE_MPI)
813    integer :: ii
814#endif
815
816    PUSH_SUB(multicomm_end)
817
818    if(mc%par_strategy /= P_STRATEGY_SERIAL) then
819#if defined(HAVE_MPI)
820      ! Delete communicators.
821      do ii = 1, P_STRATEGY_MAX
822        ! initialized even if not parallelized
823        call MPI_Comm_free(mc%group_comm(ii), mpi_err)
824      end do
825      call MPI_Comm_free(mc%dom_st_comm, mpi_err)
826      call MPI_Comm_free(mc%st_kpt_comm, mpi_err)
827      call MPI_Comm_free(mc%dom_st_kpt_comm, mpi_err)
828      call MPI_Comm_free(mc%full_comm, mpi_err)
829      call MPI_Comm_free(mc%master_comm, mpi_err)
830
831#ifdef HAVE_MPI2
832      if(multicomm_have_slaves(mc)) call MPI_Comm_free(mc%slave_intercomm, mpi_err)
833#endif
834
835#endif
836    end if
837
838    SAFE_DEALLOCATE_A(mc%group_sizes)
839    SAFE_DEALLOCATE_A(mc%group_comm)
840    SAFE_DEALLOCATE_A(mc%who_am_i)
841
842    POP_SUB(multicomm_end)
843  end subroutine multicomm_end
844
845
846  ! ---------------------------------------------------------
847  logical pure function multicomm_strategy_is_parallel(mc, level) result(rr)
848    type(multicomm_t), intent(in) :: mc
849    integer,           intent(in) :: level
850
851    rr = bitand(mc%par_strategy, 2**(level - 1)) /= 0
852
853  end function multicomm_strategy_is_parallel
854
855
856  ! ---------------------------------------------------------
857  !> This routine uses the one-factorization (or near-one-factorization
858  !! of a complete graph to construct an all-pair communication
859  !! schedule (cf. Wang, X., Blum, E. K., Parker, D. S., and Massey,
860  !! D. 1997. The dance-party problem and its application to collective
861  !! communication in computer networks. Parallel Comput. 23, 8
862  !! (Aug. 1997), 1141-1156.
863#if defined(HAVE_MPI)
864
865  subroutine multicomm_create_all_pairs(mpi_grp, ap)
866    type(mpi_grp_t),             intent(in)  :: mpi_grp
867    type(multicomm_all_pairs_t), intent(out) :: ap
868
869    integer :: grp_size, rounds, ir, in
870
871    PUSH_SUB(create_all_pairs)
872
873    ap%grp = mpi_grp
874    grp_size = mpi_grp%size
875
876    ! Number of rounds.
877    if(mod(grp_size, 2)  ==  0) then
878      rounds = grp_size - 1
879    else
880      rounds = grp_size
881    end if
882    ap%rounds = rounds
883
884    ! Calculate schedule.
885    SAFE_ALLOCATE(ap%schedule(0:grp_size - 1, 1:rounds))
886    do ir = 1, rounds
887      do in = 0, grp_size - 1
888        ap%schedule(in, ir) = get_partner(in + 1, ir) - 1
889      end do
890    end do
891
892    POP_SUB(create_all_pairs)
893
894  contains
895
896    ! ---------------------------------------------------------
897    !> Those are from the paper cited above.
898    integer pure function get_partner(in, ir)
899      integer, intent(in) :: in, ir
900
901      ! No PUSH SUB, called too often.
902
903      if(mod(grp_size, 2) == 0) then
904        get_partner = get_partner_even(grp_size, in - 1, ir - 1) + 1
905      else
906        get_partner = get_partner_odd(grp_size, in - 1, ir - 1) + 1
907      end if
908
909     end function get_partner
910
911    ! ---------------------------------------------------------
912    integer pure function get_partner_even(grp_size, ii, rr) result(pp)
913      integer, intent(in) :: grp_size, ii, rr
914
915      integer :: mm
916
917      ! No PUSH SUB, called too often.
918
919      mm = grp_size / 2
920
921      if(ii  ==  0) then
922        pp = rr + 1
923      elseif(ii  ==  rr + 1) then
924        pp = 0
925      else
926        ! I never know when to use which remainder function, but here
927        ! it has to be the modulo one. Do not change that!
928        pp = modulo(2 * rr - ii + 1, 2 * mm - 1) + 1
929      end if
930
931    end function get_partner_even
932
933    ! ---------------------------------------------------------
934      integer pure function get_partner_odd(grp_size, ii, rr) result(pp)
935      integer, intent(in) :: grp_size, ii, rr
936
937      integer :: mm
938
939      ! No PUSH SUB, called too often.
940
941      mm = (grp_size + 1) / 2
942
943      pp = get_partner_even(grp_size + 1, ii, rr)
944
945      if(pp  ==  2 * mm - 1) then
946        pp = ii
947      end if
948
949    end function get_partner_odd
950
951  end subroutine multicomm_create_all_pairs
952#endif
953
954  !---------------------------------------------------
955  !> Function to divide the range of numbers from 1 to nobjs
956  !! between nprocs processors.
957  !! THREADSAFE
958  subroutine multicomm_divide_range(nobjs, nprocs, istart, ifinal, lsize, scalapack_compat)
959    integer,           intent(in)  :: nobjs !< Number of points to divide
960    integer,           intent(in)  :: nprocs !< Number of processors
961    integer,           intent(out) :: istart(:)
962    integer,           intent(out) :: ifinal(:)
963    integer, optional, intent(out) :: lsize(:) !< Number of objects in each partition
964    logical, optional, intent(in)  :: scalapack_compat
965
966    integer :: ii, jj, rank
967    logical :: scalapack_compat_
968    integer :: nbl, size
969
970    ! no push_sub, threadsafe
971
972    scalapack_compat_ = optional_default(scalapack_compat, .false.)
973#ifndef HAVE_SCALAPACK
974    scalapack_compat_ = .false.
975#endif
976
977    if (scalapack_compat_) then
978      nbl = nobjs/nprocs
979      if (mod(nobjs, nprocs) /= 0) INCR(nbl, 1)
980
981      istart(1) = 1
982      do rank = 1, nprocs
983#ifdef HAVE_SCALAPACK
984        size = numroc(nobjs, nbl, rank - 1, 0, nprocs)
985#endif
986        if(size > 0) then
987          if(rank > 1) istart(rank) = ifinal(rank - 1) + 1
988          ifinal(rank) = istart(rank) + size - 1
989        else
990          istart(rank) = 1
991          ifinal(rank) = 0
992        end if
993      end do
994    else
995
996      if(nprocs <= nobjs) then
997
998        ! procs are assigned to groups by round robin
999        do rank = 0, nprocs - 1
1000          jj = nobjs / nprocs
1001          ii = nobjs - jj*nprocs
1002          if(ii > 0 .and. rank < ii) then
1003            jj = jj + 1
1004            istart(rank + 1) = rank*jj + 1
1005            ifinal(rank + 1) = istart(rank + 1) + jj - 1
1006          else
1007            ifinal(rank + 1) = nobjs - (nprocs - rank - 1)*jj
1008            istart(rank + 1) = ifinal(rank + 1) - jj + 1
1009          end if
1010        end do
1011
1012      else
1013        do ii = 1, nprocs
1014          if(ii <= nobjs) then
1015            istart(ii) = ii
1016            ifinal(ii) = ii
1017          else
1018            istart(ii) = 1
1019            ifinal(ii) = 0
1020          end if
1021        end do
1022      end if
1023    end if
1024
1025    if(present(lsize)) then
1026      lsize(1:nprocs) = ifinal(1:nprocs) - istart(1:nprocs) + 1
1027      ASSERT(sum(lsize(1:nprocs)) == nobjs)
1028    end if
1029
1030  end subroutine multicomm_divide_range
1031
1032  ! ---------------------------------------------------------
1033  !> Function to divide the range of numbers from 1 to nobjs
1034  !! between all available threads with OpenMP.
1035  ! THREADSAFE
1036  subroutine multicomm_divide_range_omp(nobjs, ini, nobjs_loc)
1037    integer, intent(in)    :: nobjs       !< Number of points to divide
1038    integer, intent(out)   :: ini         !< Start point of the partition
1039    integer, intent(out)   :: nobjs_loc   !< Number of objects in each partition
1040
1041    integer :: rank
1042#ifdef HAVE_OPENMP
1043    integer, allocatable :: istart(:), ifinal(:), lsize(:)
1044    integer :: nthreads
1045#endif
1046
1047    ! no push_sub, threadsafe
1048    rank = 1
1049#ifdef HAVE_OPENMP
1050    nthreads = omp_get_num_threads()
1051    SAFE_ALLOCATE(istart(1:nthreads))
1052    SAFE_ALLOCATE(ifinal(1:nthreads))
1053    SAFE_ALLOCATE(lsize(1:nthreads))
1054    call multicomm_divide_range(nobjs, nthreads, istart, ifinal, lsize)
1055    rank   = 1 + omp_get_thread_num()
1056    ini    = istart(rank)
1057    nobjs_loc = lsize(rank)
1058    SAFE_DEALLOCATE_A(istart)
1059    SAFE_DEALLOCATE_A(ifinal)
1060    SAFE_DEALLOCATE_A(lsize)
1061#else
1062    ini = 1
1063    nobjs_loc = nobjs
1064#endif
1065
1066  end subroutine multicomm_divide_range_omp
1067
1068  ! ---------------------------------------------------------
1069
1070  logical pure function multicomm_is_slave(this) result(slave)
1071    type(multicomm_t), intent(in) :: this
1072
1073    slave = this%node_type == P_SLAVE
1074  end function multicomm_is_slave
1075
1076  ! ---------------------------------------------------------
1077
1078  logical pure function multicomm_have_slaves(this) result(have_slaves)
1079    type(multicomm_t), intent(in) :: this
1080
1081    have_slaves = this%have_slaves
1082  end function multicomm_have_slaves
1083
1084end module multicomm_oct_m
1085
1086
1087!! Local Variables:
1088!! mode: f90
1089!! coding: utf-8
1090!! End:
1091