1 /*============================================================================
2  * Functions dealing with parallelism
3  *============================================================================*/
4 
5 /*
6   This file is part of Code_Saturne, a general-purpose CFD tool.
7 
8   Copyright (C) 1998-2021 EDF S.A.
9 
10   This program is free software; you can redistribute it and/or modify it under
11   the terms of the GNU General Public License as published by the Free Software
12   Foundation; either version 2 of the License, or (at your option) any later
13   version.
14 
15   This program is distributed in the hope that it will be useful, but WITHOUT
16   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
18   details.
19 
20   You should have received a copy of the GNU General Public License along with
21   this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
22   Street, Fifth Floor, Boston, MA 02110-1301, USA.
23 */
24 
25 /*----------------------------------------------------------------------------*/
26 
27 #include "cs_defs.h"
28 
29 /*----------------------------------------------------------------------------
30  * Standard C library headers
31  *----------------------------------------------------------------------------*/
32 
33 #include <assert.h>
34 #include <stdarg.h>
35 #include <string.h>
36 
37 /*----------------------------------------------------------------------------
38  *  Local headers
39  *----------------------------------------------------------------------------*/
40 
41 #include "bft_error.h"
42 #include "bft_mem.h"
43 #include "cs_order.h"
44 
45 /*----------------------------------------------------------------------------
46  *  Header for the current file
47  *----------------------------------------------------------------------------*/
48 
49 #include "cs_parall.h"
50 
51 /*----------------------------------------------------------------------------*/
52 
53 BEGIN_C_DECLS
54 
55 /*=============================================================================
56  * Additional doxygen documentation
57  *============================================================================*/
58 
59 /*! \fn inline static void cs_parall_counter(cs_gnum_t cpt[], const int n)
60  *
61  * \brief Sum values of a counter on all default communicator processes.
62  *
63  * \param[in, out]  cpt local counter in, global counter out (size: n)
64  * \param[in]       n   number of values
65  */
66 
67 /*! \fn inline static void cs_parall_counter_max(cs_lnum_t cpt[], const int n)
68  *
69  * \brief Maximum values of a counter on all default communicator processes.
70  *
71  * \param[in, out]  cpt local counter in, global counter out (size: n)
72  * \param[in]       n   number of values
73  */
74 
75 /*! \fn inline static void cs_parall_sum(int n, \
76                                          cs_datatype_t datatype, \
77                                          void *val)
78 
79  * \brief Sum values of a given datatype on all default communicator processes.
80  *
81  * \param[in]       n         number of values
82  * \param[in]       datatype  matching Code_Saturne datatype
83  * \param[in, out]  val       local sum in, global sum out (size: n)
84  */
85 
86 /*! \fn inline static void cs_parall_max(int n, \
87                                          cs_datatype_t datatype, \
88                                          void *val)
89  *
90  * \brief Maximum values of a given datatype on all
91  *        default communicator processes.
92  *
93  * \param[in]       n         number of values
94  * \param[in]       datatype  matching Code_Saturne datatype
95  * \param[in, out]  val       local maximum in, global maximum out (size: n)
96  */
97 
98 /*! \fn inline static void cs_parall_min(int n, \
99                                          cs_datatype_t datatype, \
100                                          void *val)
101  *
102  *
103  * \brief Minimum values of a given datatype on all
104  *        default communicator processes.
105  *
106  * \param[in]       n         number of values
107  * \param[in]       datatype  matching Code_Saturne datatype
108  * \param[in, out]  val       local minimum in, global minimum out (size: n)
109  */
110 
111 /*! \fn inline static void cs_parall_bcast(int             root_rank, \
112                                            int             n,         \
113                                            cs_datatype_t   datatype,  \
114                                            void           *val)
115  *
116  * \brief Broadcast values of a given datatype to all
117  *        default communicator processes.
118  *
119  * \param[in]       root_rank  rank from which to broadcast
120  * \param[in]       n          number of values
121  * \param[in]       datatype   matching Code_Saturne datatype
122  * \param[in, out]  val        values to broadcast; input on root_rank,
123  *                             output on others (size: n)
124  */
125 
126 /*!
127   \file cs_parall.c
128         Utility functions dealing with parallelism.
129 */
130 
131 /*! \cond DOXYGEN_SHOULD_SKIP_THIS */
132 
133 /*============================================================================
134  * Local structure and type definitions
135  *============================================================================*/
136 
137 #define CS_PARALL_ARRAY_SIZE  500
138 
139 typedef struct
140 {
141   double  val;
142   int     rank;
143 } _mpi_double_int_t;
144 
145 /*============================================================================
146  * Static global variables
147  *============================================================================*/
148 
149 #if defined(HAVE_MPI)
150 
151 /* Minimum recommended scatter/gather buffer size */
152 
153 static size_t _cs_parall_min_coll_buf_size = 1024*1024;
154 
155 #endif
156 
157 /*============================================================================
158  * Prototypes for functions intended for use only by Fortran wrappers.
159  * (descriptions follow, with function bodies).
160  *============================================================================*/
161 
162 void
163 cs_f_parall_max_i(int  *max);
164 
165 void
166 cs_f_parall_max_r(double  *max);
167 
168 void
169 cs_f_parall_min_i(int  *min);
170 
171 void
172 cs_f_parall_min_r(double  *min);
173 
174 void
175 cs_f_parall_sum_i(int  *sum);
176 
177 void
178 cs_f_parall_sum_r(double  *sum);
179 
180 void
181 cs_f_parall_max_n_i(int   n,
182                     int  *max);
183 
184 void
185 cs_f_parall_max_n_r(int      n,
186                     double  *max);
187 
188 void
189 cs_f_parall_min_n_i(int   n,
190                     int  *min);
191 
192 void
193 cs_f_parall_min_n_r(int      n,
194                     double  *min);
195 
196 void
197 cs_f_parall_sum_n_i(int   n,
198                     int  *sum);
199 
200 void
201 cs_f_parall_sum_n_r(int      n,
202                     double  *sum);
203 
204 void
205 cs_f_parall_bcast_i(int   root_rank,
206                     int  *val);
207 
208 void
209 cs_f_parall_bcast_r(int      root_rank,
210                     double  *val);
211 
212 void
213 cs_f_parall_bcast_n_i(int   root_rank,
214                       int   n,
215                       int  *val);
216 
217 void
218 cs_f_parall_bcast_n_r(int      root_rank,
219                       int      n,
220                       double  *val);
221 
222 void
223 cs_f_parall_barrier(void);
224 
225 /*=============================================================================
226  * Private function definitions
227  *============================================================================*/
228 
229 /*----------------------------------------------------------------------------*/
230 /*!
231  * \brief Define the array distribution over all ranks on a given root rank.
232  *        The size of each local array may be different.
233  *
234  * \param[in]   root_rank  rank which stores the global array
235  * \param[in]   n_elts     size of the local array
236  * \param[in]   n_g_elts   size of the global array
237  * \param[out]  count      number of elements in each rank
238  * \param[out]  shift      shift to access each local array related to a rank
239  */
240 /*----------------------------------------------------------------------------*/
241 
242 #if defined(HAVE_MPI)
243 
244 static void
_get_array_distribution(int root_rank,int n_elts,int n_g_elts,int * p_count[],int * p_shift[])245 _get_array_distribution(int     root_rank,
246                         int     n_elts,
247                         int     n_g_elts,
248                         int    *p_count[],
249                         int    *p_shift[])
250 {
251   int  *count = NULL;
252   int  *shift = NULL;
253 
254   const int  n_ranks = cs_glob_n_ranks;
255 
256   assert(sizeof(double) == sizeof(cs_real_t));
257 
258   BFT_MALLOC(count, n_ranks, int);
259   BFT_MALLOC(shift, n_ranks, int);
260 
261   MPI_Gather(&n_elts, 1, MPI_INT,
262              count, 1, MPI_INT, root_rank, cs_glob_mpi_comm);
263 
264   shift[0] = 0;
265   for (cs_lnum_t i = 1; i < n_ranks; i++)
266     shift[i] = shift[i-1] + count[i-1];
267 
268   if (cs_glob_rank_id == root_rank)
269     if (n_g_elts != (shift[n_ranks - 1] + count[n_ranks - 1]))
270       bft_error(__FILE__, __LINE__, 0,
271                 _("Incorrect arguments to %s:\n"
272                   "  sum of arg. 1 (n_elts) on ranks "
273                   "is not equal to arg. 2 (n_g_elts)."), __func__);
274 
275   /* Return pointers */
276   *p_count = count;
277   *p_shift = shift;
278 }
279 
280 #endif  /* HAVE_MPI */
281 
282 /*----------------------------------------------------------------------------
283  * Call MPI_Allreduce for a given Code_Saturne datatype and MPI
284  * operation on all default communicator processes.
285  *
286  * parameters:
287  *   n         <-- number of values
288  *   datatype  <-- matching Code_Saturne datatype
289  *   operation <-- MPI operation
290  *   val       <-> local value  input, global value output (array)
291  *----------------------------------------------------------------------------*/
292 
293 #if !defined(HAVE_MPI_IN_PLACE) && defined(HAVE_MPI)
294 
295 static void
_cs_parall_allreduce(int n,cs_datatype_t datatype,MPI_Op operation,void * val)296 _cs_parall_allreduce(int             n,
297                      cs_datatype_t   datatype,
298                      MPI_Op          operation,
299                      void           *val)
300 {
301   int             data_size = n*cs_datatype_size[datatype];
302   unsigned char  *locval;
303   unsigned char  _locval[256];
304 
305   if (data_size > 256)
306     BFT_MALLOC(locval, data_size, unsigned char);
307   else
308     locval = _locval;
309 
310   memcpy(locval, val, data_size);
311 
312   MPI_Allreduce(locval, val, n, cs_datatype_to_mpi[datatype], operation,
313                 cs_glob_mpi_comm);
314 
315   if (locval != _locval)
316     BFT_FREE(locval);
317 }
318 
319 #endif
320 
321 /*============================================================================
322  * Fortran wrapper function definitions
323  *============================================================================*/
324 
325 /*----------------------------------------------------------------------------
326  * Compute the global maximum of an integer in case of parallelism
327  *
328  * parameters:
329  *   max <->  input = local max; output = global max
330  *----------------------------------------------------------------------------*/
331 
332 void
cs_f_parall_max_i(int * max)333 cs_f_parall_max_i(int  *max)
334 {
335 #if defined(HAVE_MPI)
336 
337   int  global_max;
338 
339   assert (sizeof (double) == sizeof (cs_real_t));
340 
341   MPI_Allreduce (max, &global_max, 1, MPI_INT, MPI_MAX,
342                  cs_glob_mpi_comm);
343 
344   *max = global_max;
345 
346 #endif
347 }
348 
349 /*----------------------------------------------------------------------------
350  * Compute the global minimum of a real in case of parallelism
351  *
352  * parameters:
353  *   min <->  input = local min; output = global min
354  *----------------------------------------------------------------------------*/
355 
356 void
cs_f_parall_min_r(double * min)357 cs_f_parall_min_r(double  *min)
358 {
359 #if defined(HAVE_MPI)
360 
361   double  global_min;
362 
363   assert (sizeof (double) == sizeof (cs_real_t));
364 
365   MPI_Allreduce (min, &global_min, 1, MPI_DOUBLE, MPI_MIN,
366                  cs_glob_mpi_comm);
367 
368   *min = global_min;
369 
370 #endif
371 }
372 
373 /*----------------------------------------------------------------------------
374  * Compute the global minimum of an integer in case of parallelism
375  *
376  * parameters:
377  *   min <->  input = local min; output = global min
378  *----------------------------------------------------------------------------*/
379 
380 void
cs_f_parall_min_i(int * min)381 cs_f_parall_min_i(int  *min)
382 {
383 #if defined(HAVE_MPI)
384 
385   int  global_min;
386 
387   assert (sizeof (double) == sizeof (cs_real_t));
388 
389   MPI_Allreduce (min, &global_min, 1, MPI_INT, MPI_MIN,
390                  cs_glob_mpi_comm);
391 
392   *min = global_min;
393 
394 #endif
395 }
396 
397 /*----------------------------------------------------------------------------
398  * Compute the global maximum of a real in case of parallelism
399  *
400  * parameters:
401  *   max <->  input = local max; output = global max
402  *----------------------------------------------------------------------------*/
403 
404 void
cs_f_parall_max_r(double * max)405 cs_f_parall_max_r(double  *max)
406 {
407 #if defined(HAVE_MPI)
408 
409   double  global_max;
410 
411   assert (sizeof (double) == sizeof (cs_real_t));
412 
413   MPI_Allreduce (max, &global_max, 1, MPI_DOUBLE, MPI_MAX,
414                  cs_glob_mpi_comm);
415 
416   *max = global_max;
417 
418 #endif
419 }
420 
421 /*----------------------------------------------------------------------------
422  * Compute the global sum of an integer in case of parallelism
423  *
424  * parameters:
425  *   sum <->  input = local sum; output = global sum
426  *----------------------------------------------------------------------------*/
427 
428 void
cs_f_parall_sum_i(int * sum)429 cs_f_parall_sum_i(int  *sum)
430 {
431 #if defined(HAVE_MPI)
432 
433   int  global_sum;
434 
435   assert (sizeof (double) == sizeof (cs_real_t));
436 
437   MPI_Allreduce (sum, &global_sum, 1, MPI_INT, MPI_SUM,
438                  cs_glob_mpi_comm);
439 
440   *sum = global_sum;
441 
442 #endif
443 }
444 
445 /*----------------------------------------------------------------------------
446  * Compute the global sum of a real in case of parallelism
447  *
448  * parameters:
449  *   sum <->  input = local sum; output = global sum
450  *----------------------------------------------------------------------------*/
451 
452 void
cs_f_parall_sum_r(double * sum)453 cs_f_parall_sum_r(double  *sum)
454 {
455 #if defined(HAVE_MPI)
456 
457   double  global_sum;
458 
459   assert (sizeof (double) == sizeof (cs_real_t));
460 
461   MPI_Allreduce (sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM,
462                  cs_glob_mpi_comm);
463 
464   *sum = global_sum;
465 
466 #endif
467 }
468 
469 /*----------------------------------------------------------------------------
470  * Compute the global maxima of an array of integers in case of parallelism
471  *
472  * parameters:
473  *   n   <-- array size
474  *   max <-> input = local max; output = global max
475  *----------------------------------------------------------------------------*/
476 
477 void
cs_f_parall_max_n_i(int n,int * max)478 cs_f_parall_max_n_i(int   n,
479                     int  *max)
480 {
481   cs_parall_max(n, CS_INT_TYPE, max);
482 }
483 
484 /*----------------------------------------------------------------------------
485  * Compute the global maxima of an array of reals in case of parallelism
486  *
487  * parameters:
488  *   n   <-- array size
489  *   max <-> input = local max; output = global max
490  *----------------------------------------------------------------------------*/
491 
492 void
cs_f_parall_max_n_r(int n,double * max)493 cs_f_parall_max_n_r(int      n,
494                     double  *max)
495 {
496   cs_parall_max(n, CS_DOUBLE, max);
497 }
498 
499 /*----------------------------------------------------------------------------
500  * Compute the global minima of an array of integers in case of parallelism
501  *
502  * parameters:
503  *   n   <-- array size
504  *   min <-> input = local min; output = global min
505  *----------------------------------------------------------------------------*/
506 
507 void
cs_f_parall_min_n_i(int n,int * min)508 cs_f_parall_min_n_i(int   n,
509                     int  *min)
510 {
511   cs_parall_min(n, CS_INT_TYPE, min);
512 }
513 
514 /*----------------------------------------------------------------------------
515  * Compute the global minima of an array of reals in case of parallelism
516  *
517  * parameters:
518  *   n   <-- array size
519  *   min <-> input = local min; output = global min
520  *----------------------------------------------------------------------------*/
521 
522 void
cs_f_parall_min_n_r(int n,double * min)523 cs_f_parall_min_n_r(int      n,
524                     double  *min)
525 {
526   cs_parall_min(n, CS_DOUBLE, min);
527 }
528 
529 /*----------------------------------------------------------------------------
530  * Compute the global sums of an array of integers in case of parallelism
531  *
532  * parameters:
533  *   n   <-- array size
534  *   sum <-> input = local sum; output = global sum
535  *----------------------------------------------------------------------------*/
536 
537 void
cs_f_parall_sum_n_i(int n,int * sum)538 cs_f_parall_sum_n_i(int   n,
539                     int  *sum)
540 {
541   cs_parall_sum(n, CS_INT_TYPE, sum);
542 }
543 
544 /*----------------------------------------------------------------------------
545  * Compute the global sums of an array of reals in case of parallelism
546  *
547  * parameters:
548  *   n   <-- array size
549  *   sum <-> input = local sum; output = global sum
550  *----------------------------------------------------------------------------*/
551 
552 void
cs_f_parall_sum_n_r(int n,double * sum)553 cs_f_parall_sum_n_r(int      n,
554                     double  *sum)
555 {
556   cs_parall_sum(n, CS_DOUBLE, sum);
557 }
558 
559 /*----------------------------------------------------------------------------
560  * Broadcast an integer to all ranks
561  *
562  * parameters:
563  *   root_rank <-- id of root rank
564  *   val       <-> input = local value; output = global value
565  *----------------------------------------------------------------------------*/
566 
567 void
cs_f_parall_bcast_i(int root_rank,int * val)568 cs_f_parall_bcast_i(int   root_rank,
569                     int  *val)
570 {
571   cs_parall_bcast(root_rank, 1, CS_INT_TYPE, val);
572 }
573 
574 /*----------------------------------------------------------------------------
575  * Broadcast a double to all ranks
576  *
577  * parameters:
578  *   root_rank <-- id of root rank
579  *   val       <-> input = local value; output = global value
580  *----------------------------------------------------------------------------*/
581 
582 void
cs_f_parall_bcast_r(int root_rank,double * val)583 cs_f_parall_bcast_r(int      root_rank,
584                     double  *val)
585 {
586   cs_parall_bcast(root_rank, 1, CS_DOUBLE, val);
587 }
588 
589 /*----------------------------------------------------------------------------
590  * Broadcast an array of integers to all ranks
591  *
592  * parameters:
593  *   root_rank <-- id of root rank
594  *   n         <-- array size
595  *   val       <-> input = local value; output = global value
596  *----------------------------------------------------------------------------*/
597 
598 void
cs_f_parall_bcast_n_i(int root_rank,int n,int * val)599 cs_f_parall_bcast_n_i(int   root_rank,
600                       int   n,
601                       int  *val)
602 {
603   cs_parall_bcast(root_rank, n, CS_INT_TYPE, val);
604 }
605 
606 /*----------------------------------------------------------------------------
607  * Broadcast an array of doubles to all ranks
608  *
609  * parameters:
610  *   root_rank <-- id of root rank
611  *   n         <-- array size
612  *   val       <-> input = local value; output = global value
613  *----------------------------------------------------------------------------*/
614 
615 void
cs_f_parall_bcast_n_r(int root_rank,int n,double * val)616 cs_f_parall_bcast_n_r(int      root_rank,
617                       int      n,
618                       double  *val)
619 {
620   cs_parall_bcast(root_rank, n, CS_DOUBLE, val);
621 }
622 
623 /*----------------------------------------------------------------------------
624  * Call a barrier in case of parallelism
625  *
626  * This function should not be necessary in production code,
627  * but it may be useful for debugging purposes.
628  *----------------------------------------------------------------------------*/
629 
630 void
cs_f_parall_barrier(void)631 cs_f_parall_barrier(void)
632 {
633 #if defined(HAVE_MPI)
634   if (cs_glob_n_ranks > 1)
635     MPI_Barrier(cs_glob_mpi_comm);
636 #endif
637 }
638 
639 /*! (DOXYGEN_SHOULD_SKIP_THIS) \endcond */
640 
641 /*=============================================================================
642  * Public function definitions
643  *============================================================================*/
644 
645 #if !defined(HAVE_MPI_IN_PLACE) && defined(HAVE_MPI)
646 
647 void
cs_parall_counter(cs_gnum_t cpt[],const int n)648 cs_parall_counter(cs_gnum_t   cpt[],
649                   const int   n)
650 {
651   if (cs_glob_n_ranks > 1)
652     _cs_parall_allreduce(n, CS_GNUM_TYPE, MPI_SUM, cpt);
653 }
654 
655 #endif
656 
657 /*----------------------------------------------------------------------------
658  * Maximum values of a counter on all default communicator processes.
659  *
660  * parameters:
661  *   cpt <-> local counter value  input, global counter value output (array)
662  *   n   <-- number of counter array values
663  *----------------------------------------------------------------------------*/
664 
665 #if !defined(HAVE_MPI_IN_PLACE) && defined(HAVE_MPI)
666 
667 void
cs_parall_counter_max(cs_lnum_t cpt[],const int n)668 cs_parall_counter_max(cs_lnum_t   cpt[],
669                       const int   n)
670 {
671   if (cs_glob_n_ranks > 1)
672     _cs_parall_allreduce(n, CS_LNUM_TYPE, MPI_MAX, cpt);
673 }
674 
675 #endif
676 
677 /*----------------------------------------------------------------------------
678  * Sum values of a given datatype on all default communicator processes.
679  *
680  * parameters:
681  *   n        <-- number of values
682  *   datatype <-- matching Code_Saturne datatype
683  *   val      <-> local value  input, global value output (array)
684  *----------------------------------------------------------------------------*/
685 
686 #if !defined(HAVE_MPI_IN_PLACE) && defined(HAVE_MPI)
687 
688 void
cs_parall_sum(int n,cs_datatype_t datatype,void * val)689 cs_parall_sum(int             n,
690               cs_datatype_t   datatype,
691               void           *val)
692 {
693   if (cs_glob_n_ranks > 1)
694     _cs_parall_allreduce(n, datatype, MPI_SUM, val);
695 }
696 
697 #endif
698 
699 /*----------------------------------------------------------------------------
700  * Maximum values of a given datatype on all default communicator processes.
701  *
702  * parameters:
703  *   n        <-- number of values
704  *   datatype <-- matching Code_Saturne datatype
705  *   val      <-> local value  input, global value output (array)
706  *----------------------------------------------------------------------------*/
707 
708 #if !defined(HAVE_MPI_IN_PLACE) && defined(HAVE_MPI)
709 
710 void
cs_parall_max(int n,cs_datatype_t datatype,void * val)711 cs_parall_max(int             n,
712               cs_datatype_t   datatype,
713               void           *val)
714 {
715   if (cs_glob_n_ranks > 1)
716     _cs_parall_allreduce(n, datatype, MPI_MAX, val);
717 }
718 
719 #endif
720 
721 /*----------------------------------------------------------------------------
722  * Minimum values of a given datatype on all default communicator processes.
723  *
724  * parameters:
725  *   n        <-- number of values
726  *   datatype <-- matching Code_Saturne datatype
727  *   val      <-> local value  input, global value output (array)
728  *----------------------------------------------------------------------------*/
729 
730 #if !defined(HAVE_MPI_IN_PLACE) && defined(HAVE_MPI)
731 
732 void
cs_parall_min(int n,cs_datatype_t datatype,void * val)733 cs_parall_min(int             n,
734               cs_datatype_t   datatype,
735               void           *val)
736 {
737   if (cs_glob_n_ranks > 1)
738     _cs_parall_allreduce(n, datatype, MPI_MIN, val);
739 }
740 
741 #endif
742 
743 /*----------------------------------------------------------------------------*/
744 /*!
745  * \brief Maximum value of a real and the value of related array on all
746  * default communicator processes.
747  *
748  * \param[in]       n             size of the related array
749  * \param[in, out]  max           local max in, global max out
750  * \param[in, out]  max_loc_vals  array values at location of local max in,
751  *                                and at location of global max out
752  */
753 /*----------------------------------------------------------------------------*/
754 
755 void
cs_parall_max_loc_vals(int n,cs_real_t * max,cs_real_t max_loc_vals[])756 cs_parall_max_loc_vals(int         n,
757                        cs_real_t  *max,
758                        cs_real_t   max_loc_vals[])
759 {
760 #if defined(HAVE_MPI)
761 
762   if (cs_glob_n_ranks > 1) {
763 
764     _mpi_double_int_t  val_in, val_max;
765 
766     val_in.val  = *max;
767     val_in.rank = cs_glob_rank_id;
768 
769     MPI_Allreduce(&val_in, &val_max, 1, MPI_DOUBLE_INT, MPI_MAXLOC,
770                   cs_glob_mpi_comm);
771 
772     *max = val_max.val;
773 
774     MPI_Bcast(max_loc_vals, n, CS_MPI_REAL, val_max.rank, cs_glob_mpi_comm);
775 
776   }
777 
778 #endif
779 }
780 
781 /*----------------------------------------------------------------------------*/
782 /*!
783  * \brief Minimum value of a real and the value of related array on all
784  * default communicator processes.
785  *
786  * \param[in]       n             size of the related array
787  * \param[in, out]  min           local min in, global min out
788  * \param[in, out]  min_loc_vals  array values at location of local min in,
789  *                                and at location of global min out
790  */
791 /*----------------------------------------------------------------------------*/
792 
793 void
cs_parall_min_loc_vals(int n,cs_real_t * min,cs_real_t min_loc_vals[])794 cs_parall_min_loc_vals(int         n,
795                        cs_real_t  *min,
796                        cs_real_t   min_loc_vals[])
797 {
798 #if defined(HAVE_MPI)
799 
800   if (cs_glob_n_ranks > 1) {
801 
802     _mpi_double_int_t  val_in, val_min;
803 
804     val_in.val  = *min;
805     val_in.rank = cs_glob_rank_id;
806 
807     MPI_Allreduce(&val_in, &val_min, 1, MPI_DOUBLE_INT, MPI_MINLOC,
808                   cs_glob_mpi_comm);
809 
810     *min = val_min.val;
811 
812     MPI_Bcast(min_loc_vals, n, CS_MPI_REAL, val_min.rank, cs_glob_mpi_comm);
813 
814   }
815 
816 #endif
817 }
818 
819 /*----------------------------------------------------------------------------*/
820 /*!
821  * \brief Given an (id, rank, value) tuple, return the local id and rank
822  *        corresponding to the global minimum value.
823  *
824  * \param[in, out]   elt_id   element id for which the value is the smallest
825  *                            (local in, global out)
826  * \param[in, out]   rank_id  rank id for which the value is the smallest
827  *                            (local in, global out)
828  * \param[in]        val      associated local minimum value
829  */
830 /*----------------------------------------------------------------------------*/
831 
832 void
cs_parall_min_id_rank_r(cs_lnum_t * elt_id,int * rank_id,cs_real_t val)833 cs_parall_min_id_rank_r(cs_lnum_t  *elt_id,
834                         int        *rank_id,
835                         cs_real_t   val)
836 {
837 #if defined(HAVE_MPI)
838 
839   if (cs_glob_n_ranks > 1) {
840 
841     cs_lnum_t buf[2];
842 
843     _mpi_double_int_t  val_in, val_min;
844 
845     assert(sizeof(double) == sizeof(cs_real_t));
846 
847     val_in.val  = val;
848     val_in.rank = cs_glob_rank_id;
849 
850     MPI_Allreduce(&val_in, &val_min, 1, MPI_DOUBLE_INT, MPI_MINLOC,
851                   cs_glob_mpi_comm);
852 
853     *rank_id = cs_glob_rank_id;
854 
855     buf[0] = *elt_id;
856     buf[1] = *rank_id;
857 
858     MPI_Bcast(buf, 2, CS_MPI_LNUM, val_min.rank, cs_glob_mpi_comm);
859 
860     *elt_id = buf[0];
861     *rank_id = buf[1];
862 
863   }
864 
865 #endif
866 }
867 
868 /*----------------------------------------------------------------------------*/
869 /*!
870  * \brief Build a global array from each local array in each domain.
871  *
872  * Local arrays are appended in order of owning MPI rank.
873  * The size of each local array may be different.
874  *
875  * Use of this function may be quite practical, but should be limited
876  * to user functions, as it may limit scalability (especially as regards
877  * memory usage).
878  *
879  * \param[in]   n_elts    size of the local array
880  * \param[in]   n_g_elts  size of the global array
881  * \param[in]   array     local array (size: n_elts)
882  * \param[out]  g_array   global array  (size: n_g_elts)
883  */
884 /*----------------------------------------------------------------------------*/
885 
886 void
cs_parall_allgather_r(int n_elts,int n_g_elts,cs_real_t array[],cs_real_t g_array[])887 cs_parall_allgather_r(int        n_elts,
888                       int        n_g_elts,
889                       cs_real_t  array[],
890                       cs_real_t  g_array[])
891 {
892 #if defined(HAVE_MPI)
893 
894   if (cs_glob_n_ranks > 1) {
895 
896     int  i;
897     int  *count = NULL;
898     int  *shift = NULL;
899 
900     const int  n_domains = cs_glob_n_ranks;
901 
902     assert(sizeof(double) == sizeof(cs_real_t));
903 
904     BFT_MALLOC(count, n_domains, int);
905     BFT_MALLOC(shift, n_domains, int);
906 
907     MPI_Allgather(&n_elts, 1, MPI_INT, count, 1, MPI_INT,
908                   cs_glob_mpi_comm);
909 
910     shift[0] = 0;
911     for (i = 1; i < n_domains; i++)
912       shift[i] = shift[i-1] + count[i-1];
913 
914     if (n_g_elts != (shift[n_domains - 1] + count[n_domains - 1]))
915       bft_error(__FILE__, __LINE__, 0,
916                 _("Incorrect arguments to %s:\n"
917                   "  sum of arg. 1 (n_elts) on ranks "
918                   "is not equal to arg. 2 (n_g_elts)."),
919                 __func__);
920 
921     MPI_Allgatherv(array, n_elts, CS_MPI_REAL,
922                    g_array, count, shift, CS_MPI_REAL, cs_glob_mpi_comm);
923 
924     BFT_FREE(count);
925     BFT_FREE(shift);
926 
927   }
928 
929 #endif
930 
931   if (cs_glob_n_ranks == 1) {
932 
933     assert(n_elts == n_g_elts);
934 
935     for (int i = 0; i < n_elts; i++)
936       g_array[i] = array[i];
937 
938   }
939 }
940 
941 /*----------------------------------------------------------------------------*/
942 /*!
943  * \brief Build an ordered global array from each local array in each domain.
944  *
945  * Local array elements are ordered based on a given key value (usually
946  * some form of coordinate, so the result should be independent of
947  * partitioning as long as the definition of the o_key array's defintion
948  * is itself independent of the partitioning.
949  *
950  * Use of this function may be quite practical, but should be limited
951  * to user functions, as it may limit scalability (especially as regards
952  * memory usage).
953  *
954  * \param[in]   n_elts    number of local elements
955  * \param[in]   n_g_elts  number of global elements
956  * \param[in]   stride    number of values per element
957  * \param[in]   o_key     ordering key (coordinate) value per element
958  * \param[in]   array     local array (size: n_elts*stride)
959  * \param[out]  g_array   global array  (size: n_g_elts*stride)
960  */
961 /*----------------------------------------------------------------------------*/
962 
963 void
cs_parall_allgather_ordered_r(int n_elts,int n_g_elts,int stride,cs_real_t o_key[],cs_real_t array[],cs_real_t g_array[])964 cs_parall_allgather_ordered_r(int        n_elts,
965                               int        n_g_elts,
966                               int        stride,
967                               cs_real_t  o_key[],
968                               cs_real_t  array[],
969                               cs_real_t  g_array[])
970 {
971   cs_lnum_t  *order;
972   cs_real_t  *g_o_key;
973 
974   BFT_MALLOC(g_o_key, n_g_elts, cs_real_t);
975   BFT_MALLOC(order, n_g_elts, cs_lnum_t);
976 
977   cs_parall_allgather_r(n_elts, n_g_elts, o_key, g_o_key);
978   cs_parall_allgather_r(n_elts, n_g_elts*stride, array, g_array);
979 
980   cs_order_real_allocated(NULL, g_o_key, order, n_g_elts);
981   cs_order_reorder_data(n_g_elts, sizeof(cs_real_t)*stride, order, g_array);
982 
983   BFT_FREE(order);
984   BFT_FREE(g_o_key);
985 }
986 
987 /*----------------------------------------------------------------------------*/
988 /*!
989  * \brief Build a global array on the given root rank from all local arrays.
990  *
991  * Local arrays are appended in order of owning MPI rank.
992  * The size of each local array may be different.
993  *
994  * Use of this function may be quite practical, but should be limited
995  * to user functions, as it may limit scalability (especially as regards
996  * memory usage).
997  *
998  * \param[in]   root_rank  rank which stores the global array
999  * \param[in]   n_elts     size of the local array
1000  * \param[in]   n_g_elts   size of the global array
1001  * \param[in]   array      local array (size: n_elts)
1002  * \param[out]  g_array    global array  (size: n_g_elts) only usable by the
1003  *                         root rank
1004  */
1005 /*----------------------------------------------------------------------------*/
1006 
1007 void
cs_parall_gather_r(int root_rank,int n_elts,int n_g_elts,const cs_real_t array[],cs_real_t g_array[])1008 cs_parall_gather_r(int               root_rank,
1009                    int               n_elts,
1010                    int               n_g_elts,
1011                    const cs_real_t   array[],
1012                    cs_real_t         g_array[])
1013 {
1014 #if defined(HAVE_MPI)
1015 
1016   if (cs_glob_n_ranks > 1) {
1017 
1018     /* Sanity check */
1019 
1020     if (cs_glob_rank_id == root_rank && g_array == NULL)
1021       bft_error(__FILE__, __LINE__, 0,
1022                 _(" %s: Global array is not allocated on the root_rank %d\n"),
1023                 __func__, root_rank);
1024 
1025     int  *count = NULL, *shift = NULL;
1026 
1027     _get_array_distribution(root_rank, n_elts, n_g_elts, &count, &shift);
1028 
1029     MPI_Gatherv(array, n_elts, CS_MPI_REAL,
1030                 g_array, count, shift, CS_MPI_REAL,
1031                 root_rank, cs_glob_mpi_comm);
1032 
1033     BFT_FREE(count);
1034     BFT_FREE(shift);
1035 
1036   }
1037 
1038 #endif
1039 
1040   if (cs_glob_n_ranks == 1) {
1041 
1042     assert(n_elts == n_g_elts);
1043     assert(g_array != NULL && n_g_elts > 0);
1044     for (int i = 0; i < n_elts; i++)
1045       g_array[i] = array[i];
1046 
1047   }
1048 }
1049 
1050 /*----------------------------------------------------------------------------*/
1051 /*!
1052  * \brief Build an ordered global array on the given root rank from
1053  *        all local arrays.
1054  *
1055  * Local array elements are ordered based on a given key value (usually
1056  * some form of coordinate, so the result should be independent of
1057  * partitioning as long as the definition of the o_key array's defintion
1058  * is itself independent of the partitioning.
1059  *
1060  * Use of this function may be quite practical, but should be limited
1061  * to user functions, as it may limit scalability (especially as regards
1062  * memory usage).
1063  *
1064  * \param[in]   root_rank  rank which stores the global array
1065  * \param[in]   n_elts     number of local elements
1066  * \param[in]   n_g_elts   number of global elements
1067  * \param[in]   stride     number of values per element
1068  * \param[in]   o_key      ordering key (coordinate) value per element
1069  * \param[in]   array      local array (size: n_elts*stride)
1070  * \param[out]  g_array    global array  (size: n_g_elts*stride)
1071  */
1072 /*----------------------------------------------------------------------------*/
1073 
1074 void
cs_parall_gather_ordered_r(int root_rank,int n_elts,int n_g_elts,int stride,cs_real_t o_key[],cs_real_t array[],cs_real_t g_array[])1075 cs_parall_gather_ordered_r(int        root_rank,
1076                            int        n_elts,
1077                            int        n_g_elts,
1078                            int        stride,
1079                            cs_real_t  o_key[],
1080                            cs_real_t  array[],
1081                            cs_real_t  g_array[])
1082 {
1083   cs_lnum_t  *order = NULL;
1084   cs_real_t  *g_o_key = NULL;
1085 
1086   if (cs_glob_rank_id == root_rank) {
1087     BFT_MALLOC(g_o_key, n_g_elts, cs_real_t);
1088     BFT_MALLOC(order, n_g_elts, cs_lnum_t);
1089   }
1090 
1091   cs_parall_gather_r(root_rank, n_elts, n_g_elts, o_key, g_o_key);
1092   cs_parall_gather_r(root_rank, n_elts, n_g_elts*stride, array, g_array);
1093 
1094   if (cs_glob_rank_id == root_rank) {
1095     cs_order_real_allocated(NULL, g_o_key, order, n_g_elts);
1096     cs_order_reorder_data(n_g_elts, sizeof(cs_real_t)*stride, order, g_array);
1097   }
1098 
1099   BFT_FREE(order);
1100   BFT_FREE(g_o_key);
1101 }
1102 
1103 /*----------------------------------------------------------------------------*/
1104 /*!
1105  * \brief Distribute a global array from a given root rank over all ranks.
1106  *        Each rank receive the part related to its local elements.
1107  *
1108  * The size of each local array may be different.
1109  *
1110  * Use of this function may be quite practical, but should be limited
1111  * to specific usage, as it may limit scalability (especially as regards
1112  * memory usage).
1113  *
1114  * \param[in]   root_rank  rank which stores the global array
1115  * \param[in]   n_elts     size of the local array
1116  * \param[in]   n_g_elts   size of the global array
1117  * \param[in]   g_array    global array  (size: n_g_elts) only usable by the
1118  *                         root rank
1119  * \param[out]  array      local array (size: n_elts)
1120  */
1121 /*----------------------------------------------------------------------------*/
1122 
1123 void
cs_parall_scatter_r(int root_rank,int n_elts,int n_g_elts,const cs_real_t g_array[],cs_real_t array[])1124 cs_parall_scatter_r(int               root_rank,
1125                     int               n_elts,
1126                     int               n_g_elts,
1127                     const cs_real_t   g_array[],
1128                     cs_real_t         array[])
1129 {
1130   assert(array != NULL && n_elts > 0);
1131 
1132 #if defined(HAVE_MPI)
1133 
1134   if (cs_glob_n_ranks > 1) {
1135 
1136     /* Sanity check */
1137 
1138     if (cs_glob_rank_id == root_rank && g_array == NULL)
1139       bft_error(__FILE__, __LINE__, 0,
1140                 _(" %s: Global array is not allocated on the root_rank %d\n"),
1141                 __func__, root_rank);
1142 
1143     int  *count = NULL, *shift = NULL;
1144 
1145     _get_array_distribution(root_rank, n_elts, n_g_elts, &count, &shift);
1146 
1147     MPI_Scatterv(g_array, count, shift, CS_MPI_REAL,
1148                  array, n_elts, CS_MPI_REAL, root_rank, cs_glob_mpi_comm);
1149 
1150     BFT_FREE(count);
1151     BFT_FREE(shift);
1152 
1153   }
1154 
1155 #endif
1156 
1157   if (cs_glob_n_ranks == 1) {
1158 
1159     assert(n_elts == n_g_elts);
1160     assert(g_array != NULL && n_g_elts > 0);
1161     for (int i = 0; i < n_elts; i++)
1162       array[i] = g_array[i];
1163 
1164   }
1165 }
1166 
1167 /*----------------------------------------------------------------------------*/
1168 /*!
1169  * \brief Build a global array on the given root rank from all local arrays.
1170  *        Function dealing with single-precision arrays.
1171  *
1172  * Local arrays are appended in order of owning MPI rank.
1173  * The size of each local array may be different.
1174  *
1175  * Use of this function may be quite practical, but should be limited
1176  * to user functions, as it may limit scalability (especially as regards
1177  * memory usage).
1178  *
1179  * \param[in]   root_rank  rank which stores the global array
1180  * \param[in]   n_elts     size of the local array
1181  * \param[in]   n_g_elts   size of the global array
1182  * \param[in]   array      local array (size: n_elts)
1183  * \param[out]  g_array    global array  (size: n_g_elts) only usable by the
1184  *                         root rank
1185  */
1186 /*----------------------------------------------------------------------------*/
1187 
1188 void
cs_parall_gather_f(int root_rank,int n_elts,int n_g_elts,const float array[],float g_array[])1189 cs_parall_gather_f(int             root_rank,
1190                    int             n_elts,
1191                    int             n_g_elts,
1192                    const float     array[],
1193                    float           g_array[])
1194 {
1195   assert(array != NULL && n_elts > 0);
1196 
1197 #if defined(HAVE_MPI)
1198 
1199   if (cs_glob_n_ranks > 1) {
1200 
1201     /* Sanity check */
1202 
1203     if (cs_glob_rank_id == root_rank && g_array == NULL)
1204       bft_error(__FILE__, __LINE__, 0,
1205                 _(" %s: Global array is not allocated on the root_rank %d\n"),
1206                 __func__, root_rank);
1207 
1208     int  *count = NULL, *shift = NULL;
1209 
1210     _get_array_distribution(root_rank, n_elts, n_g_elts, &count, &shift);
1211 
1212     MPI_Gatherv(array, n_elts, MPI_FLOAT,
1213                 g_array, count, shift, MPI_FLOAT, root_rank, cs_glob_mpi_comm);
1214 
1215     BFT_FREE(count);
1216     BFT_FREE(shift);
1217 
1218   }
1219 
1220 #endif
1221 
1222   if (cs_glob_n_ranks == 1) {
1223 
1224     assert(n_elts == n_g_elts);
1225     assert(g_array != NULL && n_g_elts > 0);
1226     for (int i = 0; i < n_elts; i++)
1227       g_array[i] = array[i];
1228 
1229   }
1230 }
1231 
1232 /*----------------------------------------------------------------------------*/
1233 /*!
1234  * \brief Distribute a global array from a given root rank over all ranks.
1235  *        Each rank receive the part related to its local elements.
1236  *        Function dealing with single-precision arrays.
1237  *
1238  * The size of each local array may be different.
1239  *
1240  * Use of this function may be quite practical, but should be limited
1241  * to specific usage, as it may limit scalability (especially as regards
1242  * memory usage).
1243  *
1244  * \param[in]   root_rank  rank which stores the global array
1245  * \param[in]   n_elts     size of the local array
1246  * \param[in]   n_g_elts   size of the global array
1247  * \param[in]   g_array    global array  (size: n_g_elts) only usable by the
1248  *                         root rank
1249  * \param[out]  array      local array (size: n_elts)
1250  */
1251 /*----------------------------------------------------------------------------*/
1252 
1253 void
cs_parall_scatter_f(int root_rank,int n_elts,int n_g_elts,const float g_array[],float array[])1254 cs_parall_scatter_f(int           root_rank,
1255                     int           n_elts,
1256                     int           n_g_elts,
1257                     const float   g_array[],
1258                     float         array[])
1259 {
1260   assert(array != NULL && n_elts > 0);
1261 
1262 #if defined(HAVE_MPI)
1263 
1264   if (cs_glob_n_ranks > 1) {
1265 
1266     /* Sanity check */
1267 
1268     if (cs_glob_rank_id == root_rank && g_array == NULL)
1269       bft_error(__FILE__, __LINE__, 0,
1270                 _(" %s: Global array is not allocated on the root_rank %d\n"),
1271                 __func__, root_rank);
1272 
1273     int  *count = NULL, *shift = NULL;
1274 
1275     _get_array_distribution(root_rank, n_elts, n_g_elts, &count, &shift);
1276 
1277     MPI_Scatterv(g_array, count, shift, MPI_FLOAT,
1278                  array, n_elts, MPI_FLOAT, root_rank, cs_glob_mpi_comm);
1279 
1280     BFT_FREE(count);
1281     BFT_FREE(shift);
1282 
1283   }
1284 
1285 #endif
1286 
1287   if (cs_glob_n_ranks == 1) {
1288 
1289     assert(n_elts == n_g_elts);
1290     assert(g_array != NULL && n_g_elts > 0);
1291     for (int i = 0; i < n_elts; i++)
1292       array[i] = g_array[i];
1293 
1294   }
1295 }
1296 
1297 /*----------------------------------------------------------------------------*/
1298 /*!
1299  * \brief Return minimum recommended scatter or gather buffer size.
1300  *
1301  * This is used by some internal part to block or scatter/gather algorithms,
1302  * so as to allow I/O buffer size tuning.
1303  *
1304  * \return  minimum recommended part to block or gather buffer size (in bytes)
1305  */
1306 /*----------------------------------------------------------------------------*/
1307 
1308 size_t
cs_parall_get_min_coll_buf_size(void)1309 cs_parall_get_min_coll_buf_size(void)
1310 {
1311 #if defined(HAVE_MPI)
1312   return _cs_parall_min_coll_buf_size;
1313 #else
1314   return 0;
1315 #endif
1316 }
1317 
1318 /*----------------------------------------------------------------------------*/
1319 /*!
1320  * \brief Define minimum recommended scatter or gather buffer size.
1321  *
1322  * This is used by some internal part to block or scatter/gather algorithms,
1323  * so as to allow I/O buffer size tuning.
1324  *
1325  * \param[in]  buffer_size  minimum recommended part to block
1326  *             or gather buffer size (in bytes)
1327  */
1328 /*----------------------------------------------------------------------------*/
1329 
1330 void
cs_parall_set_min_coll_buf_size(size_t buffer_size)1331 cs_parall_set_min_coll_buf_size(size_t buffer_size)
1332 {
1333 #if defined(HAVE_MPI)
1334   _cs_parall_min_coll_buf_size = buffer_size;
1335 #endif
1336 }
1337 
1338 /*----------------------------------------------------------------------------*/
1339 
1340 END_C_DECLS
1341