1 /*============================================================================
2 * Functions dealing with parallelism
3 *============================================================================*/
4
5 /*
6 This file is part of Code_Saturne, a general-purpose CFD tool.
7
8 Copyright (C) 1998-2021 EDF S.A.
9
10 This program is free software; you can redistribute it and/or modify it under
11 the terms of the GNU General Public License as published by the Free Software
12 Foundation; either version 2 of the License, or (at your option) any later
13 version.
14
15 This program is distributed in the hope that it will be useful, but WITHOUT
16 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
18 details.
19
20 You should have received a copy of the GNU General Public License along with
21 this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
22 Street, Fifth Floor, Boston, MA 02110-1301, USA.
23 */
24
25 /*----------------------------------------------------------------------------*/
26
27 #include "cs_defs.h"
28
29 /*----------------------------------------------------------------------------
30 * Standard C library headers
31 *----------------------------------------------------------------------------*/
32
33 #include <assert.h>
34 #include <stdarg.h>
35 #include <string.h>
36
37 /*----------------------------------------------------------------------------
38 * Local headers
39 *----------------------------------------------------------------------------*/
40
41 #include "bft_error.h"
42 #include "bft_mem.h"
43 #include "cs_order.h"
44
45 /*----------------------------------------------------------------------------
46 * Header for the current file
47 *----------------------------------------------------------------------------*/
48
49 #include "cs_parall.h"
50
51 /*----------------------------------------------------------------------------*/
52
53 BEGIN_C_DECLS
54
55 /*=============================================================================
56 * Additional doxygen documentation
57 *============================================================================*/
58
59 /*! \fn inline static void cs_parall_counter(cs_gnum_t cpt[], const int n)
60 *
61 * \brief Sum values of a counter on all default communicator processes.
62 *
63 * \param[in, out] cpt local counter in, global counter out (size: n)
64 * \param[in] n number of values
65 */
66
67 /*! \fn inline static void cs_parall_counter_max(cs_lnum_t cpt[], const int n)
68 *
69 * \brief Maximum values of a counter on all default communicator processes.
70 *
71 * \param[in, out] cpt local counter in, global counter out (size: n)
72 * \param[in] n number of values
73 */
74
75 /*! \fn inline static void cs_parall_sum(int n, \
76 cs_datatype_t datatype, \
77 void *val)
78
79 * \brief Sum values of a given datatype on all default communicator processes.
80 *
81 * \param[in] n number of values
82 * \param[in] datatype matching Code_Saturne datatype
83 * \param[in, out] val local sum in, global sum out (size: n)
84 */
85
86 /*! \fn inline static void cs_parall_max(int n, \
87 cs_datatype_t datatype, \
88 void *val)
89 *
90 * \brief Maximum values of a given datatype on all
91 * default communicator processes.
92 *
93 * \param[in] n number of values
94 * \param[in] datatype matching Code_Saturne datatype
95 * \param[in, out] val local maximum in, global maximum out (size: n)
96 */
97
98 /*! \fn inline static void cs_parall_min(int n, \
99 cs_datatype_t datatype, \
100 void *val)
101 *
102 *
103 * \brief Minimum values of a given datatype on all
104 * default communicator processes.
105 *
106 * \param[in] n number of values
107 * \param[in] datatype matching Code_Saturne datatype
108 * \param[in, out] val local minimum in, global minimum out (size: n)
109 */
110
111 /*! \fn inline static void cs_parall_bcast(int root_rank, \
112 int n, \
113 cs_datatype_t datatype, \
114 void *val)
115 *
116 * \brief Broadcast values of a given datatype to all
117 * default communicator processes.
118 *
119 * \param[in] root_rank rank from which to broadcast
120 * \param[in] n number of values
121 * \param[in] datatype matching Code_Saturne datatype
122 * \param[in, out] val values to broadcast; input on root_rank,
123 * output on others (size: n)
124 */
125
126 /*!
127 \file cs_parall.c
128 Utility functions dealing with parallelism.
129 */
130
131 /*! \cond DOXYGEN_SHOULD_SKIP_THIS */
132
133 /*============================================================================
134 * Local structure and type definitions
135 *============================================================================*/
136
137 #define CS_PARALL_ARRAY_SIZE 500
138
139 typedef struct
140 {
141 double val;
142 int rank;
143 } _mpi_double_int_t;
144
145 /*============================================================================
146 * Static global variables
147 *============================================================================*/
148
149 #if defined(HAVE_MPI)
150
151 /* Minimum recommended scatter/gather buffer size */
152
153 static size_t _cs_parall_min_coll_buf_size = 1024*1024;
154
155 #endif
156
157 /*============================================================================
158 * Prototypes for functions intended for use only by Fortran wrappers.
159 * (descriptions follow, with function bodies).
160 *============================================================================*/
161
162 void
163 cs_f_parall_max_i(int *max);
164
165 void
166 cs_f_parall_max_r(double *max);
167
168 void
169 cs_f_parall_min_i(int *min);
170
171 void
172 cs_f_parall_min_r(double *min);
173
174 void
175 cs_f_parall_sum_i(int *sum);
176
177 void
178 cs_f_parall_sum_r(double *sum);
179
180 void
181 cs_f_parall_max_n_i(int n,
182 int *max);
183
184 void
185 cs_f_parall_max_n_r(int n,
186 double *max);
187
188 void
189 cs_f_parall_min_n_i(int n,
190 int *min);
191
192 void
193 cs_f_parall_min_n_r(int n,
194 double *min);
195
196 void
197 cs_f_parall_sum_n_i(int n,
198 int *sum);
199
200 void
201 cs_f_parall_sum_n_r(int n,
202 double *sum);
203
204 void
205 cs_f_parall_bcast_i(int root_rank,
206 int *val);
207
208 void
209 cs_f_parall_bcast_r(int root_rank,
210 double *val);
211
212 void
213 cs_f_parall_bcast_n_i(int root_rank,
214 int n,
215 int *val);
216
217 void
218 cs_f_parall_bcast_n_r(int root_rank,
219 int n,
220 double *val);
221
222 void
223 cs_f_parall_barrier(void);
224
225 /*=============================================================================
226 * Private function definitions
227 *============================================================================*/
228
229 /*----------------------------------------------------------------------------*/
230 /*!
231 * \brief Define the array distribution over all ranks on a given root rank.
232 * The size of each local array may be different.
233 *
234 * \param[in] root_rank rank which stores the global array
235 * \param[in] n_elts size of the local array
236 * \param[in] n_g_elts size of the global array
237 * \param[out] count number of elements in each rank
238 * \param[out] shift shift to access each local array related to a rank
239 */
240 /*----------------------------------------------------------------------------*/
241
242 #if defined(HAVE_MPI)
243
244 static void
_get_array_distribution(int root_rank,int n_elts,int n_g_elts,int * p_count[],int * p_shift[])245 _get_array_distribution(int root_rank,
246 int n_elts,
247 int n_g_elts,
248 int *p_count[],
249 int *p_shift[])
250 {
251 int *count = NULL;
252 int *shift = NULL;
253
254 const int n_ranks = cs_glob_n_ranks;
255
256 assert(sizeof(double) == sizeof(cs_real_t));
257
258 BFT_MALLOC(count, n_ranks, int);
259 BFT_MALLOC(shift, n_ranks, int);
260
261 MPI_Gather(&n_elts, 1, MPI_INT,
262 count, 1, MPI_INT, root_rank, cs_glob_mpi_comm);
263
264 shift[0] = 0;
265 for (cs_lnum_t i = 1; i < n_ranks; i++)
266 shift[i] = shift[i-1] + count[i-1];
267
268 if (cs_glob_rank_id == root_rank)
269 if (n_g_elts != (shift[n_ranks - 1] + count[n_ranks - 1]))
270 bft_error(__FILE__, __LINE__, 0,
271 _("Incorrect arguments to %s:\n"
272 " sum of arg. 1 (n_elts) on ranks "
273 "is not equal to arg. 2 (n_g_elts)."), __func__);
274
275 /* Return pointers */
276 *p_count = count;
277 *p_shift = shift;
278 }
279
280 #endif /* HAVE_MPI */
281
282 /*----------------------------------------------------------------------------
283 * Call MPI_Allreduce for a given Code_Saturne datatype and MPI
284 * operation on all default communicator processes.
285 *
286 * parameters:
287 * n <-- number of values
288 * datatype <-- matching Code_Saturne datatype
289 * operation <-- MPI operation
290 * val <-> local value input, global value output (array)
291 *----------------------------------------------------------------------------*/
292
293 #if !defined(HAVE_MPI_IN_PLACE) && defined(HAVE_MPI)
294
295 static void
_cs_parall_allreduce(int n,cs_datatype_t datatype,MPI_Op operation,void * val)296 _cs_parall_allreduce(int n,
297 cs_datatype_t datatype,
298 MPI_Op operation,
299 void *val)
300 {
301 int data_size = n*cs_datatype_size[datatype];
302 unsigned char *locval;
303 unsigned char _locval[256];
304
305 if (data_size > 256)
306 BFT_MALLOC(locval, data_size, unsigned char);
307 else
308 locval = _locval;
309
310 memcpy(locval, val, data_size);
311
312 MPI_Allreduce(locval, val, n, cs_datatype_to_mpi[datatype], operation,
313 cs_glob_mpi_comm);
314
315 if (locval != _locval)
316 BFT_FREE(locval);
317 }
318
319 #endif
320
321 /*============================================================================
322 * Fortran wrapper function definitions
323 *============================================================================*/
324
325 /*----------------------------------------------------------------------------
326 * Compute the global maximum of an integer in case of parallelism
327 *
328 * parameters:
329 * max <-> input = local max; output = global max
330 *----------------------------------------------------------------------------*/
331
332 void
cs_f_parall_max_i(int * max)333 cs_f_parall_max_i(int *max)
334 {
335 #if defined(HAVE_MPI)
336
337 int global_max;
338
339 assert (sizeof (double) == sizeof (cs_real_t));
340
341 MPI_Allreduce (max, &global_max, 1, MPI_INT, MPI_MAX,
342 cs_glob_mpi_comm);
343
344 *max = global_max;
345
346 #endif
347 }
348
349 /*----------------------------------------------------------------------------
350 * Compute the global minimum of a real in case of parallelism
351 *
352 * parameters:
353 * min <-> input = local min; output = global min
354 *----------------------------------------------------------------------------*/
355
356 void
cs_f_parall_min_r(double * min)357 cs_f_parall_min_r(double *min)
358 {
359 #if defined(HAVE_MPI)
360
361 double global_min;
362
363 assert (sizeof (double) == sizeof (cs_real_t));
364
365 MPI_Allreduce (min, &global_min, 1, MPI_DOUBLE, MPI_MIN,
366 cs_glob_mpi_comm);
367
368 *min = global_min;
369
370 #endif
371 }
372
373 /*----------------------------------------------------------------------------
374 * Compute the global minimum of an integer in case of parallelism
375 *
376 * parameters:
377 * min <-> input = local min; output = global min
378 *----------------------------------------------------------------------------*/
379
380 void
cs_f_parall_min_i(int * min)381 cs_f_parall_min_i(int *min)
382 {
383 #if defined(HAVE_MPI)
384
385 int global_min;
386
387 assert (sizeof (double) == sizeof (cs_real_t));
388
389 MPI_Allreduce (min, &global_min, 1, MPI_INT, MPI_MIN,
390 cs_glob_mpi_comm);
391
392 *min = global_min;
393
394 #endif
395 }
396
397 /*----------------------------------------------------------------------------
398 * Compute the global maximum of a real in case of parallelism
399 *
400 * parameters:
401 * max <-> input = local max; output = global max
402 *----------------------------------------------------------------------------*/
403
404 void
cs_f_parall_max_r(double * max)405 cs_f_parall_max_r(double *max)
406 {
407 #if defined(HAVE_MPI)
408
409 double global_max;
410
411 assert (sizeof (double) == sizeof (cs_real_t));
412
413 MPI_Allreduce (max, &global_max, 1, MPI_DOUBLE, MPI_MAX,
414 cs_glob_mpi_comm);
415
416 *max = global_max;
417
418 #endif
419 }
420
421 /*----------------------------------------------------------------------------
422 * Compute the global sum of an integer in case of parallelism
423 *
424 * parameters:
425 * sum <-> input = local sum; output = global sum
426 *----------------------------------------------------------------------------*/
427
428 void
cs_f_parall_sum_i(int * sum)429 cs_f_parall_sum_i(int *sum)
430 {
431 #if defined(HAVE_MPI)
432
433 int global_sum;
434
435 assert (sizeof (double) == sizeof (cs_real_t));
436
437 MPI_Allreduce (sum, &global_sum, 1, MPI_INT, MPI_SUM,
438 cs_glob_mpi_comm);
439
440 *sum = global_sum;
441
442 #endif
443 }
444
445 /*----------------------------------------------------------------------------
446 * Compute the global sum of a real in case of parallelism
447 *
448 * parameters:
449 * sum <-> input = local sum; output = global sum
450 *----------------------------------------------------------------------------*/
451
452 void
cs_f_parall_sum_r(double * sum)453 cs_f_parall_sum_r(double *sum)
454 {
455 #if defined(HAVE_MPI)
456
457 double global_sum;
458
459 assert (sizeof (double) == sizeof (cs_real_t));
460
461 MPI_Allreduce (sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM,
462 cs_glob_mpi_comm);
463
464 *sum = global_sum;
465
466 #endif
467 }
468
469 /*----------------------------------------------------------------------------
470 * Compute the global maxima of an array of integers in case of parallelism
471 *
472 * parameters:
473 * n <-- array size
474 * max <-> input = local max; output = global max
475 *----------------------------------------------------------------------------*/
476
477 void
cs_f_parall_max_n_i(int n,int * max)478 cs_f_parall_max_n_i(int n,
479 int *max)
480 {
481 cs_parall_max(n, CS_INT_TYPE, max);
482 }
483
484 /*----------------------------------------------------------------------------
485 * Compute the global maxima of an array of reals in case of parallelism
486 *
487 * parameters:
488 * n <-- array size
489 * max <-> input = local max; output = global max
490 *----------------------------------------------------------------------------*/
491
492 void
cs_f_parall_max_n_r(int n,double * max)493 cs_f_parall_max_n_r(int n,
494 double *max)
495 {
496 cs_parall_max(n, CS_DOUBLE, max);
497 }
498
499 /*----------------------------------------------------------------------------
500 * Compute the global minima of an array of integers in case of parallelism
501 *
502 * parameters:
503 * n <-- array size
504 * min <-> input = local min; output = global min
505 *----------------------------------------------------------------------------*/
506
507 void
cs_f_parall_min_n_i(int n,int * min)508 cs_f_parall_min_n_i(int n,
509 int *min)
510 {
511 cs_parall_min(n, CS_INT_TYPE, min);
512 }
513
514 /*----------------------------------------------------------------------------
515 * Compute the global minima of an array of reals in case of parallelism
516 *
517 * parameters:
518 * n <-- array size
519 * min <-> input = local min; output = global min
520 *----------------------------------------------------------------------------*/
521
522 void
cs_f_parall_min_n_r(int n,double * min)523 cs_f_parall_min_n_r(int n,
524 double *min)
525 {
526 cs_parall_min(n, CS_DOUBLE, min);
527 }
528
529 /*----------------------------------------------------------------------------
530 * Compute the global sums of an array of integers in case of parallelism
531 *
532 * parameters:
533 * n <-- array size
534 * sum <-> input = local sum; output = global sum
535 *----------------------------------------------------------------------------*/
536
537 void
cs_f_parall_sum_n_i(int n,int * sum)538 cs_f_parall_sum_n_i(int n,
539 int *sum)
540 {
541 cs_parall_sum(n, CS_INT_TYPE, sum);
542 }
543
544 /*----------------------------------------------------------------------------
545 * Compute the global sums of an array of reals in case of parallelism
546 *
547 * parameters:
548 * n <-- array size
549 * sum <-> input = local sum; output = global sum
550 *----------------------------------------------------------------------------*/
551
552 void
cs_f_parall_sum_n_r(int n,double * sum)553 cs_f_parall_sum_n_r(int n,
554 double *sum)
555 {
556 cs_parall_sum(n, CS_DOUBLE, sum);
557 }
558
559 /*----------------------------------------------------------------------------
560 * Broadcast an integer to all ranks
561 *
562 * parameters:
563 * root_rank <-- id of root rank
564 * val <-> input = local value; output = global value
565 *----------------------------------------------------------------------------*/
566
567 void
cs_f_parall_bcast_i(int root_rank,int * val)568 cs_f_parall_bcast_i(int root_rank,
569 int *val)
570 {
571 cs_parall_bcast(root_rank, 1, CS_INT_TYPE, val);
572 }
573
574 /*----------------------------------------------------------------------------
575 * Broadcast a double to all ranks
576 *
577 * parameters:
578 * root_rank <-- id of root rank
579 * val <-> input = local value; output = global value
580 *----------------------------------------------------------------------------*/
581
582 void
cs_f_parall_bcast_r(int root_rank,double * val)583 cs_f_parall_bcast_r(int root_rank,
584 double *val)
585 {
586 cs_parall_bcast(root_rank, 1, CS_DOUBLE, val);
587 }
588
589 /*----------------------------------------------------------------------------
590 * Broadcast an array of integers to all ranks
591 *
592 * parameters:
593 * root_rank <-- id of root rank
594 * n <-- array size
595 * val <-> input = local value; output = global value
596 *----------------------------------------------------------------------------*/
597
598 void
cs_f_parall_bcast_n_i(int root_rank,int n,int * val)599 cs_f_parall_bcast_n_i(int root_rank,
600 int n,
601 int *val)
602 {
603 cs_parall_bcast(root_rank, n, CS_INT_TYPE, val);
604 }
605
606 /*----------------------------------------------------------------------------
607 * Broadcast an array of doubles to all ranks
608 *
609 * parameters:
610 * root_rank <-- id of root rank
611 * n <-- array size
612 * val <-> input = local value; output = global value
613 *----------------------------------------------------------------------------*/
614
615 void
cs_f_parall_bcast_n_r(int root_rank,int n,double * val)616 cs_f_parall_bcast_n_r(int root_rank,
617 int n,
618 double *val)
619 {
620 cs_parall_bcast(root_rank, n, CS_DOUBLE, val);
621 }
622
623 /*----------------------------------------------------------------------------
624 * Call a barrier in case of parallelism
625 *
626 * This function should not be necessary in production code,
627 * but it may be useful for debugging purposes.
628 *----------------------------------------------------------------------------*/
629
630 void
cs_f_parall_barrier(void)631 cs_f_parall_barrier(void)
632 {
633 #if defined(HAVE_MPI)
634 if (cs_glob_n_ranks > 1)
635 MPI_Barrier(cs_glob_mpi_comm);
636 #endif
637 }
638
639 /*! (DOXYGEN_SHOULD_SKIP_THIS) \endcond */
640
641 /*=============================================================================
642 * Public function definitions
643 *============================================================================*/
644
645 #if !defined(HAVE_MPI_IN_PLACE) && defined(HAVE_MPI)
646
647 void
cs_parall_counter(cs_gnum_t cpt[],const int n)648 cs_parall_counter(cs_gnum_t cpt[],
649 const int n)
650 {
651 if (cs_glob_n_ranks > 1)
652 _cs_parall_allreduce(n, CS_GNUM_TYPE, MPI_SUM, cpt);
653 }
654
655 #endif
656
657 /*----------------------------------------------------------------------------
658 * Maximum values of a counter on all default communicator processes.
659 *
660 * parameters:
661 * cpt <-> local counter value input, global counter value output (array)
662 * n <-- number of counter array values
663 *----------------------------------------------------------------------------*/
664
665 #if !defined(HAVE_MPI_IN_PLACE) && defined(HAVE_MPI)
666
667 void
cs_parall_counter_max(cs_lnum_t cpt[],const int n)668 cs_parall_counter_max(cs_lnum_t cpt[],
669 const int n)
670 {
671 if (cs_glob_n_ranks > 1)
672 _cs_parall_allreduce(n, CS_LNUM_TYPE, MPI_MAX, cpt);
673 }
674
675 #endif
676
677 /*----------------------------------------------------------------------------
678 * Sum values of a given datatype on all default communicator processes.
679 *
680 * parameters:
681 * n <-- number of values
682 * datatype <-- matching Code_Saturne datatype
683 * val <-> local value input, global value output (array)
684 *----------------------------------------------------------------------------*/
685
686 #if !defined(HAVE_MPI_IN_PLACE) && defined(HAVE_MPI)
687
688 void
cs_parall_sum(int n,cs_datatype_t datatype,void * val)689 cs_parall_sum(int n,
690 cs_datatype_t datatype,
691 void *val)
692 {
693 if (cs_glob_n_ranks > 1)
694 _cs_parall_allreduce(n, datatype, MPI_SUM, val);
695 }
696
697 #endif
698
699 /*----------------------------------------------------------------------------
700 * Maximum values of a given datatype on all default communicator processes.
701 *
702 * parameters:
703 * n <-- number of values
704 * datatype <-- matching Code_Saturne datatype
705 * val <-> local value input, global value output (array)
706 *----------------------------------------------------------------------------*/
707
708 #if !defined(HAVE_MPI_IN_PLACE) && defined(HAVE_MPI)
709
710 void
cs_parall_max(int n,cs_datatype_t datatype,void * val)711 cs_parall_max(int n,
712 cs_datatype_t datatype,
713 void *val)
714 {
715 if (cs_glob_n_ranks > 1)
716 _cs_parall_allreduce(n, datatype, MPI_MAX, val);
717 }
718
719 #endif
720
721 /*----------------------------------------------------------------------------
722 * Minimum values of a given datatype on all default communicator processes.
723 *
724 * parameters:
725 * n <-- number of values
726 * datatype <-- matching Code_Saturne datatype
727 * val <-> local value input, global value output (array)
728 *----------------------------------------------------------------------------*/
729
730 #if !defined(HAVE_MPI_IN_PLACE) && defined(HAVE_MPI)
731
732 void
cs_parall_min(int n,cs_datatype_t datatype,void * val)733 cs_parall_min(int n,
734 cs_datatype_t datatype,
735 void *val)
736 {
737 if (cs_glob_n_ranks > 1)
738 _cs_parall_allreduce(n, datatype, MPI_MIN, val);
739 }
740
741 #endif
742
743 /*----------------------------------------------------------------------------*/
744 /*!
745 * \brief Maximum value of a real and the value of related array on all
746 * default communicator processes.
747 *
748 * \param[in] n size of the related array
749 * \param[in, out] max local max in, global max out
750 * \param[in, out] max_loc_vals array values at location of local max in,
751 * and at location of global max out
752 */
753 /*----------------------------------------------------------------------------*/
754
755 void
cs_parall_max_loc_vals(int n,cs_real_t * max,cs_real_t max_loc_vals[])756 cs_parall_max_loc_vals(int n,
757 cs_real_t *max,
758 cs_real_t max_loc_vals[])
759 {
760 #if defined(HAVE_MPI)
761
762 if (cs_glob_n_ranks > 1) {
763
764 _mpi_double_int_t val_in, val_max;
765
766 val_in.val = *max;
767 val_in.rank = cs_glob_rank_id;
768
769 MPI_Allreduce(&val_in, &val_max, 1, MPI_DOUBLE_INT, MPI_MAXLOC,
770 cs_glob_mpi_comm);
771
772 *max = val_max.val;
773
774 MPI_Bcast(max_loc_vals, n, CS_MPI_REAL, val_max.rank, cs_glob_mpi_comm);
775
776 }
777
778 #endif
779 }
780
781 /*----------------------------------------------------------------------------*/
782 /*!
783 * \brief Minimum value of a real and the value of related array on all
784 * default communicator processes.
785 *
786 * \param[in] n size of the related array
787 * \param[in, out] min local min in, global min out
788 * \param[in, out] min_loc_vals array values at location of local min in,
789 * and at location of global min out
790 */
791 /*----------------------------------------------------------------------------*/
792
793 void
cs_parall_min_loc_vals(int n,cs_real_t * min,cs_real_t min_loc_vals[])794 cs_parall_min_loc_vals(int n,
795 cs_real_t *min,
796 cs_real_t min_loc_vals[])
797 {
798 #if defined(HAVE_MPI)
799
800 if (cs_glob_n_ranks > 1) {
801
802 _mpi_double_int_t val_in, val_min;
803
804 val_in.val = *min;
805 val_in.rank = cs_glob_rank_id;
806
807 MPI_Allreduce(&val_in, &val_min, 1, MPI_DOUBLE_INT, MPI_MINLOC,
808 cs_glob_mpi_comm);
809
810 *min = val_min.val;
811
812 MPI_Bcast(min_loc_vals, n, CS_MPI_REAL, val_min.rank, cs_glob_mpi_comm);
813
814 }
815
816 #endif
817 }
818
819 /*----------------------------------------------------------------------------*/
820 /*!
821 * \brief Given an (id, rank, value) tuple, return the local id and rank
822 * corresponding to the global minimum value.
823 *
824 * \param[in, out] elt_id element id for which the value is the smallest
825 * (local in, global out)
826 * \param[in, out] rank_id rank id for which the value is the smallest
827 * (local in, global out)
828 * \param[in] val associated local minimum value
829 */
830 /*----------------------------------------------------------------------------*/
831
832 void
cs_parall_min_id_rank_r(cs_lnum_t * elt_id,int * rank_id,cs_real_t val)833 cs_parall_min_id_rank_r(cs_lnum_t *elt_id,
834 int *rank_id,
835 cs_real_t val)
836 {
837 #if defined(HAVE_MPI)
838
839 if (cs_glob_n_ranks > 1) {
840
841 cs_lnum_t buf[2];
842
843 _mpi_double_int_t val_in, val_min;
844
845 assert(sizeof(double) == sizeof(cs_real_t));
846
847 val_in.val = val;
848 val_in.rank = cs_glob_rank_id;
849
850 MPI_Allreduce(&val_in, &val_min, 1, MPI_DOUBLE_INT, MPI_MINLOC,
851 cs_glob_mpi_comm);
852
853 *rank_id = cs_glob_rank_id;
854
855 buf[0] = *elt_id;
856 buf[1] = *rank_id;
857
858 MPI_Bcast(buf, 2, CS_MPI_LNUM, val_min.rank, cs_glob_mpi_comm);
859
860 *elt_id = buf[0];
861 *rank_id = buf[1];
862
863 }
864
865 #endif
866 }
867
868 /*----------------------------------------------------------------------------*/
869 /*!
870 * \brief Build a global array from each local array in each domain.
871 *
872 * Local arrays are appended in order of owning MPI rank.
873 * The size of each local array may be different.
874 *
875 * Use of this function may be quite practical, but should be limited
876 * to user functions, as it may limit scalability (especially as regards
877 * memory usage).
878 *
879 * \param[in] n_elts size of the local array
880 * \param[in] n_g_elts size of the global array
881 * \param[in] array local array (size: n_elts)
882 * \param[out] g_array global array (size: n_g_elts)
883 */
884 /*----------------------------------------------------------------------------*/
885
886 void
cs_parall_allgather_r(int n_elts,int n_g_elts,cs_real_t array[],cs_real_t g_array[])887 cs_parall_allgather_r(int n_elts,
888 int n_g_elts,
889 cs_real_t array[],
890 cs_real_t g_array[])
891 {
892 #if defined(HAVE_MPI)
893
894 if (cs_glob_n_ranks > 1) {
895
896 int i;
897 int *count = NULL;
898 int *shift = NULL;
899
900 const int n_domains = cs_glob_n_ranks;
901
902 assert(sizeof(double) == sizeof(cs_real_t));
903
904 BFT_MALLOC(count, n_domains, int);
905 BFT_MALLOC(shift, n_domains, int);
906
907 MPI_Allgather(&n_elts, 1, MPI_INT, count, 1, MPI_INT,
908 cs_glob_mpi_comm);
909
910 shift[0] = 0;
911 for (i = 1; i < n_domains; i++)
912 shift[i] = shift[i-1] + count[i-1];
913
914 if (n_g_elts != (shift[n_domains - 1] + count[n_domains - 1]))
915 bft_error(__FILE__, __LINE__, 0,
916 _("Incorrect arguments to %s:\n"
917 " sum of arg. 1 (n_elts) on ranks "
918 "is not equal to arg. 2 (n_g_elts)."),
919 __func__);
920
921 MPI_Allgatherv(array, n_elts, CS_MPI_REAL,
922 g_array, count, shift, CS_MPI_REAL, cs_glob_mpi_comm);
923
924 BFT_FREE(count);
925 BFT_FREE(shift);
926
927 }
928
929 #endif
930
931 if (cs_glob_n_ranks == 1) {
932
933 assert(n_elts == n_g_elts);
934
935 for (int i = 0; i < n_elts; i++)
936 g_array[i] = array[i];
937
938 }
939 }
940
941 /*----------------------------------------------------------------------------*/
942 /*!
943 * \brief Build an ordered global array from each local array in each domain.
944 *
945 * Local array elements are ordered based on a given key value (usually
946 * some form of coordinate, so the result should be independent of
947 * partitioning as long as the definition of the o_key array's defintion
948 * is itself independent of the partitioning.
949 *
950 * Use of this function may be quite practical, but should be limited
951 * to user functions, as it may limit scalability (especially as regards
952 * memory usage).
953 *
954 * \param[in] n_elts number of local elements
955 * \param[in] n_g_elts number of global elements
956 * \param[in] stride number of values per element
957 * \param[in] o_key ordering key (coordinate) value per element
958 * \param[in] array local array (size: n_elts*stride)
959 * \param[out] g_array global array (size: n_g_elts*stride)
960 */
961 /*----------------------------------------------------------------------------*/
962
963 void
cs_parall_allgather_ordered_r(int n_elts,int n_g_elts,int stride,cs_real_t o_key[],cs_real_t array[],cs_real_t g_array[])964 cs_parall_allgather_ordered_r(int n_elts,
965 int n_g_elts,
966 int stride,
967 cs_real_t o_key[],
968 cs_real_t array[],
969 cs_real_t g_array[])
970 {
971 cs_lnum_t *order;
972 cs_real_t *g_o_key;
973
974 BFT_MALLOC(g_o_key, n_g_elts, cs_real_t);
975 BFT_MALLOC(order, n_g_elts, cs_lnum_t);
976
977 cs_parall_allgather_r(n_elts, n_g_elts, o_key, g_o_key);
978 cs_parall_allgather_r(n_elts, n_g_elts*stride, array, g_array);
979
980 cs_order_real_allocated(NULL, g_o_key, order, n_g_elts);
981 cs_order_reorder_data(n_g_elts, sizeof(cs_real_t)*stride, order, g_array);
982
983 BFT_FREE(order);
984 BFT_FREE(g_o_key);
985 }
986
987 /*----------------------------------------------------------------------------*/
988 /*!
989 * \brief Build a global array on the given root rank from all local arrays.
990 *
991 * Local arrays are appended in order of owning MPI rank.
992 * The size of each local array may be different.
993 *
994 * Use of this function may be quite practical, but should be limited
995 * to user functions, as it may limit scalability (especially as regards
996 * memory usage).
997 *
998 * \param[in] root_rank rank which stores the global array
999 * \param[in] n_elts size of the local array
1000 * \param[in] n_g_elts size of the global array
1001 * \param[in] array local array (size: n_elts)
1002 * \param[out] g_array global array (size: n_g_elts) only usable by the
1003 * root rank
1004 */
1005 /*----------------------------------------------------------------------------*/
1006
1007 void
cs_parall_gather_r(int root_rank,int n_elts,int n_g_elts,const cs_real_t array[],cs_real_t g_array[])1008 cs_parall_gather_r(int root_rank,
1009 int n_elts,
1010 int n_g_elts,
1011 const cs_real_t array[],
1012 cs_real_t g_array[])
1013 {
1014 #if defined(HAVE_MPI)
1015
1016 if (cs_glob_n_ranks > 1) {
1017
1018 /* Sanity check */
1019
1020 if (cs_glob_rank_id == root_rank && g_array == NULL)
1021 bft_error(__FILE__, __LINE__, 0,
1022 _(" %s: Global array is not allocated on the root_rank %d\n"),
1023 __func__, root_rank);
1024
1025 int *count = NULL, *shift = NULL;
1026
1027 _get_array_distribution(root_rank, n_elts, n_g_elts, &count, &shift);
1028
1029 MPI_Gatherv(array, n_elts, CS_MPI_REAL,
1030 g_array, count, shift, CS_MPI_REAL,
1031 root_rank, cs_glob_mpi_comm);
1032
1033 BFT_FREE(count);
1034 BFT_FREE(shift);
1035
1036 }
1037
1038 #endif
1039
1040 if (cs_glob_n_ranks == 1) {
1041
1042 assert(n_elts == n_g_elts);
1043 assert(g_array != NULL && n_g_elts > 0);
1044 for (int i = 0; i < n_elts; i++)
1045 g_array[i] = array[i];
1046
1047 }
1048 }
1049
1050 /*----------------------------------------------------------------------------*/
1051 /*!
1052 * \brief Build an ordered global array on the given root rank from
1053 * all local arrays.
1054 *
1055 * Local array elements are ordered based on a given key value (usually
1056 * some form of coordinate, so the result should be independent of
1057 * partitioning as long as the definition of the o_key array's defintion
1058 * is itself independent of the partitioning.
1059 *
1060 * Use of this function may be quite practical, but should be limited
1061 * to user functions, as it may limit scalability (especially as regards
1062 * memory usage).
1063 *
1064 * \param[in] root_rank rank which stores the global array
1065 * \param[in] n_elts number of local elements
1066 * \param[in] n_g_elts number of global elements
1067 * \param[in] stride number of values per element
1068 * \param[in] o_key ordering key (coordinate) value per element
1069 * \param[in] array local array (size: n_elts*stride)
1070 * \param[out] g_array global array (size: n_g_elts*stride)
1071 */
1072 /*----------------------------------------------------------------------------*/
1073
1074 void
cs_parall_gather_ordered_r(int root_rank,int n_elts,int n_g_elts,int stride,cs_real_t o_key[],cs_real_t array[],cs_real_t g_array[])1075 cs_parall_gather_ordered_r(int root_rank,
1076 int n_elts,
1077 int n_g_elts,
1078 int stride,
1079 cs_real_t o_key[],
1080 cs_real_t array[],
1081 cs_real_t g_array[])
1082 {
1083 cs_lnum_t *order = NULL;
1084 cs_real_t *g_o_key = NULL;
1085
1086 if (cs_glob_rank_id == root_rank) {
1087 BFT_MALLOC(g_o_key, n_g_elts, cs_real_t);
1088 BFT_MALLOC(order, n_g_elts, cs_lnum_t);
1089 }
1090
1091 cs_parall_gather_r(root_rank, n_elts, n_g_elts, o_key, g_o_key);
1092 cs_parall_gather_r(root_rank, n_elts, n_g_elts*stride, array, g_array);
1093
1094 if (cs_glob_rank_id == root_rank) {
1095 cs_order_real_allocated(NULL, g_o_key, order, n_g_elts);
1096 cs_order_reorder_data(n_g_elts, sizeof(cs_real_t)*stride, order, g_array);
1097 }
1098
1099 BFT_FREE(order);
1100 BFT_FREE(g_o_key);
1101 }
1102
1103 /*----------------------------------------------------------------------------*/
1104 /*!
1105 * \brief Distribute a global array from a given root rank over all ranks.
1106 * Each rank receive the part related to its local elements.
1107 *
1108 * The size of each local array may be different.
1109 *
1110 * Use of this function may be quite practical, but should be limited
1111 * to specific usage, as it may limit scalability (especially as regards
1112 * memory usage).
1113 *
1114 * \param[in] root_rank rank which stores the global array
1115 * \param[in] n_elts size of the local array
1116 * \param[in] n_g_elts size of the global array
1117 * \param[in] g_array global array (size: n_g_elts) only usable by the
1118 * root rank
1119 * \param[out] array local array (size: n_elts)
1120 */
1121 /*----------------------------------------------------------------------------*/
1122
1123 void
cs_parall_scatter_r(int root_rank,int n_elts,int n_g_elts,const cs_real_t g_array[],cs_real_t array[])1124 cs_parall_scatter_r(int root_rank,
1125 int n_elts,
1126 int n_g_elts,
1127 const cs_real_t g_array[],
1128 cs_real_t array[])
1129 {
1130 assert(array != NULL && n_elts > 0);
1131
1132 #if defined(HAVE_MPI)
1133
1134 if (cs_glob_n_ranks > 1) {
1135
1136 /* Sanity check */
1137
1138 if (cs_glob_rank_id == root_rank && g_array == NULL)
1139 bft_error(__FILE__, __LINE__, 0,
1140 _(" %s: Global array is not allocated on the root_rank %d\n"),
1141 __func__, root_rank);
1142
1143 int *count = NULL, *shift = NULL;
1144
1145 _get_array_distribution(root_rank, n_elts, n_g_elts, &count, &shift);
1146
1147 MPI_Scatterv(g_array, count, shift, CS_MPI_REAL,
1148 array, n_elts, CS_MPI_REAL, root_rank, cs_glob_mpi_comm);
1149
1150 BFT_FREE(count);
1151 BFT_FREE(shift);
1152
1153 }
1154
1155 #endif
1156
1157 if (cs_glob_n_ranks == 1) {
1158
1159 assert(n_elts == n_g_elts);
1160 assert(g_array != NULL && n_g_elts > 0);
1161 for (int i = 0; i < n_elts; i++)
1162 array[i] = g_array[i];
1163
1164 }
1165 }
1166
1167 /*----------------------------------------------------------------------------*/
1168 /*!
1169 * \brief Build a global array on the given root rank from all local arrays.
1170 * Function dealing with single-precision arrays.
1171 *
1172 * Local arrays are appended in order of owning MPI rank.
1173 * The size of each local array may be different.
1174 *
1175 * Use of this function may be quite practical, but should be limited
1176 * to user functions, as it may limit scalability (especially as regards
1177 * memory usage).
1178 *
1179 * \param[in] root_rank rank which stores the global array
1180 * \param[in] n_elts size of the local array
1181 * \param[in] n_g_elts size of the global array
1182 * \param[in] array local array (size: n_elts)
1183 * \param[out] g_array global array (size: n_g_elts) only usable by the
1184 * root rank
1185 */
1186 /*----------------------------------------------------------------------------*/
1187
1188 void
cs_parall_gather_f(int root_rank,int n_elts,int n_g_elts,const float array[],float g_array[])1189 cs_parall_gather_f(int root_rank,
1190 int n_elts,
1191 int n_g_elts,
1192 const float array[],
1193 float g_array[])
1194 {
1195 assert(array != NULL && n_elts > 0);
1196
1197 #if defined(HAVE_MPI)
1198
1199 if (cs_glob_n_ranks > 1) {
1200
1201 /* Sanity check */
1202
1203 if (cs_glob_rank_id == root_rank && g_array == NULL)
1204 bft_error(__FILE__, __LINE__, 0,
1205 _(" %s: Global array is not allocated on the root_rank %d\n"),
1206 __func__, root_rank);
1207
1208 int *count = NULL, *shift = NULL;
1209
1210 _get_array_distribution(root_rank, n_elts, n_g_elts, &count, &shift);
1211
1212 MPI_Gatherv(array, n_elts, MPI_FLOAT,
1213 g_array, count, shift, MPI_FLOAT, root_rank, cs_glob_mpi_comm);
1214
1215 BFT_FREE(count);
1216 BFT_FREE(shift);
1217
1218 }
1219
1220 #endif
1221
1222 if (cs_glob_n_ranks == 1) {
1223
1224 assert(n_elts == n_g_elts);
1225 assert(g_array != NULL && n_g_elts > 0);
1226 for (int i = 0; i < n_elts; i++)
1227 g_array[i] = array[i];
1228
1229 }
1230 }
1231
1232 /*----------------------------------------------------------------------------*/
1233 /*!
1234 * \brief Distribute a global array from a given root rank over all ranks.
1235 * Each rank receive the part related to its local elements.
1236 * Function dealing with single-precision arrays.
1237 *
1238 * The size of each local array may be different.
1239 *
1240 * Use of this function may be quite practical, but should be limited
1241 * to specific usage, as it may limit scalability (especially as regards
1242 * memory usage).
1243 *
1244 * \param[in] root_rank rank which stores the global array
1245 * \param[in] n_elts size of the local array
1246 * \param[in] n_g_elts size of the global array
1247 * \param[in] g_array global array (size: n_g_elts) only usable by the
1248 * root rank
1249 * \param[out] array local array (size: n_elts)
1250 */
1251 /*----------------------------------------------------------------------------*/
1252
1253 void
cs_parall_scatter_f(int root_rank,int n_elts,int n_g_elts,const float g_array[],float array[])1254 cs_parall_scatter_f(int root_rank,
1255 int n_elts,
1256 int n_g_elts,
1257 const float g_array[],
1258 float array[])
1259 {
1260 assert(array != NULL && n_elts > 0);
1261
1262 #if defined(HAVE_MPI)
1263
1264 if (cs_glob_n_ranks > 1) {
1265
1266 /* Sanity check */
1267
1268 if (cs_glob_rank_id == root_rank && g_array == NULL)
1269 bft_error(__FILE__, __LINE__, 0,
1270 _(" %s: Global array is not allocated on the root_rank %d\n"),
1271 __func__, root_rank);
1272
1273 int *count = NULL, *shift = NULL;
1274
1275 _get_array_distribution(root_rank, n_elts, n_g_elts, &count, &shift);
1276
1277 MPI_Scatterv(g_array, count, shift, MPI_FLOAT,
1278 array, n_elts, MPI_FLOAT, root_rank, cs_glob_mpi_comm);
1279
1280 BFT_FREE(count);
1281 BFT_FREE(shift);
1282
1283 }
1284
1285 #endif
1286
1287 if (cs_glob_n_ranks == 1) {
1288
1289 assert(n_elts == n_g_elts);
1290 assert(g_array != NULL && n_g_elts > 0);
1291 for (int i = 0; i < n_elts; i++)
1292 array[i] = g_array[i];
1293
1294 }
1295 }
1296
1297 /*----------------------------------------------------------------------------*/
1298 /*!
1299 * \brief Return minimum recommended scatter or gather buffer size.
1300 *
1301 * This is used by some internal part to block or scatter/gather algorithms,
1302 * so as to allow I/O buffer size tuning.
1303 *
1304 * \return minimum recommended part to block or gather buffer size (in bytes)
1305 */
1306 /*----------------------------------------------------------------------------*/
1307
1308 size_t
cs_parall_get_min_coll_buf_size(void)1309 cs_parall_get_min_coll_buf_size(void)
1310 {
1311 #if defined(HAVE_MPI)
1312 return _cs_parall_min_coll_buf_size;
1313 #else
1314 return 0;
1315 #endif
1316 }
1317
1318 /*----------------------------------------------------------------------------*/
1319 /*!
1320 * \brief Define minimum recommended scatter or gather buffer size.
1321 *
1322 * This is used by some internal part to block or scatter/gather algorithms,
1323 * so as to allow I/O buffer size tuning.
1324 *
1325 * \param[in] buffer_size minimum recommended part to block
1326 * or gather buffer size (in bytes)
1327 */
1328 /*----------------------------------------------------------------------------*/
1329
1330 void
cs_parall_set_min_coll_buf_size(size_t buffer_size)1331 cs_parall_set_min_coll_buf_size(size_t buffer_size)
1332 {
1333 #if defined(HAVE_MPI)
1334 _cs_parall_min_coll_buf_size = buffer_size;
1335 #endif
1336 }
1337
1338 /*----------------------------------------------------------------------------*/
1339
1340 END_C_DECLS
1341