1 #ifndef __CS_PARALL_H__
2 #define __CS_PARALL_H__
3 
4 /*============================================================================
5  * Functions dealing with parallelism
6  *============================================================================*/
7 
8 /*
9   This file is part of Code_Saturne, a general-purpose CFD tool.
10 
11   Copyright (C) 1998-2021 EDF S.A.
12 
13   This program is free software; you can redistribute it and/or modify it under
14   the terms of the GNU General Public License as published by the Free Software
15   Foundation; either version 2 of the License, or (at your option) any later
16   version.
17 
18   This program is distributed in the hope that it will be useful, but WITHOUT
19   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
20   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
21   details.
22 
23   You should have received a copy of the GNU General Public License along with
24   this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
25   Street, Fifth Floor, Boston, MA 02110-1301, USA.
26 */
27 
28 /*----------------------------------------------------------------------------*/
29 
30 /*----------------------------------------------------------------------------
31  *  Local headers
32  *----------------------------------------------------------------------------*/
33 
34 #include "cs_defs.h"
35 
36 /*----------------------------------------------------------------------------*/
37 
38 BEGIN_C_DECLS
39 
40 /*=============================================================================
41  * Public function prototypes
42  *============================================================================*/
43 
44 /*----------------------------------------------------------------------------
45  * Sum values of a counter on all default communicator processes.
46  *
47  * parameters:
48  *   cpt <-> local counter in, global counter out (size: n)
49  *   n   <-- number of values
50  *----------------------------------------------------------------------------*/
51 
52 #if defined(HAVE_MPI_IN_PLACE)
53 
54 inline static void
cs_parall_counter(cs_gnum_t cpt[],const int n)55 cs_parall_counter(cs_gnum_t   cpt[],
56                   const int   n)
57 {
58   if (cs_glob_n_ranks > 1) {
59     MPI_Allreduce(MPI_IN_PLACE, cpt, n, CS_MPI_GNUM, MPI_SUM,
60                   cs_glob_mpi_comm);
61   }
62 }
63 
64 #elif defined(HAVE_MPI)
65 
66 void
67 cs_parall_counter(cs_gnum_t   cpt[],
68                   const int   n);
69 
70 #else
71 
72 #define cs_parall_counter(_cpt, _n)
73 
74 #endif
75 
76 /*----------------------------------------------------------------------------
77  * Maximum values of a counter on all default communicator processes.
78  *
79  * parameters:
80  *   cpt <-> local counter in, global counter out (size: n)
81  *   n   <-> number of values
82  *----------------------------------------------------------------------------*/
83 
84 #if defined(HAVE_MPI_IN_PLACE)
85 
86 inline static void
cs_parall_counter_max(cs_lnum_t cpt[],const int n)87 cs_parall_counter_max(cs_lnum_t   cpt[],
88                       const int   n)
89 {
90   if (cs_glob_n_ranks > 1) {
91     MPI_Allreduce(MPI_IN_PLACE, cpt, n, CS_MPI_LNUM, MPI_MAX,
92                   cs_glob_mpi_comm);
93   }
94 }
95 
96 #elif defined(HAVE_MPI)
97 
98 void
99 cs_parall_counter_max(cs_lnum_t   cpt[],
100                       const int   n);
101 
102 #else
103 
104 #define cs_parall_counter_max(_cpt, _n)
105 
106 #endif
107 
108 /*----------------------------------------------------------------------------
109  * Sum values of a given datatype on all default communicator processes.
110  *
111  * parameters:
112  *   n        <-- number of values
113  *   datatype <-- matching Code_Saturne datatype
114  *   val      <-> local sum in, global sum out (array)
115  *----------------------------------------------------------------------------*/
116 
117 #if defined(HAVE_MPI_IN_PLACE)
118 
119 inline static void
cs_parall_sum(int n,cs_datatype_t datatype,void * val)120 cs_parall_sum(int             n,
121               cs_datatype_t   datatype,
122               void           *val)
123 {
124   if (cs_glob_n_ranks > 1) {
125     MPI_Allreduce(MPI_IN_PLACE, val, n, cs_datatype_to_mpi[datatype], MPI_SUM,
126                   cs_glob_mpi_comm);
127   }
128 }
129 
130 #elif defined(HAVE_MPI)
131 
132 void
133 cs_parall_sum(int             n,
134               cs_datatype_t   datatype,
135               void           *val);
136 
137 #else
138 
139 #define cs_parall_sum(_n, _datatype, _val) { };
140 
141 #endif
142 
143 /*----------------------------------------------------------------------------
144  * Maximum values of a given datatype on all default communicator processes.
145  *
146  * parameters:
147  *   n        <-- number of values
148  *   datatype <-- matching Code_Saturne datatype
149  *   val      <-> local value  input, global value output (array)
150  *----------------------------------------------------------------------------*/
151 
152 #if defined(HAVE_MPI_IN_PLACE)
153 
154 inline static void
cs_parall_max(int n,cs_datatype_t datatype,void * val)155 cs_parall_max(int             n,
156               cs_datatype_t   datatype,
157               void           *val)
158 {
159   if (cs_glob_n_ranks > 1) {
160     MPI_Allreduce(MPI_IN_PLACE, val, n, cs_datatype_to_mpi[datatype], MPI_MAX,
161                   cs_glob_mpi_comm);
162   }
163 }
164 
165 #elif defined(HAVE_MPI)
166 
167 void
168 cs_parall_max(int             n,
169               cs_datatype_t   datatype,
170               void           *val);
171 
172 #else
173 
174 #define cs_parall_max(_n, _datatype, _val);
175 
176 #endif
177 
178 /*----------------------------------------------------------------------------
179  * Minimum values of a given datatype on all default communicator processes.
180  *
181  * parameters:
182  *   n        <-- number of values
183  *   datatype <-- matching Code_Saturne datatype
184  *   val      <-> local value  input, global value output (array)
185  *----------------------------------------------------------------------------*/
186 
187 #if defined(HAVE_MPI_IN_PLACE)
188 
189 inline static void
cs_parall_min(int n,cs_datatype_t datatype,void * val)190 cs_parall_min(int             n,
191               cs_datatype_t   datatype,
192               void           *val)
193 {
194   if (cs_glob_n_ranks > 1) {
195     MPI_Allreduce(MPI_IN_PLACE, val, n, cs_datatype_to_mpi[datatype], MPI_MIN,
196                   cs_glob_mpi_comm);
197   }
198 }
199 
200 #elif defined(HAVE_MPI)
201 
202 void
203 cs_parall_min(int             n,
204               cs_datatype_t   datatype,
205               void           *val);
206 
207 #else
208 
209 #define cs_parall_min(_n, _datatype, _val);
210 
211 #endif
212 
213 /*----------------------------------------------------------------------------
214  * Broadcast values of a given datatype to all
215  * default communicator processes.
216  *
217  * parameters:
218  *   root_rank <-- rank from which to broadcast
219  *   n         <-- number of values
220  *   datatype  <-- matching Code_Saturne datatype
221  *   val       <-- values to broadcast; input on root_rank,
222  *                 output on others (size: n)
223  *----------------------------------------------------------------------------*/
224 
225 #if defined(HAVE_MPI)
226 
227 inline static void
cs_parall_bcast(int root_rank,int n,cs_datatype_t datatype,void * val)228 cs_parall_bcast(int             root_rank,
229                 int             n,
230                 cs_datatype_t   datatype,
231                 void           *val)
232 {
233   if (cs_glob_n_ranks > 1)
234     MPI_Bcast(val, n, cs_datatype_to_mpi[datatype], root_rank,
235               cs_glob_mpi_comm);
236 }
237 
238 #else
239 
240 #define cs_parall_bcast(_root_rank, _n, _datatype, _val);
241 
242 #endif
243 
244 /*----------------------------------------------------------------------------
245  * Build a global array from each local array in each domain.
246  *
247  * Local arrays are appended in order of owning MPI rank.
248  * The size of each local array may be different.
249  *
250  * Use of this function may be quite practical, but should be limited
251  * to user functions, as it may limit scalability (especially as regards
252  * memory usage).
253  *
254  * parameters:
255  *   n_elts   <-- size of the local array
256  *   n_g_elts <-- size of the global array
257  *   array    <-- local array (size: n_elts)
258  *   g_array  --> global array  (size: n_g_elts)
259  *----------------------------------------------------------------------------*/
260 
261 void
262 cs_parall_allgather_r(int        n_elts,
263                       int        n_g_elts,
264                       cs_real_t  array[],
265                       cs_real_t  g_array[]);
266 
267 /*----------------------------------------------------------------------------
268  * Build an ordered global array from each local array in each domain.
269  *
270  * Local array elements are ordered based on a given key value (usually
271  * some form of coordinate, so the result should be independent of
272  * partitioning as long as the definition of the o_key array's defintion
273  * is itself independent of the partitioning.
274  *
275  * Use of this function may be quite practical, but should be limited
276  * to user functions, as it may limit scalability (especially as regards
277  * memory usage).
278  *
279  * parameters:
280  *   n_elts   <-- number of local elements
281  *   n_g_elts <-- number of global elements
282  *   stride   <-- number of values per element
283  *   o_key    <-- ordering key (coordinate) value per element
284  *   array    <-- local array (size: n_elts*stride)
285  *   g_array  --> global array  (size: n_g_elts*stride)
286  *----------------------------------------------------------------------------*/
287 
288 void
289 cs_parall_allgather_ordered_r(int        n_elts,
290                               int        n_g_elts,
291                               int        stride,
292                               cs_real_t  o_key[],
293                               cs_real_t  array[],
294                               cs_real_t  g_array[]);
295 
296 /*----------------------------------------------------------------------------*/
297 /*!
298  * \brief Build a global array on the given root rank from all local arrays.
299  *
300  * Local arrays are appended in order of owning MPI rank.
301  * The size of each local array may be different.
302  *
303  * Use of this function may be quite practical, but should be limited
304  * to user functions, as it may limit scalability (especially as regards
305  * memory usage).
306  *
307  * \param[in]   root_rank  rank which stores the global array
308  * \param[in]   n_elts     size of the local array
309  * \param[in]   n_g_elts   size of the global array
310  * \param[in]   array      local array (size: n_elts)
311  * \param[out]  g_array    global array  (size: n_g_elts) only usable by the
312  *                         root rank
313  */
314 /*----------------------------------------------------------------------------*/
315 
316 void
317 cs_parall_gather_r(int               root_rank,
318                    int               n_elts,
319                    int               n_g_elts,
320                    const cs_real_t   array[],
321                    cs_real_t         g_array[]);
322 
323 /*----------------------------------------------------------------------------*/
324 /*!
325  * \brief Build an ordered global array on the given root rank from
326  *        all local arrays.
327  *
328  * Local array elements are ordered based on a given key value (usually
329  * some form of coordinate, so the result should be independent of
330  * partitioning as long as the definition of the o_key array's defintion
331  * is itself independent of the partitioning.
332  *
333  * Use of this function may be quite practical, but should be limited
334  * to user functions, as it may limit scalability (especially as regards
335  * memory usage).
336  *
337  * \param[in]   root_rank  rank which stores the global array
338  * \param[in]   n_elts     number of local elements
339  * \param[in]   n_g_elts   number of global elements
340  * \param[in]   stride     number of values per element
341  * \param[in]   o_key      ordering key (coordinate) value per element
342  * \param[in]   array      local array (size: n_elts*stride)
343  * \param[out]  g_array    global array  (size: n_g_elts*stride)
344  */
345 /*----------------------------------------------------------------------------*/
346 
347 void
348 cs_parall_gather_ordered_r(int        root_rank,
349                            int        n_elts,
350                            int        n_g_elts,
351                            int        stride,
352                            cs_real_t  o_key[],
353                            cs_real_t  array[],
354                            cs_real_t  g_array[]);
355 
356 /*----------------------------------------------------------------------------*/
357 /*!
358  * \brief Distribute a global array from a given root rank over all ranks.
359  *        Each rank receive the part related to its local elements.
360  *
361  * The size of each local array may be different.
362  *
363  * Use of this function may be quite practical, but should be limited
364  * to specific usage, as it may limit scalability (especially as regards
365  * memory usage).
366  *
367  * \param[in]   root_rank  rank which stores the global array
368  * \param[in]   n_elts     size of the local array
369  * \param[in]   n_g_elts   size of the global array
370  * \param[in]   g_array    global array  (size: n_g_elts) only usable by the
371  *                         root rank
372  * \param[out]  array      local array (size: n_elts)
373  */
374 /*----------------------------------------------------------------------------*/
375 
376 void
377 cs_parall_scatter_r(int               root_rank,
378                     int               n_elts,
379                     int               n_g_elts,
380                     const cs_real_t   g_array[],
381                     cs_real_t         array[]);
382 
383 /*----------------------------------------------------------------------------*/
384 /*!
385  * \brief Build a global array on the given root rank from all local arrays.
386  *        Function dealing with single-precision arrays.
387  *
388  * Local arrays are appended in order of owning MPI rank.
389  * The size of each local array may be different.
390  *
391  * Use of this function may be quite practical, but should be limited
392  * to user functions, as it may limit scalability (especially as regards
393  * memory usage).
394  *
395  * \param[in]   root_rank  rank which stores the global array
396  * \param[in]   n_elts     size of the local array
397  * \param[in]   n_g_elts   size of the global array
398  * \param[in]   array      local array (size: n_elts)
399  * \param[out]  g_array    global array  (size: n_g_elts) only usable by the
400  *                         root rank
401  */
402 /*----------------------------------------------------------------------------*/
403 
404 void
405 cs_parall_gather_f(int             root_rank,
406                    int             n_elts,
407                    int             n_g_elts,
408                    const float     array[],
409                    float           g_array[]);
410 
411 /*----------------------------------------------------------------------------*/
412 /*!
413  * \brief Distribute a global array from a given root rank over all ranks.
414  *        Each rank receive the part related to its local elements.
415  *        Function dealing with single-precision arrays.
416  *
417  * The size of each local array may be different.
418  *
419  * Use of this function may be quite practical, but should be limited
420  * to specific usage, as it may limit scalability (especially as regards
421  * memory usage).
422  *
423  * \param[in]   root_rank  rank which stores the global array
424  * \param[in]   n_elts     size of the local array
425  * \param[in]   n_g_elts   size of the global array
426  * \param[in]   g_array    global array  (size: n_g_elts) only usable by the
427  *                         root rank
428  * \param[out]  array      local array (size: n_elts)
429  */
430 /*----------------------------------------------------------------------------*/
431 
432 void
433 cs_parall_scatter_f(int           root_rank,
434                     int           n_elts,
435                     int           n_g_elts,
436                     const float   g_array[],
437                     float         array[]);
438 
439 /*----------------------------------------------------------------------------
440  * Maximum value of a real and the value of related array on all
441  * default communicator processes.
442  *
443  * parameters:
444  *   n            <-- size of the related array
445  *   max          <-> local max in, global max out
446  *   max_loc_vals <-> array values at location of local max in,
447  *                    and at location of global max out
448  *----------------------------------------------------------------------------*/
449 
450 void
451 cs_parall_max_loc_vals(int         n,
452                        cs_real_t  *max,
453                        cs_real_t   max_loc_vals[]);
454 
455 /*----------------------------------------------------------------------------
456  * Minimum value of a real and the value of related array on all
457  * default communicator processes.
458  *
459  * parameters:
460  *   n            <-- size of the related array
461  *   min          <-> local min in, global min out
462  *   min_loc_vals <-> array values at location of local min in,
463  *                    and at location of global min out
464  *----------------------------------------------------------------------------*/
465 
466 void
467 cs_parall_min_loc_vals(int         n,
468                        cs_real_t  *min,
469                        cs_real_t   min_loc_vals[]);
470 
471 /*----------------------------------------------------------------------------
472  * Given an (id, rank, value) tuple, return the local id and rank
473  * corresponding to the global minimum value.
474  *
475  * parameters:
476  *   elt_id  <-> element id for which the value is the smallest
477  *               (local in, global out)
478  *   rank_id <-> rank id for which the value is the smallest
479  *               (local in, global out)
480  *   val     <-- associated local minimum value
481  *----------------------------------------------------------------------------*/
482 
483 void
484 cs_parall_min_id_rank_r(cs_lnum_t  *elt_id,
485                         int        *rank_id,
486                         cs_real_t   dis2mn);
487 
488 /*----------------------------------------------------------------------------
489  * Return minimum recommended scatter or gather buffer size.
490  *
491  * This is used by some internal part to block or scatter/gather algorithms,
492  * so as to allow I/O buffer size tuning.
493  *
494  * returns:
495  *   minimum recommended part to block or gather buffer size (in bytes)
496  *----------------------------------------------------------------------------*/
497 
498 size_t
499 cs_parall_get_min_coll_buf_size(void);
500 
501 /*----------------------------------------------------------------------------
502  * Define minimum recommended gather buffer size.
503  *
504  * This is used by some internal part to block or scatter/gather algorithms,
505  * so as to allow I/O buffer size tuning.
506  *
507  * parameters:
508  *   minimum recommended part to block or gather buffer size (in bytes)
509  *----------------------------------------------------------------------------*/
510 
511 void
512 cs_parall_set_min_coll_buf_size(size_t buffer_size);
513 
514 /*----------------------------------------------------------------------------*/
515 /*!
516  * \brief Compute array index bounds for a local thread.
517  *        When called inside an OpenMP parallel section, this will return the
518  *        start an past-the-end indexes for the array range assigned to that
519  *        thread. In other cases, the start index is 1, and the past-the-end
520  *        index is n;
521  *
522  * \param[in]       n          size of array
523  * \param[in]       type_size  element type size (or multiple)
524  * \param[in, out]  s_id       start index for the current thread
525  * \param[in, out]  e_id       past-the-end index for the current thread
526  */
527 /*----------------------------------------------------------------------------*/
528 
529 inline static void
cs_parall_thread_range(cs_lnum_t n,size_t type_size,cs_lnum_t * s_id,cs_lnum_t * e_id)530 cs_parall_thread_range(cs_lnum_t    n,
531                        size_t       type_size,
532                        cs_lnum_t   *s_id,
533                        cs_lnum_t   *e_id)
534 {
535 #if defined(HAVE_OPENMP)
536   const int t_id = omp_get_thread_num();
537   const int n_t = omp_get_num_threads();
538   const cs_lnum_t t_n = (n + n_t - 1) / n_t;
539   const cs_lnum_t cl_m = CS_CL_SIZE / type_size;  /* Cache line multiple */
540 
541   *s_id =  t_id    * t_n;
542   *e_id = (t_id+1) * t_n;
543   *s_id = cs_align(*s_id, cl_m);
544   *e_id = cs_align(*e_id, cl_m);
545   if (*e_id > n) *e_id = n;
546 #else
547   CS_UNUSED(type_size);         /* avoid compiler warning */
548   *s_id = 0;
549   *e_id = n;
550 #endif
551 }
552 
553 /*----------------------------------------------------------------------------*/
554 
555 END_C_DECLS
556 
557 #endif /* __CS_PARALL_H__ */
558