1 #ifndef __CS_PARALL_H__
2 #define __CS_PARALL_H__
3
4 /*============================================================================
5 * Functions dealing with parallelism
6 *============================================================================*/
7
8 /*
9 This file is part of Code_Saturne, a general-purpose CFD tool.
10
11 Copyright (C) 1998-2021 EDF S.A.
12
13 This program is free software; you can redistribute it and/or modify it under
14 the terms of the GNU General Public License as published by the Free Software
15 Foundation; either version 2 of the License, or (at your option) any later
16 version.
17
18 This program is distributed in the hope that it will be useful, but WITHOUT
19 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
20 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
21 details.
22
23 You should have received a copy of the GNU General Public License along with
24 this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
25 Street, Fifth Floor, Boston, MA 02110-1301, USA.
26 */
27
28 /*----------------------------------------------------------------------------*/
29
30 /*----------------------------------------------------------------------------
31 * Local headers
32 *----------------------------------------------------------------------------*/
33
34 #include "cs_defs.h"
35
36 /*----------------------------------------------------------------------------*/
37
38 BEGIN_C_DECLS
39
40 /*=============================================================================
41 * Public function prototypes
42 *============================================================================*/
43
44 /*----------------------------------------------------------------------------
45 * Sum values of a counter on all default communicator processes.
46 *
47 * parameters:
48 * cpt <-> local counter in, global counter out (size: n)
49 * n <-- number of values
50 *----------------------------------------------------------------------------*/
51
52 #if defined(HAVE_MPI_IN_PLACE)
53
54 inline static void
cs_parall_counter(cs_gnum_t cpt[],const int n)55 cs_parall_counter(cs_gnum_t cpt[],
56 const int n)
57 {
58 if (cs_glob_n_ranks > 1) {
59 MPI_Allreduce(MPI_IN_PLACE, cpt, n, CS_MPI_GNUM, MPI_SUM,
60 cs_glob_mpi_comm);
61 }
62 }
63
64 #elif defined(HAVE_MPI)
65
66 void
67 cs_parall_counter(cs_gnum_t cpt[],
68 const int n);
69
70 #else
71
72 #define cs_parall_counter(_cpt, _n)
73
74 #endif
75
76 /*----------------------------------------------------------------------------
77 * Maximum values of a counter on all default communicator processes.
78 *
79 * parameters:
80 * cpt <-> local counter in, global counter out (size: n)
81 * n <-> number of values
82 *----------------------------------------------------------------------------*/
83
84 #if defined(HAVE_MPI_IN_PLACE)
85
86 inline static void
cs_parall_counter_max(cs_lnum_t cpt[],const int n)87 cs_parall_counter_max(cs_lnum_t cpt[],
88 const int n)
89 {
90 if (cs_glob_n_ranks > 1) {
91 MPI_Allreduce(MPI_IN_PLACE, cpt, n, CS_MPI_LNUM, MPI_MAX,
92 cs_glob_mpi_comm);
93 }
94 }
95
96 #elif defined(HAVE_MPI)
97
98 void
99 cs_parall_counter_max(cs_lnum_t cpt[],
100 const int n);
101
102 #else
103
104 #define cs_parall_counter_max(_cpt, _n)
105
106 #endif
107
108 /*----------------------------------------------------------------------------
109 * Sum values of a given datatype on all default communicator processes.
110 *
111 * parameters:
112 * n <-- number of values
113 * datatype <-- matching Code_Saturne datatype
114 * val <-> local sum in, global sum out (array)
115 *----------------------------------------------------------------------------*/
116
117 #if defined(HAVE_MPI_IN_PLACE)
118
119 inline static void
cs_parall_sum(int n,cs_datatype_t datatype,void * val)120 cs_parall_sum(int n,
121 cs_datatype_t datatype,
122 void *val)
123 {
124 if (cs_glob_n_ranks > 1) {
125 MPI_Allreduce(MPI_IN_PLACE, val, n, cs_datatype_to_mpi[datatype], MPI_SUM,
126 cs_glob_mpi_comm);
127 }
128 }
129
130 #elif defined(HAVE_MPI)
131
132 void
133 cs_parall_sum(int n,
134 cs_datatype_t datatype,
135 void *val);
136
137 #else
138
139 #define cs_parall_sum(_n, _datatype, _val) { };
140
141 #endif
142
143 /*----------------------------------------------------------------------------
144 * Maximum values of a given datatype on all default communicator processes.
145 *
146 * parameters:
147 * n <-- number of values
148 * datatype <-- matching Code_Saturne datatype
149 * val <-> local value input, global value output (array)
150 *----------------------------------------------------------------------------*/
151
152 #if defined(HAVE_MPI_IN_PLACE)
153
154 inline static void
cs_parall_max(int n,cs_datatype_t datatype,void * val)155 cs_parall_max(int n,
156 cs_datatype_t datatype,
157 void *val)
158 {
159 if (cs_glob_n_ranks > 1) {
160 MPI_Allreduce(MPI_IN_PLACE, val, n, cs_datatype_to_mpi[datatype], MPI_MAX,
161 cs_glob_mpi_comm);
162 }
163 }
164
165 #elif defined(HAVE_MPI)
166
167 void
168 cs_parall_max(int n,
169 cs_datatype_t datatype,
170 void *val);
171
172 #else
173
174 #define cs_parall_max(_n, _datatype, _val);
175
176 #endif
177
178 /*----------------------------------------------------------------------------
179 * Minimum values of a given datatype on all default communicator processes.
180 *
181 * parameters:
182 * n <-- number of values
183 * datatype <-- matching Code_Saturne datatype
184 * val <-> local value input, global value output (array)
185 *----------------------------------------------------------------------------*/
186
187 #if defined(HAVE_MPI_IN_PLACE)
188
189 inline static void
cs_parall_min(int n,cs_datatype_t datatype,void * val)190 cs_parall_min(int n,
191 cs_datatype_t datatype,
192 void *val)
193 {
194 if (cs_glob_n_ranks > 1) {
195 MPI_Allreduce(MPI_IN_PLACE, val, n, cs_datatype_to_mpi[datatype], MPI_MIN,
196 cs_glob_mpi_comm);
197 }
198 }
199
200 #elif defined(HAVE_MPI)
201
202 void
203 cs_parall_min(int n,
204 cs_datatype_t datatype,
205 void *val);
206
207 #else
208
209 #define cs_parall_min(_n, _datatype, _val);
210
211 #endif
212
213 /*----------------------------------------------------------------------------
214 * Broadcast values of a given datatype to all
215 * default communicator processes.
216 *
217 * parameters:
218 * root_rank <-- rank from which to broadcast
219 * n <-- number of values
220 * datatype <-- matching Code_Saturne datatype
221 * val <-- values to broadcast; input on root_rank,
222 * output on others (size: n)
223 *----------------------------------------------------------------------------*/
224
225 #if defined(HAVE_MPI)
226
227 inline static void
cs_parall_bcast(int root_rank,int n,cs_datatype_t datatype,void * val)228 cs_parall_bcast(int root_rank,
229 int n,
230 cs_datatype_t datatype,
231 void *val)
232 {
233 if (cs_glob_n_ranks > 1)
234 MPI_Bcast(val, n, cs_datatype_to_mpi[datatype], root_rank,
235 cs_glob_mpi_comm);
236 }
237
238 #else
239
240 #define cs_parall_bcast(_root_rank, _n, _datatype, _val);
241
242 #endif
243
244 /*----------------------------------------------------------------------------
245 * Build a global array from each local array in each domain.
246 *
247 * Local arrays are appended in order of owning MPI rank.
248 * The size of each local array may be different.
249 *
250 * Use of this function may be quite practical, but should be limited
251 * to user functions, as it may limit scalability (especially as regards
252 * memory usage).
253 *
254 * parameters:
255 * n_elts <-- size of the local array
256 * n_g_elts <-- size of the global array
257 * array <-- local array (size: n_elts)
258 * g_array --> global array (size: n_g_elts)
259 *----------------------------------------------------------------------------*/
260
261 void
262 cs_parall_allgather_r(int n_elts,
263 int n_g_elts,
264 cs_real_t array[],
265 cs_real_t g_array[]);
266
267 /*----------------------------------------------------------------------------
268 * Build an ordered global array from each local array in each domain.
269 *
270 * Local array elements are ordered based on a given key value (usually
271 * some form of coordinate, so the result should be independent of
272 * partitioning as long as the definition of the o_key array's defintion
273 * is itself independent of the partitioning.
274 *
275 * Use of this function may be quite practical, but should be limited
276 * to user functions, as it may limit scalability (especially as regards
277 * memory usage).
278 *
279 * parameters:
280 * n_elts <-- number of local elements
281 * n_g_elts <-- number of global elements
282 * stride <-- number of values per element
283 * o_key <-- ordering key (coordinate) value per element
284 * array <-- local array (size: n_elts*stride)
285 * g_array --> global array (size: n_g_elts*stride)
286 *----------------------------------------------------------------------------*/
287
288 void
289 cs_parall_allgather_ordered_r(int n_elts,
290 int n_g_elts,
291 int stride,
292 cs_real_t o_key[],
293 cs_real_t array[],
294 cs_real_t g_array[]);
295
296 /*----------------------------------------------------------------------------*/
297 /*!
298 * \brief Build a global array on the given root rank from all local arrays.
299 *
300 * Local arrays are appended in order of owning MPI rank.
301 * The size of each local array may be different.
302 *
303 * Use of this function may be quite practical, but should be limited
304 * to user functions, as it may limit scalability (especially as regards
305 * memory usage).
306 *
307 * \param[in] root_rank rank which stores the global array
308 * \param[in] n_elts size of the local array
309 * \param[in] n_g_elts size of the global array
310 * \param[in] array local array (size: n_elts)
311 * \param[out] g_array global array (size: n_g_elts) only usable by the
312 * root rank
313 */
314 /*----------------------------------------------------------------------------*/
315
316 void
317 cs_parall_gather_r(int root_rank,
318 int n_elts,
319 int n_g_elts,
320 const cs_real_t array[],
321 cs_real_t g_array[]);
322
323 /*----------------------------------------------------------------------------*/
324 /*!
325 * \brief Build an ordered global array on the given root rank from
326 * all local arrays.
327 *
328 * Local array elements are ordered based on a given key value (usually
329 * some form of coordinate, so the result should be independent of
330 * partitioning as long as the definition of the o_key array's defintion
331 * is itself independent of the partitioning.
332 *
333 * Use of this function may be quite practical, but should be limited
334 * to user functions, as it may limit scalability (especially as regards
335 * memory usage).
336 *
337 * \param[in] root_rank rank which stores the global array
338 * \param[in] n_elts number of local elements
339 * \param[in] n_g_elts number of global elements
340 * \param[in] stride number of values per element
341 * \param[in] o_key ordering key (coordinate) value per element
342 * \param[in] array local array (size: n_elts*stride)
343 * \param[out] g_array global array (size: n_g_elts*stride)
344 */
345 /*----------------------------------------------------------------------------*/
346
347 void
348 cs_parall_gather_ordered_r(int root_rank,
349 int n_elts,
350 int n_g_elts,
351 int stride,
352 cs_real_t o_key[],
353 cs_real_t array[],
354 cs_real_t g_array[]);
355
356 /*----------------------------------------------------------------------------*/
357 /*!
358 * \brief Distribute a global array from a given root rank over all ranks.
359 * Each rank receive the part related to its local elements.
360 *
361 * The size of each local array may be different.
362 *
363 * Use of this function may be quite practical, but should be limited
364 * to specific usage, as it may limit scalability (especially as regards
365 * memory usage).
366 *
367 * \param[in] root_rank rank which stores the global array
368 * \param[in] n_elts size of the local array
369 * \param[in] n_g_elts size of the global array
370 * \param[in] g_array global array (size: n_g_elts) only usable by the
371 * root rank
372 * \param[out] array local array (size: n_elts)
373 */
374 /*----------------------------------------------------------------------------*/
375
376 void
377 cs_parall_scatter_r(int root_rank,
378 int n_elts,
379 int n_g_elts,
380 const cs_real_t g_array[],
381 cs_real_t array[]);
382
383 /*----------------------------------------------------------------------------*/
384 /*!
385 * \brief Build a global array on the given root rank from all local arrays.
386 * Function dealing with single-precision arrays.
387 *
388 * Local arrays are appended in order of owning MPI rank.
389 * The size of each local array may be different.
390 *
391 * Use of this function may be quite practical, but should be limited
392 * to user functions, as it may limit scalability (especially as regards
393 * memory usage).
394 *
395 * \param[in] root_rank rank which stores the global array
396 * \param[in] n_elts size of the local array
397 * \param[in] n_g_elts size of the global array
398 * \param[in] array local array (size: n_elts)
399 * \param[out] g_array global array (size: n_g_elts) only usable by the
400 * root rank
401 */
402 /*----------------------------------------------------------------------------*/
403
404 void
405 cs_parall_gather_f(int root_rank,
406 int n_elts,
407 int n_g_elts,
408 const float array[],
409 float g_array[]);
410
411 /*----------------------------------------------------------------------------*/
412 /*!
413 * \brief Distribute a global array from a given root rank over all ranks.
414 * Each rank receive the part related to its local elements.
415 * Function dealing with single-precision arrays.
416 *
417 * The size of each local array may be different.
418 *
419 * Use of this function may be quite practical, but should be limited
420 * to specific usage, as it may limit scalability (especially as regards
421 * memory usage).
422 *
423 * \param[in] root_rank rank which stores the global array
424 * \param[in] n_elts size of the local array
425 * \param[in] n_g_elts size of the global array
426 * \param[in] g_array global array (size: n_g_elts) only usable by the
427 * root rank
428 * \param[out] array local array (size: n_elts)
429 */
430 /*----------------------------------------------------------------------------*/
431
432 void
433 cs_parall_scatter_f(int root_rank,
434 int n_elts,
435 int n_g_elts,
436 const float g_array[],
437 float array[]);
438
439 /*----------------------------------------------------------------------------
440 * Maximum value of a real and the value of related array on all
441 * default communicator processes.
442 *
443 * parameters:
444 * n <-- size of the related array
445 * max <-> local max in, global max out
446 * max_loc_vals <-> array values at location of local max in,
447 * and at location of global max out
448 *----------------------------------------------------------------------------*/
449
450 void
451 cs_parall_max_loc_vals(int n,
452 cs_real_t *max,
453 cs_real_t max_loc_vals[]);
454
455 /*----------------------------------------------------------------------------
456 * Minimum value of a real and the value of related array on all
457 * default communicator processes.
458 *
459 * parameters:
460 * n <-- size of the related array
461 * min <-> local min in, global min out
462 * min_loc_vals <-> array values at location of local min in,
463 * and at location of global min out
464 *----------------------------------------------------------------------------*/
465
466 void
467 cs_parall_min_loc_vals(int n,
468 cs_real_t *min,
469 cs_real_t min_loc_vals[]);
470
471 /*----------------------------------------------------------------------------
472 * Given an (id, rank, value) tuple, return the local id and rank
473 * corresponding to the global minimum value.
474 *
475 * parameters:
476 * elt_id <-> element id for which the value is the smallest
477 * (local in, global out)
478 * rank_id <-> rank id for which the value is the smallest
479 * (local in, global out)
480 * val <-- associated local minimum value
481 *----------------------------------------------------------------------------*/
482
483 void
484 cs_parall_min_id_rank_r(cs_lnum_t *elt_id,
485 int *rank_id,
486 cs_real_t dis2mn);
487
488 /*----------------------------------------------------------------------------
489 * Return minimum recommended scatter or gather buffer size.
490 *
491 * This is used by some internal part to block or scatter/gather algorithms,
492 * so as to allow I/O buffer size tuning.
493 *
494 * returns:
495 * minimum recommended part to block or gather buffer size (in bytes)
496 *----------------------------------------------------------------------------*/
497
498 size_t
499 cs_parall_get_min_coll_buf_size(void);
500
501 /*----------------------------------------------------------------------------
502 * Define minimum recommended gather buffer size.
503 *
504 * This is used by some internal part to block or scatter/gather algorithms,
505 * so as to allow I/O buffer size tuning.
506 *
507 * parameters:
508 * minimum recommended part to block or gather buffer size (in bytes)
509 *----------------------------------------------------------------------------*/
510
511 void
512 cs_parall_set_min_coll_buf_size(size_t buffer_size);
513
514 /*----------------------------------------------------------------------------*/
515 /*!
516 * \brief Compute array index bounds for a local thread.
517 * When called inside an OpenMP parallel section, this will return the
518 * start an past-the-end indexes for the array range assigned to that
519 * thread. In other cases, the start index is 1, and the past-the-end
520 * index is n;
521 *
522 * \param[in] n size of array
523 * \param[in] type_size element type size (or multiple)
524 * \param[in, out] s_id start index for the current thread
525 * \param[in, out] e_id past-the-end index for the current thread
526 */
527 /*----------------------------------------------------------------------------*/
528
529 inline static void
cs_parall_thread_range(cs_lnum_t n,size_t type_size,cs_lnum_t * s_id,cs_lnum_t * e_id)530 cs_parall_thread_range(cs_lnum_t n,
531 size_t type_size,
532 cs_lnum_t *s_id,
533 cs_lnum_t *e_id)
534 {
535 #if defined(HAVE_OPENMP)
536 const int t_id = omp_get_thread_num();
537 const int n_t = omp_get_num_threads();
538 const cs_lnum_t t_n = (n + n_t - 1) / n_t;
539 const cs_lnum_t cl_m = CS_CL_SIZE / type_size; /* Cache line multiple */
540
541 *s_id = t_id * t_n;
542 *e_id = (t_id+1) * t_n;
543 *s_id = cs_align(*s_id, cl_m);
544 *e_id = cs_align(*e_id, cl_m);
545 if (*e_id > n) *e_id = n;
546 #else
547 CS_UNUSED(type_size); /* avoid compiler warning */
548 *s_id = 0;
549 *e_id = n;
550 #endif
551 }
552
553 /*----------------------------------------------------------------------------*/
554
555 END_C_DECLS
556
557 #endif /* __CS_PARALL_H__ */
558