1 /*============================================================================
2 * All-to-all parallel data exchange.
3 *============================================================================*/
4
5 /*
6 This file is part of Code_Saturne, a general-purpose CFD tool.
7
8 Copyright (C) 1998-2021 EDF S.A.
9
10 This program is free software; you can redistribute it and/or modify it under
11 the terms of the GNU General Public License as published by the Free Software
12 Foundation; either version 2 of the License, or (at your option) any later
13 version.
14
15 This program is distributed in the hope that it will be useful, but WITHOUT
16 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
18 details.
19
20 You should have received a copy of the GNU General Public License along with
21 this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
22 Street, Fifth Floor, Boston, MA 02110-1301, USA.
23 */
24
25 /*----------------------------------------------------------------------------*/
26
27 #include "cs_defs.h"
28
29 /*----------------------------------------------------------------------------
30 * Standard C library headers
31 *----------------------------------------------------------------------------*/
32
33 #include <assert.h>
34 #include <errno.h>
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38
39 /*----------------------------------------------------------------------------
40 * Local headers
41 *----------------------------------------------------------------------------*/
42
43 #include "bft_mem.h"
44 #include "bft_mem_usage.h"
45 #include "bft_error.h"
46 #include "bft_printf.h"
47
48 #include "cs_base.h"
49 #include "cs_assert.h"
50 #include "cs_block_dist.h"
51 #include "cs_crystal_router.h"
52 #include "cs_log.h"
53 #include "cs_order.h"
54 #include "cs_rank_neighbors.h"
55 #include "cs_timer.h"
56
57 /*----------------------------------------------------------------------------
58 * Header for the current file
59 *----------------------------------------------------------------------------*/
60
61 #include "cs_all_to_all.h"
62
63 /*----------------------------------------------------------------------------*/
64
65 BEGIN_C_DECLS
66
67 /*=============================================================================
68 * Additional Doxygen documentation
69 *============================================================================*/
70
71 /*!
72 \file cs_all_to_all.c
73 All-to-all parallel data exchange.
74
75 \typedef cs_all_to_all_t
76 Opaque all-to-all distribution structure
77
78 \enum cs_all_to_all_type_t
79
80 \brief All-to-all algorithm selection
81
82 \var CS_ALL_TO_ALL_MPI_DEFAULT
83 Use MPI_Alltoall and MPI_Alltoallv sequences
84
85 \var CS_ALL_TO_ALL_CRYSTAL_ROUTER
86 Use crystal router algorithm
87
88 \paragraph all_to_all_flags Using flags
89 \parblock
90
91 Flags are defined as a sum (bitwise or) of constants, which may include
92 \ref CS_ALL_TO_ALL_USE_DEST_ID, \ref CS_ALL_TO_ALL_ORDER_BY_SRC_RANK,
93 \ref CS_ALL_TO_ALL_NO_REVERSE, and \ref CS_ALL_TO_ALL_NEED_SRC_RANK.
94
95 \endparblock
96 */
97
98 /*! \cond DOXYGEN_SHOULD_SKIP_THIS */
99
100 /* Note for future optimization:
101 for cases where data to be sent is sorted by increasing destination rank,
102 using communication schemes requiring ordering by ranks (such as
103 CS_ALL_TO_ALL_MPI_DEFAULT, we should not need to build an extra array,
104 but only to send the correct parts of the indexed list to the correct
105 ranks, so cs_all_to_all_copy_indexed may internally use an extra array;
106 this could be avoided by adding an additional send option or flag,
107 or adding cs_all_to_copy_... variants for cases ordered by rank */
108
109 /*=============================================================================
110 * Macro definitions
111 *============================================================================*/
112
113 /*=============================================================================
114 * Local type definitions
115 *============================================================================*/
116
117 #if defined(HAVE_MPI)
118
119 typedef enum {
120
121 CS_ALL_TO_ALL_TIME_TOTAL,
122 CS_ALL_TO_ALL_TIME_METADATA,
123 CS_ALL_TO_ALL_TIME_EXCHANGE
124
125 } cs_all_to_all_timer_t;
126
127 /* Base structure for MPI_Alltoall exchanges */
128
129 typedef struct {
130
131 cs_datatype_t datatype; /* associated datatype */
132 cs_datatype_t dest_id_datatype; /* type of destination id (CS_LNUM_TYPE,
133 or CS_DATATYPE_NULL) */
134
135 size_t stride; /* stride if strided, 0 otherwise */
136
137 size_t elt_shift; /* starting byte for element data */
138 size_t comp_size; /* Composite element size, with padding */
139
140 size_t send_size; /* Send buffer element count */
141 size_t recv_size; /* Receive buffer element count */
142
143 const void *send_buffer; /* Send buffer */
144 unsigned char *_send_buffer; /* Send buffer */
145
146 int *send_count; /* Send counts for MPI_Alltoall */
147 int *recv_count; /* Receive counts for MPI_Alltoall */
148 int *send_displ; /* Send displs for MPI_Alltoall */
149 int *recv_displ; /* Receive displs for MPI_Alltoall */
150
151 int *recv_count_save; /* Saved (strided) receive counts for
152 MPI_Alltoall for indexed exchanges */
153
154 MPI_Comm comm; /* Associated MPI communicator */
155 MPI_Datatype comp_type; /* Associated MPI datatype */
156
157 int n_ranks; /* Number of ranks associated with
158 communicator */
159
160 } _mpi_all_to_all_caller_t;
161
162 /* Base structure for cs_rank_neighbors-based exchanges */
163
164 typedef struct {
165
166 cs_rank_neighbors_t *rn_send; /* rank neighbors structure for sending */
167 cs_rank_neighbors_t *rn_recv; /* rank neighbors structure for receiving */
168
169 cs_datatype_t datatype; /* associated datatype */
170 cs_datatype_t dest_id_datatype; /* type of destination id (CS_LNUM_TYPE,
171 or CS_DATATYPE_NULL) */
172
173 size_t stride; /* stride if strided, 0 otherwise */
174
175 size_t elt_shift; /* starting byte for element data */
176 size_t comp_size; /* Composite element size, with padding */
177
178 size_t send_size; /* Send buffer element count */
179 size_t recv_size; /* Receive buffer element count */
180
181 int *elt_rank_index; /* Element rank index matching owner
182 rank_index */
183
184 const void *send_buffer; /* Send buffer */
185 unsigned char *_send_buffer; /* Send buffer */
186
187 int *send_count; /* Send counts for MPI_Alltoall */
188 int *recv_count; /* Receive counts for MPI_Alltoall */
189 int *send_displ; /* Send displs for MPI_Alltoall */
190 int *recv_displ; /* Receive displs for MPI_Alltoall */
191
192 int *recv_count_save; /* Saved (strided) receive counts for
193 MPI_Alltoall for indexed exchanges */
194
195 MPI_Comm comm; /* Associated MPI communicator */
196 MPI_Datatype comp_type; /* Associated MPI datatype */
197
198 } _hybrid_pex_t;
199
200 #endif /* defined(HAVE_MPI) */
201
202 /* Structure used to redistribute data */
203
204 #if defined(HAVE_MPI)
205
206 struct _cs_all_to_all_t {
207
208 cs_lnum_t n_elts_src; /* Number of source elements */
209 cs_lnum_t n_elts_dest; /* Number of destination elements
210 (-1 before metadata available) */
211 cs_lnum_t n_elts_dest_e; /* Number of destination elements
212 exchanged, independently of
213 dest_id (-1 before metadata
214 available) */
215
216 int flags; /* option flags */
217
218 /* Send metadata */
219
220 const int *dest_rank; /* optional element destination
221 rank (possibly shared) */
222 int *_dest_rank; /* dest_rank if owner, or NULL */
223
224 const cs_lnum_t *dest_id; /* optional element destination id
225 (possibly shared) */
226 cs_lnum_t *_dest_id; /* dest_id if owner, or NULL */
227
228 /* Receive metadata */
229
230 cs_lnum_t *recv_id; /* received match for dest_id */
231
232 /* Data needed only for Crystal Router reverse communication */
233
234 cs_lnum_t *src_id; /* received match for dest_id */
235 int *src_rank; /* received source rank */
236
237 /* Sub-structures */
238
239 _mpi_all_to_all_caller_t *dc; /* Default MPI_Alltoall(v) caller */
240 _hybrid_pex_t *hc; /* Hybrid PEX */
241 cs_crystal_router_t *cr; /* associated crystal-router */
242
243 /* MPI data */
244
245 int n_ranks; /* Number of associated ranks */
246 MPI_Comm comm; /* associated communicator */
247
248 /* Other metadata */
249
250 cs_all_to_all_type_t type; /* Communication protocol */
251
252 };
253
254 #endif /* defined(HAVE_MPI) */
255
256 /*============================================================================
257 * Static global variables
258 *============================================================================*/
259
260 /* Default all-to-all type */
261
262 static cs_all_to_all_type_t _all_to_all_type = CS_ALL_TO_ALL_MPI_DEFAULT;
263
264 static cs_rank_neighbors_exchange_t _hybrid_meta_type
265 = CS_RANK_NEIGHBORS_CRYSTAL_ROUTER;
266
267 #if defined(HAVE_MPI)
268
269 /* Call counter and timer: 0: total, 1: metadata comm, 2: data comm */
270
271 static size_t _all_to_all_calls[3] = {0, 0, 0};
272 static cs_timer_counter_t _all_to_all_timers[3];
273
274 /* Instrumentation */
275
276 static int _n_trace = 0;
277 static int _n_trace_max = 0;
278 static uint64_t *_all_to_all_trace = NULL;
279 static FILE *_all_to_all_trace_bt_log = NULL;
280
281 #endif /* defined(HAVE_MPI) */
282
283 /*============================================================================
284 * Local function defintions
285 *============================================================================*/
286
287 #if defined(HAVE_MPI)
288
289 /*----------------------------------------------------------------------------
290 * Common portion of different all-to-all distributor contructors.
291 *
292 * arguments:
293 * n_elts <-- number of elements
294 * flags <-- sum of ordering and metadata flag constants
295 * comm <-- associated MPI communicator
296 *
297 * returns:
298 * pointer to new all-to-all distributor
299 *----------------------------------------------------------------------------*/
300
301 static cs_all_to_all_t *
_all_to_all_create_base(size_t n_elts,int flags,MPI_Comm comm)302 _all_to_all_create_base(size_t n_elts,
303 int flags,
304 MPI_Comm comm)
305 {
306 cs_all_to_all_t *d;
307
308 /* Initialize timers if required */
309
310 if (_all_to_all_calls[0] == 0) {
311 int n_timers = sizeof(_all_to_all_timers)/sizeof(_all_to_all_timers[0]);
312 for (int i = 0; i < n_timers; i++)
313 CS_TIMER_COUNTER_INIT(_all_to_all_timers[i]);
314
315 const char *p = getenv("CS_ALL_TO_ALL_TRACE");
316 if (p != NULL) {
317 if (atoi(p) > 0) {
318 _n_trace_max = atoi(p);
319 bft_printf(_("\n cs_all_2_all_trace: %d.\n\n"), _n_trace_max);
320 BFT_MALLOC(_all_to_all_trace, _n_trace_max*9, uint64_t);
321 _all_to_all_trace_bt_log = fopen("all_to_all_trace_bt.txt", "w");
322 }
323 }
324 }
325
326 /* Check flags */
327
328 if ( (flags & CS_ALL_TO_ALL_USE_DEST_ID)
329 && (flags & CS_ALL_TO_ALL_ORDER_BY_SRC_RANK))
330 bft_error(__FILE__, __LINE__, 0,
331 "%s: flags may not match both\n"
332 "CS_ALL_TO_ALL_USE_DEST_ID and\n"
333 "CS_ALL_TO_ALL_ORDER_BY_SRC_RANK.",
334 __func__);
335
336 /* Allocate structure */
337
338 BFT_MALLOC(d, 1, cs_all_to_all_t);
339
340 /* Create associated sub-structure */
341
342 d->n_elts_src = n_elts;
343 d->n_elts_dest = -1; /* undetermined as yet */
344 d->n_elts_dest_e = -1;
345
346 d->flags = flags;
347
348 d->dest_rank = NULL;
349 d->_dest_rank = NULL;
350
351 d->dest_id = NULL;
352 d->_dest_id = NULL;
353
354 d->recv_id = NULL;
355
356 d->src_id = NULL;
357 d->src_rank = NULL;
358
359 d->cr = NULL;
360 d->hc = NULL;
361 d->dc = NULL;
362
363 d->comm = comm;
364 MPI_Comm_size(comm, &(d->n_ranks));
365
366 d->type = _all_to_all_type;
367
368 return d;
369 }
370
371 /*----------------------------------------------------------------------------
372 * Compute rank displacement based on count.
373 *
374 * arguments:
375 * n_ranks <-- number of ranks
376 * count <-- number of elements per rank (size: n_ranks)
377 * displ --> element displacement in cumulative array (size: n_ranks)
378 *
379 * returns:
380 * cumulative count for all ranks
381 *----------------------------------------------------------------------------*/
382
383 static cs_lnum_t
_compute_displ(int n_ranks,const int count[],int displ[])384 _compute_displ(int n_ranks,
385 const int count[],
386 int displ[])
387 {
388 int i;
389 cs_lnum_t total_count = 0;
390
391 displ[0] = 0;
392
393 for (i = 0; i < n_ranks; i++)
394 displ[i+1] = displ[i] + count[i];
395
396 if (n_ranks > 0)
397 total_count = displ[n_ranks-1] + count[n_ranks-1];
398
399 return total_count;
400 }
401
402 /*----------------------------------------------------------------------------
403 * First stage of creation for an MPI_Alltoall(v) caller for strided data.
404 *
405 * parameters:
406 * flags <-- metadata flags
407 * comm <-- associated MPI communicator
408 *---------------------------------------------------------------------------*/
409
410 static _mpi_all_to_all_caller_t *
_alltoall_caller_create_meta(int flags,MPI_Comm comm)411 _alltoall_caller_create_meta(int flags,
412 MPI_Comm comm)
413 {
414 _mpi_all_to_all_caller_t *dc;
415
416 /* Allocate structure */
417
418 BFT_MALLOC(dc, 1, _mpi_all_to_all_caller_t);
419
420 dc->datatype = CS_DATATYPE_NULL;
421
422 dc->dest_id_datatype = CS_DATATYPE_NULL;
423
424 if (flags & CS_ALL_TO_ALL_USE_DEST_ID)
425 dc->dest_id_datatype = CS_LNUM_TYPE;
426
427 dc->stride = 0;
428
429 dc->send_size = 0;
430 dc->recv_size = 0;
431
432 dc->comm = comm;
433
434 MPI_Comm_size(comm, &(dc->n_ranks));
435
436 dc->send_buffer = NULL;
437 dc->_send_buffer = NULL;
438
439 BFT_MALLOC(dc->send_count, dc->n_ranks, int);
440 BFT_MALLOC(dc->recv_count, dc->n_ranks, int);
441 BFT_MALLOC(dc->send_displ, dc->n_ranks + 1, int);
442 BFT_MALLOC(dc->recv_displ, dc->n_ranks + 1, int);
443 dc->recv_count_save = NULL;
444
445 /* Compute data size and alignment */
446
447 if (dc->dest_id_datatype == CS_LNUM_TYPE)
448 dc->elt_shift = sizeof(cs_lnum_t);
449 else
450 dc->elt_shift = 0;
451
452 size_t align_size = sizeof(cs_lnum_t);
453
454 if (dc->elt_shift % align_size)
455 dc->elt_shift += align_size - (dc->elt_shift % align_size);
456
457 dc->comp_size = dc->elt_shift;
458
459 dc->comp_type = MPI_BYTE;
460
461 /* Return pointer to structure */
462
463 return dc;
464 }
465
466 /*----------------------------------------------------------------------------
467 * Destroy a MPI_Alltoall(v) caller.
468 *
469 * parameters:
470 * dc <-> pointer to pointer to MPI_Alltoall(v) caller structure
471 *---------------------------------------------------------------------------*/
472
473 static void
_alltoall_caller_destroy(_mpi_all_to_all_caller_t ** dc)474 _alltoall_caller_destroy(_mpi_all_to_all_caller_t **dc)
475 {
476 if (dc != NULL) {
477 _mpi_all_to_all_caller_t *_dc = *dc;
478 if (_dc->comp_type != MPI_BYTE)
479 MPI_Type_free(&(_dc->comp_type));
480 BFT_FREE(_dc->_send_buffer);
481 BFT_FREE(_dc->recv_count_save);
482 BFT_FREE(_dc->recv_displ);
483 BFT_FREE(_dc->send_displ);
484 BFT_FREE(_dc->recv_count);
485 BFT_FREE(_dc->send_count);
486 BFT_FREE(*dc);
487 }
488 }
489
490 /*----------------------------------------------------------------------------
491 * Update all MPI_Alltoall(v) caller metadata.
492 *
493 * parameters:
494 * dc <-> distributor caller
495 * datatype <-- associated datatype
496 * stride <-- associated stride (0 if indexed)
497 *---------------------------------------------------------------------------*/
498
499 static void
_alltoall_caller_update_meta(_mpi_all_to_all_caller_t * dc,cs_datatype_t datatype,int stride)500 _alltoall_caller_update_meta(_mpi_all_to_all_caller_t *dc,
501 cs_datatype_t datatype,
502 int stride)
503 {
504 size_t elt_size = cs_datatype_size[datatype]*stride;
505 size_t align_size = sizeof(int);
506
507 /* Free previous associated datatype if needed */
508
509 if (dc->comp_type != MPI_BYTE)
510 MPI_Type_free(&(dc->comp_type));
511
512 /* Now update metadata */
513
514 dc->datatype = datatype;
515 dc->stride = stride;
516
517 /* Recompute data size and alignment */
518
519 if (dc->dest_id_datatype == CS_LNUM_TYPE) {
520 dc->elt_shift = sizeof(cs_lnum_t);
521 if (sizeof(cs_lnum_t) > align_size)
522 align_size = sizeof(cs_lnum_t);
523 }
524 else
525 dc->elt_shift = 0;
526
527 if (dc->elt_shift % align_size)
528 dc->elt_shift += align_size - (dc->elt_shift % align_size);
529
530 if (stride > 0) {
531 if (cs_datatype_size[datatype] > align_size)
532 align_size = cs_datatype_size[datatype];
533 dc->comp_size = dc->elt_shift + elt_size;
534 }
535 else
536 dc->comp_size = dc->elt_shift + cs_datatype_size[datatype];
537
538 if (elt_size % align_size)
539 dc->comp_size += align_size - (elt_size % align_size);
540
541 /* Update associated MPI datatype */
542
543 MPI_Type_contiguous(dc->comp_size, MPI_BYTE, &(dc->comp_type));
544 MPI_Type_commit(&(dc->comp_type));
545 }
546
547 /*----------------------------------------------------------------------------
548 * Save partial metadata before indexed MPI_Alltoall(v) call.
549 *
550 * parameters:
551 * dc <-> associated MPI_Alltoall(v) caller structure
552 *---------------------------------------------------------------------------*/
553
554 static void
_alltoall_caller_save_meta_i(_mpi_all_to_all_caller_t * dc)555 _alltoall_caller_save_meta_i(_mpi_all_to_all_caller_t *dc)
556 {
557 if (dc->recv_count_save == NULL) {
558 BFT_MALLOC(dc->recv_count_save, dc->n_ranks, int);
559 memcpy(dc->recv_count_save, dc->recv_count, sizeof(int)*(dc->n_ranks));
560 }
561 }
562
563 /*----------------------------------------------------------------------------
564 * Reset metadate to that used for strided exchanges after indexed
565 * MPI_Alltoall(v) call.
566 *
567 * parameters:
568 * dc <-> associated MPI_Alltoall(v) caller structure
569 * dest_rank <-- destination rank (in direct mode), used here to reorder
570 * data in reverse mode.
571 *---------------------------------------------------------------------------*/
572
573 static void
_alltoall_caller_reset_meta_i(_mpi_all_to_all_caller_t * dc,const int dest_rank[])574 _alltoall_caller_reset_meta_i(_mpi_all_to_all_caller_t *dc,
575 const int dest_rank[])
576 {
577 /* Re-count values to send */
578 for (int i = 0; i < dc->n_ranks; i++)
579 dc->send_count[i] = 0;
580 for (size_t j = 0; j < dc->send_size; j++)
581 dc->send_count[dest_rank[j]] += 1;
582
583 /* Revert to saved values for receive */
584 if (dc->recv_count_save != NULL) {
585 memcpy(dc->recv_count, dc->recv_count_save, sizeof(int)*(dc->n_ranks));
586 BFT_FREE(dc->recv_count_save);
587 }
588
589 /* Recompute associated displacements */
590 _compute_displ(dc->n_ranks, dc->send_count, dc->send_displ);
591 _compute_displ(dc->n_ranks, dc->recv_count, dc->recv_displ);
592 }
593
594 /*----------------------------------------------------------------------------
595 * Swap source and destination ranks of all-to-all distributor.
596 *
597 * parameters:
598 * d <-> pointer to associated all-to-all distributor
599 *----------------------------------------------------------------------------*/
600
601 static void
_alltoall_caller_swap_src_dest(_mpi_all_to_all_caller_t * dc)602 _alltoall_caller_swap_src_dest(_mpi_all_to_all_caller_t *dc)
603 {
604 size_t tmp_size[2] = {dc->send_size, dc->recv_size};
605 int *tmp_count = dc->recv_count;
606 int *tmp_displ = dc->recv_displ;
607
608 dc->send_size = tmp_size[1];
609 dc->recv_size = tmp_size[0];
610
611 dc->recv_count = dc->send_count;
612 dc->recv_displ = dc->send_displ;
613
614 dc->send_count = tmp_count;
615 dc->send_displ = tmp_displ;
616 }
617
618 /*----------------------------------------------------------------------------
619 * Prepare a MPI_Alltoall(v) caller for strided data.
620 *
621 * parameters:
622 * n_elts <-- number of elements
623 * stride <-- number of values per entity (interlaced)
624 * datatype <-- type of data considered
625 * reverse <-- true if reverse mode
626 * data <-- element values
627 * dest_id <-- element destination id, or NULL
628 * recv_id <-- element receive id (for reverse mode), or NULL
629 * dest_rank <-- destination rank for each element
630 *---------------------------------------------------------------------------*/
631
632 static void
_alltoall_caller_prepare_s(_mpi_all_to_all_caller_t * dc,size_t n_elts,int stride,cs_datatype_t datatype,bool reverse,const void * data,const cs_lnum_t * dest_id,const cs_lnum_t * recv_id,const int dest_rank[])633 _alltoall_caller_prepare_s(_mpi_all_to_all_caller_t *dc,
634 size_t n_elts,
635 int stride,
636 cs_datatype_t datatype,
637 bool reverse,
638 const void *data,
639 const cs_lnum_t *dest_id,
640 const cs_lnum_t *recv_id,
641 const int dest_rank[])
642 {
643 int i;
644
645 size_t elt_size = cs_datatype_size[datatype]*stride;
646
647 unsigned const char *_data = data;
648
649 assert(data != NULL || (n_elts == 0 || stride == 0));
650
651 _alltoall_caller_update_meta(dc, datatype, stride);
652
653 /* Allocate send buffer */
654
655 if (reverse && recv_id == NULL) {
656 BFT_FREE(dc->_send_buffer);
657 dc->send_buffer = data;
658 }
659 else {
660 BFT_REALLOC(dc->_send_buffer, dc->send_size*dc->comp_size, unsigned char);
661 dc->send_buffer = dc->_send_buffer;
662 }
663
664 /* Copy metadata */
665
666 if (dc->dest_id_datatype == CS_LNUM_TYPE) {
667 unsigned const char *_dest_id = (unsigned const char *)dest_id;
668 const size_t id_size = sizeof(cs_lnum_t);
669 for (size_t j = 0; j < n_elts; j++) {
670 size_t w_displ = dc->send_displ[dest_rank[j]]*dc->comp_size;
671 size_t r_displ = j*id_size;
672 dc->send_displ[dest_rank[j]] += 1;
673 for (size_t k = 0; k < id_size; k++)
674 dc->_send_buffer[w_displ + k] = _dest_id[r_displ + k];
675 }
676 /* Reset send_displ */
677 for (i = 0; i < dc->n_ranks; i++)
678 dc->send_displ[i] -= dc->send_count[i];
679 }
680
681 /* Copy data; in case of reverse send with destination ids, the
682 matching indirection must be applied here. */
683
684 if (!reverse) {
685 for (size_t j = 0; j < n_elts; j++) {
686 size_t w_displ = dc->send_displ[dest_rank[j]]*dc->comp_size
687 + dc->elt_shift;
688 size_t r_displ = j*elt_size;
689 dc->send_displ[dest_rank[j]] += 1;
690 for (size_t k = 0; k < elt_size; k++)
691 dc->_send_buffer[w_displ + k] = _data[r_displ + k];
692 }
693 /* Reset send_displ */
694 for (i = 0; i < dc->n_ranks; i++)
695 dc->send_displ[i] -= dc->send_count[i];
696 }
697 else if (recv_id != NULL) { /* reverse here */
698 for (size_t j = 0; j < dc->send_size; j++) {
699 size_t w_displ = dc->comp_size*j + dc->elt_shift;
700 size_t r_displ = recv_id[j]*elt_size;
701 for (size_t k = 0; k < elt_size; k++)
702 dc->_send_buffer[w_displ + k] = _data[r_displ + k];
703 }
704 }
705 else { /* If revert and no recv_id */
706 assert(dc->send_buffer == data);
707 }
708 }
709
710 /*----------------------------------------------------------------------------
711 * Prepare a MPI_Alltoall(v) caller for indexed data.
712 *
713 * parameters:
714 * n_elts <-- number of elements
715 * datatype <-- type of data considered
716 * reverse <-- true if reverse mode
717 * src_index <-- source index
718 * dest_index <-- destination index
719 * data <-- element values
720 * recv_id <-- element receive id (for reverse mode), or NULL
721 * dest_rank <-- destination rank for each element
722 *---------------------------------------------------------------------------*/
723
724 static void
_alltoall_caller_prepare_i(_mpi_all_to_all_caller_t * dc,size_t n_elts,cs_datatype_t datatype,bool reverse,const cs_lnum_t src_index[],const cs_lnum_t dest_index[],const void * data,const cs_lnum_t * recv_id,const int dest_rank[])725 _alltoall_caller_prepare_i(_mpi_all_to_all_caller_t *dc,
726 size_t n_elts,
727 cs_datatype_t datatype,
728 bool reverse,
729 const cs_lnum_t src_index[],
730 const cs_lnum_t dest_index[],
731 const void *data,
732 const cs_lnum_t *recv_id,
733 const int dest_rank[])
734 {
735 int i;
736
737 size_t elt_size = cs_datatype_size[datatype];
738
739 unsigned const char *_data = data;
740
741 _alltoall_caller_update_meta(dc, datatype, 0);
742
743 /* Allocate send buffer */
744
745 if (reverse && recv_id == NULL) {
746 BFT_FREE(dc->_send_buffer);
747 dc->send_buffer = data;
748 }
749 else {
750 size_t n_sub_elts = 0;
751 if (reverse == false || recv_id == NULL)
752 n_sub_elts = src_index[n_elts];
753 else {
754 for (size_t j = 0; j < dc->send_size; j++) {
755 cs_lnum_t k = recv_id[j];
756 n_sub_elts += src_index[k+1] - src_index[k];
757 }
758 }
759 BFT_REALLOC(dc->_send_buffer, n_sub_elts*dc->comp_size, unsigned char);
760 dc->send_buffer = dc->_send_buffer;
761 }
762
763 /* Update send and receive counts and displacements
764 (avoiding additional communication) */
765
766 for (i = 0; i < dc->n_ranks; i++) {
767 dc->send_count[i] = 0;
768 dc->recv_count[i] = 0;
769 }
770
771 if (reverse) {
772 if (recv_id != NULL) {
773 for (i = 0; i < dc->n_ranks; i++) {
774 size_t i_s = dc->send_displ[i];
775 size_t i_e = dc->send_displ[i+1];
776 for (size_t j = i_s; j < i_e; j++) {
777 cs_lnum_t k = recv_id[j];
778 cs_lnum_t n_sub_send = src_index[k+1] - src_index[k];
779 dc->send_count[i] += n_sub_send;
780 }
781 }
782 }
783 else {
784 for (i = 0; i < dc->n_ranks; i++) {
785 size_t i_s = dc->send_displ[i];
786 size_t i_e = dc->send_displ[i+1];
787 for (size_t j = i_s; j < i_e; j++) {
788 cs_lnum_t n_sub_send = src_index[j+1] - src_index[j];
789 dc->send_count[i] += n_sub_send;
790 }
791 }
792 }
793 }
794 else { /* !reverse */
795 for (size_t j = 0; j < dc->send_size; j++) {
796 cs_lnum_t n_sub_send = src_index[j+1] - src_index[j];
797 dc->send_count[dest_rank[j]] += n_sub_send;
798 }
799 }
800
801 if (reverse) {
802 for (size_t j = 0; j < dc->recv_size; j++) {
803 cs_lnum_t n_sub_recv = dest_index[j+1] - dest_index[j];
804 dc->recv_count[dest_rank[j]] += n_sub_recv;
805 }
806 }
807 else {
808 if (recv_id != NULL) {
809 for (i = 0; i < dc->n_ranks; i++) {
810 size_t i_s = dc->recv_displ[i];
811 size_t i_e = dc->recv_displ[i+1];
812 for (size_t j = i_s; j < i_e; j++) {
813 cs_lnum_t k = recv_id[j];
814 cs_lnum_t n_sub_recv = dest_index[k+1] - dest_index[k];
815 dc->recv_count[i] += n_sub_recv;
816 }
817 }
818 }
819 else {
820 for (i = 0; i < dc->n_ranks; i++) {
821 size_t i_s = dc->recv_displ[i];
822 size_t i_e = dc->recv_displ[i+1];
823 for (size_t j = i_s; j < i_e; j++) {
824 cs_lnum_t n_sub_recv = dest_index[j+1] - dest_index[j];
825 dc->recv_count[i] += n_sub_recv;
826 }
827 }
828 }
829 }
830
831 _compute_displ(dc->n_ranks, dc->send_count, dc->send_displ);
832 _compute_displ(dc->n_ranks, dc->recv_count, dc->recv_displ);
833
834 /* Copy data; in case of reverse send with destination ids, the
835 matching indirection must be applied here. */
836
837 if (!reverse) {
838 for (size_t j = 0; j < n_elts; j++) {
839 cs_lnum_t n_sub_send = src_index[j+1] - src_index[j];
840 size_t w_displ = dc->send_displ[dest_rank[j]]*elt_size;
841 size_t r_displ = src_index[j]*elt_size;
842 size_t sub_size = elt_size*n_sub_send;
843 dc->send_displ[dest_rank[j]] += n_sub_send;
844 for (size_t l = 0; l < sub_size; l++)
845 dc->_send_buffer[w_displ + l] = _data[r_displ + l];
846 }
847 /* Reset send_displ */
848 for (i = 0; i < dc->n_ranks; i++)
849 dc->send_displ[i] -= dc->send_count[i];
850 }
851 else if (recv_id != NULL) { /* reverse here */
852 size_t w_displ = 0;
853 for (size_t j = 0; j < dc->send_size; j++) {
854 cs_lnum_t k = recv_id[j];
855 cs_lnum_t n_sub_send = src_index[k+1] - src_index[k];
856 size_t r_displ = src_index[k]*elt_size;
857 size_t sub_size = elt_size*n_sub_send;
858 for (size_t l = 0; l < sub_size; l++)
859 dc->_send_buffer[w_displ + l] = _data[r_displ + l];
860 w_displ += sub_size;
861 }
862
863 /* Recompute associated displacements */
864 _compute_displ(dc->n_ranks, dc->send_count, dc->send_displ);
865 _compute_displ(dc->n_ranks, dc->recv_count, dc->recv_displ);
866
867
868 }
869 else { /* If revert and no recv_id */
870 assert(dc->send_buffer == data);
871 }
872 }
873
874 /*----------------------------------------------------------------------------
875 * Exchange metadata for an MPI_Alltoall(v) caller.
876 *
877 * Order of data from a same source rank is preserved.
878 *
879 * parameters:
880 * dc <-> associated MPI_Alltoall(v) caller structure
881 * n_elts <-- number of elements
882 * dest_rank <-- destination rank for each element
883 *---------------------------------------------------------------------------*/
884
885 static void
_alltoall_caller_exchange_meta(_mpi_all_to_all_caller_t * dc,size_t n_elts,const int dest_rank[])886 _alltoall_caller_exchange_meta(_mpi_all_to_all_caller_t *dc,
887 size_t n_elts,
888 const int dest_rank[])
889 {
890 /* Count values to send and receive */
891
892 for (int i = 0; i < dc->n_ranks; i++)
893 dc->send_count[i] = 0;
894
895 for (size_t j = 0; j < n_elts; j++)
896 dc->send_count[dest_rank[j]] += 1;
897
898 dc->send_size = _compute_displ(dc->n_ranks, dc->send_count, dc->send_displ);
899
900 /* Exchange counts */
901
902 cs_timer_t t0 = cs_timer_time();
903
904 if (_n_trace < _n_trace_max) {
905 /* Time to 1-5 s */
906 _all_to_all_trace[_n_trace*9] = t0.sec*1e5 + t0.nsec/1e4;
907 _all_to_all_trace[_n_trace*9+1] = 0;
908 _all_to_all_trace[_n_trace*9+2] = 0;
909 _all_to_all_trace[_n_trace*9+3] = bft_mem_usage_pr_size();
910 _all_to_all_trace[_n_trace*9+4] = bft_mem_usage_max_pr_size();
911 _all_to_all_trace[_n_trace*9+5] = 0;
912 _all_to_all_trace[_n_trace*9+6] = 0;
913 _all_to_all_trace[_n_trace*9+7] = 0;
914 _all_to_all_trace[_n_trace*9+8] = 0;
915 _n_trace += 1;
916 }
917
918 MPI_Alltoall(dc->send_count, 1, MPI_INT,
919 dc->recv_count, 1, MPI_INT,
920 dc->comm);
921
922 cs_timer_t t1 = cs_timer_time();
923 cs_timer_counter_add_diff(_all_to_all_timers + CS_ALL_TO_ALL_TIME_METADATA,
924 &t0, &t1);
925
926 if (_n_trace < _n_trace_max) {
927 /* Time to 1-5 s */
928 _all_to_all_trace[_n_trace*9] = t1.sec*1e5 + t1.nsec/1e4;
929 _all_to_all_trace[_n_trace*9+1] = _all_to_all_trace[_n_trace*9]
930 - _all_to_all_trace[(_n_trace-1)*9];
931 _all_to_all_trace[_n_trace*9+2] = 1;
932 _all_to_all_trace[_n_trace*9+3] = bft_mem_usage_pr_size();
933 _all_to_all_trace[_n_trace*9+4] = bft_mem_usage_max_pr_size();
934 _all_to_all_trace[_n_trace*9+5] = 0;
935 _all_to_all_trace[_n_trace*9+6] = 0;
936 _all_to_all_trace[_n_trace*9+7] = 0;
937 _all_to_all_trace[_n_trace*9+8] = 0;
938 _n_trace += 1;
939 }
940
941 _all_to_all_calls[CS_ALL_TO_ALL_TIME_METADATA] += 1;
942
943 dc->recv_size = _compute_displ(dc->n_ranks, dc->recv_count, dc->recv_displ);
944 }
945
946 /*----------------------------------------------------------------------------
947 * Exchange strided data with a MPI_Alltoall(v) caller.
948 *
949 * parameters:
950 * d <-> pointer to associated all-to-all distributor
951 * dc <-> associated MPI_Alltoall(v) caller structure
952 * reverse <-- true if reverse mode
953 * dest_rank <-- destination rank (in direct mode), used here to reorder
954 * data in reverse mode.
955 * dest_data <-> destination data buffer, or NULL
956 *
957 * returns:
958 * pointer to dest_data, or newly allocated buffer
959 *---------------------------------------------------------------------------*/
960
961 static void *
_alltoall_caller_exchange_s(cs_all_to_all_t * d,_mpi_all_to_all_caller_t * dc,bool reverse,const int dest_rank[],void * dest_data)962 _alltoall_caller_exchange_s(cs_all_to_all_t *d,
963 _mpi_all_to_all_caller_t *dc,
964 bool reverse,
965 const int dest_rank[],
966 void *dest_data)
967 {
968 size_t elt_size = cs_datatype_size[dc->datatype]*dc->stride;
969 unsigned char *_dest_data = dest_data, *_recv_data = dest_data;
970
971 /* Final data buffer */
972
973 if (_dest_data == NULL && dc->recv_size*elt_size > 0)
974 BFT_MALLOC(_dest_data, dc->recv_size*elt_size, unsigned char);
975
976 /* Data buffer for MPI exchange (may merge data and metadata) */
977 if ( dc->dest_id_datatype == CS_LNUM_TYPE || d->recv_id != NULL
978 || reverse)
979 BFT_MALLOC(_recv_data, dc->recv_size*dc->comp_size, unsigned char);
980 else
981 _recv_data = _dest_data;
982
983 cs_timer_t t0 = cs_timer_time();
984
985 if (_n_trace < _n_trace_max) {
986 int n_r_send = 0, n_r_recv = 0;
987 for (int i = 0; i < dc->n_ranks; i++) {
988 if (dc->send_count[i] > 0)
989 n_r_send += 1;
990 if (dc->recv_count[i] > 0)
991 n_r_recv += 1;
992 }
993 /* Time to 1-5 s */
994 _all_to_all_trace[_n_trace*9] = t0.sec*1e5 + t0.nsec/1e4;
995 _all_to_all_trace[_n_trace*9+1] = 0;
996 _all_to_all_trace[_n_trace*9+2] = 2;
997 _all_to_all_trace[_n_trace*9+3] = bft_mem_usage_pr_size();
998 _all_to_all_trace[_n_trace*9+4] = bft_mem_usage_max_pr_size();
999 _all_to_all_trace[_n_trace*9+5] = dc->send_size*dc->comp_size;
1000 _all_to_all_trace[_n_trace*9+6] = dc->recv_size*dc->comp_size;
1001 _all_to_all_trace[_n_trace*9+7] = n_r_send;
1002 _all_to_all_trace[_n_trace*9+8] = n_r_recv;
1003 _n_trace += 1;
1004 }
1005
1006 MPI_Alltoallv(dc->send_buffer, dc->send_count, dc->send_displ, dc->comp_type,
1007 _recv_data, dc->recv_count, dc->recv_displ, dc->comp_type,
1008 dc->comm);
1009
1010 cs_timer_t t1 = cs_timer_time();
1011 cs_timer_counter_add_diff(_all_to_all_timers + CS_ALL_TO_ALL_TIME_EXCHANGE,
1012 &t0, &t1);
1013 _all_to_all_calls[CS_ALL_TO_ALL_TIME_EXCHANGE] += 1;
1014
1015 if (_n_trace < _n_trace_max) {
1016 int n_r_send = 0, n_r_recv = 0;
1017 for (int i = 0; i < dc->n_ranks; i++) {
1018 if (dc->send_count[i] > 0)
1019 n_r_send += 1;
1020 if (dc->recv_count[i] > 0)
1021 n_r_recv += 1;
1022 }
1023 /* Time to 1-5 s */
1024 _all_to_all_trace[_n_trace*9] = t1.sec*1e5 + t1.nsec/1e4;
1025 _all_to_all_trace[_n_trace*9+1] = _all_to_all_trace[_n_trace*9]
1026 - _all_to_all_trace[(_n_trace-1)*9];
1027 _all_to_all_trace[_n_trace*9+2] = 0;
1028 _all_to_all_trace[_n_trace*9+3] = bft_mem_usage_pr_size();
1029 _all_to_all_trace[_n_trace*9+4] = bft_mem_usage_max_pr_size();
1030 _all_to_all_trace[_n_trace*9+5] = dc->send_size*dc->comp_size;
1031 _all_to_all_trace[_n_trace*9+6] = dc->recv_size*dc->comp_size;
1032 _all_to_all_trace[_n_trace*9+7] = n_r_send;
1033 _all_to_all_trace[_n_trace*9+8] = n_r_recv;
1034 _n_trace += 1;
1035 }
1036
1037 /* dest id datatype only used for first exchange */
1038
1039 if (dc->dest_id_datatype == CS_LNUM_TYPE) {
1040 assert(d->recv_id == NULL);
1041 BFT_MALLOC(d->recv_id, d->dc->recv_size, cs_lnum_t);
1042 for (size_t i = 0; i < d->dc->recv_size; i++)
1043 d->recv_id[i] = -1;
1044 const unsigned char *sp = _recv_data;
1045 for (size_t i = 0; i < d->dc->recv_size; i++)
1046 memcpy(d->recv_id + i,
1047 sp + d->dc->comp_size*i,
1048 sizeof(cs_lnum_t));
1049 dc->dest_id_datatype = CS_DATATYPE_NULL;
1050 cs_lnum_t dest_id_max = -1;
1051 for (size_t i = 0; i < d->dc->recv_size; i++) {
1052 if (d->recv_id[i] > dest_id_max)
1053 dest_id_max = d->recv_id[i];
1054 d->n_elts_dest = dest_id_max + 1;
1055 }
1056 d->n_elts_dest_e = d->dc->recv_size;
1057 }
1058
1059 /* Now handle main data buffer (reverse implies reordering data) */
1060
1061 if (_dest_data != _recv_data) {
1062 const unsigned char *sp = _recv_data + d->dc->elt_shift;
1063 if (d->recv_id != NULL && !reverse) {
1064 for (size_t i = 0; i < d->dc->recv_size; i++) {
1065 size_t w_displ = d->recv_id[i]*elt_size;
1066 size_t r_displ = d->dc->comp_size*i;
1067 for (size_t j = 0; j < elt_size; j++)
1068 _dest_data[w_displ + j] = sp[r_displ + j];
1069 }
1070 }
1071 else if (reverse) {
1072 for (int i = 0; i < dc->n_ranks; i++)
1073 dc->recv_count[i] = 0;
1074 for (size_t i = 0; i < d->dc->recv_size; i++) {
1075 int rank_id = dest_rank[i];
1076 size_t w_displ = i*elt_size;
1077 size_t r_displ = ( dc->recv_displ[rank_id]
1078 + dc->recv_count[rank_id])*dc->comp_size;
1079 for (size_t j = 0; j < elt_size; j++)
1080 _dest_data[w_displ + j] = sp[r_displ + j];
1081 dc->recv_count[rank_id] += 1;
1082 }
1083 }
1084 else {
1085 for (size_t i = 0; i < d->dc->recv_size; i++) {
1086 size_t w_displ = i*elt_size;
1087 size_t r_displ = dc->comp_size*i;
1088 for (size_t j = 0; j < elt_size; j++)
1089 _dest_data[w_displ + j] = sp[r_displ + j];
1090 }
1091 }
1092 BFT_FREE(_recv_data);
1093 }
1094
1095 return _dest_data;
1096 }
1097
1098 /*----------------------------------------------------------------------------
1099 * Exchange indexed data with a MPI_Alltoall(v) caller.
1100 *
1101 * parameters:
1102 * d <-> pointer to associated all-to-all distributor
1103 * dc <-> associated MPI_Alltoall(v) caller structure
1104 * reverse <-- true if reverse mode
1105 * dest_index <-- destination index
1106 * dest_rank <-- destination rank (in direct mode), used here to reorder
1107 * data in reverse mode.
1108 * dest_data <-> destination data buffer, or NULL
1109 *
1110 * returns:
1111 * pointer to dest_data, or newly allocated buffer
1112 *---------------------------------------------------------------------------*/
1113
1114 static void *
_alltoall_caller_exchange_i(cs_all_to_all_t * d,_mpi_all_to_all_caller_t * dc,bool reverse,const cs_lnum_t dest_index[],const int dest_rank[],void * dest_data)1115 _alltoall_caller_exchange_i(cs_all_to_all_t *d,
1116 _mpi_all_to_all_caller_t *dc,
1117 bool reverse,
1118 const cs_lnum_t dest_index[],
1119 const int dest_rank[],
1120 void *dest_data)
1121 {
1122 size_t elt_size = cs_datatype_size[dc->datatype];
1123 unsigned char *_dest_data = dest_data, *_recv_data = dest_data;
1124
1125 /* Final data buffer */
1126
1127 size_t n_elts_dest = (reverse) ? d->n_elts_src : d->n_elts_dest;
1128
1129 size_t _n_dest_sub = dest_index[n_elts_dest];
1130 if (_dest_data == NULL && _n_dest_sub*elt_size > 0)
1131 BFT_MALLOC(_dest_data, _n_dest_sub*elt_size, unsigned char);
1132
1133 /* Data buffer for MPI exchange (may merge data and metadata) */
1134
1135 if (d->recv_id != NULL || reverse) {
1136 size_t _n_dest_buf = dc->recv_displ[dc->n_ranks] * dc->comp_size;
1137 BFT_MALLOC(_recv_data, _n_dest_buf, unsigned char);
1138 }
1139 else
1140 _recv_data = _dest_data;
1141
1142 cs_timer_t t0 = cs_timer_time();
1143
1144 if (_n_trace < _n_trace_max) {
1145 int n_r_send = 0, n_r_recv = 0;
1146 for (int i = 0; i < dc->n_ranks; i++) {
1147 if (dc->send_count[i] > 0)
1148 n_r_send += 1;
1149 if (dc->recv_count[i] > 0)
1150 n_r_recv += 1;
1151 }
1152 /* Time to 1-5 s */
1153 _all_to_all_trace[_n_trace*9] = t0.sec*1e5 + t0.nsec/1e4;
1154 _all_to_all_trace[_n_trace*9+1] = 0;
1155 _all_to_all_trace[_n_trace*9+2] = 2;
1156 _all_to_all_trace[_n_trace*9+3] = bft_mem_usage_pr_size();
1157 _all_to_all_trace[_n_trace*9+4] = bft_mem_usage_max_pr_size();
1158 _all_to_all_trace[_n_trace*9+5] = dc->send_size*dc->comp_size;
1159 _all_to_all_trace[_n_trace*9+6] = dc->recv_size*dc->comp_size;
1160 _all_to_all_trace[_n_trace*9+7] = n_r_send;
1161 _all_to_all_trace[_n_trace*9+8] = n_r_recv;
1162 _n_trace += 1;
1163 }
1164
1165 MPI_Alltoallv(dc->send_buffer, dc->send_count, dc->send_displ, dc->comp_type,
1166 _recv_data, dc->recv_count, dc->recv_displ, dc->comp_type,
1167 dc->comm);
1168
1169 cs_timer_t t1 = cs_timer_time();
1170 cs_timer_counter_add_diff(_all_to_all_timers + CS_ALL_TO_ALL_TIME_EXCHANGE,
1171 &t0, &t1);
1172 _all_to_all_calls[CS_ALL_TO_ALL_TIME_EXCHANGE] += 1;
1173
1174 if (_n_trace < _n_trace_max) {
1175 int n_r_send = 0, n_r_recv = 0;
1176 for (int i = 0; i < dc->n_ranks; i++) {
1177 if (dc->send_count[i] > 0)
1178 n_r_send += 1;
1179 if (dc->recv_count[i] > 0)
1180 n_r_recv += 1;
1181 }
1182 /* Time to 1-5 s */
1183 _all_to_all_trace[_n_trace*9] = t1.sec*1e5 + t1.nsec/1e4;
1184 _all_to_all_trace[_n_trace*9+1] = _all_to_all_trace[_n_trace*9]
1185 - _all_to_all_trace[(_n_trace-1)*9];
1186 _all_to_all_trace[_n_trace*9+2] = 0;
1187 _all_to_all_trace[_n_trace*9+3] = bft_mem_usage_pr_size();
1188 _all_to_all_trace[_n_trace*9+4] = bft_mem_usage_max_pr_size();
1189 _all_to_all_trace[_n_trace*9+5] = dc->send_size*dc->comp_size;
1190 _all_to_all_trace[_n_trace*9+6] = dc->recv_size*dc->comp_size;
1191 _all_to_all_trace[_n_trace*9+7] = n_r_send;
1192 _all_to_all_trace[_n_trace*9+8] = n_r_recv;
1193 _n_trace += 1;
1194 }
1195
1196 /* Handle main data buffer (reverse implies reordering data) */
1197
1198 if (_dest_data != _recv_data) {
1199 const unsigned char *sp = _recv_data;
1200 if (d->recv_id != NULL && !reverse) {
1201 size_t r_displ = 0;
1202 for (size_t i = 0; i < dc->recv_size; i++) {
1203 cs_lnum_t k = d->recv_id[i];
1204 cs_lnum_t n_sub_recv = dest_index[k+1] - dest_index[k];
1205 size_t w_displ = dest_index[k]*elt_size;
1206 size_t sub_size = n_sub_recv*elt_size;
1207 for (size_t l = 0; l < sub_size; l++)
1208 _dest_data[w_displ + l] = sp[r_displ + l];
1209 r_displ += sub_size;
1210 }
1211 }
1212 else if (reverse) {
1213 for (int i = 0; i < dc->n_ranks; i++)
1214 dc->recv_count[i] = 0;
1215 for (size_t i = 0; i < d->dc->recv_size; i++) {
1216 int rank_id = dest_rank[i];
1217 size_t w_displ = dest_index[i]*elt_size;
1218 size_t r_displ = ( dc->recv_displ[rank_id]
1219 + dc->recv_count[rank_id])*elt_size;
1220 cs_lnum_t n_sub_recv = dest_index[i+1] - dest_index[i];
1221 size_t sub_size = n_sub_recv*elt_size;
1222 for (size_t l = 0; l < sub_size; l++)
1223 _dest_data[w_displ + l] = sp[r_displ + l];
1224 dc->recv_count[rank_id] += n_sub_recv;
1225 }
1226 }
1227 else {
1228 assert(_dest_data == _recv_data);
1229 }
1230 BFT_FREE(_recv_data);
1231 }
1232
1233 return _dest_data;
1234 }
1235
1236 /*----------------------------------------------------------------------------
1237 * First stage of creation for a hybrid caller for strided data.
1238 *
1239 * parameters:
1240 * flags <-- metadata flags
1241 * comm <-- associated MPI communicator
1242 *---------------------------------------------------------------------------*/
1243
1244 static _hybrid_pex_t *
_hybrid_pex_create_meta(int flags,MPI_Comm comm)1245 _hybrid_pex_create_meta(int flags,
1246 MPI_Comm comm)
1247 {
1248 _hybrid_pex_t *hc;
1249
1250 /* Allocate structure */
1251
1252 BFT_MALLOC(hc, 1, _hybrid_pex_t);
1253
1254 hc->rn_send = NULL;
1255 hc->rn_recv = NULL;
1256
1257 hc->datatype = CS_DATATYPE_NULL;
1258
1259 hc->dest_id_datatype = CS_DATATYPE_NULL;
1260
1261 if (flags & CS_ALL_TO_ALL_USE_DEST_ID)
1262 hc->dest_id_datatype = CS_LNUM_TYPE;
1263
1264 hc->stride = 0;
1265
1266 hc->send_size = 0;
1267 hc->recv_size = 0;
1268
1269 hc->comm = comm;
1270
1271 hc->elt_rank_index = NULL;
1272
1273 hc->send_buffer = NULL;
1274 hc->_send_buffer = NULL;
1275
1276 hc->send_count = NULL;
1277 hc->recv_count = NULL;
1278 hc->send_displ = NULL;
1279 hc->recv_displ = NULL;
1280 hc->recv_count_save = NULL;
1281
1282 /* Compute data size and alignment */
1283
1284 if (hc->dest_id_datatype == CS_LNUM_TYPE)
1285 hc->elt_shift = sizeof(cs_lnum_t);
1286 else
1287 hc->elt_shift = 0;
1288
1289 size_t align_size = sizeof(cs_lnum_t);
1290
1291 if (hc->elt_shift % align_size)
1292 hc->elt_shift += align_size - (hc->elt_shift % align_size);
1293
1294 hc->comp_size = hc->elt_shift;
1295
1296 hc->comp_type = MPI_BYTE;
1297
1298 /* Return pointer to structure */
1299
1300 return hc;
1301 }
1302
1303 /*----------------------------------------------------------------------------
1304 * Destroy a hybrid caller.
1305 *
1306 * parameters:
1307 * hc <-> pointer to pointer to MPI_Alltoall(v) caller structure
1308 *---------------------------------------------------------------------------*/
1309
1310 static void
_hybrid_pex_destroy(_hybrid_pex_t ** hc)1311 _hybrid_pex_destroy(_hybrid_pex_t **hc)
1312 {
1313 if (hc != NULL) {
1314 _hybrid_pex_t *_hc = *hc;
1315 if (_hc->comp_type != MPI_BYTE)
1316 MPI_Type_free(&(_hc->comp_type));
1317 BFT_FREE(_hc->elt_rank_index);
1318 BFT_FREE(_hc->_send_buffer);
1319 BFT_FREE(_hc->recv_count_save);
1320 BFT_FREE(_hc->recv_displ);
1321 BFT_FREE(_hc->send_displ);
1322 BFT_FREE(_hc->recv_count);
1323 BFT_FREE(_hc->send_count);
1324
1325 cs_rank_neighbors_destroy(&(_hc->rn_send));
1326 cs_rank_neighbors_destroy(&(_hc->rn_recv));
1327
1328 BFT_FREE(*hc);
1329 }
1330 }
1331
1332 /*----------------------------------------------------------------------------
1333 * Update all hybrid caller metadata.
1334 *
1335 * parameters:
1336 * hc <-> distributor caller
1337 * datatype <-- associated datatype
1338 * stride <-- associated stride (0 if indexed)
1339 *---------------------------------------------------------------------------*/
1340
1341 static void
_hybrid_pex_update_meta(_hybrid_pex_t * hc,cs_datatype_t datatype,int stride)1342 _hybrid_pex_update_meta(_hybrid_pex_t *hc,
1343 cs_datatype_t datatype,
1344 int stride)
1345 {
1346 size_t elt_size = cs_datatype_size[datatype]*stride;
1347 size_t align_size = sizeof(int);
1348
1349 /* Free previous associated datatype if needed */
1350
1351 if (hc->comp_type != MPI_BYTE)
1352 MPI_Type_free(&(hc->comp_type));
1353
1354 /* Now update metadata */
1355
1356 hc->datatype = datatype;
1357 hc->stride = stride;
1358
1359 /* Recompute data size and alignment */
1360
1361 if (hc->dest_id_datatype == CS_LNUM_TYPE) {
1362 hc->elt_shift = sizeof(cs_lnum_t);
1363 if (sizeof(cs_lnum_t) > align_size)
1364 align_size = sizeof(cs_lnum_t);
1365 }
1366 else
1367 hc->elt_shift = 0;
1368
1369 if (hc->elt_shift % align_size)
1370 hc->elt_shift += align_size - (hc->elt_shift % align_size);
1371
1372 if (stride > 0) {
1373 if (cs_datatype_size[datatype] > align_size)
1374 align_size = cs_datatype_size[datatype];
1375 hc->comp_size = hc->elt_shift + elt_size;
1376 }
1377 else
1378 hc->comp_size = hc->elt_shift + cs_datatype_size[datatype];
1379
1380 if (elt_size % align_size)
1381 hc->comp_size += align_size - (elt_size % align_size);
1382
1383 /* Update associated MPI datatype */
1384
1385 MPI_Type_contiguous(hc->comp_size, MPI_BYTE, &(hc->comp_type));
1386 MPI_Type_commit(&(hc->comp_type));
1387 }
1388
1389 /*----------------------------------------------------------------------------
1390 * Save partial metadata before indexed hybrid call.
1391 *
1392 * parameters:
1393 * hc <-> associated hybrid caller structure
1394 *---------------------------------------------------------------------------*/
1395
1396 static void
_hybrid_pex_save_meta_i(_hybrid_pex_t * hc)1397 _hybrid_pex_save_meta_i(_hybrid_pex_t *hc)
1398 {
1399 if (hc->recv_count_save == NULL) {
1400 int n_n_ranks = hc->rn_recv->size;
1401 BFT_MALLOC(hc->recv_count_save, n_n_ranks, int);
1402 memcpy(hc->recv_count_save, hc->recv_count, sizeof(int)*(n_n_ranks));
1403 }
1404 }
1405
1406 /*----------------------------------------------------------------------------
1407 * Reset metadate to that used for strided exchanges after indexed
1408 * hybrid call.
1409 *
1410 * parameters:
1411 * hc <-> associated hybrid caller structure
1412 *---------------------------------------------------------------------------*/
1413
1414 static void
_hybrid_pex_reset_meta_i(_hybrid_pex_t * hc)1415 _hybrid_pex_reset_meta_i(_hybrid_pex_t *hc)
1416 {
1417 int n_s_ranks = hc->rn_send->size;
1418 int n_r_ranks = hc->rn_recv->size;
1419
1420 const int *dest_rank_index = hc->elt_rank_index;
1421
1422 /* Re-count values to send */
1423 for (int i = 0; i < n_s_ranks; i++)
1424 hc->send_count[i] = 0;
1425 for (size_t j = 0; j < hc->send_size; j++)
1426 hc->send_count[dest_rank_index[j]] += 1;
1427
1428 /* Revert to saved values for receive */
1429 if (hc->recv_count_save != NULL) {
1430 memcpy(hc->recv_count, hc->recv_count_save, sizeof(int)*(n_r_ranks));
1431 BFT_FREE(hc->recv_count_save);
1432 }
1433
1434 /* Recompute associated displacements */
1435 _compute_displ(n_s_ranks, hc->send_count, hc->send_displ);
1436 _compute_displ(n_r_ranks, hc->recv_count, hc->recv_displ);
1437 }
1438
1439 /*----------------------------------------------------------------------------
1440 * Swap source and destination ranks of all-to-all distributor.
1441 *
1442 * parameters:
1443 * d <-> pointer to associated all-to-all distributor
1444 *----------------------------------------------------------------------------*/
1445
1446 static void
_hybrid_pex_swap_src_dest(_hybrid_pex_t * hc)1447 _hybrid_pex_swap_src_dest(_hybrid_pex_t *hc)
1448 {
1449 size_t tmp_size[2] = {hc->send_size, hc->recv_size};
1450 int *tmp_count = hc->recv_count;
1451 int *tmp_displ = hc->recv_displ;
1452 cs_rank_neighbors_t *tmp_rn = hc->rn_send;
1453
1454 hc->rn_send = hc->rn_recv;
1455 hc->rn_recv = tmp_rn;
1456
1457 hc->send_size = tmp_size[1];
1458 hc->recv_size = tmp_size[0];
1459
1460 hc->recv_count = hc->send_count;
1461 hc->recv_displ = hc->send_displ;
1462
1463 hc->send_count = tmp_count;
1464 hc->send_displ = tmp_displ;
1465 }
1466
1467 /*----------------------------------------------------------------------------
1468 * Prepare a hybrid caller for strided data.
1469 *
1470 * parameters:
1471 * n_elts <-- number of elements
1472 * stride <-- number of values per entity (interlaced)
1473 * datatype <-- type of data considered
1474 * reverse <-- true if reverse mode
1475 * data <-- element values
1476 * dest_id <-- element destination id, or NULL
1477 * recv_id <-- element receive id (for reverse mode), or NULL
1478 *---------------------------------------------------------------------------*/
1479
1480 static void
_hybrid_pex_prepare_s(_hybrid_pex_t * hc,size_t n_elts,int stride,cs_datatype_t datatype,bool reverse,const void * data,const cs_lnum_t * dest_id,const cs_lnum_t * recv_id)1481 _hybrid_pex_prepare_s(_hybrid_pex_t *hc,
1482 size_t n_elts,
1483 int stride,
1484 cs_datatype_t datatype,
1485 bool reverse,
1486 const void *data,
1487 const cs_lnum_t *dest_id,
1488 const cs_lnum_t *recv_id)
1489 {
1490 int i;
1491
1492 size_t elt_size = cs_datatype_size[datatype]*stride;
1493
1494 unsigned const char *_data = data;
1495
1496 assert(data != NULL || (n_elts == 0 || stride == 0));
1497
1498 _hybrid_pex_update_meta(hc, datatype, stride);
1499
1500 /* Allocate send buffer */
1501
1502 if (reverse && recv_id == NULL) {
1503 BFT_FREE(hc->_send_buffer);
1504 hc->send_buffer = data;
1505 }
1506 else {
1507 BFT_REALLOC(hc->_send_buffer, hc->send_size*hc->comp_size, unsigned char);
1508 hc->send_buffer = hc->_send_buffer;
1509 }
1510
1511 const int *dest_rank_index = hc->elt_rank_index;
1512
1513 /* Copy metadata */
1514
1515 if (hc->dest_id_datatype == CS_LNUM_TYPE) {
1516 unsigned const char *_dest_id = (unsigned const char *)dest_id;
1517 const size_t id_size = sizeof(cs_lnum_t);
1518 for (size_t j = 0; j < n_elts; j++) {
1519 size_t w_displ = hc->send_displ[dest_rank_index[j]]*hc->comp_size;
1520 size_t r_displ = j*id_size;
1521 hc->send_displ[dest_rank_index[j]] += 1;
1522 for (size_t k = 0; k < id_size; k++)
1523 hc->_send_buffer[w_displ + k] = _dest_id[r_displ + k];
1524 }
1525 /* Reset send_displ */
1526 int n_s_ranks = hc->rn_send->size;
1527 for (i = 0; i < n_s_ranks; i++)
1528 hc->send_displ[i] -= hc->send_count[i];
1529 }
1530
1531 /* Copy data; in case of reverse send with destination ids, the
1532 matching indirection must be applied here. */
1533
1534 if (!reverse) {
1535 for (size_t j = 0; j < n_elts; j++) {
1536 size_t w_displ = hc->send_displ[dest_rank_index[j]]*hc->comp_size
1537 + hc->elt_shift;
1538 size_t r_displ = j*elt_size;
1539 hc->send_displ[dest_rank_index[j]] += 1;
1540 for (size_t k = 0; k < elt_size; k++)
1541 hc->_send_buffer[w_displ + k] = _data[r_displ + k];
1542 }
1543 /* Reset send_displ */
1544 int n_s_ranks = hc->rn_send->size;
1545 for (i = 0; i < n_s_ranks; i++)
1546 hc->send_displ[i] -= hc->send_count[i];
1547 }
1548 else if (recv_id != NULL) { /* reverse here */
1549 for (size_t j = 0; j < hc->send_size; j++) {
1550 size_t w_displ = hc->comp_size*j + hc->elt_shift;
1551 size_t r_displ = recv_id[j]*elt_size;
1552 for (size_t k = 0; k < elt_size; k++)
1553 hc->_send_buffer[w_displ + k] = _data[r_displ + k];
1554 }
1555 }
1556 else { /* If revert and no recv_id */
1557 assert(hc->send_buffer == data);
1558 }
1559 }
1560
1561 /*----------------------------------------------------------------------------
1562 * Prepare a hybrid caller for indexed data.
1563 *
1564 * parameters:
1565 * n_elts <-- number of elements
1566 * datatype <-- type of data considered
1567 * reverse <-- true if reverse mode
1568 * src_index <-- source index
1569 * dest_index <-- destination index
1570 * data <-- element values
1571 * recv_id <-- element receive id (for reverse mode), or NULL
1572 *---------------------------------------------------------------------------*/
1573
1574 static void
_hybrid_pex_prepare_i(_hybrid_pex_t * hc,size_t n_elts,cs_datatype_t datatype,bool reverse,const cs_lnum_t src_index[],const cs_lnum_t dest_index[],const void * data,const cs_lnum_t * recv_id)1575 _hybrid_pex_prepare_i(_hybrid_pex_t *hc,
1576 size_t n_elts,
1577 cs_datatype_t datatype,
1578 bool reverse,
1579 const cs_lnum_t src_index[],
1580 const cs_lnum_t dest_index[],
1581 const void *data,
1582 const cs_lnum_t *recv_id)
1583 {
1584 int i;
1585
1586 size_t elt_size = cs_datatype_size[datatype];
1587
1588 unsigned const char *_data = data;
1589
1590 _hybrid_pex_update_meta(hc, datatype, 0);
1591
1592 /* Allocate send buffer */
1593
1594 if (reverse && recv_id == NULL) {
1595 BFT_FREE(hc->_send_buffer);
1596 hc->send_buffer = data;
1597 }
1598 else {
1599 size_t n_sub_elts = 0;
1600 if (reverse == false || recv_id == NULL)
1601 n_sub_elts = src_index[n_elts];
1602 else {
1603 for (size_t j = 0; j < hc->send_size; j++) {
1604 cs_lnum_t k = recv_id[j];
1605 n_sub_elts += src_index[k+1] - src_index[k];
1606 }
1607 }
1608 BFT_REALLOC(hc->_send_buffer, n_sub_elts*hc->comp_size, unsigned char);
1609 hc->send_buffer = hc->_send_buffer;
1610 }
1611
1612 /* Update send and receive counts and displacements
1613 (avoiding additional communication) */
1614
1615 const int n_s_ranks = hc->rn_send->size;
1616 const int n_r_ranks = hc->rn_recv->size;
1617
1618 for (i = 0; i < n_s_ranks; i++)
1619 hc->send_count[i] = 0;
1620
1621 for (i = 0; i < n_r_ranks; i++)
1622 hc->recv_count[i] = 0;
1623
1624 const int *dest_rank_index = hc->elt_rank_index;
1625
1626 if (reverse) {
1627 if (recv_id != NULL) {
1628 for (i = 0; i < n_s_ranks; i++) {
1629 size_t i_s = hc->send_displ[i];
1630 size_t i_e = hc->send_displ[i+1];
1631 for (size_t j = i_s; j < i_e; j++) {
1632 cs_lnum_t k = recv_id[j];
1633 cs_lnum_t n_sub_send = src_index[k+1] - src_index[k];
1634 hc->send_count[i] += n_sub_send;
1635 }
1636 }
1637 }
1638 else {
1639 for (i = 0; i < n_s_ranks; i++) {
1640 size_t i_s = hc->send_displ[i];
1641 size_t i_e = hc->send_displ[i+1];
1642 for (size_t j = i_s; j < i_e; j++) {
1643 cs_lnum_t n_sub_send = src_index[j+1] - src_index[j];
1644 hc->send_count[i] += n_sub_send;
1645 }
1646 }
1647 }
1648 }
1649 else { /* !reverse */
1650 for (size_t j = 0; j < hc->send_size; j++) {
1651 cs_lnum_t n_sub_send = src_index[j+1] - src_index[j];
1652 hc->send_count[dest_rank_index[j]] += n_sub_send;
1653 }
1654 }
1655
1656 if (reverse) {
1657 for (size_t j = 0; j < hc->recv_size; j++) {
1658 cs_lnum_t n_sub_recv = dest_index[j+1] - dest_index[j];
1659 hc->recv_count[dest_rank_index[j]] += n_sub_recv;
1660 }
1661 }
1662 else {
1663 if (recv_id != NULL) {
1664 for (i = 0; i < n_r_ranks; i++) {
1665 size_t i_s = hc->recv_displ[i];
1666 size_t i_e = hc->recv_displ[i+1];
1667 for (size_t j = i_s; j < i_e; j++) {
1668 cs_lnum_t k = recv_id[j];
1669 cs_lnum_t n_sub_recv = dest_index[k+1] - dest_index[k];
1670 hc->recv_count[i] += n_sub_recv;
1671 }
1672 }
1673 }
1674 else {
1675 for (i = 0; i < n_r_ranks; i++) {
1676 size_t i_s = hc->recv_displ[i];
1677 size_t i_e = hc->recv_displ[i+1];
1678 for (size_t j = i_s; j < i_e; j++) {
1679 cs_lnum_t n_sub_recv = dest_index[j+1] - dest_index[j];
1680 hc->recv_count[i] += n_sub_recv;
1681 }
1682 }
1683 }
1684 }
1685
1686 _compute_displ(n_s_ranks, hc->send_count, hc->send_displ);
1687 _compute_displ(n_r_ranks, hc->recv_count, hc->recv_displ);
1688
1689 /* Copy data; in case of reverse send with destination ids, the
1690 matching indirection must be applied here. */
1691
1692 if (!reverse) {
1693 for (size_t j = 0; j < n_elts; j++) {
1694 cs_lnum_t n_sub_send = src_index[j+1] - src_index[j];
1695 size_t w_displ = hc->send_displ[dest_rank_index[j]]*elt_size;
1696 size_t r_displ = src_index[j]*elt_size;
1697 size_t sub_size = elt_size*n_sub_send;
1698 hc->send_displ[dest_rank_index[j]] += n_sub_send;
1699 for (size_t l = 0; l < sub_size; l++)
1700 hc->_send_buffer[w_displ + l] = _data[r_displ + l];
1701 }
1702 /* Reset send_displ */
1703 for (i = 0; i < n_s_ranks; i++)
1704 hc->send_displ[i] -= hc->send_count[i];
1705 }
1706 else if (recv_id != NULL) { /* reverse here */
1707 size_t w_displ = 0;
1708 for (size_t j = 0; j < hc->send_size; j++) {
1709 cs_lnum_t k = recv_id[j];
1710 cs_lnum_t n_sub_send = src_index[k+1] - src_index[k];
1711 size_t r_displ = src_index[k]*elt_size;
1712 size_t sub_size = elt_size*n_sub_send;
1713 for (size_t l = 0; l < sub_size; l++)
1714 hc->_send_buffer[w_displ + l] = _data[r_displ + l];
1715 w_displ += sub_size;
1716 }
1717
1718 /* Recompute associated displacements */
1719 _compute_displ(n_s_ranks, hc->send_count, hc->send_displ);
1720 _compute_displ(n_r_ranks, hc->recv_count, hc->recv_displ);
1721
1722
1723 }
1724 else { /* If revert and no recv_id */
1725 assert(hc->send_buffer == data);
1726 }
1727 }
1728
1729 /*----------------------------------------------------------------------------
1730 * Exchange metadata for an hybrid caller.
1731 *
1732 * Order of data from a same source rank is preserved.
1733 *
1734 * parameters:
1735 * hc <-> associated hybrid caller structure
1736 * n_elts <-- number of elements
1737 * dest_rank <-- destination rank for each element
1738 *---------------------------------------------------------------------------*/
1739
1740 static void
_hybrid_pex_exchange_meta(_hybrid_pex_t * hc,size_t n_elts,const int dest_rank[])1741 _hybrid_pex_exchange_meta(_hybrid_pex_t *hc,
1742 size_t n_elts,
1743 const int dest_rank[])
1744 {
1745 if (hc->rn_send == NULL) {
1746 hc->rn_send = cs_rank_neighbors_create(n_elts, dest_rank);
1747
1748 BFT_MALLOC(hc->elt_rank_index, n_elts, int);
1749 BFT_MALLOC(hc->send_count, hc->rn_send->size, int);
1750 BFT_MALLOC(hc->send_displ, hc->rn_send->size + 1, int);
1751
1752 cs_rank_neighbors_to_index(hc->rn_send,
1753 n_elts,
1754 dest_rank,
1755 hc->elt_rank_index);
1756
1757 }
1758
1759 /* Count values to send and receive */
1760
1761 cs_rank_neighbors_count(hc->rn_send,
1762 n_elts,
1763 hc->elt_rank_index,
1764 hc->send_count);
1765
1766 hc->send_size = _compute_displ(hc->rn_send->size,
1767 hc->send_count,
1768 hc->send_displ);
1769
1770 /* Exchange counts */
1771
1772 cs_timer_t t0 = cs_timer_time();
1773
1774 if (_n_trace < _n_trace_max) {
1775 /* Time to 1-5 s */
1776 _all_to_all_trace[_n_trace*9] = t0.sec*1e5 + t0.nsec/1e4;
1777 _all_to_all_trace[_n_trace*9+1] = 0;
1778 _all_to_all_trace[_n_trace*9+2] = 0;
1779 _all_to_all_trace[_n_trace*9+3] = bft_mem_usage_pr_size();
1780 _all_to_all_trace[_n_trace*9+4] = bft_mem_usage_max_pr_size();
1781 _all_to_all_trace[_n_trace*9+5] = 0;
1782 _all_to_all_trace[_n_trace*9+6] = 0;
1783 _all_to_all_trace[_n_trace*9+7] = 0;
1784 _all_to_all_trace[_n_trace*9+8] = 0;
1785 _n_trace += 1;
1786 }
1787
1788 assert(hc->rn_recv == NULL);
1789
1790 if (hc->rn_recv != NULL) {
1791 cs_rank_neighbors_destroy(&(hc->rn_recv));
1792 BFT_FREE(hc->recv_count);
1793 BFT_FREE(hc->recv_displ);
1794 }
1795
1796 cs_rank_neighbors_sync_count_m(hc->rn_send,
1797 &(hc->rn_recv),
1798 hc->send_count,
1799 &(hc->recv_count),
1800 _hybrid_meta_type,
1801 hc->comm);
1802
1803 cs_timer_t t1 = cs_timer_time();
1804 cs_timer_counter_add_diff(_all_to_all_timers + CS_ALL_TO_ALL_TIME_METADATA,
1805 &t0, &t1);
1806
1807 if (_n_trace < _n_trace_max) {
1808 /* Time to 1-5 s */
1809 _all_to_all_trace[_n_trace*9] = t1.sec*1e5 + t1.nsec/1e4;
1810 _all_to_all_trace[_n_trace*9+1] = _all_to_all_trace[_n_trace*9]
1811 - _all_to_all_trace[(_n_trace-1)*9];
1812 _all_to_all_trace[_n_trace*9+2] = 1;
1813 _all_to_all_trace[_n_trace*9+3] = bft_mem_usage_pr_size();
1814 _all_to_all_trace[_n_trace*9+4] = bft_mem_usage_max_pr_size();
1815 _all_to_all_trace[_n_trace*9+5] = 0;
1816 _all_to_all_trace[_n_trace*9+6] = 0;
1817 _all_to_all_trace[_n_trace*9+7] = 0;
1818 _all_to_all_trace[_n_trace*9+8] = 0;
1819 _n_trace += 1;
1820 }
1821
1822 _all_to_all_calls[CS_ALL_TO_ALL_TIME_METADATA] += 1;
1823
1824 BFT_MALLOC(hc->recv_displ, hc->rn_recv->size + 1, int);
1825
1826 hc->recv_size = _compute_displ(hc->rn_recv->size,
1827 hc->recv_count,
1828 hc->recv_displ);
1829 }
1830
1831 /*----------------------------------------------------------------------------*/
1832 /*!
1833 * \brief Exchange indexed data with a hybrid caller.
1834 *
1835 * \param[in] hc associated hybrid caller structure
1836 * \param[in] sendbuf starting address of send buffer.
1837 * \param[out] recvbuf starting address of receive buffer.
1838 */
1839 /*----------------------------------------------------------------------------*/
1840
1841 static void
_hybrid_alltoallv(_hybrid_pex_t * hc,const void * sendbuf,void * recvbuf)1842 _hybrid_alltoallv(_hybrid_pex_t *hc,
1843 const void *sendbuf,
1844 void *recvbuf)
1845 {
1846 /* Currently available: MPI_Alltoallv */
1847
1848 if (true) {
1849 int n_ranks;
1850 MPI_Comm_size(hc->comm, &n_ranks);
1851
1852 int *send_count, *send_displ, *recv_count, *recv_displ;
1853
1854 BFT_MALLOC(send_count, n_ranks, int);
1855 BFT_MALLOC(send_displ, n_ranks+1, int);
1856 BFT_MALLOC(recv_count, n_ranks, int);
1857 BFT_MALLOC(recv_displ, n_ranks+1, int);
1858
1859 const int n_s_ranks = hc->rn_send->size;
1860 const int n_r_ranks = hc->rn_recv->size;
1861 const int *s_rank = hc->rn_send->rank;
1862 const int *r_rank = hc->rn_recv->rank;
1863
1864 for (int i = 0; i < n_ranks; i++) {
1865 send_count[i] = 0;
1866 recv_count[i] = 0;
1867 }
1868 for (int i = 0; i < n_s_ranks; i++)
1869 send_count[s_rank[i]] = hc->send_displ[i+1] - hc->send_displ[i];
1870 for (int i = 0; i < n_r_ranks; i++)
1871 recv_count[r_rank[i]] = hc->recv_displ[i+1] - hc->recv_displ[i];
1872
1873 _compute_displ(n_ranks, send_count, send_displ);
1874 _compute_displ(n_ranks, recv_count, recv_displ);
1875
1876 MPI_Alltoallv(sendbuf, send_count, send_displ, hc->comp_type,
1877 recvbuf, recv_count, recv_displ, hc->comp_type,
1878 hc->comm);
1879
1880 BFT_FREE(recv_displ);
1881 BFT_FREE(recv_count);
1882 BFT_FREE(send_displ);
1883 BFT_FREE(send_count);
1884 }
1885 }
1886
1887 /*----------------------------------------------------------------------------
1888 * Exchange strided data with a hybrid caller.
1889 *
1890 * parameters:
1891 * d <-> pointer to associated all-to-all distributor
1892 * hc <-> associated hybrid caller structure
1893 * reverse <-- true if reverse mode
1894 * dest_data <-> destination data buffer, or NULL
1895 *
1896 * returns:
1897 * pointer to dest_data, or newly allocated buffer
1898 *---------------------------------------------------------------------------*/
1899
1900 static void *
_hybrid_pex_exchange_s(cs_all_to_all_t * d,_hybrid_pex_t * hc,bool reverse,void * dest_data)1901 _hybrid_pex_exchange_s(cs_all_to_all_t *d,
1902 _hybrid_pex_t *hc,
1903 bool reverse,
1904 void *dest_data)
1905 {
1906 size_t elt_size = cs_datatype_size[hc->datatype]*hc->stride;
1907 unsigned char *_dest_data = dest_data, *_recv_data = dest_data;
1908
1909 /* Final data buffer */
1910
1911 if (_dest_data == NULL && hc->recv_size*elt_size > 0)
1912 BFT_MALLOC(_dest_data, hc->recv_size*elt_size, unsigned char);
1913
1914 /* Data buffer for MPI exchange (may merge data and metadata) */
1915 if ( hc->dest_id_datatype == CS_LNUM_TYPE || d->recv_id != NULL
1916 || reverse)
1917 BFT_MALLOC(_recv_data, hc->recv_size*hc->comp_size, unsigned char);
1918 else
1919 _recv_data = _dest_data;
1920
1921 cs_timer_t t0 = cs_timer_time();
1922
1923 if (_n_trace < _n_trace_max) {
1924 /* Time to 1-5 s */
1925 _all_to_all_trace[_n_trace*9] = t0.sec*1e5 + t0.nsec/1e4;
1926 _all_to_all_trace[_n_trace*9+1] = 0;
1927 _all_to_all_trace[_n_trace*9+2] = 2;
1928 _all_to_all_trace[_n_trace*9+3] = bft_mem_usage_pr_size();
1929 _all_to_all_trace[_n_trace*9+4] = bft_mem_usage_max_pr_size();
1930 _all_to_all_trace[_n_trace*9+5] = hc->send_size*hc->comp_size;
1931 _all_to_all_trace[_n_trace*9+6] = hc->recv_size*hc->comp_size;
1932 _all_to_all_trace[_n_trace*9+7] = hc->rn_send->size;
1933 _all_to_all_trace[_n_trace*9+8] = hc->rn_recv->size;
1934 _n_trace += 1;
1935 }
1936
1937 _hybrid_alltoallv(hc, hc->send_buffer, _recv_data);
1938
1939 cs_timer_t t1 = cs_timer_time();
1940 cs_timer_counter_add_diff(_all_to_all_timers + CS_ALL_TO_ALL_TIME_EXCHANGE,
1941 &t0, &t1);
1942 _all_to_all_calls[CS_ALL_TO_ALL_TIME_EXCHANGE] += 1;
1943
1944 if (_n_trace < _n_trace_max) {
1945 /* Time to 1-5 s */
1946 _all_to_all_trace[_n_trace*9] = t1.sec*1e5 + t1.nsec/1e4;
1947 _all_to_all_trace[_n_trace*9+1] = _all_to_all_trace[_n_trace*9]
1948 - _all_to_all_trace[(_n_trace-1)*9];
1949 _all_to_all_trace[_n_trace*9+2] = 0;
1950 _all_to_all_trace[_n_trace*9+3] = bft_mem_usage_pr_size();
1951 _all_to_all_trace[_n_trace*9+4] = bft_mem_usage_max_pr_size();
1952 _all_to_all_trace[_n_trace*9+5] = hc->send_size*hc->comp_size;
1953 _all_to_all_trace[_n_trace*9+6] = hc->recv_size*hc->comp_size;
1954 _all_to_all_trace[_n_trace*9+7] = hc->rn_send->size;
1955 _all_to_all_trace[_n_trace*9+8] = hc->rn_recv->size;
1956 _n_trace += 1;
1957 }
1958
1959 /* dest id datatype only used for first exchange */
1960
1961 if (hc->dest_id_datatype == CS_LNUM_TYPE) {
1962 assert(d->recv_id == NULL);
1963 BFT_MALLOC(d->recv_id, d->hc->recv_size, cs_lnum_t);
1964 for (size_t i = 0; i < d->hc->recv_size; i++)
1965 d->recv_id[i] = -1;
1966 const unsigned char *sp = _recv_data;
1967 for (size_t i = 0; i < d->hc->recv_size; i++)
1968 memcpy(d->recv_id + i,
1969 sp + d->hc->comp_size*i,
1970 sizeof(cs_lnum_t));
1971 hc->dest_id_datatype = CS_DATATYPE_NULL;
1972 cs_lnum_t dest_id_max = -1;
1973 for (size_t i = 0; i < d->hc->recv_size; i++) {
1974 if (d->recv_id[i] > dest_id_max)
1975 dest_id_max = d->recv_id[i];
1976 d->n_elts_dest = dest_id_max + 1;
1977 }
1978 d->n_elts_dest_e = d->hc->recv_size;
1979 }
1980
1981 /* Now handle main data buffer (reverse implies reordering data) */
1982
1983 if (_dest_data != _recv_data) {
1984 const unsigned char *sp = _recv_data + d->hc->elt_shift;
1985 if (d->recv_id != NULL && !reverse) {
1986 for (size_t i = 0; i < d->hc->recv_size; i++) {
1987 size_t w_displ = d->recv_id[i]*elt_size;
1988 size_t r_displ = d->hc->comp_size*i;
1989 for (size_t j = 0; j < elt_size; j++)
1990 _dest_data[w_displ + j] = sp[r_displ + j];
1991 }
1992 }
1993 else if (reverse) {
1994 const int n_r_ranks = hc->rn_recv->size;
1995 const int *dest_rank_index = hc->elt_rank_index;
1996 for (int i = 0; i < n_r_ranks; i++)
1997 hc->recv_count[i] = 0;
1998 for (size_t i = 0; i < d->hc->recv_size; i++) {
1999 int rank_idx = dest_rank_index[i];
2000 size_t w_displ = i*elt_size;
2001 size_t r_displ = ( hc->recv_displ[rank_idx]
2002 + hc->recv_count[rank_idx])*hc->comp_size;
2003 for (size_t j = 0; j < elt_size; j++)
2004 _dest_data[w_displ + j] = sp[r_displ + j];
2005 hc->recv_count[rank_idx] += 1;
2006 }
2007 }
2008 else {
2009 for (size_t i = 0; i < d->hc->recv_size; i++) {
2010 size_t w_displ = i*elt_size;
2011 size_t r_displ = hc->comp_size*i;
2012 for (size_t j = 0; j < elt_size; j++)
2013 _dest_data[w_displ + j] = sp[r_displ + j];
2014 }
2015 }
2016 BFT_FREE(_recv_data);
2017 }
2018
2019 return _dest_data;
2020 }
2021
2022 /*----------------------------------------------------------------------------
2023 * Exchange indexed data with a hybrid caller.
2024 *
2025 * parameters:
2026 * d <-> pointer to associated all-to-all distributor
2027 * hc <-> associated hybrid caller structure
2028 * reverse <-- true if reverse mode
2029 * dest_index <-- destination index
2030 * dest_data <-> destination data buffer, or NULL
2031 *
2032 * returns:
2033 * pointer to dest_data, or newly allocated buffer
2034 *---------------------------------------------------------------------------*/
2035
2036 static void *
_hybrid_pex_exchange_i(cs_all_to_all_t * d,_hybrid_pex_t * hc,bool reverse,const cs_lnum_t dest_index[],void * dest_data)2037 _hybrid_pex_exchange_i(cs_all_to_all_t *d,
2038 _hybrid_pex_t *hc,
2039 bool reverse,
2040 const cs_lnum_t dest_index[],
2041 void *dest_data)
2042 {
2043 size_t elt_size = cs_datatype_size[hc->datatype];
2044 unsigned char *_dest_data = dest_data, *_recv_data = dest_data;
2045
2046 /* Final data buffer */
2047
2048 size_t n_elts_dest = (reverse) ? d->n_elts_src : d->n_elts_dest;
2049
2050 size_t _n_dest_sub = dest_index[n_elts_dest];
2051 if (_dest_data == NULL && _n_dest_sub*elt_size > 0)
2052 BFT_MALLOC(_dest_data, _n_dest_sub*elt_size, unsigned char);
2053
2054 /* Data buffer for MPI exchange (may merge data and metadata) */
2055
2056 if (d->recv_id != NULL || reverse) {
2057 int n_r_ranks = hc->rn_recv->size;
2058 size_t _n_dest_buf = hc->recv_displ[n_r_ranks] * hc->comp_size;
2059 BFT_MALLOC(_recv_data, _n_dest_buf, unsigned char);
2060 }
2061 else
2062 _recv_data = _dest_data;
2063
2064 cs_timer_t t0 = cs_timer_time();
2065
2066 if (_n_trace < _n_trace_max) {
2067 /* Time to 1-5 s */
2068 _all_to_all_trace[_n_trace*9] = t0.sec*1e5 + t0.nsec/1e4;
2069 _all_to_all_trace[_n_trace*9+1] = 0;
2070 _all_to_all_trace[_n_trace*9+2] = 2;
2071 _all_to_all_trace[_n_trace*9+3] = bft_mem_usage_pr_size();
2072 _all_to_all_trace[_n_trace*9+4] = bft_mem_usage_max_pr_size();
2073 _all_to_all_trace[_n_trace*9+5] = hc->send_size*hc->comp_size;
2074 _all_to_all_trace[_n_trace*9+6] = hc->recv_size*hc->comp_size;
2075 _all_to_all_trace[_n_trace*9+7] = hc->rn_send->size;
2076 _all_to_all_trace[_n_trace*9+8] = hc->rn_recv->size;
2077 _n_trace += 1;
2078 }
2079
2080 _hybrid_alltoallv(hc, hc->send_buffer, _recv_data);
2081
2082 cs_timer_t t1 = cs_timer_time();
2083 cs_timer_counter_add_diff(_all_to_all_timers + CS_ALL_TO_ALL_TIME_EXCHANGE,
2084 &t0, &t1);
2085 _all_to_all_calls[CS_ALL_TO_ALL_TIME_EXCHANGE] += 1;
2086
2087 if (_n_trace < _n_trace_max) {
2088 /* Time to 1-5 s */
2089 _all_to_all_trace[_n_trace*9] = t1.sec*1e5 + t1.nsec/1e4;
2090 _all_to_all_trace[_n_trace*9+1] = _all_to_all_trace[_n_trace*9]
2091 - _all_to_all_trace[(_n_trace-1)*9];
2092 _all_to_all_trace[_n_trace*9+2] = 0;
2093 _all_to_all_trace[_n_trace*9+3] = bft_mem_usage_pr_size();
2094 _all_to_all_trace[_n_trace*9+4] = bft_mem_usage_max_pr_size();
2095 _all_to_all_trace[_n_trace*9+5] = hc->send_size*hc->comp_size;
2096 _all_to_all_trace[_n_trace*9+6] = hc->recv_size*hc->comp_size;
2097 _all_to_all_trace[_n_trace*9+7] = hc->rn_send->size;
2098 _all_to_all_trace[_n_trace*9+8] = hc->rn_recv->size;
2099 _n_trace += 1;
2100 }
2101
2102 /* Handle main data buffer (reverse implies reordering data) */
2103
2104 if (_dest_data != _recv_data) {
2105 const unsigned char *sp = _recv_data;
2106 if (d->recv_id != NULL && !reverse) {
2107 size_t r_displ = 0;
2108 for (size_t i = 0; i < hc->recv_size; i++) {
2109 cs_lnum_t k = d->recv_id[i];
2110 cs_lnum_t n_sub_recv = dest_index[k+1] - dest_index[k];
2111 size_t w_displ = dest_index[k]*elt_size;
2112 size_t sub_size = n_sub_recv*elt_size;
2113 for (size_t l = 0; l < sub_size; l++)
2114 _dest_data[w_displ + l] = sp[r_displ + l];
2115 r_displ += sub_size;
2116 }
2117 }
2118 else if (reverse) {
2119 const int n_r_ranks = hc->rn_recv->size;
2120 const int *dest_rank_index = hc->elt_rank_index;
2121 for (int i = 0; i < n_r_ranks; i++)
2122 hc->recv_count[i] = 0;
2123 for (size_t i = 0; i < d->hc->recv_size; i++) {
2124 int rank_idx = dest_rank_index[i];
2125 size_t w_displ = dest_index[i]*elt_size;
2126 size_t r_displ = ( hc->recv_displ[rank_idx]
2127 + hc->recv_count[rank_idx])*elt_size;
2128 cs_lnum_t n_sub_recv = dest_index[i+1] - dest_index[i];
2129 size_t sub_size = n_sub_recv*elt_size;
2130 for (size_t l = 0; l < sub_size; l++)
2131 _dest_data[w_displ + l] = sp[r_displ + l];
2132 hc->recv_count[rank_idx] += n_sub_recv;
2133 }
2134 }
2135 else {
2136 assert(_dest_data == _recv_data);
2137 }
2138 BFT_FREE(_recv_data);
2139 }
2140
2141 return _dest_data;
2142 }
2143
2144 /*----------------------------------------------------------------------------*/
2145 /*!
2146 * \brief Compute receive id by ordering based on source rank.
2147 *
2148 * The caller is responsible for freeing the returned array.
2149 *
2150 * As communication schemes (such as Crystal Router) should lead to
2151 * elements being grouped by rank, this is used here to optimize sorting.
2152 * Sorting should also be stable as a result.
2153 *
2154 * \param[in] d pointer to all-to-all distributor
2155 * \param[in] src_rank associated source rank (size: d->n_elts_dest)
2156 */
2157 /*----------------------------------------------------------------------------*/
2158
2159 static void
_recv_id_by_src_rank_order(cs_all_to_all_t * d,const int src_rank[])2160 _recv_id_by_src_rank_order(cs_all_to_all_t *d,
2161 const int src_rank[])
2162 {
2163 assert(d->recv_id == NULL);
2164
2165 const cs_lnum_t n_elts = d->n_elts_dest;
2166
2167 BFT_MALLOC(d->recv_id, n_elts, cs_lnum_t);
2168
2169 cs_lnum_t n_rs = 0;
2170 cs_lnum_2_t *rs;
2171
2172 BFT_MALLOC(rs, n_elts+1, cs_lnum_2_t);
2173
2174 /* Extract source rank and build rank ranges. */
2175
2176 int prev_rank = -1;
2177
2178 for (cs_lnum_t i = 0; i < n_elts; i++) {
2179 if (src_rank[i] != prev_rank) {
2180 prev_rank = src_rank[i];
2181 rs[n_rs][0] = src_rank[i];
2182 rs[n_rs][1] = i;
2183 n_rs++;
2184 }
2185 }
2186
2187 /* Extend array for future range tests. */
2188
2189 rs[n_rs][0] = -1;
2190 rs[n_rs][1] = n_elts;
2191
2192 /* Order ranges. */
2193
2194 cs_lnum_t *rs_order;
2195 BFT_MALLOC(rs_order, n_rs, cs_lnum_t);
2196
2197 cs_order_lnum_allocated_s(NULL,
2198 (const cs_lnum_t *)rs,
2199 2,
2200 rs_order,
2201 n_rs);
2202
2203 cs_lnum_t k = 0;
2204 for (cs_lnum_t i = 0; i < n_rs; i++) {
2205 cs_lnum_t j = rs_order[i];
2206 cs_lnum_t s_id = rs[j][1];
2207 cs_lnum_t e_id = rs[j+1][1];
2208 for (cs_lnum_t l = s_id; l < e_id; l++) {
2209 d->recv_id[l] = k;
2210 k++;
2211 }
2212 }
2213 assert(k == n_elts);
2214
2215 BFT_FREE(rs_order);
2216 BFT_FREE(rs);
2217 }
2218
2219 /*----------------------------------------------------------------------------*/
2220 /*!
2221 * \brief Determine Crystal Router flags for a distributor.
2222 *
2223 * Some flags are only required for the first exchange.
2224 *
2225 * \param[in] d pointer to associated all-to-all distributor
2226 * \param[in] reverse if true, communicate in reverse direction
2227 *
2228 * \return flags required for first Crystal Router call
2229 */
2230 /*----------------------------------------------------------------------------*/
2231
2232 static int
_cr_flags(cs_all_to_all_t * d,bool reverse)2233 _cr_flags(cs_all_to_all_t *d,
2234 bool reverse)
2235 {
2236 cs_assert(d != NULL);
2237
2238 int cr_flags = 0;
2239
2240 if (!reverse) {
2241 if (d->n_elts_dest < 0) {
2242 if (d->flags & CS_ALL_TO_ALL_USE_DEST_ID)
2243 cr_flags = cr_flags | CS_CRYSTAL_ROUTER_USE_DEST_ID;
2244 if (! (d->flags & CS_ALL_TO_ALL_NO_REVERSE)) {
2245 cr_flags = cr_flags | CS_CRYSTAL_ROUTER_ADD_SRC_ID;
2246 cr_flags = cr_flags | CS_CRYSTAL_ROUTER_ADD_SRC_RANK;
2247 }
2248 if ( d->flags & CS_ALL_TO_ALL_NEED_SRC_RANK
2249 || d->flags & CS_ALL_TO_ALL_ORDER_BY_SRC_RANK)
2250 cr_flags = cr_flags | CS_CRYSTAL_ROUTER_ADD_SRC_RANK;
2251 }
2252 }
2253 else
2254 cr_flags = cr_flags | CS_CRYSTAL_ROUTER_USE_DEST_ID;
2255
2256 return cr_flags;
2257 }
2258
2259 /*----------------------------------------------------------------------------*/
2260 /*!
2261 * \brief Compute destination id by ordering based on source rank after
2262 * the first exchange with a Crystal Router.
2263 *
2264 * \param[in] d pointer to associated all-to-all distributor
2265 * \param[in] cr pointer to associated Crystal Router
2266 */
2267 /*----------------------------------------------------------------------------*/
2268
2269 static void
_cr_recv_id_by_src_rank(cs_all_to_all_t * d,cs_crystal_router_t * cr)2270 _cr_recv_id_by_src_rank(cs_all_to_all_t *d,
2271 cs_crystal_router_t *cr)
2272 {
2273 cs_assert(d != NULL);
2274
2275 assert(d->recv_id == NULL);
2276
2277 int *src_rank;
2278 BFT_MALLOC(src_rank, d->n_elts_dest_e, int);
2279
2280 cs_crystal_router_get_data(cr,
2281 &src_rank,
2282 NULL,
2283 NULL,
2284 NULL, /* dest_index */
2285 NULL);
2286
2287 if (d->n_elts_dest < 0)
2288 d->n_elts_dest = cs_crystal_router_n_elts(cr);
2289
2290 _recv_id_by_src_rank_order(d, src_rank);
2291
2292 BFT_FREE(src_rank);
2293 }
2294
2295 /*----------------------------------------------------------------------------*/
2296 /*!
2297 * \brief Indicate if source rank info must be maintained.
2298 *
2299 * The source rank is needed if either the CS_CRYSTAL_ROUTER_ADD_SRC_RANK
2300 * flag is set, or CS_ALL_TO_ALL_NO_REVERSE is not set.
2301 *
2302 * It is recommended that if the CS_ALL_TO_ALL_ORDER_BY_SRC_RANK flag is set,
2303 * for a communication scheme which does not ensure this by default
2304 * (such as for a Crystal Router), the source rank has been converted
2305 * to a recv_id, using \ref _recv_id_by_src_rank_order, so if this array
2306 * is available, the CS_ALL_TO_ALL_ORDER_BY_SRC_RANK flag does imply
2307 * by itself that source rank info is needed anymore.
2308 *
2309 * \param[in] d pointer to associated all-to-all distributor
2310 *
2311 * \return true if source rank info is needed, false otherwise
2312 */
2313 /*----------------------------------------------------------------------------*/
2314
2315 static bool
_is_src_rank_info_needed(cs_all_to_all_t * d)2316 _is_src_rank_info_needed(cs_all_to_all_t *d)
2317 {
2318 cs_assert(d != NULL);
2319
2320 bool retval = false;
2321
2322 if (d->flags & CS_ALL_TO_ALL_NO_REVERSE) {
2323 if (d->flags & CS_ALL_TO_ALL_NEED_SRC_RANK)
2324 retval = true;
2325 else if (d->flags & CS_ALL_TO_ALL_ORDER_BY_SRC_RANK) {
2326 if (d->recv_id == NULL && d->n_elts_dest > 0)
2327 retval = true;
2328 }
2329 }
2330 else
2331 retval = true;
2332
2333 return retval;
2334 }
2335
2336 #endif /* defined(HAVE_MPI) */
2337
2338 /*! (DOXYGEN_SHOULD_SKIP_THIS) \endcond */
2339
2340 /*============================================================================
2341 * Public function definitions
2342 *============================================================================*/
2343
2344 #if defined(HAVE_MPI)
2345
2346 /*----------------------------------------------------------------------------*/
2347 /*!
2348 * \brief Create an all-to-all distributor based on destination rank.
2349 *
2350 * This is a collective operation on communicator comm.
2351 *
2352 * If the flags bit mask matches \ref CS_ALL_TO_ALL_USE_DEST_ID,
2353 * data exchanged will be ordered by the array passed to the
2354 * \c dest_id argument. For \c n total values received on a rank
2355 * (as given by \ref cs_all_to_all_n_elts_dest), those destination ids
2356 * must be in the range [0, \c n[.
2357 *
2358 * If the flags bit mask matches \ref CS_ALL_TO_ALL_ORDER_BY_SRC_RANK,
2359 * data exchanged will be ordered by source rank (this is incompatible
2360 * with \ref CS_ALL_TO_ALL_USE_DEST_ID.
2361 *
2362 * \attention
2363 * The \c dest_rank and \c dest_id arrays are only referenced by
2364 * the distributor, not copied, and must remain available throughout
2365 * the distributor's lifetime. They may be fully transferred to
2366 * the structure if not needed elsewhere using the
2367 * \ref cs_all_to_all_transfer_dest_rank and
2368 * \ref cs_all_to_all_transfer_dest_id functions.
2369 *
2370 * \param[in] n_elts number of elements
2371 * \param[in] flags sum of ordering and metadata flag constants
2372 * \param[in] dest_id element destination id (required if flags
2373 * contain \ref CS_ALL_TO_ALL_USE_DEST_ID),
2374 * or NULL
2375 * \param[in] dest_rank destination rank for each element
2376 * \param[in] comm associated MPI communicator
2377 *
2378 * \return pointer to new all-to-all distributor
2379 */
2380 /*----------------------------------------------------------------------------*/
2381
2382 cs_all_to_all_t *
cs_all_to_all_create(size_t n_elts,int flags,const cs_lnum_t * dest_id,const int dest_rank[],MPI_Comm comm)2383 cs_all_to_all_create(size_t n_elts,
2384 int flags,
2385 const cs_lnum_t *dest_id,
2386 const int dest_rank[],
2387 MPI_Comm comm)
2388 {
2389 cs_timer_t t0, t1;
2390
2391 t0 = cs_timer_time();
2392
2393 cs_all_to_all_t *d = _all_to_all_create_base(n_elts,
2394 flags,
2395 comm);
2396
2397 d->dest_id = dest_id;
2398 d->dest_rank = dest_rank;
2399
2400 /* Create substructures based on info available at this stage
2401 (for Crystal Router, delay creation as data is not passed yet) */
2402
2403 if (d->type == CS_ALL_TO_ALL_MPI_DEFAULT)
2404 d->dc = _alltoall_caller_create_meta(flags, comm);
2405 else if (d->type == CS_ALL_TO_ALL_HYBRID)
2406 d->hc = _hybrid_pex_create_meta(flags, comm);
2407
2408 t1 = cs_timer_time();
2409 cs_timer_counter_add_diff(_all_to_all_timers + CS_ALL_TO_ALL_TIME_TOTAL,
2410 &t0, &t1);
2411 _all_to_all_calls[CS_ALL_TO_ALL_TIME_TOTAL] += 1;
2412
2413 return d;
2414 }
2415
2416 /*----------------------------------------------------------------------------*/
2417 /*!
2418 * \brief Create an all-to-all distributor for elements whose destination
2419 * rank is determined from global numbers and block distribution information.
2420 *
2421 * This is a collective operation on communicator comm.
2422 *
2423 * If the flags bit mask matches \ref CS_ALL_TO_ALL_USE_DEST_ID,
2424 * data exchanged will be ordered by global element number.
2425 *
2426 * If the flags bit mask matches \ref CS_ALL_TO_ALL_ORDER_BY_SRC_RANK,
2427 * data exchanged will be ordered by source rank (this is incompatible
2428 * with \ref CS_ALL_TO_ALL_USE_DEST_ID.
2429 *
2430 * \param[in] n_elts number of elements
2431 * \param[in] flags sum of ordering and metadata flag constants
2432 * \param[in] src_gnum global source element numbers
2433 * \param[in] bi destination block distribution info
2434 * \param[in] comm associated MPI communicator
2435 *
2436 * \return pointer to new all-to-all distributor
2437 */
2438 /*----------------------------------------------------------------------------*/
2439
2440 cs_all_to_all_t *
cs_all_to_all_create_from_block(size_t n_elts,int flags,const cs_gnum_t * src_gnum,cs_block_dist_info_t bi,MPI_Comm comm)2441 cs_all_to_all_create_from_block(size_t n_elts,
2442 int flags,
2443 const cs_gnum_t *src_gnum,
2444 cs_block_dist_info_t bi,
2445 MPI_Comm comm)
2446 {
2447 cs_timer_t t0, t1;
2448
2449 t0 = cs_timer_time();
2450
2451 cs_all_to_all_t *d = _all_to_all_create_base(n_elts,
2452 flags,
2453 comm);
2454
2455 /* Compute arrays based on block distribution
2456 (using global ids and block info at lower levels could
2457 save some memory, but would make things less modular;
2458 in addition, rank is an int, while global ids are usually
2459 64-bit uints, so the overhead is only 50%, and access should
2460 be faster). */
2461
2462 BFT_MALLOC(d->_dest_rank, n_elts, int);
2463 d->dest_rank = d->_dest_rank;
2464
2465 if (flags & CS_ALL_TO_ALL_USE_DEST_ID) {
2466 BFT_MALLOC(d->_dest_id, n_elts, cs_lnum_t);
2467 d->dest_id = d->_dest_id;
2468 }
2469
2470 const int rank_step = bi.rank_step;
2471 const cs_gnum_t block_size = bi.block_size;
2472
2473 if (d->_dest_id != NULL) {
2474 #pragma omp parallel for if (n_elts > CS_THR_MIN)
2475 for (size_t i = 0; i < n_elts; i++) {
2476 cs_gnum_t g_elt_id = src_gnum[i] -1;
2477 cs_gnum_t _dest_rank = g_elt_id / block_size;
2478 d->_dest_rank[i] = _dest_rank*rank_step;
2479 d->_dest_id[i] = g_elt_id % block_size;
2480 }
2481 }
2482 else {
2483 #pragma omp parallel for if (n_elts > CS_THR_MIN)
2484 for (size_t i = 0; i < n_elts; i++) {
2485 cs_gnum_t g_elt_id = src_gnum[i] -1;
2486 cs_gnum_t _dest_rank = g_elt_id / block_size;
2487 d->_dest_rank[i] = _dest_rank*rank_step;
2488 }
2489 }
2490
2491 /* Create substructures based on info available at this stage
2492 (for Crystal Router, delay creation as data is not passed yet) */
2493
2494 if (d->type == CS_ALL_TO_ALL_MPI_DEFAULT)
2495 d->dc = _alltoall_caller_create_meta(flags, comm);
2496 else if (d->type == CS_ALL_TO_ALL_HYBRID)
2497 d->hc = _hybrid_pex_create_meta(flags, comm);
2498
2499 t1 = cs_timer_time();
2500 cs_timer_counter_add_diff(_all_to_all_timers + CS_ALL_TO_ALL_TIME_TOTAL,
2501 &t0, &t1);
2502 _all_to_all_calls[CS_ALL_TO_ALL_TIME_TOTAL] += 1;
2503
2504 return d;
2505 }
2506
2507 /*----------------------------------------------------------------------------*/
2508 /*!
2509 * \brief Destroy an all-to-all distributor.
2510 *
2511 * \param[in, out] d pointer to associated all-to-all distributor
2512 */
2513 /*----------------------------------------------------------------------------*/
2514
2515 void
cs_all_to_all_destroy(cs_all_to_all_t ** d)2516 cs_all_to_all_destroy(cs_all_to_all_t **d)
2517 {
2518 if (d != NULL) {
2519
2520 cs_timer_t t0, t1;
2521
2522 t0 = cs_timer_time();
2523
2524 cs_all_to_all_t *_d = *d;
2525 if (_d->cr != NULL)
2526 cs_crystal_router_destroy(&(_d->cr));
2527 else if (_d->hc != NULL)
2528 _hybrid_pex_destroy(&(_d->hc));
2529 else if (_d->dc != NULL)
2530 _alltoall_caller_destroy(&(_d->dc));
2531
2532 BFT_FREE(_d->src_rank);
2533 BFT_FREE(_d->src_id);
2534
2535 BFT_FREE(_d->_dest_id);
2536 BFT_FREE(_d->_dest_rank);
2537
2538 BFT_FREE(_d->recv_id);
2539 BFT_FREE(_d->src_id);
2540 BFT_FREE(_d->src_rank);
2541
2542 BFT_FREE(_d);
2543
2544 t1 = cs_timer_time();
2545 cs_timer_counter_add_diff(_all_to_all_timers + CS_ALL_TO_ALL_TIME_TOTAL,
2546 &t0, &t1);
2547
2548 /* no increment to _all_to_all_calls[0] as create/destroy are grouped */
2549 }
2550 }
2551
2552 /*----------------------------------------------------------------------------*/
2553 /*!
2554 * \brief Transfer ownership of destination rank to an all-to-all distributor.
2555 *
2556 * The dest_rank array should be the same as the one used for the creation of
2557 * the distributor.
2558 *
2559 * \param[in, out] d pointer to associated all-to-all distributor
2560 * \param[in, out] dest_rank pointer to element destination rank
2561 */
2562 /*----------------------------------------------------------------------------*/
2563
2564 void
cs_all_to_all_transfer_dest_rank(cs_all_to_all_t * d,int ** dest_rank)2565 cs_all_to_all_transfer_dest_rank(cs_all_to_all_t *d,
2566 int **dest_rank)
2567 {
2568 cs_assert(d != NULL);
2569
2570 if (d->dest_rank == *dest_rank) {
2571 d->_dest_rank = *dest_rank;
2572 *dest_rank = NULL;
2573 }
2574 else {
2575 bft_error(__FILE__, __LINE__, 0,
2576 "%s: array transferred (%p)does not match the one used\n"
2577 "for all-to-all distributor creation (%p).",
2578 __func__, (void *)*dest_rank, (const void *)d->dest_rank);
2579 }
2580 }
2581
2582 /*----------------------------------------------------------------------------*/
2583 /*!
2584 * \brief Transfer ownership of destination ids to an
2585 * all-to-all distributor.
2586 *
2587 * The dest_id array should be the same as the one used for the creation of
2588 * the distributor.
2589 *
2590 * \param[in, out] d pointer to associated all-to-all distributor
2591 * \param[in, out] dest_id pointer to element destination id
2592 */
2593 /*----------------------------------------------------------------------------*/
2594
2595 void
cs_all_to_all_transfer_dest_id(cs_all_to_all_t * d,cs_lnum_t ** dest_id)2596 cs_all_to_all_transfer_dest_id(cs_all_to_all_t *d,
2597 cs_lnum_t **dest_id)
2598 {
2599 cs_assert(d != NULL);
2600
2601 if (d->dest_id == *dest_id) {
2602 d->_dest_id = *dest_id;
2603 *dest_id = NULL;
2604 }
2605 else {
2606 bft_error(__FILE__, __LINE__, 0,
2607 "%s: array transferred (%p)does not match the one used\n"
2608 "for all-to-all distributor creation (%p).",
2609 __func__, (void *)*dest_id, (const void *)d->dest_id);
2610 }
2611 }
2612
2613 /*----------------------------------------------------------------------------*/
2614 /*!
2615 * \brief Get number of elements associated with all-to-all distributor.
2616 *
2617 * The number of elements is the number of elements received after exchange.
2618 *
2619 * If no exchange has been done yet (depending on the communication protocol),
2620 * metadata will be exchanged by this call, so it is a collective operation.
2621 *
2622 * \param[in] d pointer to associated all-to-all distributor
2623 *
2624 * \return number of elements associated with distributor.
2625 */
2626 /*----------------------------------------------------------------------------*/
2627
2628 cs_lnum_t
cs_all_to_all_n_elts_dest(cs_all_to_all_t * d)2629 cs_all_to_all_n_elts_dest(cs_all_to_all_t *d)
2630 {
2631 cs_assert(d != NULL);
2632
2633 /* Obtain count if not available yet */
2634
2635 if (d->n_elts_dest < 0) {
2636
2637 cs_timer_t t0, t1;
2638
2639 t0 = cs_timer_time();
2640
2641 switch(d->type) {
2642 case CS_ALL_TO_ALL_MPI_DEFAULT:
2643 {
2644 _alltoall_caller_exchange_meta(d->dc,
2645 d->n_elts_src,
2646 d->dest_rank);
2647 if (d->dc->dest_id_datatype == CS_LNUM_TYPE)
2648 cs_all_to_all_copy_array(d,
2649 CS_DATATYPE_NULL,
2650 0,
2651 false,
2652 NULL,
2653 NULL);
2654 else {
2655 d->n_elts_dest = d->dc->recv_size;
2656 d->n_elts_dest_e = d->dc->recv_size;
2657 }
2658 }
2659 break;
2660
2661 case CS_ALL_TO_ALL_HYBRID:
2662 {
2663 _hybrid_pex_exchange_meta(d->hc,
2664 d->n_elts_src,
2665 d->dest_rank);
2666 BFT_FREE(d->_dest_rank);
2667 if (d->hc->dest_id_datatype == CS_LNUM_TYPE)
2668 cs_all_to_all_copy_array(d,
2669 CS_DATATYPE_NULL,
2670 0,
2671 false,
2672 NULL,
2673 NULL);
2674 else {
2675 d->n_elts_dest = d->hc->recv_size;
2676 d->n_elts_dest_e = d->hc->recv_size;
2677 }
2678 }
2679 break;
2680
2681 case CS_ALL_TO_ALL_CRYSTAL_ROUTER:
2682 {
2683 cs_crystal_router_t *cr
2684 = cs_crystal_router_create_s(d->n_elts_src,
2685 0,
2686 CS_DATATYPE_NULL,
2687 _cr_flags(d, false),
2688 NULL,
2689 NULL,
2690 d->dest_id,
2691 d->dest_rank,
2692 d->comm);
2693
2694 cs_timer_t tcr0 = cs_timer_time();
2695
2696 if (_n_trace < _n_trace_max) {
2697 /* Time to 1-5 s */
2698 _all_to_all_trace[_n_trace*9] = tcr0.sec*1e5 + tcr0.nsec/1e4;
2699 _all_to_all_trace[_n_trace*9+1] = 0;
2700 _all_to_all_trace[_n_trace*9+2] = 0;
2701 _all_to_all_trace[_n_trace*9+3] = bft_mem_usage_pr_size();
2702 _all_to_all_trace[_n_trace*9+4] = bft_mem_usage_max_pr_size();
2703 _all_to_all_trace[_n_trace*9+5] = 0;
2704 _all_to_all_trace[_n_trace*9+6] = 0;
2705 _all_to_all_trace[_n_trace*9+7] = 0;
2706 _all_to_all_trace[_n_trace*9+8] = 0;
2707 _n_trace += 1;
2708 }
2709
2710 cs_crystal_router_exchange(cr);
2711
2712 cs_timer_t tcr1 = cs_timer_time();
2713 cs_timer_counter_add_diff
2714 (_all_to_all_timers + CS_ALL_TO_ALL_TIME_METADATA, &tcr0, &tcr1);
2715 _all_to_all_calls[CS_ALL_TO_ALL_TIME_METADATA] += 1;
2716
2717 if (_n_trace < _n_trace_max) {
2718 /* Time to 1-5 s */
2719 _all_to_all_trace[_n_trace*9] = tcr1.sec*1e5 + tcr1.nsec/1e4;
2720 _all_to_all_trace[_n_trace*9+1] = _all_to_all_trace[_n_trace*9]
2721 - _all_to_all_trace[(_n_trace-1)*9];
2722 _all_to_all_trace[_n_trace*9+2] = 1;
2723 _all_to_all_trace[_n_trace*9+3] = bft_mem_usage_pr_size();
2724 _all_to_all_trace[_n_trace*9+4] = bft_mem_usage_max_pr_size();
2725 _all_to_all_trace[_n_trace*9+5] = 0;
2726 _all_to_all_trace[_n_trace*9+6] = 0;
2727 _all_to_all_trace[_n_trace*9+7] = 0;
2728 _all_to_all_trace[_n_trace*9+8] = 0;
2729 _n_trace += 1;
2730 }
2731
2732 d->n_elts_dest = cs_crystal_router_n_elts(cr);
2733 d->n_elts_dest_e = cs_crystal_router_n_recv_elts(cr);
2734
2735 if (d->flags & CS_ALL_TO_ALL_ORDER_BY_SRC_RANK)
2736 _cr_recv_id_by_src_rank(d, cr);
2737
2738 int **p_src_rank = _is_src_rank_info_needed(d) ? &(d->src_rank) : NULL;
2739 cs_crystal_router_get_data(cr,
2740 p_src_rank,
2741 &(d->recv_id),
2742 &(d->src_id),
2743 NULL, /* dest_index */
2744 NULL);
2745
2746 cs_crystal_router_destroy(&cr);
2747
2748 }
2749 break;
2750
2751 }
2752
2753 t1 = cs_timer_time();
2754 cs_timer_counter_add_diff(_all_to_all_timers + CS_ALL_TO_ALL_TIME_TOTAL,
2755 &t0, &t1);
2756 _all_to_all_calls[CS_ALL_TO_ALL_TIME_TOTAL] += 1;
2757
2758 }
2759
2760 return d->n_elts_dest;
2761 }
2762
2763 /*----------------------------------------------------------------------------*/
2764 /*!
2765 * \brief Communicate array data using all-to-all distributor.
2766 *
2767 * If a destination buffer is provided, it should be of sufficient size for
2768 * the number of elements returned by \ref cs_all_to_all_n_elts_dest
2769 * (multiplied by stride and datatype size).
2770 *
2771 * If no buffer is provided, one is allocated automatically, and transferred
2772 * to the caller (who is responsible for freeing it when no longer needed).
2773 *
2774 * If used in reverse mode, data is still communicated from src_data
2775 * to dest_buffer or an internal buffer, but communication direction
2776 * (i.e. source and destination ranks) are reversed.
2777 *
2778 * This is obviously a collective operation, and all ranks must provide
2779 * the same datatype, stride, and reverse values.
2780 *
2781 * \param[in, out] d pointer to associated all-to-all distributor
2782 * \param[in] datatype type of data considered
2783 * \param[in] stride number of values per entity (interlaced),
2784 * \param[in] reverse if true, communicate in reverse direction
2785 * \param[in] src_data source data
2786 * \param[out] dest_data pointer to destination data, or NULL
2787 *
2788 * \return pointer to destination data (dest_buffer if non-NULL)
2789 */
2790 /*----------------------------------------------------------------------------*/
2791
2792 void *
cs_all_to_all_copy_array(cs_all_to_all_t * d,cs_datatype_t datatype,int stride,bool reverse,const void * src_data,void * dest_data)2793 cs_all_to_all_copy_array(cs_all_to_all_t *d,
2794 cs_datatype_t datatype,
2795 int stride,
2796 bool reverse,
2797 const void *src_data,
2798 void *dest_data)
2799 {
2800 cs_assert(d != NULL);
2801
2802 void *_dest_data = NULL;
2803
2804 if (_n_trace > 0) {
2805 fprintf(_all_to_all_trace_bt_log, "\ncs_all_to_all_copy_array: %d\n\n",
2806 _n_trace);
2807 cs_base_backtrace_dump(_all_to_all_trace_bt_log, 1);
2808 bft_printf("cs_all_to_all_copy_array: %d\n", _n_trace);
2809 }
2810
2811 cs_timer_t t0, t1;
2812
2813 t0 = cs_timer_time();
2814
2815 /* Reverse can only be called after direct exchange in most cases
2816 (this case should be rare, and requires additional echanges,
2817 but let's play it safe and make sure it works if needed) */
2818
2819 if (d->n_elts_dest < 0 && reverse)
2820 cs_all_to_all_copy_array(d,
2821 CS_DATATYPE_NULL,
2822 0,
2823 false,
2824 NULL,
2825 NULL);
2826
2827 /* Now do regular exchange */
2828
2829 switch(d->type) {
2830
2831 case CS_ALL_TO_ALL_MPI_DEFAULT:
2832 {
2833 if (d->n_elts_dest < 0) { /* Exchange metadata if not done yet */
2834 _alltoall_caller_exchange_meta(d->dc,
2835 d->n_elts_src,
2836 d->dest_rank);
2837 d->n_elts_dest = d->dc->recv_size;
2838 d->n_elts_dest_e = d->dc->recv_size;
2839 }
2840 size_t n_elts = (reverse) ? d->n_elts_dest : d->n_elts_src;
2841 if (reverse)
2842 _alltoall_caller_swap_src_dest(d->dc);
2843 _alltoall_caller_prepare_s(d->dc,
2844 n_elts,
2845 stride,
2846 datatype,
2847 reverse,
2848 src_data,
2849 d->dest_id,
2850 d->recv_id,
2851 d->dest_rank);
2852 _dest_data = _alltoall_caller_exchange_s(d,
2853 d->dc,
2854 reverse,
2855 d->dest_rank,
2856 dest_data);
2857 if (reverse) {
2858 _alltoall_caller_swap_src_dest(d->dc);
2859 if (d->dc->send_buffer == src_data)
2860 d->dc->send_buffer = NULL;
2861 }
2862 }
2863 break;
2864
2865 case CS_ALL_TO_ALL_HYBRID:
2866 {
2867 if (d->n_elts_dest < 0) { /* Exchange metadata if not done yet */
2868 _hybrid_pex_exchange_meta(d->hc,
2869 d->n_elts_src,
2870 d->dest_rank);
2871 BFT_FREE(d->_dest_rank);
2872 d->n_elts_dest = d->hc->recv_size;
2873 d->n_elts_dest_e = d->hc->recv_size;
2874 }
2875 size_t n_elts = (reverse) ? d->n_elts_dest : d->n_elts_src;
2876 if (reverse)
2877 _hybrid_pex_swap_src_dest(d->hc);
2878 _hybrid_pex_prepare_s(d->hc,
2879 n_elts,
2880 stride,
2881 datatype,
2882 reverse,
2883 src_data,
2884 d->dest_id,
2885 d->recv_id);
2886 _dest_data = _hybrid_pex_exchange_s(d,
2887 d->hc,
2888 reverse,
2889 dest_data);
2890 if (reverse) {
2891 _hybrid_pex_swap_src_dest(d->hc);
2892 if (d->hc->send_buffer == src_data)
2893 d->hc->send_buffer = NULL;
2894 }
2895 }
2896 break;
2897
2898 case CS_ALL_TO_ALL_CRYSTAL_ROUTER:
2899 {
2900 _dest_data = dest_data;
2901 cs_timer_t tcr0, tcr1;
2902 cs_crystal_router_t *cr;
2903 if (!reverse) {
2904 cr = cs_crystal_router_create_s(d->n_elts_src,
2905 stride,
2906 datatype,
2907 _cr_flags(d, reverse),
2908 src_data,
2909 NULL,
2910 d->dest_id,
2911 d->dest_rank,
2912 d->comm);
2913 tcr0 = cs_timer_time();
2914
2915 if (_n_trace < _n_trace_max) {
2916 /* Time to 1-5 s */
2917 _all_to_all_trace[_n_trace*9] = tcr0.sec*1e5 + tcr0.nsec/1e4;
2918 _all_to_all_trace[_n_trace*9+1] = 0;
2919 _all_to_all_trace[_n_trace*9+2] = 0;
2920 _all_to_all_trace[_n_trace*9+3] = bft_mem_usage_pr_size();
2921 _all_to_all_trace[_n_trace*9+4] = bft_mem_usage_max_pr_size();
2922 _all_to_all_trace[_n_trace*9+5] = 0;
2923 _all_to_all_trace[_n_trace*9+6] = 0;
2924 _all_to_all_trace[_n_trace*9+7] = 0;
2925 _all_to_all_trace[_n_trace*9+8] = 0;
2926 _n_trace += 1;
2927 }
2928
2929 cs_crystal_router_exchange(cr);
2930 tcr1 = cs_timer_time();
2931
2932 if (_n_trace < _n_trace_max) {
2933 size_t max_sizes[2];
2934 cs_crystal_router_get_max_sizes(cr, max_sizes);
2935
2936 /* Time to 1-5 s */
2937 _all_to_all_trace[_n_trace*9] = tcr1.sec*1e5 + tcr1.nsec/1e4;
2938 _all_to_all_trace[_n_trace*9+1] = _all_to_all_trace[_n_trace*9]
2939 - _all_to_all_trace[(_n_trace-1)*9];
2940 _all_to_all_trace[_n_trace*9+2] = 1;
2941 _all_to_all_trace[_n_trace*9+3] = bft_mem_usage_pr_size();
2942 _all_to_all_trace[_n_trace*9+4] = bft_mem_usage_max_pr_size();
2943 _all_to_all_trace[_n_trace*9+5] = max_sizes[0];
2944 _all_to_all_trace[_n_trace*9+6] = max_sizes[1];
2945 _all_to_all_trace[_n_trace*9+7] = 0;
2946 _all_to_all_trace[_n_trace*9+8] = 0;
2947 _n_trace += 1;
2948 }
2949
2950 if (d->n_elts_dest_e < 0) {
2951 d->n_elts_dest_e = cs_crystal_router_n_recv_elts(cr);
2952 if (d->flags & CS_ALL_TO_ALL_ORDER_BY_SRC_RANK)
2953 _cr_recv_id_by_src_rank(d, cr);
2954 }
2955 int **p_src_rank = _is_src_rank_info_needed(d) ? &(d->src_rank) : NULL;
2956 cs_crystal_router_get_data(cr,
2957 p_src_rank,
2958 &(d->recv_id),
2959 &(d->src_id),
2960 NULL, /* dest_index */
2961 &_dest_data);
2962 if (d->n_elts_dest < 0)
2963 d->n_elts_dest = cs_crystal_router_n_elts(cr);
2964 }
2965 else {
2966 const cs_lnum_t *elt_id
2967 = (d->n_elts_dest < d->n_elts_dest_e) ? d->recv_id : NULL;
2968 cr = cs_crystal_router_create_s(d->n_elts_dest_e,
2969 stride,
2970 datatype,
2971 _cr_flags(d, reverse),
2972 src_data,
2973 elt_id,
2974 d->src_id,
2975 d->src_rank,
2976 d->comm);
2977 tcr0 = cs_timer_time();
2978
2979 if (_n_trace < _n_trace_max) {
2980 /* Time to 1-5 s */
2981 _all_to_all_trace[_n_trace*9] = tcr0.sec*1e5 + tcr0.nsec/1e4;
2982 _all_to_all_trace[_n_trace*9+1] = 0;
2983 _all_to_all_trace[_n_trace*9+2] = 0;
2984 _all_to_all_trace[_n_trace*9+3] = bft_mem_usage_pr_size();
2985 _all_to_all_trace[_n_trace*9+4] = bft_mem_usage_max_pr_size();
2986 _all_to_all_trace[_n_trace*9+5] = 0;
2987 _all_to_all_trace[_n_trace*9+6] = 0;
2988 _all_to_all_trace[_n_trace*9+7] = 0;
2989 _all_to_all_trace[_n_trace*9+8] = 0;
2990 _n_trace += 1;
2991 }
2992
2993 cs_crystal_router_exchange(cr);
2994 tcr1 = cs_timer_time();
2995
2996 if (_n_trace < _n_trace_max) {
2997 size_t max_sizes[2];
2998 cs_crystal_router_get_max_sizes(cr, max_sizes);
2999
3000 /* Time to 1-5 s */
3001 _all_to_all_trace[_n_trace*9] = tcr1.sec*1e5 + tcr1.nsec/1e4;
3002 _all_to_all_trace[_n_trace*9+1] = _all_to_all_trace[_n_trace*9]
3003 - _all_to_all_trace[(_n_trace-1)*9];
3004 _all_to_all_trace[_n_trace*9+2] = 1;
3005 _all_to_all_trace[_n_trace*9+3] = bft_mem_usage_pr_size();
3006 _all_to_all_trace[_n_trace*9+4] = bft_mem_usage_max_pr_size();
3007 _all_to_all_trace[_n_trace*9+5] = max_sizes[0];
3008 _all_to_all_trace[_n_trace*9+6] = max_sizes[1];
3009 _all_to_all_trace[_n_trace*9+7] = 0;
3010 _all_to_all_trace[_n_trace*9+8] = 0;
3011 _n_trace += 1;
3012 }
3013
3014 cs_crystal_router_get_data(cr,
3015 NULL,
3016 NULL,
3017 NULL,
3018 NULL, /* dest_index */
3019 &_dest_data);
3020 }
3021 cs_crystal_router_destroy(&cr);
3022 cs_timer_counter_add_diff
3023 (_all_to_all_timers + CS_ALL_TO_ALL_TIME_EXCHANGE, &tcr0, &tcr1);
3024 _all_to_all_calls[CS_ALL_TO_ALL_TIME_EXCHANGE] += 1;
3025 }
3026 break;
3027
3028 }
3029
3030 t1 = cs_timer_time();
3031 cs_timer_counter_add_diff(_all_to_all_timers + CS_ALL_TO_ALL_TIME_TOTAL,
3032 &t0, &t1);
3033 _all_to_all_calls[CS_ALL_TO_ALL_TIME_TOTAL] += 1;
3034
3035 return _dest_data;
3036 }
3037
3038 /*----------------------------------------------------------------------------*/
3039 /*!
3040 * \brief Communicate local index using all-to-all distributor.
3041 *
3042 * If a destination buffer is provided, it should be of sufficient size for
3043 * the number of elements returned by \ref cs_all_to_all_n_elts_dest.
3044 *
3045 * If no buffer is provided, one is allocated automatically, and transferred
3046 * to the caller (who is responsible for freeing it when no longer needed).
3047 *
3048 * If used in reverse mode, data is still communicated from src_index
3049 * to dest_index or an internal buffer, but communication direction
3050 * (i.e. source and destination ranks) are reversed.
3051 *
3052 * This is obviously a collective operation, and all ranks must provide
3053 * the same value for the reverse parameter.
3054 *
3055 * \param[in, out] d pointer to associated all-to-all distributor
3056 * \param[in] reverse if true, communicate in reverse direction
3057 * \param[in] src_index source index
3058 * \param[out] dest_index pointer to destination index, or NULL
3059 *
3060 * \return pointer to destination data (dest_buffer if non-NULL)
3061 */
3062 /*----------------------------------------------------------------------------*/
3063
3064 cs_lnum_t *
cs_all_to_all_copy_index(cs_all_to_all_t * d,bool reverse,const cs_lnum_t * src_index,cs_lnum_t * dest_index)3065 cs_all_to_all_copy_index(cs_all_to_all_t *d,
3066 bool reverse,
3067 const cs_lnum_t *src_index,
3068 cs_lnum_t *dest_index)
3069 {
3070 cs_timer_t t0, t1;
3071
3072 if (_n_trace > 0) {
3073 fprintf(_all_to_all_trace_bt_log, "\ncs_all_to_all_copy_index: %d\n\n",
3074 _n_trace);
3075 cs_base_backtrace_dump(_all_to_all_trace_bt_log, 1);
3076 bft_printf("cs_all_to_all_copy_index: %d\n", _n_trace);
3077 }
3078
3079 cs_assert(d != NULL);
3080
3081 cs_lnum_t *src_count = NULL;
3082 cs_lnum_t *_dest_index = dest_index;
3083
3084 cs_all_to_all_n_elts_dest(d); /* force sync. if needed */
3085
3086 cs_lnum_t n_src = (reverse) ? d->n_elts_dest : d->n_elts_src;
3087 cs_lnum_t n_dest = -1;
3088
3089 if (dest_index == NULL) {
3090 n_dest = (reverse) ? d->n_elts_src : d->n_elts_dest;
3091 }
3092
3093 t0 = cs_timer_time();
3094
3095 if (dest_index == NULL)
3096 BFT_MALLOC(_dest_index, n_dest + 1, cs_lnum_t);
3097
3098 /* Convert send index to count, then exchange */
3099
3100 BFT_MALLOC(src_count, n_src, cs_lnum_t);
3101
3102 for (cs_lnum_t i = 0; i < n_src; i++)
3103 src_count[i] = src_index[i+1] - src_index[i];
3104
3105 t1 = cs_timer_time();
3106 cs_timer_counter_add_diff(_all_to_all_timers + CS_ALL_TO_ALL_TIME_TOTAL,
3107 &t0, &t1);
3108
3109 cs_all_to_all_copy_array(d,
3110 CS_LNUM_TYPE,
3111 1,
3112 reverse,
3113 src_count,
3114 _dest_index + 1);
3115
3116 t0 = cs_timer_time();
3117
3118 BFT_FREE(src_count);
3119
3120 _dest_index[0] = 0;
3121
3122 if (n_dest < 1)
3123 n_dest = (reverse) ? d->n_elts_src : d->n_elts_dest;
3124
3125 for (cs_lnum_t i = 0; i < n_dest; i++)
3126 _dest_index[i+1] += _dest_index[i];
3127
3128 t1 = cs_timer_time();
3129 cs_timer_counter_add_diff(_all_to_all_timers + CS_ALL_TO_ALL_TIME_TOTAL,
3130 &t0, &t1);
3131
3132 return _dest_index;
3133 }
3134
3135 /*----------------------------------------------------------------------------*/
3136 /*!
3137 * \brief Communicate local index using all-to-all distributor.
3138 *
3139 * If a destination buffer is provided, it should be of sufficient size for
3140 * the number of elements indicated by
3141 * dest_index[\ref cs_all_to_all_n_elts_dest "cs_all_to_all_n_elts_dest(d)"];
3142 *
3143 * If no buffer is provided, one is allocated automatically, and transferred
3144 * to the caller (who is responsible for freeing it when no longer needed).
3145 *
3146 * If used in reverse mode, data is still communicated from src_index
3147 * to dest_index or an internal buffer, but communication direction
3148 * (i.e. source and destination ranks) are reversed.
3149 *
3150 * This is obviously a collective operation, and all ranks must provide
3151 * the same value for the reverse parameter.
3152 *
3153 * \param[in, out] d pointer to associated all-to-all distributor
3154 * \param[in] datatype type of data considered
3155 * \param[in] reverse if true, communicate in reverse direction
3156 * \param[in] src_index source index
3157 * \param[in] src_data source data
3158 * \param[in] dest_index destination index
3159 * \param[out] dest_data pointer to destination data, or NULL
3160 *
3161 * \return pointer to destination data (dest_buffer if non-NULL)
3162 */
3163 /*----------------------------------------------------------------------------*/
3164
3165 void *
cs_all_to_all_copy_indexed(cs_all_to_all_t * d,cs_datatype_t datatype,bool reverse,const cs_lnum_t * src_index,const void * src_data,const cs_lnum_t * dest_index,void * dest_data)3166 cs_all_to_all_copy_indexed(cs_all_to_all_t *d,
3167 cs_datatype_t datatype,
3168 bool reverse,
3169 const cs_lnum_t *src_index,
3170 const void *src_data,
3171 const cs_lnum_t *dest_index,
3172 void *dest_data)
3173 {
3174 cs_assert(d != NULL);
3175
3176 if (_n_trace > 0) {
3177 fprintf(_all_to_all_trace_bt_log, "\ncs_all_to_all_copy_indexed: %d\n\n",
3178 _n_trace);
3179 cs_base_backtrace_dump(_all_to_all_trace_bt_log, 1);
3180 bft_printf("cs_all_to_all_copy_indexed: %d\n", _n_trace);
3181 }
3182
3183 void *_dest_data = NULL;
3184
3185 cs_timer_t t0, t1;
3186
3187 t0 = cs_timer_time();
3188
3189 /* Reverse can only be called after direct exchange in most cases
3190 (this case should be rare, and requires additional echanges,
3191 but let's play it safe and make sure it works if needed) */
3192
3193 if (d->n_elts_dest < 0 && reverse)
3194 cs_all_to_all_copy_array(d,
3195 CS_DATATYPE_NULL,
3196 0,
3197 false,
3198 NULL,
3199 NULL);
3200
3201 /* Now do regular exchange */
3202
3203 switch(d->type) {
3204
3205 case CS_ALL_TO_ALL_MPI_DEFAULT:
3206 {
3207 if (d->n_elts_dest < 0) { /* Exchange metadata if not done yet */
3208 _alltoall_caller_exchange_meta(d->dc,
3209 d->n_elts_src,
3210 d->dest_rank);
3211 d->n_elts_dest = d->dc->recv_size;
3212 }
3213 size_t n_elts = (reverse) ? d->n_elts_dest : d->n_elts_src;
3214 _alltoall_caller_save_meta_i(d->dc);
3215 if (reverse)
3216 _alltoall_caller_swap_src_dest(d->dc);
3217 _alltoall_caller_prepare_i(d->dc,
3218 n_elts,
3219 datatype,
3220 reverse,
3221 src_index,
3222 dest_index,
3223 src_data,
3224 d->recv_id,
3225 d->dest_rank);
3226 _dest_data = _alltoall_caller_exchange_i(d,
3227 d->dc,
3228 reverse,
3229 dest_index,
3230 d->dest_rank,
3231 dest_data);
3232 if (reverse) {
3233 _alltoall_caller_swap_src_dest(d->dc);
3234 if (d->dc->send_buffer == src_data)
3235 d->dc->send_buffer = NULL;
3236 }
3237 _alltoall_caller_reset_meta_i(d->dc, d->dest_rank);
3238 }
3239 break;
3240
3241 case CS_ALL_TO_ALL_HYBRID:
3242 {
3243 if (d->n_elts_dest < 0) { /* Exchange metadata if not done yet */
3244 _hybrid_pex_exchange_meta(d->hc,
3245 d->n_elts_src,
3246 d->dest_rank);
3247 BFT_FREE(d->_dest_rank);
3248 d->n_elts_dest = d->hc->recv_size;
3249 }
3250 size_t n_elts = (reverse) ? d->n_elts_dest : d->n_elts_src;
3251 _hybrid_pex_save_meta_i(d->hc);
3252 if (reverse)
3253 _hybrid_pex_swap_src_dest(d->hc);
3254 _hybrid_pex_prepare_i(d->hc,
3255 n_elts,
3256 datatype,
3257 reverse,
3258 src_index,
3259 dest_index,
3260 src_data,
3261 d->recv_id);
3262 _dest_data = _hybrid_pex_exchange_i(d,
3263 d->hc,
3264 reverse,
3265 dest_index,
3266 dest_data);
3267 if (reverse) {
3268 _hybrid_pex_swap_src_dest(d->hc);
3269 if (d->hc->send_buffer == src_data)
3270 d->hc->send_buffer = NULL;
3271 }
3272 _hybrid_pex_reset_meta_i(d->hc);
3273 }
3274 break;
3275
3276 case CS_ALL_TO_ALL_CRYSTAL_ROUTER:
3277 {
3278 _dest_data = dest_data;
3279 cs_timer_t tcr0, tcr1;
3280 cs_crystal_router_t *cr;
3281 if (!reverse) {
3282 cr = cs_crystal_router_create_i(d->n_elts_src,
3283 datatype,
3284 _cr_flags(d, reverse),
3285 src_index,
3286 src_data,
3287 NULL,
3288 d->dest_id,
3289 d->dest_rank,
3290 d->comm);
3291 tcr0 = cs_timer_time();
3292
3293 if (_n_trace < _n_trace_max) {
3294 /* Time to 1-5 s */
3295 _all_to_all_trace[_n_trace*9] = tcr0.sec*1e5 + tcr0.nsec/1e4;
3296 _all_to_all_trace[_n_trace*9+1] = 0;
3297 _all_to_all_trace[_n_trace*9+2] = 0;
3298 _all_to_all_trace[_n_trace*9+3] = bft_mem_usage_pr_size();
3299 _all_to_all_trace[_n_trace*9+4] = bft_mem_usage_max_pr_size();
3300 _all_to_all_trace[_n_trace*9+5] = 0;
3301 _all_to_all_trace[_n_trace*9+6] = 0;
3302 _all_to_all_trace[_n_trace*9+7] = 0;
3303 _all_to_all_trace[_n_trace*9+8] = 0;
3304 _n_trace += 1;
3305 }
3306
3307 cs_crystal_router_exchange(cr);
3308 tcr1 = cs_timer_time();
3309
3310 if (_n_trace < _n_trace_max) {
3311 size_t max_sizes[2];
3312 cs_crystal_router_get_max_sizes(cr, max_sizes);
3313
3314 /* Time to 1-5 s */
3315 _all_to_all_trace[_n_trace*9] = tcr1.sec*1e5 + tcr1.nsec/1e4;
3316 _all_to_all_trace[_n_trace*9+1] = _all_to_all_trace[_n_trace*9]
3317 - _all_to_all_trace[(_n_trace-1)*9];
3318 _all_to_all_trace[_n_trace*9+2] = 1;
3319 _all_to_all_trace[_n_trace*9+3] = bft_mem_usage_pr_size();
3320 _all_to_all_trace[_n_trace*9+4] = bft_mem_usage_max_pr_size();
3321 _all_to_all_trace[_n_trace*9+5] = max_sizes[0];
3322 _all_to_all_trace[_n_trace*9+6] = max_sizes[1];
3323 _all_to_all_trace[_n_trace*9+7] = 0;
3324 _all_to_all_trace[_n_trace*9+8] = 0;
3325 _n_trace += 1;
3326 }
3327
3328 if (d->n_elts_dest < 0) {
3329 d->n_elts_dest = cs_crystal_router_n_elts(cr);
3330 d->n_elts_dest_e = cs_crystal_router_n_recv_elts(cr);
3331 }
3332 int **p_src_rank = (d->src_rank == NULL) ? &(d->src_rank) : NULL;
3333 cs_crystal_router_get_data(cr,
3334 p_src_rank,
3335 &(d->recv_id),
3336 &(d->src_id),
3337 NULL, /* dest_index */
3338 &_dest_data);
3339 }
3340 else {
3341 const cs_lnum_t *elt_id
3342 = (d->n_elts_dest < d->n_elts_dest_e) ? d->recv_id : NULL;
3343 cr = cs_crystal_router_create_i(d->n_elts_dest_e,
3344 datatype,
3345 _cr_flags(d, reverse),
3346 src_index,
3347 src_data,
3348 elt_id,
3349 d->src_id,
3350 d->src_rank,
3351 d->comm);
3352 tcr0 = cs_timer_time();
3353
3354 if (_n_trace < _n_trace_max) {
3355 /* Time to 1-5 s */
3356 _all_to_all_trace[_n_trace*9] = tcr0.sec*1e5 + tcr0.nsec/1e4;
3357 _all_to_all_trace[_n_trace*9+1] = 0;
3358 _all_to_all_trace[_n_trace*9+2] = 0;
3359 _all_to_all_trace[_n_trace*9+3] = bft_mem_usage_pr_size();
3360 _all_to_all_trace[_n_trace*9+4] = bft_mem_usage_max_pr_size();
3361 _all_to_all_trace[_n_trace*9+5] = 0;
3362 _all_to_all_trace[_n_trace*9+6] = 0;
3363 _all_to_all_trace[_n_trace*9+7] = 0;
3364 _all_to_all_trace[_n_trace*9+8] = 0;
3365 _n_trace += 1;
3366 }
3367
3368 cs_crystal_router_exchange(cr);
3369 tcr1 = cs_timer_time();
3370
3371 if (_n_trace < _n_trace_max) {
3372 size_t max_sizes[2];
3373 cs_crystal_router_get_max_sizes(cr, max_sizes);
3374
3375 /* Time to 1-5 s */
3376 _all_to_all_trace[_n_trace*9] = tcr1.sec*1e5 + tcr1.nsec/1e4;
3377 _all_to_all_trace[_n_trace*9+1] = _all_to_all_trace[_n_trace*9]
3378 - _all_to_all_trace[(_n_trace-1)*9];
3379 _all_to_all_trace[_n_trace*9+2] = 1;
3380 _all_to_all_trace[_n_trace*9+3] = bft_mem_usage_pr_size();
3381 _all_to_all_trace[_n_trace*9+4] = bft_mem_usage_max_pr_size();
3382 _all_to_all_trace[_n_trace*9+5] = max_sizes[0];
3383 _all_to_all_trace[_n_trace*9+6] = max_sizes[1];
3384 _all_to_all_trace[_n_trace*9+7] = 0;
3385 _all_to_all_trace[_n_trace*9+8] = 0;
3386 _n_trace += 1;
3387 }
3388
3389 cs_crystal_router_get_data(cr,
3390 NULL,
3391 NULL,
3392 NULL,
3393 NULL, /* dest_index */
3394 &_dest_data);
3395 }
3396 cs_crystal_router_destroy(&cr);
3397 cs_timer_counter_add_diff
3398 (_all_to_all_timers + CS_ALL_TO_ALL_TIME_EXCHANGE, &tcr0, &tcr1);
3399 _all_to_all_calls[CS_ALL_TO_ALL_TIME_EXCHANGE] += 1;
3400 }
3401 break;
3402
3403 }
3404
3405 t1 = cs_timer_time();
3406 cs_timer_counter_add_diff(_all_to_all_timers + CS_ALL_TO_ALL_TIME_TOTAL,
3407 &t0, &t1);
3408 _all_to_all_calls[CS_ALL_TO_ALL_TIME_TOTAL] += 1;
3409
3410 return _dest_data;
3411 }
3412
3413 /*----------------------------------------------------------------------------*/
3414 /*!
3415 * \brief Get array of source element ranks associated with an
3416 * all-to-all distributor.
3417 *
3418 * This function should be called only after \ref cs_all_to_all_copy_array,
3419 * and allocates and returns an array of source element ranks matching the
3420 * exchanged data elements.
3421 *
3422 * It should also be called only if the distributor creation flags match
3423 * CS_ALL_TO_ALL_NEED_SRC_RANK or CS_ALL_TO_ALL_ORDER_BY_SRC_RANK.
3424 *
3425 * The returned data is owned by the caller, who is responsible for freeing
3426 * it when no longer needed.
3427 *
3428 * If source ranks are not available (depending on the distributor
3429 * creation function and options), the matching pointer will
3430 * be set to NULL.
3431 *
3432 * \param[in] d pointer to associated all-to-all distributor
3433 *
3434 * \return array of source ranks (or NULL)
3435 */
3436 /*----------------------------------------------------------------------------*/
3437
3438 int *
cs_all_to_all_get_src_rank(cs_all_to_all_t * d)3439 cs_all_to_all_get_src_rank(cs_all_to_all_t *d)
3440 {
3441 cs_timer_t t0 = cs_timer_time();
3442
3443 int *src_rank = NULL;
3444
3445 cs_assert(d != NULL);
3446
3447 if (! ( d->flags & CS_ALL_TO_ALL_NEED_SRC_RANK
3448 || d->flags & CS_ALL_TO_ALL_ORDER_BY_SRC_RANK))
3449 bft_error(__FILE__, __LINE__, 0,
3450 "%s: is called for a distributor with flags %d, which does not\n"
3451 "match masks CS_ALL_TO_ALL_NEED_SRC_RANK (%d) or "
3452 "CS_ALL_TO_ALL_ORDER_BY_SRC_RANK (%d).",
3453 __func__, d->flags,
3454 CS_ALL_TO_ALL_NEED_SRC_RANK,
3455 CS_ALL_TO_ALL_ORDER_BY_SRC_RANK);
3456
3457 BFT_MALLOC(src_rank, d->n_elts_dest, int);
3458
3459 switch(d->type) {
3460
3461 case CS_ALL_TO_ALL_MPI_DEFAULT:
3462 {
3463 const _mpi_all_to_all_caller_t *dc = d->dc;
3464
3465 for (int i = 0; i < dc->n_ranks; i++) {
3466 for (cs_lnum_t j = dc->recv_displ[i]; j < dc->recv_displ[i+1]; j++)
3467 src_rank[j] = i;
3468 }
3469 }
3470 break;
3471
3472 case CS_ALL_TO_ALL_HYBRID:
3473 {
3474 const _hybrid_pex_t *hc = d->hc;
3475
3476 for (int i = 0; i < hc->rn_recv->size; i++) {
3477 int rank_id = hc->rn_recv->rank[i];
3478 for (cs_lnum_t j = hc->recv_displ[i]; j < hc->recv_displ[i+1]; j++)
3479 src_rank[j] = rank_id;
3480 }
3481 }
3482 break;
3483
3484 case CS_ALL_TO_ALL_CRYSTAL_ROUTER:
3485 {
3486 if (d->src_rank != NULL)
3487 memcpy(src_rank, d->src_rank, d->n_elts_dest*sizeof(int));
3488 }
3489
3490 }
3491
3492 cs_timer_t t1 = cs_timer_time();
3493 cs_timer_counter_add_diff(_all_to_all_timers + CS_ALL_TO_ALL_TIME_TOTAL,
3494 &t0, &t1);
3495
3496 return src_rank;
3497 }
3498
3499 #endif /* defined(HAVE_MPI) */
3500
3501 /*----------------------------------------------------------------------------*/
3502 /*!
3503 * \brief Get current type of all-to-all distributor algorithm choice.
3504 *
3505 * \return current type of all-to-all distributor algorithm choice
3506 */
3507 /*----------------------------------------------------------------------------*/
3508
3509 cs_all_to_all_type_t
cs_all_to_all_get_type(void)3510 cs_all_to_all_get_type(void)
3511 {
3512 return _all_to_all_type;
3513 }
3514
3515 /*----------------------------------------------------------------------------*/
3516 /*!
3517 * \brief Get current type of hybrid all-to-all distributor parameters.
3518 *
3519 * \param[out] rne_type type of metadata exchange algorithm, or NULL
3520 */
3521 /*----------------------------------------------------------------------------*/
3522
3523 void
cs_all_to_all_get_hybrid_parameters(cs_rank_neighbors_exchange_t * rne_type)3524 cs_all_to_all_get_hybrid_parameters(cs_rank_neighbors_exchange_t *rne_type)
3525 {
3526 if (rne_type != NULL)
3527 *rne_type = _hybrid_meta_type;
3528 }
3529
3530 /*----------------------------------------------------------------------------*/
3531 /*!
3532 * \brief Set current type of all-to-all distributor algorithm choice.
3533 *
3534 * \param[in] rne_type type of metadata exchange algorithm
3535 */
3536 /*----------------------------------------------------------------------------*/
3537
3538 void
cs_all_to_all_set_hybrid_parameters(cs_rank_neighbors_exchange_t rne_type)3539 cs_all_to_all_set_hybrid_parameters(cs_rank_neighbors_exchange_t rne_type)
3540 {
3541 _hybrid_meta_type = rne_type;
3542 }
3543
3544 /*----------------------------------------------------------------------------*/
3545 /*!
3546 * \brief Set current type of all-to-all distributor algorithm choice.
3547 *
3548 * \param t type of all-to-all distributor algorithm choice to select
3549 */
3550 /*----------------------------------------------------------------------------*/
3551
3552 void
cs_all_to_all_set_type(cs_all_to_all_type_t t)3553 cs_all_to_all_set_type(cs_all_to_all_type_t t)
3554 {
3555 _all_to_all_type = t;
3556 }
3557
3558 /*----------------------------------------------------------------------------*/
3559 /*!
3560 * \brief Log performance information relative to instrumented all-to-all
3561 * distribution.
3562 */
3563 /*----------------------------------------------------------------------------*/
3564
3565 void
cs_all_to_all_log_finalize(void)3566 cs_all_to_all_log_finalize(void)
3567 {
3568 cs_crystal_router_log_finalize();
3569
3570 #if defined(HAVE_MPI)
3571
3572 if (_all_to_all_calls[0] <= 0)
3573 return;
3574
3575 char method_name[96];
3576 switch(_all_to_all_type) {
3577 case CS_ALL_TO_ALL_MPI_DEFAULT:
3578 snprintf(method_name, 96, N_("MPI_Alltoall and MPI_Alltoallv"));
3579 break;
3580 case CS_ALL_TO_ALL_HYBRID:
3581 snprintf(method_name, 96, N_("Hybrid, %s (metadata), %s (data)"),
3582 _(cs_rank_neighbors_exchange_name[_hybrid_meta_type]),
3583 "MPI_Alltoallv");
3584 break;
3585 case CS_ALL_TO_ALL_CRYSTAL_ROUTER:
3586 snprintf(method_name, 96, N_("Crystal Router algorithm"));
3587 break;
3588 }
3589 method_name[95] = '\0';
3590
3591 cs_log_printf(CS_LOG_PERFORMANCE,
3592 _("\nAll-to-many operations: (%s):\n\n"),
3593 method_name);
3594
3595 /* Print times */
3596
3597 double wtimes[3] = {0, 0, 0};
3598 double wtimes_mean[3], wtimes_max[3], wtimes_min[3];
3599 for (int i = 0; i < 3; i++) {
3600 if (_all_to_all_calls[i] > 0)
3601 wtimes[i] = (_all_to_all_timers[i]).nsec*1e-9;
3602 wtimes_mean[i] = wtimes[i];
3603 wtimes_max[i] = wtimes[i];
3604 wtimes_min[i] = wtimes[i];
3605 }
3606 if (cs_glob_n_ranks > 1) {
3607 MPI_Allreduce(wtimes, wtimes_mean, 3, MPI_DOUBLE, MPI_SUM, cs_glob_mpi_comm);
3608 MPI_Allreduce(wtimes, wtimes_max, 3, MPI_DOUBLE, MPI_MAX, cs_glob_mpi_comm);
3609 MPI_Allreduce(wtimes, wtimes_min, 3, MPI_DOUBLE, MPI_MIN, cs_glob_mpi_comm);
3610 }
3611 for (int i = 0; i < 3; i++)
3612 wtimes_mean[i] /= cs_glob_n_ranks;
3613
3614 cs_log_printf
3615 (CS_LOG_PERFORMANCE,
3616 _(" mean minimum maximum"
3617 " calls\n"
3618 " Total: %12.5f s %12.5f %12.5f s %lu\n"
3619 " Metadata exchange: %12.5f s %12.5f %12.5f s %lu\n"
3620 " Data exchange: %12.5f s %12.5f %12.5f s %lu\n\n"),
3621 wtimes_mean[0], wtimes_min[0], wtimes_max[0],
3622 (unsigned long)(_all_to_all_calls[0]),
3623 wtimes_mean[1], wtimes_min[1], wtimes_max[1],
3624 (unsigned long)(_all_to_all_calls[1]),
3625 wtimes_mean[2], wtimes_min[2], wtimes_max[2],
3626 (unsigned long)(_all_to_all_calls[2]));
3627
3628 cs_log_separator(CS_LOG_PERFORMANCE);
3629
3630 if (cs_glob_n_ranks > 1 && _n_trace > 0) {
3631 cs_gnum_t *_all_to_all_sum;
3632 BFT_MALLOC(_all_to_all_sum, _n_trace*9, uint64_t);
3633 cs_gnum_t *_all_to_all_max;
3634 BFT_MALLOC(_all_to_all_max, _n_trace*9, uint64_t);
3635
3636 MPI_Allreduce(_all_to_all_trace, _all_to_all_sum, _n_trace*9, MPI_UINT64_T,
3637 MPI_SUM, cs_glob_mpi_comm);
3638 MPI_Allreduce(_all_to_all_trace, _all_to_all_max, _n_trace*9, MPI_UINT64_T,
3639 MPI_MAX, cs_glob_mpi_comm);
3640
3641 for (int j = 0; j < _n_trace*9; j++)
3642 _all_to_all_sum[j] /= cs_glob_n_ranks;
3643
3644 if (cs_glob_rank_id < 1) {
3645 FILE *f = fopen("all_to_all_trace.csv", "w");
3646 fprintf(f, "call, time, dt_mean, dt_max, stage, mem_cur_mean, mem_cur_max, "
3647 "mem_max_mean, mem_max, send_size_mean, send_size_max, "
3648 "recv_size_mean, recv_size_max, "
3649 "send_ranks_mean, send_ranks_max, "
3650 "recv_ranks_mean, recv_ranks_max\n");
3651 for (int j = 0; j < _n_trace; j++)
3652 fprintf(f,
3653 "%d, %g, %g, %g, %d,"
3654 "%llu, %llu, %llu, %llu, %llu, %llu,"
3655 "%llu, %llu, %llu, %llu, %llu, %llu\n",
3656 j,
3657 (_all_to_all_trace[j*9] - _all_to_all_trace[0])/1.e5,
3658 _all_to_all_sum[j*9+1]/1.e5,
3659 _all_to_all_max[j*9+1]/1.e5,
3660 (int)_all_to_all_trace[j*9+2],
3661 (unsigned long long)_all_to_all_sum[j*9+3],
3662 (unsigned long long)_all_to_all_max[j*9+3],
3663 (unsigned long long)_all_to_all_sum[j*9+4],
3664 (unsigned long long)_all_to_all_max[j*9+4],
3665 (unsigned long long)_all_to_all_sum[j*9+5]/1000,
3666 (unsigned long long)_all_to_all_max[j*9+5]/1000,
3667 (unsigned long long)_all_to_all_sum[j*9+6]/1000,
3668 (unsigned long long)_all_to_all_max[j*9+6]/1000,
3669 (unsigned long long)_all_to_all_sum[j*9+7],
3670 (unsigned long long)_all_to_all_max[j*9+7],
3671 (unsigned long long)_all_to_all_sum[j*9+8],
3672 (unsigned long long)_all_to_all_max[j*9+8]);
3673 fclose(f);
3674 fclose(_all_to_all_trace_bt_log);
3675 _all_to_all_trace_bt_log = NULL;
3676 }
3677
3678 BFT_FREE(_all_to_all_sum);
3679 BFT_FREE(_all_to_all_max);
3680 BFT_FREE(_all_to_all_trace);
3681 }
3682
3683
3684 #endif /* defined(HAVE_MPI) */
3685 }
3686
3687
3688 /*----------------------------------------------------------------------------*/
3689
3690 END_C_DECLS
3691