1 /*============================================================================
2  * All-to-all parallel data exchange.
3  *============================================================================*/
4 
5 /*
6   This file is part of Code_Saturne, a general-purpose CFD tool.
7 
8   Copyright (C) 1998-2021 EDF S.A.
9 
10   This program is free software; you can redistribute it and/or modify it under
11   the terms of the GNU General Public License as published by the Free Software
12   Foundation; either version 2 of the License, or (at your option) any later
13   version.
14 
15   This program is distributed in the hope that it will be useful, but WITHOUT
16   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
18   details.
19 
20   You should have received a copy of the GNU General Public License along with
21   this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
22   Street, Fifth Floor, Boston, MA 02110-1301, USA.
23 */
24 
25 /*----------------------------------------------------------------------------*/
26 
27 #include "cs_defs.h"
28 
29 /*----------------------------------------------------------------------------
30  * Standard C library headers
31  *----------------------------------------------------------------------------*/
32 
33 #include <assert.h>
34 #include <errno.h>
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 
39 /*----------------------------------------------------------------------------
40  *  Local headers
41  *----------------------------------------------------------------------------*/
42 
43 #include "bft_mem.h"
44 #include "bft_mem_usage.h"
45 #include "bft_error.h"
46 #include "bft_printf.h"
47 
48 #include "cs_base.h"
49 #include "cs_assert.h"
50 #include "cs_block_dist.h"
51 #include "cs_crystal_router.h"
52 #include "cs_log.h"
53 #include "cs_order.h"
54 #include "cs_rank_neighbors.h"
55 #include "cs_timer.h"
56 
57 /*----------------------------------------------------------------------------
58  *  Header for the current file
59  *----------------------------------------------------------------------------*/
60 
61 #include "cs_all_to_all.h"
62 
63 /*----------------------------------------------------------------------------*/
64 
65 BEGIN_C_DECLS
66 
67 /*=============================================================================
68  * Additional Doxygen documentation
69  *============================================================================*/
70 
71 /*!
72   \file cs_all_to_all.c
73         All-to-all parallel data exchange.
74 
75   \typedef cs_all_to_all_t
76         Opaque all-to-all distribution structure
77 
78   \enum cs_all_to_all_type_t
79 
80   \brief All-to-all algorithm selection
81 
82   \var CS_ALL_TO_ALL_MPI_DEFAULT
83        Use MPI_Alltoall and MPI_Alltoallv sequences
84 
85   \var CS_ALL_TO_ALL_CRYSTAL_ROUTER
86        Use crystal router algorithm
87 
88   \paragraph all_to_all_flags Using flags
89   \parblock
90 
91   Flags are defined as a sum (bitwise or) of constants, which may include
92   \ref CS_ALL_TO_ALL_USE_DEST_ID, \ref CS_ALL_TO_ALL_ORDER_BY_SRC_RANK,
93   \ref CS_ALL_TO_ALL_NO_REVERSE, and \ref CS_ALL_TO_ALL_NEED_SRC_RANK.
94 
95   \endparblock
96 */
97 
98 /*! \cond DOXYGEN_SHOULD_SKIP_THIS */
99 
100 /* Note for future optimization:
101    for cases where data to be sent is sorted by increasing destination rank,
102    using communication schemes requiring ordering by ranks (such as
103    CS_ALL_TO_ALL_MPI_DEFAULT, we should not need to build an extra array,
104    but only to send the correct parts of the indexed list to the correct
105    ranks, so cs_all_to_all_copy_indexed may internally use an extra array;
106    this could be avoided by adding an additional send option or flag,
107    or adding cs_all_to_copy_... variants for cases ordered by rank */
108 
109 /*=============================================================================
110  * Macro definitions
111  *============================================================================*/
112 
113 /*=============================================================================
114  * Local type definitions
115  *============================================================================*/
116 
117 #if defined(HAVE_MPI)
118 
119 typedef enum {
120 
121   CS_ALL_TO_ALL_TIME_TOTAL,
122   CS_ALL_TO_ALL_TIME_METADATA,
123   CS_ALL_TO_ALL_TIME_EXCHANGE
124 
125 } cs_all_to_all_timer_t;
126 
127 /* Base structure for MPI_Alltoall exchanges */
128 
129 typedef struct {
130 
131   cs_datatype_t   datatype;          /* associated datatype */
132   cs_datatype_t   dest_id_datatype;  /* type of destination id (CS_LNUM_TYPE,
133                                         or CS_DATATYPE_NULL) */
134 
135   size_t          stride;            /* stride if strided, 0 otherwise */
136 
137   size_t          elt_shift;         /* starting byte for element data */
138   size_t          comp_size;         /* Composite element size, with padding */
139 
140   size_t          send_size;         /* Send buffer element count */
141   size_t          recv_size;         /* Receive buffer element count */
142 
143   const void     *send_buffer;       /* Send buffer */
144   unsigned char  *_send_buffer;      /* Send buffer */
145 
146   int            *send_count;        /* Send counts for MPI_Alltoall */
147   int            *recv_count;        /* Receive counts for MPI_Alltoall */
148   int            *send_displ;        /* Send displs for MPI_Alltoall */
149   int            *recv_displ;        /* Receive displs for MPI_Alltoall */
150 
151   int            *recv_count_save;   /* Saved (strided) receive counts for
152                                         MPI_Alltoall for indexed exchanges */
153 
154   MPI_Comm        comm;              /* Associated MPI communicator */
155   MPI_Datatype    comp_type;         /* Associated MPI datatype */
156 
157   int             n_ranks;           /* Number of ranks associated with
158                                         communicator */
159 
160 } _mpi_all_to_all_caller_t;
161 
162 /* Base structure for cs_rank_neighbors-based exchanges */
163 
164 typedef struct {
165 
166   cs_rank_neighbors_t   *rn_send;    /* rank neighbors structure for sending */
167   cs_rank_neighbors_t   *rn_recv;    /* rank neighbors structure for receiving */
168 
169   cs_datatype_t   datatype;          /* associated datatype */
170   cs_datatype_t   dest_id_datatype;  /* type of destination id (CS_LNUM_TYPE,
171                                         or CS_DATATYPE_NULL) */
172 
173   size_t          stride;            /* stride if strided, 0 otherwise */
174 
175   size_t          elt_shift;         /* starting byte for element data */
176   size_t          comp_size;         /* Composite element size, with padding */
177 
178   size_t          send_size;         /* Send buffer element count */
179   size_t          recv_size;         /* Receive buffer element count */
180 
181   int            *elt_rank_index;    /* Element rank index matching owner
182                                         rank_index */
183 
184   const void     *send_buffer;       /* Send buffer */
185   unsigned char  *_send_buffer;      /* Send buffer */
186 
187   int            *send_count;        /* Send counts for MPI_Alltoall */
188   int            *recv_count;        /* Receive counts for MPI_Alltoall */
189   int            *send_displ;        /* Send displs for MPI_Alltoall */
190   int            *recv_displ;        /* Receive displs for MPI_Alltoall */
191 
192   int            *recv_count_save;   /* Saved (strided) receive counts for
193                                         MPI_Alltoall for indexed exchanges */
194 
195   MPI_Comm        comm;              /* Associated MPI communicator */
196   MPI_Datatype    comp_type;         /* Associated MPI datatype */
197 
198 } _hybrid_pex_t;
199 
200 #endif /* defined(HAVE_MPI) */
201 
202 /* Structure used to redistribute data */
203 
204 #if defined(HAVE_MPI)
205 
206 struct _cs_all_to_all_t {
207 
208   cs_lnum_t                  n_elts_src;   /* Number of source elements */
209   cs_lnum_t                  n_elts_dest;  /* Number of destination elements
210                                               (-1 before metadata available) */
211   cs_lnum_t                  n_elts_dest_e; /* Number of destination elements
212                                                exchanged, independently of
213                                                 dest_id (-1 before metadata
214                                                 available) */
215 
216   int                        flags;        /* option flags */
217 
218   /* Send metadata */
219 
220   const int                 *dest_rank;    /* optional element destination
221                                               rank (possibly shared) */
222   int                       *_dest_rank;   /* dest_rank if owner, or NULL */
223 
224   const cs_lnum_t           *dest_id;      /* optional element destination id
225                                               (possibly shared) */
226   cs_lnum_t                 *_dest_id;     /* dest_id if owner, or NULL */
227 
228   /* Receive metadata */
229 
230   cs_lnum_t                 *recv_id;      /* received match for dest_id */
231 
232   /* Data needed only for Crystal Router reverse communication */
233 
234   cs_lnum_t                 *src_id;       /* received match for dest_id */
235   int                       *src_rank;     /* received source rank */
236 
237   /* Sub-structures */
238 
239   _mpi_all_to_all_caller_t  *dc;       /* Default MPI_Alltoall(v) caller */
240   _hybrid_pex_t             *hc;       /* Hybrid PEX */
241   cs_crystal_router_t       *cr;       /* associated crystal-router */
242 
243   /* MPI data */
244 
245   int                        n_ranks;      /* Number of associated ranks */
246   MPI_Comm                   comm;         /* associated communicator */
247 
248   /* Other metadata */
249 
250   cs_all_to_all_type_t       type;         /* Communication protocol */
251 
252 };
253 
254 #endif /* defined(HAVE_MPI) */
255 
256 /*============================================================================
257  * Static global variables
258  *============================================================================*/
259 
260 /* Default all-to-all type */
261 
262 static cs_all_to_all_type_t _all_to_all_type = CS_ALL_TO_ALL_MPI_DEFAULT;
263 
264 static cs_rank_neighbors_exchange_t _hybrid_meta_type
265   = CS_RANK_NEIGHBORS_CRYSTAL_ROUTER;
266 
267 #if defined(HAVE_MPI)
268 
269 /* Call counter and timer: 0: total, 1: metadata comm, 2: data comm */
270 
271 static size_t              _all_to_all_calls[3] = {0, 0, 0};
272 static cs_timer_counter_t  _all_to_all_timers[3];
273 
274 /* Instrumentation */
275 
276 static int        _n_trace = 0;
277 static int        _n_trace_max = 0;
278 static uint64_t  *_all_to_all_trace = NULL;
279 static FILE      *_all_to_all_trace_bt_log = NULL;
280 
281 #endif /* defined(HAVE_MPI) */
282 
283 /*============================================================================
284  * Local function defintions
285  *============================================================================*/
286 
287 #if defined(HAVE_MPI)
288 
289 /*----------------------------------------------------------------------------
290  * Common portion of different all-to-all distributor contructors.
291  *
292  * arguments:
293  *   n_elts <-- number of elements
294  *   flags  <-- sum of ordering and metadata flag constants
295  *   comm   <-- associated MPI communicator
296  *
297  * returns:
298  *   pointer to new all-to-all distributor
299  *----------------------------------------------------------------------------*/
300 
301 static cs_all_to_all_t *
_all_to_all_create_base(size_t n_elts,int flags,MPI_Comm comm)302 _all_to_all_create_base(size_t    n_elts,
303                         int       flags,
304                         MPI_Comm  comm)
305 {
306   cs_all_to_all_t *d;
307 
308   /* Initialize timers if required */
309 
310   if (_all_to_all_calls[0] == 0) {
311     int n_timers = sizeof(_all_to_all_timers)/sizeof(_all_to_all_timers[0]);
312     for (int i = 0; i < n_timers; i++)
313       CS_TIMER_COUNTER_INIT(_all_to_all_timers[i]);
314 
315     const char *p = getenv("CS_ALL_TO_ALL_TRACE");
316     if (p != NULL) {
317       if (atoi(p) > 0) {
318         _n_trace_max = atoi(p);
319         bft_printf(_("\n cs_all_2_all_trace: %d.\n\n"), _n_trace_max);
320         BFT_MALLOC(_all_to_all_trace, _n_trace_max*9, uint64_t);
321         _all_to_all_trace_bt_log = fopen("all_to_all_trace_bt.txt", "w");
322       }
323     }
324   }
325 
326   /* Check flags */
327 
328   if (   (flags & CS_ALL_TO_ALL_USE_DEST_ID)
329       && (flags & CS_ALL_TO_ALL_ORDER_BY_SRC_RANK))
330     bft_error(__FILE__, __LINE__, 0,
331               "%s: flags may not match both\n"
332               "CS_ALL_TO_ALL_USE_DEST_ID and\n"
333               "CS_ALL_TO_ALL_ORDER_BY_SRC_RANK.",
334               __func__);
335 
336   /* Allocate structure */
337 
338   BFT_MALLOC(d, 1, cs_all_to_all_t);
339 
340   /* Create associated sub-structure */
341 
342   d->n_elts_src = n_elts;
343   d->n_elts_dest = -1; /* undetermined as yet */
344   d->n_elts_dest_e = -1;
345 
346   d->flags = flags;
347 
348   d->dest_rank = NULL;
349   d->_dest_rank = NULL;
350 
351   d->dest_id = NULL;
352   d->_dest_id = NULL;
353 
354   d->recv_id = NULL;
355 
356   d->src_id = NULL;
357   d->src_rank = NULL;
358 
359   d->cr = NULL;
360   d->hc = NULL;
361   d->dc = NULL;
362 
363   d->comm = comm;
364   MPI_Comm_size(comm, &(d->n_ranks));
365 
366   d->type = _all_to_all_type;
367 
368   return d;
369 }
370 
371 /*----------------------------------------------------------------------------
372  * Compute rank displacement based on count.
373  *
374  * arguments:
375  *   n_ranks <-- number of ranks
376  *   count   <-- number of elements per rank (size: n_ranks)
377  *   displ   --> element displacement in cumulative array (size: n_ranks)
378  *
379  * returns:
380  *   cumulative count for all ranks
381  *----------------------------------------------------------------------------*/
382 
383 static cs_lnum_t
_compute_displ(int n_ranks,const int count[],int displ[])384 _compute_displ(int        n_ranks,
385                const int  count[],
386                int        displ[])
387 {
388   int i;
389   cs_lnum_t total_count = 0;
390 
391   displ[0] = 0;
392 
393   for (i = 0; i < n_ranks; i++)
394     displ[i+1] = displ[i] + count[i];
395 
396   if (n_ranks > 0)
397     total_count = displ[n_ranks-1] + count[n_ranks-1];
398 
399   return total_count;
400 }
401 
402 /*----------------------------------------------------------------------------
403  * First stage of creation for an MPI_Alltoall(v) caller for strided data.
404  *
405  * parameters:
406  *   flags     <-- metadata flags
407  *   comm      <-- associated MPI communicator
408  *---------------------------------------------------------------------------*/
409 
410 static _mpi_all_to_all_caller_t *
_alltoall_caller_create_meta(int flags,MPI_Comm comm)411 _alltoall_caller_create_meta(int        flags,
412                              MPI_Comm   comm)
413 {
414   _mpi_all_to_all_caller_t *dc;
415 
416   /* Allocate structure */
417 
418   BFT_MALLOC(dc, 1, _mpi_all_to_all_caller_t);
419 
420   dc->datatype = CS_DATATYPE_NULL;
421 
422   dc->dest_id_datatype = CS_DATATYPE_NULL;
423 
424   if (flags & CS_ALL_TO_ALL_USE_DEST_ID)
425     dc->dest_id_datatype = CS_LNUM_TYPE;
426 
427   dc->stride = 0;
428 
429   dc->send_size = 0;
430   dc->recv_size = 0;
431 
432   dc->comm = comm;
433 
434   MPI_Comm_size(comm, &(dc->n_ranks));
435 
436   dc->send_buffer = NULL;
437   dc->_send_buffer = NULL;
438 
439   BFT_MALLOC(dc->send_count, dc->n_ranks, int);
440   BFT_MALLOC(dc->recv_count, dc->n_ranks, int);
441   BFT_MALLOC(dc->send_displ, dc->n_ranks + 1, int);
442   BFT_MALLOC(dc->recv_displ, dc->n_ranks + 1, int);
443   dc->recv_count_save = NULL;
444 
445   /* Compute data size and alignment */
446 
447   if (dc->dest_id_datatype == CS_LNUM_TYPE)
448     dc->elt_shift = sizeof(cs_lnum_t);
449   else
450     dc->elt_shift = 0;
451 
452   size_t align_size = sizeof(cs_lnum_t);
453 
454   if (dc->elt_shift % align_size)
455     dc->elt_shift += align_size - (dc->elt_shift % align_size);
456 
457   dc->comp_size = dc->elt_shift;
458 
459   dc->comp_type = MPI_BYTE;
460 
461   /* Return pointer to structure */
462 
463   return dc;
464 }
465 
466 /*----------------------------------------------------------------------------
467  * Destroy a MPI_Alltoall(v) caller.
468  *
469  * parameters:
470  *   dc <-> pointer to pointer to MPI_Alltoall(v) caller structure
471  *---------------------------------------------------------------------------*/
472 
473 static void
_alltoall_caller_destroy(_mpi_all_to_all_caller_t ** dc)474 _alltoall_caller_destroy(_mpi_all_to_all_caller_t **dc)
475 {
476   if (dc != NULL) {
477     _mpi_all_to_all_caller_t *_dc = *dc;
478     if (_dc->comp_type != MPI_BYTE)
479       MPI_Type_free(&(_dc->comp_type));
480     BFT_FREE(_dc->_send_buffer);
481     BFT_FREE(_dc->recv_count_save);
482     BFT_FREE(_dc->recv_displ);
483     BFT_FREE(_dc->send_displ);
484     BFT_FREE(_dc->recv_count);
485     BFT_FREE(_dc->send_count);
486     BFT_FREE(*dc);
487   }
488 }
489 
490 /*----------------------------------------------------------------------------
491  * Update all MPI_Alltoall(v) caller metadata.
492  *
493  * parameters:
494  *   dc        <-> distributor caller
495  *   datatype  <-- associated datatype
496  *   stride    <-- associated stride (0 if indexed)
497  *---------------------------------------------------------------------------*/
498 
499 static void
_alltoall_caller_update_meta(_mpi_all_to_all_caller_t * dc,cs_datatype_t datatype,int stride)500 _alltoall_caller_update_meta(_mpi_all_to_all_caller_t  *dc,
501                              cs_datatype_t              datatype,
502                              int                        stride)
503 {
504   size_t elt_size = cs_datatype_size[datatype]*stride;
505   size_t align_size = sizeof(int);
506 
507   /* Free previous associated datatype if needed */
508 
509   if (dc->comp_type != MPI_BYTE)
510     MPI_Type_free(&(dc->comp_type));
511 
512   /* Now update metadata */
513 
514   dc->datatype = datatype;
515   dc->stride = stride;
516 
517   /* Recompute data size and alignment */
518 
519   if (dc->dest_id_datatype == CS_LNUM_TYPE) {
520     dc->elt_shift = sizeof(cs_lnum_t);
521     if (sizeof(cs_lnum_t) > align_size)
522       align_size = sizeof(cs_lnum_t);
523   }
524   else
525     dc->elt_shift = 0;
526 
527   if (dc->elt_shift % align_size)
528     dc->elt_shift += align_size - (dc->elt_shift % align_size);
529 
530   if (stride > 0) {
531     if (cs_datatype_size[datatype] > align_size)
532       align_size = cs_datatype_size[datatype];
533     dc->comp_size = dc->elt_shift + elt_size;
534   }
535   else
536     dc->comp_size = dc->elt_shift + cs_datatype_size[datatype];
537 
538   if (elt_size % align_size)
539     dc->comp_size += align_size - (elt_size % align_size);
540 
541   /* Update associated MPI datatype */
542 
543   MPI_Type_contiguous(dc->comp_size, MPI_BYTE, &(dc->comp_type));
544   MPI_Type_commit(&(dc->comp_type));
545 }
546 
547 /*----------------------------------------------------------------------------
548  * Save partial metadata before indexed MPI_Alltoall(v) call.
549  *
550  * parameters:
551  *   dc <-> associated MPI_Alltoall(v) caller structure
552  *---------------------------------------------------------------------------*/
553 
554 static  void
_alltoall_caller_save_meta_i(_mpi_all_to_all_caller_t * dc)555 _alltoall_caller_save_meta_i(_mpi_all_to_all_caller_t  *dc)
556 {
557   if (dc->recv_count_save == NULL) {
558     BFT_MALLOC(dc->recv_count_save, dc->n_ranks, int);
559     memcpy(dc->recv_count_save, dc->recv_count, sizeof(int)*(dc->n_ranks));
560   }
561 }
562 
563 /*----------------------------------------------------------------------------
564  * Reset metadate to that used for strided exchanges after indexed
565  * MPI_Alltoall(v) call.
566  *
567  * parameters:
568  *   dc        <-> associated MPI_Alltoall(v) caller structure
569  *   dest_rank <-- destination rank (in direct mode), used here to reorder
570  *                 data in reverse mode.
571  *---------------------------------------------------------------------------*/
572 
573 static  void
_alltoall_caller_reset_meta_i(_mpi_all_to_all_caller_t * dc,const int dest_rank[])574 _alltoall_caller_reset_meta_i(_mpi_all_to_all_caller_t  *dc,
575                               const int                  dest_rank[])
576 {
577   /* Re-count values to send */
578   for (int i = 0; i < dc->n_ranks; i++)
579     dc->send_count[i] = 0;
580   for (size_t j = 0; j < dc->send_size; j++)
581     dc->send_count[dest_rank[j]] += 1;
582 
583   /* Revert to saved values for receive */
584   if (dc->recv_count_save != NULL) {
585     memcpy(dc->recv_count, dc->recv_count_save, sizeof(int)*(dc->n_ranks));
586     BFT_FREE(dc->recv_count_save);
587   }
588 
589   /* Recompute associated displacements */
590   _compute_displ(dc->n_ranks, dc->send_count, dc->send_displ);
591   _compute_displ(dc->n_ranks, dc->recv_count, dc->recv_displ);
592 }
593 
594 /*----------------------------------------------------------------------------
595  * Swap source and destination ranks of all-to-all distributor.
596  *
597  * parameters:
598  *   d <->  pointer to associated all-to-all distributor
599  *----------------------------------------------------------------------------*/
600 
601 static void
_alltoall_caller_swap_src_dest(_mpi_all_to_all_caller_t * dc)602 _alltoall_caller_swap_src_dest(_mpi_all_to_all_caller_t  *dc)
603 {
604   size_t tmp_size[2] = {dc->send_size, dc->recv_size};
605   int *tmp_count = dc->recv_count;
606   int *tmp_displ = dc->recv_displ;
607 
608   dc->send_size = tmp_size[1];
609   dc->recv_size = tmp_size[0];
610 
611   dc->recv_count = dc->send_count;
612   dc->recv_displ = dc->send_displ;
613 
614   dc->send_count = tmp_count;
615   dc->send_displ = tmp_displ;
616 }
617 
618 /*----------------------------------------------------------------------------
619  * Prepare a MPI_Alltoall(v) caller for strided data.
620  *
621  * parameters:
622  *   n_elts           <-- number of elements
623  *   stride           <-- number of values per entity (interlaced)
624  *   datatype         <-- type of data considered
625  *   reverse          <-- true if reverse mode
626  *   data             <-- element values
627  *   dest_id          <-- element destination id, or NULL
628  *   recv_id          <-- element receive id (for reverse mode), or NULL
629  *   dest_rank        <-- destination rank for each element
630  *---------------------------------------------------------------------------*/
631 
632 static void
_alltoall_caller_prepare_s(_mpi_all_to_all_caller_t * dc,size_t n_elts,int stride,cs_datatype_t datatype,bool reverse,const void * data,const cs_lnum_t * dest_id,const cs_lnum_t * recv_id,const int dest_rank[])633 _alltoall_caller_prepare_s(_mpi_all_to_all_caller_t  *dc,
634                            size_t                     n_elts,
635                            int                        stride,
636                            cs_datatype_t              datatype,
637                            bool                       reverse,
638                            const void                *data,
639                            const cs_lnum_t           *dest_id,
640                            const cs_lnum_t           *recv_id,
641                            const int                  dest_rank[])
642 {
643   int i;
644 
645   size_t elt_size = cs_datatype_size[datatype]*stride;
646 
647   unsigned const char *_data = data;
648 
649   assert(data != NULL || (n_elts == 0 || stride == 0));
650 
651   _alltoall_caller_update_meta(dc, datatype, stride);
652 
653   /* Allocate send buffer */
654 
655   if (reverse && recv_id == NULL) {
656     BFT_FREE(dc->_send_buffer);
657     dc->send_buffer = data;
658   }
659   else {
660     BFT_REALLOC(dc->_send_buffer, dc->send_size*dc->comp_size, unsigned char);
661     dc->send_buffer = dc->_send_buffer;
662   }
663 
664   /* Copy metadata */
665 
666   if (dc->dest_id_datatype == CS_LNUM_TYPE) {
667     unsigned const char *_dest_id = (unsigned const char *)dest_id;
668     const size_t id_size = sizeof(cs_lnum_t);
669     for (size_t j = 0; j < n_elts; j++) {
670       size_t w_displ = dc->send_displ[dest_rank[j]]*dc->comp_size;
671       size_t r_displ = j*id_size;
672       dc->send_displ[dest_rank[j]] += 1;
673       for (size_t k = 0; k < id_size; k++)
674         dc->_send_buffer[w_displ + k] = _dest_id[r_displ + k];
675     }
676     /* Reset send_displ */
677     for (i = 0; i < dc->n_ranks; i++)
678       dc->send_displ[i] -= dc->send_count[i];
679   }
680 
681   /* Copy data; in case of reverse send with destination ids, the
682      matching indirection must be applied here. */
683 
684   if (!reverse) {
685     for (size_t j = 0; j < n_elts; j++) {
686       size_t w_displ =   dc->send_displ[dest_rank[j]]*dc->comp_size
687                        + dc->elt_shift;
688       size_t r_displ = j*elt_size;
689       dc->send_displ[dest_rank[j]] += 1;
690       for (size_t k = 0; k < elt_size; k++)
691         dc->_send_buffer[w_displ + k] = _data[r_displ + k];
692     }
693     /* Reset send_displ */
694     for (i = 0; i < dc->n_ranks; i++)
695       dc->send_displ[i] -= dc->send_count[i];
696   }
697   else if (recv_id != NULL) { /* reverse here */
698     for (size_t j = 0; j < dc->send_size; j++) {
699       size_t w_displ = dc->comp_size*j + dc->elt_shift;
700       size_t r_displ = recv_id[j]*elt_size;
701       for (size_t k = 0; k < elt_size; k++)
702         dc->_send_buffer[w_displ + k] = _data[r_displ + k];
703     }
704   }
705   else { /* If revert and no recv_id */
706     assert(dc->send_buffer == data);
707   }
708 }
709 
710 /*----------------------------------------------------------------------------
711  * Prepare a MPI_Alltoall(v) caller for indexed data.
712  *
713  * parameters:
714  *   n_elts           <-- number of elements
715  *   datatype         <-- type of data considered
716  *   reverse          <-- true if reverse mode
717  *   src_index        <-- source index
718  *   dest_index       <-- destination index
719  *   data             <-- element values
720  *   recv_id          <-- element receive id (for reverse mode), or NULL
721  *   dest_rank        <-- destination rank for each element
722  *---------------------------------------------------------------------------*/
723 
724 static void
_alltoall_caller_prepare_i(_mpi_all_to_all_caller_t * dc,size_t n_elts,cs_datatype_t datatype,bool reverse,const cs_lnum_t src_index[],const cs_lnum_t dest_index[],const void * data,const cs_lnum_t * recv_id,const int dest_rank[])725 _alltoall_caller_prepare_i(_mpi_all_to_all_caller_t  *dc,
726                            size_t                     n_elts,
727                            cs_datatype_t              datatype,
728                            bool                       reverse,
729                            const cs_lnum_t            src_index[],
730                            const cs_lnum_t            dest_index[],
731                            const void                *data,
732                            const cs_lnum_t           *recv_id,
733                            const int                  dest_rank[])
734 {
735   int i;
736 
737   size_t elt_size = cs_datatype_size[datatype];
738 
739   unsigned const char *_data = data;
740 
741   _alltoall_caller_update_meta(dc, datatype, 0);
742 
743   /* Allocate send buffer */
744 
745   if (reverse && recv_id == NULL) {
746     BFT_FREE(dc->_send_buffer);
747     dc->send_buffer = data;
748   }
749   else {
750     size_t n_sub_elts = 0;
751     if (reverse == false || recv_id == NULL)
752       n_sub_elts = src_index[n_elts];
753     else {
754       for (size_t j = 0; j < dc->send_size; j++) {
755         cs_lnum_t k = recv_id[j];
756         n_sub_elts += src_index[k+1] - src_index[k];
757       }
758     }
759     BFT_REALLOC(dc->_send_buffer, n_sub_elts*dc->comp_size, unsigned char);
760     dc->send_buffer = dc->_send_buffer;
761   }
762 
763   /* Update send and receive counts and displacements
764      (avoiding additional communication) */
765 
766   for (i = 0; i < dc->n_ranks; i++) {
767     dc->send_count[i] = 0;
768     dc->recv_count[i] = 0;
769   }
770 
771   if (reverse) {
772     if (recv_id != NULL) {
773       for (i = 0; i < dc->n_ranks; i++) {
774         size_t i_s = dc->send_displ[i];
775         size_t i_e = dc->send_displ[i+1];
776         for (size_t j = i_s; j < i_e; j++) {
777           cs_lnum_t k = recv_id[j];
778           cs_lnum_t n_sub_send = src_index[k+1] - src_index[k];
779           dc->send_count[i] += n_sub_send;
780         }
781       }
782     }
783     else {
784       for (i = 0; i < dc->n_ranks; i++) {
785         size_t i_s = dc->send_displ[i];
786         size_t i_e = dc->send_displ[i+1];
787         for (size_t j = i_s; j < i_e; j++) {
788           cs_lnum_t n_sub_send = src_index[j+1] - src_index[j];
789           dc->send_count[i] += n_sub_send;
790         }
791       }
792     }
793   }
794   else { /* !reverse */
795     for (size_t j = 0; j < dc->send_size; j++) {
796       cs_lnum_t n_sub_send = src_index[j+1] - src_index[j];
797       dc->send_count[dest_rank[j]] += n_sub_send;
798     }
799   }
800 
801   if (reverse) {
802     for (size_t j = 0; j < dc->recv_size; j++) {
803       cs_lnum_t n_sub_recv = dest_index[j+1] - dest_index[j];
804       dc->recv_count[dest_rank[j]] += n_sub_recv;
805     }
806   }
807   else {
808     if (recv_id != NULL) {
809       for (i = 0; i < dc->n_ranks; i++) {
810         size_t i_s = dc->recv_displ[i];
811         size_t i_e = dc->recv_displ[i+1];
812         for (size_t j = i_s; j < i_e; j++) {
813           cs_lnum_t k = recv_id[j];
814           cs_lnum_t n_sub_recv = dest_index[k+1] - dest_index[k];
815           dc->recv_count[i] += n_sub_recv;
816         }
817       }
818     }
819     else {
820       for (i = 0; i < dc->n_ranks; i++) {
821         size_t i_s = dc->recv_displ[i];
822         size_t i_e = dc->recv_displ[i+1];
823         for (size_t j = i_s; j < i_e; j++) {
824           cs_lnum_t n_sub_recv = dest_index[j+1] - dest_index[j];
825           dc->recv_count[i] += n_sub_recv;
826         }
827       }
828     }
829   }
830 
831   _compute_displ(dc->n_ranks, dc->send_count, dc->send_displ);
832   _compute_displ(dc->n_ranks, dc->recv_count, dc->recv_displ);
833 
834   /* Copy data; in case of reverse send with destination ids, the
835      matching indirection must be applied here. */
836 
837   if (!reverse) {
838     for (size_t j = 0; j < n_elts; j++) {
839       cs_lnum_t n_sub_send = src_index[j+1] - src_index[j];
840       size_t w_displ = dc->send_displ[dest_rank[j]]*elt_size;
841       size_t r_displ = src_index[j]*elt_size;
842       size_t sub_size = elt_size*n_sub_send;
843       dc->send_displ[dest_rank[j]] += n_sub_send;
844       for (size_t l = 0; l < sub_size; l++)
845         dc->_send_buffer[w_displ + l] = _data[r_displ + l];
846     }
847     /* Reset send_displ */
848     for (i = 0; i < dc->n_ranks; i++)
849       dc->send_displ[i] -= dc->send_count[i];
850   }
851   else if (recv_id != NULL) { /* reverse here */
852     size_t w_displ = 0;
853     for (size_t j = 0; j < dc->send_size; j++) {
854       cs_lnum_t k = recv_id[j];
855       cs_lnum_t n_sub_send = src_index[k+1] - src_index[k];
856       size_t r_displ = src_index[k]*elt_size;
857       size_t sub_size = elt_size*n_sub_send;
858       for (size_t l = 0; l < sub_size; l++)
859         dc->_send_buffer[w_displ + l] = _data[r_displ + l];
860       w_displ += sub_size;
861     }
862 
863     /* Recompute associated displacements */
864     _compute_displ(dc->n_ranks, dc->send_count, dc->send_displ);
865     _compute_displ(dc->n_ranks, dc->recv_count, dc->recv_displ);
866 
867 
868   }
869   else { /* If revert and no recv_id */
870     assert(dc->send_buffer == data);
871   }
872 }
873 
874 /*----------------------------------------------------------------------------
875  * Exchange metadata for an MPI_Alltoall(v) caller.
876  *
877  * Order of data from a same source rank is preserved.
878  *
879  * parameters:
880  *   dc        <-> associated MPI_Alltoall(v) caller structure
881  *   n_elts    <-- number of elements
882  *   dest_rank <-- destination rank for each element
883  *---------------------------------------------------------------------------*/
884 
885 static  void
_alltoall_caller_exchange_meta(_mpi_all_to_all_caller_t * dc,size_t n_elts,const int dest_rank[])886 _alltoall_caller_exchange_meta(_mpi_all_to_all_caller_t  *dc,
887                                size_t                     n_elts,
888                                const int                  dest_rank[])
889 {
890   /* Count values to send and receive */
891 
892   for (int i = 0; i < dc->n_ranks; i++)
893     dc->send_count[i] = 0;
894 
895   for (size_t j = 0; j < n_elts; j++)
896     dc->send_count[dest_rank[j]] += 1;
897 
898   dc->send_size = _compute_displ(dc->n_ranks, dc->send_count, dc->send_displ);
899 
900   /* Exchange counts */
901 
902   cs_timer_t t0 = cs_timer_time();
903 
904   if (_n_trace < _n_trace_max) {
905     /* Time to 1-5 s */
906     _all_to_all_trace[_n_trace*9] = t0.sec*1e5 + t0.nsec/1e4;
907     _all_to_all_trace[_n_trace*9+1] = 0;
908     _all_to_all_trace[_n_trace*9+2] = 0;
909     _all_to_all_trace[_n_trace*9+3] = bft_mem_usage_pr_size();
910     _all_to_all_trace[_n_trace*9+4] = bft_mem_usage_max_pr_size();
911     _all_to_all_trace[_n_trace*9+5] = 0;
912     _all_to_all_trace[_n_trace*9+6] = 0;
913     _all_to_all_trace[_n_trace*9+7] = 0;
914     _all_to_all_trace[_n_trace*9+8] = 0;
915     _n_trace += 1;
916   }
917 
918   MPI_Alltoall(dc->send_count, 1, MPI_INT,
919                dc->recv_count, 1, MPI_INT,
920                dc->comm);
921 
922   cs_timer_t t1 = cs_timer_time();
923   cs_timer_counter_add_diff(_all_to_all_timers + CS_ALL_TO_ALL_TIME_METADATA,
924                             &t0, &t1);
925 
926   if (_n_trace < _n_trace_max) {
927     /* Time to 1-5 s */
928     _all_to_all_trace[_n_trace*9] = t1.sec*1e5 + t1.nsec/1e4;
929     _all_to_all_trace[_n_trace*9+1] =   _all_to_all_trace[_n_trace*9]
930                                       - _all_to_all_trace[(_n_trace-1)*9];
931     _all_to_all_trace[_n_trace*9+2] = 1;
932     _all_to_all_trace[_n_trace*9+3] = bft_mem_usage_pr_size();
933     _all_to_all_trace[_n_trace*9+4] = bft_mem_usage_max_pr_size();
934     _all_to_all_trace[_n_trace*9+5] = 0;
935     _all_to_all_trace[_n_trace*9+6] = 0;
936     _all_to_all_trace[_n_trace*9+7] = 0;
937     _all_to_all_trace[_n_trace*9+8] = 0;
938     _n_trace += 1;
939   }
940 
941   _all_to_all_calls[CS_ALL_TO_ALL_TIME_METADATA] += 1;
942 
943   dc->recv_size = _compute_displ(dc->n_ranks, dc->recv_count, dc->recv_displ);
944 }
945 
946 /*----------------------------------------------------------------------------
947  * Exchange strided data with a MPI_Alltoall(v) caller.
948  *
949  * parameters:
950  *   d         <-> pointer to associated all-to-all distributor
951  *   dc        <-> associated MPI_Alltoall(v) caller structure
952  *   reverse   <-- true if reverse mode
953  *   dest_rank <-- destination rank (in direct mode), used here to reorder
954  *                 data in reverse mode.
955  *   dest_data <-> destination data buffer, or NULL
956  *
957  * returns:
958  *   pointer to dest_data, or newly allocated buffer
959  *---------------------------------------------------------------------------*/
960 
961 static  void *
_alltoall_caller_exchange_s(cs_all_to_all_t * d,_mpi_all_to_all_caller_t * dc,bool reverse,const int dest_rank[],void * dest_data)962 _alltoall_caller_exchange_s(cs_all_to_all_t           *d,
963                             _mpi_all_to_all_caller_t  *dc,
964                             bool                       reverse,
965                             const int                  dest_rank[],
966                             void                      *dest_data)
967 {
968   size_t elt_size = cs_datatype_size[dc->datatype]*dc->stride;
969   unsigned char *_dest_data = dest_data, *_recv_data = dest_data;
970 
971   /* Final data buffer */
972 
973   if (_dest_data == NULL && dc->recv_size*elt_size > 0)
974     BFT_MALLOC(_dest_data, dc->recv_size*elt_size, unsigned char);
975 
976   /* Data buffer for MPI exchange (may merge data and metadata) */
977   if (   dc->dest_id_datatype == CS_LNUM_TYPE || d->recv_id != NULL
978       || reverse)
979     BFT_MALLOC(_recv_data, dc->recv_size*dc->comp_size, unsigned char);
980   else
981     _recv_data = _dest_data;
982 
983   cs_timer_t t0 = cs_timer_time();
984 
985   if (_n_trace < _n_trace_max) {
986     int n_r_send = 0, n_r_recv = 0;
987     for (int i = 0; i < dc->n_ranks; i++) {
988       if (dc->send_count[i] > 0)
989         n_r_send += 1;
990       if (dc->recv_count[i] > 0)
991         n_r_recv += 1;
992     }
993     /* Time to 1-5 s */
994     _all_to_all_trace[_n_trace*9] = t0.sec*1e5 + t0.nsec/1e4;
995     _all_to_all_trace[_n_trace*9+1] = 0;
996     _all_to_all_trace[_n_trace*9+2] = 2;
997     _all_to_all_trace[_n_trace*9+3] = bft_mem_usage_pr_size();
998     _all_to_all_trace[_n_trace*9+4] = bft_mem_usage_max_pr_size();
999     _all_to_all_trace[_n_trace*9+5] = dc->send_size*dc->comp_size;
1000     _all_to_all_trace[_n_trace*9+6] = dc->recv_size*dc->comp_size;
1001     _all_to_all_trace[_n_trace*9+7] = n_r_send;
1002     _all_to_all_trace[_n_trace*9+8] = n_r_recv;
1003     _n_trace += 1;
1004   }
1005 
1006   MPI_Alltoallv(dc->send_buffer, dc->send_count, dc->send_displ, dc->comp_type,
1007                 _recv_data, dc->recv_count, dc->recv_displ, dc->comp_type,
1008                 dc->comm);
1009 
1010   cs_timer_t t1 = cs_timer_time();
1011   cs_timer_counter_add_diff(_all_to_all_timers + CS_ALL_TO_ALL_TIME_EXCHANGE,
1012                             &t0, &t1);
1013   _all_to_all_calls[CS_ALL_TO_ALL_TIME_EXCHANGE] += 1;
1014 
1015   if (_n_trace < _n_trace_max) {
1016     int n_r_send = 0, n_r_recv = 0;
1017     for (int i = 0; i < dc->n_ranks; i++) {
1018       if (dc->send_count[i] > 0)
1019         n_r_send += 1;
1020       if (dc->recv_count[i] > 0)
1021         n_r_recv += 1;
1022     }
1023     /* Time to 1-5 s */
1024     _all_to_all_trace[_n_trace*9] = t1.sec*1e5 + t1.nsec/1e4;
1025     _all_to_all_trace[_n_trace*9+1] =   _all_to_all_trace[_n_trace*9]
1026                                       - _all_to_all_trace[(_n_trace-1)*9];
1027     _all_to_all_trace[_n_trace*9+2] = 0;
1028     _all_to_all_trace[_n_trace*9+3] = bft_mem_usage_pr_size();
1029     _all_to_all_trace[_n_trace*9+4] = bft_mem_usage_max_pr_size();
1030     _all_to_all_trace[_n_trace*9+5] = dc->send_size*dc->comp_size;
1031     _all_to_all_trace[_n_trace*9+6] = dc->recv_size*dc->comp_size;
1032     _all_to_all_trace[_n_trace*9+7] = n_r_send;
1033     _all_to_all_trace[_n_trace*9+8] = n_r_recv;
1034     _n_trace += 1;
1035   }
1036 
1037   /* dest id datatype only used for first exchange */
1038 
1039   if (dc->dest_id_datatype == CS_LNUM_TYPE) {
1040     assert(d->recv_id == NULL);
1041     BFT_MALLOC(d->recv_id, d->dc->recv_size, cs_lnum_t);
1042     for (size_t i = 0; i < d->dc->recv_size; i++)
1043       d->recv_id[i] = -1;
1044     const unsigned char *sp = _recv_data;
1045     for (size_t i = 0; i < d->dc->recv_size; i++)
1046       memcpy(d->recv_id + i,
1047              sp + d->dc->comp_size*i,
1048              sizeof(cs_lnum_t));
1049     dc->dest_id_datatype = CS_DATATYPE_NULL;
1050     cs_lnum_t dest_id_max = -1;
1051     for (size_t i = 0; i < d->dc->recv_size; i++) {
1052       if (d->recv_id[i] > dest_id_max)
1053         dest_id_max = d->recv_id[i];
1054       d->n_elts_dest = dest_id_max + 1;
1055     }
1056     d->n_elts_dest_e = d->dc->recv_size;
1057   }
1058 
1059   /* Now handle main data buffer (reverse implies reordering data) */
1060 
1061   if (_dest_data != _recv_data) {
1062     const unsigned char *sp = _recv_data + d->dc->elt_shift;
1063     if (d->recv_id != NULL && !reverse) {
1064       for (size_t i = 0; i < d->dc->recv_size; i++) {
1065         size_t w_displ = d->recv_id[i]*elt_size;
1066         size_t r_displ = d->dc->comp_size*i;
1067         for (size_t j = 0; j < elt_size; j++)
1068           _dest_data[w_displ + j] = sp[r_displ + j];
1069       }
1070     }
1071     else if (reverse) {
1072       for (int i = 0; i < dc->n_ranks; i++)
1073         dc->recv_count[i] = 0;
1074       for (size_t i = 0; i < d->dc->recv_size; i++) {
1075         int rank_id = dest_rank[i];
1076         size_t w_displ = i*elt_size;
1077         size_t r_displ = (  dc->recv_displ[rank_id]
1078                           + dc->recv_count[rank_id])*dc->comp_size;
1079         for (size_t j = 0; j < elt_size; j++)
1080           _dest_data[w_displ + j] = sp[r_displ + j];
1081         dc->recv_count[rank_id] += 1;
1082       }
1083     }
1084     else {
1085       for (size_t i = 0; i < d->dc->recv_size; i++) {
1086         size_t w_displ = i*elt_size;
1087         size_t r_displ = dc->comp_size*i;
1088         for (size_t j = 0; j < elt_size; j++)
1089           _dest_data[w_displ + j] = sp[r_displ + j];
1090       }
1091     }
1092     BFT_FREE(_recv_data);
1093   }
1094 
1095   return _dest_data;
1096 }
1097 
1098 /*----------------------------------------------------------------------------
1099  * Exchange indexed data with a MPI_Alltoall(v) caller.
1100  *
1101  * parameters:
1102  *   d          <-> pointer to associated all-to-all distributor
1103  *   dc         <-> associated MPI_Alltoall(v) caller structure
1104  *   reverse    <-- true if reverse mode
1105  *   dest_index <-- destination index
1106  *   dest_rank  <-- destination rank (in direct mode), used here to reorder
1107  *                  data in reverse mode.
1108  *   dest_data  <-> destination data buffer, or NULL
1109  *
1110  * returns:
1111  *   pointer to dest_data, or newly allocated buffer
1112  *---------------------------------------------------------------------------*/
1113 
1114 static  void *
_alltoall_caller_exchange_i(cs_all_to_all_t * d,_mpi_all_to_all_caller_t * dc,bool reverse,const cs_lnum_t dest_index[],const int dest_rank[],void * dest_data)1115 _alltoall_caller_exchange_i(cs_all_to_all_t           *d,
1116                             _mpi_all_to_all_caller_t  *dc,
1117                             bool                       reverse,
1118                             const cs_lnum_t            dest_index[],
1119                             const int                  dest_rank[],
1120                             void                      *dest_data)
1121 {
1122   size_t elt_size = cs_datatype_size[dc->datatype];
1123   unsigned char *_dest_data = dest_data, *_recv_data = dest_data;
1124 
1125   /* Final data buffer */
1126 
1127   size_t n_elts_dest = (reverse) ? d->n_elts_src : d->n_elts_dest;
1128 
1129   size_t _n_dest_sub = dest_index[n_elts_dest];
1130   if (_dest_data == NULL && _n_dest_sub*elt_size > 0)
1131     BFT_MALLOC(_dest_data, _n_dest_sub*elt_size, unsigned char);
1132 
1133   /* Data buffer for MPI exchange (may merge data and metadata) */
1134 
1135   if (d->recv_id != NULL || reverse) {
1136     size_t _n_dest_buf = dc->recv_displ[dc->n_ranks] * dc->comp_size;
1137     BFT_MALLOC(_recv_data, _n_dest_buf, unsigned char);
1138   }
1139   else
1140     _recv_data = _dest_data;
1141 
1142   cs_timer_t t0 = cs_timer_time();
1143 
1144   if (_n_trace < _n_trace_max) {
1145     int n_r_send = 0, n_r_recv = 0;
1146     for (int i = 0; i < dc->n_ranks; i++) {
1147       if (dc->send_count[i] > 0)
1148         n_r_send += 1;
1149       if (dc->recv_count[i] > 0)
1150         n_r_recv += 1;
1151     }
1152     /* Time to 1-5 s */
1153     _all_to_all_trace[_n_trace*9] = t0.sec*1e5 + t0.nsec/1e4;
1154     _all_to_all_trace[_n_trace*9+1] = 0;
1155     _all_to_all_trace[_n_trace*9+2] = 2;
1156     _all_to_all_trace[_n_trace*9+3] = bft_mem_usage_pr_size();
1157     _all_to_all_trace[_n_trace*9+4] = bft_mem_usage_max_pr_size();
1158     _all_to_all_trace[_n_trace*9+5] = dc->send_size*dc->comp_size;
1159     _all_to_all_trace[_n_trace*9+6] = dc->recv_size*dc->comp_size;
1160     _all_to_all_trace[_n_trace*9+7] = n_r_send;
1161     _all_to_all_trace[_n_trace*9+8] = n_r_recv;
1162     _n_trace += 1;
1163   }
1164 
1165   MPI_Alltoallv(dc->send_buffer, dc->send_count, dc->send_displ, dc->comp_type,
1166                 _recv_data, dc->recv_count, dc->recv_displ, dc->comp_type,
1167                 dc->comm);
1168 
1169   cs_timer_t t1 = cs_timer_time();
1170   cs_timer_counter_add_diff(_all_to_all_timers + CS_ALL_TO_ALL_TIME_EXCHANGE,
1171                             &t0, &t1);
1172   _all_to_all_calls[CS_ALL_TO_ALL_TIME_EXCHANGE] += 1;
1173 
1174   if (_n_trace < _n_trace_max) {
1175     int n_r_send = 0, n_r_recv = 0;
1176     for (int i = 0; i < dc->n_ranks; i++) {
1177       if (dc->send_count[i] > 0)
1178         n_r_send += 1;
1179       if (dc->recv_count[i] > 0)
1180         n_r_recv += 1;
1181     }
1182     /* Time to 1-5 s */
1183     _all_to_all_trace[_n_trace*9] = t1.sec*1e5 + t1.nsec/1e4;
1184     _all_to_all_trace[_n_trace*9+1] =   _all_to_all_trace[_n_trace*9]
1185                                       - _all_to_all_trace[(_n_trace-1)*9];
1186     _all_to_all_trace[_n_trace*9+2] = 0;
1187     _all_to_all_trace[_n_trace*9+3] = bft_mem_usage_pr_size();
1188     _all_to_all_trace[_n_trace*9+4] = bft_mem_usage_max_pr_size();
1189     _all_to_all_trace[_n_trace*9+5] = dc->send_size*dc->comp_size;
1190     _all_to_all_trace[_n_trace*9+6] = dc->recv_size*dc->comp_size;
1191     _all_to_all_trace[_n_trace*9+7] = n_r_send;
1192     _all_to_all_trace[_n_trace*9+8] = n_r_recv;
1193     _n_trace += 1;
1194   }
1195 
1196   /* Handle main data buffer (reverse implies reordering data) */
1197 
1198   if (_dest_data != _recv_data) {
1199     const unsigned char *sp = _recv_data;
1200     if (d->recv_id != NULL && !reverse) {
1201       size_t r_displ = 0;
1202       for (size_t i = 0; i < dc->recv_size; i++) {
1203         cs_lnum_t k = d->recv_id[i];
1204         cs_lnum_t n_sub_recv = dest_index[k+1] - dest_index[k];
1205         size_t w_displ = dest_index[k]*elt_size;
1206         size_t sub_size = n_sub_recv*elt_size;
1207         for (size_t l = 0; l < sub_size; l++)
1208           _dest_data[w_displ + l] = sp[r_displ + l];
1209         r_displ += sub_size;
1210       }
1211     }
1212     else if (reverse) {
1213       for (int i = 0; i < dc->n_ranks; i++)
1214         dc->recv_count[i] = 0;
1215       for (size_t i = 0; i < d->dc->recv_size; i++) {
1216         int rank_id = dest_rank[i];
1217         size_t w_displ = dest_index[i]*elt_size;
1218         size_t r_displ = (  dc->recv_displ[rank_id]
1219                           + dc->recv_count[rank_id])*elt_size;
1220         cs_lnum_t n_sub_recv = dest_index[i+1] - dest_index[i];
1221         size_t sub_size = n_sub_recv*elt_size;
1222         for (size_t l = 0; l < sub_size; l++)
1223           _dest_data[w_displ + l] = sp[r_displ + l];
1224         dc->recv_count[rank_id] += n_sub_recv;
1225       }
1226     }
1227     else {
1228       assert(_dest_data == _recv_data);
1229     }
1230     BFT_FREE(_recv_data);
1231   }
1232 
1233   return _dest_data;
1234 }
1235 
1236 /*----------------------------------------------------------------------------
1237  * First stage of creation for a hybrid caller for strided data.
1238  *
1239  * parameters:
1240  *   flags     <-- metadata flags
1241  *   comm      <-- associated MPI communicator
1242  *---------------------------------------------------------------------------*/
1243 
1244 static _hybrid_pex_t *
_hybrid_pex_create_meta(int flags,MPI_Comm comm)1245 _hybrid_pex_create_meta(int        flags,
1246                         MPI_Comm   comm)
1247 {
1248   _hybrid_pex_t *hc;
1249 
1250   /* Allocate structure */
1251 
1252   BFT_MALLOC(hc, 1, _hybrid_pex_t);
1253 
1254   hc->rn_send = NULL;
1255   hc->rn_recv = NULL;
1256 
1257   hc->datatype = CS_DATATYPE_NULL;
1258 
1259   hc->dest_id_datatype = CS_DATATYPE_NULL;
1260 
1261   if (flags & CS_ALL_TO_ALL_USE_DEST_ID)
1262     hc->dest_id_datatype = CS_LNUM_TYPE;
1263 
1264   hc->stride = 0;
1265 
1266   hc->send_size = 0;
1267   hc->recv_size = 0;
1268 
1269   hc->comm = comm;
1270 
1271   hc->elt_rank_index = NULL;
1272 
1273   hc->send_buffer = NULL;
1274   hc->_send_buffer = NULL;
1275 
1276   hc->send_count = NULL;
1277   hc->recv_count = NULL;
1278   hc->send_displ = NULL;
1279   hc->recv_displ = NULL;
1280   hc->recv_count_save = NULL;
1281 
1282   /* Compute data size and alignment */
1283 
1284   if (hc->dest_id_datatype == CS_LNUM_TYPE)
1285     hc->elt_shift = sizeof(cs_lnum_t);
1286   else
1287     hc->elt_shift = 0;
1288 
1289   size_t align_size = sizeof(cs_lnum_t);
1290 
1291   if (hc->elt_shift % align_size)
1292     hc->elt_shift += align_size - (hc->elt_shift % align_size);
1293 
1294   hc->comp_size = hc->elt_shift;
1295 
1296   hc->comp_type = MPI_BYTE;
1297 
1298   /* Return pointer to structure */
1299 
1300   return hc;
1301 }
1302 
1303 /*----------------------------------------------------------------------------
1304  * Destroy a hybrid caller.
1305  *
1306  * parameters:
1307  *   hc <-> pointer to pointer to MPI_Alltoall(v) caller structure
1308  *---------------------------------------------------------------------------*/
1309 
1310 static void
_hybrid_pex_destroy(_hybrid_pex_t ** hc)1311 _hybrid_pex_destroy(_hybrid_pex_t **hc)
1312 {
1313   if (hc != NULL) {
1314     _hybrid_pex_t *_hc = *hc;
1315     if (_hc->comp_type != MPI_BYTE)
1316       MPI_Type_free(&(_hc->comp_type));
1317     BFT_FREE(_hc->elt_rank_index);
1318     BFT_FREE(_hc->_send_buffer);
1319     BFT_FREE(_hc->recv_count_save);
1320     BFT_FREE(_hc->recv_displ);
1321     BFT_FREE(_hc->send_displ);
1322     BFT_FREE(_hc->recv_count);
1323     BFT_FREE(_hc->send_count);
1324 
1325     cs_rank_neighbors_destroy(&(_hc->rn_send));
1326     cs_rank_neighbors_destroy(&(_hc->rn_recv));
1327 
1328     BFT_FREE(*hc);
1329   }
1330 }
1331 
1332 /*----------------------------------------------------------------------------
1333  * Update all hybrid caller metadata.
1334  *
1335  * parameters:
1336  *   hc        <-> distributor caller
1337  *   datatype  <-- associated datatype
1338  *   stride    <-- associated stride (0 if indexed)
1339  *---------------------------------------------------------------------------*/
1340 
1341 static void
_hybrid_pex_update_meta(_hybrid_pex_t * hc,cs_datatype_t datatype,int stride)1342 _hybrid_pex_update_meta(_hybrid_pex_t  *hc,
1343                         cs_datatype_t   datatype,
1344                         int             stride)
1345 {
1346   size_t elt_size = cs_datatype_size[datatype]*stride;
1347   size_t align_size = sizeof(int);
1348 
1349   /* Free previous associated datatype if needed */
1350 
1351   if (hc->comp_type != MPI_BYTE)
1352     MPI_Type_free(&(hc->comp_type));
1353 
1354   /* Now update metadata */
1355 
1356   hc->datatype = datatype;
1357   hc->stride = stride;
1358 
1359   /* Recompute data size and alignment */
1360 
1361   if (hc->dest_id_datatype == CS_LNUM_TYPE) {
1362     hc->elt_shift = sizeof(cs_lnum_t);
1363     if (sizeof(cs_lnum_t) > align_size)
1364       align_size = sizeof(cs_lnum_t);
1365   }
1366   else
1367     hc->elt_shift = 0;
1368 
1369   if (hc->elt_shift % align_size)
1370     hc->elt_shift += align_size - (hc->elt_shift % align_size);
1371 
1372   if (stride > 0) {
1373     if (cs_datatype_size[datatype] > align_size)
1374       align_size = cs_datatype_size[datatype];
1375     hc->comp_size = hc->elt_shift + elt_size;
1376   }
1377   else
1378     hc->comp_size = hc->elt_shift + cs_datatype_size[datatype];
1379 
1380   if (elt_size % align_size)
1381     hc->comp_size += align_size - (elt_size % align_size);
1382 
1383   /* Update associated MPI datatype */
1384 
1385   MPI_Type_contiguous(hc->comp_size, MPI_BYTE, &(hc->comp_type));
1386   MPI_Type_commit(&(hc->comp_type));
1387 }
1388 
1389 /*----------------------------------------------------------------------------
1390  * Save partial metadata before indexed hybrid call.
1391  *
1392  * parameters:
1393  *   hc  <-> associated hybrid caller structure
1394  *---------------------------------------------------------------------------*/
1395 
1396 static  void
_hybrid_pex_save_meta_i(_hybrid_pex_t * hc)1397 _hybrid_pex_save_meta_i(_hybrid_pex_t  *hc)
1398 {
1399   if (hc->recv_count_save == NULL) {
1400     int n_n_ranks = hc->rn_recv->size;
1401     BFT_MALLOC(hc->recv_count_save, n_n_ranks, int);
1402     memcpy(hc->recv_count_save, hc->recv_count, sizeof(int)*(n_n_ranks));
1403   }
1404 }
1405 
1406 /*----------------------------------------------------------------------------
1407  * Reset metadate to that used for strided exchanges after indexed
1408  * hybrid call.
1409  *
1410  * parameters:
1411  *   hc              <-> associated hybrid caller structure
1412  *---------------------------------------------------------------------------*/
1413 
1414 static  void
_hybrid_pex_reset_meta_i(_hybrid_pex_t * hc)1415 _hybrid_pex_reset_meta_i(_hybrid_pex_t  *hc)
1416 {
1417   int n_s_ranks = hc->rn_send->size;
1418   int n_r_ranks = hc->rn_recv->size;
1419 
1420   const int  *dest_rank_index = hc->elt_rank_index;
1421 
1422   /* Re-count values to send */
1423   for (int i = 0; i < n_s_ranks; i++)
1424     hc->send_count[i] = 0;
1425   for (size_t j = 0; j < hc->send_size; j++)
1426     hc->send_count[dest_rank_index[j]] += 1;
1427 
1428   /* Revert to saved values for receive */
1429   if (hc->recv_count_save != NULL) {
1430     memcpy(hc->recv_count, hc->recv_count_save, sizeof(int)*(n_r_ranks));
1431     BFT_FREE(hc->recv_count_save);
1432   }
1433 
1434   /* Recompute associated displacements */
1435   _compute_displ(n_s_ranks, hc->send_count, hc->send_displ);
1436   _compute_displ(n_r_ranks, hc->recv_count, hc->recv_displ);
1437 }
1438 
1439 /*----------------------------------------------------------------------------
1440  * Swap source and destination ranks of all-to-all distributor.
1441  *
1442  * parameters:
1443  *   d <->  pointer to associated all-to-all distributor
1444  *----------------------------------------------------------------------------*/
1445 
1446 static void
_hybrid_pex_swap_src_dest(_hybrid_pex_t * hc)1447 _hybrid_pex_swap_src_dest(_hybrid_pex_t  *hc)
1448 {
1449   size_t tmp_size[2] = {hc->send_size, hc->recv_size};
1450   int *tmp_count = hc->recv_count;
1451   int *tmp_displ = hc->recv_displ;
1452   cs_rank_neighbors_t *tmp_rn = hc->rn_send;
1453 
1454   hc->rn_send = hc->rn_recv;
1455   hc->rn_recv = tmp_rn;
1456 
1457   hc->send_size = tmp_size[1];
1458   hc->recv_size = tmp_size[0];
1459 
1460   hc->recv_count = hc->send_count;
1461   hc->recv_displ = hc->send_displ;
1462 
1463   hc->send_count = tmp_count;
1464   hc->send_displ = tmp_displ;
1465 }
1466 
1467 /*----------------------------------------------------------------------------
1468  * Prepare a hybrid caller for strided data.
1469  *
1470  * parameters:
1471  *   n_elts           <-- number of elements
1472  *   stride           <-- number of values per entity (interlaced)
1473  *   datatype         <-- type of data considered
1474  *   reverse          <-- true if reverse mode
1475  *   data             <-- element values
1476  *   dest_id          <-- element destination id, or NULL
1477  *   recv_id          <-- element receive id (for reverse mode), or NULL
1478  *---------------------------------------------------------------------------*/
1479 
1480 static void
_hybrid_pex_prepare_s(_hybrid_pex_t * hc,size_t n_elts,int stride,cs_datatype_t datatype,bool reverse,const void * data,const cs_lnum_t * dest_id,const cs_lnum_t * recv_id)1481 _hybrid_pex_prepare_s(_hybrid_pex_t    *hc,
1482                       size_t            n_elts,
1483                       int               stride,
1484                       cs_datatype_t     datatype,
1485                       bool              reverse,
1486                       const void       *data,
1487                       const cs_lnum_t  *dest_id,
1488                       const cs_lnum_t  *recv_id)
1489 {
1490   int i;
1491 
1492   size_t elt_size = cs_datatype_size[datatype]*stride;
1493 
1494   unsigned const char *_data = data;
1495 
1496   assert(data != NULL || (n_elts == 0 || stride == 0));
1497 
1498   _hybrid_pex_update_meta(hc, datatype, stride);
1499 
1500   /* Allocate send buffer */
1501 
1502   if (reverse && recv_id == NULL) {
1503     BFT_FREE(hc->_send_buffer);
1504     hc->send_buffer = data;
1505   }
1506   else {
1507     BFT_REALLOC(hc->_send_buffer, hc->send_size*hc->comp_size, unsigned char);
1508     hc->send_buffer = hc->_send_buffer;
1509   }
1510 
1511   const int  *dest_rank_index = hc->elt_rank_index;
1512 
1513   /* Copy metadata */
1514 
1515   if (hc->dest_id_datatype == CS_LNUM_TYPE) {
1516     unsigned const char *_dest_id = (unsigned const char *)dest_id;
1517     const size_t id_size = sizeof(cs_lnum_t);
1518     for (size_t j = 0; j < n_elts; j++) {
1519       size_t w_displ = hc->send_displ[dest_rank_index[j]]*hc->comp_size;
1520       size_t r_displ = j*id_size;
1521       hc->send_displ[dest_rank_index[j]] += 1;
1522       for (size_t k = 0; k < id_size; k++)
1523         hc->_send_buffer[w_displ + k] = _dest_id[r_displ + k];
1524     }
1525     /* Reset send_displ */
1526     int n_s_ranks = hc->rn_send->size;
1527     for (i = 0; i < n_s_ranks; i++)
1528       hc->send_displ[i] -= hc->send_count[i];
1529   }
1530 
1531   /* Copy data; in case of reverse send with destination ids, the
1532      matching indirection must be applied here. */
1533 
1534   if (!reverse) {
1535     for (size_t j = 0; j < n_elts; j++) {
1536       size_t w_displ =   hc->send_displ[dest_rank_index[j]]*hc->comp_size
1537                        + hc->elt_shift;
1538       size_t r_displ = j*elt_size;
1539       hc->send_displ[dest_rank_index[j]] += 1;
1540       for (size_t k = 0; k < elt_size; k++)
1541         hc->_send_buffer[w_displ + k] = _data[r_displ + k];
1542     }
1543     /* Reset send_displ */
1544     int n_s_ranks = hc->rn_send->size;
1545     for (i = 0; i < n_s_ranks; i++)
1546       hc->send_displ[i] -= hc->send_count[i];
1547   }
1548   else if (recv_id != NULL) { /* reverse here */
1549     for (size_t j = 0; j < hc->send_size; j++) {
1550       size_t w_displ = hc->comp_size*j + hc->elt_shift;
1551       size_t r_displ = recv_id[j]*elt_size;
1552       for (size_t k = 0; k < elt_size; k++)
1553         hc->_send_buffer[w_displ + k] = _data[r_displ + k];
1554     }
1555   }
1556   else { /* If revert and no recv_id */
1557     assert(hc->send_buffer == data);
1558   }
1559 }
1560 
1561 /*----------------------------------------------------------------------------
1562  * Prepare a hybrid caller for indexed data.
1563  *
1564  * parameters:
1565  *   n_elts           <-- number of elements
1566  *   datatype         <-- type of data considered
1567  *   reverse          <-- true if reverse mode
1568  *   src_index        <-- source index
1569  *   dest_index       <-- destination index
1570  *   data             <-- element values
1571  *   recv_id          <-- element receive id (for reverse mode), or NULL
1572  *---------------------------------------------------------------------------*/
1573 
1574 static void
_hybrid_pex_prepare_i(_hybrid_pex_t * hc,size_t n_elts,cs_datatype_t datatype,bool reverse,const cs_lnum_t src_index[],const cs_lnum_t dest_index[],const void * data,const cs_lnum_t * recv_id)1575 _hybrid_pex_prepare_i(_hybrid_pex_t      *hc,
1576                       size_t              n_elts,
1577                       cs_datatype_t       datatype,
1578                       bool                reverse,
1579                       const cs_lnum_t     src_index[],
1580                       const cs_lnum_t     dest_index[],
1581                       const void         *data,
1582                       const cs_lnum_t    *recv_id)
1583 {
1584   int i;
1585 
1586   size_t elt_size = cs_datatype_size[datatype];
1587 
1588   unsigned const char *_data = data;
1589 
1590   _hybrid_pex_update_meta(hc, datatype, 0);
1591 
1592   /* Allocate send buffer */
1593 
1594   if (reverse && recv_id == NULL) {
1595     BFT_FREE(hc->_send_buffer);
1596     hc->send_buffer = data;
1597   }
1598   else {
1599     size_t n_sub_elts = 0;
1600     if (reverse == false || recv_id == NULL)
1601       n_sub_elts = src_index[n_elts];
1602     else {
1603       for (size_t j = 0; j < hc->send_size; j++) {
1604         cs_lnum_t k = recv_id[j];
1605         n_sub_elts += src_index[k+1] - src_index[k];
1606       }
1607     }
1608     BFT_REALLOC(hc->_send_buffer, n_sub_elts*hc->comp_size, unsigned char);
1609     hc->send_buffer = hc->_send_buffer;
1610   }
1611 
1612   /* Update send and receive counts and displacements
1613      (avoiding additional communication) */
1614 
1615   const int n_s_ranks = hc->rn_send->size;
1616   const int n_r_ranks = hc->rn_recv->size;
1617 
1618   for (i = 0; i < n_s_ranks; i++)
1619     hc->send_count[i] = 0;
1620 
1621   for (i = 0; i < n_r_ranks; i++)
1622     hc->recv_count[i] = 0;
1623 
1624   const int  *dest_rank_index = hc->elt_rank_index;
1625 
1626   if (reverse) {
1627     if (recv_id != NULL) {
1628       for (i = 0; i < n_s_ranks; i++) {
1629         size_t i_s = hc->send_displ[i];
1630         size_t i_e = hc->send_displ[i+1];
1631         for (size_t j = i_s; j < i_e; j++) {
1632           cs_lnum_t k = recv_id[j];
1633           cs_lnum_t n_sub_send = src_index[k+1] - src_index[k];
1634           hc->send_count[i] += n_sub_send;
1635         }
1636       }
1637     }
1638     else {
1639       for (i = 0; i < n_s_ranks; i++) {
1640         size_t i_s = hc->send_displ[i];
1641         size_t i_e = hc->send_displ[i+1];
1642         for (size_t j = i_s; j < i_e; j++) {
1643           cs_lnum_t n_sub_send = src_index[j+1] - src_index[j];
1644           hc->send_count[i] += n_sub_send;
1645         }
1646       }
1647     }
1648   }
1649   else { /* !reverse */
1650     for (size_t j = 0; j < hc->send_size; j++) {
1651       cs_lnum_t n_sub_send = src_index[j+1] - src_index[j];
1652       hc->send_count[dest_rank_index[j]] += n_sub_send;
1653     }
1654   }
1655 
1656   if (reverse) {
1657     for (size_t j = 0; j < hc->recv_size; j++) {
1658       cs_lnum_t n_sub_recv = dest_index[j+1] - dest_index[j];
1659       hc->recv_count[dest_rank_index[j]] += n_sub_recv;
1660     }
1661   }
1662   else {
1663     if (recv_id != NULL) {
1664       for (i = 0; i < n_r_ranks; i++) {
1665         size_t i_s = hc->recv_displ[i];
1666         size_t i_e = hc->recv_displ[i+1];
1667         for (size_t j = i_s; j < i_e; j++) {
1668           cs_lnum_t k = recv_id[j];
1669           cs_lnum_t n_sub_recv = dest_index[k+1] - dest_index[k];
1670           hc->recv_count[i] += n_sub_recv;
1671         }
1672       }
1673     }
1674     else {
1675       for (i = 0; i < n_r_ranks; i++) {
1676         size_t i_s = hc->recv_displ[i];
1677         size_t i_e = hc->recv_displ[i+1];
1678         for (size_t j = i_s; j < i_e; j++) {
1679           cs_lnum_t n_sub_recv = dest_index[j+1] - dest_index[j];
1680           hc->recv_count[i] += n_sub_recv;
1681         }
1682       }
1683     }
1684   }
1685 
1686   _compute_displ(n_s_ranks, hc->send_count, hc->send_displ);
1687   _compute_displ(n_r_ranks, hc->recv_count, hc->recv_displ);
1688 
1689   /* Copy data; in case of reverse send with destination ids, the
1690      matching indirection must be applied here. */
1691 
1692   if (!reverse) {
1693     for (size_t j = 0; j < n_elts; j++) {
1694       cs_lnum_t n_sub_send = src_index[j+1] - src_index[j];
1695       size_t w_displ = hc->send_displ[dest_rank_index[j]]*elt_size;
1696       size_t r_displ = src_index[j]*elt_size;
1697       size_t sub_size = elt_size*n_sub_send;
1698       hc->send_displ[dest_rank_index[j]] += n_sub_send;
1699       for (size_t l = 0; l < sub_size; l++)
1700         hc->_send_buffer[w_displ + l] = _data[r_displ + l];
1701     }
1702     /* Reset send_displ */
1703     for (i = 0; i < n_s_ranks; i++)
1704       hc->send_displ[i] -= hc->send_count[i];
1705   }
1706   else if (recv_id != NULL) { /* reverse here */
1707     size_t w_displ = 0;
1708     for (size_t j = 0; j < hc->send_size; j++) {
1709       cs_lnum_t k = recv_id[j];
1710       cs_lnum_t n_sub_send = src_index[k+1] - src_index[k];
1711       size_t r_displ = src_index[k]*elt_size;
1712       size_t sub_size = elt_size*n_sub_send;
1713       for (size_t l = 0; l < sub_size; l++)
1714         hc->_send_buffer[w_displ + l] = _data[r_displ + l];
1715       w_displ += sub_size;
1716     }
1717 
1718     /* Recompute associated displacements */
1719     _compute_displ(n_s_ranks, hc->send_count, hc->send_displ);
1720     _compute_displ(n_r_ranks, hc->recv_count, hc->recv_displ);
1721 
1722 
1723   }
1724   else { /* If revert and no recv_id */
1725     assert(hc->send_buffer == data);
1726   }
1727 }
1728 
1729 /*----------------------------------------------------------------------------
1730  * Exchange metadata for an hybrid caller.
1731  *
1732  * Order of data from a same source rank is preserved.
1733  *
1734  * parameters:
1735  *   hc        <-> associated hybrid caller structure
1736  *   n_elts    <-- number of elements
1737  *   dest_rank <-- destination rank for each element
1738  *---------------------------------------------------------------------------*/
1739 
1740 static  void
_hybrid_pex_exchange_meta(_hybrid_pex_t * hc,size_t n_elts,const int dest_rank[])1741 _hybrid_pex_exchange_meta(_hybrid_pex_t  *hc,
1742                           size_t          n_elts,
1743                           const int       dest_rank[])
1744 {
1745   if (hc->rn_send == NULL) {
1746     hc->rn_send = cs_rank_neighbors_create(n_elts, dest_rank);
1747 
1748     BFT_MALLOC(hc->elt_rank_index, n_elts, int);
1749     BFT_MALLOC(hc->send_count, hc->rn_send->size, int);
1750     BFT_MALLOC(hc->send_displ, hc->rn_send->size + 1, int);
1751 
1752     cs_rank_neighbors_to_index(hc->rn_send,
1753                                n_elts,
1754                                dest_rank,
1755                                hc->elt_rank_index);
1756 
1757   }
1758 
1759   /* Count values to send and receive */
1760 
1761   cs_rank_neighbors_count(hc->rn_send,
1762                           n_elts,
1763                           hc->elt_rank_index,
1764                           hc->send_count);
1765 
1766   hc->send_size = _compute_displ(hc->rn_send->size,
1767                                  hc->send_count,
1768                                  hc->send_displ);
1769 
1770   /* Exchange counts */
1771 
1772   cs_timer_t t0 = cs_timer_time();
1773 
1774   if (_n_trace < _n_trace_max) {
1775     /* Time to 1-5 s */
1776     _all_to_all_trace[_n_trace*9] = t0.sec*1e5 + t0.nsec/1e4;
1777     _all_to_all_trace[_n_trace*9+1] = 0;
1778     _all_to_all_trace[_n_trace*9+2] = 0;
1779     _all_to_all_trace[_n_trace*9+3] = bft_mem_usage_pr_size();
1780     _all_to_all_trace[_n_trace*9+4] = bft_mem_usage_max_pr_size();
1781     _all_to_all_trace[_n_trace*9+5] = 0;
1782     _all_to_all_trace[_n_trace*9+6] = 0;
1783     _all_to_all_trace[_n_trace*9+7] = 0;
1784     _all_to_all_trace[_n_trace*9+8] = 0;
1785     _n_trace += 1;
1786   }
1787 
1788   assert(hc->rn_recv == NULL);
1789 
1790   if (hc->rn_recv != NULL) {
1791     cs_rank_neighbors_destroy(&(hc->rn_recv));
1792     BFT_FREE(hc->recv_count);
1793     BFT_FREE(hc->recv_displ);
1794   }
1795 
1796   cs_rank_neighbors_sync_count_m(hc->rn_send,
1797                                  &(hc->rn_recv),
1798                                  hc->send_count,
1799                                  &(hc->recv_count),
1800                                  _hybrid_meta_type,
1801                                  hc->comm);
1802 
1803   cs_timer_t t1 = cs_timer_time();
1804   cs_timer_counter_add_diff(_all_to_all_timers + CS_ALL_TO_ALL_TIME_METADATA,
1805                             &t0, &t1);
1806 
1807   if (_n_trace < _n_trace_max) {
1808     /* Time to 1-5 s */
1809     _all_to_all_trace[_n_trace*9] = t1.sec*1e5 + t1.nsec/1e4;
1810     _all_to_all_trace[_n_trace*9+1] =   _all_to_all_trace[_n_trace*9]
1811                                       - _all_to_all_trace[(_n_trace-1)*9];
1812     _all_to_all_trace[_n_trace*9+2] = 1;
1813     _all_to_all_trace[_n_trace*9+3] = bft_mem_usage_pr_size();
1814     _all_to_all_trace[_n_trace*9+4] = bft_mem_usage_max_pr_size();
1815     _all_to_all_trace[_n_trace*9+5] = 0;
1816     _all_to_all_trace[_n_trace*9+6] = 0;
1817     _all_to_all_trace[_n_trace*9+7] = 0;
1818     _all_to_all_trace[_n_trace*9+8] = 0;
1819     _n_trace += 1;
1820   }
1821 
1822   _all_to_all_calls[CS_ALL_TO_ALL_TIME_METADATA] += 1;
1823 
1824   BFT_MALLOC(hc->recv_displ, hc->rn_recv->size + 1, int);
1825 
1826   hc->recv_size = _compute_displ(hc->rn_recv->size,
1827                                  hc->recv_count,
1828                                  hc->recv_displ);
1829 }
1830 
1831 /*----------------------------------------------------------------------------*/
1832 /*!
1833  * \brief Exchange indexed data with a hybrid caller.
1834  *
1835  * \param[in]    hc          associated hybrid caller structure
1836  * \param[in]    sendbuf     starting address of send buffer.
1837  * \param[out]   recvbuf     starting address of receive buffer.
1838  */
1839 /*----------------------------------------------------------------------------*/
1840 
1841 static  void
_hybrid_alltoallv(_hybrid_pex_t * hc,const void * sendbuf,void * recvbuf)1842 _hybrid_alltoallv(_hybrid_pex_t   *hc,
1843                   const void      *sendbuf,
1844                   void            *recvbuf)
1845 {
1846   /* Currently available: MPI_Alltoallv */
1847 
1848   if (true) {
1849     int n_ranks;
1850     MPI_Comm_size(hc->comm, &n_ranks);
1851 
1852     int  *send_count, *send_displ, *recv_count, *recv_displ;
1853 
1854     BFT_MALLOC(send_count, n_ranks, int);
1855     BFT_MALLOC(send_displ, n_ranks+1, int);
1856     BFT_MALLOC(recv_count, n_ranks, int);
1857     BFT_MALLOC(recv_displ, n_ranks+1, int);
1858 
1859     const int n_s_ranks = hc->rn_send->size;
1860     const int n_r_ranks = hc->rn_recv->size;
1861     const int *s_rank = hc->rn_send->rank;
1862     const int *r_rank = hc->rn_recv->rank;
1863 
1864     for (int i = 0; i < n_ranks; i++) {
1865       send_count[i] = 0;
1866       recv_count[i] = 0;
1867     }
1868     for (int i = 0; i < n_s_ranks; i++)
1869       send_count[s_rank[i]] = hc->send_displ[i+1] - hc->send_displ[i];
1870     for (int i = 0; i < n_r_ranks; i++)
1871       recv_count[r_rank[i]] = hc->recv_displ[i+1] - hc->recv_displ[i];
1872 
1873     _compute_displ(n_ranks, send_count, send_displ);
1874     _compute_displ(n_ranks, recv_count, recv_displ);
1875 
1876     MPI_Alltoallv(sendbuf, send_count, send_displ, hc->comp_type,
1877                   recvbuf, recv_count, recv_displ, hc->comp_type,
1878                   hc->comm);
1879 
1880     BFT_FREE(recv_displ);
1881     BFT_FREE(recv_count);
1882     BFT_FREE(send_displ);
1883     BFT_FREE(send_count);
1884   }
1885 }
1886 
1887 /*----------------------------------------------------------------------------
1888  * Exchange strided data with a hybrid caller.
1889  *
1890  * parameters:
1891  *   d               <-> pointer to associated all-to-all distributor
1892  *   hc              <-> associated hybrid caller structure
1893  *   reverse         <-- true if reverse mode
1894  *   dest_data       <-> destination data buffer, or NULL
1895  *
1896  * returns:
1897  *   pointer to dest_data, or newly allocated buffer
1898  *---------------------------------------------------------------------------*/
1899 
1900 static  void *
_hybrid_pex_exchange_s(cs_all_to_all_t * d,_hybrid_pex_t * hc,bool reverse,void * dest_data)1901 _hybrid_pex_exchange_s(cs_all_to_all_t  *d,
1902                        _hybrid_pex_t    *hc,
1903                        bool              reverse,
1904                        void             *dest_data)
1905 {
1906   size_t elt_size = cs_datatype_size[hc->datatype]*hc->stride;
1907   unsigned char *_dest_data = dest_data, *_recv_data = dest_data;
1908 
1909   /* Final data buffer */
1910 
1911   if (_dest_data == NULL && hc->recv_size*elt_size > 0)
1912     BFT_MALLOC(_dest_data, hc->recv_size*elt_size, unsigned char);
1913 
1914   /* Data buffer for MPI exchange (may merge data and metadata) */
1915   if (   hc->dest_id_datatype == CS_LNUM_TYPE || d->recv_id != NULL
1916       || reverse)
1917     BFT_MALLOC(_recv_data, hc->recv_size*hc->comp_size, unsigned char);
1918   else
1919     _recv_data = _dest_data;
1920 
1921   cs_timer_t t0 = cs_timer_time();
1922 
1923   if (_n_trace < _n_trace_max) {
1924     /* Time to 1-5 s */
1925     _all_to_all_trace[_n_trace*9] = t0.sec*1e5 + t0.nsec/1e4;
1926     _all_to_all_trace[_n_trace*9+1] = 0;
1927     _all_to_all_trace[_n_trace*9+2] = 2;
1928     _all_to_all_trace[_n_trace*9+3] = bft_mem_usage_pr_size();
1929     _all_to_all_trace[_n_trace*9+4] = bft_mem_usage_max_pr_size();
1930     _all_to_all_trace[_n_trace*9+5] = hc->send_size*hc->comp_size;
1931     _all_to_all_trace[_n_trace*9+6] = hc->recv_size*hc->comp_size;
1932     _all_to_all_trace[_n_trace*9+7] = hc->rn_send->size;
1933     _all_to_all_trace[_n_trace*9+8] = hc->rn_recv->size;
1934     _n_trace += 1;
1935   }
1936 
1937   _hybrid_alltoallv(hc, hc->send_buffer, _recv_data);
1938 
1939   cs_timer_t t1 = cs_timer_time();
1940   cs_timer_counter_add_diff(_all_to_all_timers + CS_ALL_TO_ALL_TIME_EXCHANGE,
1941                             &t0, &t1);
1942   _all_to_all_calls[CS_ALL_TO_ALL_TIME_EXCHANGE] += 1;
1943 
1944   if (_n_trace < _n_trace_max) {
1945     /* Time to 1-5 s */
1946     _all_to_all_trace[_n_trace*9] = t1.sec*1e5 + t1.nsec/1e4;
1947     _all_to_all_trace[_n_trace*9+1] =   _all_to_all_trace[_n_trace*9]
1948                                       - _all_to_all_trace[(_n_trace-1)*9];
1949     _all_to_all_trace[_n_trace*9+2] = 0;
1950     _all_to_all_trace[_n_trace*9+3] = bft_mem_usage_pr_size();
1951     _all_to_all_trace[_n_trace*9+4] = bft_mem_usage_max_pr_size();
1952     _all_to_all_trace[_n_trace*9+5] = hc->send_size*hc->comp_size;
1953     _all_to_all_trace[_n_trace*9+6] = hc->recv_size*hc->comp_size;
1954     _all_to_all_trace[_n_trace*9+7] = hc->rn_send->size;
1955     _all_to_all_trace[_n_trace*9+8] = hc->rn_recv->size;
1956     _n_trace += 1;
1957   }
1958 
1959   /* dest id datatype only used for first exchange */
1960 
1961   if (hc->dest_id_datatype == CS_LNUM_TYPE) {
1962     assert(d->recv_id == NULL);
1963     BFT_MALLOC(d->recv_id, d->hc->recv_size, cs_lnum_t);
1964     for (size_t i = 0; i < d->hc->recv_size; i++)
1965       d->recv_id[i] = -1;
1966     const unsigned char *sp = _recv_data;
1967     for (size_t i = 0; i < d->hc->recv_size; i++)
1968       memcpy(d->recv_id + i,
1969              sp + d->hc->comp_size*i,
1970              sizeof(cs_lnum_t));
1971     hc->dest_id_datatype = CS_DATATYPE_NULL;
1972     cs_lnum_t dest_id_max = -1;
1973     for (size_t i = 0; i < d->hc->recv_size; i++) {
1974       if (d->recv_id[i] > dest_id_max)
1975         dest_id_max = d->recv_id[i];
1976       d->n_elts_dest = dest_id_max + 1;
1977     }
1978     d->n_elts_dest_e = d->hc->recv_size;
1979   }
1980 
1981   /* Now handle main data buffer (reverse implies reordering data) */
1982 
1983   if (_dest_data != _recv_data) {
1984     const unsigned char *sp = _recv_data + d->hc->elt_shift;
1985     if (d->recv_id != NULL && !reverse) {
1986       for (size_t i = 0; i < d->hc->recv_size; i++) {
1987         size_t w_displ = d->recv_id[i]*elt_size;
1988         size_t r_displ = d->hc->comp_size*i;
1989         for (size_t j = 0; j < elt_size; j++)
1990           _dest_data[w_displ + j] = sp[r_displ + j];
1991       }
1992     }
1993     else if (reverse) {
1994       const int n_r_ranks = hc->rn_recv->size;
1995       const int *dest_rank_index = hc->elt_rank_index;
1996       for (int i = 0; i < n_r_ranks; i++)
1997         hc->recv_count[i] = 0;
1998       for (size_t i = 0; i < d->hc->recv_size; i++) {
1999         int rank_idx = dest_rank_index[i];
2000         size_t w_displ = i*elt_size;
2001         size_t r_displ = (  hc->recv_displ[rank_idx]
2002                           + hc->recv_count[rank_idx])*hc->comp_size;
2003         for (size_t j = 0; j < elt_size; j++)
2004           _dest_data[w_displ + j] = sp[r_displ + j];
2005         hc->recv_count[rank_idx] += 1;
2006       }
2007     }
2008     else {
2009       for (size_t i = 0; i < d->hc->recv_size; i++) {
2010         size_t w_displ = i*elt_size;
2011         size_t r_displ = hc->comp_size*i;
2012         for (size_t j = 0; j < elt_size; j++)
2013           _dest_data[w_displ + j] = sp[r_displ + j];
2014       }
2015     }
2016     BFT_FREE(_recv_data);
2017   }
2018 
2019   return _dest_data;
2020 }
2021 
2022 /*----------------------------------------------------------------------------
2023  * Exchange indexed data with a hybrid caller.
2024  *
2025  * parameters:
2026  *   d                <-> pointer to associated all-to-all distributor
2027  *   hc               <-> associated hybrid caller structure
2028  *   reverse          <-- true if reverse mode
2029  *   dest_index       <-- destination index
2030  *   dest_data        <-> destination data buffer, or NULL
2031  *
2032  * returns:
2033  *   pointer to dest_data, or newly allocated buffer
2034  *---------------------------------------------------------------------------*/
2035 
2036 static  void *
_hybrid_pex_exchange_i(cs_all_to_all_t * d,_hybrid_pex_t * hc,bool reverse,const cs_lnum_t dest_index[],void * dest_data)2037 _hybrid_pex_exchange_i(cs_all_to_all_t    *d,
2038                        _hybrid_pex_t      *hc,
2039                        bool                reverse,
2040                        const cs_lnum_t     dest_index[],
2041                        void               *dest_data)
2042 {
2043   size_t elt_size = cs_datatype_size[hc->datatype];
2044   unsigned char *_dest_data = dest_data, *_recv_data = dest_data;
2045 
2046   /* Final data buffer */
2047 
2048   size_t n_elts_dest = (reverse) ? d->n_elts_src : d->n_elts_dest;
2049 
2050   size_t _n_dest_sub = dest_index[n_elts_dest];
2051   if (_dest_data == NULL && _n_dest_sub*elt_size > 0)
2052     BFT_MALLOC(_dest_data, _n_dest_sub*elt_size, unsigned char);
2053 
2054   /* Data buffer for MPI exchange (may merge data and metadata) */
2055 
2056   if (d->recv_id != NULL || reverse) {
2057     int n_r_ranks = hc->rn_recv->size;
2058     size_t _n_dest_buf = hc->recv_displ[n_r_ranks] * hc->comp_size;
2059     BFT_MALLOC(_recv_data, _n_dest_buf, unsigned char);
2060   }
2061   else
2062     _recv_data = _dest_data;
2063 
2064   cs_timer_t t0 = cs_timer_time();
2065 
2066   if (_n_trace < _n_trace_max) {
2067     /* Time to 1-5 s */
2068     _all_to_all_trace[_n_trace*9] = t0.sec*1e5 + t0.nsec/1e4;
2069     _all_to_all_trace[_n_trace*9+1] = 0;
2070     _all_to_all_trace[_n_trace*9+2] = 2;
2071     _all_to_all_trace[_n_trace*9+3] = bft_mem_usage_pr_size();
2072     _all_to_all_trace[_n_trace*9+4] = bft_mem_usage_max_pr_size();
2073     _all_to_all_trace[_n_trace*9+5] = hc->send_size*hc->comp_size;
2074     _all_to_all_trace[_n_trace*9+6] = hc->recv_size*hc->comp_size;
2075     _all_to_all_trace[_n_trace*9+7] = hc->rn_send->size;
2076     _all_to_all_trace[_n_trace*9+8] = hc->rn_recv->size;
2077     _n_trace += 1;
2078   }
2079 
2080   _hybrid_alltoallv(hc, hc->send_buffer, _recv_data);
2081 
2082   cs_timer_t t1 = cs_timer_time();
2083   cs_timer_counter_add_diff(_all_to_all_timers + CS_ALL_TO_ALL_TIME_EXCHANGE,
2084                             &t0, &t1);
2085   _all_to_all_calls[CS_ALL_TO_ALL_TIME_EXCHANGE] += 1;
2086 
2087   if (_n_trace < _n_trace_max) {
2088     /* Time to 1-5 s */
2089     _all_to_all_trace[_n_trace*9] = t1.sec*1e5 + t1.nsec/1e4;
2090     _all_to_all_trace[_n_trace*9+1] =   _all_to_all_trace[_n_trace*9]
2091                                       - _all_to_all_trace[(_n_trace-1)*9];
2092     _all_to_all_trace[_n_trace*9+2] = 0;
2093     _all_to_all_trace[_n_trace*9+3] = bft_mem_usage_pr_size();
2094     _all_to_all_trace[_n_trace*9+4] = bft_mem_usage_max_pr_size();
2095     _all_to_all_trace[_n_trace*9+5] = hc->send_size*hc->comp_size;
2096     _all_to_all_trace[_n_trace*9+6] = hc->recv_size*hc->comp_size;
2097     _all_to_all_trace[_n_trace*9+7] = hc->rn_send->size;
2098     _all_to_all_trace[_n_trace*9+8] = hc->rn_recv->size;
2099     _n_trace += 1;
2100   }
2101 
2102   /* Handle main data buffer (reverse implies reordering data) */
2103 
2104   if (_dest_data != _recv_data) {
2105     const unsigned char *sp = _recv_data;
2106     if (d->recv_id != NULL && !reverse) {
2107       size_t r_displ = 0;
2108       for (size_t i = 0; i < hc->recv_size; i++) {
2109         cs_lnum_t k = d->recv_id[i];
2110         cs_lnum_t n_sub_recv = dest_index[k+1] - dest_index[k];
2111         size_t w_displ = dest_index[k]*elt_size;
2112         size_t sub_size = n_sub_recv*elt_size;
2113         for (size_t l = 0; l < sub_size; l++)
2114           _dest_data[w_displ + l] = sp[r_displ + l];
2115         r_displ += sub_size;
2116       }
2117     }
2118     else if (reverse) {
2119       const int n_r_ranks = hc->rn_recv->size;
2120       const int *dest_rank_index = hc->elt_rank_index;
2121       for (int i = 0; i < n_r_ranks; i++)
2122         hc->recv_count[i] = 0;
2123       for (size_t i = 0; i < d->hc->recv_size; i++) {
2124         int rank_idx = dest_rank_index[i];
2125         size_t w_displ = dest_index[i]*elt_size;
2126         size_t r_displ = (  hc->recv_displ[rank_idx]
2127                           + hc->recv_count[rank_idx])*elt_size;
2128         cs_lnum_t n_sub_recv = dest_index[i+1] - dest_index[i];
2129         size_t sub_size = n_sub_recv*elt_size;
2130         for (size_t l = 0; l < sub_size; l++)
2131           _dest_data[w_displ + l] = sp[r_displ + l];
2132         hc->recv_count[rank_idx] += n_sub_recv;
2133       }
2134     }
2135     else {
2136       assert(_dest_data == _recv_data);
2137     }
2138     BFT_FREE(_recv_data);
2139   }
2140 
2141   return _dest_data;
2142 }
2143 
2144 /*----------------------------------------------------------------------------*/
2145 /*!
2146  * \brief Compute receive id by ordering based on source rank.
2147  *
2148  * The caller is responsible for freeing the returned array.
2149  *
2150  * As communication schemes (such as Crystal Router) should lead to
2151  * elements being grouped by rank, this is used here to optimize sorting.
2152  * Sorting should also be stable as a result.
2153  *
2154  * \param[in]  d         pointer to all-to-all distributor
2155  * \param[in]  src_rank  associated source rank (size: d->n_elts_dest)
2156  */
2157 /*----------------------------------------------------------------------------*/
2158 
2159 static void
_recv_id_by_src_rank_order(cs_all_to_all_t * d,const int src_rank[])2160 _recv_id_by_src_rank_order(cs_all_to_all_t  *d,
2161                            const int         src_rank[])
2162 {
2163   assert(d->recv_id == NULL);
2164 
2165   const cs_lnum_t n_elts = d->n_elts_dest;
2166 
2167   BFT_MALLOC(d->recv_id, n_elts, cs_lnum_t);
2168 
2169   cs_lnum_t n_rs = 0;
2170   cs_lnum_2_t *rs;
2171 
2172   BFT_MALLOC(rs, n_elts+1, cs_lnum_2_t);
2173 
2174   /* Extract source rank and build rank ranges. */
2175 
2176   int prev_rank = -1;
2177 
2178   for (cs_lnum_t i = 0; i < n_elts; i++) {
2179     if (src_rank[i] != prev_rank) {
2180       prev_rank = src_rank[i];
2181       rs[n_rs][0] = src_rank[i];
2182       rs[n_rs][1] = i;
2183       n_rs++;
2184     }
2185   }
2186 
2187   /* Extend array for future range tests. */
2188 
2189   rs[n_rs][0] = -1;
2190   rs[n_rs][1] = n_elts;
2191 
2192   /* Order ranges. */
2193 
2194   cs_lnum_t *rs_order;
2195   BFT_MALLOC(rs_order, n_rs, cs_lnum_t);
2196 
2197   cs_order_lnum_allocated_s(NULL,
2198                             (const cs_lnum_t  *)rs,
2199                             2,
2200                             rs_order,
2201                             n_rs);
2202 
2203   cs_lnum_t k = 0;
2204   for (cs_lnum_t i = 0; i < n_rs; i++) {
2205     cs_lnum_t j = rs_order[i];
2206     cs_lnum_t s_id = rs[j][1];
2207     cs_lnum_t e_id = rs[j+1][1];
2208     for (cs_lnum_t l = s_id; l < e_id; l++) {
2209       d->recv_id[l] = k;
2210       k++;
2211     }
2212   }
2213   assert(k == n_elts);
2214 
2215   BFT_FREE(rs_order);
2216   BFT_FREE(rs);
2217 }
2218 
2219 /*----------------------------------------------------------------------------*/
2220 /*!
2221  * \brief Determine Crystal Router flags for a distributor.
2222  *
2223  * Some flags are only required for the first exchange.
2224  *
2225  * \param[in]  d        pointer to associated all-to-all distributor
2226  * \param[in]  reverse  if true, communicate in reverse direction
2227  *
2228  * \return flags required for first Crystal Router call
2229  */
2230 /*----------------------------------------------------------------------------*/
2231 
2232 static int
_cr_flags(cs_all_to_all_t * d,bool reverse)2233 _cr_flags(cs_all_to_all_t  *d,
2234           bool              reverse)
2235 {
2236   cs_assert(d != NULL);
2237 
2238   int cr_flags = 0;
2239 
2240   if (!reverse) {
2241     if (d->n_elts_dest < 0) {
2242       if (d->flags & CS_ALL_TO_ALL_USE_DEST_ID)
2243         cr_flags = cr_flags | CS_CRYSTAL_ROUTER_USE_DEST_ID;
2244       if (! (d->flags & CS_ALL_TO_ALL_NO_REVERSE)) {
2245         cr_flags = cr_flags | CS_CRYSTAL_ROUTER_ADD_SRC_ID;
2246         cr_flags = cr_flags | CS_CRYSTAL_ROUTER_ADD_SRC_RANK;
2247       }
2248       if (   d->flags & CS_ALL_TO_ALL_NEED_SRC_RANK
2249           || d->flags & CS_ALL_TO_ALL_ORDER_BY_SRC_RANK)
2250         cr_flags = cr_flags | CS_CRYSTAL_ROUTER_ADD_SRC_RANK;
2251     }
2252   }
2253   else
2254     cr_flags = cr_flags | CS_CRYSTAL_ROUTER_USE_DEST_ID;
2255 
2256   return cr_flags;
2257 }
2258 
2259 /*----------------------------------------------------------------------------*/
2260 /*!
2261  * \brief Compute destination id by ordering based on source rank after
2262  *        the first exchange with a Crystal Router.
2263  *
2264  * \param[in]  d   pointer to associated all-to-all distributor
2265  * \param[in]  cr  pointer to associated Crystal Router
2266  */
2267 /*----------------------------------------------------------------------------*/
2268 
2269 static void
_cr_recv_id_by_src_rank(cs_all_to_all_t * d,cs_crystal_router_t * cr)2270 _cr_recv_id_by_src_rank(cs_all_to_all_t      *d,
2271                         cs_crystal_router_t  *cr)
2272 {
2273   cs_assert(d != NULL);
2274 
2275   assert(d->recv_id == NULL);
2276 
2277   int *src_rank;
2278   BFT_MALLOC(src_rank, d->n_elts_dest_e, int);
2279 
2280   cs_crystal_router_get_data(cr,
2281                              &src_rank,
2282                              NULL,
2283                              NULL,
2284                              NULL, /* dest_index */
2285                              NULL);
2286 
2287   if (d->n_elts_dest < 0)
2288     d->n_elts_dest = cs_crystal_router_n_elts(cr);
2289 
2290   _recv_id_by_src_rank_order(d, src_rank);
2291 
2292   BFT_FREE(src_rank);
2293 }
2294 
2295 /*----------------------------------------------------------------------------*/
2296 /*!
2297  * \brief Indicate if source rank info must be maintained.
2298  *
2299  * The source rank is needed if either the CS_CRYSTAL_ROUTER_ADD_SRC_RANK
2300  * flag is set, or CS_ALL_TO_ALL_NO_REVERSE is not set.
2301  *
2302  * It is recommended that if the CS_ALL_TO_ALL_ORDER_BY_SRC_RANK flag is set,
2303  * for a communication scheme which does not ensure this by default
2304  * (such as for a Crystal Router), the source rank has been converted
2305  * to a recv_id, using \ref _recv_id_by_src_rank_order, so if this array
2306  * is available, the CS_ALL_TO_ALL_ORDER_BY_SRC_RANK flag does imply
2307  * by itself that source rank info is needed anymore.
2308  *
2309  * \param[in]  d   pointer to associated all-to-all distributor
2310  *
2311  * \return true if source rank info is needed, false otherwise
2312  */
2313 /*----------------------------------------------------------------------------*/
2314 
2315 static bool
_is_src_rank_info_needed(cs_all_to_all_t * d)2316 _is_src_rank_info_needed(cs_all_to_all_t  *d)
2317 {
2318   cs_assert(d != NULL);
2319 
2320   bool retval = false;
2321 
2322   if (d->flags & CS_ALL_TO_ALL_NO_REVERSE) {
2323     if (d->flags & CS_ALL_TO_ALL_NEED_SRC_RANK)
2324       retval = true;
2325     else if (d->flags & CS_ALL_TO_ALL_ORDER_BY_SRC_RANK) {
2326       if (d->recv_id == NULL && d->n_elts_dest > 0)
2327         retval = true;
2328     }
2329   }
2330   else
2331     retval = true;
2332 
2333   return retval;
2334 }
2335 
2336 #endif /* defined(HAVE_MPI) */
2337 
2338 /*! (DOXYGEN_SHOULD_SKIP_THIS) \endcond */
2339 
2340 /*============================================================================
2341  * Public function definitions
2342  *============================================================================*/
2343 
2344 #if defined(HAVE_MPI)
2345 
2346 /*----------------------------------------------------------------------------*/
2347 /*!
2348  * \brief Create an all-to-all distributor based on destination rank.
2349  *
2350  * This is a collective operation on communicator comm.
2351  *
2352  * If the flags bit mask matches \ref CS_ALL_TO_ALL_USE_DEST_ID,
2353  * data exchanged will be ordered by the array passed to the
2354  * \c dest_id argument. For \c n total values received on a rank
2355  * (as given by \ref cs_all_to_all_n_elts_dest), those destination ids
2356  * must be in the range [0, \c n[.
2357  *
2358  * If the flags bit mask matches \ref CS_ALL_TO_ALL_ORDER_BY_SRC_RANK,
2359  * data exchanged will be ordered by source rank (this is incompatible
2360  * with \ref CS_ALL_TO_ALL_USE_DEST_ID.
2361  *
2362  * \attention
2363  * The \c dest_rank and \c dest_id arrays are only referenced by
2364  * the distributor, not copied, and must remain available throughout
2365  * the distributor's lifetime. They may be fully transferred to
2366  * the structure if not needed elsewhere using the
2367  * \ref cs_all_to_all_transfer_dest_rank and
2368  * \ref cs_all_to_all_transfer_dest_id functions.
2369  *
2370  * \param[in]  n_elts       number of elements
2371  * \param[in]  flags        sum of ordering and metadata flag constants
2372  * \param[in]  dest_id      element destination id (required if flags
2373  *                          contain \ref CS_ALL_TO_ALL_USE_DEST_ID),
2374  *                          or NULL
2375  * \param[in]  dest_rank    destination rank for each element
2376  * \param[in]  comm         associated MPI communicator
2377  *
2378  * \return  pointer to new all-to-all distributor
2379  */
2380 /*----------------------------------------------------------------------------*/
2381 
2382 cs_all_to_all_t *
cs_all_to_all_create(size_t n_elts,int flags,const cs_lnum_t * dest_id,const int dest_rank[],MPI_Comm comm)2383 cs_all_to_all_create(size_t            n_elts,
2384                      int               flags,
2385                      const cs_lnum_t  *dest_id,
2386                      const int         dest_rank[],
2387                      MPI_Comm          comm)
2388 {
2389   cs_timer_t t0, t1;
2390 
2391   t0 = cs_timer_time();
2392 
2393   cs_all_to_all_t *d = _all_to_all_create_base(n_elts,
2394                                                flags,
2395                                                comm);
2396 
2397   d->dest_id = dest_id;
2398   d->dest_rank = dest_rank;
2399 
2400   /* Create substructures based on info available at this stage
2401      (for Crystal Router, delay creation as data is not passed yet) */
2402 
2403   if (d->type == CS_ALL_TO_ALL_MPI_DEFAULT)
2404     d->dc = _alltoall_caller_create_meta(flags, comm);
2405   else if (d->type == CS_ALL_TO_ALL_HYBRID)
2406     d->hc = _hybrid_pex_create_meta(flags, comm);
2407 
2408   t1 = cs_timer_time();
2409   cs_timer_counter_add_diff(_all_to_all_timers + CS_ALL_TO_ALL_TIME_TOTAL,
2410                             &t0, &t1);
2411   _all_to_all_calls[CS_ALL_TO_ALL_TIME_TOTAL] += 1;
2412 
2413   return d;
2414 }
2415 
2416 /*----------------------------------------------------------------------------*/
2417 /*!
2418  * \brief Create an all-to-all distributor for elements whose destination
2419  * rank is determined from global numbers and block distribution information.
2420  *
2421  * This is a collective operation on communicator comm.
2422  *
2423  * If the flags bit mask matches \ref CS_ALL_TO_ALL_USE_DEST_ID,
2424  * data exchanged will be ordered by global element number.
2425  *
2426  * If the flags bit mask matches \ref CS_ALL_TO_ALL_ORDER_BY_SRC_RANK,
2427  * data exchanged will be ordered by source rank (this is incompatible
2428  * with \ref CS_ALL_TO_ALL_USE_DEST_ID.
2429  *
2430  * \param[in]  n_elts       number of elements
2431  * \param[in]  flags        sum of ordering and metadata flag constants
2432  * \param[in]  src_gnum     global source element numbers
2433  * \param[in]  bi           destination block distribution info
2434  * \param[in]  comm         associated MPI communicator
2435  *
2436  * \return  pointer to new all-to-all distributor
2437  */
2438 /*----------------------------------------------------------------------------*/
2439 
2440 cs_all_to_all_t *
cs_all_to_all_create_from_block(size_t n_elts,int flags,const cs_gnum_t * src_gnum,cs_block_dist_info_t bi,MPI_Comm comm)2441 cs_all_to_all_create_from_block(size_t                 n_elts,
2442                                 int                    flags,
2443                                 const cs_gnum_t       *src_gnum,
2444                                 cs_block_dist_info_t   bi,
2445                                 MPI_Comm               comm)
2446 {
2447   cs_timer_t t0, t1;
2448 
2449   t0 = cs_timer_time();
2450 
2451   cs_all_to_all_t *d = _all_to_all_create_base(n_elts,
2452                                                flags,
2453                                                comm);
2454 
2455   /* Compute arrays based on block distribution
2456      (using global ids and block info at lower levels could
2457      save some memory, but would make things less modular;
2458      in addition, rank is an int, while global ids are usually
2459      64-bit uints, so the overhead is only 50%, and access should
2460      be faster). */
2461 
2462   BFT_MALLOC(d->_dest_rank, n_elts, int);
2463   d->dest_rank = d->_dest_rank;
2464 
2465   if (flags & CS_ALL_TO_ALL_USE_DEST_ID) {
2466     BFT_MALLOC(d->_dest_id, n_elts, cs_lnum_t);
2467     d->dest_id = d->_dest_id;
2468   }
2469 
2470   const int rank_step = bi.rank_step;
2471   const cs_gnum_t block_size = bi.block_size;
2472 
2473   if (d->_dest_id != NULL) {
2474     #pragma omp parallel for if (n_elts > CS_THR_MIN)
2475     for (size_t i = 0; i < n_elts; i++) {
2476       cs_gnum_t g_elt_id = src_gnum[i] -1;
2477       cs_gnum_t _dest_rank = g_elt_id / block_size;
2478       d->_dest_rank[i] = _dest_rank*rank_step;
2479       d->_dest_id[i]   = g_elt_id % block_size;
2480     }
2481   }
2482   else {
2483     #pragma omp parallel for if (n_elts > CS_THR_MIN)
2484     for (size_t i = 0; i < n_elts; i++) {
2485       cs_gnum_t g_elt_id = src_gnum[i] -1;
2486       cs_gnum_t _dest_rank = g_elt_id / block_size;
2487       d->_dest_rank[i] = _dest_rank*rank_step;
2488     }
2489   }
2490 
2491   /* Create substructures based on info available at this stage
2492      (for Crystal Router, delay creation as data is not passed yet) */
2493 
2494   if (d->type == CS_ALL_TO_ALL_MPI_DEFAULT)
2495     d->dc = _alltoall_caller_create_meta(flags, comm);
2496   else if (d->type == CS_ALL_TO_ALL_HYBRID)
2497     d->hc = _hybrid_pex_create_meta(flags, comm);
2498 
2499   t1 = cs_timer_time();
2500   cs_timer_counter_add_diff(_all_to_all_timers + CS_ALL_TO_ALL_TIME_TOTAL,
2501                             &t0, &t1);
2502   _all_to_all_calls[CS_ALL_TO_ALL_TIME_TOTAL] += 1;
2503 
2504   return d;
2505 }
2506 
2507 /*----------------------------------------------------------------------------*/
2508 /*!
2509  * \brief Destroy an all-to-all distributor.
2510  *
2511  * \param[in, out]  d   pointer to associated all-to-all distributor
2512  */
2513 /*----------------------------------------------------------------------------*/
2514 
2515 void
cs_all_to_all_destroy(cs_all_to_all_t ** d)2516 cs_all_to_all_destroy(cs_all_to_all_t **d)
2517 {
2518   if (d != NULL) {
2519 
2520     cs_timer_t t0, t1;
2521 
2522     t0 = cs_timer_time();
2523 
2524     cs_all_to_all_t *_d = *d;
2525     if (_d->cr != NULL)
2526       cs_crystal_router_destroy(&(_d->cr));
2527     else if (_d->hc != NULL)
2528       _hybrid_pex_destroy(&(_d->hc));
2529     else if (_d->dc != NULL)
2530       _alltoall_caller_destroy(&(_d->dc));
2531 
2532     BFT_FREE(_d->src_rank);
2533     BFT_FREE(_d->src_id);
2534 
2535     BFT_FREE(_d->_dest_id);
2536     BFT_FREE(_d->_dest_rank);
2537 
2538     BFT_FREE(_d->recv_id);
2539     BFT_FREE(_d->src_id);
2540     BFT_FREE(_d->src_rank);
2541 
2542     BFT_FREE(_d);
2543 
2544     t1 = cs_timer_time();
2545     cs_timer_counter_add_diff(_all_to_all_timers + CS_ALL_TO_ALL_TIME_TOTAL,
2546                               &t0, &t1);
2547 
2548     /* no increment to _all_to_all_calls[0] as create/destroy are grouped */
2549   }
2550 }
2551 
2552 /*----------------------------------------------------------------------------*/
2553 /*!
2554  * \brief Transfer ownership of destination rank to an all-to-all distributor.
2555  *
2556  * The dest_rank array should be the same as the one used for the creation of
2557  * the distributor.
2558  *
2559  * \param[in, out]  d          pointer to associated all-to-all distributor
2560  * \param[in, out]  dest_rank  pointer to element destination rank
2561  */
2562 /*----------------------------------------------------------------------------*/
2563 
2564 void
cs_all_to_all_transfer_dest_rank(cs_all_to_all_t * d,int ** dest_rank)2565 cs_all_to_all_transfer_dest_rank(cs_all_to_all_t   *d,
2566                                  int              **dest_rank)
2567 {
2568   cs_assert(d != NULL);
2569 
2570   if (d->dest_rank == *dest_rank) {
2571     d->_dest_rank = *dest_rank;
2572     *dest_rank  = NULL;
2573   }
2574   else {
2575     bft_error(__FILE__, __LINE__, 0,
2576               "%s: array transferred (%p)does not match the one used\n"
2577               "for all-to-all distributor creation (%p).",
2578               __func__, (void *)*dest_rank, (const void *)d->dest_rank);
2579   }
2580 }
2581 
2582 /*----------------------------------------------------------------------------*/
2583 /*!
2584  * \brief Transfer ownership of destination ids to an
2585  *        all-to-all distributor.
2586  *
2587  * The dest_id array should be the same as the one used for the creation of
2588  * the distributor.
2589  *
2590  * \param[in, out]  d        pointer to associated all-to-all distributor
2591  * \param[in, out]  dest_id  pointer to element destination id
2592  */
2593 /*----------------------------------------------------------------------------*/
2594 
2595 void
cs_all_to_all_transfer_dest_id(cs_all_to_all_t * d,cs_lnum_t ** dest_id)2596 cs_all_to_all_transfer_dest_id(cs_all_to_all_t   *d,
2597                                cs_lnum_t        **dest_id)
2598 {
2599   cs_assert(d != NULL);
2600 
2601   if (d->dest_id == *dest_id) {
2602     d->_dest_id = *dest_id;
2603     *dest_id  = NULL;
2604   }
2605   else {
2606     bft_error(__FILE__, __LINE__, 0,
2607               "%s: array transferred (%p)does not match the one used\n"
2608               "for all-to-all distributor creation (%p).",
2609               __func__, (void *)*dest_id, (const void *)d->dest_id);
2610   }
2611 }
2612 
2613 /*----------------------------------------------------------------------------*/
2614 /*!
2615  * \brief Get number of elements associated with all-to-all distributor.
2616  *
2617  * The number of elements is the number of elements received after exchange.
2618  *
2619  * If no exchange has been done yet (depending on the communication protocol),
2620  * metadata will be exchanged by this call, so it is a collective operation.
2621  *
2622  * \param[in]  d   pointer to associated all-to-all distributor
2623  *
2624  * \return  number of elements associated with distributor.
2625  */
2626 /*----------------------------------------------------------------------------*/
2627 
2628 cs_lnum_t
cs_all_to_all_n_elts_dest(cs_all_to_all_t * d)2629 cs_all_to_all_n_elts_dest(cs_all_to_all_t  *d)
2630 {
2631   cs_assert(d != NULL);
2632 
2633   /* Obtain count if not available yet */
2634 
2635   if (d->n_elts_dest < 0) {
2636 
2637     cs_timer_t t0, t1;
2638 
2639     t0 = cs_timer_time();
2640 
2641     switch(d->type) {
2642     case CS_ALL_TO_ALL_MPI_DEFAULT:
2643       {
2644         _alltoall_caller_exchange_meta(d->dc,
2645                                        d->n_elts_src,
2646                                        d->dest_rank);
2647         if (d->dc->dest_id_datatype == CS_LNUM_TYPE)
2648           cs_all_to_all_copy_array(d,
2649                                    CS_DATATYPE_NULL,
2650                                    0,
2651                                    false,
2652                                    NULL,
2653                                    NULL);
2654         else {
2655           d->n_elts_dest = d->dc->recv_size;
2656           d->n_elts_dest_e = d->dc->recv_size;
2657         }
2658       }
2659       break;
2660 
2661     case CS_ALL_TO_ALL_HYBRID:
2662       {
2663         _hybrid_pex_exchange_meta(d->hc,
2664                                   d->n_elts_src,
2665                                   d->dest_rank);
2666         BFT_FREE(d->_dest_rank);
2667         if (d->hc->dest_id_datatype == CS_LNUM_TYPE)
2668           cs_all_to_all_copy_array(d,
2669                                    CS_DATATYPE_NULL,
2670                                    0,
2671                                    false,
2672                                    NULL,
2673                                    NULL);
2674         else {
2675           d->n_elts_dest = d->hc->recv_size;
2676           d->n_elts_dest_e = d->hc->recv_size;
2677         }
2678       }
2679       break;
2680 
2681     case CS_ALL_TO_ALL_CRYSTAL_ROUTER:
2682       {
2683         cs_crystal_router_t *cr
2684           = cs_crystal_router_create_s(d->n_elts_src,
2685                                        0,
2686                                        CS_DATATYPE_NULL,
2687                                         _cr_flags(d, false),
2688                                        NULL,
2689                                        NULL,
2690                                        d->dest_id,
2691                                        d->dest_rank,
2692                                        d->comm);
2693 
2694         cs_timer_t tcr0 = cs_timer_time();
2695 
2696         if (_n_trace < _n_trace_max) {
2697           /* Time to 1-5 s */
2698           _all_to_all_trace[_n_trace*9] = tcr0.sec*1e5 + tcr0.nsec/1e4;
2699           _all_to_all_trace[_n_trace*9+1] = 0;
2700           _all_to_all_trace[_n_trace*9+2] = 0;
2701           _all_to_all_trace[_n_trace*9+3] = bft_mem_usage_pr_size();
2702           _all_to_all_trace[_n_trace*9+4] = bft_mem_usage_max_pr_size();
2703           _all_to_all_trace[_n_trace*9+5] = 0;
2704           _all_to_all_trace[_n_trace*9+6] = 0;
2705           _all_to_all_trace[_n_trace*9+7] = 0;
2706           _all_to_all_trace[_n_trace*9+8] = 0;
2707           _n_trace += 1;
2708         }
2709 
2710         cs_crystal_router_exchange(cr);
2711 
2712         cs_timer_t tcr1 = cs_timer_time();
2713         cs_timer_counter_add_diff
2714           (_all_to_all_timers + CS_ALL_TO_ALL_TIME_METADATA, &tcr0, &tcr1);
2715         _all_to_all_calls[CS_ALL_TO_ALL_TIME_METADATA] += 1;
2716 
2717         if (_n_trace < _n_trace_max) {
2718           /* Time to 1-5 s */
2719           _all_to_all_trace[_n_trace*9] = tcr1.sec*1e5 + tcr1.nsec/1e4;
2720           _all_to_all_trace[_n_trace*9+1] =   _all_to_all_trace[_n_trace*9]
2721                                             - _all_to_all_trace[(_n_trace-1)*9];
2722           _all_to_all_trace[_n_trace*9+2] = 1;
2723           _all_to_all_trace[_n_trace*9+3] = bft_mem_usage_pr_size();
2724           _all_to_all_trace[_n_trace*9+4] = bft_mem_usage_max_pr_size();
2725           _all_to_all_trace[_n_trace*9+5] = 0;
2726           _all_to_all_trace[_n_trace*9+6] = 0;
2727           _all_to_all_trace[_n_trace*9+7] = 0;
2728           _all_to_all_trace[_n_trace*9+8] = 0;
2729           _n_trace += 1;
2730         }
2731 
2732         d->n_elts_dest = cs_crystal_router_n_elts(cr);
2733         d->n_elts_dest_e = cs_crystal_router_n_recv_elts(cr);
2734 
2735         if (d->flags & CS_ALL_TO_ALL_ORDER_BY_SRC_RANK)
2736           _cr_recv_id_by_src_rank(d, cr);
2737 
2738         int **p_src_rank = _is_src_rank_info_needed(d) ? &(d->src_rank) : NULL;
2739         cs_crystal_router_get_data(cr,
2740                                    p_src_rank,
2741                                    &(d->recv_id),
2742                                    &(d->src_id),
2743                                    NULL, /* dest_index */
2744                                    NULL);
2745 
2746         cs_crystal_router_destroy(&cr);
2747 
2748       }
2749       break;
2750 
2751     }
2752 
2753     t1 = cs_timer_time();
2754     cs_timer_counter_add_diff(_all_to_all_timers + CS_ALL_TO_ALL_TIME_TOTAL,
2755                               &t0, &t1);
2756     _all_to_all_calls[CS_ALL_TO_ALL_TIME_TOTAL] += 1;
2757 
2758   }
2759 
2760   return d->n_elts_dest;
2761 }
2762 
2763 /*----------------------------------------------------------------------------*/
2764 /*!
2765  * \brief Communicate array data using all-to-all distributor.
2766  *
2767  * If a destination buffer is provided, it should be of sufficient size for
2768  * the number of elements returned by \ref cs_all_to_all_n_elts_dest
2769  * (multiplied by stride and datatype size).
2770  *
2771  * If no buffer is provided, one is allocated automatically, and transferred
2772  * to the caller (who is responsible for freeing it when no longer needed).
2773  *
2774  * If used in reverse mode, data is still communicated from src_data
2775  * to dest_buffer or an internal buffer, but communication direction
2776  * (i.e. source and destination ranks) are reversed.
2777  *
2778  * This is obviously a collective operation, and all ranks must provide
2779  * the same datatype, stride, and reverse values.
2780  *
2781  * \param[in, out]  d          pointer to associated all-to-all distributor
2782  * \param[in]       datatype   type of data considered
2783  * \param[in]       stride     number of values per entity (interlaced),
2784  * \param[in]       reverse    if true, communicate in reverse direction
2785  * \param[in]       src_data   source data
2786  * \param[out]      dest_data  pointer to destination data, or NULL
2787  *
2788  * \return pointer to destination data (dest_buffer if non-NULL)
2789  */
2790 /*----------------------------------------------------------------------------*/
2791 
2792 void *
cs_all_to_all_copy_array(cs_all_to_all_t * d,cs_datatype_t datatype,int stride,bool reverse,const void * src_data,void * dest_data)2793 cs_all_to_all_copy_array(cs_all_to_all_t   *d,
2794                          cs_datatype_t      datatype,
2795                          int                stride,
2796                          bool               reverse,
2797                          const void        *src_data,
2798                          void              *dest_data)
2799 {
2800   cs_assert(d != NULL);
2801 
2802   void  *_dest_data = NULL;
2803 
2804   if (_n_trace > 0) {
2805     fprintf(_all_to_all_trace_bt_log, "\ncs_all_to_all_copy_array: %d\n\n",
2806             _n_trace);
2807     cs_base_backtrace_dump(_all_to_all_trace_bt_log, 1);
2808     bft_printf("cs_all_to_all_copy_array: %d\n", _n_trace);
2809   }
2810 
2811   cs_timer_t t0, t1;
2812 
2813   t0 = cs_timer_time();
2814 
2815   /* Reverse can only be called after direct exchange in most cases
2816      (this case should be rare, and requires additional echanges,
2817      but let's play it safe and make sure it works if needed) */
2818 
2819   if (d->n_elts_dest < 0 && reverse)
2820     cs_all_to_all_copy_array(d,
2821                              CS_DATATYPE_NULL,
2822                              0,
2823                              false,
2824                              NULL,
2825                              NULL);
2826 
2827   /* Now do regular exchange */
2828 
2829   switch(d->type) {
2830 
2831   case CS_ALL_TO_ALL_MPI_DEFAULT:
2832     {
2833       if (d->n_elts_dest < 0) { /* Exchange metadata if not done yet */
2834         _alltoall_caller_exchange_meta(d->dc,
2835                                        d->n_elts_src,
2836                                        d->dest_rank);
2837         d->n_elts_dest = d->dc->recv_size;
2838         d->n_elts_dest_e = d->dc->recv_size;
2839       }
2840       size_t n_elts = (reverse) ? d->n_elts_dest : d->n_elts_src;
2841       if (reverse)
2842         _alltoall_caller_swap_src_dest(d->dc);
2843       _alltoall_caller_prepare_s(d->dc,
2844                                  n_elts,
2845                                  stride,
2846                                  datatype,
2847                                  reverse,
2848                                  src_data,
2849                                  d->dest_id,
2850                                  d->recv_id,
2851                                  d->dest_rank);
2852       _dest_data = _alltoall_caller_exchange_s(d,
2853                                                d->dc,
2854                                                reverse,
2855                                                d->dest_rank,
2856                                                dest_data);
2857       if (reverse) {
2858         _alltoall_caller_swap_src_dest(d->dc);
2859         if (d->dc->send_buffer == src_data)
2860           d->dc->send_buffer = NULL;
2861       }
2862     }
2863     break;
2864 
2865   case CS_ALL_TO_ALL_HYBRID:
2866     {
2867       if (d->n_elts_dest < 0) { /* Exchange metadata if not done yet */
2868         _hybrid_pex_exchange_meta(d->hc,
2869                                   d->n_elts_src,
2870                                   d->dest_rank);
2871         BFT_FREE(d->_dest_rank);
2872         d->n_elts_dest = d->hc->recv_size;
2873         d->n_elts_dest_e = d->hc->recv_size;
2874       }
2875       size_t n_elts = (reverse) ? d->n_elts_dest : d->n_elts_src;
2876       if (reverse)
2877         _hybrid_pex_swap_src_dest(d->hc);
2878       _hybrid_pex_prepare_s(d->hc,
2879                             n_elts,
2880                             stride,
2881                             datatype,
2882                             reverse,
2883                             src_data,
2884                             d->dest_id,
2885                             d->recv_id);
2886       _dest_data = _hybrid_pex_exchange_s(d,
2887                                           d->hc,
2888                                           reverse,
2889                                           dest_data);
2890       if (reverse) {
2891         _hybrid_pex_swap_src_dest(d->hc);
2892         if (d->hc->send_buffer == src_data)
2893           d->hc->send_buffer = NULL;
2894       }
2895     }
2896     break;
2897 
2898   case CS_ALL_TO_ALL_CRYSTAL_ROUTER:
2899     {
2900       _dest_data = dest_data;
2901       cs_timer_t tcr0, tcr1;
2902       cs_crystal_router_t *cr;
2903       if (!reverse) {
2904         cr = cs_crystal_router_create_s(d->n_elts_src,
2905                                         stride,
2906                                         datatype,
2907                                         _cr_flags(d, reverse),
2908                                         src_data,
2909                                         NULL,
2910                                         d->dest_id,
2911                                         d->dest_rank,
2912                                         d->comm);
2913         tcr0 = cs_timer_time();
2914 
2915         if (_n_trace < _n_trace_max) {
2916           /* Time to 1-5 s */
2917           _all_to_all_trace[_n_trace*9] = tcr0.sec*1e5 + tcr0.nsec/1e4;
2918           _all_to_all_trace[_n_trace*9+1] = 0;
2919           _all_to_all_trace[_n_trace*9+2] = 0;
2920           _all_to_all_trace[_n_trace*9+3] = bft_mem_usage_pr_size();
2921           _all_to_all_trace[_n_trace*9+4] = bft_mem_usage_max_pr_size();
2922           _all_to_all_trace[_n_trace*9+5] = 0;
2923           _all_to_all_trace[_n_trace*9+6] = 0;
2924           _all_to_all_trace[_n_trace*9+7] = 0;
2925           _all_to_all_trace[_n_trace*9+8] = 0;
2926           _n_trace += 1;
2927         }
2928 
2929         cs_crystal_router_exchange(cr);
2930         tcr1 = cs_timer_time();
2931 
2932         if (_n_trace < _n_trace_max) {
2933           size_t max_sizes[2];
2934           cs_crystal_router_get_max_sizes(cr, max_sizes);
2935 
2936           /* Time to 1-5 s */
2937           _all_to_all_trace[_n_trace*9] = tcr1.sec*1e5 + tcr1.nsec/1e4;
2938           _all_to_all_trace[_n_trace*9+1] =   _all_to_all_trace[_n_trace*9]
2939                                             - _all_to_all_trace[(_n_trace-1)*9];
2940           _all_to_all_trace[_n_trace*9+2] = 1;
2941           _all_to_all_trace[_n_trace*9+3] = bft_mem_usage_pr_size();
2942           _all_to_all_trace[_n_trace*9+4] = bft_mem_usage_max_pr_size();
2943           _all_to_all_trace[_n_trace*9+5] = max_sizes[0];
2944           _all_to_all_trace[_n_trace*9+6] = max_sizes[1];
2945           _all_to_all_trace[_n_trace*9+7] = 0;
2946           _all_to_all_trace[_n_trace*9+8] = 0;
2947           _n_trace += 1;
2948         }
2949 
2950         if (d->n_elts_dest_e < 0) {
2951           d->n_elts_dest_e = cs_crystal_router_n_recv_elts(cr);
2952           if (d->flags & CS_ALL_TO_ALL_ORDER_BY_SRC_RANK)
2953             _cr_recv_id_by_src_rank(d, cr);
2954         }
2955         int **p_src_rank = _is_src_rank_info_needed(d) ? &(d->src_rank) : NULL;
2956         cs_crystal_router_get_data(cr,
2957                                    p_src_rank,
2958                                    &(d->recv_id),
2959                                    &(d->src_id),
2960                                    NULL, /* dest_index */
2961                                    &_dest_data);
2962         if (d->n_elts_dest < 0)
2963           d->n_elts_dest = cs_crystal_router_n_elts(cr);
2964       }
2965       else {
2966         const cs_lnum_t *elt_id
2967           = (d->n_elts_dest < d->n_elts_dest_e) ? d->recv_id : NULL;
2968         cr = cs_crystal_router_create_s(d->n_elts_dest_e,
2969                                         stride,
2970                                         datatype,
2971                                         _cr_flags(d, reverse),
2972                                         src_data,
2973                                         elt_id,
2974                                         d->src_id,
2975                                         d->src_rank,
2976                                         d->comm);
2977         tcr0 = cs_timer_time();
2978 
2979         if (_n_trace < _n_trace_max) {
2980           /* Time to 1-5 s */
2981           _all_to_all_trace[_n_trace*9] = tcr0.sec*1e5 + tcr0.nsec/1e4;
2982           _all_to_all_trace[_n_trace*9+1] = 0;
2983           _all_to_all_trace[_n_trace*9+2] = 0;
2984           _all_to_all_trace[_n_trace*9+3] = bft_mem_usage_pr_size();
2985           _all_to_all_trace[_n_trace*9+4] = bft_mem_usage_max_pr_size();
2986           _all_to_all_trace[_n_trace*9+5] = 0;
2987           _all_to_all_trace[_n_trace*9+6] = 0;
2988           _all_to_all_trace[_n_trace*9+7] = 0;
2989           _all_to_all_trace[_n_trace*9+8] = 0;
2990           _n_trace += 1;
2991         }
2992 
2993         cs_crystal_router_exchange(cr);
2994         tcr1 = cs_timer_time();
2995 
2996         if (_n_trace < _n_trace_max) {
2997           size_t max_sizes[2];
2998           cs_crystal_router_get_max_sizes(cr, max_sizes);
2999 
3000           /* Time to 1-5 s */
3001           _all_to_all_trace[_n_trace*9] = tcr1.sec*1e5 + tcr1.nsec/1e4;
3002           _all_to_all_trace[_n_trace*9+1] =   _all_to_all_trace[_n_trace*9]
3003                                             - _all_to_all_trace[(_n_trace-1)*9];
3004           _all_to_all_trace[_n_trace*9+2] = 1;
3005           _all_to_all_trace[_n_trace*9+3] = bft_mem_usage_pr_size();
3006           _all_to_all_trace[_n_trace*9+4] = bft_mem_usage_max_pr_size();
3007           _all_to_all_trace[_n_trace*9+5] = max_sizes[0];
3008           _all_to_all_trace[_n_trace*9+6] = max_sizes[1];
3009           _all_to_all_trace[_n_trace*9+7] = 0;
3010           _all_to_all_trace[_n_trace*9+8] = 0;
3011           _n_trace += 1;
3012         }
3013 
3014         cs_crystal_router_get_data(cr,
3015                                    NULL,
3016                                    NULL,
3017                                    NULL,
3018                                    NULL, /* dest_index */
3019                                    &_dest_data);
3020       }
3021       cs_crystal_router_destroy(&cr);
3022       cs_timer_counter_add_diff
3023         (_all_to_all_timers + CS_ALL_TO_ALL_TIME_EXCHANGE, &tcr0, &tcr1);
3024       _all_to_all_calls[CS_ALL_TO_ALL_TIME_EXCHANGE] += 1;
3025     }
3026     break;
3027 
3028   }
3029 
3030   t1 = cs_timer_time();
3031   cs_timer_counter_add_diff(_all_to_all_timers + CS_ALL_TO_ALL_TIME_TOTAL,
3032                             &t0, &t1);
3033   _all_to_all_calls[CS_ALL_TO_ALL_TIME_TOTAL] += 1;
3034 
3035   return _dest_data;
3036 }
3037 
3038 /*----------------------------------------------------------------------------*/
3039 /*!
3040  * \brief Communicate local index using all-to-all distributor.
3041  *
3042  * If a destination buffer is provided, it should be of sufficient size for
3043  * the number of elements returned by \ref cs_all_to_all_n_elts_dest.
3044  *
3045  * If no buffer is provided, one is allocated automatically, and transferred
3046  * to the caller (who is responsible for freeing it when no longer needed).
3047  *
3048  * If used in reverse mode, data is still communicated from src_index
3049  * to dest_index or an internal buffer, but communication direction
3050  * (i.e. source and destination ranks) are reversed.
3051  *
3052  * This is obviously a collective operation, and all ranks must provide
3053  * the same value for the reverse parameter.
3054  *
3055  * \param[in, out]  d           pointer to associated all-to-all distributor
3056  * \param[in]       reverse     if true, communicate in reverse direction
3057  * \param[in]       src_index   source index
3058  * \param[out]      dest_index  pointer to destination index, or NULL
3059  *
3060  * \return pointer to destination data (dest_buffer if non-NULL)
3061  */
3062 /*----------------------------------------------------------------------------*/
3063 
3064 cs_lnum_t *
cs_all_to_all_copy_index(cs_all_to_all_t * d,bool reverse,const cs_lnum_t * src_index,cs_lnum_t * dest_index)3065 cs_all_to_all_copy_index(cs_all_to_all_t  *d,
3066                          bool              reverse,
3067                          const cs_lnum_t  *src_index,
3068                          cs_lnum_t        *dest_index)
3069 {
3070   cs_timer_t t0, t1;
3071 
3072   if (_n_trace > 0) {
3073     fprintf(_all_to_all_trace_bt_log, "\ncs_all_to_all_copy_index: %d\n\n",
3074             _n_trace);
3075     cs_base_backtrace_dump(_all_to_all_trace_bt_log, 1);
3076     bft_printf("cs_all_to_all_copy_index: %d\n", _n_trace);
3077   }
3078 
3079   cs_assert(d != NULL);
3080 
3081   cs_lnum_t *src_count = NULL;
3082   cs_lnum_t *_dest_index = dest_index;
3083 
3084   cs_all_to_all_n_elts_dest(d); /* force sync. if needed */
3085 
3086   cs_lnum_t n_src = (reverse) ? d->n_elts_dest : d->n_elts_src;
3087   cs_lnum_t n_dest = -1;
3088 
3089   if (dest_index == NULL) {
3090     n_dest = (reverse) ? d->n_elts_src : d->n_elts_dest;
3091   }
3092 
3093   t0 = cs_timer_time();
3094 
3095   if (dest_index == NULL)
3096     BFT_MALLOC(_dest_index, n_dest + 1, cs_lnum_t);
3097 
3098   /* Convert send index to count, then exchange */
3099 
3100   BFT_MALLOC(src_count, n_src, cs_lnum_t);
3101 
3102   for (cs_lnum_t i = 0; i < n_src; i++)
3103     src_count[i] = src_index[i+1] - src_index[i];
3104 
3105   t1 = cs_timer_time();
3106   cs_timer_counter_add_diff(_all_to_all_timers + CS_ALL_TO_ALL_TIME_TOTAL,
3107                             &t0, &t1);
3108 
3109   cs_all_to_all_copy_array(d,
3110                            CS_LNUM_TYPE,
3111                            1,
3112                            reverse,
3113                            src_count,
3114                            _dest_index + 1);
3115 
3116   t0 = cs_timer_time();
3117 
3118   BFT_FREE(src_count);
3119 
3120   _dest_index[0] = 0;
3121 
3122   if (n_dest < 1)
3123     n_dest = (reverse) ? d->n_elts_src : d->n_elts_dest;
3124 
3125   for (cs_lnum_t i = 0; i < n_dest; i++)
3126     _dest_index[i+1] += _dest_index[i];
3127 
3128   t1 = cs_timer_time();
3129   cs_timer_counter_add_diff(_all_to_all_timers + CS_ALL_TO_ALL_TIME_TOTAL,
3130                             &t0, &t1);
3131 
3132   return _dest_index;
3133 }
3134 
3135 /*----------------------------------------------------------------------------*/
3136 /*!
3137  * \brief Communicate local index using all-to-all distributor.
3138  *
3139  * If a destination buffer is provided, it should be of sufficient size for
3140  * the number of elements indicated by
3141  * dest_index[\ref cs_all_to_all_n_elts_dest "cs_all_to_all_n_elts_dest(d)"];
3142  *
3143  * If no buffer is provided, one is allocated automatically, and transferred
3144  * to the caller (who is responsible for freeing it when no longer needed).
3145  *
3146  * If used in reverse mode, data is still communicated from src_index
3147  * to dest_index or an internal buffer, but communication direction
3148  * (i.e. source and destination ranks) are reversed.
3149  *
3150  * This is obviously a collective operation, and all ranks must provide
3151  * the same value for the reverse parameter.
3152  *
3153  * \param[in, out]  d           pointer to associated all-to-all distributor
3154  * \param[in]       datatype    type of data considered
3155  * \param[in]       reverse     if true, communicate in reverse direction
3156  * \param[in]       src_index   source index
3157  * \param[in]       src_data    source data
3158  * \param[in]       dest_index  destination index
3159  * \param[out]      dest_data   pointer to destination data, or NULL
3160  *
3161  * \return pointer to destination data (dest_buffer if non-NULL)
3162  */
3163 /*----------------------------------------------------------------------------*/
3164 
3165 void *
cs_all_to_all_copy_indexed(cs_all_to_all_t * d,cs_datatype_t datatype,bool reverse,const cs_lnum_t * src_index,const void * src_data,const cs_lnum_t * dest_index,void * dest_data)3166 cs_all_to_all_copy_indexed(cs_all_to_all_t  *d,
3167                            cs_datatype_t     datatype,
3168                            bool              reverse,
3169                            const cs_lnum_t  *src_index,
3170                            const void       *src_data,
3171                            const cs_lnum_t  *dest_index,
3172                            void             *dest_data)
3173 {
3174   cs_assert(d != NULL);
3175 
3176   if (_n_trace > 0) {
3177     fprintf(_all_to_all_trace_bt_log, "\ncs_all_to_all_copy_indexed: %d\n\n",
3178             _n_trace);
3179     cs_base_backtrace_dump(_all_to_all_trace_bt_log, 1);
3180     bft_printf("cs_all_to_all_copy_indexed: %d\n", _n_trace);
3181   }
3182 
3183   void  *_dest_data = NULL;
3184 
3185   cs_timer_t t0, t1;
3186 
3187   t0 = cs_timer_time();
3188 
3189   /* Reverse can only be called after direct exchange in most cases
3190      (this case should be rare, and requires additional echanges,
3191      but let's play it safe and make sure it works if needed) */
3192 
3193   if (d->n_elts_dest < 0 && reverse)
3194     cs_all_to_all_copy_array(d,
3195                              CS_DATATYPE_NULL,
3196                              0,
3197                              false,
3198                              NULL,
3199                              NULL);
3200 
3201   /* Now do regular exchange */
3202 
3203   switch(d->type) {
3204 
3205   case CS_ALL_TO_ALL_MPI_DEFAULT:
3206     {
3207       if (d->n_elts_dest < 0) { /* Exchange metadata if not done yet */
3208         _alltoall_caller_exchange_meta(d->dc,
3209                                        d->n_elts_src,
3210                                        d->dest_rank);
3211         d->n_elts_dest = d->dc->recv_size;
3212       }
3213       size_t n_elts = (reverse) ? d->n_elts_dest : d->n_elts_src;
3214       _alltoall_caller_save_meta_i(d->dc);
3215       if (reverse)
3216         _alltoall_caller_swap_src_dest(d->dc);
3217       _alltoall_caller_prepare_i(d->dc,
3218                                  n_elts,
3219                                  datatype,
3220                                  reverse,
3221                                  src_index,
3222                                  dest_index,
3223                                  src_data,
3224                                  d->recv_id,
3225                                  d->dest_rank);
3226       _dest_data = _alltoall_caller_exchange_i(d,
3227                                                d->dc,
3228                                                reverse,
3229                                                dest_index,
3230                                                d->dest_rank,
3231                                                dest_data);
3232       if (reverse) {
3233         _alltoall_caller_swap_src_dest(d->dc);
3234         if (d->dc->send_buffer == src_data)
3235           d->dc->send_buffer = NULL;
3236       }
3237       _alltoall_caller_reset_meta_i(d->dc, d->dest_rank);
3238     }
3239     break;
3240 
3241   case CS_ALL_TO_ALL_HYBRID:
3242     {
3243       if (d->n_elts_dest < 0) { /* Exchange metadata if not done yet */
3244         _hybrid_pex_exchange_meta(d->hc,
3245                                   d->n_elts_src,
3246                                   d->dest_rank);
3247         BFT_FREE(d->_dest_rank);
3248         d->n_elts_dest = d->hc->recv_size;
3249       }
3250       size_t n_elts = (reverse) ? d->n_elts_dest : d->n_elts_src;
3251       _hybrid_pex_save_meta_i(d->hc);
3252       if (reverse)
3253         _hybrid_pex_swap_src_dest(d->hc);
3254       _hybrid_pex_prepare_i(d->hc,
3255                             n_elts,
3256                             datatype,
3257                             reverse,
3258                             src_index,
3259                             dest_index,
3260                             src_data,
3261                             d->recv_id);
3262       _dest_data = _hybrid_pex_exchange_i(d,
3263                                           d->hc,
3264                                           reverse,
3265                                           dest_index,
3266                                           dest_data);
3267       if (reverse) {
3268         _hybrid_pex_swap_src_dest(d->hc);
3269         if (d->hc->send_buffer == src_data)
3270           d->hc->send_buffer = NULL;
3271       }
3272       _hybrid_pex_reset_meta_i(d->hc);
3273     }
3274     break;
3275 
3276   case CS_ALL_TO_ALL_CRYSTAL_ROUTER:
3277     {
3278       _dest_data = dest_data;
3279       cs_timer_t tcr0, tcr1;
3280       cs_crystal_router_t *cr;
3281       if (!reverse) {
3282         cr = cs_crystal_router_create_i(d->n_elts_src,
3283                                         datatype,
3284                                         _cr_flags(d, reverse),
3285                                         src_index,
3286                                         src_data,
3287                                         NULL,
3288                                         d->dest_id,
3289                                         d->dest_rank,
3290                                         d->comm);
3291         tcr0 = cs_timer_time();
3292 
3293         if (_n_trace < _n_trace_max) {
3294           /* Time to 1-5 s */
3295           _all_to_all_trace[_n_trace*9] = tcr0.sec*1e5 + tcr0.nsec/1e4;
3296           _all_to_all_trace[_n_trace*9+1] = 0;
3297           _all_to_all_trace[_n_trace*9+2] = 0;
3298           _all_to_all_trace[_n_trace*9+3] = bft_mem_usage_pr_size();
3299           _all_to_all_trace[_n_trace*9+4] = bft_mem_usage_max_pr_size();
3300           _all_to_all_trace[_n_trace*9+5] = 0;
3301           _all_to_all_trace[_n_trace*9+6] = 0;
3302           _all_to_all_trace[_n_trace*9+7] = 0;
3303           _all_to_all_trace[_n_trace*9+8] = 0;
3304           _n_trace += 1;
3305         }
3306 
3307         cs_crystal_router_exchange(cr);
3308         tcr1 = cs_timer_time();
3309 
3310         if (_n_trace < _n_trace_max) {
3311           size_t max_sizes[2];
3312           cs_crystal_router_get_max_sizes(cr, max_sizes);
3313 
3314           /* Time to 1-5 s */
3315           _all_to_all_trace[_n_trace*9] = tcr1.sec*1e5 + tcr1.nsec/1e4;
3316           _all_to_all_trace[_n_trace*9+1] =   _all_to_all_trace[_n_trace*9]
3317                                             - _all_to_all_trace[(_n_trace-1)*9];
3318           _all_to_all_trace[_n_trace*9+2] = 1;
3319           _all_to_all_trace[_n_trace*9+3] = bft_mem_usage_pr_size();
3320           _all_to_all_trace[_n_trace*9+4] = bft_mem_usage_max_pr_size();
3321           _all_to_all_trace[_n_trace*9+5] = max_sizes[0];
3322           _all_to_all_trace[_n_trace*9+6] = max_sizes[1];
3323           _all_to_all_trace[_n_trace*9+7] = 0;
3324           _all_to_all_trace[_n_trace*9+8] = 0;
3325           _n_trace += 1;
3326         }
3327 
3328         if (d->n_elts_dest < 0) {
3329           d->n_elts_dest = cs_crystal_router_n_elts(cr);
3330           d->n_elts_dest_e = cs_crystal_router_n_recv_elts(cr);
3331         }
3332         int **p_src_rank = (d->src_rank == NULL) ? &(d->src_rank) : NULL;
3333         cs_crystal_router_get_data(cr,
3334                                    p_src_rank,
3335                                    &(d->recv_id),
3336                                    &(d->src_id),
3337                                    NULL, /* dest_index */
3338                                    &_dest_data);
3339       }
3340       else {
3341         const cs_lnum_t *elt_id
3342           = (d->n_elts_dest < d->n_elts_dest_e) ? d->recv_id : NULL;
3343         cr = cs_crystal_router_create_i(d->n_elts_dest_e,
3344                                         datatype,
3345                                         _cr_flags(d, reverse),
3346                                         src_index,
3347                                         src_data,
3348                                         elt_id,
3349                                         d->src_id,
3350                                         d->src_rank,
3351                                         d->comm);
3352         tcr0 = cs_timer_time();
3353 
3354         if (_n_trace < _n_trace_max) {
3355           /* Time to 1-5 s */
3356           _all_to_all_trace[_n_trace*9] = tcr0.sec*1e5 + tcr0.nsec/1e4;
3357           _all_to_all_trace[_n_trace*9+1] = 0;
3358           _all_to_all_trace[_n_trace*9+2] = 0;
3359           _all_to_all_trace[_n_trace*9+3] = bft_mem_usage_pr_size();
3360           _all_to_all_trace[_n_trace*9+4] = bft_mem_usage_max_pr_size();
3361           _all_to_all_trace[_n_trace*9+5] = 0;
3362           _all_to_all_trace[_n_trace*9+6] = 0;
3363           _all_to_all_trace[_n_trace*9+7] = 0;
3364           _all_to_all_trace[_n_trace*9+8] = 0;
3365           _n_trace += 1;
3366         }
3367 
3368         cs_crystal_router_exchange(cr);
3369         tcr1 = cs_timer_time();
3370 
3371         if (_n_trace < _n_trace_max) {
3372           size_t max_sizes[2];
3373           cs_crystal_router_get_max_sizes(cr, max_sizes);
3374 
3375           /* Time to 1-5 s */
3376           _all_to_all_trace[_n_trace*9] = tcr1.sec*1e5 + tcr1.nsec/1e4;
3377           _all_to_all_trace[_n_trace*9+1] =   _all_to_all_trace[_n_trace*9]
3378                                             - _all_to_all_trace[(_n_trace-1)*9];
3379           _all_to_all_trace[_n_trace*9+2] = 1;
3380           _all_to_all_trace[_n_trace*9+3] = bft_mem_usage_pr_size();
3381           _all_to_all_trace[_n_trace*9+4] = bft_mem_usage_max_pr_size();
3382           _all_to_all_trace[_n_trace*9+5] = max_sizes[0];
3383           _all_to_all_trace[_n_trace*9+6] = max_sizes[1];
3384           _all_to_all_trace[_n_trace*9+7] = 0;
3385           _all_to_all_trace[_n_trace*9+8] = 0;
3386           _n_trace += 1;
3387         }
3388 
3389         cs_crystal_router_get_data(cr,
3390                                    NULL,
3391                                    NULL,
3392                                    NULL,
3393                                    NULL, /* dest_index */
3394                                    &_dest_data);
3395       }
3396       cs_crystal_router_destroy(&cr);
3397       cs_timer_counter_add_diff
3398         (_all_to_all_timers + CS_ALL_TO_ALL_TIME_EXCHANGE, &tcr0, &tcr1);
3399       _all_to_all_calls[CS_ALL_TO_ALL_TIME_EXCHANGE] += 1;
3400     }
3401     break;
3402 
3403   }
3404 
3405   t1 = cs_timer_time();
3406   cs_timer_counter_add_diff(_all_to_all_timers + CS_ALL_TO_ALL_TIME_TOTAL,
3407                             &t0, &t1);
3408   _all_to_all_calls[CS_ALL_TO_ALL_TIME_TOTAL] += 1;
3409 
3410   return _dest_data;
3411 }
3412 
3413 /*----------------------------------------------------------------------------*/
3414 /*!
3415  * \brief Get array of source element ranks associated with an
3416  *        all-to-all distributor.
3417  *
3418  * This function should be called only after \ref cs_all_to_all_copy_array,
3419  * and allocates and returns an array of source element ranks matching the
3420  * exchanged data elements.
3421  *
3422  * It should also be called only if the distributor creation flags match
3423  * CS_ALL_TO_ALL_NEED_SRC_RANK or CS_ALL_TO_ALL_ORDER_BY_SRC_RANK.
3424  *
3425  * The returned data is owned by the caller, who is responsible for freeing
3426  * it when no longer needed.
3427  *
3428  * If source ranks are not available (depending on the distributor
3429  * creation function and options), the matching pointer will
3430  * be set to NULL.
3431  *
3432  * \param[in]  d   pointer to associated all-to-all distributor
3433  *
3434  * \return  array of source ranks (or NULL)
3435  */
3436 /*----------------------------------------------------------------------------*/
3437 
3438 int *
cs_all_to_all_get_src_rank(cs_all_to_all_t * d)3439 cs_all_to_all_get_src_rank(cs_all_to_all_t  *d)
3440 {
3441   cs_timer_t t0 = cs_timer_time();
3442 
3443   int *src_rank = NULL;
3444 
3445   cs_assert(d != NULL);
3446 
3447   if (! (   d->flags & CS_ALL_TO_ALL_NEED_SRC_RANK
3448          || d->flags & CS_ALL_TO_ALL_ORDER_BY_SRC_RANK))
3449     bft_error(__FILE__, __LINE__, 0,
3450               "%s: is called for a distributor with flags %d, which does not\n"
3451               "match masks CS_ALL_TO_ALL_NEED_SRC_RANK (%d) or "
3452               "CS_ALL_TO_ALL_ORDER_BY_SRC_RANK (%d).",
3453               __func__, d->flags,
3454               CS_ALL_TO_ALL_NEED_SRC_RANK,
3455               CS_ALL_TO_ALL_ORDER_BY_SRC_RANK);
3456 
3457   BFT_MALLOC(src_rank, d->n_elts_dest, int);
3458 
3459   switch(d->type) {
3460 
3461   case CS_ALL_TO_ALL_MPI_DEFAULT:
3462     {
3463       const _mpi_all_to_all_caller_t *dc = d->dc;
3464 
3465       for (int i = 0; i < dc->n_ranks; i++) {
3466         for (cs_lnum_t j = dc->recv_displ[i]; j < dc->recv_displ[i+1]; j++)
3467           src_rank[j] = i;
3468       }
3469     }
3470     break;
3471 
3472   case CS_ALL_TO_ALL_HYBRID:
3473     {
3474       const _hybrid_pex_t *hc = d->hc;
3475 
3476       for (int i = 0; i < hc->rn_recv->size; i++) {
3477         int rank_id = hc->rn_recv->rank[i];
3478         for (cs_lnum_t j = hc->recv_displ[i]; j < hc->recv_displ[i+1]; j++)
3479           src_rank[j] = rank_id;
3480       }
3481     }
3482     break;
3483 
3484   case CS_ALL_TO_ALL_CRYSTAL_ROUTER:
3485     {
3486       if (d->src_rank != NULL)
3487         memcpy(src_rank, d->src_rank, d->n_elts_dest*sizeof(int));
3488     }
3489 
3490   }
3491 
3492   cs_timer_t t1 = cs_timer_time();
3493   cs_timer_counter_add_diff(_all_to_all_timers + CS_ALL_TO_ALL_TIME_TOTAL,
3494                             &t0, &t1);
3495 
3496   return src_rank;
3497 }
3498 
3499 #endif /* defined(HAVE_MPI) */
3500 
3501 /*----------------------------------------------------------------------------*/
3502 /*!
3503  * \brief Get current type of all-to-all distributor algorithm choice.
3504  *
3505  * \return  current type of all-to-all distributor algorithm choice
3506  */
3507 /*----------------------------------------------------------------------------*/
3508 
3509 cs_all_to_all_type_t
cs_all_to_all_get_type(void)3510 cs_all_to_all_get_type(void)
3511 {
3512   return _all_to_all_type;
3513 }
3514 
3515 /*----------------------------------------------------------------------------*/
3516 /*!
3517  * \brief Get current type of hybrid all-to-all distributor parameters.
3518  *
3519  * \param[out]  rne_type  type of metadata exchange algorithm, or NULL
3520  */
3521 /*----------------------------------------------------------------------------*/
3522 
3523 void
cs_all_to_all_get_hybrid_parameters(cs_rank_neighbors_exchange_t * rne_type)3524 cs_all_to_all_get_hybrid_parameters(cs_rank_neighbors_exchange_t  *rne_type)
3525 {
3526   if (rne_type != NULL)
3527     *rne_type = _hybrid_meta_type;
3528 }
3529 
3530 /*----------------------------------------------------------------------------*/
3531 /*!
3532  * \brief Set current type of all-to-all distributor algorithm choice.
3533  *
3534  * \param[in]  rne_type  type of metadata exchange algorithm
3535  */
3536 /*----------------------------------------------------------------------------*/
3537 
3538 void
cs_all_to_all_set_hybrid_parameters(cs_rank_neighbors_exchange_t rne_type)3539 cs_all_to_all_set_hybrid_parameters(cs_rank_neighbors_exchange_t  rne_type)
3540 {
3541   _hybrid_meta_type = rne_type;
3542 }
3543 
3544 /*----------------------------------------------------------------------------*/
3545 /*!
3546  * \brief Set current type of all-to-all distributor algorithm choice.
3547  *
3548  * \param  t  type of all-to-all distributor algorithm choice to select
3549  */
3550 /*----------------------------------------------------------------------------*/
3551 
3552 void
cs_all_to_all_set_type(cs_all_to_all_type_t t)3553 cs_all_to_all_set_type(cs_all_to_all_type_t  t)
3554 {
3555   _all_to_all_type = t;
3556 }
3557 
3558 /*----------------------------------------------------------------------------*/
3559 /*!
3560  * \brief Log performance information relative to instrumented all-to-all
3561  * distribution.
3562  */
3563 /*----------------------------------------------------------------------------*/
3564 
3565 void
cs_all_to_all_log_finalize(void)3566 cs_all_to_all_log_finalize(void)
3567 {
3568   cs_crystal_router_log_finalize();
3569 
3570 #if defined(HAVE_MPI)
3571 
3572   if (_all_to_all_calls[0] <= 0)
3573     return;
3574 
3575   char method_name[96];
3576   switch(_all_to_all_type) {
3577   case CS_ALL_TO_ALL_MPI_DEFAULT:
3578     snprintf(method_name, 96, N_("MPI_Alltoall and MPI_Alltoallv"));
3579     break;
3580   case CS_ALL_TO_ALL_HYBRID:
3581     snprintf(method_name, 96, N_("Hybrid, %s (metadata), %s (data)"),
3582              _(cs_rank_neighbors_exchange_name[_hybrid_meta_type]),
3583              "MPI_Alltoallv");
3584     break;
3585   case CS_ALL_TO_ALL_CRYSTAL_ROUTER:
3586     snprintf(method_name, 96, N_("Crystal Router algorithm"));
3587     break;
3588   }
3589   method_name[95] = '\0';
3590 
3591   cs_log_printf(CS_LOG_PERFORMANCE,
3592                 _("\nAll-to-many operations: (%s):\n\n"),
3593                 method_name);
3594 
3595   /* Print times */
3596 
3597   double wtimes[3] = {0, 0, 0};
3598   double wtimes_mean[3], wtimes_max[3], wtimes_min[3];
3599   for (int i = 0; i < 3; i++) {
3600     if (_all_to_all_calls[i] > 0)
3601       wtimes[i] = (_all_to_all_timers[i]).nsec*1e-9;
3602     wtimes_mean[i] = wtimes[i];
3603     wtimes_max[i] = wtimes[i];
3604     wtimes_min[i] = wtimes[i];
3605   }
3606   if (cs_glob_n_ranks > 1) {
3607     MPI_Allreduce(wtimes, wtimes_mean, 3, MPI_DOUBLE, MPI_SUM, cs_glob_mpi_comm);
3608     MPI_Allreduce(wtimes, wtimes_max, 3, MPI_DOUBLE, MPI_MAX, cs_glob_mpi_comm);
3609     MPI_Allreduce(wtimes, wtimes_min, 3, MPI_DOUBLE, MPI_MIN, cs_glob_mpi_comm);
3610   }
3611   for (int i = 0; i < 3; i++)
3612     wtimes_mean[i] /= cs_glob_n_ranks;
3613 
3614   cs_log_printf
3615     (CS_LOG_PERFORMANCE,
3616      _("                             mean        minimum      maximum"
3617        "     calls\n"
3618        "  Total:             %12.5f s %12.5f %12.5f s   %lu\n"
3619        "  Metadata exchange: %12.5f s %12.5f %12.5f s   %lu\n"
3620        "  Data exchange:     %12.5f s %12.5f %12.5f s   %lu\n\n"),
3621      wtimes_mean[0], wtimes_min[0], wtimes_max[0],
3622      (unsigned long)(_all_to_all_calls[0]),
3623      wtimes_mean[1], wtimes_min[1], wtimes_max[1],
3624      (unsigned long)(_all_to_all_calls[1]),
3625      wtimes_mean[2], wtimes_min[2], wtimes_max[2],
3626      (unsigned long)(_all_to_all_calls[2]));
3627 
3628   cs_log_separator(CS_LOG_PERFORMANCE);
3629 
3630   if (cs_glob_n_ranks > 1 && _n_trace > 0) {
3631     cs_gnum_t *_all_to_all_sum;
3632     BFT_MALLOC(_all_to_all_sum, _n_trace*9, uint64_t);
3633     cs_gnum_t *_all_to_all_max;
3634     BFT_MALLOC(_all_to_all_max, _n_trace*9, uint64_t);
3635 
3636     MPI_Allreduce(_all_to_all_trace, _all_to_all_sum, _n_trace*9, MPI_UINT64_T,
3637                   MPI_SUM, cs_glob_mpi_comm);
3638     MPI_Allreduce(_all_to_all_trace, _all_to_all_max, _n_trace*9, MPI_UINT64_T,
3639                   MPI_MAX, cs_glob_mpi_comm);
3640 
3641     for (int j = 0; j < _n_trace*9; j++)
3642       _all_to_all_sum[j] /= cs_glob_n_ranks;
3643 
3644     if (cs_glob_rank_id < 1) {
3645       FILE *f = fopen("all_to_all_trace.csv", "w");
3646       fprintf(f, "call, time, dt_mean, dt_max, stage, mem_cur_mean, mem_cur_max, "
3647               "mem_max_mean, mem_max, send_size_mean, send_size_max, "
3648               "recv_size_mean, recv_size_max, "
3649               "send_ranks_mean, send_ranks_max, "
3650               "recv_ranks_mean, recv_ranks_max\n");
3651       for (int j = 0; j < _n_trace; j++)
3652         fprintf(f,
3653                 "%d, %g, %g, %g, %d,"
3654                 "%llu, %llu, %llu, %llu, %llu, %llu,"
3655                 "%llu, %llu, %llu, %llu, %llu, %llu\n",
3656                 j,
3657                 (_all_to_all_trace[j*9] - _all_to_all_trace[0])/1.e5,
3658                 _all_to_all_sum[j*9+1]/1.e5,
3659                 _all_to_all_max[j*9+1]/1.e5,
3660                 (int)_all_to_all_trace[j*9+2],
3661                 (unsigned long long)_all_to_all_sum[j*9+3],
3662                 (unsigned long long)_all_to_all_max[j*9+3],
3663                 (unsigned long long)_all_to_all_sum[j*9+4],
3664                 (unsigned long long)_all_to_all_max[j*9+4],
3665                 (unsigned long long)_all_to_all_sum[j*9+5]/1000,
3666                 (unsigned long long)_all_to_all_max[j*9+5]/1000,
3667                 (unsigned long long)_all_to_all_sum[j*9+6]/1000,
3668                 (unsigned long long)_all_to_all_max[j*9+6]/1000,
3669                 (unsigned long long)_all_to_all_sum[j*9+7],
3670                 (unsigned long long)_all_to_all_max[j*9+7],
3671                 (unsigned long long)_all_to_all_sum[j*9+8],
3672                 (unsigned long long)_all_to_all_max[j*9+8]);
3673       fclose(f);
3674       fclose(_all_to_all_trace_bt_log);
3675       _all_to_all_trace_bt_log = NULL;
3676     }
3677 
3678     BFT_FREE(_all_to_all_sum);
3679     BFT_FREE(_all_to_all_max);
3680     BFT_FREE(_all_to_all_trace);
3681   }
3682 
3683 
3684 #endif /* defined(HAVE_MPI) */
3685 }
3686 
3687 
3688 /*----------------------------------------------------------------------------*/
3689 
3690 END_C_DECLS
3691