1 /*============================================================================
2  * Crystal Router parallel exchange algorithm based operations.
3  *============================================================================*/
4 
5 /*
6   This file is part of Code_Saturne, a general-purpose CFD tool.
7 
8   Copyright (C) 1998-2021 EDF S.A.
9 
10   This program is free software; you can redistribute it and/or modify it under
11   the terms of the GNU General Public License as published by the Free Software
12   Foundation; either version 2 of the License, or (at your option) any later
13   version.
14 
15   This program is distributed in the hope that it will be useful, but WITHOUT
16   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
18   details.
19 
20   You should have received a copy of the GNU General Public License along with
21   this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
22   Street, Fifth Floor, Boston, MA 02110-1301, USA.
23 */
24 
25 /*----------------------------------------------------------------------------*/
26 
27 #include "cs_defs.h"
28 
29 /*----------------------------------------------------------------------------
30  * Standard C library headers
31  *----------------------------------------------------------------------------*/
32 
33 #include <assert.h>
34 #include <errno.h>
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 
39 /*----------------------------------------------------------------------------
40  *  Local headers
41  *----------------------------------------------------------------------------*/
42 
43 #include "bft_mem.h"
44 #include "bft_error.h"
45 #include "bft_printf.h"
46 
47 #include "cs_assert.h"
48 #include "cs_block_dist.h"
49 #include "cs_log.h"
50 #include "cs_timer.h"
51 
52 /*----------------------------------------------------------------------------
53  *  Header for the current file
54  *----------------------------------------------------------------------------*/
55 
56 #include "cs_crystal_router.h"
57 
58 /*----------------------------------------------------------------------------*/
59 
60 BEGIN_C_DECLS
61 
62 /*=============================================================================
63  * Additional Doxygen documentation
64  *============================================================================*/
65 
66 /*!
67   \file cs_crystal_router.c
68         Crystal Router parallel exchange algorithm based operations.
69 
70   The Crystal Router is described in \cite Fox:1988 .
71 
72   \typedef cs_crystal_router_t
73         Opaque Crystal Router distribution structure
74 
75   \par Using flags
76   \parblock
77 
78   Flags are defined as a sum (bitwise or) of constants, which may include
79   \ref CS_CRYSTAL_ROUTER_USE_DEST_ID, \ref CS_CRYSTAL_ROUTER_ADD_SRC_ID,
80   and \ref CS_CRYSTAL_ROUTER_ADD_SRC_RANK.
81 
82   \endparblock
83 
84 */
85 
86 /*! \cond DOXYGEN_SHOULD_SKIP_THIS */
87 
88 /*=============================================================================
89  * Macro definitions
90  *============================================================================*/
91 
92 #define _CR_DEBUG_DUMP 0
93 
94 /* Data elements in crystal router */
95 
96 typedef enum {
97 
98   CS_CRYSTAL_ROUTER_DEST_ID,
99   CS_CRYSTAL_ROUTER_SRC_ID,
100   CS_CRYSTAL_ROUTER_DATA
101 
102 } cs_crystal_router_data_t;
103 
104 /*=============================================================================
105  * Local type definitions
106  *============================================================================*/
107 
108 #if defined(HAVE_MPI)
109 
110 /* Crystal router management structure */
111 
112 struct  _cs_crystal_router_t { /* Crystal router information */
113 
114   cs_datatype_t   datatype;           /* associated datatype */
115   int             flags;              /* ordering and metadata flags */
116 
117   size_t          stride;             /* stride if strided, 0 otherwise */
118 
119   size_t          dest_id_shift;      /* starting byte for destination id */
120   size_t          src_id_shift;       /* starting byte for source id */
121   size_t          n_vals_shift;       /* starting byte for element count
122                                          (for indexed cases) */
123   size_t          elt_shift;          /* starting byte for element data */
124 
125   size_t          elt_size;           /* element size */
126   size_t          comp_size;          /* composite metadata + element size if
127                                          strided, metadata size otherwise */
128 
129   size_t          dest_id_end;        /* maximum destination id + 1 */
130   size_t          n_elts[2];          /* number of elements in partition */
131   size_t          n_vals[2];          /* number of data values in partition */
132   size_t          buffer_size[2];     /* buffer size values */
133   size_t          buffer_size_max[2]; /* max buffer size values reached */
134   size_t          alloc_tot_max;      /* max total allocation reached */
135   unsigned char  *buffer[2];
136 
137   MPI_Comm        comm;               /* associated MPI communicator */
138   MPI_Datatype    mpi_type;           /* Associated MPI datatype */
139   size_t          mpi_type_size;      /* Associated MPI datatype size */
140   int             rank_id;            /* local rank id in comm */
141   int             n_ranks;            /* comm size */
142 
143 };
144 
145 #endif /* defined(HAVE_MPI) */
146 
147 /*============================================================================
148  * Static global variables
149  *============================================================================*/
150 
151 /* Call counter and timers: 0: total, 1: communication */
152 
153 static size_t              _cr_calls = 0;
154 static cs_timer_counter_t  _cr_timers[2];
155 
156 const float  _realloc_f_threshold = 0.7;
157 
158 /*============================================================================
159  * Local function defintions
160  *============================================================================*/
161 
162 #if defined(HAVE_MPI)
163 
164 /*----------------------------------------------------------------------------
165  * Return Crystal Router datatype count matching a given number of values.
166  *
167  * parameters:
168  *   cr      <-- associated crystal router structure
169  *   n_elts  <-- number of elements
170  *   n_vals  <-- number of associated values (used only for indexed cases)
171  *
172  * returns:
173  *   count of associated datatype elements for matching buffer size
174  *---------------------------------------------------------------------------*/
175 
176 static  size_t
_comm_type_count(cs_crystal_router_t * cr,cs_lnum_t n_elts,cs_lnum_t n_vals)177 _comm_type_count(cs_crystal_router_t  *cr,
178                  cs_lnum_t             n_elts,
179                  cs_lnum_t             n_vals)
180 {
181   size_t retval;
182 
183   if (cr->n_vals_shift == 0) { /* strided */
184     retval = n_elts;
185   }
186   else { /* indexed */
187     size_t n = n_elts*cr->comp_size + n_vals*cr->elt_size;
188     retval = n / cr->mpi_type_size;
189   }
190 
191   return retval;
192 }
193 
194 /*----------------------------------------------------------------------------
195  * Return number of values matching a given number of elements and MPI
196  * datatype values.
197  *
198  * parameters:
199  *   cr         <-- associated crystal router structure
200  *   n_elts     <-- number of elements
201  *   n_mpi_vals <-- number of MPI element
202  *
203  * returns:
204  *   count of associated datatype elements for matching buffer size
205  *---------------------------------------------------------------------------*/
206 
207 static  size_t
_comm_type_count_to_n_vals(cs_crystal_router_t * cr,cs_lnum_t n_elts,int n_mpi_elts)208 _comm_type_count_to_n_vals(cs_crystal_router_t  *cr,
209                            cs_lnum_t             n_elts,
210                            int                   n_mpi_elts)
211 {
212   size_t retval;
213 
214   if (cr->n_vals_shift == 0) { /* strided */
215     retval = n_elts * cr->stride;
216   }
217   else { /* indexed */
218     size_t n = n_mpi_elts*cr->mpi_type_size - n_elts*cr->comp_size;
219     retval = n / cr->elt_size;
220   }
221 
222   return retval;
223 }
224 
225 /*----------------------------------------------------------------------------
226  * Return Crystal Router datatype count matching a given number of values.
227  *
228  * parameters:
229  *   cr      <-- associated crystal router structure
230  *   n_elts  <-- number of elements
231  *   n_vals  <-- number of associated values (used only for indexed cases)
232  *
233  * returns:
234  *   count of associated datatype elements for matching buffer size
235  *---------------------------------------------------------------------------*/
236 
237 static inline size_t
_data_size(cs_crystal_router_t * cr,cs_lnum_t n_elts,cs_lnum_t n_vals)238 _data_size(cs_crystal_router_t  *cr,
239            cs_lnum_t             n_elts,
240            cs_lnum_t             n_vals)
241 {
242   size_t retval;
243 
244   if (cr->n_vals_shift == 0) /* strided */
245     retval = n_elts * cr->comp_size;
246   else /* indexed */
247     retval = n_elts*cr->comp_size + n_vals*cr->elt_size;
248 
249   return retval;
250 }
251 
252 /*----------------------------------------------------------------------------
253  * Allocation and base settings for a crystal router.
254  *
255  * parameters:
256  *   n_elts           <-- number of elements
257  *   stride           <-- number of values per entity (interlaced)
258  *   datatype         <-- type of data considered
259  *   flags            <-- ordering and metadata flags
260  *   comm             <-- associated MPI communicator
261  *---------------------------------------------------------------------------*/
262 
263 static cs_crystal_router_t *
_crystal_create(size_t n_elts,int flags,MPI_Comm comm)264 _crystal_create(size_t         n_elts,
265                 int            flags,
266                 MPI_Comm       comm)
267 {
268   int rank_id, n_ranks;
269   cs_crystal_router_t *cr = NULL;
270 
271   const size_t align_size = sizeof(cs_lnum_t);
272 
273   /* Communicator info */
274 
275   MPI_Comm_rank(comm, &rank_id);
276   MPI_Comm_size(comm, &n_ranks);
277 
278   /* Allocate structure */
279 
280   BFT_MALLOC(cr, 1, cs_crystal_router_t);
281 
282   cr->flags = flags;
283 
284   cr->stride = 0;
285   cr->elt_size = 0;
286   cr->comp_size = 0;
287   cr->dest_id_end = 0;
288   cr->n_elts[0] = n_elts;
289   cr->n_elts[1] = 0;
290 
291   cr->comm = comm;
292   cr->rank_id = rank_id;
293   cr->n_ranks = n_ranks;
294 
295   /* Compute data size and alignment */
296 
297   cr->dest_id_shift = sizeof(int);
298 
299   if (cr->flags & CS_CRYSTAL_ROUTER_ADD_SRC_RANK)
300     cr->dest_id_shift += sizeof(int);
301 
302   if (cr->flags & CS_CRYSTAL_ROUTER_USE_DEST_ID)
303     cr->src_id_shift = cr->dest_id_shift + sizeof(cs_lnum_t);
304   else
305     cr->src_id_shift = cr->dest_id_shift;
306 
307   if (cr->flags & CS_CRYSTAL_ROUTER_ADD_SRC_ID)
308     cr->elt_shift = cr->src_id_shift + cs_datatype_size[CS_LNUM_TYPE];
309   else
310     cr->elt_shift = cr->src_id_shift;
311 
312   if (cr->elt_shift % align_size)
313     cr->elt_shift += align_size - (cr->elt_shift % align_size);
314 
315   cr->n_vals_shift = 0;
316 
317   for (int i = 0; i < 2; i++) {
318     cr->n_vals[i] = 0;
319     cr->buffer_size[i] = 0;
320     cr->buffer_size_max[i] = 0;
321     cr->buffer[i] = NULL;
322   }
323 
324   cr->alloc_tot_max = 0;
325 
326   return cr;
327 }
328 
329 /*----------------------------------------------------------------------------
330  * First stage of creation for a crystal router for strided data.
331  *
332  * parameters:
333  *   n_elts           <-- number of elements
334  *   stride           <-- number of values per entity (interlaced)
335  *   datatype         <-- type of data considered
336  *   flags            <-- ordering and metadata flags
337  *   comm             <-- associated MPI communicator
338  *---------------------------------------------------------------------------*/
339 
340 static cs_crystal_router_t *
_crystal_create_meta_s(size_t n_elts,int stride,cs_datatype_t datatype,int flags,MPI_Comm comm)341 _crystal_create_meta_s(size_t         n_elts,
342                        int            stride,
343                        cs_datatype_t  datatype,
344                        int            flags,
345                        MPI_Comm       comm)
346 {
347   /* Allocate structure */
348 
349   cs_crystal_router_t *cr = _crystal_create(n_elts, flags, comm);
350 
351   size_t elt_size = cs_datatype_size[datatype]*stride;
352   size_t align_size = sizeof(cs_lnum_t);
353 
354   cr->datatype = (stride > 0) ? datatype : CS_DATATYPE_NULL;
355 
356   cr->stride = (stride > 0) ? stride : 1;
357   cr->elt_size = elt_size;
358 
359   /* Compute data size and alignment */
360 
361   cr->comp_size = cr->elt_shift + elt_size;
362 
363   if (elt_size % align_size)
364     cr->comp_size += align_size - (elt_size % align_size);;
365 
366   /* Create associated MPI datatype */
367 
368   cr->mpi_type_size = cr->comp_size;
369 
370   MPI_Type_contiguous(cr->mpi_type_size, MPI_BYTE, &(cr->mpi_type));
371   MPI_Type_commit(&(cr->mpi_type));
372 
373   /* Allocate buffers */
374 
375   cr->buffer_size[0] = n_elts*cr->comp_size;
376   cr->buffer_size[1] = 0;
377   BFT_MALLOC(cr->buffer[0], cr->buffer_size[0], unsigned char);
378   memset(cr->buffer[0], 0, cr->buffer_size[0]);
379   cr->buffer[1] = NULL;
380 
381   cr->buffer_size_max[0] = cr->buffer_size[0];
382   cr->buffer_size_max[1] = 0;
383 
384   cr->alloc_tot_max = cr->buffer_size_max[0];
385 
386   return cr;
387 }
388 
389 /*----------------------------------------------------------------------------
390  * First stage of creation for a crystal router for indexed data.
391  *
392  * parameters:
393  *   n_elts     <-- number of elements
394  *   datatype   <-- type of data considered
395  *   elt_idx    <-- element values start and past-the-last index
396  *   src_id     <-- optional parent element id (elt_idx indirection), or NULL
397  *   flags      <-- ordering and metadata flags
398  *   comm       <-- associated MPI communicator
399  *---------------------------------------------------------------------------*/
400 
401 static cs_crystal_router_t *
_crystal_create_meta_i(size_t n_elts,cs_datatype_t datatype,const cs_lnum_t * elt_idx,const cs_lnum_t * src_id,int flags,MPI_Comm comm)402 _crystal_create_meta_i(size_t            n_elts,
403                        cs_datatype_t     datatype,
404                        const cs_lnum_t  *elt_idx,
405                        const cs_lnum_t  *src_id,
406                        int               flags,
407                        MPI_Comm          comm)
408 {
409   assert(datatype != CS_DATATYPE_NULL);
410 
411   /* Allocate structure */
412 
413   cs_crystal_router_t *cr = _crystal_create(n_elts, flags, comm);
414 
415   size_t elt_size = cs_datatype_size[datatype];
416   size_t align_size = sizeof(cs_lnum_t);
417 
418   cr->datatype = datatype;
419   cr->elt_size = elt_size;
420 
421   /* Compute data size and alignment */
422 
423   cr->n_vals_shift = cr->elt_shift;
424   cr->elt_shift = cr->n_vals_shift + cs_datatype_size[CS_LNUM_TYPE];
425 
426   if (cr->elt_shift < cr->elt_size)
427     cr->elt_shift = cr->elt_size;
428 
429   if (cr->elt_shift % align_size)
430     cr->elt_shift += align_size - (cr->elt_shift % align_size);
431 
432   cr->comp_size = cr->elt_shift;
433 
434   /* Create associated MPI datatype */
435 
436   cr->mpi_type_size = CS_MIN(cr->comp_size, cr->elt_size);
437   while (cr->comp_size % cr->mpi_type_size || cr->elt_size % cr->mpi_type_size)
438     cr->mpi_type_size--;
439 
440   MPI_Type_contiguous(cr->mpi_type_size, MPI_BYTE, &(cr->mpi_type));
441   MPI_Type_commit(&(cr->mpi_type));
442 
443   /* Allocate buffers */
444 
445   if (src_id == NULL)
446     cr->n_vals[0] = elt_idx[n_elts];
447   else {
448     cr->n_vals[0] = n_elts*cr->comp_size;
449     for (size_t i = 0; i < n_elts; i++) {
450       size_t j = src_id[i];
451       cr->n_vals[0] += elt_idx[j+1] - elt_idx[j];
452     }
453   }
454   cr->n_vals[1] = 0;
455   cr->buffer_size[0] = n_elts*cr->comp_size + cr->n_vals[0]*elt_size;
456   cr->buffer_size[1] = 0;
457   BFT_MALLOC(cr->buffer[0], cr->buffer_size[0], unsigned char);
458   memset(cr->buffer[0], 0, cr->buffer_size[0]);
459   cr->buffer[1] = NULL;
460 
461   cr->buffer_size_max[0] = cr->buffer_size[0];
462   cr->buffer_size_max[1] = 0;
463 
464   cr->alloc_tot_max = cr->buffer_size_max[0];
465 
466   return cr;
467 }
468 
469 #if _CR_DEBUG_DUMP /* _dump functions for debugging */
470 
471 /*----------------------------------------------------------------------------
472  * Dump data element associated with a Crystal Router.
473  *
474  * parameters:
475  *   <-- cr pointer to associated Crystal Router
476  *   <-- e  pointer to element
477  *----------------------------------------------------------------------------*/
478 
479 static void
_dump_e(const cs_crystal_router_t * cr,const void * e)480 _dump_e(const cs_crystal_router_t  *cr,
481         const void                 *e)
482 {
483   switch (cr->datatype) {
484 
485   case CS_FLOAT:
486     {
487       const float *v = (const float *)e;
488       bft_printf(" %g", (double)(*v));
489     }
490     break;
491   case CS_DOUBLE:
492     {
493       const double *v = (const double *)e;
494       bft_printf(" %g", (*v));
495     }
496     break;
497   case CS_LNUM_TYPE:
498     {
499       const cs_lnum_t *v = (const cs_lnum_t *)e;
500       bft_printf(" %d", (int)(*v));
501     }
502     break;
503   case CS_GNUM_TYPE:
504     {
505       const cs_gnum_t *v = (const cs_gnum_t *)e;
506       bft_printf(" %llu", (unsigned long long)(*v));
507     }
508     break;
509   default:
510     {
511       const unsigned char *pe = (const unsigned char *)e;
512       size_t elt_size = cs_datatype_size[cr->datatype];
513       if (cr->elt_size > 0)
514         bft_printf(" %x", (int)pe[0]);
515       for (size_t k = 1; k < elt_size; k++)
516         bft_printf(":%x", (int)pe[k]);
517     }
518   }
519 }
520 
521 /*----------------------------------------------------------------------------
522  * Dump data elements associated with a strided Crystal Router.
523  *
524  * parameters:
525  *   <-- cr       pointer to associated Crystal Router
526  *   <-- comment  associated comment
527  *----------------------------------------------------------------------------*/
528 
529 static void
_dump_s(const cs_crystal_router_t * cr,const char * comment)530 _dump_s(const cs_crystal_router_t  *cr,
531         const char                 *comment)
532 {
533   if (comment != NULL)
534     bft_printf("\n%s\n", comment);
535 
536   size_t elt_size = cr->elt_size;
537   if (cr->stride > 0)
538     elt_size = cr->elt_size/cr->stride;
539 
540   for (int ip = 0; ip < 2; ip++) {
541 
542     const size_t n_elts = cr->n_elts[ip];
543 
544     if (n_elts > 0)
545       bft_printf("crystal router partition %d: %d elements "
546                  "(stride %d, size %d)\n",
547                  ip, (int)n_elts, (int)cr->stride, (int)elt_size);
548 
549     /* Extract data */
550 
551     for (size_t i = 0; i < n_elts; i++) {
552 
553       unsigned const char *p_s = cr->buffer[ip] + i*cr->comp_size;
554 
555       const int *cr_dest_rank = (const int *)(p_s);
556 
557       bft_printf("  %d\n"
558                  "    dest_rank:  %d\n", (int)i, *cr_dest_rank);
559 
560       if (cr->flags & CS_CRYSTAL_ROUTER_ADD_SRC_RANK) {
561         const int *cr_src_rank = (const int *)(p_s + sizeof(int));
562         bft_printf("    src_rank:   %d\n", (int)(*cr_src_rank));
563       }
564 
565       if (cr->flags & CS_CRYSTAL_ROUTER_USE_DEST_ID) {
566         const cs_lnum_t *cr_dest_id
567           = (const cs_lnum_t *)(p_s + cr->dest_id_shift);
568         bft_printf("    dest_id:    %d\n", (int)(*cr_dest_id));
569       }
570 
571       if (cr->flags & CS_CRYSTAL_ROUTER_ADD_SRC_ID) {
572         const cs_lnum_t *cr_src_id
573           = (const cs_lnum_t *)(p_s + cr->src_id_shift);
574         bft_printf("    src_id:     %d\n", (int)(*cr_src_id));
575       }
576 
577       /* Data */
578 
579       if (cr->elt_size > 0) {
580         unsigned const char *pe = p_s + cr->elt_shift;
581         bft_printf("    data      :");
582         for (size_t j = 0; j < cr->stride; j++)
583           _dump_e(cr, pe + elt_size*j);
584       }
585       bft_printf("\n");
586       bft_printf_flush();
587 
588     }
589   }
590 }
591 
592 /*----------------------------------------------------------------------------
593  * Dump data elements associated with an indexed Crystal Router.
594  *
595  * parameters:
596  *   <-- cr       pointer to associated Crystal Router
597  *   <-- comment  associated comment
598  *----------------------------------------------------------------------------*/
599 
600 static void
_dump_i(const cs_crystal_router_t * cr,const char * comment)601 _dump_i(const cs_crystal_router_t  *cr,
602         const char                 *comment)
603 {
604   if (comment != NULL)
605     bft_printf("\n%s\n", comment);
606 
607   for (int ip = 0; ip < 2; ip++) {
608 
609     const size_t n_elts = cr->n_elts[ip];
610 
611     if (n_elts > 0)
612       bft_printf("crystal router partition %d: %d elements\n",
613                  ip, (int)n_elts);
614 
615     unsigned const char *p_s = cr->buffer[ip];
616 
617     /* Extract data */
618 
619     cs_lnum_t  s_idx = 0;
620 
621     for (size_t i = 0; i < n_elts; i++) {
622 
623       const int *cr_dest_rank = (const int *)(p_s);
624 
625       bft_printf("  %d\n"
626                  "    dest_rank:  %d\n", (int)i, *cr_dest_rank);
627 
628       if (cr->flags & CS_CRYSTAL_ROUTER_ADD_SRC_RANK) {
629         const int *cr_src_rank = (const int *)(p_s + sizeof(int));
630         bft_printf("    src_rank:   %d\n", (int)(*cr_src_rank));
631       }
632 
633       if (cr->flags & CS_CRYSTAL_ROUTER_USE_DEST_ID) {
634         const cs_lnum_t *cr_dest_id
635           = (const cs_lnum_t *)(p_s + cr->dest_id_shift);
636         bft_printf("    dest_id:    %d\n", (int)(*cr_dest_id));
637       }
638 
639       if (cr->flags & CS_CRYSTAL_ROUTER_ADD_SRC_ID) {
640         const cs_lnum_t *cr_src_id
641           = (const cs_lnum_t *)(p_s + cr->src_id_shift);
642         bft_printf("    src_id:     %d\n", (int)(*cr_src_id));
643       }
644 
645       /* Number of elements and optional data index */
646 
647       const cs_lnum_t *pn = (const cs_lnum_t *)(p_s + cr->n_vals_shift);
648       const cs_lnum_t n_sub = *pn;
649       const size_t _sub_size = cr->elt_size*n_sub;
650 
651       bft_printf("    data index: %d %d\n"
652                  "    data      : ", (int)s_idx, (int)(s_idx+n_sub));
653 
654       /* Data */
655 
656       unsigned const char *pe = p_s + cr->elt_shift;
657       for (cs_lnum_t j = 0; j < n_sub; j++)
658         _dump_e(cr, pe + cr->elt_size*j);
659       bft_printf("\n");
660       bft_printf_flush();
661 
662       s_idx += n_sub;
663       p_s += cr->comp_size + _sub_size;
664 
665     }
666   }
667 }
668 
669 /*----------------------------------------------------------------------------
670  * Dump data elements associated with a Crystal Router.
671  *
672  * parameters:
673  *   <-- cr       pointer to associated Crystal Router
674  *   <-- comment  associated comment
675  *----------------------------------------------------------------------------*/
676 
677 static void
_dump(const cs_crystal_router_t * cr,const char * comment)678 _dump(const cs_crystal_router_t  *cr,
679       const char                 *comment)
680 {
681   if (cr->n_vals_shift == 0)
682     _dump_s(cr, comment);
683   else
684     _dump_i(cr, comment);
685 }
686 
687 #endif /* _dump functions for debugging */
688 
689 /*----------------------------------------------------------------------------
690  * Partition strided data for exchange with a crystal router.
691  *
692  * parameters:
693  *   cr      <-> associated crystal router structure
694  *   send_id <-> id of "send" buffer" (1 if low = buf0, 0 if low = buf1)
695  *   cutoff  <-- cutoff rank
696  *---------------------------------------------------------------------------*/
697 
698 static void
_crystal_partition_strided(cs_crystal_router_t * cr,int send_id,int cutoff)699 _crystal_partition_strided(cs_crystal_router_t  *cr,
700                            int                   send_id,
701                            int                   cutoff)
702 {
703   cs_lnum_t i;
704 
705   cs_lnum_t n0 = 0, n1 = 0;
706 
707   const cs_lnum_t n = cr->n_elts[0];
708   const int id0 = (send_id + 1) % 2;
709   const int id1 = send_id;
710   const size_t comp_size = cr->comp_size;
711 
712   assert(send_id == 0 || send_id == 1);
713 
714   if (   cr->buffer_size[1] < cr->buffer_size[0]
715       || cr->buffer_size[1] > cr->buffer_size[0]*_realloc_f_threshold) {
716     cr->buffer_size[1] = cr->buffer_size[0];
717     BFT_REALLOC(cr->buffer[1], cr->buffer_size[1], unsigned char);
718     if (cr->buffer_size[1] > cr->buffer_size_max[1])
719       cr->buffer_size_max[1] = cr->buffer_size[1];
720     size_t alloc_tot = cr->buffer_size[0] + cr->buffer_size[1];
721     if (alloc_tot > cr->alloc_tot_max)
722       cr->alloc_tot_max = alloc_tot;
723   }
724 
725   if (id0 == 0) {
726     for (i = 0; i < n; i++) {
727       unsigned char *src = cr->buffer[0] + i*comp_size;
728       int *r = (int *)src;
729       if (r[0] >= cutoff)
730         break;
731     }
732     n0 = i;
733   }
734   else {
735     for (i = 0; i < n; i++) {
736       unsigned char *src = cr->buffer[0] + i*comp_size;
737       int *r = (int *)src;
738       if (r[0] < cutoff)
739         break;
740     }
741     n1 = i;
742   }
743 
744   while (i < n) {
745     unsigned char *src = cr->buffer[0] + i*comp_size;
746     int *r = (int *)src;
747     if (r[0] < cutoff) {
748       unsigned char *dest = cr->buffer[id0] + n0*comp_size;
749       memcpy(dest, src, comp_size);
750       n0++;
751     }
752     else {
753       unsigned char *dest = cr->buffer[id1] + n1*comp_size;
754       memcpy(dest, src, comp_size);
755       n1++;
756     }
757     i++;
758   }
759 
760   assert(n0 + n1 == n);
761 
762   cr->n_elts[id0] = n0;
763   cr->n_elts[id1] = n1;
764 
765   cr->n_vals[id0] = n0*cr->stride;
766   cr->n_vals[id1] = n1*cr->stride;
767 }
768 
769 /*----------------------------------------------------------------------------
770  * Partition indexed data for exchange with a crystal router.
771  *
772  * parameters:
773  *   cr      <-> associated crystal router structure
774  *   send_id <-> id of "send" buffer" (1 if low = buf0, 0 if low = buf1)
775  *   cutoff  <-- cutoff rank
776  *---------------------------------------------------------------------------*/
777 
778 static void
_crystal_partition_indexed(cs_crystal_router_t * cr,int send_id,int cutoff)779 _crystal_partition_indexed(cs_crystal_router_t  *cr,
780                            int                   send_id,
781                            int                   cutoff)
782 {
783   cs_lnum_t i;
784 
785   cs_lnum_t n0 = 0, n1 = 0;
786 
787   const cs_lnum_t n = cr->n_elts[0];
788   const int id0 = (send_id + 1) % 2;
789   const int id1 = send_id;
790   const size_t comp_size = cr->comp_size;
791   const size_t elt_size = cr->elt_size;
792   const size_t n_vals_shift = cr->n_vals_shift;
793 
794   assert(send_id == 0 || send_id == 1);
795 
796   unsigned char *src = cr->buffer[0];
797 
798   size_t r0_shift = 0;
799   size_t r1_shift = 0;
800 
801   if (id0 == 0) {
802     for (i = 0; i < n; i++) {
803       int *r = (int *)src;
804       cs_lnum_t *n_sub = (cs_lnum_t *)(src + n_vals_shift);
805       size_t sub_size = comp_size + n_sub[0]*elt_size;
806       if (r[0] >= cutoff)
807         break;
808       r0_shift += sub_size;
809       src += sub_size;
810     }
811     n0 = i;
812   }
813   else {
814     for (i = 0; i < n; i++) {
815       int *r = (int *)src;
816       cs_lnum_t *n_sub = (cs_lnum_t *)(src + n_vals_shift);
817       size_t sub_size = comp_size + n_sub[0]*elt_size;
818       if (r[0] < cutoff)
819         break;
820       r1_shift += sub_size;
821       src += sub_size;
822     }
823     n1 = i;
824   }
825 
826   size_t r0_end = r0_shift;
827   size_t r1_end = r1_shift;
828   cs_lnum_t i_start = i;
829   unsigned char *src_start = src;
830 
831   /* Count to minimize memory allocation
832      (extra reads, but hopefully a good tradeoff) */
833 
834   while (i < n) {
835     int *r = (int *)src;
836     cs_lnum_t *n_sub = (cs_lnum_t *)(src + n_vals_shift);
837     size_t sub_size = comp_size + n_sub[0]*elt_size;
838     if (r[0] < cutoff) {
839       r0_end += sub_size;
840     }
841     else {
842       r1_end += sub_size;
843     }
844     i++;
845     src += sub_size;
846   }
847 
848   cr->buffer_size[1] = (id0 == 0) ? r1_end : r0_end;
849   BFT_REALLOC(cr->buffer[1], cr->buffer_size[1], unsigned char);
850   if (cr->buffer_size[1] > cr->buffer_size_max[1])
851     cr->buffer_size_max[1] = cr->buffer_size[1];
852   size_t alloc_tot = cr->buffer_size[0] + cr->buffer_size[1];
853   if (alloc_tot > cr->alloc_tot_max)
854     cr->alloc_tot_max = alloc_tot;
855 
856   i = i_start;
857   src = src_start;
858 
859   while (i < n) {
860     int *r = (int *)src;
861     cs_lnum_t *n_sub = (cs_lnum_t *)(src + n_vals_shift);
862     size_t sub_size = comp_size + n_sub[0]*elt_size;
863     if (r[0] < cutoff) {
864       unsigned char *dest = cr->buffer[id0] + r0_shift;
865       for (size_t j = 0; j < sub_size; j++)
866         dest[j] = src[j];
867       r0_shift += sub_size;
868       n0++;
869     }
870     else {
871       unsigned char *dest = cr->buffer[id1] + r1_shift;
872       for (size_t j = 0; j < sub_size; j++)
873         dest[j] = src[j];
874       r1_shift += sub_size;
875       n1++;
876     }
877     i++;
878     src += sub_size;
879   }
880 
881   assert(n0 + n1 == n);
882 
883   cr->n_elts[id0] = n0;
884   cr->n_elts[id1] = n1;
885 
886   cr->n_vals[id0] = (r0_shift - n0*comp_size)/elt_size;
887   cr->n_vals[id1] = (r1_shift - n1*comp_size)/elt_size;
888 }
889 
890 /*----------------------------------------------------------------------------
891  * Send and receive data with a crystal router for one stage.
892  *
893  * parameters:
894  *   cr        <-> associated crystal router structure
895  *   target    <-- base target rank to send/receive from
896  *   n_recv    <-- number of ranks to receive from
897  *---------------------------------------------------------------------------*/
898 
899 static  void
_crystal_sendrecv(cs_crystal_router_t * cr,int target,int n_recv)900 _crystal_sendrecv(cs_crystal_router_t  *cr,
901                   int                   target,
902                   int                   n_recv)
903 {
904   cs_timer_t t0 = cs_timer_time();
905 
906   assert(n_recv <= 2);
907 
908   cs_lnum_t send_size[2];
909   uint64_t test_size;
910 
911   MPI_Status status[3];
912   MPI_Request request[3] = {MPI_REQUEST_NULL,
913                             MPI_REQUEST_NULL,
914                             MPI_REQUEST_NULL};
915   cs_lnum_t recv_size[4] = {0, 0, 0, 0};
916 
917   /* Send message to target process */
918 
919   test_size = _comm_type_count(cr, cr->n_elts[1], cr->n_vals[1]);
920 
921   send_size[0] = cr->n_elts[1];
922   send_size[1] = test_size;
923 
924   if ((uint64_t)send_size[1] != test_size)
925     bft_error(__FILE__, __LINE__, 0,
926               "Crystal router:"
927               "  Message to send would have size too large for C int: %llu",
928               (unsigned long long)test_size);
929 
930   MPI_Isend(&send_size, 2, CS_MPI_LNUM, target, cr->rank_id,
931             cr->comm, &request[0]);
932 
933   for (int i = 0; i < n_recv; i++)
934     MPI_Irecv(recv_size+i*2, 2, CS_MPI_LNUM, target+i, target+i,
935               cr->comm, request+i+1);
936 
937   MPI_Waitall(n_recv + 1, request, status);
938 
939   size_t loc_size = _data_size(cr, cr->n_elts[0], cr->n_vals[0]);
940   for (int i = 0; i < n_recv; i++)
941     loc_size += _data_size(cr, recv_size[i*2], recv_size[i*2+1]);
942   if (   loc_size > cr->buffer_size[0]
943       || loc_size < cr->buffer_size[0]*_realloc_f_threshold) {
944     cr->buffer_size[0] = loc_size;
945     BFT_REALLOC(cr->buffer[0], cr->buffer_size[0], unsigned char);
946     if (cr->buffer_size[0] > cr->buffer_size_max[0])
947       cr->buffer_size_max[0] = cr->buffer_size[0];
948     size_t alloc_tot = cr->buffer_size[0] + cr->buffer_size[1];
949     if (alloc_tot > cr->alloc_tot_max)
950       cr->alloc_tot_max = alloc_tot;
951   }
952 
953   MPI_Isend(cr->buffer[1], send_size[1], cr->mpi_type,
954             target, cr->rank_id, cr->comm, request);
955 
956   cr->n_elts[1] = 0;
957 
958   for (int i = 0; i < n_recv; i++) {
959 
960     unsigned char *r_ptr =   cr->buffer[0]
961                            + _data_size(cr, cr->n_elts[0], cr->n_vals[0]);
962 
963     MPI_Irecv(r_ptr, recv_size[i*2 + 1], cr->mpi_type,
964               target+i, target+i, cr->comm, request+1+i);
965 
966     cr->n_elts[0] += recv_size[i*2];
967     cr->n_vals[0] += _comm_type_count_to_n_vals(cr,
968                                                 recv_size[i*2],
969                                                 recv_size[i*2 + 1]);
970 
971   }
972 
973   MPI_Waitall(n_recv+1, request, status);
974 
975   cs_timer_t t1 = cs_timer_time();
976   cs_timer_counter_add_diff(_cr_timers + 1, &t0, &t1);
977 }
978 
979 /*----------------------------------------------------------------------------
980  * Get destination id and associated data index for an indexed Crystal Router.
981  *
982  * If the destination id info is not provided in the Crystal Router,
983  * it is assumed the given array is already valid, so only the index is
984  * computed. If it is provided, the dest_id array is optional.
985  *
986  * parameters:
987  *   <-- cr        pointer to associated Crystal Router
988  *   <-> dest_id   pointer to destination id array, or NULL
989  *   --> data_idx  pointer to data index
990  *----------------------------------------------------------------------------*/
991 
992 static void
_get_data_index_with_dest_id(cs_crystal_router_t * cr,cs_lnum_t dest_id[],cs_lnum_t data_idx[])993 _get_data_index_with_dest_id(cs_crystal_router_t  *cr,
994                              cs_lnum_t             dest_id[],
995                              cs_lnum_t             data_idx[])
996 {
997   const size_t n_elts = cr->n_elts[0];
998 
999   const unsigned char *p_s = cr->buffer[0];
1000 
1001   /* Extract _dest_id first, to use in other loops */
1002 
1003   if (cr->flags & CS_CRYSTAL_ROUTER_USE_DEST_ID) {
1004 
1005     if (dest_id != NULL) {
1006 
1007       cs_lnum_t max_id = -1;  /* id for added elements */
1008 
1009       for (size_t i = 0; i < n_elts; i++) {
1010         const cs_lnum_t *cr_dest_id_p
1011           = (const cs_lnum_t *)(p_s + cr->dest_id_shift);
1012         dest_id[i] = *cr_dest_id_p;
1013         const cs_lnum_t *pn = (const cs_lnum_t *)(p_s + cr->n_vals_shift);
1014         data_idx[*cr_dest_id_p + 1] = *pn;
1015         p_s += cr->comp_size + cr->elt_size*(*pn);
1016         if (*cr_dest_id_p > max_id)
1017           max_id = *cr_dest_id_p;
1018       }
1019 
1020       max_id += 1;
1021 
1022       cr->dest_id_end = max_id;
1023 
1024     }
1025     else {
1026 
1027       for (size_t i = 0; i < n_elts; i++) {
1028         const cs_lnum_t *cr_dest_id_p
1029           = (const cs_lnum_t *)(p_s + cr->dest_id_shift);
1030         const cs_lnum_t *pn = (const cs_lnum_t *)(p_s + cr->n_vals_shift);
1031         data_idx[*cr_dest_id_p + 1] = *pn;
1032         p_s += cr->comp_size + cr->elt_size*(*pn);
1033       }
1034 
1035     }
1036 
1037   }
1038 
1039   else {
1040 
1041     for (size_t i = 0; i < n_elts; i++) {
1042       const cs_lnum_t *pn = (const cs_lnum_t *)(p_s + cr->n_vals_shift);
1043       data_idx[dest_id[i] + 1] = *pn;
1044       p_s += cr->comp_size + cr->elt_size*(*pn);
1045     }
1046 
1047   }
1048 
1049   /* Transform count to index */
1050 
1051   data_idx[0] = 0;
1052   for (size_t i = 0; i < n_elts; i++)
1053     data_idx[i+1] += data_idx[i];
1054 }
1055 
1056 /*----------------------------------------------------------------------------
1057  * Get data elements associated with a strided Crystal Router.
1058  *
1059  * This variant assumes no destination id is provided, so data is
1060  * copied in Crystal Router buffer order.
1061  *
1062  * parameters:
1063  *   <-- cr         pointer to associated Crystal Router
1064  *   --> src_rank   pointer to source rank array, or NULL
1065  *   --> src_id     pointer to source id array, or NULL
1066  *   --> data  pointer to (pointer to) destination data, or NULL
1067  *----------------------------------------------------------------------------*/
1068 
1069 static void
_get_data_s(cs_crystal_router_t * cr,int * src_rank,cs_lnum_t * src_id,void * data)1070 _get_data_s(cs_crystal_router_t   *cr,
1071             int                   *src_rank,
1072             cs_lnum_t             *src_id,
1073             void                  *data)
1074 {
1075   const size_t n_elts = cr->n_elts[0];
1076 
1077   /* Now extract data; work with chunks to amortize some tests
1078      while not multiplying variants */
1079 
1080   const size_t chunk_size = 64;
1081   const size_t n_chunks
1082     = (n_elts % chunk_size) ? n_elts/chunk_size + 1 : n_elts/chunk_size;
1083 
1084   unsigned char *cr_src_rank_p = cr->buffer[0] + sizeof(int);
1085   unsigned char *cr_src_id_p = cr->buffer[0] + cr->src_id_shift;
1086   unsigned char *cr_data_p = cr->buffer[0] + cr->elt_shift;
1087 
1088   unsigned char *src_rank_p = (unsigned char *)src_rank;
1089   unsigned char *src_id_p = (unsigned char *)src_id;
1090   unsigned char *data_p = (unsigned char *)data;
1091 
1092   for (size_t c_id = 0; c_id < n_chunks; c_id++) {
1093 
1094     size_t i0 = c_id*chunk_size;
1095     size_t i1 = (c_id+1)*chunk_size;
1096     if (i1 > n_elts)
1097       i1 = n_elts;
1098 
1099     /* Optional source rank */
1100 
1101     if (src_rank != NULL) {
1102       for (size_t i = i0; i < i1; i++) {
1103         for (size_t j = 0; j < sizeof(int); j++)
1104           src_rank_p[i*sizeof(int) + j]
1105             = cr_src_rank_p[i*cr->comp_size + j];
1106       }
1107     }
1108 
1109     /* Optional source id */
1110 
1111     if (src_id != NULL) {
1112       for (size_t i = i0; i < i1; i++) {
1113         for (size_t j = 0; j < sizeof(cs_lnum_t); j++)
1114           src_id_p[i*sizeof(cs_lnum_t) + j]
1115             = cr_src_id_p[i*cr->comp_size + j];
1116       }
1117     }
1118 
1119     /* data */
1120 
1121     if (data != NULL) {
1122       for (size_t i = i0; i < i1; i++) {
1123         for (size_t j = 0; j < cr->elt_size; j++)
1124           data_p[i*cr->elt_size + j]
1125             = cr_data_p[i*cr->comp_size + j];
1126       }
1127     }
1128 
1129   }
1130 }
1131 
1132 /*----------------------------------------------------------------------------
1133  * Get data elements associated with a strided Crystal Router.
1134  *
1135  * This variant assumes a destination id provided in the Crystal Router.
1136  *
1137  * The dest_id array is always in buffer order.
1138  *
1139  * If the general case, dest_id is applied to all the other
1140  * extracted arrays. If some ids appear multiple times, and dest_id, src_rank,
1141  * and src_id are all non-NULL (with the matching flags
1142  * CS_CRYSTAL_ROUTER_USE_DEST_ID, CS_CRYSTAL_ROUTER_ADD_SRC_ID, and
1143  * CS_CRYSTAL_ROUTER_ADD_SRC_RANK set), it is applied only to the data, and
1144  * src_rank and src_id are kept in buffer order.
1145  *
1146  * With this behavior, for reverse exchange, src_id and src_rank can be
1147  * used as dest_id and dest_rank respectively in the call to
1148  * \ref cs_crystal_router_create_s, while NULL is passed to elt_id
1149  * with single ids, and dest_id passed in case of multiple ids.
1150  *
1151  * parameters:
1152  *   <-- cr         pointer to associated Crystal Router
1153  *   <-> dest_id    pointer to destination id array, or NULL
1154  *   --> src_rank   pointer to source rank array, or NULL
1155  *   --> src_id     pointer to source id array, or NULL
1156  *   --> data  pointer to (pointer to) destination data, or NULL
1157  *----------------------------------------------------------------------------*/
1158 
1159 static void
_get_data_s_with_dest_id(cs_crystal_router_t * cr,cs_lnum_t * dest_id,int * src_rank,cs_lnum_t * src_id,void * data)1160 _get_data_s_with_dest_id(cs_crystal_router_t   *cr,
1161                          cs_lnum_t             *dest_id,
1162                          int                   *src_rank,
1163                          cs_lnum_t             *src_id,
1164                          void                  *data)
1165 {
1166   cs_lnum_t *_dest_id = dest_id;
1167 
1168   const size_t n_elts = cr->n_elts[0];
1169 
1170   /* Now extract data; work with chunks to amortize some tests
1171      while not multiplying variants */
1172 
1173   const size_t chunk_size = 64;
1174   const size_t n_chunks
1175     = (n_elts % chunk_size) ? n_elts/chunk_size + 1 : n_elts/chunk_size;
1176 
1177   const unsigned char *cr_data_p = cr->buffer[0] + cr->elt_shift;
1178   unsigned char *data_p = (unsigned char *)data;
1179 
1180   /* If we do not have a dest_id buffer, use a local one */
1181 
1182   if (_dest_id == NULL) {
1183     BFT_MALLOC(_dest_id, n_elts, cs_lnum_t);
1184     cs_assert(cr->flags & CS_CRYSTAL_ROUTER_USE_DEST_ID);
1185   }
1186 
1187   /* Extract _dest_id first, to use in other loops */
1188 
1189   if (cr->flags & CS_CRYSTAL_ROUTER_USE_DEST_ID) {
1190 
1191     cs_lnum_t max_id = -1;  /* id for added elements */
1192 
1193     for (size_t i = 0; i < n_elts; i++) {
1194       const cs_lnum_t *cr_dest_id_p
1195         = (const cs_lnum_t *)(  cr->buffer[0] + i*cr->comp_size
1196                               + cr->dest_id_shift);
1197       _dest_id[i] = *cr_dest_id_p;
1198       if (*cr_dest_id_p > max_id)
1199         max_id = *cr_dest_id_p;
1200     }
1201     max_id += 1;
1202 
1203     cr->dest_id_end = max_id;
1204 
1205   }
1206 
1207   /* If we have less destination ids than elements, dest_id will
1208      not be applied to src id and src rank, to avoid losing
1209      information necessary for reverse exchanges */
1210 
1211   int  reverse_flags =   CS_CRYSTAL_ROUTER_USE_DEST_ID
1212                        | CS_CRYSTAL_ROUTER_ADD_SRC_ID
1213                        | CS_CRYSTAL_ROUTER_ADD_SRC_RANK;
1214 
1215   bool reverse_multiple = (   ((cr->flags & reverse_flags) == reverse_flags)
1216                            && cr->dest_id_end < n_elts
1217                            && dest_id != NULL
1218                            && src_id != NULL
1219                            && src_rank != NULL);
1220 
1221   for (size_t c_id = 0; c_id < n_chunks; c_id++) {
1222 
1223     size_t i0 = c_id*chunk_size;
1224     size_t i1 = (c_id+1)*chunk_size;
1225     if (i1 > n_elts)
1226       i1 = n_elts;
1227 
1228     /* Optional source rank */
1229 
1230     if (src_rank != NULL) {
1231 
1232       if (reverse_multiple) {
1233         for (size_t i = i0; i < i1; i++) {
1234           const int *cr_src_rank_p
1235             = (const int *)(cr->buffer[0] + i*cr->comp_size + sizeof(int));
1236           src_rank[i] = *cr_src_rank_p;
1237         }
1238       }
1239       else {
1240         for (size_t i = i0; i < i1; i++) {
1241           const int *cr_src_rank_p
1242             = (const int *)(cr->buffer[0] + i*cr->comp_size + sizeof(int));
1243           src_rank[_dest_id[i]] = *cr_src_rank_p;
1244         }
1245       }
1246 
1247     }
1248 
1249     /* Optional source id */
1250 
1251     if (src_id != NULL) {
1252 
1253       if (reverse_multiple) {
1254         for (size_t i = i0; i < i1; i++) {
1255           const cs_lnum_t *cr_src_id_p
1256             = (const cs_lnum_t *)(  cr->buffer[0] + i*cr->comp_size
1257                                   + cr->src_id_shift);
1258           src_id[i] = *cr_src_id_p;
1259         }
1260       }
1261       else {
1262         for (size_t i = i0; i < i1; i++) {
1263           const cs_lnum_t *cr_src_id_p
1264             = (const cs_lnum_t *)(  cr->buffer[0] + i*cr->comp_size
1265                                   + cr->src_id_shift);
1266           src_id[_dest_id[i]] = *cr_src_id_p;
1267         }
1268       }
1269 
1270     }
1271 
1272     /* data */
1273 
1274     if (data != NULL) {
1275       for (size_t i = i0; i < i1; i++) {
1276         cs_lnum_t e_id = _dest_id[i];
1277         for (size_t j = 0; j < cr->elt_size; j++)
1278           data_p[e_id*cr->elt_size + j]
1279             = cr_data_p[i*cr->comp_size + j];
1280       }
1281     }
1282 
1283   }
1284 
1285   if (dest_id == NULL)
1286     BFT_FREE(_dest_id);
1287 }
1288 
1289 /*----------------------------------------------------------------------------
1290  * Get data elements associated with an indexed Crystal Router.
1291  *
1292  * This variant assumes no destination id is provided, so data is
1293  * copied in Crystal Router buffer order.
1294  *
1295  * parameters:
1296  *   <-- cr        pointer to associated Crystal Router
1297  *   --> src_rank  pointer to source rank array, or NULL
1298  *   --> src_id    pointer to source id array, or NULL
1299  *   --> data_idx  pointer to data index, or NULL
1300  *   --> data      pointer to destination data, or NULL
1301  *----------------------------------------------------------------------------*/
1302 
1303 static void
_get_data_i(cs_crystal_router_t * cr,int * src_rank,cs_lnum_t * src_id,cs_lnum_t * data_idx,void * data)1304 _get_data_i(cs_crystal_router_t   *cr,
1305             int                   *src_rank,
1306             cs_lnum_t             *src_id,
1307             cs_lnum_t             *data_idx,
1308             void                  *data)
1309 {
1310   const size_t n_elts = cr->n_elts[0];
1311 
1312   if (data_idx != NULL)
1313     data_idx[0] = 0;
1314 
1315   cs_lnum_t      s_idx = 0;
1316   unsigned char *data_p = (unsigned char *)data;
1317   unsigned const char *p_s = cr->buffer[0];
1318 
1319   /* Extract data */
1320 
1321   for (size_t i = 0; i < n_elts; i++) {
1322 
1323     /* Optional source rank */
1324 
1325     if (src_rank != NULL) {
1326       const int *cr_src_rank = (const int *)(p_s + sizeof(int));
1327       src_rank[i] = *cr_src_rank;
1328     }
1329 
1330     /* Optional source id */
1331 
1332     if (src_id != NULL) {
1333       const cs_lnum_t *cr_src_id = (const cs_lnum_t *)(p_s + cr->src_id_shift);
1334       src_id[i] = *cr_src_id;
1335     }
1336 
1337     /* Number of elements and optional data index */
1338 
1339     const cs_lnum_t *pn = (const cs_lnum_t *)(p_s + cr->n_vals_shift);
1340     const cs_lnum_t n_sub = *pn;
1341     const size_t _sub_size = cr->elt_size*n_sub;
1342 
1343     if (data_idx != NULL)
1344       data_idx[i+1] = data_idx[i] + n_sub;
1345 
1346     /* Data */
1347 
1348     if (data_p != NULL) {
1349       unsigned const char *pe = p_s + cr->elt_shift;
1350       unsigned char *_data_p = data_p + s_idx*cr->elt_size;
1351       for (size_t j = 0; j < _sub_size; j++)
1352         _data_p[j] = pe[j];
1353     }
1354 
1355     s_idx += n_sub;
1356     p_s += cr->comp_size + _sub_size;
1357 
1358   }
1359 }
1360 
1361 /*----------------------------------------------------------------------------
1362  * Get data elements associated with an indexed Crystal Router.
1363  *
1364  * This variant assumes a destination id is provided, along with a matching
1365  * index. The destination id may either be provided or extracted from the
1366  * data if present; but the data index should always be provided (i.e.
1367  * precomputed), as it is required for indirect writes. If the destination
1368  * ids are both provided and present in the Crystal Router, they are assumed
1369  * to agree.
1370  *
1371  * If the general case, dest_id is applied to all the other
1372  * extracted arrays. If some ids appear multiple times, and dest_id, src_rank,
1373  * and src_id are all non-NULL (with the matching flags
1374  * CS_CRYSTAL_ROUTER_USE_DEST_ID, CS_CRYSTAL_ROUTER_ADD_SRC_ID, and
1375  * CS_CRYSTAL_ROUTER_ADD_SRC_RANK set), it is applied only to the data and
1376  * index, while src_rank and src_id are kept in buffer order.
1377  *
1378  * With this behavior, for reverse exchange, src_id and src_rank can be
1379  * used as dest_id and dest_rank respectively in the call to
1380  * \ref cs_crystal_router_create_s, while NULL is passed to elt_id
1381  * with single ids, and dest_id passed in case of multiple ids.
1382  *
1383  * parameters:
1384  *   <-- cr        pointer to associated Crystal Router
1385  *   <-- dest_id   pointer to destination id array, or NULL
1386  *   <-- data_idx  pointer to data index
1387  *   --> src_rank  pointer to source rank array, or NULL
1388  *   --> src_id    pointer to source id array, or NULL
1389  *   --> data      pointer to destination data, or NULL
1390  *----------------------------------------------------------------------------*/
1391 
1392 static void
_get_data_i_with_dest_id(cs_crystal_router_t * cr,const cs_lnum_t dest_id[],const cs_lnum_t data_idx[],int * src_rank,cs_lnum_t * src_id,void * data)1393 _get_data_i_with_dest_id(cs_crystal_router_t   *cr,
1394                          const cs_lnum_t        dest_id[],
1395                          const cs_lnum_t        data_idx[],
1396                          int                   *src_rank,
1397                          cs_lnum_t             *src_id,
1398                          void                  *data)
1399 {
1400   const size_t n_elts = cr->n_elts[0];
1401 
1402   unsigned char *data_p = (unsigned char *)data;
1403   unsigned const char *p_s = cr->buffer[0];
1404 
1405   /* If we have less destination ids than elements, dest_id will
1406      not be applied to src id and src rank, to avoid losing
1407      information necessary for reverse exchanges */
1408 
1409   int reverse_flags =   CS_CRYSTAL_ROUTER_USE_DEST_ID
1410                       | CS_CRYSTAL_ROUTER_ADD_SRC_ID
1411                       | CS_CRYSTAL_ROUTER_ADD_SRC_RANK;
1412 
1413   if ((cr->flags & reverse_flags) && cr->dest_id_end < n_elts
1414       && dest_id != NULL && src_id != NULL && src_rank != NULL) {
1415 
1416     for (size_t i = 0; i < n_elts; i++) {
1417 
1418       cs_lnum_t id = dest_id[i];
1419 
1420       /* Source rank and id */
1421       const int *cr_src_rank = (const int *)(p_s + sizeof(int));
1422       src_rank[i] = *cr_src_rank;
1423 
1424       const cs_lnum_t *cr_src_id = (const cs_lnum_t *)(p_s + cr->src_id_shift);
1425       src_id[i] = *cr_src_id;
1426 
1427       /* Number of elements and optional data index */
1428       const cs_lnum_t *pn = (const cs_lnum_t *)(p_s + cr->n_vals_shift);
1429       const cs_lnum_t n_sub = *pn;
1430       const size_t _sub_size = cr->elt_size*n_sub;
1431       assert(n_sub == data_idx[id+1] - data_idx[id]);
1432 
1433       /* Data */
1434       if (data_p != NULL) {
1435         unsigned const char *pe = p_s + cr->elt_shift;
1436         unsigned char *_data_p = data_p + data_idx[id]*cr->elt_size;
1437         for (size_t j = 0; j < _sub_size; j++)
1438           _data_p[j] = pe[j];
1439       }
1440 
1441       p_s += cr->comp_size + _sub_size;
1442 
1443     }
1444 
1445   }
1446 
1447   /* Extract data in the general case */
1448 
1449   else {
1450 
1451     for (size_t i = 0; i < n_elts; i++) {
1452 
1453       cs_lnum_t id;
1454       if (dest_id != NULL)
1455         id = dest_id[i];
1456       else {
1457         const cs_lnum_t *cr_dest_id
1458           = (const cs_lnum_t *)(p_s + cr->dest_id_shift);
1459         id = *cr_dest_id;
1460       }
1461 
1462       /* Optional source rank */
1463       if (src_rank != NULL) {
1464         const int *cr_src_rank = (const int *)(p_s + sizeof(int));
1465         src_rank[id] = *cr_src_rank;
1466       }
1467 
1468       /* Optional source id */
1469       if (src_id != NULL) {
1470         const cs_lnum_t *cr_src_id = (const cs_lnum_t *)(p_s + cr->src_id_shift);
1471         src_id[id] = *cr_src_id;
1472       }
1473 
1474       /* Number of elements and optional data index */
1475       const cs_lnum_t *pn = (const cs_lnum_t *)(p_s + cr->n_vals_shift);
1476       const cs_lnum_t n_sub = *pn;
1477       const size_t _sub_size = cr->elt_size*n_sub;
1478       assert(n_sub == data_idx[id+1] - data_idx[id]);
1479 
1480       /* Data */
1481       if (data_p != NULL) {
1482         unsigned const char *pe = p_s + cr->elt_shift;
1483         unsigned char *_data_p = data_p + data_idx[id]*cr->elt_size;
1484         for (size_t j = 0; j < _sub_size; j++)
1485           _data_p[j] = pe[j];
1486       }
1487 
1488       p_s += cr->comp_size + _sub_size;
1489 
1490     }
1491 
1492   }
1493 }
1494 
1495 #endif /* defined(HAVE_MPI) */
1496 
1497 /*! (DOXYGEN_SHOULD_SKIP_THIS) \endcond */
1498 
1499 /*============================================================================
1500  * Public function definitions
1501  *============================================================================*/
1502 
1503 #if defined(HAVE_MPI)
1504 
1505 /*----------------------------------------------------------------------------*/
1506 /*!
1507  * \brief Create a Crystal Router for strided data.
1508  *
1509  * If the flags constant contains \ref CS_CRYSTAL_ROUTER_USE_DEST_ID,
1510  * data exchanged will be ordered by the array passed to the
1511  * \c dest_id argument. For \c n total values received on a rank
1512  * (as given by \ref cs_crystal_router_n_elts), those destination ids
1513  * must be in the range [0, \c n[.
1514  *
1515  * If the flags bit mask matches \ref CS_CRYSTAL_ROUTER_ADD_SRC_ID,
1516  * source ids are added. If it matches \ref CS_CRYSTAL_ROUTER_ADD_SRC_RANK,
1517  * source rank metadata is added.
1518  *
1519  * \param[in]  n_elts            number of elements
1520  * \param[in]  stride            number of values per entity (interlaced)
1521  * \param[in]  datatype          type of data considered
1522  * \param[in]  flags             add destination id ?
1523  * \param[in]  elt               element values
1524  * \param[in]  src_id            optional parent element id (indirection
1525  *                               for elt array), or NULL
1526  * \param[in]  dest_id           element destination id, or NULL
1527  * \param[in]  dest_rank         destination rank for each element
1528  * \param[in]  comm              associated MPI communicator
1529  *
1530  * \return  pointer to new Crystal Router structure.
1531  */
1532 /*----------------------------------------------------------------------------*/
1533 
1534 cs_crystal_router_t *
cs_crystal_router_create_s(size_t n_elts,int stride,cs_datatype_t datatype,int flags,const void * elt,const cs_lnum_t * src_id,const cs_lnum_t * dest_id,const int dest_rank[],MPI_Comm comm)1535 cs_crystal_router_create_s(size_t            n_elts,
1536                            int               stride,
1537                            cs_datatype_t     datatype,
1538                            int               flags,
1539                            const void       *elt,
1540                            const cs_lnum_t  *src_id,
1541                            const cs_lnum_t  *dest_id,
1542                            const int         dest_rank[],
1543                            MPI_Comm          comm)
1544 {
1545   cs_timer_t t0 = cs_timer_time();
1546 
1547   /* Initialize timers if required */
1548 
1549   if (_cr_calls == 0) {
1550     for (int i = 0; i < 2; i++)
1551       CS_TIMER_COUNTER_INIT(_cr_timers[i]);
1552   }
1553   _cr_calls += 1;
1554 
1555   unsigned const char *_elt = elt;
1556 
1557   /* Allocate structure */
1558 
1559   cs_crystal_router_t *cr = _crystal_create_meta_s(n_elts,
1560                                                    stride,
1561                                                    datatype,
1562                                                    flags,
1563                                                    comm);
1564   /* Copy data and some metadata */
1565 
1566   const bool add_src_rank
1567     = (cr->flags & CS_CRYSTAL_ROUTER_ADD_SRC_RANK) ? true : false;
1568   const bool add_dest_id
1569     = (cr->flags & CS_CRYSTAL_ROUTER_USE_DEST_ID) ? true : false;
1570   const bool add_src_id
1571     = (cr->flags & CS_CRYSTAL_ROUTER_ADD_SRC_ID) ? true : false;
1572 
1573   if (add_dest_id)
1574     cs_assert(dest_id != NULL || n_elts == 0);
1575 
1576   for (size_t i = 0; i < n_elts; i++) {
1577 
1578     unsigned char *p_s = cr->buffer[0] + i*cr->comp_size;
1579     unsigned char *pe = p_s + cr->elt_shift;
1580 
1581     int *pr = (int *)p_s;
1582     pr[0] = dest_rank[i];
1583     if (add_src_rank) {
1584       pr[1] = cr->rank_id;
1585     }
1586 
1587     if (add_dest_id) {
1588       cs_lnum_t *_cr_dest_id = (cs_lnum_t *)(p_s + cr->dest_id_shift);
1589       *_cr_dest_id = dest_id[i];
1590     }
1591 
1592     size_t j = (src_id == NULL) ? i : (size_t)src_id[i];
1593 
1594     if (add_src_id) {
1595       cs_lnum_t *_cr_src_id = (cs_lnum_t *)(p_s + cr->src_id_shift);
1596       *_cr_src_id = (cs_lnum_t)j;
1597     }
1598 
1599     unsigned const char *_psrc = _elt + j*cr->elt_size;
1600     for (size_t k = 0; k < cr->elt_size; k++)
1601       pe[k] = _psrc[k];
1602 
1603   }
1604 
1605   cs_timer_t t1 = cs_timer_time();
1606   cs_timer_counter_add_diff(_cr_timers, &t0, &t1);
1607 
1608 #if _CR_DEBUG_DUMP
1609   _dump(cr, "Crystal Router after creation.");
1610 #endif
1611 
1612   return cr;
1613 }
1614 
1615 /*----------------------------------------------------------------------------*/
1616 /*!
1617  * \brief Create a Crystal Router for indexed data.
1618  *
1619  * If the flags constant contains \ref CS_CRYSTAL_ROUTER_USE_DEST_ID,
1620  * data exchanged will be ordered by the array passed to the
1621  * \c dest_id argument. For \c n total values received on a rank
1622  * (as given by \ref cs_crystal_router_n_elts), those destination ids
1623  * must be in the range [0, \c n[.
1624  *
1625  * If the flags bit mask matches \ref CS_CRYSTAL_ROUTER_ADD_SRC_ID,
1626  * source ids are added. If it matches \ref CS_CRYSTAL_ROUTER_ADD_SRC_RANK,
1627  * source rank metadata is added.
1628  *
1629  * \param[in]  n_elts            number of elements
1630  * \param[in]  datatype          type of data considered
1631  * \param[in]  flags             add destination id ?
1632  * \param[in]  elt_idx           element values start and past-the-last index
1633  * \param[in]  elt               element values
1634  * \param[in]  src_id            optional parent element id (indirection
1635  *                               for elt and elt_idx arrays), or NULL
1636  * \param[in]  dest_id           element destination id, or NULL
1637  * \param[in]  dest_rank         destination rank for each element
1638  * \param[in]  comm              associated MPI communicator
1639  *
1640  * \return  pointer to new Crystal Router structure.
1641  */
1642 /*----------------------------------------------------------------------------*/
1643 
1644 cs_crystal_router_t *
cs_crystal_router_create_i(size_t n_elts,cs_datatype_t datatype,int flags,const cs_lnum_t * elt_idx,const void * elt,const cs_lnum_t * src_id,const cs_lnum_t * dest_id,const int dest_rank[],MPI_Comm comm)1645 cs_crystal_router_create_i(size_t            n_elts,
1646                            cs_datatype_t     datatype,
1647                            int               flags,
1648                            const cs_lnum_t  *elt_idx,
1649                            const void       *elt,
1650                            const cs_lnum_t  *src_id,
1651                            const cs_lnum_t  *dest_id,
1652                            const int         dest_rank[],
1653                            MPI_Comm          comm)
1654 {
1655   cs_timer_t t0 = cs_timer_time();
1656 
1657   /* Initialize timers if required */
1658 
1659   if (_cr_calls == 0) {
1660     for (int i = 0; i < 2; i++)
1661       CS_TIMER_COUNTER_INIT(_cr_timers[i]);
1662   }
1663   _cr_calls += 1;
1664 
1665   unsigned const char *_elt = elt;
1666 
1667   /* Allocate structure */
1668 
1669   cs_crystal_router_t *cr = _crystal_create_meta_i(n_elts,
1670                                                    datatype,
1671                                                    elt_idx,
1672                                                    src_id,
1673                                                    flags,
1674                                                    comm);
1675 
1676   /* Copy data and some metadata */
1677 
1678   const bool add_src_rank
1679     = (cr->flags & CS_CRYSTAL_ROUTER_ADD_SRC_RANK) ? true : false;
1680   const bool add_dest_id
1681     = (cr->flags & CS_CRYSTAL_ROUTER_USE_DEST_ID) ? true : false;
1682   const bool add_src_id
1683     = (cr->flags & CS_CRYSTAL_ROUTER_ADD_SRC_ID) ? true : false;
1684 
1685   if (add_dest_id)
1686     cs_assert(dest_id != NULL || n_elts == 0);
1687 
1688   /* Case with no src id indirection */
1689 
1690   if (src_id == NULL) {
1691 
1692     for (size_t i = 0; i < n_elts; i++) {
1693 
1694       unsigned char *p_s =   cr->buffer[0] + i*cr->comp_size
1695                            + elt_idx[i]*cr->elt_size;
1696       int *pr = (int *)p_s;
1697       cs_lnum_t *pn = (cs_lnum_t *)(p_s + cr->n_vals_shift);
1698       unsigned char *pe = p_s + cr->elt_shift;
1699       unsigned const char *_psrc = _elt + elt_idx[i]*cr->elt_size;
1700 
1701       pr[0] = dest_rank[i];
1702 
1703       if (add_src_rank)
1704         pr[1] = cr->rank_id;
1705 
1706       if (add_dest_id) {
1707         unsigned char *pi = p_s + cr->dest_id_shift;
1708         unsigned const char *_p_dest_id = (const unsigned char *)(dest_id + i);
1709         for (size_t j = 0; j < sizeof(cs_lnum_t); j++)
1710           pi[j] = _p_dest_id[j];
1711       }
1712 
1713       if (add_src_id) {
1714         const cs_lnum_t s_id = i;
1715         const unsigned char *_src_id = (const unsigned char *)(&s_id);
1716         unsigned char *pi = p_s + cr->src_id_shift;
1717         for (size_t j = 0; j < sizeof(cs_lnum_t); j++)
1718           pi[j] = _src_id[j];
1719       }
1720 
1721       cs_lnum_t n_sub = elt_idx[i+1] - elt_idx[i];
1722       pn[0] = n_sub;
1723 
1724       size_t sub_size = n_sub * cr->elt_size;
1725       for (size_t j = 0; j < sub_size; j++)
1726         pe[j] = _psrc[j];
1727     }
1728 
1729   }
1730 
1731   /* Case with src id indirection */
1732 
1733   else {
1734 
1735     cs_lnum_t c_idx = 0;
1736 
1737     for (size_t i = 0; i < n_elts; i++) {
1738 
1739       size_t j = src_id[i];
1740 
1741       unsigned char *p_s =   cr->buffer[0] + i*cr->comp_size
1742                            + c_idx*cr->elt_size;
1743       int *pr = (int *)p_s;
1744       cs_lnum_t *pn = (cs_lnum_t *)(p_s + cr->n_vals_shift);
1745       unsigned char *pe = p_s + cr->elt_shift;
1746       unsigned const char *_psrc = _elt + elt_idx[j]*cr->elt_size;
1747 
1748       pr[0] = dest_rank[i];
1749 
1750       if (add_src_rank)
1751         pr[1] = cr->rank_id;
1752 
1753       if (add_dest_id) {
1754         unsigned char *pi = p_s + cr->dest_id_shift;
1755         unsigned const char *_p_dest_id = (const unsigned char *)(dest_id + i);
1756         for (size_t k = 0; k < sizeof(cs_lnum_t); k++)
1757           pi[k] = _p_dest_id[k];
1758       }
1759 
1760       if (add_src_id) {
1761         const cs_lnum_t s_id = j;
1762         const unsigned char *_src_id = (const unsigned char *)(&s_id);
1763         unsigned char *pi = p_s + cr->src_id_shift;
1764         for (size_t k = 0; k < sizeof(cs_lnum_t); k++)
1765           pi[k] = _src_id[k];
1766       }
1767 
1768       cs_lnum_t n_sub = elt_idx[j+1] - elt_idx[j];
1769       pn[0] = n_sub;
1770 
1771       size_t sub_size = n_sub * cr->elt_size;
1772       for (size_t k = 0; k < sub_size; k++)
1773         pe[k] = _psrc[k];
1774 
1775       c_idx += n_sub;
1776 
1777     }
1778 
1779   }
1780 
1781   cs_timer_t t1 = cs_timer_time();
1782   cs_timer_counter_add_diff(_cr_timers, &t0, &t1);
1783 
1784 #if _CR_DEBUG_DUMP
1785   _dump(cr, "Crystal Router after creation.");
1786 #endif
1787 
1788   return cr;
1789 }
1790 
1791 /*----------------------------------------------------------------------------*/
1792 /*!
1793  * \brief Destroy a Crystal Router.
1794  *
1795  * \param[in, out]  cr   pointer to associated Crystal Router
1796  */
1797 /*----------------------------------------------------------------------------*/
1798 
1799 void
cs_crystal_router_destroy(cs_crystal_router_t ** cr)1800 cs_crystal_router_destroy(cs_crystal_router_t  **cr)
1801 {
1802   if (cr != NULL) {
1803 
1804     cs_timer_t t0 = cs_timer_time();
1805 
1806     if (*cr != NULL) {
1807       cs_crystal_router_t *_cr = *cr;
1808       if (_cr->mpi_type != MPI_BYTE)
1809         MPI_Type_free(&(_cr->mpi_type));
1810       BFT_FREE(_cr->buffer[1]);
1811       BFT_FREE(_cr->buffer[0]);
1812       BFT_FREE(*cr);
1813     }
1814 
1815     cs_timer_t t1 = cs_timer_time();
1816     cs_timer_counter_add_diff(_cr_timers, &t0, &t1);
1817   }
1818 }
1819 
1820 /*----------------------------------------------------------------------------*/
1821 /*!
1822  * \brief Exchange data with a Crystal Router.
1823  *
1824  * Order of data from a same source rank is preserved.
1825  *
1826  * \param[in, out]  cr   pointer to associated Crystal Router
1827  */
1828 /*----------------------------------------------------------------------------*/
1829 
1830 void
cs_crystal_router_exchange(cs_crystal_router_t * cr)1831 cs_crystal_router_exchange(cs_crystal_router_t  *cr)
1832 {
1833   cs_assert(cr != NULL);
1834 
1835   cs_timer_t t0 = cs_timer_time();
1836 
1837   int target = -1;
1838   int send_part = 0;
1839   int b_low = 0;
1840   int n_sub_ranks = cr->n_ranks;
1841 
1842   while (n_sub_ranks > 1) {
1843 
1844     int n_recv = 1;
1845     int n_low = n_sub_ranks / 2;
1846     int b_high = b_low + n_low;
1847 
1848     if (cr->rank_id < b_high) {
1849       target = cr->rank_id + n_low;
1850       if ((n_sub_ranks & 1) && (cr->rank_id == b_high - 1))
1851         n_recv = 2;
1852       send_part = 1;
1853     }
1854     else {
1855       target = cr->rank_id - n_low;
1856       if (target == b_high) {
1857         target--;
1858         n_recv = 0;
1859       }
1860       send_part = 0;
1861     }
1862 
1863     /* Partition data */
1864 
1865     if (cr->n_vals_shift == 0)
1866       _crystal_partition_strided(cr, send_part, b_high);
1867     else
1868       _crystal_partition_indexed(cr, send_part, b_high);
1869 
1870 #if _CR_DEBUG_DUMP
1871     {
1872       char comment[80];
1873       snprintf(comment, 79, "Crystal Router after partition, n_sub = %d",
1874                n_sub_ranks);
1875       _dump(cr, comment);
1876     }
1877 #endif
1878 
1879     /* Send message to target process */
1880 
1881     _crystal_sendrecv(cr, target, n_recv);
1882 
1883     /* Ready for next exchange */
1884 
1885     if (cr->rank_id < b_high)
1886       n_sub_ranks = n_low;
1887     else {
1888       n_sub_ranks -= n_low;
1889       b_low = b_high;
1890     }
1891 
1892 #if _CR_DEBUG_DUMP
1893     {
1894       char comment[80];
1895       if (n_sub_ranks > 1) {
1896         snprintf(comment, 79, "Crystal Router after sendrecv, n_sub = %d",
1897                  n_sub_ranks);
1898       }
1899       else {
1900         snprintf(comment, 79, "Crystal Router after exchange");
1901       }
1902       _dump(cr, comment);
1903     }
1904 #endif
1905 
1906   }
1907 
1908   cr->n_elts[1] = 0;
1909   cr->buffer_size[1] = 0;
1910   BFT_FREE(cr->buffer[1]);
1911 
1912   cs_timer_t t1 = cs_timer_time();
1913   cs_timer_counter_add_diff(_cr_timers, &t0, &t1);
1914 }
1915 
1916 /*----------------------------------------------------------------------------*/
1917 /*!
1918  * \brief Get number of elements associated with Crystal Router.
1919  *
1920  * The number of elements is the number of elements received after exchange.
1921  *
1922  * \param[in]  cr  pointer to associated Crystal Router
1923  *
1924  * \return  number of elements associated with distributor.
1925  */
1926 /*----------------------------------------------------------------------------*/
1927 
1928 cs_lnum_t
cs_crystal_router_n_elts(const cs_crystal_router_t * cr)1929 cs_crystal_router_n_elts(const cs_crystal_router_t  *cr)
1930 {
1931   cs_lnum_t retval = 0;
1932 
1933   if (cr != NULL) {
1934 
1935     if (cr->flags & CS_CRYSTAL_ROUTER_USE_DEST_ID) {
1936 
1937       const size_t n_elts = cr->n_elts[0];
1938 
1939       if (cr->dest_id_end == 0 && n_elts > 0) {
1940 
1941         cs_lnum_t dest_id_max = -1;
1942 
1943         if (cr->n_vals_shift == 0) {
1944           for (size_t i = 0; i < n_elts; i++) {
1945             unsigned const char *p_s = cr->buffer[0] + i*cr->comp_size;
1946             const cs_lnum_t *cr_dest_id
1947               = (const cs_lnum_t *)(p_s + cr->dest_id_shift);
1948             if (cr_dest_id[0] > dest_id_max)
1949               dest_id_max = cr_dest_id[0];
1950           }
1951         }
1952 
1953         else {
1954           unsigned const char *p_s = cr->buffer[0];
1955           for (size_t i = 0; i < n_elts; i++) {
1956             const cs_lnum_t *cr_dest_id
1957               = (const cs_lnum_t *)(p_s + cr->dest_id_shift);
1958             if (cr_dest_id[0] > dest_id_max)
1959               dest_id_max = cr_dest_id[0];
1960             const cs_lnum_t *pn = (const cs_lnum_t *)(p_s + cr->n_vals_shift);
1961             p_s += cr->comp_size + cr->elt_size*pn[0];
1962           }
1963         }
1964 
1965         retval = dest_id_max + 1;
1966 
1967       }
1968       else
1969         retval = cr->dest_id_end;
1970 
1971     }
1972     else
1973       retval = cr->n_elts[0];
1974 
1975   }
1976 
1977   return retval;
1978 }
1979 
1980 /*----------------------------------------------------------------------------*/
1981 /*!
1982  * \brief Get number of elements received with Crystal Router.
1983  *
1984  * The number of elements is the number of elements received in the exchange.
1985  * In most cases this is the same as that returned using
1986  * \ref cs_crystal_router_n_elts, but may be larger when a destination id is
1987  * used and multiple elements point to a same destination (for example when
1988  * distributing from partitioned data with elements duplicated on
1989  * partition boundaries to blocks with a single occurence of elements).
1990  *
1991  * \param[in]  cr  pointer to associated Crystal Router
1992  *
1993  * \return  number of elements associated with distributor.
1994  */
1995 /*----------------------------------------------------------------------------*/
1996 
1997 cs_lnum_t
cs_crystal_router_n_recv_elts(const cs_crystal_router_t * cr)1998 cs_crystal_router_n_recv_elts(const cs_crystal_router_t  *cr)
1999 {
2000   cs_lnum_t retval = 0;
2001 
2002   if (cr != NULL)
2003     retval = cr->n_elts[0];
2004 
2005   return retval;
2006 }
2007 
2008 /*----------------------------------------------------------------------------*/
2009 /*!
2010  * \brief Get data elements associated with a Crystal Router.
2011  *
2012  * Depending on the creation options, some elements may be available or not.
2013  *
2014  * For each element type, a pointer to a pointer to a buffer is provided.
2015  * If the pointer to the buffer is non-null, the buffer must be of sufficient
2016  * size for the number of elements returned by \ref cs_crystal_router_n_elts.
2017  *
2018  * If no buffer is provided, one is allocated automatically, and transferred
2019  * to the caller (who is responsible for freeing it when no longer needed).
2020  *
2021  * Note also that if the destination id is provided in the Crystal Router,
2022  * it will be applied automatically to the data and data_index arrays. The
2023  * dest_id array itself is always in receive order. If the Crystal
2024  * Router does not contain destination id info but the \c dest_id
2025  * argument points to a non-NULL value, the provided id will be used to
2026  * order extracted data. This allows saving the destination id on the receive
2027  * side, and not re-sending it (saving bandwidth) for subsequent calls
2028  * with a similar Crystal Router.
2029  *
2030  * If all destination ids appear only once, or the
2031  * CS_CRYSTAL_ROUTER_USE_DEST_ID is not set, dest_id is also applied to
2032  * src_rank and src_id. Otherwise, those arrays are kept in buffer order.
2033  * This can be checked by comparing the output of
2034  * \ref cs_crystal_router_n_elts and \ref cs_crystal_router_n_recv_elts.
2035  *
2036  * With this behavior, for reverse exchange, src_id and src_rank can be
2037  * used as dest_id and dest_rank respectively in the call to
2038  * \ref cs_crystal_router_create_s, while NULL is passed to elt_id
2039  * with single ids, and dest_id passed in case of duplicate ids.
2040  *
2041  * \param[in]       cr          pointer to associated Crystal Router
2042  * \param[out]      src_rank    pointer to (pointer to) source rank array,
2043  *                              or NULL
2044  * \param[in, out]  dest_id     pointer to (pointer to) destination id array,
2045  *                              or NULL
2046  * \param[out]      src_id      pointer to (pointer to) source id array, or NULL
2047  * \param[out]      data_index  pointer to (pointer to) destination index,
2048  *                              or NULL
2049  * \param[out]      data        pointer to (pointer to) destination data,
2050  *                              or NULL
2051  */
2052 /*----------------------------------------------------------------------------*/
2053 
2054 void
cs_crystal_router_get_data(cs_crystal_router_t * cr,int ** src_rank,cs_lnum_t ** dest_id,cs_lnum_t ** src_id,cs_lnum_t ** data_index,void ** data)2055 cs_crystal_router_get_data(cs_crystal_router_t   *cr,
2056                            int                  **src_rank,
2057                            cs_lnum_t            **dest_id,
2058                            cs_lnum_t            **src_id,
2059                            cs_lnum_t            **data_index,
2060                            void                 **data)
2061 {
2062   cs_timer_t t0 = cs_timer_time();
2063 
2064   int *_src_rank = NULL;
2065   cs_lnum_t *_dest_id = NULL;
2066   cs_lnum_t *_src_id = NULL;
2067   cs_lnum_t *_data_index = NULL;
2068   unsigned char *_data = NULL;
2069 
2070   size_t n_elts = cr->n_elts[0];
2071 
2072   /* Map and allocate arrays if needed */
2073 
2074   if (src_rank != NULL && cr->flags & CS_CRYSTAL_ROUTER_ADD_SRC_RANK) {
2075     _src_rank = *src_rank;
2076     if (_src_rank == NULL) {
2077       BFT_MALLOC(_src_rank, n_elts, int);
2078       *src_rank = _src_rank;
2079     }
2080   }
2081 
2082   if (dest_id != NULL) { /* May be extracted or provided from previous call */
2083     _dest_id = *dest_id;
2084     if (_dest_id == NULL && cr->flags & CS_CRYSTAL_ROUTER_USE_DEST_ID) {
2085       BFT_MALLOC(_dest_id, n_elts, cs_lnum_t);
2086       *dest_id = _dest_id;
2087     }
2088   }
2089 
2090   if (src_id != NULL && cr->flags & CS_CRYSTAL_ROUTER_ADD_SRC_ID) {
2091     _src_id = *src_id;
2092     if (_src_id == NULL) {
2093       BFT_MALLOC(_src_id, n_elts, cs_lnum_t);
2094       *src_id = _src_id;
2095     }
2096   }
2097 
2098   if (data_index != NULL && cr->n_vals_shift > 0) {
2099     _data_index = *data_index;
2100     if (_data_index == NULL) {
2101       BFT_MALLOC(_data_index, n_elts + 1, cs_lnum_t);
2102       *data_index = _data_index;
2103     }
2104   }
2105 
2106   if (data != NULL) {
2107     _data = *data;
2108     if (_data == NULL) {
2109       size_t data_size;
2110       if (cr->stride > 0)
2111         data_size = n_elts*cr->elt_size;
2112       else
2113         data_size = cr->n_vals[0]*cr->elt_size;
2114       BFT_MALLOC(_data, data_size, unsigned char);
2115       *data = _data;
2116     }
2117   }
2118 
2119   if (data != NULL && cr->stride > 0) {
2120     _data = *data;
2121     if (_data == NULL) {
2122       BFT_MALLOC(_data, n_elts*cr->elt_size, unsigned char);
2123       *data = _data;
2124     }
2125   }
2126 
2127   /* Now extract data */
2128 
2129   if (_dest_id != NULL || cr->flags & CS_CRYSTAL_ROUTER_USE_DEST_ID) {
2130     if (cr->n_vals_shift == 0)
2131       _get_data_s_with_dest_id(cr,
2132                                _dest_id,
2133                                _src_rank,
2134                                _src_id,
2135                                _data);
2136     else {
2137       if (cr->n_vals_shift > 0 && _data_index == NULL)
2138         BFT_MALLOC(_data_index, n_elts + 1, cs_lnum_t);
2139 
2140       _get_data_index_with_dest_id(cr, _dest_id, _data_index);
2141 
2142       _get_data_i_with_dest_id(cr,
2143                                _dest_id,
2144                                _data_index,
2145                                _src_rank,
2146                                _src_id,
2147                                _data);
2148     }
2149   }
2150 
2151   else {
2152 
2153     if (cr->n_vals_shift == 0)
2154       _get_data_s(cr, _src_rank, _src_id, _data);
2155     else
2156       _get_data_i(cr, _src_rank, _src_id, _data_index, _data);
2157 
2158   }
2159 
2160   if (dest_id == NULL && _dest_id != NULL)
2161     BFT_FREE(_dest_id);
2162   if (data_index == NULL && _data_index != NULL)
2163     BFT_FREE(_data_index);
2164 
2165   cs_timer_t t1 = cs_timer_time();
2166   cs_timer_counter_add_diff(_cr_timers, &t0, &t1);
2167 }
2168 
2169 /*----------------------------------------------------------------------------*/
2170 /*!
2171  * \brief Query maximum buffer sizes reached by a Crystal Router.
2172  *
2173  * Order of data from a same source rank is preserved.
2174  *
2175  * \param[in]   cr      pointer to associated Crystal Router
2176  * \param[out]  max_sizes   pointer to maximum local/receive (max_sizes[0])
2177  *                          and send (max_sizes[1]) sizes, or NULL
2178  *
2179  * \return  maximum total allocated buffer memory
2180  */
2181 /*----------------------------------------------------------------------------*/
2182 
2183 size_t
cs_crystal_router_get_max_sizes(cs_crystal_router_t * cr,size_t * max_sizes)2184 cs_crystal_router_get_max_sizes(cs_crystal_router_t  *cr,
2185                                 size_t               *max_sizes)
2186 {
2187   cs_assert(cr != NULL);
2188 
2189   if (max_sizes != NULL) {
2190     max_sizes[0] = cr->buffer_size_max[0];
2191     max_sizes[1] = cr->buffer_size_max[1];
2192   }
2193 
2194   return cr->alloc_tot_max;
2195 }
2196 
2197 #endif /* defined(HAVE_MPI) */
2198 
2199 /*----------------------------------------------------------------------------*/
2200 /*!
2201  * \brief Log performance information relative to Crystal Router exchange.
2202  *
2203  * Call count is based on Crystal router creation calls, as the number
2204  * of exchange and data access calls should be identical.
2205  */
2206 /*----------------------------------------------------------------------------*/
2207 
2208 void
cs_crystal_router_log_finalize(void)2209 cs_crystal_router_log_finalize(void)
2210 {
2211   if (_cr_calls <= 0 || cs_glob_n_ranks < 2)
2212     return;
2213 
2214   cs_log_printf(CS_LOG_PERFORMANCE,
2215                 _("\nCrystal router: %llu %s\n"),
2216                 (unsigned long long) _cr_calls, _("calls"));
2217 
2218 #if defined(HAVE_MPI)
2219   double wtimes[2] = {_cr_timers[0].nsec*1e-9,
2220                       _cr_timers[1].nsec*1e-9};
2221 
2222   double mntimes[2], mxtimes[2], stimes[2];
2223 
2224   MPI_Reduce(wtimes, mntimes, 2, MPI_DOUBLE, MPI_MIN,
2225              0, cs_glob_mpi_comm);
2226   MPI_Reduce(wtimes, mxtimes, 2, MPI_DOUBLE, MPI_MAX,
2227              0, cs_glob_mpi_comm);
2228   MPI_Reduce(wtimes, stimes, 2, MPI_DOUBLE, MPI_SUM,
2229              0, cs_glob_mpi_comm);
2230   if (cs_glob_rank_id == 0) {
2231     cs_log_printf
2232       (CS_LOG_PERFORMANCE,
2233        _("                      mean           minimum        maximum\n"
2234          "  wall clock:    %12.5f s %12.5f s %12.5f s\n"
2235          "  communication: %12.5f s %12.5f s %12.5f s\n"),
2236        stimes[0]/cs_glob_n_ranks, mntimes[0], mxtimes[0],
2237        stimes[1]/cs_glob_n_ranks, mntimes[1], mxtimes[1]);
2238   }
2239 #endif
2240 }
2241 
2242 /*----------------------------------------------------------------------------*/
2243 
2244 END_C_DECLS
2245