1 /*============================================================================
2 * Crystal Router parallel exchange algorithm based operations.
3 *============================================================================*/
4
5 /*
6 This file is part of Code_Saturne, a general-purpose CFD tool.
7
8 Copyright (C) 1998-2021 EDF S.A.
9
10 This program is free software; you can redistribute it and/or modify it under
11 the terms of the GNU General Public License as published by the Free Software
12 Foundation; either version 2 of the License, or (at your option) any later
13 version.
14
15 This program is distributed in the hope that it will be useful, but WITHOUT
16 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
18 details.
19
20 You should have received a copy of the GNU General Public License along with
21 this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
22 Street, Fifth Floor, Boston, MA 02110-1301, USA.
23 */
24
25 /*----------------------------------------------------------------------------*/
26
27 #include "cs_defs.h"
28
29 /*----------------------------------------------------------------------------
30 * Standard C library headers
31 *----------------------------------------------------------------------------*/
32
33 #include <assert.h>
34 #include <errno.h>
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38
39 /*----------------------------------------------------------------------------
40 * Local headers
41 *----------------------------------------------------------------------------*/
42
43 #include "bft_mem.h"
44 #include "bft_error.h"
45 #include "bft_printf.h"
46
47 #include "cs_assert.h"
48 #include "cs_block_dist.h"
49 #include "cs_log.h"
50 #include "cs_timer.h"
51
52 /*----------------------------------------------------------------------------
53 * Header for the current file
54 *----------------------------------------------------------------------------*/
55
56 #include "cs_crystal_router.h"
57
58 /*----------------------------------------------------------------------------*/
59
60 BEGIN_C_DECLS
61
62 /*=============================================================================
63 * Additional Doxygen documentation
64 *============================================================================*/
65
66 /*!
67 \file cs_crystal_router.c
68 Crystal Router parallel exchange algorithm based operations.
69
70 The Crystal Router is described in \cite Fox:1988 .
71
72 \typedef cs_crystal_router_t
73 Opaque Crystal Router distribution structure
74
75 \par Using flags
76 \parblock
77
78 Flags are defined as a sum (bitwise or) of constants, which may include
79 \ref CS_CRYSTAL_ROUTER_USE_DEST_ID, \ref CS_CRYSTAL_ROUTER_ADD_SRC_ID,
80 and \ref CS_CRYSTAL_ROUTER_ADD_SRC_RANK.
81
82 \endparblock
83
84 */
85
86 /*! \cond DOXYGEN_SHOULD_SKIP_THIS */
87
88 /*=============================================================================
89 * Macro definitions
90 *============================================================================*/
91
92 #define _CR_DEBUG_DUMP 0
93
94 /* Data elements in crystal router */
95
96 typedef enum {
97
98 CS_CRYSTAL_ROUTER_DEST_ID,
99 CS_CRYSTAL_ROUTER_SRC_ID,
100 CS_CRYSTAL_ROUTER_DATA
101
102 } cs_crystal_router_data_t;
103
104 /*=============================================================================
105 * Local type definitions
106 *============================================================================*/
107
108 #if defined(HAVE_MPI)
109
110 /* Crystal router management structure */
111
112 struct _cs_crystal_router_t { /* Crystal router information */
113
114 cs_datatype_t datatype; /* associated datatype */
115 int flags; /* ordering and metadata flags */
116
117 size_t stride; /* stride if strided, 0 otherwise */
118
119 size_t dest_id_shift; /* starting byte for destination id */
120 size_t src_id_shift; /* starting byte for source id */
121 size_t n_vals_shift; /* starting byte for element count
122 (for indexed cases) */
123 size_t elt_shift; /* starting byte for element data */
124
125 size_t elt_size; /* element size */
126 size_t comp_size; /* composite metadata + element size if
127 strided, metadata size otherwise */
128
129 size_t dest_id_end; /* maximum destination id + 1 */
130 size_t n_elts[2]; /* number of elements in partition */
131 size_t n_vals[2]; /* number of data values in partition */
132 size_t buffer_size[2]; /* buffer size values */
133 size_t buffer_size_max[2]; /* max buffer size values reached */
134 size_t alloc_tot_max; /* max total allocation reached */
135 unsigned char *buffer[2];
136
137 MPI_Comm comm; /* associated MPI communicator */
138 MPI_Datatype mpi_type; /* Associated MPI datatype */
139 size_t mpi_type_size; /* Associated MPI datatype size */
140 int rank_id; /* local rank id in comm */
141 int n_ranks; /* comm size */
142
143 };
144
145 #endif /* defined(HAVE_MPI) */
146
147 /*============================================================================
148 * Static global variables
149 *============================================================================*/
150
151 /* Call counter and timers: 0: total, 1: communication */
152
153 static size_t _cr_calls = 0;
154 static cs_timer_counter_t _cr_timers[2];
155
156 const float _realloc_f_threshold = 0.7;
157
158 /*============================================================================
159 * Local function defintions
160 *============================================================================*/
161
162 #if defined(HAVE_MPI)
163
164 /*----------------------------------------------------------------------------
165 * Return Crystal Router datatype count matching a given number of values.
166 *
167 * parameters:
168 * cr <-- associated crystal router structure
169 * n_elts <-- number of elements
170 * n_vals <-- number of associated values (used only for indexed cases)
171 *
172 * returns:
173 * count of associated datatype elements for matching buffer size
174 *---------------------------------------------------------------------------*/
175
176 static size_t
_comm_type_count(cs_crystal_router_t * cr,cs_lnum_t n_elts,cs_lnum_t n_vals)177 _comm_type_count(cs_crystal_router_t *cr,
178 cs_lnum_t n_elts,
179 cs_lnum_t n_vals)
180 {
181 size_t retval;
182
183 if (cr->n_vals_shift == 0) { /* strided */
184 retval = n_elts;
185 }
186 else { /* indexed */
187 size_t n = n_elts*cr->comp_size + n_vals*cr->elt_size;
188 retval = n / cr->mpi_type_size;
189 }
190
191 return retval;
192 }
193
194 /*----------------------------------------------------------------------------
195 * Return number of values matching a given number of elements and MPI
196 * datatype values.
197 *
198 * parameters:
199 * cr <-- associated crystal router structure
200 * n_elts <-- number of elements
201 * n_mpi_vals <-- number of MPI element
202 *
203 * returns:
204 * count of associated datatype elements for matching buffer size
205 *---------------------------------------------------------------------------*/
206
207 static size_t
_comm_type_count_to_n_vals(cs_crystal_router_t * cr,cs_lnum_t n_elts,int n_mpi_elts)208 _comm_type_count_to_n_vals(cs_crystal_router_t *cr,
209 cs_lnum_t n_elts,
210 int n_mpi_elts)
211 {
212 size_t retval;
213
214 if (cr->n_vals_shift == 0) { /* strided */
215 retval = n_elts * cr->stride;
216 }
217 else { /* indexed */
218 size_t n = n_mpi_elts*cr->mpi_type_size - n_elts*cr->comp_size;
219 retval = n / cr->elt_size;
220 }
221
222 return retval;
223 }
224
225 /*----------------------------------------------------------------------------
226 * Return Crystal Router datatype count matching a given number of values.
227 *
228 * parameters:
229 * cr <-- associated crystal router structure
230 * n_elts <-- number of elements
231 * n_vals <-- number of associated values (used only for indexed cases)
232 *
233 * returns:
234 * count of associated datatype elements for matching buffer size
235 *---------------------------------------------------------------------------*/
236
237 static inline size_t
_data_size(cs_crystal_router_t * cr,cs_lnum_t n_elts,cs_lnum_t n_vals)238 _data_size(cs_crystal_router_t *cr,
239 cs_lnum_t n_elts,
240 cs_lnum_t n_vals)
241 {
242 size_t retval;
243
244 if (cr->n_vals_shift == 0) /* strided */
245 retval = n_elts * cr->comp_size;
246 else /* indexed */
247 retval = n_elts*cr->comp_size + n_vals*cr->elt_size;
248
249 return retval;
250 }
251
252 /*----------------------------------------------------------------------------
253 * Allocation and base settings for a crystal router.
254 *
255 * parameters:
256 * n_elts <-- number of elements
257 * stride <-- number of values per entity (interlaced)
258 * datatype <-- type of data considered
259 * flags <-- ordering and metadata flags
260 * comm <-- associated MPI communicator
261 *---------------------------------------------------------------------------*/
262
263 static cs_crystal_router_t *
_crystal_create(size_t n_elts,int flags,MPI_Comm comm)264 _crystal_create(size_t n_elts,
265 int flags,
266 MPI_Comm comm)
267 {
268 int rank_id, n_ranks;
269 cs_crystal_router_t *cr = NULL;
270
271 const size_t align_size = sizeof(cs_lnum_t);
272
273 /* Communicator info */
274
275 MPI_Comm_rank(comm, &rank_id);
276 MPI_Comm_size(comm, &n_ranks);
277
278 /* Allocate structure */
279
280 BFT_MALLOC(cr, 1, cs_crystal_router_t);
281
282 cr->flags = flags;
283
284 cr->stride = 0;
285 cr->elt_size = 0;
286 cr->comp_size = 0;
287 cr->dest_id_end = 0;
288 cr->n_elts[0] = n_elts;
289 cr->n_elts[1] = 0;
290
291 cr->comm = comm;
292 cr->rank_id = rank_id;
293 cr->n_ranks = n_ranks;
294
295 /* Compute data size and alignment */
296
297 cr->dest_id_shift = sizeof(int);
298
299 if (cr->flags & CS_CRYSTAL_ROUTER_ADD_SRC_RANK)
300 cr->dest_id_shift += sizeof(int);
301
302 if (cr->flags & CS_CRYSTAL_ROUTER_USE_DEST_ID)
303 cr->src_id_shift = cr->dest_id_shift + sizeof(cs_lnum_t);
304 else
305 cr->src_id_shift = cr->dest_id_shift;
306
307 if (cr->flags & CS_CRYSTAL_ROUTER_ADD_SRC_ID)
308 cr->elt_shift = cr->src_id_shift + cs_datatype_size[CS_LNUM_TYPE];
309 else
310 cr->elt_shift = cr->src_id_shift;
311
312 if (cr->elt_shift % align_size)
313 cr->elt_shift += align_size - (cr->elt_shift % align_size);
314
315 cr->n_vals_shift = 0;
316
317 for (int i = 0; i < 2; i++) {
318 cr->n_vals[i] = 0;
319 cr->buffer_size[i] = 0;
320 cr->buffer_size_max[i] = 0;
321 cr->buffer[i] = NULL;
322 }
323
324 cr->alloc_tot_max = 0;
325
326 return cr;
327 }
328
329 /*----------------------------------------------------------------------------
330 * First stage of creation for a crystal router for strided data.
331 *
332 * parameters:
333 * n_elts <-- number of elements
334 * stride <-- number of values per entity (interlaced)
335 * datatype <-- type of data considered
336 * flags <-- ordering and metadata flags
337 * comm <-- associated MPI communicator
338 *---------------------------------------------------------------------------*/
339
340 static cs_crystal_router_t *
_crystal_create_meta_s(size_t n_elts,int stride,cs_datatype_t datatype,int flags,MPI_Comm comm)341 _crystal_create_meta_s(size_t n_elts,
342 int stride,
343 cs_datatype_t datatype,
344 int flags,
345 MPI_Comm comm)
346 {
347 /* Allocate structure */
348
349 cs_crystal_router_t *cr = _crystal_create(n_elts, flags, comm);
350
351 size_t elt_size = cs_datatype_size[datatype]*stride;
352 size_t align_size = sizeof(cs_lnum_t);
353
354 cr->datatype = (stride > 0) ? datatype : CS_DATATYPE_NULL;
355
356 cr->stride = (stride > 0) ? stride : 1;
357 cr->elt_size = elt_size;
358
359 /* Compute data size and alignment */
360
361 cr->comp_size = cr->elt_shift + elt_size;
362
363 if (elt_size % align_size)
364 cr->comp_size += align_size - (elt_size % align_size);;
365
366 /* Create associated MPI datatype */
367
368 cr->mpi_type_size = cr->comp_size;
369
370 MPI_Type_contiguous(cr->mpi_type_size, MPI_BYTE, &(cr->mpi_type));
371 MPI_Type_commit(&(cr->mpi_type));
372
373 /* Allocate buffers */
374
375 cr->buffer_size[0] = n_elts*cr->comp_size;
376 cr->buffer_size[1] = 0;
377 BFT_MALLOC(cr->buffer[0], cr->buffer_size[0], unsigned char);
378 memset(cr->buffer[0], 0, cr->buffer_size[0]);
379 cr->buffer[1] = NULL;
380
381 cr->buffer_size_max[0] = cr->buffer_size[0];
382 cr->buffer_size_max[1] = 0;
383
384 cr->alloc_tot_max = cr->buffer_size_max[0];
385
386 return cr;
387 }
388
389 /*----------------------------------------------------------------------------
390 * First stage of creation for a crystal router for indexed data.
391 *
392 * parameters:
393 * n_elts <-- number of elements
394 * datatype <-- type of data considered
395 * elt_idx <-- element values start and past-the-last index
396 * src_id <-- optional parent element id (elt_idx indirection), or NULL
397 * flags <-- ordering and metadata flags
398 * comm <-- associated MPI communicator
399 *---------------------------------------------------------------------------*/
400
401 static cs_crystal_router_t *
_crystal_create_meta_i(size_t n_elts,cs_datatype_t datatype,const cs_lnum_t * elt_idx,const cs_lnum_t * src_id,int flags,MPI_Comm comm)402 _crystal_create_meta_i(size_t n_elts,
403 cs_datatype_t datatype,
404 const cs_lnum_t *elt_idx,
405 const cs_lnum_t *src_id,
406 int flags,
407 MPI_Comm comm)
408 {
409 assert(datatype != CS_DATATYPE_NULL);
410
411 /* Allocate structure */
412
413 cs_crystal_router_t *cr = _crystal_create(n_elts, flags, comm);
414
415 size_t elt_size = cs_datatype_size[datatype];
416 size_t align_size = sizeof(cs_lnum_t);
417
418 cr->datatype = datatype;
419 cr->elt_size = elt_size;
420
421 /* Compute data size and alignment */
422
423 cr->n_vals_shift = cr->elt_shift;
424 cr->elt_shift = cr->n_vals_shift + cs_datatype_size[CS_LNUM_TYPE];
425
426 if (cr->elt_shift < cr->elt_size)
427 cr->elt_shift = cr->elt_size;
428
429 if (cr->elt_shift % align_size)
430 cr->elt_shift += align_size - (cr->elt_shift % align_size);
431
432 cr->comp_size = cr->elt_shift;
433
434 /* Create associated MPI datatype */
435
436 cr->mpi_type_size = CS_MIN(cr->comp_size, cr->elt_size);
437 while (cr->comp_size % cr->mpi_type_size || cr->elt_size % cr->mpi_type_size)
438 cr->mpi_type_size--;
439
440 MPI_Type_contiguous(cr->mpi_type_size, MPI_BYTE, &(cr->mpi_type));
441 MPI_Type_commit(&(cr->mpi_type));
442
443 /* Allocate buffers */
444
445 if (src_id == NULL)
446 cr->n_vals[0] = elt_idx[n_elts];
447 else {
448 cr->n_vals[0] = n_elts*cr->comp_size;
449 for (size_t i = 0; i < n_elts; i++) {
450 size_t j = src_id[i];
451 cr->n_vals[0] += elt_idx[j+1] - elt_idx[j];
452 }
453 }
454 cr->n_vals[1] = 0;
455 cr->buffer_size[0] = n_elts*cr->comp_size + cr->n_vals[0]*elt_size;
456 cr->buffer_size[1] = 0;
457 BFT_MALLOC(cr->buffer[0], cr->buffer_size[0], unsigned char);
458 memset(cr->buffer[0], 0, cr->buffer_size[0]);
459 cr->buffer[1] = NULL;
460
461 cr->buffer_size_max[0] = cr->buffer_size[0];
462 cr->buffer_size_max[1] = 0;
463
464 cr->alloc_tot_max = cr->buffer_size_max[0];
465
466 return cr;
467 }
468
469 #if _CR_DEBUG_DUMP /* _dump functions for debugging */
470
471 /*----------------------------------------------------------------------------
472 * Dump data element associated with a Crystal Router.
473 *
474 * parameters:
475 * <-- cr pointer to associated Crystal Router
476 * <-- e pointer to element
477 *----------------------------------------------------------------------------*/
478
479 static void
_dump_e(const cs_crystal_router_t * cr,const void * e)480 _dump_e(const cs_crystal_router_t *cr,
481 const void *e)
482 {
483 switch (cr->datatype) {
484
485 case CS_FLOAT:
486 {
487 const float *v = (const float *)e;
488 bft_printf(" %g", (double)(*v));
489 }
490 break;
491 case CS_DOUBLE:
492 {
493 const double *v = (const double *)e;
494 bft_printf(" %g", (*v));
495 }
496 break;
497 case CS_LNUM_TYPE:
498 {
499 const cs_lnum_t *v = (const cs_lnum_t *)e;
500 bft_printf(" %d", (int)(*v));
501 }
502 break;
503 case CS_GNUM_TYPE:
504 {
505 const cs_gnum_t *v = (const cs_gnum_t *)e;
506 bft_printf(" %llu", (unsigned long long)(*v));
507 }
508 break;
509 default:
510 {
511 const unsigned char *pe = (const unsigned char *)e;
512 size_t elt_size = cs_datatype_size[cr->datatype];
513 if (cr->elt_size > 0)
514 bft_printf(" %x", (int)pe[0]);
515 for (size_t k = 1; k < elt_size; k++)
516 bft_printf(":%x", (int)pe[k]);
517 }
518 }
519 }
520
521 /*----------------------------------------------------------------------------
522 * Dump data elements associated with a strided Crystal Router.
523 *
524 * parameters:
525 * <-- cr pointer to associated Crystal Router
526 * <-- comment associated comment
527 *----------------------------------------------------------------------------*/
528
529 static void
_dump_s(const cs_crystal_router_t * cr,const char * comment)530 _dump_s(const cs_crystal_router_t *cr,
531 const char *comment)
532 {
533 if (comment != NULL)
534 bft_printf("\n%s\n", comment);
535
536 size_t elt_size = cr->elt_size;
537 if (cr->stride > 0)
538 elt_size = cr->elt_size/cr->stride;
539
540 for (int ip = 0; ip < 2; ip++) {
541
542 const size_t n_elts = cr->n_elts[ip];
543
544 if (n_elts > 0)
545 bft_printf("crystal router partition %d: %d elements "
546 "(stride %d, size %d)\n",
547 ip, (int)n_elts, (int)cr->stride, (int)elt_size);
548
549 /* Extract data */
550
551 for (size_t i = 0; i < n_elts; i++) {
552
553 unsigned const char *p_s = cr->buffer[ip] + i*cr->comp_size;
554
555 const int *cr_dest_rank = (const int *)(p_s);
556
557 bft_printf(" %d\n"
558 " dest_rank: %d\n", (int)i, *cr_dest_rank);
559
560 if (cr->flags & CS_CRYSTAL_ROUTER_ADD_SRC_RANK) {
561 const int *cr_src_rank = (const int *)(p_s + sizeof(int));
562 bft_printf(" src_rank: %d\n", (int)(*cr_src_rank));
563 }
564
565 if (cr->flags & CS_CRYSTAL_ROUTER_USE_DEST_ID) {
566 const cs_lnum_t *cr_dest_id
567 = (const cs_lnum_t *)(p_s + cr->dest_id_shift);
568 bft_printf(" dest_id: %d\n", (int)(*cr_dest_id));
569 }
570
571 if (cr->flags & CS_CRYSTAL_ROUTER_ADD_SRC_ID) {
572 const cs_lnum_t *cr_src_id
573 = (const cs_lnum_t *)(p_s + cr->src_id_shift);
574 bft_printf(" src_id: %d\n", (int)(*cr_src_id));
575 }
576
577 /* Data */
578
579 if (cr->elt_size > 0) {
580 unsigned const char *pe = p_s + cr->elt_shift;
581 bft_printf(" data :");
582 for (size_t j = 0; j < cr->stride; j++)
583 _dump_e(cr, pe + elt_size*j);
584 }
585 bft_printf("\n");
586 bft_printf_flush();
587
588 }
589 }
590 }
591
592 /*----------------------------------------------------------------------------
593 * Dump data elements associated with an indexed Crystal Router.
594 *
595 * parameters:
596 * <-- cr pointer to associated Crystal Router
597 * <-- comment associated comment
598 *----------------------------------------------------------------------------*/
599
600 static void
_dump_i(const cs_crystal_router_t * cr,const char * comment)601 _dump_i(const cs_crystal_router_t *cr,
602 const char *comment)
603 {
604 if (comment != NULL)
605 bft_printf("\n%s\n", comment);
606
607 for (int ip = 0; ip < 2; ip++) {
608
609 const size_t n_elts = cr->n_elts[ip];
610
611 if (n_elts > 0)
612 bft_printf("crystal router partition %d: %d elements\n",
613 ip, (int)n_elts);
614
615 unsigned const char *p_s = cr->buffer[ip];
616
617 /* Extract data */
618
619 cs_lnum_t s_idx = 0;
620
621 for (size_t i = 0; i < n_elts; i++) {
622
623 const int *cr_dest_rank = (const int *)(p_s);
624
625 bft_printf(" %d\n"
626 " dest_rank: %d\n", (int)i, *cr_dest_rank);
627
628 if (cr->flags & CS_CRYSTAL_ROUTER_ADD_SRC_RANK) {
629 const int *cr_src_rank = (const int *)(p_s + sizeof(int));
630 bft_printf(" src_rank: %d\n", (int)(*cr_src_rank));
631 }
632
633 if (cr->flags & CS_CRYSTAL_ROUTER_USE_DEST_ID) {
634 const cs_lnum_t *cr_dest_id
635 = (const cs_lnum_t *)(p_s + cr->dest_id_shift);
636 bft_printf(" dest_id: %d\n", (int)(*cr_dest_id));
637 }
638
639 if (cr->flags & CS_CRYSTAL_ROUTER_ADD_SRC_ID) {
640 const cs_lnum_t *cr_src_id
641 = (const cs_lnum_t *)(p_s + cr->src_id_shift);
642 bft_printf(" src_id: %d\n", (int)(*cr_src_id));
643 }
644
645 /* Number of elements and optional data index */
646
647 const cs_lnum_t *pn = (const cs_lnum_t *)(p_s + cr->n_vals_shift);
648 const cs_lnum_t n_sub = *pn;
649 const size_t _sub_size = cr->elt_size*n_sub;
650
651 bft_printf(" data index: %d %d\n"
652 " data : ", (int)s_idx, (int)(s_idx+n_sub));
653
654 /* Data */
655
656 unsigned const char *pe = p_s + cr->elt_shift;
657 for (cs_lnum_t j = 0; j < n_sub; j++)
658 _dump_e(cr, pe + cr->elt_size*j);
659 bft_printf("\n");
660 bft_printf_flush();
661
662 s_idx += n_sub;
663 p_s += cr->comp_size + _sub_size;
664
665 }
666 }
667 }
668
669 /*----------------------------------------------------------------------------
670 * Dump data elements associated with a Crystal Router.
671 *
672 * parameters:
673 * <-- cr pointer to associated Crystal Router
674 * <-- comment associated comment
675 *----------------------------------------------------------------------------*/
676
677 static void
_dump(const cs_crystal_router_t * cr,const char * comment)678 _dump(const cs_crystal_router_t *cr,
679 const char *comment)
680 {
681 if (cr->n_vals_shift == 0)
682 _dump_s(cr, comment);
683 else
684 _dump_i(cr, comment);
685 }
686
687 #endif /* _dump functions for debugging */
688
689 /*----------------------------------------------------------------------------
690 * Partition strided data for exchange with a crystal router.
691 *
692 * parameters:
693 * cr <-> associated crystal router structure
694 * send_id <-> id of "send" buffer" (1 if low = buf0, 0 if low = buf1)
695 * cutoff <-- cutoff rank
696 *---------------------------------------------------------------------------*/
697
698 static void
_crystal_partition_strided(cs_crystal_router_t * cr,int send_id,int cutoff)699 _crystal_partition_strided(cs_crystal_router_t *cr,
700 int send_id,
701 int cutoff)
702 {
703 cs_lnum_t i;
704
705 cs_lnum_t n0 = 0, n1 = 0;
706
707 const cs_lnum_t n = cr->n_elts[0];
708 const int id0 = (send_id + 1) % 2;
709 const int id1 = send_id;
710 const size_t comp_size = cr->comp_size;
711
712 assert(send_id == 0 || send_id == 1);
713
714 if ( cr->buffer_size[1] < cr->buffer_size[0]
715 || cr->buffer_size[1] > cr->buffer_size[0]*_realloc_f_threshold) {
716 cr->buffer_size[1] = cr->buffer_size[0];
717 BFT_REALLOC(cr->buffer[1], cr->buffer_size[1], unsigned char);
718 if (cr->buffer_size[1] > cr->buffer_size_max[1])
719 cr->buffer_size_max[1] = cr->buffer_size[1];
720 size_t alloc_tot = cr->buffer_size[0] + cr->buffer_size[1];
721 if (alloc_tot > cr->alloc_tot_max)
722 cr->alloc_tot_max = alloc_tot;
723 }
724
725 if (id0 == 0) {
726 for (i = 0; i < n; i++) {
727 unsigned char *src = cr->buffer[0] + i*comp_size;
728 int *r = (int *)src;
729 if (r[0] >= cutoff)
730 break;
731 }
732 n0 = i;
733 }
734 else {
735 for (i = 0; i < n; i++) {
736 unsigned char *src = cr->buffer[0] + i*comp_size;
737 int *r = (int *)src;
738 if (r[0] < cutoff)
739 break;
740 }
741 n1 = i;
742 }
743
744 while (i < n) {
745 unsigned char *src = cr->buffer[0] + i*comp_size;
746 int *r = (int *)src;
747 if (r[0] < cutoff) {
748 unsigned char *dest = cr->buffer[id0] + n0*comp_size;
749 memcpy(dest, src, comp_size);
750 n0++;
751 }
752 else {
753 unsigned char *dest = cr->buffer[id1] + n1*comp_size;
754 memcpy(dest, src, comp_size);
755 n1++;
756 }
757 i++;
758 }
759
760 assert(n0 + n1 == n);
761
762 cr->n_elts[id0] = n0;
763 cr->n_elts[id1] = n1;
764
765 cr->n_vals[id0] = n0*cr->stride;
766 cr->n_vals[id1] = n1*cr->stride;
767 }
768
769 /*----------------------------------------------------------------------------
770 * Partition indexed data for exchange with a crystal router.
771 *
772 * parameters:
773 * cr <-> associated crystal router structure
774 * send_id <-> id of "send" buffer" (1 if low = buf0, 0 if low = buf1)
775 * cutoff <-- cutoff rank
776 *---------------------------------------------------------------------------*/
777
778 static void
_crystal_partition_indexed(cs_crystal_router_t * cr,int send_id,int cutoff)779 _crystal_partition_indexed(cs_crystal_router_t *cr,
780 int send_id,
781 int cutoff)
782 {
783 cs_lnum_t i;
784
785 cs_lnum_t n0 = 0, n1 = 0;
786
787 const cs_lnum_t n = cr->n_elts[0];
788 const int id0 = (send_id + 1) % 2;
789 const int id1 = send_id;
790 const size_t comp_size = cr->comp_size;
791 const size_t elt_size = cr->elt_size;
792 const size_t n_vals_shift = cr->n_vals_shift;
793
794 assert(send_id == 0 || send_id == 1);
795
796 unsigned char *src = cr->buffer[0];
797
798 size_t r0_shift = 0;
799 size_t r1_shift = 0;
800
801 if (id0 == 0) {
802 for (i = 0; i < n; i++) {
803 int *r = (int *)src;
804 cs_lnum_t *n_sub = (cs_lnum_t *)(src + n_vals_shift);
805 size_t sub_size = comp_size + n_sub[0]*elt_size;
806 if (r[0] >= cutoff)
807 break;
808 r0_shift += sub_size;
809 src += sub_size;
810 }
811 n0 = i;
812 }
813 else {
814 for (i = 0; i < n; i++) {
815 int *r = (int *)src;
816 cs_lnum_t *n_sub = (cs_lnum_t *)(src + n_vals_shift);
817 size_t sub_size = comp_size + n_sub[0]*elt_size;
818 if (r[0] < cutoff)
819 break;
820 r1_shift += sub_size;
821 src += sub_size;
822 }
823 n1 = i;
824 }
825
826 size_t r0_end = r0_shift;
827 size_t r1_end = r1_shift;
828 cs_lnum_t i_start = i;
829 unsigned char *src_start = src;
830
831 /* Count to minimize memory allocation
832 (extra reads, but hopefully a good tradeoff) */
833
834 while (i < n) {
835 int *r = (int *)src;
836 cs_lnum_t *n_sub = (cs_lnum_t *)(src + n_vals_shift);
837 size_t sub_size = comp_size + n_sub[0]*elt_size;
838 if (r[0] < cutoff) {
839 r0_end += sub_size;
840 }
841 else {
842 r1_end += sub_size;
843 }
844 i++;
845 src += sub_size;
846 }
847
848 cr->buffer_size[1] = (id0 == 0) ? r1_end : r0_end;
849 BFT_REALLOC(cr->buffer[1], cr->buffer_size[1], unsigned char);
850 if (cr->buffer_size[1] > cr->buffer_size_max[1])
851 cr->buffer_size_max[1] = cr->buffer_size[1];
852 size_t alloc_tot = cr->buffer_size[0] + cr->buffer_size[1];
853 if (alloc_tot > cr->alloc_tot_max)
854 cr->alloc_tot_max = alloc_tot;
855
856 i = i_start;
857 src = src_start;
858
859 while (i < n) {
860 int *r = (int *)src;
861 cs_lnum_t *n_sub = (cs_lnum_t *)(src + n_vals_shift);
862 size_t sub_size = comp_size + n_sub[0]*elt_size;
863 if (r[0] < cutoff) {
864 unsigned char *dest = cr->buffer[id0] + r0_shift;
865 for (size_t j = 0; j < sub_size; j++)
866 dest[j] = src[j];
867 r0_shift += sub_size;
868 n0++;
869 }
870 else {
871 unsigned char *dest = cr->buffer[id1] + r1_shift;
872 for (size_t j = 0; j < sub_size; j++)
873 dest[j] = src[j];
874 r1_shift += sub_size;
875 n1++;
876 }
877 i++;
878 src += sub_size;
879 }
880
881 assert(n0 + n1 == n);
882
883 cr->n_elts[id0] = n0;
884 cr->n_elts[id1] = n1;
885
886 cr->n_vals[id0] = (r0_shift - n0*comp_size)/elt_size;
887 cr->n_vals[id1] = (r1_shift - n1*comp_size)/elt_size;
888 }
889
890 /*----------------------------------------------------------------------------
891 * Send and receive data with a crystal router for one stage.
892 *
893 * parameters:
894 * cr <-> associated crystal router structure
895 * target <-- base target rank to send/receive from
896 * n_recv <-- number of ranks to receive from
897 *---------------------------------------------------------------------------*/
898
899 static void
_crystal_sendrecv(cs_crystal_router_t * cr,int target,int n_recv)900 _crystal_sendrecv(cs_crystal_router_t *cr,
901 int target,
902 int n_recv)
903 {
904 cs_timer_t t0 = cs_timer_time();
905
906 assert(n_recv <= 2);
907
908 cs_lnum_t send_size[2];
909 uint64_t test_size;
910
911 MPI_Status status[3];
912 MPI_Request request[3] = {MPI_REQUEST_NULL,
913 MPI_REQUEST_NULL,
914 MPI_REQUEST_NULL};
915 cs_lnum_t recv_size[4] = {0, 0, 0, 0};
916
917 /* Send message to target process */
918
919 test_size = _comm_type_count(cr, cr->n_elts[1], cr->n_vals[1]);
920
921 send_size[0] = cr->n_elts[1];
922 send_size[1] = test_size;
923
924 if ((uint64_t)send_size[1] != test_size)
925 bft_error(__FILE__, __LINE__, 0,
926 "Crystal router:"
927 " Message to send would have size too large for C int: %llu",
928 (unsigned long long)test_size);
929
930 MPI_Isend(&send_size, 2, CS_MPI_LNUM, target, cr->rank_id,
931 cr->comm, &request[0]);
932
933 for (int i = 0; i < n_recv; i++)
934 MPI_Irecv(recv_size+i*2, 2, CS_MPI_LNUM, target+i, target+i,
935 cr->comm, request+i+1);
936
937 MPI_Waitall(n_recv + 1, request, status);
938
939 size_t loc_size = _data_size(cr, cr->n_elts[0], cr->n_vals[0]);
940 for (int i = 0; i < n_recv; i++)
941 loc_size += _data_size(cr, recv_size[i*2], recv_size[i*2+1]);
942 if ( loc_size > cr->buffer_size[0]
943 || loc_size < cr->buffer_size[0]*_realloc_f_threshold) {
944 cr->buffer_size[0] = loc_size;
945 BFT_REALLOC(cr->buffer[0], cr->buffer_size[0], unsigned char);
946 if (cr->buffer_size[0] > cr->buffer_size_max[0])
947 cr->buffer_size_max[0] = cr->buffer_size[0];
948 size_t alloc_tot = cr->buffer_size[0] + cr->buffer_size[1];
949 if (alloc_tot > cr->alloc_tot_max)
950 cr->alloc_tot_max = alloc_tot;
951 }
952
953 MPI_Isend(cr->buffer[1], send_size[1], cr->mpi_type,
954 target, cr->rank_id, cr->comm, request);
955
956 cr->n_elts[1] = 0;
957
958 for (int i = 0; i < n_recv; i++) {
959
960 unsigned char *r_ptr = cr->buffer[0]
961 + _data_size(cr, cr->n_elts[0], cr->n_vals[0]);
962
963 MPI_Irecv(r_ptr, recv_size[i*2 + 1], cr->mpi_type,
964 target+i, target+i, cr->comm, request+1+i);
965
966 cr->n_elts[0] += recv_size[i*2];
967 cr->n_vals[0] += _comm_type_count_to_n_vals(cr,
968 recv_size[i*2],
969 recv_size[i*2 + 1]);
970
971 }
972
973 MPI_Waitall(n_recv+1, request, status);
974
975 cs_timer_t t1 = cs_timer_time();
976 cs_timer_counter_add_diff(_cr_timers + 1, &t0, &t1);
977 }
978
979 /*----------------------------------------------------------------------------
980 * Get destination id and associated data index for an indexed Crystal Router.
981 *
982 * If the destination id info is not provided in the Crystal Router,
983 * it is assumed the given array is already valid, so only the index is
984 * computed. If it is provided, the dest_id array is optional.
985 *
986 * parameters:
987 * <-- cr pointer to associated Crystal Router
988 * <-> dest_id pointer to destination id array, or NULL
989 * --> data_idx pointer to data index
990 *----------------------------------------------------------------------------*/
991
992 static void
_get_data_index_with_dest_id(cs_crystal_router_t * cr,cs_lnum_t dest_id[],cs_lnum_t data_idx[])993 _get_data_index_with_dest_id(cs_crystal_router_t *cr,
994 cs_lnum_t dest_id[],
995 cs_lnum_t data_idx[])
996 {
997 const size_t n_elts = cr->n_elts[0];
998
999 const unsigned char *p_s = cr->buffer[0];
1000
1001 /* Extract _dest_id first, to use in other loops */
1002
1003 if (cr->flags & CS_CRYSTAL_ROUTER_USE_DEST_ID) {
1004
1005 if (dest_id != NULL) {
1006
1007 cs_lnum_t max_id = -1; /* id for added elements */
1008
1009 for (size_t i = 0; i < n_elts; i++) {
1010 const cs_lnum_t *cr_dest_id_p
1011 = (const cs_lnum_t *)(p_s + cr->dest_id_shift);
1012 dest_id[i] = *cr_dest_id_p;
1013 const cs_lnum_t *pn = (const cs_lnum_t *)(p_s + cr->n_vals_shift);
1014 data_idx[*cr_dest_id_p + 1] = *pn;
1015 p_s += cr->comp_size + cr->elt_size*(*pn);
1016 if (*cr_dest_id_p > max_id)
1017 max_id = *cr_dest_id_p;
1018 }
1019
1020 max_id += 1;
1021
1022 cr->dest_id_end = max_id;
1023
1024 }
1025 else {
1026
1027 for (size_t i = 0; i < n_elts; i++) {
1028 const cs_lnum_t *cr_dest_id_p
1029 = (const cs_lnum_t *)(p_s + cr->dest_id_shift);
1030 const cs_lnum_t *pn = (const cs_lnum_t *)(p_s + cr->n_vals_shift);
1031 data_idx[*cr_dest_id_p + 1] = *pn;
1032 p_s += cr->comp_size + cr->elt_size*(*pn);
1033 }
1034
1035 }
1036
1037 }
1038
1039 else {
1040
1041 for (size_t i = 0; i < n_elts; i++) {
1042 const cs_lnum_t *pn = (const cs_lnum_t *)(p_s + cr->n_vals_shift);
1043 data_idx[dest_id[i] + 1] = *pn;
1044 p_s += cr->comp_size + cr->elt_size*(*pn);
1045 }
1046
1047 }
1048
1049 /* Transform count to index */
1050
1051 data_idx[0] = 0;
1052 for (size_t i = 0; i < n_elts; i++)
1053 data_idx[i+1] += data_idx[i];
1054 }
1055
1056 /*----------------------------------------------------------------------------
1057 * Get data elements associated with a strided Crystal Router.
1058 *
1059 * This variant assumes no destination id is provided, so data is
1060 * copied in Crystal Router buffer order.
1061 *
1062 * parameters:
1063 * <-- cr pointer to associated Crystal Router
1064 * --> src_rank pointer to source rank array, or NULL
1065 * --> src_id pointer to source id array, or NULL
1066 * --> data pointer to (pointer to) destination data, or NULL
1067 *----------------------------------------------------------------------------*/
1068
1069 static void
_get_data_s(cs_crystal_router_t * cr,int * src_rank,cs_lnum_t * src_id,void * data)1070 _get_data_s(cs_crystal_router_t *cr,
1071 int *src_rank,
1072 cs_lnum_t *src_id,
1073 void *data)
1074 {
1075 const size_t n_elts = cr->n_elts[0];
1076
1077 /* Now extract data; work with chunks to amortize some tests
1078 while not multiplying variants */
1079
1080 const size_t chunk_size = 64;
1081 const size_t n_chunks
1082 = (n_elts % chunk_size) ? n_elts/chunk_size + 1 : n_elts/chunk_size;
1083
1084 unsigned char *cr_src_rank_p = cr->buffer[0] + sizeof(int);
1085 unsigned char *cr_src_id_p = cr->buffer[0] + cr->src_id_shift;
1086 unsigned char *cr_data_p = cr->buffer[0] + cr->elt_shift;
1087
1088 unsigned char *src_rank_p = (unsigned char *)src_rank;
1089 unsigned char *src_id_p = (unsigned char *)src_id;
1090 unsigned char *data_p = (unsigned char *)data;
1091
1092 for (size_t c_id = 0; c_id < n_chunks; c_id++) {
1093
1094 size_t i0 = c_id*chunk_size;
1095 size_t i1 = (c_id+1)*chunk_size;
1096 if (i1 > n_elts)
1097 i1 = n_elts;
1098
1099 /* Optional source rank */
1100
1101 if (src_rank != NULL) {
1102 for (size_t i = i0; i < i1; i++) {
1103 for (size_t j = 0; j < sizeof(int); j++)
1104 src_rank_p[i*sizeof(int) + j]
1105 = cr_src_rank_p[i*cr->comp_size + j];
1106 }
1107 }
1108
1109 /* Optional source id */
1110
1111 if (src_id != NULL) {
1112 for (size_t i = i0; i < i1; i++) {
1113 for (size_t j = 0; j < sizeof(cs_lnum_t); j++)
1114 src_id_p[i*sizeof(cs_lnum_t) + j]
1115 = cr_src_id_p[i*cr->comp_size + j];
1116 }
1117 }
1118
1119 /* data */
1120
1121 if (data != NULL) {
1122 for (size_t i = i0; i < i1; i++) {
1123 for (size_t j = 0; j < cr->elt_size; j++)
1124 data_p[i*cr->elt_size + j]
1125 = cr_data_p[i*cr->comp_size + j];
1126 }
1127 }
1128
1129 }
1130 }
1131
1132 /*----------------------------------------------------------------------------
1133 * Get data elements associated with a strided Crystal Router.
1134 *
1135 * This variant assumes a destination id provided in the Crystal Router.
1136 *
1137 * The dest_id array is always in buffer order.
1138 *
1139 * If the general case, dest_id is applied to all the other
1140 * extracted arrays. If some ids appear multiple times, and dest_id, src_rank,
1141 * and src_id are all non-NULL (with the matching flags
1142 * CS_CRYSTAL_ROUTER_USE_DEST_ID, CS_CRYSTAL_ROUTER_ADD_SRC_ID, and
1143 * CS_CRYSTAL_ROUTER_ADD_SRC_RANK set), it is applied only to the data, and
1144 * src_rank and src_id are kept in buffer order.
1145 *
1146 * With this behavior, for reverse exchange, src_id and src_rank can be
1147 * used as dest_id and dest_rank respectively in the call to
1148 * \ref cs_crystal_router_create_s, while NULL is passed to elt_id
1149 * with single ids, and dest_id passed in case of multiple ids.
1150 *
1151 * parameters:
1152 * <-- cr pointer to associated Crystal Router
1153 * <-> dest_id pointer to destination id array, or NULL
1154 * --> src_rank pointer to source rank array, or NULL
1155 * --> src_id pointer to source id array, or NULL
1156 * --> data pointer to (pointer to) destination data, or NULL
1157 *----------------------------------------------------------------------------*/
1158
1159 static void
_get_data_s_with_dest_id(cs_crystal_router_t * cr,cs_lnum_t * dest_id,int * src_rank,cs_lnum_t * src_id,void * data)1160 _get_data_s_with_dest_id(cs_crystal_router_t *cr,
1161 cs_lnum_t *dest_id,
1162 int *src_rank,
1163 cs_lnum_t *src_id,
1164 void *data)
1165 {
1166 cs_lnum_t *_dest_id = dest_id;
1167
1168 const size_t n_elts = cr->n_elts[0];
1169
1170 /* Now extract data; work with chunks to amortize some tests
1171 while not multiplying variants */
1172
1173 const size_t chunk_size = 64;
1174 const size_t n_chunks
1175 = (n_elts % chunk_size) ? n_elts/chunk_size + 1 : n_elts/chunk_size;
1176
1177 const unsigned char *cr_data_p = cr->buffer[0] + cr->elt_shift;
1178 unsigned char *data_p = (unsigned char *)data;
1179
1180 /* If we do not have a dest_id buffer, use a local one */
1181
1182 if (_dest_id == NULL) {
1183 BFT_MALLOC(_dest_id, n_elts, cs_lnum_t);
1184 cs_assert(cr->flags & CS_CRYSTAL_ROUTER_USE_DEST_ID);
1185 }
1186
1187 /* Extract _dest_id first, to use in other loops */
1188
1189 if (cr->flags & CS_CRYSTAL_ROUTER_USE_DEST_ID) {
1190
1191 cs_lnum_t max_id = -1; /* id for added elements */
1192
1193 for (size_t i = 0; i < n_elts; i++) {
1194 const cs_lnum_t *cr_dest_id_p
1195 = (const cs_lnum_t *)( cr->buffer[0] + i*cr->comp_size
1196 + cr->dest_id_shift);
1197 _dest_id[i] = *cr_dest_id_p;
1198 if (*cr_dest_id_p > max_id)
1199 max_id = *cr_dest_id_p;
1200 }
1201 max_id += 1;
1202
1203 cr->dest_id_end = max_id;
1204
1205 }
1206
1207 /* If we have less destination ids than elements, dest_id will
1208 not be applied to src id and src rank, to avoid losing
1209 information necessary for reverse exchanges */
1210
1211 int reverse_flags = CS_CRYSTAL_ROUTER_USE_DEST_ID
1212 | CS_CRYSTAL_ROUTER_ADD_SRC_ID
1213 | CS_CRYSTAL_ROUTER_ADD_SRC_RANK;
1214
1215 bool reverse_multiple = ( ((cr->flags & reverse_flags) == reverse_flags)
1216 && cr->dest_id_end < n_elts
1217 && dest_id != NULL
1218 && src_id != NULL
1219 && src_rank != NULL);
1220
1221 for (size_t c_id = 0; c_id < n_chunks; c_id++) {
1222
1223 size_t i0 = c_id*chunk_size;
1224 size_t i1 = (c_id+1)*chunk_size;
1225 if (i1 > n_elts)
1226 i1 = n_elts;
1227
1228 /* Optional source rank */
1229
1230 if (src_rank != NULL) {
1231
1232 if (reverse_multiple) {
1233 for (size_t i = i0; i < i1; i++) {
1234 const int *cr_src_rank_p
1235 = (const int *)(cr->buffer[0] + i*cr->comp_size + sizeof(int));
1236 src_rank[i] = *cr_src_rank_p;
1237 }
1238 }
1239 else {
1240 for (size_t i = i0; i < i1; i++) {
1241 const int *cr_src_rank_p
1242 = (const int *)(cr->buffer[0] + i*cr->comp_size + sizeof(int));
1243 src_rank[_dest_id[i]] = *cr_src_rank_p;
1244 }
1245 }
1246
1247 }
1248
1249 /* Optional source id */
1250
1251 if (src_id != NULL) {
1252
1253 if (reverse_multiple) {
1254 for (size_t i = i0; i < i1; i++) {
1255 const cs_lnum_t *cr_src_id_p
1256 = (const cs_lnum_t *)( cr->buffer[0] + i*cr->comp_size
1257 + cr->src_id_shift);
1258 src_id[i] = *cr_src_id_p;
1259 }
1260 }
1261 else {
1262 for (size_t i = i0; i < i1; i++) {
1263 const cs_lnum_t *cr_src_id_p
1264 = (const cs_lnum_t *)( cr->buffer[0] + i*cr->comp_size
1265 + cr->src_id_shift);
1266 src_id[_dest_id[i]] = *cr_src_id_p;
1267 }
1268 }
1269
1270 }
1271
1272 /* data */
1273
1274 if (data != NULL) {
1275 for (size_t i = i0; i < i1; i++) {
1276 cs_lnum_t e_id = _dest_id[i];
1277 for (size_t j = 0; j < cr->elt_size; j++)
1278 data_p[e_id*cr->elt_size + j]
1279 = cr_data_p[i*cr->comp_size + j];
1280 }
1281 }
1282
1283 }
1284
1285 if (dest_id == NULL)
1286 BFT_FREE(_dest_id);
1287 }
1288
1289 /*----------------------------------------------------------------------------
1290 * Get data elements associated with an indexed Crystal Router.
1291 *
1292 * This variant assumes no destination id is provided, so data is
1293 * copied in Crystal Router buffer order.
1294 *
1295 * parameters:
1296 * <-- cr pointer to associated Crystal Router
1297 * --> src_rank pointer to source rank array, or NULL
1298 * --> src_id pointer to source id array, or NULL
1299 * --> data_idx pointer to data index, or NULL
1300 * --> data pointer to destination data, or NULL
1301 *----------------------------------------------------------------------------*/
1302
1303 static void
_get_data_i(cs_crystal_router_t * cr,int * src_rank,cs_lnum_t * src_id,cs_lnum_t * data_idx,void * data)1304 _get_data_i(cs_crystal_router_t *cr,
1305 int *src_rank,
1306 cs_lnum_t *src_id,
1307 cs_lnum_t *data_idx,
1308 void *data)
1309 {
1310 const size_t n_elts = cr->n_elts[0];
1311
1312 if (data_idx != NULL)
1313 data_idx[0] = 0;
1314
1315 cs_lnum_t s_idx = 0;
1316 unsigned char *data_p = (unsigned char *)data;
1317 unsigned const char *p_s = cr->buffer[0];
1318
1319 /* Extract data */
1320
1321 for (size_t i = 0; i < n_elts; i++) {
1322
1323 /* Optional source rank */
1324
1325 if (src_rank != NULL) {
1326 const int *cr_src_rank = (const int *)(p_s + sizeof(int));
1327 src_rank[i] = *cr_src_rank;
1328 }
1329
1330 /* Optional source id */
1331
1332 if (src_id != NULL) {
1333 const cs_lnum_t *cr_src_id = (const cs_lnum_t *)(p_s + cr->src_id_shift);
1334 src_id[i] = *cr_src_id;
1335 }
1336
1337 /* Number of elements and optional data index */
1338
1339 const cs_lnum_t *pn = (const cs_lnum_t *)(p_s + cr->n_vals_shift);
1340 const cs_lnum_t n_sub = *pn;
1341 const size_t _sub_size = cr->elt_size*n_sub;
1342
1343 if (data_idx != NULL)
1344 data_idx[i+1] = data_idx[i] + n_sub;
1345
1346 /* Data */
1347
1348 if (data_p != NULL) {
1349 unsigned const char *pe = p_s + cr->elt_shift;
1350 unsigned char *_data_p = data_p + s_idx*cr->elt_size;
1351 for (size_t j = 0; j < _sub_size; j++)
1352 _data_p[j] = pe[j];
1353 }
1354
1355 s_idx += n_sub;
1356 p_s += cr->comp_size + _sub_size;
1357
1358 }
1359 }
1360
1361 /*----------------------------------------------------------------------------
1362 * Get data elements associated with an indexed Crystal Router.
1363 *
1364 * This variant assumes a destination id is provided, along with a matching
1365 * index. The destination id may either be provided or extracted from the
1366 * data if present; but the data index should always be provided (i.e.
1367 * precomputed), as it is required for indirect writes. If the destination
1368 * ids are both provided and present in the Crystal Router, they are assumed
1369 * to agree.
1370 *
1371 * If the general case, dest_id is applied to all the other
1372 * extracted arrays. If some ids appear multiple times, and dest_id, src_rank,
1373 * and src_id are all non-NULL (with the matching flags
1374 * CS_CRYSTAL_ROUTER_USE_DEST_ID, CS_CRYSTAL_ROUTER_ADD_SRC_ID, and
1375 * CS_CRYSTAL_ROUTER_ADD_SRC_RANK set), it is applied only to the data and
1376 * index, while src_rank and src_id are kept in buffer order.
1377 *
1378 * With this behavior, for reverse exchange, src_id and src_rank can be
1379 * used as dest_id and dest_rank respectively in the call to
1380 * \ref cs_crystal_router_create_s, while NULL is passed to elt_id
1381 * with single ids, and dest_id passed in case of multiple ids.
1382 *
1383 * parameters:
1384 * <-- cr pointer to associated Crystal Router
1385 * <-- dest_id pointer to destination id array, or NULL
1386 * <-- data_idx pointer to data index
1387 * --> src_rank pointer to source rank array, or NULL
1388 * --> src_id pointer to source id array, or NULL
1389 * --> data pointer to destination data, or NULL
1390 *----------------------------------------------------------------------------*/
1391
1392 static void
_get_data_i_with_dest_id(cs_crystal_router_t * cr,const cs_lnum_t dest_id[],const cs_lnum_t data_idx[],int * src_rank,cs_lnum_t * src_id,void * data)1393 _get_data_i_with_dest_id(cs_crystal_router_t *cr,
1394 const cs_lnum_t dest_id[],
1395 const cs_lnum_t data_idx[],
1396 int *src_rank,
1397 cs_lnum_t *src_id,
1398 void *data)
1399 {
1400 const size_t n_elts = cr->n_elts[0];
1401
1402 unsigned char *data_p = (unsigned char *)data;
1403 unsigned const char *p_s = cr->buffer[0];
1404
1405 /* If we have less destination ids than elements, dest_id will
1406 not be applied to src id and src rank, to avoid losing
1407 information necessary for reverse exchanges */
1408
1409 int reverse_flags = CS_CRYSTAL_ROUTER_USE_DEST_ID
1410 | CS_CRYSTAL_ROUTER_ADD_SRC_ID
1411 | CS_CRYSTAL_ROUTER_ADD_SRC_RANK;
1412
1413 if ((cr->flags & reverse_flags) && cr->dest_id_end < n_elts
1414 && dest_id != NULL && src_id != NULL && src_rank != NULL) {
1415
1416 for (size_t i = 0; i < n_elts; i++) {
1417
1418 cs_lnum_t id = dest_id[i];
1419
1420 /* Source rank and id */
1421 const int *cr_src_rank = (const int *)(p_s + sizeof(int));
1422 src_rank[i] = *cr_src_rank;
1423
1424 const cs_lnum_t *cr_src_id = (const cs_lnum_t *)(p_s + cr->src_id_shift);
1425 src_id[i] = *cr_src_id;
1426
1427 /* Number of elements and optional data index */
1428 const cs_lnum_t *pn = (const cs_lnum_t *)(p_s + cr->n_vals_shift);
1429 const cs_lnum_t n_sub = *pn;
1430 const size_t _sub_size = cr->elt_size*n_sub;
1431 assert(n_sub == data_idx[id+1] - data_idx[id]);
1432
1433 /* Data */
1434 if (data_p != NULL) {
1435 unsigned const char *pe = p_s + cr->elt_shift;
1436 unsigned char *_data_p = data_p + data_idx[id]*cr->elt_size;
1437 for (size_t j = 0; j < _sub_size; j++)
1438 _data_p[j] = pe[j];
1439 }
1440
1441 p_s += cr->comp_size + _sub_size;
1442
1443 }
1444
1445 }
1446
1447 /* Extract data in the general case */
1448
1449 else {
1450
1451 for (size_t i = 0; i < n_elts; i++) {
1452
1453 cs_lnum_t id;
1454 if (dest_id != NULL)
1455 id = dest_id[i];
1456 else {
1457 const cs_lnum_t *cr_dest_id
1458 = (const cs_lnum_t *)(p_s + cr->dest_id_shift);
1459 id = *cr_dest_id;
1460 }
1461
1462 /* Optional source rank */
1463 if (src_rank != NULL) {
1464 const int *cr_src_rank = (const int *)(p_s + sizeof(int));
1465 src_rank[id] = *cr_src_rank;
1466 }
1467
1468 /* Optional source id */
1469 if (src_id != NULL) {
1470 const cs_lnum_t *cr_src_id = (const cs_lnum_t *)(p_s + cr->src_id_shift);
1471 src_id[id] = *cr_src_id;
1472 }
1473
1474 /* Number of elements and optional data index */
1475 const cs_lnum_t *pn = (const cs_lnum_t *)(p_s + cr->n_vals_shift);
1476 const cs_lnum_t n_sub = *pn;
1477 const size_t _sub_size = cr->elt_size*n_sub;
1478 assert(n_sub == data_idx[id+1] - data_idx[id]);
1479
1480 /* Data */
1481 if (data_p != NULL) {
1482 unsigned const char *pe = p_s + cr->elt_shift;
1483 unsigned char *_data_p = data_p + data_idx[id]*cr->elt_size;
1484 for (size_t j = 0; j < _sub_size; j++)
1485 _data_p[j] = pe[j];
1486 }
1487
1488 p_s += cr->comp_size + _sub_size;
1489
1490 }
1491
1492 }
1493 }
1494
1495 #endif /* defined(HAVE_MPI) */
1496
1497 /*! (DOXYGEN_SHOULD_SKIP_THIS) \endcond */
1498
1499 /*============================================================================
1500 * Public function definitions
1501 *============================================================================*/
1502
1503 #if defined(HAVE_MPI)
1504
1505 /*----------------------------------------------------------------------------*/
1506 /*!
1507 * \brief Create a Crystal Router for strided data.
1508 *
1509 * If the flags constant contains \ref CS_CRYSTAL_ROUTER_USE_DEST_ID,
1510 * data exchanged will be ordered by the array passed to the
1511 * \c dest_id argument. For \c n total values received on a rank
1512 * (as given by \ref cs_crystal_router_n_elts), those destination ids
1513 * must be in the range [0, \c n[.
1514 *
1515 * If the flags bit mask matches \ref CS_CRYSTAL_ROUTER_ADD_SRC_ID,
1516 * source ids are added. If it matches \ref CS_CRYSTAL_ROUTER_ADD_SRC_RANK,
1517 * source rank metadata is added.
1518 *
1519 * \param[in] n_elts number of elements
1520 * \param[in] stride number of values per entity (interlaced)
1521 * \param[in] datatype type of data considered
1522 * \param[in] flags add destination id ?
1523 * \param[in] elt element values
1524 * \param[in] src_id optional parent element id (indirection
1525 * for elt array), or NULL
1526 * \param[in] dest_id element destination id, or NULL
1527 * \param[in] dest_rank destination rank for each element
1528 * \param[in] comm associated MPI communicator
1529 *
1530 * \return pointer to new Crystal Router structure.
1531 */
1532 /*----------------------------------------------------------------------------*/
1533
1534 cs_crystal_router_t *
cs_crystal_router_create_s(size_t n_elts,int stride,cs_datatype_t datatype,int flags,const void * elt,const cs_lnum_t * src_id,const cs_lnum_t * dest_id,const int dest_rank[],MPI_Comm comm)1535 cs_crystal_router_create_s(size_t n_elts,
1536 int stride,
1537 cs_datatype_t datatype,
1538 int flags,
1539 const void *elt,
1540 const cs_lnum_t *src_id,
1541 const cs_lnum_t *dest_id,
1542 const int dest_rank[],
1543 MPI_Comm comm)
1544 {
1545 cs_timer_t t0 = cs_timer_time();
1546
1547 /* Initialize timers if required */
1548
1549 if (_cr_calls == 0) {
1550 for (int i = 0; i < 2; i++)
1551 CS_TIMER_COUNTER_INIT(_cr_timers[i]);
1552 }
1553 _cr_calls += 1;
1554
1555 unsigned const char *_elt = elt;
1556
1557 /* Allocate structure */
1558
1559 cs_crystal_router_t *cr = _crystal_create_meta_s(n_elts,
1560 stride,
1561 datatype,
1562 flags,
1563 comm);
1564 /* Copy data and some metadata */
1565
1566 const bool add_src_rank
1567 = (cr->flags & CS_CRYSTAL_ROUTER_ADD_SRC_RANK) ? true : false;
1568 const bool add_dest_id
1569 = (cr->flags & CS_CRYSTAL_ROUTER_USE_DEST_ID) ? true : false;
1570 const bool add_src_id
1571 = (cr->flags & CS_CRYSTAL_ROUTER_ADD_SRC_ID) ? true : false;
1572
1573 if (add_dest_id)
1574 cs_assert(dest_id != NULL || n_elts == 0);
1575
1576 for (size_t i = 0; i < n_elts; i++) {
1577
1578 unsigned char *p_s = cr->buffer[0] + i*cr->comp_size;
1579 unsigned char *pe = p_s + cr->elt_shift;
1580
1581 int *pr = (int *)p_s;
1582 pr[0] = dest_rank[i];
1583 if (add_src_rank) {
1584 pr[1] = cr->rank_id;
1585 }
1586
1587 if (add_dest_id) {
1588 cs_lnum_t *_cr_dest_id = (cs_lnum_t *)(p_s + cr->dest_id_shift);
1589 *_cr_dest_id = dest_id[i];
1590 }
1591
1592 size_t j = (src_id == NULL) ? i : (size_t)src_id[i];
1593
1594 if (add_src_id) {
1595 cs_lnum_t *_cr_src_id = (cs_lnum_t *)(p_s + cr->src_id_shift);
1596 *_cr_src_id = (cs_lnum_t)j;
1597 }
1598
1599 unsigned const char *_psrc = _elt + j*cr->elt_size;
1600 for (size_t k = 0; k < cr->elt_size; k++)
1601 pe[k] = _psrc[k];
1602
1603 }
1604
1605 cs_timer_t t1 = cs_timer_time();
1606 cs_timer_counter_add_diff(_cr_timers, &t0, &t1);
1607
1608 #if _CR_DEBUG_DUMP
1609 _dump(cr, "Crystal Router after creation.");
1610 #endif
1611
1612 return cr;
1613 }
1614
1615 /*----------------------------------------------------------------------------*/
1616 /*!
1617 * \brief Create a Crystal Router for indexed data.
1618 *
1619 * If the flags constant contains \ref CS_CRYSTAL_ROUTER_USE_DEST_ID,
1620 * data exchanged will be ordered by the array passed to the
1621 * \c dest_id argument. For \c n total values received on a rank
1622 * (as given by \ref cs_crystal_router_n_elts), those destination ids
1623 * must be in the range [0, \c n[.
1624 *
1625 * If the flags bit mask matches \ref CS_CRYSTAL_ROUTER_ADD_SRC_ID,
1626 * source ids are added. If it matches \ref CS_CRYSTAL_ROUTER_ADD_SRC_RANK,
1627 * source rank metadata is added.
1628 *
1629 * \param[in] n_elts number of elements
1630 * \param[in] datatype type of data considered
1631 * \param[in] flags add destination id ?
1632 * \param[in] elt_idx element values start and past-the-last index
1633 * \param[in] elt element values
1634 * \param[in] src_id optional parent element id (indirection
1635 * for elt and elt_idx arrays), or NULL
1636 * \param[in] dest_id element destination id, or NULL
1637 * \param[in] dest_rank destination rank for each element
1638 * \param[in] comm associated MPI communicator
1639 *
1640 * \return pointer to new Crystal Router structure.
1641 */
1642 /*----------------------------------------------------------------------------*/
1643
1644 cs_crystal_router_t *
cs_crystal_router_create_i(size_t n_elts,cs_datatype_t datatype,int flags,const cs_lnum_t * elt_idx,const void * elt,const cs_lnum_t * src_id,const cs_lnum_t * dest_id,const int dest_rank[],MPI_Comm comm)1645 cs_crystal_router_create_i(size_t n_elts,
1646 cs_datatype_t datatype,
1647 int flags,
1648 const cs_lnum_t *elt_idx,
1649 const void *elt,
1650 const cs_lnum_t *src_id,
1651 const cs_lnum_t *dest_id,
1652 const int dest_rank[],
1653 MPI_Comm comm)
1654 {
1655 cs_timer_t t0 = cs_timer_time();
1656
1657 /* Initialize timers if required */
1658
1659 if (_cr_calls == 0) {
1660 for (int i = 0; i < 2; i++)
1661 CS_TIMER_COUNTER_INIT(_cr_timers[i]);
1662 }
1663 _cr_calls += 1;
1664
1665 unsigned const char *_elt = elt;
1666
1667 /* Allocate structure */
1668
1669 cs_crystal_router_t *cr = _crystal_create_meta_i(n_elts,
1670 datatype,
1671 elt_idx,
1672 src_id,
1673 flags,
1674 comm);
1675
1676 /* Copy data and some metadata */
1677
1678 const bool add_src_rank
1679 = (cr->flags & CS_CRYSTAL_ROUTER_ADD_SRC_RANK) ? true : false;
1680 const bool add_dest_id
1681 = (cr->flags & CS_CRYSTAL_ROUTER_USE_DEST_ID) ? true : false;
1682 const bool add_src_id
1683 = (cr->flags & CS_CRYSTAL_ROUTER_ADD_SRC_ID) ? true : false;
1684
1685 if (add_dest_id)
1686 cs_assert(dest_id != NULL || n_elts == 0);
1687
1688 /* Case with no src id indirection */
1689
1690 if (src_id == NULL) {
1691
1692 for (size_t i = 0; i < n_elts; i++) {
1693
1694 unsigned char *p_s = cr->buffer[0] + i*cr->comp_size
1695 + elt_idx[i]*cr->elt_size;
1696 int *pr = (int *)p_s;
1697 cs_lnum_t *pn = (cs_lnum_t *)(p_s + cr->n_vals_shift);
1698 unsigned char *pe = p_s + cr->elt_shift;
1699 unsigned const char *_psrc = _elt + elt_idx[i]*cr->elt_size;
1700
1701 pr[0] = dest_rank[i];
1702
1703 if (add_src_rank)
1704 pr[1] = cr->rank_id;
1705
1706 if (add_dest_id) {
1707 unsigned char *pi = p_s + cr->dest_id_shift;
1708 unsigned const char *_p_dest_id = (const unsigned char *)(dest_id + i);
1709 for (size_t j = 0; j < sizeof(cs_lnum_t); j++)
1710 pi[j] = _p_dest_id[j];
1711 }
1712
1713 if (add_src_id) {
1714 const cs_lnum_t s_id = i;
1715 const unsigned char *_src_id = (const unsigned char *)(&s_id);
1716 unsigned char *pi = p_s + cr->src_id_shift;
1717 for (size_t j = 0; j < sizeof(cs_lnum_t); j++)
1718 pi[j] = _src_id[j];
1719 }
1720
1721 cs_lnum_t n_sub = elt_idx[i+1] - elt_idx[i];
1722 pn[0] = n_sub;
1723
1724 size_t sub_size = n_sub * cr->elt_size;
1725 for (size_t j = 0; j < sub_size; j++)
1726 pe[j] = _psrc[j];
1727 }
1728
1729 }
1730
1731 /* Case with src id indirection */
1732
1733 else {
1734
1735 cs_lnum_t c_idx = 0;
1736
1737 for (size_t i = 0; i < n_elts; i++) {
1738
1739 size_t j = src_id[i];
1740
1741 unsigned char *p_s = cr->buffer[0] + i*cr->comp_size
1742 + c_idx*cr->elt_size;
1743 int *pr = (int *)p_s;
1744 cs_lnum_t *pn = (cs_lnum_t *)(p_s + cr->n_vals_shift);
1745 unsigned char *pe = p_s + cr->elt_shift;
1746 unsigned const char *_psrc = _elt + elt_idx[j]*cr->elt_size;
1747
1748 pr[0] = dest_rank[i];
1749
1750 if (add_src_rank)
1751 pr[1] = cr->rank_id;
1752
1753 if (add_dest_id) {
1754 unsigned char *pi = p_s + cr->dest_id_shift;
1755 unsigned const char *_p_dest_id = (const unsigned char *)(dest_id + i);
1756 for (size_t k = 0; k < sizeof(cs_lnum_t); k++)
1757 pi[k] = _p_dest_id[k];
1758 }
1759
1760 if (add_src_id) {
1761 const cs_lnum_t s_id = j;
1762 const unsigned char *_src_id = (const unsigned char *)(&s_id);
1763 unsigned char *pi = p_s + cr->src_id_shift;
1764 for (size_t k = 0; k < sizeof(cs_lnum_t); k++)
1765 pi[k] = _src_id[k];
1766 }
1767
1768 cs_lnum_t n_sub = elt_idx[j+1] - elt_idx[j];
1769 pn[0] = n_sub;
1770
1771 size_t sub_size = n_sub * cr->elt_size;
1772 for (size_t k = 0; k < sub_size; k++)
1773 pe[k] = _psrc[k];
1774
1775 c_idx += n_sub;
1776
1777 }
1778
1779 }
1780
1781 cs_timer_t t1 = cs_timer_time();
1782 cs_timer_counter_add_diff(_cr_timers, &t0, &t1);
1783
1784 #if _CR_DEBUG_DUMP
1785 _dump(cr, "Crystal Router after creation.");
1786 #endif
1787
1788 return cr;
1789 }
1790
1791 /*----------------------------------------------------------------------------*/
1792 /*!
1793 * \brief Destroy a Crystal Router.
1794 *
1795 * \param[in, out] cr pointer to associated Crystal Router
1796 */
1797 /*----------------------------------------------------------------------------*/
1798
1799 void
cs_crystal_router_destroy(cs_crystal_router_t ** cr)1800 cs_crystal_router_destroy(cs_crystal_router_t **cr)
1801 {
1802 if (cr != NULL) {
1803
1804 cs_timer_t t0 = cs_timer_time();
1805
1806 if (*cr != NULL) {
1807 cs_crystal_router_t *_cr = *cr;
1808 if (_cr->mpi_type != MPI_BYTE)
1809 MPI_Type_free(&(_cr->mpi_type));
1810 BFT_FREE(_cr->buffer[1]);
1811 BFT_FREE(_cr->buffer[0]);
1812 BFT_FREE(*cr);
1813 }
1814
1815 cs_timer_t t1 = cs_timer_time();
1816 cs_timer_counter_add_diff(_cr_timers, &t0, &t1);
1817 }
1818 }
1819
1820 /*----------------------------------------------------------------------------*/
1821 /*!
1822 * \brief Exchange data with a Crystal Router.
1823 *
1824 * Order of data from a same source rank is preserved.
1825 *
1826 * \param[in, out] cr pointer to associated Crystal Router
1827 */
1828 /*----------------------------------------------------------------------------*/
1829
1830 void
cs_crystal_router_exchange(cs_crystal_router_t * cr)1831 cs_crystal_router_exchange(cs_crystal_router_t *cr)
1832 {
1833 cs_assert(cr != NULL);
1834
1835 cs_timer_t t0 = cs_timer_time();
1836
1837 int target = -1;
1838 int send_part = 0;
1839 int b_low = 0;
1840 int n_sub_ranks = cr->n_ranks;
1841
1842 while (n_sub_ranks > 1) {
1843
1844 int n_recv = 1;
1845 int n_low = n_sub_ranks / 2;
1846 int b_high = b_low + n_low;
1847
1848 if (cr->rank_id < b_high) {
1849 target = cr->rank_id + n_low;
1850 if ((n_sub_ranks & 1) && (cr->rank_id == b_high - 1))
1851 n_recv = 2;
1852 send_part = 1;
1853 }
1854 else {
1855 target = cr->rank_id - n_low;
1856 if (target == b_high) {
1857 target--;
1858 n_recv = 0;
1859 }
1860 send_part = 0;
1861 }
1862
1863 /* Partition data */
1864
1865 if (cr->n_vals_shift == 0)
1866 _crystal_partition_strided(cr, send_part, b_high);
1867 else
1868 _crystal_partition_indexed(cr, send_part, b_high);
1869
1870 #if _CR_DEBUG_DUMP
1871 {
1872 char comment[80];
1873 snprintf(comment, 79, "Crystal Router after partition, n_sub = %d",
1874 n_sub_ranks);
1875 _dump(cr, comment);
1876 }
1877 #endif
1878
1879 /* Send message to target process */
1880
1881 _crystal_sendrecv(cr, target, n_recv);
1882
1883 /* Ready for next exchange */
1884
1885 if (cr->rank_id < b_high)
1886 n_sub_ranks = n_low;
1887 else {
1888 n_sub_ranks -= n_low;
1889 b_low = b_high;
1890 }
1891
1892 #if _CR_DEBUG_DUMP
1893 {
1894 char comment[80];
1895 if (n_sub_ranks > 1) {
1896 snprintf(comment, 79, "Crystal Router after sendrecv, n_sub = %d",
1897 n_sub_ranks);
1898 }
1899 else {
1900 snprintf(comment, 79, "Crystal Router after exchange");
1901 }
1902 _dump(cr, comment);
1903 }
1904 #endif
1905
1906 }
1907
1908 cr->n_elts[1] = 0;
1909 cr->buffer_size[1] = 0;
1910 BFT_FREE(cr->buffer[1]);
1911
1912 cs_timer_t t1 = cs_timer_time();
1913 cs_timer_counter_add_diff(_cr_timers, &t0, &t1);
1914 }
1915
1916 /*----------------------------------------------------------------------------*/
1917 /*!
1918 * \brief Get number of elements associated with Crystal Router.
1919 *
1920 * The number of elements is the number of elements received after exchange.
1921 *
1922 * \param[in] cr pointer to associated Crystal Router
1923 *
1924 * \return number of elements associated with distributor.
1925 */
1926 /*----------------------------------------------------------------------------*/
1927
1928 cs_lnum_t
cs_crystal_router_n_elts(const cs_crystal_router_t * cr)1929 cs_crystal_router_n_elts(const cs_crystal_router_t *cr)
1930 {
1931 cs_lnum_t retval = 0;
1932
1933 if (cr != NULL) {
1934
1935 if (cr->flags & CS_CRYSTAL_ROUTER_USE_DEST_ID) {
1936
1937 const size_t n_elts = cr->n_elts[0];
1938
1939 if (cr->dest_id_end == 0 && n_elts > 0) {
1940
1941 cs_lnum_t dest_id_max = -1;
1942
1943 if (cr->n_vals_shift == 0) {
1944 for (size_t i = 0; i < n_elts; i++) {
1945 unsigned const char *p_s = cr->buffer[0] + i*cr->comp_size;
1946 const cs_lnum_t *cr_dest_id
1947 = (const cs_lnum_t *)(p_s + cr->dest_id_shift);
1948 if (cr_dest_id[0] > dest_id_max)
1949 dest_id_max = cr_dest_id[0];
1950 }
1951 }
1952
1953 else {
1954 unsigned const char *p_s = cr->buffer[0];
1955 for (size_t i = 0; i < n_elts; i++) {
1956 const cs_lnum_t *cr_dest_id
1957 = (const cs_lnum_t *)(p_s + cr->dest_id_shift);
1958 if (cr_dest_id[0] > dest_id_max)
1959 dest_id_max = cr_dest_id[0];
1960 const cs_lnum_t *pn = (const cs_lnum_t *)(p_s + cr->n_vals_shift);
1961 p_s += cr->comp_size + cr->elt_size*pn[0];
1962 }
1963 }
1964
1965 retval = dest_id_max + 1;
1966
1967 }
1968 else
1969 retval = cr->dest_id_end;
1970
1971 }
1972 else
1973 retval = cr->n_elts[0];
1974
1975 }
1976
1977 return retval;
1978 }
1979
1980 /*----------------------------------------------------------------------------*/
1981 /*!
1982 * \brief Get number of elements received with Crystal Router.
1983 *
1984 * The number of elements is the number of elements received in the exchange.
1985 * In most cases this is the same as that returned using
1986 * \ref cs_crystal_router_n_elts, but may be larger when a destination id is
1987 * used and multiple elements point to a same destination (for example when
1988 * distributing from partitioned data with elements duplicated on
1989 * partition boundaries to blocks with a single occurence of elements).
1990 *
1991 * \param[in] cr pointer to associated Crystal Router
1992 *
1993 * \return number of elements associated with distributor.
1994 */
1995 /*----------------------------------------------------------------------------*/
1996
1997 cs_lnum_t
cs_crystal_router_n_recv_elts(const cs_crystal_router_t * cr)1998 cs_crystal_router_n_recv_elts(const cs_crystal_router_t *cr)
1999 {
2000 cs_lnum_t retval = 0;
2001
2002 if (cr != NULL)
2003 retval = cr->n_elts[0];
2004
2005 return retval;
2006 }
2007
2008 /*----------------------------------------------------------------------------*/
2009 /*!
2010 * \brief Get data elements associated with a Crystal Router.
2011 *
2012 * Depending on the creation options, some elements may be available or not.
2013 *
2014 * For each element type, a pointer to a pointer to a buffer is provided.
2015 * If the pointer to the buffer is non-null, the buffer must be of sufficient
2016 * size for the number of elements returned by \ref cs_crystal_router_n_elts.
2017 *
2018 * If no buffer is provided, one is allocated automatically, and transferred
2019 * to the caller (who is responsible for freeing it when no longer needed).
2020 *
2021 * Note also that if the destination id is provided in the Crystal Router,
2022 * it will be applied automatically to the data and data_index arrays. The
2023 * dest_id array itself is always in receive order. If the Crystal
2024 * Router does not contain destination id info but the \c dest_id
2025 * argument points to a non-NULL value, the provided id will be used to
2026 * order extracted data. This allows saving the destination id on the receive
2027 * side, and not re-sending it (saving bandwidth) for subsequent calls
2028 * with a similar Crystal Router.
2029 *
2030 * If all destination ids appear only once, or the
2031 * CS_CRYSTAL_ROUTER_USE_DEST_ID is not set, dest_id is also applied to
2032 * src_rank and src_id. Otherwise, those arrays are kept in buffer order.
2033 * This can be checked by comparing the output of
2034 * \ref cs_crystal_router_n_elts and \ref cs_crystal_router_n_recv_elts.
2035 *
2036 * With this behavior, for reverse exchange, src_id and src_rank can be
2037 * used as dest_id and dest_rank respectively in the call to
2038 * \ref cs_crystal_router_create_s, while NULL is passed to elt_id
2039 * with single ids, and dest_id passed in case of duplicate ids.
2040 *
2041 * \param[in] cr pointer to associated Crystal Router
2042 * \param[out] src_rank pointer to (pointer to) source rank array,
2043 * or NULL
2044 * \param[in, out] dest_id pointer to (pointer to) destination id array,
2045 * or NULL
2046 * \param[out] src_id pointer to (pointer to) source id array, or NULL
2047 * \param[out] data_index pointer to (pointer to) destination index,
2048 * or NULL
2049 * \param[out] data pointer to (pointer to) destination data,
2050 * or NULL
2051 */
2052 /*----------------------------------------------------------------------------*/
2053
2054 void
cs_crystal_router_get_data(cs_crystal_router_t * cr,int ** src_rank,cs_lnum_t ** dest_id,cs_lnum_t ** src_id,cs_lnum_t ** data_index,void ** data)2055 cs_crystal_router_get_data(cs_crystal_router_t *cr,
2056 int **src_rank,
2057 cs_lnum_t **dest_id,
2058 cs_lnum_t **src_id,
2059 cs_lnum_t **data_index,
2060 void **data)
2061 {
2062 cs_timer_t t0 = cs_timer_time();
2063
2064 int *_src_rank = NULL;
2065 cs_lnum_t *_dest_id = NULL;
2066 cs_lnum_t *_src_id = NULL;
2067 cs_lnum_t *_data_index = NULL;
2068 unsigned char *_data = NULL;
2069
2070 size_t n_elts = cr->n_elts[0];
2071
2072 /* Map and allocate arrays if needed */
2073
2074 if (src_rank != NULL && cr->flags & CS_CRYSTAL_ROUTER_ADD_SRC_RANK) {
2075 _src_rank = *src_rank;
2076 if (_src_rank == NULL) {
2077 BFT_MALLOC(_src_rank, n_elts, int);
2078 *src_rank = _src_rank;
2079 }
2080 }
2081
2082 if (dest_id != NULL) { /* May be extracted or provided from previous call */
2083 _dest_id = *dest_id;
2084 if (_dest_id == NULL && cr->flags & CS_CRYSTAL_ROUTER_USE_DEST_ID) {
2085 BFT_MALLOC(_dest_id, n_elts, cs_lnum_t);
2086 *dest_id = _dest_id;
2087 }
2088 }
2089
2090 if (src_id != NULL && cr->flags & CS_CRYSTAL_ROUTER_ADD_SRC_ID) {
2091 _src_id = *src_id;
2092 if (_src_id == NULL) {
2093 BFT_MALLOC(_src_id, n_elts, cs_lnum_t);
2094 *src_id = _src_id;
2095 }
2096 }
2097
2098 if (data_index != NULL && cr->n_vals_shift > 0) {
2099 _data_index = *data_index;
2100 if (_data_index == NULL) {
2101 BFT_MALLOC(_data_index, n_elts + 1, cs_lnum_t);
2102 *data_index = _data_index;
2103 }
2104 }
2105
2106 if (data != NULL) {
2107 _data = *data;
2108 if (_data == NULL) {
2109 size_t data_size;
2110 if (cr->stride > 0)
2111 data_size = n_elts*cr->elt_size;
2112 else
2113 data_size = cr->n_vals[0]*cr->elt_size;
2114 BFT_MALLOC(_data, data_size, unsigned char);
2115 *data = _data;
2116 }
2117 }
2118
2119 if (data != NULL && cr->stride > 0) {
2120 _data = *data;
2121 if (_data == NULL) {
2122 BFT_MALLOC(_data, n_elts*cr->elt_size, unsigned char);
2123 *data = _data;
2124 }
2125 }
2126
2127 /* Now extract data */
2128
2129 if (_dest_id != NULL || cr->flags & CS_CRYSTAL_ROUTER_USE_DEST_ID) {
2130 if (cr->n_vals_shift == 0)
2131 _get_data_s_with_dest_id(cr,
2132 _dest_id,
2133 _src_rank,
2134 _src_id,
2135 _data);
2136 else {
2137 if (cr->n_vals_shift > 0 && _data_index == NULL)
2138 BFT_MALLOC(_data_index, n_elts + 1, cs_lnum_t);
2139
2140 _get_data_index_with_dest_id(cr, _dest_id, _data_index);
2141
2142 _get_data_i_with_dest_id(cr,
2143 _dest_id,
2144 _data_index,
2145 _src_rank,
2146 _src_id,
2147 _data);
2148 }
2149 }
2150
2151 else {
2152
2153 if (cr->n_vals_shift == 0)
2154 _get_data_s(cr, _src_rank, _src_id, _data);
2155 else
2156 _get_data_i(cr, _src_rank, _src_id, _data_index, _data);
2157
2158 }
2159
2160 if (dest_id == NULL && _dest_id != NULL)
2161 BFT_FREE(_dest_id);
2162 if (data_index == NULL && _data_index != NULL)
2163 BFT_FREE(_data_index);
2164
2165 cs_timer_t t1 = cs_timer_time();
2166 cs_timer_counter_add_diff(_cr_timers, &t0, &t1);
2167 }
2168
2169 /*----------------------------------------------------------------------------*/
2170 /*!
2171 * \brief Query maximum buffer sizes reached by a Crystal Router.
2172 *
2173 * Order of data from a same source rank is preserved.
2174 *
2175 * \param[in] cr pointer to associated Crystal Router
2176 * \param[out] max_sizes pointer to maximum local/receive (max_sizes[0])
2177 * and send (max_sizes[1]) sizes, or NULL
2178 *
2179 * \return maximum total allocated buffer memory
2180 */
2181 /*----------------------------------------------------------------------------*/
2182
2183 size_t
cs_crystal_router_get_max_sizes(cs_crystal_router_t * cr,size_t * max_sizes)2184 cs_crystal_router_get_max_sizes(cs_crystal_router_t *cr,
2185 size_t *max_sizes)
2186 {
2187 cs_assert(cr != NULL);
2188
2189 if (max_sizes != NULL) {
2190 max_sizes[0] = cr->buffer_size_max[0];
2191 max_sizes[1] = cr->buffer_size_max[1];
2192 }
2193
2194 return cr->alloc_tot_max;
2195 }
2196
2197 #endif /* defined(HAVE_MPI) */
2198
2199 /*----------------------------------------------------------------------------*/
2200 /*!
2201 * \brief Log performance information relative to Crystal Router exchange.
2202 *
2203 * Call count is based on Crystal router creation calls, as the number
2204 * of exchange and data access calls should be identical.
2205 */
2206 /*----------------------------------------------------------------------------*/
2207
2208 void
cs_crystal_router_log_finalize(void)2209 cs_crystal_router_log_finalize(void)
2210 {
2211 if (_cr_calls <= 0 || cs_glob_n_ranks < 2)
2212 return;
2213
2214 cs_log_printf(CS_LOG_PERFORMANCE,
2215 _("\nCrystal router: %llu %s\n"),
2216 (unsigned long long) _cr_calls, _("calls"));
2217
2218 #if defined(HAVE_MPI)
2219 double wtimes[2] = {_cr_timers[0].nsec*1e-9,
2220 _cr_timers[1].nsec*1e-9};
2221
2222 double mntimes[2], mxtimes[2], stimes[2];
2223
2224 MPI_Reduce(wtimes, mntimes, 2, MPI_DOUBLE, MPI_MIN,
2225 0, cs_glob_mpi_comm);
2226 MPI_Reduce(wtimes, mxtimes, 2, MPI_DOUBLE, MPI_MAX,
2227 0, cs_glob_mpi_comm);
2228 MPI_Reduce(wtimes, stimes, 2, MPI_DOUBLE, MPI_SUM,
2229 0, cs_glob_mpi_comm);
2230 if (cs_glob_rank_id == 0) {
2231 cs_log_printf
2232 (CS_LOG_PERFORMANCE,
2233 _(" mean minimum maximum\n"
2234 " wall clock: %12.5f s %12.5f s %12.5f s\n"
2235 " communication: %12.5f s %12.5f s %12.5f s\n"),
2236 stimes[0]/cs_glob_n_ranks, mntimes[0], mxtimes[0],
2237 stimes[1]/cs_glob_n_ranks, mntimes[1], mxtimes[1]);
2238 }
2239 #endif
2240 }
2241
2242 /*----------------------------------------------------------------------------*/
2243
2244 END_C_DECLS
2245