1 /*============================================================================
2 * Functions dealing with ghost cells
3 *============================================================================*/
4
5 /*
6 This file is part of Code_Saturne, a general-purpose CFD tool.
7
8 Copyright (C) 1998-2021 EDF S.A.
9
10 This program is free software; you can redistribute it and/or modify it under
11 the terms of the GNU General Public License as published by the Free Software
12 Foundation; either version 2 of the License, or (at your option) any later
13 version.
14
15 This program is distributed in the hope that it will be useful, but WITHOUT
16 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
18 details.
19
20 You should have received a copy of the GNU General Public License along with
21 this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
22 Street, Fifth Floor, Boston, MA 02110-1301, USA.
23 */
24
25 /*----------------------------------------------------------------------------*/
26
27 #include "cs_defs.h"
28
29 /*----------------------------------------------------------------------------
30 * Standard C library headers
31 *----------------------------------------------------------------------------*/
32
33 #include <stdio.h>
34 #include <string.h>
35 #include <assert.h>
36
37 /*----------------------------------------------------------------------------
38 * Local headers
39 *----------------------------------------------------------------------------*/
40
41 #include "bft_mem.h"
42 #include "bft_error.h"
43 #include "bft_printf.h"
44
45 #include "cs_base.h"
46 #include "cs_base_accel.h"
47 #include "cs_order.h"
48
49 #include "cs_interface.h"
50 #include "cs_rank_neighbors.h"
51
52 #include "fvm_periodicity.h"
53
54 /*----------------------------------------------------------------------------
55 * Header for the current file
56 *----------------------------------------------------------------------------*/
57
58 #include "cs_halo.h"
59
60 #if defined(HAVE_CUDA)
61 #include "cs_halo_cuda.h"
62 #endif
63
64 /*----------------------------------------------------------------------------*/
65
66 BEGIN_C_DECLS
67
68 /*! \cond DOXYGEN_SHOULD_SKIP_THIS */
69
70 /* Remarks:
71 *
72 * The current available mode for MPI-3 RMA uses "get" semantics.
73 * A "put" semantics variant could easily be added, either:
74 * - Using MPI_Win_create_dynamic and attaching the halo section of the
75 * current array to that window (which would seem cumbersome as this also
76 * requires exchanging base addresses obtained with MPI_Get_Address for
77 * each array, but could be amortized for iterative algorithms).
78 * - Using a fixed receive buffer, and copying data to the tail section of
79 * the array afterwards (as is done for accelerators when the MPI library
80 * used is not no accelerator-aware). This would add an extra copy, but
81 * be much simpler.
82 *
83 * It may also be useful to allow this setting on a "per halo" basis, as
84 * some publications report better performance with RMA for large data,
85 * and better performance with P2P for small data, so in uses such
86 * as multigrid solvers, either may be preferred for different levels.
87 */
88
89 /*=============================================================================
90 * Local macro definitions
91 *============================================================================*/
92
93 #if defined(MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT
94 #define _CS_MPI_DEVICE_SUPPORT 1
95 #else
96 #if defined(_CS_MPI_DEVICE_SUPPORT)
97 #undef _CS_MPI_DEVICE_SUPPORT
98 #endif
99 #endif
100
101 /*=============================================================================
102 * Local type definitions
103 *============================================================================*/
104
105 /* Structure to maintain halo exchange state */
106
107 struct _cs_halo_state_t {
108
109 /* Current synchronization state */
110
111 cs_halo_type_t sync_mode; /* Standard or extended */
112 cs_datatype_t data_type; /* Datatype */
113 int stride; /* Number of values per location */
114
115 cs_alloc_mode_t var_location; /* Allocation info for exchanged variable */
116
117 void *send_buffer_cur; /* Send buffer used for current progress
118 (either _send_buffer or passed by caller) */
119
120 int n_requests; /* Number of MPI requests */
121 int local_rank_id; /* Id of halo for own rank, -1 if not present */
122
123 /* Buffers for synchronization;
124 receive buffers only needed for some communication modes */
125
126 size_t send_buffer_size; /* Size of send buffer, in bytes */
127 size_t recv_buffer_size; /* Size of receive buffer, in bytes */
128
129 void *send_buffer; /* Send buffer (maintained by this object) */
130 void *recv_buffer; /* Recv. buffer (maintained by this object) */
131
132 #if defined(HAVE_MPI)
133
134 int request_size; /* Size of requests and status arrays */
135
136 MPI_Request *request; /* Array of MPI requests */
137 MPI_Status *status; /* Array of MPI status */
138
139 MPI_Win win; /* MPI-3 RMA window */
140
141 #endif
142
143 };
144
145 /*============================================================================
146 * Static global variables
147 *============================================================================*/
148
149 /* Number of defined halos */
150 static int _n_halos = 0;
151
152 /* Allocation mode for arrays which might be used on accelerator device
153 Note that an alternative option would be to use shared memory with
154 prefetching. We will need to do performance comparisons first, but
155 in the case of similar performance, going for the shared approach
156 would be preferred for its "more generic" aspect. */
157 static cs_alloc_mode_t _halo_buffer_alloc_mode = CS_ALLOC_HOST_DEVICE_PINNED;
158
159 /* Should we use barriers after posting receives ? */
160 static int _halo_use_barrier = false;
161
162 /* Default halo state handler */
163 static cs_halo_state_t *_halo_state = NULL;
164
165 /* Halo communications mode */
166 static int _halo_comm_mode = CS_HALO_COMM_P2P;
167
168 /*============================================================================
169 * Private function definitions
170 *============================================================================*/
171
172 /*----------------------------------------------------------------------------*/
173 /*!
174 * \brief Test if an array of global numbers is ordered.
175 *
176 * \param[in] list optional list (1 to n numbering) of selected entities
177 * (or NULL if all nb_ent are selected). This list may
178 * contain element numbers in any order
179 * \param[in] nb_ent number of entities considered
180 *
181 * \return 1 if ordered, 0 otherwise.
182 */
183 /*----------------------------------------------------------------------------*/
184
185 static int
_order_int_test(const int list[],size_t nb_ent)186 _order_int_test(const int list[],
187 size_t nb_ent)
188 {
189 size_t i = 0;
190
191 /* If numbering is explicit */
192
193 if (list != NULL) {
194 for (i = 1 ; i < nb_ent ; i++) {
195 if (list[i] < list[i-1])
196 break;
197 }
198 }
199 else
200 i = nb_ent;
201
202 if (i == nb_ent || nb_ent == 0)
203 return 1;
204 else
205 return 0;
206 }
207
208 #if defined(HAVE_MPI)
209
210 /*----------------------------------------------------------------------------
211 * Update state request arrays so as to be usable with a given halo.
212 *
213 * This function should be called at the end of any halo creation,
214 * so that buffer sizes are increased if necessary.
215 *
216 * parameters:
217 * halo <-- pointer to cs_halo_t structure.
218 * halo_state <-> pointer to halo state structure.
219 *---------------------------------------------------------------------------*/
220
221 static void
_update_requests(const cs_halo_t * halo,cs_halo_state_t * hs)222 _update_requests(const cs_halo_t *halo,
223 cs_halo_state_t *hs)
224 {
225 if (halo == NULL)
226 return;
227
228 int n_requests = halo->n_c_domains*2;
229
230 if (n_requests > hs->request_size) {
231 hs->request_size = n_requests;
232 BFT_REALLOC(hs->request, hs->request_size, MPI_Request);
233 BFT_REALLOC(hs->status, hs->request_size, MPI_Status);
234 }
235
236 }
237
238 /*----------------------------------------------------------------------------*/
239 /*!
240 * \brief Exchange send shift in send buffer for one-sided get.
241 *
242 * \param[in] halo halo structure to update
243 */
244 /*----------------------------------------------------------------------------*/
245
246 static void
_exchange_send_shift(cs_halo_t * halo)247 _exchange_send_shift(cs_halo_t *halo)
248 {
249 MPI_Comm comm = cs_glob_mpi_comm;
250 MPI_Request *request = NULL;
251 MPI_Status *status = NULL;
252
253 BFT_MALLOC(request, halo->n_c_domains*2, MPI_Request);
254 BFT_MALLOC(status, halo->n_c_domains*2, MPI_Status);
255
256 BFT_REALLOC(halo->c_domain_s_shift, halo->n_c_domains, cs_lnum_t);
257
258 /* Exchange local range with neighbor ranks */
259
260 const int local_rank = CS_MAX(cs_glob_rank_id, 0);
261
262 for (int i = 0; i < halo->n_c_domains; i++) {
263 int rank_id = halo->c_domain_rank[i];
264 MPI_Irecv(halo->c_domain_s_shift + i,
265 1,
266 CS_MPI_LNUM,
267 rank_id,
268 local_rank,
269 comm,
270 &(request[i]));
271 }
272
273 for (int i = 0; i < halo->n_c_domains; i++) {
274 int rank_id = halo->c_domain_rank[i];
275 MPI_Isend(halo->send_index + 2*i,
276 1,
277 CS_MPI_LNUM,
278 rank_id,
279 rank_id,
280 comm,
281 &(request[halo->n_c_domains + i]));
282 }
283
284 MPI_Waitall(halo->n_c_domains*2, request, status);
285
286 BFT_FREE(request);
287 BFT_FREE(status);
288 }
289
290 #endif /* HAVE_MPI */
291
292 /*----------------------------------------------------------------------------
293 * Local copy from halo send buffer to destination array.
294 *
295 * This allows pariodicity data which may be present on the local rank to
296 * be exchanged without any MPI call.
297 *
298 * Data is untyped; only its size is given, so this function may also
299 * be used to synchronize interleaved multidimendsional data, using
300 * size = element_size*dim (assuming a homogeneous environment, at least
301 * as far as data encoding goes).
302 *
303 * parameters:
304 * halo <-- pointer to halo structure
305 * local_rank_id <-- id of local rank
306 * sync_mode <-- synchronization mode (standard or extended)
307 * size <-- datatype size
308 * var_location <-- Allocation info for exchanged variable (host/device)
309 * send_buf <-> pointer to send data
310 * val <-> pointer to destination data
311 *----------------------------------------------------------------------------*/
312
313 static void
_sync_local(const cs_halo_t * halo,int local_rank_id,cs_halo_type_t sync_mode,size_t size,cs_alloc_mode_t var_location,const void * send_buf,void * val)314 _sync_local(const cs_halo_t *halo,
315 int local_rank_id,
316 cs_halo_type_t sync_mode,
317 size_t size,
318 cs_alloc_mode_t var_location,
319 const void *send_buf,
320 void *val)
321 {
322 cs_lnum_t end_shift = (sync_mode == CS_HALO_EXTENDED) ? 2 : 1;
323
324 unsigned char *_val = val;
325 unsigned char *recv
326 = _val + (halo->n_local_elts + halo->index[2*local_rank_id]) * size;
327
328 cs_lnum_t start = halo->send_index[2*local_rank_id]*size;
329 cs_lnum_t length = ( halo->send_index[2*local_rank_id + end_shift]
330 - halo->send_index[2*local_rank_id]);
331
332 size_t count = length * size;
333
334 if (var_location == CS_ALLOC_HOST) {
335 const unsigned char *buffer = (const unsigned char *)send_buf;
336 const unsigned char *_buffer = buffer + start;
337 memcpy(recv, _buffer, count);
338 }
339
340 #if defined(HAVE_ACCEL)
341 else {
342 const unsigned char *buffer
343 = (const unsigned char *)cs_get_device_ptr_const(send_buf);
344 const unsigned char *_buffer = buffer + start;
345 cs_copy_d2d(recv, _buffer, count);
346 }
347 #endif
348 }
349
350 #if defined(HAVE_MPI)
351 #if (MPI_VERSION >= 3)
352
353 /*----------------------------------------------------------------------------*/
354 /*!
355 * \brief Launch update of ghost values in case of parallelism
356 * for one-sided communication.
357 *
358 * The cs_halo_sync_pack function should have been called before this function,
359 * using the same hs argument.
360 *
361 * \param[in] halo pointer to halo structure
362 * \param[in] val pointer to variable value array
363 * \param[in, out] hs pointer to halo state, NULL for global state
364 */
365 /*----------------------------------------------------------------------------*/
366
367 static void
_halo_sync_start_one_sided(const cs_halo_t * halo,void * val,cs_halo_state_t * hs)368 _halo_sync_start_one_sided(const cs_halo_t *halo,
369 void *val,
370 cs_halo_state_t *hs)
371 {
372 cs_lnum_t end_shift = (hs->sync_mode == CS_HALO_EXTENDED) ? 2 : 1;
373 cs_lnum_t stride = hs->stride;
374 size_t elt_size = cs_datatype_size[hs->data_type] * stride;
375 size_t n_loc_elts = halo->n_local_elts;
376
377 unsigned char *restrict _val = val;
378 unsigned char *restrict _val_dest = _val + n_loc_elts*elt_size;
379
380 MPI_Datatype mpi_datatype = cs_datatype_to_mpi[hs->data_type];
381
382 const int local_rank = CS_MAX(cs_glob_rank_id, 0);
383
384 /* Get data from distant ranks */
385
386 if (_halo_comm_mode == CS_HALO_COMM_RMA_GET) {
387
388 /* Use active target synchronization */
389
390 if (halo->c_domain_group != MPI_GROUP_NULL) {
391 /* Start RMA exposure epoch */
392 MPI_Win_post(halo->c_domain_group,
393 MPI_MODE_NOPUT, /* program assertion */
394 hs->win);
395
396 /* Access Epoch */
397 MPI_Win_start(halo->c_domain_group,
398 0, /* program assertion */
399 hs->win);
400 }
401 else {
402 MPI_Win_fence(0, hs->win);
403 }
404
405 for (int rank_id = 0; rank_id < halo->n_c_domains; rank_id++) {
406
407 int length = ( halo->index[2*rank_id + end_shift]
408 - halo->index[2*rank_id]) * stride;
409
410 if (halo->c_domain_rank[rank_id] != local_rank) {
411
412 if (length > 0) {
413 cs_lnum_t start = halo->index[2*rank_id]*elt_size;
414 unsigned char *dest = _val_dest + start;
415 MPI_Aint displacement = halo->c_domain_s_shift[rank_id]*elt_size;
416
417 MPI_Get(dest,
418 length, /* origin count */
419 mpi_datatype, /* origin datatype */
420 halo->c_domain_rank[rank_id], /* target rank */
421 displacement, /* target displacement */
422 length, /* target count */
423 mpi_datatype, /* target datatype */
424 hs->win);
425 }
426
427 }
428 else
429 hs->local_rank_id = rank_id;
430 }
431
432 }
433 }
434
435 /*----------------------------------------------------------------------------*/
436 /*!
437 * \brief Finalize update of ghost values in case of parallelism
438 * for one-sided communication.
439 *
440 * The cs_halo_sync_pack function should have been called before this function,
441 * using the same hs argument.
442 *
443 * \param[in] halo pointer to halo structure
444 * \param[in] val pointer to variable value array
445 * \param[in, out] hs pointer to halo state, NULL for global state
446 */
447 /*----------------------------------------------------------------------------*/
448
449 static void
_halo_sync_complete_one_sided(const cs_halo_t * halo,void * val,cs_halo_state_t * hs)450 _halo_sync_complete_one_sided(const cs_halo_t *halo,
451 void *val,
452 cs_halo_state_t *hs)
453 {
454 /* Use active target synchronization */
455
456 /* Access Epoch */
457 if (halo->c_domain_group != MPI_GROUP_NULL) {
458 MPI_Win_complete(hs->win);
459
460 /* Complete RMA exposure epoch */
461 MPI_Win_wait(hs->win);
462 }
463 else {
464 MPI_Win_fence(0, hs->win);
465 }
466
467 /* Copy local values in case of periodicity */
468
469 if (hs->local_rank_id > -1) {
470 size_t elt_size = cs_datatype_size[hs->data_type] * hs->stride;
471 _sync_local(halo, hs->local_rank_id, hs->sync_mode, elt_size,
472 hs->var_location, hs->send_buffer_cur, val);
473 }
474 }
475
476 #endif /* (MPI_VERSION >= 3) */
477 #endif /* defined(HAVE_MPI) */
478
479 /*! (DOXYGEN_SHOULD_SKIP_THIS) \endcond */
480
481 /*============================================================================
482 * Public function definitions
483 *============================================================================*/
484
485 /*----------------------------------------------------------------------------*/
486 /*!
487 * \brief Create a halo structure given an interface set.
488 *
489 * \param[in] ifs pointer to a cs_interface_set structure
490 *
491 * \return pointer to created cs_halo_t structure
492 */
493 /*----------------------------------------------------------------------------*/
494
495 cs_halo_t *
cs_halo_create(const cs_interface_set_t * ifs)496 cs_halo_create(const cs_interface_set_t *ifs)
497 {
498 cs_lnum_t i, tmp_id, perio_lst_size;
499
500 cs_lnum_t loc_id = -1;
501
502 cs_halo_t *halo = NULL;
503
504 const cs_interface_t *interface = NULL;
505
506 BFT_MALLOC(halo, 1, cs_halo_t);
507
508 halo->n_c_domains = cs_interface_set_size(ifs);
509 halo->n_transforms = 0;
510
511 halo->periodicity = cs_interface_set_periodicity(ifs);
512 halo->n_rotations = 0;
513
514 halo->n_local_elts = 0;
515
516 for (i = 0; i < CS_HALO_N_TYPES; i++) {
517 halo->n_send_elts[i] = 0;
518 halo->n_elts [i] = 0;
519 }
520
521 BFT_MALLOC(halo->c_domain_rank, halo->n_c_domains, int);
522
523 /* Check if cs_glob_rank_id belongs to interface set in order to
524 order ranks with local rank at first place */
525
526 for (i = 0; i < halo->n_c_domains; i++) {
527
528 interface = cs_interface_set_get(ifs, i);
529 halo->c_domain_rank[i] = cs_interface_rank(interface);
530
531 if (cs_glob_rank_id == cs_interface_rank(interface))
532 loc_id = i;
533
534 } /* End of loop on ranks */
535
536 if (loc_id > 0) {
537
538 tmp_id = halo->c_domain_rank[loc_id];
539 halo->c_domain_rank[loc_id] = halo->c_domain_rank[0];
540 halo->c_domain_rank[0] = tmp_id;
541
542 }
543
544 /* Order ranks */
545
546 if ( halo->n_c_domains > 2
547 && _order_int_test(&(halo->c_domain_rank[1]),
548 halo->n_c_domains-1) == 0) {
549
550 cs_lnum_t *order = NULL;
551 cs_gnum_t *buffer = NULL;
552
553 BFT_MALLOC(order, halo->n_c_domains - 1, cs_lnum_t);
554 BFT_MALLOC(buffer, halo->n_c_domains - 1, cs_gnum_t);
555
556 for (i = 1; i < halo->n_c_domains; i++)
557 buffer[i-1] = (cs_gnum_t)halo->c_domain_rank[i];
558
559 cs_order_gnum_allocated(NULL,
560 buffer,
561 order,
562 halo->n_c_domains - 1);
563
564 for (i = 0; i < halo->n_c_domains - 1; i++)
565 halo->c_domain_rank[i+1] = (cs_lnum_t)buffer[order[i]];
566
567 BFT_FREE(buffer);
568 BFT_FREE(order);
569
570 } /* End of ordering ranks */
571
572 CS_MALLOC_HD(halo->send_index, 2*halo->n_c_domains + 1, cs_lnum_t,
573 _halo_buffer_alloc_mode);
574 BFT_MALLOC(halo->index, 2*halo->n_c_domains + 1, cs_lnum_t);
575
576 for (i = 0; i < 2*halo->n_c_domains + 1; i++) {
577 halo->send_index[i] = 0;
578 halo->index[i] = 0;
579 }
580
581 halo->send_perio_lst = NULL;
582 halo->perio_lst = NULL;
583
584 if (halo->periodicity != NULL) {
585
586 halo->n_transforms = fvm_periodicity_get_n_transforms(halo->periodicity);
587
588 for (i = 0; i < halo->n_transforms; i++) {
589 if ( fvm_periodicity_get_type(halo->periodicity, i)
590 >= FVM_PERIODICITY_ROTATION)
591 halo->n_rotations += 1;
592 }
593
594 /* We need 2 values per transformation and there are n_transforms
595 transformations. For each rank, we need a value for standard and
596 extended halo. */
597
598 perio_lst_size = 2*halo->n_transforms * 2*halo->n_c_domains;
599
600 BFT_MALLOC(halo->send_perio_lst, perio_lst_size, cs_lnum_t);
601 BFT_MALLOC(halo->perio_lst, perio_lst_size, cs_lnum_t);
602
603 for (i = 0; i < perio_lst_size; i++) {
604 halo->send_perio_lst[i] = 0;
605 halo->perio_lst[i] = 0;
606 }
607
608 }
609
610 halo->send_list = NULL;
611
612 #if defined(HAVE_MPI)
613 halo->c_domain_group = MPI_GROUP_NULL;
614 halo->c_domain_s_shift = NULL;
615 #endif
616
617 _n_halos += 1;
618
619 return halo;
620 }
621
622 /*----------------------------------------------------------------------------*/
623 /*!
624 * \brief Ready halo for use.
625 *
626 * This function should be called after building a halo using the
627 * cs_halo_create_function and defined locally.
628 * It is called automatically by cs_halo_create_from_ref and
629 * cs_halo_create_from_rank_neighbors so does not need to be called again
630 * using these functions.
631 *
632 * \param[in] halo pointer to halo structure
633 */
634 /*----------------------------------------------------------------------------*/
635
636 void
cs_halo_create_complete(cs_halo_t * halo)637 cs_halo_create_complete(cs_halo_t *halo)
638 {
639 #if defined(HAVE_MPI)
640
641 /* Make buffer available on device if relevant */
642 cs_sync_h2d(halo->send_index);
643 cs_sync_h2d(halo->send_list);
644
645 /* Create group for one-sided communication */
646 if (_halo_comm_mode > CS_HALO_COMM_P2P) {
647 const int local_rank = CS_MAX(cs_glob_rank_id, 0);
648 int n_group_ranks = 0;
649 int *group_ranks = NULL;
650 BFT_MALLOC(group_ranks, halo->n_c_domains + 1, int);
651 for (int i = 0; i < halo->n_c_domains; i++) {
652 if (halo->c_domain_rank[i] < local_rank)
653 group_ranks[n_group_ranks++] = halo->c_domain_rank[i];
654 }
655 group_ranks[n_group_ranks++] = local_rank;
656 for (int i = 0; i < halo->n_c_domains; i++) {
657 if (halo->c_domain_rank[i] > local_rank)
658 group_ranks[n_group_ranks++] = halo->c_domain_rank[i];
659 }
660
661 if (_order_int_test(group_ranks, n_group_ranks)) {
662
663 MPI_Group glob_group;
664 MPI_Comm_group(cs_glob_mpi_comm, &glob_group);
665 MPI_Group_incl(glob_group,
666 n_group_ranks,
667 group_ranks,
668 &(halo->c_domain_group));
669 MPI_Group_free(&glob_group);
670
671 }
672
673 BFT_FREE(group_ranks);
674 }
675
676 /* Exchange shifts for one-sided communication */
677 if (_halo_comm_mode == CS_HALO_COMM_RMA_GET)
678 _exchange_send_shift(halo);
679
680 if (_halo_state == NULL)
681 _halo_state = cs_halo_state_create();
682
683 #endif /* defined(HAVE_MPI) */
684 }
685
686 /*----------------------------------------------------------------------------*/
687 /*!
688 * \brief Create a halo structure, given a reference halo.
689 *
690 * \param[in] ref pointer to reference halo
691 *
692 * \return pointer to created cs_halo_t structure
693 */
694 /*----------------------------------------------------------------------------*/
695
696 cs_halo_t *
cs_halo_create_from_ref(const cs_halo_t * ref)697 cs_halo_create_from_ref(const cs_halo_t *ref)
698 {
699 cs_lnum_t i;
700
701 cs_halo_t *halo = NULL;
702
703 BFT_MALLOC(halo, 1, cs_halo_t);
704
705 halo->n_c_domains = ref->n_c_domains;
706 halo->n_transforms = ref->n_transforms;
707
708 halo->periodicity = ref->periodicity;
709 halo->n_rotations = ref->n_rotations;
710
711 halo->n_local_elts = 0;
712
713 BFT_MALLOC(halo->c_domain_rank, halo->n_c_domains, int);
714
715 for (i = 0; i < halo->n_c_domains; i++)
716 halo->c_domain_rank[i] = ref->c_domain_rank[i];
717
718 CS_MALLOC_HD(halo->send_index, 2*halo->n_c_domains + 1, cs_lnum_t,
719 _halo_buffer_alloc_mode);
720 BFT_MALLOC(halo->index, 2*halo->n_c_domains + 1, cs_lnum_t);
721
722 for (i = 0; i < 2*halo->n_c_domains + 1; i++) {
723 halo->send_index[i] = 0;
724 halo->index[i] = 0;
725 }
726
727 halo->send_perio_lst = NULL;
728 halo->perio_lst = NULL;
729
730 if (halo->n_transforms > 0) {
731
732 cs_lnum_t perio_lst_size = 2*halo->n_transforms * 2*halo->n_c_domains;
733
734 BFT_MALLOC(halo->send_perio_lst, perio_lst_size, cs_lnum_t);
735 BFT_MALLOC(halo->perio_lst, perio_lst_size, cs_lnum_t);
736
737 for (i = 0; i < perio_lst_size; i++) {
738 halo->send_perio_lst[i] = 0;
739 halo->perio_lst[i] = 0;
740 }
741
742 }
743
744 halo->send_list = NULL;
745
746 #if defined(HAVE_MPI)
747 halo->c_domain_group = MPI_GROUP_NULL;
748 halo->c_domain_s_shift = NULL;
749 #endif
750
751 _n_halos += 1;
752
753 cs_halo_create_complete(halo);
754
755 return halo;
756 }
757
758 #if defined(HAVE_MPI)
759
760 /*----------------------------------------------------------------------------*/
761 /*!
762 * \brief Create a halo structure from distant element distant ranks and ids.
763 *
764 * \remark This function does not handle periodicity. For most matrix-vector,
765 * products and similar operations, periodicity of translation an
766 * even rotation could be handled with no specific halo information,
767 * simply by assigning an equivalence between two periodic elements.
768 * For rotation, this would require also applying a rotation through
769 * the matrix coefficients (this would have the advantage of being
770 * compatible with external libraries). An alternative would be
771 * to add rotation information to a given halo as a second stage,
772 * through a specialized operator which can be added in the future.
773 *
774 * \param[in] rn associated rank neighbors info
775 * \param[in] n_local_elts number of elements for local rank
776 * \param[in] n_distant_elts number of distant elements for local rank
777 * \param[in] elt_rank_id distant element rank index in rank neighbors,
778 * ordered by rank (size: n_distant_elts)
779 * \param[in] elt_id distant element id (at distant rank),
780 * ordered by rank (size: n_distant_elts)
781 *
782 * \return pointer to created cs_halo_t structure
783 */
784 /*----------------------------------------------------------------------------*/
785
786 cs_halo_t *
cs_halo_create_from_rank_neighbors(const cs_rank_neighbors_t * rn,cs_lnum_t n_local_elts,cs_lnum_t n_distant_elts,const int elt_rank_id[],const cs_lnum_t elt_id[])787 cs_halo_create_from_rank_neighbors(const cs_rank_neighbors_t *rn,
788 cs_lnum_t n_local_elts,
789 cs_lnum_t n_distant_elts,
790 const int elt_rank_id[],
791 const cs_lnum_t elt_id[])
792 {
793 cs_halo_t *halo = NULL;
794
795 BFT_MALLOC(halo, 1, cs_halo_t);
796
797 halo->n_c_domains = 0;
798 halo->n_transforms = 0;
799
800 halo->n_rotations = 0;
801
802 halo->periodicity = NULL;
803 halo->send_perio_lst = NULL;
804 halo->perio_lst = NULL;
805
806 #if defined(HAVE_MPI)
807 halo->c_domain_group = MPI_GROUP_NULL;
808 halo->c_domain_s_shift = NULL;
809 #endif
810
811 halo->n_local_elts = n_local_elts;
812
813 for (int i = 0; i < CS_HALO_N_TYPES; i++) {
814 halo->n_send_elts[i] = 0;
815 halo->n_elts [i] = n_distant_elts;
816 }
817
818 /* Count elements for each rank;
819 check they are are ordered lexicographically */
820
821 cs_lnum_t *rank_count;
822 BFT_MALLOC(rank_count, rn->size*2, cs_lnum_t);
823 for (int i = 0; i < rn->size; i++)
824 rank_count[i] = 0;
825
826 int rank_prev = -1;
827 int elt_prev = -1;
828 for (cs_lnum_t i = 0; i < n_distant_elts; i++) {
829 int rank_id = elt_rank_id[i];
830 if ( rank_id < rank_prev
831 || (rank_id == rank_prev && elt_id[i] <= elt_prev))
832 bft_error
833 (__FILE__, __LINE__, 0,
834 "%s:\n"
835 " Rank and distant element ids passed to this function must\n"
836 " be lexicographically ordered; this is not the case here.",
837 __func__);
838 rank_count[rank_id] += 1;
839 rank_prev = rank_id;
840 elt_prev = elt_id[i];
841 }
842
843 /* Now exchange counts with neighboring elements */
844
845 MPI_Comm comm = cs_glob_mpi_comm;
846 MPI_Request *request = NULL;
847 MPI_Status *status = NULL;
848
849 BFT_MALLOC(request, rn->size*2, MPI_Request);
850 BFT_MALLOC(status, rn->size*2, MPI_Status);
851
852 /* Exchange local range with neighbor ranks */
853
854 int request_count = 0;
855 const int local_rank = CS_MAX(cs_glob_rank_id, 0);
856
857 for (int i = 0; i < rn->size; i++) {
858 MPI_Irecv(rank_count + rn->size + i,
859 1,
860 CS_MPI_LNUM,
861 rn->rank[i],
862 local_rank,
863 comm,
864 &(request[request_count++]));
865 }
866
867 for (int i = 0; i < rn->size; i++) {
868 MPI_Isend(rank_count + i,
869 1,
870 CS_MPI_LNUM,
871 rn->rank[i],
872 rn->rank[i],
873 comm,
874 &(request[request_count++]));
875 }
876
877 MPI_Waitall(request_count, request, status);
878
879 /* Now build send and receive indexes to exchange data;
880 the receive index can be directly assigned to the halo;
881 also check if cs_glob_rank_id belongs to interface set in order to
882 order ranks with local rank at first place */
883
884 int loc_r_index = -1;
885 cs_lnum_t r_displ = 0, loc_r_displ = 0;
886 cs_lnum_t recv_count = 0, send_count = 0;
887
888 halo->n_c_domains = 0;
889 for (int i = 0; i < rn->size; i++) {
890 if (rank_count[i] + rank_count[rn->size + i] > 0) {
891 halo->n_c_domains += 1;
892 if (rn->rank[i] == local_rank) {
893 loc_r_index = i;
894 loc_r_displ = r_displ;
895 assert(rank_count[i] == rank_count[rn->size + i]);
896 }
897 r_displ += rank_count[i];
898 recv_count += rank_count[rn->size + i];
899 }
900 }
901
902 BFT_MALLOC(halo->c_domain_rank, halo->n_c_domains, int);
903
904 CS_MALLOC_HD(halo->send_list, recv_count, cs_lnum_t,
905 _halo_buffer_alloc_mode);
906 CS_MALLOC_HD(halo->send_index, 2*halo->n_c_domains + 1, cs_lnum_t,
907 _halo_buffer_alloc_mode);
908 BFT_MALLOC(halo->index, halo->n_c_domains*2+1, cs_lnum_t);
909
910 halo->n_c_domains = 0;
911 send_count = 0;
912 recv_count = 0;
913
914 halo->index[0] = 0;
915 halo->send_index[0] = 0;
916
917 if (loc_r_index > -1) {
918 halo->c_domain_rank[0] = local_rank;
919 cs_lnum_t l_count = rank_count[loc_r_index];
920 for (cs_lnum_t i = 0; i < l_count; i++)
921 halo->send_list[i] = elt_id[loc_r_displ + i];
922 send_count += l_count;
923 recv_count += l_count;
924 halo->n_c_domains = 1;
925 for (int j = 1; j < 3; j++) {
926 halo->index[j] = recv_count;
927 halo->send_index[j] = send_count;
928 }
929 }
930
931 for (int i = 0; i < rn->size; i++) {
932 if ( rank_count[i] + rank_count[rn->size + i] > 0
933 && rn->rank[i] != local_rank) {
934 halo->c_domain_rank[halo->n_c_domains] = rn->rank[i];
935 recv_count += rank_count[i];
936 send_count += rank_count[rn->size + i];
937 for (int j = 1; j < 3; j++) {
938 halo->index[halo->n_c_domains*2 + j] = recv_count;
939 halo->send_index[halo->n_c_domains*2 + j] = send_count;
940 }
941 halo->n_c_domains += 1;
942 }
943 }
944
945 BFT_FREE(rank_count);
946
947 for (int i = 0; i < CS_HALO_N_TYPES; i++)
948 halo->n_send_elts[i] = send_count;
949
950 /* Now send lists to matching ranks (reverse send and receive) */
951
952 request_count = 0;
953
954 for (int i = 0; i < halo->n_c_domains; i++) {
955 int rank_id = halo->c_domain_rank[i];
956 if (rank_id == local_rank) continue;
957 cs_lnum_t r_shift = halo->send_index[2*i];
958 cs_lnum_t r_size = halo->send_index[2*i+1] - r_shift;
959 if (r_size > 0)
960 MPI_Irecv(halo->send_list + r_shift,
961 r_size,
962 CS_MPI_LNUM,
963 rank_id,
964 local_rank,
965 comm,
966 &(request[request_count++]));
967 }
968
969 for (int i = 0; i < halo->n_c_domains; i++) {
970 int rank_id = halo->c_domain_rank[i];
971 if (rank_id == local_rank) continue;
972 cs_lnum_t s_shift = halo->index[2*i];
973 cs_lnum_t s_size = halo->index[2*i+1] - s_shift;
974 if (s_shift < loc_r_displ) { /* case with local rank first */
975 assert(halo->c_domain_rank[0] == local_rank);
976 s_shift -= halo->index[2];
977 }
978 if (s_size > 0)
979 MPI_Isend(elt_id + s_shift,
980 s_size,
981 CS_MPI_LNUM,
982 rank_id,
983 rank_id,
984 comm,
985 &(request[request_count++]));
986 }
987
988 MPI_Waitall(request_count, request, status);
989
990 BFT_FREE(request);
991 BFT_FREE(status);
992
993 _n_halos += 1;
994
995 cs_halo_create_complete(halo);
996
997 return halo;
998 }
999
1000 #endif /* HAVE_MPI */
1001
1002 /*----------------------------------------------------------------------------*/
1003 /*!
1004 * brief Destroy a halo structure.
1005 *
1006 * \param[in, out] halo pointer to pointer to cs_halo structure to destroy.
1007 */
1008 /*----------------------------------------------------------------------------*/
1009
1010 void
cs_halo_destroy(cs_halo_t ** halo)1011 cs_halo_destroy(cs_halo_t **halo)
1012 {
1013 if (halo == NULL)
1014 return;
1015
1016 if (*halo == NULL)
1017 return;
1018
1019 cs_halo_t *_halo = *halo;
1020
1021 #if defined(HAVE_MPI)
1022 if (_halo->c_domain_group != MPI_GROUP_NULL)
1023 MPI_Group_free(&(_halo->c_domain_group));
1024
1025 BFT_FREE(_halo->c_domain_s_shift);
1026 #endif
1027
1028 BFT_FREE(_halo->c_domain_rank);
1029
1030 CS_FREE_HD(_halo->send_list);
1031 CS_FREE_HD(_halo->send_index);
1032 BFT_FREE(_halo->index);
1033
1034 BFT_FREE(_halo->send_perio_lst);
1035 BFT_FREE(_halo->perio_lst);
1036
1037 BFT_FREE(*halo);
1038
1039 _n_halos -= 1;
1040
1041 /* Delete default state if no halo remains */
1042
1043 if (_n_halos == 0)
1044 cs_halo_state_destroy(&_halo_state);
1045 }
1046
1047 /*----------------------------------------------------------------------------*/
1048 /*!
1049 * \brief Create a halo state structure.
1050 *
1051 * \return pointer to created cs_halo_state_t structure.
1052 */
1053 /*----------------------------------------------------------------------------*/
1054
1055 cs_halo_state_t *
cs_halo_state_create(void)1056 cs_halo_state_create(void)
1057 {
1058 cs_halo_state_t *hs;
1059 BFT_MALLOC(hs, 1, cs_halo_state_t);
1060
1061 cs_halo_state_t hs_ini = {
1062 .sync_mode = CS_HALO_STANDARD,
1063 .data_type = CS_DATATYPE_NULL,
1064 .stride = 0,
1065 .var_location = CS_ALLOC_HOST,
1066 .send_buffer_cur = NULL,
1067 .n_requests = 0,
1068 .local_rank_id = -1,
1069 .send_buffer_size = 0,
1070 .recv_buffer_size = 0,
1071 .send_buffer = NULL,
1072 .recv_buffer = NULL
1073 #if defined(HAVE_MPI)
1074 ,
1075 .request_size = 0,
1076 .request = NULL,
1077 .status = NULL,
1078 .win = MPI_WIN_NULL
1079
1080 #endif
1081 };
1082
1083 *hs = hs_ini;
1084
1085 return hs;
1086 }
1087
1088 /*----------------------------------------------------------------------------*/
1089 /*!
1090 * \brief Destroy a halo state structure.
1091 *
1092 * \param[in, out] halo_state pointer to pointer to cs_halo_state
1093 * structure to destroy.
1094 */
1095 /*----------------------------------------------------------------------------*/
1096
1097 void
cs_halo_state_destroy(cs_halo_state_t ** halo_state)1098 cs_halo_state_destroy(cs_halo_state_t **halo_state)
1099 {
1100 if (halo_state != NULL) {
1101 cs_halo_state_t *hs = *halo_state;
1102
1103 #if defined(HAVE_MPI)
1104 #if (MPI_VERSION >= 3)
1105 if (hs->win != MPI_WIN_NULL) {
1106 MPI_Win_free(&(hs->win));
1107 hs->win = MPI_WIN_NULL;
1108 }
1109 #endif
1110 #endif
1111
1112 CS_FREE_HD(hs->send_buffer);
1113
1114 #if defined(HAVE_MPI)
1115 BFT_FREE(hs->request);
1116 BFT_FREE(hs->status);
1117 #endif
1118
1119 BFT_FREE(*halo_state);
1120 }
1121 }
1122
1123 /*----------------------------------------------------------------------------*/
1124 /*!
1125 * \brief Get pointer to default halo state structure.
1126 *
1127 * \return] halo pointer to pointer to cs_halo structure to destroy.
1128 */
1129 /*----------------------------------------------------------------------------*/
1130
1131 cs_halo_state_t *
cs_halo_state_get_default(void)1132 cs_halo_state_get_default(void)
1133 {
1134 return _halo_state;
1135 }
1136
1137 /*----------------------------------------------------------------------------
1138 * Apply local cells renumbering to a halo
1139 *
1140 * parameters:
1141 * halo <-- pointer to halo structure
1142 * new_cell_id <-- array indicating old -> new cell id (0 to n-1)
1143 *---------------------------------------------------------------------------*/
1144
1145 void
cs_halo_renumber_cells(cs_halo_t * halo,const cs_lnum_t new_cell_id[])1146 cs_halo_renumber_cells(cs_halo_t *halo,
1147 const cs_lnum_t new_cell_id[])
1148 {
1149 if (halo != NULL) {
1150
1151 const cs_lnum_t n_elts = halo->n_send_elts[CS_HALO_EXTENDED];
1152
1153 for (cs_lnum_t j = 0; j < n_elts; j++)
1154 halo->send_list[j] = new_cell_id[halo->send_list[j]];
1155
1156 cs_sync_h2d(halo->send_list);
1157
1158 }
1159 }
1160
1161 /*----------------------------------------------------------------------------
1162 * Apply ghost cells renumbering to a halo
1163 *
1164 * parameters:
1165 * halo <-- pointer to halo structure
1166 * old_cell_id <-- array indicating new -> old cell id (0 to n-1)
1167 *---------------------------------------------------------------------------*/
1168
1169 void
cs_halo_renumber_ghost_cells(cs_halo_t * halo,const cs_lnum_t old_cell_id[])1170 cs_halo_renumber_ghost_cells(cs_halo_t *halo,
1171 const cs_lnum_t old_cell_id[])
1172 {
1173 if (halo == NULL)
1174 return;
1175
1176 /* Reverse update from distant cells */
1177
1178 cs_lnum_t *send_buf, *recv_buf;
1179
1180 BFT_MALLOC(send_buf, halo->n_send_elts[1], cs_lnum_t);
1181 BFT_MALLOC(recv_buf, halo->n_elts[1], cs_lnum_t);
1182
1183 for (int i = 0; i < halo->n_c_domains; i++) {
1184 cs_lnum_t start = halo->index[2*i];
1185 cs_lnum_t end = halo->index[2*i+2];
1186 cs_lnum_t shift = halo->n_local_elts + halo->index[2*i];
1187 for (cs_lnum_t j = start; j < end; j++) {
1188 recv_buf[j] = old_cell_id[halo->n_local_elts + j] - shift;
1189 assert(recv_buf[j] >= 0 && recv_buf[j] < (end - start));
1190 }
1191 }
1192
1193 int local_rank_id = (cs_glob_n_ranks == 1) ? 0 : -1;
1194
1195 #if defined(HAVE_MPI)
1196
1197 if (cs_glob_n_ranks > 1) {
1198
1199 int rank_id;
1200 int request_count = 0;
1201 const int local_rank = cs_glob_rank_id;
1202
1203 MPI_Request *request;
1204 MPI_Status *status;
1205
1206 BFT_MALLOC(request, halo->n_c_domains*2, MPI_Request);
1207 BFT_MALLOC(status, halo->n_c_domains*2, MPI_Status);
1208
1209 /* Receive data from distant ranks */
1210
1211 for (rank_id = 0; rank_id < halo->n_c_domains; rank_id++) {
1212
1213 cs_lnum_t start = halo->send_index[2*rank_id];
1214 cs_lnum_t length = ( halo->send_index[2*rank_id + 2]
1215 - halo->send_index[2*rank_id]);
1216
1217 if (halo->c_domain_rank[rank_id] != local_rank) {
1218 if (length > 0)
1219 MPI_Irecv(send_buf + start,
1220 length,
1221 CS_MPI_LNUM,
1222 halo->c_domain_rank[rank_id],
1223 local_rank,
1224 cs_glob_mpi_comm,
1225 &(request[request_count++]));
1226 }
1227 else
1228 local_rank_id = rank_id;
1229
1230 }
1231
1232 /* We wait for posting all receives (often recommended) */
1233
1234 if (_halo_use_barrier)
1235 MPI_Barrier(cs_glob_mpi_comm);
1236
1237 /* Send data to distant ranks */
1238
1239 for (rank_id = 0; rank_id < halo->n_c_domains; rank_id++) {
1240
1241 /* If this is not the local rank */
1242
1243 if (halo->c_domain_rank[rank_id] != local_rank) {
1244
1245 cs_lnum_t start = halo->index[2*rank_id];
1246 cs_lnum_t length = ( halo->index[2*rank_id + 2]
1247 - halo->index[2*rank_id]);
1248
1249 if (length > 0)
1250 MPI_Isend(recv_buf + start,
1251 length,
1252 CS_MPI_LNUM,
1253 halo->c_domain_rank[rank_id],
1254 halo->c_domain_rank[rank_id],
1255 cs_glob_mpi_comm,
1256 &(request[request_count++]));
1257
1258 }
1259
1260 }
1261
1262 /* Wait for all exchanges */
1263
1264 MPI_Waitall(request_count, request, status);
1265
1266 BFT_FREE(request);
1267 BFT_FREE(status);
1268
1269 }
1270
1271 #endif /* defined(HAVE_MPI) */
1272
1273 /* Copy local values if present */
1274
1275 if (local_rank_id > -1) {
1276
1277 cs_lnum_t *recv = recv_buf + halo->index[2*local_rank_id];
1278
1279 cs_lnum_t start = halo->send_index[2*local_rank_id];
1280 cs_lnum_t length = ( halo->send_index[2*local_rank_id + 2]
1281 - halo->send_index[2*local_rank_id]);
1282
1283 for (cs_lnum_t j = 0; j < length; j++)
1284 send_buf[j+start] = recv[j];
1285
1286 }
1287
1288 BFT_FREE(recv_buf);
1289
1290 /* Now apply renumbering to send list */
1291
1292 for (int i = 0; i < halo->n_c_domains; i++) {
1293 cs_lnum_t start = halo->send_index[2*i];
1294 cs_lnum_t end = halo->send_index[2*i+2];
1295 for (cs_lnum_t j = start; j < end; j++)
1296 send_buf[j] = halo->send_list[start + send_buf[j]];
1297 for (cs_lnum_t j = start; j < end; j++)
1298 halo->send_list[j] = send_buf[j];
1299 }
1300
1301 cs_sync_h2d(halo->send_list);
1302
1303 BFT_FREE(send_buf);
1304 }
1305
1306 /*----------------------------------------------------------------------------*/
1307 /*!
1308 * \brief Initialize halo state prior to packing halo data to send.
1309 *
1310 * A local state handler may be provided, or the default state handler will
1311 * be used.
1312 *
1313 * This function is included in \ref cs_halo_sync_pack, but may be called
1314 * separately for specific implementations, such as for accelerator devices.
1315 *
1316 * A local state and/or buffer may be provided, or the default (global) state
1317 * and buffer will be used. If provided explicitely,
1318 * the buffer must be of sufficient size.
1319 *
1320 * \param[in] halo pointer to halo structure
1321 * \param[in] sync_mode synchronization mode (standard or extended)
1322 * \param[in] data_type data type
1323 * \param[in] stride number of (interlaced) values by entity
1324 * \param[out] send_buf pointer to send buffer, NULL for global buffer
1325 * \param[in, out] hs pointer to halo state, NULL for global state
1326 *
1327 * \return pointer to halo send buffer
1328 */
1329 /*----------------------------------------------------------------------------*/
1330
1331 void *
cs_halo_sync_pack_init_state(const cs_halo_t * halo,cs_halo_type_t sync_mode,cs_datatype_t data_type,int stride,void * send_buf,cs_halo_state_t * hs)1332 cs_halo_sync_pack_init_state(const cs_halo_t *halo,
1333 cs_halo_type_t sync_mode,
1334 cs_datatype_t data_type,
1335 int stride,
1336 void *send_buf,
1337 cs_halo_state_t *hs)
1338 {
1339 void *_send_buffer = send_buf;
1340
1341 if (halo == NULL)
1342 return _send_buffer;
1343
1344 cs_halo_state_t *_hs = (hs != NULL) ? hs : _halo_state;
1345
1346 if (_send_buffer == NULL) {
1347 size_t send_buffer_size = cs_halo_pack_size(halo, data_type, stride);
1348
1349 if (send_buffer_size > _hs->send_buffer_size) {
1350 cs_alloc_mode_t alloc_mode = cs_check_device_ptr(halo->send_list);
1351
1352 _hs->send_buffer_size = send_buffer_size;
1353
1354 #if defined(HAVE_MPI)
1355 #if (MPI_VERSION >= 3)
1356 if (_hs->win != MPI_WIN_NULL) {
1357 MPI_Win_free(&(_hs->win));
1358 _hs->win = MPI_WIN_NULL;
1359 }
1360 #endif
1361 #endif
1362
1363 CS_FREE_HD(_hs->send_buffer);
1364 CS_MALLOC_HD(_hs->send_buffer,
1365 _hs->send_buffer_size,
1366 char,
1367 alloc_mode);
1368
1369 #if defined(HAVE_MPI)
1370 #if (MPI_VERSION >= 3)
1371 if (_halo_comm_mode == CS_HALO_COMM_RMA_GET)
1372 MPI_Win_create(_hs->send_buffer,
1373 _hs->send_buffer_size,
1374 1, /* displacement unit */
1375 MPI_INFO_NULL,
1376 MPI_COMM_WORLD,
1377 &(_hs->win));
1378 #endif
1379 #endif
1380 }
1381
1382 _send_buffer = _hs->send_buffer;
1383 }
1384
1385 _hs->var_location = CS_ALLOC_HOST;
1386 _hs->send_buffer_cur = _send_buffer;
1387
1388 _hs->sync_mode = sync_mode;
1389 _hs->data_type = data_type;
1390 _hs->stride = stride;
1391
1392 return _send_buffer;
1393 }
1394
1395 /*----------------------------------------------------------------------------*/
1396 /*!
1397 * \brief Pack halo data to send into dense buffer.
1398 *
1399 * A local state handler may be provided, or the default state handler will
1400 * be used.
1401 *
1402 * A local state and/or buffer may be provided, or the default (global) state
1403 * and buffer will be used. If provided explicitely,
1404 * the buffer must be of sufficient size.
1405 *
1406 * \param[in] halo pointer to halo structure
1407 * \param[in] sync_mode synchronization mode (standard or extended)
1408 * \param[in] data_type data type
1409 * \param[in] stride number of (interlaced) values by entity
1410 * \param[in] val pointer to variable value array
1411 * \param[out] send_buf pointer to send buffer, NULL for global buffer
1412 * \param[in, out] hs pointer to halo state, NULL for global state
1413 */
1414 /*----------------------------------------------------------------------------*/
1415
1416 void
cs_halo_sync_pack(const cs_halo_t * halo,cs_halo_type_t sync_mode,cs_datatype_t data_type,int stride,void * val,void * send_buf,cs_halo_state_t * hs)1417 cs_halo_sync_pack(const cs_halo_t *halo,
1418 cs_halo_type_t sync_mode,
1419 cs_datatype_t data_type,
1420 int stride,
1421 void *val,
1422 void *send_buf,
1423 cs_halo_state_t *hs)
1424 {
1425 if (halo == NULL)
1426 return;
1427
1428 void *_send_buffer = cs_halo_sync_pack_init_state(halo,
1429 sync_mode,
1430 data_type,
1431 stride,
1432 send_buf,
1433 hs);
1434
1435 cs_lnum_t end_shift = 0;
1436
1437 if (sync_mode == CS_HALO_STANDARD)
1438 end_shift = 1;
1439
1440 else if (sync_mode == CS_HALO_EXTENDED)
1441 end_shift = 2;
1442
1443 /* Assemble buffers for halo exchange; avoid threading for now, as dynamic
1444 scheduling led to slightly higher cost here in some tests,
1445 and even static scheduling might lead to false sharing for small
1446 halos. */
1447
1448 if (data_type == CS_REAL_TYPE) {
1449
1450 cs_real_t *buffer = (cs_real_t *)_send_buffer;
1451 cs_real_t *var = val;
1452
1453 for (int rank_id = 0; rank_id < halo->n_c_domains; rank_id++) {
1454
1455 cs_lnum_t p_start = halo->send_index[2*rank_id]*stride;
1456 size_t start = halo->send_index[2*rank_id];
1457 size_t length = ( halo->send_index[2*rank_id + end_shift]
1458 - halo->send_index[2*rank_id]);
1459
1460 if (stride == 3) { /* Unroll loop for this case */
1461 for (size_t i = 0; i < length; i++) {
1462 buffer[p_start + i*3]
1463 = var[(halo->send_list[start + i])*3];
1464 buffer[p_start + i*3 + 1]
1465 = var[(halo->send_list[start + i])*3 + 1];
1466 buffer[p_start + i*3 + 2]
1467 = var[(halo->send_list[start + i])*3 + 2];
1468 }
1469 }
1470 else {
1471 size_t _stride = stride;
1472 for (size_t i = 0; i < length; i++) {
1473 size_t r_start = halo->send_list[start + i] * stride;
1474 for (size_t j = 0; j < _stride; j++)
1475 buffer[p_start + i*_stride + j] = var[r_start + j];
1476 }
1477 }
1478
1479 }
1480
1481 }
1482
1483 else {
1484
1485 unsigned char *buffer = (unsigned char *)_send_buffer;
1486
1487 size_t elt_size = cs_datatype_size[data_type] * stride;
1488
1489 for (int rank_id = 0; rank_id < halo->n_c_domains; rank_id++) {
1490
1491 cs_lnum_t p_start = halo->send_index[2*rank_id]*elt_size;
1492 size_t start = halo->send_index[2*rank_id];
1493 size_t length = ( halo->send_index[2*rank_id + end_shift]
1494 - halo->send_index[2*rank_id]);
1495
1496 unsigned char *restrict _val = val;
1497 unsigned char *_buffer = buffer + p_start;
1498
1499 for (size_t i = 0; i < length; i++) {
1500 size_t r_start = halo->send_list[start + i] * elt_size;
1501 for (size_t j = 0; j < elt_size; j++)
1502 _buffer[i*elt_size + j] = _val[r_start + j];
1503 }
1504
1505 }
1506
1507 }
1508 }
1509
1510 #if defined(HAVE_ACCEL)
1511
1512 /*----------------------------------------------------------------------------*/
1513 /*!
1514 * \brief Pack halo data to send into dense buffer on accelerator device.
1515 *
1516 * A local state handler may be provided, or the default state handler will
1517 * be used.
1518 *
1519 * A local state and/or buffer may be provided, or the default (global) state
1520 * and buffer will be used. If provided explicitely,
1521 * the buffer must be of sufficient size.
1522 *
1523 * \param[in] halo pointer to halo structure
1524 * \param[in] sync_mode synchronization mode (standard or extended)
1525 * \param[in] data_type data type
1526 * \param[in] stride number of (interlaced) values by entity
1527 * \param[in] val pointer to variable value array (on device)
1528 * \param[out] send_buf pointer to send buffer (on device),
1529 * NULL for global buffer
1530 * \param[in, out] hs pointer to halo state, NULL for global state
1531 */
1532 /*----------------------------------------------------------------------------*/
1533
1534 void
cs_halo_sync_pack_d(const cs_halo_t * halo,cs_halo_type_t sync_mode,cs_datatype_t data_type,int stride,void * val,void * send_buf,cs_halo_state_t * hs)1535 cs_halo_sync_pack_d(const cs_halo_t *halo,
1536 cs_halo_type_t sync_mode,
1537 cs_datatype_t data_type,
1538 int stride,
1539 void *val,
1540 void *send_buf,
1541 cs_halo_state_t *hs)
1542 {
1543 if (halo == NULL)
1544 return;
1545
1546 cs_halo_state_t *_hs = (hs != NULL) ? hs : _halo_state;
1547
1548 void *_send_buf = cs_halo_sync_pack_init_state(halo,
1549 sync_mode,
1550 data_type,
1551 stride,
1552 send_buf,
1553 _hs);
1554
1555 void *_send_buf_d = cs_get_device_ptr(_send_buf);
1556
1557 #if defined(HAVE_CUDA)
1558
1559 cs_halo_cuda_pack_send_buffer_real(halo,
1560 sync_mode,
1561 stride,
1562 val,
1563 _send_buf_d);
1564
1565 #else
1566
1567 cs_halo_sync_pack(halo,
1568 sync_mode,
1569 data_type,
1570 stride,
1571 val,
1572 send_buf,
1573 _hs);
1574
1575 #endif
1576
1577 _hs->var_location = CS_ALLOC_HOST;
1578 }
1579
1580 #endif /* defined(HAVE_ACCEL) */
1581
1582 /*----------------------------------------------------------------------------*/
1583 /*!
1584 * \brief Launch update array of values in case of parallelism or periodicity.
1585 *
1586 * This function aims at copying main values from local elements
1587 * (id between 1 and n_local_elements) to ghost elements on distant ranks
1588 * (id between n_local_elements + 1 to n_local_elements_with_halo).
1589 *
1590 * The cs_halo_sync_pack function should have been called before this function,
1591 * using the same hs argument.
1592 *
1593 * \param[in] halo pointer to halo structure
1594 * \param[in] val pointer to variable value array
1595 * \param[in, out] hs pointer to halo state, NULL for global state
1596 */
1597 /*----------------------------------------------------------------------------*/
1598
1599 void
cs_halo_sync_start(const cs_halo_t * halo,void * val,cs_halo_state_t * hs)1600 cs_halo_sync_start(const cs_halo_t *halo,
1601 void *val,
1602 cs_halo_state_t *hs)
1603 {
1604 if (halo == NULL)
1605 return;
1606
1607 cs_halo_state_t *_hs = (hs != NULL) ? hs : _halo_state;
1608
1609 #if (MPI_VERSION >= 3)
1610 if (_halo_comm_mode > CS_HALO_COMM_P2P) {
1611 _halo_sync_start_one_sided(halo, val, _hs);
1612 return;
1613 }
1614 #endif
1615
1616 cs_lnum_t end_shift = (_hs->sync_mode == CS_HALO_EXTENDED) ? 2 : 1;
1617 cs_lnum_t stride = _hs->stride;
1618 size_t elt_size = cs_datatype_size[_hs->data_type] * stride;
1619 size_t n_loc_elts = halo->n_local_elts;
1620
1621 unsigned char *restrict _val = val;
1622 unsigned char *restrict _val_dest = _val + n_loc_elts*elt_size;
1623
1624 unsigned char *buffer = (unsigned char *)(_hs->send_buffer_cur);
1625
1626 if (_hs->var_location > CS_ALLOC_HOST) {
1627 # if defined(_CS_MPI_DEVICE_SUPPORT)
1628 /* For CUDA-aware MPI, directly work with buffer on device */
1629 buffer = cs_get_device_ptr(buffer);
1630 # else
1631 /* For host-based MPI, copy or prefetch buffer */
1632 cs_sync_d2h(buffer);
1633
1634 /* When array passed is defined on device but is not shared, use
1635 separate (smaller) CPU for receive (as we cannot know whether
1636 a matching host beffer without complexifying the API);
1637 this will be copied back to device at the next step */
1638 if (_hs->var_location != CS_ALLOC_HOST_DEVICE_SHARED) {
1639 if (_hs->recv_buffer_size < _hs->send_buffer_size) {
1640 _hs->recv_buffer_size = _hs->send_buffer_size;
1641 CS_FREE_HD(_hs->recv_buffer);
1642 CS_MALLOC_HD(_hs->recv_buffer, _hs->recv_buffer_size, unsigned char,
1643 CS_ALLOC_HOST_DEVICE_PINNED);
1644 }
1645 _val_dest = _hs->recv_buffer;
1646 }
1647 #endif
1648 }
1649
1650 #if defined(HAVE_MPI)
1651
1652 _update_requests(halo, _hs);
1653
1654 MPI_Datatype mpi_datatype = cs_datatype_to_mpi[_hs->data_type];
1655
1656 int request_count = 0;
1657 const int local_rank = CS_MAX(cs_glob_rank_id, 0);
1658
1659 /* Receive data from distant ranks */
1660
1661 for (int rank_id = 0; rank_id < halo->n_c_domains; rank_id++) {
1662
1663 cs_lnum_t length = ( halo->index[2*rank_id + end_shift]
1664 - halo->index[2*rank_id]) * stride;
1665
1666 if (halo->c_domain_rank[rank_id] != local_rank) {
1667
1668 if (length > 0) {
1669 size_t start = (size_t)(halo->index[2*rank_id]);
1670 unsigned char *dest = _val_dest + start*elt_size;
1671
1672 MPI_Irecv(dest,
1673 length*_hs->stride,
1674 mpi_datatype,
1675 halo->c_domain_rank[rank_id],
1676 halo->c_domain_rank[rank_id],
1677 cs_glob_mpi_comm,
1678 &(_hs->request[request_count++]));
1679 }
1680
1681 }
1682 else
1683 _hs->local_rank_id = rank_id;
1684 }
1685
1686 /* We may wait for posting all receives (sometimes recommended) */
1687
1688 if (_halo_use_barrier)
1689 MPI_Barrier(cs_glob_mpi_comm);
1690
1691 /* Send data to distant ranks */
1692
1693 for (int rank_id = 0; rank_id < halo->n_c_domains; rank_id++) {
1694
1695 cs_lnum_t start = halo->send_index[2*rank_id]*elt_size;
1696 cs_lnum_t length = ( halo->send_index[2*rank_id + end_shift]
1697 - halo->send_index[2*rank_id]);
1698
1699 if (halo->c_domain_rank[rank_id] != local_rank && length > 0)
1700 MPI_Isend(buffer + start,
1701 length*stride,
1702 mpi_datatype,
1703 halo->c_domain_rank[rank_id],
1704 local_rank,
1705 cs_glob_mpi_comm,
1706 &(_hs->request[request_count++]));
1707
1708 }
1709
1710 _hs->n_requests = request_count;
1711
1712 #endif /* defined(HAVE_MPI) */
1713 }
1714
1715 /*----------------------------------------------------------------------------*/
1716 /*!
1717 * \brief Wait for completion of update array of values in case of
1718 * parallelism or periodicity.
1719 *
1720 * This function aims at copying main values from local elements
1721 * (id between 1 and n_local_elements) to ghost elements on distant ranks
1722 * (id between n_local_elements + 1 to n_local_elements_with_halo).
1723 *
1724 * The cs_halo_sync_start function should have been called before this function,
1725 * using the same hs argument.
1726 *
1727 * \param[in] halo pointer to halo structure
1728 * \param[in] val pointer to variable value array
1729 * \param[in, out] hs pointer to halo state, NULL for global state
1730 */
1731 /*----------------------------------------------------------------------------*/
1732
1733 void
cs_halo_sync_wait(const cs_halo_t * halo,void * val,cs_halo_state_t * hs)1734 cs_halo_sync_wait(const cs_halo_t *halo,
1735 void *val,
1736 cs_halo_state_t *hs)
1737 {
1738 if (halo == NULL)
1739 return;
1740
1741 cs_halo_state_t *_hs = (hs != NULL) ? hs : _halo_state;
1742
1743 #if (MPI_VERSION >= 3)
1744 if (_halo_comm_mode > CS_HALO_COMM_P2P) {
1745 _halo_sync_complete_one_sided(halo, val, _hs);
1746 return;
1747 }
1748 #endif
1749
1750 #if defined(HAVE_MPI)
1751
1752 /* Wait for all exchanges */
1753
1754 if (_hs->n_requests > 0)
1755 MPI_Waitall(_hs->n_requests, _hs->request, _hs->status);
1756
1757 #endif /* defined(HAVE_MPI) */
1758
1759 #if defined(HAVE_ACCEL)
1760 #if !defined(_CS_MPI_DEVICE_SUPPORT)
1761 if (_hs->var_location > CS_ALLOC_HOST) {
1762
1763 size_t n_loc_elts = halo->n_local_elts;
1764 size_t n_elts = ( _hs->sync_mode
1765 == CS_HALO_EXTENDED) ? halo->n_elts[1] : halo->n_elts[0];
1766 size_t elt_size = cs_datatype_size[_hs->data_type] * _hs->stride;
1767 size_t n_bytes = n_elts*elt_size;
1768
1769 unsigned char *restrict _val = val;
1770 unsigned char *restrict _val_dest = _val + n_loc_elts*elt_size;
1771
1772 if (_hs->var_location == CS_ALLOC_HOST_DEVICE_SHARED)
1773 cs_prefetch_h2d(_val_dest, n_bytes);
1774 else
1775 cs_copy_h2d(_val_dest, _hs->recv_buffer, n_bytes);
1776
1777 }
1778 #endif
1779 #endif /* defined(HAVE_ACCEL) */
1780
1781 /* Copy local values in case of periodicity */
1782
1783 if (_hs->local_rank_id > -1) {
1784 size_t elt_size = cs_datatype_size[_hs->data_type] * _hs->stride;
1785 _sync_local(halo, _hs->local_rank_id, _hs->sync_mode, elt_size,
1786 _hs->var_location, _hs->send_buffer_cur, val);
1787 }
1788
1789 /* Cleanup */
1790
1791 _hs->sync_mode = CS_HALO_STANDARD;
1792 _hs->data_type = CS_DATATYPE_NULL;
1793 _hs->stride = 0;
1794 _hs->send_buffer_cur = NULL;
1795 _hs->n_requests = 0;
1796 _hs->local_rank_id = -1;
1797 }
1798
1799 /*----------------------------------------------------------------------------*/
1800 /*!
1801 * \brief Update array of values in case of parallelism or periodicity.
1802 *
1803 * This function aims at copying main values from local elements
1804 * (id between 1 and n_local_elements) to ghost elements on distant ranks
1805 * (id between n_local_elements + 1 to n_local_elements_with_halo).
1806 *
1807 * \param[in] halo pointer to halo structure
1808 * \param[in] sync_mode synchronization mode (standard or extended)
1809 * \param[in] data_type data type
1810 * \param[in] stride number of (interlaced) values by entity
1811 * \param[in] val pointer to variable value array
1812 */
1813 /*----------------------------------------------------------------------------*/
1814
1815 void
cs_halo_sync(const cs_halo_t * halo,cs_halo_type_t sync_mode,cs_datatype_t data_type,int stride,void * val)1816 cs_halo_sync(const cs_halo_t *halo,
1817 cs_halo_type_t sync_mode,
1818 cs_datatype_t data_type,
1819 int stride,
1820 void *val)
1821 {
1822 if (halo == NULL)
1823 return;
1824
1825 cs_halo_sync_pack(halo,
1826 sync_mode,
1827 data_type,
1828 stride,
1829 val,
1830 NULL,
1831 NULL);
1832
1833 cs_halo_sync_start(halo, val, NULL);
1834
1835 cs_halo_sync_wait(halo, val, NULL);
1836 }
1837
1838 /*----------------------------------------------------------------------------
1839 * Update array of any type of halo values in case of parallelism or
1840 * periodicity.
1841 *
1842 * Data is untyped; only its size is given, so this function may also
1843 * be used to synchronize interleaved multidimendsional data, using
1844 * size = element_size*dim (assuming a homogeneous environment, at least
1845 * as far as data encoding goes).
1846 *
1847 * This function aims at copying main values from local elements
1848 * (id between 1 and n_local_elements) to ghost elements on distant ranks
1849 * (id between n_local_elements + 1 to n_local_elements_with_halo).
1850 *
1851 * parameters:
1852 * halo <-- pointer to halo structure
1853 * sync_mode <-- synchronization mode (standard or extended)
1854 * size <-- datatype size
1855 * num <-> pointer to local number value array
1856 *----------------------------------------------------------------------------*/
1857
1858 void
cs_halo_sync_untyped(const cs_halo_t * halo,cs_halo_type_t sync_mode,size_t size,void * val)1859 cs_halo_sync_untyped(const cs_halo_t *halo,
1860 cs_halo_type_t sync_mode,
1861 size_t size,
1862 void *val)
1863 {
1864 cs_halo_sync(halo, sync_mode, CS_CHAR, size, val);
1865 }
1866
1867 /*----------------------------------------------------------------------------
1868 * Update array of integer halo values in case of parallelism or periodicity.
1869 *
1870 * This function aims at copying main values from local elements
1871 * (id between 1 and n_local_elements) to ghost elements on distant ranks
1872 * (id between n_local_elements + 1 to n_local_elements_with_halo).
1873 *
1874 * parameters:
1875 * halo <-- pointer to halo structure
1876 * sync_mode <-- synchronization mode (standard or extended)
1877 * num <-> pointer to local number value array
1878 *----------------------------------------------------------------------------*/
1879
1880 void
cs_halo_sync_num(const cs_halo_t * halo,cs_halo_type_t sync_mode,cs_lnum_t num[])1881 cs_halo_sync_num(const cs_halo_t *halo,
1882 cs_halo_type_t sync_mode,
1883 cs_lnum_t num[])
1884 {
1885 cs_halo_sync(halo, sync_mode, CS_LNUM_TYPE, 1, num);
1886 }
1887
1888 /*----------------------------------------------------------------------------
1889 * Update array of variable (floating-point) halo values in case of
1890 * parallelism or periodicity.
1891 *
1892 * This function aims at copying main values from local elements
1893 * (id between 1 and n_local_elements) to ghost elements on distant ranks
1894 * (id between n_local_elements + 1 to n_local_elements_with_halo).
1895 *
1896 * parameters:
1897 * halo <-- pointer to halo structure
1898 * sync_mode <-- synchronization mode (standard or extended)
1899 * var <-> pointer to variable value array
1900 *----------------------------------------------------------------------------*/
1901
1902 void
cs_halo_sync_var(const cs_halo_t * halo,cs_halo_type_t sync_mode,cs_real_t var[])1903 cs_halo_sync_var(const cs_halo_t *halo,
1904 cs_halo_type_t sync_mode,
1905 cs_real_t var[])
1906 {
1907 cs_halo_sync(halo, sync_mode, CS_REAL_TYPE, 1, var);
1908 }
1909
1910 /*----------------------------------------------------------------------------
1911 * Update array of strided variable (floating-point) values in case
1912 * of parallelism or periodicity.
1913 *
1914 * This function aims at copying main values from local elements
1915 * (id between 1 and n_local_elements) to ghost elements on distant ranks
1916 * (id between n_local_elements + 1 to n_local_elements_with_halo).
1917 *
1918 * parameters:
1919 * halo <-- pointer to halo structure
1920 * sync_mode <-- synchronization mode (standard or extended)
1921 * var <-> pointer to variable value array
1922 * stride <-- number of (interlaced) values by entity
1923 *----------------------------------------------------------------------------*/
1924
1925 void
cs_halo_sync_var_strided(const cs_halo_t * halo,cs_halo_type_t sync_mode,cs_real_t var[],int stride)1926 cs_halo_sync_var_strided(const cs_halo_t *halo,
1927 cs_halo_type_t sync_mode,
1928 cs_real_t var[],
1929 int stride)
1930 {
1931 cs_halo_sync(halo, sync_mode, CS_REAL_TYPE, stride, var);
1932 }
1933
1934 /*----------------------------------------------------------------------------
1935 * Return MPI_Barrier usage flag.
1936 *
1937 * returns:
1938 * true if MPI barriers are used after posting receives and before posting
1939 * sends, false otherwise
1940 *---------------------------------------------------------------------------*/
1941
1942 bool
cs_halo_get_use_barrier(void)1943 cs_halo_get_use_barrier(void)
1944 {
1945 return _halo_use_barrier;
1946 }
1947
1948 /*----------------------------------------------------------------------------
1949 * Set MPI_Barrier usage flag.
1950 *
1951 * parameters:
1952 * use_barrier <-- true if MPI barriers should be used after posting
1953 * receives and before posting sends, false otherwise.
1954 *---------------------------------------------------------------------------*/
1955
1956 void
cs_halo_set_use_barrier(bool use_barrier)1957 cs_halo_set_use_barrier(bool use_barrier)
1958 {
1959 _halo_use_barrier = use_barrier;
1960 }
1961
1962 /*----------------------------------------------------------------------------*/
1963 /*!
1964 * \brief Get default communication mode for halo exchange.
1965 *
1966 * \return allocation mode
1967 */
1968 /*----------------------------------------------------------------------------*/
1969
1970 cs_halo_comm_mode_t
cs_halo_get_comm_mode(void)1971 cs_halo_get_comm_mode(void)
1972 {
1973 return _halo_comm_mode;
1974 }
1975
1976 /*----------------------------------------------------------------------------*/
1977 /*!
1978 * \brief Set default communication mode for halo exchange.
1979 *
1980 * \param[in] mode allocation mode to set
1981 */
1982 /*----------------------------------------------------------------------------*/
1983
1984 void
cs_halo_set_comm_mode(cs_halo_comm_mode_t mode)1985 cs_halo_set_comm_mode(cs_halo_comm_mode_t mode)
1986 {
1987 if (mode >= CS_HALO_COMM_P2P && mode <= CS_HALO_COMM_RMA_GET)
1988 _halo_comm_mode = mode;
1989 }
1990
1991 /*----------------------------------------------------------------------------*/
1992 /*!
1993 * \brief Get default host/device allocation mode for message packing arrays.
1994 *
1995 * \return allocation mode
1996 */
1997 /*----------------------------------------------------------------------------*/
1998
1999 cs_alloc_mode_t
cs_halo_get_buffer_alloc_mode(void)2000 cs_halo_get_buffer_alloc_mode(void)
2001 {
2002 return _halo_buffer_alloc_mode;
2003 }
2004
2005 /*----------------------------------------------------------------------------*/
2006 /*!
2007 * \brief Set default host/device allocation mode for message packing arrays.
2008 *
2009 * \param[in] mode allocation mode to set
2010 */
2011 /*----------------------------------------------------------------------------*/
2012
2013 void
cs_halo_set_buffer_alloc_mode(cs_alloc_mode_t mode)2014 cs_halo_set_buffer_alloc_mode(cs_alloc_mode_t mode)
2015 {
2016 _halo_buffer_alloc_mode = mode;
2017 }
2018
2019 /*----------------------------------------------------------------------------
2020 * Dump a cs_halo_t structure.
2021 *
2022 * parameters:
2023 * halo <-- pointer to cs_halo_t struture
2024 * print_level <-- 0 only dimensions and indexes are printed, else (1)
2025 * everything is printed
2026 *---------------------------------------------------------------------------*/
2027
2028 void
cs_halo_dump(const cs_halo_t * halo,int print_level)2029 cs_halo_dump(const cs_halo_t *halo,
2030 int print_level)
2031 {
2032 if (halo == NULL) {
2033 bft_printf("\n\n halo: nil\n");
2034 return;
2035 }
2036
2037 bft_printf("\n halo: %p\n"
2038 " n_transforms: %d\n"
2039 " n_c_domains: %d\n"
2040 " periodicity: %p\n"
2041 " n_rotations: %d\n"
2042 " n_local_elts: %ld\n",
2043 (const void *)halo,
2044 halo->n_transforms, halo->n_c_domains,
2045 (const void *)halo->periodicity,
2046 halo->n_rotations, (long)halo->n_local_elts);
2047
2048 bft_printf("\nRanks on halo frontier:\n");
2049 for (int i = 0; i < halo->n_c_domains; i++)
2050 bft_printf("%5d", halo->c_domain_rank[i]);
2051
2052 for (int halo_id = 0; halo_id < 2; halo_id++) {
2053
2054 cs_lnum_t n_elts[2];
2055 cs_lnum_t *index = NULL, *list = NULL, *perio_lst = NULL;
2056
2057 bft_printf("\n ---------\n");
2058
2059 if (halo_id == 0) {
2060
2061 bft_printf(" send_list:\n");
2062 n_elts[0] = halo->n_send_elts[0];
2063 n_elts[1] = halo->n_send_elts[1];
2064 index = halo->send_index;
2065 list = halo->send_list;
2066 perio_lst = halo->send_perio_lst;
2067
2068 }
2069 else if (halo_id == 1) {
2070
2071 bft_printf(" halo:\n");
2072 n_elts[0] = halo->n_elts[0];
2073 n_elts[1] = halo->n_elts[1];
2074 index = halo->index;
2075 list = NULL;
2076 perio_lst = halo->perio_lst;
2077
2078 }
2079
2080 bft_printf(" ---------\n\n");
2081 bft_printf(" n_ghost_cells: %ld\n"
2082 " n_std_ghost_cells: %ld\n", (long)n_elts[1], (long)n_elts[0]);
2083
2084 if (index == NULL)
2085 return;
2086
2087 if (halo->n_transforms > 0) {
2088
2089 const cs_lnum_t stride = 4*halo->n_c_domains;
2090
2091 for (int i = 0; i < halo->n_transforms; i++) {
2092
2093 bft_printf("\nTransformation number: %d\n", i+1);
2094
2095 for (int j = 0; j < halo->n_c_domains; j++) {
2096
2097 bft_printf(" rank %3d <STD> %5ld %5ld <EXT> %5ld %5ld\n",
2098 halo->c_domain_rank[j],
2099 (long)perio_lst[i*stride + 4*j],
2100 (long)perio_lst[i*stride + 4*j+1],
2101 (long)perio_lst[i*stride + 4*j+2],
2102 (long)perio_lst[i*stride + 4*j+3]);
2103 }
2104
2105 } /* End of loop on perio */
2106
2107 } /* End if n_perio > 0 */
2108
2109 for (int i = 0; i < halo->n_c_domains; i++) {
2110
2111 bft_printf("\n rank %d:\n", halo->c_domain_rank[i]);
2112
2113 if (index[2*i+1] - index[2*i] > 0) {
2114
2115 bft_printf("\n Standard halo\n");
2116 bft_printf(" idx start %ld: idx end %ld:\n",
2117 (long)index[2*i], (long)index[2*i+1]);
2118
2119 if (print_level > 0 && list != NULL) {
2120 bft_printf("\n idx elt id\n");
2121 for (cs_lnum_t j = index[2*i]; j < index[2*i+1]; j++)
2122 bft_printf(" %10ld %10ld\n", (long)j, (long)list[j]);
2123 }
2124
2125 } /* there are elements on standard neighborhood */
2126
2127 if (index[2*i+2] - index[2*i+1] > 0) {
2128
2129 bft_printf("\n Extended halo\n");
2130 bft_printf(" idx start %ld: idx end %ld:\n",
2131 (long)index[2*i+1], (long)index[2*i+2]);
2132
2133 if (print_level > 0 && list != NULL) {
2134 bft_printf("\n idx elt id\n");
2135 for (long j = index[2*i+1]; j < index[2*i+2]; j++)
2136 bft_printf(" %10ld %10ld %10ld\n",
2137 (long)j, (long)list[j], (long)halo->n_local_elts+j);
2138 }
2139
2140 } /* If there are elements on extended neighborhood */
2141
2142 } /* End of loop on involved ranks */
2143
2144 } /* End of loop on halos (send_halo/halo) */
2145
2146 bft_printf("\n\n");
2147 bft_printf_flush();
2148 }
2149
2150 /*----------------------------------------------------------------------------*/
2151
2152 END_C_DECLS
2153