1 /*============================================================================
2  * Functions dealing with ghost cells
3  *============================================================================*/
4 
5 /*
6   This file is part of Code_Saturne, a general-purpose CFD tool.
7 
8   Copyright (C) 1998-2021 EDF S.A.
9 
10   This program is free software; you can redistribute it and/or modify it under
11   the terms of the GNU General Public License as published by the Free Software
12   Foundation; either version 2 of the License, or (at your option) any later
13   version.
14 
15   This program is distributed in the hope that it will be useful, but WITHOUT
16   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
18   details.
19 
20   You should have received a copy of the GNU General Public License along with
21   this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
22   Street, Fifth Floor, Boston, MA 02110-1301, USA.
23 */
24 
25 /*----------------------------------------------------------------------------*/
26 
27 #include "cs_defs.h"
28 
29 /*----------------------------------------------------------------------------
30  * Standard C library headers
31  *----------------------------------------------------------------------------*/
32 
33 #include <stdio.h>
34 #include <string.h>
35 #include <assert.h>
36 
37 /*----------------------------------------------------------------------------
38  *  Local headers
39  *----------------------------------------------------------------------------*/
40 
41 #include "bft_mem.h"
42 #include "bft_error.h"
43 #include "bft_printf.h"
44 
45 #include "cs_base.h"
46 #include "cs_base_accel.h"
47 #include "cs_order.h"
48 
49 #include "cs_interface.h"
50 #include "cs_rank_neighbors.h"
51 
52 #include "fvm_periodicity.h"
53 
54 /*----------------------------------------------------------------------------
55  *  Header for the current file
56  *----------------------------------------------------------------------------*/
57 
58 #include "cs_halo.h"
59 
60 #if defined(HAVE_CUDA)
61 #include "cs_halo_cuda.h"
62 #endif
63 
64 /*----------------------------------------------------------------------------*/
65 
66 BEGIN_C_DECLS
67 
68 /*! \cond DOXYGEN_SHOULD_SKIP_THIS */
69 
70 /* Remarks:
71  *
72  * The current available mode for MPI-3 RMA uses "get" semantics.
73  * A "put" semantics variant could easily be added, either:
74  * - Using MPI_Win_create_dynamic and attaching the halo section of the
75  *   current array to that window (which would seem cumbersome as this also
76  *   requires exchanging base addresses obtained with MPI_Get_Address for
77  *   each array, but could be amortized for iterative algorithms).
78  * - Using a fixed receive buffer, and copying data to the tail section of
79  *   the array afterwards (as is done for accelerators when the MPI library
80  *   used is not no accelerator-aware). This would add an extra copy, but
81  *   be much simpler.
82  *
83  * It may also be useful to allow this setting on a "per halo" basis, as
84  * some publications report better performance with RMA for large data,
85  * and better performance with P2P for small data, so in uses such
86  * as multigrid solvers, either may be preferred for different levels.
87 */
88 
89 /*=============================================================================
90  * Local macro definitions
91  *============================================================================*/
92 
93 #if defined(MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT
94   #define _CS_MPI_DEVICE_SUPPORT 1
95 #else
96   #if defined(_CS_MPI_DEVICE_SUPPORT)
97     #undef _CS_MPI_DEVICE_SUPPORT
98   #endif
99 #endif
100 
101 /*=============================================================================
102  * Local type definitions
103  *============================================================================*/
104 
105 /* Structure to maintain halo exchange state */
106 
107 struct _cs_halo_state_t {
108 
109   /* Current synchronization state */
110 
111   cs_halo_type_t  sync_mode;      /* Standard or extended */
112   cs_datatype_t   data_type;      /* Datatype */
113   int             stride;         /* Number of values per location */
114 
115   cs_alloc_mode_t var_location;   /* Allocation info for exchanged variable */
116 
117   void        *send_buffer_cur;   /* Send buffer used for current progress
118                                      (either _send_buffer or passed by caller) */
119 
120   int       n_requests;        /* Number of MPI requests */
121   int       local_rank_id;     /* Id of halo for own rank, -1 if not present */
122 
123   /* Buffers for synchronization;
124      receive buffers only needed for some communication modes */
125 
126   size_t       send_buffer_size;  /* Size of send buffer, in bytes */
127   size_t       recv_buffer_size;  /* Size of receive buffer, in bytes */
128 
129   void        *send_buffer;       /* Send buffer (maintained by this object) */
130   void        *recv_buffer;       /* Recv. buffer (maintained by this object) */
131 
132 #if defined(HAVE_MPI)
133 
134   int          request_size;      /* Size of requests and status arrays */
135 
136   MPI_Request  *request;          /* Array of MPI requests */
137   MPI_Status   *status;           /* Array of MPI status */
138 
139   MPI_Win       win;              /* MPI-3 RMA window */
140 
141 #endif
142 
143 };
144 
145 /*============================================================================
146  * Static global variables
147  *============================================================================*/
148 
149 /* Number of defined halos */
150 static int _n_halos = 0;
151 
152 /* Allocation mode for arrays which might be used on accelerator device
153    Note that an alternative option would be to use shared memory with
154    prefetching. We will need to do performance comparisons first, but
155    in the case of similar performance, going for the shared approach
156    would be preferred for its "more generic" aspect. */
157 static cs_alloc_mode_t _halo_buffer_alloc_mode = CS_ALLOC_HOST_DEVICE_PINNED;
158 
159 /* Should we use barriers after posting receives ? */
160 static int _halo_use_barrier = false;
161 
162 /* Default halo state handler */
163 static cs_halo_state_t *_halo_state = NULL;
164 
165 /* Halo communications mode */
166 static int _halo_comm_mode = CS_HALO_COMM_P2P;
167 
168 /*============================================================================
169  * Private function definitions
170  *============================================================================*/
171 
172 /*----------------------------------------------------------------------------*/
173 /*!
174  * \brief Test if an array of global numbers is ordered.
175  *
176  * \param[in]  list    optional list (1 to n numbering) of selected entities
177  *                     (or NULL if all nb_ent are selected). This list may
178  *                     contain element numbers in any order
179  * \param[in]  nb_ent  number of entities considered
180  *
181  * \return  1 if ordered, 0 otherwise.
182  */
183 /*----------------------------------------------------------------------------*/
184 
185 static int
_order_int_test(const int list[],size_t nb_ent)186 _order_int_test(const int  list[],
187                 size_t     nb_ent)
188 {
189   size_t i = 0;
190 
191   /* If numbering is explicit */
192 
193   if (list != NULL) {
194     for (i = 1 ; i < nb_ent ; i++) {
195       if (list[i] < list[i-1])
196         break;
197     }
198   }
199   else
200     i = nb_ent;
201 
202   if (i == nb_ent || nb_ent == 0)
203     return 1;
204   else
205     return 0;
206 }
207 
208 #if defined(HAVE_MPI)
209 
210 /*----------------------------------------------------------------------------
211  * Update state request arrays so as to be usable with a given halo.
212  *
213  * This function should be called at the end of any halo creation,
214  * so that buffer sizes are increased if necessary.
215  *
216  * parameters:
217  *   halo       <-- pointer to cs_halo_t structure.
218  *   halo_state <-> pointer to halo state structure.
219  *---------------------------------------------------------------------------*/
220 
221 static void
_update_requests(const cs_halo_t * halo,cs_halo_state_t * hs)222 _update_requests(const cs_halo_t  *halo,
223                  cs_halo_state_t  *hs)
224 {
225   if (halo == NULL)
226     return;
227 
228   int n_requests = halo->n_c_domains*2;
229 
230   if (n_requests > hs->request_size) {
231     hs->request_size = n_requests;
232     BFT_REALLOC(hs->request, hs->request_size, MPI_Request);
233     BFT_REALLOC(hs->status, hs->request_size,  MPI_Status);
234   }
235 
236 }
237 
238 /*----------------------------------------------------------------------------*/
239 /*!
240  * \brief Exchange send shift in send buffer for one-sided get.
241  *
242  * \param[in]  halo  halo structure to update
243  */
244 /*----------------------------------------------------------------------------*/
245 
246 static void
_exchange_send_shift(cs_halo_t * halo)247 _exchange_send_shift(cs_halo_t  *halo)
248 {
249   MPI_Comm comm = cs_glob_mpi_comm;
250   MPI_Request *request = NULL;
251   MPI_Status *status = NULL;
252 
253   BFT_MALLOC(request, halo->n_c_domains*2, MPI_Request);
254   BFT_MALLOC(status, halo->n_c_domains*2, MPI_Status);
255 
256   BFT_REALLOC(halo->c_domain_s_shift, halo->n_c_domains, cs_lnum_t);
257 
258   /* Exchange local range with neighbor ranks */
259 
260   const int local_rank = CS_MAX(cs_glob_rank_id, 0);
261 
262   for (int i = 0; i < halo->n_c_domains; i++) {
263     int rank_id = halo->c_domain_rank[i];
264     MPI_Irecv(halo->c_domain_s_shift + i,
265               1,
266               CS_MPI_LNUM,
267               rank_id,
268               local_rank,
269               comm,
270               &(request[i]));
271   }
272 
273   for (int i = 0; i < halo->n_c_domains; i++) {
274     int rank_id = halo->c_domain_rank[i];
275     MPI_Isend(halo->send_index + 2*i,
276               1,
277               CS_MPI_LNUM,
278               rank_id,
279               rank_id,
280               comm,
281               &(request[halo->n_c_domains + i]));
282   }
283 
284   MPI_Waitall(halo->n_c_domains*2, request, status);
285 
286   BFT_FREE(request);
287   BFT_FREE(status);
288 }
289 
290 #endif /* HAVE_MPI */
291 
292 /*----------------------------------------------------------------------------
293  * Local copy from halo send buffer to destination array.
294  *
295  * This allows pariodicity data which may be present on the local rank to
296  * be exchanged without any MPI call.
297  *
298  * Data is untyped; only its size is given, so this function may also
299  * be used to synchronize interleaved multidimendsional data, using
300  * size = element_size*dim (assuming a homogeneous environment, at least
301  * as far as data encoding goes).
302  *
303  * parameters:
304  *   halo          <-- pointer to halo structure
305  *   local_rank_id <-- id of local rank
306  *   sync_mode     <-- synchronization mode (standard or extended)
307  *   size          <-- datatype size
308  *   var_location  <-- Allocation info for exchanged variable (host/device)
309  *   send_buf      <-> pointer to send data
310  *   val           <-> pointer to destination data
311  *----------------------------------------------------------------------------*/
312 
313 static void
_sync_local(const cs_halo_t * halo,int local_rank_id,cs_halo_type_t sync_mode,size_t size,cs_alloc_mode_t var_location,const void * send_buf,void * val)314 _sync_local(const cs_halo_t  *halo,
315             int               local_rank_id,
316             cs_halo_type_t    sync_mode,
317             size_t            size,
318             cs_alloc_mode_t   var_location,
319             const void       *send_buf,
320             void             *val)
321 {
322   cs_lnum_t end_shift = (sync_mode == CS_HALO_EXTENDED) ? 2 : 1;
323 
324   unsigned char *_val = val;
325   unsigned char *recv
326     = _val + (halo->n_local_elts + halo->index[2*local_rank_id]) * size;
327 
328   cs_lnum_t start = halo->send_index[2*local_rank_id]*size;
329   cs_lnum_t length = (  halo->send_index[2*local_rank_id + end_shift]
330                       - halo->send_index[2*local_rank_id]);
331 
332   size_t count = length * size;
333 
334   if (var_location == CS_ALLOC_HOST) {
335     const unsigned char *buffer = (const unsigned char *)send_buf;
336     const unsigned char *_buffer = buffer + start;
337     memcpy(recv, _buffer, count);
338   }
339 
340 #if defined(HAVE_ACCEL)
341   else {
342     const unsigned char *buffer
343       = (const unsigned char *)cs_get_device_ptr_const(send_buf);
344     const unsigned char *_buffer = buffer + start;
345     cs_copy_d2d(recv, _buffer, count);
346   }
347 #endif
348 }
349 
350 #if defined(HAVE_MPI)
351 #if (MPI_VERSION >= 3)
352 
353 /*----------------------------------------------------------------------------*/
354 /*!
355  * \brief Launch update of ghost values in case of parallelism
356  *        for one-sided communication.
357  *
358  * The cs_halo_sync_pack function should have been called before this function,
359  * using the same hs argument.
360  *
361  * \param[in]       halo        pointer to halo structure
362  * \param[in]       val         pointer to variable value array
363  * \param[in, out]  hs          pointer to halo state, NULL for global state
364  */
365 /*----------------------------------------------------------------------------*/
366 
367 static void
_halo_sync_start_one_sided(const cs_halo_t * halo,void * val,cs_halo_state_t * hs)368 _halo_sync_start_one_sided(const cs_halo_t  *halo,
369                            void             *val,
370                            cs_halo_state_t  *hs)
371 {
372   cs_lnum_t end_shift = (hs->sync_mode == CS_HALO_EXTENDED) ? 2 : 1;
373   cs_lnum_t stride = hs->stride;
374   size_t elt_size = cs_datatype_size[hs->data_type] * stride;
375   size_t n_loc_elts = halo->n_local_elts;
376 
377   unsigned char *restrict _val = val;
378   unsigned char *restrict _val_dest = _val + n_loc_elts*elt_size;
379 
380   MPI_Datatype mpi_datatype = cs_datatype_to_mpi[hs->data_type];
381 
382   const int local_rank = CS_MAX(cs_glob_rank_id, 0);
383 
384   /* Get data from distant ranks */
385 
386   if (_halo_comm_mode == CS_HALO_COMM_RMA_GET) {
387 
388     /* Use active target synchronization */
389 
390     if (halo->c_domain_group != MPI_GROUP_NULL) {
391       /* Start RMA exposure epoch */
392       MPI_Win_post(halo->c_domain_group,
393                    MPI_MODE_NOPUT,          /* program assertion */
394                    hs->win);
395 
396       /* Access Epoch */
397       MPI_Win_start(halo->c_domain_group,
398                     0,                      /* program assertion */
399                     hs->win);
400     }
401     else {
402       MPI_Win_fence(0, hs->win);
403     }
404 
405     for (int rank_id = 0; rank_id < halo->n_c_domains; rank_id++) {
406 
407       int  length = (  halo->index[2*rank_id + end_shift]
408                      - halo->index[2*rank_id]) * stride;
409 
410       if (halo->c_domain_rank[rank_id] != local_rank) {
411 
412         if (length > 0) {
413           cs_lnum_t start = halo->index[2*rank_id]*elt_size;
414           unsigned char *dest = _val_dest + start;
415           MPI_Aint displacement = halo->c_domain_s_shift[rank_id]*elt_size;
416 
417           MPI_Get(dest,
418                   length,                        /* origin count */
419                   mpi_datatype,                  /* origin datatype */
420                   halo->c_domain_rank[rank_id],  /* target rank */
421                   displacement,                  /* target displacement */
422                   length,                        /* target count */
423                   mpi_datatype,                  /* target datatype */
424                   hs->win);
425         }
426 
427       }
428       else
429         hs->local_rank_id = rank_id;
430     }
431 
432   }
433 }
434 
435 /*----------------------------------------------------------------------------*/
436 /*!
437  * \brief Finalize update of ghost values in case of parallelism
438  *        for one-sided communication.
439  *
440  * The cs_halo_sync_pack function should have been called before this function,
441  * using the same hs argument.
442  *
443  * \param[in]       halo        pointer to halo structure
444  * \param[in]       val         pointer to variable value array
445  * \param[in, out]  hs          pointer to halo state, NULL for global state
446  */
447 /*----------------------------------------------------------------------------*/
448 
449 static void
_halo_sync_complete_one_sided(const cs_halo_t * halo,void * val,cs_halo_state_t * hs)450 _halo_sync_complete_one_sided(const cs_halo_t  *halo,
451                               void             *val,
452                               cs_halo_state_t  *hs)
453 {
454   /* Use active target synchronization */
455 
456   /* Access Epoch */
457   if (halo->c_domain_group != MPI_GROUP_NULL) {
458     MPI_Win_complete(hs->win);
459 
460     /* Complete RMA exposure epoch */
461     MPI_Win_wait(hs->win);
462   }
463   else {
464     MPI_Win_fence(0, hs->win);
465   }
466 
467   /* Copy local values in case of periodicity */
468 
469   if (hs->local_rank_id > -1) {
470     size_t elt_size = cs_datatype_size[hs->data_type] * hs->stride;
471     _sync_local(halo, hs->local_rank_id, hs->sync_mode, elt_size,
472                 hs->var_location, hs->send_buffer_cur, val);
473   }
474 }
475 
476 #endif /* (MPI_VERSION >= 3) */
477 #endif /* defined(HAVE_MPI) */
478 
479 /*! (DOXYGEN_SHOULD_SKIP_THIS) \endcond */
480 
481 /*============================================================================
482  * Public function definitions
483  *============================================================================*/
484 
485 /*----------------------------------------------------------------------------*/
486 /*!
487  * \brief Create a halo structure given an interface set.
488  *
489  * \param[in]  ifs  pointer to a cs_interface_set structure
490  *
491  * \return  pointer to created cs_halo_t structure
492  */
493 /*----------------------------------------------------------------------------*/
494 
495 cs_halo_t *
cs_halo_create(const cs_interface_set_t * ifs)496 cs_halo_create(const cs_interface_set_t  *ifs)
497 {
498   cs_lnum_t  i, tmp_id, perio_lst_size;
499 
500   cs_lnum_t  loc_id = -1;
501 
502   cs_halo_t  *halo = NULL;
503 
504   const cs_interface_t  *interface = NULL;
505 
506   BFT_MALLOC(halo, 1, cs_halo_t);
507 
508   halo->n_c_domains = cs_interface_set_size(ifs);
509   halo->n_transforms = 0;
510 
511   halo->periodicity = cs_interface_set_periodicity(ifs);
512   halo->n_rotations = 0;
513 
514   halo->n_local_elts = 0;
515 
516   for (i = 0; i < CS_HALO_N_TYPES; i++) {
517     halo->n_send_elts[i] = 0;
518     halo->n_elts [i] = 0;
519   }
520 
521   BFT_MALLOC(halo->c_domain_rank, halo->n_c_domains, int);
522 
523   /* Check if cs_glob_rank_id belongs to interface set in order to
524      order ranks with local rank at first place */
525 
526   for (i = 0; i < halo->n_c_domains; i++) {
527 
528     interface = cs_interface_set_get(ifs, i);
529     halo->c_domain_rank[i] = cs_interface_rank(interface);
530 
531     if (cs_glob_rank_id == cs_interface_rank(interface))
532       loc_id = i;
533 
534   } /* End of loop on ranks */
535 
536   if (loc_id > 0) {
537 
538     tmp_id = halo->c_domain_rank[loc_id];
539     halo->c_domain_rank[loc_id] = halo->c_domain_rank[0];
540     halo->c_domain_rank[0] = tmp_id;
541 
542   }
543 
544   /* Order ranks */
545 
546   if (   halo->n_c_domains > 2
547       && _order_int_test(&(halo->c_domain_rank[1]),
548                          halo->n_c_domains-1) == 0) {
549 
550     cs_lnum_t  *order = NULL;
551     cs_gnum_t  *buffer = NULL;
552 
553     BFT_MALLOC(order, halo->n_c_domains - 1, cs_lnum_t);
554     BFT_MALLOC(buffer, halo->n_c_domains - 1, cs_gnum_t);
555 
556     for (i = 1; i < halo->n_c_domains; i++)
557       buffer[i-1] = (cs_gnum_t)halo->c_domain_rank[i];
558 
559     cs_order_gnum_allocated(NULL,
560                             buffer,
561                             order,
562                             halo->n_c_domains - 1);
563 
564     for (i = 0; i < halo->n_c_domains - 1; i++)
565       halo->c_domain_rank[i+1] = (cs_lnum_t)buffer[order[i]];
566 
567     BFT_FREE(buffer);
568     BFT_FREE(order);
569 
570   } /* End of ordering ranks */
571 
572   CS_MALLOC_HD(halo->send_index, 2*halo->n_c_domains + 1, cs_lnum_t,
573                _halo_buffer_alloc_mode);
574   BFT_MALLOC(halo->index, 2*halo->n_c_domains + 1, cs_lnum_t);
575 
576   for (i = 0; i < 2*halo->n_c_domains + 1; i++) {
577     halo->send_index[i] = 0;
578     halo->index[i] = 0;
579   }
580 
581   halo->send_perio_lst = NULL;
582   halo->perio_lst = NULL;
583 
584   if (halo->periodicity != NULL) {
585 
586     halo->n_transforms = fvm_periodicity_get_n_transforms(halo->periodicity);
587 
588     for (i = 0; i < halo->n_transforms; i++) {
589       if (   fvm_periodicity_get_type(halo->periodicity, i)
590           >= FVM_PERIODICITY_ROTATION)
591         halo->n_rotations += 1;
592     }
593 
594     /* We need 2 values per transformation and there are n_transforms
595        transformations. For each rank, we need a value for standard and
596        extended halo. */
597 
598     perio_lst_size = 2*halo->n_transforms * 2*halo->n_c_domains;
599 
600     BFT_MALLOC(halo->send_perio_lst, perio_lst_size, cs_lnum_t);
601     BFT_MALLOC(halo->perio_lst, perio_lst_size, cs_lnum_t);
602 
603     for (i = 0; i < perio_lst_size; i++) {
604       halo->send_perio_lst[i] = 0;
605       halo->perio_lst[i] = 0;
606     }
607 
608   }
609 
610   halo->send_list = NULL;
611 
612 #if defined(HAVE_MPI)
613   halo->c_domain_group = MPI_GROUP_NULL;
614   halo->c_domain_s_shift = NULL;
615 #endif
616 
617   _n_halos += 1;
618 
619   return halo;
620 }
621 
622 /*----------------------------------------------------------------------------*/
623 /*!
624  * \brief Ready halo for use.
625  *
626  * This function should be called after building a halo using the
627  * cs_halo_create_function and defined locally.
628  * It is called automatically by cs_halo_create_from_ref and
629  * cs_halo_create_from_rank_neighbors so does not need to be called again
630  * using these functions.
631  *
632  * \param[in]  halo  pointer to halo structure
633  */
634 /*----------------------------------------------------------------------------*/
635 
636 void
cs_halo_create_complete(cs_halo_t * halo)637 cs_halo_create_complete(cs_halo_t  *halo)
638 {
639 #if defined(HAVE_MPI)
640 
641   /* Make buffer available on device if relevant */
642   cs_sync_h2d(halo->send_index);
643   cs_sync_h2d(halo->send_list);
644 
645   /* Create group for one-sided communication */
646   if (_halo_comm_mode > CS_HALO_COMM_P2P) {
647     const int local_rank = CS_MAX(cs_glob_rank_id, 0);
648     int n_group_ranks = 0;
649     int *group_ranks = NULL;
650     BFT_MALLOC(group_ranks, halo->n_c_domains + 1, int);
651     for (int i = 0; i < halo->n_c_domains; i++) {
652       if (halo->c_domain_rank[i] < local_rank)
653         group_ranks[n_group_ranks++] = halo->c_domain_rank[i];
654     }
655     group_ranks[n_group_ranks++] = local_rank;
656     for (int i = 0; i < halo->n_c_domains; i++) {
657       if (halo->c_domain_rank[i] > local_rank)
658         group_ranks[n_group_ranks++] = halo->c_domain_rank[i];
659     }
660 
661     if (_order_int_test(group_ranks, n_group_ranks)) {
662 
663       MPI_Group glob_group;
664       MPI_Comm_group(cs_glob_mpi_comm, &glob_group);
665       MPI_Group_incl(glob_group,
666                      n_group_ranks,
667                      group_ranks,
668                      &(halo->c_domain_group));
669       MPI_Group_free(&glob_group);
670 
671     }
672 
673     BFT_FREE(group_ranks);
674   }
675 
676   /* Exchange shifts for one-sided communication */
677   if (_halo_comm_mode == CS_HALO_COMM_RMA_GET)
678     _exchange_send_shift(halo);
679 
680   if (_halo_state == NULL)
681     _halo_state = cs_halo_state_create();
682 
683 #endif /* defined(HAVE_MPI) */
684 }
685 
686 /*----------------------------------------------------------------------------*/
687 /*!
688  * \brief Create a halo structure, given a reference halo.
689  *
690  * \param[in]  ref  pointer to reference halo
691  *
692  * \return  pointer to created cs_halo_t structure
693  */
694 /*----------------------------------------------------------------------------*/
695 
696 cs_halo_t *
cs_halo_create_from_ref(const cs_halo_t * ref)697 cs_halo_create_from_ref(const cs_halo_t  *ref)
698 {
699   cs_lnum_t  i;
700 
701   cs_halo_t  *halo = NULL;
702 
703   BFT_MALLOC(halo, 1, cs_halo_t);
704 
705   halo->n_c_domains = ref->n_c_domains;
706   halo->n_transforms = ref->n_transforms;
707 
708   halo->periodicity = ref->periodicity;
709   halo->n_rotations = ref->n_rotations;
710 
711   halo->n_local_elts = 0;
712 
713   BFT_MALLOC(halo->c_domain_rank, halo->n_c_domains, int);
714 
715   for (i = 0; i < halo->n_c_domains; i++)
716     halo->c_domain_rank[i] = ref->c_domain_rank[i];
717 
718   CS_MALLOC_HD(halo->send_index, 2*halo->n_c_domains + 1, cs_lnum_t,
719                _halo_buffer_alloc_mode);
720   BFT_MALLOC(halo->index, 2*halo->n_c_domains + 1, cs_lnum_t);
721 
722   for (i = 0; i < 2*halo->n_c_domains + 1; i++) {
723     halo->send_index[i] = 0;
724     halo->index[i] = 0;
725   }
726 
727   halo->send_perio_lst = NULL;
728   halo->perio_lst = NULL;
729 
730   if (halo->n_transforms > 0) {
731 
732     cs_lnum_t  perio_lst_size = 2*halo->n_transforms * 2*halo->n_c_domains;
733 
734     BFT_MALLOC(halo->send_perio_lst, perio_lst_size, cs_lnum_t);
735     BFT_MALLOC(halo->perio_lst, perio_lst_size, cs_lnum_t);
736 
737     for (i = 0; i < perio_lst_size; i++) {
738       halo->send_perio_lst[i] = 0;
739       halo->perio_lst[i] = 0;
740     }
741 
742   }
743 
744   halo->send_list = NULL;
745 
746 #if defined(HAVE_MPI)
747   halo->c_domain_group = MPI_GROUP_NULL;
748   halo->c_domain_s_shift = NULL;
749 #endif
750 
751   _n_halos += 1;
752 
753   cs_halo_create_complete(halo);
754 
755   return halo;
756 }
757 
758 #if defined(HAVE_MPI)
759 
760 /*----------------------------------------------------------------------------*/
761 /*!
762  * \brief Create a halo structure from distant element distant ranks and ids.
763  *
764  * \remark  This function does not handle periodicity. For most matrix-vector,
765  *          products and similar operations, periodicity of translation an
766  *          even rotation could be handled with no specific halo information,
767  *          simply by assigning an equivalence between two periodic elements.
768  *          For rotation, this would require also applying a rotation through
769  *          the matrix coefficients (this would have the advantage of being
770  *          compatible with external libraries). An alternative would be
771  *          to add rotation information to a given halo as a second stage,
772  *          through a specialized operator which can be added in the future.
773  *
774  * \param[in]  rn              associated rank neighbors info
775  * \param[in]  n_local_elts    number of elements for local rank
776  * \param[in]  n_distant_elts  number of distant elements for local rank
777  * \param[in]  elt_rank_id     distant element rank index in rank neighbors,
778  *                             ordered by rank (size: n_distant_elts)
779  * \param[in]  elt_id          distant element id (at distant rank),
780  *                             ordered by rank (size: n_distant_elts)
781  *
782  * \return  pointer to created cs_halo_t structure
783  */
784 /*----------------------------------------------------------------------------*/
785 
786 cs_halo_t *
cs_halo_create_from_rank_neighbors(const cs_rank_neighbors_t * rn,cs_lnum_t n_local_elts,cs_lnum_t n_distant_elts,const int elt_rank_id[],const cs_lnum_t elt_id[])787 cs_halo_create_from_rank_neighbors(const cs_rank_neighbors_t  *rn,
788                                    cs_lnum_t                   n_local_elts,
789                                    cs_lnum_t                   n_distant_elts,
790                                    const int                   elt_rank_id[],
791                                    const cs_lnum_t             elt_id[])
792 {
793   cs_halo_t  *halo = NULL;
794 
795   BFT_MALLOC(halo, 1, cs_halo_t);
796 
797   halo->n_c_domains = 0;
798   halo->n_transforms = 0;
799 
800   halo->n_rotations = 0;
801 
802   halo->periodicity = NULL;
803   halo->send_perio_lst = NULL;
804   halo->perio_lst = NULL;
805 
806 #if defined(HAVE_MPI)
807   halo->c_domain_group = MPI_GROUP_NULL;
808   halo->c_domain_s_shift = NULL;
809 #endif
810 
811   halo->n_local_elts = n_local_elts;
812 
813   for (int i = 0; i < CS_HALO_N_TYPES; i++) {
814     halo->n_send_elts[i] = 0;
815     halo->n_elts [i] = n_distant_elts;
816   }
817 
818   /* Count elements for each rank;
819      check they are are ordered lexicographically */
820 
821   cs_lnum_t *rank_count;
822   BFT_MALLOC(rank_count, rn->size*2, cs_lnum_t);
823   for (int i = 0; i < rn->size; i++)
824     rank_count[i] = 0;
825 
826   int rank_prev = -1;
827   int elt_prev = -1;
828   for (cs_lnum_t i = 0; i < n_distant_elts; i++) {
829     int rank_id = elt_rank_id[i];
830     if (   rank_id < rank_prev
831         || (rank_id == rank_prev && elt_id[i] <= elt_prev))
832       bft_error
833         (__FILE__, __LINE__, 0,
834          "%s:\n"
835          "  Rank and distant element ids passed to this function must\n"
836          "  be lexicographically ordered; this is not the case here.",
837          __func__);
838     rank_count[rank_id] += 1;
839     rank_prev = rank_id;
840     elt_prev = elt_id[i];
841   }
842 
843   /* Now exchange counts with neighboring elements */
844 
845   MPI_Comm comm = cs_glob_mpi_comm;
846   MPI_Request *request = NULL;
847   MPI_Status *status = NULL;
848 
849   BFT_MALLOC(request, rn->size*2, MPI_Request);
850   BFT_MALLOC(status, rn->size*2, MPI_Status);
851 
852   /* Exchange local range with neighbor ranks */
853 
854   int request_count = 0;
855   const int local_rank = CS_MAX(cs_glob_rank_id, 0);
856 
857   for (int i = 0; i < rn->size; i++) {
858     MPI_Irecv(rank_count + rn->size + i,
859               1,
860               CS_MPI_LNUM,
861               rn->rank[i],
862               local_rank,
863               comm,
864               &(request[request_count++]));
865   }
866 
867   for (int i = 0; i < rn->size; i++) {
868     MPI_Isend(rank_count + i,
869               1,
870               CS_MPI_LNUM,
871               rn->rank[i],
872               rn->rank[i],
873               comm,
874               &(request[request_count++]));
875   }
876 
877   MPI_Waitall(request_count, request, status);
878 
879   /* Now build send and receive indexes to exchange data;
880      the receive index can be directly assigned to the halo;
881      also check if cs_glob_rank_id belongs to interface set in order to
882      order ranks with local rank at first place */
883 
884   int        loc_r_index = -1;
885   cs_lnum_t  r_displ = 0, loc_r_displ = 0;
886   cs_lnum_t  recv_count = 0, send_count = 0;
887 
888   halo->n_c_domains = 0;
889   for (int i = 0; i < rn->size; i++) {
890     if (rank_count[i] + rank_count[rn->size + i] > 0) {
891       halo->n_c_domains += 1;
892       if (rn->rank[i] == local_rank) {
893         loc_r_index = i;
894         loc_r_displ = r_displ;
895         assert(rank_count[i] == rank_count[rn->size + i]);
896       }
897       r_displ += rank_count[i];
898       recv_count += rank_count[rn->size + i];
899     }
900   }
901 
902   BFT_MALLOC(halo->c_domain_rank, halo->n_c_domains, int);
903 
904   CS_MALLOC_HD(halo->send_list, recv_count, cs_lnum_t,
905                _halo_buffer_alloc_mode);
906   CS_MALLOC_HD(halo->send_index, 2*halo->n_c_domains + 1, cs_lnum_t,
907                _halo_buffer_alloc_mode);
908   BFT_MALLOC(halo->index, halo->n_c_domains*2+1, cs_lnum_t);
909 
910   halo->n_c_domains = 0;
911   send_count = 0;
912   recv_count = 0;
913 
914   halo->index[0] = 0;
915   halo->send_index[0] = 0;
916 
917   if (loc_r_index > -1) {
918     halo->c_domain_rank[0] = local_rank;
919     cs_lnum_t  l_count = rank_count[loc_r_index];
920     for (cs_lnum_t i = 0; i < l_count; i++)
921       halo->send_list[i] = elt_id[loc_r_displ + i];
922     send_count += l_count;
923     recv_count += l_count;
924     halo->n_c_domains = 1;
925     for (int j = 1; j < 3; j++) {
926       halo->index[j] = recv_count;
927       halo->send_index[j] = send_count;
928     }
929   }
930 
931   for (int i = 0; i < rn->size; i++) {
932     if (   rank_count[i] + rank_count[rn->size + i] > 0
933         && rn->rank[i] != local_rank) {
934       halo->c_domain_rank[halo->n_c_domains] = rn->rank[i];
935       recv_count += rank_count[i];
936       send_count += rank_count[rn->size + i];
937       for (int j = 1; j < 3; j++) {
938         halo->index[halo->n_c_domains*2 + j] = recv_count;
939         halo->send_index[halo->n_c_domains*2 + j] = send_count;
940       }
941       halo->n_c_domains += 1;
942     }
943   }
944 
945   BFT_FREE(rank_count);
946 
947   for (int i = 0; i < CS_HALO_N_TYPES; i++)
948     halo->n_send_elts[i] = send_count;
949 
950   /* Now send lists to matching ranks (reverse send and receive) */
951 
952   request_count = 0;
953 
954   for (int i = 0; i < halo->n_c_domains; i++) {
955     int rank_id = halo->c_domain_rank[i];
956     if (rank_id == local_rank) continue;
957     cs_lnum_t r_shift = halo->send_index[2*i];
958     cs_lnum_t r_size  = halo->send_index[2*i+1] - r_shift;
959     if (r_size > 0)
960       MPI_Irecv(halo->send_list + r_shift,
961                 r_size,
962                 CS_MPI_LNUM,
963                 rank_id,
964                 local_rank,
965                 comm,
966                 &(request[request_count++]));
967   }
968 
969   for (int i = 0; i < halo->n_c_domains; i++) {
970     int rank_id = halo->c_domain_rank[i];
971     if (rank_id == local_rank) continue;
972     cs_lnum_t s_shift = halo->index[2*i];
973     cs_lnum_t s_size  = halo->index[2*i+1] - s_shift;
974     if (s_shift < loc_r_displ) { /* case with local rank first */
975       assert(halo->c_domain_rank[0] == local_rank);
976       s_shift -= halo->index[2];
977     }
978     if (s_size > 0)
979       MPI_Isend(elt_id + s_shift,
980                 s_size,
981                 CS_MPI_LNUM,
982                 rank_id,
983                 rank_id,
984                 comm,
985                 &(request[request_count++]));
986   }
987 
988   MPI_Waitall(request_count, request, status);
989 
990   BFT_FREE(request);
991   BFT_FREE(status);
992 
993   _n_halos += 1;
994 
995   cs_halo_create_complete(halo);
996 
997   return halo;
998 }
999 
1000 #endif /* HAVE_MPI */
1001 
1002 /*----------------------------------------------------------------------------*/
1003 /*!
1004  * brief Destroy a halo structure.
1005  *
1006  * \param[in, out]  halo  pointer to pointer to cs_halo structure to destroy.
1007  */
1008 /*----------------------------------------------------------------------------*/
1009 
1010 void
cs_halo_destroy(cs_halo_t ** halo)1011 cs_halo_destroy(cs_halo_t  **halo)
1012 {
1013   if (halo == NULL)
1014     return;
1015 
1016   if (*halo == NULL)
1017     return;
1018 
1019   cs_halo_t  *_halo = *halo;
1020 
1021 #if defined(HAVE_MPI)
1022   if (_halo->c_domain_group != MPI_GROUP_NULL)
1023     MPI_Group_free(&(_halo->c_domain_group));
1024 
1025   BFT_FREE(_halo->c_domain_s_shift);
1026 #endif
1027 
1028   BFT_FREE(_halo->c_domain_rank);
1029 
1030   CS_FREE_HD(_halo->send_list);
1031   CS_FREE_HD(_halo->send_index);
1032   BFT_FREE(_halo->index);
1033 
1034   BFT_FREE(_halo->send_perio_lst);
1035   BFT_FREE(_halo->perio_lst);
1036 
1037   BFT_FREE(*halo);
1038 
1039   _n_halos -= 1;
1040 
1041   /* Delete default state if no halo remains */
1042 
1043   if (_n_halos == 0)
1044     cs_halo_state_destroy(&_halo_state);
1045 }
1046 
1047 /*----------------------------------------------------------------------------*/
1048 /*!
1049  * \brief Create a halo state structure.
1050  *
1051  * \return  pointer to created cs_halo_state_t structure.
1052  */
1053 /*----------------------------------------------------------------------------*/
1054 
1055 cs_halo_state_t *
cs_halo_state_create(void)1056 cs_halo_state_create(void)
1057 {
1058   cs_halo_state_t *hs;
1059   BFT_MALLOC(hs, 1, cs_halo_state_t);
1060 
1061   cs_halo_state_t hs_ini = {
1062     .sync_mode = CS_HALO_STANDARD,
1063     .data_type = CS_DATATYPE_NULL,
1064     .stride = 0,
1065     .var_location = CS_ALLOC_HOST,
1066     .send_buffer_cur = NULL,
1067     .n_requests = 0,
1068     .local_rank_id = -1,
1069     .send_buffer_size = 0,
1070     .recv_buffer_size = 0,
1071     .send_buffer = NULL,
1072     .recv_buffer = NULL
1073 #if defined(HAVE_MPI)
1074     ,
1075     .request_size = 0,
1076     .request = NULL,
1077     .status = NULL,
1078     .win = MPI_WIN_NULL
1079 
1080 #endif
1081   };
1082 
1083   *hs = hs_ini;
1084 
1085   return hs;
1086 }
1087 
1088 /*----------------------------------------------------------------------------*/
1089 /*!
1090  * \brief Destroy a halo state structure.
1091  *
1092  * \param[in, out]  halo_state  pointer to pointer to cs_halo_state
1093  *                              structure to destroy.
1094  */
1095 /*----------------------------------------------------------------------------*/
1096 
1097 void
cs_halo_state_destroy(cs_halo_state_t ** halo_state)1098 cs_halo_state_destroy(cs_halo_state_t  **halo_state)
1099 {
1100   if (halo_state != NULL) {
1101     cs_halo_state_t *hs = *halo_state;
1102 
1103 #if defined(HAVE_MPI)
1104 #if (MPI_VERSION >= 3)
1105     if (hs->win != MPI_WIN_NULL) {
1106       MPI_Win_free(&(hs->win));
1107       hs->win = MPI_WIN_NULL;
1108     }
1109 #endif
1110 #endif
1111 
1112     CS_FREE_HD(hs->send_buffer);
1113 
1114 #if defined(HAVE_MPI)
1115     BFT_FREE(hs->request);
1116     BFT_FREE(hs->status);
1117 #endif
1118 
1119     BFT_FREE(*halo_state);
1120   }
1121 }
1122 
1123 /*----------------------------------------------------------------------------*/
1124 /*!
1125  * \brief Get pointer to default halo state structure.
1126  *
1127  * \return]  halo  pointer to pointer to cs_halo structure to destroy.
1128  */
1129 /*----------------------------------------------------------------------------*/
1130 
1131 cs_halo_state_t *
cs_halo_state_get_default(void)1132 cs_halo_state_get_default(void)
1133 {
1134   return _halo_state;
1135 }
1136 
1137 /*----------------------------------------------------------------------------
1138  * Apply local cells renumbering to a halo
1139  *
1140  * parameters:
1141  *   halo        <-- pointer to halo structure
1142  *   new_cell_id <-- array indicating old -> new cell id (0 to n-1)
1143  *---------------------------------------------------------------------------*/
1144 
1145 void
cs_halo_renumber_cells(cs_halo_t * halo,const cs_lnum_t new_cell_id[])1146 cs_halo_renumber_cells(cs_halo_t        *halo,
1147                        const cs_lnum_t   new_cell_id[])
1148 {
1149   if (halo != NULL) {
1150 
1151     const cs_lnum_t n_elts = halo->n_send_elts[CS_HALO_EXTENDED];
1152 
1153     for (cs_lnum_t j = 0; j < n_elts; j++)
1154       halo->send_list[j] = new_cell_id[halo->send_list[j]];
1155 
1156     cs_sync_h2d(halo->send_list);
1157 
1158   }
1159 }
1160 
1161 /*----------------------------------------------------------------------------
1162  * Apply ghost cells renumbering to a halo
1163  *
1164  * parameters:
1165  *   halo        <-- pointer to halo structure
1166  *   old_cell_id <-- array indicating new -> old cell id (0 to n-1)
1167  *---------------------------------------------------------------------------*/
1168 
1169 void
cs_halo_renumber_ghost_cells(cs_halo_t * halo,const cs_lnum_t old_cell_id[])1170 cs_halo_renumber_ghost_cells(cs_halo_t        *halo,
1171                              const cs_lnum_t   old_cell_id[])
1172 {
1173   if (halo == NULL)
1174     return;
1175 
1176   /* Reverse update from distant cells */
1177 
1178   cs_lnum_t *send_buf, *recv_buf;
1179 
1180   BFT_MALLOC(send_buf, halo->n_send_elts[1], cs_lnum_t);
1181   BFT_MALLOC(recv_buf, halo->n_elts[1], cs_lnum_t);
1182 
1183   for (int i = 0; i < halo->n_c_domains; i++) {
1184     cs_lnum_t start = halo->index[2*i];
1185     cs_lnum_t end = halo->index[2*i+2];
1186     cs_lnum_t shift = halo->n_local_elts + halo->index[2*i];
1187     for (cs_lnum_t j = start; j < end; j++) {
1188       recv_buf[j] = old_cell_id[halo->n_local_elts + j] - shift;
1189       assert(recv_buf[j] >= 0 && recv_buf[j] < (end - start));
1190     }
1191   }
1192 
1193   int local_rank_id = (cs_glob_n_ranks == 1) ? 0 : -1;
1194 
1195 #if defined(HAVE_MPI)
1196 
1197   if (cs_glob_n_ranks > 1) {
1198 
1199     int rank_id;
1200     int request_count = 0;
1201     const int local_rank = cs_glob_rank_id;
1202 
1203     MPI_Request  *request;
1204     MPI_Status   *status;
1205 
1206     BFT_MALLOC(request, halo->n_c_domains*2, MPI_Request);
1207     BFT_MALLOC(status, halo->n_c_domains*2, MPI_Status);
1208 
1209     /* Receive data from distant ranks */
1210 
1211     for (rank_id = 0; rank_id < halo->n_c_domains; rank_id++) {
1212 
1213       cs_lnum_t start = halo->send_index[2*rank_id];
1214       cs_lnum_t length = (  halo->send_index[2*rank_id + 2]
1215                           - halo->send_index[2*rank_id]);
1216 
1217       if (halo->c_domain_rank[rank_id] != local_rank) {
1218         if (length > 0)
1219           MPI_Irecv(send_buf + start,
1220                     length,
1221                     CS_MPI_LNUM,
1222                     halo->c_domain_rank[rank_id],
1223                     local_rank,
1224                     cs_glob_mpi_comm,
1225                     &(request[request_count++]));
1226       }
1227       else
1228         local_rank_id = rank_id;
1229 
1230     }
1231 
1232     /* We wait for posting all receives (often recommended) */
1233 
1234     if (_halo_use_barrier)
1235       MPI_Barrier(cs_glob_mpi_comm);
1236 
1237     /* Send data to distant ranks */
1238 
1239     for (rank_id = 0; rank_id < halo->n_c_domains; rank_id++) {
1240 
1241       /* If this is not the local rank */
1242 
1243       if (halo->c_domain_rank[rank_id] != local_rank) {
1244 
1245         cs_lnum_t start = halo->index[2*rank_id];
1246         cs_lnum_t length = (  halo->index[2*rank_id + 2]
1247                             - halo->index[2*rank_id]);
1248 
1249         if (length > 0)
1250           MPI_Isend(recv_buf + start,
1251                     length,
1252                     CS_MPI_LNUM,
1253                     halo->c_domain_rank[rank_id],
1254                     halo->c_domain_rank[rank_id],
1255                     cs_glob_mpi_comm,
1256                     &(request[request_count++]));
1257 
1258       }
1259 
1260     }
1261 
1262     /* Wait for all exchanges */
1263 
1264     MPI_Waitall(request_count, request, status);
1265 
1266     BFT_FREE(request);
1267     BFT_FREE(status);
1268 
1269   }
1270 
1271 #endif /* defined(HAVE_MPI) */
1272 
1273   /* Copy local values if present */
1274 
1275   if (local_rank_id > -1) {
1276 
1277     cs_lnum_t *recv = recv_buf + halo->index[2*local_rank_id];
1278 
1279     cs_lnum_t start = halo->send_index[2*local_rank_id];
1280     cs_lnum_t length = (  halo->send_index[2*local_rank_id + 2]
1281                         - halo->send_index[2*local_rank_id]);
1282 
1283     for (cs_lnum_t j = 0; j < length; j++)
1284       send_buf[j+start] = recv[j];
1285 
1286   }
1287 
1288   BFT_FREE(recv_buf);
1289 
1290   /* Now apply renumbering to send list */
1291 
1292   for (int i = 0; i < halo->n_c_domains; i++) {
1293     cs_lnum_t start = halo->send_index[2*i];
1294     cs_lnum_t end = halo->send_index[2*i+2];
1295     for (cs_lnum_t j = start; j < end; j++)
1296       send_buf[j] = halo->send_list[start + send_buf[j]];
1297     for (cs_lnum_t j = start; j < end; j++)
1298       halo->send_list[j] = send_buf[j];
1299   }
1300 
1301   cs_sync_h2d(halo->send_list);
1302 
1303   BFT_FREE(send_buf);
1304 }
1305 
1306 /*----------------------------------------------------------------------------*/
1307 /*!
1308  * \brief Initialize halo state prior to packing halo data to send.
1309  *
1310  * A local state handler may be provided, or the default state handler will
1311  * be used.
1312  *
1313  * This function is included in \ref cs_halo_sync_pack, but may be called
1314  * separately for specific implementations, such as for accelerator devices.
1315  *
1316  * A local state and/or buffer may be provided, or the default (global) state
1317  * and buffer will be used. If provided explicitely,
1318  * the buffer must be of sufficient size.
1319  *
1320  * \param[in]       halo        pointer to halo structure
1321  * \param[in]       sync_mode   synchronization mode (standard or extended)
1322  * \param[in]       data_type   data type
1323  * \param[in]       stride      number of (interlaced) values by entity
1324  * \param[out]      send_buf    pointer to send buffer, NULL for global buffer
1325  * \param[in, out]  hs          pointer to halo state, NULL for global state
1326  *
1327  * \return  pointer to halo send buffer
1328  */
1329 /*----------------------------------------------------------------------------*/
1330 
1331 void *
cs_halo_sync_pack_init_state(const cs_halo_t * halo,cs_halo_type_t sync_mode,cs_datatype_t data_type,int stride,void * send_buf,cs_halo_state_t * hs)1332 cs_halo_sync_pack_init_state(const cs_halo_t  *halo,
1333                              cs_halo_type_t    sync_mode,
1334                              cs_datatype_t     data_type,
1335                              int               stride,
1336                              void             *send_buf,
1337                              cs_halo_state_t  *hs)
1338 {
1339   void *_send_buffer = send_buf;
1340 
1341   if (halo == NULL)
1342     return _send_buffer;
1343 
1344   cs_halo_state_t  *_hs = (hs != NULL) ? hs : _halo_state;
1345 
1346   if (_send_buffer == NULL) {
1347     size_t send_buffer_size = cs_halo_pack_size(halo, data_type, stride);
1348 
1349     if (send_buffer_size > _hs->send_buffer_size) {
1350       cs_alloc_mode_t alloc_mode = cs_check_device_ptr(halo->send_list);
1351 
1352       _hs->send_buffer_size = send_buffer_size;
1353 
1354 #if defined(HAVE_MPI)
1355 #if (MPI_VERSION >= 3)
1356       if (_hs->win != MPI_WIN_NULL) {
1357         MPI_Win_free(&(_hs->win));
1358         _hs->win = MPI_WIN_NULL;
1359       }
1360 #endif
1361 #endif
1362 
1363       CS_FREE_HD(_hs->send_buffer);
1364       CS_MALLOC_HD(_hs->send_buffer,
1365                    _hs->send_buffer_size,
1366                    char,
1367                    alloc_mode);
1368 
1369 #if defined(HAVE_MPI)
1370 #if (MPI_VERSION >= 3)
1371       if (_halo_comm_mode == CS_HALO_COMM_RMA_GET)
1372         MPI_Win_create(_hs->send_buffer,
1373                        _hs->send_buffer_size,
1374                        1,   /* displacement unit */
1375                        MPI_INFO_NULL,
1376                        MPI_COMM_WORLD,
1377                        &(_hs->win));
1378 #endif
1379 #endif
1380     }
1381 
1382     _send_buffer = _hs->send_buffer;
1383   }
1384 
1385   _hs->var_location = CS_ALLOC_HOST;
1386   _hs->send_buffer_cur = _send_buffer;
1387 
1388   _hs->sync_mode = sync_mode;
1389   _hs->data_type = data_type;
1390   _hs->stride = stride;
1391 
1392   return _send_buffer;
1393 }
1394 
1395 /*----------------------------------------------------------------------------*/
1396 /*!
1397  * \brief Pack halo data to send into dense buffer.
1398  *
1399  * A local state handler may be provided, or the default state handler will
1400  * be used.
1401  *
1402  * A local state and/or buffer may be provided, or the default (global) state
1403  * and buffer will be used. If provided explicitely,
1404  * the buffer must be of sufficient size.
1405  *
1406  * \param[in]       halo        pointer to halo structure
1407  * \param[in]       sync_mode   synchronization mode (standard or extended)
1408  * \param[in]       data_type   data type
1409  * \param[in]       stride      number of (interlaced) values by entity
1410  * \param[in]       val         pointer to variable value array
1411  * \param[out]      send_buf    pointer to send buffer, NULL for global buffer
1412  * \param[in, out]  hs          pointer to halo state, NULL for global state
1413  */
1414 /*----------------------------------------------------------------------------*/
1415 
1416 void
cs_halo_sync_pack(const cs_halo_t * halo,cs_halo_type_t sync_mode,cs_datatype_t data_type,int stride,void * val,void * send_buf,cs_halo_state_t * hs)1417 cs_halo_sync_pack(const cs_halo_t  *halo,
1418                   cs_halo_type_t    sync_mode,
1419                   cs_datatype_t     data_type,
1420                   int               stride,
1421                   void             *val,
1422                   void             *send_buf,
1423                   cs_halo_state_t  *hs)
1424 {
1425   if (halo == NULL)
1426     return;
1427 
1428   void *_send_buffer = cs_halo_sync_pack_init_state(halo,
1429                                                     sync_mode,
1430                                                     data_type,
1431                                                     stride,
1432                                                     send_buf,
1433                                                     hs);
1434 
1435   cs_lnum_t end_shift = 0;
1436 
1437   if (sync_mode == CS_HALO_STANDARD)
1438     end_shift = 1;
1439 
1440   else if (sync_mode == CS_HALO_EXTENDED)
1441     end_shift = 2;
1442 
1443   /* Assemble buffers for halo exchange; avoid threading for now, as dynamic
1444      scheduling led to slightly higher cost here in some tests,
1445      and even static scheduling might lead to false sharing for small
1446      halos. */
1447 
1448   if (data_type == CS_REAL_TYPE) {
1449 
1450     cs_real_t *buffer = (cs_real_t *)_send_buffer;
1451     cs_real_t *var = val;
1452 
1453     for (int rank_id = 0; rank_id < halo->n_c_domains; rank_id++) {
1454 
1455       cs_lnum_t p_start = halo->send_index[2*rank_id]*stride;
1456       size_t start = halo->send_index[2*rank_id];
1457       size_t length = (  halo->send_index[2*rank_id + end_shift]
1458                        - halo->send_index[2*rank_id]);
1459 
1460       if (stride == 3) { /* Unroll loop for this case */
1461         for (size_t i = 0; i < length; i++) {
1462           buffer[p_start + i*3]
1463             = var[(halo->send_list[start + i])*3];
1464           buffer[p_start + i*3 + 1]
1465             = var[(halo->send_list[start + i])*3 + 1];
1466           buffer[p_start + i*3 + 2]
1467             = var[(halo->send_list[start + i])*3 + 2];
1468         }
1469       }
1470       else {
1471         size_t _stride = stride;
1472         for (size_t i = 0; i < length; i++) {
1473           size_t r_start = halo->send_list[start + i] * stride;
1474           for (size_t j = 0; j < _stride; j++)
1475             buffer[p_start + i*_stride + j] = var[r_start + j];
1476         }
1477       }
1478 
1479     }
1480 
1481   }
1482 
1483   else {
1484 
1485     unsigned char *buffer = (unsigned char *)_send_buffer;
1486 
1487     size_t elt_size = cs_datatype_size[data_type] * stride;
1488 
1489     for (int rank_id = 0; rank_id < halo->n_c_domains; rank_id++) {
1490 
1491       cs_lnum_t p_start = halo->send_index[2*rank_id]*elt_size;
1492       size_t start = halo->send_index[2*rank_id];
1493       size_t length = (  halo->send_index[2*rank_id + end_shift]
1494                        - halo->send_index[2*rank_id]);
1495 
1496       unsigned char *restrict _val = val;
1497       unsigned char *_buffer = buffer + p_start;
1498 
1499       for (size_t i = 0; i < length; i++) {
1500         size_t r_start = halo->send_list[start + i] * elt_size;
1501         for (size_t j = 0; j < elt_size; j++)
1502           _buffer[i*elt_size + j] = _val[r_start + j];
1503       }
1504 
1505     }
1506 
1507   }
1508 }
1509 
1510 #if defined(HAVE_ACCEL)
1511 
1512 /*----------------------------------------------------------------------------*/
1513 /*!
1514  * \brief Pack halo data to send into dense buffer on accelerator device.
1515  *
1516  * A local state handler may be provided, or the default state handler will
1517  * be used.
1518  *
1519  * A local state and/or buffer may be provided, or the default (global) state
1520  * and buffer will be used. If provided explicitely,
1521  * the buffer must be of sufficient size.
1522  *
1523  * \param[in]       halo        pointer to halo structure
1524  * \param[in]       sync_mode   synchronization mode (standard or extended)
1525  * \param[in]       data_type   data type
1526  * \param[in]       stride      number of (interlaced) values by entity
1527  * \param[in]       val         pointer to variable value array (on device)
1528  * \param[out]      send_buf    pointer to send buffer (on device),
1529  *                              NULL for global buffer
1530  * \param[in, out]  hs          pointer to halo state, NULL for global state
1531  */
1532 /*----------------------------------------------------------------------------*/
1533 
1534 void
cs_halo_sync_pack_d(const cs_halo_t * halo,cs_halo_type_t sync_mode,cs_datatype_t data_type,int stride,void * val,void * send_buf,cs_halo_state_t * hs)1535 cs_halo_sync_pack_d(const cs_halo_t  *halo,
1536                     cs_halo_type_t    sync_mode,
1537                     cs_datatype_t     data_type,
1538                     int               stride,
1539                     void             *val,
1540                     void             *send_buf,
1541                     cs_halo_state_t  *hs)
1542 {
1543   if (halo == NULL)
1544     return;
1545 
1546   cs_halo_state_t  *_hs = (hs != NULL) ? hs : _halo_state;
1547 
1548   void *_send_buf = cs_halo_sync_pack_init_state(halo,
1549                                                  sync_mode,
1550                                                  data_type,
1551                                                  stride,
1552                                                  send_buf,
1553                                                  _hs);
1554 
1555   void *_send_buf_d = cs_get_device_ptr(_send_buf);
1556 
1557 #if defined(HAVE_CUDA)
1558 
1559   cs_halo_cuda_pack_send_buffer_real(halo,
1560                                      sync_mode,
1561                                      stride,
1562                                      val,
1563                                      _send_buf_d);
1564 
1565 #else
1566 
1567   cs_halo_sync_pack(halo,
1568                     sync_mode,
1569                     data_type,
1570                     stride,
1571                     val,
1572                     send_buf,
1573                     _hs);
1574 
1575 #endif
1576 
1577   _hs->var_location = CS_ALLOC_HOST;
1578 }
1579 
1580 #endif /* defined(HAVE_ACCEL) */
1581 
1582 /*----------------------------------------------------------------------------*/
1583 /*!
1584  * \brief Launch update array of values in case of parallelism or periodicity.
1585  *
1586  * This function aims at copying main values from local elements
1587  * (id between 1 and n_local_elements) to ghost elements on distant ranks
1588  * (id between n_local_elements + 1 to n_local_elements_with_halo).
1589  *
1590  * The cs_halo_sync_pack function should have been called before this function,
1591  * using the same hs argument.
1592  *
1593  * \param[in]       halo        pointer to halo structure
1594  * \param[in]       val         pointer to variable value array
1595  * \param[in, out]  hs          pointer to halo state, NULL for global state
1596  */
1597 /*----------------------------------------------------------------------------*/
1598 
1599 void
cs_halo_sync_start(const cs_halo_t * halo,void * val,cs_halo_state_t * hs)1600 cs_halo_sync_start(const cs_halo_t  *halo,
1601                    void             *val,
1602                    cs_halo_state_t  *hs)
1603 {
1604   if (halo == NULL)
1605     return;
1606 
1607   cs_halo_state_t  *_hs = (hs != NULL) ? hs : _halo_state;
1608 
1609 #if (MPI_VERSION >= 3)
1610   if (_halo_comm_mode > CS_HALO_COMM_P2P) {
1611     _halo_sync_start_one_sided(halo, val, _hs);
1612     return;
1613   }
1614 #endif
1615 
1616   cs_lnum_t end_shift = (_hs->sync_mode == CS_HALO_EXTENDED) ? 2 : 1;
1617   cs_lnum_t stride = _hs->stride;
1618   size_t elt_size = cs_datatype_size[_hs->data_type] * stride;
1619   size_t n_loc_elts = halo->n_local_elts;
1620 
1621   unsigned char *restrict _val = val;
1622   unsigned char *restrict _val_dest = _val + n_loc_elts*elt_size;
1623 
1624   unsigned char *buffer = (unsigned char *)(_hs->send_buffer_cur);
1625 
1626   if (_hs->var_location > CS_ALLOC_HOST) {
1627 #   if defined(_CS_MPI_DEVICE_SUPPORT)
1628     /* For CUDA-aware MPI, directly work with buffer on device */
1629     buffer = cs_get_device_ptr(buffer);
1630 # else
1631     /* For host-based MPI, copy or prefetch buffer */
1632     cs_sync_d2h(buffer);
1633 
1634     /* When array passed is defined on device but is not shared, use
1635        separate (smaller) CPU for receive (as we cannot know whether
1636        a matching host beffer without complexifying the API);
1637        this will be copied back to device at the next step */
1638     if (_hs->var_location != CS_ALLOC_HOST_DEVICE_SHARED) {
1639       if (_hs->recv_buffer_size < _hs->send_buffer_size) {
1640         _hs->recv_buffer_size = _hs->send_buffer_size;
1641         CS_FREE_HD(_hs->recv_buffer);
1642         CS_MALLOC_HD(_hs->recv_buffer, _hs->recv_buffer_size, unsigned char,
1643                      CS_ALLOC_HOST_DEVICE_PINNED);
1644       }
1645       _val_dest = _hs->recv_buffer;
1646     }
1647 #endif
1648   }
1649 
1650 #if defined(HAVE_MPI)
1651 
1652   _update_requests(halo, _hs);
1653 
1654   MPI_Datatype mpi_datatype = cs_datatype_to_mpi[_hs->data_type];
1655 
1656   int request_count = 0;
1657   const int local_rank = CS_MAX(cs_glob_rank_id, 0);
1658 
1659   /* Receive data from distant ranks */
1660 
1661   for (int rank_id = 0; rank_id < halo->n_c_domains; rank_id++) {
1662 
1663     cs_lnum_t length = (  halo->index[2*rank_id + end_shift]
1664                         - halo->index[2*rank_id]) * stride;
1665 
1666     if (halo->c_domain_rank[rank_id] != local_rank) {
1667 
1668       if (length > 0) {
1669         size_t start = (size_t)(halo->index[2*rank_id]);
1670         unsigned char *dest = _val_dest + start*elt_size;
1671 
1672         MPI_Irecv(dest,
1673                   length*_hs->stride,
1674                   mpi_datatype,
1675                   halo->c_domain_rank[rank_id],
1676                   halo->c_domain_rank[rank_id],
1677                   cs_glob_mpi_comm,
1678                   &(_hs->request[request_count++]));
1679       }
1680 
1681     }
1682     else
1683       _hs->local_rank_id = rank_id;
1684   }
1685 
1686   /* We may wait for posting all receives (sometimes recommended) */
1687 
1688   if (_halo_use_barrier)
1689     MPI_Barrier(cs_glob_mpi_comm);
1690 
1691   /* Send data to distant ranks */
1692 
1693   for (int rank_id = 0; rank_id < halo->n_c_domains; rank_id++) {
1694 
1695     cs_lnum_t start = halo->send_index[2*rank_id]*elt_size;
1696     cs_lnum_t length = (  halo->send_index[2*rank_id + end_shift]
1697                         - halo->send_index[2*rank_id]);
1698 
1699     if (halo->c_domain_rank[rank_id] != local_rank && length > 0)
1700       MPI_Isend(buffer + start,
1701                 length*stride,
1702                 mpi_datatype,
1703                 halo->c_domain_rank[rank_id],
1704                 local_rank,
1705                 cs_glob_mpi_comm,
1706                 &(_hs->request[request_count++]));
1707 
1708   }
1709 
1710   _hs->n_requests = request_count;
1711 
1712 #endif /* defined(HAVE_MPI) */
1713 }
1714 
1715 /*----------------------------------------------------------------------------*/
1716 /*!
1717  * \brief Wait for completion of update array of values in case of
1718  *  parallelism or periodicity.
1719  *
1720  * This function aims at copying main values from local elements
1721  * (id between 1 and n_local_elements) to ghost elements on distant ranks
1722  * (id between n_local_elements + 1 to n_local_elements_with_halo).
1723  *
1724  * The cs_halo_sync_start function should have been called before this function,
1725  * using the same hs argument.
1726  *
1727  * \param[in]       halo        pointer to halo structure
1728  * \param[in]       val         pointer to variable value array
1729  * \param[in, out]  hs          pointer to halo state, NULL for global state
1730  */
1731 /*----------------------------------------------------------------------------*/
1732 
1733 void
cs_halo_sync_wait(const cs_halo_t * halo,void * val,cs_halo_state_t * hs)1734 cs_halo_sync_wait(const cs_halo_t  *halo,
1735                   void             *val,
1736                   cs_halo_state_t  *hs)
1737 {
1738   if (halo == NULL)
1739     return;
1740 
1741   cs_halo_state_t  *_hs = (hs != NULL) ? hs : _halo_state;
1742 
1743 #if (MPI_VERSION >= 3)
1744   if (_halo_comm_mode > CS_HALO_COMM_P2P) {
1745     _halo_sync_complete_one_sided(halo, val, _hs);
1746     return;
1747   }
1748 #endif
1749 
1750 #if defined(HAVE_MPI)
1751 
1752   /* Wait for all exchanges */
1753 
1754   if (_hs->n_requests > 0)
1755     MPI_Waitall(_hs->n_requests, _hs->request, _hs->status);
1756 
1757 #endif /* defined(HAVE_MPI) */
1758 
1759 #if defined(HAVE_ACCEL)
1760 #if !defined(_CS_MPI_DEVICE_SUPPORT)
1761   if (_hs->var_location > CS_ALLOC_HOST) {
1762 
1763     size_t n_loc_elts = halo->n_local_elts;
1764     size_t n_elts = (   _hs->sync_mode
1765                      == CS_HALO_EXTENDED) ? halo->n_elts[1] : halo->n_elts[0];
1766     size_t elt_size = cs_datatype_size[_hs->data_type] * _hs->stride;
1767     size_t n_bytes = n_elts*elt_size;
1768 
1769     unsigned char *restrict _val = val;
1770     unsigned char *restrict _val_dest = _val + n_loc_elts*elt_size;
1771 
1772     if (_hs->var_location == CS_ALLOC_HOST_DEVICE_SHARED)
1773       cs_prefetch_h2d(_val_dest, n_bytes);
1774     else
1775       cs_copy_h2d(_val_dest, _hs->recv_buffer, n_bytes);
1776 
1777   }
1778 #endif
1779 #endif /* defined(HAVE_ACCEL) */
1780 
1781   /* Copy local values in case of periodicity */
1782 
1783   if (_hs->local_rank_id > -1) {
1784     size_t elt_size = cs_datatype_size[_hs->data_type] * _hs->stride;
1785     _sync_local(halo, _hs->local_rank_id, _hs->sync_mode, elt_size,
1786                 _hs->var_location, _hs->send_buffer_cur, val);
1787   }
1788 
1789   /* Cleanup */
1790 
1791   _hs->sync_mode = CS_HALO_STANDARD;
1792   _hs->data_type = CS_DATATYPE_NULL;
1793   _hs->stride = 0;
1794   _hs->send_buffer_cur = NULL;
1795   _hs->n_requests = 0;
1796   _hs->local_rank_id  = -1;
1797 }
1798 
1799 /*----------------------------------------------------------------------------*/
1800 /*!
1801  * \brief Update array of values in case of parallelism or periodicity.
1802  *
1803  * This function aims at copying main values from local elements
1804  * (id between 1 and n_local_elements) to ghost elements on distant ranks
1805  * (id between n_local_elements + 1 to n_local_elements_with_halo).
1806  *
1807  * \param[in]   halo        pointer to halo structure
1808  * \param[in]   sync_mode   synchronization mode (standard or extended)
1809  * \param[in]   data_type   data type
1810  * \param[in]   stride      number of (interlaced) values by entity
1811  * \param[in]   val         pointer to variable value array
1812  */
1813 /*----------------------------------------------------------------------------*/
1814 
1815 void
cs_halo_sync(const cs_halo_t * halo,cs_halo_type_t sync_mode,cs_datatype_t data_type,int stride,void * val)1816 cs_halo_sync(const cs_halo_t  *halo,
1817              cs_halo_type_t    sync_mode,
1818              cs_datatype_t     data_type,
1819              int               stride,
1820              void             *val)
1821 {
1822   if (halo == NULL)
1823     return;
1824 
1825   cs_halo_sync_pack(halo,
1826                     sync_mode,
1827                     data_type,
1828                     stride,
1829                     val,
1830                     NULL,
1831                     NULL);
1832 
1833   cs_halo_sync_start(halo, val, NULL);
1834 
1835   cs_halo_sync_wait(halo, val, NULL);
1836 }
1837 
1838 /*----------------------------------------------------------------------------
1839  * Update array of any type of halo values in case of parallelism or
1840  * periodicity.
1841  *
1842  * Data is untyped; only its size is given, so this function may also
1843  * be used to synchronize interleaved multidimendsional data, using
1844  * size = element_size*dim (assuming a homogeneous environment, at least
1845  * as far as data encoding goes).
1846  *
1847  * This function aims at copying main values from local elements
1848  * (id between 1 and n_local_elements) to ghost elements on distant ranks
1849  * (id between n_local_elements + 1 to n_local_elements_with_halo).
1850  *
1851  * parameters:
1852  *   halo      <-- pointer to halo structure
1853  *   sync_mode <-- synchronization mode (standard or extended)
1854  *   size      <-- datatype size
1855  *   num       <-> pointer to local number value array
1856  *----------------------------------------------------------------------------*/
1857 
1858 void
cs_halo_sync_untyped(const cs_halo_t * halo,cs_halo_type_t sync_mode,size_t size,void * val)1859 cs_halo_sync_untyped(const cs_halo_t  *halo,
1860                      cs_halo_type_t    sync_mode,
1861                      size_t            size,
1862                      void             *val)
1863 {
1864   cs_halo_sync(halo, sync_mode, CS_CHAR, size, val);
1865 }
1866 
1867 /*----------------------------------------------------------------------------
1868  * Update array of integer halo values in case of parallelism or periodicity.
1869  *
1870  * This function aims at copying main values from local elements
1871  * (id between 1 and n_local_elements) to ghost elements on distant ranks
1872  * (id between n_local_elements + 1 to n_local_elements_with_halo).
1873  *
1874  * parameters:
1875  *   halo      <-- pointer to halo structure
1876  *   sync_mode <-- synchronization mode (standard or extended)
1877  *   num       <-> pointer to local number value array
1878  *----------------------------------------------------------------------------*/
1879 
1880 void
cs_halo_sync_num(const cs_halo_t * halo,cs_halo_type_t sync_mode,cs_lnum_t num[])1881 cs_halo_sync_num(const cs_halo_t  *halo,
1882                  cs_halo_type_t    sync_mode,
1883                  cs_lnum_t         num[])
1884 {
1885   cs_halo_sync(halo, sync_mode, CS_LNUM_TYPE, 1, num);
1886 }
1887 
1888 /*----------------------------------------------------------------------------
1889  * Update array of variable (floating-point) halo values in case of
1890  * parallelism or periodicity.
1891  *
1892  * This function aims at copying main values from local elements
1893  * (id between 1 and n_local_elements) to ghost elements on distant ranks
1894  * (id between n_local_elements + 1 to n_local_elements_with_halo).
1895  *
1896  * parameters:
1897  *   halo      <-- pointer to halo structure
1898  *   sync_mode <-- synchronization mode (standard or extended)
1899  *   var       <-> pointer to variable value array
1900  *----------------------------------------------------------------------------*/
1901 
1902 void
cs_halo_sync_var(const cs_halo_t * halo,cs_halo_type_t sync_mode,cs_real_t var[])1903 cs_halo_sync_var(const cs_halo_t  *halo,
1904                  cs_halo_type_t    sync_mode,
1905                  cs_real_t         var[])
1906 {
1907   cs_halo_sync(halo, sync_mode, CS_REAL_TYPE, 1, var);
1908 }
1909 
1910 /*----------------------------------------------------------------------------
1911  * Update array of strided variable (floating-point) values in case
1912  * of parallelism or periodicity.
1913  *
1914  * This function aims at copying main values from local elements
1915  * (id between 1 and n_local_elements) to ghost elements on distant ranks
1916  * (id between n_local_elements + 1 to n_local_elements_with_halo).
1917  *
1918  * parameters:
1919  *   halo      <-- pointer to halo structure
1920  *   sync_mode <-- synchronization mode (standard or extended)
1921  *   var       <-> pointer to variable value array
1922  *   stride    <-- number of (interlaced) values by entity
1923  *----------------------------------------------------------------------------*/
1924 
1925 void
cs_halo_sync_var_strided(const cs_halo_t * halo,cs_halo_type_t sync_mode,cs_real_t var[],int stride)1926 cs_halo_sync_var_strided(const cs_halo_t  *halo,
1927                          cs_halo_type_t    sync_mode,
1928                          cs_real_t         var[],
1929                          int               stride)
1930 {
1931   cs_halo_sync(halo, sync_mode, CS_REAL_TYPE, stride, var);
1932 }
1933 
1934 /*----------------------------------------------------------------------------
1935  * Return MPI_Barrier usage flag.
1936  *
1937  * returns:
1938  *   true if MPI barriers are used after posting receives and before posting
1939  *   sends, false otherwise
1940  *---------------------------------------------------------------------------*/
1941 
1942 bool
cs_halo_get_use_barrier(void)1943 cs_halo_get_use_barrier(void)
1944 {
1945   return _halo_use_barrier;
1946 }
1947 
1948 /*----------------------------------------------------------------------------
1949  * Set MPI_Barrier usage flag.
1950  *
1951  * parameters:
1952  *   use_barrier <-- true if MPI barriers should be used after posting
1953  *                   receives and before posting sends, false otherwise.
1954  *---------------------------------------------------------------------------*/
1955 
1956 void
cs_halo_set_use_barrier(bool use_barrier)1957 cs_halo_set_use_barrier(bool use_barrier)
1958 {
1959   _halo_use_barrier = use_barrier;
1960 }
1961 
1962 /*----------------------------------------------------------------------------*/
1963 /*!
1964  * \brief Get default communication mode for halo exchange.
1965  *
1966  * \return  allocation mode
1967  */
1968 /*----------------------------------------------------------------------------*/
1969 
1970 cs_halo_comm_mode_t
cs_halo_get_comm_mode(void)1971 cs_halo_get_comm_mode(void)
1972 {
1973   return _halo_comm_mode;
1974 }
1975 
1976 /*----------------------------------------------------------------------------*/
1977 /*!
1978  * \brief Set default communication mode for halo exchange.
1979  *
1980  * \param[in]  mode  allocation mode to set
1981  */
1982 /*----------------------------------------------------------------------------*/
1983 
1984 void
cs_halo_set_comm_mode(cs_halo_comm_mode_t mode)1985 cs_halo_set_comm_mode(cs_halo_comm_mode_t  mode)
1986 {
1987   if (mode >= CS_HALO_COMM_P2P && mode <= CS_HALO_COMM_RMA_GET)
1988     _halo_comm_mode = mode;
1989 }
1990 
1991 /*----------------------------------------------------------------------------*/
1992 /*!
1993  * \brief Get default host/device allocation mode for message packing arrays.
1994  *
1995  * \return  allocation mode
1996  */
1997 /*----------------------------------------------------------------------------*/
1998 
1999 cs_alloc_mode_t
cs_halo_get_buffer_alloc_mode(void)2000 cs_halo_get_buffer_alloc_mode(void)
2001 {
2002   return _halo_buffer_alloc_mode;
2003 }
2004 
2005 /*----------------------------------------------------------------------------*/
2006 /*!
2007  * \brief Set default host/device allocation mode for message packing arrays.
2008  *
2009  * \param[in]  mode  allocation mode to set
2010  */
2011 /*----------------------------------------------------------------------------*/
2012 
2013 void
cs_halo_set_buffer_alloc_mode(cs_alloc_mode_t mode)2014 cs_halo_set_buffer_alloc_mode(cs_alloc_mode_t  mode)
2015 {
2016   _halo_buffer_alloc_mode = mode;
2017 }
2018 
2019 /*----------------------------------------------------------------------------
2020  * Dump a cs_halo_t structure.
2021  *
2022  * parameters:
2023  *   halo           <-- pointer to cs_halo_t struture
2024  *   print_level    <--  0 only dimensions and indexes are printed, else (1)
2025  *                       everything is printed
2026  *---------------------------------------------------------------------------*/
2027 
2028 void
cs_halo_dump(const cs_halo_t * halo,int print_level)2029 cs_halo_dump(const cs_halo_t  *halo,
2030              int               print_level)
2031 {
2032   if (halo == NULL) {
2033     bft_printf("\n\n  halo: nil\n");
2034     return;
2035   }
2036 
2037   bft_printf("\n  halo:         %p\n"
2038              "  n_transforms:   %d\n"
2039              "  n_c_domains:    %d\n"
2040              "  periodicity:    %p\n"
2041              "  n_rotations:    %d\n"
2042              "  n_local_elts:   %ld\n",
2043              (const void *)halo,
2044              halo->n_transforms, halo->n_c_domains,
2045              (const void *)halo->periodicity,
2046              halo->n_rotations, (long)halo->n_local_elts);
2047 
2048   bft_printf("\nRanks on halo frontier:\n");
2049   for (int i = 0; i < halo->n_c_domains; i++)
2050     bft_printf("%5d", halo->c_domain_rank[i]);
2051 
2052   for (int halo_id = 0; halo_id < 2; halo_id++) {
2053 
2054     cs_lnum_t  n_elts[2];
2055     cs_lnum_t  *index = NULL, *list = NULL, *perio_lst = NULL;
2056 
2057     bft_printf("\n    ---------\n");
2058 
2059     if (halo_id == 0) {
2060 
2061       bft_printf("    send_list:\n");
2062       n_elts[0] = halo->n_send_elts[0];
2063       n_elts[1] = halo->n_send_elts[1];
2064       index = halo->send_index;
2065       list = halo->send_list;
2066       perio_lst = halo->send_perio_lst;
2067 
2068     }
2069     else if (halo_id == 1) {
2070 
2071       bft_printf("    halo:\n");
2072       n_elts[0] = halo->n_elts[0];
2073       n_elts[1] = halo->n_elts[1];
2074       index = halo->index;
2075       list = NULL;
2076       perio_lst = halo->perio_lst;
2077 
2078     }
2079 
2080     bft_printf("    ---------\n\n");
2081     bft_printf("  n_ghost_cells:        %ld\n"
2082                "  n_std_ghost_cells:    %ld\n", (long)n_elts[1], (long)n_elts[0]);
2083 
2084     if (index == NULL)
2085       return;
2086 
2087     if (halo->n_transforms > 0) {
2088 
2089       const cs_lnum_t  stride = 4*halo->n_c_domains;
2090 
2091       for (int i = 0; i < halo->n_transforms; i++) {
2092 
2093         bft_printf("\nTransformation number: %d\n", i+1);
2094 
2095         for (int j = 0; j < halo->n_c_domains; j++) {
2096 
2097           bft_printf("    rank %3d <STD> %5ld %5ld <EXT> %5ld %5ld\n",
2098                      halo->c_domain_rank[j],
2099                      (long)perio_lst[i*stride + 4*j],
2100                      (long)perio_lst[i*stride + 4*j+1],
2101                      (long)perio_lst[i*stride + 4*j+2],
2102                      (long)perio_lst[i*stride + 4*j+3]);
2103         }
2104 
2105       } /* End of loop on perio */
2106 
2107     } /* End if n_perio > 0 */
2108 
2109     for (int i = 0; i < halo->n_c_domains; i++) {
2110 
2111       bft_printf("\n  rank      %d:\n", halo->c_domain_rank[i]);
2112 
2113       if (index[2*i+1] - index[2*i] > 0) {
2114 
2115         bft_printf("\n  Standard halo\n");
2116         bft_printf("  idx start %ld:          idx end   %ld:\n",
2117                    (long)index[2*i], (long)index[2*i+1]);
2118 
2119         if (print_level > 0 && list != NULL) {
2120           bft_printf("\n            idx     elt id\n");
2121           for (cs_lnum_t j = index[2*i]; j < index[2*i+1]; j++)
2122             bft_printf("    %10ld %10ld\n", (long)j, (long)list[j]);
2123         }
2124 
2125       } /* there are elements on standard neighborhood */
2126 
2127       if (index[2*i+2] - index[2*i+1] > 0) {
2128 
2129         bft_printf("\n  Extended halo\n");
2130         bft_printf("  idx start %ld:          idx end   %ld:\n",
2131                    (long)index[2*i+1], (long)index[2*i+2]);
2132 
2133         if (print_level > 0 && list != NULL) {
2134           bft_printf("\n            idx     elt id\n");
2135           for (long j = index[2*i+1]; j < index[2*i+2]; j++)
2136             bft_printf("    %10ld %10ld %10ld\n",
2137                        (long)j, (long)list[j], (long)halo->n_local_elts+j);
2138         }
2139 
2140       } /* If there are elements on extended neighborhood */
2141 
2142     } /* End of loop on involved ranks */
2143 
2144   } /* End of loop on halos (send_halo/halo) */
2145 
2146   bft_printf("\n\n");
2147   bft_printf_flush();
2148 }
2149 
2150 /*----------------------------------------------------------------------------*/
2151 
2152 END_C_DECLS
2153