1 /*============================================================================
2  * Incremental or general construction of matrix.
3  *============================================================================*/
4 
5 /*
6   This file is part of Code_Saturne, a general-purpose CFD tool.
7 
8   Copyright (C) 1998-2021 EDF S.A.
9 
10   This program is free software; you can redistribute it and/or modify it under
11   the terms of the GNU General Public License as published by the Free Software
12   Foundation; either version 2 of the License, or (at your option) any later
13   version.
14 
15   This program is distributed in the hope that it will be useful, but WITHOUT
16   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
18   details.
19 
20   You should have received a copy of the GNU General Public License along with
21   this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
22   Street, Fifth Floor, Boston, MA 02110-1301, USA.
23 */
24 
25 /*----------------------------------------------------------------------------*/
26 
27 #include "cs_defs.h"
28 
29 /*----------------------------------------------------------------------------
30  * Standard C library headers
31  *----------------------------------------------------------------------------*/
32 
33 #include <assert.h>
34 #include <math.h>
35 #include <stdlib.h>
36 #include <stdarg.h>
37 #include <stdio.h>
38 #include <string.h>
39 
40 /*----------------------------------------------------------------------------
41  * Local headers
42  *----------------------------------------------------------------------------*/
43 
44 #include "bft_error.h"
45 #include "bft_mem.h"
46 #include "bft_mem_usage.h"
47 
48 #include "cs_defs.h"
49 #include "cs_log.h"
50 #include "cs_rank_neighbors.h"
51 #include "cs_sort.h"
52 #include "cs_timer.h"
53 
54 #include "cs_matrix.h"
55 
56 /*----------------------------------------------------------------------------
57  *  Header for the current file
58  *----------------------------------------------------------------------------*/
59 
60 #include "cs_matrix_assembler_priv.h"
61 #include "cs_matrix_assembler.h"
62 
63 /*----------------------------------------------------------------------------*/
64 
65 BEGIN_C_DECLS
66 
67 /*=============================================================================
68  * Additional doxygen documentation
69  *============================================================================*/
70 
71 /*!
72   \file cs_matrix_assembler.c
73 
74   \brief Incremental or general construction of matrix structure.
75 
76   The matrix assembler is intended to assist the building of matrices
77   in general parallel conditions, assuming each parallel rank is assigned a
78   given number of rows, in increasing order. Column elements may refer to rows
79   assigned to other ranks. This allows a good mapping to external libraries
80   such as PETSc and Hypre.
81 
82   Moreover, in some cases, some global matrix elements may be computed on ranks
83   to whom neither the row nor the column is assigned. This may happen for
84   example when rows are assigned to mesh vertices, and a given cell
85   provides a contribution to two vertices laying on parallel rank boundaries,
86   and assigned to two different ranks, which are different from the cell's
87   rank. Few such contributions are expected with a good partitionning, but
88   they cannot always be avoided, or this would require complex partitioning
89   constraints. libraries such as PETSc do not handle these cases, at least
90   not efficiently (as the recommended preallocation required for local rows
91   cannot be computed without the knowledge of those contributions), so
92   the assembler should help manage these contributions in any case.
93 
94   The addition of a given non-zero to a matrix structure may be done
95   multiple times (for example when looping on cells with an internal loop on
96   vertices), but for better performance (and memory usage), it is recommended
97   to avoid the same info multiple times, as duplicates may be removed only
98   during the computation stage).
99  */
100 
101 /*! \cond DOXYGEN_SHOULD_SKIP_THIS */
102 
103 /*=============================================================================
104  * Local Macro Definitions
105  *============================================================================*/
106 
107 /* Minimum size for OpenMP loops (needs benchmarking to adjust) */
108 #define THR_MIN 128
109 
110 /* Fixed coefficient buffer size for accumulation
111    (a reasonably small fixed size has the advantage of being easily usable
112    on the stack and in a threading context, and that size should still
113    be large enough to amortize calls to lower-level functions */
114 
115 #undef  COEFF_GROUP_SIZE
116 #define COEFF_GROUP_SIZE 256
117 
118 #undef  HISTOGRAM_SUBS
119 #define HISTOGRAM_SUBS  5
120 
121 /*=============================================================================
122  * Local Structure Definitions
123  *============================================================================*/
124 
125 /*============================================================================
126  * Private function definitions
127  *============================================================================*/
128 
129 /*----------------------------------------------------------------------------*/
130 /*!
131  * \brief Print distribution of counter over ranks info to a given log type.
132  *
133  * \param[in]  log_id  log file type
134  * \param[in]  count   counter value for current rank
135  */
136 /*----------------------------------------------------------------------------*/
137 
138 static void
_display_rank_histogram(cs_log_t log_id,int count)139 _display_rank_histogram(cs_log_t  log_id,
140                         int       count)
141 {
142   int  i, j, k, count_max, count_min;
143   double step;
144 
145   int h_count[HISTOGRAM_SUBS];
146   int n_steps = HISTOGRAM_SUBS;
147 
148   const int n_counts = cs_glob_n_ranks;
149 
150   int *r_count;
151   BFT_MALLOC(r_count, cs_glob_n_ranks, int);
152 
153   r_count[0] = count;
154 
155 #if defined(HAVE_MPI)
156   if (cs_glob_n_ranks > 1)
157     MPI_Allgather(&count, 1, MPI_INT,
158                   r_count, 1, MPI_INT, cs_glob_mpi_comm);
159 #endif
160 
161   /* Compute local min and max */
162 
163   count_max = r_count[0];
164   count_min = r_count[0];
165   for (i = 1; i < n_counts; i++) {
166     count_min = CS_MIN(count_min, r_count[i]);
167     count_max = CS_MAX(count_max, r_count[i]);
168   }
169 
170   cs_log_printf(log_id, _("    minimum count =         %10d\n"), count_min);
171   cs_log_printf(log_id, _("    maximum count =         %10d\n\n"), count_max);
172 
173   /* Define axis subdivisions */
174 
175   for (j = 0; j < n_steps; j++)
176     h_count[j] = 0;
177 
178   if (count_max - count_min > 0) {
179 
180     if (count_max-count_min < n_steps)
181       n_steps = CS_MAX(1, floor(count_max-count_min));
182 
183     step = (double)(count_max - count_min) / n_steps;
184 
185     /* Loop on counts */
186 
187     for (i = 0; i < n_counts; i++) {
188 
189       /* Associated subdivision */
190 
191       for (j = 0, k = 1; k < n_steps; j++, k++) {
192         if (r_count[i] < count_min + k*step)
193           break;
194       }
195       h_count[j] += 1;
196 
197     }
198 
199     for (i = 0, j = 1; i < n_steps - 1; i++, j++)
200       cs_log_printf(log_id, "    %3d : [ %10d ; %10d [ = %10d\n",
201                     i+1,
202                     (int)(count_min + i*step),
203                     (int)(count_min + j*step),
204                     h_count[i]);
205 
206     cs_log_printf(log_id, "    %3d : [ %10d ; %10d ] = %10d\n",
207                   n_steps,
208                   (int)(count_min + (n_steps - 1)*step),
209                   count_max,
210                   h_count[n_steps - 1]);
211 
212   }
213 
214   else { /* if (count_max == count_min) */
215     cs_log_printf(log_id, "    %3d : [ %10d ; %10d ] = %10d\n",
216                   1, count_min, count_max, n_counts);
217   }
218 
219   BFT_FREE(r_count);
220 }
221 
222 /*----------------------------------------------------------------------------*/
223 /*!
224  * \brief Sort and compact local matrix elements.
225  *
226  * \param[in, out]  ma   pointer to matrix assembler structure
227  *
228  * This function should be called by a single thread for a given assembler.
229  */
230 /*----------------------------------------------------------------------------*/
231 
232 static void
_sort_and_compact_local(cs_matrix_assembler_t * ma)233 _sort_and_compact_local(cs_matrix_assembler_t  *ma)
234 {
235   /* Check if we are not already sorted */
236 
237   cs_lnum_t n_rows = ma->n_rows;
238   bool ordered = true;
239 
240   for (cs_lnum_t i = 0; i < n_rows && ordered; i++) {
241     cs_lnum_t *col_id = ma->_c_id + ma->_r_idx[i];
242     cs_lnum_t n_cols = ma->_r_idx[i+1] - ma->_r_idx[i];
243     for (cs_lnum_t j = 1; j < n_cols; j++) {
244       if (col_id[j] <= col_id[j-1])
245         ordered = false;
246     }
247   }
248 
249   if (ordered)
250     return;
251 
252   /* Sort by row */
253 
254   bool direct_assembly = cs_sort_indexed(n_rows, ma->_r_idx, ma->_c_id);
255 
256   /* Compact elements if necessary */
257 
258   if (direct_assembly == false) {
259 
260     cs_lnum_t *tmpr_idx = NULL;
261 
262     BFT_MALLOC(tmpr_idx, n_rows+1, cs_lnum_t);
263     memcpy(tmpr_idx, ma->_r_idx, (n_rows+1)*sizeof(cs_lnum_t));
264 
265     cs_lnum_t  k = 0;
266 
267     for (cs_lnum_t i = 0; i < n_rows; i++) {
268       cs_lnum_t *col_id = ma->_c_id + ma->_r_idx[i];
269       cs_lnum_t n_cols = ma->_r_idx[i+1] - ma->_r_idx[i];
270       cs_lnum_t col_id_prev = -1;
271       ma->_r_idx[i] = k;
272       for (cs_lnum_t j = 0; j < n_cols; j++) {
273         if (col_id_prev != col_id[j]) {
274           ma->_c_id[k++] = col_id[j];
275           col_id_prev = col_id[j];
276         }
277       }
278     }
279     ma->_r_idx[n_rows] = k;
280 
281     assert(ma->_r_idx[n_rows] < tmpr_idx[n_rows]);
282 
283     BFT_FREE(tmpr_idx);
284     BFT_REALLOC(ma->_c_id, (ma->_r_idx[n_rows]), cs_lnum_t);
285     ma->c_id = ma->_c_id;
286 
287   }
288 }
289 
290 #if defined(HAVE_MPI)
291 
292 /*----------------------------------------------------------------------------*/
293 /*!
294  * \brief Binary search for a given global id in a given array of
295  *        ordered global ids, when the id might not be present
296  *
297  * We assume the id is present in the array.
298  *
299  * \param[in]  g_id_array size  array_size
300  * \param[in]  g_id             global id to search for
301  * \param[in]  g_id_array       ordered unique global ids array
302  *
303  * \return  index of g_id in g_id_array, or -1 if not found
304  */
305 /*----------------------------------------------------------------------------*/
306 
307 static inline cs_lnum_t
_g_id_binary_search(cs_lnum_t g_id_array_size,cs_gnum_t g_id,const cs_gnum_t g_id_array[])308 _g_id_binary_search(cs_lnum_t        g_id_array_size,
309                     cs_gnum_t        g_id,
310                     const cs_gnum_t  g_id_array[])
311 {
312   cs_lnum_t start_id = 0;
313   cs_lnum_t end_id = g_id_array_size - 1;
314   cs_lnum_t mid_id = (end_id -start_id) / 2;
315   while (start_id < end_id) {
316     if (g_id_array[mid_id] < g_id)
317       start_id = mid_id + 1;
318     else if (g_id_array[mid_id] > g_id)
319       end_id = mid_id - 1;
320     else
321       break;
322     mid_id = start_id + ((end_id -start_id) / 2);
323   }
324   if (g_id_array[mid_id] != g_id)
325     mid_id = -1;
326 
327   return mid_id;
328 }
329 
330 /*----------------------------------------------------------------------------*/
331 /*!
332  * \brief Use binary search to determine to which range a global id belongs
333  *
334  * \param[in]   n_ranges  number of distant ranges
335  * \param[in]   d_ranges  global id ranges [min, max[ for neighboring ranks
336  * \param[in ]  g_id      global id to search for
337  *
338  * \return  id of matching distant range, or -2 if not found
339  */
340 /*----------------------------------------------------------------------------*/
341 
342 static inline int
_g_id_rank_index(int n_ranges,const cs_gnum_t d_ranges[],cs_gnum_t g_id)343 _g_id_rank_index(int              n_ranges,
344                  const cs_gnum_t  d_ranges[],
345                  cs_gnum_t        g_id)
346 {
347   cs_lnum_t start_id = 0;
348   cs_lnum_t end_id = n_ranges - 1;
349   cs_lnum_t mid_id = (end_id -start_id) / 2;
350 
351   while (start_id < end_id) {
352     if (d_ranges[mid_id*2+1] <= g_id)
353       start_id = mid_id + 1;
354     else if (d_ranges[mid_id*2] > g_id)
355       end_id = mid_id - 1;
356     else
357       break;
358     mid_id = start_id + ((end_id -start_id) / 2);
359   }
360 
361   if (g_id < d_ranges[mid_id*2] || g_id >= d_ranges[mid_id*2+1])
362     mid_id = -2;
363 
364   return mid_id;
365 }
366 
367 /*----------------------------------------------------------------------------*/
368 /*!
369  * \brief Sort and compact distant matrix elements.
370  *
371  * \param[in, out]  ma   pointer to matrix assembler structure
372  *
373  * This function should be called by a single thread for a given assembler.
374  */
375 /*----------------------------------------------------------------------------*/
376 
377 static void
_sort_and_compact_distant(cs_matrix_assembler_t * ma)378 _sort_and_compact_distant(cs_matrix_assembler_t  *ma)
379 {
380   /* Check if we are not already sorted */
381 
382   cs_lnum_t n_rows = ma->n_rows;
383   bool ordered = true;
384 
385   for (cs_lnum_t i = 0; i < n_rows && ordered; i++) {
386     cs_gnum_t *col_id = ma->d_g_c_id + ma->d_r_idx[i];
387     cs_lnum_t n_cols = ma->d_r_idx[i+1] - ma->d_r_idx[i];
388     for (cs_lnum_t j = 1; j < n_cols; j++) {
389       if (col_id[j] <= col_id[j-1])
390         ordered = false;
391     }
392   }
393 
394   if (ordered)
395     return;
396 
397   /* Sort by row */
398 
399   bool direct_assembly
400     = cs_sort_indexed_gnum(n_rows, ma->d_r_idx, ma->d_g_c_id);
401 
402   /* Compact elements if necessary */
403 
404   if (direct_assembly == false) {
405 
406     cs_lnum_t *tmpr_idx = NULL;
407 
408     BFT_MALLOC(tmpr_idx, n_rows+1, cs_lnum_t);
409     memcpy(tmpr_idx, ma->d_r_idx, (n_rows+1)*sizeof(cs_lnum_t));
410 
411     cs_lnum_t  k = 0;
412 
413     for (cs_lnum_t i = 0; i < n_rows; i++) {
414       cs_gnum_t *col_g_id = ma->d_g_c_id + ma->d_r_idx[i];
415       cs_lnum_t n_cols = ma->d_r_idx[i+1] - ma->d_r_idx[i];
416       ma->d_r_idx[i] = k;
417       if (n_cols > 0)
418         ma->d_g_c_id[k++] = col_g_id[0];
419       for (cs_lnum_t j = 1; j < n_cols; j++) {
420         if (col_g_id[j] != col_g_id[j-1])
421           ma->d_g_c_id[k++] = col_g_id[j];
422       }
423     }
424     ma->d_r_idx[n_rows] = k;
425 
426     assert(ma->d_r_idx[n_rows] < tmpr_idx[n_rows]);
427 
428     BFT_FREE(tmpr_idx);
429     BFT_REALLOC(ma->d_g_c_id, (ma->d_r_idx[n_rows]), cs_gnum_t);
430 
431   }
432 }
433 
434 /*----------------------------------------------------------------------------*/
435 /*!
436  * \brief Add terms to local matrix elements defined by distant ranks
437  *
438  * \param[in, out]  ma    pointer to matrix assembler structure
439  * \param[in]       n     number of added column and row couples
440  * \param[in]       l_ij  added couples
441  *
442  * This function should be called by a single thread for a given assembler.
443  */
444 /*----------------------------------------------------------------------------*/
445 
446 static void
_complete_local(cs_matrix_assembler_t * ma,cs_lnum_t n,cs_lnum_2_t l_ij[])447 _complete_local(cs_matrix_assembler_t  *ma,
448                 cs_lnum_t               n,
449                 cs_lnum_2_t             l_ij[])
450 {
451   /* Count maximum terms to add for each row */
452 
453   cs_lnum_t n_rows = ma->n_rows;
454 
455   cs_lnum_t *l_c_count, *l_r_idx;
456   BFT_MALLOC(l_c_count, n_rows, cs_lnum_t);
457 
458   for (cs_lnum_t i = 0; i < n_rows; i++)
459     l_c_count[i] = 0;
460 
461   for (cs_lnum_t i = 0; i < n; i++)
462     l_c_count[l_ij[i][0]] += 1;
463 
464   BFT_MALLOC(l_r_idx, n_rows+1, cs_lnum_t);
465 
466   l_r_idx[0] = 0;
467   for (cs_lnum_t i = 0; i < n_rows; i++)
468     l_r_idx[i+1] = l_r_idx[i] + l_c_count[i];
469 
470   /* Expand matrix, starting copies from end to avoid overwrites
471      (first line untouched) */
472 
473   BFT_REALLOC(ma->_c_id, ma->r_idx[n_rows] + l_r_idx[n_rows], cs_lnum_t);
474   ma->c_id = ma->_c_id;
475 
476   for (cs_lnum_t i = n_rows-1; i > 0; i--) {
477     cs_lnum_t n_cols = ma->_r_idx[i+1] - ma->_r_idx[i];
478     cs_lnum_t *col_id = ma->_c_id + ma->_r_idx[i] + l_r_idx[i];
479     const cs_lnum_t *col_id_s = ma->_c_id + ma->_r_idx[i];
480     l_c_count[i] = n_cols;
481     ma->_r_idx[i+1] += l_r_idx[i+1];
482     for (cs_lnum_t j = n_cols-1; j >= 0; j--)
483       col_id[j] = col_id_s[j];
484   }
485   l_c_count[0] = ma->_r_idx[1];
486   ma->_r_idx[1] += l_r_idx[1];
487 
488   BFT_FREE(l_r_idx);
489 
490   /* Now add terms */
491 
492   for (cs_lnum_t i = 0; i < n; i++) {
493     cs_lnum_t r_id = l_ij[i][0];
494     cs_gnum_t c_id = l_ij[i][1];
495     ma->_c_id[ma->_r_idx[r_id] + l_c_count[r_id]] = c_id;
496     l_c_count[r_id] += 1;
497   }
498   BFT_FREE(l_c_count);
499 
500   /* Sort and remove duplicates */
501 
502   _sort_and_compact_local(ma);
503 }
504 
505 /*----------------------------------------------------------------------------*/
506 /*!
507  * \brief Add terms to distant matrix elements.
508  *
509  * \param[in, out]  ma    pointer to matrix assembler structure
510  * \param[in]       n     number of added column and row couples
511  * \param[in]       g_ij  added couples
512  *
513  * This function should be called by a single thread for a given assembler.
514  */
515 /*----------------------------------------------------------------------------*/
516 
517 static void
_complete_distant(cs_matrix_assembler_t * ma,cs_lnum_t n,cs_gnum_t g_ij[])518 _complete_distant(cs_matrix_assembler_t  *ma,
519                   cs_lnum_t               n,
520                   cs_gnum_t               g_ij[])
521 {
522   /* Count maximum terms to add for each row */
523 
524   cs_lnum_t n_rows = ma->n_rows;
525 
526   cs_lnum_t *d_c_count, *d_r_idx;
527   BFT_MALLOC(d_c_count, n_rows, cs_lnum_t);
528   BFT_MALLOC(d_r_idx, n_rows+1, cs_lnum_t);
529 
530   for (cs_lnum_t i = 0; i < n_rows; i++)
531     d_c_count[i] = 0;
532 
533   /* Only local rows should be encountered here; local column elements
534      are ignored here (we will later check they do not require a local
535      insertion, which is not expected, as we assume the local rank
536      should already have provided a contribution to a local entry) */
537 
538   for (cs_lnum_t i = 0; i < n; i++) {
539     cs_gnum_t g_r_id = g_ij[i*2];
540     cs_gnum_t g_c_id = g_ij[i*2 + 1];
541     assert (g_r_id >=  ma->l_range[0] && g_r_id < ma->l_range[1]);
542     if (g_c_id <  ma->l_range[0] || g_c_id >= ma->l_range[1])
543       d_c_count[g_r_id - ma->l_range[0]] += 1;
544   }
545 
546   d_r_idx[0] = 0;
547   for (cs_lnum_t i = 0; i < n_rows; i++)
548     d_r_idx[i+1] = d_r_idx[i] + d_c_count[i];
549 
550   /* Expand matrix, starting copies from end to avoid overwrites
551      (first line untouched) */
552 
553   BFT_REALLOC(ma->d_g_c_id, ma->d_r_idx[n_rows] + d_r_idx[n_rows], cs_gnum_t);
554 
555   for (cs_lnum_t i = n_rows-1; i > 0; i--) {
556     cs_lnum_t n_cols = ma->d_r_idx[i+1] - ma->d_r_idx[i];
557     cs_gnum_t *col_g_id_d = ma->d_g_c_id + ma->d_r_idx[i] + d_r_idx[i];
558     const cs_gnum_t *col_g_id_s = ma->d_g_c_id + ma->d_r_idx[i];
559     d_c_count[i] = n_cols;
560     ma->d_r_idx[i+1] += d_r_idx[i+1];
561     for (cs_lnum_t j = n_cols-1; j >= 0; j--)
562       col_g_id_d[j] = col_g_id_s[j];
563   }
564   if (n_rows > 0) {
565     d_c_count[0] = ma->d_r_idx[1];
566     ma->d_r_idx[1] += d_r_idx[1];
567   }
568 
569   BFT_FREE(d_r_idx);
570 
571   /* Now add terms */
572 
573   for (cs_lnum_t i = 0; i < n; i++) {
574     cs_gnum_t g_r_id = g_ij[i*2];
575     cs_gnum_t g_c_id = g_ij[i*2 + 1];
576     if (g_c_id <  ma->l_range[0] || g_c_id >= ma->l_range[1]) {
577       cs_lnum_t l_r_id = g_r_id - ma->l_range[0];
578       ma->d_g_c_id[ma->d_r_idx[l_r_id] + d_c_count[l_r_id]] = g_c_id;
579       d_c_count[l_r_id] += 1;
580     }
581   }
582   BFT_FREE(d_c_count);
583 
584   /* Sort and remove duplicates */
585 
586   _sort_and_compact_distant(ma);
587 }
588 
589 /*----------------------------------------------------------------------------*/
590 /*!
591  * \brief Append distant row ids to local row ids.
592  *
593  * As column ids remain ordered in each row, local columns appear first,
594  * and ids matching distant columns second.
595  *
596  * The resulting structure is expected to be best suited for matrix-vector
597  * products, or row extractions, as we do not need to handle separate
598  * matrices, and multiple loops.
599  *
600  * For assembly, the extended part is also maintained separately. For a given
601  * column appearing on row i at extended position j (i.e. ma->d_r_idx[i] + j),
602  * its matching position in the extended matrix will be:
603  * ma->r_idx[i+1] - (ma->d_r_idx[i+1] - ma->d_r_idx[i] + j.
604  *
605  * \param[in, out]  ma         pointer to matrix assembler structure
606  * \param[in]       n_e_g_ids  number of unique external global ids
607  * \param[in]       e_g_id     ordered unique external global ids
608  *
609  * This function should be called by a single thread for a given assembler.
610  */
611 /*----------------------------------------------------------------------------*/
612 
613 static void
_append_local_and_distant(cs_matrix_assembler_t * ma,cs_lnum_t n_e_g_ids,const cs_gnum_t e_g_id[])614 _append_local_and_distant(cs_matrix_assembler_t  *ma,
615                           cs_lnum_t               n_e_g_ids,
616                           const cs_gnum_t         e_g_id[])
617 {
618   /* Count terms to add for each row */
619 
620   cs_lnum_t n_rows = ma->n_rows;
621 
622   cs_lnum_t *c_count;
623   BFT_MALLOC(c_count, n_rows, cs_lnum_t);
624 
625   /* Expand matrix, starting copies from end to avoid overwrites
626      (first line untouched) */
627 
628   BFT_REALLOC(ma->_c_id, ma->_r_idx[n_rows] + ma->d_r_idx[n_rows], cs_lnum_t);
629   ma->c_id = ma->_c_id;
630 
631   for (cs_lnum_t i = n_rows-1; i > 0; i--) {
632     cs_lnum_t n_cols = ma->_r_idx[i+1] - ma->_r_idx[i];
633     cs_lnum_t *col_id_d = ma->_c_id + ma->_r_idx[i] + ma->d_r_idx[i];
634     const cs_lnum_t *col_id_s = ma->_c_id + ma->_r_idx[i];
635     c_count[i] = n_cols;
636     ma->_r_idx[i+1] += ma->d_r_idx[i+1];
637     for (cs_lnum_t j = n_cols-1; j >= 0; j--)
638       col_id_d[j] = col_id_s[j];
639   }
640   if (n_rows > 0) {
641     c_count[0] = ma->_r_idx[1];
642     ma->_r_idx[1] += ma->d_r_idx[1];
643   }
644 
645   /* Now insert terms */
646 
647   for (cs_lnum_t i = 0; i < n_rows; i++) {
648     cs_lnum_t n_d_cols = ma->d_r_idx[i+1] - ma->d_r_idx[i];
649     cs_lnum_t *col_id_d = ma->_c_id + ma->_r_idx[i+1] - n_d_cols;
650     const cs_gnum_t *col_g_id_s = ma->d_g_c_id + ma->d_r_idx[i];
651     for (cs_lnum_t j = 0; j < n_d_cols; j++) {
652       cs_gnum_t g_c_id = col_g_id_s[j];
653       cs_lnum_t k = _g_id_binary_find(n_e_g_ids, g_c_id, e_g_id);
654       col_id_d[j] = n_rows + k;
655     }
656   }
657 
658   BFT_FREE(c_count);
659 }
660 
661 /*----------------------------------------------------------------------------
662  * Determine assumed rank associated with a given global id.
663  *
664  * parameters:
665  *   n_ranks <-- number of ranks
666  *   n_g     <-- global number of ids
667  *   g_id    <-- array of global ids for local rank (size: n)
668  *
669  * returns:
670  *   assumed rank id for g_id
671  *----------------------------------------------------------------------------*/
672 
673 static inline int
_assumed_rank(int n_ranks,cs_gnum_t n_g,const cs_gnum_t g_id)674 _assumed_rank(int              n_ranks,
675               cs_gnum_t        n_g,
676               const cs_gnum_t  g_id)
677 {
678   int rank_id;
679 
680   cs_gnum_t n_per_rank = n_g / n_ranks;
681   cs_lnum_t rmdr = n_g - n_per_rank * (cs_gnum_t)n_ranks;
682 
683   assert(g_id < n_g);
684 
685   if (rmdr == 0)
686     rank_id = g_id / n_per_rank;
687   else {
688     cs_gnum_t n_ranks_rmdr = n_ranks - rmdr;
689     cs_gnum_t n_ranks_n_per_rank = n_ranks_rmdr * n_per_rank;
690     if (g_id  <  n_ranks_n_per_rank)
691       rank_id = g_id / n_per_rank;
692     else
693       rank_id = (g_id + n_ranks_rmdr) / (n_per_rank + 1);
694   }
695 
696   return rank_id;
697 }
698 
699 /*----------------------------------------------------------------------------*/
700 /*!
701  * \brief Build Assumed rank neighbors info
702  *
703  * This operation is collective on communicator comm.
704  *
705  * \param[in]  n        number of global ids for local rank
706  * \param[in]  n_g      global number of ids
707  * \param[in]  l_range  global id range [min, max[ for local rank
708  * \param[in]  g_id     array of global ids for local rank (size: n*stride)
709  * \param[in]  comm     associated communicator
710  *
711  * \return  assumed rank neighbors structure
712  */
713 /*----------------------------------------------------------------------------*/
714 
715 static cs_rank_neighbors_t *
_assumed_rank_neighbors(cs_lnum_t n,cs_gnum_t n_g,cs_gnum_t l_range[2],const cs_gnum_t g_id[],MPI_Comm comm)716 _assumed_rank_neighbors(cs_lnum_t        n,
717                         cs_gnum_t        n_g,
718                         cs_gnum_t        l_range[2],
719                         const cs_gnum_t  g_id[],
720                         MPI_Comm         comm)
721 {
722   /* Prepare for determination of assumed rank;
723      we will send both a partial local range description and global ids to the
724      assumed partition, using a common array, so as to minimize
725      all-to-all communication. */
726 
727   int n_ranks, l_rank;
728   int *a_rank;
729 
730   MPI_Comm_size(comm, &n_ranks);
731   MPI_Comm_rank(comm, &l_rank);
732 
733   /* Build real to assumed partition mapping */
734 
735   cs_lnum_t l_range_size = l_range[1] - l_range[0];
736 
737   BFT_MALLOC(a_rank, l_range_size + n, int);
738 
739   cs_lnum_t n_a_neighbors = 0, count = 0;
740 
741   /* Ranks to send real partition info to */
742 
743   int r_prev = -1;
744 
745   for (cs_lnum_t i = 0; i < l_range_size; i++) {
746     cs_gnum_t _g_id = l_range[0] + (cs_gnum_t)i;
747     int r_dest = _assumed_rank(n_ranks, n_g, _g_id);
748     if (r_dest != r_prev && r_dest != l_rank) {
749       a_rank[n_a_neighbors++] = r_dest;
750       r_prev = r_dest;
751     }
752   }
753 
754   count = n_a_neighbors;
755 
756   /* Ranks to send real data to; assuming ids are mostly
757      ordered, we limit the size of sent data. */
758 
759   for (cs_lnum_t i = 0; i < n; i++) {
760     int r_dest = _assumed_rank(n_ranks, n_g, g_id[i]);
761     if (r_dest != r_prev && r_dest != l_rank) {
762       a_rank[count++] = r_dest;
763       r_prev = r_dest;
764     }
765   }
766 
767   /* Build communicating rank neighbors structure;
768      symmetrize the structure to tell ranks from which other
769      ranks they may receive info */
770 
771   cs_rank_neighbors_t *arn = cs_rank_neighbors_create(count, a_rank);
772 
773   BFT_FREE(a_rank);
774 
775   cs_rank_neighbors_symmetrize(arn, comm);
776 
777   return arn;
778 }
779 
780 /*----------------------------------------------------------------------------*/
781 /*!
782  * \brief Exchange range info for building assumed rank neighbors info
783  *
784  * This operation is collective on communicator comm. The caller is
785  * responsible for freeng the returned array.
786  *
787  * \param[in]  arn      info on ranks with which we will communicate
788  * \param[in]  l_range  global id range [min, max[ for local rank
789  * \param[in]  comm     associated communicator
790  *
791  * \return  true rank neighbors ranges array (size: arn->size*2)
792  */
793 /*----------------------------------------------------------------------------*/
794 
795 static cs_gnum_t *
_rank_ranges_exchange(cs_rank_neighbors_t * arn,cs_gnum_t l_range[2],MPI_Comm comm)796 _rank_ranges_exchange(cs_rank_neighbors_t  *arn,
797                       cs_gnum_t             l_range[2],
798                       MPI_Comm              comm)
799 {
800   MPI_Request *request = NULL;
801   MPI_Status *status = NULL;
802   cs_gnum_t *d_ranges = NULL;
803 
804   BFT_MALLOC(d_ranges, arn->size*2, cs_gnum_t);
805   BFT_MALLOC(request, arn->size*2, MPI_Request);
806   BFT_MALLOC(status, arn->size*2, MPI_Status);
807 
808   /* Prepare for determination of assumed rank */
809 
810   /* Exchange local range with neighbor ranks */
811 
812   int request_count = 0;
813   const int local_rank = cs_glob_rank_id;
814 
815   for (int i = 0; i < arn->size; i++) {
816     MPI_Irecv(d_ranges + i*2,
817               2,
818               CS_MPI_GNUM,
819               arn->rank[i],
820               local_rank,
821               comm,
822               &(request[request_count++]));
823   }
824 
825   for (int i = 0; i < arn->size; i++) {
826     MPI_Isend(l_range,
827               2,
828               CS_MPI_GNUM,
829               arn->rank[i],
830               arn->rank[i],
831               comm,
832               &(request[request_count++]));
833   }
834 
835   MPI_Waitall(request_count, request, status);
836 
837   BFT_FREE(request);
838   BFT_FREE(status);
839 
840   return d_ranges;
841 }
842 
843 /*----------------------------------------------------------------------------*/
844 /*!
845  * \brief Build true rank info based on assumed rank info
846  *
847  * This operation is collective on communicator comm.
848  *
849  * The caller is responsible for freeing the returned array
850  *
851  * \param[in]  arn       info on ranks with which we initially communicate
852  * \param[in]  n         number of global ids for local rank
853  * \param[in]  n_g       global number of ids
854  * \param[in]  l_range   global id range [min, max[ for local rank
855  * \param[in]  d_ranges  global id ranges [min, max[ for neighboring ranks
856  * \param[in]  g_id      sorted array of global ids for local rank
857  *                       (size: n)
858  * \param[in]  comm      associated communicator
859  *
860  * \return  effective ranks matching given global ids (size n)
861  */
862 /*----------------------------------------------------------------------------*/
863 
864 static int *
_assumed_to_true_rank(cs_rank_neighbors_t * arn,cs_lnum_t n,cs_gnum_t n_g,const cs_gnum_t l_range[2],const cs_gnum_t d_ranges[],const cs_gnum_t g_id[],MPI_Comm comm)865 _assumed_to_true_rank(cs_rank_neighbors_t  *arn,
866                       cs_lnum_t             n,
867                       cs_gnum_t             n_g,
868                       const cs_gnum_t       l_range[2],
869                       const cs_gnum_t       d_ranges[],
870                       const cs_gnum_t       g_id[],
871                       MPI_Comm              comm)
872 {
873   int *d_rank;
874   BFT_MALLOC(d_rank, n, int);
875 
876   MPI_Request *request = NULL;
877   MPI_Status *status = NULL;
878 
879   const int local_rank = cs_glob_rank_id;
880 
881   /* Prepare for determination of assumed rank;
882      we will send both a partial local range description and global ids to the
883      assumed partition, using a common array, so as to minimize
884      all-to-all communication. */
885 
886   int n_ranks, l_rank;
887   int *a_rank;
888 
889   MPI_Comm_size(comm, &n_ranks);
890   MPI_Comm_rank(comm, &l_rank);
891 
892   BFT_MALLOC(request, arn->size*2, MPI_Request);
893   BFT_MALLOC(status, arn->size*2, MPI_Status);
894 
895   /* Determine ranks with which we will first exchange;
896      we filter and immediately handle elements whose assumed
897      rank matches the current rank */
898 
899   BFT_MALLOC(a_rank, n, int);
900 
901   cs_lnum_t n_range_match = 0;
902   cs_lnum_t n_range_dist = 0;
903 
904   int a_rank_prev = -1;
905   for (cs_lnum_t i = 0; i < n; i++) {
906     int r = _assumed_rank(n_ranks, n_g, g_id[i]);
907     if (r == local_rank) {
908       int r_idx = _g_id_rank_index(arn->size, d_ranges, g_id[i]);
909       assert(r_idx >= 0);
910       d_rank[i] = arn->rank[r_idx];
911       n_range_match += 1;
912     }
913     else {
914       d_rank[i] = -1; /* flag for later */
915       a_rank[n_range_dist] = r;
916       n_range_dist += 1;
917     }
918     assert(r >= a_rank_prev);
919     a_rank_prev = r;
920   }
921 
922   cs_rank_neighbors_to_index(arn, n_range_dist, a_rank, a_rank);
923 
924   /* Count number of exchanges with each assumed rank
925      and build index, first handled as counts;
926      local values have index -1 */
927 
928   cs_lnum_t *send_index;
929   BFT_MALLOC(send_index, arn->size+1, cs_lnum_t);
930 
931   send_index[0] = 0;
932   cs_rank_neighbors_count(arn, n_range_dist, a_rank, send_index + 1);
933 
934   BFT_FREE(a_rank);
935 
936   /* Exchange counts */
937 
938   cs_lnum_t *recv_index;
939   BFT_MALLOC(recv_index, arn->size + 1, cs_lnum_t);
940 
941   recv_index[0] = 0;
942 
943   int request_count = 0;
944 
945   for (int i = 0; i < arn->size; i++) {
946     MPI_Irecv(recv_index + i + 1,
947               1,
948               CS_MPI_LNUM,
949               arn->rank[i],
950               local_rank,
951               comm,
952               &(request[request_count++]));
953   }
954 
955   for (int i = 0; i < arn->size; i++) {
956     MPI_Isend(send_index + i + 1,
957               1,
958               CS_MPI_LNUM,
959               arn->rank[i],
960               arn->rank[i],
961               comm,
962               &(request[request_count++]));
963   }
964 
965   MPI_Waitall(request_count, request, status);
966 
967   /* Now transform counts to indexes */
968 
969   for (int i = 0; i < arn->size; i++)
970     send_index[i+1] += send_index[i];
971 
972   for (int i = 0; i < arn->size; i++)
973     recv_index[i+1] += recv_index[i];
974 
975   /* Now exchange global ids */
976 
977   const cs_lnum_t recv_size = recv_index[arn->size];
978 
979   cs_gnum_t *recv_g_id;
980   BFT_MALLOC(recv_g_id, recv_size, cs_gnum_t);
981 
982   request_count = 0;
983 
984   for (int i = 0; i < arn->size; i++) {
985     if (recv_index[i+1] > recv_index[i])
986       MPI_Irecv(recv_g_id + recv_index[i],
987                 recv_index[i+1] - recv_index[i],
988                 CS_MPI_GNUM,
989                 arn->rank[i],
990                 local_rank,
991                 comm,
992                 &(request[request_count++]));
993   }
994 
995   for (int i = 0; i < arn->size; i++) {
996     if (send_index[i+1] > send_index[i]) {
997       cs_lnum_t shift = (arn->rank[i] > local_rank) ? n_range_match : 0;
998       MPI_Isend(g_id + send_index[i] + shift,
999                 send_index[i+1] - send_index[i],
1000                 CS_MPI_GNUM,
1001                 arn->rank[i],
1002                 arn->rank[i],
1003                 comm,
1004                 &(request[request_count++]));
1005     }
1006   }
1007 
1008   MPI_Waitall(request_count, request, status);
1009 
1010   /* Now for each global id, determine its rank */
1011 
1012   int *d_rank_ar;
1013   BFT_MALLOC(d_rank_ar, recv_size, int);
1014 
1015   for (cs_lnum_t i = 0; i < recv_size; i++) {
1016     cs_gnum_t _g_id = recv_g_id[i];
1017     if (l_range[0] <= _g_id && l_range[1] > _g_id)
1018       d_rank_ar[i] = local_rank;
1019     else {
1020       int r_idx = _g_id_rank_index(arn->size, d_ranges, recv_g_id[i]);
1021       assert(r_idx >= 0);
1022       d_rank_ar[i] = arn->rank[r_idx];
1023       assert(d_rank_ar[i] >= 0);
1024     }
1025   }
1026 
1027   BFT_FREE(recv_g_id);
1028 
1029   /* Send data back to original rank;
1030      remember part of the data is already known */
1031 
1032   request_count = 0;
1033 
1034   for (int i = 0; i < arn->size; i++) {
1035     if (send_index[i+1] > send_index[i]) {
1036       cs_lnum_t shift = (arn->rank[i] > local_rank) ? n_range_match : 0;
1037       MPI_Irecv(d_rank + send_index[i] + shift,
1038                 send_index[i+1] - send_index[i],
1039                 MPI_INT,
1040                 arn->rank[i],
1041                 arn->rank[i],
1042                 comm,
1043                 &(request[request_count++]));
1044     }
1045   }
1046 
1047   for (int i = 0; i < arn->size; i++) {
1048     if (recv_index[i+1] > recv_index[i])
1049       MPI_Isend(d_rank_ar + recv_index[i],
1050                 recv_index[i+1] - recv_index[i],
1051                 MPI_INT,
1052                 arn->rank[i],
1053                 local_rank,
1054                 comm,
1055                 &(request[request_count++]));
1056   }
1057 
1058   MPI_Waitall(request_count, request, status);
1059 
1060   /* Note: we could add an additional exchange here, for each
1061      connected rank, to indicate which ranks may need
1062      to communicate with it; this would avoid the need for a
1063      symmetrization of the rank neighbors structure built from
1064      the d_rank[] array, as ranks would also know who needs to
1065      send some info. An all-to-all communication with just
1066      1 value per rank could thus be replaced by a point-to-point
1067      communication, but with slightly larger messages.
1068      We are not sure the added complexity would be worth it,
1069      so at this stage, we just add this placeholder. */
1070 
1071   BFT_FREE(d_rank_ar);
1072 
1073   BFT_FREE(recv_index);
1074   BFT_FREE(send_index);
1075 
1076   BFT_FREE(request);
1077   BFT_FREE(status);
1078 
1079   return d_rank;
1080 }
1081 
1082 /*----------------------------------------------------------------------------*/
1083 /*!
1084  * \brief Determine rank neighbors based on global id info
1085  *
1086  * This assumes the ordered and compact list of external global ids
1087  * has been built.
1088  *
1089  * \param[in]   n_e_g_ids     number of unique external global ids
1090  * \param[in]   n_g           global number of ids
1091  * \param[in]   l_range       global id range [min, max[ for local rank
1092  * \param[in]   e_g_id        ordered unique external global ids
1093  * \param[out]  n_init_ranks  number of ranks in initial assumed
1094  *                            neighborhood (for logging), or NULL
1095  * \param[in]   comm          associated communicator
1096  *
1097  * This function should be called by a single thread for a given assembler.
1098  */
1099 /*----------------------------------------------------------------------------*/
1100 
1101 static cs_rank_neighbors_t *
_rank_neighbors(cs_lnum_t n_e_g_ids,cs_gnum_t n_g,cs_gnum_t l_range[2],const cs_gnum_t e_g_id[],int * n_init_ranks,MPI_Comm comm)1102 _rank_neighbors(cs_lnum_t          n_e_g_ids,
1103                 cs_gnum_t          n_g,
1104                 cs_gnum_t          l_range[2],
1105                 const cs_gnum_t    e_g_id[],
1106                 int               *n_init_ranks,
1107                 MPI_Comm           comm)
1108 {
1109   /* Determine assumed rank neighbors based on compact info */
1110 
1111   cs_rank_neighbors_t *arn = _assumed_rank_neighbors(n_e_g_ids,
1112                                                      n_g,
1113                                                      l_range,
1114                                                      e_g_id,
1115                                                      comm);
1116 
1117   if (n_init_ranks != NULL)
1118     *n_init_ranks = arn->size;
1119 
1120   cs_gnum_t *d_ranges = _rank_ranges_exchange(arn,
1121                                               l_range,
1122                                               comm);
1123 
1124   int *e_rank_id = _assumed_to_true_rank(arn,
1125                                          n_e_g_ids,
1126                                          n_g,
1127                                          l_range,
1128                                          d_ranges,
1129                                          e_g_id,
1130                                          comm);
1131 
1132   BFT_FREE(d_ranges);
1133 
1134   /* Now rebuild rank neighbors */
1135 
1136   cs_rank_neighbors_destroy(&arn);
1137 
1138   arn = cs_rank_neighbors_create(n_e_g_ids, e_rank_id);
1139 
1140   BFT_FREE(e_rank_id);
1141 
1142   cs_rank_neighbors_symmetrize(arn, comm);
1143 
1144   return arn;
1145 }
1146 
1147 /*----------------------------------------------------------------------------*/
1148 /*!
1149  * \brief Process and exchange info relative to data that will
1150  *        be sent to neighboring ranks.
1151  *
1152  * At this stage, ma->rank will contain the index of the rank in the
1153  * rank neighborhood structure, not the true MPI rank.
1154  *
1155  * \param[in, out]  ma           pointer to matrix assembler structure
1156  * \param[in]       e_g_ij_size  size of coefficient data to send
1157  * \param[in]       e_g_ij       coefficient data (row_g_id, col_g_id couples)
1158  *                               to send
1159  */
1160 /*----------------------------------------------------------------------------*/
1161 
1162 static void
_process_assembly_data(cs_matrix_assembler_t * ma,cs_gnum_t e_g_ij_size,const cs_gnum_t * e_g_ij)1163 _process_assembly_data(cs_matrix_assembler_t  *ma,
1164                        cs_gnum_t               e_g_ij_size,
1165                        const cs_gnum_t        *e_g_ij)
1166 {
1167   ma->coeff_send_size = e_g_ij_size;
1168 
1169   /* Count rows */
1170 
1171   ma->coeff_send_n_rows = 0;
1172 
1173   cs_gnum_t g_r_id_prev = ma->l_range[0]; /* impossible value here */
1174 
1175   cs_lnum_t  n_e_g_ids = 0;
1176   cs_gnum_t *e_g_id = NULL;
1177 
1178   for (cs_lnum_t i = 0; i < ma->coeff_send_size; i++) {
1179     cs_gnum_t g_r_id = e_g_ij[i*2];
1180     if (g_r_id != g_r_id_prev) {
1181       n_e_g_ids += 1;
1182       ma->coeff_send_n_rows += 1;
1183       g_r_id_prev = g_r_id;
1184     }
1185     cs_gnum_t g_c_id = e_g_ij[i*2+1];
1186     if (g_c_id < ma->l_range[0] || g_c_id >= ma->l_range[1])
1187       n_e_g_ids += 1;
1188   }
1189 
1190   if (ma->coeff_send_size > 0) {
1191 
1192     BFT_MALLOC(e_g_id, n_e_g_ids, cs_gnum_t);
1193 
1194     n_e_g_ids = 0;
1195 
1196     cs_lnum_t row_count = 0;
1197 
1198     BFT_MALLOC(ma->coeff_send_index, ma->coeff_send_n_rows+1, cs_lnum_t);
1199     BFT_MALLOC(ma->coeff_send_row_g_id, ma->coeff_send_n_rows, cs_gnum_t);
1200     BFT_MALLOC(ma->coeff_send_col_g_id, ma->coeff_send_size, cs_gnum_t);
1201 
1202     g_r_id_prev = ma->l_range[0]; /* impossible value here */
1203 
1204     for (cs_lnum_t i = 0; i < ma->coeff_send_size; i++) {
1205       cs_gnum_t g_r_id = e_g_ij[i*2];
1206       if (g_r_id != g_r_id_prev) {
1207         ma->coeff_send_index[row_count] = i;
1208         ma->coeff_send_row_g_id[row_count] = g_r_id;
1209         e_g_id[n_e_g_ids++] = g_r_id;
1210         row_count++;
1211         g_r_id_prev = g_r_id;
1212       }
1213       cs_gnum_t g_c_id = e_g_ij[i*2+1];
1214       ma->coeff_send_col_g_id[i] = g_c_id;
1215       if (g_c_id < ma->l_range[0] || g_c_id >= ma->l_range[1]) {
1216         e_g_id[n_e_g_ids++] = g_c_id;
1217       }
1218     }
1219     ma->coeff_send_index[row_count] = ma->coeff_send_size;
1220 
1221     n_e_g_ids = cs_sort_and_compact_gnum(n_e_g_ids, e_g_id);
1222 
1223   }
1224 
1225   /* Determine ranks we may be communicating with for this stage */
1226 
1227   cs_rank_neighbors_t *arn = _rank_neighbors(n_e_g_ids,
1228                                              ma->n_g_rows,
1229                                              ma->l_range,
1230                                              e_g_id,
1231                                              &(ma->n_ranks_init[0]),
1232                                              ma->comm);
1233 
1234   BFT_FREE(e_g_id);
1235 
1236   cs_gnum_t *d_ranges = _rank_ranges_exchange(arn, ma->l_range, ma->comm);
1237 
1238   /* Prepare counts */
1239 
1240   cs_lnum_t *counts;
1241   BFT_MALLOC(counts, arn->size*2, cs_lnum_t);
1242 
1243   for (int i = 0; i < arn->size*2; i++)
1244     counts[i] = 0;
1245 
1246   for (cs_lnum_t i = 0; i < ma->coeff_send_n_rows; i++) {
1247     cs_gnum_t g_r_id = ma->coeff_send_row_g_id[i];
1248     int r_rank_id = _g_id_rank_index(arn->size, d_ranges, g_r_id);
1249     assert(r_rank_id >= 0 && r_rank_id < arn->size);
1250     assert(   g_r_id >= d_ranges[2*r_rank_id]
1251            && g_r_id < d_ranges[2*r_rank_id+1]);
1252     counts[r_rank_id] += ma->coeff_send_index[i+1] - ma->coeff_send_index[i];
1253   }
1254 
1255   BFT_FREE(d_ranges);
1256 
1257   /* Exchange counts */
1258 
1259   MPI_Request *request = NULL;
1260   MPI_Status *status = NULL;
1261 
1262   BFT_MALLOC(request, arn->size*2, MPI_Request);
1263   BFT_MALLOC(status, arn->size*2, MPI_Status);
1264 
1265   const int local_rank = cs_glob_rank_id;
1266 
1267   int request_count = 0;
1268 
1269   for (int i = 0; i < arn->size; i++) {
1270     MPI_Irecv(counts + arn->size + i,
1271               1,
1272               CS_MPI_LNUM,
1273               arn->rank[i],
1274               local_rank,
1275               ma->comm,
1276               &(request[request_count++]));
1277   }
1278 
1279   for (int i = 0; i < arn->size; i++) {
1280     MPI_Isend(counts + i,
1281               1,
1282               CS_MPI_LNUM,
1283               arn->rank[i],
1284               arn->rank[i],
1285               ma->comm,
1286               &(request[request_count++]));
1287   }
1288 
1289   MPI_Waitall(request_count, request, status);
1290 
1291   /* Save exchange rank info */
1292 
1293   ma->n_coeff_ranks = 0;
1294   for (int i = 0; i < arn->size; i++) {
1295     if (counts[i] + counts[arn->size + i] > 0)
1296       ma->n_coeff_ranks += 1;
1297   }
1298 
1299   BFT_MALLOC(ma->coeff_rank, ma->n_coeff_ranks, int);
1300 
1301   if (ma->n_coeff_ranks > 0) {
1302 
1303     BFT_MALLOC(ma->coeff_rank_send_index, ma->n_coeff_ranks+1, cs_lnum_t);
1304     BFT_MALLOC(ma->coeff_rank_recv_index, ma->n_coeff_ranks+1, cs_lnum_t);
1305 
1306     ma->coeff_rank_send_index[0] = 0;
1307     ma->coeff_rank_recv_index[0] = 0;
1308 
1309     ma->n_coeff_ranks = 0;
1310     for (int i = 0; i < arn->size; i++) {
1311       if (counts[i] + counts[arn->size + i] > 0) {
1312         int j = ma->n_coeff_ranks;
1313         ma->coeff_rank[j] = arn->rank[i];
1314         ma->coeff_rank_send_index[j+1]
1315           = ma->coeff_rank_send_index[j] + counts[i];
1316         ma->coeff_rank_recv_index[j+1]
1317           = ma->coeff_rank_recv_index[j] + counts[arn->size + i];
1318         ma->n_coeff_ranks += 1;
1319       }
1320     }
1321 
1322   }
1323 
1324   BFT_FREE(counts);
1325 
1326   /* We don't need the ranks neighbors structure anymore
1327      (we'll use the saved and compacted info from now on) */
1328 
1329   cs_rank_neighbors_destroy(&arn);
1330 
1331   if (ma->n_coeff_ranks > 0) {
1332 
1333     /* Exchange data */
1334 
1335     ma->coeff_recv_size = ma->coeff_rank_recv_index[ma->n_coeff_ranks];
1336 
1337     cs_gnum_t  *recv_data;
1338     BFT_MALLOC(recv_data, ma->coeff_recv_size*2, cs_gnum_t);
1339 
1340     request_count = 0;
1341 
1342     for (int i = 0; i < ma->n_coeff_ranks; i++) {
1343       cs_lnum_t l_size =   ma->coeff_rank_recv_index[i+1]
1344                          - ma->coeff_rank_recv_index[i];
1345       if (l_size > 0) {
1346         cs_lnum_t recv_shift = ma->coeff_rank_recv_index[i]*2;
1347         MPI_Irecv(recv_data + recv_shift,
1348                   l_size*2,
1349                   CS_MPI_GNUM,
1350                   ma->coeff_rank[i],
1351                   local_rank,
1352                   ma->comm,
1353                   &(request[request_count++]));
1354       }
1355     }
1356 
1357     for (int i = 0; i < ma->n_coeff_ranks; i++) {
1358       cs_lnum_t l_size =   ma->coeff_rank_send_index[i+1]
1359                          - ma->coeff_rank_send_index[i];
1360       if (l_size > 0) {
1361         cs_lnum_t send_shift = ma->coeff_rank_send_index[i]*2;
1362         MPI_Isend(e_g_ij + send_shift,
1363                   l_size*2,
1364                   CS_MPI_GNUM,
1365                   ma->coeff_rank[i],
1366                   ma->coeff_rank[i],
1367                   ma->comm,
1368                   &(request[request_count++]));
1369       }
1370     }
1371 
1372     MPI_Waitall(request_count, request, status);
1373 
1374     /* Insert terms at local rows */
1375 
1376     if (ma->coeff_recv_size > 0)
1377       _complete_distant(ma, ma->coeff_recv_size, recv_data);
1378 
1379     /* Now we build the mapping from received terms to their positions
1380        in the array. For indexing, we implicitely consider that each row
1381        is built of 2 sub-rows, with the local columns first, and distant
1382        columns second. */
1383 
1384     BFT_MALLOC(ma->coeff_recv_row_id, ma->coeff_recv_size, cs_lnum_t);
1385 
1386     if (ma->flags & CS_MATRIX_DISTANT_ROW_USE_COL_IDX) {
1387 
1388       cs_lnum_t    n_l_insert = 0, n_l_insert_max = 0;
1389       cs_lnum_2_t *l_insert = NULL;
1390 
1391       BFT_MALLOC(ma->coeff_recv_col_idx, ma->coeff_recv_size, cs_lnum_t);
1392 
1393       /* First pass: determine local row ids, and check if insertion of
1394          previously unknown entries is required;
1395          column ids are set for local entries */
1396 
1397       for (cs_lnum_t i = 0; i < ma->coeff_recv_size; i++) {
1398 
1399         cs_lnum_t l_r_id = recv_data[i*2] - ma->l_range[0];
1400         cs_gnum_t g_c_id = recv_data[i*2+1];
1401 
1402         ma->coeff_recv_row_id[i] = l_r_id;
1403 
1404         /* Local part */
1405 
1406         if (g_c_id >= ma->l_range[0] && g_c_id < ma->l_range[1]) {
1407 
1408           cs_lnum_t n_cols = ma->r_idx[l_r_id+1] - ma->r_idx[l_r_id];
1409           cs_lnum_t l_c_id = g_c_id - ma->l_range[0];
1410           const cs_lnum_t *col_id = ma->c_id + ma->r_idx[l_r_id];
1411 
1412           ma->coeff_recv_col_idx[i] = _l_id_binary_search(n_cols,
1413                                                           l_c_id,
1414                                                           col_id);
1415 
1416           /* special case for separate diagonal, other cases require insertion */
1417 
1418           if (   ma->coeff_recv_col_idx[i] < 0
1419               && (!ma->separate_diag || l_c_id != l_r_id)) {
1420 
1421             if (n_l_insert >= n_l_insert_max) {
1422               n_l_insert_max = CS_MAX(n_l_insert_max*2, 16);
1423               BFT_REALLOC(l_insert, n_l_insert_max, cs_lnum_2_t);
1424             }
1425 
1426             l_insert[n_l_insert][0] = l_r_id;
1427             l_insert[n_l_insert][1] = l_c_id;
1428             n_l_insert++;
1429 
1430           }
1431 
1432         }
1433 
1434       }
1435 
1436       /* Insert additional local terms if required */
1437 
1438       if (n_l_insert > 0) {
1439 
1440         _complete_local(ma, n_l_insert, l_insert);
1441 
1442         BFT_FREE(l_insert);
1443         n_l_insert_max = 0;
1444 
1445       }
1446 
1447       /* Second pass: determine local column ids, and check if insertion of
1448          previously unknown entries is required */
1449 
1450       for (cs_lnum_t i = 0; i < ma->coeff_recv_size; i++) {
1451 
1452         cs_lnum_t l_r_id = recv_data[i*2] - ma->l_range[0];
1453         cs_gnum_t g_c_id = recv_data[i*2+1];
1454 
1455         /* Local part */
1456 
1457         if (g_c_id >= ma->l_range[0] && g_c_id < ma->l_range[1]) {
1458 
1459           if (n_l_insert == 0)  /* Already up-to-date if no insertion */
1460             continue;
1461 
1462           cs_lnum_t n_cols = ma->r_idx[l_r_id+1] - ma->r_idx[l_r_id];
1463           cs_lnum_t l_c_id = g_c_id - ma->l_range[0];
1464           const cs_lnum_t *col_id = ma->c_id + ma->r_idx[l_r_id];
1465 
1466           ma->coeff_recv_col_idx[i] = _l_id_binary_search(n_cols,
1467                                                           l_c_id,
1468                                                           col_id);
1469 
1470           assert(   ma->coeff_recv_col_idx[i] >= 0
1471                  || (ma->separate_diag && l_c_id == l_r_id));
1472 
1473         }
1474 
1475         /* Distant part */
1476 
1477         else {
1478 
1479           cs_lnum_t n_d_cols = ma->d_r_idx[l_r_id+1] - ma->d_r_idx[l_r_id];
1480           cs_lnum_t n_cols = ma->r_idx[l_r_id+1] - ma->r_idx[l_r_id];
1481           const cs_gnum_t *col_g_id = ma->d_g_c_id + ma->d_r_idx[l_r_id];
1482 
1483           cs_lnum_t d_c_idx = _g_id_binary_find(n_d_cols,
1484                                                 g_c_id,
1485                                                 col_g_id);
1486 
1487           /* column ids start and end of local row, so add n_cols
1488              (local part only, matrices are not appened at this stage) */
1489           ma->coeff_recv_col_idx[i] = d_c_idx + n_cols;
1490         }
1491 
1492       }
1493 
1494     }
1495 
1496     if (ma->flags & CS_MATRIX_DISTANT_ROW_USE_COL_G_ID) {
1497 
1498       BFT_MALLOC(ma->coeff_recv_col_g_id, ma->coeff_recv_size, cs_gnum_t);
1499 
1500       for (cs_lnum_t i = 0; i < ma->coeff_recv_size; i++) {
1501         ma->coeff_recv_row_id[i] = recv_data[i*2] - ma->l_range[0];
1502         ma->coeff_recv_col_g_id[i] = recv_data[i*2+1];
1503       }
1504 
1505     }
1506 
1507     BFT_FREE(recv_data);
1508   }
1509 
1510   BFT_FREE(request);
1511   BFT_FREE(status);
1512 }
1513 
1514 /*----------------------------------------------------------------------------*/
1515 /*!
1516  * \brief Build vector halo structure associated with a matrix assembler.
1517  *
1518  * This assumes the ordered and compact list of external global ids
1519  * has been built.
1520 
1521  * \param[in, out]  ma         pointer to matrix assembler structure
1522  * \param[in]       n_e_g_ids  number of unique external global ids
1523  * \param[in]       e_g_id     ordered unique external global ids
1524  *
1525  * This function should be called by a single thread for a given assembler.
1526  */
1527 /*----------------------------------------------------------------------------*/
1528 
1529 static void
_matrix_assembler_compute_halo(cs_matrix_assembler_t * ma,cs_lnum_t n_e_g_ids,const cs_gnum_t e_g_id[])1530 _matrix_assembler_compute_halo(cs_matrix_assembler_t  *ma,
1531                                cs_lnum_t               n_e_g_ids,
1532                                const cs_gnum_t         e_g_id[])
1533 {
1534   /* Determine assumed rank neighbors based on compact info */
1535 
1536   cs_rank_neighbors_t *arn = _rank_neighbors(n_e_g_ids,
1537                                              ma->n_g_rows,
1538                                              ma->l_range,
1539                                              e_g_id,
1540                                              &(ma->n_ranks_init[1]),
1541                                              ma->comm);
1542 
1543   cs_gnum_t *d_ranges = _rank_ranges_exchange(arn, ma->l_range, ma->comm);
1544 
1545   /* Identifiy rank and local id associated with each element */
1546 
1547   int        *rank_id;
1548   cs_lnum_t  *r_loc_id;
1549   BFT_MALLOC(rank_id, n_e_g_ids, int);
1550   BFT_MALLOC(r_loc_id, n_e_g_ids, cs_lnum_t);
1551 
1552 # pragma omp parallel if(n_e_g_ids > CS_THR_MIN)
1553   for (cs_lnum_t i = 0; i < n_e_g_ids; i++) {
1554     cs_gnum_t g_id = e_g_id[i];
1555     int r_id = _g_id_rank_index(arn->size, d_ranges, g_id);
1556     assert(r_id > -1);
1557     assert(g_id >= d_ranges[r_id*2]);
1558     r_loc_id[i] = g_id - d_ranges[r_id*2];
1559     rank_id[i] = r_id;
1560   }
1561 
1562   BFT_FREE(d_ranges);
1563 
1564   ma->_halo = cs_halo_create_from_rank_neighbors(arn,
1565                                                  ma->n_rows,
1566                                                  n_e_g_ids,
1567                                                  rank_id,
1568                                                  r_loc_id);
1569   ma->halo = ma->_halo;
1570 
1571   BFT_FREE(r_loc_id);
1572   BFT_FREE(rank_id);
1573 
1574   cs_rank_neighbors_destroy(&arn);
1575 }
1576 
1577 /*----------------------------------------------------------------------------*/
1578 /*!
1579  * \brief Compute internal structures required by a matrix assembler
1580  *        in parallel mode.
1581  *
1582  * \param[in, out]  ma   pointer to matrix assembler structure
1583  *
1584  * This function should be called by a single thread for a given assembler.
1585  */
1586 /*----------------------------------------------------------------------------*/
1587 
1588 static void
_matrix_assembler_compute_mpi(cs_matrix_assembler_t * ma)1589 _matrix_assembler_compute_mpi(cs_matrix_assembler_t  *ma)
1590 {
1591   /* Compute number of rows */
1592 
1593   cs_lnum_t  n_rows = 0;
1594   if (ma->l_range[1] > ma->l_range[0])
1595     n_rows = ma->l_range[1] - ma->l_range[0];
1596 
1597   MPI_Allreduce(ma->l_range+1, &(ma->n_g_rows), 1, CS_MPI_GNUM,
1598                 MPI_MAX, ma->comm);
1599 
1600   ma->n_rows = n_rows;
1601 
1602   /* Separate local fom distant rows: counting step */
1603 
1604   cs_lnum_t  e_size = 0;
1605 
1606   cs_lnum_t  *l_c_count, *d_c_count;
1607 
1608   BFT_MALLOC(ma->_r_idx, n_rows+1, cs_lnum_t);
1609   BFT_MALLOC(ma->d_r_idx, n_rows+1, cs_lnum_t);
1610   ma->r_idx = ma->_r_idx;
1611 
1612   BFT_MALLOC(l_c_count, n_rows, cs_lnum_t);
1613   BFT_MALLOC(d_c_count, n_rows, cs_lnum_t);
1614 
1615   for (cs_lnum_t i = 0; i < n_rows; i++) {
1616     l_c_count[i] = 0;
1617     d_c_count[i] = 0;
1618   }
1619 
1620   if (ma->separate_diag) {
1621 
1622     for (cs_lnum_t i = 0; i < ma->size; i++) {
1623       cs_gnum_t g_r_id = ma->g_rc_id[i*2];
1624       cs_gnum_t g_c_id = ma->g_rc_id[i*2 + 1];
1625       if (g_r_id >= ma->l_range[0] && g_r_id < ma->l_range[1]) {
1626         cs_lnum_t l_r_id = g_r_id - ma->l_range[0];
1627         if (g_c_id >= ma->l_range[0] && g_c_id < ma->l_range[1]) {
1628           if (g_c_id != g_r_id)
1629             l_c_count[l_r_id] += 1;
1630         }
1631         else
1632           d_c_count[l_r_id] += 1;
1633       }
1634       else
1635         e_size += 1;
1636     }
1637 
1638   }
1639   else {
1640 
1641     for (cs_lnum_t i = 0; i < ma->size; i++) {
1642       cs_gnum_t g_r_id = ma->g_rc_id[i*2];
1643       cs_gnum_t g_c_id = ma->g_rc_id[i*2 + 1];
1644       if (g_r_id >= ma->l_range[0] && g_r_id < ma->l_range[1]) {
1645         cs_lnum_t l_r_id = g_r_id - ma->l_range[0];
1646         if (g_c_id >= ma->l_range[0] && g_c_id < ma->l_range[1]) {
1647           l_c_count[l_r_id] += 1;
1648         }
1649         else
1650           d_c_count[l_r_id] += 1;
1651       }
1652       else
1653         e_size += 1;
1654     }
1655 
1656   }
1657 
1658   /* Build index and reset count */
1659 
1660   ma->_r_idx[0] = 0;
1661   ma->d_r_idx[0] = 0;
1662 
1663   for (cs_lnum_t i = 0; i < n_rows; i++) {
1664     ma->_r_idx[i+1] = ma->_r_idx[i] + l_c_count[i];
1665     ma->d_r_idx[i+1] = ma->d_r_idx[i] + d_c_count[i];
1666     l_c_count[i] = 0;
1667     d_c_count[i] = 0;
1668   }
1669 
1670   /* Allocate structures holding data */
1671 
1672   BFT_MALLOC(ma->_c_id, ma->_r_idx[n_rows], cs_lnum_t);
1673   BFT_MALLOC(ma->d_g_c_id, ma->d_r_idx[n_rows], cs_gnum_t);
1674   ma->c_id = ma->_c_id;
1675 
1676   cs_gnum_t *e_g_ij;
1677   BFT_MALLOC(e_g_ij, e_size*2, cs_gnum_t);
1678 
1679   /* Now fill data: local part is determined (will be cleaned),
1680      and list of distant rows defined; in cases where sizeof(cs_lnum_t),
1681      is smaller than sizeof(cs_gnum_t), this data, though
1682      unsorted, will already be smaller than the initial couples. */
1683 
1684   cs_lnum_t k = 0;
1685 
1686   if (ma->separate_diag) {
1687 
1688     for (cs_lnum_t i = 0; i < ma->size; i++) {
1689       cs_gnum_t g_r_id = ma->g_rc_id[i*2];
1690       cs_gnum_t g_c_id = ma->g_rc_id[i*2 + 1];
1691       if (g_r_id >= ma->l_range[0] && g_r_id < ma->l_range[1]) {
1692         cs_lnum_t l_r_id = g_r_id - ma->l_range[0];
1693         if (   g_c_id >= ma->l_range[0] && g_c_id < ma->l_range[1]
1694             && g_c_id != g_r_id) {
1695           cs_lnum_t l_c_id = g_c_id - ma->l_range[0];
1696           ma->_c_id[ma->_r_idx[l_r_id] + l_c_count[l_r_id]] = l_c_id;
1697           l_c_count[l_r_id] += 1;
1698         }
1699         else {
1700           ma->d_g_c_id[ma->d_r_idx[l_r_id] + d_c_count[l_r_id]] = g_c_id;
1701           d_c_count[l_r_id] += 1;
1702         }
1703       }
1704       else {
1705         e_g_ij[k++] = g_r_id;
1706         e_g_ij[k++] = g_c_id;
1707       }
1708     }
1709 
1710   }
1711   else {
1712 
1713     for (cs_lnum_t i = 0; i < ma->size; i++) {
1714       cs_gnum_t g_r_id = ma->g_rc_id[i*2];
1715       cs_gnum_t g_c_id = ma->g_rc_id[i*2 + 1];
1716       if (g_r_id >= ma->l_range[0] && g_r_id < ma->l_range[1]) {
1717         cs_lnum_t l_r_id = g_r_id - ma->l_range[0];
1718         if (g_c_id >= ma->l_range[0] && g_c_id < ma->l_range[1]) {
1719           cs_lnum_t l_c_id = g_c_id - ma->l_range[0];
1720           ma->_c_id[ma->_r_idx[l_r_id] + l_c_count[l_r_id]] = l_c_id;
1721           l_c_count[l_r_id] += 1;
1722         }
1723         else {
1724           ma->d_g_c_id[ma->d_r_idx[l_r_id] + d_c_count[l_r_id]] = g_c_id;
1725           d_c_count[l_r_id] += 1;
1726         }
1727       }
1728       else {
1729         e_g_ij[k++] = g_r_id;
1730         e_g_ij[k++] = g_c_id;
1731       }
1732     }
1733 
1734   }
1735 
1736   BFT_FREE(d_c_count);
1737   BFT_FREE(l_c_count);
1738 
1739   /* We can free the initial insertion structure */
1740 
1741   BFT_FREE(ma->g_rc_id);
1742 
1743   assert(k == e_size*2);
1744 
1745   /* Rows and column ids in the local range can be finalized now;
1746      for most matrix structures, we will need to expand the
1747      rows with columns referencing distant rows (except when storing
1748      those terms separatly or passing them to an external library),
1749      but updating it now allows optimizing for space, and will make
1750      future updates simpler. */
1751 
1752   _sort_and_compact_local(ma);
1753   _sort_and_compact_distant(ma);
1754 
1755   /* We can also order and compact data that will be sent to external
1756      ranks now, as it will be needed at some point, and is simpler
1757      to do on global id couples than on local ids with higher stride.
1758      This may also reduce the size of the data we work on for the
1759      following steps. */
1760 
1761   e_size = cs_sort_and_compact_gnum_2(e_size, e_g_ij);
1762 
1763   _process_assembly_data(ma, e_size, e_g_ij);
1764 
1765   BFT_FREE(e_g_ij);
1766 
1767   /* Build compact external global id array */
1768 
1769   cs_lnum_t   n_e_g_ids = 0;
1770   cs_gnum_t  *e_g_id = NULL;
1771 
1772   cs_lnum_t d_size = ma->d_r_idx[ma->n_rows];
1773 
1774   BFT_MALLOC(e_g_id, d_size, cs_gnum_t);
1775 
1776   /* Initial pass to reduce data a bit before sorting */
1777 
1778   cs_gnum_t g_id_prev = ma->l_range[0];
1779 
1780   for (cs_lnum_t i = 0; i < d_size; i++) {
1781     cs_gnum_t g_id = ma->d_g_c_id[i];
1782     if (   (g_id < ma->l_range[0] || g_id >= ma->l_range[1])
1783         && g_id != g_id_prev) {
1784       e_g_id[n_e_g_ids++] = g_id;
1785       g_id_prev = g_id;
1786     }
1787   }
1788 
1789   n_e_g_ids = cs_sort_and_compact_gnum(n_e_g_ids, e_g_id);
1790   BFT_REALLOC(e_g_id, n_e_g_ids, cs_gnum_t);
1791 
1792   /* Finally, append distant row to local rows, and build matching halo */
1793 
1794   _append_local_and_distant(ma, n_e_g_ids, e_g_id);
1795 
1796   if (! (ma->flags & CS_MATRIX_EXTERNAL_HALO)) {
1797     _matrix_assembler_compute_halo(ma, n_e_g_ids, e_g_id);
1798     assert(ma->halo->n_elts[0] == n_e_g_ids);
1799   }
1800 
1801   /* Assign array to structure */
1802 
1803   ma->n_e_g_ids = n_e_g_ids;
1804   ma->e_g_id = e_g_id; e_g_id = NULL;
1805 }
1806 
1807 #endif /* HAVE_MPI */
1808 
1809 /*----------------------------------------------------------------------------*/
1810 /*!
1811  * \brief Compute internal structures required by a matrix assembler
1812  *        in serial mode.
1813  *
1814  * \param[in, out]  ma   pointer to matrix assembler structure
1815  *
1816  * This function should be called by a single thread for a given assembler.
1817  */
1818 /*----------------------------------------------------------------------------*/
1819 
1820 static void
_matrix_assembler_compute_local(cs_matrix_assembler_t * ma)1821 _matrix_assembler_compute_local(cs_matrix_assembler_t  *ma)
1822 {
1823   /* Separate local fom distant rows: counting step */
1824 
1825   cs_lnum_t n_rows = 0;
1826   if (ma->l_range[1] > ma->l_range[0])
1827     n_rows = ma->l_range[1] - ma->l_range[0];
1828 
1829   cs_lnum_t  *c_count;
1830   BFT_MALLOC(ma->_r_idx, n_rows+1, cs_lnum_t);
1831   ma->r_idx = ma->_r_idx;
1832 
1833   BFT_MALLOC(c_count, n_rows, cs_lnum_t);
1834 
1835   for (cs_lnum_t i = 0; i < n_rows; i++)
1836     c_count[i] = 0;
1837 
1838   if (ma->separate_diag) {
1839 
1840     for (cs_lnum_t i = 0; i < ma->size; i++) {
1841       cs_gnum_t g_r_id = ma->g_rc_id[i*2];
1842       cs_gnum_t g_c_id = ma->g_rc_id[i*2 + 1];
1843       assert(g_r_id >= ma->l_range[0] && g_r_id < ma->l_range[1]);
1844       cs_lnum_t l_r_id = g_r_id - ma->l_range[0];
1845       assert(g_c_id >= ma->l_range[0] && g_c_id < ma->l_range[1]);
1846       if (g_c_id != g_r_id)
1847         c_count[l_r_id] += 1;
1848     }
1849 
1850   }
1851   else {
1852 
1853     for (cs_lnum_t i = 0; i < ma->size; i++) {
1854       cs_gnum_t g_r_id = ma->g_rc_id[i*2];
1855       cs_gnum_t g_c_id = ma->g_rc_id[i*2 + 1];
1856       assert(g_r_id >= ma->l_range[0] && g_r_id < ma->l_range[1]);
1857       cs_lnum_t l_r_id = g_r_id - ma->l_range[0];
1858       assert(g_c_id >= ma->l_range[0] && g_c_id < ma->l_range[1]);
1859       c_count[l_r_id] += 1;
1860     }
1861 
1862   }
1863 
1864   /* Build index and reset count */
1865 
1866   ma->_r_idx[0] = 0;
1867 
1868   for (cs_lnum_t i = 0; i < n_rows; i++) {
1869     ma->_r_idx[i+1] = ma->_r_idx[i] + c_count[i];
1870     c_count[i] = 0;
1871   }
1872 
1873   /* Allocate structures holding data */
1874 
1875   BFT_MALLOC(ma->_c_id, ma->_r_idx[n_rows], cs_lnum_t);
1876   ma->c_id = ma->_c_id;
1877 
1878   /* Now fill data: local part is determined (will be cleaned),
1879      and list of distant rows defined */
1880 
1881   if (ma->separate_diag) {
1882 
1883     for (cs_lnum_t i = 0; i < ma->size; i++) {
1884       cs_lnum_t l_r_id = ma->g_rc_id[i*2] - ma->l_range[0];
1885       if (ma->g_rc_id[i*2+1] != ma->g_rc_id[i*2]) {
1886         ma->_c_id[ma->_r_idx[l_r_id] + c_count[l_r_id]]
1887           = ma->g_rc_id[i*2+1] - ma->l_range[0];
1888         c_count[l_r_id] += 1;
1889       }
1890     }
1891 
1892   }
1893   else {
1894 
1895     for (cs_lnum_t i = 0; i < ma->size; i++) {
1896       cs_lnum_t l_r_id = ma->g_rc_id[i*2] - ma->l_range[0];
1897       ma->_c_id[ma->_r_idx[l_r_id] + c_count[l_r_id]]
1898         = ma->g_rc_id[i*2+1] - ma->l_range[0];
1899       c_count[l_r_id] += 1;
1900     }
1901 
1902   }
1903 
1904   BFT_FREE(c_count);
1905 
1906   /* Set global number of rows (will be updated in parallel) */
1907 
1908   ma->n_g_rows = n_rows;
1909   ma->n_rows = n_rows;
1910 
1911   /* Sort and compact elements for final structure */
1912 
1913   _sort_and_compact_local(ma);
1914 }
1915 
1916 /*----------------------------------------------------------------------------*/
1917 /*!
1918  * \brief Build diagonal index array when conversion between included
1919  *        and separate diagonal is required between a values assembler
1920  *        and its matching matrix assembler.
1921  *
1922  * \param[in, out]  pointer to matrix values assembler structure
1923  */
1924 /*----------------------------------------------------------------------------*/
1925 
1926 static void
_matrix_assembler_values_diag_idx(cs_matrix_assembler_values_t * mav)1927 _matrix_assembler_values_diag_idx(cs_matrix_assembler_values_t  *mav)
1928 {
1929   if (mav->diag_idx != NULL)
1930     return;
1931 
1932   const cs_matrix_assembler_t *ma = mav->ma;
1933 
1934   if (ma->separate_diag == mav->separate_diag)
1935     return;
1936 
1937   BFT_MALLOC(mav->diag_idx, ma->n_rows, cs_lnum_t);
1938 
1939   /*
1940     Two cases:
1941     - assembler has separate diagonal but values assembler does not:
1942       determine after which index rows should be shifted
1943     - assembler has included diagonal but values assembler does not:
1944       determine after which index rows should be shifted
1945   */
1946 
1947   if (ma->separate_diag) {
1948     for (cs_lnum_t i = 0; i < ma->n_rows; i++) {
1949       cs_lnum_t j = ma->r_idx[i], k = ma->r_idx[i+1];
1950       while (j < k) {
1951         if (ma->c_id[j] > i)
1952           k = j;
1953         j++;
1954       }
1955       mav->diag_idx[i] = k - ma->r_idx[i];
1956     }
1957   }
1958   else if (mav->separate_diag) {
1959     for (cs_lnum_t i = 0; i < ma->n_rows; i++) {
1960       cs_lnum_t j = ma->r_idx[i], k = ma->r_idx[i+1];
1961       while (j < k) {
1962         if (ma->c_id[j] == i)
1963           k = j;
1964         j++;
1965       }
1966       mav->diag_idx[i] = k - ma->r_idx[i];
1967     }
1968   }
1969 }
1970 
1971 /*----------------------------------------------------------------------------*/
1972 /*!
1973  * \brief Add coefficient values based on row indexes when the diagonal
1974  *        must be shifted between the assembler and the values assembler.
1975  *
1976  * This function may be called by different threads if the caller
1977  * ensures different threads will not write to the same final parts of
1978  * the matrix coefficients.
1979  *
1980  * \param[in, out]  mav      pointer to matrix assembler values structure
1981  * \param[in]       n        number of values to add
1982  * \param[in]       stride   matrix block stride
1983  * \param[in]       row_id   matrix row ids
1984  * \param[in]       col_idx  matrix assembler column indexes
1985  * \param[in]       val      values to add
1986  */
1987 /*----------------------------------------------------------------------------*/
1988 
1989 static void
_matrix_assembler_values_add_cnv_idx(cs_matrix_assembler_values_t * mav,cs_lnum_t n,cs_lnum_t stride,const cs_lnum_t row_id[],const cs_lnum_t col_idx[],const cs_real_t val[])1990 _matrix_assembler_values_add_cnv_idx(cs_matrix_assembler_values_t  *mav,
1991                                      cs_lnum_t                      n,
1992                                      cs_lnum_t                      stride,
1993                                      const cs_lnum_t                row_id[],
1994                                      const cs_lnum_t                col_idx[],
1995                                      const cs_real_t                val[])
1996 {
1997   const cs_matrix_assembler_t  *ma = mav->ma;
1998 
1999   /* Case where the assembler has a separate diagonal, and values have
2000      an included diagonal */
2001 
2002   if (ma->separate_diag) {
2003 
2004     assert(mav->separate_diag == false);
2005 
2006     cs_lnum_t s_col_idx[COEFF_GROUP_SIZE];
2007 
2008     for (cs_lnum_t i = 0; i < n; i+= COEFF_GROUP_SIZE) {
2009 
2010       cs_lnum_t b_size = COEFF_GROUP_SIZE;
2011       if (i + COEFF_GROUP_SIZE > n)
2012         b_size = n - i;
2013 
2014       for (cs_lnum_t j = 0; j < b_size; j++) {
2015         cs_lnum_t r_id = row_id[i+j];
2016         cs_lnum_t c_idx = col_idx[i+j];
2017         if (r_id < 0)
2018           continue;
2019         if (c_idx == -1)
2020           s_col_idx[j] = mav->diag_idx[r_id];
2021         else if (c_idx < mav->diag_idx[r_id])
2022           s_col_idx[j] = c_idx;
2023         else
2024           s_col_idx[j] = c_idx + 1;
2025       }
2026 
2027       mav->add_values(mav->matrix,
2028                       b_size,
2029                       stride,
2030                       row_id + i,
2031                       s_col_idx,
2032                       val + (i*stride));
2033 
2034     }
2035 
2036   }
2037 
2038   /* Case where the assembler has an included diagonal, and values have
2039      a separate diagonal */
2040 
2041   else {
2042 
2043     assert(ma->separate_diag == false);
2044     assert(mav->separate_diag == true);
2045 
2046     cs_lnum_t s_col_idx[COEFF_GROUP_SIZE];
2047 
2048     for (cs_lnum_t i = 0; i < n; i+= COEFF_GROUP_SIZE) {
2049 
2050       cs_lnum_t b_size = COEFF_GROUP_SIZE;
2051       if (i + COEFF_GROUP_SIZE > n)
2052         b_size = n - i;
2053 
2054       for (cs_lnum_t j = 0; j < b_size; j++) {
2055         cs_lnum_t r_id = row_id[i+j];
2056         cs_lnum_t c_idx = col_idx[i+j];
2057         if (r_id < 0)
2058           continue;
2059         if (c_idx < mav->diag_idx[r_id])
2060           s_col_idx[j] = c_idx;
2061         else if (c_idx == mav->diag_idx[r_id])
2062           s_col_idx[j] = -1;
2063         else
2064           s_col_idx[j] = c_idx - 1;
2065       }
2066 
2067       mav->add_values(mav->matrix,
2068                       b_size,
2069                       stride,
2070                       row_id + i,
2071                       s_col_idx,
2072                       val + (i*stride));
2073 
2074     }
2075 
2076   }
2077 }
2078 
2079 #if defined(HAVE_MPI)
2080 
2081 /*----------------------------------------------------------------------------*/
2082 /*!
2083  * \brief Add coefficient values based on local row ids and global column ids,
2084  *        using a local addition function.
2085  *
2086  * This function may be called by different threads if the caller
2087  * ensures different threads will not write to the same final parts of
2088  * the matrix coefficients.
2089  *
2090  * \param[in, out]  mav       pointer to matrix assembler values structure
2091  * \param[in]       n         number of values to add
2092  * \param[in]       stride    matrix block stride
2093  * \param[in]       row_id    matrix row ids
2094  * \param[in]       col_g_id  global column ids
2095  * \param[in]       val       values to add
2096  */
2097 /*----------------------------------------------------------------------------*/
2098 
2099 static void
_matrix_assembler_values_add_lg(cs_matrix_assembler_values_t * mav,cs_lnum_t n,cs_lnum_t stride,const cs_lnum_t row_id[],const cs_gnum_t col_g_id[],const cs_real_t val[])2100 _matrix_assembler_values_add_lg(cs_matrix_assembler_values_t  *mav,
2101                                 cs_lnum_t                      n,
2102                                 cs_lnum_t                      stride,
2103                                 const cs_lnum_t                row_id[],
2104                                 const cs_gnum_t                col_g_id[],
2105                                 const cs_real_t                val[])
2106 {
2107   const cs_matrix_assembler_t  *ma = mav->ma;
2108 
2109   assert(mav->add_values != NULL);
2110 
2111   cs_lnum_t s_col_idx[COEFF_GROUP_SIZE];
2112 
2113   for (cs_lnum_t i = 0; i < n; i+= COEFF_GROUP_SIZE) {
2114 
2115     cs_lnum_t b_size = COEFF_GROUP_SIZE;
2116     if (i + COEFF_GROUP_SIZE > n)
2117       b_size = n - i;
2118 
2119     for (cs_lnum_t j = 0; j < b_size; j++) {
2120 
2121       cs_lnum_t k = i+j;
2122 
2123       cs_lnum_t l_r_id = row_id[k];
2124       cs_gnum_t g_c_id = col_g_id[k];
2125 
2126       /* Local part */
2127 
2128       if (g_c_id >= ma->l_range[0] && g_c_id < ma->l_range[1]) {
2129 
2130         cs_lnum_t l_c_id = g_c_id - ma->l_range[0];
2131         cs_lnum_t n_cols =  ma->r_idx[l_r_id+1] - ma->r_idx[l_r_id];
2132 
2133         /* remark: we could slightly reduce the binary search by subtracting
2134            the last (ma->d_r_idx[l_r_id+1] - ma->d_r_idx[l_r_id]) from
2135            the column count, but this would require a separate function
2136            for the serial case where that array may not exist */
2137 
2138         s_col_idx[j] = _l_id_binary_search(n_cols,
2139                                            l_c_id,
2140                                            ma->c_id + ma->r_idx[l_r_id]);
2141 
2142         assert(s_col_idx[j] > -1 || (ma->separate_diag && l_c_id == l_r_id));
2143 
2144       }
2145 
2146       /* Distant part */
2147 
2148 #if defined(HAVE_MPI)
2149 
2150       else {
2151 
2152         cs_lnum_t n_cols = ma->d_r_idx[l_r_id+1] - ma->d_r_idx[l_r_id];
2153 
2154         cs_lnum_t d_c_idx = _g_id_binary_find(n_cols,
2155                                               g_c_id,
2156                                               ma->d_g_c_id + ma->d_r_idx[l_r_id]);
2157 
2158         /* column ids start and end of local row, so add r_idx[row_id+1] */
2159         s_col_idx[j] = d_c_idx + ma->r_idx[l_r_id] + 1;
2160 
2161       }
2162 
2163 #endif /* HAVE_MPI */
2164 
2165     }
2166 
2167     /* Case where assembler and values assembler have the same
2168        "separate diagonal" behavior */
2169 
2170     if (ma->separate_diag == mav->separate_diag)
2171       mav->add_values(mav->matrix,
2172                       b_size,
2173                       stride,
2174                       row_id + i,
2175                       s_col_idx,
2176                       val + (i*stride));
2177 
2178     else
2179       _matrix_assembler_values_add_cnv_idx(mav,
2180                                            b_size,
2181                                            stride,
2182                                            row_id + i,
2183                                            s_col_idx,
2184                                            val + (i*stride));
2185 
2186   }
2187 }
2188 
2189 #endif /* HAVE_MPI */
2190 
2191 /*----------------------------------------------------------------------------*/
2192 /*!
2193  * \brief Add coefficient values based on local row ids and column ids,
2194  *        using a global addition function.
2195  *
2196  * This function may be called by different threads if the caller
2197  * ensures different threads will not write to the same final parts of
2198  * the matrix coefficients.
2199  *
2200  * \param[in, out]  mav     pointer to matrix assembler values structure
2201  * \param[in]       n       number of values to add
2202  * \param[in]       stride  matrix block stride
2203  * \param[in]       row_id  matrix row ids
2204  * \param[in]       col_id  matrix assembler column ids
2205  * \param[in]       val     values to add
2206  */
2207 /*----------------------------------------------------------------------------*/
2208 
2209 static void
_matrix_assembler_values_add_ll_g(cs_matrix_assembler_values_t * mav,cs_lnum_t n,cs_lnum_t stride,const cs_lnum_t row_id[],const cs_lnum_t col_id[],const cs_real_t val[])2210 _matrix_assembler_values_add_ll_g(cs_matrix_assembler_values_t  *mav,
2211                                   cs_lnum_t                      n,
2212                                   cs_lnum_t                      stride,
2213                                   const cs_lnum_t                row_id[],
2214                                   const cs_lnum_t                col_id[],
2215                                   const cs_real_t                val[])
2216 
2217 {
2218   const cs_matrix_assembler_t  *ma = mav->ma;
2219 
2220   cs_gnum_t row_g_id[COEFF_GROUP_SIZE];
2221   cs_gnum_t col_g_id[COEFF_GROUP_SIZE];
2222 
2223   for (cs_lnum_t i = 0; i < n; i+= COEFF_GROUP_SIZE) {
2224 
2225     cs_lnum_t b_size = COEFF_GROUP_SIZE;
2226     if (i + COEFF_GROUP_SIZE > n)
2227       b_size = n - i;
2228 
2229     for (cs_lnum_t j = 0; j < b_size; j++) {
2230 
2231       cs_lnum_t k = i+j;
2232 
2233       row_g_id[j] = row_id[k] + ma->l_range[0];
2234 
2235       cs_lnum_t l_c_id = col_id[k];
2236       if (l_c_id < ma->n_rows)
2237         col_g_id[j] = row_id[i+j] + ma->l_range[0];
2238       else
2239         col_g_id[j] = ma->e_g_id[l_c_id - ma->n_rows];
2240 
2241     }
2242 
2243     mav->add_values_g(mav->matrix,
2244                       b_size,
2245                       stride,
2246                       row_g_id,
2247                       col_g_id,
2248                       val + (i*stride));
2249 
2250   }
2251 
2252 }
2253 
2254 #if defined(HAVE_MPI)
2255 
2256 /*----------------------------------------------------------------------------*/
2257 /*!
2258  * \brief Add coefficient values based on local row ids and column indexes,
2259  *        using a global addition function.
2260  *
2261  * This function may be called by different threads if the caller
2262  * ensures different threads will not write to the same final parts of
2263  * the matrix coefficients.
2264  *
2265  * \param[in, out]  mav      pointer to matrix assembler values structure
2266  * \param[in]       n        number of values to add
2267  * \param[in]       stride   matrix block stride
2268  * \param[in]       row_id   matrix row ids
2269  * \param[in]       col_idx  matrix assembler column indexes
2270  * \param[in]       val      values to add
2271  */
2272 /*----------------------------------------------------------------------------*/
2273 
2274 static void
_matrix_assembler_values_add_llx_g(cs_matrix_assembler_values_t * mav,cs_lnum_t n,cs_lnum_t stride,const cs_lnum_t row_id[],const cs_lnum_t col_idx[],const cs_real_t val[])2275 _matrix_assembler_values_add_llx_g(cs_matrix_assembler_values_t  *mav,
2276                                    cs_lnum_t                      n,
2277                                    cs_lnum_t                      stride,
2278                                    const cs_lnum_t                row_id[],
2279                                    const cs_lnum_t                col_idx[],
2280                                    const cs_real_t                val[])
2281 
2282 {
2283   const cs_matrix_assembler_t  *ma = mav->ma;
2284 
2285   cs_gnum_t row_g_id[COEFF_GROUP_SIZE];
2286   cs_gnum_t col_g_id[COEFF_GROUP_SIZE];
2287 
2288   for (cs_lnum_t i = 0; i < n; i+= COEFF_GROUP_SIZE) {
2289 
2290     cs_lnum_t b_size = COEFF_GROUP_SIZE;
2291     if (i + COEFF_GROUP_SIZE > n)
2292       b_size = n - i;
2293 
2294     for (cs_lnum_t j = 0; j < b_size; j++) {
2295 
2296       cs_lnum_t k = i+j;
2297 
2298       cs_lnum_t l_r_id = row_id[k];
2299       cs_lnum_t l_c_id;
2300       if (ma->c_id[k] != -1)
2301         l_c_id = ma->c_id[ma->r_idx[l_r_id] + col_idx[k]];
2302       else
2303         l_c_id = l_r_id;
2304 
2305       row_g_id[j] = row_id[k] + ma->l_range[0];
2306       if (l_c_id < ma->n_rows)
2307         col_g_id[j] = row_id[k] + ma->l_range[0];
2308       else
2309         col_g_id[j] = ma->e_g_id[l_c_id - ma->n_rows];
2310 
2311     }
2312 
2313     mav->add_values_g(mav->matrix,
2314                       b_size,
2315                       stride,
2316                       row_g_id,
2317                       col_g_id,
2318                       val + (i*stride));
2319 
2320   }
2321 
2322 }
2323 
2324 /*----------------------------------------------------------------------------*/
2325 /*!
2326  * \brief Add coefficient values based on local row ids and global column ids,
2327  *        using a global addition function.
2328  *
2329  * This function may be called by different threads if the caller
2330  * ensures different threads will not write to the same final parts of
2331  * the matrix coefficients.
2332  *
2333  * \param[in, out]  mav       pointer to matrix assembler values structure
2334  * \param[in]       n         number of values to add
2335  * \param[in]       stride    matrix block stride
2336  * \param[in]       row_id    matrix row ids
2337  * \param[in]       col_g_id  matrix global column ids
2338  * \param[in]       val       values to add
2339  */
2340 /*----------------------------------------------------------------------------*/
2341 
2342 static void
_matrix_assembler_values_add_lg_g(cs_matrix_assembler_values_t * mav,cs_lnum_t n,cs_lnum_t stride,const cs_lnum_t row_id[],const cs_gnum_t col_g_id[],const cs_real_t val[])2343 _matrix_assembler_values_add_lg_g(cs_matrix_assembler_values_t  *mav,
2344                                   cs_lnum_t                      n,
2345                                   cs_lnum_t                      stride,
2346                                   const cs_lnum_t                row_id[],
2347                                   const cs_gnum_t                col_g_id[],
2348                                   const cs_real_t                val[])
2349 {
2350   const cs_matrix_assembler_t  *ma = mav->ma;
2351 
2352   cs_gnum_t row_g_id[COEFF_GROUP_SIZE];
2353 
2354   for (cs_lnum_t i = 0; i < n; i+= COEFF_GROUP_SIZE) {
2355 
2356     cs_lnum_t b_size = COEFF_GROUP_SIZE;
2357     if (i + COEFF_GROUP_SIZE > n)
2358       b_size = n - i;
2359 
2360     for (cs_lnum_t j = 0; j < b_size; j++)
2361       row_g_id[j] = row_id[i+j] + ma->l_range[0];
2362 
2363     mav->add_values_g(mav->matrix,
2364                       b_size,
2365                       stride,
2366                       row_g_id,
2367                       col_g_id + i,
2368                       val + (i*stride));
2369 
2370   }
2371 }
2372 
2373 #endif /* HAVE_MPI */
2374 
2375 /*! (DOXYGEN_SHOULD_SKIP_THIS) \endcond */
2376 
2377 /*============================================================================
2378  * Public function definitions
2379  *============================================================================*/
2380 
2381 /*----------------------------------------------------------------------------*/
2382 /*!
2383  * \brief Create a matrix assembler structure.
2384  *
2385  * The associated matrix structure data will initially be empty, though
2386  * the range of rows associated with the current rank must be defined
2387  * immediately.
2388  *
2389  * \param[in]  l_range        global id range [min, max[ for local rank
2390  * \param[in]  separate_diag  if true, diagonal terms are handled separately
2391  *
2392  * \return  pointer to created matrix assembler structure
2393  */
2394 /*----------------------------------------------------------------------------*/
2395 
2396 cs_matrix_assembler_t *
cs_matrix_assembler_create(const cs_gnum_t l_range[2],bool separate_diag)2397 cs_matrix_assembler_create(const cs_gnum_t  l_range[2],
2398                            bool             separate_diag)
2399 {
2400   cs_matrix_assembler_t *ma = NULL;
2401 
2402   BFT_MALLOC(ma, 1, cs_matrix_assembler_t);
2403 
2404   ma->separate_diag = separate_diag;
2405 
2406   ma->flags = CS_MATRIX_DISTANT_ROW_USE_COL_IDX;
2407 
2408   ma->l_range[0] = l_range[0];
2409   ma->l_range[1] = l_range[1];
2410 
2411   ma->n_g_rows = 0;
2412   ma->n_rows = 0;
2413 
2414   ma->size = 0;
2415   ma->max_size = 0;
2416 
2417   ma->r_idx = NULL;
2418   ma->c_id = NULL;
2419   ma->_r_idx = NULL;
2420   ma->_c_id = NULL;
2421 
2422   ma->d_r_idx = NULL;
2423   ma->d_g_c_id = NULL;
2424 
2425   ma->g_rc_id = NULL;
2426 
2427 #if defined(HAVE_MPI)
2428 
2429   ma->n_coeff_ranks = 0;
2430   ma->coeff_rank = NULL;
2431 
2432   ma->coeff_send_size = 0;
2433   ma->coeff_recv_size = 0;
2434 
2435   ma->coeff_send_n_rows = 0;
2436   ma->coeff_send_index = NULL;
2437   ma->coeff_send_row_g_id = NULL;
2438   ma->coeff_send_col_g_id = NULL;
2439 
2440   ma->coeff_rank_send_index = NULL;
2441   ma->coeff_rank_recv_index = NULL;
2442 
2443   ma->coeff_recv_row_id = NULL;
2444   ma->coeff_recv_col_idx = NULL;
2445   ma->coeff_recv_col_g_id = NULL;
2446 
2447   ma->comm = cs_glob_mpi_comm;
2448 
2449   ma->n_ranks_init[0] = 0;
2450   ma->n_ranks_init[1] = 0;
2451 
2452 #endif /* HAVE_MPI */
2453 
2454   ma->halo = NULL;
2455   ma->_halo = NULL;
2456 
2457   ma->n_e_g_ids = 0;
2458   ma->e_g_id = NULL;
2459 
2460   return ma;
2461 }
2462 
2463 /*----------------------------------------------------------------------------*/
2464 /*!
2465  * \brief Create a matrix assembler structure based on a given connectivity
2466  *        and associated halo structure.
2467  *
2468  * The assembler may not be later modified when built this way.
2469  *
2470  * The connectivity arrays and halo will be shared by the caller, so their
2471  * life cycle must be at least as long as the matrix assembler structure.
2472  *
2473  * \param[in]  n_rows         number fo local rows
2474  * \param[in]  separate_diag  if true, diagonal terms are handled separately
2475  * \param[in]  row_idx        matrix row index
2476  * \param[in]  col_id         matrix column indexes
2477  * \param[in]  halo           associated halo structure
2478  *
2479  * \return  pointer to created matrix assembler structure
2480  */
2481 /*----------------------------------------------------------------------------*/
2482 
2483 cs_matrix_assembler_t *
cs_matrix_assembler_create_from_shared(cs_lnum_t n_rows,bool separate_diag,const cs_lnum_t row_idx[],const cs_lnum_t col_id[],const cs_halo_t * halo)2484 cs_matrix_assembler_create_from_shared(cs_lnum_t         n_rows,
2485                                        bool              separate_diag,
2486                                        const cs_lnum_t   row_idx[],
2487                                        const cs_lnum_t   col_id[],
2488                                        const cs_halo_t  *halo)
2489 {
2490   cs_gnum_t l_range[2] = {0, n_rows};
2491   cs_gnum_t n_g_rows = n_rows;
2492 
2493 #if defined(HAVE_MPI)
2494   if (cs_glob_n_ranks > 1) {
2495     cs_gnum_t g_shift;
2496     cs_gnum_t l_shift = n_rows;
2497     MPI_Scan(&l_shift, &g_shift, 1, CS_MPI_GNUM, MPI_SUM, cs_glob_mpi_comm);
2498     MPI_Allreduce(&l_shift, &n_g_rows, 1, CS_MPI_GNUM, MPI_SUM,
2499                   cs_glob_mpi_comm);
2500     l_range[0] = g_shift - l_shift;
2501     l_range[1] = g_shift;
2502   }
2503 #endif
2504 
2505   /* Base creation */
2506 
2507   cs_matrix_assembler_t *ma = cs_matrix_assembler_create(l_range,
2508                                                          separate_diag);
2509 
2510   ma->n_g_rows = n_g_rows;
2511   ma->n_rows = n_rows;
2512 
2513   ma->r_idx = row_idx;
2514   ma->c_id = col_id;
2515 
2516   ma->halo = halo;
2517 
2518   /* Additional required parallel data */
2519 
2520   if (ma->halo != NULL) {
2521 
2522     cs_gnum_t *t_g_id;
2523     BFT_MALLOC(ma->e_g_id, ma->halo->n_elts[0], cs_gnum_t);
2524     BFT_MALLOC(t_g_id, ma->n_rows + ma->halo->n_elts[0], cs_gnum_t);
2525 
2526     for (cs_lnum_t i = 0; i < ma->n_rows; i++)
2527       t_g_id[i] = (cs_gnum_t)i + ma->l_range[0];
2528 
2529     cs_halo_sync_untyped(ma->halo,
2530                          CS_HALO_STANDARD,
2531                          sizeof(cs_gnum_t),
2532                          t_g_id);
2533 
2534     ma->n_e_g_ids = ma->halo->n_elts[0];
2535     for (cs_lnum_t i = 0; i < ma->n_e_g_ids; i++)
2536       ma->e_g_id[i] = t_g_id[ma->n_rows + i];
2537 
2538     BFT_FREE(t_g_id);
2539 
2540     /* Build distant columns index and global ids */
2541 
2542     BFT_MALLOC(ma->d_r_idx, ma->n_rows+1, cs_lnum_t);
2543 
2544     ma->d_r_idx[0] = 0;
2545     for (cs_lnum_t i = 0; i < ma->n_rows; i++) {
2546       cs_lnum_t n_e_cols = 0;
2547       cs_lnum_t n_cols = ma->r_idx[i+1] - ma->r_idx[i];
2548       const cs_lnum_t *c_id = ma->c_id + ma->r_idx[i];
2549       for (cs_lnum_t j = 0; j < n_cols; j++) {
2550         if (c_id[j] >= ma->n_rows)
2551           n_e_cols++;
2552       }
2553       ma->d_r_idx[i+1] = n_e_cols;
2554     }
2555 
2556     for (cs_lnum_t i = 0; i < ma->n_rows; i++)
2557       ma->d_r_idx[i+1] += ma->d_r_idx[i];
2558 
2559     BFT_MALLOC(ma->d_g_c_id, ma->d_r_idx[ma->n_rows], cs_gnum_t);
2560     for (cs_lnum_t i = 0; i < ma->n_rows; i++) {
2561       cs_lnum_t offset = ma->d_r_idx[i];
2562       cs_lnum_t n_cols = ma->r_idx[i+1] - ma->r_idx[i];
2563       const cs_lnum_t *c_id = ma->c_id + ma->r_idx[i];
2564       for (cs_lnum_t j = 0; j < n_cols; j++) {
2565         if (c_id[j] >= ma->n_rows) {
2566           assert(c_id[j] - ma->n_rows < ma->n_e_g_ids);
2567           ma->d_g_c_id[offset++] = ma->e_g_id[c_id[j] - ma->n_rows];
2568         }
2569       }
2570     }
2571 
2572   }
2573 
2574   return ma;
2575 }
2576 
2577 /*----------------------------------------------------------------------------*/
2578 /*!
2579  * \brief Destroy a matrix assembler structure.
2580  *
2581  * \param[in, out]  ma  pointer to matrix assembler structure pointer
2582  */
2583 /*----------------------------------------------------------------------------*/
2584 
2585 void
cs_matrix_assembler_destroy(cs_matrix_assembler_t ** ma)2586 cs_matrix_assembler_destroy(cs_matrix_assembler_t  **ma)
2587 {
2588   if (ma != NULL && *ma != NULL) {
2589     cs_matrix_assembler_t *_ma = *ma;
2590 
2591     BFT_FREE(_ma->e_g_id);
2592 
2593     if (_ma->_halo != NULL)
2594       cs_halo_destroy(&(_ma->_halo));
2595 
2596 #if defined(HAVE_MPI)
2597 
2598     BFT_FREE(_ma->coeff_recv_col_g_id);
2599     BFT_FREE(_ma->coeff_recv_col_idx);
2600     BFT_FREE(_ma->coeff_recv_row_id);
2601 
2602     BFT_FREE(_ma->coeff_rank_recv_index);
2603     BFT_FREE(_ma->coeff_rank_send_index);
2604 
2605     BFT_FREE(_ma->coeff_send_col_g_id);
2606     BFT_FREE(_ma->coeff_send_row_g_id);
2607     BFT_FREE(_ma->coeff_send_index);
2608     BFT_FREE(_ma->coeff_rank);
2609 
2610 #endif /* HAVE_MPI */
2611 
2612     BFT_FREE(_ma->g_rc_id);
2613 
2614     BFT_FREE(_ma->d_g_c_id);
2615     BFT_FREE(_ma->d_r_idx);
2616 
2617     BFT_FREE(_ma->_c_id);
2618     BFT_FREE(_ma->_r_idx);
2619 
2620     BFT_FREE(*ma);
2621   }
2622 }
2623 
2624 /*----------------------------------------------------------------------------*/
2625 /*!
2626  * \brief Set option flags for a given matrix assembler structure.
2627  *
2628  * When used, this function should be called before defining entries
2629  * (in case of future option additions) and in any case before computing
2630  * the final structure.
2631  *
2632  * Flags are defined as a sum (bitwise or) of constants, described below:
2633  * - \ref CS_MATRIX_DISTANT_ROW_USE_COL_IDX indicates that column indexes
2634  *   matching distant row data are computed and maintained, so the
2635  *   associated coefficient contributions can be added more efficiently.
2636  * - \ref CS_MATRIX_DISTANT_ROW_USE_COL_G_ID  indicates that columns
2637  *   global ids matching distant row data are maintained directly,
2638  *   so no lookup to a global id description of matrix columns is needed;
2639  *   this option is useful only when using an external library allowing
2640  *   incremental setting or accumulation of coefficients using global
2641  *   column ids, such as with PETSc's MatSetValues. When building matrices
2642  *   locally first (which includes most cases, whether internal matrices,
2643  *   or using library conversion or import functions such as PETSc's
2644  *   MatCreateMPIAIJWithArrays) this is slightly less efficient, as
2645  *   column ids will need to be matched to each row's columns a second
2646  *   time for those (exchanged) coefficients.
2647  * - \ref CS_MATRIX_EXTERNAL_HALO indicates that we do not need to
2648  *   build an associated halo structure, as it will be built externally
2649  *   (i.e. by an external library such as PETSc, HYPRE, ...) using its
2650  *   own equivalent structures.
2651  *
2652  * \param[in, out]  ma     pointer to matrix assembler structure
2653  * \param[in]       flags  sum of matrix assembler flag constants
2654  *                         (bitwise or)
2655  */
2656 /*----------------------------------------------------------------------------*/
2657 
2658 void
cs_matrix_assembler_set_options(cs_matrix_assembler_t * ma,int flags)2659 cs_matrix_assembler_set_options(cs_matrix_assembler_t  *ma,
2660                                 int                     flags)
2661 {
2662   ma->flags = flags;
2663 
2664   /* Ensure at least one distant row exchange option is set */
2665 
2666   if (   !(ma->flags & CS_MATRIX_DISTANT_ROW_USE_COL_IDX)
2667       && !(ma->flags & CS_MATRIX_DISTANT_ROW_USE_COL_G_ID))
2668     ma->flags = ma->flags | CS_MATRIX_DISTANT_ROW_USE_COL_IDX;
2669 }
2670 
2671 /*----------------------------------------------------------------------------*/
2672 /*!
2673  * \brief Return the option flags for a given matrix assembler structure.
2674  *
2675  * Flags are defined as a sum (bitwise or) of constants, described in
2676  * \ref cs_matrix_assembler_set_options.
2677  *
2678  * \param[in]  ma  pointer to matrix assembler structure
2679  *
2680  * \return  option flags (sum of integer constants) used y this structure
2681  */
2682 /*----------------------------------------------------------------------------*/
2683 
2684 int
cs_matrix_assembler_get_options(const cs_matrix_assembler_t * ma)2685 cs_matrix_assembler_get_options(const cs_matrix_assembler_t  *ma)
2686 {
2687   return ma->flags;
2688 }
2689 
2690 /*----------------------------------------------------------------------------*/
2691 /*!
2692  * \brief Add entries to a matrix assembler structure.
2693  *
2694  * This function should be called by a single thread for a given assembler.
2695  *
2696  * \param[in, out]  ma        pointer to matrix assembler structure
2697  * \param[in]       n         number of entries
2698  * \param[in]       col_g_id  global column ids associated with entries
2699  * \param[in]       row_g_id  global row ids associated with entries
2700  */
2701 /*----------------------------------------------------------------------------*/
2702 
2703 void
cs_matrix_assembler_add_g_ids(cs_matrix_assembler_t * ma,cs_lnum_t n,const cs_gnum_t row_g_id[],const cs_gnum_t col_g_id[])2704 cs_matrix_assembler_add_g_ids(cs_matrix_assembler_t  *ma,
2705                               cs_lnum_t               n,
2706                               const cs_gnum_t         row_g_id[],
2707                               const cs_gnum_t         col_g_id[])
2708 {
2709   /* Reallocate if needed;
2710      note that we could use optimized structures to avoid
2711      inserting duplicate values; A simple and relatively
2712      efficient variant may be to preallocate a local matrix
2713      with a fixed estimated number of columns, so as to
2714      handle the bulk of data, and add additional bins for
2715      the rest; sorting and removal of duplicates could occur
2716      when storage of preallocated sections is full.
2717      This may be done at a future stage, depending on timing
2718      and memory consumption measures for the current implementation. */
2719 
2720   if (ma->size + n >= ma->max_size) {
2721     if (ma->size == 0)
2722       ma->max_size = 4;
2723     while (ma->size + n >= ma->max_size)
2724       ma->max_size *= 2;
2725     BFT_REALLOC(ma->g_rc_id, ma->max_size*2, cs_gnum_t);
2726   }
2727 
2728   cs_gnum_t *_g_rc_id = ma->g_rc_id + ma->size*2;
2729 
2730   if (ma->separate_diag == false) {
2731     for (cs_lnum_t i = 0; i < n; i++) {
2732       _g_rc_id[i*2]   = row_g_id[i];
2733       _g_rc_id[i*2+1] = col_g_id[i];
2734     }
2735     ma->size += n;
2736   }
2737   else {
2738     cs_lnum_t j = 0;
2739     for (cs_lnum_t i = 0; i < n; i++) {
2740       if (   row_g_id[i] != col_g_id[i]
2741           || row_g_id[i] <  ma->l_range[0]
2742           || row_g_id[i] >= ma->l_range[1]) {
2743         _g_rc_id[j*2]   = row_g_id[i];
2744         _g_rc_id[j*2+1] = col_g_id[i];
2745         j += 1;
2746       }
2747     }
2748     ma->size += j;
2749   }
2750 }
2751 
2752 /*----------------------------------------------------------------------------*/
2753 /*!
2754  * \brief Compute internal structures required by a matrix assembler.
2755  *
2756   * The associated vector halo is also computed at this stage.
2757  *
2758 * \param[in, out]  ma    pointer to matrix assembler structure
2759  *
2760  * This function should be called by a single thread for a given assembler.
2761  */
2762 /*----------------------------------------------------------------------------*/
2763 
2764 void
cs_matrix_assembler_compute(cs_matrix_assembler_t * ma)2765 cs_matrix_assembler_compute(cs_matrix_assembler_t  *ma)
2766 {
2767 #if defined(HAVE_MPI)
2768   if (ma->comm != MPI_COMM_NULL && ma->comm != MPI_COMM_SELF) {
2769     _matrix_assembler_compute_mpi(ma);
2770     return;
2771   }
2772 #endif /* HAVE_MPI */
2773 
2774   _matrix_assembler_compute_local(ma);
2775 
2776   /* Non-null array to simplify binary search (rare diagonal-only case) */
2777 
2778   if (ma->c_id == NULL) {
2779     BFT_MALLOC(ma->_c_id, 1, cs_lnum_t);
2780     ma->c_id = ma->_c_id;
2781     ma->_c_id[0] = -1;
2782   }
2783 }
2784 
2785 #if defined(HAVE_MPI)
2786 
2787 /*----------------------------------------------------------------------------*/
2788 /*!
2789  * \brief Assign specific MPI communicator to matrix assembler.
2790  *
2791  * This must be called before \ref cs_matrix_assembler_compute.
2792  *
2793  * \param[in, out]  ma    pointer to matrix assembler structure
2794  * \param[in]       comm  assigned MPI communicator
2795  */
2796 /*----------------------------------------------------------------------------*/
2797 
2798 void
cs_matrix_assembler_set_comm(cs_matrix_assembler_t * ma,MPI_Comm comm)2799 cs_matrix_assembler_set_comm(cs_matrix_assembler_t  *ma,
2800                              MPI_Comm                comm)
2801 {
2802   assert(ma->n_ranks_init[1] == 0);
2803 
2804   ma->comm = comm;
2805 }
2806 
2807 #endif /* HAVE_MPI */
2808 
2809 /*----------------------------------------------------------------------------*/
2810 /*!
2811  * \brief Return a pointer to local global row range associated with a
2812  *        matrix assembler.
2813  *
2814  * \param[in]  ma   pointer to matrix assembler structure
2815  *
2816  * \return  pointer to local range
2817  */
2818 /*----------------------------------------------------------------------------*/
2819 
2820 const cs_gnum_t *
cs_matrix_assembler_get_l_range(const cs_matrix_assembler_t * ma)2821 cs_matrix_assembler_get_l_range(const cs_matrix_assembler_t  *ma)
2822 {
2823   return ma->l_range;
2824 }
2825 
2826 /*----------------------------------------------------------------------------*/
2827 /*!
2828  * \brief Return a pointer to the halo structure associated with a
2829  *        matrix assembler.
2830  *
2831  * \param[in]  ma   pointer to matrix assembler structure
2832  *
2833  * \return  pointer to halo structure
2834  */
2835 /*----------------------------------------------------------------------------*/
2836 
2837 const cs_halo_t *
cs_matrix_assembler_get_halo(const cs_matrix_assembler_t * ma)2838 cs_matrix_assembler_get_halo(const cs_matrix_assembler_t  *ma)
2839 {
2840   return ma->halo;
2841 }
2842 
2843 /*----------------------------------------------------------------------------*/
2844 /*!
2845  * \brief Indicate if the matrix assembler is based on a separate diagonal.
2846  *
2847  * \param[in]  ma  pointer to matrix assembler structure
2848  *
2849  * \return  true if the associated structure has a separate diagonal,
2850  *          false otherwise
2851  */
2852 /*----------------------------------------------------------------------------*/
2853 
2854 bool
cs_matrix_assembler_get_separate_diag(const cs_matrix_assembler_t * ma)2855 cs_matrix_assembler_get_separate_diag(const cs_matrix_assembler_t  *ma)
2856 {
2857   return ma->separate_diag;
2858 }
2859 
2860 /*----------------------------------------------------------------------------*/
2861 /*!
2862  * \brief Return the number of rows associated with a matrix assembler.
2863  *
2864  * \param[in]  ma  pointer to matrix assembler structure
2865  *
2866  * \return  number of rows associated with matrix assembler
2867  */
2868 /*----------------------------------------------------------------------------*/
2869 
2870 cs_lnum_t
cs_matrix_assembler_get_n_rows(const cs_matrix_assembler_t * ma)2871 cs_matrix_assembler_get_n_rows(const cs_matrix_assembler_t  *ma)
2872 {
2873   return ma->n_rows;
2874 }
2875 
2876 /*----------------------------------------------------------------------------*/
2877 /*!
2878  * \brief Return the number of columns associated with a matrix assembler.
2879  *
2880  * \param[in]  ma  pointer to matrix assembler structure
2881  *
2882  * \return  number of columns associated with matrix assembler
2883  */
2884 /*----------------------------------------------------------------------------*/
2885 
2886 cs_lnum_t
cs_matrix_assembler_get_n_columns(const cs_matrix_assembler_t * ma)2887 cs_matrix_assembler_get_n_columns(const cs_matrix_assembler_t  *ma)
2888 {
2889 #if defined(HAVE_MPI)
2890   return (ma->n_rows + ma->n_e_g_ids);
2891 #else
2892   return ma->n_rows;
2893 #endif
2894 }
2895 
2896 /*----------------------------------------------------------------------------*/
2897 /*!
2898  * \brief Return a row index associated with a matrix assembler.
2899  *
2900  * This index is a CSR type index relative to all columns, including or
2901  * excluding the diagonal depending on the \c separate_diag option used
2902  * when creating the matrix assembler. The matching column ids can be
2903  * obtained using \ref cs_matrix_assembler_get_col_ids.
2904  *
2905  * \warning the returned index is valid only as long as the matrix assembly
2906  * structure exists.
2907  *
2908  * \param[in]  ma  pointer to matrix assembler structure
2909  *
2910  * \return  pointer to matrix structure row index
2911  */
2912 /*----------------------------------------------------------------------------*/
2913 
2914 const cs_lnum_t *
cs_matrix_assembler_get_row_index(const cs_matrix_assembler_t * ma)2915 cs_matrix_assembler_get_row_index(const cs_matrix_assembler_t  *ma)
2916 {
2917   return ma->r_idx;
2918 }
2919 
2920 /*----------------------------------------------------------------------------*/
2921 /*!
2922  * \brief Return a column ids associated with a matrix assembler.
2923  *
2924  * These ids relative to all columns of successive rows, with columns
2925  * ordered by local id, so local columns appear first, distant columns
2926  * after. Depending on the \c separate_diag option used when creating the
2927  * matrix assembler, the columns may include the diagonal or not.
2928  * The matching index can be obtained using
2929  * \ref cs_matrix_assembler_get_row_index.
2930  *
2931  * \warning the returned index is valid only as long as the matrix assembly
2932  * structure exists.
2933  *
2934  * \param[in]  ma  pointer to matrix assembler structure
2935  *
2936  * \return  pointer to matrix structure row index
2937  */
2938 /*----------------------------------------------------------------------------*/
2939 
2940 const cs_lnum_t *
cs_matrix_assembler_get_col_ids(const cs_matrix_assembler_t * ma)2941 cs_matrix_assembler_get_col_ids(const cs_matrix_assembler_t  *ma)
2942 {
2943   return ma->c_id;
2944 }
2945 
2946 /*----------------------------------------------------------------------------*/
2947 /*!
2948  * \brief Return info on the number of neighbor ranks a matrix assembler
2949  *        may communicate with.
2950  *
2951  * \param[in]   ma   pointer to matrix assembler structure
2952  * \param[out]  rc   rank counts; the 4 values are:
2953  *                   - 0 number of communicating ranks for vector update (halo)
2954  *                   - 1 number of communicating ranks for matrix assembly
2955  *                   - 2 maximum number of communicating ranks for halo
2956  *                       construction (assumed ranks algorithm)
2957  *                   - 3 maximum number of communicating ranks for matrix
2958  *                       assembly determination (assumed ranks algorithm)
2959  */
2960 /*----------------------------------------------------------------------------*/
2961 
2962 void
cs_matrix_assembler_get_rank_counts(const cs_matrix_assembler_t * ma,int rc[4])2963 cs_matrix_assembler_get_rank_counts(const cs_matrix_assembler_t  *ma,
2964                                     int                           rc[4])
2965 {
2966   rc[0] = 0;
2967   if (ma->halo != NULL)
2968     rc[0] = ma->halo->n_c_domains;
2969 #if defined(HAVE_MPI)
2970   rc[1] = ma->n_coeff_ranks;
2971   rc[2] = ma->n_ranks_init[1];
2972   rc[3] = ma->n_ranks_init[0];
2973 #else
2974   rc[1] = 0;
2975   rc[2] = 0;
2976   rc[3] = 0;
2977 #endif
2978 }
2979 
2980 /*----------------------------------------------------------------------------*/
2981 /*!
2982  * \brief Log rank counts for a given matrix assembler.
2983  *
2984  * \param[in]  ma      pointer to matrix assembler structure
2985  * \param[in]  log_id  log type
2986  * \param[in]  name    name of this assembler
2987  */
2988 /*----------------------------------------------------------------------------*/
2989 
2990 void
cs_matrix_assembler_log_rank_counts(const cs_matrix_assembler_t * ma,cs_log_t log_id,const char * name)2991 cs_matrix_assembler_log_rank_counts(const cs_matrix_assembler_t  *ma,
2992                                     cs_log_t                      log_id,
2993                                     const char                   *name)
2994 {
2995   cs_log_printf(log_id,
2996                 _("\nNeighbor rank counts for matrix assembler: %s\n"
2997                   "-----------------------------------------\n"),
2998                 name);
2999 
3000   const char *count_name[]
3001     = {N_("Neighbor ranks for vector update (halo)"),
3002        N_("Neighbor ranks for matrix assembly"),
3003        N_("Maximum neighbor ranks during halo construction"),
3004        N_("Maximum neighbor ranks during assembly determination")};
3005 
3006   int counts[4];
3007 
3008   cs_matrix_assembler_get_rank_counts(ma, counts);
3009 
3010   for (int i = 0; i < 4; i++) {
3011 
3012     char ul[120];
3013     size_t j = 0;
3014     size_t u_count = cs_log_strlen(_(count_name[i]));
3015     for (j = 0; j < u_count && j < 119; j++)
3016       ul[j] = '-';
3017     ul[j] = '\0';
3018 
3019     cs_log_printf(log_id, "\n  %s:\n  %s\n\n", _(count_name[i]), ul);
3020 
3021     _display_rank_histogram(log_id, counts[i]);
3022 
3023   }
3024 }
3025 
3026 /*----------------------------------------------------------------------------*/
3027 /*!
3028  * \brief Create and initialize a matrix assembler values structure.
3029  *
3030  * The associated values will initially be set to zero.
3031  *
3032  * Block sizes are defined by an optional array of 4 values:
3033  *   0: useful block size, 1: vector block extents,
3034  *   2: matrix line extents,  3: matrix line*column extents
3035  *
3036  * This is a low-level function, which should be called by a simpler
3037  * function (\ref cs_matrix_assembler_values_init) which provides
3038  * the necessary function pointers.
3039  *
3040  * \warning  The matrix pointer must point to valid data when the selection
3041  *           function is called, so the life cycle of the data pointed to
3042  *           should be at least as long as that of the assembler values
3043  *           structure. In a similar manner, the life cycle of the associated
3044  *           matrix assembler must also be at least as long as that
3045  *           of the assembler values structure.
3046  *
3047  * \param[in]       ma        associated matrix assembly structure
3048  * \param[in]       sep_diag  true if diagonal terms are stored separately
3049  * \param[in]       db_size   optional diagonal block sizes
3050  * \param[in]       eb_size   optional extra-diagonal block sizes
3051  * \param[in, out]  matrix    untyped pointer to matrix description structure
3052  * \param[in]       init      pointer to matrix coefficients
3053  *                            initialization function
3054  * \param[in]       add       pointer to matrix coefficients addition from
3055  *                            local ids function
3056  * \param[in]       add_g     pointer to matrix coefficients addition from
3057  *                            global ids function
3058  * \param[in]       begin     pointer to matrix coefficients assembly
3059  *                            start function (optional)
3060  * \param[in]       end       pointer to matrix coefficients assembly
3061  *                            end function (optional)
3062  *
3063  * \return  pointer to initialized matrix assembler structure;
3064  */
3065 /*----------------------------------------------------------------------------*/
3066 
3067 cs_matrix_assembler_values_t *
cs_matrix_assembler_values_create(const cs_matrix_assembler_t * ma,bool sep_diag,const cs_lnum_t * db_size,const cs_lnum_t * eb_size,void * matrix,cs_matrix_assembler_values_init_t * init,cs_matrix_assembler_values_add_t * add,cs_matrix_assembler_values_add_g_t * add_g,cs_matrix_assembler_values_begin_t * begin,cs_matrix_assembler_values_end_t * end)3068 cs_matrix_assembler_values_create(const cs_matrix_assembler_t          *ma,
3069                                   bool                                  sep_diag,
3070                                   const cs_lnum_t                      *db_size,
3071                                   const cs_lnum_t                      *eb_size,
3072                                   void                                 *matrix,
3073                                   cs_matrix_assembler_values_init_t    *init,
3074                                   cs_matrix_assembler_values_add_t     *add,
3075                                   cs_matrix_assembler_values_add_g_t   *add_g,
3076                                   cs_matrix_assembler_values_begin_t   *begin,
3077                                   cs_matrix_assembler_values_end_t     *end)
3078 {
3079   cs_matrix_assembler_values_t *mav;
3080 
3081   BFT_MALLOC(mav, 1, cs_matrix_assembler_values_t);
3082 
3083   mav->ma = ma;
3084 
3085   mav->separate_diag = sep_diag;
3086   mav->final_assembly = false;
3087 
3088   for (int i = 0; i < 4; i++) {
3089     mav->db_size[i] = 1;
3090     mav->eb_size[i] = 1;
3091   }
3092 
3093   if (db_size != NULL)
3094     memcpy(mav->db_size, db_size, 4*sizeof(int));
3095   if (eb_size != NULL)
3096     memcpy(mav->eb_size, eb_size, 4*sizeof(int));
3097 
3098   mav->diag_idx = NULL;
3099 
3100   mav->matrix = matrix;
3101 
3102   mav->init = init;
3103   mav->add_values = add;
3104   mav->add_values_g = add_g;
3105   mav->assembly_begin = begin;
3106   mav->assembly_end = end;
3107 
3108 #if defined(HAVE_MPI)
3109 
3110   cs_lnum_t  alloc_size = ma->coeff_send_size * mav->eb_size[3];
3111 
3112   BFT_MALLOC(mav->coeff_send, alloc_size, cs_real_t);
3113 
3114   for (cs_lnum_t i = 0; i < alloc_size; i++)
3115     mav->coeff_send[i] = 0;
3116 
3117 #endif /* HAVE_MPI */
3118 
3119   /* Build diagonal index if it will be needed */
3120 
3121   if (mav->separate_diag != ma->separate_diag)
3122     _matrix_assembler_values_diag_idx(mav);
3123 
3124   if (mav->init != NULL)
3125     mav->init(mav->matrix, mav->db_size, mav->eb_size);
3126 
3127   return mav;
3128 }
3129 
3130 /*----------------------------------------------------------------------------*/
3131 /*!
3132  * \brief Finalize matrix assembler values structure.
3133  *
3134  * When this function returns, the assembler values structure has been
3135  * destroyed, and the associated matrix is fully assembled (i.e. ready to
3136  * use).
3137  *
3138  * \param[in, out]  mav  pointer to matrix assembler values structure pointer
3139  */
3140 /*----------------------------------------------------------------------------*/
3141 
3142 void
cs_matrix_assembler_values_finalize(cs_matrix_assembler_values_t ** mav)3143 cs_matrix_assembler_values_finalize(cs_matrix_assembler_values_t  **mav)
3144 {
3145   if (mav != NULL) {
3146 
3147     cs_matrix_assembler_values_t *_mav = *mav;
3148 
3149     if (_mav->final_assembly == false)
3150       cs_matrix_assembler_values_done(_mav);
3151 
3152     if (_mav->assembly_end != NULL)
3153       _mav->assembly_end(_mav->matrix);
3154 
3155     BFT_FREE(*mav);
3156 
3157   }
3158 }
3159 
3160 /*----------------------------------------------------------------------------*/
3161 /*!
3162  * \brief Add values to a matrix assembler values structure using local
3163  *        row and column ids.
3164  *
3165  * If the matching matrix coefficients use a block structure, the
3166  * values passed to this function should use the same block size.
3167  *
3168  * Note also that if the matrix has different diagonal and extradiagonal
3169  * block sizes, separate calls to this function should be used for both
3170  * types of coefficients; in addition, in this case, diagonal coefficients
3171  * should only be provided by the owning rank (this also impacts how
3172  * the associated matrix assembler structure is defined).
3173  *
3174  * This function may be called by different threads, as long those threads
3175  * do not add contributions to the same rows (assuming caution is taken
3176  * in the case of external libraries so that their builder functions
3177  * have tht same property).
3178  *
3179  * \param[in, out]  mav     pointer to matrix assembler values structure
3180  * \param[in]       n       number of entries
3181  * \param[in]       col_id  local column ids associated with entries
3182  * \param[in]       row_id  local row ids associated with entries
3183  * \param[in]       val     values associated with entries
3184  */
3185 /*----------------------------------------------------------------------------*/
3186 
3187 void
cs_matrix_assembler_values_add(cs_matrix_assembler_values_t * mav,cs_lnum_t n,const cs_lnum_t row_id[],const cs_lnum_t col_id[],const cs_real_t val[])3188 cs_matrix_assembler_values_add(cs_matrix_assembler_values_t  *mav,
3189                                cs_lnum_t                      n,
3190                                const cs_lnum_t                row_id[],
3191                                const cs_lnum_t                col_id[],
3192                                const cs_real_t                val[])
3193 {
3194   const cs_matrix_assembler_t  *ma = mav->ma;
3195 
3196   cs_lnum_t stride = 0;
3197 
3198   if (n < 1)
3199     return;
3200 
3201   /* Base stride on first type of value encountered */
3202 
3203   if (row_id[0] == col_id[0])
3204     stride = mav->db_size[3];
3205   else
3206     stride = mav->eb_size[3];
3207 
3208   /* Case where we compute a column index first */
3209 
3210   if (mav->add_values != NULL) {
3211 
3212     cs_lnum_t s_col_idx[COEFF_GROUP_SIZE];
3213 
3214     for (cs_lnum_t i = 0; i < n; i+= COEFF_GROUP_SIZE) {
3215 
3216       cs_lnum_t b_size = COEFF_GROUP_SIZE;
3217       if (i + COEFF_GROUP_SIZE > n)
3218         b_size = n - i;
3219 
3220       for (cs_lnum_t j = 0; j < b_size; j++) {
3221 
3222         cs_lnum_t k = i+j;
3223 
3224         cs_lnum_t l_r_id = row_id[k];
3225         cs_lnum_t l_c_id = col_id[k];
3226 
3227         cs_lnum_t n_cols = ma->r_idx[l_r_id+1] - ma->r_idx[l_r_id];
3228 
3229         s_col_idx[j] = _l_id_binary_search(n_cols,
3230                                            col_id[k],
3231                                            ma->c_id + ma->r_idx[l_r_id]);
3232 
3233         assert(s_col_idx[j] > -1 || (ma->separate_diag && l_c_id == l_r_id));
3234 
3235       }
3236 
3237       if (ma->separate_diag == mav->separate_diag)
3238         mav->add_values(mav->matrix,
3239                         b_size,
3240                         stride,
3241                         row_id + i,
3242                         s_col_idx,
3243                         val + (i*stride));
3244 
3245       else
3246         _matrix_assembler_values_add_cnv_idx(mav,
3247                                              b_size,
3248                                              stride,
3249                                              row_id + i,
3250                                              s_col_idx,
3251                                              val + (i*stride));
3252 
3253     }
3254 
3255   }
3256 
3257   else { /* use a global id-based function */
3258 
3259     assert(mav->add_values_g != NULL);
3260 
3261     _matrix_assembler_values_add_ll_g(mav,
3262                                       n,
3263                                       stride,
3264                                       row_id,
3265                                       col_id,
3266                                       val);
3267   }
3268 }
3269 
3270 /*----------------------------------------------------------------------------*/
3271 /*!
3272  * \brief Add values to a matrix assembler values structure using global
3273  *        row and column ids.
3274  *
3275  * If the matching matrix coefficients use a block structure, the
3276  * values passed to this function should use the same block size.
3277  *
3278  * Note also that if the matrix has different diagonal and extradiagonal
3279  * block sizes, separate calls to this function should be used for both
3280  * types of coefficients; in addition, in this case, diagonal coefficients
3281  * should only be provided by the owning rank (this also impacts how
3282  * the associated matrix assembler structure is defined).
3283  *
3284  * This function may be called by different threads, as long those threads
3285  * do not add contributions to the same rows (assuming caution is taken
3286  * in the case of external libraries so that their builder functions
3287  * have tht same property).
3288  *
3289  * \param[in, out]  mav       pointer to matrix assembler values structure
3290  * \param[in]       n         number of entries
3291  * \param[in]       col_g_id  global column ids associated with entries
3292  * \param[in]       row_g_id  global row ids associated with entries
3293  * \param[in]       val       values associated with entries
3294  */
3295 /*----------------------------------------------------------------------------*/
3296 
3297 void
cs_matrix_assembler_values_add_g(cs_matrix_assembler_values_t * mav,cs_lnum_t n,const cs_gnum_t row_g_id[],const cs_gnum_t col_g_id[],const cs_real_t val[])3298 cs_matrix_assembler_values_add_g(cs_matrix_assembler_values_t  *mav,
3299                                  cs_lnum_t                      n,
3300                                  const cs_gnum_t                row_g_id[],
3301                                  const cs_gnum_t                col_g_id[],
3302                                  const cs_real_t                val[])
3303 {
3304   const cs_matrix_assembler_t  *ma = mav->ma;
3305 
3306   cs_lnum_t stride = 0;
3307 
3308   if (n < 1)
3309     return;
3310 
3311   /* Base stride on first type of value encountered */
3312 
3313   if (row_g_id[0] == col_g_id[0])
3314     stride = mav->db_size[3];
3315   else
3316     stride = mav->eb_size[3];
3317 
3318   cs_gnum_t s_g_row_id[COEFF_GROUP_SIZE];
3319   cs_gnum_t s_g_col_id[COEFF_GROUP_SIZE];
3320 
3321   for (cs_lnum_t i = 0; i < n; i+= COEFF_GROUP_SIZE) {
3322 
3323     cs_lnum_t b_size = COEFF_GROUP_SIZE;
3324     if (i + COEFF_GROUP_SIZE > n)
3325       b_size = n - i;
3326 
3327     for (cs_lnum_t j = 0; j < b_size; j++) {
3328 
3329       cs_lnum_t k = i+j;
3330 
3331       cs_gnum_t g_r_id = row_g_id[k];
3332 
3333 #if defined(HAVE_MPI)
3334 
3335       /* Case where coefficient is handled by other rank */
3336 
3337       if (g_r_id < ma->l_range[0] || g_r_id >= ma->l_range[1]) {
3338 
3339         s_g_row_id[j] = ma->l_range[1];
3340         s_g_col_id[j] = 0;
3341 
3342         cs_lnum_t e_r_id = _g_id_binary_find(ma->coeff_send_n_rows,
3343                                              g_r_id,
3344                                              ma->coeff_send_row_g_id);
3345 
3346         cs_lnum_t r_start = ma->coeff_send_index[e_r_id];
3347         cs_lnum_t n_e_rows = ma->coeff_send_index[e_r_id+1] - r_start;
3348 
3349         cs_lnum_t e_id =   r_start
3350                          +_g_id_binary_find(n_e_rows,
3351                                             col_g_id[k],
3352                                             ma->coeff_send_col_g_id + r_start);
3353 
3354         /* Now add values to send coefficients */
3355 
3356         for (cs_lnum_t l = 0; l < stride; l++)
3357 #         pragma omp atomic
3358           mav->coeff_send[e_id*stride + l] += val[k*stride + l];
3359 
3360       }
3361 
3362       /* Standard case */
3363 
3364       else {
3365 
3366         s_g_row_id[j] = g_r_id;
3367         s_g_col_id[j] = col_g_id[k];
3368 
3369       }
3370 
3371 #else
3372 
3373       s_g_row_id[j] = g_r_id;
3374       s_g_col_id[j] = col_g_id[k];
3375 
3376 #endif /* HAVE_MPI */
3377 
3378     }
3379 
3380     if (mav->add_values_g != NULL) /* global id-based assembler function */
3381       mav->add_values_g(mav->matrix,
3382                         b_size,
3383                         stride,
3384                         s_g_row_id,
3385                         s_g_col_id,
3386                         val + (i*stride));
3387 
3388     else if (ma->d_r_idx != NULL) { /* local-id-based function, need to adapt */
3389 
3390       cs_lnum_t s_row_id[COEFF_GROUP_SIZE];
3391       cs_lnum_t s_col_idx[COEFF_GROUP_SIZE];
3392 
3393       for (cs_lnum_t j = 0; j < b_size; j++) {
3394 
3395         if (s_g_row_id[j] == ma->l_range[1]) { /* filter other-rank values */
3396           s_row_id[j] = -1;
3397           s_col_idx[j] = -1;
3398         }
3399 
3400         else { /* Handle values assigned to local rank */
3401 
3402           cs_lnum_t l_r_id = s_g_row_id[j] - ma->l_range[0];
3403           cs_gnum_t g_c_id = s_g_col_id[j];
3404 
3405           s_row_id[j] = l_r_id;
3406 
3407           cs_lnum_t n_l_cols = (  ma->r_idx[l_r_id+1] - ma->r_idx[l_r_id]
3408                                 - ma->d_r_idx[l_r_id+1] + ma->d_r_idx[l_r_id]);
3409 
3410           /* Local part */
3411 
3412           if (g_c_id >= ma->l_range[0] && g_c_id < ma->l_range[1]) {
3413 
3414             cs_lnum_t l_c_id = g_c_id - ma->l_range[0];
3415 
3416             s_col_idx[j] = _l_id_binary_search(n_l_cols,
3417                                                l_c_id,
3418                                                ma->c_id + ma->r_idx[l_r_id]);
3419 
3420             assert(s_col_idx[j] > -1 || (ma->separate_diag && l_c_id == l_r_id));
3421 
3422           }
3423 
3424           /* Distant part */
3425 
3426           else {
3427 
3428             cs_lnum_t n_cols = ma->d_r_idx[l_r_id+1] - ma->d_r_idx[l_r_id];
3429 
3430             cs_lnum_t d_c_idx = _g_id_binary_find(n_cols,
3431                                                   g_c_id,
3432                                                   ma->d_g_c_id + ma->d_r_idx[l_r_id]);
3433 
3434             /* column ids start and end of local row, so add n_l_cols */
3435             s_col_idx[j] = d_c_idx + n_l_cols;
3436 
3437           }
3438 
3439         }
3440 
3441       }
3442 
3443       if (ma->separate_diag == mav->separate_diag) {
3444         mav->add_values(mav->matrix,
3445                         b_size,
3446                         stride,
3447                         s_row_id,
3448                         s_col_idx,
3449                         val + (i*stride));
3450       }
3451 
3452       else
3453         _matrix_assembler_values_add_cnv_idx(mav,
3454                                              b_size,
3455                                              stride,
3456                                              s_row_id,
3457                                              s_col_idx,
3458                                              val + (i*stride));
3459 
3460     }
3461 
3462     else { /* local-only case or matrix part */
3463 
3464       assert(ma->d_r_idx == NULL);
3465 
3466       cs_lnum_t s_row_id[COEFF_GROUP_SIZE];
3467       cs_lnum_t s_col_idx[COEFF_GROUP_SIZE];
3468 
3469       for (cs_lnum_t j = 0; j < b_size; j++) {
3470 
3471         assert(s_g_row_id[j] < ma->l_range[1]);
3472         assert(s_g_col_id[j] < ma->l_range[1]);
3473 
3474         cs_lnum_t l_r_id = s_g_row_id[j] - ma->l_range[0];
3475         cs_lnum_t l_c_id = s_g_col_id[j] - ma->l_range[0];
3476 
3477         assert(   s_g_col_id[j] >= ma->l_range[0]
3478                && s_g_col_id[j] < ma->l_range[1]);
3479 
3480         s_row_id[j] = l_r_id;
3481 
3482         cs_lnum_t n_cols = ma->r_idx[l_r_id+1] - ma->r_idx[l_r_id];
3483 
3484         s_col_idx[j] = _l_id_binary_search(n_cols,
3485                                            l_c_id,
3486                                            ma->c_id + ma->r_idx[l_r_id]);
3487 
3488         assert(s_col_idx[j] > -1 || (ma->separate_diag && l_c_id == l_r_id));
3489 
3490       }
3491 
3492       if (ma->separate_diag == mav->separate_diag)
3493         mav->add_values(mav->matrix,
3494                         b_size,
3495                         stride,
3496                         s_row_id,
3497                         s_col_idx,
3498                         val + (i*stride));
3499       else
3500         _matrix_assembler_values_add_cnv_idx(mav,
3501                                              b_size,
3502                                              stride,
3503                                              s_row_id,
3504                                              s_col_idx,
3505                                              val + (i*stride));
3506 
3507     }
3508   }
3509 }
3510 
3511 /*----------------------------------------------------------------------------*/
3512 /*!
3513  * \brief Start assembly of matrix values structure.
3514  *
3515  * The call to this function is always optional, and indicates that assembly
3516  * may begin. Calling it before \ref cs_matrix_assembler_values_finalize
3517  * may be useful when the underlying libraries can overlap communication
3518  * (or other operations such as matrix trasformation) and computation.
3519  *
3520  * \remark When values have been added to rows belonging to another rank,
3521  *         communication will occur here. Splitting this function could
3522  *         add another opportunity for overlap of computation and
3523  *         communication, but is not done as of 2016, as it would add
3524  *         complexity, with probably limited returns, as the effective
3525  *         asynchonous progress of MPI libraries is usually limited.
3526  *
3527  * \param[in, out]  mav  pointer to matrix assembler data structure
3528  */
3529 /*----------------------------------------------------------------------------*/
3530 
3531 void
cs_matrix_assembler_values_done(cs_matrix_assembler_values_t * mav)3532 cs_matrix_assembler_values_done(cs_matrix_assembler_values_t  *mav)
3533 {
3534   /* Exchange row data with other ranks if required */
3535 
3536 #if defined(HAVE_MPI)
3537 
3538   const cs_matrix_assembler_t  *ma = mav->ma;
3539 
3540   if (ma->n_coeff_ranks > 0) {
3541 
3542     cs_real_t  *recv_coeffs = NULL;
3543 
3544     cs_lnum_t stride = mav->eb_size[3];
3545 
3546     MPI_Request *request = NULL;
3547     MPI_Status *status = NULL;
3548 
3549     BFT_MALLOC(recv_coeffs, ma->coeff_recv_size*stride, cs_real_t);
3550 
3551     BFT_MALLOC(request, ma->n_coeff_ranks*2, MPI_Request);
3552     BFT_MALLOC(status, ma->n_coeff_ranks*2, MPI_Status);
3553 
3554     int request_count = 0;
3555     int local_rank = cs_glob_rank_id;
3556 
3557     for (int i = 0; i < ma->n_coeff_ranks; i++) {
3558       cs_lnum_t l_size = (  ma->coeff_rank_recv_index[i+1]
3559                           - ma->coeff_rank_recv_index[i]) * stride;
3560       if (l_size > 0) {
3561         cs_lnum_t recv_shift = ma->coeff_rank_recv_index[i]*stride;
3562         MPI_Irecv(recv_coeffs + recv_shift,
3563                   l_size,
3564                   CS_MPI_REAL,
3565                   ma->coeff_rank[i],
3566                   local_rank,
3567                   ma->comm,
3568                   &(request[request_count++]));
3569       }
3570     }
3571 
3572     for (int i = 0; i < ma->n_coeff_ranks; i++) {
3573       cs_lnum_t l_size = (  ma->coeff_rank_send_index[i+1]
3574                           - ma->coeff_rank_send_index[i]) * stride;
3575       if (l_size > 0) {
3576         cs_lnum_t send_shift = ma->coeff_rank_send_index[i]*stride;
3577         MPI_Isend(mav->coeff_send + send_shift,
3578                   l_size,
3579                   CS_MPI_REAL,
3580                   ma->coeff_rank[i],
3581                   ma->coeff_rank[i],
3582                   ma->comm,
3583                   &(request[request_count++]));
3584       }
3585     }
3586 
3587     MPI_Waitall(request_count, request, status);
3588 
3589     BFT_FREE(request);
3590     BFT_FREE(status);
3591 
3592     /* Now add coefficients to local rows */
3593 
3594     if (ma->coeff_recv_size > 0) {
3595 
3596       /* Using local id-based function */
3597 
3598       if (mav->add_values != NULL) {
3599 
3600         assert(ma->coeff_recv_row_id != NULL);
3601         assert(ma->coeff_recv_col_idx != NULL);
3602 
3603         /* Case where columns are already pre-indexed */
3604 
3605         if (ma->coeff_recv_col_idx != NULL) {
3606           if (ma->separate_diag == mav->separate_diag)
3607             mav->add_values(mav->matrix,
3608                             ma->coeff_recv_size,
3609                             stride,
3610                             ma->coeff_recv_row_id,
3611                             ma->coeff_recv_col_idx,
3612                             recv_coeffs);
3613           else
3614             _matrix_assembler_values_add_cnv_idx(mav,
3615                                                  ma->coeff_recv_size,
3616                                                  stride,
3617                                                  ma->coeff_recv_row_id,
3618                                                  ma->coeff_recv_col_idx,
3619                                                  recv_coeffs);
3620         }
3621         else {
3622           assert(ma->coeff_recv_col_g_id != NULL);
3623           _matrix_assembler_values_add_lg(mav,
3624                                           ma->coeff_recv_size,
3625                                           stride,
3626                                           ma->coeff_recv_row_id,
3627                                           ma->coeff_recv_col_g_id,
3628                                           recv_coeffs);
3629         }
3630 
3631       }
3632 
3633       /* Using global id-based function */
3634 
3635       else {
3636         assert(mav->add_values_g != NULL);
3637 
3638         if (ma->coeff_recv_col_g_id != NULL)
3639           _matrix_assembler_values_add_lg_g(mav,
3640                                             ma->coeff_recv_size,
3641                                             stride,
3642                                             ma->coeff_recv_row_id,
3643                                             ma->coeff_recv_col_g_id,
3644                                             recv_coeffs);
3645 
3646         else {
3647           assert(ma->coeff_recv_col_idx);
3648           _matrix_assembler_values_add_llx_g(mav,
3649                                              ma->coeff_recv_size,
3650                                              stride,
3651                                              ma->coeff_recv_row_id,
3652                                              ma->coeff_recv_col_idx,
3653                                              recv_coeffs);
3654         }
3655       }
3656 
3657     }
3658 
3659     BFT_FREE(recv_coeffs);
3660 
3661   }
3662 
3663   BFT_FREE(mav->coeff_send);
3664 
3665 #endif /* HAVE_MPI */
3666 
3667   BFT_FREE(mav->diag_idx);
3668 
3669   mav->final_assembly = true;
3670 
3671   if (mav->assembly_begin != NULL)
3672     mav->assembly_begin(mav->matrix);
3673 }
3674 
3675 /*----------------------------------------------------------------------------*/
3676 
3677 END_C_DECLS
3678