1 /*============================================================================
2 * Incremental or general construction of matrix.
3 *============================================================================*/
4
5 /*
6 This file is part of Code_Saturne, a general-purpose CFD tool.
7
8 Copyright (C) 1998-2021 EDF S.A.
9
10 This program is free software; you can redistribute it and/or modify it under
11 the terms of the GNU General Public License as published by the Free Software
12 Foundation; either version 2 of the License, or (at your option) any later
13 version.
14
15 This program is distributed in the hope that it will be useful, but WITHOUT
16 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
18 details.
19
20 You should have received a copy of the GNU General Public License along with
21 this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
22 Street, Fifth Floor, Boston, MA 02110-1301, USA.
23 */
24
25 /*----------------------------------------------------------------------------*/
26
27 #include "cs_defs.h"
28
29 /*----------------------------------------------------------------------------
30 * Standard C library headers
31 *----------------------------------------------------------------------------*/
32
33 #include <assert.h>
34 #include <math.h>
35 #include <stdlib.h>
36 #include <stdarg.h>
37 #include <stdio.h>
38 #include <string.h>
39
40 /*----------------------------------------------------------------------------
41 * Local headers
42 *----------------------------------------------------------------------------*/
43
44 #include "bft_error.h"
45 #include "bft_mem.h"
46 #include "bft_mem_usage.h"
47
48 #include "cs_defs.h"
49 #include "cs_log.h"
50 #include "cs_rank_neighbors.h"
51 #include "cs_sort.h"
52 #include "cs_timer.h"
53
54 #include "cs_matrix.h"
55
56 /*----------------------------------------------------------------------------
57 * Header for the current file
58 *----------------------------------------------------------------------------*/
59
60 #include "cs_matrix_assembler_priv.h"
61 #include "cs_matrix_assembler.h"
62
63 /*----------------------------------------------------------------------------*/
64
65 BEGIN_C_DECLS
66
67 /*=============================================================================
68 * Additional doxygen documentation
69 *============================================================================*/
70
71 /*!
72 \file cs_matrix_assembler.c
73
74 \brief Incremental or general construction of matrix structure.
75
76 The matrix assembler is intended to assist the building of matrices
77 in general parallel conditions, assuming each parallel rank is assigned a
78 given number of rows, in increasing order. Column elements may refer to rows
79 assigned to other ranks. This allows a good mapping to external libraries
80 such as PETSc and Hypre.
81
82 Moreover, in some cases, some global matrix elements may be computed on ranks
83 to whom neither the row nor the column is assigned. This may happen for
84 example when rows are assigned to mesh vertices, and a given cell
85 provides a contribution to two vertices laying on parallel rank boundaries,
86 and assigned to two different ranks, which are different from the cell's
87 rank. Few such contributions are expected with a good partitionning, but
88 they cannot always be avoided, or this would require complex partitioning
89 constraints. libraries such as PETSc do not handle these cases, at least
90 not efficiently (as the recommended preallocation required for local rows
91 cannot be computed without the knowledge of those contributions), so
92 the assembler should help manage these contributions in any case.
93
94 The addition of a given non-zero to a matrix structure may be done
95 multiple times (for example when looping on cells with an internal loop on
96 vertices), but for better performance (and memory usage), it is recommended
97 to avoid the same info multiple times, as duplicates may be removed only
98 during the computation stage).
99 */
100
101 /*! \cond DOXYGEN_SHOULD_SKIP_THIS */
102
103 /*=============================================================================
104 * Local Macro Definitions
105 *============================================================================*/
106
107 /* Minimum size for OpenMP loops (needs benchmarking to adjust) */
108 #define THR_MIN 128
109
110 /* Fixed coefficient buffer size for accumulation
111 (a reasonably small fixed size has the advantage of being easily usable
112 on the stack and in a threading context, and that size should still
113 be large enough to amortize calls to lower-level functions */
114
115 #undef COEFF_GROUP_SIZE
116 #define COEFF_GROUP_SIZE 256
117
118 #undef HISTOGRAM_SUBS
119 #define HISTOGRAM_SUBS 5
120
121 /*=============================================================================
122 * Local Structure Definitions
123 *============================================================================*/
124
125 /*============================================================================
126 * Private function definitions
127 *============================================================================*/
128
129 /*----------------------------------------------------------------------------*/
130 /*!
131 * \brief Print distribution of counter over ranks info to a given log type.
132 *
133 * \param[in] log_id log file type
134 * \param[in] count counter value for current rank
135 */
136 /*----------------------------------------------------------------------------*/
137
138 static void
_display_rank_histogram(cs_log_t log_id,int count)139 _display_rank_histogram(cs_log_t log_id,
140 int count)
141 {
142 int i, j, k, count_max, count_min;
143 double step;
144
145 int h_count[HISTOGRAM_SUBS];
146 int n_steps = HISTOGRAM_SUBS;
147
148 const int n_counts = cs_glob_n_ranks;
149
150 int *r_count;
151 BFT_MALLOC(r_count, cs_glob_n_ranks, int);
152
153 r_count[0] = count;
154
155 #if defined(HAVE_MPI)
156 if (cs_glob_n_ranks > 1)
157 MPI_Allgather(&count, 1, MPI_INT,
158 r_count, 1, MPI_INT, cs_glob_mpi_comm);
159 #endif
160
161 /* Compute local min and max */
162
163 count_max = r_count[0];
164 count_min = r_count[0];
165 for (i = 1; i < n_counts; i++) {
166 count_min = CS_MIN(count_min, r_count[i]);
167 count_max = CS_MAX(count_max, r_count[i]);
168 }
169
170 cs_log_printf(log_id, _(" minimum count = %10d\n"), count_min);
171 cs_log_printf(log_id, _(" maximum count = %10d\n\n"), count_max);
172
173 /* Define axis subdivisions */
174
175 for (j = 0; j < n_steps; j++)
176 h_count[j] = 0;
177
178 if (count_max - count_min > 0) {
179
180 if (count_max-count_min < n_steps)
181 n_steps = CS_MAX(1, floor(count_max-count_min));
182
183 step = (double)(count_max - count_min) / n_steps;
184
185 /* Loop on counts */
186
187 for (i = 0; i < n_counts; i++) {
188
189 /* Associated subdivision */
190
191 for (j = 0, k = 1; k < n_steps; j++, k++) {
192 if (r_count[i] < count_min + k*step)
193 break;
194 }
195 h_count[j] += 1;
196
197 }
198
199 for (i = 0, j = 1; i < n_steps - 1; i++, j++)
200 cs_log_printf(log_id, " %3d : [ %10d ; %10d [ = %10d\n",
201 i+1,
202 (int)(count_min + i*step),
203 (int)(count_min + j*step),
204 h_count[i]);
205
206 cs_log_printf(log_id, " %3d : [ %10d ; %10d ] = %10d\n",
207 n_steps,
208 (int)(count_min + (n_steps - 1)*step),
209 count_max,
210 h_count[n_steps - 1]);
211
212 }
213
214 else { /* if (count_max == count_min) */
215 cs_log_printf(log_id, " %3d : [ %10d ; %10d ] = %10d\n",
216 1, count_min, count_max, n_counts);
217 }
218
219 BFT_FREE(r_count);
220 }
221
222 /*----------------------------------------------------------------------------*/
223 /*!
224 * \brief Sort and compact local matrix elements.
225 *
226 * \param[in, out] ma pointer to matrix assembler structure
227 *
228 * This function should be called by a single thread for a given assembler.
229 */
230 /*----------------------------------------------------------------------------*/
231
232 static void
_sort_and_compact_local(cs_matrix_assembler_t * ma)233 _sort_and_compact_local(cs_matrix_assembler_t *ma)
234 {
235 /* Check if we are not already sorted */
236
237 cs_lnum_t n_rows = ma->n_rows;
238 bool ordered = true;
239
240 for (cs_lnum_t i = 0; i < n_rows && ordered; i++) {
241 cs_lnum_t *col_id = ma->_c_id + ma->_r_idx[i];
242 cs_lnum_t n_cols = ma->_r_idx[i+1] - ma->_r_idx[i];
243 for (cs_lnum_t j = 1; j < n_cols; j++) {
244 if (col_id[j] <= col_id[j-1])
245 ordered = false;
246 }
247 }
248
249 if (ordered)
250 return;
251
252 /* Sort by row */
253
254 bool direct_assembly = cs_sort_indexed(n_rows, ma->_r_idx, ma->_c_id);
255
256 /* Compact elements if necessary */
257
258 if (direct_assembly == false) {
259
260 cs_lnum_t *tmpr_idx = NULL;
261
262 BFT_MALLOC(tmpr_idx, n_rows+1, cs_lnum_t);
263 memcpy(tmpr_idx, ma->_r_idx, (n_rows+1)*sizeof(cs_lnum_t));
264
265 cs_lnum_t k = 0;
266
267 for (cs_lnum_t i = 0; i < n_rows; i++) {
268 cs_lnum_t *col_id = ma->_c_id + ma->_r_idx[i];
269 cs_lnum_t n_cols = ma->_r_idx[i+1] - ma->_r_idx[i];
270 cs_lnum_t col_id_prev = -1;
271 ma->_r_idx[i] = k;
272 for (cs_lnum_t j = 0; j < n_cols; j++) {
273 if (col_id_prev != col_id[j]) {
274 ma->_c_id[k++] = col_id[j];
275 col_id_prev = col_id[j];
276 }
277 }
278 }
279 ma->_r_idx[n_rows] = k;
280
281 assert(ma->_r_idx[n_rows] < tmpr_idx[n_rows]);
282
283 BFT_FREE(tmpr_idx);
284 BFT_REALLOC(ma->_c_id, (ma->_r_idx[n_rows]), cs_lnum_t);
285 ma->c_id = ma->_c_id;
286
287 }
288 }
289
290 #if defined(HAVE_MPI)
291
292 /*----------------------------------------------------------------------------*/
293 /*!
294 * \brief Binary search for a given global id in a given array of
295 * ordered global ids, when the id might not be present
296 *
297 * We assume the id is present in the array.
298 *
299 * \param[in] g_id_array size array_size
300 * \param[in] g_id global id to search for
301 * \param[in] g_id_array ordered unique global ids array
302 *
303 * \return index of g_id in g_id_array, or -1 if not found
304 */
305 /*----------------------------------------------------------------------------*/
306
307 static inline cs_lnum_t
_g_id_binary_search(cs_lnum_t g_id_array_size,cs_gnum_t g_id,const cs_gnum_t g_id_array[])308 _g_id_binary_search(cs_lnum_t g_id_array_size,
309 cs_gnum_t g_id,
310 const cs_gnum_t g_id_array[])
311 {
312 cs_lnum_t start_id = 0;
313 cs_lnum_t end_id = g_id_array_size - 1;
314 cs_lnum_t mid_id = (end_id -start_id) / 2;
315 while (start_id < end_id) {
316 if (g_id_array[mid_id] < g_id)
317 start_id = mid_id + 1;
318 else if (g_id_array[mid_id] > g_id)
319 end_id = mid_id - 1;
320 else
321 break;
322 mid_id = start_id + ((end_id -start_id) / 2);
323 }
324 if (g_id_array[mid_id] != g_id)
325 mid_id = -1;
326
327 return mid_id;
328 }
329
330 /*----------------------------------------------------------------------------*/
331 /*!
332 * \brief Use binary search to determine to which range a global id belongs
333 *
334 * \param[in] n_ranges number of distant ranges
335 * \param[in] d_ranges global id ranges [min, max[ for neighboring ranks
336 * \param[in ] g_id global id to search for
337 *
338 * \return id of matching distant range, or -2 if not found
339 */
340 /*----------------------------------------------------------------------------*/
341
342 static inline int
_g_id_rank_index(int n_ranges,const cs_gnum_t d_ranges[],cs_gnum_t g_id)343 _g_id_rank_index(int n_ranges,
344 const cs_gnum_t d_ranges[],
345 cs_gnum_t g_id)
346 {
347 cs_lnum_t start_id = 0;
348 cs_lnum_t end_id = n_ranges - 1;
349 cs_lnum_t mid_id = (end_id -start_id) / 2;
350
351 while (start_id < end_id) {
352 if (d_ranges[mid_id*2+1] <= g_id)
353 start_id = mid_id + 1;
354 else if (d_ranges[mid_id*2] > g_id)
355 end_id = mid_id - 1;
356 else
357 break;
358 mid_id = start_id + ((end_id -start_id) / 2);
359 }
360
361 if (g_id < d_ranges[mid_id*2] || g_id >= d_ranges[mid_id*2+1])
362 mid_id = -2;
363
364 return mid_id;
365 }
366
367 /*----------------------------------------------------------------------------*/
368 /*!
369 * \brief Sort and compact distant matrix elements.
370 *
371 * \param[in, out] ma pointer to matrix assembler structure
372 *
373 * This function should be called by a single thread for a given assembler.
374 */
375 /*----------------------------------------------------------------------------*/
376
377 static void
_sort_and_compact_distant(cs_matrix_assembler_t * ma)378 _sort_and_compact_distant(cs_matrix_assembler_t *ma)
379 {
380 /* Check if we are not already sorted */
381
382 cs_lnum_t n_rows = ma->n_rows;
383 bool ordered = true;
384
385 for (cs_lnum_t i = 0; i < n_rows && ordered; i++) {
386 cs_gnum_t *col_id = ma->d_g_c_id + ma->d_r_idx[i];
387 cs_lnum_t n_cols = ma->d_r_idx[i+1] - ma->d_r_idx[i];
388 for (cs_lnum_t j = 1; j < n_cols; j++) {
389 if (col_id[j] <= col_id[j-1])
390 ordered = false;
391 }
392 }
393
394 if (ordered)
395 return;
396
397 /* Sort by row */
398
399 bool direct_assembly
400 = cs_sort_indexed_gnum(n_rows, ma->d_r_idx, ma->d_g_c_id);
401
402 /* Compact elements if necessary */
403
404 if (direct_assembly == false) {
405
406 cs_lnum_t *tmpr_idx = NULL;
407
408 BFT_MALLOC(tmpr_idx, n_rows+1, cs_lnum_t);
409 memcpy(tmpr_idx, ma->d_r_idx, (n_rows+1)*sizeof(cs_lnum_t));
410
411 cs_lnum_t k = 0;
412
413 for (cs_lnum_t i = 0; i < n_rows; i++) {
414 cs_gnum_t *col_g_id = ma->d_g_c_id + ma->d_r_idx[i];
415 cs_lnum_t n_cols = ma->d_r_idx[i+1] - ma->d_r_idx[i];
416 ma->d_r_idx[i] = k;
417 if (n_cols > 0)
418 ma->d_g_c_id[k++] = col_g_id[0];
419 for (cs_lnum_t j = 1; j < n_cols; j++) {
420 if (col_g_id[j] != col_g_id[j-1])
421 ma->d_g_c_id[k++] = col_g_id[j];
422 }
423 }
424 ma->d_r_idx[n_rows] = k;
425
426 assert(ma->d_r_idx[n_rows] < tmpr_idx[n_rows]);
427
428 BFT_FREE(tmpr_idx);
429 BFT_REALLOC(ma->d_g_c_id, (ma->d_r_idx[n_rows]), cs_gnum_t);
430
431 }
432 }
433
434 /*----------------------------------------------------------------------------*/
435 /*!
436 * \brief Add terms to local matrix elements defined by distant ranks
437 *
438 * \param[in, out] ma pointer to matrix assembler structure
439 * \param[in] n number of added column and row couples
440 * \param[in] l_ij added couples
441 *
442 * This function should be called by a single thread for a given assembler.
443 */
444 /*----------------------------------------------------------------------------*/
445
446 static void
_complete_local(cs_matrix_assembler_t * ma,cs_lnum_t n,cs_lnum_2_t l_ij[])447 _complete_local(cs_matrix_assembler_t *ma,
448 cs_lnum_t n,
449 cs_lnum_2_t l_ij[])
450 {
451 /* Count maximum terms to add for each row */
452
453 cs_lnum_t n_rows = ma->n_rows;
454
455 cs_lnum_t *l_c_count, *l_r_idx;
456 BFT_MALLOC(l_c_count, n_rows, cs_lnum_t);
457
458 for (cs_lnum_t i = 0; i < n_rows; i++)
459 l_c_count[i] = 0;
460
461 for (cs_lnum_t i = 0; i < n; i++)
462 l_c_count[l_ij[i][0]] += 1;
463
464 BFT_MALLOC(l_r_idx, n_rows+1, cs_lnum_t);
465
466 l_r_idx[0] = 0;
467 for (cs_lnum_t i = 0; i < n_rows; i++)
468 l_r_idx[i+1] = l_r_idx[i] + l_c_count[i];
469
470 /* Expand matrix, starting copies from end to avoid overwrites
471 (first line untouched) */
472
473 BFT_REALLOC(ma->_c_id, ma->r_idx[n_rows] + l_r_idx[n_rows], cs_lnum_t);
474 ma->c_id = ma->_c_id;
475
476 for (cs_lnum_t i = n_rows-1; i > 0; i--) {
477 cs_lnum_t n_cols = ma->_r_idx[i+1] - ma->_r_idx[i];
478 cs_lnum_t *col_id = ma->_c_id + ma->_r_idx[i] + l_r_idx[i];
479 const cs_lnum_t *col_id_s = ma->_c_id + ma->_r_idx[i];
480 l_c_count[i] = n_cols;
481 ma->_r_idx[i+1] += l_r_idx[i+1];
482 for (cs_lnum_t j = n_cols-1; j >= 0; j--)
483 col_id[j] = col_id_s[j];
484 }
485 l_c_count[0] = ma->_r_idx[1];
486 ma->_r_idx[1] += l_r_idx[1];
487
488 BFT_FREE(l_r_idx);
489
490 /* Now add terms */
491
492 for (cs_lnum_t i = 0; i < n; i++) {
493 cs_lnum_t r_id = l_ij[i][0];
494 cs_gnum_t c_id = l_ij[i][1];
495 ma->_c_id[ma->_r_idx[r_id] + l_c_count[r_id]] = c_id;
496 l_c_count[r_id] += 1;
497 }
498 BFT_FREE(l_c_count);
499
500 /* Sort and remove duplicates */
501
502 _sort_and_compact_local(ma);
503 }
504
505 /*----------------------------------------------------------------------------*/
506 /*!
507 * \brief Add terms to distant matrix elements.
508 *
509 * \param[in, out] ma pointer to matrix assembler structure
510 * \param[in] n number of added column and row couples
511 * \param[in] g_ij added couples
512 *
513 * This function should be called by a single thread for a given assembler.
514 */
515 /*----------------------------------------------------------------------------*/
516
517 static void
_complete_distant(cs_matrix_assembler_t * ma,cs_lnum_t n,cs_gnum_t g_ij[])518 _complete_distant(cs_matrix_assembler_t *ma,
519 cs_lnum_t n,
520 cs_gnum_t g_ij[])
521 {
522 /* Count maximum terms to add for each row */
523
524 cs_lnum_t n_rows = ma->n_rows;
525
526 cs_lnum_t *d_c_count, *d_r_idx;
527 BFT_MALLOC(d_c_count, n_rows, cs_lnum_t);
528 BFT_MALLOC(d_r_idx, n_rows+1, cs_lnum_t);
529
530 for (cs_lnum_t i = 0; i < n_rows; i++)
531 d_c_count[i] = 0;
532
533 /* Only local rows should be encountered here; local column elements
534 are ignored here (we will later check they do not require a local
535 insertion, which is not expected, as we assume the local rank
536 should already have provided a contribution to a local entry) */
537
538 for (cs_lnum_t i = 0; i < n; i++) {
539 cs_gnum_t g_r_id = g_ij[i*2];
540 cs_gnum_t g_c_id = g_ij[i*2 + 1];
541 assert (g_r_id >= ma->l_range[0] && g_r_id < ma->l_range[1]);
542 if (g_c_id < ma->l_range[0] || g_c_id >= ma->l_range[1])
543 d_c_count[g_r_id - ma->l_range[0]] += 1;
544 }
545
546 d_r_idx[0] = 0;
547 for (cs_lnum_t i = 0; i < n_rows; i++)
548 d_r_idx[i+1] = d_r_idx[i] + d_c_count[i];
549
550 /* Expand matrix, starting copies from end to avoid overwrites
551 (first line untouched) */
552
553 BFT_REALLOC(ma->d_g_c_id, ma->d_r_idx[n_rows] + d_r_idx[n_rows], cs_gnum_t);
554
555 for (cs_lnum_t i = n_rows-1; i > 0; i--) {
556 cs_lnum_t n_cols = ma->d_r_idx[i+1] - ma->d_r_idx[i];
557 cs_gnum_t *col_g_id_d = ma->d_g_c_id + ma->d_r_idx[i] + d_r_idx[i];
558 const cs_gnum_t *col_g_id_s = ma->d_g_c_id + ma->d_r_idx[i];
559 d_c_count[i] = n_cols;
560 ma->d_r_idx[i+1] += d_r_idx[i+1];
561 for (cs_lnum_t j = n_cols-1; j >= 0; j--)
562 col_g_id_d[j] = col_g_id_s[j];
563 }
564 if (n_rows > 0) {
565 d_c_count[0] = ma->d_r_idx[1];
566 ma->d_r_idx[1] += d_r_idx[1];
567 }
568
569 BFT_FREE(d_r_idx);
570
571 /* Now add terms */
572
573 for (cs_lnum_t i = 0; i < n; i++) {
574 cs_gnum_t g_r_id = g_ij[i*2];
575 cs_gnum_t g_c_id = g_ij[i*2 + 1];
576 if (g_c_id < ma->l_range[0] || g_c_id >= ma->l_range[1]) {
577 cs_lnum_t l_r_id = g_r_id - ma->l_range[0];
578 ma->d_g_c_id[ma->d_r_idx[l_r_id] + d_c_count[l_r_id]] = g_c_id;
579 d_c_count[l_r_id] += 1;
580 }
581 }
582 BFT_FREE(d_c_count);
583
584 /* Sort and remove duplicates */
585
586 _sort_and_compact_distant(ma);
587 }
588
589 /*----------------------------------------------------------------------------*/
590 /*!
591 * \brief Append distant row ids to local row ids.
592 *
593 * As column ids remain ordered in each row, local columns appear first,
594 * and ids matching distant columns second.
595 *
596 * The resulting structure is expected to be best suited for matrix-vector
597 * products, or row extractions, as we do not need to handle separate
598 * matrices, and multiple loops.
599 *
600 * For assembly, the extended part is also maintained separately. For a given
601 * column appearing on row i at extended position j (i.e. ma->d_r_idx[i] + j),
602 * its matching position in the extended matrix will be:
603 * ma->r_idx[i+1] - (ma->d_r_idx[i+1] - ma->d_r_idx[i] + j.
604 *
605 * \param[in, out] ma pointer to matrix assembler structure
606 * \param[in] n_e_g_ids number of unique external global ids
607 * \param[in] e_g_id ordered unique external global ids
608 *
609 * This function should be called by a single thread for a given assembler.
610 */
611 /*----------------------------------------------------------------------------*/
612
613 static void
_append_local_and_distant(cs_matrix_assembler_t * ma,cs_lnum_t n_e_g_ids,const cs_gnum_t e_g_id[])614 _append_local_and_distant(cs_matrix_assembler_t *ma,
615 cs_lnum_t n_e_g_ids,
616 const cs_gnum_t e_g_id[])
617 {
618 /* Count terms to add for each row */
619
620 cs_lnum_t n_rows = ma->n_rows;
621
622 cs_lnum_t *c_count;
623 BFT_MALLOC(c_count, n_rows, cs_lnum_t);
624
625 /* Expand matrix, starting copies from end to avoid overwrites
626 (first line untouched) */
627
628 BFT_REALLOC(ma->_c_id, ma->_r_idx[n_rows] + ma->d_r_idx[n_rows], cs_lnum_t);
629 ma->c_id = ma->_c_id;
630
631 for (cs_lnum_t i = n_rows-1; i > 0; i--) {
632 cs_lnum_t n_cols = ma->_r_idx[i+1] - ma->_r_idx[i];
633 cs_lnum_t *col_id_d = ma->_c_id + ma->_r_idx[i] + ma->d_r_idx[i];
634 const cs_lnum_t *col_id_s = ma->_c_id + ma->_r_idx[i];
635 c_count[i] = n_cols;
636 ma->_r_idx[i+1] += ma->d_r_idx[i+1];
637 for (cs_lnum_t j = n_cols-1; j >= 0; j--)
638 col_id_d[j] = col_id_s[j];
639 }
640 if (n_rows > 0) {
641 c_count[0] = ma->_r_idx[1];
642 ma->_r_idx[1] += ma->d_r_idx[1];
643 }
644
645 /* Now insert terms */
646
647 for (cs_lnum_t i = 0; i < n_rows; i++) {
648 cs_lnum_t n_d_cols = ma->d_r_idx[i+1] - ma->d_r_idx[i];
649 cs_lnum_t *col_id_d = ma->_c_id + ma->_r_idx[i+1] - n_d_cols;
650 const cs_gnum_t *col_g_id_s = ma->d_g_c_id + ma->d_r_idx[i];
651 for (cs_lnum_t j = 0; j < n_d_cols; j++) {
652 cs_gnum_t g_c_id = col_g_id_s[j];
653 cs_lnum_t k = _g_id_binary_find(n_e_g_ids, g_c_id, e_g_id);
654 col_id_d[j] = n_rows + k;
655 }
656 }
657
658 BFT_FREE(c_count);
659 }
660
661 /*----------------------------------------------------------------------------
662 * Determine assumed rank associated with a given global id.
663 *
664 * parameters:
665 * n_ranks <-- number of ranks
666 * n_g <-- global number of ids
667 * g_id <-- array of global ids for local rank (size: n)
668 *
669 * returns:
670 * assumed rank id for g_id
671 *----------------------------------------------------------------------------*/
672
673 static inline int
_assumed_rank(int n_ranks,cs_gnum_t n_g,const cs_gnum_t g_id)674 _assumed_rank(int n_ranks,
675 cs_gnum_t n_g,
676 const cs_gnum_t g_id)
677 {
678 int rank_id;
679
680 cs_gnum_t n_per_rank = n_g / n_ranks;
681 cs_lnum_t rmdr = n_g - n_per_rank * (cs_gnum_t)n_ranks;
682
683 assert(g_id < n_g);
684
685 if (rmdr == 0)
686 rank_id = g_id / n_per_rank;
687 else {
688 cs_gnum_t n_ranks_rmdr = n_ranks - rmdr;
689 cs_gnum_t n_ranks_n_per_rank = n_ranks_rmdr * n_per_rank;
690 if (g_id < n_ranks_n_per_rank)
691 rank_id = g_id / n_per_rank;
692 else
693 rank_id = (g_id + n_ranks_rmdr) / (n_per_rank + 1);
694 }
695
696 return rank_id;
697 }
698
699 /*----------------------------------------------------------------------------*/
700 /*!
701 * \brief Build Assumed rank neighbors info
702 *
703 * This operation is collective on communicator comm.
704 *
705 * \param[in] n number of global ids for local rank
706 * \param[in] n_g global number of ids
707 * \param[in] l_range global id range [min, max[ for local rank
708 * \param[in] g_id array of global ids for local rank (size: n*stride)
709 * \param[in] comm associated communicator
710 *
711 * \return assumed rank neighbors structure
712 */
713 /*----------------------------------------------------------------------------*/
714
715 static cs_rank_neighbors_t *
_assumed_rank_neighbors(cs_lnum_t n,cs_gnum_t n_g,cs_gnum_t l_range[2],const cs_gnum_t g_id[],MPI_Comm comm)716 _assumed_rank_neighbors(cs_lnum_t n,
717 cs_gnum_t n_g,
718 cs_gnum_t l_range[2],
719 const cs_gnum_t g_id[],
720 MPI_Comm comm)
721 {
722 /* Prepare for determination of assumed rank;
723 we will send both a partial local range description and global ids to the
724 assumed partition, using a common array, so as to minimize
725 all-to-all communication. */
726
727 int n_ranks, l_rank;
728 int *a_rank;
729
730 MPI_Comm_size(comm, &n_ranks);
731 MPI_Comm_rank(comm, &l_rank);
732
733 /* Build real to assumed partition mapping */
734
735 cs_lnum_t l_range_size = l_range[1] - l_range[0];
736
737 BFT_MALLOC(a_rank, l_range_size + n, int);
738
739 cs_lnum_t n_a_neighbors = 0, count = 0;
740
741 /* Ranks to send real partition info to */
742
743 int r_prev = -1;
744
745 for (cs_lnum_t i = 0; i < l_range_size; i++) {
746 cs_gnum_t _g_id = l_range[0] + (cs_gnum_t)i;
747 int r_dest = _assumed_rank(n_ranks, n_g, _g_id);
748 if (r_dest != r_prev && r_dest != l_rank) {
749 a_rank[n_a_neighbors++] = r_dest;
750 r_prev = r_dest;
751 }
752 }
753
754 count = n_a_neighbors;
755
756 /* Ranks to send real data to; assuming ids are mostly
757 ordered, we limit the size of sent data. */
758
759 for (cs_lnum_t i = 0; i < n; i++) {
760 int r_dest = _assumed_rank(n_ranks, n_g, g_id[i]);
761 if (r_dest != r_prev && r_dest != l_rank) {
762 a_rank[count++] = r_dest;
763 r_prev = r_dest;
764 }
765 }
766
767 /* Build communicating rank neighbors structure;
768 symmetrize the structure to tell ranks from which other
769 ranks they may receive info */
770
771 cs_rank_neighbors_t *arn = cs_rank_neighbors_create(count, a_rank);
772
773 BFT_FREE(a_rank);
774
775 cs_rank_neighbors_symmetrize(arn, comm);
776
777 return arn;
778 }
779
780 /*----------------------------------------------------------------------------*/
781 /*!
782 * \brief Exchange range info for building assumed rank neighbors info
783 *
784 * This operation is collective on communicator comm. The caller is
785 * responsible for freeng the returned array.
786 *
787 * \param[in] arn info on ranks with which we will communicate
788 * \param[in] l_range global id range [min, max[ for local rank
789 * \param[in] comm associated communicator
790 *
791 * \return true rank neighbors ranges array (size: arn->size*2)
792 */
793 /*----------------------------------------------------------------------------*/
794
795 static cs_gnum_t *
_rank_ranges_exchange(cs_rank_neighbors_t * arn,cs_gnum_t l_range[2],MPI_Comm comm)796 _rank_ranges_exchange(cs_rank_neighbors_t *arn,
797 cs_gnum_t l_range[2],
798 MPI_Comm comm)
799 {
800 MPI_Request *request = NULL;
801 MPI_Status *status = NULL;
802 cs_gnum_t *d_ranges = NULL;
803
804 BFT_MALLOC(d_ranges, arn->size*2, cs_gnum_t);
805 BFT_MALLOC(request, arn->size*2, MPI_Request);
806 BFT_MALLOC(status, arn->size*2, MPI_Status);
807
808 /* Prepare for determination of assumed rank */
809
810 /* Exchange local range with neighbor ranks */
811
812 int request_count = 0;
813 const int local_rank = cs_glob_rank_id;
814
815 for (int i = 0; i < arn->size; i++) {
816 MPI_Irecv(d_ranges + i*2,
817 2,
818 CS_MPI_GNUM,
819 arn->rank[i],
820 local_rank,
821 comm,
822 &(request[request_count++]));
823 }
824
825 for (int i = 0; i < arn->size; i++) {
826 MPI_Isend(l_range,
827 2,
828 CS_MPI_GNUM,
829 arn->rank[i],
830 arn->rank[i],
831 comm,
832 &(request[request_count++]));
833 }
834
835 MPI_Waitall(request_count, request, status);
836
837 BFT_FREE(request);
838 BFT_FREE(status);
839
840 return d_ranges;
841 }
842
843 /*----------------------------------------------------------------------------*/
844 /*!
845 * \brief Build true rank info based on assumed rank info
846 *
847 * This operation is collective on communicator comm.
848 *
849 * The caller is responsible for freeing the returned array
850 *
851 * \param[in] arn info on ranks with which we initially communicate
852 * \param[in] n number of global ids for local rank
853 * \param[in] n_g global number of ids
854 * \param[in] l_range global id range [min, max[ for local rank
855 * \param[in] d_ranges global id ranges [min, max[ for neighboring ranks
856 * \param[in] g_id sorted array of global ids for local rank
857 * (size: n)
858 * \param[in] comm associated communicator
859 *
860 * \return effective ranks matching given global ids (size n)
861 */
862 /*----------------------------------------------------------------------------*/
863
864 static int *
_assumed_to_true_rank(cs_rank_neighbors_t * arn,cs_lnum_t n,cs_gnum_t n_g,const cs_gnum_t l_range[2],const cs_gnum_t d_ranges[],const cs_gnum_t g_id[],MPI_Comm comm)865 _assumed_to_true_rank(cs_rank_neighbors_t *arn,
866 cs_lnum_t n,
867 cs_gnum_t n_g,
868 const cs_gnum_t l_range[2],
869 const cs_gnum_t d_ranges[],
870 const cs_gnum_t g_id[],
871 MPI_Comm comm)
872 {
873 int *d_rank;
874 BFT_MALLOC(d_rank, n, int);
875
876 MPI_Request *request = NULL;
877 MPI_Status *status = NULL;
878
879 const int local_rank = cs_glob_rank_id;
880
881 /* Prepare for determination of assumed rank;
882 we will send both a partial local range description and global ids to the
883 assumed partition, using a common array, so as to minimize
884 all-to-all communication. */
885
886 int n_ranks, l_rank;
887 int *a_rank;
888
889 MPI_Comm_size(comm, &n_ranks);
890 MPI_Comm_rank(comm, &l_rank);
891
892 BFT_MALLOC(request, arn->size*2, MPI_Request);
893 BFT_MALLOC(status, arn->size*2, MPI_Status);
894
895 /* Determine ranks with which we will first exchange;
896 we filter and immediately handle elements whose assumed
897 rank matches the current rank */
898
899 BFT_MALLOC(a_rank, n, int);
900
901 cs_lnum_t n_range_match = 0;
902 cs_lnum_t n_range_dist = 0;
903
904 int a_rank_prev = -1;
905 for (cs_lnum_t i = 0; i < n; i++) {
906 int r = _assumed_rank(n_ranks, n_g, g_id[i]);
907 if (r == local_rank) {
908 int r_idx = _g_id_rank_index(arn->size, d_ranges, g_id[i]);
909 assert(r_idx >= 0);
910 d_rank[i] = arn->rank[r_idx];
911 n_range_match += 1;
912 }
913 else {
914 d_rank[i] = -1; /* flag for later */
915 a_rank[n_range_dist] = r;
916 n_range_dist += 1;
917 }
918 assert(r >= a_rank_prev);
919 a_rank_prev = r;
920 }
921
922 cs_rank_neighbors_to_index(arn, n_range_dist, a_rank, a_rank);
923
924 /* Count number of exchanges with each assumed rank
925 and build index, first handled as counts;
926 local values have index -1 */
927
928 cs_lnum_t *send_index;
929 BFT_MALLOC(send_index, arn->size+1, cs_lnum_t);
930
931 send_index[0] = 0;
932 cs_rank_neighbors_count(arn, n_range_dist, a_rank, send_index + 1);
933
934 BFT_FREE(a_rank);
935
936 /* Exchange counts */
937
938 cs_lnum_t *recv_index;
939 BFT_MALLOC(recv_index, arn->size + 1, cs_lnum_t);
940
941 recv_index[0] = 0;
942
943 int request_count = 0;
944
945 for (int i = 0; i < arn->size; i++) {
946 MPI_Irecv(recv_index + i + 1,
947 1,
948 CS_MPI_LNUM,
949 arn->rank[i],
950 local_rank,
951 comm,
952 &(request[request_count++]));
953 }
954
955 for (int i = 0; i < arn->size; i++) {
956 MPI_Isend(send_index + i + 1,
957 1,
958 CS_MPI_LNUM,
959 arn->rank[i],
960 arn->rank[i],
961 comm,
962 &(request[request_count++]));
963 }
964
965 MPI_Waitall(request_count, request, status);
966
967 /* Now transform counts to indexes */
968
969 for (int i = 0; i < arn->size; i++)
970 send_index[i+1] += send_index[i];
971
972 for (int i = 0; i < arn->size; i++)
973 recv_index[i+1] += recv_index[i];
974
975 /* Now exchange global ids */
976
977 const cs_lnum_t recv_size = recv_index[arn->size];
978
979 cs_gnum_t *recv_g_id;
980 BFT_MALLOC(recv_g_id, recv_size, cs_gnum_t);
981
982 request_count = 0;
983
984 for (int i = 0; i < arn->size; i++) {
985 if (recv_index[i+1] > recv_index[i])
986 MPI_Irecv(recv_g_id + recv_index[i],
987 recv_index[i+1] - recv_index[i],
988 CS_MPI_GNUM,
989 arn->rank[i],
990 local_rank,
991 comm,
992 &(request[request_count++]));
993 }
994
995 for (int i = 0; i < arn->size; i++) {
996 if (send_index[i+1] > send_index[i]) {
997 cs_lnum_t shift = (arn->rank[i] > local_rank) ? n_range_match : 0;
998 MPI_Isend(g_id + send_index[i] + shift,
999 send_index[i+1] - send_index[i],
1000 CS_MPI_GNUM,
1001 arn->rank[i],
1002 arn->rank[i],
1003 comm,
1004 &(request[request_count++]));
1005 }
1006 }
1007
1008 MPI_Waitall(request_count, request, status);
1009
1010 /* Now for each global id, determine its rank */
1011
1012 int *d_rank_ar;
1013 BFT_MALLOC(d_rank_ar, recv_size, int);
1014
1015 for (cs_lnum_t i = 0; i < recv_size; i++) {
1016 cs_gnum_t _g_id = recv_g_id[i];
1017 if (l_range[0] <= _g_id && l_range[1] > _g_id)
1018 d_rank_ar[i] = local_rank;
1019 else {
1020 int r_idx = _g_id_rank_index(arn->size, d_ranges, recv_g_id[i]);
1021 assert(r_idx >= 0);
1022 d_rank_ar[i] = arn->rank[r_idx];
1023 assert(d_rank_ar[i] >= 0);
1024 }
1025 }
1026
1027 BFT_FREE(recv_g_id);
1028
1029 /* Send data back to original rank;
1030 remember part of the data is already known */
1031
1032 request_count = 0;
1033
1034 for (int i = 0; i < arn->size; i++) {
1035 if (send_index[i+1] > send_index[i]) {
1036 cs_lnum_t shift = (arn->rank[i] > local_rank) ? n_range_match : 0;
1037 MPI_Irecv(d_rank + send_index[i] + shift,
1038 send_index[i+1] - send_index[i],
1039 MPI_INT,
1040 arn->rank[i],
1041 arn->rank[i],
1042 comm,
1043 &(request[request_count++]));
1044 }
1045 }
1046
1047 for (int i = 0; i < arn->size; i++) {
1048 if (recv_index[i+1] > recv_index[i])
1049 MPI_Isend(d_rank_ar + recv_index[i],
1050 recv_index[i+1] - recv_index[i],
1051 MPI_INT,
1052 arn->rank[i],
1053 local_rank,
1054 comm,
1055 &(request[request_count++]));
1056 }
1057
1058 MPI_Waitall(request_count, request, status);
1059
1060 /* Note: we could add an additional exchange here, for each
1061 connected rank, to indicate which ranks may need
1062 to communicate with it; this would avoid the need for a
1063 symmetrization of the rank neighbors structure built from
1064 the d_rank[] array, as ranks would also know who needs to
1065 send some info. An all-to-all communication with just
1066 1 value per rank could thus be replaced by a point-to-point
1067 communication, but with slightly larger messages.
1068 We are not sure the added complexity would be worth it,
1069 so at this stage, we just add this placeholder. */
1070
1071 BFT_FREE(d_rank_ar);
1072
1073 BFT_FREE(recv_index);
1074 BFT_FREE(send_index);
1075
1076 BFT_FREE(request);
1077 BFT_FREE(status);
1078
1079 return d_rank;
1080 }
1081
1082 /*----------------------------------------------------------------------------*/
1083 /*!
1084 * \brief Determine rank neighbors based on global id info
1085 *
1086 * This assumes the ordered and compact list of external global ids
1087 * has been built.
1088 *
1089 * \param[in] n_e_g_ids number of unique external global ids
1090 * \param[in] n_g global number of ids
1091 * \param[in] l_range global id range [min, max[ for local rank
1092 * \param[in] e_g_id ordered unique external global ids
1093 * \param[out] n_init_ranks number of ranks in initial assumed
1094 * neighborhood (for logging), or NULL
1095 * \param[in] comm associated communicator
1096 *
1097 * This function should be called by a single thread for a given assembler.
1098 */
1099 /*----------------------------------------------------------------------------*/
1100
1101 static cs_rank_neighbors_t *
_rank_neighbors(cs_lnum_t n_e_g_ids,cs_gnum_t n_g,cs_gnum_t l_range[2],const cs_gnum_t e_g_id[],int * n_init_ranks,MPI_Comm comm)1102 _rank_neighbors(cs_lnum_t n_e_g_ids,
1103 cs_gnum_t n_g,
1104 cs_gnum_t l_range[2],
1105 const cs_gnum_t e_g_id[],
1106 int *n_init_ranks,
1107 MPI_Comm comm)
1108 {
1109 /* Determine assumed rank neighbors based on compact info */
1110
1111 cs_rank_neighbors_t *arn = _assumed_rank_neighbors(n_e_g_ids,
1112 n_g,
1113 l_range,
1114 e_g_id,
1115 comm);
1116
1117 if (n_init_ranks != NULL)
1118 *n_init_ranks = arn->size;
1119
1120 cs_gnum_t *d_ranges = _rank_ranges_exchange(arn,
1121 l_range,
1122 comm);
1123
1124 int *e_rank_id = _assumed_to_true_rank(arn,
1125 n_e_g_ids,
1126 n_g,
1127 l_range,
1128 d_ranges,
1129 e_g_id,
1130 comm);
1131
1132 BFT_FREE(d_ranges);
1133
1134 /* Now rebuild rank neighbors */
1135
1136 cs_rank_neighbors_destroy(&arn);
1137
1138 arn = cs_rank_neighbors_create(n_e_g_ids, e_rank_id);
1139
1140 BFT_FREE(e_rank_id);
1141
1142 cs_rank_neighbors_symmetrize(arn, comm);
1143
1144 return arn;
1145 }
1146
1147 /*----------------------------------------------------------------------------*/
1148 /*!
1149 * \brief Process and exchange info relative to data that will
1150 * be sent to neighboring ranks.
1151 *
1152 * At this stage, ma->rank will contain the index of the rank in the
1153 * rank neighborhood structure, not the true MPI rank.
1154 *
1155 * \param[in, out] ma pointer to matrix assembler structure
1156 * \param[in] e_g_ij_size size of coefficient data to send
1157 * \param[in] e_g_ij coefficient data (row_g_id, col_g_id couples)
1158 * to send
1159 */
1160 /*----------------------------------------------------------------------------*/
1161
1162 static void
_process_assembly_data(cs_matrix_assembler_t * ma,cs_gnum_t e_g_ij_size,const cs_gnum_t * e_g_ij)1163 _process_assembly_data(cs_matrix_assembler_t *ma,
1164 cs_gnum_t e_g_ij_size,
1165 const cs_gnum_t *e_g_ij)
1166 {
1167 ma->coeff_send_size = e_g_ij_size;
1168
1169 /* Count rows */
1170
1171 ma->coeff_send_n_rows = 0;
1172
1173 cs_gnum_t g_r_id_prev = ma->l_range[0]; /* impossible value here */
1174
1175 cs_lnum_t n_e_g_ids = 0;
1176 cs_gnum_t *e_g_id = NULL;
1177
1178 for (cs_lnum_t i = 0; i < ma->coeff_send_size; i++) {
1179 cs_gnum_t g_r_id = e_g_ij[i*2];
1180 if (g_r_id != g_r_id_prev) {
1181 n_e_g_ids += 1;
1182 ma->coeff_send_n_rows += 1;
1183 g_r_id_prev = g_r_id;
1184 }
1185 cs_gnum_t g_c_id = e_g_ij[i*2+1];
1186 if (g_c_id < ma->l_range[0] || g_c_id >= ma->l_range[1])
1187 n_e_g_ids += 1;
1188 }
1189
1190 if (ma->coeff_send_size > 0) {
1191
1192 BFT_MALLOC(e_g_id, n_e_g_ids, cs_gnum_t);
1193
1194 n_e_g_ids = 0;
1195
1196 cs_lnum_t row_count = 0;
1197
1198 BFT_MALLOC(ma->coeff_send_index, ma->coeff_send_n_rows+1, cs_lnum_t);
1199 BFT_MALLOC(ma->coeff_send_row_g_id, ma->coeff_send_n_rows, cs_gnum_t);
1200 BFT_MALLOC(ma->coeff_send_col_g_id, ma->coeff_send_size, cs_gnum_t);
1201
1202 g_r_id_prev = ma->l_range[0]; /* impossible value here */
1203
1204 for (cs_lnum_t i = 0; i < ma->coeff_send_size; i++) {
1205 cs_gnum_t g_r_id = e_g_ij[i*2];
1206 if (g_r_id != g_r_id_prev) {
1207 ma->coeff_send_index[row_count] = i;
1208 ma->coeff_send_row_g_id[row_count] = g_r_id;
1209 e_g_id[n_e_g_ids++] = g_r_id;
1210 row_count++;
1211 g_r_id_prev = g_r_id;
1212 }
1213 cs_gnum_t g_c_id = e_g_ij[i*2+1];
1214 ma->coeff_send_col_g_id[i] = g_c_id;
1215 if (g_c_id < ma->l_range[0] || g_c_id >= ma->l_range[1]) {
1216 e_g_id[n_e_g_ids++] = g_c_id;
1217 }
1218 }
1219 ma->coeff_send_index[row_count] = ma->coeff_send_size;
1220
1221 n_e_g_ids = cs_sort_and_compact_gnum(n_e_g_ids, e_g_id);
1222
1223 }
1224
1225 /* Determine ranks we may be communicating with for this stage */
1226
1227 cs_rank_neighbors_t *arn = _rank_neighbors(n_e_g_ids,
1228 ma->n_g_rows,
1229 ma->l_range,
1230 e_g_id,
1231 &(ma->n_ranks_init[0]),
1232 ma->comm);
1233
1234 BFT_FREE(e_g_id);
1235
1236 cs_gnum_t *d_ranges = _rank_ranges_exchange(arn, ma->l_range, ma->comm);
1237
1238 /* Prepare counts */
1239
1240 cs_lnum_t *counts;
1241 BFT_MALLOC(counts, arn->size*2, cs_lnum_t);
1242
1243 for (int i = 0; i < arn->size*2; i++)
1244 counts[i] = 0;
1245
1246 for (cs_lnum_t i = 0; i < ma->coeff_send_n_rows; i++) {
1247 cs_gnum_t g_r_id = ma->coeff_send_row_g_id[i];
1248 int r_rank_id = _g_id_rank_index(arn->size, d_ranges, g_r_id);
1249 assert(r_rank_id >= 0 && r_rank_id < arn->size);
1250 assert( g_r_id >= d_ranges[2*r_rank_id]
1251 && g_r_id < d_ranges[2*r_rank_id+1]);
1252 counts[r_rank_id] += ma->coeff_send_index[i+1] - ma->coeff_send_index[i];
1253 }
1254
1255 BFT_FREE(d_ranges);
1256
1257 /* Exchange counts */
1258
1259 MPI_Request *request = NULL;
1260 MPI_Status *status = NULL;
1261
1262 BFT_MALLOC(request, arn->size*2, MPI_Request);
1263 BFT_MALLOC(status, arn->size*2, MPI_Status);
1264
1265 const int local_rank = cs_glob_rank_id;
1266
1267 int request_count = 0;
1268
1269 for (int i = 0; i < arn->size; i++) {
1270 MPI_Irecv(counts + arn->size + i,
1271 1,
1272 CS_MPI_LNUM,
1273 arn->rank[i],
1274 local_rank,
1275 ma->comm,
1276 &(request[request_count++]));
1277 }
1278
1279 for (int i = 0; i < arn->size; i++) {
1280 MPI_Isend(counts + i,
1281 1,
1282 CS_MPI_LNUM,
1283 arn->rank[i],
1284 arn->rank[i],
1285 ma->comm,
1286 &(request[request_count++]));
1287 }
1288
1289 MPI_Waitall(request_count, request, status);
1290
1291 /* Save exchange rank info */
1292
1293 ma->n_coeff_ranks = 0;
1294 for (int i = 0; i < arn->size; i++) {
1295 if (counts[i] + counts[arn->size + i] > 0)
1296 ma->n_coeff_ranks += 1;
1297 }
1298
1299 BFT_MALLOC(ma->coeff_rank, ma->n_coeff_ranks, int);
1300
1301 if (ma->n_coeff_ranks > 0) {
1302
1303 BFT_MALLOC(ma->coeff_rank_send_index, ma->n_coeff_ranks+1, cs_lnum_t);
1304 BFT_MALLOC(ma->coeff_rank_recv_index, ma->n_coeff_ranks+1, cs_lnum_t);
1305
1306 ma->coeff_rank_send_index[0] = 0;
1307 ma->coeff_rank_recv_index[0] = 0;
1308
1309 ma->n_coeff_ranks = 0;
1310 for (int i = 0; i < arn->size; i++) {
1311 if (counts[i] + counts[arn->size + i] > 0) {
1312 int j = ma->n_coeff_ranks;
1313 ma->coeff_rank[j] = arn->rank[i];
1314 ma->coeff_rank_send_index[j+1]
1315 = ma->coeff_rank_send_index[j] + counts[i];
1316 ma->coeff_rank_recv_index[j+1]
1317 = ma->coeff_rank_recv_index[j] + counts[arn->size + i];
1318 ma->n_coeff_ranks += 1;
1319 }
1320 }
1321
1322 }
1323
1324 BFT_FREE(counts);
1325
1326 /* We don't need the ranks neighbors structure anymore
1327 (we'll use the saved and compacted info from now on) */
1328
1329 cs_rank_neighbors_destroy(&arn);
1330
1331 if (ma->n_coeff_ranks > 0) {
1332
1333 /* Exchange data */
1334
1335 ma->coeff_recv_size = ma->coeff_rank_recv_index[ma->n_coeff_ranks];
1336
1337 cs_gnum_t *recv_data;
1338 BFT_MALLOC(recv_data, ma->coeff_recv_size*2, cs_gnum_t);
1339
1340 request_count = 0;
1341
1342 for (int i = 0; i < ma->n_coeff_ranks; i++) {
1343 cs_lnum_t l_size = ma->coeff_rank_recv_index[i+1]
1344 - ma->coeff_rank_recv_index[i];
1345 if (l_size > 0) {
1346 cs_lnum_t recv_shift = ma->coeff_rank_recv_index[i]*2;
1347 MPI_Irecv(recv_data + recv_shift,
1348 l_size*2,
1349 CS_MPI_GNUM,
1350 ma->coeff_rank[i],
1351 local_rank,
1352 ma->comm,
1353 &(request[request_count++]));
1354 }
1355 }
1356
1357 for (int i = 0; i < ma->n_coeff_ranks; i++) {
1358 cs_lnum_t l_size = ma->coeff_rank_send_index[i+1]
1359 - ma->coeff_rank_send_index[i];
1360 if (l_size > 0) {
1361 cs_lnum_t send_shift = ma->coeff_rank_send_index[i]*2;
1362 MPI_Isend(e_g_ij + send_shift,
1363 l_size*2,
1364 CS_MPI_GNUM,
1365 ma->coeff_rank[i],
1366 ma->coeff_rank[i],
1367 ma->comm,
1368 &(request[request_count++]));
1369 }
1370 }
1371
1372 MPI_Waitall(request_count, request, status);
1373
1374 /* Insert terms at local rows */
1375
1376 if (ma->coeff_recv_size > 0)
1377 _complete_distant(ma, ma->coeff_recv_size, recv_data);
1378
1379 /* Now we build the mapping from received terms to their positions
1380 in the array. For indexing, we implicitely consider that each row
1381 is built of 2 sub-rows, with the local columns first, and distant
1382 columns second. */
1383
1384 BFT_MALLOC(ma->coeff_recv_row_id, ma->coeff_recv_size, cs_lnum_t);
1385
1386 if (ma->flags & CS_MATRIX_DISTANT_ROW_USE_COL_IDX) {
1387
1388 cs_lnum_t n_l_insert = 0, n_l_insert_max = 0;
1389 cs_lnum_2_t *l_insert = NULL;
1390
1391 BFT_MALLOC(ma->coeff_recv_col_idx, ma->coeff_recv_size, cs_lnum_t);
1392
1393 /* First pass: determine local row ids, and check if insertion of
1394 previously unknown entries is required;
1395 column ids are set for local entries */
1396
1397 for (cs_lnum_t i = 0; i < ma->coeff_recv_size; i++) {
1398
1399 cs_lnum_t l_r_id = recv_data[i*2] - ma->l_range[0];
1400 cs_gnum_t g_c_id = recv_data[i*2+1];
1401
1402 ma->coeff_recv_row_id[i] = l_r_id;
1403
1404 /* Local part */
1405
1406 if (g_c_id >= ma->l_range[0] && g_c_id < ma->l_range[1]) {
1407
1408 cs_lnum_t n_cols = ma->r_idx[l_r_id+1] - ma->r_idx[l_r_id];
1409 cs_lnum_t l_c_id = g_c_id - ma->l_range[0];
1410 const cs_lnum_t *col_id = ma->c_id + ma->r_idx[l_r_id];
1411
1412 ma->coeff_recv_col_idx[i] = _l_id_binary_search(n_cols,
1413 l_c_id,
1414 col_id);
1415
1416 /* special case for separate diagonal, other cases require insertion */
1417
1418 if ( ma->coeff_recv_col_idx[i] < 0
1419 && (!ma->separate_diag || l_c_id != l_r_id)) {
1420
1421 if (n_l_insert >= n_l_insert_max) {
1422 n_l_insert_max = CS_MAX(n_l_insert_max*2, 16);
1423 BFT_REALLOC(l_insert, n_l_insert_max, cs_lnum_2_t);
1424 }
1425
1426 l_insert[n_l_insert][0] = l_r_id;
1427 l_insert[n_l_insert][1] = l_c_id;
1428 n_l_insert++;
1429
1430 }
1431
1432 }
1433
1434 }
1435
1436 /* Insert additional local terms if required */
1437
1438 if (n_l_insert > 0) {
1439
1440 _complete_local(ma, n_l_insert, l_insert);
1441
1442 BFT_FREE(l_insert);
1443 n_l_insert_max = 0;
1444
1445 }
1446
1447 /* Second pass: determine local column ids, and check if insertion of
1448 previously unknown entries is required */
1449
1450 for (cs_lnum_t i = 0; i < ma->coeff_recv_size; i++) {
1451
1452 cs_lnum_t l_r_id = recv_data[i*2] - ma->l_range[0];
1453 cs_gnum_t g_c_id = recv_data[i*2+1];
1454
1455 /* Local part */
1456
1457 if (g_c_id >= ma->l_range[0] && g_c_id < ma->l_range[1]) {
1458
1459 if (n_l_insert == 0) /* Already up-to-date if no insertion */
1460 continue;
1461
1462 cs_lnum_t n_cols = ma->r_idx[l_r_id+1] - ma->r_idx[l_r_id];
1463 cs_lnum_t l_c_id = g_c_id - ma->l_range[0];
1464 const cs_lnum_t *col_id = ma->c_id + ma->r_idx[l_r_id];
1465
1466 ma->coeff_recv_col_idx[i] = _l_id_binary_search(n_cols,
1467 l_c_id,
1468 col_id);
1469
1470 assert( ma->coeff_recv_col_idx[i] >= 0
1471 || (ma->separate_diag && l_c_id == l_r_id));
1472
1473 }
1474
1475 /* Distant part */
1476
1477 else {
1478
1479 cs_lnum_t n_d_cols = ma->d_r_idx[l_r_id+1] - ma->d_r_idx[l_r_id];
1480 cs_lnum_t n_cols = ma->r_idx[l_r_id+1] - ma->r_idx[l_r_id];
1481 const cs_gnum_t *col_g_id = ma->d_g_c_id + ma->d_r_idx[l_r_id];
1482
1483 cs_lnum_t d_c_idx = _g_id_binary_find(n_d_cols,
1484 g_c_id,
1485 col_g_id);
1486
1487 /* column ids start and end of local row, so add n_cols
1488 (local part only, matrices are not appened at this stage) */
1489 ma->coeff_recv_col_idx[i] = d_c_idx + n_cols;
1490 }
1491
1492 }
1493
1494 }
1495
1496 if (ma->flags & CS_MATRIX_DISTANT_ROW_USE_COL_G_ID) {
1497
1498 BFT_MALLOC(ma->coeff_recv_col_g_id, ma->coeff_recv_size, cs_gnum_t);
1499
1500 for (cs_lnum_t i = 0; i < ma->coeff_recv_size; i++) {
1501 ma->coeff_recv_row_id[i] = recv_data[i*2] - ma->l_range[0];
1502 ma->coeff_recv_col_g_id[i] = recv_data[i*2+1];
1503 }
1504
1505 }
1506
1507 BFT_FREE(recv_data);
1508 }
1509
1510 BFT_FREE(request);
1511 BFT_FREE(status);
1512 }
1513
1514 /*----------------------------------------------------------------------------*/
1515 /*!
1516 * \brief Build vector halo structure associated with a matrix assembler.
1517 *
1518 * This assumes the ordered and compact list of external global ids
1519 * has been built.
1520
1521 * \param[in, out] ma pointer to matrix assembler structure
1522 * \param[in] n_e_g_ids number of unique external global ids
1523 * \param[in] e_g_id ordered unique external global ids
1524 *
1525 * This function should be called by a single thread for a given assembler.
1526 */
1527 /*----------------------------------------------------------------------------*/
1528
1529 static void
_matrix_assembler_compute_halo(cs_matrix_assembler_t * ma,cs_lnum_t n_e_g_ids,const cs_gnum_t e_g_id[])1530 _matrix_assembler_compute_halo(cs_matrix_assembler_t *ma,
1531 cs_lnum_t n_e_g_ids,
1532 const cs_gnum_t e_g_id[])
1533 {
1534 /* Determine assumed rank neighbors based on compact info */
1535
1536 cs_rank_neighbors_t *arn = _rank_neighbors(n_e_g_ids,
1537 ma->n_g_rows,
1538 ma->l_range,
1539 e_g_id,
1540 &(ma->n_ranks_init[1]),
1541 ma->comm);
1542
1543 cs_gnum_t *d_ranges = _rank_ranges_exchange(arn, ma->l_range, ma->comm);
1544
1545 /* Identifiy rank and local id associated with each element */
1546
1547 int *rank_id;
1548 cs_lnum_t *r_loc_id;
1549 BFT_MALLOC(rank_id, n_e_g_ids, int);
1550 BFT_MALLOC(r_loc_id, n_e_g_ids, cs_lnum_t);
1551
1552 # pragma omp parallel if(n_e_g_ids > CS_THR_MIN)
1553 for (cs_lnum_t i = 0; i < n_e_g_ids; i++) {
1554 cs_gnum_t g_id = e_g_id[i];
1555 int r_id = _g_id_rank_index(arn->size, d_ranges, g_id);
1556 assert(r_id > -1);
1557 assert(g_id >= d_ranges[r_id*2]);
1558 r_loc_id[i] = g_id - d_ranges[r_id*2];
1559 rank_id[i] = r_id;
1560 }
1561
1562 BFT_FREE(d_ranges);
1563
1564 ma->_halo = cs_halo_create_from_rank_neighbors(arn,
1565 ma->n_rows,
1566 n_e_g_ids,
1567 rank_id,
1568 r_loc_id);
1569 ma->halo = ma->_halo;
1570
1571 BFT_FREE(r_loc_id);
1572 BFT_FREE(rank_id);
1573
1574 cs_rank_neighbors_destroy(&arn);
1575 }
1576
1577 /*----------------------------------------------------------------------------*/
1578 /*!
1579 * \brief Compute internal structures required by a matrix assembler
1580 * in parallel mode.
1581 *
1582 * \param[in, out] ma pointer to matrix assembler structure
1583 *
1584 * This function should be called by a single thread for a given assembler.
1585 */
1586 /*----------------------------------------------------------------------------*/
1587
1588 static void
_matrix_assembler_compute_mpi(cs_matrix_assembler_t * ma)1589 _matrix_assembler_compute_mpi(cs_matrix_assembler_t *ma)
1590 {
1591 /* Compute number of rows */
1592
1593 cs_lnum_t n_rows = 0;
1594 if (ma->l_range[1] > ma->l_range[0])
1595 n_rows = ma->l_range[1] - ma->l_range[0];
1596
1597 MPI_Allreduce(ma->l_range+1, &(ma->n_g_rows), 1, CS_MPI_GNUM,
1598 MPI_MAX, ma->comm);
1599
1600 ma->n_rows = n_rows;
1601
1602 /* Separate local fom distant rows: counting step */
1603
1604 cs_lnum_t e_size = 0;
1605
1606 cs_lnum_t *l_c_count, *d_c_count;
1607
1608 BFT_MALLOC(ma->_r_idx, n_rows+1, cs_lnum_t);
1609 BFT_MALLOC(ma->d_r_idx, n_rows+1, cs_lnum_t);
1610 ma->r_idx = ma->_r_idx;
1611
1612 BFT_MALLOC(l_c_count, n_rows, cs_lnum_t);
1613 BFT_MALLOC(d_c_count, n_rows, cs_lnum_t);
1614
1615 for (cs_lnum_t i = 0; i < n_rows; i++) {
1616 l_c_count[i] = 0;
1617 d_c_count[i] = 0;
1618 }
1619
1620 if (ma->separate_diag) {
1621
1622 for (cs_lnum_t i = 0; i < ma->size; i++) {
1623 cs_gnum_t g_r_id = ma->g_rc_id[i*2];
1624 cs_gnum_t g_c_id = ma->g_rc_id[i*2 + 1];
1625 if (g_r_id >= ma->l_range[0] && g_r_id < ma->l_range[1]) {
1626 cs_lnum_t l_r_id = g_r_id - ma->l_range[0];
1627 if (g_c_id >= ma->l_range[0] && g_c_id < ma->l_range[1]) {
1628 if (g_c_id != g_r_id)
1629 l_c_count[l_r_id] += 1;
1630 }
1631 else
1632 d_c_count[l_r_id] += 1;
1633 }
1634 else
1635 e_size += 1;
1636 }
1637
1638 }
1639 else {
1640
1641 for (cs_lnum_t i = 0; i < ma->size; i++) {
1642 cs_gnum_t g_r_id = ma->g_rc_id[i*2];
1643 cs_gnum_t g_c_id = ma->g_rc_id[i*2 + 1];
1644 if (g_r_id >= ma->l_range[0] && g_r_id < ma->l_range[1]) {
1645 cs_lnum_t l_r_id = g_r_id - ma->l_range[0];
1646 if (g_c_id >= ma->l_range[0] && g_c_id < ma->l_range[1]) {
1647 l_c_count[l_r_id] += 1;
1648 }
1649 else
1650 d_c_count[l_r_id] += 1;
1651 }
1652 else
1653 e_size += 1;
1654 }
1655
1656 }
1657
1658 /* Build index and reset count */
1659
1660 ma->_r_idx[0] = 0;
1661 ma->d_r_idx[0] = 0;
1662
1663 for (cs_lnum_t i = 0; i < n_rows; i++) {
1664 ma->_r_idx[i+1] = ma->_r_idx[i] + l_c_count[i];
1665 ma->d_r_idx[i+1] = ma->d_r_idx[i] + d_c_count[i];
1666 l_c_count[i] = 0;
1667 d_c_count[i] = 0;
1668 }
1669
1670 /* Allocate structures holding data */
1671
1672 BFT_MALLOC(ma->_c_id, ma->_r_idx[n_rows], cs_lnum_t);
1673 BFT_MALLOC(ma->d_g_c_id, ma->d_r_idx[n_rows], cs_gnum_t);
1674 ma->c_id = ma->_c_id;
1675
1676 cs_gnum_t *e_g_ij;
1677 BFT_MALLOC(e_g_ij, e_size*2, cs_gnum_t);
1678
1679 /* Now fill data: local part is determined (will be cleaned),
1680 and list of distant rows defined; in cases where sizeof(cs_lnum_t),
1681 is smaller than sizeof(cs_gnum_t), this data, though
1682 unsorted, will already be smaller than the initial couples. */
1683
1684 cs_lnum_t k = 0;
1685
1686 if (ma->separate_diag) {
1687
1688 for (cs_lnum_t i = 0; i < ma->size; i++) {
1689 cs_gnum_t g_r_id = ma->g_rc_id[i*2];
1690 cs_gnum_t g_c_id = ma->g_rc_id[i*2 + 1];
1691 if (g_r_id >= ma->l_range[0] && g_r_id < ma->l_range[1]) {
1692 cs_lnum_t l_r_id = g_r_id - ma->l_range[0];
1693 if ( g_c_id >= ma->l_range[0] && g_c_id < ma->l_range[1]
1694 && g_c_id != g_r_id) {
1695 cs_lnum_t l_c_id = g_c_id - ma->l_range[0];
1696 ma->_c_id[ma->_r_idx[l_r_id] + l_c_count[l_r_id]] = l_c_id;
1697 l_c_count[l_r_id] += 1;
1698 }
1699 else {
1700 ma->d_g_c_id[ma->d_r_idx[l_r_id] + d_c_count[l_r_id]] = g_c_id;
1701 d_c_count[l_r_id] += 1;
1702 }
1703 }
1704 else {
1705 e_g_ij[k++] = g_r_id;
1706 e_g_ij[k++] = g_c_id;
1707 }
1708 }
1709
1710 }
1711 else {
1712
1713 for (cs_lnum_t i = 0; i < ma->size; i++) {
1714 cs_gnum_t g_r_id = ma->g_rc_id[i*2];
1715 cs_gnum_t g_c_id = ma->g_rc_id[i*2 + 1];
1716 if (g_r_id >= ma->l_range[0] && g_r_id < ma->l_range[1]) {
1717 cs_lnum_t l_r_id = g_r_id - ma->l_range[0];
1718 if (g_c_id >= ma->l_range[0] && g_c_id < ma->l_range[1]) {
1719 cs_lnum_t l_c_id = g_c_id - ma->l_range[0];
1720 ma->_c_id[ma->_r_idx[l_r_id] + l_c_count[l_r_id]] = l_c_id;
1721 l_c_count[l_r_id] += 1;
1722 }
1723 else {
1724 ma->d_g_c_id[ma->d_r_idx[l_r_id] + d_c_count[l_r_id]] = g_c_id;
1725 d_c_count[l_r_id] += 1;
1726 }
1727 }
1728 else {
1729 e_g_ij[k++] = g_r_id;
1730 e_g_ij[k++] = g_c_id;
1731 }
1732 }
1733
1734 }
1735
1736 BFT_FREE(d_c_count);
1737 BFT_FREE(l_c_count);
1738
1739 /* We can free the initial insertion structure */
1740
1741 BFT_FREE(ma->g_rc_id);
1742
1743 assert(k == e_size*2);
1744
1745 /* Rows and column ids in the local range can be finalized now;
1746 for most matrix structures, we will need to expand the
1747 rows with columns referencing distant rows (except when storing
1748 those terms separatly or passing them to an external library),
1749 but updating it now allows optimizing for space, and will make
1750 future updates simpler. */
1751
1752 _sort_and_compact_local(ma);
1753 _sort_and_compact_distant(ma);
1754
1755 /* We can also order and compact data that will be sent to external
1756 ranks now, as it will be needed at some point, and is simpler
1757 to do on global id couples than on local ids with higher stride.
1758 This may also reduce the size of the data we work on for the
1759 following steps. */
1760
1761 e_size = cs_sort_and_compact_gnum_2(e_size, e_g_ij);
1762
1763 _process_assembly_data(ma, e_size, e_g_ij);
1764
1765 BFT_FREE(e_g_ij);
1766
1767 /* Build compact external global id array */
1768
1769 cs_lnum_t n_e_g_ids = 0;
1770 cs_gnum_t *e_g_id = NULL;
1771
1772 cs_lnum_t d_size = ma->d_r_idx[ma->n_rows];
1773
1774 BFT_MALLOC(e_g_id, d_size, cs_gnum_t);
1775
1776 /* Initial pass to reduce data a bit before sorting */
1777
1778 cs_gnum_t g_id_prev = ma->l_range[0];
1779
1780 for (cs_lnum_t i = 0; i < d_size; i++) {
1781 cs_gnum_t g_id = ma->d_g_c_id[i];
1782 if ( (g_id < ma->l_range[0] || g_id >= ma->l_range[1])
1783 && g_id != g_id_prev) {
1784 e_g_id[n_e_g_ids++] = g_id;
1785 g_id_prev = g_id;
1786 }
1787 }
1788
1789 n_e_g_ids = cs_sort_and_compact_gnum(n_e_g_ids, e_g_id);
1790 BFT_REALLOC(e_g_id, n_e_g_ids, cs_gnum_t);
1791
1792 /* Finally, append distant row to local rows, and build matching halo */
1793
1794 _append_local_and_distant(ma, n_e_g_ids, e_g_id);
1795
1796 if (! (ma->flags & CS_MATRIX_EXTERNAL_HALO)) {
1797 _matrix_assembler_compute_halo(ma, n_e_g_ids, e_g_id);
1798 assert(ma->halo->n_elts[0] == n_e_g_ids);
1799 }
1800
1801 /* Assign array to structure */
1802
1803 ma->n_e_g_ids = n_e_g_ids;
1804 ma->e_g_id = e_g_id; e_g_id = NULL;
1805 }
1806
1807 #endif /* HAVE_MPI */
1808
1809 /*----------------------------------------------------------------------------*/
1810 /*!
1811 * \brief Compute internal structures required by a matrix assembler
1812 * in serial mode.
1813 *
1814 * \param[in, out] ma pointer to matrix assembler structure
1815 *
1816 * This function should be called by a single thread for a given assembler.
1817 */
1818 /*----------------------------------------------------------------------------*/
1819
1820 static void
_matrix_assembler_compute_local(cs_matrix_assembler_t * ma)1821 _matrix_assembler_compute_local(cs_matrix_assembler_t *ma)
1822 {
1823 /* Separate local fom distant rows: counting step */
1824
1825 cs_lnum_t n_rows = 0;
1826 if (ma->l_range[1] > ma->l_range[0])
1827 n_rows = ma->l_range[1] - ma->l_range[0];
1828
1829 cs_lnum_t *c_count;
1830 BFT_MALLOC(ma->_r_idx, n_rows+1, cs_lnum_t);
1831 ma->r_idx = ma->_r_idx;
1832
1833 BFT_MALLOC(c_count, n_rows, cs_lnum_t);
1834
1835 for (cs_lnum_t i = 0; i < n_rows; i++)
1836 c_count[i] = 0;
1837
1838 if (ma->separate_diag) {
1839
1840 for (cs_lnum_t i = 0; i < ma->size; i++) {
1841 cs_gnum_t g_r_id = ma->g_rc_id[i*2];
1842 cs_gnum_t g_c_id = ma->g_rc_id[i*2 + 1];
1843 assert(g_r_id >= ma->l_range[0] && g_r_id < ma->l_range[1]);
1844 cs_lnum_t l_r_id = g_r_id - ma->l_range[0];
1845 assert(g_c_id >= ma->l_range[0] && g_c_id < ma->l_range[1]);
1846 if (g_c_id != g_r_id)
1847 c_count[l_r_id] += 1;
1848 }
1849
1850 }
1851 else {
1852
1853 for (cs_lnum_t i = 0; i < ma->size; i++) {
1854 cs_gnum_t g_r_id = ma->g_rc_id[i*2];
1855 cs_gnum_t g_c_id = ma->g_rc_id[i*2 + 1];
1856 assert(g_r_id >= ma->l_range[0] && g_r_id < ma->l_range[1]);
1857 cs_lnum_t l_r_id = g_r_id - ma->l_range[0];
1858 assert(g_c_id >= ma->l_range[0] && g_c_id < ma->l_range[1]);
1859 c_count[l_r_id] += 1;
1860 }
1861
1862 }
1863
1864 /* Build index and reset count */
1865
1866 ma->_r_idx[0] = 0;
1867
1868 for (cs_lnum_t i = 0; i < n_rows; i++) {
1869 ma->_r_idx[i+1] = ma->_r_idx[i] + c_count[i];
1870 c_count[i] = 0;
1871 }
1872
1873 /* Allocate structures holding data */
1874
1875 BFT_MALLOC(ma->_c_id, ma->_r_idx[n_rows], cs_lnum_t);
1876 ma->c_id = ma->_c_id;
1877
1878 /* Now fill data: local part is determined (will be cleaned),
1879 and list of distant rows defined */
1880
1881 if (ma->separate_diag) {
1882
1883 for (cs_lnum_t i = 0; i < ma->size; i++) {
1884 cs_lnum_t l_r_id = ma->g_rc_id[i*2] - ma->l_range[0];
1885 if (ma->g_rc_id[i*2+1] != ma->g_rc_id[i*2]) {
1886 ma->_c_id[ma->_r_idx[l_r_id] + c_count[l_r_id]]
1887 = ma->g_rc_id[i*2+1] - ma->l_range[0];
1888 c_count[l_r_id] += 1;
1889 }
1890 }
1891
1892 }
1893 else {
1894
1895 for (cs_lnum_t i = 0; i < ma->size; i++) {
1896 cs_lnum_t l_r_id = ma->g_rc_id[i*2] - ma->l_range[0];
1897 ma->_c_id[ma->_r_idx[l_r_id] + c_count[l_r_id]]
1898 = ma->g_rc_id[i*2+1] - ma->l_range[0];
1899 c_count[l_r_id] += 1;
1900 }
1901
1902 }
1903
1904 BFT_FREE(c_count);
1905
1906 /* Set global number of rows (will be updated in parallel) */
1907
1908 ma->n_g_rows = n_rows;
1909 ma->n_rows = n_rows;
1910
1911 /* Sort and compact elements for final structure */
1912
1913 _sort_and_compact_local(ma);
1914 }
1915
1916 /*----------------------------------------------------------------------------*/
1917 /*!
1918 * \brief Build diagonal index array when conversion between included
1919 * and separate diagonal is required between a values assembler
1920 * and its matching matrix assembler.
1921 *
1922 * \param[in, out] pointer to matrix values assembler structure
1923 */
1924 /*----------------------------------------------------------------------------*/
1925
1926 static void
_matrix_assembler_values_diag_idx(cs_matrix_assembler_values_t * mav)1927 _matrix_assembler_values_diag_idx(cs_matrix_assembler_values_t *mav)
1928 {
1929 if (mav->diag_idx != NULL)
1930 return;
1931
1932 const cs_matrix_assembler_t *ma = mav->ma;
1933
1934 if (ma->separate_diag == mav->separate_diag)
1935 return;
1936
1937 BFT_MALLOC(mav->diag_idx, ma->n_rows, cs_lnum_t);
1938
1939 /*
1940 Two cases:
1941 - assembler has separate diagonal but values assembler does not:
1942 determine after which index rows should be shifted
1943 - assembler has included diagonal but values assembler does not:
1944 determine after which index rows should be shifted
1945 */
1946
1947 if (ma->separate_diag) {
1948 for (cs_lnum_t i = 0; i < ma->n_rows; i++) {
1949 cs_lnum_t j = ma->r_idx[i], k = ma->r_idx[i+1];
1950 while (j < k) {
1951 if (ma->c_id[j] > i)
1952 k = j;
1953 j++;
1954 }
1955 mav->diag_idx[i] = k - ma->r_idx[i];
1956 }
1957 }
1958 else if (mav->separate_diag) {
1959 for (cs_lnum_t i = 0; i < ma->n_rows; i++) {
1960 cs_lnum_t j = ma->r_idx[i], k = ma->r_idx[i+1];
1961 while (j < k) {
1962 if (ma->c_id[j] == i)
1963 k = j;
1964 j++;
1965 }
1966 mav->diag_idx[i] = k - ma->r_idx[i];
1967 }
1968 }
1969 }
1970
1971 /*----------------------------------------------------------------------------*/
1972 /*!
1973 * \brief Add coefficient values based on row indexes when the diagonal
1974 * must be shifted between the assembler and the values assembler.
1975 *
1976 * This function may be called by different threads if the caller
1977 * ensures different threads will not write to the same final parts of
1978 * the matrix coefficients.
1979 *
1980 * \param[in, out] mav pointer to matrix assembler values structure
1981 * \param[in] n number of values to add
1982 * \param[in] stride matrix block stride
1983 * \param[in] row_id matrix row ids
1984 * \param[in] col_idx matrix assembler column indexes
1985 * \param[in] val values to add
1986 */
1987 /*----------------------------------------------------------------------------*/
1988
1989 static void
_matrix_assembler_values_add_cnv_idx(cs_matrix_assembler_values_t * mav,cs_lnum_t n,cs_lnum_t stride,const cs_lnum_t row_id[],const cs_lnum_t col_idx[],const cs_real_t val[])1990 _matrix_assembler_values_add_cnv_idx(cs_matrix_assembler_values_t *mav,
1991 cs_lnum_t n,
1992 cs_lnum_t stride,
1993 const cs_lnum_t row_id[],
1994 const cs_lnum_t col_idx[],
1995 const cs_real_t val[])
1996 {
1997 const cs_matrix_assembler_t *ma = mav->ma;
1998
1999 /* Case where the assembler has a separate diagonal, and values have
2000 an included diagonal */
2001
2002 if (ma->separate_diag) {
2003
2004 assert(mav->separate_diag == false);
2005
2006 cs_lnum_t s_col_idx[COEFF_GROUP_SIZE];
2007
2008 for (cs_lnum_t i = 0; i < n; i+= COEFF_GROUP_SIZE) {
2009
2010 cs_lnum_t b_size = COEFF_GROUP_SIZE;
2011 if (i + COEFF_GROUP_SIZE > n)
2012 b_size = n - i;
2013
2014 for (cs_lnum_t j = 0; j < b_size; j++) {
2015 cs_lnum_t r_id = row_id[i+j];
2016 cs_lnum_t c_idx = col_idx[i+j];
2017 if (r_id < 0)
2018 continue;
2019 if (c_idx == -1)
2020 s_col_idx[j] = mav->diag_idx[r_id];
2021 else if (c_idx < mav->diag_idx[r_id])
2022 s_col_idx[j] = c_idx;
2023 else
2024 s_col_idx[j] = c_idx + 1;
2025 }
2026
2027 mav->add_values(mav->matrix,
2028 b_size,
2029 stride,
2030 row_id + i,
2031 s_col_idx,
2032 val + (i*stride));
2033
2034 }
2035
2036 }
2037
2038 /* Case where the assembler has an included diagonal, and values have
2039 a separate diagonal */
2040
2041 else {
2042
2043 assert(ma->separate_diag == false);
2044 assert(mav->separate_diag == true);
2045
2046 cs_lnum_t s_col_idx[COEFF_GROUP_SIZE];
2047
2048 for (cs_lnum_t i = 0; i < n; i+= COEFF_GROUP_SIZE) {
2049
2050 cs_lnum_t b_size = COEFF_GROUP_SIZE;
2051 if (i + COEFF_GROUP_SIZE > n)
2052 b_size = n - i;
2053
2054 for (cs_lnum_t j = 0; j < b_size; j++) {
2055 cs_lnum_t r_id = row_id[i+j];
2056 cs_lnum_t c_idx = col_idx[i+j];
2057 if (r_id < 0)
2058 continue;
2059 if (c_idx < mav->diag_idx[r_id])
2060 s_col_idx[j] = c_idx;
2061 else if (c_idx == mav->diag_idx[r_id])
2062 s_col_idx[j] = -1;
2063 else
2064 s_col_idx[j] = c_idx - 1;
2065 }
2066
2067 mav->add_values(mav->matrix,
2068 b_size,
2069 stride,
2070 row_id + i,
2071 s_col_idx,
2072 val + (i*stride));
2073
2074 }
2075
2076 }
2077 }
2078
2079 #if defined(HAVE_MPI)
2080
2081 /*----------------------------------------------------------------------------*/
2082 /*!
2083 * \brief Add coefficient values based on local row ids and global column ids,
2084 * using a local addition function.
2085 *
2086 * This function may be called by different threads if the caller
2087 * ensures different threads will not write to the same final parts of
2088 * the matrix coefficients.
2089 *
2090 * \param[in, out] mav pointer to matrix assembler values structure
2091 * \param[in] n number of values to add
2092 * \param[in] stride matrix block stride
2093 * \param[in] row_id matrix row ids
2094 * \param[in] col_g_id global column ids
2095 * \param[in] val values to add
2096 */
2097 /*----------------------------------------------------------------------------*/
2098
2099 static void
_matrix_assembler_values_add_lg(cs_matrix_assembler_values_t * mav,cs_lnum_t n,cs_lnum_t stride,const cs_lnum_t row_id[],const cs_gnum_t col_g_id[],const cs_real_t val[])2100 _matrix_assembler_values_add_lg(cs_matrix_assembler_values_t *mav,
2101 cs_lnum_t n,
2102 cs_lnum_t stride,
2103 const cs_lnum_t row_id[],
2104 const cs_gnum_t col_g_id[],
2105 const cs_real_t val[])
2106 {
2107 const cs_matrix_assembler_t *ma = mav->ma;
2108
2109 assert(mav->add_values != NULL);
2110
2111 cs_lnum_t s_col_idx[COEFF_GROUP_SIZE];
2112
2113 for (cs_lnum_t i = 0; i < n; i+= COEFF_GROUP_SIZE) {
2114
2115 cs_lnum_t b_size = COEFF_GROUP_SIZE;
2116 if (i + COEFF_GROUP_SIZE > n)
2117 b_size = n - i;
2118
2119 for (cs_lnum_t j = 0; j < b_size; j++) {
2120
2121 cs_lnum_t k = i+j;
2122
2123 cs_lnum_t l_r_id = row_id[k];
2124 cs_gnum_t g_c_id = col_g_id[k];
2125
2126 /* Local part */
2127
2128 if (g_c_id >= ma->l_range[0] && g_c_id < ma->l_range[1]) {
2129
2130 cs_lnum_t l_c_id = g_c_id - ma->l_range[0];
2131 cs_lnum_t n_cols = ma->r_idx[l_r_id+1] - ma->r_idx[l_r_id];
2132
2133 /* remark: we could slightly reduce the binary search by subtracting
2134 the last (ma->d_r_idx[l_r_id+1] - ma->d_r_idx[l_r_id]) from
2135 the column count, but this would require a separate function
2136 for the serial case where that array may not exist */
2137
2138 s_col_idx[j] = _l_id_binary_search(n_cols,
2139 l_c_id,
2140 ma->c_id + ma->r_idx[l_r_id]);
2141
2142 assert(s_col_idx[j] > -1 || (ma->separate_diag && l_c_id == l_r_id));
2143
2144 }
2145
2146 /* Distant part */
2147
2148 #if defined(HAVE_MPI)
2149
2150 else {
2151
2152 cs_lnum_t n_cols = ma->d_r_idx[l_r_id+1] - ma->d_r_idx[l_r_id];
2153
2154 cs_lnum_t d_c_idx = _g_id_binary_find(n_cols,
2155 g_c_id,
2156 ma->d_g_c_id + ma->d_r_idx[l_r_id]);
2157
2158 /* column ids start and end of local row, so add r_idx[row_id+1] */
2159 s_col_idx[j] = d_c_idx + ma->r_idx[l_r_id] + 1;
2160
2161 }
2162
2163 #endif /* HAVE_MPI */
2164
2165 }
2166
2167 /* Case where assembler and values assembler have the same
2168 "separate diagonal" behavior */
2169
2170 if (ma->separate_diag == mav->separate_diag)
2171 mav->add_values(mav->matrix,
2172 b_size,
2173 stride,
2174 row_id + i,
2175 s_col_idx,
2176 val + (i*stride));
2177
2178 else
2179 _matrix_assembler_values_add_cnv_idx(mav,
2180 b_size,
2181 stride,
2182 row_id + i,
2183 s_col_idx,
2184 val + (i*stride));
2185
2186 }
2187 }
2188
2189 #endif /* HAVE_MPI */
2190
2191 /*----------------------------------------------------------------------------*/
2192 /*!
2193 * \brief Add coefficient values based on local row ids and column ids,
2194 * using a global addition function.
2195 *
2196 * This function may be called by different threads if the caller
2197 * ensures different threads will not write to the same final parts of
2198 * the matrix coefficients.
2199 *
2200 * \param[in, out] mav pointer to matrix assembler values structure
2201 * \param[in] n number of values to add
2202 * \param[in] stride matrix block stride
2203 * \param[in] row_id matrix row ids
2204 * \param[in] col_id matrix assembler column ids
2205 * \param[in] val values to add
2206 */
2207 /*----------------------------------------------------------------------------*/
2208
2209 static void
_matrix_assembler_values_add_ll_g(cs_matrix_assembler_values_t * mav,cs_lnum_t n,cs_lnum_t stride,const cs_lnum_t row_id[],const cs_lnum_t col_id[],const cs_real_t val[])2210 _matrix_assembler_values_add_ll_g(cs_matrix_assembler_values_t *mav,
2211 cs_lnum_t n,
2212 cs_lnum_t stride,
2213 const cs_lnum_t row_id[],
2214 const cs_lnum_t col_id[],
2215 const cs_real_t val[])
2216
2217 {
2218 const cs_matrix_assembler_t *ma = mav->ma;
2219
2220 cs_gnum_t row_g_id[COEFF_GROUP_SIZE];
2221 cs_gnum_t col_g_id[COEFF_GROUP_SIZE];
2222
2223 for (cs_lnum_t i = 0; i < n; i+= COEFF_GROUP_SIZE) {
2224
2225 cs_lnum_t b_size = COEFF_GROUP_SIZE;
2226 if (i + COEFF_GROUP_SIZE > n)
2227 b_size = n - i;
2228
2229 for (cs_lnum_t j = 0; j < b_size; j++) {
2230
2231 cs_lnum_t k = i+j;
2232
2233 row_g_id[j] = row_id[k] + ma->l_range[0];
2234
2235 cs_lnum_t l_c_id = col_id[k];
2236 if (l_c_id < ma->n_rows)
2237 col_g_id[j] = row_id[i+j] + ma->l_range[0];
2238 else
2239 col_g_id[j] = ma->e_g_id[l_c_id - ma->n_rows];
2240
2241 }
2242
2243 mav->add_values_g(mav->matrix,
2244 b_size,
2245 stride,
2246 row_g_id,
2247 col_g_id,
2248 val + (i*stride));
2249
2250 }
2251
2252 }
2253
2254 #if defined(HAVE_MPI)
2255
2256 /*----------------------------------------------------------------------------*/
2257 /*!
2258 * \brief Add coefficient values based on local row ids and column indexes,
2259 * using a global addition function.
2260 *
2261 * This function may be called by different threads if the caller
2262 * ensures different threads will not write to the same final parts of
2263 * the matrix coefficients.
2264 *
2265 * \param[in, out] mav pointer to matrix assembler values structure
2266 * \param[in] n number of values to add
2267 * \param[in] stride matrix block stride
2268 * \param[in] row_id matrix row ids
2269 * \param[in] col_idx matrix assembler column indexes
2270 * \param[in] val values to add
2271 */
2272 /*----------------------------------------------------------------------------*/
2273
2274 static void
_matrix_assembler_values_add_llx_g(cs_matrix_assembler_values_t * mav,cs_lnum_t n,cs_lnum_t stride,const cs_lnum_t row_id[],const cs_lnum_t col_idx[],const cs_real_t val[])2275 _matrix_assembler_values_add_llx_g(cs_matrix_assembler_values_t *mav,
2276 cs_lnum_t n,
2277 cs_lnum_t stride,
2278 const cs_lnum_t row_id[],
2279 const cs_lnum_t col_idx[],
2280 const cs_real_t val[])
2281
2282 {
2283 const cs_matrix_assembler_t *ma = mav->ma;
2284
2285 cs_gnum_t row_g_id[COEFF_GROUP_SIZE];
2286 cs_gnum_t col_g_id[COEFF_GROUP_SIZE];
2287
2288 for (cs_lnum_t i = 0; i < n; i+= COEFF_GROUP_SIZE) {
2289
2290 cs_lnum_t b_size = COEFF_GROUP_SIZE;
2291 if (i + COEFF_GROUP_SIZE > n)
2292 b_size = n - i;
2293
2294 for (cs_lnum_t j = 0; j < b_size; j++) {
2295
2296 cs_lnum_t k = i+j;
2297
2298 cs_lnum_t l_r_id = row_id[k];
2299 cs_lnum_t l_c_id;
2300 if (ma->c_id[k] != -1)
2301 l_c_id = ma->c_id[ma->r_idx[l_r_id] + col_idx[k]];
2302 else
2303 l_c_id = l_r_id;
2304
2305 row_g_id[j] = row_id[k] + ma->l_range[0];
2306 if (l_c_id < ma->n_rows)
2307 col_g_id[j] = row_id[k] + ma->l_range[0];
2308 else
2309 col_g_id[j] = ma->e_g_id[l_c_id - ma->n_rows];
2310
2311 }
2312
2313 mav->add_values_g(mav->matrix,
2314 b_size,
2315 stride,
2316 row_g_id,
2317 col_g_id,
2318 val + (i*stride));
2319
2320 }
2321
2322 }
2323
2324 /*----------------------------------------------------------------------------*/
2325 /*!
2326 * \brief Add coefficient values based on local row ids and global column ids,
2327 * using a global addition function.
2328 *
2329 * This function may be called by different threads if the caller
2330 * ensures different threads will not write to the same final parts of
2331 * the matrix coefficients.
2332 *
2333 * \param[in, out] mav pointer to matrix assembler values structure
2334 * \param[in] n number of values to add
2335 * \param[in] stride matrix block stride
2336 * \param[in] row_id matrix row ids
2337 * \param[in] col_g_id matrix global column ids
2338 * \param[in] val values to add
2339 */
2340 /*----------------------------------------------------------------------------*/
2341
2342 static void
_matrix_assembler_values_add_lg_g(cs_matrix_assembler_values_t * mav,cs_lnum_t n,cs_lnum_t stride,const cs_lnum_t row_id[],const cs_gnum_t col_g_id[],const cs_real_t val[])2343 _matrix_assembler_values_add_lg_g(cs_matrix_assembler_values_t *mav,
2344 cs_lnum_t n,
2345 cs_lnum_t stride,
2346 const cs_lnum_t row_id[],
2347 const cs_gnum_t col_g_id[],
2348 const cs_real_t val[])
2349 {
2350 const cs_matrix_assembler_t *ma = mav->ma;
2351
2352 cs_gnum_t row_g_id[COEFF_GROUP_SIZE];
2353
2354 for (cs_lnum_t i = 0; i < n; i+= COEFF_GROUP_SIZE) {
2355
2356 cs_lnum_t b_size = COEFF_GROUP_SIZE;
2357 if (i + COEFF_GROUP_SIZE > n)
2358 b_size = n - i;
2359
2360 for (cs_lnum_t j = 0; j < b_size; j++)
2361 row_g_id[j] = row_id[i+j] + ma->l_range[0];
2362
2363 mav->add_values_g(mav->matrix,
2364 b_size,
2365 stride,
2366 row_g_id,
2367 col_g_id + i,
2368 val + (i*stride));
2369
2370 }
2371 }
2372
2373 #endif /* HAVE_MPI */
2374
2375 /*! (DOXYGEN_SHOULD_SKIP_THIS) \endcond */
2376
2377 /*============================================================================
2378 * Public function definitions
2379 *============================================================================*/
2380
2381 /*----------------------------------------------------------------------------*/
2382 /*!
2383 * \brief Create a matrix assembler structure.
2384 *
2385 * The associated matrix structure data will initially be empty, though
2386 * the range of rows associated with the current rank must be defined
2387 * immediately.
2388 *
2389 * \param[in] l_range global id range [min, max[ for local rank
2390 * \param[in] separate_diag if true, diagonal terms are handled separately
2391 *
2392 * \return pointer to created matrix assembler structure
2393 */
2394 /*----------------------------------------------------------------------------*/
2395
2396 cs_matrix_assembler_t *
cs_matrix_assembler_create(const cs_gnum_t l_range[2],bool separate_diag)2397 cs_matrix_assembler_create(const cs_gnum_t l_range[2],
2398 bool separate_diag)
2399 {
2400 cs_matrix_assembler_t *ma = NULL;
2401
2402 BFT_MALLOC(ma, 1, cs_matrix_assembler_t);
2403
2404 ma->separate_diag = separate_diag;
2405
2406 ma->flags = CS_MATRIX_DISTANT_ROW_USE_COL_IDX;
2407
2408 ma->l_range[0] = l_range[0];
2409 ma->l_range[1] = l_range[1];
2410
2411 ma->n_g_rows = 0;
2412 ma->n_rows = 0;
2413
2414 ma->size = 0;
2415 ma->max_size = 0;
2416
2417 ma->r_idx = NULL;
2418 ma->c_id = NULL;
2419 ma->_r_idx = NULL;
2420 ma->_c_id = NULL;
2421
2422 ma->d_r_idx = NULL;
2423 ma->d_g_c_id = NULL;
2424
2425 ma->g_rc_id = NULL;
2426
2427 #if defined(HAVE_MPI)
2428
2429 ma->n_coeff_ranks = 0;
2430 ma->coeff_rank = NULL;
2431
2432 ma->coeff_send_size = 0;
2433 ma->coeff_recv_size = 0;
2434
2435 ma->coeff_send_n_rows = 0;
2436 ma->coeff_send_index = NULL;
2437 ma->coeff_send_row_g_id = NULL;
2438 ma->coeff_send_col_g_id = NULL;
2439
2440 ma->coeff_rank_send_index = NULL;
2441 ma->coeff_rank_recv_index = NULL;
2442
2443 ma->coeff_recv_row_id = NULL;
2444 ma->coeff_recv_col_idx = NULL;
2445 ma->coeff_recv_col_g_id = NULL;
2446
2447 ma->comm = cs_glob_mpi_comm;
2448
2449 ma->n_ranks_init[0] = 0;
2450 ma->n_ranks_init[1] = 0;
2451
2452 #endif /* HAVE_MPI */
2453
2454 ma->halo = NULL;
2455 ma->_halo = NULL;
2456
2457 ma->n_e_g_ids = 0;
2458 ma->e_g_id = NULL;
2459
2460 return ma;
2461 }
2462
2463 /*----------------------------------------------------------------------------*/
2464 /*!
2465 * \brief Create a matrix assembler structure based on a given connectivity
2466 * and associated halo structure.
2467 *
2468 * The assembler may not be later modified when built this way.
2469 *
2470 * The connectivity arrays and halo will be shared by the caller, so their
2471 * life cycle must be at least as long as the matrix assembler structure.
2472 *
2473 * \param[in] n_rows number fo local rows
2474 * \param[in] separate_diag if true, diagonal terms are handled separately
2475 * \param[in] row_idx matrix row index
2476 * \param[in] col_id matrix column indexes
2477 * \param[in] halo associated halo structure
2478 *
2479 * \return pointer to created matrix assembler structure
2480 */
2481 /*----------------------------------------------------------------------------*/
2482
2483 cs_matrix_assembler_t *
cs_matrix_assembler_create_from_shared(cs_lnum_t n_rows,bool separate_diag,const cs_lnum_t row_idx[],const cs_lnum_t col_id[],const cs_halo_t * halo)2484 cs_matrix_assembler_create_from_shared(cs_lnum_t n_rows,
2485 bool separate_diag,
2486 const cs_lnum_t row_idx[],
2487 const cs_lnum_t col_id[],
2488 const cs_halo_t *halo)
2489 {
2490 cs_gnum_t l_range[2] = {0, n_rows};
2491 cs_gnum_t n_g_rows = n_rows;
2492
2493 #if defined(HAVE_MPI)
2494 if (cs_glob_n_ranks > 1) {
2495 cs_gnum_t g_shift;
2496 cs_gnum_t l_shift = n_rows;
2497 MPI_Scan(&l_shift, &g_shift, 1, CS_MPI_GNUM, MPI_SUM, cs_glob_mpi_comm);
2498 MPI_Allreduce(&l_shift, &n_g_rows, 1, CS_MPI_GNUM, MPI_SUM,
2499 cs_glob_mpi_comm);
2500 l_range[0] = g_shift - l_shift;
2501 l_range[1] = g_shift;
2502 }
2503 #endif
2504
2505 /* Base creation */
2506
2507 cs_matrix_assembler_t *ma = cs_matrix_assembler_create(l_range,
2508 separate_diag);
2509
2510 ma->n_g_rows = n_g_rows;
2511 ma->n_rows = n_rows;
2512
2513 ma->r_idx = row_idx;
2514 ma->c_id = col_id;
2515
2516 ma->halo = halo;
2517
2518 /* Additional required parallel data */
2519
2520 if (ma->halo != NULL) {
2521
2522 cs_gnum_t *t_g_id;
2523 BFT_MALLOC(ma->e_g_id, ma->halo->n_elts[0], cs_gnum_t);
2524 BFT_MALLOC(t_g_id, ma->n_rows + ma->halo->n_elts[0], cs_gnum_t);
2525
2526 for (cs_lnum_t i = 0; i < ma->n_rows; i++)
2527 t_g_id[i] = (cs_gnum_t)i + ma->l_range[0];
2528
2529 cs_halo_sync_untyped(ma->halo,
2530 CS_HALO_STANDARD,
2531 sizeof(cs_gnum_t),
2532 t_g_id);
2533
2534 ma->n_e_g_ids = ma->halo->n_elts[0];
2535 for (cs_lnum_t i = 0; i < ma->n_e_g_ids; i++)
2536 ma->e_g_id[i] = t_g_id[ma->n_rows + i];
2537
2538 BFT_FREE(t_g_id);
2539
2540 /* Build distant columns index and global ids */
2541
2542 BFT_MALLOC(ma->d_r_idx, ma->n_rows+1, cs_lnum_t);
2543
2544 ma->d_r_idx[0] = 0;
2545 for (cs_lnum_t i = 0; i < ma->n_rows; i++) {
2546 cs_lnum_t n_e_cols = 0;
2547 cs_lnum_t n_cols = ma->r_idx[i+1] - ma->r_idx[i];
2548 const cs_lnum_t *c_id = ma->c_id + ma->r_idx[i];
2549 for (cs_lnum_t j = 0; j < n_cols; j++) {
2550 if (c_id[j] >= ma->n_rows)
2551 n_e_cols++;
2552 }
2553 ma->d_r_idx[i+1] = n_e_cols;
2554 }
2555
2556 for (cs_lnum_t i = 0; i < ma->n_rows; i++)
2557 ma->d_r_idx[i+1] += ma->d_r_idx[i];
2558
2559 BFT_MALLOC(ma->d_g_c_id, ma->d_r_idx[ma->n_rows], cs_gnum_t);
2560 for (cs_lnum_t i = 0; i < ma->n_rows; i++) {
2561 cs_lnum_t offset = ma->d_r_idx[i];
2562 cs_lnum_t n_cols = ma->r_idx[i+1] - ma->r_idx[i];
2563 const cs_lnum_t *c_id = ma->c_id + ma->r_idx[i];
2564 for (cs_lnum_t j = 0; j < n_cols; j++) {
2565 if (c_id[j] >= ma->n_rows) {
2566 assert(c_id[j] - ma->n_rows < ma->n_e_g_ids);
2567 ma->d_g_c_id[offset++] = ma->e_g_id[c_id[j] - ma->n_rows];
2568 }
2569 }
2570 }
2571
2572 }
2573
2574 return ma;
2575 }
2576
2577 /*----------------------------------------------------------------------------*/
2578 /*!
2579 * \brief Destroy a matrix assembler structure.
2580 *
2581 * \param[in, out] ma pointer to matrix assembler structure pointer
2582 */
2583 /*----------------------------------------------------------------------------*/
2584
2585 void
cs_matrix_assembler_destroy(cs_matrix_assembler_t ** ma)2586 cs_matrix_assembler_destroy(cs_matrix_assembler_t **ma)
2587 {
2588 if (ma != NULL && *ma != NULL) {
2589 cs_matrix_assembler_t *_ma = *ma;
2590
2591 BFT_FREE(_ma->e_g_id);
2592
2593 if (_ma->_halo != NULL)
2594 cs_halo_destroy(&(_ma->_halo));
2595
2596 #if defined(HAVE_MPI)
2597
2598 BFT_FREE(_ma->coeff_recv_col_g_id);
2599 BFT_FREE(_ma->coeff_recv_col_idx);
2600 BFT_FREE(_ma->coeff_recv_row_id);
2601
2602 BFT_FREE(_ma->coeff_rank_recv_index);
2603 BFT_FREE(_ma->coeff_rank_send_index);
2604
2605 BFT_FREE(_ma->coeff_send_col_g_id);
2606 BFT_FREE(_ma->coeff_send_row_g_id);
2607 BFT_FREE(_ma->coeff_send_index);
2608 BFT_FREE(_ma->coeff_rank);
2609
2610 #endif /* HAVE_MPI */
2611
2612 BFT_FREE(_ma->g_rc_id);
2613
2614 BFT_FREE(_ma->d_g_c_id);
2615 BFT_FREE(_ma->d_r_idx);
2616
2617 BFT_FREE(_ma->_c_id);
2618 BFT_FREE(_ma->_r_idx);
2619
2620 BFT_FREE(*ma);
2621 }
2622 }
2623
2624 /*----------------------------------------------------------------------------*/
2625 /*!
2626 * \brief Set option flags for a given matrix assembler structure.
2627 *
2628 * When used, this function should be called before defining entries
2629 * (in case of future option additions) and in any case before computing
2630 * the final structure.
2631 *
2632 * Flags are defined as a sum (bitwise or) of constants, described below:
2633 * - \ref CS_MATRIX_DISTANT_ROW_USE_COL_IDX indicates that column indexes
2634 * matching distant row data are computed and maintained, so the
2635 * associated coefficient contributions can be added more efficiently.
2636 * - \ref CS_MATRIX_DISTANT_ROW_USE_COL_G_ID indicates that columns
2637 * global ids matching distant row data are maintained directly,
2638 * so no lookup to a global id description of matrix columns is needed;
2639 * this option is useful only when using an external library allowing
2640 * incremental setting or accumulation of coefficients using global
2641 * column ids, such as with PETSc's MatSetValues. When building matrices
2642 * locally first (which includes most cases, whether internal matrices,
2643 * or using library conversion or import functions such as PETSc's
2644 * MatCreateMPIAIJWithArrays) this is slightly less efficient, as
2645 * column ids will need to be matched to each row's columns a second
2646 * time for those (exchanged) coefficients.
2647 * - \ref CS_MATRIX_EXTERNAL_HALO indicates that we do not need to
2648 * build an associated halo structure, as it will be built externally
2649 * (i.e. by an external library such as PETSc, HYPRE, ...) using its
2650 * own equivalent structures.
2651 *
2652 * \param[in, out] ma pointer to matrix assembler structure
2653 * \param[in] flags sum of matrix assembler flag constants
2654 * (bitwise or)
2655 */
2656 /*----------------------------------------------------------------------------*/
2657
2658 void
cs_matrix_assembler_set_options(cs_matrix_assembler_t * ma,int flags)2659 cs_matrix_assembler_set_options(cs_matrix_assembler_t *ma,
2660 int flags)
2661 {
2662 ma->flags = flags;
2663
2664 /* Ensure at least one distant row exchange option is set */
2665
2666 if ( !(ma->flags & CS_MATRIX_DISTANT_ROW_USE_COL_IDX)
2667 && !(ma->flags & CS_MATRIX_DISTANT_ROW_USE_COL_G_ID))
2668 ma->flags = ma->flags | CS_MATRIX_DISTANT_ROW_USE_COL_IDX;
2669 }
2670
2671 /*----------------------------------------------------------------------------*/
2672 /*!
2673 * \brief Return the option flags for a given matrix assembler structure.
2674 *
2675 * Flags are defined as a sum (bitwise or) of constants, described in
2676 * \ref cs_matrix_assembler_set_options.
2677 *
2678 * \param[in] ma pointer to matrix assembler structure
2679 *
2680 * \return option flags (sum of integer constants) used y this structure
2681 */
2682 /*----------------------------------------------------------------------------*/
2683
2684 int
cs_matrix_assembler_get_options(const cs_matrix_assembler_t * ma)2685 cs_matrix_assembler_get_options(const cs_matrix_assembler_t *ma)
2686 {
2687 return ma->flags;
2688 }
2689
2690 /*----------------------------------------------------------------------------*/
2691 /*!
2692 * \brief Add entries to a matrix assembler structure.
2693 *
2694 * This function should be called by a single thread for a given assembler.
2695 *
2696 * \param[in, out] ma pointer to matrix assembler structure
2697 * \param[in] n number of entries
2698 * \param[in] col_g_id global column ids associated with entries
2699 * \param[in] row_g_id global row ids associated with entries
2700 */
2701 /*----------------------------------------------------------------------------*/
2702
2703 void
cs_matrix_assembler_add_g_ids(cs_matrix_assembler_t * ma,cs_lnum_t n,const cs_gnum_t row_g_id[],const cs_gnum_t col_g_id[])2704 cs_matrix_assembler_add_g_ids(cs_matrix_assembler_t *ma,
2705 cs_lnum_t n,
2706 const cs_gnum_t row_g_id[],
2707 const cs_gnum_t col_g_id[])
2708 {
2709 /* Reallocate if needed;
2710 note that we could use optimized structures to avoid
2711 inserting duplicate values; A simple and relatively
2712 efficient variant may be to preallocate a local matrix
2713 with a fixed estimated number of columns, so as to
2714 handle the bulk of data, and add additional bins for
2715 the rest; sorting and removal of duplicates could occur
2716 when storage of preallocated sections is full.
2717 This may be done at a future stage, depending on timing
2718 and memory consumption measures for the current implementation. */
2719
2720 if (ma->size + n >= ma->max_size) {
2721 if (ma->size == 0)
2722 ma->max_size = 4;
2723 while (ma->size + n >= ma->max_size)
2724 ma->max_size *= 2;
2725 BFT_REALLOC(ma->g_rc_id, ma->max_size*2, cs_gnum_t);
2726 }
2727
2728 cs_gnum_t *_g_rc_id = ma->g_rc_id + ma->size*2;
2729
2730 if (ma->separate_diag == false) {
2731 for (cs_lnum_t i = 0; i < n; i++) {
2732 _g_rc_id[i*2] = row_g_id[i];
2733 _g_rc_id[i*2+1] = col_g_id[i];
2734 }
2735 ma->size += n;
2736 }
2737 else {
2738 cs_lnum_t j = 0;
2739 for (cs_lnum_t i = 0; i < n; i++) {
2740 if ( row_g_id[i] != col_g_id[i]
2741 || row_g_id[i] < ma->l_range[0]
2742 || row_g_id[i] >= ma->l_range[1]) {
2743 _g_rc_id[j*2] = row_g_id[i];
2744 _g_rc_id[j*2+1] = col_g_id[i];
2745 j += 1;
2746 }
2747 }
2748 ma->size += j;
2749 }
2750 }
2751
2752 /*----------------------------------------------------------------------------*/
2753 /*!
2754 * \brief Compute internal structures required by a matrix assembler.
2755 *
2756 * The associated vector halo is also computed at this stage.
2757 *
2758 * \param[in, out] ma pointer to matrix assembler structure
2759 *
2760 * This function should be called by a single thread for a given assembler.
2761 */
2762 /*----------------------------------------------------------------------------*/
2763
2764 void
cs_matrix_assembler_compute(cs_matrix_assembler_t * ma)2765 cs_matrix_assembler_compute(cs_matrix_assembler_t *ma)
2766 {
2767 #if defined(HAVE_MPI)
2768 if (ma->comm != MPI_COMM_NULL && ma->comm != MPI_COMM_SELF) {
2769 _matrix_assembler_compute_mpi(ma);
2770 return;
2771 }
2772 #endif /* HAVE_MPI */
2773
2774 _matrix_assembler_compute_local(ma);
2775
2776 /* Non-null array to simplify binary search (rare diagonal-only case) */
2777
2778 if (ma->c_id == NULL) {
2779 BFT_MALLOC(ma->_c_id, 1, cs_lnum_t);
2780 ma->c_id = ma->_c_id;
2781 ma->_c_id[0] = -1;
2782 }
2783 }
2784
2785 #if defined(HAVE_MPI)
2786
2787 /*----------------------------------------------------------------------------*/
2788 /*!
2789 * \brief Assign specific MPI communicator to matrix assembler.
2790 *
2791 * This must be called before \ref cs_matrix_assembler_compute.
2792 *
2793 * \param[in, out] ma pointer to matrix assembler structure
2794 * \param[in] comm assigned MPI communicator
2795 */
2796 /*----------------------------------------------------------------------------*/
2797
2798 void
cs_matrix_assembler_set_comm(cs_matrix_assembler_t * ma,MPI_Comm comm)2799 cs_matrix_assembler_set_comm(cs_matrix_assembler_t *ma,
2800 MPI_Comm comm)
2801 {
2802 assert(ma->n_ranks_init[1] == 0);
2803
2804 ma->comm = comm;
2805 }
2806
2807 #endif /* HAVE_MPI */
2808
2809 /*----------------------------------------------------------------------------*/
2810 /*!
2811 * \brief Return a pointer to local global row range associated with a
2812 * matrix assembler.
2813 *
2814 * \param[in] ma pointer to matrix assembler structure
2815 *
2816 * \return pointer to local range
2817 */
2818 /*----------------------------------------------------------------------------*/
2819
2820 const cs_gnum_t *
cs_matrix_assembler_get_l_range(const cs_matrix_assembler_t * ma)2821 cs_matrix_assembler_get_l_range(const cs_matrix_assembler_t *ma)
2822 {
2823 return ma->l_range;
2824 }
2825
2826 /*----------------------------------------------------------------------------*/
2827 /*!
2828 * \brief Return a pointer to the halo structure associated with a
2829 * matrix assembler.
2830 *
2831 * \param[in] ma pointer to matrix assembler structure
2832 *
2833 * \return pointer to halo structure
2834 */
2835 /*----------------------------------------------------------------------------*/
2836
2837 const cs_halo_t *
cs_matrix_assembler_get_halo(const cs_matrix_assembler_t * ma)2838 cs_matrix_assembler_get_halo(const cs_matrix_assembler_t *ma)
2839 {
2840 return ma->halo;
2841 }
2842
2843 /*----------------------------------------------------------------------------*/
2844 /*!
2845 * \brief Indicate if the matrix assembler is based on a separate diagonal.
2846 *
2847 * \param[in] ma pointer to matrix assembler structure
2848 *
2849 * \return true if the associated structure has a separate diagonal,
2850 * false otherwise
2851 */
2852 /*----------------------------------------------------------------------------*/
2853
2854 bool
cs_matrix_assembler_get_separate_diag(const cs_matrix_assembler_t * ma)2855 cs_matrix_assembler_get_separate_diag(const cs_matrix_assembler_t *ma)
2856 {
2857 return ma->separate_diag;
2858 }
2859
2860 /*----------------------------------------------------------------------------*/
2861 /*!
2862 * \brief Return the number of rows associated with a matrix assembler.
2863 *
2864 * \param[in] ma pointer to matrix assembler structure
2865 *
2866 * \return number of rows associated with matrix assembler
2867 */
2868 /*----------------------------------------------------------------------------*/
2869
2870 cs_lnum_t
cs_matrix_assembler_get_n_rows(const cs_matrix_assembler_t * ma)2871 cs_matrix_assembler_get_n_rows(const cs_matrix_assembler_t *ma)
2872 {
2873 return ma->n_rows;
2874 }
2875
2876 /*----------------------------------------------------------------------------*/
2877 /*!
2878 * \brief Return the number of columns associated with a matrix assembler.
2879 *
2880 * \param[in] ma pointer to matrix assembler structure
2881 *
2882 * \return number of columns associated with matrix assembler
2883 */
2884 /*----------------------------------------------------------------------------*/
2885
2886 cs_lnum_t
cs_matrix_assembler_get_n_columns(const cs_matrix_assembler_t * ma)2887 cs_matrix_assembler_get_n_columns(const cs_matrix_assembler_t *ma)
2888 {
2889 #if defined(HAVE_MPI)
2890 return (ma->n_rows + ma->n_e_g_ids);
2891 #else
2892 return ma->n_rows;
2893 #endif
2894 }
2895
2896 /*----------------------------------------------------------------------------*/
2897 /*!
2898 * \brief Return a row index associated with a matrix assembler.
2899 *
2900 * This index is a CSR type index relative to all columns, including or
2901 * excluding the diagonal depending on the \c separate_diag option used
2902 * when creating the matrix assembler. The matching column ids can be
2903 * obtained using \ref cs_matrix_assembler_get_col_ids.
2904 *
2905 * \warning the returned index is valid only as long as the matrix assembly
2906 * structure exists.
2907 *
2908 * \param[in] ma pointer to matrix assembler structure
2909 *
2910 * \return pointer to matrix structure row index
2911 */
2912 /*----------------------------------------------------------------------------*/
2913
2914 const cs_lnum_t *
cs_matrix_assembler_get_row_index(const cs_matrix_assembler_t * ma)2915 cs_matrix_assembler_get_row_index(const cs_matrix_assembler_t *ma)
2916 {
2917 return ma->r_idx;
2918 }
2919
2920 /*----------------------------------------------------------------------------*/
2921 /*!
2922 * \brief Return a column ids associated with a matrix assembler.
2923 *
2924 * These ids relative to all columns of successive rows, with columns
2925 * ordered by local id, so local columns appear first, distant columns
2926 * after. Depending on the \c separate_diag option used when creating the
2927 * matrix assembler, the columns may include the diagonal or not.
2928 * The matching index can be obtained using
2929 * \ref cs_matrix_assembler_get_row_index.
2930 *
2931 * \warning the returned index is valid only as long as the matrix assembly
2932 * structure exists.
2933 *
2934 * \param[in] ma pointer to matrix assembler structure
2935 *
2936 * \return pointer to matrix structure row index
2937 */
2938 /*----------------------------------------------------------------------------*/
2939
2940 const cs_lnum_t *
cs_matrix_assembler_get_col_ids(const cs_matrix_assembler_t * ma)2941 cs_matrix_assembler_get_col_ids(const cs_matrix_assembler_t *ma)
2942 {
2943 return ma->c_id;
2944 }
2945
2946 /*----------------------------------------------------------------------------*/
2947 /*!
2948 * \brief Return info on the number of neighbor ranks a matrix assembler
2949 * may communicate with.
2950 *
2951 * \param[in] ma pointer to matrix assembler structure
2952 * \param[out] rc rank counts; the 4 values are:
2953 * - 0 number of communicating ranks for vector update (halo)
2954 * - 1 number of communicating ranks for matrix assembly
2955 * - 2 maximum number of communicating ranks for halo
2956 * construction (assumed ranks algorithm)
2957 * - 3 maximum number of communicating ranks for matrix
2958 * assembly determination (assumed ranks algorithm)
2959 */
2960 /*----------------------------------------------------------------------------*/
2961
2962 void
cs_matrix_assembler_get_rank_counts(const cs_matrix_assembler_t * ma,int rc[4])2963 cs_matrix_assembler_get_rank_counts(const cs_matrix_assembler_t *ma,
2964 int rc[4])
2965 {
2966 rc[0] = 0;
2967 if (ma->halo != NULL)
2968 rc[0] = ma->halo->n_c_domains;
2969 #if defined(HAVE_MPI)
2970 rc[1] = ma->n_coeff_ranks;
2971 rc[2] = ma->n_ranks_init[1];
2972 rc[3] = ma->n_ranks_init[0];
2973 #else
2974 rc[1] = 0;
2975 rc[2] = 0;
2976 rc[3] = 0;
2977 #endif
2978 }
2979
2980 /*----------------------------------------------------------------------------*/
2981 /*!
2982 * \brief Log rank counts for a given matrix assembler.
2983 *
2984 * \param[in] ma pointer to matrix assembler structure
2985 * \param[in] log_id log type
2986 * \param[in] name name of this assembler
2987 */
2988 /*----------------------------------------------------------------------------*/
2989
2990 void
cs_matrix_assembler_log_rank_counts(const cs_matrix_assembler_t * ma,cs_log_t log_id,const char * name)2991 cs_matrix_assembler_log_rank_counts(const cs_matrix_assembler_t *ma,
2992 cs_log_t log_id,
2993 const char *name)
2994 {
2995 cs_log_printf(log_id,
2996 _("\nNeighbor rank counts for matrix assembler: %s\n"
2997 "-----------------------------------------\n"),
2998 name);
2999
3000 const char *count_name[]
3001 = {N_("Neighbor ranks for vector update (halo)"),
3002 N_("Neighbor ranks for matrix assembly"),
3003 N_("Maximum neighbor ranks during halo construction"),
3004 N_("Maximum neighbor ranks during assembly determination")};
3005
3006 int counts[4];
3007
3008 cs_matrix_assembler_get_rank_counts(ma, counts);
3009
3010 for (int i = 0; i < 4; i++) {
3011
3012 char ul[120];
3013 size_t j = 0;
3014 size_t u_count = cs_log_strlen(_(count_name[i]));
3015 for (j = 0; j < u_count && j < 119; j++)
3016 ul[j] = '-';
3017 ul[j] = '\0';
3018
3019 cs_log_printf(log_id, "\n %s:\n %s\n\n", _(count_name[i]), ul);
3020
3021 _display_rank_histogram(log_id, counts[i]);
3022
3023 }
3024 }
3025
3026 /*----------------------------------------------------------------------------*/
3027 /*!
3028 * \brief Create and initialize a matrix assembler values structure.
3029 *
3030 * The associated values will initially be set to zero.
3031 *
3032 * Block sizes are defined by an optional array of 4 values:
3033 * 0: useful block size, 1: vector block extents,
3034 * 2: matrix line extents, 3: matrix line*column extents
3035 *
3036 * This is a low-level function, which should be called by a simpler
3037 * function (\ref cs_matrix_assembler_values_init) which provides
3038 * the necessary function pointers.
3039 *
3040 * \warning The matrix pointer must point to valid data when the selection
3041 * function is called, so the life cycle of the data pointed to
3042 * should be at least as long as that of the assembler values
3043 * structure. In a similar manner, the life cycle of the associated
3044 * matrix assembler must also be at least as long as that
3045 * of the assembler values structure.
3046 *
3047 * \param[in] ma associated matrix assembly structure
3048 * \param[in] sep_diag true if diagonal terms are stored separately
3049 * \param[in] db_size optional diagonal block sizes
3050 * \param[in] eb_size optional extra-diagonal block sizes
3051 * \param[in, out] matrix untyped pointer to matrix description structure
3052 * \param[in] init pointer to matrix coefficients
3053 * initialization function
3054 * \param[in] add pointer to matrix coefficients addition from
3055 * local ids function
3056 * \param[in] add_g pointer to matrix coefficients addition from
3057 * global ids function
3058 * \param[in] begin pointer to matrix coefficients assembly
3059 * start function (optional)
3060 * \param[in] end pointer to matrix coefficients assembly
3061 * end function (optional)
3062 *
3063 * \return pointer to initialized matrix assembler structure;
3064 */
3065 /*----------------------------------------------------------------------------*/
3066
3067 cs_matrix_assembler_values_t *
cs_matrix_assembler_values_create(const cs_matrix_assembler_t * ma,bool sep_diag,const cs_lnum_t * db_size,const cs_lnum_t * eb_size,void * matrix,cs_matrix_assembler_values_init_t * init,cs_matrix_assembler_values_add_t * add,cs_matrix_assembler_values_add_g_t * add_g,cs_matrix_assembler_values_begin_t * begin,cs_matrix_assembler_values_end_t * end)3068 cs_matrix_assembler_values_create(const cs_matrix_assembler_t *ma,
3069 bool sep_diag,
3070 const cs_lnum_t *db_size,
3071 const cs_lnum_t *eb_size,
3072 void *matrix,
3073 cs_matrix_assembler_values_init_t *init,
3074 cs_matrix_assembler_values_add_t *add,
3075 cs_matrix_assembler_values_add_g_t *add_g,
3076 cs_matrix_assembler_values_begin_t *begin,
3077 cs_matrix_assembler_values_end_t *end)
3078 {
3079 cs_matrix_assembler_values_t *mav;
3080
3081 BFT_MALLOC(mav, 1, cs_matrix_assembler_values_t);
3082
3083 mav->ma = ma;
3084
3085 mav->separate_diag = sep_diag;
3086 mav->final_assembly = false;
3087
3088 for (int i = 0; i < 4; i++) {
3089 mav->db_size[i] = 1;
3090 mav->eb_size[i] = 1;
3091 }
3092
3093 if (db_size != NULL)
3094 memcpy(mav->db_size, db_size, 4*sizeof(int));
3095 if (eb_size != NULL)
3096 memcpy(mav->eb_size, eb_size, 4*sizeof(int));
3097
3098 mav->diag_idx = NULL;
3099
3100 mav->matrix = matrix;
3101
3102 mav->init = init;
3103 mav->add_values = add;
3104 mav->add_values_g = add_g;
3105 mav->assembly_begin = begin;
3106 mav->assembly_end = end;
3107
3108 #if defined(HAVE_MPI)
3109
3110 cs_lnum_t alloc_size = ma->coeff_send_size * mav->eb_size[3];
3111
3112 BFT_MALLOC(mav->coeff_send, alloc_size, cs_real_t);
3113
3114 for (cs_lnum_t i = 0; i < alloc_size; i++)
3115 mav->coeff_send[i] = 0;
3116
3117 #endif /* HAVE_MPI */
3118
3119 /* Build diagonal index if it will be needed */
3120
3121 if (mav->separate_diag != ma->separate_diag)
3122 _matrix_assembler_values_diag_idx(mav);
3123
3124 if (mav->init != NULL)
3125 mav->init(mav->matrix, mav->db_size, mav->eb_size);
3126
3127 return mav;
3128 }
3129
3130 /*----------------------------------------------------------------------------*/
3131 /*!
3132 * \brief Finalize matrix assembler values structure.
3133 *
3134 * When this function returns, the assembler values structure has been
3135 * destroyed, and the associated matrix is fully assembled (i.e. ready to
3136 * use).
3137 *
3138 * \param[in, out] mav pointer to matrix assembler values structure pointer
3139 */
3140 /*----------------------------------------------------------------------------*/
3141
3142 void
cs_matrix_assembler_values_finalize(cs_matrix_assembler_values_t ** mav)3143 cs_matrix_assembler_values_finalize(cs_matrix_assembler_values_t **mav)
3144 {
3145 if (mav != NULL) {
3146
3147 cs_matrix_assembler_values_t *_mav = *mav;
3148
3149 if (_mav->final_assembly == false)
3150 cs_matrix_assembler_values_done(_mav);
3151
3152 if (_mav->assembly_end != NULL)
3153 _mav->assembly_end(_mav->matrix);
3154
3155 BFT_FREE(*mav);
3156
3157 }
3158 }
3159
3160 /*----------------------------------------------------------------------------*/
3161 /*!
3162 * \brief Add values to a matrix assembler values structure using local
3163 * row and column ids.
3164 *
3165 * If the matching matrix coefficients use a block structure, the
3166 * values passed to this function should use the same block size.
3167 *
3168 * Note also that if the matrix has different diagonal and extradiagonal
3169 * block sizes, separate calls to this function should be used for both
3170 * types of coefficients; in addition, in this case, diagonal coefficients
3171 * should only be provided by the owning rank (this also impacts how
3172 * the associated matrix assembler structure is defined).
3173 *
3174 * This function may be called by different threads, as long those threads
3175 * do not add contributions to the same rows (assuming caution is taken
3176 * in the case of external libraries so that their builder functions
3177 * have tht same property).
3178 *
3179 * \param[in, out] mav pointer to matrix assembler values structure
3180 * \param[in] n number of entries
3181 * \param[in] col_id local column ids associated with entries
3182 * \param[in] row_id local row ids associated with entries
3183 * \param[in] val values associated with entries
3184 */
3185 /*----------------------------------------------------------------------------*/
3186
3187 void
cs_matrix_assembler_values_add(cs_matrix_assembler_values_t * mav,cs_lnum_t n,const cs_lnum_t row_id[],const cs_lnum_t col_id[],const cs_real_t val[])3188 cs_matrix_assembler_values_add(cs_matrix_assembler_values_t *mav,
3189 cs_lnum_t n,
3190 const cs_lnum_t row_id[],
3191 const cs_lnum_t col_id[],
3192 const cs_real_t val[])
3193 {
3194 const cs_matrix_assembler_t *ma = mav->ma;
3195
3196 cs_lnum_t stride = 0;
3197
3198 if (n < 1)
3199 return;
3200
3201 /* Base stride on first type of value encountered */
3202
3203 if (row_id[0] == col_id[0])
3204 stride = mav->db_size[3];
3205 else
3206 stride = mav->eb_size[3];
3207
3208 /* Case where we compute a column index first */
3209
3210 if (mav->add_values != NULL) {
3211
3212 cs_lnum_t s_col_idx[COEFF_GROUP_SIZE];
3213
3214 for (cs_lnum_t i = 0; i < n; i+= COEFF_GROUP_SIZE) {
3215
3216 cs_lnum_t b_size = COEFF_GROUP_SIZE;
3217 if (i + COEFF_GROUP_SIZE > n)
3218 b_size = n - i;
3219
3220 for (cs_lnum_t j = 0; j < b_size; j++) {
3221
3222 cs_lnum_t k = i+j;
3223
3224 cs_lnum_t l_r_id = row_id[k];
3225 cs_lnum_t l_c_id = col_id[k];
3226
3227 cs_lnum_t n_cols = ma->r_idx[l_r_id+1] - ma->r_idx[l_r_id];
3228
3229 s_col_idx[j] = _l_id_binary_search(n_cols,
3230 col_id[k],
3231 ma->c_id + ma->r_idx[l_r_id]);
3232
3233 assert(s_col_idx[j] > -1 || (ma->separate_diag && l_c_id == l_r_id));
3234
3235 }
3236
3237 if (ma->separate_diag == mav->separate_diag)
3238 mav->add_values(mav->matrix,
3239 b_size,
3240 stride,
3241 row_id + i,
3242 s_col_idx,
3243 val + (i*stride));
3244
3245 else
3246 _matrix_assembler_values_add_cnv_idx(mav,
3247 b_size,
3248 stride,
3249 row_id + i,
3250 s_col_idx,
3251 val + (i*stride));
3252
3253 }
3254
3255 }
3256
3257 else { /* use a global id-based function */
3258
3259 assert(mav->add_values_g != NULL);
3260
3261 _matrix_assembler_values_add_ll_g(mav,
3262 n,
3263 stride,
3264 row_id,
3265 col_id,
3266 val);
3267 }
3268 }
3269
3270 /*----------------------------------------------------------------------------*/
3271 /*!
3272 * \brief Add values to a matrix assembler values structure using global
3273 * row and column ids.
3274 *
3275 * If the matching matrix coefficients use a block structure, the
3276 * values passed to this function should use the same block size.
3277 *
3278 * Note also that if the matrix has different diagonal and extradiagonal
3279 * block sizes, separate calls to this function should be used for both
3280 * types of coefficients; in addition, in this case, diagonal coefficients
3281 * should only be provided by the owning rank (this also impacts how
3282 * the associated matrix assembler structure is defined).
3283 *
3284 * This function may be called by different threads, as long those threads
3285 * do not add contributions to the same rows (assuming caution is taken
3286 * in the case of external libraries so that their builder functions
3287 * have tht same property).
3288 *
3289 * \param[in, out] mav pointer to matrix assembler values structure
3290 * \param[in] n number of entries
3291 * \param[in] col_g_id global column ids associated with entries
3292 * \param[in] row_g_id global row ids associated with entries
3293 * \param[in] val values associated with entries
3294 */
3295 /*----------------------------------------------------------------------------*/
3296
3297 void
cs_matrix_assembler_values_add_g(cs_matrix_assembler_values_t * mav,cs_lnum_t n,const cs_gnum_t row_g_id[],const cs_gnum_t col_g_id[],const cs_real_t val[])3298 cs_matrix_assembler_values_add_g(cs_matrix_assembler_values_t *mav,
3299 cs_lnum_t n,
3300 const cs_gnum_t row_g_id[],
3301 const cs_gnum_t col_g_id[],
3302 const cs_real_t val[])
3303 {
3304 const cs_matrix_assembler_t *ma = mav->ma;
3305
3306 cs_lnum_t stride = 0;
3307
3308 if (n < 1)
3309 return;
3310
3311 /* Base stride on first type of value encountered */
3312
3313 if (row_g_id[0] == col_g_id[0])
3314 stride = mav->db_size[3];
3315 else
3316 stride = mav->eb_size[3];
3317
3318 cs_gnum_t s_g_row_id[COEFF_GROUP_SIZE];
3319 cs_gnum_t s_g_col_id[COEFF_GROUP_SIZE];
3320
3321 for (cs_lnum_t i = 0; i < n; i+= COEFF_GROUP_SIZE) {
3322
3323 cs_lnum_t b_size = COEFF_GROUP_SIZE;
3324 if (i + COEFF_GROUP_SIZE > n)
3325 b_size = n - i;
3326
3327 for (cs_lnum_t j = 0; j < b_size; j++) {
3328
3329 cs_lnum_t k = i+j;
3330
3331 cs_gnum_t g_r_id = row_g_id[k];
3332
3333 #if defined(HAVE_MPI)
3334
3335 /* Case where coefficient is handled by other rank */
3336
3337 if (g_r_id < ma->l_range[0] || g_r_id >= ma->l_range[1]) {
3338
3339 s_g_row_id[j] = ma->l_range[1];
3340 s_g_col_id[j] = 0;
3341
3342 cs_lnum_t e_r_id = _g_id_binary_find(ma->coeff_send_n_rows,
3343 g_r_id,
3344 ma->coeff_send_row_g_id);
3345
3346 cs_lnum_t r_start = ma->coeff_send_index[e_r_id];
3347 cs_lnum_t n_e_rows = ma->coeff_send_index[e_r_id+1] - r_start;
3348
3349 cs_lnum_t e_id = r_start
3350 +_g_id_binary_find(n_e_rows,
3351 col_g_id[k],
3352 ma->coeff_send_col_g_id + r_start);
3353
3354 /* Now add values to send coefficients */
3355
3356 for (cs_lnum_t l = 0; l < stride; l++)
3357 # pragma omp atomic
3358 mav->coeff_send[e_id*stride + l] += val[k*stride + l];
3359
3360 }
3361
3362 /* Standard case */
3363
3364 else {
3365
3366 s_g_row_id[j] = g_r_id;
3367 s_g_col_id[j] = col_g_id[k];
3368
3369 }
3370
3371 #else
3372
3373 s_g_row_id[j] = g_r_id;
3374 s_g_col_id[j] = col_g_id[k];
3375
3376 #endif /* HAVE_MPI */
3377
3378 }
3379
3380 if (mav->add_values_g != NULL) /* global id-based assembler function */
3381 mav->add_values_g(mav->matrix,
3382 b_size,
3383 stride,
3384 s_g_row_id,
3385 s_g_col_id,
3386 val + (i*stride));
3387
3388 else if (ma->d_r_idx != NULL) { /* local-id-based function, need to adapt */
3389
3390 cs_lnum_t s_row_id[COEFF_GROUP_SIZE];
3391 cs_lnum_t s_col_idx[COEFF_GROUP_SIZE];
3392
3393 for (cs_lnum_t j = 0; j < b_size; j++) {
3394
3395 if (s_g_row_id[j] == ma->l_range[1]) { /* filter other-rank values */
3396 s_row_id[j] = -1;
3397 s_col_idx[j] = -1;
3398 }
3399
3400 else { /* Handle values assigned to local rank */
3401
3402 cs_lnum_t l_r_id = s_g_row_id[j] - ma->l_range[0];
3403 cs_gnum_t g_c_id = s_g_col_id[j];
3404
3405 s_row_id[j] = l_r_id;
3406
3407 cs_lnum_t n_l_cols = ( ma->r_idx[l_r_id+1] - ma->r_idx[l_r_id]
3408 - ma->d_r_idx[l_r_id+1] + ma->d_r_idx[l_r_id]);
3409
3410 /* Local part */
3411
3412 if (g_c_id >= ma->l_range[0] && g_c_id < ma->l_range[1]) {
3413
3414 cs_lnum_t l_c_id = g_c_id - ma->l_range[0];
3415
3416 s_col_idx[j] = _l_id_binary_search(n_l_cols,
3417 l_c_id,
3418 ma->c_id + ma->r_idx[l_r_id]);
3419
3420 assert(s_col_idx[j] > -1 || (ma->separate_diag && l_c_id == l_r_id));
3421
3422 }
3423
3424 /* Distant part */
3425
3426 else {
3427
3428 cs_lnum_t n_cols = ma->d_r_idx[l_r_id+1] - ma->d_r_idx[l_r_id];
3429
3430 cs_lnum_t d_c_idx = _g_id_binary_find(n_cols,
3431 g_c_id,
3432 ma->d_g_c_id + ma->d_r_idx[l_r_id]);
3433
3434 /* column ids start and end of local row, so add n_l_cols */
3435 s_col_idx[j] = d_c_idx + n_l_cols;
3436
3437 }
3438
3439 }
3440
3441 }
3442
3443 if (ma->separate_diag == mav->separate_diag) {
3444 mav->add_values(mav->matrix,
3445 b_size,
3446 stride,
3447 s_row_id,
3448 s_col_idx,
3449 val + (i*stride));
3450 }
3451
3452 else
3453 _matrix_assembler_values_add_cnv_idx(mav,
3454 b_size,
3455 stride,
3456 s_row_id,
3457 s_col_idx,
3458 val + (i*stride));
3459
3460 }
3461
3462 else { /* local-only case or matrix part */
3463
3464 assert(ma->d_r_idx == NULL);
3465
3466 cs_lnum_t s_row_id[COEFF_GROUP_SIZE];
3467 cs_lnum_t s_col_idx[COEFF_GROUP_SIZE];
3468
3469 for (cs_lnum_t j = 0; j < b_size; j++) {
3470
3471 assert(s_g_row_id[j] < ma->l_range[1]);
3472 assert(s_g_col_id[j] < ma->l_range[1]);
3473
3474 cs_lnum_t l_r_id = s_g_row_id[j] - ma->l_range[0];
3475 cs_lnum_t l_c_id = s_g_col_id[j] - ma->l_range[0];
3476
3477 assert( s_g_col_id[j] >= ma->l_range[0]
3478 && s_g_col_id[j] < ma->l_range[1]);
3479
3480 s_row_id[j] = l_r_id;
3481
3482 cs_lnum_t n_cols = ma->r_idx[l_r_id+1] - ma->r_idx[l_r_id];
3483
3484 s_col_idx[j] = _l_id_binary_search(n_cols,
3485 l_c_id,
3486 ma->c_id + ma->r_idx[l_r_id]);
3487
3488 assert(s_col_idx[j] > -1 || (ma->separate_diag && l_c_id == l_r_id));
3489
3490 }
3491
3492 if (ma->separate_diag == mav->separate_diag)
3493 mav->add_values(mav->matrix,
3494 b_size,
3495 stride,
3496 s_row_id,
3497 s_col_idx,
3498 val + (i*stride));
3499 else
3500 _matrix_assembler_values_add_cnv_idx(mav,
3501 b_size,
3502 stride,
3503 s_row_id,
3504 s_col_idx,
3505 val + (i*stride));
3506
3507 }
3508 }
3509 }
3510
3511 /*----------------------------------------------------------------------------*/
3512 /*!
3513 * \brief Start assembly of matrix values structure.
3514 *
3515 * The call to this function is always optional, and indicates that assembly
3516 * may begin. Calling it before \ref cs_matrix_assembler_values_finalize
3517 * may be useful when the underlying libraries can overlap communication
3518 * (or other operations such as matrix trasformation) and computation.
3519 *
3520 * \remark When values have been added to rows belonging to another rank,
3521 * communication will occur here. Splitting this function could
3522 * add another opportunity for overlap of computation and
3523 * communication, but is not done as of 2016, as it would add
3524 * complexity, with probably limited returns, as the effective
3525 * asynchonous progress of MPI libraries is usually limited.
3526 *
3527 * \param[in, out] mav pointer to matrix assembler data structure
3528 */
3529 /*----------------------------------------------------------------------------*/
3530
3531 void
cs_matrix_assembler_values_done(cs_matrix_assembler_values_t * mav)3532 cs_matrix_assembler_values_done(cs_matrix_assembler_values_t *mav)
3533 {
3534 /* Exchange row data with other ranks if required */
3535
3536 #if defined(HAVE_MPI)
3537
3538 const cs_matrix_assembler_t *ma = mav->ma;
3539
3540 if (ma->n_coeff_ranks > 0) {
3541
3542 cs_real_t *recv_coeffs = NULL;
3543
3544 cs_lnum_t stride = mav->eb_size[3];
3545
3546 MPI_Request *request = NULL;
3547 MPI_Status *status = NULL;
3548
3549 BFT_MALLOC(recv_coeffs, ma->coeff_recv_size*stride, cs_real_t);
3550
3551 BFT_MALLOC(request, ma->n_coeff_ranks*2, MPI_Request);
3552 BFT_MALLOC(status, ma->n_coeff_ranks*2, MPI_Status);
3553
3554 int request_count = 0;
3555 int local_rank = cs_glob_rank_id;
3556
3557 for (int i = 0; i < ma->n_coeff_ranks; i++) {
3558 cs_lnum_t l_size = ( ma->coeff_rank_recv_index[i+1]
3559 - ma->coeff_rank_recv_index[i]) * stride;
3560 if (l_size > 0) {
3561 cs_lnum_t recv_shift = ma->coeff_rank_recv_index[i]*stride;
3562 MPI_Irecv(recv_coeffs + recv_shift,
3563 l_size,
3564 CS_MPI_REAL,
3565 ma->coeff_rank[i],
3566 local_rank,
3567 ma->comm,
3568 &(request[request_count++]));
3569 }
3570 }
3571
3572 for (int i = 0; i < ma->n_coeff_ranks; i++) {
3573 cs_lnum_t l_size = ( ma->coeff_rank_send_index[i+1]
3574 - ma->coeff_rank_send_index[i]) * stride;
3575 if (l_size > 0) {
3576 cs_lnum_t send_shift = ma->coeff_rank_send_index[i]*stride;
3577 MPI_Isend(mav->coeff_send + send_shift,
3578 l_size,
3579 CS_MPI_REAL,
3580 ma->coeff_rank[i],
3581 ma->coeff_rank[i],
3582 ma->comm,
3583 &(request[request_count++]));
3584 }
3585 }
3586
3587 MPI_Waitall(request_count, request, status);
3588
3589 BFT_FREE(request);
3590 BFT_FREE(status);
3591
3592 /* Now add coefficients to local rows */
3593
3594 if (ma->coeff_recv_size > 0) {
3595
3596 /* Using local id-based function */
3597
3598 if (mav->add_values != NULL) {
3599
3600 assert(ma->coeff_recv_row_id != NULL);
3601 assert(ma->coeff_recv_col_idx != NULL);
3602
3603 /* Case where columns are already pre-indexed */
3604
3605 if (ma->coeff_recv_col_idx != NULL) {
3606 if (ma->separate_diag == mav->separate_diag)
3607 mav->add_values(mav->matrix,
3608 ma->coeff_recv_size,
3609 stride,
3610 ma->coeff_recv_row_id,
3611 ma->coeff_recv_col_idx,
3612 recv_coeffs);
3613 else
3614 _matrix_assembler_values_add_cnv_idx(mav,
3615 ma->coeff_recv_size,
3616 stride,
3617 ma->coeff_recv_row_id,
3618 ma->coeff_recv_col_idx,
3619 recv_coeffs);
3620 }
3621 else {
3622 assert(ma->coeff_recv_col_g_id != NULL);
3623 _matrix_assembler_values_add_lg(mav,
3624 ma->coeff_recv_size,
3625 stride,
3626 ma->coeff_recv_row_id,
3627 ma->coeff_recv_col_g_id,
3628 recv_coeffs);
3629 }
3630
3631 }
3632
3633 /* Using global id-based function */
3634
3635 else {
3636 assert(mav->add_values_g != NULL);
3637
3638 if (ma->coeff_recv_col_g_id != NULL)
3639 _matrix_assembler_values_add_lg_g(mav,
3640 ma->coeff_recv_size,
3641 stride,
3642 ma->coeff_recv_row_id,
3643 ma->coeff_recv_col_g_id,
3644 recv_coeffs);
3645
3646 else {
3647 assert(ma->coeff_recv_col_idx);
3648 _matrix_assembler_values_add_llx_g(mav,
3649 ma->coeff_recv_size,
3650 stride,
3651 ma->coeff_recv_row_id,
3652 ma->coeff_recv_col_idx,
3653 recv_coeffs);
3654 }
3655 }
3656
3657 }
3658
3659 BFT_FREE(recv_coeffs);
3660
3661 }
3662
3663 BFT_FREE(mav->coeff_send);
3664
3665 #endif /* HAVE_MPI */
3666
3667 BFT_FREE(mav->diag_idx);
3668
3669 mav->final_assembly = true;
3670
3671 if (mav->assembly_begin != NULL)
3672 mav->assembly_begin(mav->matrix);
3673 }
3674
3675 /*----------------------------------------------------------------------------*/
3676
3677 END_C_DECLS
3678