1 /*============================================================================
2 * Sparse Linear Equation Solvers
3 *============================================================================*/
4
5 /*
6 This file is part of Code_Saturne, a general-purpose CFD tool.
7
8 Copyright (C) 1998-2021 EDF S.A.
9
10 This program is free software; you can redistribute it and/or modify it under
11 the terms of the GNU General Public License as published by the Free Software
12 Foundation; either version 2 of the License, or (at your option) any later
13 version.
14
15 This program is distributed in the hope that it will be useful, but WITHOUT
16 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
18 details.
19
20 You should have received a copy of the GNU General Public License along with
21 this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
22 Street, Fifth Floor, Boston, MA 02110-1301, USA.
23 */
24
25 /*----------------------------------------------------------------------------*/
26
27 #include "cs_defs.h"
28
29 /*----------------------------------------------------------------------------
30 * Standard C library headers
31 *----------------------------------------------------------------------------*/
32
33 #include <stdarg.h>
34 #include <stdio.h>
35 #include <stdlib.h>
36 #include <string.h>
37 #include <assert.h>
38 #include <math.h>
39 #include <float.h>
40
41 #if defined(HAVE_MPI)
42 #include <mpi.h>
43 #endif
44
45 /*----------------------------------------------------------------------------
46 * Local headers
47 *----------------------------------------------------------------------------*/
48
49 #include "cs_parall.h"
50
51 /*----------------------------------------------------------------------------
52 * Header for the current file
53 *----------------------------------------------------------------------------*/
54
55 #include "cs_sles_it.h"
56 #include "cs_sles_it_priv.h"
57
58 /*----------------------------------------------------------------------------*/
59
60 BEGIN_C_DECLS
61
62 /*=============================================================================
63 * Additional doxygen documentation
64 *============================================================================*/
65
66 /*!
67 \file cs_sles_it.c
68 Iterative linear solvers
69
70 \page sles_it Iterative linear solvers.
71
72 For Krylov space solvers, default preconditioning is based
73 on a Neumann polynomial of degree \a poly_degree, with a negative value
74 meaning no preconditioning, and 0 diagonal preconditioning.
75
76 For positive values of \a poly_degree, the preconditioning is explained here:
77 \a D being the diagonal part of matrix \a A and \a X its extra-diagonal
78 part, it can be written \f$A=D(Id+D^{-1}X)\f$. Therefore
79 \f$A^{-1}=(Id+D^{-1}X)^{-1}D^{-1}\f$. A series development of
80 \f$Id+D^{-1}X\f$ can then be used which yields, symbolically,
81 \f[
82 Id+\sum\limits_{I=1}^{poly\_degree}\left(-D^{-1}X\right)^{I}
83 \f]
84
85 The efficiency of the polynomial preconditioning will vary depending
86 on the system type. In most cases, diagonal or degree 1 provide
87 best results. Each polynomial preconditioning degree above 0 adds one
88 matrix-vector product per inital matrix-vector product of the algorithm.
89 Switching from diagonal to polynomial degree 1 often divides the number of
90 required iterations by approximately 2, but each iteration then costs
91 close to 2 times that of diagonal preconditioning (other vector operations
92 are not doubled), so the net gain is often about 10%. Higher degree
93 polynomials usually lead to diminishing returns.
94 */
95
96 /*! \cond DOXYGEN_SHOULD_SKIP_THIS */
97
98 /*=============================================================================
99 * Local Macro Definitions
100 *============================================================================*/
101
102 #define RINFIN 1.E+30
103
104 /* SIMD unit size to ensure SIMD alignement (2 to 8 required on most
105 * current architectures, so 16 should be enough on most architectures) */
106
107 #define CS_SIMD_SIZE(s) (((s-1)/16+1)*16)
108
109 /*=============================================================================
110 * Local Structure Definitions
111 *============================================================================*/
112
113 /*============================================================================
114 * Global variables
115 *============================================================================*/
116
117 static int _thread_debug = 0;
118
119 /* Mean system rows threshold under which we use single-reduce version of PCG */
120
121 static cs_lnum_t _pcg_sr_threshold = 512;
122
123 /* Sparse linear equation solver type names */
124
125 const char *cs_sles_it_type_name[]
126 = {N_("Conjugate Gradient"),
127 N_("Flexible Conjugate Gradient"),
128 N_("Inexact Preconditioned Conjugate Gradient"),
129 N_("Jacobi"),
130 N_("BiCGstab"),
131 N_("BiCGstab2"),
132 N_("GCR"),
133 N_("GMRES"),
134 N_("Gauss-Seidel"),
135 N_("Symmetric Gauss-Seidel"),
136 N_("3-layer conjugate residual"),
137 N_("User-defined iterative solver"),
138 N_("None"), /* Smoothers beyond this */
139 N_("Truncated forward Gauss-Seidel"),
140 N_("Truncated backwards Gauss-Seidel"),
141 };
142
143 /*=============================================================================
144 * Local Structure Definitions
145 *============================================================================*/
146
147 /*============================================================================
148 * Private function definitions
149 *============================================================================*/
150
151 /*----------------------------------------------------------------------------
152 * Convergence test.
153 *
154 * parameters:
155 * c <-- pointer to solver context info
156 * n_iter <-- Number of iterations done
157 * residue <-- Non normalized residue
158 * convergence <-> Convergence information structure
159 *
160 * returns:
161 * convergence status.
162 *----------------------------------------------------------------------------*/
163
164 static cs_sles_convergence_state_t
_convergence_test(cs_sles_it_t * c,unsigned n_iter,double residue,cs_sles_it_convergence_t * convergence)165 _convergence_test(cs_sles_it_t *c,
166 unsigned n_iter,
167 double residue,
168 cs_sles_it_convergence_t *convergence)
169 {
170 const int verbosity = convergence->verbosity;
171 const cs_sles_it_setup_t *s = c->setup_data;
172
173 const char final_fmt[]
174 = N_(" n_iter: %5d, res_abs: %11.4e, res_nor: %11.4e, norm: %11.4e,"
175 " res_init: %11.4e\n");
176
177 /* Update conversion info structure */
178
179 convergence->n_iterations = n_iter;
180 convergence->residue = residue;
181
182 /* Plot convergence if requested */
183
184 if (c->plot != NULL) {
185 double vals = residue;
186 double wall_time = cs_timer_wtime();
187 c->plot_time_stamp += 1;
188 cs_time_plot_vals_write(c->plot, /* time plot structure */
189 c->plot_time_stamp, /* current iteration number */
190 wall_time, /* current time */
191 1, /* number of values */
192 &vals); /* values */
193
194 }
195
196 /* If converged */
197
198 if (residue < convergence->precision * convergence->r_norm) {
199 if (verbosity > 1)
200 bft_printf(_(final_fmt), n_iter, residue, residue/convergence->r_norm,
201 convergence->r_norm, s->initial_residue);
202 return CS_SLES_CONVERGED;
203 }
204
205 /* If not converged */
206 else if (n_iter >= convergence->n_iterations_max) {
207 if (verbosity > -1) {
208 if (verbosity == 1) /* Already output if verbosity > 1 */
209 bft_printf("%s [%s]:\n", cs_sles_it_type_name[c->type],
210 convergence->name);
211 else {
212 if (convergence->r_norm > 0.)
213 bft_printf(_(final_fmt),
214 n_iter, residue, residue/convergence->r_norm,
215 convergence->r_norm, s->initial_residue);
216 else
217 bft_printf(_(" n_iter : %5d, res_abs : %11.4e\n"),
218 n_iter, residue);
219 }
220 if (convergence->precision > 0.)
221 bft_printf(_(" @@ Warning: non convergence\n"));
222 }
223 return CS_SLES_MAX_ITERATION;
224 }
225
226 /* If diverged */
227 else {
228 int diverges = 0;
229 if (residue > s->initial_residue * 10000.0 && residue > 100.)
230 diverges = 1;
231 #if (__STDC_VERSION__ >= 199901L)
232 else if (isnan(residue) || isinf(residue)) {
233 diverges = 1;
234 }
235 #endif
236 if (diverges == 1) {
237 bft_printf(_("\n\n"
238 "%s [%s]: divergence after %u iterations:\n"
239 " initial residual: %11.4e; current residual: %11.4e\n"),
240 cs_sles_it_type_name[c->type], convergence->name,
241 convergence->n_iterations,
242 s->initial_residue, convergence->residue);
243 return CS_SLES_DIVERGED;
244 }
245 }
246
247 return CS_SLES_ITERATING;
248 }
249
250 /*----------------------------------------------------------------------------
251 * Solution of A.vx = Rhs using preconditioned conjugate gradient.
252 *
253 * Parallel-optimized version, groups dot products, at the cost of
254 * computation of the preconditionning for n+1 iterations instead of n.
255 *
256 * On entry, vx is considered initialized.
257 *
258 * parameters:
259 * c <-- pointer to solver context info
260 * a <-- matrix
261 * diag_block_size <-- diagonal block size
262 * convergence <-- convergence information structure
263 * rhs <-- right hand side
264 * vx <-> system solution
265 * aux_size <-- number of elements in aux_vectors (in bytes)
266 * aux_vectors --- optional working area (allocation otherwise)
267 *
268 * returns:
269 * convergence state
270 *----------------------------------------------------------------------------*/
271
272 static cs_sles_convergence_state_t
_conjugate_gradient(cs_sles_it_t * c,const cs_matrix_t * a,cs_lnum_t diag_block_size,cs_sles_it_convergence_t * convergence,const cs_real_t * rhs,cs_real_t * restrict vx,size_t aux_size,void * aux_vectors)273 _conjugate_gradient(cs_sles_it_t *c,
274 const cs_matrix_t *a,
275 cs_lnum_t diag_block_size,
276 cs_sles_it_convergence_t *convergence,
277 const cs_real_t *rhs,
278 cs_real_t *restrict vx,
279 size_t aux_size,
280 void *aux_vectors)
281 {
282 cs_sles_convergence_state_t cvg;
283 double ro_0, ro_1, alpha, rk_gkm1, rk_gk, beta, residue;
284 cs_real_t *_aux_vectors;
285 cs_real_t *restrict rk, *restrict dk, *restrict gk;
286 cs_real_t *restrict zk;
287
288 unsigned n_iter = 0;
289
290 /* Allocate or map work arrays */
291 /*-----------------------------*/
292
293 assert(c->setup_data != NULL);
294
295 const cs_lnum_t n_rows = c->setup_data->n_rows;
296
297 {
298 const cs_lnum_t n_cols = cs_matrix_get_n_columns(a) * diag_block_size;
299 const size_t n_wa = 4;
300 const size_t wa_size = CS_SIMD_SIZE(n_cols);
301
302 if (aux_vectors == NULL || aux_size/sizeof(cs_real_t) < (wa_size * n_wa))
303 BFT_MALLOC(_aux_vectors, wa_size * n_wa, cs_real_t);
304 else
305 _aux_vectors = aux_vectors;
306
307 rk = _aux_vectors;
308 dk = _aux_vectors + wa_size;
309 gk = _aux_vectors + wa_size*2;
310 zk = _aux_vectors + wa_size*3;
311 }
312
313 /* Initialize iterative calculation */
314 /*----------------------------------*/
315
316 /* Residue and descent direction */
317
318 cs_matrix_vector_multiply(a, vx, rk); /* rk = A.x0 */
319
320 # pragma omp parallel for if(n_rows > CS_THR_MIN)
321 for (cs_lnum_t ii = 0; ii < n_rows; ii++)
322 rk[ii] -= rhs[ii];
323
324 /* Preconditioning */
325
326 c->setup_data->pc_apply(c->setup_data->pc_context,
327 rk,
328 gk);
329
330 /* Descent direction */
331 /*-------------------*/
332
333 #if defined(HAVE_OPENMP)
334
335 # pragma omp parallel for if(n_rows > CS_THR_MIN)
336 for (cs_lnum_t ii = 0; ii < n_rows; ii++)
337 dk[ii] = gk[ii];
338
339 #else
340
341 memcpy(dk, gk, n_rows * sizeof(cs_real_t));
342
343 #endif
344
345 _dot_products_xx_xy(c, rk, gk, &residue, &rk_gkm1);
346 residue = sqrt(residue);
347
348 /* If no solving required, finish here */
349
350 c->setup_data->initial_residue = residue;
351 cvg = _convergence_test(c, n_iter, residue, convergence);
352
353 if (cvg == CS_SLES_ITERATING) {
354
355 n_iter = 1;
356
357 cs_matrix_vector_multiply(a, dk, zk);
358
359 /* Descent parameter */
360
361 _dot_products_xy_yz(c, rk, dk, zk, &ro_0, &ro_1);
362
363 cs_real_t d_ro_1 = (CS_ABS(ro_1) > DBL_MIN) ? 1. / ro_1 : 0.;
364 alpha = - ro_0 * d_ro_1;
365
366 # pragma omp parallel if(n_rows > CS_THR_MIN)
367 {
368 # pragma omp for nowait
369 for (cs_lnum_t ii = 0; ii < n_rows; ii++)
370 vx[ii] += (alpha * dk[ii]);
371
372 # pragma omp for nowait
373 for (cs_lnum_t ii = 0; ii < n_rows; ii++)
374 rk[ii] += (alpha * zk[ii]);
375 }
376
377 /* Convergence test */
378
379 residue = sqrt(_dot_product_xx(c, rk));
380 cvg = _convergence_test(c, n_iter, residue, convergence);
381
382 /* Current Iteration */
383 /*-------------------*/
384
385 }
386
387 while (cvg == CS_SLES_ITERATING) {
388
389 /* Preconditioning */
390
391 c->setup_data->pc_apply(c->setup_data->pc_context, rk, gk);
392
393 /* compute residue and prepare descent parameter */
394
395 _dot_products_xx_xy(c, rk, gk, &residue, &rk_gk);
396
397 residue = sqrt(residue);
398
399 /* Convergence test for end of previous iteration */
400
401 if (n_iter > 1)
402 cvg = _convergence_test(c, n_iter, residue, convergence);
403
404 if (cvg != CS_SLES_ITERATING)
405 break;
406
407 n_iter += 1;
408
409 /* Complete descent parameter computation and matrix.vector product */
410
411 beta = rk_gk / rk_gkm1;
412 rk_gkm1 = rk_gk;
413
414 # pragma omp parallel for firstprivate(alpha) if(n_rows > CS_THR_MIN)
415 for (cs_lnum_t ii = 0; ii < n_rows; ii++)
416 dk[ii] = gk[ii] + (beta * dk[ii]);
417
418 cs_matrix_vector_multiply(a, dk, zk);
419
420 _dot_products_xy_yz(c, rk, dk, zk, &ro_0, &ro_1);
421
422 cs_real_t d_ro_1 = (CS_ABS(ro_1) > DBL_MIN) ? 1. / ro_1 : 0.;
423 alpha = - ro_0 * d_ro_1;
424
425 # pragma omp parallel if(n_rows > CS_THR_MIN)
426 {
427 # pragma omp for nowait
428 for (cs_lnum_t ii = 0; ii < n_rows; ii++)
429 vx[ii] += (alpha * dk[ii]);
430
431 # pragma omp for nowait
432 for (cs_lnum_t ii = 0; ii < n_rows; ii++)
433 rk[ii] += (alpha * zk[ii]);
434 }
435
436 }
437
438 if (_aux_vectors != aux_vectors)
439 BFT_FREE(_aux_vectors);
440
441 return cvg;
442 }
443
444 /*----------------------------------------------------------------------------
445 * Solution of A.vx = Rhs using flexible preconditioned conjugate gradient.
446 *
447 * Compared to standard PCG, FCG supports variable preconditioners.
448 *
449 * This variant, described in \cite Notay:2015, allows computing the
450 * required inner products with a single global communication.
451 *
452 * On entry, vx is considered initialized.
453 *
454 * parameters:
455 * c <-- pointer to solver context info
456 * a <-- matrix
457 * diag_block_size <-- diagonal block size
458 * convergence <-- convergence information structure
459 * rhs <-- right hand side
460 * vx <-> system solution
461 * aux_size <-- number of elements in aux_vectors (in bytes)
462 * aux_vectors --- optional working area (allocation otherwise)
463 *
464 * returns:
465 * convergence state
466 *----------------------------------------------------------------------------*/
467
468 static cs_sles_convergence_state_t
_flexible_conjugate_gradient(cs_sles_it_t * c,const cs_matrix_t * a,cs_lnum_t diag_block_size,cs_sles_it_convergence_t * convergence,const cs_real_t * rhs,cs_real_t * restrict vx,size_t aux_size,void * aux_vectors)469 _flexible_conjugate_gradient(cs_sles_it_t *c,
470 const cs_matrix_t *a,
471 cs_lnum_t diag_block_size,
472 cs_sles_it_convergence_t *convergence,
473 const cs_real_t *rhs,
474 cs_real_t *restrict vx,
475 size_t aux_size,
476 void *aux_vectors)
477 {
478 cs_sles_convergence_state_t cvg = CS_SLES_ITERATING;
479 cs_real_t *_aux_vectors;
480 cs_real_t *restrict rk, *restrict vk, *restrict wk;
481 cs_real_t *restrict dk, *restrict qk;
482
483 unsigned n_iter = 0;
484
485 /* Allocate or map work arrays */
486 /*-----------------------------*/
487
488 assert(c->setup_data != NULL);
489
490 const cs_lnum_t n_rows = c->setup_data->n_rows;
491
492 {
493 const cs_lnum_t n_cols = cs_matrix_get_n_columns(a) * diag_block_size;
494 const size_t n_wa = 5;
495 const size_t wa_size = CS_SIMD_SIZE(n_cols);
496
497 if (aux_vectors == NULL || aux_size/sizeof(cs_real_t) < (wa_size * n_wa))
498 BFT_MALLOC(_aux_vectors, wa_size * n_wa, cs_real_t);
499 else
500 _aux_vectors = aux_vectors;
501
502 rk = _aux_vectors;
503 vk = _aux_vectors + wa_size;
504 wk = _aux_vectors + wa_size*2;
505 dk = _aux_vectors + wa_size*3;
506 qk = _aux_vectors + wa_size*4;
507 }
508
509 /* Initialize iterative calculation */
510 /*----------------------------------*/
511
512 /* Residue and descent direction */
513
514 cs_matrix_vector_multiply(a, vx, rk); /* rk = A.x0 */
515
516 # pragma omp parallel if(n_rows > CS_THR_MIN)
517 {
518 # pragma omp for nowait
519 for (cs_lnum_t ii = 0; ii < n_rows; ii++)
520 rk[ii] = rhs[ii] - rk[ii];
521
522 # pragma omp for nowait
523 for (cs_lnum_t ii = 0; ii < n_rows; ii++)
524 qk[ii] = 0;
525 }
526
527 double rho_km1 = 0;
528
529 while (cvg == CS_SLES_ITERATING) {
530
531 /* Preconditioning */
532
533 c->setup_data->pc_apply(c->setup_data->pc_context, rk, vk);
534
535 cs_matrix_vector_multiply(a, vk, wk);
536
537 /* compute residue and prepare descent parameter */
538
539 double alpha_k, beta_k, gamma_k, residue;
540
541 _dot_products_vr_vw_vq_rr(c, vk, rk, wk, qk,
542 &alpha_k, &beta_k, &gamma_k, &residue);
543
544 residue = sqrt(residue);
545
546 /* Convergence test for end of previous iteration */
547
548 if (n_iter > 0)
549 cvg = _convergence_test(c, n_iter, residue, convergence);
550 else
551 c->setup_data->initial_residue = residue;
552
553 if (cvg != CS_SLES_ITERATING)
554 break;
555
556 /* Complete descent parameter computation and matrix.vector product */
557
558 if (n_iter > 0) {
559
560 cs_real_t gk_rk1 = gamma_k / rho_km1;
561 cs_real_t rho_k = beta_k - gamma_k*gamma_k / rho_km1;
562 cs_real_t ak_rk = alpha_k / rho_k;
563
564 # pragma omp parallel if(n_rows > CS_THR_MIN)
565 {
566 # pragma omp for nowait
567 for (cs_lnum_t ii = 0; ii < n_rows; ii++) {
568 dk[ii] = vk[ii] - gk_rk1 * dk[ii];
569 vx[ii] = vx[ii] + ak_rk * dk[ii];
570 }
571
572 # pragma omp for nowait
573 for (cs_lnum_t ii = 0; ii < n_rows; ii++) {
574 qk[ii] = wk[ii] - gk_rk1 * qk[ii];
575 rk[ii] = rk[ii] - ak_rk * qk[ii];
576 }
577 }
578
579 rho_km1 = rho_k;
580 }
581 else { /* n_iter == 0 */
582
583 cs_real_t rho_k = beta_k;
584 cs_real_t ak_rk = alpha_k / rho_k;
585
586 # pragma omp parallel if(n_rows > CS_THR_MIN)
587 {
588 # pragma omp for nowait
589 for (cs_lnum_t ii = 0; ii < n_rows; ii++) {
590 dk[ii] = vk[ii];
591 vx[ii] = vx[ii] + ak_rk * vk[ii];
592 }
593
594 # pragma omp for nowait
595 for (cs_lnum_t ii = 0; ii < n_rows; ii++) {
596 qk[ii] = wk[ii];
597 rk[ii] = rk[ii] - ak_rk * wk[ii];
598 }
599 }
600
601 rho_km1 = rho_k;
602 }
603
604 n_iter += 1;
605 }
606
607 if (_aux_vectors != aux_vectors)
608 BFT_FREE(_aux_vectors);
609
610 return cvg;
611 }
612
613 /*----------------------------------------------------------------------------
614 * Solution of A.vx = Rhs using flexible preconditioned conjugate gradient.
615 *
616 * Compared to standard PCG, IPCG supports variable preconditioners, at
617 * the expense of storing the residual at iterations k (rk) and k-1 (rkm1)
618 * to compute the Beta coefficient. When the preconditioner is constant
619 * across the iterations, IPCG is equivalent to PCG.
620 *
621 * On entry, vx is considered initialized.
622 *
623 * parameters:
624 * c <-- pointer to solver context info
625 * a <-- matrix
626 * diag_block_size <-- diagonal block size
627 * convergence <-- convergence information structure
628 * rhs <-- right hand side
629 * vx <-> system solution
630 * aux_size <-- number of elements in aux_vectors (in bytes)
631 * aux_vectors --- optional working area (allocation otherwise)
632 *
633 * returns:
634 * convergence state
635 *----------------------------------------------------------------------------*/
636
637 static cs_sles_convergence_state_t
_conjugate_gradient_ip(cs_sles_it_t * c,const cs_matrix_t * a,cs_lnum_t diag_block_size,cs_sles_it_convergence_t * convergence,const cs_real_t * rhs,cs_real_t * restrict vx,size_t aux_size,void * aux_vectors)638 _conjugate_gradient_ip(cs_sles_it_t *c,
639 const cs_matrix_t *a,
640 cs_lnum_t diag_block_size,
641 cs_sles_it_convergence_t *convergence,
642 const cs_real_t *rhs,
643 cs_real_t *restrict vx,
644 size_t aux_size,
645 void *aux_vectors)
646 {
647 cs_sles_convergence_state_t cvg;
648 double ro_0, ro_1, alpha, rk_gk_m1, rkm1_gk, rk_gk, beta, residue;
649 cs_real_t *_aux_vectors;
650 cs_real_t *restrict rk, *restrict rkm1, *restrict dk, *restrict gk;
651 cs_real_t *restrict zk;
652
653 unsigned n_iter = 0;
654
655 /* Allocate or map work arrays */
656 /*-----------------------------*/
657
658 assert(c->setup_data != NULL);
659
660 const cs_lnum_t n_rows = c->setup_data->n_rows;
661
662 {
663 const cs_lnum_t n_cols = cs_matrix_get_n_columns(a) * diag_block_size;
664 const size_t n_wa = 5;
665 const size_t wa_size = CS_SIMD_SIZE(n_cols);
666
667 if (aux_vectors == NULL || aux_size/sizeof(cs_real_t) < (wa_size * n_wa))
668 BFT_MALLOC(_aux_vectors, wa_size * n_wa, cs_real_t);
669 else
670 _aux_vectors = aux_vectors;
671
672 rk = _aux_vectors;
673 rkm1 = _aux_vectors + wa_size;
674 dk = _aux_vectors + wa_size*2;
675 gk = _aux_vectors + wa_size*3;
676 zk = _aux_vectors + wa_size*4;
677 }
678
679 /* Initialize iterative calculation */
680 /*----------------------------------*/
681
682 /* Residue and descent direction */
683
684 cs_matrix_vector_multiply(a, vx, rk); /* rk = A.x0 */
685
686 # pragma omp parallel for if(n_rows > CS_THR_MIN)
687 for (cs_lnum_t ii = 0; ii < n_rows; ii++)
688 rk[ii] -= rhs[ii];
689
690 /* Preconditioning */
691
692 c->setup_data->pc_apply(c->setup_data->pc_context, rk, gk);
693
694 /* Descent direction */
695 /*-------------------*/
696
697 #if defined(HAVE_OPENMP)
698
699 # pragma omp parallel for if(n_rows > CS_THR_MIN)
700 for (cs_lnum_t ii = 0; ii < n_rows; ii++)
701 dk[ii] = gk[ii];
702
703 #else
704
705 memcpy(dk, gk, n_rows * sizeof(cs_real_t));
706
707 #endif
708
709 _dot_products_xx_xy(c, rk, gk, &residue, &rk_gk_m1);
710 residue = sqrt(residue);
711
712 /* If no solving required, finish here */
713
714 c->setup_data->initial_residue = residue;
715 cvg = _convergence_test(c, n_iter, residue, convergence);
716
717 if (cvg == CS_SLES_ITERATING) {
718
719 n_iter = 1;
720
721 cs_matrix_vector_multiply(a, dk, zk);
722
723 /* Descent parameter */
724
725 _dot_products_xy_yz(c, rk, dk, zk, &ro_0, &ro_1);
726
727 cs_real_t d_ro_1 = (CS_ABS(ro_1) > DBL_MIN) ? 1. / ro_1 : 0.;
728 alpha = - ro_0 * d_ro_1;
729
730 # pragma omp parallel if(n_rows > CS_THR_MIN)
731 {
732 # pragma omp for nowait
733 for (cs_lnum_t ii = 0; ii < n_rows; ii++)
734 vx[ii] += (alpha * dk[ii]);
735
736 # pragma omp for nowait
737 for (cs_lnum_t ii = 0; ii < n_rows; ii++) {
738 rkm1[ii] = rk[ii];
739 rk[ii] += (alpha * zk[ii]);
740 }
741 }
742
743 /* Convergence test */
744
745 residue = sqrt(_dot_product_xx(c, rk));
746 cvg = _convergence_test(c, n_iter, residue, convergence);
747
748 /* Current Iteration */
749 /*-------------------*/
750
751 }
752
753 while (cvg == CS_SLES_ITERATING) {
754
755 /* Preconditioning */
756
757 c->setup_data->pc_apply(c->setup_data->pc_context, rk, gk);
758
759 /* compute residue and prepare descent parameter */
760
761 _dot_products_xx_xy_yz(c, rk, gk, rkm1, &residue, &rk_gk, &rkm1_gk);
762
763 residue = sqrt(residue);
764
765 /* Convergence test for end of previous iteration */
766
767 if (n_iter > 1)
768 cvg = _convergence_test(c, n_iter, residue, convergence);
769
770 if (cvg != CS_SLES_ITERATING)
771 break;
772
773 n_iter += 1;
774
775 /* Complete descent parameter computation and matrix.vector product */
776
777 beta = (rk_gk - rkm1_gk) / rk_gk_m1;
778 rk_gk_m1 = rk_gk;
779
780 # pragma omp parallel for firstprivate(alpha) if(n_rows > CS_THR_MIN)
781 for (cs_lnum_t ii = 0; ii < n_rows; ii++)
782 dk[ii] = gk[ii] + (beta * dk[ii]);
783
784 cs_matrix_vector_multiply(a, dk, zk);
785
786 _dot_products_xy_yz(c, rk, dk, zk, &ro_0, &ro_1);
787
788 cs_real_t d_ro_1 = (CS_ABS(ro_1) > DBL_MIN) ? 1. / ro_1 : 0.;
789 alpha = - ro_0 * d_ro_1;
790
791 # pragma omp parallel if(n_rows > CS_THR_MIN)
792 {
793 # pragma omp for nowait
794 for (cs_lnum_t ii = 0; ii < n_rows; ii++)
795 vx[ii] += (alpha * dk[ii]);
796
797 # pragma omp for nowait
798 for (cs_lnum_t ii = 0; ii < n_rows; ii++) {
799 rkm1[ii] = rk[ii];
800 rk[ii] += (alpha * zk[ii]);
801 }
802 }
803
804 }
805
806 if (_aux_vectors != aux_vectors)
807 BFT_FREE(_aux_vectors);
808
809 return cvg;
810 }
811
812 /*----------------------------------------------------------------------------
813 * Solution of A.vx = Rhs using preconditioned conjugate gradient
814 * with single reduction.
815 *
816 * For more information, see Lapack Working note 56, at
817 * http://www.netlib.org/lapack/lawnspdf/lawn56.pdf)
818 *
819 * On entry, vx is considered initialized.
820 *
821 * parameters:
822 * c <-- pointer to solver context info
823 * a <-- matrix
824 * diag_block_size <-- block size of element ii, ii
825 * convergence <-- convergence information structure
826 * rhs <-- right hand side
827 * vx <-> system solution
828 * aux_size <-- number of elements in aux_vectors (in bytes)
829 * aux_vectors --- optional working area (allocation otherwise)
830 *
831 * returns:
832 * convergence state
833 *----------------------------------------------------------------------------*/
834
835 static cs_sles_convergence_state_t
_conjugate_gradient_sr(cs_sles_it_t * c,const cs_matrix_t * a,cs_lnum_t diag_block_size,cs_sles_it_convergence_t * convergence,const cs_real_t * rhs,cs_real_t * restrict vx,size_t aux_size,void * aux_vectors)836 _conjugate_gradient_sr(cs_sles_it_t *c,
837 const cs_matrix_t *a,
838 cs_lnum_t diag_block_size,
839 cs_sles_it_convergence_t *convergence,
840 const cs_real_t *rhs,
841 cs_real_t *restrict vx,
842 size_t aux_size,
843 void *aux_vectors)
844 {
845 cs_sles_convergence_state_t cvg;
846 double ro_0, ro_1, alpha, rk_gkm1, rk_gk, gk_sk, beta, residue;
847 cs_real_t *_aux_vectors;
848 cs_real_t *restrict rk, *restrict dk, *restrict gk, *restrict sk;
849 cs_real_t *restrict zk;
850
851 unsigned n_iter = 0;
852
853 /* Allocate or map work arrays */
854 /*-----------------------------*/
855
856 assert(c->setup_data != NULL);
857
858 const cs_lnum_t n_rows = c->setup_data->n_rows;
859
860 {
861 const cs_lnum_t n_cols = cs_matrix_get_n_columns(a) * diag_block_size;
862 const size_t n_wa = 5;
863 const size_t wa_size = CS_SIMD_SIZE(n_cols);
864
865 if (aux_vectors == NULL || aux_size/sizeof(cs_real_t) < (wa_size * n_wa))
866 BFT_MALLOC(_aux_vectors, wa_size * n_wa, cs_real_t);
867 else
868 _aux_vectors = aux_vectors;
869
870 rk = _aux_vectors;
871 dk = _aux_vectors + wa_size;
872 gk = _aux_vectors + wa_size*2;
873 zk = _aux_vectors + wa_size*3;
874 sk = _aux_vectors + wa_size*4;
875 }
876
877 /* Initialize iterative calculation */
878 /*----------------------------------*/
879
880 /* Residue and descent direction */
881
882 cs_matrix_vector_multiply(a, vx, rk); /* rk = A.x0 */
883
884 # pragma omp parallel for if(n_rows > CS_THR_MIN)
885 for (cs_lnum_t ii = 0; ii < n_rows; ii++)
886 rk[ii] -= rhs[ii];
887
888 /* Preconditionning */
889
890 c->setup_data->pc_apply(c->setup_data->pc_context, rk, gk);
891
892 /* Descent direction */
893 /*-------------------*/
894
895 #if defined(HAVE_OPENMP)
896
897 # pragma omp parallel for if(n_rows > CS_THR_MIN)
898 for (cs_lnum_t ii = 0; ii < n_rows; ii++)
899 dk[ii] = gk[ii];
900
901 #else
902
903 memcpy(dk, gk, n_rows * sizeof(cs_real_t));
904
905 #endif
906
907 cs_matrix_vector_multiply(a, dk, zk); /* zk = A.dk */
908
909 /* Descent parameter */
910
911 _dot_products_xx_xy_yz(c, rk, dk, zk, &residue, &ro_0, &ro_1);
912 residue = sqrt(residue);
913
914 c->setup_data->initial_residue = residue;
915
916 /* If no solving required, finish here */
917
918 cvg = _convergence_test(c, n_iter, residue, convergence);
919
920 if (cvg == CS_SLES_ITERATING) {
921
922 n_iter = 1;
923
924 cs_real_t d_ro_1 = (CS_ABS(ro_1) > DBL_MIN) ? 1. / ro_1 : 0.;
925 alpha = - ro_0 * d_ro_1;
926
927 rk_gkm1 = ro_0;
928
929 # pragma omp parallel if(n_rows > CS_THR_MIN)
930 {
931 # pragma omp for nowait
932 for (cs_lnum_t ii = 0; ii < n_rows; ii++)
933 vx[ii] += (alpha * dk[ii]);
934
935 # pragma omp for nowait
936 for (cs_lnum_t ii = 0; ii < n_rows; ii++)
937 rk[ii] += (alpha * zk[ii]);
938 }
939
940 /* Convergence test */
941
942 residue = sqrt(_dot_product_xx(c, rk));
943
944 cvg = _convergence_test(c, n_iter, residue, convergence);
945
946 }
947
948 /* Current Iteration */
949 /*-------------------*/
950
951 while (cvg == CS_SLES_ITERATING) {
952
953 /* Preconditionning */
954
955 c->setup_data->pc_apply(c->setup_data->pc_context, rk, gk);
956
957 cs_matrix_vector_multiply(a, gk, sk); /* sk = A.gk */
958
959 /* compute residue and prepare descent parameter */
960
961 _dot_products_xx_xy_yz(c, rk, gk, sk, &residue, &rk_gk, &gk_sk);
962
963 residue = sqrt(residue);
964
965 /* Convergence test for end of previous iteration */
966
967 if (n_iter > 1)
968 cvg = _convergence_test(c, n_iter, residue, convergence);
969
970 if (cvg != CS_SLES_ITERATING)
971 break;
972
973 n_iter += 1;
974
975 /* Complete descent parameter computation and matrix.vector product */
976
977 beta = rk_gk / rk_gkm1;
978 rk_gkm1 = rk_gk;
979
980 ro_1 = gk_sk - beta*beta*ro_1;
981 ro_0 = rk_gk;
982
983 cs_real_t d_ro_1 = (CS_ABS(ro_1) > DBL_MIN) ? 1. / ro_1 : 0.;
984 alpha = - ro_0 * d_ro_1;
985
986 # pragma omp parallel if(n_rows > CS_THR_MIN)
987 {
988 # pragma omp for nowait
989 for (cs_lnum_t ii = 0; ii < n_rows; ii++) {
990 dk[ii] = gk[ii] + (beta * dk[ii]);
991 vx[ii] += alpha * dk[ii];
992 }
993 # pragma omp for nowait
994 for (cs_lnum_t ii = 0; ii < n_rows; ii++) {
995 zk[ii] = sk[ii] + (beta * zk[ii]);
996 rk[ii] += alpha * zk[ii];
997 }
998 }
999
1000 }
1001
1002 if (_aux_vectors != aux_vectors)
1003 BFT_FREE(_aux_vectors);
1004
1005 return cvg;
1006 }
1007
1008 /*----------------------------------------------------------------------------
1009 * Solution of A.vx = Rhs using non-preconditioned conjugate gradient.
1010 *
1011 * On entry, vx is considered initialized.
1012 *
1013 * parameters:
1014 * c <-- pointer to solver context info
1015 * a <-- matrix
1016 * diag_block_size <-- block size of element ii, ii
1017 * convergence <-- convergence information structure
1018 * rhs <-- right hand side
1019 * vx <-> system solution
1020 * aux_size <-- number of elements in aux_vectors (in bytes)
1021 * aux_vectors --- optional working area (allocation otherwise)
1022 *
1023 * returns:
1024 * convergence state
1025 *----------------------------------------------------------------------------*/
1026
1027 static cs_sles_convergence_state_t
_conjugate_gradient_npc(cs_sles_it_t * c,const cs_matrix_t * a,cs_lnum_t diag_block_size,cs_sles_it_convergence_t * convergence,const cs_real_t * rhs,cs_real_t * restrict vx,size_t aux_size,void * aux_vectors)1028 _conjugate_gradient_npc(cs_sles_it_t *c,
1029 const cs_matrix_t *a,
1030 cs_lnum_t diag_block_size,
1031 cs_sles_it_convergence_t *convergence,
1032 const cs_real_t *rhs,
1033 cs_real_t *restrict vx,
1034 size_t aux_size,
1035 void *aux_vectors)
1036 {
1037 cs_sles_convergence_state_t cvg;
1038 double ro_0, ro_1, alpha, rk_rkm1, rk_rk, beta, residue;
1039 cs_real_t *_aux_vectors;
1040 cs_real_t *restrict rk, *restrict dk, *restrict zk;
1041
1042 unsigned n_iter = 0;
1043
1044 /* Allocate or map work arrays */
1045 /*-----------------------------*/
1046
1047 assert(c->setup_data != NULL);
1048
1049 const cs_lnum_t n_rows = c->setup_data->n_rows;
1050
1051 {
1052 const cs_lnum_t n_cols = cs_matrix_get_n_columns(a) * diag_block_size;
1053 const size_t n_wa = 3;
1054 const size_t wa_size = CS_SIMD_SIZE(n_cols);
1055
1056 if (aux_vectors == NULL || aux_size/sizeof(cs_real_t) < (wa_size * n_wa))
1057 BFT_MALLOC(_aux_vectors, wa_size * n_wa, cs_real_t);
1058 else
1059 _aux_vectors = aux_vectors;
1060
1061 rk = _aux_vectors;
1062 dk = _aux_vectors + wa_size;
1063 zk = _aux_vectors + wa_size*2;
1064 }
1065
1066 /* Initialize iterative calculation */
1067 /*----------------------------------*/
1068
1069 /* Residue and descent direction */
1070
1071 cs_matrix_vector_multiply(a, vx, rk); /* rk = A.x0 */
1072
1073 # pragma omp parallel for if(n_rows > CS_THR_MIN)
1074 for (cs_lnum_t ii = 0; ii < n_rows; ii++)
1075 rk[ii] -= rhs[ii];
1076
1077 /* Descent direction */
1078 /*-------------------*/
1079
1080 #if defined(HAVE_OPENMP)
1081
1082 # pragma omp parallel for if(n_rows > CS_THR_MIN)
1083 for (cs_lnum_t ii = 0; ii < n_rows; ii++)
1084 dk[ii] = rk[ii];
1085
1086 #else
1087
1088 memcpy(dk, rk, n_rows * sizeof(cs_real_t));
1089
1090 #endif
1091
1092 rk_rkm1 = _dot_product_xx(c, rk);
1093 residue = sqrt(rk_rkm1);
1094
1095 /* If no solving required, finish here */
1096
1097 c->setup_data->initial_residue = residue;
1098 cvg = _convergence_test(c, n_iter, residue, convergence);
1099
1100 if (cvg == CS_SLES_ITERATING) {
1101
1102 n_iter = 1;
1103
1104 cs_matrix_vector_multiply(a, dk, zk);
1105
1106 /* Descent parameter */
1107
1108 _dot_products_xy_yz(c, rk, dk, zk, &ro_0, &ro_1);
1109
1110 cs_real_t d_ro_1 = (CS_ABS(ro_1) > DBL_MIN) ? 1. / ro_1 : 0.;
1111 alpha = - ro_0 * d_ro_1;
1112
1113 # pragma omp parallel if(n_rows > CS_THR_MIN)
1114 {
1115 # pragma omp for nowait
1116 for (cs_lnum_t ii = 0; ii < n_rows; ii++)
1117 vx[ii] += (alpha * dk[ii]);
1118
1119 # pragma omp for nowait
1120 for (cs_lnum_t ii = 0; ii < n_rows; ii++)
1121 rk[ii] += (alpha * zk[ii]);
1122 }
1123
1124 /* Convergence test */
1125
1126 residue = sqrt(_dot_product(c, rk, rk));
1127
1128 cvg = _convergence_test(c, n_iter, residue, convergence);
1129
1130 }
1131
1132 /* Current Iteration */
1133 /*-------------------*/
1134
1135 while (cvg == CS_SLES_ITERATING) {
1136
1137 /* compute residue and prepare descent parameter */
1138
1139 _dot_products_xx_xy(c, rk, rk, &residue, &rk_rk);
1140
1141 residue = sqrt(residue);
1142
1143 /* Convergence test for end of previous iteration */
1144
1145 if (n_iter > 1)
1146 cvg = _convergence_test(c, n_iter, residue, convergence);
1147
1148 if (cvg != CS_SLES_ITERATING)
1149 break;
1150
1151 n_iter += 1;
1152
1153 /* Complete descent parameter computation and matrix.vector product */
1154
1155 beta = rk_rk / rk_rkm1;
1156 rk_rkm1 = rk_rk;
1157
1158 # pragma omp parallel for firstprivate(alpha) if(n_rows > CS_THR_MIN)
1159 for (cs_lnum_t ii = 0; ii < n_rows; ii++)
1160 dk[ii] = rk[ii] + (beta * dk[ii]);
1161
1162 cs_matrix_vector_multiply(a, dk, zk);
1163
1164 _dot_products_xy_yz(c, rk, dk, zk, &ro_0, &ro_1);
1165
1166 cs_real_t d_ro_1 = (CS_ABS(ro_1) > DBL_MIN) ? 1. / ro_1 : 0.;
1167 alpha = - ro_0 * d_ro_1;
1168
1169 # pragma omp parallel if(n_rows > CS_THR_MIN)
1170 {
1171 # pragma omp for nowait
1172 for (cs_lnum_t ii = 0; ii < n_rows; ii++)
1173 vx[ii] += (alpha * dk[ii]);
1174
1175 # pragma omp for nowait
1176 for (cs_lnum_t ii = 0; ii < n_rows; ii++)
1177 rk[ii] += (alpha * zk[ii]);
1178 }
1179
1180 }
1181
1182 if (_aux_vectors != aux_vectors)
1183 BFT_FREE(_aux_vectors);
1184
1185 return cvg;
1186 }
1187
1188 /*----------------------------------------------------------------------------
1189 * Solution of A.vx = Rhs using non-preconditioned conjugate gradient
1190 * with single reduction.
1191 *
1192 * For more information, see Lapack Working note 56, at
1193 * http://www.netlib.org/lapack/lawnspdf/lawn56.pdf)
1194 *
1195 * On entry, vx is considered initialized.
1196 *
1197 * parameters:
1198 * c <-- pointer to solver context info
1199 * a <-- matrix
1200 * diag_block_size <-- block size of element ii, ii
1201 * convergence <-- convergence information structure
1202 * rhs <-- right hand side
1203 * vx <-> system solution
1204 * aux_size <-- number of elements in aux_vectors (in bytes)
1205 * aux_vectors --- optional working area (allocation otherwise)
1206 *
1207 * returns:
1208 * convergence state
1209 *----------------------------------------------------------------------------*/
1210
1211 static cs_sles_convergence_state_t
_conjugate_gradient_npc_sr(cs_sles_it_t * c,const cs_matrix_t * a,cs_lnum_t diag_block_size,cs_sles_it_convergence_t * convergence,const cs_real_t * rhs,cs_real_t * restrict vx,size_t aux_size,void * aux_vectors)1212 _conjugate_gradient_npc_sr(cs_sles_it_t *c,
1213 const cs_matrix_t *a,
1214 cs_lnum_t diag_block_size,
1215 cs_sles_it_convergence_t *convergence,
1216 const cs_real_t *rhs,
1217 cs_real_t *restrict vx,
1218 size_t aux_size,
1219 void *aux_vectors)
1220 {
1221 cs_sles_convergence_state_t cvg;
1222 double ro_0, ro_1, alpha, rk_rkm1, rk_rk, rk_sk, beta, residue;
1223 cs_real_t *_aux_vectors;
1224 cs_real_t *restrict rk, *restrict dk, *restrict sk;
1225 cs_real_t *restrict zk;
1226
1227 unsigned n_iter = 0;
1228
1229 /* Allocate or map work arrays */
1230 /*-----------------------------*/
1231
1232 assert(c->setup_data != NULL);
1233
1234 const cs_lnum_t n_rows = c->setup_data->n_rows;
1235
1236 {
1237 const cs_lnum_t n_cols = cs_matrix_get_n_columns(a) * diag_block_size;
1238 const size_t n_wa = 4;
1239 const size_t wa_size = CS_SIMD_SIZE(n_cols);
1240
1241 if (aux_vectors == NULL || aux_size/sizeof(cs_real_t) < (wa_size * n_wa))
1242 BFT_MALLOC(_aux_vectors, wa_size * n_wa, cs_real_t);
1243 else
1244 _aux_vectors = aux_vectors;
1245
1246 rk = _aux_vectors;
1247 dk = _aux_vectors + wa_size;
1248 zk = _aux_vectors + wa_size*2;
1249 sk = _aux_vectors + wa_size*3;
1250 }
1251
1252 /* Initialize iterative calculation */
1253 /*----------------------------------*/
1254
1255 /* Residue and descent direction */
1256
1257 cs_matrix_vector_multiply(a, vx, rk); /* rk = A.x0 */
1258
1259 # pragma omp parallel for if(n_rows > CS_THR_MIN)
1260 for (cs_lnum_t ii = 0; ii < n_rows; ii++)
1261 rk[ii] = rk[ii] - rhs[ii];
1262
1263 /* Descent direction */
1264 /*-------------------*/
1265
1266 #if defined(HAVE_OPENMP)
1267
1268 # pragma omp parallel for if(n_rows > CS_THR_MIN)
1269 for (cs_lnum_t ii = 0; ii < n_rows; ii++)
1270 dk[ii] = rk[ii];
1271
1272 #else
1273
1274 memcpy(dk, rk, n_rows * sizeof(cs_real_t));
1275
1276 #endif
1277
1278 cs_matrix_vector_multiply(a, dk, zk); /* zk = A.dk */
1279
1280 /* Descent parameter */
1281
1282 _dot_products_xx_xy_yz(c, rk, dk, zk, &residue, &ro_0, &ro_1);
1283 residue = sqrt(residue);
1284
1285 /* If no solving required, finish here */
1286
1287 c->setup_data->initial_residue = residue;
1288
1289 cvg = _convergence_test(c, n_iter, residue, convergence);
1290
1291 if (cvg == CS_SLES_ITERATING) {
1292
1293 n_iter = 1;
1294
1295 cs_real_t d_ro_1 = (CS_ABS(ro_1) > DBL_MIN) ? 1. / ro_1 : 0.;
1296 alpha = - ro_0 * d_ro_1;
1297
1298 rk_rkm1 = ro_0;
1299
1300 # pragma omp parallel if(n_rows > CS_THR_MIN)
1301 {
1302 # pragma omp for nowait
1303 for (cs_lnum_t ii = 0; ii < n_rows; ii++)
1304 vx[ii] += (alpha * dk[ii]);
1305
1306 # pragma omp for nowait
1307 for (cs_lnum_t ii = 0; ii < n_rows; ii++)
1308 rk[ii] += (alpha * zk[ii]);
1309 }
1310
1311 /* Convergence test */
1312
1313 residue = sqrt(_dot_product_xx(c, rk));
1314
1315 cvg = _convergence_test(c, n_iter, residue, convergence);
1316
1317 }
1318
1319 /* Current Iteration */
1320 /*-------------------*/
1321
1322 while (cvg == CS_SLES_ITERATING) {
1323
1324 cs_matrix_vector_multiply(a, rk, sk); /* sk = A.zk */
1325
1326 /* compute residue and prepare descent parameter */
1327
1328 _dot_products_xx_xy(c, rk, sk, &residue, &rk_sk);
1329
1330 rk_rk = residue;
1331
1332 residue = sqrt(residue);
1333
1334 /* Convergence test for end of previous iteration */
1335
1336 if (n_iter > 1)
1337 cvg = _convergence_test(c, n_iter, residue, convergence);
1338
1339 if (cvg != CS_SLES_ITERATING)
1340 break;
1341
1342 n_iter += 1;
1343
1344 /* Complete descent parameter computation and matrix.vector product */
1345
1346 beta = rk_rk / rk_rkm1;
1347 rk_rkm1 = rk_rk;
1348
1349 ro_1 = rk_sk - beta*beta*ro_1;
1350 ro_0 = rk_rk;
1351
1352 cs_real_t d_ro_1 = (CS_ABS(ro_1) > DBL_MIN) ? 1. / ro_1 : 0.;
1353 alpha = - ro_0 * d_ro_1;
1354
1355 # pragma omp parallel if(n_rows > CS_THR_MIN)
1356 {
1357 # pragma omp for nowait
1358 for (cs_lnum_t ii = 0; ii < n_rows; ii++) {
1359 dk[ii] = rk[ii] + (beta * dk[ii]);
1360 vx[ii] += alpha * dk[ii];
1361 }
1362 # pragma omp for nowait
1363 for (cs_lnum_t ii = 0; ii < n_rows; ii++) {
1364 zk[ii] = sk[ii] + (beta * zk[ii]);
1365 rk[ii] += alpha * zk[ii];
1366 }
1367 }
1368
1369 }
1370
1371 if (_aux_vectors != aux_vectors)
1372 BFT_FREE(_aux_vectors);
1373
1374 return cvg;
1375 }
1376
1377 /*----------------------------------------------------------------------------
1378 * Solution of A.vx = Rhs using preconditioned 3-layer conjugate residual.
1379 *
1380 * On entry, vx is considered initialized.
1381 *
1382 * parameters:
1383 * c <-- pointer to solver context info
1384 * a <-- matrix
1385 * diag_block_size <-- block size of element ii, ii
1386 * convergence <-- convergence information structure
1387 * rhs <-- right hand side
1388 * vx <-> system solution
1389 * aux_size <-- number of elements in aux_vectors (in bytes)
1390 * aux_vectors --- optional working area (allocation otherwise)
1391 *
1392 * returns:
1393 * convergence state
1394 *----------------------------------------------------------------------------*/
1395
1396 static cs_sles_convergence_state_t
_conjugate_residual_3(cs_sles_it_t * c,const cs_matrix_t * a,cs_lnum_t diag_block_size,cs_sles_it_convergence_t * convergence,const cs_real_t * rhs,cs_real_t * restrict vx,size_t aux_size,void * aux_vectors)1397 _conjugate_residual_3(cs_sles_it_t *c,
1398 const cs_matrix_t *a,
1399 cs_lnum_t diag_block_size,
1400 cs_sles_it_convergence_t *convergence,
1401 const cs_real_t *rhs,
1402 cs_real_t *restrict vx,
1403 size_t aux_size,
1404 void *aux_vectors)
1405 {
1406 cs_sles_convergence_state_t cvg;
1407 cs_lnum_t ii;
1408 double residue;
1409 double ak, bk, ck, dk, ek, denom, alpha, tau;
1410 cs_real_t *_aux_vectors;
1411 cs_real_t *restrict vxm1;
1412 cs_real_t *restrict rk, *restrict rkm1;
1413 cs_real_t *restrict wk, *restrict zk;
1414 cs_real_t *restrict tmp;
1415
1416 unsigned n_iter = 0;
1417
1418 /* Allocate or map work arrays */
1419 /*-----------------------------*/
1420
1421 assert(c->setup_data != NULL);
1422
1423 const cs_lnum_t n_rows = c->setup_data->n_rows;
1424
1425 {
1426 const cs_lnum_t n_cols = cs_matrix_get_n_columns(a) * diag_block_size;
1427 const size_t n_wa = 6;
1428 const size_t wa_size = CS_SIMD_SIZE(n_cols);
1429
1430 if (aux_vectors == NULL || aux_size/sizeof(cs_real_t) < (wa_size * n_wa))
1431 BFT_MALLOC(_aux_vectors, wa_size * n_wa, cs_real_t);
1432 else
1433 _aux_vectors = aux_vectors;
1434
1435 vxm1 = _aux_vectors;
1436 rk = _aux_vectors + wa_size;
1437 rkm1 = _aux_vectors + wa_size*2;
1438 tmp = _aux_vectors + wa_size*3;
1439 wk = _aux_vectors + wa_size*4;
1440 zk = _aux_vectors + wa_size*5;
1441
1442 # pragma omp parallel for if(n_rows > CS_THR_MIN)
1443 for (ii = 0; ii < n_rows; ii++) {
1444 vxm1[ii] = vx[ii];
1445 rkm1[ii] = 0.0;
1446 }
1447 }
1448
1449 /* Initialize iterative calculation */
1450 /*----------------------------------*/
1451
1452 /* Residue */
1453
1454 cs_matrix_vector_multiply(a, vx, rk); /* rk = A.x0 */
1455
1456 # pragma omp parallel for if(n_rows > CS_THR_MIN)
1457 for (ii = 0; ii < n_rows; ii++)
1458 rk[ii] -= rhs[ii];
1459
1460 residue = _dot_product(c, rk, rk);
1461 residue = sqrt(residue);
1462
1463 /* If no solving required, finish here */
1464
1465 c->setup_data->initial_residue = residue;
1466 cvg = _convergence_test(c, n_iter, residue, convergence);
1467
1468 /* Current Iteration */
1469 /*-------------------*/
1470
1471 while (cvg == CS_SLES_ITERATING) {
1472
1473 /* Preconditionning */
1474
1475 c->setup_data->pc_apply(c->setup_data->pc_context, rk, wk);
1476
1477 cs_matrix_vector_multiply(a, wk, zk);
1478
1479 _dot_products_xy_yz(c, rk, zk, rkm1, &ak, &bk);
1480
1481 # pragma omp parallel for if(n_rows > CS_THR_MIN)
1482 for (ii = 0; ii < n_rows; ii++)
1483 tmp[ii] = rk[ii] - rkm1[ii];
1484
1485 _dot_products_xy_yz(c, rk, tmp, rkm1, &ck, &dk);
1486
1487 ek = _dot_product_xx(c, zk);
1488
1489 denom = (ck-dk)*ek - ((ak-bk)*(ak-bk));
1490
1491 if (fabs(denom) < 1.e-30)
1492 alpha = 1.0;
1493 else
1494 alpha = ((ak-bk)*bk - dk*ek)/denom;
1495
1496 if (fabs(alpha) < 1.e-30 || fabs(alpha - 1.) < 1.e-30) {
1497 alpha = 1.0;
1498 tau = ak/ek;
1499 }
1500 else
1501 tau = ak/ek + ((1 - alpha)/alpha) * bk/ek;
1502
1503 cs_real_t c0 = (1 - alpha);
1504 cs_real_t c1 = -alpha*tau;
1505
1506 # pragma omp parallel firstprivate(alpha, tau, c0, c1) if(n_rows > CS_THR_MIN)
1507 {
1508 # pragma omp for nowait
1509 for (ii = 0; ii < n_rows; ii++) {
1510 cs_real_t trk = rk[ii];
1511 rk[ii] = alpha*rk[ii] + c0*rkm1[ii] + c1*zk[ii];
1512 rkm1[ii] = trk;
1513 }
1514 # pragma omp for nowait
1515 for (ii = 0; ii < n_rows; ii++) {
1516 cs_real_t tvx = vx[ii];
1517 vx[ii] = alpha*vx[ii] + c0*vxm1[ii] + c1*wk[ii];
1518 vxm1[ii] = tvx;
1519 }
1520 }
1521
1522 /* compute residue */
1523
1524 residue = sqrt(_dot_product(c, rk, rk));
1525
1526 /* Convergence test for end of previous iteration */
1527
1528 if (n_iter > 1)
1529 cvg = _convergence_test(c, n_iter, residue, convergence);
1530
1531 if (cvg != CS_SLES_ITERATING)
1532 break;
1533
1534 n_iter += 1;
1535
1536 }
1537
1538 if (_aux_vectors != aux_vectors)
1539 BFT_FREE(_aux_vectors);
1540
1541 return cvg;
1542 }
1543
1544 /*----------------------------------------------------------------------------
1545 * Solution of A.vx = Rhs using Jacobi.
1546 *
1547 * On entry, vx is considered initialized.
1548 *
1549 * parameters:
1550 * c <-- pointer to solver context info
1551 * a <-- linear equation matrix
1552 * diag_block_size <-- diagonal block size
1553 * convergence <-- convergence information structure
1554 * rhs <-- right hand side
1555 * vx <-> system solution
1556 * aux_size <-- number of elements in aux_vectors (in bytes)
1557 * aux_vectors --- optional working area (allocation otherwise)
1558 *
1559 * returns:
1560 * convergence state
1561 *----------------------------------------------------------------------------*/
1562
1563 static cs_sles_convergence_state_t
_jacobi(cs_sles_it_t * c,const cs_matrix_t * a,cs_lnum_t diag_block_size,cs_sles_it_convergence_t * convergence,const cs_real_t * rhs,cs_real_t * restrict vx,size_t aux_size,void * aux_vectors)1564 _jacobi(cs_sles_it_t *c,
1565 const cs_matrix_t *a,
1566 cs_lnum_t diag_block_size,
1567 cs_sles_it_convergence_t *convergence,
1568 const cs_real_t *rhs,
1569 cs_real_t *restrict vx,
1570 size_t aux_size,
1571 void *aux_vectors)
1572 {
1573 cs_sles_convergence_state_t cvg;
1574 cs_lnum_t ii;
1575 double res2, residue;
1576 cs_real_t *_aux_vectors;
1577 cs_real_t *restrict rk;
1578
1579 unsigned n_iter = 0;
1580
1581 /* Allocate or map work arrays */
1582 /*-----------------------------*/
1583
1584 assert(c->setup_data != NULL);
1585
1586 const cs_real_t *restrict ad_inv = c->setup_data->ad_inv;
1587
1588 const cs_lnum_t n_rows = c->setup_data->n_rows;
1589
1590 {
1591 const cs_lnum_t n_cols = cs_matrix_get_n_columns(a) * diag_block_size;
1592 const size_t n_wa = 1;
1593 const size_t wa_size = CS_SIMD_SIZE(n_cols);
1594
1595 if (aux_vectors == NULL || aux_size/sizeof(cs_real_t) < (wa_size * n_wa))
1596 BFT_MALLOC(_aux_vectors, wa_size * n_wa, cs_real_t);
1597 else
1598 _aux_vectors = aux_vectors;
1599
1600 rk = _aux_vectors;
1601 }
1602
1603 const cs_real_t *restrict ad = cs_matrix_get_diagonal(a);
1604
1605 cvg = CS_SLES_ITERATING;
1606
1607 /* Current iteration */
1608 /*-------------------*/
1609
1610 while (cvg == CS_SLES_ITERATING) {
1611
1612 n_iter += 1;
1613
1614 #if defined(HAVE_OPENMP)
1615
1616 # pragma omp parallel for if(n_rows > CS_THR_MIN)
1617 for (ii = 0; ii < n_rows; ii++)
1618 rk[ii] = vx[ii];
1619
1620 #else
1621
1622 memcpy(rk, vx, n_rows * sizeof(cs_real_t)); /* rk <- vx */
1623
1624 #endif
1625
1626 /* Compute Vx <- Vx - (A-diag).Rk and residue. */
1627
1628 cs_matrix_exdiag_vector_multiply(a, rk, vx);
1629
1630 res2 = 0.0;
1631
1632 # pragma omp parallel for reduction(+:res2) if(n_rows > CS_THR_MIN)
1633 for (ii = 0; ii < n_rows; ii++) {
1634 vx[ii] = (rhs[ii]-vx[ii])*ad_inv[ii];
1635 double r = ad[ii] * (vx[ii]-rk[ii]);
1636 res2 += (r*r);
1637 }
1638
1639 #if defined(HAVE_MPI)
1640
1641 if (c->comm != MPI_COMM_NULL) {
1642 double _sum;
1643 MPI_Allreduce(&res2, &_sum, 1, MPI_DOUBLE, MPI_SUM, c->comm);
1644 res2 = _sum;
1645 }
1646
1647 #endif /* defined(HAVE_MPI) */
1648
1649 residue = sqrt(res2); /* Actually, residue of previous iteration */
1650
1651 /* Convergence test */
1652
1653 if (n_iter == 1)
1654 c->setup_data->initial_residue = residue;
1655
1656 cvg = _convergence_test(c, n_iter, residue, convergence);
1657
1658 }
1659
1660 if (_aux_vectors != aux_vectors)
1661 BFT_FREE(_aux_vectors);
1662
1663 return cvg;
1664 }
1665
1666 /*----------------------------------------------------------------------------
1667 * Solution of A.vx = Rhs using block Jacobi.
1668 *
1669 * On entry, vx is considered initialized.
1670 *
1671 * parameters:
1672 * c <-- pointer to solver context info
1673 * a <-- linear equation matrix
1674 * diag_block_size <-- diagonal block size (unused here)
1675 * convergence <-- convergence information structure
1676 * rhs <-- right hand side
1677 * vx --> system solution
1678 * aux_size <-- number of elements in aux_vectors (in bytes)
1679 * aux_vectors --- optional working area (allocation otherwise)
1680 *
1681 * returns:
1682 * convergence state
1683 *----------------------------------------------------------------------------*/
1684
1685 static cs_sles_convergence_state_t
_block_3_jacobi(cs_sles_it_t * c,const cs_matrix_t * a,cs_lnum_t diag_block_size,cs_sles_it_convergence_t * convergence,const cs_real_t * rhs,cs_real_t * restrict vx,size_t aux_size,void * aux_vectors)1686 _block_3_jacobi(cs_sles_it_t *c,
1687 const cs_matrix_t *a,
1688 cs_lnum_t diag_block_size,
1689 cs_sles_it_convergence_t *convergence,
1690 const cs_real_t *rhs,
1691 cs_real_t *restrict vx,
1692 size_t aux_size,
1693 void *aux_vectors)
1694 {
1695 CS_UNUSED(diag_block_size);
1696
1697 assert(diag_block_size == 3);
1698
1699 cs_sles_convergence_state_t cvg;
1700 double res2, residue;
1701 cs_real_t *_aux_vectors;
1702 cs_real_t *restrict rk, *restrict vxx;
1703
1704 unsigned n_iter = 0;
1705
1706 /* Allocate or map work arrays */
1707 /*-----------------------------*/
1708
1709 assert(c->setup_data != NULL);
1710
1711 const cs_real_t *restrict ad_inv = c->setup_data->ad_inv;
1712
1713 const cs_lnum_t n_rows = c->setup_data->n_rows;
1714 const cs_lnum_t n_blocks = c->setup_data->n_rows / 3;
1715
1716 {
1717 const cs_lnum_t n_cols = cs_matrix_get_n_columns(a) * diag_block_size;
1718 const size_t n_wa = 2;
1719 const size_t wa_size = CS_SIMD_SIZE(n_cols);
1720
1721 if (aux_vectors == NULL || aux_size/sizeof(cs_real_t) < (wa_size * n_wa))
1722 BFT_MALLOC(_aux_vectors, wa_size * n_wa, cs_real_t);
1723 else
1724 _aux_vectors = aux_vectors;
1725
1726 rk = _aux_vectors;
1727 vxx = _aux_vectors + wa_size;
1728 }
1729
1730 const cs_real_t *restrict ad = cs_matrix_get_diagonal(a);
1731
1732 cvg = CS_SLES_ITERATING;
1733
1734 /* Current iteration */
1735 /*-------------------*/
1736
1737 while (cvg == CS_SLES_ITERATING) {
1738
1739 n_iter += 1;
1740 memcpy(rk, vx, n_rows * sizeof(cs_real_t)); /* rk <- vx */
1741
1742 /* Compute vxx <- vx - (a-diag).rk and residue. */
1743
1744 cs_matrix_exdiag_vector_multiply(a, rk, vxx);
1745
1746 res2 = 0.0;
1747
1748 /* Compute vx <- diag^-1 . (vxx - rhs) and residue. */
1749 # pragma omp parallel for reduction(+:res2) if(n_blocks > CS_THR_MIN)
1750 for (cs_lnum_t ii = 0; ii < n_blocks; ii++) {
1751 _fw_and_bw_lu33(ad_inv + 9*ii,
1752 vx + 3*ii,
1753 vxx + 3*ii,
1754 rhs + 3*ii);
1755 for (cs_lnum_t jj = 0; jj < 3; jj++) {
1756 register double r = 0.0;
1757 for (cs_lnum_t kk = 0; kk < 3; kk++)
1758 r += ad[ii*9 + jj*3 + kk]
1759 * (vx[ii*3 + kk] - rk[ii*3 + kk]);
1760 res2 += (r*r);
1761 }
1762 }
1763
1764 #if defined(HAVE_MPI)
1765
1766 if (c->comm != MPI_COMM_NULL) {
1767 double _sum;
1768 MPI_Allreduce(&res2, &_sum, 1, MPI_DOUBLE, MPI_SUM, c->comm);
1769 res2 = _sum;
1770 }
1771
1772 #endif /* defined(HAVE_MPI) */
1773
1774 residue = sqrt(res2); /* Actually, residue of previous iteration */
1775
1776 if (n_iter == 1)
1777 c->setup_data->initial_residue = residue;
1778
1779 /* Convergence test */
1780
1781 cvg = _convergence_test(c, n_iter, residue, convergence);
1782
1783 }
1784
1785 if (_aux_vectors != aux_vectors)
1786 BFT_FREE(_aux_vectors);
1787 return cvg;
1788 }
1789
1790 /*----------------------------------------------------------------------------
1791 * Solution of A.vx = Rhs using block Jacobi.
1792 *
1793 * On entry, vx is considered initialized.
1794 *
1795 * parameters:
1796 * c <-- pointer to solver context info
1797 * a <-- linear equation matrix
1798 * diag_block_size <-- block size of diagonal elements
1799 * convergence <-- convergence information structure
1800 * rhs <-- right hand side
1801 * vx --> system solution
1802 * aux_size <-- number of elements in aux_vectors (in bytes)
1803 * aux_vectors --- optional working area (allocation otherwise)
1804 *
1805 * returns:
1806 * convergence state
1807 *----------------------------------------------------------------------------*/
1808
1809 static cs_sles_convergence_state_t
_block_jacobi(cs_sles_it_t * c,const cs_matrix_t * a,cs_lnum_t diag_block_size,cs_sles_it_convergence_t * convergence,const cs_real_t * rhs,cs_real_t * restrict vx,size_t aux_size,void * aux_vectors)1810 _block_jacobi(cs_sles_it_t *c,
1811 const cs_matrix_t *a,
1812 cs_lnum_t diag_block_size,
1813 cs_sles_it_convergence_t *convergence,
1814 const cs_real_t *rhs,
1815 cs_real_t *restrict vx,
1816 size_t aux_size,
1817 void *aux_vectors)
1818 {
1819 cs_sles_convergence_state_t cvg;
1820 double res2, residue;
1821 cs_real_t *_aux_vectors;
1822 cs_real_t *restrict rk, *restrict vxx;
1823
1824 unsigned n_iter = 0;
1825
1826 /* Call setup if not already done, allocate or map work arrays */
1827 /*-------------------------------------------------------------*/
1828 assert(c->setup_data != NULL);
1829
1830 const cs_lnum_t *db_size = cs_matrix_get_diag_block_size(a);
1831
1832 const cs_real_t *restrict ad_inv = c->setup_data->ad_inv;
1833
1834 const cs_lnum_t n_rows = c->setup_data->n_rows;
1835 const cs_lnum_t n_blocks = c->setup_data->n_rows / diag_block_size;
1836
1837 {
1838 const cs_lnum_t n_cols = cs_matrix_get_n_columns(a) * diag_block_size;
1839 const size_t n_wa = 2;
1840 const size_t wa_size = CS_SIMD_SIZE(n_cols);
1841
1842 if (aux_vectors == NULL || aux_size/sizeof(cs_real_t) < (wa_size * n_wa))
1843 BFT_MALLOC(_aux_vectors, wa_size * n_wa, cs_real_t);
1844 else
1845 _aux_vectors = aux_vectors;
1846
1847 rk = _aux_vectors;
1848 vxx = _aux_vectors + wa_size;
1849 }
1850
1851 const cs_real_t *restrict ad = cs_matrix_get_diagonal(a);
1852
1853 cvg = CS_SLES_ITERATING;
1854
1855 /* Current iteration */
1856 /*-------------------*/
1857
1858 while (cvg == CS_SLES_ITERATING) {
1859
1860 n_iter += 1;
1861 memcpy(rk, vx, n_rows * sizeof(cs_real_t)); /* rk <- vx */
1862
1863 /* Compute Vx <- Vx - (A-diag).Rk and residue. */
1864
1865 cs_matrix_exdiag_vector_multiply(a, rk, vxx);
1866
1867 res2 = 0.0;
1868
1869 # pragma omp parallel for reduction(+:res2) if(n_blocks > CS_THR_MIN)
1870 for (cs_lnum_t ii = 0; ii < n_blocks; ii++) {
1871 _fw_and_bw_lu(ad_inv + db_size[3]*ii,
1872 db_size[0],
1873 vx + db_size[1]*ii,
1874 vxx + db_size[1]*ii,
1875 rhs + db_size[1]*ii);
1876 for (cs_lnum_t jj = 0; jj < db_size[0]; jj++) {
1877 register double r = 0.0;
1878 for (cs_lnum_t kk = 0; kk < db_size[0]; kk++)
1879 r += ad[ii*db_size[3] + jj*db_size[2] + kk]
1880 * (vx[ii*db_size[1] + kk] - rk[ii*db_size[1] + kk]);
1881 res2 += (r*r);
1882 }
1883 }
1884
1885 #if defined(HAVE_MPI)
1886
1887 if (c->comm != MPI_COMM_NULL) {
1888 double _sum;
1889 MPI_Allreduce(&res2, &_sum, 1, MPI_DOUBLE, MPI_SUM, c->comm);
1890 res2 = _sum;
1891 }
1892
1893 #endif /* defined(HAVE_MPI) */
1894
1895 residue = sqrt(res2); /* Actually, residue of previous iteration */
1896
1897 if (n_iter == 1)
1898 c->setup_data->initial_residue = residue;
1899
1900 /* Convergence test */
1901
1902 cvg = _convergence_test(c, n_iter, residue, convergence);
1903
1904 }
1905
1906 if (_aux_vectors != aux_vectors)
1907 BFT_FREE(_aux_vectors);
1908
1909 return cvg;
1910 }
1911
1912
1913 /*----------------------------------------------------------------------------
1914 * Test for (and eventually report) breakdown.
1915 *
1916 * parameters:
1917 * c <-- pointer to solver context info
1918 * convergence <-- convergence information structure
1919 * coeff <-- coefficient name
1920 * coeff <-- coefficient to test
1921 * epsilon <-- value to test against
1922 * n_iter <-- current number of iterations
1923 * cvg <-> convergence status
1924 *
1925 * returns:
1926 * true in case of breakdown, false otherwise
1927 *----------------------------------------------------------------------------*/
1928
1929 static inline bool
_breakdown(cs_sles_it_t * c,cs_sles_it_convergence_t * convergence,const char * coeff_name,double coeff,double epsilon,double residue,int n_iter,cs_sles_convergence_state_t * cvg)1930 _breakdown(cs_sles_it_t *c,
1931 cs_sles_it_convergence_t *convergence,
1932 const char *coeff_name,
1933 double coeff,
1934 double epsilon,
1935 double residue,
1936 int n_iter,
1937 cs_sles_convergence_state_t *cvg)
1938 {
1939 bool retval = false;
1940
1941 if (CS_ABS(coeff) < epsilon) {
1942
1943 bft_printf
1944 (_("\n\n"
1945 "%s [%s]:\n"
1946 " @@ Warning: non convergence\n"
1947 "\n"
1948 " norm of coefficient \"%s\" is lower than %12.4e\n"
1949 "\n"
1950 " The resolution does not progress anymore."),
1951 cs_sles_it_type_name[c->type], convergence->name, coeff_name, epsilon);
1952 bft_printf(_(" n_iter : %5u, res_abs : %11.4e, res_nor : %11.4e\n"),
1953 n_iter, residue, residue/convergence->r_norm);
1954
1955 *cvg = CS_SLES_BREAKDOWN;
1956 retval = true;
1957 }
1958
1959 return retval;
1960 }
1961
1962 /*----------------------------------------------------------------------------
1963 * Solution of A.vx = Rhs using preconditioned Bi-CGSTAB.
1964 *
1965 * Parallel-optimized version, groups dot products, at the cost of
1966 * computation of the preconditionning for n+1 iterations instead of n.
1967 *
1968 * On entry, vx is considered initialized.
1969 *
1970 * parameters:
1971 * c <-- pointer to solver context info
1972 * name <-- pointer to system name
1973 * a <-- matrix
1974 * diag_block_size <-- block size of diagonal elements
1975 * convergence <-- convergence information structure
1976 * rhs <-- right hand side
1977 * vx <-> system solution
1978 * aux_size <-- number of elements in aux_vectors (in bytes)
1979 * aux_vectors --- optional working area (allocation otherwise)
1980 *
1981 * returns:
1982 * convergence state
1983 *----------------------------------------------------------------------------*/
1984
1985 static cs_sles_convergence_state_t
_bi_cgstab(cs_sles_it_t * c,const cs_matrix_t * a,cs_lnum_t diag_block_size,cs_sles_it_convergence_t * convergence,const cs_real_t * rhs,cs_real_t * restrict vx,size_t aux_size,void * aux_vectors)1986 _bi_cgstab(cs_sles_it_t *c,
1987 const cs_matrix_t *a,
1988 cs_lnum_t diag_block_size,
1989 cs_sles_it_convergence_t *convergence,
1990 const cs_real_t *rhs,
1991 cs_real_t *restrict vx,
1992 size_t aux_size,
1993 void *aux_vectors)
1994 {
1995 cs_sles_convergence_state_t cvg;
1996 double _epzero = 1.e-30; /* smaller than epzero */
1997 double ro_0, ro_1, alpha, beta, betam1, gamma, omega, ukres0;
1998 double residue;
1999 cs_real_t *_aux_vectors;
2000 cs_real_t *restrict res0, *restrict rk, *restrict pk, *restrict zk;
2001 cs_real_t *restrict uk, *restrict vk;
2002
2003 unsigned n_iter = 0;
2004
2005 /* Allocate or map work arrays */
2006 /*-----------------------------*/
2007
2008 assert(c->setup_data != NULL);
2009
2010 const cs_lnum_t n_rows = c->setup_data->n_rows;
2011
2012 {
2013 const cs_lnum_t n_cols = cs_matrix_get_n_columns(a) * diag_block_size;
2014 const size_t n_wa = 6;
2015 const size_t wa_size = CS_SIMD_SIZE(n_cols);
2016
2017 if (aux_vectors == NULL || aux_size/sizeof(cs_real_t) < (wa_size * n_wa))
2018 BFT_MALLOC(_aux_vectors, wa_size * n_wa, cs_real_t);
2019 else
2020 _aux_vectors = aux_vectors;
2021
2022 res0 = _aux_vectors;
2023 rk = _aux_vectors + wa_size;
2024 pk = _aux_vectors + wa_size*2;
2025 zk = _aux_vectors + wa_size*3;
2026 uk = _aux_vectors + wa_size*4;
2027 vk = _aux_vectors + wa_size*5;
2028 }
2029
2030 # pragma omp parallel for if(n_rows > CS_THR_MIN)
2031 for (cs_lnum_t ii = 0; ii < n_rows; ii++) {
2032 pk[ii] = 0.0;
2033 uk[ii] = 0.0;
2034 }
2035
2036 /* Initialize iterative calculation */
2037 /*----------------------------------*/
2038
2039 cs_matrix_vector_multiply(a, vx, res0);
2040
2041 # pragma omp parallel for if(n_rows > CS_THR_MIN)
2042 for (cs_lnum_t ii = 0; ii < n_rows; ii++) {
2043 res0[ii] = -res0[ii] + rhs[ii];
2044 rk[ii] = res0[ii];
2045 }
2046
2047 alpha = 1.0;
2048 betam1 = 1.0;
2049 gamma = 1.0;
2050
2051 cvg = CS_SLES_ITERATING;
2052
2053 /* Current Iteration */
2054 /*-------------------*/
2055
2056 while (cvg == CS_SLES_ITERATING) {
2057
2058 /* Compute beta and omega;
2059 group dot products for new iteration's beta
2060 and previous iteration's residue to reduce total latency */
2061
2062 if (n_iter == 0) {
2063 beta = _dot_product_xx(c, rk); /* rk == res0 here */
2064 residue = sqrt(beta);
2065 c->setup_data->initial_residue = residue;
2066 }
2067 else {
2068 _dot_products_xx_xy(c, rk, res0, &residue, &beta);
2069 residue = sqrt(residue);
2070 }
2071
2072 /* Convergence test */
2073 cvg = _convergence_test(c, n_iter, residue, convergence);
2074 if (cvg != CS_SLES_ITERATING)
2075 break;
2076
2077 n_iter += 1;
2078
2079 if (_breakdown(c, convergence, "beta", beta, _epzero,
2080 residue, n_iter, &cvg))
2081 break;
2082
2083 if (_breakdown(c, convergence, "alpha", alpha, _epzero,
2084 residue, n_iter, &cvg))
2085 break;
2086
2087 omega = beta*gamma / (alpha*betam1);
2088 betam1 = beta;
2089
2090 /* Compute pk */
2091
2092 # pragma omp parallel for if(n_rows > CS_THR_MIN)
2093 for (cs_lnum_t ii = 0; ii < n_rows; ii++)
2094 pk[ii] = rk[ii] + omega*(pk[ii] - alpha*uk[ii]);
2095
2096 /* Compute zk = c.pk */
2097
2098 c->setup_data->pc_apply(c->setup_data->pc_context, pk, zk);
2099
2100 /* Compute uk = A.zk */
2101
2102 cs_matrix_vector_multiply(a, zk, uk);
2103
2104 /* Compute uk.res0 and gamma */
2105
2106 ukres0 = _dot_product(c, uk, res0);
2107
2108 gamma = beta / ukres0;
2109
2110 /* First update of vx and rk */
2111
2112 # pragma omp parallel if(n_rows > CS_THR_MIN)
2113 {
2114 # pragma omp for nowait
2115 for (cs_lnum_t ii = 0; ii < n_rows; ii++)
2116 vx[ii] += (gamma * zk[ii]);
2117
2118 # pragma omp for nowait
2119 for (cs_lnum_t ii = 0; ii < n_rows; ii++)
2120 rk[ii] -= (gamma * uk[ii]);
2121 }
2122
2123 /* Compute zk = C.rk (zk is overwritten, vk is a working array */
2124
2125 c->setup_data->pc_apply(c->setup_data->pc_context, rk, zk);
2126
2127 /* Compute vk = A.zk and alpha */
2128
2129 cs_matrix_vector_multiply(a, zk, vk);
2130
2131 _dot_products_xx_xy(c, vk, rk, &ro_1, &ro_0);
2132
2133 if (_breakdown(c, convergence, "rho1", ro_1, _epzero,
2134 residue, n_iter, &cvg))
2135 break;
2136
2137 cs_real_t d_ro_1 = (CS_ABS(ro_1) > DBL_MIN) ? 1. / ro_1 : 0.;
2138 alpha = ro_0 * d_ro_1;
2139
2140 /* Final update of vx and rk */
2141
2142 # pragma omp parallel if(n_rows > CS_THR_MIN)
2143 {
2144 # pragma omp for nowait
2145 for (cs_lnum_t ii = 0; ii < n_rows; ii++)
2146 vx[ii] += (alpha * zk[ii]);
2147
2148 # pragma omp for nowait
2149 for (cs_lnum_t ii = 0; ii < n_rows; ii++)
2150 rk[ii] -= (alpha * vk[ii]);
2151 }
2152
2153 /* Convergence test at beginning of next iteration so
2154 as to group dot products for better parallel performance */
2155 }
2156
2157 if (_aux_vectors != aux_vectors)
2158 BFT_FREE(_aux_vectors);
2159
2160 return cvg;
2161 }
2162
2163 /*----------------------------------------------------------------------------
2164 * Solution of (ad+ax).vx = Rhs using (not yet preconditioned) Bi-CGSTAB2.
2165 *
2166 * This Krylov method is based on an implemantation of J.P. Caltagirone
2167 * in his file bibpac6.f90 for Aquillon. He refers to it as Bi-CGSTAB2,
2168 * but a proper reference for the method has yet to be found.
2169 * (Gutknecht's BiCGstab2?)
2170 *
2171 * On entry, vx is considered initialized.
2172 *
2173 * parameters:
2174 * c <-- pointer to solver context info
2175 * a <-- matrix
2176 * diag_block_size <-- block size of element ii, ii
2177 * convergence <-- convergence information structure
2178 * rhs <-- right hand side
2179 * vx <-> system solution
2180 * aux_size <-- number of elements in aux_vectors (in bytes)
2181 * aux_vectors --- optional working area (allocation otherwise)
2182 *
2183 * returns:
2184 * convergence state
2185 *----------------------------------------------------------------------------*/
2186
2187 static cs_sles_convergence_state_t
_bicgstab2(cs_sles_it_t * c,const cs_matrix_t * a,cs_lnum_t diag_block_size,cs_sles_it_convergence_t * convergence,const cs_real_t * rhs,cs_real_t * restrict vx,size_t aux_size,void * aux_vectors)2188 _bicgstab2(cs_sles_it_t *c,
2189 const cs_matrix_t *a,
2190 cs_lnum_t diag_block_size,
2191 cs_sles_it_convergence_t *convergence,
2192 const cs_real_t *rhs,
2193 cs_real_t *restrict vx,
2194 size_t aux_size,
2195 void *aux_vectors)
2196 {
2197 cs_sles_convergence_state_t cvg;
2198 double _epzero = 1.e-30;/* smaller than epzero */
2199 double ro_0, ro_1, alpha, beta, gamma;
2200 double omega_1, omega_2, mu, nu, tau;
2201 double residue;
2202 cs_real_t *_aux_vectors;
2203 cs_real_t *restrict res0, *restrict qk, *restrict rk, *restrict sk;
2204 cs_real_t *restrict tk, *restrict uk, *restrict vk, *restrict wk;
2205 cs_real_t *restrict zk;
2206
2207 unsigned n_iter = 0;
2208
2209 /* Allocate or map work arrays */
2210 /*-----------------------------*/
2211
2212 assert(c->setup_data != NULL);
2213
2214 const cs_lnum_t n_rows = c->setup_data->n_rows;
2215
2216 {
2217 const cs_lnum_t n_cols = cs_matrix_get_n_columns(a) * diag_block_size;
2218 size_t n_wa = 9;
2219 const size_t wa_size = CS_SIMD_SIZE(n_cols);
2220
2221 if (aux_vectors == NULL || aux_size/sizeof(cs_real_t) < (wa_size * n_wa))
2222 BFT_MALLOC(_aux_vectors, wa_size * n_wa, cs_real_t);
2223 else
2224 _aux_vectors = aux_vectors;
2225
2226 res0 = _aux_vectors;
2227 zk = _aux_vectors + wa_size;
2228 qk = _aux_vectors + wa_size*2;
2229 rk = _aux_vectors + wa_size*3;
2230 sk = _aux_vectors + wa_size*4;
2231 tk = _aux_vectors + wa_size*5;
2232 uk = _aux_vectors + wa_size*6;
2233 vk = _aux_vectors + wa_size*7;
2234 wk = _aux_vectors + wa_size*8;
2235 }
2236
2237 /* Initialize work arrays */
2238
2239 # pragma omp parallel for if(n_rows > CS_THR_MIN)
2240 for (cs_lnum_t ii = 0; ii < n_rows; ii++) {
2241 uk[ii] = 0.0;
2242 }
2243
2244 /* Initialize iterative calculation */
2245 /*----------------------------------*/
2246
2247 cs_matrix_vector_multiply(a, vx, res0);
2248
2249 # pragma omp parallel for if(n_rows > CS_THR_MIN)
2250 for (cs_lnum_t ii = 0; ii < n_rows; ii++) {
2251 res0[ii] = -res0[ii] + rhs[ii];
2252 rk[ii] = res0[ii];
2253 qk[ii] = rk[ii];
2254 }
2255
2256 ro_0 = 1.0;
2257 alpha = 0.0;
2258 omega_2 = 1.0;
2259
2260 cvg = CS_SLES_ITERATING;
2261
2262 /* Current Iteration */
2263 /*-------------------*/
2264
2265 while (cvg == CS_SLES_ITERATING) {
2266
2267 /* Compute beta and omega;
2268 group dot products for new iteration's beta
2269 and previous iteration's residue to reduce total latency */
2270
2271 double mprec = 1.0e-60;
2272
2273 if (n_iter == 0) {
2274 residue = sqrt(_dot_product_xx(c, rk)); /* rk == res0 here */
2275 c->setup_data->initial_residue = residue;
2276 }
2277 else
2278 residue = sqrt(_dot_product_xx(c, rk));
2279
2280 /* Convergence test */
2281 cvg = _convergence_test(c, n_iter, residue, convergence);
2282 if (cvg != CS_SLES_ITERATING)
2283 break;
2284
2285 n_iter += 1;
2286
2287 ro_0 = -omega_2*ro_0;
2288 ro_1 = _dot_product(c, qk, rk);
2289
2290 if (_breakdown(c, convergence, "rho0", ro_0, 1.e-60,
2291 residue, n_iter, &cvg))
2292 break;
2293
2294 if (_breakdown(c, convergence, "rho1", ro_1, _epzero,
2295 residue, n_iter, &cvg))
2296 break;
2297
2298 beta = alpha*ro_1/ro_0;
2299 ro_0 = ro_1;
2300
2301 # pragma omp parallel for if(n_rows > CS_THR_MIN)
2302 for (cs_lnum_t ii = 0; ii < n_rows; ii++)
2303 uk[ii] = rk[ii] - beta* uk[ii];
2304
2305 /* Compute vk = A*uk */
2306
2307 cs_matrix_vector_multiply(a, uk, vk);
2308
2309 /* Preconditionning */
2310
2311 c->setup_data->pc_apply(c->setup_data->pc_context, vk, zk);
2312
2313 /* Compute gamma and alpha */
2314
2315 gamma = _dot_product(c, qk, vk);
2316
2317 if (_breakdown(c, convergence, "gamma", gamma, 1.e-60,
2318 residue, n_iter, &cvg))
2319 break;
2320
2321 alpha = ro_0/gamma;
2322
2323 /* Update rk */
2324
2325 # pragma omp parallel for if(n_rows > CS_THR_MIN)
2326 for (cs_lnum_t ii = 0; ii < n_rows; ii++) {
2327 rk[ii] -= alpha*vk[ii];
2328 vx[ii] += alpha*uk[ii];
2329 }
2330
2331 /* p = A*r */
2332
2333 cs_matrix_vector_multiply(a, rk, sk);
2334
2335 c->setup_data->pc_apply(c->setup_data->pc_context, sk, zk);
2336
2337 ro_1 = _dot_product(c, qk, sk);
2338 beta = alpha*ro_1/ro_0;
2339 ro_0 = ro_1;
2340
2341 # pragma omp parallel if(n_rows > CS_THR_MIN)
2342 {
2343 # pragma omp for nowait
2344 for (cs_lnum_t ii = 0; ii < n_rows; ii++)
2345 vk[ii] = sk[ii] - beta*vk[ii];
2346 # pragma omp for nowait
2347 for (cs_lnum_t ii = 0; ii < n_rows; ii++)
2348 uk[ii] = rk[ii] - beta*uk[ii];
2349 }
2350
2351 /* wk = A*vk */
2352
2353 cs_matrix_vector_multiply(a, vk, wk);
2354
2355 c->setup_data->pc_apply(c->setup_data->pc_context, wk, zk);
2356
2357 gamma = _dot_product(c, qk, wk);
2358 alpha = (ro_0+mprec)/(gamma+mprec);
2359
2360 # pragma omp parallel for if(n_rows > CS_THR_MIN)
2361 for (cs_lnum_t ii = 0; ii < n_rows; ii++) {
2362 rk[ii] -= alpha*vk[ii];
2363 sk[ii] -= alpha*wk[ii];
2364 }
2365
2366 /* tk = A*sk */
2367
2368 cs_matrix_vector_multiply(a, sk, tk);
2369
2370 c->setup_data->pc_apply(c->setup_data->pc_context, tk, zk);
2371
2372 _dot_products_xx_yy_xy_xz_yz(c, sk, tk, rk,
2373 &mu, &tau, &nu, &omega_1, &omega_2);
2374
2375 tau = tau - (nu*nu/mu);
2376 omega_2 = (omega_2 - ((nu+mprec)*(omega_1+mprec)/(mu+mprec)))/(tau+mprec);
2377
2378 omega_1 = (omega_1 - nu*omega_2)/mu;
2379
2380 /* sol <- sol + omega_1*r + omega_2*s + alpha*u */
2381 # pragma omp parallel for if(n_rows > CS_THR_MIN)
2382 for (cs_lnum_t ii = 0; ii < n_rows; ii++)
2383 vx[ii] += omega_1*rk[ii] + omega_2*sk[ii] + alpha*uk[ii];
2384
2385 # pragma omp parallel if(n_rows > CS_THR_MIN)
2386 {
2387 /* r <- r - omega_1*s - omega_2*t */
2388 # pragma omp for nowait
2389 for (cs_lnum_t ii = 0; ii < n_rows; ii++)
2390 rk[ii] += - omega_1*sk[ii] - omega_2*tk[ii];
2391 /* u <- u - omega_1*v - omega_2*w */
2392 # pragma omp for nowait
2393 for (cs_lnum_t ii = 0; ii < n_rows; ii++)
2394 uk[ii] += - omega_1*vk[ii] - omega_2*wk[ii];
2395 }
2396
2397 }
2398
2399 if (_aux_vectors != aux_vectors)
2400 BFT_FREE(_aux_vectors);
2401
2402 return cvg;
2403 }
2404
2405 /*----------------------------------------------------------------------------
2406 * Transform using Givens rotations a system Ax=b where A is an upper
2407 * triangular matrix of size n*(n-1) with a lower diagonal to an equivalent
2408 * system A'x=b' where A' is an upper triangular matrix.
2409 *
2410 * parameters:
2411 * a <-> dense matrix (size: size a_size*a_size):
2412 * a(i,j) = a[i + j*a_size]
2413 * input: A; output: A'
2414 * a_size <-- matrix dim
2415 * b <-> pre-allocated vector of a_size elements
2416 * input: b; output: b'
2417 * givens_coeff <-> pre-allocated vector of a_size elements
2418 * input: previous Givens coefficients
2419 * output: updated Givens coefficients
2420 * update_rank <-- rank of first non null coefficient on lower diagonal
2421 * end_update <-- rank of last non null coefficient on lower diagonal
2422 *----------------------------------------------------------------------------*/
2423
2424 static void
_givens_rot_update(cs_real_t * restrict a,cs_lnum_t a_size,cs_real_t * restrict b,cs_real_t * restrict givens_coeff,cs_lnum_t update_rank,cs_lnum_t end_update)2425 _givens_rot_update(cs_real_t *restrict a,
2426 cs_lnum_t a_size,
2427 cs_real_t *restrict b,
2428 cs_real_t *restrict givens_coeff,
2429 cs_lnum_t update_rank,
2430 cs_lnum_t end_update)
2431 {
2432 for (cs_lnum_t i = 0; i < update_rank; ++i) {
2433 for (cs_lnum_t j = update_rank; j < end_update; ++j) {
2434
2435 cs_real_t _aux = givens_coeff[i]*a[j*a_size + i]
2436 + givens_coeff[i + a_size] * a[j*a_size + i+1];
2437
2438 a[j*a_size + i+1] = givens_coeff[i] * a[i+1 + j*a_size]
2439 - givens_coeff[i + a_size] * a[j*a_size + i];
2440
2441 a[j*a_size + i] = _aux;
2442 }
2443 }
2444
2445 for (cs_lnum_t i = update_rank; i < end_update; ++i) {
2446
2447 cs_real_t norm = sqrt( a[i*a_size + i] * a[i*a_size + i]
2448 + a[i*a_size + i+1] * a[i*a_size + i+1]);
2449
2450 givens_coeff[a_size + i] = a[i*a_size + i+1]/norm;
2451 givens_coeff[i] = a[i*a_size + i]/norm;
2452
2453 b[i+1] = -b[i]*givens_coeff[a_size + i];
2454 b[i] = b[i]*givens_coeff[i];
2455
2456 /* i == j */
2457 {
2458 cs_real_t _aux = givens_coeff[i] * a[i*a_size + i]
2459 + givens_coeff[a_size+i] * a[i*a_size + i+1];
2460 a[i*a_size+i+1] = 0;
2461 a[i*a_size + i] = _aux;
2462 }
2463
2464 for (cs_lnum_t j = i+1; j < end_update; j++) {
2465
2466 cs_real_t _aux = givens_coeff[i] * a[j*a_size + i]
2467 + givens_coeff[a_size+i] * a[j*a_size + i+1];
2468
2469 a[i+1 + j*a_size] = givens_coeff[i]*a[i+1 + j*a_size]
2470 - givens_coeff[a_size + i]*a[i + j*a_size];
2471
2472 a[j*a_size + i] = _aux;
2473 }
2474 }
2475 }
2476
2477 /*----------------------------------------------------------------------------
2478 * Compute solution of Ax = b where A is an upper triangular matrix.
2479 *
2480 * As the system solved by GMRES will grow with iteration number, we
2481 * preallocate a allocated size, and solve only the useful part:
2482 *
2483 * | | | |x1| |b1|
2484 * | A | pad | |x2| |b2|
2485 * |_______| | |x3| |b3|
2486 * | | * |p | = |p |
2487 * | pad | |a | |a |
2488 * | | |d | |d |
2489 *
2490 * parameters:
2491 * a <-- dense upper triangular matrix A (size: glob_size*glob_size)
2492 * a(i,j) = a[i + j*glob_size]
2493 * a_size <-- system size
2494 * alloc_size <-- a_size + halo size
2495 * b <-- pre-allocated vector of a_size elements
2496 * x --> system solution, pre-allocated vector of a_size elements
2497 *
2498 * returns:
2499 * 0 in case of success, 1 in case of zero-pivot.
2500 *----------------------------------------------------------------------------*/
2501
2502 static int
_solve_diag_sup_halo(cs_real_t * restrict a,cs_lnum_t a_size,cs_lnum_t alloc_size,cs_real_t * restrict b,cs_real_t * restrict x)2503 _solve_diag_sup_halo(cs_real_t *restrict a,
2504 cs_lnum_t a_size,
2505 cs_lnum_t alloc_size,
2506 cs_real_t *restrict b,
2507 cs_real_t *restrict x)
2508 {
2509 for (cs_lnum_t i = a_size - 1; i > -1; i--) {
2510
2511 x[i] = b[i];
2512
2513 for (cs_lnum_t j = i + 1; j < a_size; j++)
2514 x[i] = x[i] - a[j*alloc_size + i]*x[j];
2515
2516 x[i] /= a[i*alloc_size + i];
2517 }
2518
2519 return 0; /* We should add a check for zero-pivot */
2520 }
2521
2522 /*----------------------------------------------------------------------------
2523 * Solution of A.vx = Rhs using optimised preconditioned GCR.
2524 *
2525 * On entry, vx is considered initialized.
2526 *
2527 * parameters:
2528 * c <-- pointer to solver context info
2529 * a <-- matrix
2530 * diag_block_size <-- diagonal block size (unused here)
2531 * convergence <-- convergence information structure
2532 * rhs <-- right hand side
2533 * vx <-> system solution
2534 * aux_size <-- number of elements in aux_vectors (in bytes)
2535 * aux_vectors --- optional working area (allocation otherwise)
2536 *
2537 * returns:
2538 * convergence state
2539 *----------------------------------------------------------------------------*/
2540
2541 static cs_sles_convergence_state_t
_gcr(cs_sles_it_t * c,const cs_matrix_t * a,cs_lnum_t diag_block_size,cs_sles_it_convergence_t * convergence,const cs_real_t * rhs,cs_real_t * restrict vx,size_t aux_size,void * aux_vectors)2542 _gcr(cs_sles_it_t *c,
2543 const cs_matrix_t *a,
2544 cs_lnum_t diag_block_size,
2545 cs_sles_it_convergence_t *convergence,
2546 const cs_real_t *rhs,
2547 cs_real_t *restrict vx,
2548 size_t aux_size,
2549 void *aux_vectors)
2550 {
2551 cs_sles_convergence_state_t cvg;
2552 double residue;
2553 cs_real_t *_aux_vectors, *alpha;
2554 cs_real_t *restrict rk, *restrict zk, *restrict ck;
2555 cs_real_t *restrict gkj, *restrict gkj_inv;
2556
2557 /* In case of the standard GCR, n_k_per_restart --> Inf,
2558 * or stops until convergence*/
2559 unsigned n_iter = 0;
2560 const unsigned n_k_per_restart = c->restart_interval;
2561
2562 size_t wa_size;
2563
2564 unsigned n_restart = 0;
2565
2566 /* Allocate or map work arrays */
2567 /*-----------------------------*/
2568
2569 assert(c->setup_data != NULL);
2570 const cs_lnum_t n_rows = c->setup_data->n_rows;
2571
2572 {
2573 const cs_lnum_t n_cols = cs_matrix_get_n_columns(a) * diag_block_size;
2574 const size_t n_wa = 1 + n_k_per_restart * 2;
2575 wa_size = CS_SIMD_SIZE(n_cols);
2576
2577 if (aux_vectors == NULL || aux_size/sizeof(cs_real_t) < (wa_size * n_wa))
2578 BFT_MALLOC(_aux_vectors, wa_size * n_wa, cs_real_t);
2579 else
2580 _aux_vectors = aux_vectors;
2581
2582 rk = _aux_vectors; /* store residuals */
2583 zk = _aux_vectors + wa_size; /* store inv(M)*r */
2584 ck = _aux_vectors + wa_size * (1 + n_k_per_restart); /* store A*zk */
2585 }
2586
2587 BFT_MALLOC(alpha, n_k_per_restart, cs_real_t);
2588
2589 /* gkj stores the upper triangle matrix Gamma of crossed inner-products
2590 * Not necessary to allocate the full matrix size
2591 * gkj_inv stores the inverse of gkj */
2592
2593 BFT_MALLOC(gkj, (n_k_per_restart + 1) * n_k_per_restart / 2, cs_real_t);
2594 BFT_MALLOC(gkj_inv, (n_k_per_restart + 1) * n_k_per_restart / 2, cs_real_t);
2595
2596 cvg = CS_SLES_ITERATING;
2597
2598 /* Current Restart */
2599 while (cvg == CS_SLES_ITERATING) {
2600
2601 n_iter = 0;
2602
2603 /* Initialize iterative calculation */
2604 /*----------------------------------*/
2605
2606 cs_matrix_vector_multiply(a, vx, rk); /* rk = A.x0 */
2607
2608 # pragma omp parallel for if(n_rows > CS_THR_MIN)
2609 for (cs_lnum_t ii = 0; ii < n_rows; ii++)
2610 rk[ii] -= rhs[ii];
2611
2612 residue = sqrt(_dot_product_xx(c, rk));
2613
2614 if (n_restart == 0)
2615 c->setup_data->initial_residue = residue;
2616
2617 /* Current Iteration on k */
2618 /* ---------------------- */
2619
2620 while (cvg == CS_SLES_ITERATING && n_iter < n_k_per_restart) {
2621
2622 /* Preconditionning */
2623
2624 cs_real_t *zk_n = zk + n_iter * wa_size;
2625 cs_real_t *ck_n = ck + n_iter * wa_size;
2626
2627 c->setup_data->pc_apply(c->setup_data->pc_context, rk, zk_n);
2628
2629 cs_matrix_vector_multiply(a, zk_n, ck_n);
2630
2631 for(cs_lnum_t jj = 0; jj < (int)n_iter; jj++) {
2632 cs_real_t *ck_j = ck + jj * wa_size;
2633
2634 gkj[(n_iter + 1) * n_iter / 2 + jj] = _dot_product(c, ck_j, ck_n);
2635 # pragma omp parallel for if(n_rows > CS_THR_MIN)
2636 for (cs_lnum_t ii = 0; ii < n_rows; ii++)
2637 ck_n[ii] += - gkj[(n_iter + 1) * n_iter / 2 + jj] * ck_j[ii];
2638 }
2639
2640 const int iter_shift = (n_iter+1) * n_iter / 2 + n_iter;
2641 gkj[iter_shift] = sqrt(_dot_product(c, ck_n, ck_n));
2642
2643 if (fabs(gkj[iter_shift]) > 0) {
2644
2645 # pragma omp parallel for if(n_rows > CS_THR_MIN)
2646 for (cs_lnum_t ii = 0; ii < n_rows; ii++)
2647 ck_n[ii] /= gkj[iter_shift];
2648
2649 alpha[n_iter] = _dot_product(c, ck_n, rk);
2650
2651 }
2652 else
2653 alpha[n_iter] = 0.;
2654
2655 # pragma omp parallel for if(n_rows > CS_THR_MIN)
2656 for (cs_lnum_t ii = 0; ii < n_rows; ii++)
2657 rk[ii] += - alpha[n_iter] * ck_n[ii];
2658
2659 /* Compute residue */
2660
2661 residue = sqrt(_dot_product_xx(c, rk));
2662
2663 n_iter += 1;
2664
2665 /* Convergence test of current iteration */
2666
2667 cvg = _convergence_test(c, (n_restart * n_k_per_restart) + n_iter,
2668 residue, convergence);
2669
2670 if (cvg != CS_SLES_ITERATING)
2671 break;
2672
2673 } /* Needs iterating or k < n_restart */
2674
2675 /* Inversion of Gamma */
2676
2677 if (n_iter == 1 && !(fabs(alpha[0]) > 0))
2678 gkj_inv[0] = 1.0;
2679
2680 else {
2681
2682 cs_lnum_t n_g_inv = (n_k_per_restart + 1) * n_k_per_restart / 2;
2683 for (cs_lnum_t jj = 0; jj < n_g_inv; jj++)
2684 gkj_inv[jj] = 0.0;
2685
2686 for (cs_lnum_t kk = 0; kk < (int)n_iter; kk++) {
2687 for(cs_lnum_t ii = 0; ii < kk; ii++) {
2688 for (cs_lnum_t jj = 0; jj < kk; jj++)
2689 gkj_inv[(kk + 1) * kk / 2 + ii]
2690 += ((ii <= jj) ? gkj_inv[(jj + 1) * jj / 2 + ii] : 0.0)
2691 * gkj[(kk + 1) * kk / 2 + jj];
2692 }
2693
2694 for (cs_lnum_t jj = 0; jj < kk; jj++)
2695 gkj_inv[(kk + 1) * kk / 2 + jj] /= - gkj[(kk + 1) * kk / 2 + kk];
2696
2697 gkj_inv[(kk + 1) * kk / 2 + kk] = 1.0 / gkj[(kk + 1) * kk / 2 + kk];
2698 }
2699
2700 } /* n_iter > 1 */
2701
2702 /* Compute the solution */
2703
2704 # pragma omp parallel if (n_rows > CS_THR_MIN)
2705 {
2706 cs_lnum_t s_id, e_id;
2707 cs_parall_thread_range(n_rows, sizeof(cs_real_t), &s_id, &e_id);
2708
2709 for(cs_lnum_t kk = 0; kk < (int)n_iter; kk++) {
2710 for(cs_lnum_t jj = 0; jj <= kk; jj++) {
2711 const cs_real_t *zk_j = zk + jj*wa_size;
2712 for (cs_lnum_t ii = s_id; ii < e_id; ii++)
2713 vx[ii] -= alpha[kk] * zk_j[ii]
2714 * gkj_inv[(kk + 1) * kk / 2 + jj];
2715 }
2716 }
2717 }
2718
2719 n_restart += 1;
2720
2721 } /* Needs iterating */
2722
2723 if (_aux_vectors != aux_vectors)
2724 BFT_FREE(_aux_vectors);
2725
2726 BFT_FREE(alpha);
2727 BFT_FREE(gkj);
2728 BFT_FREE(gkj_inv);
2729
2730 return cvg;
2731 }
2732
2733 /*----------------------------------------------------------------------------
2734 * Solution of A.vx = Rhs using preconditioned GMRES.
2735 *
2736 * On entry, vx is considered initialized.
2737 *
2738 * parameters:
2739 * c <-- pointer to solver context info
2740 * a <-- matrix
2741 * diag_block_size <-- diagonal block size (unused here)
2742 * convergence <-- convergence information structure
2743 * rhs <-- right hand side
2744 * vx <-> system solution
2745 * aux_size <-- number of elements in aux_vectors (in bytes)
2746 * aux_vectors --- optional working area (allocation otherwise)
2747 *
2748 * returns:
2749 * convergence state
2750 *----------------------------------------------------------------------------*/
2751
2752 static cs_sles_convergence_state_t
_gmres(cs_sles_it_t * c,const cs_matrix_t * a,cs_lnum_t diag_block_size,cs_sles_it_convergence_t * convergence,const cs_real_t * rhs,cs_real_t * restrict vx,size_t aux_size,void * aux_vectors)2753 _gmres(cs_sles_it_t *c,
2754 const cs_matrix_t *a,
2755 cs_lnum_t diag_block_size,
2756 cs_sles_it_convergence_t *convergence,
2757 const cs_real_t *rhs,
2758 cs_real_t *restrict vx,
2759 size_t aux_size,
2760 void *aux_vectors)
2761 {
2762 cs_sles_convergence_state_t cvg = CS_SLES_ITERATING;
2763 int l_iter, l_old_iter;
2764 double beta, dot_prod, residue;
2765 cs_real_t *_aux_vectors;
2766 cs_real_t *restrict _krylov_vectors, *restrict _h_matrix;
2767 cs_real_t *restrict _givens_coeff, *restrict _beta;
2768 cs_real_t *restrict dk, *restrict gk;
2769 cs_real_t *restrict bk, *restrict fk, *restrict krk;
2770
2771 cs_lnum_t krylov_size_max = c->restart_interval;
2772 unsigned n_iter = 0;
2773
2774 /* Allocate or map work arrays */
2775 /*-----------------------------*/
2776
2777 assert(c->setup_data != NULL);
2778
2779 const cs_lnum_t n_rows = c->setup_data->n_rows;
2780
2781 /* Allocate work arrays */
2782
2783 int krylov_size = sqrt(n_rows*diag_block_size)*1.5 + 1;
2784 if (krylov_size > krylov_size_max)
2785 krylov_size = krylov_size_max;
2786
2787 #if defined(HAVE_MPI)
2788 if (c->comm != MPI_COMM_NULL) {
2789 int _krylov_size = krylov_size;
2790 MPI_Allreduce(&_krylov_size,
2791 &krylov_size,
2792 1,
2793 MPI_INT,
2794 MPI_MIN,
2795 c->comm);
2796 }
2797 #endif
2798
2799 int check_freq = (int)(krylov_size/10) + 1;
2800 double epsi = 1.e-15;
2801 int scaltest = 0;
2802
2803 {
2804 const cs_lnum_t n_cols = cs_matrix_get_n_columns(a) * diag_block_size;
2805
2806 size_t _aux_r_size;
2807 size_t n_wa = 4;
2808 size_t wa_size = n_cols < krylov_size? krylov_size : n_cols;
2809
2810 wa_size = CS_SIMD_SIZE(wa_size);
2811 _aux_r_size = wa_size*n_wa
2812 + (krylov_size-1)*(n_rows + krylov_size) + 3*krylov_size;
2813
2814 if (aux_vectors == NULL || aux_size/sizeof(cs_real_t) < _aux_r_size)
2815 BFT_MALLOC(_aux_vectors, _aux_r_size, cs_real_t);
2816 else
2817 _aux_vectors = aux_vectors;
2818
2819 dk = _aux_vectors;
2820 gk = _aux_vectors + wa_size;
2821 bk = _aux_vectors + 2*wa_size;
2822 fk = _aux_vectors + 3*wa_size;
2823 _krylov_vectors = _aux_vectors + n_wa*wa_size;
2824 _h_matrix = _aux_vectors + n_wa*wa_size + (krylov_size - 1)*n_rows;
2825 _givens_coeff = _aux_vectors + n_wa*wa_size
2826 + (krylov_size - 1)*(n_rows + krylov_size);
2827 _beta = _aux_vectors + n_wa*wa_size
2828 + (krylov_size - 1)*(n_rows + krylov_size) + 2*krylov_size;
2829 }
2830
2831 for (cs_lnum_t ii = 0; ii < krylov_size*(krylov_size - 1); ii++)
2832 _h_matrix[ii] = 0.;
2833
2834 cvg = CS_SLES_ITERATING;
2835
2836 while (cvg == CS_SLES_ITERATING) {
2837
2838 /* compute rk <- a*vx (vx = x0) */
2839
2840 cs_matrix_vector_multiply(a, vx, dk);
2841
2842 /* compute rk <- rhs - rk (r0 = b-A*x0) */
2843
2844 # pragma omp parallel for if(n_rows > CS_THR_MIN)
2845 for (cs_lnum_t ii = 0; ii < n_rows; ii++)
2846 dk[ii] = rhs[ii] - dk[ii];
2847
2848 if (n_iter == 0) {
2849 residue = sqrt(_dot_product_xx(c, dk));
2850 c->setup_data->initial_residue = residue;
2851 cvg = _convergence_test(c, n_iter, residue, convergence);
2852 if (cvg != CS_SLES_ITERATING)
2853 break;
2854 }
2855
2856 /* beta = ||r0|| */
2857 beta = sqrt(_dot_product(c, dk, dk));
2858 dot_prod = beta;
2859
2860 _beta[0] = beta;
2861 for (cs_lnum_t ii = 1; ii < krylov_size; ii++)
2862 _beta[ii] = 0.;
2863
2864 /* Lap */
2865
2866 l_iter = 0;
2867 l_old_iter = 0;
2868
2869 for (cs_lnum_t ii = 0; ii < krylov_size - 1; ii++) {
2870
2871 /* krk = k-ieth col of _krylov_vector = vi */
2872 krk = _krylov_vectors + ii*n_rows;
2873
2874 # pragma omp parallel for if(n_rows > CS_THR_MIN)
2875 for (cs_lnum_t jj = 0; jj < n_rows; jj++)
2876 krk[jj] = dk[jj]/dot_prod;
2877
2878 c->setup_data->pc_apply(c->setup_data->pc_context, krk, gk);
2879
2880 /* compute w = dk <- A*vj */
2881
2882 cs_matrix_vector_multiply(a, gk, dk);
2883
2884 for (cs_lnum_t jj = 0; jj < ii + 1; jj++) {
2885
2886 /* compute h(k,i) = <w,vi> = <dk,vi> */
2887 _h_matrix[ii*krylov_size + jj]
2888 = _dot_product(c, dk, (_krylov_vectors + jj*n_rows));
2889
2890 /* compute w = dk <- w - h(i,k)*vi */
2891
2892 cs_axpy(n_rows,
2893 -_h_matrix[ii*krylov_size+jj],
2894 (_krylov_vectors + jj*n_rows),
2895 dk);
2896
2897 }
2898
2899 /* compute h(i+1,i) = sqrt<w,w> */
2900 dot_prod = sqrt(_dot_product(c, dk, dk));
2901 _h_matrix[ii*krylov_size + ii + 1] = dot_prod;
2902
2903 if (dot_prod < epsi) scaltest = 1;
2904
2905 if ( (l_iter + 1)%check_freq == 0
2906 || l_iter == krylov_size - 2
2907 || scaltest == 1) {
2908
2909 /* H matrix to diagonal sup matrix */
2910
2911 _givens_rot_update(_h_matrix,
2912 krylov_size,
2913 _beta,
2914 _givens_coeff,
2915 l_old_iter,
2916 l_iter + 1);
2917
2918 l_old_iter = l_iter + 1;
2919
2920 /* solve diag sup system */
2921 _solve_diag_sup_halo(_h_matrix, l_iter + 1, krylov_size, _beta, gk);
2922
2923 # pragma omp parallel for if(n_rows > CS_THR_MIN)
2924 for (cs_lnum_t jj = 0; jj < n_rows; jj++) {
2925 fk[jj] = 0.0;
2926 for (cs_lnum_t kk = 0; kk <= l_iter; kk++)
2927 fk[jj] += _krylov_vectors[kk*n_rows + jj] * gk[kk];
2928 }
2929
2930 c->setup_data->pc_apply(c->setup_data->pc_context, fk, gk);
2931
2932 # pragma omp parallel for if(n_rows > CS_THR_MIN)
2933 for (cs_lnum_t jj = 0; jj < n_rows; jj++)
2934 fk[jj] = vx[jj] + gk[jj];
2935
2936 cs_matrix_vector_multiply(a, fk, bk);
2937
2938 /* compute residue = | Ax - b |_1 */
2939
2940 # pragma omp parallel for if(n_rows > CS_THR_MIN)
2941 for (cs_lnum_t jj = 0; jj < n_rows; jj++)
2942 bk[jj] -= rhs[jj];
2943
2944 residue = sqrt(_dot_product_xx(c, bk));
2945
2946 cvg = _convergence_test(c, n_iter, residue, convergence);
2947
2948 }
2949
2950 n_iter++;
2951 l_iter++;
2952
2953 if (cvg == CS_SLES_CONVERGED || cvg == CS_SLES_MAX_ITERATION ||
2954 l_iter == krylov_size - 1 || scaltest == 1) {
2955 # pragma omp parallel for if (n_rows > CS_THR_MIN)
2956 for (cs_lnum_t jj = 0; jj < n_rows; jj++)
2957 vx[jj] = fk[jj];
2958 break;
2959 }
2960 }
2961 }
2962
2963 if (_aux_vectors != aux_vectors)
2964 BFT_FREE(_aux_vectors);
2965
2966 return cvg;
2967 }
2968
2969 /*----------------------------------------------------------------------------
2970 * Solution of A.vx = Rhs using Process-local Gauss-Seidel.
2971 *
2972 * On entry, vx is considered initialized.
2973 *
2974 * parameters:
2975 * c <-- pointer to solver context info
2976 * a <-- linear equation matrix
2977 * diag_block_size <-- diagonal block size
2978 * convergence <-- convergence information structure
2979 * rhs <-- right hand side
2980 * vx <-> system solution
2981 * aux_size <-- number of elements in aux_vectors (in bytes)
2982 * aux_vectors --- optional working area (unused here)
2983 *
2984 * returns:
2985 * convergence state
2986 *----------------------------------------------------------------------------*/
2987
2988 static cs_sles_convergence_state_t
_p_ordered_gauss_seidel_msr(cs_sles_it_t * c,const cs_matrix_t * a,cs_lnum_t diag_block_size,cs_sles_it_convergence_t * convergence,const cs_real_t * rhs,cs_real_t * restrict vx)2989 _p_ordered_gauss_seidel_msr(cs_sles_it_t *c,
2990 const cs_matrix_t *a,
2991 cs_lnum_t diag_block_size,
2992 cs_sles_it_convergence_t *convergence,
2993 const cs_real_t *rhs,
2994 cs_real_t *restrict vx)
2995 {
2996 cs_sles_convergence_state_t cvg;
2997 double res2, residue;
2998
2999 unsigned n_iter = 0;
3000
3001 const cs_lnum_t n_rows = cs_matrix_get_n_rows(a);
3002
3003 const cs_halo_t *halo = cs_matrix_get_halo(a);
3004
3005 const cs_real_t *restrict ad_inv = c->setup_data->ad_inv;
3006
3007 const cs_real_t *restrict ad = cs_matrix_get_diagonal(a);
3008
3009 const cs_lnum_t *a_row_index, *a_col_id;
3010 const cs_real_t *a_d_val, *a_x_val;
3011
3012 const cs_lnum_t *db_size = cs_matrix_get_diag_block_size(a);
3013 cs_matrix_get_msr_arrays(a, &a_row_index, &a_col_id, &a_d_val, &a_x_val);
3014
3015 const cs_lnum_t *order = c->add_data->order;
3016
3017 cvg = CS_SLES_ITERATING;
3018
3019 /* Current iteration */
3020 /*-------------------*/
3021
3022 while (cvg == CS_SLES_ITERATING) {
3023
3024 n_iter += 1;
3025
3026 /* Synchronize ghost cells first */
3027
3028 if (halo != NULL)
3029 cs_matrix_pre_vector_multiply_sync(a, vx);
3030
3031 /* Compute Vx <- Vx - (A-diag).Rk and residue. */
3032
3033 res2 = 0.0;
3034
3035 if (diag_block_size == 1) {
3036
3037 # pragma omp parallel for reduction(+:res2) \
3038 if(n_rows > CS_THR_MIN && !_thread_debug)
3039 for (cs_lnum_t ll = 0; ll < n_rows; ll++) {
3040
3041 cs_lnum_t ii = order[ll];
3042
3043 const cs_lnum_t *restrict col_id = a_col_id + a_row_index[ii];
3044 const cs_real_t *restrict m_row = a_x_val + a_row_index[ii];
3045 const cs_lnum_t n_cols = a_row_index[ii+1] - a_row_index[ii];
3046
3047 cs_real_t vxm1 = vx[ii];
3048 cs_real_t vx0 = rhs[ii];
3049
3050 for (cs_lnum_t jj = 0; jj < n_cols; jj++)
3051 vx0 -= (m_row[jj]*vx[col_id[jj]]);
3052
3053 vx0 *= ad_inv[ii];
3054
3055 register double r = ad[ii] * (vx0-vxm1);
3056
3057 vx[ii] = vx0;
3058
3059 res2 += (r*r);
3060 }
3061
3062 }
3063 else {
3064
3065 # pragma omp parallel for reduction(+:res2) \
3066 if(n_rows > CS_THR_MIN && !_thread_debug)
3067 for (cs_lnum_t ll = 0; ll < n_rows; ll++) {
3068
3069 cs_lnum_t ii = order[ll];
3070
3071 const cs_lnum_t *restrict col_id = a_col_id + a_row_index[ii];
3072 const cs_real_t *restrict m_row = a_x_val + a_row_index[ii];
3073 const cs_lnum_t n_cols = a_row_index[ii+1] - a_row_index[ii];
3074
3075 cs_real_t vx0[DB_SIZE_MAX], vxm1[DB_SIZE_MAX], _vx[DB_SIZE_MAX];
3076
3077 for (cs_lnum_t kk = 0; kk < db_size[0]; kk++) {
3078 vxm1[kk] = vx[ii*db_size[1] + kk];
3079 vx0[kk] = rhs[ii*db_size[1] + kk];
3080 }
3081
3082 for (cs_lnum_t jj = 0; jj < n_cols; jj++) {
3083 for (cs_lnum_t kk = 0; kk < db_size[0]; kk++)
3084 vx0[kk] -= (m_row[jj]*vx[col_id[jj]*db_size[1] + kk]);
3085 }
3086
3087 _fw_and_bw_lu_gs(ad_inv + db_size[3]*ii,
3088 db_size[0],
3089 _vx,
3090 vx0);
3091
3092 double rr = 0;
3093 for (cs_lnum_t kk = 0; kk < db_size[0]; kk++) {
3094 register double r = ad[ii*db_size[1] + kk] * (_vx[kk]-vxm1[kk]);
3095 rr += (r*r);
3096 vx[ii*db_size[1] + kk] = _vx[kk];
3097 }
3098 res2 += rr;
3099
3100 }
3101
3102 }
3103
3104 #if defined(HAVE_MPI)
3105
3106 if (c->comm != MPI_COMM_NULL) {
3107 double _sum;
3108 MPI_Allreduce(&res2, &_sum, 1, MPI_DOUBLE, MPI_SUM, c->comm);
3109 res2 = _sum;
3110 }
3111
3112 #endif /* defined(HAVE_MPI) */
3113
3114 residue = sqrt(res2); /* Actually, residue of previous iteration */
3115
3116 /* Convergence test */
3117
3118 if (n_iter == 1)
3119 c->setup_data->initial_residue = residue;
3120
3121 cvg = _convergence_test(c, n_iter, residue, convergence);
3122
3123 }
3124
3125 return cvg;
3126 }
3127
3128 /*----------------------------------------------------------------------------
3129 * Solution of A.vx = Rhs using Process-local Gauss-Seidel.
3130 *
3131 * On entry, vx is considered initialized.
3132 *
3133 * parameters:
3134 * c <-- pointer to solver context info
3135 * a <-- linear equation matrix
3136 * diag_block_size <-- diagonal block size
3137 * convergence <-- convergence information structure
3138 * rhs <-- right hand side
3139 * vx <-> system solution
3140 * aux_size <-- number of elements in aux_vectors (in bytes)
3141 * aux_vectors --- optional working area (unused here)
3142 *
3143 * returns:
3144 * convergence state
3145 *----------------------------------------------------------------------------*/
3146
3147 static cs_sles_convergence_state_t
_p_gauss_seidel_msr(cs_sles_it_t * c,const cs_matrix_t * a,cs_lnum_t diag_block_size,cs_sles_it_convergence_t * convergence,const cs_real_t * rhs,cs_real_t * restrict vx)3148 _p_gauss_seidel_msr(cs_sles_it_t *c,
3149 const cs_matrix_t *a,
3150 cs_lnum_t diag_block_size,
3151 cs_sles_it_convergence_t *convergence,
3152 const cs_real_t *rhs,
3153 cs_real_t *restrict vx)
3154 {
3155 cs_sles_convergence_state_t cvg;
3156 double res2, residue;
3157
3158 unsigned n_iter = 0;
3159
3160 const cs_lnum_t n_rows = cs_matrix_get_n_rows(a);
3161
3162 const cs_halo_t *halo = cs_matrix_get_halo(a);
3163
3164 const cs_real_t *restrict ad_inv = c->setup_data->ad_inv;
3165
3166 const cs_real_t *restrict ad = cs_matrix_get_diagonal(a);
3167
3168 const cs_lnum_t *a_row_index, *a_col_id;
3169 const cs_real_t *a_d_val, *a_x_val;
3170
3171 const cs_lnum_t *db_size = cs_matrix_get_diag_block_size(a);
3172 cs_matrix_get_msr_arrays(a, &a_row_index, &a_col_id, &a_d_val, &a_x_val);
3173
3174 cvg = CS_SLES_ITERATING;
3175
3176 /* Current iteration */
3177 /*-------------------*/
3178
3179 while (cvg == CS_SLES_ITERATING) {
3180
3181 n_iter += 1;
3182
3183 /* Synchronize ghost cells first */
3184
3185 if (halo != NULL)
3186 cs_matrix_pre_vector_multiply_sync(a, vx);
3187
3188 /* Compute Vx <- Vx - (A-diag).Rk and residue. */
3189
3190 res2 = 0.0;
3191
3192 if (diag_block_size == 1) {
3193
3194 # pragma omp parallel for reduction(+:res2) \
3195 if(n_rows > CS_THR_MIN && !_thread_debug)
3196 for (cs_lnum_t ii = 0; ii < n_rows; ii++) {
3197
3198 const cs_lnum_t *restrict col_id = a_col_id + a_row_index[ii];
3199 const cs_real_t *restrict m_row = a_x_val + a_row_index[ii];
3200 const cs_lnum_t n_cols = a_row_index[ii+1] - a_row_index[ii];
3201
3202 cs_real_t vxm1 = vx[ii];
3203 cs_real_t vx0 = rhs[ii];
3204
3205 for (cs_lnum_t jj = 0; jj < n_cols; jj++)
3206 vx0 -= (m_row[jj]*vx[col_id[jj]]);
3207
3208 vx0 *= ad_inv[ii];
3209
3210 register double r = ad[ii] * (vx0-vxm1);
3211 res2 += (r*r);
3212
3213 vx[ii] = vx0;
3214 }
3215
3216 }
3217 else {
3218
3219 # pragma omp parallel for reduction(+:res2) \
3220 if(n_rows > CS_THR_MIN && !_thread_debug)
3221 for (cs_lnum_t ii = 0; ii < n_rows; ii++) {
3222
3223 const cs_lnum_t *restrict col_id = a_col_id + a_row_index[ii];
3224 const cs_real_t *restrict m_row = a_x_val + a_row_index[ii];
3225 const cs_lnum_t n_cols = a_row_index[ii+1] - a_row_index[ii];
3226
3227 cs_real_t vx0[DB_SIZE_MAX], vxm1[DB_SIZE_MAX], _vx[DB_SIZE_MAX];
3228
3229 for (cs_lnum_t kk = 0; kk < db_size[0]; kk++) {
3230 vxm1[kk] = vx[ii*db_size[1] + kk];
3231 vx0[kk] = rhs[ii*db_size[1] + kk];
3232 }
3233
3234 for (cs_lnum_t jj = 0; jj < n_cols; jj++) {
3235 for (cs_lnum_t kk = 0; kk < db_size[0]; kk++)
3236 vx0[kk] -= (m_row[jj]*vx[col_id[jj]*db_size[1] + kk]);
3237 }
3238
3239 _fw_and_bw_lu_gs(ad_inv + db_size[3]*ii,
3240 db_size[0],
3241 _vx,
3242 vx0);
3243
3244 double rr = 0;
3245 for (cs_lnum_t kk = 0; kk < db_size[0]; kk++) {
3246 register double r = ad[ii*db_size[1] + kk] * (_vx[kk]-vxm1[kk]);
3247 rr += (r*r);
3248 vx[ii*db_size[1] + kk] = _vx[kk];
3249 }
3250 res2 += rr;
3251
3252 }
3253
3254 }
3255
3256 if (convergence->precision > 0. || c->plot != NULL) {
3257
3258 #if defined(HAVE_MPI)
3259
3260 if (c->comm != MPI_COMM_NULL) {
3261 double _sum;
3262 MPI_Allreduce(&res2, &_sum, 1, MPI_DOUBLE, MPI_SUM, c->comm);
3263 res2 = _sum;
3264 }
3265
3266 #endif /* defined(HAVE_MPI) */
3267
3268 residue = sqrt(res2); /* Actually, residue of previous iteration */
3269
3270 /* Convergence test */
3271
3272 if (n_iter == 1)
3273 c->setup_data->initial_residue = residue;
3274
3275 cvg = _convergence_test(c, n_iter, residue, convergence);
3276
3277 }
3278 else if (n_iter >= convergence->n_iterations_max) {
3279 convergence->n_iterations = n_iter;
3280 cvg = CS_SLES_MAX_ITERATION;
3281 }
3282
3283 }
3284
3285 return cvg;
3286 }
3287
3288 /*----------------------------------------------------------------------------
3289 * Solution of A.vx = Rhs using Process-local symmetric Gauss-Seidel.
3290 *
3291 * On entry, vx is considered initialized.
3292 *
3293 * parameters:
3294 * c <-- pointer to solver context info
3295 * a <-- linear equation matrix
3296 * diag_block_size <-- diagonal block size
3297 * convergence <-- convergence information structure
3298 * rhs <-- right hand side
3299 * vx <-> system solution
3300 * aux_size <-- number of elements in aux_vectors (in bytes)
3301 * aux_vectors --- optional working area (unused here)
3302 *
3303 * returns:
3304 * convergence state
3305 *----------------------------------------------------------------------------*/
3306
3307 static cs_sles_convergence_state_t
_p_sym_gauss_seidel_msr(cs_sles_it_t * c,const cs_matrix_t * a,cs_lnum_t diag_block_size,cs_sles_it_convergence_t * convergence,const cs_real_t * rhs,cs_real_t * restrict vx,size_t aux_size,void * aux_vectors)3308 _p_sym_gauss_seidel_msr(cs_sles_it_t *c,
3309 const cs_matrix_t *a,
3310 cs_lnum_t diag_block_size,
3311 cs_sles_it_convergence_t *convergence,
3312 const cs_real_t *rhs,
3313 cs_real_t *restrict vx,
3314 size_t aux_size,
3315 void *aux_vectors)
3316 {
3317 CS_UNUSED(aux_size);
3318 CS_UNUSED(aux_vectors);
3319
3320 cs_sles_convergence_state_t cvg;
3321 double res2, residue;
3322
3323 /* Check matrix storage type */
3324
3325 if (cs_matrix_get_type(a) != CS_MATRIX_MSR)
3326 bft_error
3327 (__FILE__, __LINE__, 0,
3328 _("Symmetric Gauss-Seidel Jacobi hybrid solver only supported with a\n"
3329 "matrix using %s storage."),
3330 "MSR");
3331
3332 unsigned n_iter = 0;
3333
3334 const cs_lnum_t n_rows = cs_matrix_get_n_rows(a);
3335
3336 const cs_halo_t *halo = cs_matrix_get_halo(a);
3337
3338 const cs_real_t *restrict ad_inv = c->setup_data->ad_inv;
3339
3340 const cs_real_t *restrict ad = cs_matrix_get_diagonal(a);
3341
3342 const cs_lnum_t *a_row_index, *a_col_id;
3343 const cs_real_t *a_d_val, *a_x_val;
3344
3345 const cs_lnum_t *db_size = cs_matrix_get_diag_block_size(a);
3346 cs_matrix_get_msr_arrays(a, &a_row_index, &a_col_id, &a_d_val, &a_x_val);
3347
3348 cvg = CS_SLES_ITERATING;
3349
3350 /* Current iteration */
3351 /*-------------------*/
3352
3353 while (cvg == CS_SLES_ITERATING) {
3354
3355 n_iter += 1;
3356
3357 /* Synchronize ghost cells first */
3358
3359 if (halo != NULL)
3360 cs_matrix_pre_vector_multiply_sync(a, vx);
3361
3362 /* Compute Vx <- Vx - (A-diag).Rk and residue: forward step */
3363
3364 if (diag_block_size == 1) {
3365
3366 # pragma omp parallel for if(n_rows > CS_THR_MIN && !_thread_debug)
3367 for (cs_lnum_t ii = 0; ii < n_rows; ii++) {
3368
3369 const cs_lnum_t *restrict col_id = a_col_id + a_row_index[ii];
3370 const cs_real_t *restrict m_row = a_x_val + a_row_index[ii];
3371 const cs_lnum_t n_cols = a_row_index[ii+1] - a_row_index[ii];
3372
3373 cs_real_t vx0 = rhs[ii];
3374
3375 for (cs_lnum_t jj = 0; jj < n_cols; jj++)
3376 vx0 -= (m_row[jj]*vx[col_id[jj]]);
3377
3378 vx[ii] = vx0 * ad_inv[ii];
3379
3380 }
3381
3382 }
3383 else {
3384
3385 # pragma omp parallel for if(n_rows > CS_THR_MIN && !_thread_debug)
3386 for (cs_lnum_t ii = 0; ii < n_rows; ii++) {
3387
3388 const cs_lnum_t *restrict col_id = a_col_id + a_row_index[ii];
3389 const cs_real_t *restrict m_row = a_x_val + a_row_index[ii];
3390 const cs_lnum_t n_cols = a_row_index[ii+1] - a_row_index[ii];
3391
3392 cs_real_t vx0[DB_SIZE_MAX], _vx[DB_SIZE_MAX];
3393
3394 for (cs_lnum_t kk = 0; kk < diag_block_size; kk++)
3395 vx0[kk] = rhs[ii*db_size[1] + kk];
3396
3397 for (cs_lnum_t jj = 0; jj < n_cols; jj++) {
3398 for (cs_lnum_t kk = 0; kk < diag_block_size; kk++)
3399 vx0[kk] -= (m_row[jj]*vx[col_id[jj]*db_size[1] + kk]);
3400 }
3401
3402 _fw_and_bw_lu_gs(ad_inv + db_size[3]*ii,
3403 db_size[0],
3404 _vx,
3405 vx0);
3406
3407 for (cs_lnum_t kk = 0; kk < diag_block_size; kk++)
3408 vx[ii*db_size[1] + kk] = _vx[kk];
3409
3410 }
3411
3412 }
3413
3414 /* Synchronize ghost cells again */
3415
3416 if (halo != NULL)
3417 cs_matrix_pre_vector_multiply_sync(a, vx);
3418
3419 /* Compute Vx <- Vx - (A-diag).Rk and residue: backward step */
3420
3421 res2 = 0.0;
3422
3423 if (diag_block_size == 1) {
3424
3425 # pragma omp parallel for reduction(+:res2) \
3426 if(n_rows > CS_THR_MIN && !_thread_debug)
3427 for (cs_lnum_t ii = n_rows - 1; ii > - 1; ii--) {
3428
3429 const cs_lnum_t *restrict col_id = a_col_id + a_row_index[ii];
3430 const cs_real_t *restrict m_row = a_x_val + a_row_index[ii];
3431 const cs_lnum_t n_cols = a_row_index[ii+1] - a_row_index[ii];
3432
3433 cs_real_t vxm1 = vx[ii];
3434 cs_real_t vx0 = rhs[ii];
3435
3436 for (cs_lnum_t jj = 0; jj < n_cols; jj++)
3437 vx0 -= (m_row[jj]*vx[col_id[jj]]);
3438
3439 vx0 *= ad_inv[ii];
3440
3441 register double r = ad[ii] * (vx0-vxm1);
3442 res2 += (r*r);
3443
3444 vx[ii] = vx0;
3445 }
3446
3447 }
3448 else {
3449
3450 # pragma omp parallel for reduction(+:res2) \
3451 if(n_rows > CS_THR_MIN && !_thread_debug)
3452 for (cs_lnum_t ii = n_rows - 1; ii > - 1; ii--) {
3453
3454 const cs_lnum_t *restrict col_id = a_col_id + a_row_index[ii];
3455 const cs_real_t *restrict m_row = a_x_val + a_row_index[ii];
3456 const cs_lnum_t n_cols = a_row_index[ii+1] - a_row_index[ii];
3457
3458 cs_real_t vx0[DB_SIZE_MAX], vxm1[DB_SIZE_MAX], _vx[DB_SIZE_MAX];
3459
3460 for (cs_lnum_t kk = 0; kk < db_size[0]; kk++) {
3461 vxm1[kk] = vx[ii*db_size[1] + kk];
3462 vx0[kk] = rhs[ii*db_size[1] + kk];
3463 }
3464
3465 for (cs_lnum_t jj = 0; jj < n_cols; jj++) {
3466 for (cs_lnum_t kk = 0; kk < db_size[0]; kk++)
3467 vx0[kk] -= (m_row[jj]*vx[col_id[jj]*db_size[1] + kk]);
3468 }
3469
3470 _fw_and_bw_lu_gs(ad_inv + db_size[3]*ii,
3471 db_size[0],
3472 _vx,
3473 vx0);
3474
3475 double rr = 0;
3476 for (cs_lnum_t kk = 0; kk < db_size[0]; kk++) {
3477 register double r = ad[ii*db_size[1] + kk] * (_vx[kk]-vxm1[kk]);
3478 rr += (r*r);
3479 vx[ii*db_size[1] + kk] = _vx[kk];
3480 }
3481 res2 += rr;
3482
3483 }
3484
3485 }
3486
3487 if (convergence->precision > 0. || c->plot != NULL) {
3488
3489 #if defined(HAVE_MPI)
3490
3491 if (c->comm != MPI_COMM_NULL) {
3492 double _sum;
3493 MPI_Allreduce(&res2, &_sum, 1, MPI_DOUBLE, MPI_SUM, c->comm);
3494 res2 = _sum;
3495 }
3496
3497 #endif /* defined(HAVE_MPI) */
3498
3499 residue = sqrt(res2); /* Actually, residue of previous iteration */
3500
3501 /* Convergence test */
3502
3503 if (n_iter == 1)
3504 c->setup_data->initial_residue = residue;
3505
3506 cvg = _convergence_test(c, n_iter, residue, convergence);
3507
3508 }
3509 else if (n_iter >= convergence->n_iterations_max) {
3510 convergence->n_iterations = n_iter;
3511 cvg = CS_SLES_MAX_ITERATION;
3512 }
3513
3514 }
3515
3516 return cvg;
3517 }
3518
3519 /*----------------------------------------------------------------------------
3520 * Solution of A.vx = Rhs using Process-local symmetric Gauss-Seidel.
3521 *
3522 * On entry, vx is considered initialized.
3523 *
3524 * parameters:
3525 * c <-- pointer to solver context info
3526 * a <-- linear equation matrix
3527 * diag_block_size <-- diagonal block size
3528 * convergence <-- convergence information structure
3529 * rhs <-- right hand side
3530 * vx <-> system solution
3531 * aux_size <-- number of elements in aux_vectors (in bytes)
3532 * aux_vectors --- optional working area (unused here)
3533 *
3534 * returns:
3535 * convergence state
3536 *----------------------------------------------------------------------------*/
3537
3538 static cs_sles_convergence_state_t
_p_gauss_seidel(cs_sles_it_t * c,const cs_matrix_t * a,cs_lnum_t diag_block_size,cs_sles_it_convergence_t * convergence,const cs_real_t * rhs,cs_real_t * restrict vx,size_t aux_size,void * aux_vectors)3539 _p_gauss_seidel(cs_sles_it_t *c,
3540 const cs_matrix_t *a,
3541 cs_lnum_t diag_block_size,
3542 cs_sles_it_convergence_t *convergence,
3543 const cs_real_t *rhs,
3544 cs_real_t *restrict vx,
3545 size_t aux_size,
3546 void *aux_vectors)
3547 {
3548 CS_UNUSED(aux_size);
3549 CS_UNUSED(aux_vectors);
3550
3551 cs_sles_convergence_state_t cvg;
3552
3553 /* Check matrix storage type */
3554
3555 if (cs_matrix_get_type(a) != CS_MATRIX_MSR)
3556 bft_error
3557 (__FILE__, __LINE__, 0,
3558 _("Gauss-Seidel Jacobi hybrid solver only supported with a\n"
3559 "matrix using %s storage."),
3560 "MSR");
3561
3562 /* Allocate or map work arrays */
3563 /*-----------------------------*/
3564
3565 assert(c->setup_data != NULL);
3566
3567 /* Check for ordered variant */
3568
3569 const cs_lnum_t *order = NULL;
3570
3571 if (c->add_data != NULL)
3572 order = c->add_data->order;
3573
3574 if (order != NULL)
3575 cvg = _p_ordered_gauss_seidel_msr(c,
3576 a,
3577 diag_block_size,
3578 convergence,
3579 rhs,
3580 vx);
3581
3582 else
3583 cvg = _p_gauss_seidel_msr(c,
3584 a,
3585 diag_block_size,
3586 convergence,
3587 rhs,
3588 vx);
3589
3590 return cvg;
3591 }
3592
3593 /*----------------------------------------------------------------------------
3594 * Switch to fallback solver if defined.
3595 *
3596 * vx is reset to zero in this case.
3597 *
3598 * parameters:
3599 * c <-- pointer to solver context info
3600 * a <-- matrix
3601 * solver_type <-- fallback solver type
3602 * prev_state <-- previous convergence state
3603 * convergence <-> convergence information structure
3604 * rhs <-- right hand side
3605 * vx <-> system solution
3606 * aux_size <-- number of elements in aux_vectors (in bytes)
3607 * aux_vectors --- optional working area
3608 * (internal allocation if NULL)
3609 *
3610 * returns:
3611 * convergence state
3612 *----------------------------------------------------------------------------*/
3613
3614 static cs_sles_convergence_state_t
_fallback(cs_sles_it_t * c,cs_sles_it_type_t solver_type,const cs_matrix_t * a,cs_sles_convergence_state_t prev_state,const cs_sles_it_convergence_t * convergence,int * n_iter,double * residue,const cs_real_t * rhs,cs_real_t * restrict vx,size_t aux_size,void * aux_vectors)3615 _fallback(cs_sles_it_t *c,
3616 cs_sles_it_type_t solver_type,
3617 const cs_matrix_t *a,
3618 cs_sles_convergence_state_t prev_state,
3619 const cs_sles_it_convergence_t *convergence,
3620 int *n_iter,
3621 double *residue,
3622 const cs_real_t *rhs,
3623 cs_real_t *restrict vx,
3624 size_t aux_size,
3625 void *aux_vectors)
3626 {
3627 cs_sles_convergence_state_t cvg = CS_SLES_ITERATING;
3628
3629 /* Check if fallback was already defined for this case */
3630
3631 if (c->fallback == NULL) {
3632
3633 c->fallback = cs_sles_it_create(solver_type,
3634 -1, /* poly_degree */
3635 c->n_max_iter,
3636 c->update_stats);
3637
3638 cs_sles_it_set_shareable(c->fallback, c);
3639
3640 c->fallback->plot = c->plot;
3641
3642 }
3643
3644 c->fallback->plot_time_stamp = c->plot_time_stamp;
3645
3646 const cs_lnum_t n_rows = c->setup_data->n_rows;
3647
3648 if (prev_state < CS_SLES_BREAKDOWN) {
3649 # pragma omp parallel for if(n_rows > CS_THR_MIN)
3650 for (cs_lnum_t i = 0; i < n_rows; i++)
3651 vx[i] = 0;
3652 }
3653
3654 cvg = cs_sles_it_solve(c->fallback,
3655 convergence->name,
3656 a,
3657 convergence->verbosity,
3658 convergence->precision,
3659 convergence->r_norm,
3660 n_iter,
3661 residue,
3662 rhs,
3663 vx,
3664 aux_size,
3665 aux_vectors);
3666
3667 cs_sles_it_free(c->fallback);
3668
3669 *n_iter += convergence->n_iterations;
3670
3671 c->plot_time_stamp = c->fallback->plot_time_stamp;
3672
3673 return cvg;
3674 }
3675
3676 /*! (DOXYGEN_SHOULD_SKIP_THIS) \endcond */
3677
3678 /*=============================================================================
3679 * User function prototypes
3680 *============================================================================*/
3681
3682 /*----------------------------------------------------------------------------
3683 * Solution of A.vx = Rhs using a user-defined iterative solver
3684 *
3685 * On entry, vx is considered initialized.
3686 *
3687 * parameters:
3688 * c <-- pointer to solver context info
3689 * a <-- matrix
3690 * diag_block_size <-- diagonal block size (unused here)
3691 * convergence <-- convergence information structure
3692 * rhs <-- right hand side
3693 * vx <-> system solution
3694 * aux_size <-- number of elements in aux_vectors (in bytes)
3695 * aux_vectors --- optional working area (allocation otherwise)
3696 *
3697 * returns:
3698 * convergence state
3699 *----------------------------------------------------------------------------*/
3700
3701 cs_sles_convergence_state_t
cs_user_sles_it_solver(cs_sles_it_t * c,const cs_matrix_t * a,cs_lnum_t diag_block_size,cs_sles_it_convergence_t * convergence,const cs_real_t * rhs,cs_real_t * restrict vx,size_t aux_size,void * aux_vectors)3702 cs_user_sles_it_solver(cs_sles_it_t *c,
3703 const cs_matrix_t *a,
3704 cs_lnum_t diag_block_size,
3705 cs_sles_it_convergence_t *convergence,
3706 const cs_real_t *rhs,
3707 cs_real_t *restrict vx,
3708 size_t aux_size,
3709 void *aux_vectors)
3710 {
3711 cs_sles_convergence_state_t cvg = CS_SLES_CONVERGED;
3712
3713 CS_UNUSED(c);
3714 CS_UNUSED(a);
3715 CS_UNUSED(diag_block_size);
3716 CS_UNUSED(convergence);
3717 CS_UNUSED(rhs);
3718 CS_UNUSED(vx);
3719 CS_UNUSED(aux_size);
3720 CS_UNUSED(aux_vectors);
3721
3722 return cvg;
3723 }
3724
3725 /*============================================================================
3726 * Public function definitions
3727 *============================================================================*/
3728
3729 /*----------------------------------------------------------------------------*/
3730 /*!
3731 * \brief Define and associate an iterative sparse linear system solver
3732 * for a given field or equation name.
3733 *
3734 * If this system did not previously exist, it is added to the list of
3735 * "known" systems. Otherwise, its definition is replaced by the one
3736 * defined here.
3737 *
3738 * This is a utility function: if finer control is needed, see
3739 * \ref cs_sles_define and \ref cs_sles_it_create.
3740 *
3741 * Note that this function returns a pointer directly to the iterative solver
3742 * management structure. This may be used to set further options,
3743 * for example using \ref cs_sles_it_set_plot_options. If needed,
3744 * \ref cs_sles_find may be used to obtain a pointer to the matching
3745 * \ref cs_sles_t container.
3746 *
3747 * \param[in] f_id associated field id, or < 0
3748 * \param[in] name associated name if f_id < 0, or NULL
3749 * \param[in] solver_type type of solver (PCG, Jacobi, ...)
3750 * \param[in] poly_degree preconditioning polynomial degree
3751 * (0: diagonal; -1: non-preconditioned)
3752 * \param[in] n_max_iter maximum number of iterations
3753 *
3754 * \return pointer to newly created iterative solver info object.
3755 */
3756 /*----------------------------------------------------------------------------*/
3757
3758 cs_sles_it_t *
cs_sles_it_define(int f_id,const char * name,cs_sles_it_type_t solver_type,int poly_degree,int n_max_iter)3759 cs_sles_it_define(int f_id,
3760 const char *name,
3761 cs_sles_it_type_t solver_type,
3762 int poly_degree,
3763 int n_max_iter)
3764 {
3765 /* Test for environment variables here */
3766
3767 const char *s = getenv("CS_THREAD_DEBUG");
3768 if (s != NULL) {
3769 if (atoi(s) > 0)
3770 _thread_debug = true;
3771 }
3772
3773 /* Now define solver */
3774
3775 cs_sles_it_t *
3776 c = cs_sles_it_create(solver_type,
3777 poly_degree,
3778 n_max_iter,
3779 true); /* update stats */
3780
3781 cs_sles_t *sc = cs_sles_define(f_id,
3782 name,
3783 c,
3784 "cs_sles_it_t",
3785 cs_sles_it_setup,
3786 cs_sles_it_solve,
3787 cs_sles_it_free,
3788 cs_sles_it_log,
3789 cs_sles_it_copy,
3790 cs_sles_it_destroy);
3791
3792 cs_sles_set_error_handler(sc,
3793 cs_sles_it_error_post_and_abort);
3794
3795 return c;
3796 }
3797
3798 /*----------------------------------------------------------------------------*/
3799 /*!
3800 * \brief Create iterative sparse linear system solver info and context.
3801 *
3802 * \param[in] solver_type type of solver (PCG, Jacobi, ...)
3803 * \param[in] poly_degree preconditioning polynomial degree
3804 * (0: diagonal; -1: non-preconditioned;
3805 * see \ref sles_it for details)
3806 * \param[in] n_max_iter maximum number of iterations
3807 * \param[in] update_stats automatic solver statistics indicator
3808 *
3809 * \return pointer to newly created solver info object.
3810 */
3811 /*----------------------------------------------------------------------------*/
3812
3813 cs_sles_it_t *
cs_sles_it_create(cs_sles_it_type_t solver_type,int poly_degree,int n_max_iter,bool update_stats)3814 cs_sles_it_create(cs_sles_it_type_t solver_type,
3815 int poly_degree,
3816 int n_max_iter,
3817 bool update_stats)
3818 {
3819 cs_sles_it_t *c;
3820
3821 BFT_MALLOC(c, 1, cs_sles_it_t);
3822
3823 c->type = solver_type;
3824 c->solve = NULL;
3825
3826 switch(c->type) {
3827 case CS_SLES_JACOBI:
3828 case CS_SLES_P_GAUSS_SEIDEL:
3829 case CS_SLES_P_SYM_GAUSS_SEIDEL:
3830 c->_pc = NULL;
3831 break;
3832 default:
3833 if (poly_degree < 0) {
3834 /* specific implementation for non-preconditioned PCG */
3835 if (c->type == CS_SLES_PCG)
3836 c->_pc = NULL;
3837 else
3838 c->_pc = cs_sles_pc_none_create();
3839 }
3840 else if (poly_degree == 0)
3841 c->_pc = cs_sles_pc_jacobi_create();
3842 else if (poly_degree == 1)
3843 c->_pc =cs_sles_pc_poly_1_create();
3844 else
3845 c->_pc =cs_sles_pc_poly_2_create();
3846 }
3847 c->pc = c->_pc;
3848
3849 c->update_stats = update_stats;
3850 c->ignore_convergence = false;
3851
3852 c->n_max_iter = n_max_iter;
3853 c->restart_interval = 20; /* Default value commonly found in the literature */
3854
3855 c->n_setups = 0;
3856 c->n_solves = 0;
3857
3858 c->n_iterations_min = 0;
3859 c->n_iterations_max = 0;
3860 c->n_iterations_last = 0;
3861 c->n_iterations_tot = 0;
3862
3863 CS_TIMER_COUNTER_INIT(c->t_setup);
3864 CS_TIMER_COUNTER_INIT(c->t_solve);
3865
3866 c->plot_time_stamp = 0;
3867 c->plot = NULL;
3868 c->_plot = NULL;
3869
3870 #if defined(HAVE_MPI)
3871 c->comm = cs_glob_mpi_comm;
3872 c->caller_comm = cs_glob_mpi_comm;
3873 c->caller_n_ranks = cs_glob_n_ranks;
3874 if (c->caller_n_ranks < 2) {
3875 c->comm = MPI_COMM_NULL;
3876 c->caller_comm = cs_glob_mpi_comm;
3877 }
3878 #endif
3879
3880 c->setup_data = NULL;
3881 c->add_data = NULL;
3882 c->shared = NULL;
3883
3884 /* Fallback mechanism; note that for fallbacks,
3885 the preconditioner is shared, so Krylov methods
3886 may not be mixed with Jacobi or Gauss-Seidel methods */
3887
3888 switch(c->type) {
3889 case CS_SLES_BICGSTAB:
3890 case CS_SLES_BICGSTAB2:
3891 case CS_SLES_PCR3:
3892 c->fallback_cvg = CS_SLES_BREAKDOWN;
3893 break;
3894 default:
3895 c->fallback_cvg = CS_SLES_DIVERGED;
3896 }
3897
3898 c->fallback = NULL;
3899
3900 return c;
3901 }
3902
3903 /*----------------------------------------------------------------------------*/
3904 /*!
3905 * \brief Destroy iterative sparse linear system solver info and context.
3906 *
3907 * \param[in, out] context pointer to iterative solver info and context
3908 * (actual type: cs_sles_it_t **)
3909 */
3910 /*----------------------------------------------------------------------------*/
3911
3912 void
cs_sles_it_destroy(void ** context)3913 cs_sles_it_destroy(void **context)
3914 {
3915 cs_sles_it_t *c = (cs_sles_it_t *)(*context);
3916 if (c != NULL) {
3917 if (c->fallback != NULL) {
3918 void *f = c->fallback;
3919 cs_sles_it_destroy(&f);
3920 c->fallback = f;
3921 }
3922 cs_sles_pc_destroy(&(c->_pc));
3923 cs_sles_it_free(c);
3924 if (c->_plot != NULL) {
3925 cs_time_plot_finalize(&(c->_plot));
3926 c->plot = NULL;
3927 }
3928 if (c->add_data != NULL) {
3929 BFT_FREE(c->add_data->order);
3930 BFT_FREE(c->add_data);
3931 }
3932 BFT_FREE(c);
3933 *context = c;
3934 }
3935 }
3936
3937 /*----------------------------------------------------------------------------*/
3938 /*!
3939 * \brief Create iterative sparse linear system solver info and context
3940 * based on existing info and context.
3941 *
3942 * \param[in] context pointer to reference info and context
3943 * (actual type: cs_sles_it_t *)
3944 *
3945 * \return pointer to newly created solver info object.
3946 * (actual type: cs_sles_it_t *)
3947 */
3948 /*----------------------------------------------------------------------------*/
3949
3950 void *
cs_sles_it_copy(const void * context)3951 cs_sles_it_copy(const void *context)
3952 {
3953 cs_sles_it_t *d = NULL;
3954
3955 if (context != NULL) {
3956 const cs_sles_it_t *c = context;
3957 d = cs_sles_it_create(c->type,
3958 -1,
3959 c->n_max_iter,
3960 c->update_stats);
3961 if (c->pc != NULL && c->_pc != NULL) {
3962 d->_pc = cs_sles_pc_clone(c->_pc);
3963 d->pc = d->_pc;
3964 }
3965 else {
3966 d->_pc = c->_pc;
3967 d->pc = c->pc;
3968 }
3969
3970 /* If useful, copy the restart interval */
3971 if (c->type == CS_SLES_GMRES || c->type == CS_SLES_GCR)
3972 d->restart_interval = c->restart_interval;
3973
3974 #if defined(HAVE_MPI)
3975 d->comm = c->comm;
3976 #endif
3977 }
3978
3979 return d;
3980 }
3981
3982 /*----------------------------------------------------------------------------*/
3983 /*!
3984 * \brief Log sparse linear equation solver info.
3985 *
3986 * \param[in] context pointer to iterative solver info and context
3987 * (actual type: cs_sles_it_t *)
3988 * \param[in] log_type log type
3989 */
3990 /*----------------------------------------------------------------------------*/
3991
3992 void
cs_sles_it_log(const void * context,cs_log_t log_type)3993 cs_sles_it_log(const void *context,
3994 cs_log_t log_type)
3995 {
3996 const cs_sles_it_t *c = context;
3997
3998 if (log_type == CS_LOG_SETUP) {
3999
4000 cs_log_printf(log_type,
4001 _(" Solver type: %s\n"),
4002 _(cs_sles_it_type_name[c->type]));
4003 if (c->pc != NULL)
4004 cs_log_printf(log_type,
4005 _(" Preconditioning: %s\n"),
4006 _(cs_sles_pc_get_type_name(c->pc)));
4007 if (c->type == CS_SLES_GMRES || c->type == CS_SLES_GCR)
4008 cs_log_printf(log_type,
4009 " Restart interval: %d\n",
4010 c->restart_interval);
4011 cs_log_printf(log_type,
4012 _(" Maximum number of iterations: %d\n"),
4013 c->n_max_iter);
4014 }
4015
4016 else if (log_type == CS_LOG_PERFORMANCE) {
4017
4018 int n_calls = c->n_solves;
4019 int n_it_min = c->n_iterations_min;
4020 int n_it_max = c->n_iterations_max;
4021 int n_it_mean = 0;
4022
4023 if (n_it_min < 0)
4024 n_it_min = 0;
4025
4026 if (n_calls > 0)
4027 n_it_mean = (int)( c->n_iterations_tot
4028 / ((unsigned long long)n_calls));
4029
4030 cs_log_printf(log_type,
4031 _("\n"
4032 " Solver type: %s\n"),
4033 _(cs_sles_it_type_name[c->type]));
4034
4035 if (c->pc != NULL)
4036 cs_log_printf(log_type,
4037 _(" Preconditioning: %s\n"),
4038 _(cs_sles_pc_get_type_name(c->pc)));
4039 cs_log_printf(log_type,
4040 _(" Number of setups: %12d\n"
4041 " Number of calls: %12d\n"
4042 " Minimum number of iterations: %12d\n"
4043 " Maximum number of iterations: %12d\n"
4044 " Mean number of iterations: %12d\n"
4045 " Total setup time: %12.3f\n"
4046 " Total solution time: %12.3f\n"),
4047 c->n_setups, n_calls, n_it_min, n_it_max, n_it_mean,
4048 c->t_setup.nsec*1e-9,
4049 c->t_solve.nsec*1e-9);
4050
4051 if (c->fallback != NULL) {
4052
4053 n_calls = c->fallback->n_solves;
4054 n_it_min = c->fallback->n_iterations_min;
4055 n_it_max = c->fallback->n_iterations_max;
4056 n_it_mean = 0;
4057
4058 if (n_it_min < 0)
4059 n_it_min = 0;
4060
4061 if (n_calls > 0)
4062 n_it_mean = (int)( c->fallback->n_iterations_tot
4063 / ((unsigned long long)n_calls));
4064
4065 cs_log_printf(log_type,
4066 _("\n"
4067 " Backup solver type: %s\n"),
4068 _(cs_sles_it_type_name[c->fallback->type]));
4069
4070 cs_log_printf(log_type,
4071 _(" Number of calls: %12d\n"
4072 " Minimum number of iterations: %12d\n"
4073 " Maximum number of iterations: %12d\n"
4074 " Mean number of iterations: %12d\n"
4075 " Total solution time: %12.3f\n"),
4076 n_calls, n_it_min, n_it_max, n_it_mean,
4077 c->fallback->t_solve.nsec*1e-9);
4078
4079 }
4080
4081 }
4082
4083 if (c->pc != NULL)
4084 cs_sles_pc_log(c->pc, log_type);
4085 }
4086
4087 /*----------------------------------------------------------------------------*/
4088 /*!
4089 * \brief Setup iterative sparse linear equation solver.
4090 *
4091 * \param[in, out] context pointer to iterative solver info and context
4092 * (actual type: cs_sles_it_t *)
4093 * \param[in] name pointer to system name
4094 * \param[in] a associated matrix
4095 * \param[in] verbosity associated verbosity
4096 */
4097 /*----------------------------------------------------------------------------*/
4098
4099 void
cs_sles_it_setup(void * context,const char * name,const cs_matrix_t * a,int verbosity)4100 cs_sles_it_setup(void *context,
4101 const char *name,
4102 const cs_matrix_t *a,
4103 int verbosity)
4104 {
4105 cs_sles_it_t *c = context;
4106
4107 cs_timer_t t0;
4108 if (c->update_stats == true)
4109 t0 = cs_timer_time();
4110
4111 const int diag_block_size = (cs_matrix_get_diag_block_size(a))[0];
4112
4113 if (verbosity > 1) {
4114 bft_printf(_("\n Setup of solver for linear system \"%s\"\n"),
4115 name);
4116 cs_matrix_log_info(a, verbosity);
4117 }
4118
4119 if ( c->type == CS_SLES_JACOBI
4120 || ( c->type >= CS_SLES_P_GAUSS_SEIDEL
4121 && c->type <= CS_SLES_P_SYM_GAUSS_SEIDEL)) {
4122 /* Force to Jacobi in case matrix type is not adapted */
4123 if (cs_matrix_get_type(a) != CS_MATRIX_MSR) {
4124 c->type = CS_SLES_JACOBI;
4125 }
4126 cs_sles_it_setup_priv(c, name, a, verbosity, diag_block_size, true);
4127 }
4128 else
4129 cs_sles_it_setup_priv(c, name, a, verbosity, diag_block_size, false);
4130
4131 switch (c->type) {
4132
4133 case CS_SLES_PCR3:
4134 c->solve = _conjugate_residual_3;
4135 break;
4136
4137 case CS_SLES_PCG:
4138 /* Check for single-reduction */
4139 {
4140 bool single_reduce = false;
4141 #if defined(HAVE_MPI)
4142 cs_gnum_t n_m_rows = c->setup_data->n_rows;
4143 if (c->comm != MPI_COMM_NULL) {
4144 int size;
4145 cs_gnum_t _n_m_rows;
4146 MPI_Allreduce(&n_m_rows, &_n_m_rows, 1, CS_MPI_GNUM, MPI_SUM, c->comm);
4147 MPI_Comm_size(c->comm, &size);
4148 n_m_rows = _n_m_rows / (cs_gnum_t)size;
4149 }
4150 if (c->comm != c->caller_comm)
4151 MPI_Bcast(&n_m_rows, 1, CS_MPI_GNUM, 0, cs_glob_mpi_comm);
4152 if (n_m_rows < (cs_gnum_t)_pcg_sr_threshold)
4153 single_reduce = true;
4154 #endif
4155 if (!single_reduce) {
4156 if (c->pc != NULL)
4157 c->solve = _conjugate_gradient;
4158 else
4159 c->solve = _conjugate_gradient_npc;
4160 break;
4161 }
4162 else {
4163 if (c->pc != NULL)
4164 c->solve = _conjugate_gradient_sr;
4165 else
4166 c->solve = _conjugate_gradient_npc_sr;
4167 }
4168 }
4169 break;
4170
4171 case CS_SLES_FCG:
4172 c->solve = _flexible_conjugate_gradient;
4173 break;
4174
4175 case CS_SLES_IPCG:
4176 c->solve = _conjugate_gradient_ip;
4177 break;
4178
4179 case CS_SLES_JACOBI:
4180 if (diag_block_size == 1)
4181 c->solve = _jacobi;
4182 else if (diag_block_size == 3)
4183 c->solve = _block_3_jacobi;
4184 else
4185 c->solve = _block_jacobi;
4186 break;
4187
4188 case CS_SLES_BICGSTAB:
4189 c->solve = _bi_cgstab;
4190 break;
4191
4192 case CS_SLES_BICGSTAB2:
4193 c->solve = _bicgstab2;
4194 break;
4195
4196 case CS_SLES_GCR:
4197 assert(c->restart_interval > 1);
4198 c->solve = _gcr;
4199 break;
4200
4201 case CS_SLES_GMRES:
4202 assert(c->restart_interval > 1);
4203 c->solve = _gmres;
4204 break;
4205
4206 case CS_SLES_P_GAUSS_SEIDEL:
4207 c->solve = _p_gauss_seidel;
4208 break;
4209 case CS_SLES_P_SYM_GAUSS_SEIDEL:
4210 c->solve = _p_sym_gauss_seidel_msr;
4211 break;
4212
4213 case CS_SLES_USER_DEFINED:
4214 c->solve = cs_user_sles_it_solver;
4215 break;
4216
4217 default:
4218 bft_error
4219 (__FILE__, __LINE__, 0,
4220 _("Setup of linear equation on \"%s\"\n"
4221 "with solver type %d, which is not defined)."),
4222 name, (int)c->type);
4223 break;
4224 }
4225
4226 /* Now finish */
4227
4228 if (c->update_stats == true) {
4229 cs_timer_t t1 = cs_timer_time();
4230 c->n_setups += 1;
4231 cs_timer_counter_add_diff(&(c->t_setup), &t0, &t1);
4232 }
4233 }
4234
4235 /*----------------------------------------------------------------------------*/
4236 /*!
4237 * \brief Call iterative sparse linear equation solver.
4238 *
4239 * \param[in, out] context pointer to iterative solver info and context
4240 * (actual type: cs_sles_it_t *)
4241 * \param[in] name pointer to system name
4242 * \param[in] a matrix
4243 * \param[in] verbosity associated verbosity
4244 * \param[in] precision solver precision
4245 * \param[in] r_norm residue normalization
4246 * \param[out] n_iter number of "equivalent" iterations
4247 * \param[out] residue residue
4248 * \param[in] rhs right hand side
4249 * \param[in, out] vx system solution
4250 * \param[in] aux_size number of elements in aux_vectors (in bytes)
4251 * \param aux_vectors optional working area
4252 * (internal allocation if NULL)
4253 *
4254 * \return convergence state
4255 */
4256 /*----------------------------------------------------------------------------*/
4257
4258 cs_sles_convergence_state_t
cs_sles_it_solve(void * context,const char * name,const cs_matrix_t * a,int verbosity,double precision,double r_norm,int * n_iter,double * residue,const cs_real_t * rhs,cs_real_t * vx,size_t aux_size,void * aux_vectors)4259 cs_sles_it_solve(void *context,
4260 const char *name,
4261 const cs_matrix_t *a,
4262 int verbosity,
4263 double precision,
4264 double r_norm,
4265 int *n_iter,
4266 double *residue,
4267 const cs_real_t *rhs,
4268 cs_real_t *vx,
4269 size_t aux_size,
4270 void *aux_vectors)
4271 {
4272 cs_sles_it_t *c = context;
4273
4274 cs_sles_convergence_state_t cvg = CS_SLES_ITERATING;
4275
4276 cs_timer_t t0 = {0, 0}, t1;
4277
4278 unsigned _n_iter = 0;
4279 cs_sles_it_convergence_t convergence;
4280
4281 if (c->update_stats == true)
4282 t0 = cs_timer_time();
4283
4284 const cs_lnum_t *diag_block_size = cs_matrix_get_diag_block_size(a);
4285 const cs_lnum_t _diag_block_size = diag_block_size[0];
4286
4287 assert(diag_block_size[0] == diag_block_size[1]);
4288
4289 /* Initialize number of iterations and residue,
4290 check for immediate return,
4291 and solve sparse linear system */
4292
4293 *n_iter = 0;
4294
4295 /* Setup if not already done */
4296
4297 if (c->setup_data == NULL) {
4298
4299 if (c->update_stats) { /* Stop solve timer to switch to setup timer */
4300 t1 = cs_timer_time();
4301 cs_timer_counter_add_diff(&(c->t_solve), &t0, &t1);
4302 }
4303
4304 cs_sles_it_setup(c, name, a, verbosity);
4305
4306 if (c->update_stats) /* Restart solve timer */
4307 t0 = cs_timer_time();
4308
4309 }
4310
4311 if (c->pc != NULL)
4312 cs_sles_pc_set_tolerance(c->pc, precision, r_norm);
4313
4314 /* Solve sparse linear system */
4315
4316 cs_sles_it_convergence_init(&convergence,
4317 name,
4318 verbosity,
4319 c->n_max_iter,
4320 precision,
4321 r_norm,
4322 residue);
4323
4324 c->setup_data->initial_residue = -1;
4325
4326 if (verbosity > 1)
4327 cs_log_printf(CS_LOG_DEFAULT,
4328 _(" RHS norm: %11.4e\n\n"), r_norm);
4329
4330 /* Only call solver for "active" ranks */
4331
4332 bool local_solve = true;
4333 #if defined(HAVE_MPI)
4334 if (c->comm == MPI_COMM_NULL) {
4335 cs_lnum_t n_rows = cs_matrix_get_n_rows(a);
4336 if (n_rows == 0) {
4337 local_solve = false;
4338 cvg = CS_SLES_CONVERGED;
4339 }
4340 }
4341 #endif
4342
4343 if (local_solve)
4344 cvg = c->solve(c,
4345 a, _diag_block_size, &convergence,
4346 rhs, vx,
4347 aux_size, aux_vectors);
4348
4349 /* Broadcast convergence info from "active" ranks to others*/
4350
4351 #if defined(HAVE_MPI)
4352 if (c->comm != c->caller_comm && c->ignore_convergence == false) {
4353 /* cvg is signed, so shift (with some margin) before copy to unsigned. */
4354 unsigned buf[2] = {(unsigned)cvg+10, convergence.n_iterations};
4355 MPI_Bcast(buf, 2, MPI_UNSIGNED, 0, c->caller_comm);
4356 MPI_Bcast(&convergence.residue, 1, MPI_DOUBLE, 0, c->caller_comm);
4357 cvg = (cs_sles_convergence_state_t)(buf[0] - 10);
4358 convergence.n_iterations = buf[1];
4359 }
4360 #endif
4361
4362 /* Update return values */
4363
4364 _n_iter = convergence.n_iterations;
4365
4366 *n_iter = convergence.n_iterations;
4367 *residue = convergence.residue;
4368
4369 cs_sles_it_type_t fallback_type = CS_SLES_N_IT_TYPES;
4370 if (cvg < c->fallback_cvg)
4371 fallback_type = CS_SLES_GMRES;
4372
4373 if (c->update_stats == true) {
4374
4375 t1 = cs_timer_time();
4376
4377 c->n_solves += 1;
4378
4379 if (fallback_type == CS_SLES_N_IT_TYPES) {
4380 if (c->n_iterations_tot == 0)
4381 c->n_iterations_min = _n_iter;
4382 else if (c->n_iterations_min > _n_iter)
4383 c->n_iterations_min = _n_iter;
4384 if (c->n_iterations_max < _n_iter)
4385 c->n_iterations_max = _n_iter;
4386 }
4387
4388 c->n_iterations_last = _n_iter;
4389 c->n_iterations_tot += _n_iter;
4390
4391 cs_timer_counter_add_diff(&(c->t_solve), &t0, &t1);
4392
4393 }
4394
4395 if (fallback_type != CS_SLES_N_IT_TYPES)
4396 cvg = _fallback(c,
4397 fallback_type,
4398 a,
4399 cvg,
4400 &convergence,
4401 n_iter,
4402 residue,
4403 rhs,
4404 vx,
4405 aux_size,
4406 aux_vectors);
4407
4408 return cvg;
4409 }
4410
4411 /*----------------------------------------------------------------------------*/
4412 /*!
4413 * \brief Free iterative sparse linear equation solver setup context.
4414 *
4415 * This function frees resolution-related data, such as
4416 * buffers and preconditioning but does not free the whole context,
4417 * as info used for logging (especially performance data) is maintained.
4418 *
4419 * \param[in, out] context pointer to iterative solver info and context
4420 * (actual type: cs_sles_it_t *)
4421 */
4422 /*----------------------------------------------------------------------------*/
4423
4424 void
cs_sles_it_free(void * context)4425 cs_sles_it_free(void *context)
4426 {
4427 cs_sles_it_t *c = context;
4428
4429 cs_timer_t t0;
4430 if (c->update_stats == true)
4431 t0 = cs_timer_time();
4432
4433 if (c->fallback != NULL)
4434 cs_sles_it_free(c->fallback);
4435
4436 if (c->_pc != NULL)
4437 cs_sles_pc_free(c->_pc);
4438
4439 if (c->setup_data != NULL) {
4440 BFT_FREE(c->setup_data->_ad_inv);
4441 BFT_FREE(c->setup_data);
4442 }
4443
4444 if (c->update_stats == true) {
4445 cs_timer_t t1 = cs_timer_time();
4446 cs_timer_counter_add_diff(&(c->t_setup), &t0, &t1);
4447 }
4448 }
4449
4450 /*----------------------------------------------------------------------------*/
4451 /*!
4452 * \brief Return iterative solver type.
4453 *
4454 * \param[in] context pointer to iterative solver info and context
4455 *
4456 * \return selected solver type
4457 */
4458 /*----------------------------------------------------------------------------*/
4459
4460 cs_sles_it_type_t
cs_sles_it_get_type(const cs_sles_it_t * context)4461 cs_sles_it_get_type(const cs_sles_it_t *context)
4462 {
4463 return context->type;
4464 }
4465
4466 /*----------------------------------------------------------------------------*/
4467 /*!
4468 * \brief Return the initial residue for the previous solve operation
4469 * with a solver.
4470 *
4471 * This is useful for convergence tests when this solver is used as
4472 * a preconditioning smoother.
4473 *
4474 * This operation is only valid between calls to \ref cs_sles_it_setup
4475 * (or \ref cs_sles_it_solve) and \ref cs_sles_it_free.
4476 * It returns -1 otherwise.
4477 *
4478 * \param[in] context pointer to iterative solver info and context
4479 *
4480 * \return initial residue from last call to \ref cs_sles_solve with this
4481 * solver
4482 */
4483 /*----------------------------------------------------------------------------*/
4484
4485 double
cs_sles_it_get_last_initial_residue(const cs_sles_it_t * context)4486 cs_sles_it_get_last_initial_residue(const cs_sles_it_t *context)
4487 {
4488 double retval = 1;
4489 if (context->setup_data != NULL)
4490 retval = context->setup_data->initial_residue;
4491
4492 return retval;
4493 }
4494
4495 /*----------------------------------------------------------------------------*/
4496 /*!
4497 * \brief Return a preconditioner context for an iterative sparse linear
4498 * equation solver.
4499 *
4500 * This allows modifying parameters of a non default (Jacobi or polynomial)
4501 * preconditioner.
4502 *
4503 * \param[in] context pointer to iterative solver info and context
4504 *
4505 * \return pointer to preconditoner context
4506 */
4507 /*----------------------------------------------------------------------------*/
4508
4509 cs_sles_pc_t *
cs_sles_it_get_pc(cs_sles_it_t * context)4510 cs_sles_it_get_pc(cs_sles_it_t *context)
4511 {
4512 cs_sles_pc_t *pc = NULL;
4513
4514 if (context != NULL) {
4515 cs_sles_it_t *c = context;
4516 pc = c->pc;
4517 }
4518
4519 return pc;
4520 }
4521
4522 /*----------------------------------------------------------------------------*/
4523 /*!
4524 * \brief Assign a preconditioner to an iterative sparse linear equation
4525 * solver, transfering its ownership to to solver context.
4526 *
4527 * This allows assigning a non default (Jacobi or polynomial) preconditioner.
4528 *
4529 * The input pointer is set to NULL to make it clear the caller does not
4530 * own the preconditioner anymore, though the context can be accessed using
4531 * \ref cs_sles_it_get_pc.
4532 *
4533 * \param[in, out] context pointer to iterative solver info and context
4534 * \param[in, out] pc pointer to preconditoner context
4535 */
4536 /*----------------------------------------------------------------------------*/
4537
4538 void
cs_sles_it_transfer_pc(cs_sles_it_t * context,cs_sles_pc_t ** pc)4539 cs_sles_it_transfer_pc(cs_sles_it_t *context,
4540 cs_sles_pc_t **pc)
4541 {
4542 if (context != NULL) {
4543 cs_sles_it_t *c = context;
4544 c->pc = NULL;
4545 cs_sles_pc_destroy(&(c->_pc));
4546 if (pc != NULL) {
4547 c->_pc = *pc;
4548 c->pc = *pc;
4549 }
4550 }
4551 else if (pc != NULL)
4552 cs_sles_pc_destroy(pc);
4553 }
4554
4555 /*----------------------------------------------------------------------------*/
4556 /*!
4557 * \brief Copy options from one iterative sparse linear system solver info
4558 * and context to another.
4559 *
4560 * Optional plotting contexts are shared between the source and destination
4561 * contexts.
4562 *
4563 * Preconditioner settings are to be handled separately.
4564 *
4565 * \param[in] src pointer to source info and context
4566 * \param[in, out] dest pointer to destination info and context
4567 */
4568 /*----------------------------------------------------------------------------*/
4569
4570 void
cs_sles_it_transfer_parameters(const cs_sles_it_t * src,cs_sles_it_t * dest)4571 cs_sles_it_transfer_parameters(const cs_sles_it_t *src,
4572 cs_sles_it_t *dest)
4573 {
4574 if (dest != NULL && src != NULL) {
4575
4576 dest->update_stats = src->update_stats;
4577 dest->n_max_iter = src->n_max_iter;
4578 dest->restart_interval = src->restart_interval;
4579
4580 dest->plot_time_stamp = src->plot_time_stamp;
4581 dest->plot = src->plot;
4582 if (dest->_plot != NULL)
4583 cs_time_plot_finalize(&(dest->_plot));
4584
4585 #if defined(HAVE_MPI)
4586 dest->comm = src->comm;
4587 #endif
4588
4589 }
4590 }
4591
4592 /*----------------------------------------------------------------------------*/
4593 /*!
4594 * \brief Associate a similar info and context object with which some setup
4595 * data may be shared.
4596 *
4597 * This is especially useful for sharing preconditioning data between
4598 * similar solver contexts (for example ascending and descending multigrid
4599 * smoothers based on the same matrix).
4600 *
4601 * For preconditioning data to be effectively shared, \ref cs_sles_it_setup
4602 * (or \ref cs_sles_it_solve) must be called on \p shareable before being
4603 * called on \p context (without \ref cs_sles_it_free being called in between,
4604 * of course).
4605 *
4606 * It is the caller's responsibility to ensure the context is not used
4607 * for a \ref cs_sles_it_setup or \ref cs_sles_it_solve operation after the
4608 * shareable object has been destroyed (normally by \ref cs_sles_it_destroy).
4609 *
4610 * \param[in, out] context pointer to iterative solver info and context
4611 * \param[in] shareable pointer to iterative solver info and context
4612 * whose context may be shared
4613 */
4614 /*----------------------------------------------------------------------------*/
4615
4616 void
cs_sles_it_set_shareable(cs_sles_it_t * context,const cs_sles_it_t * shareable)4617 cs_sles_it_set_shareable(cs_sles_it_t *context,
4618 const cs_sles_it_t *shareable)
4619 {
4620 cs_sles_it_t *c = context;
4621
4622 c->shared = shareable;
4623
4624 c->pc = shareable->pc;
4625
4626 if (c->pc != c->_pc && c->_pc != NULL)
4627 cs_sles_pc_destroy(&(c->_pc));
4628 }
4629
4630 #if defined(HAVE_MPI)
4631
4632 /*----------------------------------------------------------------------------*/
4633 /*!
4634 * \brief Set MPI communicator for global reductions.
4635 *
4636 * The system is solved only on ranks with a non-NULL communicator or
4637 * if the caller communicator has less than 2 ranks. convergence info
4638 * is the broadcast across the caller communicator.
4639 *
4640 * \param[in, out] context pointer to iterative solver info and context
4641 * \param[in] comm MPI communicator for solving
4642 * \param[in] caller_comm MPI communicator of caller
4643 */
4644 /*----------------------------------------------------------------------------*/
4645
4646 void
cs_sles_it_set_mpi_reduce_comm(cs_sles_it_t * context,MPI_Comm comm,MPI_Comm caller_comm)4647 cs_sles_it_set_mpi_reduce_comm(cs_sles_it_t *context,
4648 MPI_Comm comm,
4649 MPI_Comm caller_comm)
4650 {
4651 cs_sles_it_t *c = context;
4652
4653 static int flag = -1;
4654
4655 if (flag < 0)
4656 flag = cs_halo_get_use_barrier();
4657
4658 c->comm = comm;
4659 c->caller_comm = caller_comm;
4660
4661 if (c->caller_comm != MPI_COMM_NULL)
4662 MPI_Comm_size(c->caller_comm, &(c->caller_n_ranks));
4663 else
4664 c->caller_n_ranks = 1;
4665
4666 if (comm != cs_glob_mpi_comm)
4667 cs_halo_set_use_barrier(0);
4668 else {
4669 cs_halo_set_use_barrier(flag);
4670 if (cs_glob_n_ranks < 2)
4671 c->comm = MPI_COMM_NULL;
4672 }
4673 }
4674
4675 #endif /* defined(HAVE_MPI) */
4676
4677 /*----------------------------------------------------------------------------*/
4678 /*!
4679 * \brief Assign ordering to iterative solver.
4680 *
4681 * The solver context takes ownership of the order array (i.e. it will
4682 * handle its later deallocation).
4683 *
4684 * This is useful only for Process-local Gauss-Seidel.
4685 *
4686 * \param[in, out] context pointer to iterative solver info and context
4687 * \param[in, out] order pointer to ordering array
4688 */
4689 /*----------------------------------------------------------------------------*/
4690
4691 void
cs_sles_it_assign_order(cs_sles_it_t * context,cs_lnum_t ** order)4692 cs_sles_it_assign_order(cs_sles_it_t *context,
4693 cs_lnum_t **order)
4694 {
4695 if (context->type != CS_SLES_P_GAUSS_SEIDEL)
4696 BFT_FREE(*order);
4697
4698 else {
4699
4700 if (context->add_data == NULL) {
4701 BFT_MALLOC(context->add_data, 1, cs_sles_it_add_t);
4702 context->add_data->order = NULL;
4703 }
4704
4705 BFT_FREE(context->add_data->order);
4706
4707 context->add_data->order = *order;
4708
4709 *order = NULL;
4710
4711 }
4712 }
4713
4714 /*----------------------------------------------------------------------------*/
4715 /*!
4716 * \brief Define convergence level under which the fallback to another
4717 * solver may be used if applicable.
4718 *
4719 * Currently, this mechanism is only by default used for BiCGstab and
4720 * 3-layer conjugate residual solvers with scalar matrices, which may
4721 * fall back to a preconditioned GMRES solver. For those solvers, the
4722 * default threshold is \ref CS_SLES_BREAKDOWN, meaning that divergence
4723 * (but not breakdown) will lead to the use of the fallback mechanism.
4724 *
4725 * \param[in, out] context pointer to iterative solver info and context
4726 * \param[in] threshold convergence level under which fallback is used
4727 */
4728 /*----------------------------------------------------------------------------*/
4729
4730 void
cs_sles_it_set_fallback_threshold(cs_sles_it_t * context,cs_sles_convergence_state_t threshold)4731 cs_sles_it_set_fallback_threshold(cs_sles_it_t *context,
4732 cs_sles_convergence_state_t threshold)
4733 {
4734 context->fallback_cvg = threshold;
4735 }
4736
4737 /*----------------------------------------------------------------------------*/
4738 /*!
4739 * \brief Define the number of iterations to be done before restarting the
4740 * solver. Useful only for GCR or GMRES algorithms.
4741 *
4742 * \param[in, out] context pointer to iterative solver info and context
4743 * \param[in] interval convergence level under which fallback is used
4744 */
4745 /*----------------------------------------------------------------------------*/
4746
4747 void
cs_sles_it_set_restart_interval(cs_sles_it_t * context,int interval)4748 cs_sles_it_set_restart_interval(cs_sles_it_t *context,
4749 int interval)
4750 {
4751 if (context == NULL)
4752 return;
4753
4754 context->restart_interval = interval;
4755 }
4756
4757 /*----------------------------------------------------------------------------*/
4758 /*!
4759 * \brief Query mean number of rows under which Conjugate Gradient algorithm
4760 * uses the single-reduction variant.
4761 *
4762 * The single-reduction variant requires only one parallel sum per
4763 * iteration (instead of 2), at the cost of additional vector operations,
4764 * so it tends to be more expensive when the number of matrix rows per
4765 * MPI rank is high, then becomes cheaper when the MPI latency cost becomes
4766 * more significant.
4767 *
4768 * This option is ignored for non-parallel runs, so 0 is returned.
4769 *
4770 * \returns mean number of rows per active rank under which the
4771 * single-reduction variant will be used
4772 */
4773 /*----------------------------------------------------------------------------*/
4774
4775 cs_lnum_t
cs_sles_it_get_pcg_single_reduction(void)4776 cs_sles_it_get_pcg_single_reduction(void)
4777 {
4778 #if defined(HAVE_MPI)
4779 return _pcg_sr_threshold;
4780 #else
4781 return 0;
4782 #endif
4783 }
4784
4785 /*----------------------------------------------------------------------------*/
4786 /*!
4787 * \brief Set mean number of rows under which Conjugate Gradient algorithm
4788 * should use the single-reduction variant.
4789 *
4790 * The single-reduction variant requires only one parallel sum per
4791 * iteration (instead of 2), at the cost of additional vector operations,
4792 * so it tends to be more expensive when the number of matrix rows per
4793 * MPI rank is high, then becomes cheaper when the MPI latency cost becomes
4794 * more significant.
4795 *
4796 * This option is ignored for non-parallel runs.
4797 *
4798 * \param[in] threshold mean number of rows per active rank under which the
4799 * single-reduction variant will be used
4800 */
4801 /*----------------------------------------------------------------------------*/
4802
4803 void
cs_sles_it_set_pcg_single_reduction(cs_lnum_t threshold)4804 cs_sles_it_set_pcg_single_reduction(cs_lnum_t threshold)
4805 {
4806 #if defined(HAVE_MPI)
4807 _pcg_sr_threshold = threshold;
4808 #endif
4809 }
4810
4811 /*----------------------------------------------------------------------------*/
4812 /*!
4813 * \brief Log the current global settings relative to parallelism.
4814 */
4815 /*----------------------------------------------------------------------------*/
4816
4817 void
cs_sles_it_log_parallel_options(void)4818 cs_sles_it_log_parallel_options(void)
4819 {
4820 #if defined(HAVE_MPI)
4821 if (cs_glob_n_ranks > 1)
4822 cs_log_printf(CS_LOG_SETUP,
4823 _("\n"
4824 "Iterative linear solvers parallel parameters:\n"
4825 " PCG single-reduction threshold: %ld\n"),
4826 (long)_pcg_sr_threshold);
4827 #endif
4828 }
4829
4830 /*----------------------------------------------------------------------------*/
4831 /*!
4832 * \brief Error handler for iterative sparse linear equation solver.
4833 *
4834 * In case of divergence or breakdown, this error handler outputs
4835 * postprocessing data to assist debugging, then aborts the run.
4836 * It does nothing in case the maximum iteration count is reached.
4837
4838 * \param[in, out] sles pointer to solver object
4839 * \param[in] state convergence state
4840 * \param[in] a matrix
4841 * \param[in] rhs right hand side
4842 * \param[in, out] vx system solution
4843 *
4844 * \return false (do not attempt new solve)
4845 */
4846 /*----------------------------------------------------------------------------*/
4847
4848 bool
cs_sles_it_error_post_and_abort(cs_sles_t * sles,cs_sles_convergence_state_t state,const cs_matrix_t * a,const cs_real_t * rhs,cs_real_t * vx)4849 cs_sles_it_error_post_and_abort(cs_sles_t *sles,
4850 cs_sles_convergence_state_t state,
4851 const cs_matrix_t *a,
4852 const cs_real_t *rhs,
4853 cs_real_t *vx)
4854 {
4855 if (state >= CS_SLES_BREAKDOWN)
4856 return false;
4857
4858 const cs_sles_it_t *c = cs_sles_get_context(sles);
4859 const char *name = cs_sles_get_name(sles);
4860
4861 int mesh_id = cs_post_init_error_writer_cells();
4862
4863 cs_sles_post_error_output_def(name,
4864 mesh_id,
4865 a,
4866 rhs,
4867 vx);
4868
4869 cs_post_finalize();
4870
4871 const char *error_type[] = {N_("divergence"), N_("breakdown")};
4872 int err_id = (state == CS_SLES_BREAKDOWN) ? 1 : 0;
4873
4874 bft_error(__FILE__, __LINE__, 0,
4875 _("%s: error (%s) solving for %s"),
4876 _(cs_sles_it_type_name[c->type]),
4877 _(error_type[err_id]),
4878 name);
4879
4880 return false;
4881 }
4882
4883 /*----------------------------------------------------------------------------*/
4884 /*!
4885 * \brief Set plotting options for an iterative sparse linear equation solver.
4886 *
4887 * \param[in, out] context pointer to iterative solver info and context
4888 * \param[in] base_name base plot name to activate, NULL otherwise
4889 * \param[in] use_iteration if true, use iteration as time stamp
4890 * otherwise, use wall clock time
4891 */
4892 /*----------------------------------------------------------------------------*/
4893
4894 void
cs_sles_it_set_plot_options(cs_sles_it_t * context,const char * base_name,bool use_iteration)4895 cs_sles_it_set_plot_options(cs_sles_it_t *context,
4896 const char *base_name,
4897 bool use_iteration)
4898 {
4899 if (context != NULL) {
4900 if (cs_glob_rank_id < 1 && base_name != NULL) {
4901
4902 /* Destroy previous plot if options reset */
4903 if (context->_plot != NULL)
4904 cs_time_plot_finalize(&(context->_plot));
4905
4906 /* Create new plot */
4907 cs_file_mkdir_default("monitoring");
4908 const char *probe_names[] = {base_name};
4909 context->_plot = cs_time_plot_init_probe(base_name,
4910 "monitoring/residue_",
4911 CS_TIME_PLOT_CSV,
4912 use_iteration,
4913 -1.0, /* force flush */
4914 0, /* no buffer */
4915 1, /* n_probes */
4916 NULL, /* probe_list */
4917 NULL, /* probe_coords */
4918 probe_names);
4919 context->plot = context->_plot;
4920 context->plot_time_stamp = 0;
4921 }
4922 }
4923 }
4924
4925 /*----------------------------------------------------------------------------*/
4926
4927 END_C_DECLS
4928