1 /*============================================================================
2  * Sparse Linear Equation Solvers
3  *============================================================================*/
4 
5 /*
6   This file is part of Code_Saturne, a general-purpose CFD tool.
7 
8   Copyright (C) 1998-2021 EDF S.A.
9 
10   This program is free software; you can redistribute it and/or modify it under
11   the terms of the GNU General Public License as published by the Free Software
12   Foundation; either version 2 of the License, or (at your option) any later
13   version.
14 
15   This program is distributed in the hope that it will be useful, but WITHOUT
16   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
18   details.
19 
20   You should have received a copy of the GNU General Public License along with
21   this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
22   Street, Fifth Floor, Boston, MA 02110-1301, USA.
23 */
24 
25 /*----------------------------------------------------------------------------*/
26 
27 #include "cs_defs.h"
28 
29 /*----------------------------------------------------------------------------
30  * Standard C library headers
31  *----------------------------------------------------------------------------*/
32 
33 #include <stdarg.h>
34 #include <stdio.h>
35 #include <stdlib.h>
36 #include <string.h>
37 #include <assert.h>
38 #include <math.h>
39 #include <float.h>
40 
41 #if defined(HAVE_MPI)
42 #include <mpi.h>
43 #endif
44 
45 /*----------------------------------------------------------------------------
46  * Local headers
47  *----------------------------------------------------------------------------*/
48 
49 #include "cs_parall.h"
50 
51 /*----------------------------------------------------------------------------
52  *  Header for the current file
53  *----------------------------------------------------------------------------*/
54 
55 #include "cs_sles_it.h"
56 #include "cs_sles_it_priv.h"
57 
58 /*----------------------------------------------------------------------------*/
59 
60 BEGIN_C_DECLS
61 
62 /*=============================================================================
63  * Additional doxygen documentation
64  *============================================================================*/
65 
66 /*!
67   \file cs_sles_it.c
68         Iterative linear solvers
69 
70  \page sles_it Iterative linear solvers.
71 
72  For Krylov space solvers, default preconditioning is based
73  on a Neumann polynomial of degree \a poly_degree, with a negative value
74  meaning no preconditioning, and 0 diagonal preconditioning.
75 
76  For positive values of \a poly_degree, the preconditioning is explained here:
77  \a D being the diagonal part of matrix \a A and \a X its extra-diagonal
78  part, it can be written \f$A=D(Id+D^{-1}X)\f$. Therefore
79  \f$A^{-1}=(Id+D^{-1}X)^{-1}D^{-1}\f$. A series development of
80  \f$Id+D^{-1}X\f$ can then be used which yields, symbolically,
81  \f[
82  Id+\sum\limits_{I=1}^{poly\_degree}\left(-D^{-1}X\right)^{I}
83  \f]
84 
85  The efficiency of the polynomial preconditioning will vary depending
86  on the system type. In most cases, diagonal or degree 1 provide
87  best results. Each polynomial preconditioning degree above 0 adds one
88  matrix-vector product per inital matrix-vector product of the algorithm.
89  Switching from diagonal to polynomial degree 1 often divides the number of
90  required iterations by approximately 2, but each iteration then costs
91  close to 2 times that of diagonal preconditioning (other vector operations
92  are not doubled), so the net gain is often about 10%. Higher degree
93  polynomials usually lead to diminishing returns.
94 */
95 
96 /*! \cond DOXYGEN_SHOULD_SKIP_THIS */
97 
98 /*=============================================================================
99  * Local Macro Definitions
100  *============================================================================*/
101 
102 #define RINFIN  1.E+30
103 
104 /* SIMD unit size to ensure SIMD alignement (2 to 8 required on most
105  * current architectures, so 16 should be enough on most architectures) */
106 
107 #define CS_SIMD_SIZE(s) (((s-1)/16+1)*16)
108 
109 /*=============================================================================
110  * Local Structure Definitions
111  *============================================================================*/
112 
113 /*============================================================================
114  *  Global variables
115  *============================================================================*/
116 
117 static int _thread_debug = 0;
118 
119 /* Mean system rows threshold under which we use single-reduce version of PCG */
120 
121 static cs_lnum_t _pcg_sr_threshold = 512;
122 
123 /* Sparse linear equation solver type names */
124 
125 const char *cs_sles_it_type_name[]
126   = {N_("Conjugate Gradient"),
127      N_("Flexible Conjugate Gradient"),
128      N_("Inexact Preconditioned Conjugate Gradient"),
129      N_("Jacobi"),
130      N_("BiCGstab"),
131      N_("BiCGstab2"),
132      N_("GCR"),
133      N_("GMRES"),
134      N_("Gauss-Seidel"),
135      N_("Symmetric Gauss-Seidel"),
136      N_("3-layer conjugate residual"),
137      N_("User-defined iterative solver"),
138      N_("None"), /* Smoothers beyond this */
139      N_("Truncated forward Gauss-Seidel"),
140      N_("Truncated backwards Gauss-Seidel"),
141 };
142 
143 /*=============================================================================
144  * Local Structure Definitions
145  *============================================================================*/
146 
147 /*============================================================================
148  * Private function definitions
149  *============================================================================*/
150 
151 /*----------------------------------------------------------------------------
152  * Convergence test.
153  *
154  * parameters:
155  *   c           <-- pointer to solver context info
156  *   n_iter      <-- Number of iterations done
157  *   residue     <-- Non normalized residue
158  *   convergence <-> Convergence information structure
159  *
160  * returns:
161  *   convergence status.
162  *----------------------------------------------------------------------------*/
163 
164 static cs_sles_convergence_state_t
_convergence_test(cs_sles_it_t * c,unsigned n_iter,double residue,cs_sles_it_convergence_t * convergence)165 _convergence_test(cs_sles_it_t              *c,
166                   unsigned                   n_iter,
167                   double                     residue,
168                   cs_sles_it_convergence_t  *convergence)
169 {
170   const int verbosity = convergence->verbosity;
171   const cs_sles_it_setup_t  *s = c->setup_data;
172 
173   const char final_fmt[]
174     = N_("  n_iter: %5d, res_abs: %11.4e, res_nor: %11.4e, norm: %11.4e,"
175          " res_init: %11.4e\n");
176 
177   /* Update conversion info structure */
178 
179   convergence->n_iterations = n_iter;
180   convergence->residue = residue;
181 
182   /* Plot convergence if requested */
183 
184   if (c->plot != NULL) {
185     double vals = residue;
186     double wall_time = cs_timer_wtime();
187     c->plot_time_stamp += 1;
188     cs_time_plot_vals_write(c->plot,             /* time plot structure */
189                             c->plot_time_stamp,  /* current iteration number */
190                             wall_time,           /* current time */
191                             1,                   /* number of values */
192                             &vals);              /* values */
193 
194   }
195 
196   /* If converged */
197 
198   if (residue < convergence->precision * convergence->r_norm) {
199     if (verbosity > 1)
200       bft_printf(_(final_fmt), n_iter, residue, residue/convergence->r_norm,
201                  convergence->r_norm, s->initial_residue);
202     return CS_SLES_CONVERGED;
203   }
204 
205   /* If not converged */
206   else if (n_iter >= convergence->n_iterations_max) {
207     if (verbosity > -1) {
208       if (verbosity == 1) /* Already output if verbosity > 1 */
209         bft_printf("%s [%s]:\n", cs_sles_it_type_name[c->type],
210                    convergence->name);
211       else {
212         if (convergence->r_norm > 0.)
213           bft_printf(_(final_fmt),
214                      n_iter, residue, residue/convergence->r_norm,
215                      convergence->r_norm, s->initial_residue);
216         else
217           bft_printf(_("  n_iter : %5d, res_abs : %11.4e\n"),
218                      n_iter, residue);
219       }
220       if (convergence->precision > 0.)
221         bft_printf(_(" @@ Warning: non convergence\n"));
222     }
223     return CS_SLES_MAX_ITERATION;
224   }
225 
226   /* If diverged */
227   else {
228     int diverges = 0;
229     if (residue > s->initial_residue * 10000.0 && residue > 100.)
230       diverges = 1;
231 #if (__STDC_VERSION__ >= 199901L)
232     else if (isnan(residue) || isinf(residue)) {
233       diverges = 1;
234     }
235 #endif
236     if (diverges == 1) {
237       bft_printf(_("\n\n"
238                    "%s [%s]: divergence after %u iterations:\n"
239                    "  initial residual: %11.4e; current residual: %11.4e\n"),
240                  cs_sles_it_type_name[c->type], convergence->name,
241                  convergence->n_iterations,
242                  s->initial_residue, convergence->residue);
243       return CS_SLES_DIVERGED;
244     }
245   }
246 
247   return CS_SLES_ITERATING;
248 }
249 
250 /*----------------------------------------------------------------------------
251  * Solution of A.vx = Rhs using preconditioned conjugate gradient.
252  *
253  * Parallel-optimized version, groups dot products, at the cost of
254  * computation of the preconditionning for n+1 iterations instead of n.
255  *
256  * On entry, vx is considered initialized.
257  *
258  * parameters:
259  *   c               <-- pointer to solver context info
260  *   a               <-- matrix
261  *   diag_block_size <-- diagonal block size
262  *   convergence     <-- convergence information structure
263  *   rhs             <-- right hand side
264  *   vx              <-> system solution
265  *   aux_size        <-- number of elements in aux_vectors (in bytes)
266  *   aux_vectors     --- optional working area (allocation otherwise)
267  *
268  * returns:
269  *   convergence state
270  *----------------------------------------------------------------------------*/
271 
272 static cs_sles_convergence_state_t
_conjugate_gradient(cs_sles_it_t * c,const cs_matrix_t * a,cs_lnum_t diag_block_size,cs_sles_it_convergence_t * convergence,const cs_real_t * rhs,cs_real_t * restrict vx,size_t aux_size,void * aux_vectors)273 _conjugate_gradient(cs_sles_it_t              *c,
274                     const cs_matrix_t         *a,
275                     cs_lnum_t                  diag_block_size,
276                     cs_sles_it_convergence_t  *convergence,
277                     const cs_real_t           *rhs,
278                     cs_real_t                 *restrict vx,
279                     size_t                     aux_size,
280                     void                      *aux_vectors)
281 {
282   cs_sles_convergence_state_t cvg;
283   double  ro_0, ro_1, alpha, rk_gkm1, rk_gk, beta, residue;
284   cs_real_t  *_aux_vectors;
285   cs_real_t  *restrict rk, *restrict dk, *restrict gk;
286   cs_real_t *restrict zk;
287 
288   unsigned n_iter = 0;
289 
290   /* Allocate or map work arrays */
291   /*-----------------------------*/
292 
293   assert(c->setup_data != NULL);
294 
295   const cs_lnum_t n_rows = c->setup_data->n_rows;
296 
297   {
298     const cs_lnum_t n_cols = cs_matrix_get_n_columns(a) * diag_block_size;
299     const size_t n_wa = 4;
300     const size_t wa_size = CS_SIMD_SIZE(n_cols);
301 
302     if (aux_vectors == NULL || aux_size/sizeof(cs_real_t) < (wa_size * n_wa))
303       BFT_MALLOC(_aux_vectors, wa_size * n_wa, cs_real_t);
304     else
305       _aux_vectors = aux_vectors;
306 
307     rk = _aux_vectors;
308     dk = _aux_vectors + wa_size;
309     gk = _aux_vectors + wa_size*2;
310     zk = _aux_vectors + wa_size*3;
311   }
312 
313   /* Initialize iterative calculation */
314   /*----------------------------------*/
315 
316   /* Residue and descent direction */
317 
318   cs_matrix_vector_multiply(a, vx, rk);  /* rk = A.x0 */
319 
320 # pragma omp parallel for if(n_rows > CS_THR_MIN)
321   for (cs_lnum_t ii = 0; ii < n_rows; ii++)
322     rk[ii] -= rhs[ii];
323 
324   /* Preconditioning */
325 
326   c->setup_data->pc_apply(c->setup_data->pc_context,
327                           rk,
328                           gk);
329 
330   /* Descent direction */
331   /*-------------------*/
332 
333 #if defined(HAVE_OPENMP)
334 
335 # pragma omp parallel for if(n_rows > CS_THR_MIN)
336   for (cs_lnum_t ii = 0; ii < n_rows; ii++)
337     dk[ii] = gk[ii];
338 
339 #else
340 
341   memcpy(dk, gk, n_rows * sizeof(cs_real_t));
342 
343 #endif
344 
345   _dot_products_xx_xy(c, rk, gk, &residue, &rk_gkm1);
346   residue = sqrt(residue);
347 
348   /* If no solving required, finish here */
349 
350   c->setup_data->initial_residue = residue;
351   cvg = _convergence_test(c, n_iter, residue, convergence);
352 
353   if (cvg == CS_SLES_ITERATING) {
354 
355     n_iter = 1;
356 
357     cs_matrix_vector_multiply(a, dk, zk);
358 
359     /* Descent parameter */
360 
361     _dot_products_xy_yz(c, rk, dk, zk, &ro_0, &ro_1);
362 
363     cs_real_t d_ro_1 = (CS_ABS(ro_1) > DBL_MIN) ? 1. / ro_1 : 0.;
364     alpha =  - ro_0 * d_ro_1;
365 
366 #   pragma omp parallel if(n_rows > CS_THR_MIN)
367     {
368 #     pragma omp for nowait
369       for (cs_lnum_t ii = 0; ii < n_rows; ii++)
370         vx[ii] += (alpha * dk[ii]);
371 
372 #     pragma omp for nowait
373       for (cs_lnum_t ii = 0; ii < n_rows; ii++)
374         rk[ii] += (alpha * zk[ii]);
375     }
376 
377     /* Convergence test */
378 
379     residue = sqrt(_dot_product_xx(c, rk));
380     cvg = _convergence_test(c, n_iter, residue, convergence);
381 
382     /* Current Iteration */
383     /*-------------------*/
384 
385   }
386 
387   while (cvg == CS_SLES_ITERATING) {
388 
389     /* Preconditioning */
390 
391     c->setup_data->pc_apply(c->setup_data->pc_context, rk, gk);
392 
393     /* compute residue and prepare descent parameter */
394 
395     _dot_products_xx_xy(c, rk, gk, &residue, &rk_gk);
396 
397     residue = sqrt(residue);
398 
399     /* Convergence test for end of previous iteration */
400 
401     if (n_iter > 1)
402       cvg = _convergence_test(c, n_iter, residue, convergence);
403 
404     if (cvg != CS_SLES_ITERATING)
405       break;
406 
407     n_iter += 1;
408 
409     /* Complete descent parameter computation and matrix.vector product */
410 
411     beta = rk_gk / rk_gkm1;
412     rk_gkm1 = rk_gk;
413 
414 #   pragma omp parallel for firstprivate(alpha) if(n_rows > CS_THR_MIN)
415     for (cs_lnum_t ii = 0; ii < n_rows; ii++)
416       dk[ii] = gk[ii] + (beta * dk[ii]);
417 
418     cs_matrix_vector_multiply(a, dk, zk);
419 
420     _dot_products_xy_yz(c, rk, dk, zk, &ro_0, &ro_1);
421 
422     cs_real_t d_ro_1 = (CS_ABS(ro_1) > DBL_MIN) ? 1. / ro_1 : 0.;
423     alpha =  - ro_0 * d_ro_1;
424 
425 #   pragma omp parallel if(n_rows > CS_THR_MIN)
426     {
427 #     pragma omp for nowait
428       for (cs_lnum_t ii = 0; ii < n_rows; ii++)
429         vx[ii] += (alpha * dk[ii]);
430 
431 #     pragma omp for nowait
432       for (cs_lnum_t ii = 0; ii < n_rows; ii++)
433         rk[ii] += (alpha * zk[ii]);
434     }
435 
436   }
437 
438   if (_aux_vectors != aux_vectors)
439     BFT_FREE(_aux_vectors);
440 
441   return cvg;
442 }
443 
444 /*----------------------------------------------------------------------------
445  * Solution of A.vx = Rhs using flexible preconditioned conjugate gradient.
446  *
447  * Compared to standard PCG, FCG supports variable preconditioners.
448  *
449  * This variant, described in \cite Notay:2015, allows computing the
450  * required inner products with a single global communication.
451  *
452  * On entry, vx is considered initialized.
453  *
454  * parameters:
455  *   c               <-- pointer to solver context info
456  *   a               <-- matrix
457  *   diag_block_size <-- diagonal block size
458  *   convergence     <-- convergence information structure
459  *   rhs             <-- right hand side
460  *   vx              <-> system solution
461  *   aux_size        <-- number of elements in aux_vectors (in bytes)
462  *   aux_vectors     --- optional working area (allocation otherwise)
463  *
464  * returns:
465  *   convergence state
466  *----------------------------------------------------------------------------*/
467 
468 static cs_sles_convergence_state_t
_flexible_conjugate_gradient(cs_sles_it_t * c,const cs_matrix_t * a,cs_lnum_t diag_block_size,cs_sles_it_convergence_t * convergence,const cs_real_t * rhs,cs_real_t * restrict vx,size_t aux_size,void * aux_vectors)469 _flexible_conjugate_gradient(cs_sles_it_t              *c,
470                              const cs_matrix_t         *a,
471                              cs_lnum_t                  diag_block_size,
472                              cs_sles_it_convergence_t  *convergence,
473                              const cs_real_t           *rhs,
474                              cs_real_t                 *restrict vx,
475                              size_t                     aux_size,
476                              void                      *aux_vectors)
477 {
478   cs_sles_convergence_state_t cvg = CS_SLES_ITERATING;
479   cs_real_t  *_aux_vectors;
480   cs_real_t  *restrict rk, *restrict vk, *restrict wk;
481   cs_real_t  *restrict dk, *restrict qk;
482 
483   unsigned n_iter = 0;
484 
485   /* Allocate or map work arrays */
486   /*-----------------------------*/
487 
488   assert(c->setup_data != NULL);
489 
490   const cs_lnum_t n_rows = c->setup_data->n_rows;
491 
492   {
493     const cs_lnum_t n_cols = cs_matrix_get_n_columns(a) * diag_block_size;
494     const size_t n_wa = 5;
495     const size_t wa_size = CS_SIMD_SIZE(n_cols);
496 
497     if (aux_vectors == NULL || aux_size/sizeof(cs_real_t) < (wa_size * n_wa))
498       BFT_MALLOC(_aux_vectors, wa_size * n_wa, cs_real_t);
499     else
500       _aux_vectors = aux_vectors;
501 
502     rk = _aux_vectors;
503     vk = _aux_vectors + wa_size;
504     wk = _aux_vectors + wa_size*2;
505     dk = _aux_vectors + wa_size*3;
506     qk = _aux_vectors + wa_size*4;
507   }
508 
509   /* Initialize iterative calculation */
510   /*----------------------------------*/
511 
512   /* Residue and descent direction */
513 
514   cs_matrix_vector_multiply(a, vx, rk);  /* rk = A.x0 */
515 
516 # pragma omp parallel if(n_rows > CS_THR_MIN)
517   {
518 #   pragma omp for nowait
519     for (cs_lnum_t ii = 0; ii < n_rows; ii++)
520       rk[ii] = rhs[ii] - rk[ii];
521 
522 #   pragma omp for nowait
523     for (cs_lnum_t ii = 0; ii < n_rows; ii++)
524       qk[ii] = 0;
525   }
526 
527   double rho_km1 = 0;
528 
529   while (cvg == CS_SLES_ITERATING) {
530 
531     /* Preconditioning */
532 
533     c->setup_data->pc_apply(c->setup_data->pc_context, rk, vk);
534 
535     cs_matrix_vector_multiply(a, vk, wk);
536 
537     /* compute residue and prepare descent parameter */
538 
539     double alpha_k, beta_k, gamma_k, residue;
540 
541     _dot_products_vr_vw_vq_rr(c, vk, rk, wk, qk,
542                               &alpha_k, &beta_k, &gamma_k, &residue);
543 
544     residue = sqrt(residue);
545 
546     /* Convergence test for end of previous iteration */
547 
548     if (n_iter > 0)
549       cvg = _convergence_test(c, n_iter, residue, convergence);
550     else
551       c->setup_data->initial_residue = residue;
552 
553     if (cvg != CS_SLES_ITERATING)
554       break;
555 
556     /* Complete descent parameter computation and matrix.vector product */
557 
558     if (n_iter > 0) {
559 
560       cs_real_t gk_rk1 = gamma_k / rho_km1;
561       cs_real_t rho_k = beta_k - gamma_k*gamma_k / rho_km1;
562       cs_real_t ak_rk = alpha_k / rho_k;
563 
564 #     pragma omp parallel if(n_rows > CS_THR_MIN)
565       {
566 #       pragma omp for nowait
567         for (cs_lnum_t ii = 0; ii < n_rows; ii++) {
568           dk[ii] = vk[ii] - gk_rk1 * dk[ii];
569           vx[ii] = vx[ii] + ak_rk * dk[ii];
570         }
571 
572 #       pragma omp for nowait
573         for (cs_lnum_t ii = 0; ii < n_rows; ii++) {
574           qk[ii] = wk[ii] - gk_rk1 * qk[ii];
575           rk[ii] = rk[ii] - ak_rk * qk[ii];
576         }
577       }
578 
579       rho_km1 = rho_k;
580     }
581     else { /* n_iter == 0 */
582 
583       cs_real_t rho_k = beta_k;
584       cs_real_t ak_rk = alpha_k / rho_k;
585 
586 #     pragma omp parallel if(n_rows > CS_THR_MIN)
587       {
588 #       pragma omp for nowait
589         for (cs_lnum_t ii = 0; ii < n_rows; ii++) {
590           dk[ii] = vk[ii];
591           vx[ii] = vx[ii] + ak_rk * vk[ii];
592         }
593 
594 #       pragma omp for nowait
595         for (cs_lnum_t ii = 0; ii < n_rows; ii++) {
596           qk[ii] = wk[ii];
597           rk[ii] = rk[ii] - ak_rk * wk[ii];
598         }
599       }
600 
601       rho_km1 = rho_k;
602     }
603 
604     n_iter += 1;
605   }
606 
607   if (_aux_vectors != aux_vectors)
608     BFT_FREE(_aux_vectors);
609 
610   return cvg;
611 }
612 
613 /*----------------------------------------------------------------------------
614  * Solution of A.vx = Rhs using flexible preconditioned conjugate gradient.
615  *
616  * Compared to standard PCG, IPCG supports variable preconditioners, at
617  * the expense of storing the residual at iterations k (rk) and k-1 (rkm1)
618  * to compute the Beta coefficient. When the preconditioner is constant
619  * across the iterations, IPCG is equivalent to PCG.
620  *
621  * On entry, vx is considered initialized.
622  *
623  * parameters:
624  *   c               <-- pointer to solver context info
625  *   a               <-- matrix
626  *   diag_block_size <-- diagonal block size
627  *   convergence     <-- convergence information structure
628  *   rhs             <-- right hand side
629  *   vx              <-> system solution
630  *   aux_size        <-- number of elements in aux_vectors (in bytes)
631  *   aux_vectors     --- optional working area (allocation otherwise)
632  *
633  * returns:
634  *   convergence state
635  *----------------------------------------------------------------------------*/
636 
637 static cs_sles_convergence_state_t
_conjugate_gradient_ip(cs_sles_it_t * c,const cs_matrix_t * a,cs_lnum_t diag_block_size,cs_sles_it_convergence_t * convergence,const cs_real_t * rhs,cs_real_t * restrict vx,size_t aux_size,void * aux_vectors)638 _conjugate_gradient_ip(cs_sles_it_t              *c,
639                        const cs_matrix_t         *a,
640                        cs_lnum_t                  diag_block_size,
641                        cs_sles_it_convergence_t  *convergence,
642                        const cs_real_t           *rhs,
643                        cs_real_t                 *restrict vx,
644                        size_t                     aux_size,
645                        void                      *aux_vectors)
646 {
647   cs_sles_convergence_state_t cvg;
648   double  ro_0, ro_1, alpha, rk_gk_m1, rkm1_gk, rk_gk, beta, residue;
649   cs_real_t  *_aux_vectors;
650   cs_real_t  *restrict rk, *restrict rkm1, *restrict dk, *restrict gk;
651   cs_real_t  *restrict zk;
652 
653   unsigned n_iter = 0;
654 
655   /* Allocate or map work arrays */
656   /*-----------------------------*/
657 
658   assert(c->setup_data != NULL);
659 
660   const cs_lnum_t n_rows = c->setup_data->n_rows;
661 
662   {
663     const cs_lnum_t n_cols = cs_matrix_get_n_columns(a) * diag_block_size;
664     const size_t n_wa = 5;
665     const size_t wa_size = CS_SIMD_SIZE(n_cols);
666 
667     if (aux_vectors == NULL || aux_size/sizeof(cs_real_t) < (wa_size * n_wa))
668       BFT_MALLOC(_aux_vectors, wa_size * n_wa, cs_real_t);
669     else
670       _aux_vectors = aux_vectors;
671 
672     rk    = _aux_vectors;
673     rkm1  = _aux_vectors + wa_size;
674     dk    = _aux_vectors + wa_size*2;
675     gk    = _aux_vectors + wa_size*3;
676     zk    = _aux_vectors + wa_size*4;
677   }
678 
679   /* Initialize iterative calculation */
680   /*----------------------------------*/
681 
682   /* Residue and descent direction */
683 
684   cs_matrix_vector_multiply(a, vx, rk);  /* rk = A.x0 */
685 
686 # pragma omp parallel for if(n_rows > CS_THR_MIN)
687   for (cs_lnum_t ii = 0; ii < n_rows; ii++)
688     rk[ii] -= rhs[ii];
689 
690   /* Preconditioning */
691 
692   c->setup_data->pc_apply(c->setup_data->pc_context, rk, gk);
693 
694   /* Descent direction */
695   /*-------------------*/
696 
697 #if defined(HAVE_OPENMP)
698 
699 # pragma omp parallel for if(n_rows > CS_THR_MIN)
700   for (cs_lnum_t ii = 0; ii < n_rows; ii++)
701     dk[ii] = gk[ii];
702 
703 #else
704 
705   memcpy(dk, gk, n_rows * sizeof(cs_real_t));
706 
707 #endif
708 
709   _dot_products_xx_xy(c, rk, gk, &residue, &rk_gk_m1);
710   residue = sqrt(residue);
711 
712   /* If no solving required, finish here */
713 
714   c->setup_data->initial_residue = residue;
715   cvg = _convergence_test(c, n_iter, residue, convergence);
716 
717   if (cvg == CS_SLES_ITERATING) {
718 
719     n_iter = 1;
720 
721     cs_matrix_vector_multiply(a, dk, zk);
722 
723     /* Descent parameter */
724 
725     _dot_products_xy_yz(c, rk, dk, zk, &ro_0, &ro_1);
726 
727     cs_real_t d_ro_1 = (CS_ABS(ro_1) > DBL_MIN) ? 1. / ro_1 : 0.;
728     alpha =  - ro_0 * d_ro_1;
729 
730 #   pragma omp parallel if(n_rows > CS_THR_MIN)
731     {
732 #     pragma omp for nowait
733       for (cs_lnum_t ii = 0; ii < n_rows; ii++)
734         vx[ii] += (alpha * dk[ii]);
735 
736 #     pragma omp for nowait
737       for (cs_lnum_t ii = 0; ii < n_rows; ii++) {
738         rkm1[ii] = rk[ii];
739         rk[ii] += (alpha * zk[ii]);
740       }
741     }
742 
743     /* Convergence test */
744 
745     residue = sqrt(_dot_product_xx(c, rk));
746     cvg = _convergence_test(c, n_iter, residue, convergence);
747 
748     /* Current Iteration */
749     /*-------------------*/
750 
751   }
752 
753   while (cvg == CS_SLES_ITERATING) {
754 
755     /* Preconditioning */
756 
757     c->setup_data->pc_apply(c->setup_data->pc_context, rk, gk);
758 
759     /* compute residue and prepare descent parameter */
760 
761     _dot_products_xx_xy_yz(c, rk, gk, rkm1, &residue, &rk_gk, &rkm1_gk);
762 
763     residue = sqrt(residue);
764 
765     /* Convergence test for end of previous iteration */
766 
767     if (n_iter > 1)
768       cvg = _convergence_test(c, n_iter, residue, convergence);
769 
770     if (cvg != CS_SLES_ITERATING)
771       break;
772 
773     n_iter += 1;
774 
775     /* Complete descent parameter computation and matrix.vector product */
776 
777     beta = (rk_gk - rkm1_gk) / rk_gk_m1;
778     rk_gk_m1 = rk_gk;
779 
780 #   pragma omp parallel for firstprivate(alpha) if(n_rows > CS_THR_MIN)
781     for (cs_lnum_t ii = 0; ii < n_rows; ii++)
782       dk[ii] = gk[ii] + (beta * dk[ii]);
783 
784     cs_matrix_vector_multiply(a, dk, zk);
785 
786     _dot_products_xy_yz(c, rk, dk, zk, &ro_0, &ro_1);
787 
788     cs_real_t d_ro_1 = (CS_ABS(ro_1) > DBL_MIN) ? 1. / ro_1 : 0.;
789     alpha =  - ro_0 * d_ro_1;
790 
791 #   pragma omp parallel if(n_rows > CS_THR_MIN)
792     {
793 #     pragma omp for nowait
794       for (cs_lnum_t ii = 0; ii < n_rows; ii++)
795         vx[ii] += (alpha * dk[ii]);
796 
797 #     pragma omp for nowait
798       for (cs_lnum_t ii = 0; ii < n_rows; ii++) {
799         rkm1[ii] = rk[ii];
800         rk[ii] += (alpha * zk[ii]);
801       }
802     }
803 
804   }
805 
806   if (_aux_vectors != aux_vectors)
807     BFT_FREE(_aux_vectors);
808 
809   return cvg;
810 }
811 
812 /*----------------------------------------------------------------------------
813  * Solution of A.vx = Rhs using preconditioned conjugate gradient
814  * with single reduction.
815  *
816  * For more information, see Lapack Working note 56, at
817  * http://www.netlib.org/lapack/lawnspdf/lawn56.pdf)
818  *
819  * On entry, vx is considered initialized.
820  *
821  * parameters:
822  *   c               <-- pointer to solver context info
823  *   a               <-- matrix
824  *   diag_block_size <-- block size of element ii, ii
825  *   convergence     <-- convergence information structure
826  *   rhs             <-- right hand side
827  *   vx              <-> system solution
828  *   aux_size        <-- number of elements in aux_vectors (in bytes)
829  *   aux_vectors     --- optional working area (allocation otherwise)
830  *
831  * returns:
832  *   convergence state
833  *----------------------------------------------------------------------------*/
834 
835 static cs_sles_convergence_state_t
_conjugate_gradient_sr(cs_sles_it_t * c,const cs_matrix_t * a,cs_lnum_t diag_block_size,cs_sles_it_convergence_t * convergence,const cs_real_t * rhs,cs_real_t * restrict vx,size_t aux_size,void * aux_vectors)836 _conjugate_gradient_sr(cs_sles_it_t              *c,
837                        const cs_matrix_t         *a,
838                        cs_lnum_t                  diag_block_size,
839                        cs_sles_it_convergence_t  *convergence,
840                        const cs_real_t           *rhs,
841                        cs_real_t                 *restrict vx,
842                        size_t                     aux_size,
843                        void                      *aux_vectors)
844 {
845   cs_sles_convergence_state_t cvg;
846   double  ro_0, ro_1, alpha, rk_gkm1, rk_gk, gk_sk, beta, residue;
847   cs_real_t *_aux_vectors;
848   cs_real_t  *restrict rk, *restrict dk, *restrict gk, *restrict sk;
849   cs_real_t *restrict zk;
850 
851   unsigned n_iter = 0;
852 
853   /* Allocate or map work arrays */
854   /*-----------------------------*/
855 
856   assert(c->setup_data != NULL);
857 
858   const cs_lnum_t n_rows = c->setup_data->n_rows;
859 
860   {
861     const cs_lnum_t n_cols = cs_matrix_get_n_columns(a) * diag_block_size;
862     const size_t n_wa = 5;
863     const size_t wa_size = CS_SIMD_SIZE(n_cols);
864 
865     if (aux_vectors == NULL || aux_size/sizeof(cs_real_t) < (wa_size * n_wa))
866       BFT_MALLOC(_aux_vectors, wa_size * n_wa, cs_real_t);
867     else
868       _aux_vectors = aux_vectors;
869 
870     rk = _aux_vectors;
871     dk = _aux_vectors + wa_size;
872     gk = _aux_vectors + wa_size*2;
873     zk = _aux_vectors + wa_size*3;
874     sk = _aux_vectors + wa_size*4;
875   }
876 
877   /* Initialize iterative calculation */
878   /*----------------------------------*/
879 
880   /* Residue and descent direction */
881 
882   cs_matrix_vector_multiply(a, vx, rk);  /* rk = A.x0 */
883 
884 # pragma omp parallel for if(n_rows > CS_THR_MIN)
885   for (cs_lnum_t ii = 0; ii < n_rows; ii++)
886     rk[ii] -= rhs[ii];
887 
888   /* Preconditionning */
889 
890   c->setup_data->pc_apply(c->setup_data->pc_context, rk, gk);
891 
892   /* Descent direction */
893   /*-------------------*/
894 
895 #if defined(HAVE_OPENMP)
896 
897 # pragma omp parallel for if(n_rows > CS_THR_MIN)
898   for (cs_lnum_t ii = 0; ii < n_rows; ii++)
899     dk[ii] = gk[ii];
900 
901 #else
902 
903   memcpy(dk, gk, n_rows * sizeof(cs_real_t));
904 
905 #endif
906 
907   cs_matrix_vector_multiply(a, dk, zk); /* zk = A.dk */
908 
909   /* Descent parameter */
910 
911   _dot_products_xx_xy_yz(c, rk, dk, zk, &residue, &ro_0, &ro_1);
912   residue = sqrt(residue);
913 
914   c->setup_data->initial_residue = residue;
915 
916   /* If no solving required, finish here */
917 
918   cvg = _convergence_test(c, n_iter, residue, convergence);
919 
920   if (cvg == CS_SLES_ITERATING) {
921 
922     n_iter = 1;
923 
924     cs_real_t d_ro_1 = (CS_ABS(ro_1) > DBL_MIN) ? 1. / ro_1 : 0.;
925     alpha =  - ro_0 * d_ro_1;
926 
927     rk_gkm1 = ro_0;
928 
929 #   pragma omp parallel if(n_rows > CS_THR_MIN)
930     {
931 #     pragma omp for nowait
932       for (cs_lnum_t ii = 0; ii < n_rows; ii++)
933         vx[ii] += (alpha * dk[ii]);
934 
935 #     pragma omp for nowait
936       for (cs_lnum_t ii = 0; ii < n_rows; ii++)
937         rk[ii] += (alpha * zk[ii]);
938     }
939 
940     /* Convergence test */
941 
942     residue = sqrt(_dot_product_xx(c, rk));
943 
944     cvg = _convergence_test(c, n_iter, residue, convergence);
945 
946   }
947 
948   /* Current Iteration */
949   /*-------------------*/
950 
951   while (cvg == CS_SLES_ITERATING) {
952 
953     /* Preconditionning */
954 
955     c->setup_data->pc_apply(c->setup_data->pc_context, rk, gk);
956 
957     cs_matrix_vector_multiply(a, gk, sk);  /* sk = A.gk */
958 
959     /* compute residue and prepare descent parameter */
960 
961     _dot_products_xx_xy_yz(c, rk, gk, sk, &residue, &rk_gk, &gk_sk);
962 
963     residue = sqrt(residue);
964 
965     /* Convergence test for end of previous iteration */
966 
967     if (n_iter > 1)
968       cvg = _convergence_test(c, n_iter, residue, convergence);
969 
970     if (cvg != CS_SLES_ITERATING)
971       break;
972 
973     n_iter += 1;
974 
975     /* Complete descent parameter computation and matrix.vector product */
976 
977     beta = rk_gk / rk_gkm1;
978     rk_gkm1 = rk_gk;
979 
980     ro_1 = gk_sk - beta*beta*ro_1;
981     ro_0 = rk_gk;
982 
983     cs_real_t d_ro_1 = (CS_ABS(ro_1) > DBL_MIN) ? 1. / ro_1 : 0.;
984     alpha =  - ro_0 * d_ro_1;
985 
986 #   pragma omp parallel if(n_rows > CS_THR_MIN)
987     {
988 #     pragma omp for nowait
989       for (cs_lnum_t ii = 0; ii < n_rows; ii++) {
990         dk[ii] = gk[ii] + (beta * dk[ii]);
991         vx[ii] += alpha * dk[ii];
992       }
993 #     pragma omp for nowait
994       for (cs_lnum_t ii = 0; ii < n_rows; ii++) {
995         zk[ii] = sk[ii] + (beta * zk[ii]);
996         rk[ii] += alpha * zk[ii];
997       }
998     }
999 
1000   }
1001 
1002   if (_aux_vectors != aux_vectors)
1003     BFT_FREE(_aux_vectors);
1004 
1005   return cvg;
1006 }
1007 
1008 /*----------------------------------------------------------------------------
1009  * Solution of A.vx = Rhs using non-preconditioned conjugate gradient.
1010  *
1011  * On entry, vx is considered initialized.
1012  *
1013  * parameters:
1014  *   c               <-- pointer to solver context info
1015  *   a               <-- matrix
1016  *   diag_block_size <-- block size of element ii, ii
1017  *   convergence     <-- convergence information structure
1018  *   rhs             <-- right hand side
1019  *   vx              <-> system solution
1020  *   aux_size        <-- number of elements in aux_vectors (in bytes)
1021  *   aux_vectors     --- optional working area (allocation otherwise)
1022  *
1023  * returns:
1024  *   convergence state
1025  *----------------------------------------------------------------------------*/
1026 
1027 static cs_sles_convergence_state_t
_conjugate_gradient_npc(cs_sles_it_t * c,const cs_matrix_t * a,cs_lnum_t diag_block_size,cs_sles_it_convergence_t * convergence,const cs_real_t * rhs,cs_real_t * restrict vx,size_t aux_size,void * aux_vectors)1028 _conjugate_gradient_npc(cs_sles_it_t              *c,
1029                         const cs_matrix_t         *a,
1030                         cs_lnum_t                  diag_block_size,
1031                         cs_sles_it_convergence_t  *convergence,
1032                         const cs_real_t           *rhs,
1033                         cs_real_t                 *restrict vx,
1034                         size_t                     aux_size,
1035                         void                      *aux_vectors)
1036 {
1037   cs_sles_convergence_state_t cvg;
1038   double  ro_0, ro_1, alpha, rk_rkm1, rk_rk, beta, residue;
1039   cs_real_t *_aux_vectors;
1040   cs_real_t  *restrict rk, *restrict dk, *restrict zk;
1041 
1042   unsigned n_iter = 0;
1043 
1044   /* Allocate or map work arrays */
1045   /*-----------------------------*/
1046 
1047   assert(c->setup_data != NULL);
1048 
1049   const cs_lnum_t n_rows = c->setup_data->n_rows;
1050 
1051   {
1052     const cs_lnum_t n_cols = cs_matrix_get_n_columns(a) * diag_block_size;
1053     const size_t n_wa = 3;
1054     const size_t wa_size = CS_SIMD_SIZE(n_cols);
1055 
1056     if (aux_vectors == NULL || aux_size/sizeof(cs_real_t) < (wa_size * n_wa))
1057       BFT_MALLOC(_aux_vectors, wa_size * n_wa, cs_real_t);
1058     else
1059       _aux_vectors = aux_vectors;
1060 
1061     rk = _aux_vectors;
1062     dk = _aux_vectors + wa_size;
1063     zk = _aux_vectors + wa_size*2;
1064   }
1065 
1066   /* Initialize iterative calculation */
1067   /*----------------------------------*/
1068 
1069   /* Residue and descent direction */
1070 
1071   cs_matrix_vector_multiply(a, vx, rk);  /* rk = A.x0 */
1072 
1073 # pragma omp parallel for if(n_rows > CS_THR_MIN)
1074   for (cs_lnum_t ii = 0; ii < n_rows; ii++)
1075     rk[ii] -= rhs[ii];
1076 
1077   /* Descent direction */
1078   /*-------------------*/
1079 
1080 #if defined(HAVE_OPENMP)
1081 
1082 # pragma omp parallel for if(n_rows > CS_THR_MIN)
1083   for (cs_lnum_t ii = 0; ii < n_rows; ii++)
1084     dk[ii] = rk[ii];
1085 
1086 #else
1087 
1088   memcpy(dk, rk, n_rows * sizeof(cs_real_t));
1089 
1090 #endif
1091 
1092   rk_rkm1 = _dot_product_xx(c, rk);
1093   residue = sqrt(rk_rkm1);
1094 
1095   /* If no solving required, finish here */
1096 
1097   c->setup_data->initial_residue = residue;
1098   cvg = _convergence_test(c, n_iter, residue, convergence);
1099 
1100   if (cvg == CS_SLES_ITERATING) {
1101 
1102     n_iter = 1;
1103 
1104     cs_matrix_vector_multiply(a, dk, zk);
1105 
1106     /* Descent parameter */
1107 
1108     _dot_products_xy_yz(c, rk, dk, zk, &ro_0, &ro_1);
1109 
1110     cs_real_t d_ro_1 = (CS_ABS(ro_1) > DBL_MIN) ? 1. / ro_1 : 0.;
1111     alpha =  - ro_0 * d_ro_1;
1112 
1113 #   pragma omp parallel if(n_rows > CS_THR_MIN)
1114     {
1115 #     pragma omp for nowait
1116       for (cs_lnum_t ii = 0; ii < n_rows; ii++)
1117         vx[ii] += (alpha * dk[ii]);
1118 
1119 #     pragma omp for nowait
1120       for (cs_lnum_t ii = 0; ii < n_rows; ii++)
1121         rk[ii] += (alpha * zk[ii]);
1122     }
1123 
1124     /* Convergence test */
1125 
1126     residue = sqrt(_dot_product(c, rk, rk));
1127 
1128     cvg = _convergence_test(c, n_iter, residue, convergence);
1129 
1130   }
1131 
1132   /* Current Iteration */
1133   /*-------------------*/
1134 
1135   while (cvg == CS_SLES_ITERATING) {
1136 
1137     /* compute residue and prepare descent parameter */
1138 
1139     _dot_products_xx_xy(c, rk, rk, &residue, &rk_rk);
1140 
1141     residue = sqrt(residue);
1142 
1143     /* Convergence test for end of previous iteration */
1144 
1145     if (n_iter > 1)
1146       cvg = _convergence_test(c, n_iter, residue, convergence);
1147 
1148     if (cvg != CS_SLES_ITERATING)
1149       break;
1150 
1151     n_iter += 1;
1152 
1153     /* Complete descent parameter computation and matrix.vector product */
1154 
1155     beta = rk_rk / rk_rkm1;
1156     rk_rkm1 = rk_rk;
1157 
1158 #   pragma omp parallel for firstprivate(alpha) if(n_rows > CS_THR_MIN)
1159     for (cs_lnum_t ii = 0; ii < n_rows; ii++)
1160       dk[ii] = rk[ii] + (beta * dk[ii]);
1161 
1162     cs_matrix_vector_multiply(a, dk, zk);
1163 
1164     _dot_products_xy_yz(c, rk, dk, zk, &ro_0, &ro_1);
1165 
1166     cs_real_t d_ro_1 = (CS_ABS(ro_1) > DBL_MIN) ? 1. / ro_1 : 0.;
1167     alpha =  - ro_0 * d_ro_1;
1168 
1169 #   pragma omp parallel if(n_rows > CS_THR_MIN)
1170     {
1171 #     pragma omp for nowait
1172       for (cs_lnum_t ii = 0; ii < n_rows; ii++)
1173         vx[ii] += (alpha * dk[ii]);
1174 
1175 #     pragma omp for nowait
1176       for (cs_lnum_t ii = 0; ii < n_rows; ii++)
1177         rk[ii] += (alpha * zk[ii]);
1178     }
1179 
1180   }
1181 
1182   if (_aux_vectors != aux_vectors)
1183     BFT_FREE(_aux_vectors);
1184 
1185   return cvg;
1186 }
1187 
1188 /*----------------------------------------------------------------------------
1189  * Solution of A.vx = Rhs using non-preconditioned conjugate gradient
1190  * with single reduction.
1191  *
1192  * For more information, see Lapack Working note 56, at
1193  * http://www.netlib.org/lapack/lawnspdf/lawn56.pdf)
1194  *
1195  * On entry, vx is considered initialized.
1196  *
1197  * parameters:
1198  *   c               <-- pointer to solver context info
1199  *   a               <-- matrix
1200  *   diag_block_size <-- block size of element ii, ii
1201  *   convergence     <-- convergence information structure
1202  *   rhs             <-- right hand side
1203  *   vx              <-> system solution
1204  *   aux_size        <-- number of elements in aux_vectors (in bytes)
1205  *   aux_vectors     --- optional working area (allocation otherwise)
1206  *
1207  * returns:
1208  *   convergence state
1209  *----------------------------------------------------------------------------*/
1210 
1211 static cs_sles_convergence_state_t
_conjugate_gradient_npc_sr(cs_sles_it_t * c,const cs_matrix_t * a,cs_lnum_t diag_block_size,cs_sles_it_convergence_t * convergence,const cs_real_t * rhs,cs_real_t * restrict vx,size_t aux_size,void * aux_vectors)1212 _conjugate_gradient_npc_sr(cs_sles_it_t              *c,
1213                            const cs_matrix_t         *a,
1214                            cs_lnum_t                  diag_block_size,
1215                            cs_sles_it_convergence_t  *convergence,
1216                            const cs_real_t           *rhs,
1217                            cs_real_t                 *restrict vx,
1218                            size_t                     aux_size,
1219                            void                      *aux_vectors)
1220 {
1221   cs_sles_convergence_state_t cvg;
1222   double  ro_0, ro_1, alpha, rk_rkm1, rk_rk, rk_sk, beta, residue;
1223   cs_real_t *_aux_vectors;
1224   cs_real_t  *restrict rk, *restrict dk, *restrict sk;
1225   cs_real_t *restrict zk;
1226 
1227   unsigned n_iter = 0;
1228 
1229   /* Allocate or map work arrays */
1230   /*-----------------------------*/
1231 
1232   assert(c->setup_data != NULL);
1233 
1234   const cs_lnum_t n_rows = c->setup_data->n_rows;
1235 
1236   {
1237     const cs_lnum_t n_cols = cs_matrix_get_n_columns(a) * diag_block_size;
1238     const size_t n_wa = 4;
1239     const size_t wa_size = CS_SIMD_SIZE(n_cols);
1240 
1241     if (aux_vectors == NULL || aux_size/sizeof(cs_real_t) < (wa_size * n_wa))
1242       BFT_MALLOC(_aux_vectors, wa_size * n_wa, cs_real_t);
1243     else
1244       _aux_vectors = aux_vectors;
1245 
1246     rk = _aux_vectors;
1247     dk = _aux_vectors + wa_size;
1248     zk = _aux_vectors + wa_size*2;
1249     sk = _aux_vectors + wa_size*3;
1250   }
1251 
1252   /* Initialize iterative calculation */
1253   /*----------------------------------*/
1254 
1255   /* Residue and descent direction */
1256 
1257   cs_matrix_vector_multiply(a, vx, rk);  /* rk = A.x0 */
1258 
1259 # pragma omp parallel for if(n_rows > CS_THR_MIN)
1260   for (cs_lnum_t ii = 0; ii < n_rows; ii++)
1261     rk[ii] = rk[ii] - rhs[ii];
1262 
1263   /* Descent direction */
1264   /*-------------------*/
1265 
1266 #if defined(HAVE_OPENMP)
1267 
1268 # pragma omp parallel for if(n_rows > CS_THR_MIN)
1269   for (cs_lnum_t ii = 0; ii < n_rows; ii++)
1270     dk[ii] = rk[ii];
1271 
1272 #else
1273 
1274   memcpy(dk, rk, n_rows * sizeof(cs_real_t));
1275 
1276 #endif
1277 
1278   cs_matrix_vector_multiply(a, dk, zk); /* zk = A.dk */
1279 
1280   /* Descent parameter */
1281 
1282   _dot_products_xx_xy_yz(c, rk, dk, zk, &residue, &ro_0, &ro_1);
1283   residue = sqrt(residue);
1284 
1285   /* If no solving required, finish here */
1286 
1287   c->setup_data->initial_residue = residue;
1288 
1289   cvg = _convergence_test(c, n_iter, residue, convergence);
1290 
1291   if (cvg == CS_SLES_ITERATING) {
1292 
1293     n_iter = 1;
1294 
1295     cs_real_t d_ro_1 = (CS_ABS(ro_1) > DBL_MIN) ? 1. / ro_1 : 0.;
1296     alpha =  - ro_0 * d_ro_1;
1297 
1298     rk_rkm1 = ro_0;
1299 
1300 #   pragma omp parallel if(n_rows > CS_THR_MIN)
1301     {
1302 #     pragma omp for nowait
1303       for (cs_lnum_t ii = 0; ii < n_rows; ii++)
1304         vx[ii] += (alpha * dk[ii]);
1305 
1306 #     pragma omp for nowait
1307       for (cs_lnum_t ii = 0; ii < n_rows; ii++)
1308         rk[ii] += (alpha * zk[ii]);
1309     }
1310 
1311     /* Convergence test */
1312 
1313     residue = sqrt(_dot_product_xx(c, rk));
1314 
1315     cvg = _convergence_test(c, n_iter, residue, convergence);
1316 
1317   }
1318 
1319   /* Current Iteration */
1320   /*-------------------*/
1321 
1322   while (cvg == CS_SLES_ITERATING) {
1323 
1324     cs_matrix_vector_multiply(a, rk, sk);  /* sk = A.zk */
1325 
1326     /* compute residue and prepare descent parameter */
1327 
1328     _dot_products_xx_xy(c, rk, sk, &residue, &rk_sk);
1329 
1330     rk_rk = residue;
1331 
1332     residue = sqrt(residue);
1333 
1334     /* Convergence test for end of previous iteration */
1335 
1336     if (n_iter > 1)
1337       cvg = _convergence_test(c, n_iter, residue, convergence);
1338 
1339     if (cvg != CS_SLES_ITERATING)
1340       break;
1341 
1342     n_iter += 1;
1343 
1344     /* Complete descent parameter computation and matrix.vector product */
1345 
1346     beta = rk_rk / rk_rkm1;
1347     rk_rkm1 = rk_rk;
1348 
1349     ro_1 = rk_sk - beta*beta*ro_1;
1350     ro_0 = rk_rk;
1351 
1352     cs_real_t d_ro_1 = (CS_ABS(ro_1) > DBL_MIN) ? 1. / ro_1 : 0.;
1353     alpha =  - ro_0 * d_ro_1;
1354 
1355 #   pragma omp parallel if(n_rows > CS_THR_MIN)
1356     {
1357 #     pragma omp for nowait
1358       for (cs_lnum_t ii = 0; ii < n_rows; ii++) {
1359         dk[ii] = rk[ii] + (beta * dk[ii]);
1360         vx[ii] += alpha * dk[ii];
1361       }
1362 #     pragma omp for nowait
1363       for (cs_lnum_t ii = 0; ii < n_rows; ii++) {
1364         zk[ii] = sk[ii] + (beta * zk[ii]);
1365         rk[ii] += alpha * zk[ii];
1366       }
1367     }
1368 
1369   }
1370 
1371   if (_aux_vectors != aux_vectors)
1372     BFT_FREE(_aux_vectors);
1373 
1374   return cvg;
1375 }
1376 
1377 /*----------------------------------------------------------------------------
1378  * Solution of A.vx = Rhs using preconditioned 3-layer conjugate residual.
1379  *
1380  * On entry, vx is considered initialized.
1381  *
1382  * parameters:
1383  *   c               <-- pointer to solver context info
1384  *   a               <-- matrix
1385  *   diag_block_size <-- block size of element ii, ii
1386  *   convergence     <-- convergence information structure
1387  *   rhs             <-- right hand side
1388  *   vx              <-> system solution
1389  *   aux_size        <-- number of elements in aux_vectors (in bytes)
1390  *   aux_vectors     --- optional working area (allocation otherwise)
1391  *
1392  * returns:
1393  *   convergence state
1394  *----------------------------------------------------------------------------*/
1395 
1396 static cs_sles_convergence_state_t
_conjugate_residual_3(cs_sles_it_t * c,const cs_matrix_t * a,cs_lnum_t diag_block_size,cs_sles_it_convergence_t * convergence,const cs_real_t * rhs,cs_real_t * restrict vx,size_t aux_size,void * aux_vectors)1397 _conjugate_residual_3(cs_sles_it_t              *c,
1398                       const cs_matrix_t         *a,
1399                       cs_lnum_t                  diag_block_size,
1400                       cs_sles_it_convergence_t  *convergence,
1401                       const cs_real_t           *rhs,
1402                       cs_real_t                 *restrict vx,
1403                       size_t                     aux_size,
1404                       void                      *aux_vectors)
1405 {
1406   cs_sles_convergence_state_t cvg;
1407   cs_lnum_t  ii;
1408   double  residue;
1409   double  ak, bk, ck, dk, ek, denom, alpha, tau;
1410   cs_real_t *_aux_vectors;
1411   cs_real_t  *restrict vxm1;
1412   cs_real_t  *restrict rk, *restrict rkm1;
1413   cs_real_t  *restrict wk, *restrict zk;
1414   cs_real_t  *restrict tmp;
1415 
1416   unsigned n_iter = 0;
1417 
1418   /* Allocate or map work arrays */
1419   /*-----------------------------*/
1420 
1421   assert(c->setup_data != NULL);
1422 
1423   const cs_lnum_t n_rows = c->setup_data->n_rows;
1424 
1425   {
1426     const cs_lnum_t n_cols = cs_matrix_get_n_columns(a) * diag_block_size;
1427     const size_t n_wa = 6;
1428     const size_t wa_size = CS_SIMD_SIZE(n_cols);
1429 
1430     if (aux_vectors == NULL || aux_size/sizeof(cs_real_t) < (wa_size * n_wa))
1431       BFT_MALLOC(_aux_vectors, wa_size * n_wa, cs_real_t);
1432     else
1433       _aux_vectors = aux_vectors;
1434 
1435     vxm1 = _aux_vectors;
1436     rk = _aux_vectors + wa_size;
1437     rkm1 = _aux_vectors + wa_size*2;
1438     tmp = _aux_vectors + wa_size*3;
1439     wk = _aux_vectors + wa_size*4;
1440     zk = _aux_vectors + wa_size*5;
1441 
1442 #   pragma omp parallel for if(n_rows > CS_THR_MIN)
1443     for (ii = 0; ii < n_rows; ii++) {
1444       vxm1[ii] = vx[ii];
1445       rkm1[ii] = 0.0;
1446     }
1447   }
1448 
1449   /* Initialize iterative calculation */
1450   /*----------------------------------*/
1451 
1452   /* Residue */
1453 
1454   cs_matrix_vector_multiply(a, vx, rk);  /* rk = A.x0 */
1455 
1456 # pragma omp parallel for if(n_rows > CS_THR_MIN)
1457   for (ii = 0; ii < n_rows; ii++)
1458     rk[ii] -= rhs[ii];
1459 
1460   residue = _dot_product(c, rk, rk);
1461   residue = sqrt(residue);
1462 
1463   /* If no solving required, finish here */
1464 
1465   c->setup_data->initial_residue = residue;
1466   cvg = _convergence_test(c, n_iter, residue, convergence);
1467 
1468   /* Current Iteration */
1469   /*-------------------*/
1470 
1471   while (cvg == CS_SLES_ITERATING) {
1472 
1473     /* Preconditionning */
1474 
1475     c->setup_data->pc_apply(c->setup_data->pc_context, rk, wk);
1476 
1477     cs_matrix_vector_multiply(a, wk, zk);
1478 
1479     _dot_products_xy_yz(c, rk, zk, rkm1, &ak, &bk);
1480 
1481 #   pragma omp parallel for if(n_rows > CS_THR_MIN)
1482     for (ii = 0; ii < n_rows; ii++)
1483       tmp[ii] = rk[ii] - rkm1[ii];
1484 
1485     _dot_products_xy_yz(c, rk, tmp, rkm1, &ck, &dk);
1486 
1487     ek = _dot_product_xx(c, zk);
1488 
1489     denom = (ck-dk)*ek - ((ak-bk)*(ak-bk));
1490 
1491     if (fabs(denom) < 1.e-30)
1492       alpha = 1.0;
1493     else
1494       alpha = ((ak-bk)*bk - dk*ek)/denom;
1495 
1496     if (fabs(alpha) < 1.e-30 || fabs(alpha - 1.) < 1.e-30) {
1497       alpha = 1.0;
1498       tau = ak/ek;
1499     }
1500     else
1501       tau = ak/ek + ((1 - alpha)/alpha) * bk/ek;
1502 
1503     cs_real_t c0 = (1 - alpha);
1504     cs_real_t c1 = -alpha*tau;
1505 
1506 #   pragma omp parallel firstprivate(alpha, tau, c0, c1) if(n_rows > CS_THR_MIN)
1507     {
1508 #     pragma omp for nowait
1509       for (ii = 0; ii < n_rows; ii++) {
1510         cs_real_t trk = rk[ii];
1511         rk[ii] = alpha*rk[ii] + c0*rkm1[ii] + c1*zk[ii];
1512         rkm1[ii] = trk;
1513       }
1514 #     pragma omp for nowait
1515       for (ii = 0; ii < n_rows; ii++) {
1516         cs_real_t tvx = vx[ii];
1517         vx[ii] = alpha*vx[ii] + c0*vxm1[ii] + c1*wk[ii];
1518         vxm1[ii] = tvx;
1519       }
1520     }
1521 
1522     /* compute residue */
1523 
1524     residue = sqrt(_dot_product(c, rk, rk));
1525 
1526     /* Convergence test for end of previous iteration */
1527 
1528     if (n_iter > 1)
1529       cvg = _convergence_test(c, n_iter, residue, convergence);
1530 
1531     if (cvg != CS_SLES_ITERATING)
1532       break;
1533 
1534     n_iter += 1;
1535 
1536   }
1537 
1538   if (_aux_vectors != aux_vectors)
1539     BFT_FREE(_aux_vectors);
1540 
1541   return cvg;
1542 }
1543 
1544 /*----------------------------------------------------------------------------
1545  * Solution of A.vx = Rhs using Jacobi.
1546  *
1547  * On entry, vx is considered initialized.
1548  *
1549  * parameters:
1550  *   c               <-- pointer to solver context info
1551  *   a               <-- linear equation matrix
1552  *   diag_block_size <-- diagonal block size
1553  *   convergence     <-- convergence information structure
1554  *   rhs             <-- right hand side
1555  *   vx              <-> system solution
1556  *   aux_size        <-- number of elements in aux_vectors (in bytes)
1557  *   aux_vectors     --- optional working area (allocation otherwise)
1558  *
1559  * returns:
1560  *   convergence state
1561  *----------------------------------------------------------------------------*/
1562 
1563 static cs_sles_convergence_state_t
_jacobi(cs_sles_it_t * c,const cs_matrix_t * a,cs_lnum_t diag_block_size,cs_sles_it_convergence_t * convergence,const cs_real_t * rhs,cs_real_t * restrict vx,size_t aux_size,void * aux_vectors)1564 _jacobi(cs_sles_it_t              *c,
1565         const cs_matrix_t         *a,
1566         cs_lnum_t                  diag_block_size,
1567         cs_sles_it_convergence_t  *convergence,
1568         const cs_real_t           *rhs,
1569         cs_real_t                 *restrict vx,
1570         size_t                     aux_size,
1571         void                      *aux_vectors)
1572 {
1573   cs_sles_convergence_state_t cvg;
1574   cs_lnum_t  ii;
1575   double  res2, residue;
1576   cs_real_t *_aux_vectors;
1577   cs_real_t *restrict rk;
1578 
1579   unsigned n_iter = 0;
1580 
1581   /* Allocate or map work arrays */
1582   /*-----------------------------*/
1583 
1584   assert(c->setup_data != NULL);
1585 
1586   const cs_real_t  *restrict ad_inv = c->setup_data->ad_inv;
1587 
1588   const cs_lnum_t n_rows = c->setup_data->n_rows;
1589 
1590   {
1591     const cs_lnum_t n_cols = cs_matrix_get_n_columns(a) * diag_block_size;
1592     const size_t n_wa = 1;
1593     const size_t wa_size = CS_SIMD_SIZE(n_cols);
1594 
1595     if (aux_vectors == NULL || aux_size/sizeof(cs_real_t) < (wa_size * n_wa))
1596       BFT_MALLOC(_aux_vectors, wa_size * n_wa, cs_real_t);
1597     else
1598       _aux_vectors = aux_vectors;
1599 
1600     rk = _aux_vectors;
1601   }
1602 
1603   const cs_real_t  *restrict ad = cs_matrix_get_diagonal(a);
1604 
1605   cvg = CS_SLES_ITERATING;
1606 
1607   /* Current iteration */
1608   /*-------------------*/
1609 
1610   while (cvg == CS_SLES_ITERATING) {
1611 
1612     n_iter += 1;
1613 
1614 #if defined(HAVE_OPENMP)
1615 
1616 #   pragma omp parallel for if(n_rows > CS_THR_MIN)
1617     for (ii = 0; ii < n_rows; ii++)
1618       rk[ii] = vx[ii];
1619 
1620 #else
1621 
1622     memcpy(rk, vx, n_rows * sizeof(cs_real_t));   /* rk <- vx */
1623 
1624 #endif
1625 
1626     /* Compute Vx <- Vx - (A-diag).Rk and residue. */
1627 
1628     cs_matrix_exdiag_vector_multiply(a, rk, vx);
1629 
1630     res2 = 0.0;
1631 
1632 #   pragma omp parallel for reduction(+:res2) if(n_rows > CS_THR_MIN)
1633     for (ii = 0; ii < n_rows; ii++) {
1634       vx[ii] = (rhs[ii]-vx[ii])*ad_inv[ii];
1635       double r = ad[ii] * (vx[ii]-rk[ii]);
1636       res2 += (r*r);
1637     }
1638 
1639 #if defined(HAVE_MPI)
1640 
1641     if (c->comm != MPI_COMM_NULL) {
1642       double _sum;
1643       MPI_Allreduce(&res2, &_sum, 1, MPI_DOUBLE, MPI_SUM, c->comm);
1644       res2 = _sum;
1645     }
1646 
1647 #endif /* defined(HAVE_MPI) */
1648 
1649     residue = sqrt(res2); /* Actually, residue of previous iteration */
1650 
1651     /* Convergence test */
1652 
1653     if (n_iter == 1)
1654       c->setup_data->initial_residue = residue;
1655 
1656     cvg = _convergence_test(c, n_iter, residue, convergence);
1657 
1658   }
1659 
1660   if (_aux_vectors != aux_vectors)
1661     BFT_FREE(_aux_vectors);
1662 
1663   return cvg;
1664 }
1665 
1666 /*----------------------------------------------------------------------------
1667  * Solution of A.vx = Rhs using block Jacobi.
1668  *
1669  * On entry, vx is considered initialized.
1670  *
1671  * parameters:
1672  *   c               <-- pointer to solver context info
1673  *   a               <-- linear equation matrix
1674  *   diag_block_size <-- diagonal block size (unused here)
1675  *   convergence     <-- convergence information structure
1676  *   rhs             <-- right hand side
1677  *   vx              --> system solution
1678  *   aux_size        <-- number of elements in aux_vectors (in bytes)
1679  *   aux_vectors     --- optional working area (allocation otherwise)
1680  *
1681  * returns:
1682  *   convergence state
1683  *----------------------------------------------------------------------------*/
1684 
1685 static cs_sles_convergence_state_t
_block_3_jacobi(cs_sles_it_t * c,const cs_matrix_t * a,cs_lnum_t diag_block_size,cs_sles_it_convergence_t * convergence,const cs_real_t * rhs,cs_real_t * restrict vx,size_t aux_size,void * aux_vectors)1686 _block_3_jacobi(cs_sles_it_t              *c,
1687                 const cs_matrix_t         *a,
1688                 cs_lnum_t                  diag_block_size,
1689                 cs_sles_it_convergence_t  *convergence,
1690                 const cs_real_t           *rhs,
1691                 cs_real_t                 *restrict vx,
1692                 size_t                     aux_size,
1693                 void                      *aux_vectors)
1694 {
1695   CS_UNUSED(diag_block_size);
1696 
1697   assert(diag_block_size == 3);
1698 
1699   cs_sles_convergence_state_t cvg;
1700   double  res2, residue;
1701   cs_real_t *_aux_vectors;
1702   cs_real_t  *restrict rk, *restrict vxx;
1703 
1704   unsigned n_iter = 0;
1705 
1706   /* Allocate or map work arrays */
1707   /*-----------------------------*/
1708 
1709   assert(c->setup_data != NULL);
1710 
1711   const cs_real_t  *restrict ad_inv = c->setup_data->ad_inv;
1712 
1713   const cs_lnum_t n_rows = c->setup_data->n_rows;
1714   const cs_lnum_t n_blocks = c->setup_data->n_rows / 3;
1715 
1716   {
1717     const cs_lnum_t n_cols = cs_matrix_get_n_columns(a) * diag_block_size;
1718     const size_t n_wa = 2;
1719     const size_t wa_size = CS_SIMD_SIZE(n_cols);
1720 
1721     if (aux_vectors == NULL || aux_size/sizeof(cs_real_t) < (wa_size * n_wa))
1722       BFT_MALLOC(_aux_vectors, wa_size * n_wa, cs_real_t);
1723     else
1724       _aux_vectors = aux_vectors;
1725 
1726     rk  = _aux_vectors;
1727     vxx = _aux_vectors + wa_size;
1728   }
1729 
1730   const cs_real_t  *restrict ad = cs_matrix_get_diagonal(a);
1731 
1732   cvg = CS_SLES_ITERATING;
1733 
1734   /* Current iteration */
1735   /*-------------------*/
1736 
1737   while (cvg == CS_SLES_ITERATING) {
1738 
1739     n_iter += 1;
1740     memcpy(rk, vx, n_rows * sizeof(cs_real_t));  /* rk <- vx */
1741 
1742     /* Compute vxx <- vx - (a-diag).rk and residue. */
1743 
1744     cs_matrix_exdiag_vector_multiply(a, rk, vxx);
1745 
1746     res2 = 0.0;
1747 
1748     /* Compute vx <- diag^-1 . (vxx - rhs) and residue. */
1749 #   pragma omp parallel for reduction(+:res2) if(n_blocks > CS_THR_MIN)
1750     for (cs_lnum_t ii = 0; ii < n_blocks; ii++) {
1751       _fw_and_bw_lu33(ad_inv + 9*ii,
1752                       vx + 3*ii,
1753                       vxx + 3*ii,
1754                       rhs + 3*ii);
1755       for (cs_lnum_t jj = 0; jj < 3; jj++) {
1756         register double r = 0.0;
1757         for (cs_lnum_t kk = 0; kk < 3; kk++)
1758           r +=    ad[ii*9 + jj*3 + kk]
1759                * (vx[ii*3 + kk] - rk[ii*3 + kk]);
1760         res2 += (r*r);
1761       }
1762     }
1763 
1764 #if defined(HAVE_MPI)
1765 
1766     if (c->comm != MPI_COMM_NULL) {
1767       double _sum;
1768       MPI_Allreduce(&res2, &_sum, 1, MPI_DOUBLE, MPI_SUM, c->comm);
1769       res2 = _sum;
1770     }
1771 
1772 #endif /* defined(HAVE_MPI) */
1773 
1774     residue = sqrt(res2); /* Actually, residue of previous iteration */
1775 
1776     if (n_iter == 1)
1777       c->setup_data->initial_residue = residue;
1778 
1779     /* Convergence test */
1780 
1781     cvg = _convergence_test(c, n_iter, residue, convergence);
1782 
1783   }
1784 
1785   if (_aux_vectors != aux_vectors)
1786     BFT_FREE(_aux_vectors);
1787   return cvg;
1788 }
1789 
1790 /*----------------------------------------------------------------------------
1791  * Solution of A.vx = Rhs using block Jacobi.
1792  *
1793  * On entry, vx is considered initialized.
1794  *
1795  * parameters:
1796  *   c               <-- pointer to solver context info
1797  *   a               <-- linear equation matrix
1798  *   diag_block_size <-- block size of diagonal elements
1799  *   convergence     <-- convergence information structure
1800  *   rhs             <-- right hand side
1801  *   vx              --> system solution
1802  *   aux_size        <-- number of elements in aux_vectors (in bytes)
1803  *   aux_vectors     --- optional working area (allocation otherwise)
1804  *
1805  * returns:
1806  *   convergence state
1807  *----------------------------------------------------------------------------*/
1808 
1809 static cs_sles_convergence_state_t
_block_jacobi(cs_sles_it_t * c,const cs_matrix_t * a,cs_lnum_t diag_block_size,cs_sles_it_convergence_t * convergence,const cs_real_t * rhs,cs_real_t * restrict vx,size_t aux_size,void * aux_vectors)1810 _block_jacobi(cs_sles_it_t              *c,
1811               const cs_matrix_t         *a,
1812               cs_lnum_t                  diag_block_size,
1813               cs_sles_it_convergence_t  *convergence,
1814               const cs_real_t           *rhs,
1815               cs_real_t                 *restrict vx,
1816               size_t                     aux_size,
1817               void                      *aux_vectors)
1818 {
1819   cs_sles_convergence_state_t cvg;
1820   double  res2, residue;
1821   cs_real_t *_aux_vectors;
1822   cs_real_t  *restrict rk, *restrict vxx;
1823 
1824   unsigned n_iter = 0;
1825 
1826   /* Call setup if not already done, allocate or map work arrays */
1827   /*-------------------------------------------------------------*/
1828   assert(c->setup_data != NULL);
1829 
1830   const cs_lnum_t *db_size = cs_matrix_get_diag_block_size(a);
1831 
1832   const cs_real_t  *restrict ad_inv = c->setup_data->ad_inv;
1833 
1834   const cs_lnum_t n_rows = c->setup_data->n_rows;
1835   const cs_lnum_t n_blocks = c->setup_data->n_rows / diag_block_size;
1836 
1837   {
1838     const cs_lnum_t n_cols = cs_matrix_get_n_columns(a) * diag_block_size;
1839     const size_t n_wa = 2;
1840     const size_t wa_size = CS_SIMD_SIZE(n_cols);
1841 
1842     if (aux_vectors == NULL || aux_size/sizeof(cs_real_t) < (wa_size * n_wa))
1843       BFT_MALLOC(_aux_vectors, wa_size * n_wa, cs_real_t);
1844     else
1845       _aux_vectors = aux_vectors;
1846 
1847     rk  = _aux_vectors;
1848     vxx = _aux_vectors + wa_size;
1849   }
1850 
1851   const cs_real_t  *restrict ad = cs_matrix_get_diagonal(a);
1852 
1853   cvg = CS_SLES_ITERATING;
1854 
1855   /* Current iteration */
1856   /*-------------------*/
1857 
1858   while (cvg == CS_SLES_ITERATING) {
1859 
1860     n_iter += 1;
1861     memcpy(rk, vx, n_rows * sizeof(cs_real_t));  /* rk <- vx */
1862 
1863     /* Compute Vx <- Vx - (A-diag).Rk and residue. */
1864 
1865     cs_matrix_exdiag_vector_multiply(a, rk, vxx);
1866 
1867     res2 = 0.0;
1868 
1869 #   pragma omp parallel for reduction(+:res2) if(n_blocks > CS_THR_MIN)
1870     for (cs_lnum_t ii = 0; ii < n_blocks; ii++) {
1871       _fw_and_bw_lu(ad_inv + db_size[3]*ii,
1872                     db_size[0],
1873                     vx + db_size[1]*ii,
1874                     vxx + db_size[1]*ii,
1875                     rhs + db_size[1]*ii);
1876       for (cs_lnum_t jj = 0; jj < db_size[0]; jj++) {
1877         register double r = 0.0;
1878         for (cs_lnum_t kk = 0; kk < db_size[0]; kk++)
1879           r +=    ad[ii*db_size[3] + jj*db_size[2] + kk]
1880                * (vx[ii*db_size[1] + kk] - rk[ii*db_size[1] + kk]);
1881         res2 += (r*r);
1882       }
1883     }
1884 
1885 #if defined(HAVE_MPI)
1886 
1887     if (c->comm != MPI_COMM_NULL) {
1888       double _sum;
1889       MPI_Allreduce(&res2, &_sum, 1, MPI_DOUBLE, MPI_SUM, c->comm);
1890       res2 = _sum;
1891     }
1892 
1893 #endif /* defined(HAVE_MPI) */
1894 
1895     residue = sqrt(res2); /* Actually, residue of previous iteration */
1896 
1897     if (n_iter == 1)
1898       c->setup_data->initial_residue = residue;
1899 
1900     /* Convergence test */
1901 
1902     cvg = _convergence_test(c, n_iter, residue, convergence);
1903 
1904   }
1905 
1906   if (_aux_vectors != aux_vectors)
1907     BFT_FREE(_aux_vectors);
1908 
1909   return cvg;
1910 }
1911 
1912 
1913 /*----------------------------------------------------------------------------
1914  * Test for (and eventually report) breakdown.
1915  *
1916  * parameters:
1917  *   c           <-- pointer to solver context info
1918  *   convergence <-- convergence information structure
1919  *   coeff       <-- coefficient name
1920  *   coeff       <-- coefficient to test
1921  *   epsilon     <-- value to test against
1922  *   n_iter      <-- current number of iterations
1923  *   cvg         <-> convergence status
1924  *
1925  * returns:
1926  *   true in case of breakdown, false otherwise
1927  *----------------------------------------------------------------------------*/
1928 
1929 static inline bool
_breakdown(cs_sles_it_t * c,cs_sles_it_convergence_t * convergence,const char * coeff_name,double coeff,double epsilon,double residue,int n_iter,cs_sles_convergence_state_t * cvg)1930 _breakdown(cs_sles_it_t                 *c,
1931            cs_sles_it_convergence_t     *convergence,
1932            const char                   *coeff_name,
1933            double                        coeff,
1934            double                        epsilon,
1935            double                        residue,
1936            int                           n_iter,
1937            cs_sles_convergence_state_t  *cvg)
1938 {
1939   bool retval = false;
1940 
1941   if (CS_ABS(coeff) < epsilon) {
1942 
1943     bft_printf
1944       (_("\n\n"
1945          "%s [%s]:\n"
1946          " @@ Warning: non convergence\n"
1947          "\n"
1948          "    norm of coefficient \"%s\" is lower than %12.4e\n"
1949          "\n"
1950          "    The resolution does not progress anymore."),
1951        cs_sles_it_type_name[c->type], convergence->name, coeff_name, epsilon);
1952     bft_printf(_("  n_iter : %5u, res_abs : %11.4e, res_nor : %11.4e\n"),
1953                n_iter, residue, residue/convergence->r_norm);
1954 
1955     *cvg = CS_SLES_BREAKDOWN;
1956     retval = true;
1957   }
1958 
1959   return retval;
1960 }
1961 
1962 /*----------------------------------------------------------------------------
1963  * Solution of A.vx = Rhs using preconditioned Bi-CGSTAB.
1964  *
1965  * Parallel-optimized version, groups dot products, at the cost of
1966  * computation of the preconditionning for n+1 iterations instead of n.
1967  *
1968  * On entry, vx is considered initialized.
1969  *
1970  * parameters:
1971  *   c               <-- pointer to solver context info
1972  *   name            <-- pointer to system name
1973  *   a               <-- matrix
1974  *   diag_block_size <-- block size of diagonal elements
1975  *   convergence     <-- convergence information structure
1976  *   rhs             <-- right hand side
1977  *   vx              <-> system solution
1978  *   aux_size        <-- number of elements in aux_vectors (in bytes)
1979  *   aux_vectors     --- optional working area (allocation otherwise)
1980  *
1981  * returns:
1982  *   convergence state
1983  *----------------------------------------------------------------------------*/
1984 
1985 static cs_sles_convergence_state_t
_bi_cgstab(cs_sles_it_t * c,const cs_matrix_t * a,cs_lnum_t diag_block_size,cs_sles_it_convergence_t * convergence,const cs_real_t * rhs,cs_real_t * restrict vx,size_t aux_size,void * aux_vectors)1986 _bi_cgstab(cs_sles_it_t              *c,
1987            const cs_matrix_t         *a,
1988            cs_lnum_t                  diag_block_size,
1989            cs_sles_it_convergence_t  *convergence,
1990            const cs_real_t           *rhs,
1991            cs_real_t                 *restrict vx,
1992            size_t                     aux_size,
1993            void                      *aux_vectors)
1994 {
1995   cs_sles_convergence_state_t cvg;
1996   double  _epzero = 1.e-30; /* smaller than epzero */
1997   double  ro_0, ro_1, alpha, beta, betam1, gamma, omega, ukres0;
1998   double  residue;
1999   cs_real_t  *_aux_vectors;
2000   cs_real_t  *restrict res0, *restrict rk, *restrict pk, *restrict zk;
2001   cs_real_t  *restrict uk, *restrict vk;
2002 
2003   unsigned n_iter = 0;
2004 
2005   /* Allocate or map work arrays */
2006   /*-----------------------------*/
2007 
2008   assert(c->setup_data != NULL);
2009 
2010   const cs_lnum_t n_rows = c->setup_data->n_rows;
2011 
2012   {
2013     const cs_lnum_t n_cols = cs_matrix_get_n_columns(a) * diag_block_size;
2014     const size_t n_wa = 6;
2015     const size_t wa_size = CS_SIMD_SIZE(n_cols);
2016 
2017     if (aux_vectors == NULL || aux_size/sizeof(cs_real_t) < (wa_size * n_wa))
2018       BFT_MALLOC(_aux_vectors, wa_size * n_wa, cs_real_t);
2019     else
2020       _aux_vectors = aux_vectors;
2021 
2022     res0 = _aux_vectors;
2023     rk = _aux_vectors + wa_size;
2024     pk = _aux_vectors + wa_size*2;
2025     zk = _aux_vectors + wa_size*3;
2026     uk = _aux_vectors + wa_size*4;
2027     vk = _aux_vectors + wa_size*5;
2028   }
2029 
2030 # pragma omp parallel for if(n_rows > CS_THR_MIN)
2031   for (cs_lnum_t ii = 0; ii < n_rows; ii++) {
2032     pk[ii] = 0.0;
2033     uk[ii] = 0.0;
2034   }
2035 
2036   /* Initialize iterative calculation */
2037   /*----------------------------------*/
2038 
2039   cs_matrix_vector_multiply(a, vx, res0);
2040 
2041 # pragma omp parallel for if(n_rows > CS_THR_MIN)
2042   for (cs_lnum_t ii = 0; ii < n_rows; ii++) {
2043     res0[ii] = -res0[ii] + rhs[ii];
2044     rk[ii] = res0[ii];
2045   }
2046 
2047   alpha = 1.0;
2048   betam1 = 1.0;
2049   gamma = 1.0;
2050 
2051   cvg = CS_SLES_ITERATING;
2052 
2053   /* Current Iteration */
2054   /*-------------------*/
2055 
2056   while (cvg == CS_SLES_ITERATING) {
2057 
2058     /* Compute beta and omega;
2059        group dot products for new iteration's beta
2060        and previous iteration's residue to reduce total latency */
2061 
2062     if (n_iter == 0) {
2063       beta = _dot_product_xx(c, rk); /* rk == res0 here */
2064       residue = sqrt(beta);
2065       c->setup_data->initial_residue = residue;
2066     }
2067     else {
2068       _dot_products_xx_xy(c, rk, res0, &residue, &beta);
2069       residue = sqrt(residue);
2070     }
2071 
2072     /* Convergence test */
2073     cvg = _convergence_test(c, n_iter, residue, convergence);
2074     if (cvg != CS_SLES_ITERATING)
2075       break;
2076 
2077     n_iter += 1;
2078 
2079     if (_breakdown(c, convergence, "beta", beta, _epzero,
2080                    residue, n_iter, &cvg))
2081       break;
2082 
2083     if (_breakdown(c, convergence, "alpha", alpha, _epzero,
2084                    residue, n_iter, &cvg))
2085       break;
2086 
2087     omega = beta*gamma / (alpha*betam1);
2088     betam1 = beta;
2089 
2090     /* Compute pk */
2091 
2092 #   pragma omp parallel for if(n_rows > CS_THR_MIN)
2093     for (cs_lnum_t ii = 0; ii < n_rows; ii++)
2094       pk[ii] = rk[ii] + omega*(pk[ii] - alpha*uk[ii]);
2095 
2096     /* Compute zk = c.pk */
2097 
2098     c->setup_data->pc_apply(c->setup_data->pc_context, pk, zk);
2099 
2100     /* Compute uk = A.zk */
2101 
2102     cs_matrix_vector_multiply(a, zk, uk);
2103 
2104     /* Compute uk.res0 and gamma */
2105 
2106     ukres0 = _dot_product(c, uk, res0);
2107 
2108     gamma = beta / ukres0;
2109 
2110     /* First update of vx and rk */
2111 
2112 #   pragma omp parallel if(n_rows > CS_THR_MIN)
2113     {
2114 #     pragma omp for nowait
2115       for (cs_lnum_t ii = 0; ii < n_rows; ii++)
2116         vx[ii] += (gamma * zk[ii]);
2117 
2118 #     pragma omp for nowait
2119       for (cs_lnum_t ii = 0; ii < n_rows; ii++)
2120         rk[ii] -= (gamma * uk[ii]);
2121     }
2122 
2123     /* Compute zk = C.rk (zk is overwritten, vk is a working array */
2124 
2125     c->setup_data->pc_apply(c->setup_data->pc_context, rk, zk);
2126 
2127     /* Compute vk = A.zk and alpha */
2128 
2129     cs_matrix_vector_multiply(a, zk, vk);
2130 
2131     _dot_products_xx_xy(c, vk, rk, &ro_1, &ro_0);
2132 
2133     if (_breakdown(c, convergence, "rho1", ro_1, _epzero,
2134                    residue, n_iter, &cvg))
2135       break;
2136 
2137     cs_real_t d_ro_1 = (CS_ABS(ro_1) > DBL_MIN) ? 1. / ro_1 : 0.;
2138     alpha = ro_0 * d_ro_1;
2139 
2140     /* Final update of vx and rk */
2141 
2142 #   pragma omp parallel if(n_rows > CS_THR_MIN)
2143     {
2144 #     pragma omp for nowait
2145       for (cs_lnum_t ii = 0; ii < n_rows; ii++)
2146         vx[ii] += (alpha * zk[ii]);
2147 
2148 #     pragma omp for nowait
2149       for (cs_lnum_t ii = 0; ii < n_rows; ii++)
2150         rk[ii] -= (alpha * vk[ii]);
2151     }
2152 
2153     /* Convergence test at beginning of next iteration so
2154        as to group dot products for better parallel performance */
2155   }
2156 
2157   if (_aux_vectors != aux_vectors)
2158     BFT_FREE(_aux_vectors);
2159 
2160   return cvg;
2161 }
2162 
2163 /*----------------------------------------------------------------------------
2164  * Solution of (ad+ax).vx = Rhs using (not yet preconditioned) Bi-CGSTAB2.
2165  *
2166  * This Krylov method is based on an implemantation of J.P. Caltagirone
2167  * in his file bibpac6.f90 for Aquillon. He refers to it as Bi-CGSTAB2,
2168  * but a proper reference for the method has yet to be found.
2169  * (Gutknecht's BiCGstab2?)
2170  *
2171  * On entry, vx is considered initialized.
2172  *
2173  * parameters:
2174  *   c               <-- pointer to solver context info
2175  *   a               <-- matrix
2176  *   diag_block_size <-- block size of element ii, ii
2177  *   convergence     <-- convergence information structure
2178  *   rhs             <-- right hand side
2179  *   vx              <-> system solution
2180  *   aux_size        <-- number of elements in aux_vectors (in bytes)
2181  *   aux_vectors     --- optional working area (allocation otherwise)
2182  *
2183  * returns:
2184  *   convergence state
2185  *----------------------------------------------------------------------------*/
2186 
2187 static cs_sles_convergence_state_t
_bicgstab2(cs_sles_it_t * c,const cs_matrix_t * a,cs_lnum_t diag_block_size,cs_sles_it_convergence_t * convergence,const cs_real_t * rhs,cs_real_t * restrict vx,size_t aux_size,void * aux_vectors)2188 _bicgstab2(cs_sles_it_t              *c,
2189            const cs_matrix_t         *a,
2190            cs_lnum_t                  diag_block_size,
2191            cs_sles_it_convergence_t  *convergence,
2192            const cs_real_t           *rhs,
2193            cs_real_t                 *restrict vx,
2194            size_t                     aux_size,
2195            void                      *aux_vectors)
2196 {
2197   cs_sles_convergence_state_t cvg;
2198   double  _epzero = 1.e-30;/* smaller than epzero */
2199   double  ro_0, ro_1, alpha, beta, gamma;
2200   double  omega_1, omega_2, mu, nu, tau;
2201   double  residue;
2202   cs_real_t  *_aux_vectors;
2203   cs_real_t  *restrict res0, *restrict qk, *restrict rk, *restrict sk;
2204   cs_real_t  *restrict tk, *restrict uk, *restrict vk, *restrict wk;
2205   cs_real_t  *restrict zk;
2206 
2207   unsigned n_iter = 0;
2208 
2209   /* Allocate or map work arrays */
2210   /*-----------------------------*/
2211 
2212   assert(c->setup_data != NULL);
2213 
2214   const cs_lnum_t n_rows = c->setup_data->n_rows;
2215 
2216   {
2217     const cs_lnum_t n_cols = cs_matrix_get_n_columns(a) * diag_block_size;
2218     size_t  n_wa = 9;
2219     const size_t wa_size = CS_SIMD_SIZE(n_cols);
2220 
2221     if (aux_vectors == NULL || aux_size/sizeof(cs_real_t) < (wa_size * n_wa))
2222       BFT_MALLOC(_aux_vectors, wa_size * n_wa, cs_real_t);
2223     else
2224       _aux_vectors = aux_vectors;
2225 
2226     res0 = _aux_vectors;
2227     zk = _aux_vectors + wa_size;
2228     qk = _aux_vectors + wa_size*2;
2229     rk = _aux_vectors + wa_size*3;
2230     sk = _aux_vectors + wa_size*4;
2231     tk = _aux_vectors + wa_size*5;
2232     uk = _aux_vectors + wa_size*6;
2233     vk = _aux_vectors + wa_size*7;
2234     wk = _aux_vectors + wa_size*8;
2235   }
2236 
2237   /* Initialize work arrays */
2238 
2239 # pragma omp parallel for if(n_rows > CS_THR_MIN)
2240   for (cs_lnum_t ii = 0; ii < n_rows; ii++) {
2241     uk[ii] = 0.0;
2242   }
2243 
2244   /* Initialize iterative calculation */
2245   /*----------------------------------*/
2246 
2247   cs_matrix_vector_multiply(a, vx, res0);
2248 
2249 # pragma omp parallel for if(n_rows > CS_THR_MIN)
2250   for (cs_lnum_t ii = 0; ii < n_rows; ii++) {
2251     res0[ii] = -res0[ii] + rhs[ii];
2252     rk[ii] = res0[ii];
2253     qk[ii] = rk[ii];
2254   }
2255 
2256   ro_0    = 1.0;
2257   alpha   = 0.0;
2258   omega_2 = 1.0;
2259 
2260   cvg = CS_SLES_ITERATING;
2261 
2262   /* Current Iteration */
2263   /*-------------------*/
2264 
2265   while (cvg == CS_SLES_ITERATING) {
2266 
2267     /* Compute beta and omega;
2268        group dot products for new iteration's beta
2269        and previous iteration's residue to reduce total latency */
2270 
2271     double mprec = 1.0e-60;
2272 
2273     if (n_iter == 0) {
2274       residue = sqrt(_dot_product_xx(c, rk)); /* rk == res0 here */
2275       c->setup_data->initial_residue = residue;
2276     }
2277     else
2278       residue = sqrt(_dot_product_xx(c, rk));
2279 
2280     /* Convergence test */
2281     cvg = _convergence_test(c, n_iter, residue, convergence);
2282     if (cvg != CS_SLES_ITERATING)
2283         break;
2284 
2285     n_iter += 1;
2286 
2287     ro_0 = -omega_2*ro_0;
2288     ro_1 = _dot_product(c, qk, rk);
2289 
2290     if (_breakdown(c, convergence, "rho0", ro_0, 1.e-60,
2291                    residue, n_iter, &cvg))
2292       break;
2293 
2294     if (_breakdown(c, convergence, "rho1", ro_1, _epzero,
2295                    residue, n_iter, &cvg))
2296       break;
2297 
2298     beta = alpha*ro_1/ro_0;
2299     ro_0 = ro_1;
2300 
2301 #   pragma omp parallel for if(n_rows > CS_THR_MIN)
2302     for (cs_lnum_t ii = 0; ii < n_rows; ii++)
2303       uk[ii] = rk[ii] - beta* uk[ii];
2304 
2305     /* Compute vk =  A*uk */
2306 
2307     cs_matrix_vector_multiply(a, uk, vk);
2308 
2309     /* Preconditionning */
2310 
2311     c->setup_data->pc_apply(c->setup_data->pc_context, vk, zk);
2312 
2313     /* Compute gamma and alpha */
2314 
2315     gamma = _dot_product(c, qk, vk);
2316 
2317     if (_breakdown(c, convergence, "gamma", gamma, 1.e-60,
2318                    residue, n_iter, &cvg))
2319       break;
2320 
2321     alpha = ro_0/gamma;
2322 
2323     /* Update rk */
2324 
2325 #   pragma omp parallel for if(n_rows > CS_THR_MIN)
2326     for (cs_lnum_t ii = 0; ii < n_rows; ii++) {
2327       rk[ii] -= alpha*vk[ii];
2328       vx[ii] += alpha*uk[ii];
2329     }
2330 
2331     /* p = A*r */
2332 
2333     cs_matrix_vector_multiply(a, rk, sk);
2334 
2335     c->setup_data->pc_apply(c->setup_data->pc_context, sk, zk);
2336 
2337     ro_1 = _dot_product(c, qk, sk);
2338     beta = alpha*ro_1/ro_0;
2339     ro_0 = ro_1;
2340 
2341 #   pragma omp parallel if(n_rows > CS_THR_MIN)
2342     {
2343 #     pragma omp for nowait
2344       for (cs_lnum_t ii = 0; ii < n_rows; ii++)
2345         vk[ii] = sk[ii] - beta*vk[ii];
2346 #     pragma omp for nowait
2347       for (cs_lnum_t ii = 0; ii < n_rows; ii++)
2348         uk[ii] = rk[ii] - beta*uk[ii];
2349     }
2350 
2351     /* wk = A*vk */
2352 
2353     cs_matrix_vector_multiply(a, vk, wk);
2354 
2355     c->setup_data->pc_apply(c->setup_data->pc_context, wk, zk);
2356 
2357     gamma = _dot_product(c, qk, wk);
2358     alpha = (ro_0+mprec)/(gamma+mprec);
2359 
2360 #   pragma omp parallel for if(n_rows > CS_THR_MIN)
2361     for (cs_lnum_t ii = 0; ii < n_rows; ii++) {
2362       rk[ii] -= alpha*vk[ii];
2363       sk[ii] -= alpha*wk[ii];
2364     }
2365 
2366     /* tk = A*sk */
2367 
2368     cs_matrix_vector_multiply(a, sk, tk);
2369 
2370     c->setup_data->pc_apply(c->setup_data->pc_context, tk, zk);
2371 
2372     _dot_products_xx_yy_xy_xz_yz(c, sk, tk, rk,
2373                                  &mu, &tau, &nu, &omega_1, &omega_2);
2374 
2375     tau = tau - (nu*nu/mu);
2376     omega_2 = (omega_2 - ((nu+mprec)*(omega_1+mprec)/(mu+mprec)))/(tau+mprec);
2377 
2378     omega_1 = (omega_1 - nu*omega_2)/mu;
2379 
2380     /* sol <- sol + omega_1*r + omega_2*s + alpha*u */
2381 #   pragma omp parallel for if(n_rows > CS_THR_MIN)
2382     for (cs_lnum_t ii = 0; ii < n_rows; ii++)
2383       vx[ii] += omega_1*rk[ii] + omega_2*sk[ii] + alpha*uk[ii];
2384 
2385 #   pragma omp parallel if(n_rows > CS_THR_MIN)
2386     {
2387       /* r <- r - omega_1*s - omega_2*t */
2388 #     pragma omp for nowait
2389       for (cs_lnum_t ii = 0; ii < n_rows; ii++)
2390         rk[ii] += - omega_1*sk[ii] - omega_2*tk[ii];
2391       /* u <- u - omega_1*v - omega_2*w */
2392 #     pragma omp for nowait
2393       for (cs_lnum_t ii = 0; ii < n_rows; ii++)
2394         uk[ii] += - omega_1*vk[ii] - omega_2*wk[ii];
2395     }
2396 
2397   }
2398 
2399   if (_aux_vectors != aux_vectors)
2400     BFT_FREE(_aux_vectors);
2401 
2402   return cvg;
2403 }
2404 
2405 /*----------------------------------------------------------------------------
2406  * Transform using Givens rotations a system Ax=b where A is an upper
2407  * triangular matrix of size n*(n-1) with a lower diagonal to an equivalent
2408  * system A'x=b' where A' is an upper triangular matrix.
2409  *
2410  * parameters:
2411  *   a              <-> dense matrix (size: size a_size*a_size):
2412  *                      a(i,j) = a[i + j*a_size]
2413  *                      input: A; output: A'
2414  *   a_size         <-- matrix dim
2415  *   b              <-> pre-allocated vector of a_size elements
2416  *                      input: b; output: b'
2417  *   givens_coeff   <-> pre-allocated vector of a_size elements
2418  *                      input: previous Givens coefficients
2419  *                      output: updated Givens coefficients
2420  *   update_rank    <-- rank of first non null coefficient on lower diagonal
2421  *   end_update     <-- rank of last non null coefficient on lower diagonal
2422  *----------------------------------------------------------------------------*/
2423 
2424 static void
_givens_rot_update(cs_real_t * restrict a,cs_lnum_t a_size,cs_real_t * restrict b,cs_real_t * restrict givens_coeff,cs_lnum_t update_rank,cs_lnum_t end_update)2425 _givens_rot_update(cs_real_t    *restrict a,
2426                    cs_lnum_t              a_size,
2427                    cs_real_t    *restrict b,
2428                    cs_real_t    *restrict givens_coeff,
2429                    cs_lnum_t              update_rank,
2430                    cs_lnum_t              end_update)
2431 {
2432   for (cs_lnum_t i = 0; i < update_rank; ++i) {
2433     for (cs_lnum_t j = update_rank; j < end_update; ++j) {
2434 
2435       cs_real_t _aux =   givens_coeff[i]*a[j*a_size + i]
2436                        + givens_coeff[i + a_size] * a[j*a_size + i+1];
2437 
2438       a[j*a_size + i+1] =   givens_coeff[i] * a[i+1 + j*a_size]
2439                           - givens_coeff[i + a_size] * a[j*a_size + i];
2440 
2441       a[j*a_size + i] = _aux;
2442     }
2443   }
2444 
2445   for (cs_lnum_t i = update_rank; i < end_update; ++i) {
2446 
2447     cs_real_t norm = sqrt(  a[i*a_size + i]   * a[i*a_size + i]
2448                           + a[i*a_size + i+1] * a[i*a_size + i+1]);
2449 
2450     givens_coeff[a_size + i] = a[i*a_size + i+1]/norm;
2451     givens_coeff[i] = a[i*a_size + i]/norm;
2452 
2453     b[i+1] = -b[i]*givens_coeff[a_size + i];
2454     b[i] = b[i]*givens_coeff[i];
2455 
2456     /* i == j */
2457     {
2458       cs_real_t _aux =   givens_coeff[i] * a[i*a_size + i]
2459                        + givens_coeff[a_size+i] * a[i*a_size + i+1];
2460       a[i*a_size+i+1] = 0;
2461       a[i*a_size + i] = _aux;
2462     }
2463 
2464     for (cs_lnum_t j = i+1; j < end_update; j++) {
2465 
2466       cs_real_t _aux =   givens_coeff[i] * a[j*a_size + i]
2467                        + givens_coeff[a_size+i] * a[j*a_size + i+1];
2468 
2469       a[i+1 + j*a_size] =   givens_coeff[i]*a[i+1 + j*a_size]
2470                           - givens_coeff[a_size + i]*a[i + j*a_size];
2471 
2472       a[j*a_size + i] = _aux;
2473     }
2474   }
2475 }
2476 
2477 /*----------------------------------------------------------------------------
2478  * Compute solution of Ax = b where A is an upper triangular matrix.
2479  *
2480  * As the system solved by GMRES will grow with iteration number, we
2481  * preallocate a allocated size, and solve only the useful part:
2482  *
2483  *   |       |       |   |x1|   |b1|
2484  *   |   A   |  pad  |   |x2|   |b2|
2485  *   |_______|       |   |x3|   |b3|
2486  *   |               | * |p | = |p |
2487  *   |     pad       |   |a |   |a |
2488  *   |               |   |d |   |d |
2489  *
2490  * parameters:
2491  *   a          <-- dense upper triangular matrix A (size: glob_size*glob_size)
2492  *                 a(i,j) = a[i + j*glob_size]
2493  *   a_size     <-- system size
2494  *   alloc_size <-- a_size + halo size
2495  *   b          <-- pre-allocated vector of a_size elements
2496  *   x          --> system solution, pre-allocated vector of a_size elements
2497  *
2498  * returns:
2499  *   0 in case of success, 1 in case of zero-pivot.
2500  *----------------------------------------------------------------------------*/
2501 
2502 static int
_solve_diag_sup_halo(cs_real_t * restrict a,cs_lnum_t a_size,cs_lnum_t alloc_size,cs_real_t * restrict b,cs_real_t * restrict x)2503 _solve_diag_sup_halo(cs_real_t  *restrict a,
2504                      cs_lnum_t            a_size,
2505                      cs_lnum_t            alloc_size,
2506                      cs_real_t  *restrict b,
2507                      cs_real_t  *restrict x)
2508 {
2509   for (cs_lnum_t i = a_size - 1; i > -1; i--) {
2510 
2511     x[i] = b[i];
2512 
2513     for (cs_lnum_t j = i + 1; j < a_size; j++)
2514       x[i] = x[i] - a[j*alloc_size + i]*x[j];
2515 
2516     x[i] /= a[i*alloc_size + i];
2517   }
2518 
2519   return 0; /* We should add a check for zero-pivot */
2520 }
2521 
2522 /*----------------------------------------------------------------------------
2523  * Solution of A.vx = Rhs using optimised preconditioned GCR.
2524  *
2525  * On entry, vx is considered initialized.
2526  *
2527  * parameters:
2528  *   c               <-- pointer to solver context info
2529  *   a               <-- matrix
2530  *   diag_block_size <-- diagonal block size (unused here)
2531  *   convergence     <-- convergence information structure
2532  *   rhs             <-- right hand side
2533  *   vx              <-> system solution
2534  *   aux_size        <-- number of elements in aux_vectors (in bytes)
2535  *   aux_vectors     --- optional working area (allocation otherwise)
2536  *
2537  * returns:
2538  *   convergence state
2539  *----------------------------------------------------------------------------*/
2540 
2541 static cs_sles_convergence_state_t
_gcr(cs_sles_it_t * c,const cs_matrix_t * a,cs_lnum_t diag_block_size,cs_sles_it_convergence_t * convergence,const cs_real_t * rhs,cs_real_t * restrict vx,size_t aux_size,void * aux_vectors)2542 _gcr(cs_sles_it_t              *c,
2543      const cs_matrix_t         *a,
2544      cs_lnum_t                  diag_block_size,
2545      cs_sles_it_convergence_t  *convergence,
2546      const cs_real_t           *rhs,
2547      cs_real_t                 *restrict vx,
2548      size_t                     aux_size,
2549      void                      *aux_vectors)
2550 {
2551   cs_sles_convergence_state_t cvg;
2552   double  residue;
2553   cs_real_t *_aux_vectors, *alpha;
2554   cs_real_t *restrict rk, *restrict zk, *restrict ck;
2555   cs_real_t *restrict gkj, *restrict gkj_inv;
2556 
2557   /* In case of the standard GCR, n_k_per_restart --> Inf,
2558    * or stops until convergence*/
2559   unsigned n_iter = 0;
2560   const unsigned n_k_per_restart = c->restart_interval;
2561 
2562   size_t wa_size;
2563 
2564   unsigned n_restart = 0;
2565 
2566   /* Allocate or map work arrays */
2567   /*-----------------------------*/
2568 
2569   assert(c->setup_data != NULL);
2570   const cs_lnum_t n_rows = c->setup_data->n_rows;
2571 
2572   {
2573     const cs_lnum_t n_cols = cs_matrix_get_n_columns(a) * diag_block_size;
2574     const size_t n_wa = 1 + n_k_per_restart * 2;
2575     wa_size = CS_SIMD_SIZE(n_cols);
2576 
2577     if (aux_vectors == NULL || aux_size/sizeof(cs_real_t) < (wa_size * n_wa))
2578       BFT_MALLOC(_aux_vectors, wa_size * n_wa, cs_real_t);
2579     else
2580       _aux_vectors = aux_vectors;
2581 
2582     rk = _aux_vectors;                               /* store residuals  */
2583     zk = _aux_vectors + wa_size;                     /* store inv(M)*r   */
2584     ck = _aux_vectors + wa_size * (1 + n_k_per_restart);   /* store A*zk */
2585   }
2586 
2587   BFT_MALLOC(alpha, n_k_per_restart, cs_real_t);
2588 
2589   /* gkj stores the upper triangle matrix Gamma of crossed inner-products
2590    * Not necessary to allocate the full matrix size
2591    * gkj_inv stores the inverse of gkj */
2592 
2593   BFT_MALLOC(gkj, (n_k_per_restart + 1) * n_k_per_restart / 2, cs_real_t);
2594   BFT_MALLOC(gkj_inv, (n_k_per_restart + 1) * n_k_per_restart / 2, cs_real_t);
2595 
2596   cvg = CS_SLES_ITERATING;
2597 
2598   /* Current Restart */
2599   while (cvg == CS_SLES_ITERATING) {
2600 
2601     n_iter = 0;
2602 
2603     /* Initialize iterative calculation */
2604     /*----------------------------------*/
2605 
2606     cs_matrix_vector_multiply(a, vx, rk);  /* rk = A.x0 */
2607 
2608 #   pragma omp parallel for if(n_rows > CS_THR_MIN)
2609     for (cs_lnum_t ii = 0; ii < n_rows; ii++)
2610       rk[ii] -= rhs[ii];
2611 
2612     residue = sqrt(_dot_product_xx(c, rk));
2613 
2614     if (n_restart == 0)
2615       c->setup_data->initial_residue = residue;
2616 
2617     /* Current Iteration on k */
2618     /* ---------------------- */
2619 
2620     while (cvg == CS_SLES_ITERATING && n_iter < n_k_per_restart) {
2621 
2622       /* Preconditionning */
2623 
2624       cs_real_t *zk_n = zk + n_iter * wa_size;
2625       cs_real_t *ck_n = ck + n_iter * wa_size;
2626 
2627       c->setup_data->pc_apply(c->setup_data->pc_context, rk, zk_n);
2628 
2629       cs_matrix_vector_multiply(a, zk_n, ck_n);
2630 
2631       for(cs_lnum_t jj = 0; jj < (int)n_iter; jj++) {
2632         cs_real_t *ck_j = ck + jj * wa_size;
2633 
2634         gkj[(n_iter + 1) * n_iter / 2 + jj] = _dot_product(c, ck_j, ck_n);
2635 #       pragma omp parallel for if(n_rows > CS_THR_MIN)
2636         for (cs_lnum_t ii = 0; ii < n_rows; ii++)
2637           ck_n[ii] += - gkj[(n_iter + 1) * n_iter / 2 + jj] * ck_j[ii];
2638       }
2639 
2640       const int  iter_shift = (n_iter+1) * n_iter / 2 + n_iter;
2641       gkj[iter_shift] = sqrt(_dot_product(c, ck_n, ck_n));
2642 
2643       if (fabs(gkj[iter_shift]) > 0) {
2644 
2645 #       pragma omp parallel for if(n_rows > CS_THR_MIN)
2646         for (cs_lnum_t ii = 0; ii < n_rows; ii++)
2647           ck_n[ii] /= gkj[iter_shift];
2648 
2649         alpha[n_iter] = _dot_product(c, ck_n, rk);
2650 
2651       }
2652       else
2653         alpha[n_iter] = 0.;
2654 
2655 #     pragma omp parallel for if(n_rows > CS_THR_MIN)
2656       for (cs_lnum_t ii = 0; ii < n_rows; ii++)
2657         rk[ii] += - alpha[n_iter] * ck_n[ii];
2658 
2659       /* Compute residue */
2660 
2661       residue = sqrt(_dot_product_xx(c, rk));
2662 
2663       n_iter += 1;
2664 
2665       /* Convergence test of current iteration */
2666 
2667       cvg = _convergence_test(c, (n_restart * n_k_per_restart) + n_iter,
2668                               residue, convergence);
2669 
2670       if (cvg != CS_SLES_ITERATING)
2671         break;
2672 
2673     } /* Needs iterating or k < n_restart */
2674 
2675     /* Inversion of Gamma */
2676 
2677     if (n_iter == 1 && !(fabs(alpha[0]) > 0))
2678       gkj_inv[0] = 1.0;
2679 
2680     else {
2681 
2682       cs_lnum_t n_g_inv = (n_k_per_restart + 1) * n_k_per_restart / 2;
2683       for (cs_lnum_t jj = 0; jj < n_g_inv; jj++)
2684         gkj_inv[jj] = 0.0;
2685 
2686       for (cs_lnum_t kk = 0; kk < (int)n_iter; kk++) {
2687         for(cs_lnum_t ii = 0; ii < kk; ii++) {
2688           for (cs_lnum_t jj = 0; jj < kk; jj++)
2689             gkj_inv[(kk + 1) * kk / 2 + ii]
2690               +=   ((ii <= jj) ? gkj_inv[(jj + 1) * jj / 2 + ii] : 0.0)
2691               * gkj[(kk + 1) * kk / 2  + jj];
2692         }
2693 
2694         for (cs_lnum_t jj = 0; jj < kk; jj++)
2695           gkj_inv[(kk + 1) * kk / 2 + jj] /= - gkj[(kk + 1) * kk / 2 + kk];
2696 
2697         gkj_inv[(kk + 1) * kk / 2 + kk] = 1.0 / gkj[(kk + 1) * kk / 2 + kk];
2698       }
2699 
2700     } /* n_iter > 1 */
2701 
2702     /* Compute the solution */
2703 
2704 #   pragma omp parallel if (n_rows > CS_THR_MIN)
2705     {
2706       cs_lnum_t s_id, e_id;
2707       cs_parall_thread_range(n_rows, sizeof(cs_real_t), &s_id, &e_id);
2708 
2709       for(cs_lnum_t kk = 0; kk < (int)n_iter; kk++) {
2710         for(cs_lnum_t jj = 0; jj <= kk; jj++) {
2711           const cs_real_t *zk_j = zk + jj*wa_size;
2712           for (cs_lnum_t ii = s_id; ii < e_id; ii++)
2713             vx[ii] -=    alpha[kk] * zk_j[ii]
2714                       *  gkj_inv[(kk + 1) * kk / 2 + jj];
2715         }
2716       }
2717     }
2718 
2719     n_restart += 1;
2720 
2721   } /* Needs iterating */
2722 
2723   if (_aux_vectors != aux_vectors)
2724     BFT_FREE(_aux_vectors);
2725 
2726   BFT_FREE(alpha);
2727   BFT_FREE(gkj);
2728   BFT_FREE(gkj_inv);
2729 
2730   return cvg;
2731 }
2732 
2733 /*----------------------------------------------------------------------------
2734  * Solution of A.vx = Rhs using preconditioned GMRES.
2735  *
2736  * On entry, vx is considered initialized.
2737  *
2738  * parameters:
2739  *   c               <-- pointer to solver context info
2740  *   a               <-- matrix
2741  *   diag_block_size <-- diagonal block size (unused here)
2742  *   convergence     <-- convergence information structure
2743  *   rhs             <-- right hand side
2744  *   vx              <-> system solution
2745  *   aux_size        <-- number of elements in aux_vectors (in bytes)
2746  *   aux_vectors     --- optional working area (allocation otherwise)
2747  *
2748  * returns:
2749  *   convergence state
2750  *----------------------------------------------------------------------------*/
2751 
2752 static cs_sles_convergence_state_t
_gmres(cs_sles_it_t * c,const cs_matrix_t * a,cs_lnum_t diag_block_size,cs_sles_it_convergence_t * convergence,const cs_real_t * rhs,cs_real_t * restrict vx,size_t aux_size,void * aux_vectors)2753 _gmres(cs_sles_it_t              *c,
2754        const cs_matrix_t         *a,
2755        cs_lnum_t                  diag_block_size,
2756        cs_sles_it_convergence_t  *convergence,
2757        const cs_real_t           *rhs,
2758        cs_real_t                 *restrict vx,
2759        size_t                     aux_size,
2760        void                      *aux_vectors)
2761 {
2762   cs_sles_convergence_state_t cvg = CS_SLES_ITERATING;
2763   int l_iter, l_old_iter;
2764   double    beta, dot_prod, residue;
2765   cs_real_t  *_aux_vectors;
2766   cs_real_t *restrict _krylov_vectors, *restrict _h_matrix;
2767   cs_real_t *restrict _givens_coeff, *restrict _beta;
2768   cs_real_t *restrict dk, *restrict gk;
2769   cs_real_t *restrict bk, *restrict fk, *restrict krk;
2770 
2771   cs_lnum_t krylov_size_max = c->restart_interval;
2772   unsigned n_iter = 0;
2773 
2774   /* Allocate or map work arrays */
2775   /*-----------------------------*/
2776 
2777   assert(c->setup_data != NULL);
2778 
2779   const cs_lnum_t n_rows = c->setup_data->n_rows;
2780 
2781   /* Allocate work arrays */
2782 
2783   int krylov_size = sqrt(n_rows*diag_block_size)*1.5 + 1;
2784   if (krylov_size > krylov_size_max)
2785     krylov_size = krylov_size_max;
2786 
2787 #if defined(HAVE_MPI)
2788   if (c->comm != MPI_COMM_NULL) {
2789     int _krylov_size = krylov_size;
2790     MPI_Allreduce(&_krylov_size,
2791                   &krylov_size,
2792                   1,
2793                   MPI_INT,
2794                   MPI_MIN,
2795                   c->comm);
2796   }
2797 #endif
2798 
2799   int     check_freq = (int)(krylov_size/10) + 1;
2800   double  epsi = 1.e-15;
2801   int scaltest = 0;
2802 
2803   {
2804     const cs_lnum_t n_cols = cs_matrix_get_n_columns(a) * diag_block_size;
2805 
2806     size_t _aux_r_size;
2807     size_t  n_wa = 4;
2808     size_t  wa_size = n_cols < krylov_size? krylov_size : n_cols;
2809 
2810     wa_size = CS_SIMD_SIZE(wa_size);
2811     _aux_r_size =   wa_size*n_wa
2812                   + (krylov_size-1)*(n_rows + krylov_size) + 3*krylov_size;
2813 
2814     if (aux_vectors == NULL || aux_size/sizeof(cs_real_t) < _aux_r_size)
2815       BFT_MALLOC(_aux_vectors, _aux_r_size, cs_real_t);
2816     else
2817       _aux_vectors = aux_vectors;
2818 
2819     dk = _aux_vectors;
2820     gk = _aux_vectors + wa_size;
2821     bk = _aux_vectors + 2*wa_size;
2822     fk = _aux_vectors + 3*wa_size;
2823     _krylov_vectors = _aux_vectors + n_wa*wa_size;
2824     _h_matrix = _aux_vectors + n_wa*wa_size + (krylov_size - 1)*n_rows;
2825     _givens_coeff =   _aux_vectors + n_wa*wa_size
2826                     + (krylov_size - 1)*(n_rows + krylov_size);
2827     _beta =   _aux_vectors + n_wa*wa_size
2828             + (krylov_size - 1)*(n_rows + krylov_size) + 2*krylov_size;
2829   }
2830 
2831   for (cs_lnum_t ii = 0; ii < krylov_size*(krylov_size - 1); ii++)
2832     _h_matrix[ii] = 0.;
2833 
2834   cvg = CS_SLES_ITERATING;
2835 
2836   while (cvg == CS_SLES_ITERATING) {
2837 
2838     /* compute  rk <- a*vx (vx = x0) */
2839 
2840     cs_matrix_vector_multiply(a, vx, dk);
2841 
2842     /* compute  rk <- rhs - rk (r0 = b-A*x0) */
2843 
2844 #   pragma omp parallel for if(n_rows > CS_THR_MIN)
2845     for (cs_lnum_t ii = 0; ii < n_rows; ii++)
2846       dk[ii] = rhs[ii] - dk[ii];
2847 
2848     if (n_iter == 0) {
2849       residue = sqrt(_dot_product_xx(c, dk));
2850       c->setup_data->initial_residue = residue;
2851       cvg = _convergence_test(c, n_iter, residue, convergence);
2852       if (cvg != CS_SLES_ITERATING)
2853         break;
2854     }
2855 
2856     /* beta = ||r0|| */
2857     beta = sqrt(_dot_product(c, dk, dk));
2858     dot_prod = beta;
2859 
2860     _beta[0] = beta;
2861     for (cs_lnum_t ii = 1; ii < krylov_size; ii++)
2862       _beta[ii] = 0.;
2863 
2864     /* Lap */
2865 
2866     l_iter = 0;
2867     l_old_iter = 0;
2868 
2869     for (cs_lnum_t ii = 0; ii < krylov_size - 1; ii++) {
2870 
2871       /* krk = k-ieth col of _krylov_vector = vi */
2872       krk = _krylov_vectors + ii*n_rows;
2873 
2874 #     pragma omp parallel for if(n_rows > CS_THR_MIN)
2875       for (cs_lnum_t jj = 0; jj < n_rows; jj++)
2876         krk[jj] = dk[jj]/dot_prod;
2877 
2878       c->setup_data->pc_apply(c->setup_data->pc_context, krk, gk);
2879 
2880       /* compute w = dk <- A*vj */
2881 
2882       cs_matrix_vector_multiply(a, gk, dk);
2883 
2884       for (cs_lnum_t jj = 0; jj < ii + 1; jj++) {
2885 
2886         /* compute h(k,i) = <w,vi> = <dk,vi> */
2887         _h_matrix[ii*krylov_size + jj]
2888           = _dot_product(c, dk, (_krylov_vectors + jj*n_rows));
2889 
2890         /* compute w = dk <- w - h(i,k)*vi */
2891 
2892         cs_axpy(n_rows,
2893                 -_h_matrix[ii*krylov_size+jj],
2894                 (_krylov_vectors + jj*n_rows),
2895                 dk);
2896 
2897       }
2898 
2899       /* compute h(i+1,i) = sqrt<w,w> */
2900       dot_prod = sqrt(_dot_product(c, dk, dk));
2901       _h_matrix[ii*krylov_size + ii + 1] = dot_prod;
2902 
2903       if (dot_prod < epsi) scaltest = 1;
2904 
2905       if (   (l_iter + 1)%check_freq == 0
2906           || l_iter == krylov_size - 2
2907           || scaltest == 1) {
2908 
2909           /* H matrix to diagonal sup matrix */
2910 
2911         _givens_rot_update(_h_matrix,
2912                            krylov_size,
2913                            _beta,
2914                            _givens_coeff,
2915                            l_old_iter,
2916                            l_iter + 1);
2917 
2918         l_old_iter = l_iter + 1;
2919 
2920         /* solve diag sup system */
2921         _solve_diag_sup_halo(_h_matrix, l_iter + 1, krylov_size, _beta, gk);
2922 
2923 #       pragma omp parallel for if(n_rows > CS_THR_MIN)
2924         for (cs_lnum_t jj = 0; jj < n_rows; jj++) {
2925           fk[jj] = 0.0;
2926           for (cs_lnum_t kk = 0; kk <= l_iter; kk++)
2927             fk[jj] += _krylov_vectors[kk*n_rows + jj] * gk[kk];
2928         }
2929 
2930         c->setup_data->pc_apply(c->setup_data->pc_context, fk, gk);
2931 
2932 #       pragma omp parallel for if(n_rows > CS_THR_MIN)
2933         for (cs_lnum_t jj = 0; jj < n_rows; jj++)
2934           fk[jj] = vx[jj] + gk[jj];
2935 
2936         cs_matrix_vector_multiply(a, fk, bk);
2937 
2938         /* compute residue = | Ax - b |_1 */
2939 
2940 #       pragma omp parallel for if(n_rows > CS_THR_MIN)
2941         for (cs_lnum_t jj = 0; jj < n_rows; jj++)
2942           bk[jj] -= rhs[jj];
2943 
2944         residue = sqrt(_dot_product_xx(c, bk));
2945 
2946         cvg = _convergence_test(c, n_iter, residue, convergence);
2947 
2948       }
2949 
2950       n_iter++;
2951       l_iter++;
2952 
2953       if (cvg == CS_SLES_CONVERGED || cvg == CS_SLES_MAX_ITERATION ||
2954           l_iter == krylov_size - 1   || scaltest == 1) {
2955 #       pragma omp parallel for if (n_rows > CS_THR_MIN)
2956         for (cs_lnum_t jj = 0; jj < n_rows; jj++)
2957           vx[jj] = fk[jj];
2958         break;
2959       }
2960     }
2961   }
2962 
2963   if (_aux_vectors != aux_vectors)
2964     BFT_FREE(_aux_vectors);
2965 
2966   return cvg;
2967 }
2968 
2969 /*----------------------------------------------------------------------------
2970  * Solution of A.vx = Rhs using Process-local Gauss-Seidel.
2971  *
2972  * On entry, vx is considered initialized.
2973  *
2974  * parameters:
2975  *   c               <-- pointer to solver context info
2976  *   a               <-- linear equation matrix
2977  *   diag_block_size <-- diagonal block size
2978  *   convergence     <-- convergence information structure
2979  *   rhs             <-- right hand side
2980  *   vx              <-> system solution
2981  *   aux_size        <-- number of elements in aux_vectors (in bytes)
2982  *   aux_vectors     --- optional working area (unused here)
2983  *
2984  * returns:
2985  *   convergence state
2986  *----------------------------------------------------------------------------*/
2987 
2988 static cs_sles_convergence_state_t
_p_ordered_gauss_seidel_msr(cs_sles_it_t * c,const cs_matrix_t * a,cs_lnum_t diag_block_size,cs_sles_it_convergence_t * convergence,const cs_real_t * rhs,cs_real_t * restrict vx)2989 _p_ordered_gauss_seidel_msr(cs_sles_it_t              *c,
2990                             const cs_matrix_t         *a,
2991                             cs_lnum_t                  diag_block_size,
2992                             cs_sles_it_convergence_t  *convergence,
2993                             const cs_real_t           *rhs,
2994                             cs_real_t                 *restrict vx)
2995 {
2996   cs_sles_convergence_state_t cvg;
2997   double  res2, residue;
2998 
2999   unsigned n_iter = 0;
3000 
3001   const cs_lnum_t n_rows = cs_matrix_get_n_rows(a);
3002 
3003   const cs_halo_t *halo = cs_matrix_get_halo(a);
3004 
3005   const cs_real_t  *restrict ad_inv = c->setup_data->ad_inv;
3006 
3007   const cs_real_t  *restrict ad = cs_matrix_get_diagonal(a);
3008 
3009   const cs_lnum_t  *a_row_index, *a_col_id;
3010   const cs_real_t  *a_d_val, *a_x_val;
3011 
3012   const cs_lnum_t *db_size = cs_matrix_get_diag_block_size(a);
3013   cs_matrix_get_msr_arrays(a, &a_row_index, &a_col_id, &a_d_val, &a_x_val);
3014 
3015   const cs_lnum_t  *order = c->add_data->order;
3016 
3017   cvg = CS_SLES_ITERATING;
3018 
3019   /* Current iteration */
3020   /*-------------------*/
3021 
3022   while (cvg == CS_SLES_ITERATING) {
3023 
3024     n_iter += 1;
3025 
3026     /* Synchronize ghost cells first */
3027 
3028     if (halo != NULL)
3029       cs_matrix_pre_vector_multiply_sync(a, vx);
3030 
3031     /* Compute Vx <- Vx - (A-diag).Rk and residue. */
3032 
3033     res2 = 0.0;
3034 
3035     if (diag_block_size == 1) {
3036 
3037 #     pragma omp parallel for reduction(+:res2)      \
3038                           if(n_rows > CS_THR_MIN && !_thread_debug)
3039       for (cs_lnum_t ll = 0; ll < n_rows; ll++) {
3040 
3041         cs_lnum_t ii = order[ll];
3042 
3043         const cs_lnum_t *restrict col_id = a_col_id + a_row_index[ii];
3044         const cs_real_t *restrict m_row = a_x_val + a_row_index[ii];
3045         const cs_lnum_t n_cols = a_row_index[ii+1] - a_row_index[ii];
3046 
3047         cs_real_t vxm1 = vx[ii];
3048         cs_real_t vx0 = rhs[ii];
3049 
3050         for (cs_lnum_t jj = 0; jj < n_cols; jj++)
3051           vx0 -= (m_row[jj]*vx[col_id[jj]]);
3052 
3053         vx0 *= ad_inv[ii];
3054 
3055         register double r = ad[ii] * (vx0-vxm1);
3056 
3057         vx[ii] = vx0;
3058 
3059         res2 += (r*r);
3060       }
3061 
3062     }
3063     else {
3064 
3065 #     pragma omp parallel for reduction(+:res2) \
3066                           if(n_rows > CS_THR_MIN  && !_thread_debug)
3067       for (cs_lnum_t ll = 0; ll < n_rows; ll++) {
3068 
3069         cs_lnum_t ii = order[ll];
3070 
3071         const cs_lnum_t *restrict col_id = a_col_id + a_row_index[ii];
3072         const cs_real_t *restrict m_row = a_x_val + a_row_index[ii];
3073         const cs_lnum_t n_cols = a_row_index[ii+1] - a_row_index[ii];
3074 
3075         cs_real_t vx0[DB_SIZE_MAX], vxm1[DB_SIZE_MAX], _vx[DB_SIZE_MAX];
3076 
3077         for (cs_lnum_t kk = 0; kk < db_size[0]; kk++) {
3078           vxm1[kk] = vx[ii*db_size[1] + kk];
3079           vx0[kk] = rhs[ii*db_size[1] + kk];
3080         }
3081 
3082         for (cs_lnum_t jj = 0; jj < n_cols; jj++) {
3083           for (cs_lnum_t kk = 0; kk < db_size[0]; kk++)
3084             vx0[kk] -= (m_row[jj]*vx[col_id[jj]*db_size[1] + kk]);
3085         }
3086 
3087         _fw_and_bw_lu_gs(ad_inv + db_size[3]*ii,
3088                          db_size[0],
3089                          _vx,
3090                          vx0);
3091 
3092         double rr = 0;
3093         for (cs_lnum_t kk = 0; kk < db_size[0]; kk++) {
3094           register double r = ad[ii*db_size[1] + kk] * (_vx[kk]-vxm1[kk]);
3095           rr += (r*r);
3096           vx[ii*db_size[1] + kk] = _vx[kk];
3097         }
3098         res2 += rr;
3099 
3100       }
3101 
3102     }
3103 
3104 #if defined(HAVE_MPI)
3105 
3106     if (c->comm != MPI_COMM_NULL) {
3107       double _sum;
3108       MPI_Allreduce(&res2, &_sum, 1, MPI_DOUBLE, MPI_SUM, c->comm);
3109       res2 = _sum;
3110     }
3111 
3112 #endif /* defined(HAVE_MPI) */
3113 
3114     residue = sqrt(res2); /* Actually, residue of previous iteration */
3115 
3116     /* Convergence test */
3117 
3118     if (n_iter == 1)
3119       c->setup_data->initial_residue = residue;
3120 
3121     cvg = _convergence_test(c, n_iter, residue, convergence);
3122 
3123   }
3124 
3125   return cvg;
3126 }
3127 
3128 /*----------------------------------------------------------------------------
3129  * Solution of A.vx = Rhs using Process-local Gauss-Seidel.
3130  *
3131  * On entry, vx is considered initialized.
3132  *
3133  * parameters:
3134  *   c               <-- pointer to solver context info
3135  *   a               <-- linear equation matrix
3136  *   diag_block_size <-- diagonal block size
3137  *   convergence     <-- convergence information structure
3138  *   rhs             <-- right hand side
3139  *   vx              <-> system solution
3140  *   aux_size        <-- number of elements in aux_vectors (in bytes)
3141  *   aux_vectors     --- optional working area (unused here)
3142  *
3143  * returns:
3144  *   convergence state
3145  *----------------------------------------------------------------------------*/
3146 
3147 static cs_sles_convergence_state_t
_p_gauss_seidel_msr(cs_sles_it_t * c,const cs_matrix_t * a,cs_lnum_t diag_block_size,cs_sles_it_convergence_t * convergence,const cs_real_t * rhs,cs_real_t * restrict vx)3148 _p_gauss_seidel_msr(cs_sles_it_t              *c,
3149                     const cs_matrix_t         *a,
3150                     cs_lnum_t                  diag_block_size,
3151                     cs_sles_it_convergence_t  *convergence,
3152                     const cs_real_t           *rhs,
3153                     cs_real_t                 *restrict vx)
3154 {
3155   cs_sles_convergence_state_t cvg;
3156   double  res2, residue;
3157 
3158   unsigned n_iter = 0;
3159 
3160   const cs_lnum_t n_rows = cs_matrix_get_n_rows(a);
3161 
3162   const cs_halo_t *halo = cs_matrix_get_halo(a);
3163 
3164   const cs_real_t  *restrict ad_inv = c->setup_data->ad_inv;
3165 
3166   const cs_real_t  *restrict ad = cs_matrix_get_diagonal(a);
3167 
3168   const cs_lnum_t  *a_row_index, *a_col_id;
3169   const cs_real_t  *a_d_val, *a_x_val;
3170 
3171   const cs_lnum_t *db_size = cs_matrix_get_diag_block_size(a);
3172   cs_matrix_get_msr_arrays(a, &a_row_index, &a_col_id, &a_d_val, &a_x_val);
3173 
3174   cvg = CS_SLES_ITERATING;
3175 
3176   /* Current iteration */
3177   /*-------------------*/
3178 
3179   while (cvg == CS_SLES_ITERATING) {
3180 
3181     n_iter += 1;
3182 
3183     /* Synchronize ghost cells first */
3184 
3185     if (halo != NULL)
3186       cs_matrix_pre_vector_multiply_sync(a, vx);
3187 
3188     /* Compute Vx <- Vx - (A-diag).Rk and residue. */
3189 
3190     res2 = 0.0;
3191 
3192     if (diag_block_size == 1) {
3193 
3194 #     pragma omp parallel for reduction(+:res2)      \
3195                           if(n_rows > CS_THR_MIN && !_thread_debug)
3196       for (cs_lnum_t ii = 0; ii < n_rows; ii++) {
3197 
3198         const cs_lnum_t *restrict col_id = a_col_id + a_row_index[ii];
3199         const cs_real_t *restrict m_row = a_x_val + a_row_index[ii];
3200         const cs_lnum_t n_cols = a_row_index[ii+1] - a_row_index[ii];
3201 
3202         cs_real_t vxm1 = vx[ii];
3203         cs_real_t vx0 = rhs[ii];
3204 
3205         for (cs_lnum_t jj = 0; jj < n_cols; jj++)
3206           vx0 -= (m_row[jj]*vx[col_id[jj]]);
3207 
3208         vx0 *= ad_inv[ii];
3209 
3210         register double r = ad[ii] * (vx0-vxm1);
3211         res2 += (r*r);
3212 
3213         vx[ii] = vx0;
3214       }
3215 
3216     }
3217     else {
3218 
3219 #     pragma omp parallel for reduction(+:res2) \
3220                           if(n_rows > CS_THR_MIN && !_thread_debug)
3221       for (cs_lnum_t ii = 0; ii < n_rows; ii++) {
3222 
3223         const cs_lnum_t *restrict col_id = a_col_id + a_row_index[ii];
3224         const cs_real_t *restrict m_row = a_x_val + a_row_index[ii];
3225         const cs_lnum_t n_cols = a_row_index[ii+1] - a_row_index[ii];
3226 
3227         cs_real_t vx0[DB_SIZE_MAX], vxm1[DB_SIZE_MAX], _vx[DB_SIZE_MAX];
3228 
3229         for (cs_lnum_t kk = 0; kk < db_size[0]; kk++) {
3230           vxm1[kk] = vx[ii*db_size[1] + kk];
3231           vx0[kk] = rhs[ii*db_size[1] + kk];
3232         }
3233 
3234         for (cs_lnum_t jj = 0; jj < n_cols; jj++) {
3235           for (cs_lnum_t kk = 0; kk < db_size[0]; kk++)
3236             vx0[kk] -= (m_row[jj]*vx[col_id[jj]*db_size[1] + kk]);
3237         }
3238 
3239         _fw_and_bw_lu_gs(ad_inv + db_size[3]*ii,
3240                          db_size[0],
3241                          _vx,
3242                          vx0);
3243 
3244         double rr = 0;
3245         for (cs_lnum_t kk = 0; kk < db_size[0]; kk++) {
3246           register double r = ad[ii*db_size[1] + kk] * (_vx[kk]-vxm1[kk]);
3247           rr += (r*r);
3248           vx[ii*db_size[1] + kk] = _vx[kk];
3249         }
3250         res2 += rr;
3251 
3252       }
3253 
3254     }
3255 
3256     if (convergence->precision > 0. || c->plot != NULL) {
3257 
3258 #if defined(HAVE_MPI)
3259 
3260       if (c->comm != MPI_COMM_NULL) {
3261         double _sum;
3262         MPI_Allreduce(&res2, &_sum, 1, MPI_DOUBLE, MPI_SUM, c->comm);
3263         res2 = _sum;
3264       }
3265 
3266 #endif /* defined(HAVE_MPI) */
3267 
3268       residue = sqrt(res2); /* Actually, residue of previous iteration */
3269 
3270       /* Convergence test */
3271 
3272       if (n_iter == 1)
3273         c->setup_data->initial_residue = residue;
3274 
3275       cvg = _convergence_test(c, n_iter, residue, convergence);
3276 
3277     }
3278     else if (n_iter >= convergence->n_iterations_max) {
3279       convergence->n_iterations = n_iter;
3280       cvg = CS_SLES_MAX_ITERATION;
3281     }
3282 
3283   }
3284 
3285   return cvg;
3286 }
3287 
3288 /*----------------------------------------------------------------------------
3289  * Solution of A.vx = Rhs using Process-local symmetric Gauss-Seidel.
3290  *
3291  * On entry, vx is considered initialized.
3292  *
3293  * parameters:
3294  *   c               <-- pointer to solver context info
3295  *   a               <-- linear equation matrix
3296  *   diag_block_size <-- diagonal block size
3297  *   convergence     <-- convergence information structure
3298  *   rhs             <-- right hand side
3299  *   vx              <-> system solution
3300  *   aux_size        <-- number of elements in aux_vectors (in bytes)
3301  *   aux_vectors     --- optional working area (unused here)
3302  *
3303  * returns:
3304  *   convergence state
3305  *----------------------------------------------------------------------------*/
3306 
3307 static cs_sles_convergence_state_t
_p_sym_gauss_seidel_msr(cs_sles_it_t * c,const cs_matrix_t * a,cs_lnum_t diag_block_size,cs_sles_it_convergence_t * convergence,const cs_real_t * rhs,cs_real_t * restrict vx,size_t aux_size,void * aux_vectors)3308 _p_sym_gauss_seidel_msr(cs_sles_it_t              *c,
3309                         const cs_matrix_t         *a,
3310                         cs_lnum_t                  diag_block_size,
3311                         cs_sles_it_convergence_t  *convergence,
3312                         const cs_real_t           *rhs,
3313                         cs_real_t                 *restrict vx,
3314                         size_t                     aux_size,
3315                         void                      *aux_vectors)
3316 {
3317   CS_UNUSED(aux_size);
3318   CS_UNUSED(aux_vectors);
3319 
3320   cs_sles_convergence_state_t cvg;
3321   double  res2, residue;
3322 
3323   /* Check matrix storage type */
3324 
3325   if (cs_matrix_get_type(a) != CS_MATRIX_MSR)
3326     bft_error
3327       (__FILE__, __LINE__, 0,
3328        _("Symmetric Gauss-Seidel Jacobi hybrid solver only supported with a\n"
3329          "matrix using %s storage."),
3330        "MSR");
3331 
3332   unsigned n_iter = 0;
3333 
3334   const cs_lnum_t n_rows = cs_matrix_get_n_rows(a);
3335 
3336   const cs_halo_t *halo = cs_matrix_get_halo(a);
3337 
3338   const cs_real_t  *restrict ad_inv = c->setup_data->ad_inv;
3339 
3340   const cs_real_t  *restrict ad = cs_matrix_get_diagonal(a);
3341 
3342   const cs_lnum_t  *a_row_index, *a_col_id;
3343   const cs_real_t  *a_d_val, *a_x_val;
3344 
3345   const cs_lnum_t *db_size = cs_matrix_get_diag_block_size(a);
3346   cs_matrix_get_msr_arrays(a, &a_row_index, &a_col_id, &a_d_val, &a_x_val);
3347 
3348   cvg = CS_SLES_ITERATING;
3349 
3350   /* Current iteration */
3351   /*-------------------*/
3352 
3353   while (cvg == CS_SLES_ITERATING) {
3354 
3355     n_iter += 1;
3356 
3357     /* Synchronize ghost cells first */
3358 
3359     if (halo != NULL)
3360       cs_matrix_pre_vector_multiply_sync(a, vx);
3361 
3362     /* Compute Vx <- Vx - (A-diag).Rk and residue: forward step */
3363 
3364     if (diag_block_size == 1) {
3365 
3366 #     pragma omp parallel for if(n_rows > CS_THR_MIN && !_thread_debug)
3367       for (cs_lnum_t ii = 0; ii < n_rows; ii++) {
3368 
3369         const cs_lnum_t *restrict col_id = a_col_id + a_row_index[ii];
3370         const cs_real_t *restrict m_row = a_x_val + a_row_index[ii];
3371         const cs_lnum_t n_cols = a_row_index[ii+1] - a_row_index[ii];
3372 
3373         cs_real_t vx0 = rhs[ii];
3374 
3375         for (cs_lnum_t jj = 0; jj < n_cols; jj++)
3376           vx0 -= (m_row[jj]*vx[col_id[jj]]);
3377 
3378         vx[ii] = vx0 * ad_inv[ii];
3379 
3380       }
3381 
3382     }
3383     else {
3384 
3385 #     pragma omp parallel for if(n_rows > CS_THR_MIN && !_thread_debug)
3386       for (cs_lnum_t ii = 0; ii < n_rows; ii++) {
3387 
3388         const cs_lnum_t *restrict col_id = a_col_id + a_row_index[ii];
3389         const cs_real_t *restrict m_row = a_x_val + a_row_index[ii];
3390         const cs_lnum_t n_cols = a_row_index[ii+1] - a_row_index[ii];
3391 
3392         cs_real_t vx0[DB_SIZE_MAX], _vx[DB_SIZE_MAX];
3393 
3394         for (cs_lnum_t kk = 0; kk < diag_block_size; kk++)
3395           vx0[kk] = rhs[ii*db_size[1] + kk];
3396 
3397         for (cs_lnum_t jj = 0; jj < n_cols; jj++) {
3398           for (cs_lnum_t kk = 0; kk < diag_block_size; kk++)
3399             vx0[kk] -= (m_row[jj]*vx[col_id[jj]*db_size[1] + kk]);
3400         }
3401 
3402         _fw_and_bw_lu_gs(ad_inv + db_size[3]*ii,
3403                          db_size[0],
3404                          _vx,
3405                          vx0);
3406 
3407         for (cs_lnum_t kk = 0; kk < diag_block_size; kk++)
3408           vx[ii*db_size[1] + kk] = _vx[kk];
3409 
3410       }
3411 
3412     }
3413 
3414     /* Synchronize ghost cells again */
3415 
3416     if (halo != NULL)
3417       cs_matrix_pre_vector_multiply_sync(a, vx);
3418 
3419     /* Compute Vx <- Vx - (A-diag).Rk and residue: backward step */
3420 
3421     res2 = 0.0;
3422 
3423     if (diag_block_size == 1) {
3424 
3425 #     pragma omp parallel for reduction(+:res2)      \
3426                           if(n_rows > CS_THR_MIN && !_thread_debug)
3427       for (cs_lnum_t ii = n_rows - 1; ii > - 1; ii--) {
3428 
3429         const cs_lnum_t *restrict col_id = a_col_id + a_row_index[ii];
3430         const cs_real_t *restrict m_row = a_x_val + a_row_index[ii];
3431         const cs_lnum_t n_cols = a_row_index[ii+1] - a_row_index[ii];
3432 
3433         cs_real_t vxm1 = vx[ii];
3434         cs_real_t vx0 = rhs[ii];
3435 
3436         for (cs_lnum_t jj = 0; jj < n_cols; jj++)
3437           vx0 -= (m_row[jj]*vx[col_id[jj]]);
3438 
3439         vx0 *= ad_inv[ii];
3440 
3441         register double r = ad[ii] * (vx0-vxm1);
3442         res2 += (r*r);
3443 
3444         vx[ii] = vx0;
3445       }
3446 
3447     }
3448     else {
3449 
3450 #     pragma omp parallel for reduction(+:res2) \
3451                           if(n_rows > CS_THR_MIN && !_thread_debug)
3452       for (cs_lnum_t ii = n_rows - 1; ii > - 1; ii--) {
3453 
3454         const cs_lnum_t *restrict col_id = a_col_id + a_row_index[ii];
3455         const cs_real_t *restrict m_row = a_x_val + a_row_index[ii];
3456         const cs_lnum_t n_cols = a_row_index[ii+1] - a_row_index[ii];
3457 
3458         cs_real_t vx0[DB_SIZE_MAX], vxm1[DB_SIZE_MAX], _vx[DB_SIZE_MAX];
3459 
3460         for (cs_lnum_t kk = 0; kk < db_size[0]; kk++) {
3461           vxm1[kk] = vx[ii*db_size[1] + kk];
3462           vx0[kk] = rhs[ii*db_size[1] + kk];
3463         }
3464 
3465         for (cs_lnum_t jj = 0; jj < n_cols; jj++) {
3466           for (cs_lnum_t kk = 0; kk < db_size[0]; kk++)
3467             vx0[kk] -= (m_row[jj]*vx[col_id[jj]*db_size[1] + kk]);
3468         }
3469 
3470         _fw_and_bw_lu_gs(ad_inv + db_size[3]*ii,
3471                          db_size[0],
3472                          _vx,
3473                          vx0);
3474 
3475         double rr = 0;
3476         for (cs_lnum_t kk = 0; kk < db_size[0]; kk++) {
3477           register double r = ad[ii*db_size[1] + kk] * (_vx[kk]-vxm1[kk]);
3478           rr += (r*r);
3479           vx[ii*db_size[1] + kk] = _vx[kk];
3480         }
3481         res2 += rr;
3482 
3483       }
3484 
3485     }
3486 
3487     if (convergence->precision > 0. || c->plot != NULL) {
3488 
3489 #if defined(HAVE_MPI)
3490 
3491       if (c->comm != MPI_COMM_NULL) {
3492         double _sum;
3493         MPI_Allreduce(&res2, &_sum, 1, MPI_DOUBLE, MPI_SUM, c->comm);
3494         res2 = _sum;
3495       }
3496 
3497 #endif /* defined(HAVE_MPI) */
3498 
3499       residue = sqrt(res2); /* Actually, residue of previous iteration */
3500 
3501       /* Convergence test */
3502 
3503       if (n_iter == 1)
3504         c->setup_data->initial_residue = residue;
3505 
3506       cvg = _convergence_test(c, n_iter, residue, convergence);
3507 
3508     }
3509     else if (n_iter >= convergence->n_iterations_max) {
3510       convergence->n_iterations = n_iter;
3511       cvg = CS_SLES_MAX_ITERATION;
3512     }
3513 
3514   }
3515 
3516   return cvg;
3517 }
3518 
3519 /*----------------------------------------------------------------------------
3520  * Solution of A.vx = Rhs using Process-local symmetric Gauss-Seidel.
3521  *
3522  * On entry, vx is considered initialized.
3523  *
3524  * parameters:
3525  *   c               <-- pointer to solver context info
3526  *   a               <-- linear equation matrix
3527  *   diag_block_size <-- diagonal block size
3528  *   convergence     <-- convergence information structure
3529  *   rhs             <-- right hand side
3530  *   vx              <-> system solution
3531  *   aux_size        <-- number of elements in aux_vectors (in bytes)
3532  *   aux_vectors     --- optional working area (unused here)
3533  *
3534  * returns:
3535  *   convergence state
3536  *----------------------------------------------------------------------------*/
3537 
3538 static cs_sles_convergence_state_t
_p_gauss_seidel(cs_sles_it_t * c,const cs_matrix_t * a,cs_lnum_t diag_block_size,cs_sles_it_convergence_t * convergence,const cs_real_t * rhs,cs_real_t * restrict vx,size_t aux_size,void * aux_vectors)3539 _p_gauss_seidel(cs_sles_it_t              *c,
3540                 const cs_matrix_t         *a,
3541                 cs_lnum_t                  diag_block_size,
3542                 cs_sles_it_convergence_t  *convergence,
3543                 const cs_real_t           *rhs,
3544                 cs_real_t                 *restrict vx,
3545                 size_t                     aux_size,
3546                 void                      *aux_vectors)
3547 {
3548   CS_UNUSED(aux_size);
3549   CS_UNUSED(aux_vectors);
3550 
3551   cs_sles_convergence_state_t cvg;
3552 
3553   /* Check matrix storage type */
3554 
3555   if (cs_matrix_get_type(a) != CS_MATRIX_MSR)
3556     bft_error
3557       (__FILE__, __LINE__, 0,
3558        _("Gauss-Seidel Jacobi hybrid solver only supported with a\n"
3559          "matrix using %s storage."),
3560        "MSR");
3561 
3562   /* Allocate or map work arrays */
3563   /*-----------------------------*/
3564 
3565   assert(c->setup_data != NULL);
3566 
3567   /* Check for ordered variant */
3568 
3569   const cs_lnum_t  *order = NULL;
3570 
3571   if (c->add_data != NULL)
3572     order = c->add_data->order;
3573 
3574   if (order != NULL)
3575     cvg = _p_ordered_gauss_seidel_msr(c,
3576                                       a,
3577                                       diag_block_size,
3578                                       convergence,
3579                                       rhs,
3580                                       vx);
3581 
3582   else
3583     cvg = _p_gauss_seidel_msr(c,
3584                               a,
3585                               diag_block_size,
3586                               convergence,
3587                               rhs,
3588                               vx);
3589 
3590   return cvg;
3591 }
3592 
3593 /*----------------------------------------------------------------------------
3594  * Switch to fallback solver if defined.
3595  *
3596  * vx is reset to zero in this case.
3597  *
3598  * parameters:
3599  *   c               <-- pointer to solver context info
3600  *   a               <-- matrix
3601  *   solver_type     <-- fallback solver type
3602  *   prev_state      <-- previous convergence state
3603  *   convergence     <-> convergence information structure
3604  *   rhs             <-- right hand side
3605  *   vx              <-> system solution
3606  *   aux_size        <-- number of elements in aux_vectors (in bytes)
3607  *   aux_vectors     --- optional working area
3608  *                       (internal allocation if NULL)
3609  *
3610  * returns:
3611  *   convergence state
3612  *----------------------------------------------------------------------------*/
3613 
3614 static cs_sles_convergence_state_t
_fallback(cs_sles_it_t * c,cs_sles_it_type_t solver_type,const cs_matrix_t * a,cs_sles_convergence_state_t prev_state,const cs_sles_it_convergence_t * convergence,int * n_iter,double * residue,const cs_real_t * rhs,cs_real_t * restrict vx,size_t aux_size,void * aux_vectors)3615 _fallback(cs_sles_it_t                    *c,
3616           cs_sles_it_type_t                solver_type,
3617           const cs_matrix_t               *a,
3618           cs_sles_convergence_state_t      prev_state,
3619           const cs_sles_it_convergence_t  *convergence,
3620           int                             *n_iter,
3621           double                          *residue,
3622           const cs_real_t                 *rhs,
3623           cs_real_t                       *restrict vx,
3624           size_t                           aux_size,
3625           void                            *aux_vectors)
3626 {
3627   cs_sles_convergence_state_t cvg = CS_SLES_ITERATING;
3628 
3629   /* Check if fallback was already defined for this case */
3630 
3631   if (c->fallback == NULL) {
3632 
3633     c->fallback = cs_sles_it_create(solver_type,
3634                                     -1, /* poly_degree */
3635                                     c->n_max_iter,
3636                                     c->update_stats);
3637 
3638     cs_sles_it_set_shareable(c->fallback, c);
3639 
3640     c->fallback->plot = c->plot;
3641 
3642   }
3643 
3644   c->fallback->plot_time_stamp = c->plot_time_stamp;
3645 
3646   const cs_lnum_t n_rows = c->setup_data->n_rows;
3647 
3648   if (prev_state < CS_SLES_BREAKDOWN) {
3649 #   pragma omp parallel for if(n_rows > CS_THR_MIN)
3650     for (cs_lnum_t i = 0; i < n_rows; i++)
3651       vx[i] = 0;
3652   }
3653 
3654   cvg = cs_sles_it_solve(c->fallback,
3655                          convergence->name,
3656                          a,
3657                          convergence->verbosity,
3658                          convergence->precision,
3659                          convergence->r_norm,
3660                          n_iter,
3661                          residue,
3662                          rhs,
3663                          vx,
3664                          aux_size,
3665                          aux_vectors);
3666 
3667   cs_sles_it_free(c->fallback);
3668 
3669   *n_iter += convergence->n_iterations;
3670 
3671   c->plot_time_stamp = c->fallback->plot_time_stamp;
3672 
3673   return cvg;
3674 }
3675 
3676 /*! (DOXYGEN_SHOULD_SKIP_THIS) \endcond */
3677 
3678 /*=============================================================================
3679  * User function prototypes
3680  *============================================================================*/
3681 
3682 /*----------------------------------------------------------------------------
3683  * Solution of A.vx = Rhs using a user-defined iterative solver
3684  *
3685  * On entry, vx is considered initialized.
3686  *
3687  * parameters:
3688  *   c               <-- pointer to solver context info
3689  *   a               <-- matrix
3690  *   diag_block_size <-- diagonal block size (unused here)
3691  *   convergence     <-- convergence information structure
3692  *   rhs             <-- right hand side
3693  *   vx              <-> system solution
3694  *   aux_size        <-- number of elements in aux_vectors (in bytes)
3695  *   aux_vectors     --- optional working area (allocation otherwise)
3696  *
3697  * returns:
3698  *   convergence state
3699  *----------------------------------------------------------------------------*/
3700 
3701 cs_sles_convergence_state_t
cs_user_sles_it_solver(cs_sles_it_t * c,const cs_matrix_t * a,cs_lnum_t diag_block_size,cs_sles_it_convergence_t * convergence,const cs_real_t * rhs,cs_real_t * restrict vx,size_t aux_size,void * aux_vectors)3702 cs_user_sles_it_solver(cs_sles_it_t              *c,
3703                        const cs_matrix_t         *a,
3704                        cs_lnum_t                  diag_block_size,
3705                        cs_sles_it_convergence_t  *convergence,
3706                        const cs_real_t           *rhs,
3707                        cs_real_t                 *restrict vx,
3708                        size_t                     aux_size,
3709                        void                      *aux_vectors)
3710 {
3711   cs_sles_convergence_state_t cvg = CS_SLES_CONVERGED;
3712 
3713   CS_UNUSED(c);
3714   CS_UNUSED(a);
3715   CS_UNUSED(diag_block_size);
3716   CS_UNUSED(convergence);
3717   CS_UNUSED(rhs);
3718   CS_UNUSED(vx);
3719   CS_UNUSED(aux_size);
3720   CS_UNUSED(aux_vectors);
3721 
3722   return cvg;
3723 }
3724 
3725 /*============================================================================
3726  * Public function definitions
3727  *============================================================================*/
3728 
3729 /*----------------------------------------------------------------------------*/
3730 /*!
3731  * \brief Define and associate an iterative sparse linear system solver
3732  *        for a given field or equation name.
3733  *
3734  * If this system did not previously exist, it is added to the list of
3735  * "known" systems. Otherwise, its definition is replaced by the one
3736  * defined here.
3737  *
3738  * This is a utility function: if finer control is needed, see
3739  * \ref cs_sles_define and \ref cs_sles_it_create.
3740  *
3741  * Note that this function returns a pointer directly to the iterative solver
3742  * management structure. This may be used to set further options,
3743  * for example using \ref cs_sles_it_set_plot_options. If needed,
3744  * \ref cs_sles_find may be used to obtain a pointer to the matching
3745  * \ref cs_sles_t container.
3746  *
3747  * \param[in]  f_id          associated field id, or < 0
3748  * \param[in]  name          associated name if f_id < 0, or NULL
3749  * \param[in]  solver_type   type of solver (PCG, Jacobi, ...)
3750  * \param[in]  poly_degree   preconditioning polynomial degree
3751  *                           (0: diagonal; -1: non-preconditioned)
3752  * \param[in]  n_max_iter    maximum number of iterations
3753  *
3754  * \return  pointer to newly created iterative solver info object.
3755  */
3756 /*----------------------------------------------------------------------------*/
3757 
3758 cs_sles_it_t *
cs_sles_it_define(int f_id,const char * name,cs_sles_it_type_t solver_type,int poly_degree,int n_max_iter)3759 cs_sles_it_define(int                 f_id,
3760                   const char         *name,
3761                   cs_sles_it_type_t   solver_type,
3762                   int                 poly_degree,
3763                   int                 n_max_iter)
3764 {
3765   /* Test for environment variables here */
3766 
3767   const char *s = getenv("CS_THREAD_DEBUG");
3768   if (s != NULL) {
3769     if (atoi(s) > 0)
3770       _thread_debug = true;
3771   }
3772 
3773   /* Now define solver */
3774 
3775   cs_sles_it_t *
3776     c = cs_sles_it_create(solver_type,
3777                           poly_degree,
3778                           n_max_iter,
3779                           true); /* update stats */
3780 
3781   cs_sles_t *sc = cs_sles_define(f_id,
3782                                  name,
3783                                  c,
3784                                  "cs_sles_it_t",
3785                                  cs_sles_it_setup,
3786                                  cs_sles_it_solve,
3787                                  cs_sles_it_free,
3788                                  cs_sles_it_log,
3789                                  cs_sles_it_copy,
3790                                  cs_sles_it_destroy);
3791 
3792   cs_sles_set_error_handler(sc,
3793                             cs_sles_it_error_post_and_abort);
3794 
3795   return c;
3796 }
3797 
3798 /*----------------------------------------------------------------------------*/
3799 /*!
3800  * \brief Create iterative sparse linear system solver info and context.
3801  *
3802  * \param[in]  solver_type   type of solver (PCG, Jacobi, ...)
3803  * \param[in]  poly_degree   preconditioning polynomial degree
3804  *                           (0: diagonal; -1: non-preconditioned;
3805  *                           see \ref sles_it for details)
3806  * \param[in]  n_max_iter    maximum number of iterations
3807  * \param[in]  update_stats  automatic solver statistics indicator
3808  *
3809  * \return  pointer to newly created solver info object.
3810  */
3811 /*----------------------------------------------------------------------------*/
3812 
3813 cs_sles_it_t *
cs_sles_it_create(cs_sles_it_type_t solver_type,int poly_degree,int n_max_iter,bool update_stats)3814 cs_sles_it_create(cs_sles_it_type_t   solver_type,
3815                   int                 poly_degree,
3816                   int                 n_max_iter,
3817                   bool                update_stats)
3818 {
3819   cs_sles_it_t *c;
3820 
3821   BFT_MALLOC(c, 1, cs_sles_it_t);
3822 
3823   c->type = solver_type;
3824   c->solve = NULL;
3825 
3826   switch(c->type) {
3827   case CS_SLES_JACOBI:
3828   case CS_SLES_P_GAUSS_SEIDEL:
3829   case CS_SLES_P_SYM_GAUSS_SEIDEL:
3830     c->_pc = NULL;
3831     break;
3832   default:
3833     if (poly_degree < 0) {
3834        /* specific implementation for non-preconditioned PCG */
3835       if (c->type == CS_SLES_PCG)
3836         c->_pc = NULL;
3837       else
3838         c->_pc = cs_sles_pc_none_create();
3839     }
3840     else if (poly_degree == 0)
3841       c->_pc = cs_sles_pc_jacobi_create();
3842     else if (poly_degree == 1)
3843       c->_pc =cs_sles_pc_poly_1_create();
3844     else
3845       c->_pc =cs_sles_pc_poly_2_create();
3846   }
3847   c->pc = c->_pc;
3848 
3849   c->update_stats = update_stats;
3850   c->ignore_convergence = false;
3851 
3852   c->n_max_iter = n_max_iter;
3853   c->restart_interval = 20; /* Default value commonly found in the literature */
3854 
3855   c->n_setups = 0;
3856   c->n_solves = 0;
3857 
3858   c->n_iterations_min = 0;
3859   c->n_iterations_max = 0;
3860   c->n_iterations_last = 0;
3861   c->n_iterations_tot = 0;
3862 
3863   CS_TIMER_COUNTER_INIT(c->t_setup);
3864   CS_TIMER_COUNTER_INIT(c->t_solve);
3865 
3866   c->plot_time_stamp = 0;
3867   c->plot = NULL;
3868   c->_plot = NULL;
3869 
3870 #if defined(HAVE_MPI)
3871   c->comm = cs_glob_mpi_comm;
3872   c->caller_comm = cs_glob_mpi_comm;
3873   c->caller_n_ranks = cs_glob_n_ranks;
3874   if (c->caller_n_ranks < 2) {
3875     c->comm = MPI_COMM_NULL;
3876     c->caller_comm = cs_glob_mpi_comm;
3877   }
3878 #endif
3879 
3880   c->setup_data = NULL;
3881   c->add_data = NULL;
3882   c->shared = NULL;
3883 
3884   /* Fallback mechanism; note that for fallbacks,
3885      the preconditioner is shared, so Krylov methods
3886      may not be mixed with Jacobi or Gauss-Seidel methods */
3887 
3888   switch(c->type) {
3889   case CS_SLES_BICGSTAB:
3890   case CS_SLES_BICGSTAB2:
3891   case CS_SLES_PCR3:
3892     c->fallback_cvg = CS_SLES_BREAKDOWN;
3893     break;
3894   default:
3895     c->fallback_cvg = CS_SLES_DIVERGED;
3896   }
3897 
3898   c->fallback = NULL;
3899 
3900   return c;
3901 }
3902 
3903 /*----------------------------------------------------------------------------*/
3904 /*!
3905  * \brief Destroy iterative sparse linear system solver info and context.
3906  *
3907  * \param[in, out]  context  pointer to iterative solver info and context
3908  *                           (actual type: cs_sles_it_t  **)
3909  */
3910 /*----------------------------------------------------------------------------*/
3911 
3912 void
cs_sles_it_destroy(void ** context)3913 cs_sles_it_destroy(void **context)
3914 {
3915   cs_sles_it_t *c = (cs_sles_it_t *)(*context);
3916   if (c != NULL) {
3917     if (c->fallback != NULL) {
3918       void *f = c->fallback;
3919       cs_sles_it_destroy(&f);
3920       c->fallback = f;
3921     }
3922     cs_sles_pc_destroy(&(c->_pc));
3923     cs_sles_it_free(c);
3924     if (c->_plot != NULL) {
3925       cs_time_plot_finalize(&(c->_plot));
3926       c->plot = NULL;
3927     }
3928     if (c->add_data != NULL) {
3929       BFT_FREE(c->add_data->order);
3930       BFT_FREE(c->add_data);
3931     }
3932     BFT_FREE(c);
3933     *context = c;
3934   }
3935 }
3936 
3937 /*----------------------------------------------------------------------------*/
3938 /*!
3939  * \brief Create iterative sparse linear system solver info and context
3940  *        based on existing info and context.
3941  *
3942  * \param[in]  context  pointer to reference info and context
3943  *                     (actual type: cs_sles_it_t  *)
3944  *
3945  * \return  pointer to newly created solver info object.
3946  *          (actual type: cs_sles_it_t  *)
3947  */
3948 /*----------------------------------------------------------------------------*/
3949 
3950 void *
cs_sles_it_copy(const void * context)3951 cs_sles_it_copy(const void  *context)
3952 {
3953   cs_sles_it_t *d = NULL;
3954 
3955   if (context != NULL) {
3956     const cs_sles_it_t *c = context;
3957     d = cs_sles_it_create(c->type,
3958                           -1,
3959                           c->n_max_iter,
3960                           c->update_stats);
3961     if (c->pc != NULL && c->_pc != NULL) {
3962       d->_pc = cs_sles_pc_clone(c->_pc);
3963       d->pc = d->_pc;
3964     }
3965     else {
3966       d->_pc = c->_pc;
3967       d->pc = c->pc;
3968     }
3969 
3970     /* If useful, copy the restart interval */
3971     if (c->type == CS_SLES_GMRES || c->type == CS_SLES_GCR)
3972       d->restart_interval = c->restart_interval;
3973 
3974 #if defined(HAVE_MPI)
3975     d->comm = c->comm;
3976 #endif
3977   }
3978 
3979   return d;
3980 }
3981 
3982 /*----------------------------------------------------------------------------*/
3983 /*!
3984  * \brief Log sparse linear equation solver info.
3985  *
3986  * \param[in]  context   pointer to iterative solver info and context
3987  *                       (actual type: cs_sles_it_t  *)
3988  * \param[in]  log_type  log type
3989  */
3990 /*----------------------------------------------------------------------------*/
3991 
3992 void
cs_sles_it_log(const void * context,cs_log_t log_type)3993 cs_sles_it_log(const void  *context,
3994                cs_log_t     log_type)
3995 {
3996   const cs_sles_it_t  *c = context;
3997 
3998   if (log_type == CS_LOG_SETUP) {
3999 
4000     cs_log_printf(log_type,
4001                   _("  Solver type:                       %s\n"),
4002                   _(cs_sles_it_type_name[c->type]));
4003     if (c->pc != NULL)
4004       cs_log_printf(log_type,
4005                     _("  Preconditioning:                   %s\n"),
4006                     _(cs_sles_pc_get_type_name(c->pc)));
4007     if (c->type == CS_SLES_GMRES || c->type == CS_SLES_GCR)
4008       cs_log_printf(log_type,
4009                     "  Restart interval:                  %d\n",
4010                     c->restart_interval);
4011     cs_log_printf(log_type,
4012                   _("  Maximum number of iterations:      %d\n"),
4013                   c->n_max_iter);
4014   }
4015 
4016   else if (log_type == CS_LOG_PERFORMANCE) {
4017 
4018     int n_calls = c->n_solves;
4019     int n_it_min = c->n_iterations_min;
4020     int n_it_max = c->n_iterations_max;
4021     int n_it_mean = 0;
4022 
4023     if (n_it_min < 0)
4024       n_it_min = 0;
4025 
4026     if (n_calls > 0)
4027       n_it_mean = (int)(  c->n_iterations_tot
4028                          / ((unsigned long long)n_calls));
4029 
4030     cs_log_printf(log_type,
4031                   _("\n"
4032                     "  Solver type:                   %s\n"),
4033                   _(cs_sles_it_type_name[c->type]));
4034 
4035     if (c->pc != NULL)
4036       cs_log_printf(log_type,
4037                     _("  Preconditioning:               %s\n"),
4038                     _(cs_sles_pc_get_type_name(c->pc)));
4039     cs_log_printf(log_type,
4040                   _("  Number of setups:              %12d\n"
4041                     "  Number of calls:               %12d\n"
4042                     "  Minimum number of iterations:  %12d\n"
4043                     "  Maximum number of iterations:  %12d\n"
4044                     "  Mean number of iterations:     %12d\n"
4045                     "  Total setup time:              %12.3f\n"
4046                     "  Total solution time:           %12.3f\n"),
4047                   c->n_setups, n_calls, n_it_min, n_it_max, n_it_mean,
4048                   c->t_setup.nsec*1e-9,
4049                   c->t_solve.nsec*1e-9);
4050 
4051     if (c->fallback != NULL) {
4052 
4053       n_calls = c->fallback->n_solves;
4054       n_it_min = c->fallback->n_iterations_min;
4055       n_it_max = c->fallback->n_iterations_max;
4056       n_it_mean = 0;
4057 
4058       if (n_it_min < 0)
4059         n_it_min = 0;
4060 
4061       if (n_calls > 0)
4062         n_it_mean = (int)(  c->fallback->n_iterations_tot
4063                            / ((unsigned long long)n_calls));
4064 
4065     cs_log_printf(log_type,
4066                   _("\n"
4067                     "  Backup solver type:            %s\n"),
4068                   _(cs_sles_it_type_name[c->fallback->type]));
4069 
4070     cs_log_printf(log_type,
4071                   _("  Number of calls:               %12d\n"
4072                     "  Minimum number of iterations:  %12d\n"
4073                     "  Maximum number of iterations:  %12d\n"
4074                     "  Mean number of iterations:     %12d\n"
4075                     "  Total solution time:           %12.3f\n"),
4076                   n_calls, n_it_min, n_it_max, n_it_mean,
4077                   c->fallback->t_solve.nsec*1e-9);
4078 
4079     }
4080 
4081   }
4082 
4083   if (c->pc != NULL)
4084     cs_sles_pc_log(c->pc, log_type);
4085 }
4086 
4087 /*----------------------------------------------------------------------------*/
4088 /*!
4089  * \brief Setup iterative sparse linear equation solver.
4090  *
4091  * \param[in, out]  context    pointer to iterative solver info and context
4092  *                             (actual type: cs_sles_it_t  *)
4093  * \param[in]       name       pointer to system name
4094  * \param[in]       a          associated matrix
4095  * \param[in]       verbosity  associated verbosity
4096  */
4097 /*----------------------------------------------------------------------------*/
4098 
4099 void
cs_sles_it_setup(void * context,const char * name,const cs_matrix_t * a,int verbosity)4100 cs_sles_it_setup(void               *context,
4101                  const char         *name,
4102                  const cs_matrix_t  *a,
4103                  int                 verbosity)
4104 {
4105   cs_sles_it_t  *c = context;
4106 
4107   cs_timer_t t0;
4108   if (c->update_stats == true)
4109     t0 = cs_timer_time();
4110 
4111   const int diag_block_size = (cs_matrix_get_diag_block_size(a))[0];
4112 
4113   if (verbosity > 1) {
4114     bft_printf(_("\n Setup of solver for linear system \"%s\"\n"),
4115                name);
4116     cs_matrix_log_info(a, verbosity);
4117   }
4118 
4119   if (   c->type == CS_SLES_JACOBI
4120       || (   c->type >= CS_SLES_P_GAUSS_SEIDEL
4121           && c->type <= CS_SLES_P_SYM_GAUSS_SEIDEL)) {
4122     /* Force to Jacobi in case matrix type is not adapted */
4123     if (cs_matrix_get_type(a) != CS_MATRIX_MSR) {
4124       c->type = CS_SLES_JACOBI;
4125     }
4126     cs_sles_it_setup_priv(c, name, a, verbosity, diag_block_size, true);
4127   }
4128   else
4129     cs_sles_it_setup_priv(c, name, a, verbosity, diag_block_size, false);
4130 
4131   switch (c->type) {
4132 
4133   case CS_SLES_PCR3:
4134     c->solve = _conjugate_residual_3;
4135     break;
4136 
4137   case CS_SLES_PCG:
4138     /* Check for single-reduction */
4139     {
4140       bool single_reduce = false;
4141 #if defined(HAVE_MPI)
4142       cs_gnum_t n_m_rows = c->setup_data->n_rows;
4143       if (c->comm != MPI_COMM_NULL) {
4144         int size;
4145         cs_gnum_t _n_m_rows;
4146         MPI_Allreduce(&n_m_rows, &_n_m_rows, 1, CS_MPI_GNUM, MPI_SUM, c->comm);
4147         MPI_Comm_size(c->comm, &size);
4148         n_m_rows = _n_m_rows / (cs_gnum_t)size;
4149       }
4150       if (c->comm != c->caller_comm)
4151         MPI_Bcast(&n_m_rows, 1, CS_MPI_GNUM, 0, cs_glob_mpi_comm);
4152       if (n_m_rows < (cs_gnum_t)_pcg_sr_threshold)
4153         single_reduce = true;
4154 #endif
4155       if (!single_reduce) {
4156         if (c->pc != NULL)
4157           c->solve = _conjugate_gradient;
4158         else
4159           c->solve = _conjugate_gradient_npc;
4160         break;
4161       }
4162       else {
4163         if (c->pc != NULL)
4164           c->solve = _conjugate_gradient_sr;
4165         else
4166           c->solve = _conjugate_gradient_npc_sr;
4167       }
4168     }
4169     break;
4170 
4171   case CS_SLES_FCG:
4172     c->solve = _flexible_conjugate_gradient;
4173     break;
4174 
4175   case CS_SLES_IPCG:
4176     c->solve = _conjugate_gradient_ip;
4177     break;
4178 
4179   case CS_SLES_JACOBI:
4180     if (diag_block_size == 1)
4181       c->solve = _jacobi;
4182     else if (diag_block_size == 3)
4183       c->solve = _block_3_jacobi;
4184     else
4185       c->solve = _block_jacobi;
4186     break;
4187 
4188   case CS_SLES_BICGSTAB:
4189     c->solve = _bi_cgstab;
4190     break;
4191 
4192   case CS_SLES_BICGSTAB2:
4193     c->solve = _bicgstab2;
4194     break;
4195 
4196   case CS_SLES_GCR:
4197     assert(c->restart_interval > 1);
4198     c->solve = _gcr;
4199     break;
4200 
4201   case CS_SLES_GMRES:
4202     assert(c->restart_interval > 1);
4203     c->solve = _gmres;
4204     break;
4205 
4206   case CS_SLES_P_GAUSS_SEIDEL:
4207     c->solve = _p_gauss_seidel;
4208     break;
4209   case CS_SLES_P_SYM_GAUSS_SEIDEL:
4210     c->solve = _p_sym_gauss_seidel_msr;
4211     break;
4212 
4213   case CS_SLES_USER_DEFINED:
4214     c->solve = cs_user_sles_it_solver;
4215     break;
4216 
4217   default:
4218     bft_error
4219       (__FILE__, __LINE__, 0,
4220        _("Setup of linear equation on \"%s\"\n"
4221          "with solver type %d, which is not defined)."),
4222        name, (int)c->type);
4223     break;
4224   }
4225 
4226   /* Now finish */
4227 
4228   if (c->update_stats == true) {
4229     cs_timer_t t1 = cs_timer_time();
4230     c->n_setups += 1;
4231     cs_timer_counter_add_diff(&(c->t_setup), &t0, &t1);
4232   }
4233 }
4234 
4235 /*----------------------------------------------------------------------------*/
4236 /*!
4237  * \brief Call iterative sparse linear equation solver.
4238  *
4239  * \param[in, out]  context        pointer to iterative solver info and context
4240  *                                 (actual type: cs_sles_it_t  *)
4241  * \param[in]       name           pointer to system name
4242  * \param[in]       a              matrix
4243  * \param[in]       verbosity      associated verbosity
4244  * \param[in]       precision      solver precision
4245  * \param[in]       r_norm         residue normalization
4246  * \param[out]      n_iter         number of "equivalent" iterations
4247  * \param[out]      residue        residue
4248  * \param[in]       rhs            right hand side
4249  * \param[in, out]  vx             system solution
4250  * \param[in]       aux_size       number of elements in aux_vectors (in bytes)
4251  * \param           aux_vectors    optional working area
4252  *                                 (internal allocation if NULL)
4253  *
4254  * \return  convergence state
4255  */
4256 /*----------------------------------------------------------------------------*/
4257 
4258 cs_sles_convergence_state_t
cs_sles_it_solve(void * context,const char * name,const cs_matrix_t * a,int verbosity,double precision,double r_norm,int * n_iter,double * residue,const cs_real_t * rhs,cs_real_t * vx,size_t aux_size,void * aux_vectors)4259 cs_sles_it_solve(void                *context,
4260                  const char          *name,
4261                  const cs_matrix_t   *a,
4262                  int                  verbosity,
4263                  double               precision,
4264                  double               r_norm,
4265                  int                 *n_iter,
4266                  double              *residue,
4267                  const cs_real_t     *rhs,
4268                  cs_real_t           *vx,
4269                  size_t               aux_size,
4270                  void                *aux_vectors)
4271 {
4272   cs_sles_it_t  *c = context;
4273 
4274   cs_sles_convergence_state_t cvg = CS_SLES_ITERATING;
4275 
4276   cs_timer_t t0 = {0, 0}, t1;
4277 
4278   unsigned _n_iter = 0;
4279   cs_sles_it_convergence_t  convergence;
4280 
4281   if (c->update_stats == true)
4282     t0 = cs_timer_time();
4283 
4284   const cs_lnum_t *diag_block_size = cs_matrix_get_diag_block_size(a);
4285   const cs_lnum_t _diag_block_size = diag_block_size[0];
4286 
4287   assert(diag_block_size[0] == diag_block_size[1]);
4288 
4289   /* Initialize number of iterations and residue,
4290      check for immediate return,
4291      and solve sparse linear system */
4292 
4293   *n_iter = 0;
4294 
4295   /* Setup if not already done */
4296 
4297   if (c->setup_data == NULL) {
4298 
4299     if (c->update_stats) { /* Stop solve timer to switch to setup timer */
4300       t1 = cs_timer_time();
4301       cs_timer_counter_add_diff(&(c->t_solve), &t0, &t1);
4302     }
4303 
4304     cs_sles_it_setup(c, name, a, verbosity);
4305 
4306     if (c->update_stats) /* Restart solve timer */
4307       t0 = cs_timer_time();
4308 
4309   }
4310 
4311   if (c->pc != NULL)
4312     cs_sles_pc_set_tolerance(c->pc, precision, r_norm);
4313 
4314   /* Solve sparse linear system */
4315 
4316   cs_sles_it_convergence_init(&convergence,
4317                               name,
4318                               verbosity,
4319                               c->n_max_iter,
4320                               precision,
4321                               r_norm,
4322                               residue);
4323 
4324   c->setup_data->initial_residue = -1;
4325 
4326   if (verbosity > 1)
4327     cs_log_printf(CS_LOG_DEFAULT,
4328                   _(" RHS norm:          %11.4e\n\n"), r_norm);
4329 
4330   /* Only call solver for "active" ranks */
4331 
4332   bool local_solve = true;
4333 #if defined(HAVE_MPI)
4334   if (c->comm == MPI_COMM_NULL) {
4335     cs_lnum_t n_rows = cs_matrix_get_n_rows(a);
4336     if (n_rows == 0) {
4337       local_solve = false;
4338       cvg = CS_SLES_CONVERGED;
4339     }
4340   }
4341 #endif
4342 
4343   if (local_solve)
4344     cvg = c->solve(c,
4345                    a, _diag_block_size, &convergence,
4346                    rhs, vx,
4347                    aux_size, aux_vectors);
4348 
4349   /* Broadcast convergence info from "active" ranks to others*/
4350 
4351 #if defined(HAVE_MPI)
4352   if (c->comm != c->caller_comm && c->ignore_convergence == false) {
4353     /* cvg is signed, so shift (with some margin) before copy to unsigned. */
4354     unsigned buf[2] = {(unsigned)cvg+10, convergence.n_iterations};
4355     MPI_Bcast(buf, 2, MPI_UNSIGNED, 0, c->caller_comm);
4356     MPI_Bcast(&convergence.residue, 1, MPI_DOUBLE, 0, c->caller_comm);
4357     cvg = (cs_sles_convergence_state_t)(buf[0] - 10);
4358     convergence.n_iterations = buf[1];
4359   }
4360 #endif
4361 
4362   /* Update return values */
4363 
4364   _n_iter = convergence.n_iterations;
4365 
4366   *n_iter = convergence.n_iterations;
4367   *residue = convergence.residue;
4368 
4369   cs_sles_it_type_t fallback_type = CS_SLES_N_IT_TYPES;
4370   if (cvg < c->fallback_cvg)
4371     fallback_type = CS_SLES_GMRES;
4372 
4373   if (c->update_stats == true) {
4374 
4375     t1 = cs_timer_time();
4376 
4377     c->n_solves += 1;
4378 
4379     if (fallback_type == CS_SLES_N_IT_TYPES) {
4380       if (c->n_iterations_tot == 0)
4381         c->n_iterations_min = _n_iter;
4382       else if (c->n_iterations_min > _n_iter)
4383         c->n_iterations_min = _n_iter;
4384       if (c->n_iterations_max < _n_iter)
4385         c->n_iterations_max = _n_iter;
4386     }
4387 
4388     c->n_iterations_last = _n_iter;
4389     c->n_iterations_tot += _n_iter;
4390 
4391     cs_timer_counter_add_diff(&(c->t_solve), &t0, &t1);
4392 
4393   }
4394 
4395   if (fallback_type != CS_SLES_N_IT_TYPES)
4396     cvg = _fallback(c,
4397                     fallback_type,
4398                     a,
4399                     cvg,
4400                     &convergence,
4401                     n_iter,
4402                     residue,
4403                     rhs,
4404                     vx,
4405                     aux_size,
4406                     aux_vectors);
4407 
4408   return cvg;
4409 }
4410 
4411 /*----------------------------------------------------------------------------*/
4412 /*!
4413  * \brief Free iterative sparse linear equation solver setup context.
4414  *
4415  * This function frees resolution-related data, such as
4416  * buffers and preconditioning but does not free the whole context,
4417  * as info used for logging (especially performance data) is maintained.
4418  *
4419  * \param[in, out]  context  pointer to iterative solver info and context
4420  *                           (actual type: cs_sles_it_t  *)
4421  */
4422 /*----------------------------------------------------------------------------*/
4423 
4424 void
cs_sles_it_free(void * context)4425 cs_sles_it_free(void  *context)
4426 {
4427   cs_sles_it_t  *c = context;
4428 
4429   cs_timer_t t0;
4430   if (c->update_stats == true)
4431     t0 = cs_timer_time();
4432 
4433   if (c->fallback != NULL)
4434     cs_sles_it_free(c->fallback);
4435 
4436   if (c->_pc != NULL)
4437     cs_sles_pc_free(c->_pc);
4438 
4439   if (c->setup_data != NULL) {
4440     BFT_FREE(c->setup_data->_ad_inv);
4441     BFT_FREE(c->setup_data);
4442   }
4443 
4444   if (c->update_stats == true) {
4445     cs_timer_t t1 = cs_timer_time();
4446     cs_timer_counter_add_diff(&(c->t_setup), &t0, &t1);
4447   }
4448 }
4449 
4450 /*----------------------------------------------------------------------------*/
4451 /*!
4452  * \brief Return iterative solver type.
4453  *
4454  * \param[in]  context  pointer to iterative solver info and context
4455  *
4456  * \return  selected solver type
4457  */
4458 /*----------------------------------------------------------------------------*/
4459 
4460 cs_sles_it_type_t
cs_sles_it_get_type(const cs_sles_it_t * context)4461 cs_sles_it_get_type(const cs_sles_it_t  *context)
4462 {
4463   return context->type;
4464 }
4465 
4466 /*----------------------------------------------------------------------------*/
4467 /*!
4468  * \brief Return the initial residue for the previous solve operation
4469  *        with a solver.
4470  *
4471  * This is useful for convergence tests when this solver is used as
4472  * a preconditioning smoother.
4473  *
4474  * This operation is only valid between calls to \ref cs_sles_it_setup
4475  * (or \ref cs_sles_it_solve) and \ref cs_sles_it_free.
4476  * It returns -1 otherwise.
4477  *
4478  * \param[in]  context  pointer to iterative solver info and context
4479  *
4480  * \return initial residue from last call to \ref cs_sles_solve with this
4481  *         solver
4482  */
4483 /*----------------------------------------------------------------------------*/
4484 
4485 double
cs_sles_it_get_last_initial_residue(const cs_sles_it_t * context)4486 cs_sles_it_get_last_initial_residue(const cs_sles_it_t  *context)
4487 {
4488   double retval = 1;
4489   if (context->setup_data != NULL)
4490     retval = context->setup_data->initial_residue;
4491 
4492   return retval;
4493 }
4494 
4495 /*----------------------------------------------------------------------------*/
4496 /*!
4497  * \brief Return a preconditioner context for an iterative sparse linear
4498  *        equation solver.
4499  *
4500  * This allows modifying parameters of a non default (Jacobi or polynomial)
4501  * preconditioner.
4502  *
4503  * \param[in]  context   pointer to iterative solver info and context
4504  *
4505  * \return  pointer to preconditoner context
4506  */
4507 /*----------------------------------------------------------------------------*/
4508 
4509 cs_sles_pc_t  *
cs_sles_it_get_pc(cs_sles_it_t * context)4510 cs_sles_it_get_pc(cs_sles_it_t  *context)
4511 {
4512   cs_sles_pc_t  *pc = NULL;
4513 
4514   if (context != NULL) {
4515     cs_sles_it_t  *c = context;
4516     pc = c->pc;
4517   }
4518 
4519   return pc;
4520 }
4521 
4522 /*----------------------------------------------------------------------------*/
4523 /*!
4524  * \brief Assign a preconditioner to an iterative sparse linear equation
4525  *        solver, transfering its ownership to to solver context.
4526  *
4527  * This allows assigning a non default (Jacobi or polynomial) preconditioner.
4528  *
4529  * The input pointer is set to NULL to make it clear the caller does not
4530  * own the preconditioner anymore, though the context can be accessed using
4531  * \ref cs_sles_it_get_pc.
4532  *
4533  * \param[in, out]  context   pointer to iterative solver info and context
4534  * \param[in, out]  pc        pointer to preconditoner context
4535  */
4536 /*----------------------------------------------------------------------------*/
4537 
4538 void
cs_sles_it_transfer_pc(cs_sles_it_t * context,cs_sles_pc_t ** pc)4539 cs_sles_it_transfer_pc(cs_sles_it_t     *context,
4540                        cs_sles_pc_t    **pc)
4541 {
4542   if (context != NULL) {
4543     cs_sles_it_t  *c = context;
4544     c->pc = NULL;
4545     cs_sles_pc_destroy(&(c->_pc));
4546     if (pc != NULL) {
4547       c->_pc = *pc;
4548       c->pc = *pc;
4549     }
4550   }
4551   else if (pc != NULL)
4552     cs_sles_pc_destroy(pc);
4553 }
4554 
4555 /*----------------------------------------------------------------------------*/
4556 /*!
4557  * \brief Copy options from one iterative sparse linear system solver info
4558  *        and context to another.
4559  *
4560  * Optional plotting contexts are shared between the source and destination
4561  * contexts.
4562  *
4563  * Preconditioner settings are to be handled separately.
4564  *
4565  * \param[in]       src   pointer to source info and context
4566  * \param[in, out]  dest  pointer to destination info and context
4567  */
4568 /*----------------------------------------------------------------------------*/
4569 
4570 void
cs_sles_it_transfer_parameters(const cs_sles_it_t * src,cs_sles_it_t * dest)4571 cs_sles_it_transfer_parameters(const cs_sles_it_t  *src,
4572                                cs_sles_it_t        *dest)
4573 {
4574   if (dest != NULL && src != NULL) {
4575 
4576     dest->update_stats = src->update_stats;
4577     dest->n_max_iter = src->n_max_iter;
4578     dest->restart_interval = src->restart_interval;
4579 
4580     dest->plot_time_stamp = src->plot_time_stamp;
4581     dest->plot = src->plot;
4582     if (dest->_plot != NULL)
4583       cs_time_plot_finalize(&(dest->_plot));
4584 
4585 #if defined(HAVE_MPI)
4586     dest->comm = src->comm;
4587 #endif
4588 
4589   }
4590 }
4591 
4592 /*----------------------------------------------------------------------------*/
4593 /*!
4594  * \brief Associate a similar info and context object with which some setup
4595  *        data may be shared.
4596  *
4597  * This is especially useful for sharing preconditioning data between
4598  * similar solver contexts (for example ascending and descending multigrid
4599  * smoothers based on the same matrix).
4600  *
4601  * For preconditioning data to be effectively shared, \ref cs_sles_it_setup
4602  * (or \ref cs_sles_it_solve) must be called on \p shareable before being
4603  * called on \p context (without \ref cs_sles_it_free being called in between,
4604  * of course).
4605  *
4606  * It is the caller's responsibility to ensure the context is not used
4607  * for a \ref cs_sles_it_setup or \ref cs_sles_it_solve operation after the
4608  * shareable object has been destroyed (normally by \ref cs_sles_it_destroy).
4609  *
4610  * \param[in, out]  context    pointer to iterative solver info and context
4611  * \param[in]       shareable  pointer to iterative solver info and context
4612  *                             whose context may be shared
4613  */
4614 /*----------------------------------------------------------------------------*/
4615 
4616 void
cs_sles_it_set_shareable(cs_sles_it_t * context,const cs_sles_it_t * shareable)4617 cs_sles_it_set_shareable(cs_sles_it_t        *context,
4618                          const cs_sles_it_t  *shareable)
4619 {
4620   cs_sles_it_t  *c = context;
4621 
4622   c->shared = shareable;
4623 
4624   c->pc = shareable->pc;
4625 
4626   if (c->pc != c->_pc && c->_pc != NULL)
4627     cs_sles_pc_destroy(&(c->_pc));
4628 }
4629 
4630 #if defined(HAVE_MPI)
4631 
4632 /*----------------------------------------------------------------------------*/
4633 /*!
4634  * \brief Set MPI communicator for global reductions.
4635  *
4636  * The system is solved only on ranks with a non-NULL communicator or
4637  * if the caller communicator has less than 2 ranks. convergence info
4638  * is the broadcast across the caller communicator.
4639  *
4640  * \param[in, out]  context      pointer to iterative solver info and context
4641  * \param[in]       comm         MPI communicator for solving
4642  * \param[in]       caller_comm  MPI communicator of caller
4643  */
4644 /*----------------------------------------------------------------------------*/
4645 
4646 void
cs_sles_it_set_mpi_reduce_comm(cs_sles_it_t * context,MPI_Comm comm,MPI_Comm caller_comm)4647 cs_sles_it_set_mpi_reduce_comm(cs_sles_it_t  *context,
4648                                MPI_Comm       comm,
4649                                MPI_Comm       caller_comm)
4650 {
4651   cs_sles_it_t  *c = context;
4652 
4653   static int flag = -1;
4654 
4655   if (flag < 0)
4656     flag = cs_halo_get_use_barrier();
4657 
4658   c->comm = comm;
4659   c->caller_comm = caller_comm;
4660 
4661   if (c->caller_comm != MPI_COMM_NULL)
4662     MPI_Comm_size(c->caller_comm, &(c->caller_n_ranks));
4663   else
4664     c->caller_n_ranks = 1;
4665 
4666   if (comm != cs_glob_mpi_comm)
4667     cs_halo_set_use_barrier(0);
4668   else {
4669     cs_halo_set_use_barrier(flag);
4670     if (cs_glob_n_ranks < 2)
4671       c->comm = MPI_COMM_NULL;
4672   }
4673 }
4674 
4675 #endif /* defined(HAVE_MPI) */
4676 
4677 /*----------------------------------------------------------------------------*/
4678 /*!
4679  * \brief Assign ordering to iterative solver.
4680  *
4681  * The solver context takes ownership of the order array (i.e. it will
4682  * handle its later deallocation).
4683  *
4684  * This is useful only for Process-local Gauss-Seidel.
4685  *
4686  * \param[in, out]  context  pointer to iterative solver info and context
4687  * \param[in, out]  order    pointer to ordering array
4688  */
4689 /*----------------------------------------------------------------------------*/
4690 
4691 void
cs_sles_it_assign_order(cs_sles_it_t * context,cs_lnum_t ** order)4692 cs_sles_it_assign_order(cs_sles_it_t   *context,
4693                         cs_lnum_t     **order)
4694 {
4695   if (context->type != CS_SLES_P_GAUSS_SEIDEL)
4696     BFT_FREE(*order);
4697 
4698   else {
4699 
4700     if (context->add_data == NULL) {
4701       BFT_MALLOC(context->add_data, 1, cs_sles_it_add_t);
4702       context->add_data->order = NULL;
4703     }
4704 
4705     BFT_FREE(context->add_data->order);
4706 
4707     context->add_data->order = *order;
4708 
4709     *order = NULL;
4710 
4711   }
4712 }
4713 
4714 /*----------------------------------------------------------------------------*/
4715 /*!
4716  * \brief Define convergence level under which the fallback to another
4717  *        solver may be used if applicable.
4718  *
4719  * Currently, this mechanism is only by default used for BiCGstab and
4720  * 3-layer conjugate residual solvers with scalar matrices, which may
4721  * fall back to a preconditioned GMRES solver. For those solvers, the
4722  * default threshold is \ref CS_SLES_BREAKDOWN, meaning that divergence
4723  * (but not breakdown) will lead to the use of the fallback mechanism.
4724  *
4725  * \param[in, out]  context    pointer to iterative solver info and context
4726  * \param[in]       threshold  convergence level under which fallback is used
4727  */
4728 /*----------------------------------------------------------------------------*/
4729 
4730 void
cs_sles_it_set_fallback_threshold(cs_sles_it_t * context,cs_sles_convergence_state_t threshold)4731 cs_sles_it_set_fallback_threshold(cs_sles_it_t                 *context,
4732                                   cs_sles_convergence_state_t   threshold)
4733 {
4734   context->fallback_cvg = threshold;
4735 }
4736 
4737 /*----------------------------------------------------------------------------*/
4738 /*!
4739  * \brief Define the number of iterations to be done before restarting the
4740  *        solver. Useful only for GCR or GMRES algorithms.
4741  *
4742  * \param[in, out]  context    pointer to iterative solver info and context
4743  * \param[in]       interval   convergence level under which fallback is used
4744  */
4745 /*----------------------------------------------------------------------------*/
4746 
4747 void
cs_sles_it_set_restart_interval(cs_sles_it_t * context,int interval)4748 cs_sles_it_set_restart_interval(cs_sles_it_t                 *context,
4749                                 int                           interval)
4750 {
4751   if (context == NULL)
4752     return;
4753 
4754   context->restart_interval = interval;
4755 }
4756 
4757 /*----------------------------------------------------------------------------*/
4758 /*!
4759  * \brief Query mean number of rows under which Conjugate Gradient algorithm
4760  *        uses the single-reduction variant.
4761  *
4762  * The single-reduction variant requires only one parallel sum per
4763  * iteration (instead of 2), at the cost of additional vector operations,
4764  * so it tends to be more expensive when the number of matrix rows per
4765  * MPI rank is high, then becomes cheaper when the MPI latency cost becomes
4766  * more significant.
4767  *
4768  * This option is ignored for non-parallel runs, so 0 is returned.
4769  *
4770  * \returns  mean number of rows per active rank under which the
4771  *           single-reduction variant will be used
4772  */
4773 /*----------------------------------------------------------------------------*/
4774 
4775 cs_lnum_t
cs_sles_it_get_pcg_single_reduction(void)4776 cs_sles_it_get_pcg_single_reduction(void)
4777 {
4778 #if defined(HAVE_MPI)
4779   return _pcg_sr_threshold;
4780 #else
4781   return 0;
4782 #endif
4783 }
4784 
4785 /*----------------------------------------------------------------------------*/
4786 /*!
4787  * \brief Set mean number of rows under which Conjugate Gradient algorithm
4788  *        should use the single-reduction variant.
4789  *
4790  * The single-reduction variant requires only one parallel sum per
4791  * iteration (instead of 2), at the cost of additional vector operations,
4792  * so it tends to be more expensive when the number of matrix rows per
4793  * MPI rank is high, then becomes cheaper when the MPI latency cost becomes
4794  * more significant.
4795  *
4796  * This option is ignored for non-parallel runs.
4797  *
4798  * \param[in]  threshold  mean number of rows per active rank under which the
4799  *             single-reduction variant will be used
4800  */
4801 /*----------------------------------------------------------------------------*/
4802 
4803 void
cs_sles_it_set_pcg_single_reduction(cs_lnum_t threshold)4804 cs_sles_it_set_pcg_single_reduction(cs_lnum_t  threshold)
4805 {
4806 #if defined(HAVE_MPI)
4807   _pcg_sr_threshold = threshold;
4808 #endif
4809 }
4810 
4811 /*----------------------------------------------------------------------------*/
4812 /*!
4813  * \brief Log the current global settings relative to parallelism.
4814  */
4815 /*----------------------------------------------------------------------------*/
4816 
4817 void
cs_sles_it_log_parallel_options(void)4818 cs_sles_it_log_parallel_options(void)
4819 {
4820 #if defined(HAVE_MPI)
4821   if (cs_glob_n_ranks > 1)
4822     cs_log_printf(CS_LOG_SETUP,
4823                   _("\n"
4824                     "Iterative linear solvers parallel parameters:\n"
4825                     "  PCG single-reduction threshold:     %ld\n"),
4826                   (long)_pcg_sr_threshold);
4827 #endif
4828 }
4829 
4830 /*----------------------------------------------------------------------------*/
4831 /*!
4832  * \brief Error handler for iterative sparse linear equation solver.
4833  *
4834  * In case of divergence or breakdown, this error handler outputs
4835  * postprocessing data to assist debugging, then aborts the run.
4836  * It does nothing in case the maximum iteration count is reached.
4837 
4838  * \param[in, out]  sles           pointer to solver object
4839  * \param[in]       state          convergence state
4840  * \param[in]       a              matrix
4841  * \param[in]       rhs            right hand side
4842  * \param[in, out]  vx             system solution
4843  *
4844  * \return  false (do not attempt new solve)
4845  */
4846 /*----------------------------------------------------------------------------*/
4847 
4848 bool
cs_sles_it_error_post_and_abort(cs_sles_t * sles,cs_sles_convergence_state_t state,const cs_matrix_t * a,const cs_real_t * rhs,cs_real_t * vx)4849 cs_sles_it_error_post_and_abort(cs_sles_t                    *sles,
4850                                 cs_sles_convergence_state_t   state,
4851                                 const cs_matrix_t            *a,
4852                                 const cs_real_t              *rhs,
4853                                 cs_real_t                    *vx)
4854 {
4855   if (state >= CS_SLES_BREAKDOWN)
4856     return false;
4857 
4858   const cs_sles_it_t  *c = cs_sles_get_context(sles);
4859   const char *name = cs_sles_get_name(sles);
4860 
4861   int mesh_id = cs_post_init_error_writer_cells();
4862 
4863   cs_sles_post_error_output_def(name,
4864                                 mesh_id,
4865                                 a,
4866                                 rhs,
4867                                 vx);
4868 
4869   cs_post_finalize();
4870 
4871   const char *error_type[] = {N_("divergence"), N_("breakdown")};
4872   int err_id = (state == CS_SLES_BREAKDOWN) ? 1 : 0;
4873 
4874   bft_error(__FILE__, __LINE__, 0,
4875             _("%s: error (%s) solving for %s"),
4876             _(cs_sles_it_type_name[c->type]),
4877             _(error_type[err_id]),
4878             name);
4879 
4880   return false;
4881 }
4882 
4883 /*----------------------------------------------------------------------------*/
4884 /*!
4885  * \brief Set plotting options for an iterative sparse linear equation solver.
4886  *
4887  * \param[in, out]  context        pointer to iterative solver info and context
4888  * \param[in]       base_name      base plot name to activate, NULL otherwise
4889  * \param[in]       use_iteration  if true, use iteration as time stamp
4890  *                                 otherwise, use wall clock time
4891  */
4892 /*----------------------------------------------------------------------------*/
4893 
4894 void
cs_sles_it_set_plot_options(cs_sles_it_t * context,const char * base_name,bool use_iteration)4895 cs_sles_it_set_plot_options(cs_sles_it_t  *context,
4896                             const char    *base_name,
4897                             bool           use_iteration)
4898 {
4899   if (context != NULL) {
4900     if (cs_glob_rank_id < 1 && base_name != NULL) {
4901 
4902       /* Destroy previous plot if options reset */
4903       if (context->_plot != NULL)
4904         cs_time_plot_finalize(&(context->_plot));
4905 
4906       /* Create new plot */
4907       cs_file_mkdir_default("monitoring");
4908       const char *probe_names[] = {base_name};
4909       context->_plot = cs_time_plot_init_probe(base_name,
4910                                                "monitoring/residue_",
4911                                                CS_TIME_PLOT_CSV,
4912                                                use_iteration,
4913                                                -1.0,  /* force flush */
4914                                                0,     /* no buffer */
4915                                                1,     /* n_probes */
4916                                                NULL,  /* probe_list */
4917                                                NULL,  /* probe_coords */
4918                                                probe_names);
4919       context->plot = context->_plot;
4920       context->plot_time_stamp = 0;
4921     }
4922   }
4923 }
4924 
4925 /*----------------------------------------------------------------------------*/
4926 
4927 END_C_DECLS
4928