1 /******************************************************************************
2  * Copyright 1998-2019 Lawrence Livermore National Security, LLC and other
3  * HYPRE Project Developers. See the top-level COPYRIGHT file for details.
4  *
5  * SPDX-License-Identifier: (Apache-2.0 OR MIT)
6  ******************************************************************************/
7 
8 #include "_hypre_parcsr_ls.h"
9 #include "float.h"
10 #include "ams.h"
11 #include "_hypre_utilities.hpp"
12 
13 /*--------------------------------------------------------------------------
14  * hypre_ParCSRRelax
15  *
16  * Relaxation on the ParCSR matrix A with right-hand side f and
17  * initial guess u. Possible values for relax_type are:
18  *
19  * 1 = l1-scaled (or weighted) Jacobi
20  * 2 = l1-scaled block Gauss-Seidel/SSOR
21  * 3 = Kaczmarz
22  * 4 = truncated version of 2 (Remark 6.2 in smoothers paper)
23  * x = BoomerAMG relaxation with relax_type = |x|
24  * (16 = Cheby)
25  *
26  * The default value of relax_type is 2.
27  *--------------------------------------------------------------------------*/
28 
29 HYPRE_Int
hypre_ParCSRRelax(hypre_ParCSRMatrix * A,hypre_ParVector * f,HYPRE_Int relax_type,HYPRE_Int relax_times,HYPRE_Real * l1_norms,HYPRE_Real relax_weight,HYPRE_Real omega,HYPRE_Real max_eig_est,HYPRE_Real min_eig_est,HYPRE_Int cheby_order,HYPRE_Real cheby_fraction,hypre_ParVector * u,hypre_ParVector * v,hypre_ParVector * z)30 hypre_ParCSRRelax( hypre_ParCSRMatrix *A,              /* matrix to relax with */
31                    hypre_ParVector    *f,              /* right-hand side */
32                    HYPRE_Int           relax_type,     /* relaxation type */
33                    HYPRE_Int           relax_times,    /* number of sweeps */
34                    HYPRE_Real         *l1_norms,       /* l1 norms of the rows of A */
35                    HYPRE_Real          relax_weight,   /* damping coefficient (usually <= 1) */
36                    HYPRE_Real          omega,          /* SOR parameter (usually in (0,2) */
37                    HYPRE_Real          max_eig_est,    /* for cheby smoothers */
38                    HYPRE_Real          min_eig_est,
39                    HYPRE_Int           cheby_order,
40                    HYPRE_Real          cheby_fraction,
41                    hypre_ParVector    *u,              /* initial/updated approximation */
42                    hypre_ParVector    *v,              /* temporary vector */
43                    hypre_ParVector    *z               /* temporary vector */ )
44 {
45    HYPRE_Int sweep;
46 
47    for (sweep = 0; sweep < relax_times; sweep++)
48    {
49       if (relax_type == 1) /* l1-scaled Jacobi */
50       {
51          hypre_BoomerAMGRelax(A, f, NULL, 7, 0, relax_weight, 1.0, l1_norms, u, v, z);
52       }
53       else if (relax_type == 2 || relax_type == 4) /* offd-l1-scaled block GS */
54       {
55          /* !!! Note: relax_weight and omega flipped !!! */
56 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
57          HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_ParCSRMatrixMemoryLocation(A) );
58          if (exec == HYPRE_EXEC_DEVICE)
59          {
60             hypre_BoomerAMGRelaxHybridGaussSeidelDevice(A, f, NULL, 0, omega, relax_weight, l1_norms, u, v, z,
61                                                         1,  1 /* symm */);
62          }
63          else
64 #endif
65          {
66             hypre_BoomerAMGRelaxHybridGaussSeidel_core(A, f, NULL, 0, omega, relax_weight, l1_norms, u, v, z,
67                                                        1, 1 /* symm */, 0 /* skip diag */, 1, 0);
68          }
69       }
70       else if (relax_type == 3) /* Kaczmarz */
71       {
72          hypre_BoomerAMGRelax(A, f, NULL, 20, 0, relax_weight, omega, l1_norms, u, v, z);
73       }
74       else /* call BoomerAMG relaxation */
75       {
76          if (relax_type == 16)
77          {
78             hypre_ParCSRRelax_Cheby(A, f, max_eig_est, min_eig_est, cheby_fraction, cheby_order, 1,
79                                     0, u, v, z);
80          }
81          else
82          {
83             hypre_BoomerAMGRelax(A, f, NULL, hypre_abs(relax_type), 0, relax_weight,
84                                  omega, l1_norms, u, v, z);
85          }
86       }
87    }
88 
89    return hypre_error_flag;
90 }
91 
92 /*--------------------------------------------------------------------------
93  * hypre_ParVectorInRangeOf
94  *
95  * Return a vector that belongs to the range of a given matrix.
96  *--------------------------------------------------------------------------*/
97 
hypre_ParVectorInRangeOf(hypre_ParCSRMatrix * A)98 hypre_ParVector *hypre_ParVectorInRangeOf(hypre_ParCSRMatrix *A)
99 {
100    hypre_ParVector *x;
101 
102    x = hypre_ParVectorCreate(hypre_ParCSRMatrixComm(A),
103                              hypre_ParCSRMatrixGlobalNumRows(A),
104                              hypre_ParCSRMatrixRowStarts(A));
105    hypre_ParVectorInitialize(x);
106    hypre_ParVectorOwnsData(x) = 1;
107 
108    return x;
109 }
110 
111 /*--------------------------------------------------------------------------
112  * hypre_ParVectorInDomainOf
113  *
114  * Return a vector that belongs to the domain of a given matrix.
115  *--------------------------------------------------------------------------*/
116 
hypre_ParVectorInDomainOf(hypre_ParCSRMatrix * A)117 hypre_ParVector *hypre_ParVectorInDomainOf(hypre_ParCSRMatrix *A)
118 {
119    hypre_ParVector *x;
120 
121    x = hypre_ParVectorCreate(hypre_ParCSRMatrixComm(A),
122                              hypre_ParCSRMatrixGlobalNumCols(A),
123                              hypre_ParCSRMatrixColStarts(A));
124    hypre_ParVectorInitialize(x);
125    hypre_ParVectorOwnsData(x) = 1;
126 
127    return x;
128 }
129 
130 /*--------------------------------------------------------------------------
131  * hypre_ParVectorBlockSplit
132  *
133  * Extract the dim sub-vectors x_0,...,x_{dim-1} composing a parallel
134  * block vector x. It is assumed that &x[i] = [x_0[i],...,x_{dim-1}[i]].
135  *--------------------------------------------------------------------------*/
136 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
137 template<HYPRE_Int dir>
138 __global__ void
hypreCUDAKernel_ParVectorBlockSplitGather(HYPRE_Int size,HYPRE_Int dim,HYPRE_Real * x0,HYPRE_Real * x1,HYPRE_Real * x2,HYPRE_Real * x)139 hypreCUDAKernel_ParVectorBlockSplitGather(HYPRE_Int   size,
140                                           HYPRE_Int   dim,
141                                           HYPRE_Real *x0,
142                                           HYPRE_Real *x1,
143                                           HYPRE_Real *x2,
144                                           HYPRE_Real *x)
145 {
146    const HYPRE_Int i = hypre_cuda_get_grid_thread_id<1,1>();
147 
148    if (i >= size * dim)
149    {
150       return;
151    }
152 
153    HYPRE_Real *xx[3];
154 
155    xx[0] = x0;
156    xx[1] = x1;
157    xx[2] = x2;
158 
159    const HYPRE_Int d = i % dim;
160    const HYPRE_Int k = i / dim;
161 
162    if (dir == 0)
163    {
164       xx[d][k] = x[i];
165    }
166    else if (dir == 1)
167    {
168       x[i] = xx[d][k];
169    }
170 }
171 #endif
172 
hypre_ParVectorBlockSplit(hypre_ParVector * x,hypre_ParVector * x_[3],HYPRE_Int dim)173 HYPRE_Int hypre_ParVectorBlockSplit(hypre_ParVector *x,
174                                     hypre_ParVector *x_[3],
175                                     HYPRE_Int dim)
176 {
177    HYPRE_Int i, d, size_;
178    HYPRE_Real *x_data, *x_data_[3];
179 
180 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
181    HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_ParVectorMemoryLocation(x) );
182 #endif
183 
184    size_ = hypre_VectorSize(hypre_ParVectorLocalVector(x_[0]));
185 
186    x_data = hypre_VectorData(hypre_ParVectorLocalVector(x));
187    for (d = 0; d < dim; d++)
188       x_data_[d] = hypre_VectorData(hypre_ParVectorLocalVector(x_[d]));
189 
190 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
191    if (exec == HYPRE_EXEC_DEVICE)
192    {
193       dim3 bDim = hypre_GetDefaultCUDABlockDimension();
194       dim3 gDim = hypre_GetDefaultCUDAGridDimension(size_ * dim, "thread", bDim);
195       HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ParVectorBlockSplitGather<0>, gDim, bDim,
196                          size_, dim, x_data_[0], x_data_[1], x_data_[2], x_data);
197    }
198    else
199 #endif
200    {
201       for (i = 0; i < size_; i++)
202          for (d = 0; d < dim; d++)
203             x_data_[d][i] = x_data[dim*i+d];
204    }
205 
206    return hypre_error_flag;
207 }
208 
209 /*--------------------------------------------------------------------------
210  * hypre_ParVectorBlockGather
211  *
212  * Compose a parallel block vector x from dim given sub-vectors
213  * x_0,...,x_{dim-1}, such that &x[i] = [x_0[i],...,x_{dim-1}[i]].
214  *--------------------------------------------------------------------------*/
215 
hypre_ParVectorBlockGather(hypre_ParVector * x,hypre_ParVector * x_[3],HYPRE_Int dim)216 HYPRE_Int hypre_ParVectorBlockGather(hypre_ParVector *x,
217                                      hypre_ParVector *x_[3],
218                                      HYPRE_Int dim)
219 {
220    HYPRE_Int i, d, size_;
221    HYPRE_Real *x_data, *x_data_[3];
222 
223 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
224    HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_ParVectorMemoryLocation(x) );
225 #endif
226 
227    size_ = hypre_VectorSize(hypre_ParVectorLocalVector(x_[0]));
228 
229    x_data = hypre_VectorData(hypre_ParVectorLocalVector(x));
230    for (d = 0; d < dim; d++)
231       x_data_[d] = hypre_VectorData(hypre_ParVectorLocalVector(x_[d]));
232 
233 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
234    if (exec == HYPRE_EXEC_DEVICE)
235    {
236       dim3 bDim = hypre_GetDefaultCUDABlockDimension();
237       dim3 gDim = hypre_GetDefaultCUDAGridDimension(size_ * dim, "thread", bDim);
238       HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ParVectorBlockSplitGather<1>, gDim, bDim,
239                          size_, dim, x_data_[0], x_data_[1], x_data_[2], x_data);
240    }
241    else
242 #endif
243    {
244       for (i = 0; i < size_; i++)
245          for (d = 0; d < dim; d++)
246             x_data[dim*i+d] = x_data_[d][i];
247    }
248 
249    return hypre_error_flag;
250 }
251 
252 /*--------------------------------------------------------------------------
253  * hypre_BoomerAMGBlockSolve
254  *
255  * Apply the block-diagonal solver diag(B) to the system diag(A) x = b.
256  * Here B is a given BoomerAMG solver for A, while x and b are "block"
257  * parallel vectors.
258  *--------------------------------------------------------------------------*/
259 
hypre_BoomerAMGBlockSolve(void * B,hypre_ParCSRMatrix * A,hypre_ParVector * b,hypre_ParVector * x)260 HYPRE_Int hypre_BoomerAMGBlockSolve(void *B,
261                                     hypre_ParCSRMatrix *A,
262                                     hypre_ParVector *b,
263                                     hypre_ParVector *x)
264 {
265    HYPRE_Int d, dim = 1;
266 
267    hypre_ParVector *b_[3];
268    hypre_ParVector *x_[3];
269 
270    dim = hypre_ParVectorGlobalSize(x) / hypre_ParCSRMatrixGlobalNumRows(A);
271 
272    if (dim == 1)
273    {
274       hypre_BoomerAMGSolve(B, A, b, x);
275       return hypre_error_flag;
276    }
277 
278    for (d = 0; d < dim; d++)
279    {
280       b_[d] = hypre_ParVectorInRangeOf(A);
281       x_[d] = hypre_ParVectorInRangeOf(A);
282    }
283 
284    hypre_ParVectorBlockSplit(b, b_, dim);
285    hypre_ParVectorBlockSplit(x, x_, dim);
286 
287    for (d = 0; d < dim; d++)
288       hypre_BoomerAMGSolve(B, A, b_[d], x_[d]);
289 
290    hypre_ParVectorBlockGather(x, x_, dim);
291 
292    for (d = 0; d < dim; d++)
293    {
294       hypre_ParVectorDestroy(b_[d]);
295       hypre_ParVectorDestroy(x_[d]);
296    }
297 
298    return hypre_error_flag;
299 }
300 
301 /*--------------------------------------------------------------------------
302  * hypre_ParCSRMatrixFixZeroRows
303  *
304  * For every zero row in the matrix: set the diagonal element to 1.
305  *--------------------------------------------------------------------------*/
306 
hypre_ParCSRMatrixFixZeroRowsHost(hypre_ParCSRMatrix * A)307 HYPRE_Int hypre_ParCSRMatrixFixZeroRowsHost(hypre_ParCSRMatrix *A)
308 {
309    HYPRE_Int i, j;
310    HYPRE_Real l1_norm;
311    HYPRE_Int num_rows = hypre_ParCSRMatrixNumRows(A);
312 
313    hypre_CSRMatrix *A_diag = hypre_ParCSRMatrixDiag(A);
314    HYPRE_Int *A_diag_I = hypre_CSRMatrixI(A_diag);
315    HYPRE_Int *A_diag_J = hypre_CSRMatrixJ(A_diag);
316    HYPRE_Real *A_diag_data = hypre_CSRMatrixData(A_diag);
317 
318    hypre_CSRMatrix *A_offd = hypre_ParCSRMatrixOffd(A);
319    HYPRE_Int *A_offd_I = hypre_CSRMatrixI(A_offd);
320    HYPRE_Real *A_offd_data = hypre_CSRMatrixData(A_offd);
321    HYPRE_Int num_cols_offd = hypre_CSRMatrixNumCols(A_offd);
322 
323    /* a row will be considered zero if its l1 norm is less than eps */
324    HYPRE_Real eps = 0.0; /* DBL_EPSILON * 1e+4; */
325 
326    for (i = 0; i < num_rows; i++)
327    {
328       l1_norm = 0.0;
329       for (j = A_diag_I[i]; j < A_diag_I[i+1]; j++)
330          l1_norm += fabs(A_diag_data[j]);
331       if (num_cols_offd)
332          for (j = A_offd_I[i]; j < A_offd_I[i+1]; j++)
333             l1_norm += fabs(A_offd_data[j]);
334 
335       if (l1_norm <= eps)
336       {
337          for (j = A_diag_I[i]; j < A_diag_I[i+1]; j++)
338             if (A_diag_J[j] == i)
339                A_diag_data[j] = 1.0;
340             else
341                A_diag_data[j] = 0.0;
342          if (num_cols_offd)
343             for (j = A_offd_I[i]; j < A_offd_I[i+1]; j++)
344                A_offd_data[j] = 0.0;
345       }
346    }
347 
348    return hypre_error_flag;
349 }
350 
351 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
352 __global__ void
hypreCUDAKernel_ParCSRMatrixFixZeroRows(HYPRE_Int nrows,HYPRE_Int * A_diag_i,HYPRE_Int * A_diag_j,HYPRE_Complex * A_diag_data,HYPRE_Int * A_offd_i,HYPRE_Complex * A_offd_data,HYPRE_Int num_cols_offd)353 hypreCUDAKernel_ParCSRMatrixFixZeroRows( HYPRE_Int      nrows,
354                                          HYPRE_Int     *A_diag_i,
355                                          HYPRE_Int     *A_diag_j,
356                                          HYPRE_Complex *A_diag_data,
357                                          HYPRE_Int     *A_offd_i,
358                                          HYPRE_Complex *A_offd_data,
359                                          HYPRE_Int      num_cols_offd)
360 {
361    HYPRE_Int row_i = hypre_cuda_get_grid_warp_id<1,1>();
362 
363    if (row_i >= nrows)
364    {
365       return;
366    }
367 
368    HYPRE_Int lane = hypre_cuda_get_lane_id<1>();
369    HYPRE_Real eps = 0.0; /* DBL_EPSILON * 1e+4; */
370    HYPRE_Real l1_norm = 0.0;
371    HYPRE_Int p1, q1, p2 = 0, q2 = 0;
372 
373    if (lane < 2)
374    {
375       p1 = read_only_load(A_diag_i + row_i + lane);
376       if (num_cols_offd)
377       {
378          p2 = read_only_load(A_offd_i + row_i + lane);
379       }
380    }
381 
382    q1 = __shfl_sync(HYPRE_WARP_FULL_MASK, p1, 1);
383    p1 = __shfl_sync(HYPRE_WARP_FULL_MASK, p1, 0);
384    if (num_cols_offd)
385    {
386       q2 = __shfl_sync(HYPRE_WARP_FULL_MASK, p2, 1);
387       p2 = __shfl_sync(HYPRE_WARP_FULL_MASK, p2, 0);
388    }
389 
390    for (HYPRE_Int j = p1 + lane; j < q1; j += HYPRE_WARP_SIZE)
391    {
392       l1_norm += fabs(A_diag_data[j]);
393    }
394 
395    for (HYPRE_Int j = p2 + lane; j < q2; j += HYPRE_WARP_SIZE)
396    {
397       l1_norm += fabs(A_offd_data[j]);
398    }
399 
400    l1_norm = warp_allreduce_sum(l1_norm);
401 
402    if (l1_norm <= eps)
403    {
404       for (HYPRE_Int j = p1 + lane; j < q1; j += HYPRE_WARP_SIZE)
405       {
406          if (row_i == read_only_load(&A_diag_j[j]))
407          {
408             A_diag_data[j] = 1.0;
409          }
410          else
411          {
412             A_diag_data[j] = 0.0;
413          }
414       }
415 
416       for (HYPRE_Int j = p2 + lane; j < q2; j += HYPRE_WARP_SIZE)
417       {
418          A_offd_data[j] = 0.0;
419       }
420    }
421 }
422 
hypre_ParCSRMatrixFixZeroRowsDevice(hypre_ParCSRMatrix * A)423 HYPRE_Int hypre_ParCSRMatrixFixZeroRowsDevice(hypre_ParCSRMatrix *A)
424 {
425    HYPRE_Int        nrows         = hypre_ParCSRMatrixNumRows(A);
426    hypre_CSRMatrix *A_diag        = hypre_ParCSRMatrixDiag(A);
427    HYPRE_Real      *A_diag_data   = hypre_CSRMatrixData(A_diag);
428    HYPRE_Int       *A_diag_i      = hypre_CSRMatrixI(A_diag);
429    HYPRE_Int       *A_diag_j      = hypre_CSRMatrixJ(A_diag);
430    hypre_CSRMatrix *A_offd        = hypre_ParCSRMatrixOffd(A);
431    HYPRE_Real      *A_offd_data   = hypre_CSRMatrixData(A_offd);
432    HYPRE_Int       *A_offd_i      = hypre_CSRMatrixI(A_offd);
433    HYPRE_Int        num_cols_offd = hypre_CSRMatrixNumCols(A_offd);
434    dim3             bDim, gDim;
435 
436    bDim = hypre_GetDefaultCUDABlockDimension();
437    gDim = hypre_GetDefaultCUDAGridDimension(nrows, "warp", bDim);
438 
439    HYPRE_CUDA_LAUNCH(hypreCUDAKernel_ParCSRMatrixFixZeroRows, gDim, bDim,
440                      nrows, A_diag_i, A_diag_j, A_diag_data, A_offd_i, A_offd_data, num_cols_offd);
441 
442    //hypre_SyncCudaComputeStream(hypre_handle());
443 
444    return hypre_error_flag;
445 }
446 #endif
447 
hypre_ParCSRMatrixFixZeroRows(hypre_ParCSRMatrix * A)448 HYPRE_Int hypre_ParCSRMatrixFixZeroRows(hypre_ParCSRMatrix *A)
449 {
450 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
451    HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_ParCSRMatrixMemoryLocation(A) );
452    if (exec == HYPRE_EXEC_DEVICE)
453    {
454       return hypre_ParCSRMatrixFixZeroRowsDevice(A);
455    }
456    else
457 #endif
458    {
459       return hypre_ParCSRMatrixFixZeroRowsHost(A);
460    }
461 }
462 
463 /*--------------------------------------------------------------------------
464  * hypre_ParCSRComputeL1Norms
465  *
466  * Compute the l1 norms of the rows of a given matrix, depending on
467  * the option parameter:
468  *
469  * option 1 = Compute the l1 norm of the rows
470  * option 2 = Compute the l1 norm of the (processor) off-diagonal
471  *            part of the rows plus the diagonal of A
472  * option 3 = Compute the l2 norm^2 of the rows
473  * option 4 = Truncated version of option 2 based on Remark 6.2 in "Multigrid
474  *            Smoothers for Ultra-Parallel Computing"
475  *
476  * The above computations are done in a CF manner, whenever the provided
477  * cf_marker is not NULL.
478  *--------------------------------------------------------------------------*/
479 
480 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
481 struct l1_norm_op1 : public thrust::binary_function<HYPRE_Complex, HYPRE_Complex, HYPRE_Complex>
482 {
483    __host__ __device__
operatorl1_norm_op1484    HYPRE_Complex operator()(HYPRE_Complex &x, HYPRE_Complex &y) const
485    {
486       return x <= 4.0/3.0 * y ? y : x;
487    }
488 };
489 #endif
490 
hypre_ParCSRComputeL1Norms(hypre_ParCSRMatrix * A,HYPRE_Int option,HYPRE_Int * cf_marker,HYPRE_Real ** l1_norm_ptr)491 HYPRE_Int hypre_ParCSRComputeL1Norms(hypre_ParCSRMatrix  *A,
492                                      HYPRE_Int            option,
493                                      HYPRE_Int           *cf_marker,
494                                      HYPRE_Real         **l1_norm_ptr)
495 {
496    HYPRE_Int i;
497    HYPRE_Int num_rows = hypre_ParCSRMatrixNumRows(A);
498    hypre_CSRMatrix *A_diag = hypre_ParCSRMatrixDiag(A);
499    hypre_CSRMatrix *A_offd = hypre_ParCSRMatrixOffd(A);
500    HYPRE_Int num_cols_offd = hypre_CSRMatrixNumCols(A_offd);
501 
502    HYPRE_MemoryLocation memory_location_l1 = hypre_ParCSRMatrixMemoryLocation(A);
503 
504    HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( memory_location_l1 );
505 
506    if (exec == HYPRE_EXEC_HOST)
507    {
508       HYPRE_Int num_threads = hypre_NumThreads();
509       if (num_threads > 1)
510       {
511          return hypre_ParCSRComputeL1NormsThreads(A, option, num_threads, cf_marker, l1_norm_ptr);
512       }
513    }
514 
515    HYPRE_Real *l1_norm = hypre_TAlloc(HYPRE_Real, num_rows, memory_location_l1);
516 
517    HYPRE_MemoryLocation memory_location_tmp = exec == HYPRE_EXEC_HOST ? HYPRE_MEMORY_HOST : HYPRE_MEMORY_DEVICE;
518    HYPRE_Real *diag_tmp = NULL;
519 
520    HYPRE_Int *cf_marker_offd = NULL;
521 
522    /* collect the cf marker data from other procs */
523    if (cf_marker != NULL)
524    {
525       HYPRE_Int num_sends;
526       HYPRE_Int *int_buf_data = NULL;
527 
528       hypre_ParCSRCommPkg  *comm_pkg = hypre_ParCSRMatrixCommPkg(A);
529       hypre_ParCSRCommHandle *comm_handle;
530 
531       if (num_cols_offd)
532       {
533          cf_marker_offd = hypre_CTAlloc(HYPRE_Int, num_cols_offd, memory_location_tmp);
534       }
535       num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg);
536       if (hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends))
537       {
538          int_buf_data = hypre_CTAlloc(HYPRE_Int, hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends),
539                                       memory_location_tmp);
540       }
541 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
542       if (exec == HYPRE_EXEC_DEVICE)
543       {
544          hypre_ParCSRCommPkgCopySendMapElmtsToDevice(comm_pkg);
545          HYPRE_THRUST_CALL( gather,
546                             hypre_ParCSRCommPkgDeviceSendMapElmts(comm_pkg),
547                             hypre_ParCSRCommPkgDeviceSendMapElmts(comm_pkg) + hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends),
548                             cf_marker,
549                             int_buf_data );
550       }
551       else
552 #endif
553       {
554          HYPRE_Int index = 0;
555          HYPRE_Int start;
556          HYPRE_Int j;
557          for (i = 0; i < num_sends; i++)
558          {
559             start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i);
560             for (j = start; j < hypre_ParCSRCommPkgSendMapStart(comm_pkg, i+1); j++)
561             {
562                int_buf_data[index++] = cf_marker[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,j)];
563             }
564          }
565       }
566 
567       comm_handle = hypre_ParCSRCommHandleCreate_v2(11, comm_pkg, memory_location_tmp, int_buf_data,
568                                                      memory_location_tmp, cf_marker_offd);
569       hypre_ParCSRCommHandleDestroy(comm_handle);
570       hypre_TFree(int_buf_data, memory_location_tmp);
571    }
572 
573    if (option == 1)
574    {
575       /* Set the l1 norm of the diag part */
576       hypre_CSRMatrixComputeRowSum(A_diag, cf_marker, cf_marker, l1_norm, 1, 1.0, "set");
577 
578       /* Add the l1 norm of the offd part */
579       if (num_cols_offd)
580       {
581          hypre_CSRMatrixComputeRowSum(A_offd, cf_marker, cf_marker_offd, l1_norm, 1, 1.0, "add");
582       }
583    }
584    else if (option == 2)
585    {
586       /* Set the abs(diag) element */
587       hypre_CSRMatrixExtractDiagonal(A_diag, l1_norm, 1);
588       /* Add the l1 norm of the offd part */
589       if (num_cols_offd)
590       {
591          hypre_CSRMatrixComputeRowSum(A_offd, cf_marker, cf_marker, l1_norm, 1, 1.0, "add");
592       }
593    }
594    else if (option == 3)
595    {
596       /* Set the CF l2 norm of the diag part */
597       hypre_CSRMatrixComputeRowSum(A_diag, NULL, NULL, l1_norm, 2, 1.0, "set");
598       /* Add the CF l2 norm of the offd part */
599       if (num_cols_offd)
600       {
601          hypre_CSRMatrixComputeRowSum(A_offd, NULL, NULL, l1_norm, 2, 1.0, "add");
602       }
603    }
604    else if (option == 4)
605    {
606       /* Set the abs(diag) element */
607       hypre_CSRMatrixExtractDiagonal(A_diag, l1_norm, 1);
608 
609       diag_tmp = hypre_TAlloc(HYPRE_Real, num_rows, memory_location_tmp);
610       hypre_TMemcpy(diag_tmp, l1_norm, HYPRE_Real, num_rows, memory_location_tmp, memory_location_l1);
611 
612       /* Add the scaled l1 norm of the offd part */
613       if (num_cols_offd)
614       {
615          hypre_CSRMatrixComputeRowSum(A_offd, cf_marker, cf_marker_offd, l1_norm, 1, 0.5, "add");
616       }
617 
618       /* Truncate according to Remark 6.2 */
619 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
620       if (exec == HYPRE_EXEC_DEVICE)
621       {
622          HYPRE_THRUST_CALL( transform, l1_norm, l1_norm + num_rows, diag_tmp, l1_norm, l1_norm_op1() );
623       }
624       else
625 #endif
626       {
627          for (i = 0; i < num_rows; i++)
628          {
629             if (l1_norm[i] <= 4.0/3.0 * diag_tmp[i])
630             {
631                l1_norm[i] = diag_tmp[i];
632             }
633          }
634       }
635    }
636    else if (option == 5) /*stores diagonal of A for Jacobi using matvec, rlx 7 */
637    {
638       /* Set the diag element */
639       hypre_CSRMatrixExtractDiagonal(A_diag, l1_norm, 0);
640 
641 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
642       if ( exec == HYPRE_EXEC_DEVICE)
643       {
644          thrust::identity<HYPRE_Complex> identity;
645          HYPRE_THRUST_CALL( replace_if, l1_norm, l1_norm + num_rows, thrust::not1(identity), 1.0 );
646       }
647       else
648 #endif
649       {
650          for (i = 0; i < num_rows; i++)
651          {
652             if (l1_norm[i] == 0.0)
653             {
654                l1_norm[i] = 1.0;
655             }
656          }
657       }
658 
659       *l1_norm_ptr = l1_norm;
660 
661       return hypre_error_flag;
662    }
663 
664    /* Handle negative definite matrices */
665    if (!diag_tmp)
666    {
667       diag_tmp = hypre_TAlloc(HYPRE_Real, num_rows, memory_location_tmp);
668    }
669 
670    /* Set the diag element */
671    hypre_CSRMatrixExtractDiagonal(A_diag, diag_tmp, 0);
672 
673 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
674    if (exec == HYPRE_EXEC_DEVICE)
675    {
676       HYPRE_THRUST_CALL( transform_if, l1_norm, l1_norm + num_rows, diag_tmp, l1_norm, thrust::negate<HYPRE_Real>(),
677                          is_negative<HYPRE_Real>() );
678       //bool any_zero = HYPRE_THRUST_CALL( any_of, l1_norm, l1_norm + num_rows, thrust::not1(thrust::identity<HYPRE_Complex>()) );
679       bool any_zero = 0.0 == HYPRE_THRUST_CALL( reduce, l1_norm, l1_norm + num_rows, 1.0, thrust::minimum<HYPRE_Real>() );
680       if ( any_zero )
681       {
682          hypre_error_in_arg(1);
683       }
684    }
685    else
686 #endif
687    {
688       for (i = 0; i < num_rows; i++)
689       {
690          if (diag_tmp[i] < 0.0)
691          {
692             l1_norm[i] = -l1_norm[i];
693          }
694       }
695 
696       for (i = 0; i < num_rows; i++)
697       {
698          /* if (fabs(l1_norm[i]) < DBL_EPSILON) */
699          if (fabs(l1_norm[i]) == 0.0)
700          {
701             hypre_error_in_arg(1);
702             break;
703          }
704       }
705    }
706 
707    hypre_TFree(cf_marker_offd, memory_location_tmp);
708    hypre_TFree(diag_tmp, memory_location_tmp);
709 
710    *l1_norm_ptr = l1_norm;
711 
712    return hypre_error_flag;
713 }
714 
715 /*--------------------------------------------------------------------------
716  * hypre_ParCSRMatrixSetDiagRows
717  *
718  * For every row containing only a diagonal element: set it to d.
719  *--------------------------------------------------------------------------*/
720 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
721 __global__ void
hypreCUDAKernel_ParCSRMatrixSetDiagRows(HYPRE_Int nrows,HYPRE_Int * A_diag_I,HYPRE_Int * A_diag_J,HYPRE_Complex * A_diag_data,HYPRE_Int * A_offd_I,HYPRE_Int num_cols_offd,HYPRE_Real d)722 hypreCUDAKernel_ParCSRMatrixSetDiagRows(HYPRE_Int      nrows,
723                                         HYPRE_Int     *A_diag_I,
724                                         HYPRE_Int     *A_diag_J,
725                                         HYPRE_Complex *A_diag_data,
726                                         HYPRE_Int     *A_offd_I,
727                                         HYPRE_Int      num_cols_offd,
728                                         HYPRE_Real     d)
729 {
730    const HYPRE_Int i = hypre_cuda_get_grid_thread_id<1,1>();
731    if (i >= nrows)
732    {
733       return;
734    }
735 
736    HYPRE_Int j = read_only_load(&A_diag_I[i]);
737 
738    if ( (read_only_load(&A_diag_I[i+1]) == j+1) && (read_only_load(&A_diag_J[j]) == i) &&
739         (!num_cols_offd || (read_only_load(&A_offd_I[i+1]) == read_only_load(&A_offd_I[i]))) )
740    {
741       A_diag_data[j] = d;
742    }
743 }
744 #endif
745 
hypre_ParCSRMatrixSetDiagRows(hypre_ParCSRMatrix * A,HYPRE_Real d)746 HYPRE_Int hypre_ParCSRMatrixSetDiagRows(hypre_ParCSRMatrix *A, HYPRE_Real d)
747 {
748    HYPRE_Int i, j;
749    HYPRE_Int num_rows = hypre_ParCSRMatrixNumRows(A);
750 
751    hypre_CSRMatrix *A_diag = hypre_ParCSRMatrixDiag(A);
752    HYPRE_Int *A_diag_I = hypre_CSRMatrixI(A_diag);
753    HYPRE_Int *A_diag_J = hypre_CSRMatrixJ(A_diag);
754    HYPRE_Real *A_diag_data = hypre_CSRMatrixData(A_diag);
755 
756    hypre_CSRMatrix *A_offd = hypre_ParCSRMatrixOffd(A);
757    HYPRE_Int *A_offd_I = hypre_CSRMatrixI(A_offd);
758    HYPRE_Int num_cols_offd = hypre_CSRMatrixNumCols(A_offd);
759 
760 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
761    HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_ParCSRMatrixMemoryLocation(A) );
762    if (exec == HYPRE_EXEC_DEVICE)
763    {
764       dim3 bDim = hypre_GetDefaultCUDABlockDimension();
765       dim3 gDim = hypre_GetDefaultCUDAGridDimension(num_rows, "thread", bDim);
766       HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ParCSRMatrixSetDiagRows, gDim, bDim,
767                          num_rows, A_diag_I, A_diag_J, A_diag_data, A_offd_I, num_cols_offd, d);
768    }
769    else
770 #endif
771    {
772       for (i = 0; i < num_rows; i++)
773       {
774          j = A_diag_I[i];
775          if ((A_diag_I[i+1] == j+1) && (A_diag_J[j] == i) &&
776                (!num_cols_offd || (A_offd_I[i+1] == A_offd_I[i])))
777          {
778             A_diag_data[j] = d;
779          }
780       }
781    }
782 
783    return hypre_error_flag;
784 }
785 
786 /*--------------------------------------------------------------------------
787  * hypre_AMSCreate
788  *
789  * Allocate the AMS solver structure.
790  *--------------------------------------------------------------------------*/
791 
hypre_AMSCreate()792 void * hypre_AMSCreate()
793 {
794    hypre_AMSData *ams_data;
795 
796    ams_data = hypre_CTAlloc(hypre_AMSData,  1, HYPRE_MEMORY_HOST);
797 
798    /* Default parameters */
799 
800    ams_data -> dim = 3;                /* 3D problem */
801    ams_data -> maxit = 20;             /* perform at most 20 iterations */
802    ams_data -> tol = 1e-6;             /* convergence tolerance */
803    ams_data -> print_level = 1;        /* print residual norm at each step */
804    ams_data -> cycle_type = 1;         /* a 3-level multiplicative solver */
805    ams_data -> A_relax_type = 2;       /* offd-l1-scaled GS */
806    ams_data -> A_relax_times = 1;      /* one relaxation sweep */
807    ams_data -> A_relax_weight = 1.0;   /* damping parameter */
808    ams_data -> A_omega = 1.0;          /* SSOR coefficient */
809    ams_data -> A_cheby_order = 2;      /* Cheby: order (1 -4 are vaild) */
810    ams_data -> A_cheby_fraction = .3;  /* Cheby: fraction of spectrum to smooth */
811 
812    ams_data -> B_G_coarsen_type = 10;  /* HMIS coarsening */
813    ams_data -> B_G_agg_levels = 1;     /* Levels of aggressive coarsening */
814    ams_data -> B_G_relax_type = 3;     /* hybrid G-S/Jacobi */
815    ams_data -> B_G_theta = 0.25;       /* strength threshold */
816    ams_data -> B_G_interp_type = 0;    /* interpolation type */
817    ams_data -> B_G_Pmax = 0;           /* max nonzero elements in interp. rows */
818    ams_data -> B_Pi_coarsen_type = 10; /* HMIS coarsening */
819    ams_data -> B_Pi_agg_levels = 1;    /* Levels of aggressive coarsening */
820    ams_data -> B_Pi_relax_type = 3;    /* hybrid G-S/Jacobi */
821    ams_data -> B_Pi_theta = 0.25;      /* strength threshold */
822    ams_data -> B_Pi_interp_type = 0;   /* interpolation type */
823    ams_data -> B_Pi_Pmax = 0;          /* max nonzero elements in interp. rows */
824    ams_data -> beta_is_zero = 0;       /* the problem has a mass term */
825 
826    /* By default, do l1-GS smoothing on the coarsest grid */
827    ams_data -> B_G_coarse_relax_type  = 8;
828    ams_data -> B_Pi_coarse_relax_type = 8;
829 
830    /* The rest of the fields are initialized using the Set functions */
831 
832    ams_data -> A    = NULL;
833    ams_data -> G    = NULL;
834    ams_data -> A_G  = NULL;
835    ams_data -> B_G  = 0;
836    ams_data -> Pi   = NULL;
837    ams_data -> A_Pi = NULL;
838    ams_data -> B_Pi = 0;
839    ams_data -> x    = NULL;
840    ams_data -> y    = NULL;
841    ams_data -> z    = NULL;
842    ams_data -> Gx   = NULL;
843    ams_data -> Gy   = NULL;
844    ams_data -> Gz   = NULL;
845 
846    ams_data -> r0  = NULL;
847    ams_data -> g0  = NULL;
848    ams_data -> r1  = NULL;
849    ams_data -> g1  = NULL;
850    ams_data -> r2  = NULL;
851    ams_data -> g2  = NULL;
852    ams_data -> zz  = NULL;
853 
854    ams_data -> Pix    = NULL;
855    ams_data -> Piy    = NULL;
856    ams_data -> Piz    = NULL;
857    ams_data -> A_Pix  = NULL;
858    ams_data -> A_Piy  = NULL;
859    ams_data -> A_Piz  = NULL;
860    ams_data -> B_Pix  = 0;
861    ams_data -> B_Piy  = 0;
862    ams_data -> B_Piz  = 0;
863 
864    ams_data -> interior_nodes       = NULL;
865    ams_data -> G0                   = NULL;
866    ams_data -> A_G0                 = NULL;
867    ams_data -> B_G0                 = 0;
868    ams_data -> projection_frequency = 5;
869 
870    ams_data -> A_l1_norms = NULL;
871    ams_data -> A_max_eig_est = 0;
872    ams_data -> A_min_eig_est = 0;
873 
874    ams_data -> owns_Pi   = 1;
875    ams_data -> owns_A_G  = 0;
876    ams_data -> owns_A_Pi = 0;
877 
878    return (void *) ams_data;
879 }
880 
881 /*--------------------------------------------------------------------------
882  * hypre_AMSDestroy
883  *
884  * Deallocate the AMS solver structure. Note that the input data (given
885  * through the Set functions) is not destroyed.
886  *--------------------------------------------------------------------------*/
887 
hypre_AMSDestroy(void * solver)888 HYPRE_Int hypre_AMSDestroy(void *solver)
889 {
890    hypre_AMSData *ams_data = (hypre_AMSData *) solver;
891 
892    if (!ams_data)
893    {
894       hypre_error_in_arg(1);
895       return hypre_error_flag;
896    }
897 
898    if (ams_data -> owns_A_G)
899       if (ams_data -> A_G)
900          hypre_ParCSRMatrixDestroy(ams_data -> A_G);
901    if (!ams_data -> beta_is_zero)
902       if (ams_data -> B_G)
903          HYPRE_BoomerAMGDestroy(ams_data -> B_G);
904 
905    if (ams_data -> owns_Pi && ams_data -> Pi)
906       hypre_ParCSRMatrixDestroy(ams_data -> Pi);
907    if (ams_data -> owns_A_Pi)
908       if (ams_data -> A_Pi)
909          hypre_ParCSRMatrixDestroy(ams_data -> A_Pi);
910    if (ams_data -> B_Pi)
911       HYPRE_BoomerAMGDestroy(ams_data -> B_Pi);
912 
913    if (ams_data -> owns_Pi && ams_data -> Pix)
914       hypre_ParCSRMatrixDestroy(ams_data -> Pix);
915    if (ams_data -> A_Pix)
916       hypre_ParCSRMatrixDestroy(ams_data -> A_Pix);
917    if (ams_data -> B_Pix)
918       HYPRE_BoomerAMGDestroy(ams_data -> B_Pix);
919    if (ams_data -> owns_Pi && ams_data -> Piy)
920       hypre_ParCSRMatrixDestroy(ams_data -> Piy);
921    if (ams_data -> A_Piy)
922       hypre_ParCSRMatrixDestroy(ams_data -> A_Piy);
923    if (ams_data -> B_Piy)
924       HYPRE_BoomerAMGDestroy(ams_data -> B_Piy);
925    if (ams_data -> owns_Pi && ams_data -> Piz)
926       hypre_ParCSRMatrixDestroy(ams_data -> Piz);
927    if (ams_data -> A_Piz)
928       hypre_ParCSRMatrixDestroy(ams_data -> A_Piz);
929    if (ams_data -> B_Piz)
930       HYPRE_BoomerAMGDestroy(ams_data -> B_Piz);
931 
932    if (ams_data -> r0)
933       hypre_ParVectorDestroy(ams_data -> r0);
934    if (ams_data -> g0)
935       hypre_ParVectorDestroy(ams_data -> g0);
936    if (ams_data -> r1)
937       hypre_ParVectorDestroy(ams_data -> r1);
938    if (ams_data -> g1)
939       hypre_ParVectorDestroy(ams_data -> g1);
940    if (ams_data -> r2)
941       hypre_ParVectorDestroy(ams_data -> r2);
942    if (ams_data -> g2)
943       hypre_ParVectorDestroy(ams_data -> g2);
944    if (ams_data -> zz)
945       hypre_ParVectorDestroy(ams_data -> zz);
946 
947    if (ams_data -> G0)
948       hypre_ParCSRMatrixDestroy(ams_data -> A);
949    if (ams_data -> G0)
950       hypre_ParCSRMatrixDestroy(ams_data -> G0);
951    if (ams_data -> A_G0)
952       hypre_ParCSRMatrixDestroy(ams_data -> A_G0);
953    if (ams_data -> B_G0)
954       HYPRE_BoomerAMGDestroy(ams_data -> B_G0);
955 
956    hypre_SeqVectorDestroy(ams_data -> A_l1_norms);
957 
958    /* G, x, y ,z, Gx, Gy and Gz are not destroyed */
959 
960    if (ams_data)
961    {
962       hypre_TFree(ams_data, HYPRE_MEMORY_HOST);
963    }
964 
965    return hypre_error_flag;
966 }
967 
968 /*--------------------------------------------------------------------------
969  * hypre_AMSSetDimension
970  *
971  * Set problem dimension (2 or 3). By default we assume dim = 3.
972  *--------------------------------------------------------------------------*/
973 
hypre_AMSSetDimension(void * solver,HYPRE_Int dim)974 HYPRE_Int hypre_AMSSetDimension(void *solver,
975                                 HYPRE_Int dim)
976 {
977    hypre_AMSData *ams_data = (hypre_AMSData *) solver;
978 
979    if (dim != 1 && dim != 2 && dim != 3)
980       hypre_error_in_arg(2);
981 
982    ams_data -> dim = dim;
983    return hypre_error_flag;
984 }
985 
986 /*--------------------------------------------------------------------------
987  * hypre_AMSSetDiscreteGradient
988  *
989  * Set the discrete gradient matrix G.
990  * This function should be called before hypre_AMSSetup()!
991  *--------------------------------------------------------------------------*/
992 
hypre_AMSSetDiscreteGradient(void * solver,hypre_ParCSRMatrix * G)993 HYPRE_Int hypre_AMSSetDiscreteGradient(void *solver,
994                                        hypre_ParCSRMatrix *G)
995 {
996    hypre_AMSData *ams_data = (hypre_AMSData *) solver;
997    ams_data -> G = G;
998    return hypre_error_flag;
999 }
1000 
1001 /*--------------------------------------------------------------------------
1002  * hypre_AMSSetCoordinateVectors
1003  *
1004  * Set the x, y and z coordinates of the vertices in the mesh.
1005  *
1006  * Either SetCoordinateVectors or SetEdgeConstantVectors should be
1007  * called before hypre_AMSSetup()!
1008  *--------------------------------------------------------------------------*/
1009 
hypre_AMSSetCoordinateVectors(void * solver,hypre_ParVector * x,hypre_ParVector * y,hypre_ParVector * z)1010 HYPRE_Int hypre_AMSSetCoordinateVectors(void *solver,
1011                                         hypre_ParVector *x,
1012                                         hypre_ParVector *y,
1013                                         hypre_ParVector *z)
1014 {
1015    hypre_AMSData *ams_data = (hypre_AMSData *) solver;
1016    ams_data -> x = x;
1017    ams_data -> y = y;
1018    ams_data -> z = z;
1019    return hypre_error_flag;
1020 }
1021 
1022 /*--------------------------------------------------------------------------
1023  * hypre_AMSSetEdgeConstantVectors
1024  *
1025  * Set the vectors Gx, Gy and Gz which give the representations of
1026  * the constant vector fields (1,0,0), (0,1,0) and (0,0,1) in the
1027  * edge element basis.
1028  *
1029  * Either SetCoordinateVectors or SetEdgeConstantVectors should be
1030  * called before hypre_AMSSetup()!
1031  *--------------------------------------------------------------------------*/
1032 
hypre_AMSSetEdgeConstantVectors(void * solver,hypre_ParVector * Gx,hypre_ParVector * Gy,hypre_ParVector * Gz)1033 HYPRE_Int hypre_AMSSetEdgeConstantVectors(void *solver,
1034                                           hypre_ParVector *Gx,
1035                                           hypre_ParVector *Gy,
1036                                           hypre_ParVector *Gz)
1037 {
1038    hypre_AMSData *ams_data = (hypre_AMSData *) solver;
1039    ams_data -> Gx = Gx;
1040    ams_data -> Gy = Gy;
1041    ams_data -> Gz = Gz;
1042    return hypre_error_flag;
1043 }
1044 
1045 /*--------------------------------------------------------------------------
1046  * hypre_AMSSetInterpolations
1047  *
1048  * Set the (components of) the Nedelec interpolation matrix Pi=[Pix,Piy,Piz].
1049  *
1050  * This function is generally intended to be used only for high-order Nedelec
1051  * discretizations (in the lowest order case, Pi is constructed internally in
1052  * AMS from the discreet gradient matrix and the coordinates of the vertices),
1053  * though it can also be used in the lowest-order case or for other types of
1054  * discretizations (e.g. ones based on the second family of Nedelec elements).
1055  *
1056  * By definition, Pi is the matrix representation of the linear operator that
1057  * interpolates (high-order) vector nodal finite elements into the (high-order)
1058  * Nedelec space. The component matrices are defined as Pix phi = Pi (phi,0,0)
1059  * and similarly for Piy and Piz. Note that all these operators depend on the
1060  * choice of the basis and degrees of freedom in the high-order spaces.
1061  *
1062  * The column numbering of Pi should be node-based, i.e. the x/y/z components of
1063  * the first node (vertex or high-order dof) should be listed first, followed by
1064  * the x/y/z components of the second node and so on (see the documentation of
1065  * HYPRE_BoomerAMGSetDofFunc).
1066  *
1067  * If used, this function should be called before hypre_AMSSetup() and there is
1068  * no need to provide the vertex coordinates. Furthermore, only one of the sets
1069  * {Pi} and {Pix,Piy,Piz} needs to be specified (though it is OK to provide
1070  * both).  If Pix is NULL, then scalar Pi-based AMS cycles, i.e. those with
1071  * cycle_type > 10, will be unavailable.  Similarly, AMS cycles based on
1072  * monolithic Pi (cycle_type < 10) require that Pi is not NULL.
1073  *--------------------------------------------------------------------------*/
1074 
hypre_AMSSetInterpolations(void * solver,hypre_ParCSRMatrix * Pi,hypre_ParCSRMatrix * Pix,hypre_ParCSRMatrix * Piy,hypre_ParCSRMatrix * Piz)1075 HYPRE_Int hypre_AMSSetInterpolations(void *solver,
1076                                      hypre_ParCSRMatrix *Pi,
1077                                      hypre_ParCSRMatrix *Pix,
1078                                      hypre_ParCSRMatrix *Piy,
1079                                      hypre_ParCSRMatrix *Piz)
1080 {
1081    hypre_AMSData *ams_data = (hypre_AMSData *) solver;
1082    ams_data -> Pi = Pi;
1083    ams_data -> Pix = Pix;
1084    ams_data -> Piy = Piy;
1085    ams_data -> Piz = Piz;
1086    ams_data -> owns_Pi = 0;
1087    return hypre_error_flag;
1088 }
1089 
1090 /*--------------------------------------------------------------------------
1091  * hypre_AMSSetAlphaPoissonMatrix
1092  *
1093  * Set the matrix corresponding to the Poisson problem with coefficient
1094  * alpha (the curl-curl term coefficient in the Maxwell problem).
1095  *
1096  * If this function is called, the coarse space solver on the range
1097  * of Pi^T is a block-diagonal version of A_Pi. If this function is not
1098  * called, the coarse space solver on the range of Pi^T is constructed
1099  * as Pi^T A Pi in hypre_AMSSetup().
1100  *--------------------------------------------------------------------------*/
1101 
hypre_AMSSetAlphaPoissonMatrix(void * solver,hypre_ParCSRMatrix * A_Pi)1102 HYPRE_Int hypre_AMSSetAlphaPoissonMatrix(void *solver,
1103                                          hypre_ParCSRMatrix *A_Pi)
1104 {
1105    hypre_AMSData *ams_data = (hypre_AMSData *) solver;
1106    ams_data -> A_Pi = A_Pi;
1107 
1108    /* Penalize the eliminated degrees of freedom */
1109    hypre_ParCSRMatrixSetDiagRows(A_Pi, HYPRE_REAL_MAX);
1110 
1111    /* Make sure that the first entry in each row is the diagonal one. */
1112    /* hypre_CSRMatrixReorder(hypre_ParCSRMatrixDiag(A_Pi)); */
1113 
1114    return hypre_error_flag;
1115 }
1116 
1117 /*--------------------------------------------------------------------------
1118  * hypre_AMSSetBetaPoissonMatrix
1119  *
1120  * Set the matrix corresponding to the Poisson problem with coefficient
1121  * beta (the mass term coefficient in the Maxwell problem).
1122  *
1123  * This function call is optional - if not given, the Poisson matrix will
1124  * be computed in hypre_AMSSetup(). If the given matrix is NULL, we assume
1125  * that beta is 0 and use two-level (instead of three-level) methods.
1126  *--------------------------------------------------------------------------*/
1127 
hypre_AMSSetBetaPoissonMatrix(void * solver,hypre_ParCSRMatrix * A_G)1128 HYPRE_Int hypre_AMSSetBetaPoissonMatrix(void *solver,
1129                                         hypre_ParCSRMatrix *A_G)
1130 {
1131    hypre_AMSData *ams_data = (hypre_AMSData *) solver;
1132    ams_data -> A_G = A_G;
1133    if (!A_G)
1134       ams_data -> beta_is_zero = 1;
1135    else
1136    {
1137       /* Penalize the eliminated degrees of freedom */
1138       hypre_ParCSRMatrixSetDiagRows(A_G, HYPRE_REAL_MAX);
1139 
1140       /* Make sure that the first entry in each row is the diagonal one. */
1141       /* hypre_CSRMatrixReorder(hypre_ParCSRMatrixDiag(A_G)); */
1142    }
1143    return hypre_error_flag;
1144 }
1145 
1146 /*--------------------------------------------------------------------------
1147  * hypre_AMSSetInteriorNodes
1148  *
1149  * Set the list of nodes which are interior to the zero-conductivity region.
1150  * A node is interior if interior_nodes[i] == 1.0.
1151  *
1152  * Should be called before hypre_AMSSetup()!
1153  *--------------------------------------------------------------------------*/
1154 
hypre_AMSSetInteriorNodes(void * solver,hypre_ParVector * interior_nodes)1155 HYPRE_Int hypre_AMSSetInteriorNodes(void *solver,
1156                                     hypre_ParVector *interior_nodes)
1157 {
1158    hypre_AMSData *ams_data = (hypre_AMSData *) solver;
1159    ams_data -> interior_nodes = interior_nodes;
1160    return hypre_error_flag;
1161 }
1162 
1163 /*--------------------------------------------------------------------------
1164  * hypre_AMSSetProjectionFrequency
1165  *
1166  * How often to project the r.h.s. onto the compatible sub-space Ker(G0^T),
1167  * when iterating with the solver.
1168  *
1169  * The default value is every 5th iteration.
1170  *--------------------------------------------------------------------------*/
1171 
hypre_AMSSetProjectionFrequency(void * solver,HYPRE_Int projection_frequency)1172 HYPRE_Int hypre_AMSSetProjectionFrequency(void *solver,
1173                                           HYPRE_Int projection_frequency)
1174 {
1175    hypre_AMSData *ams_data = (hypre_AMSData *) solver;
1176    ams_data -> projection_frequency = projection_frequency;
1177    return hypre_error_flag;
1178 }
1179 
1180 /*--------------------------------------------------------------------------
1181  * hypre_AMSSetMaxIter
1182  *
1183  * Set the maximum number of iterations in the three-level method.
1184  * The default value is 20. To use the AMS solver as a preconditioner,
1185  * set maxit to 1, tol to 0.0 and print_level to 0.
1186  *--------------------------------------------------------------------------*/
1187 
hypre_AMSSetMaxIter(void * solver,HYPRE_Int maxit)1188 HYPRE_Int hypre_AMSSetMaxIter(void *solver,
1189                               HYPRE_Int maxit)
1190 {
1191    hypre_AMSData *ams_data = (hypre_AMSData *) solver;
1192    ams_data -> maxit = maxit;
1193    return hypre_error_flag;
1194 }
1195 
1196 /*--------------------------------------------------------------------------
1197  * hypre_AMSSetTol
1198  *
1199  * Set the convergence tolerance (if the method is used as a solver).
1200  * The default value is 1e-6.
1201  *--------------------------------------------------------------------------*/
1202 
hypre_AMSSetTol(void * solver,HYPRE_Real tol)1203 HYPRE_Int hypre_AMSSetTol(void *solver,
1204                           HYPRE_Real tol)
1205 {
1206    hypre_AMSData *ams_data = (hypre_AMSData *) solver;
1207    ams_data -> tol = tol;
1208    return hypre_error_flag;
1209 }
1210 
1211 /*--------------------------------------------------------------------------
1212  * hypre_AMSSetCycleType
1213  *
1214  * Choose which three-level solver to use. Possible values are:
1215  *
1216  *   1 = 3-level multipl. solver (01210)      <-- small solution time
1217  *   2 = 3-level additive solver (0+1+2)
1218  *   3 = 3-level multipl. solver (02120)
1219  *   4 = 3-level additive solver (010+2)
1220  *   5 = 3-level multipl. solver (0102010)    <-- small solution time
1221  *   6 = 3-level additive solver (1+020)
1222  *   7 = 3-level multipl. solver (0201020)    <-- small number of iterations
1223  *   8 = 3-level additive solver (0(1+2)0)    <-- small solution time
1224  *   9 = 3-level multipl. solver (01210) with discrete divergence
1225  *  11 = 5-level multipl. solver (013454310)  <-- small solution time, memory
1226  *  12 = 5-level additive solver (0+1+3+4+5)
1227  *  13 = 5-level multipl. solver (034515430)  <-- small solution time, memory
1228  *  14 = 5-level additive solver (01(3+4+5)10)
1229  *  20 = 2-level multipl. solver (0[12]0)
1230  *
1231  *   0 = a Hiptmair-like smoother (010)
1232  *
1233  * The default value is 1.
1234  *--------------------------------------------------------------------------*/
1235 
hypre_AMSSetCycleType(void * solver,HYPRE_Int cycle_type)1236 HYPRE_Int hypre_AMSSetCycleType(void *solver,
1237                                 HYPRE_Int cycle_type)
1238 {
1239    hypre_AMSData *ams_data = (hypre_AMSData *) solver;
1240    ams_data -> cycle_type = cycle_type;
1241    return hypre_error_flag;
1242 }
1243 
1244 /*--------------------------------------------------------------------------
1245  * hypre_AMSSetPrintLevel
1246  *
1247  * Control how much information is printed during the solution iterations.
1248  * The defaut values is 1 (print residual norm at each step).
1249  *--------------------------------------------------------------------------*/
1250 
hypre_AMSSetPrintLevel(void * solver,HYPRE_Int print_level)1251 HYPRE_Int hypre_AMSSetPrintLevel(void *solver,
1252                                  HYPRE_Int print_level)
1253 {
1254    hypre_AMSData *ams_data = (hypre_AMSData *) solver;
1255    ams_data -> print_level = print_level;
1256    return hypre_error_flag;
1257 }
1258 
1259 /*--------------------------------------------------------------------------
1260  * hypre_AMSSetSmoothingOptions
1261  *
1262  * Set relaxation parameters for A. Default values: 2, 1, 1.0, 1.0.
1263  *--------------------------------------------------------------------------*/
1264 
hypre_AMSSetSmoothingOptions(void * solver,HYPRE_Int A_relax_type,HYPRE_Int A_relax_times,HYPRE_Real A_relax_weight,HYPRE_Real A_omega)1265 HYPRE_Int hypre_AMSSetSmoothingOptions(void *solver,
1266                                        HYPRE_Int A_relax_type,
1267                                        HYPRE_Int A_relax_times,
1268                                        HYPRE_Real A_relax_weight,
1269                                        HYPRE_Real A_omega)
1270 {
1271    hypre_AMSData *ams_data = (hypre_AMSData *) solver;
1272    ams_data -> A_relax_type = A_relax_type;
1273    ams_data -> A_relax_times = A_relax_times;
1274    ams_data -> A_relax_weight = A_relax_weight;
1275    ams_data -> A_omega = A_omega;
1276    return hypre_error_flag;
1277 }
1278 /*--------------------------------------------------------------------------
1279  * hypre_AMSSetChebySmoothingOptions
1280  *  AB: note: this could be added to the above,
1281  *      but I didn't want to change parameter list)
1282  * Set parameters for chebyshev smoother for A. Default values: 2,.3.
1283  *--------------------------------------------------------------------------*/
1284 
hypre_AMSSetChebySmoothingOptions(void * solver,HYPRE_Int A_cheby_order,HYPRE_Int A_cheby_fraction)1285 HYPRE_Int hypre_AMSSetChebySmoothingOptions(void *solver,
1286                                             HYPRE_Int A_cheby_order,
1287                                             HYPRE_Int A_cheby_fraction)
1288 {
1289    hypre_AMSData *ams_data = (hypre_AMSData *) solver;
1290    ams_data -> A_cheby_order =  A_cheby_order;
1291    ams_data -> A_cheby_fraction =  A_cheby_fraction;
1292 
1293    return hypre_error_flag;
1294 }
1295 /*--------------------------------------------------------------------------
1296  * hypre_AMSSetAlphaAMGOptions
1297  *
1298  * Set AMG parameters for B_Pi. Default values: 10, 1, 3, 0.25, 0, 0.
1299  *--------------------------------------------------------------------------*/
1300 
hypre_AMSSetAlphaAMGOptions(void * solver,HYPRE_Int B_Pi_coarsen_type,HYPRE_Int B_Pi_agg_levels,HYPRE_Int B_Pi_relax_type,HYPRE_Real B_Pi_theta,HYPRE_Int B_Pi_interp_type,HYPRE_Int B_Pi_Pmax)1301 HYPRE_Int hypre_AMSSetAlphaAMGOptions(void *solver,
1302                                       HYPRE_Int B_Pi_coarsen_type,
1303                                       HYPRE_Int B_Pi_agg_levels,
1304                                       HYPRE_Int B_Pi_relax_type,
1305                                       HYPRE_Real B_Pi_theta,
1306                                       HYPRE_Int B_Pi_interp_type,
1307                                       HYPRE_Int B_Pi_Pmax)
1308 {
1309    hypre_AMSData *ams_data = (hypre_AMSData *) solver;
1310    ams_data -> B_Pi_coarsen_type = B_Pi_coarsen_type;
1311    ams_data -> B_Pi_agg_levels = B_Pi_agg_levels;
1312    ams_data -> B_Pi_relax_type = B_Pi_relax_type;
1313    ams_data -> B_Pi_theta = B_Pi_theta;
1314    ams_data -> B_Pi_interp_type = B_Pi_interp_type;
1315    ams_data -> B_Pi_Pmax = B_Pi_Pmax;
1316    return hypre_error_flag;
1317 }
1318 
1319 /*--------------------------------------------------------------------------
1320  * hypre_AMSSetAlphaAMGCoarseRelaxType
1321  *
1322  * Set the AMG coarsest level relaxation for B_Pi. Default value: 8.
1323  *--------------------------------------------------------------------------*/
1324 
hypre_AMSSetAlphaAMGCoarseRelaxType(void * solver,HYPRE_Int B_Pi_coarse_relax_type)1325 HYPRE_Int hypre_AMSSetAlphaAMGCoarseRelaxType(void *solver,
1326                                               HYPRE_Int B_Pi_coarse_relax_type)
1327 {
1328    hypre_AMSData *ams_data =  (hypre_AMSData *)solver;
1329    ams_data -> B_Pi_coarse_relax_type = B_Pi_coarse_relax_type;
1330    return hypre_error_flag;
1331 }
1332 
1333 /*--------------------------------------------------------------------------
1334  * hypre_AMSSetBetaAMGOptions
1335  *
1336  * Set AMG parameters for B_G. Default values: 10, 1, 3, 0.25, 0, 0.
1337  *--------------------------------------------------------------------------*/
1338 
hypre_AMSSetBetaAMGOptions(void * solver,HYPRE_Int B_G_coarsen_type,HYPRE_Int B_G_agg_levels,HYPRE_Int B_G_relax_type,HYPRE_Real B_G_theta,HYPRE_Int B_G_interp_type,HYPRE_Int B_G_Pmax)1339 HYPRE_Int hypre_AMSSetBetaAMGOptions(void *solver,
1340                                      HYPRE_Int B_G_coarsen_type,
1341                                      HYPRE_Int B_G_agg_levels,
1342                                      HYPRE_Int B_G_relax_type,
1343                                      HYPRE_Real B_G_theta,
1344                                      HYPRE_Int B_G_interp_type,
1345                                      HYPRE_Int B_G_Pmax)
1346 {
1347    hypre_AMSData *ams_data = (hypre_AMSData *) solver;
1348    ams_data -> B_G_coarsen_type = B_G_coarsen_type;
1349    ams_data -> B_G_agg_levels = B_G_agg_levels;
1350    ams_data -> B_G_relax_type = B_G_relax_type;
1351    ams_data -> B_G_theta = B_G_theta;
1352    ams_data -> B_G_interp_type = B_G_interp_type;
1353    ams_data -> B_G_Pmax = B_G_Pmax;
1354    return hypre_error_flag;
1355 }
1356 
1357 /*--------------------------------------------------------------------------
1358  * hypre_AMSSetBetaAMGCoarseRelaxType
1359  *
1360  * Set the AMG coarsest level relaxation for B_G. Default value: 8.
1361  *--------------------------------------------------------------------------*/
1362 
hypre_AMSSetBetaAMGCoarseRelaxType(void * solver,HYPRE_Int B_G_coarse_relax_type)1363 HYPRE_Int hypre_AMSSetBetaAMGCoarseRelaxType(void *solver,
1364                                              HYPRE_Int B_G_coarse_relax_type)
1365 {
1366    hypre_AMSData *ams_data = (hypre_AMSData *) solver;
1367    ams_data -> B_G_coarse_relax_type = B_G_coarse_relax_type;
1368    return hypre_error_flag;
1369 }
1370 
1371 /*--------------------------------------------------------------------------
1372  * hypre_AMSComputePi
1373  *
1374  * Construct the Pi interpolation matrix, which maps the space of vector
1375  * linear finite elements to the space of edge finite elements.
1376  *
1377  * The construction is based on the fact that Pi = [Pi_x, Pi_y, Pi_z],
1378  * where each block has the same sparsity structure as G, and the entries
1379  * can be computed from the vectors Gx, Gy, Gz.
1380  *--------------------------------------------------------------------------*/
1381 
1382 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
1383 __global__ void
hypreCUDAKernel_AMSComputePi_copy1(HYPRE_Int nnz,HYPRE_Int dim,HYPRE_Int * j_in,HYPRE_Int * j_out)1384 hypreCUDAKernel_AMSComputePi_copy1(HYPRE_Int  nnz,
1385                                    HYPRE_Int  dim,
1386                                    HYPRE_Int *j_in,
1387                                    HYPRE_Int *j_out)
1388 {
1389    const HYPRE_Int i = hypre_cuda_get_grid_thread_id<1,1>();
1390 
1391    if (i < nnz)
1392    {
1393       const HYPRE_Int j = dim * i;
1394 
1395       for (HYPRE_Int d = 0; d < dim; d++)
1396       {
1397          j_out[j+d] = dim * read_only_load(&j_in[i]) + d;
1398       }
1399    }
1400 }
1401 
1402 __global__ void
hypreCUDAKernel_AMSComputePi_copy2(HYPRE_Int nrows,HYPRE_Int dim,HYPRE_Int * i_in,HYPRE_Real * data_in,HYPRE_Real * Gx_data,HYPRE_Real * Gy_data,HYPRE_Real * Gz_data,HYPRE_Real * data_out)1403 hypreCUDAKernel_AMSComputePi_copy2(HYPRE_Int   nrows,
1404                                    HYPRE_Int   dim,
1405                                    HYPRE_Int  *i_in,
1406                                    HYPRE_Real *data_in,
1407                                    HYPRE_Real *Gx_data,
1408                                    HYPRE_Real *Gy_data,
1409                                    HYPRE_Real *Gz_data,
1410                                    HYPRE_Real *data_out)
1411 {
1412    const HYPRE_Int i = hypre_cuda_get_grid_warp_id<1,1>();
1413 
1414    if (i >= nrows)
1415    {
1416       return;
1417    }
1418 
1419    const HYPRE_Int lane_id = hypre_cuda_get_lane_id<1>();
1420    HYPRE_Int j, istart, iend;
1421    HYPRE_Real t, G[3], *Gdata[3];
1422 
1423    Gdata[0] = Gx_data;
1424    Gdata[1] = Gy_data;
1425    Gdata[2] = Gz_data;
1426 
1427    if (lane_id < 2)
1428    {
1429       j = read_only_load(i_in + i + lane_id);
1430    }
1431 
1432    istart = __shfl_sync(HYPRE_WARP_FULL_MASK, j, 0);
1433    iend   = __shfl_sync(HYPRE_WARP_FULL_MASK, j, 1);
1434 
1435    if (lane_id < dim)
1436    {
1437       t = read_only_load(Gdata[lane_id] + i);
1438    }
1439 
1440    for (HYPRE_Int d = 0; d < dim; d++)
1441    {
1442       G[d] = __shfl_sync(HYPRE_WARP_FULL_MASK, t, d);
1443    }
1444 
1445    for (j = istart + lane_id; j < iend; j += HYPRE_WARP_SIZE)
1446    {
1447       const HYPRE_Real v = data_in ? fabs(read_only_load(&data_in[j])) * 0.5 : 1.0;
1448       const HYPRE_Int k = j * dim;
1449 
1450       for (HYPRE_Int d = 0; d < dim; d++)
1451       {
1452          data_out[k+d] = v * G[d];
1453       }
1454    }
1455 }
1456 
1457 #endif
1458 
hypre_AMSComputePi(hypre_ParCSRMatrix * A,hypre_ParCSRMatrix * G,hypre_ParVector * Gx,hypre_ParVector * Gy,hypre_ParVector * Gz,HYPRE_Int dim,hypre_ParCSRMatrix ** Pi_ptr)1459 HYPRE_Int hypre_AMSComputePi(hypre_ParCSRMatrix *A,
1460                              hypre_ParCSRMatrix *G,
1461                              hypre_ParVector *Gx,
1462                              hypre_ParVector *Gy,
1463                              hypre_ParVector *Gz,
1464                              HYPRE_Int dim,
1465                              hypre_ParCSRMatrix **Pi_ptr)
1466 {
1467    hypre_ParCSRMatrix *Pi;
1468 
1469    /* Compute Pi = [Pi_x, Pi_y, Pi_z] */
1470    {
1471       HYPRE_Int i, j, d;
1472 
1473       HYPRE_Real *Gx_data, *Gy_data, *Gz_data;
1474 
1475       MPI_Comm comm = hypre_ParCSRMatrixComm(G);
1476       HYPRE_BigInt global_num_rows = hypre_ParCSRMatrixGlobalNumRows(G);
1477       HYPRE_BigInt global_num_cols = dim*hypre_ParCSRMatrixGlobalNumCols(G);
1478       HYPRE_BigInt *row_starts = hypre_ParCSRMatrixRowStarts(G);
1479       HYPRE_BigInt *col_starts;
1480       HYPRE_Int col_starts_size;
1481       HYPRE_Int num_cols_offd = dim*hypre_CSRMatrixNumCols(hypre_ParCSRMatrixOffd(G));
1482       HYPRE_Int num_nonzeros_diag = dim*hypre_CSRMatrixNumNonzeros(hypre_ParCSRMatrixDiag(G));
1483       HYPRE_Int num_nonzeros_offd = dim*hypre_CSRMatrixNumNonzeros(hypre_ParCSRMatrixOffd(G));
1484       HYPRE_BigInt *col_starts_G = hypre_ParCSRMatrixColStarts(G);
1485 
1486       col_starts_size = 2;
1487       col_starts = hypre_TAlloc(HYPRE_BigInt, col_starts_size, HYPRE_MEMORY_HOST);
1488       for (i = 0; i < col_starts_size; i++)
1489          col_starts[i] = (HYPRE_BigInt)dim * col_starts_G[i];
1490 
1491       Pi = hypre_ParCSRMatrixCreate(comm,
1492                                     global_num_rows,
1493                                     global_num_cols,
1494                                     row_starts,
1495                                     col_starts,
1496                                     num_cols_offd,
1497                                     num_nonzeros_diag,
1498                                     num_nonzeros_offd);
1499 
1500       hypre_ParCSRMatrixOwnsData(Pi) = 1;
1501       hypre_ParCSRMatrixInitialize(Pi);
1502       hypre_TFree(col_starts, HYPRE_MEMORY_HOST);
1503 
1504       Gx_data = hypre_VectorData(hypre_ParVectorLocalVector(Gx));
1505       if (dim >= 2)
1506          Gy_data = hypre_VectorData(hypre_ParVectorLocalVector(Gy));
1507       if (dim == 3)
1508          Gz_data = hypre_VectorData(hypre_ParVectorLocalVector(Gz));
1509 
1510 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
1511       HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy2( hypre_ParCSRMatrixMemoryLocation(G),
1512                                                          hypre_ParCSRMatrixMemoryLocation(Pi) );
1513 #endif
1514 
1515       /* Fill-in the diagonal part */
1516       {
1517          hypre_CSRMatrix *G_diag = hypre_ParCSRMatrixDiag(G);
1518          HYPRE_Int *G_diag_I = hypre_CSRMatrixI(G_diag);
1519          HYPRE_Int *G_diag_J = hypre_CSRMatrixJ(G_diag);
1520          HYPRE_Real *G_diag_data = hypre_CSRMatrixData(G_diag);
1521 
1522          HYPRE_Int G_diag_nrows = hypre_CSRMatrixNumRows(G_diag);
1523          HYPRE_Int G_diag_nnz = hypre_CSRMatrixNumNonzeros(G_diag);
1524 
1525          hypre_CSRMatrix *Pi_diag = hypre_ParCSRMatrixDiag(Pi);
1526          HYPRE_Int *Pi_diag_I = hypre_CSRMatrixI(Pi_diag);
1527          HYPRE_Int *Pi_diag_J = hypre_CSRMatrixJ(Pi_diag);
1528          HYPRE_Real *Pi_diag_data = hypre_CSRMatrixData(Pi_diag);
1529 
1530 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
1531          if (exec == HYPRE_EXEC_DEVICE)
1532          {
1533             HYPRE_THRUST_CALL( transform,
1534                                G_diag_I,
1535                                G_diag_I + G_diag_nrows + 1,
1536                                Pi_diag_I,
1537                                dim * _1 );
1538 
1539             dim3 bDim = hypre_GetDefaultCUDABlockDimension();
1540             dim3 gDim = hypre_GetDefaultCUDAGridDimension(G_diag_nnz, "thread", bDim);
1541 
1542             HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
1543                                G_diag_nnz, dim, G_diag_J, Pi_diag_J );
1544 
1545             gDim = hypre_GetDefaultCUDAGridDimension(G_diag_nrows, "warp", bDim);
1546 
1547             HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim,
1548                                G_diag_nrows, dim, G_diag_I, G_diag_data, Gx_data, Gy_data, Gz_data,
1549                                Pi_diag_data );
1550          }
1551          else
1552 #endif
1553          {
1554             for (i = 0; i < G_diag_nrows+1; i++)
1555                Pi_diag_I[i] = dim * G_diag_I[i];
1556 
1557             for (i = 0; i < G_diag_nnz; i++)
1558                for (d = 0; d < dim; d++)
1559                   Pi_diag_J[dim*i+d] = dim*G_diag_J[i]+d;
1560 
1561             for (i = 0; i < G_diag_nrows; i++)
1562                for (j = G_diag_I[i]; j < G_diag_I[i+1]; j++)
1563                {
1564                   *Pi_diag_data++ = fabs(G_diag_data[j]) * 0.5 * Gx_data[i];
1565                   if (dim >= 2)
1566                      *Pi_diag_data++ = fabs(G_diag_data[j]) * 0.5 * Gy_data[i];
1567                   if (dim == 3)
1568                      *Pi_diag_data++ = fabs(G_diag_data[j]) * 0.5 * Gz_data[i];
1569                }
1570          }
1571       }
1572 
1573       /* Fill-in the off-diagonal part */
1574       {
1575          hypre_CSRMatrix *G_offd = hypre_ParCSRMatrixOffd(G);
1576          HYPRE_Int *G_offd_I = hypre_CSRMatrixI(G_offd);
1577          HYPRE_Int *G_offd_J = hypre_CSRMatrixJ(G_offd);
1578          HYPRE_Real *G_offd_data = hypre_CSRMatrixData(G_offd);
1579 
1580          HYPRE_Int G_offd_nrows = hypre_CSRMatrixNumRows(G_offd);
1581          HYPRE_Int G_offd_ncols = hypre_CSRMatrixNumCols(G_offd);
1582          HYPRE_Int G_offd_nnz = hypre_CSRMatrixNumNonzeros(G_offd);
1583 
1584          hypre_CSRMatrix *Pi_offd = hypre_ParCSRMatrixOffd(Pi);
1585          HYPRE_Int *Pi_offd_I = hypre_CSRMatrixI(Pi_offd);
1586          HYPRE_Int *Pi_offd_J = hypre_CSRMatrixJ(Pi_offd);
1587          HYPRE_Real *Pi_offd_data = hypre_CSRMatrixData(Pi_offd);
1588 
1589          HYPRE_BigInt *G_cmap = hypre_ParCSRMatrixColMapOffd(G);
1590          HYPRE_BigInt *Pi_cmap = hypre_ParCSRMatrixColMapOffd(Pi);
1591 
1592 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
1593          if (exec == HYPRE_EXEC_DEVICE)
1594          {
1595             if (G_offd_ncols)
1596             {
1597                HYPRE_THRUST_CALL( transform,
1598                                   G_offd_I,
1599                                   G_offd_I + G_offd_nrows + 1,
1600                                   Pi_offd_I,
1601                                   dim * _1 );
1602             }
1603 
1604             dim3 bDim = hypre_GetDefaultCUDABlockDimension();
1605             dim3 gDim = hypre_GetDefaultCUDAGridDimension(G_offd_nnz, "thread", bDim);
1606 
1607             HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
1608                                G_offd_nnz, dim, G_offd_J, Pi_offd_J );
1609 
1610             gDim = hypre_GetDefaultCUDAGridDimension(G_offd_nrows, "warp", bDim);
1611 
1612             HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim,
1613                                G_offd_nrows, dim, G_offd_I, G_offd_data, Gx_data, Gy_data, Gz_data,
1614                                Pi_offd_data );
1615          }
1616          else
1617 #endif
1618          {
1619             if (G_offd_ncols)
1620                for (i = 0; i < G_offd_nrows+1; i++)
1621                   Pi_offd_I[i] = dim * G_offd_I[i];
1622 
1623             for (i = 0; i < G_offd_nnz; i++)
1624                for (d = 0; d < dim; d++)
1625                   Pi_offd_J[dim*i+d] = dim*G_offd_J[i]+d;
1626 
1627             for (i = 0; i < G_offd_nrows; i++)
1628                for (j = G_offd_I[i]; j < G_offd_I[i+1]; j++)
1629                {
1630                   *Pi_offd_data++ = fabs(G_offd_data[j]) * 0.5 * Gx_data[i];
1631                   if (dim >= 2)
1632                      *Pi_offd_data++ = fabs(G_offd_data[j]) * 0.5 * Gy_data[i];
1633                   if (dim == 3)
1634                      *Pi_offd_data++ = fabs(G_offd_data[j]) * 0.5 * Gz_data[i];
1635                }
1636          }
1637 
1638          for (i = 0; i < G_offd_ncols; i++)
1639             for (d = 0; d < dim; d++)
1640                Pi_cmap[dim*i+d] = (HYPRE_BigInt)dim * G_cmap[i] + (HYPRE_BigInt)d;
1641       }
1642    }
1643 
1644    *Pi_ptr = Pi;
1645 
1646    return hypre_error_flag;
1647 }
1648 
1649 /*--------------------------------------------------------------------------
1650  * hypre_AMSComputePixyz
1651  *
1652  * Construct the components Pix, Piy, Piz of the interpolation matrix Pi,
1653  * which maps the space of vector linear finite elements to the space of
1654  * edge finite elements.
1655  *
1656  * The construction is based on the fact that each component has the same
1657  * sparsity structure as G, and the entries can be computed from the vectors
1658  * Gx, Gy, Gz.
1659  *--------------------------------------------------------------------------*/
1660 
1661 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
1662 __global__ void
hypreCUDAKernel_AMSComputePixyz_copy(HYPRE_Int nrows,HYPRE_Int dim,HYPRE_Int * i_in,HYPRE_Real * data_in,HYPRE_Real * Gx_data,HYPRE_Real * Gy_data,HYPRE_Real * Gz_data,HYPRE_Real * data_x_out,HYPRE_Real * data_y_out,HYPRE_Real * data_z_out)1663 hypreCUDAKernel_AMSComputePixyz_copy(HYPRE_Int   nrows,
1664                                      HYPRE_Int   dim,
1665                                      HYPRE_Int  *i_in,
1666                                      HYPRE_Real *data_in,
1667                                      HYPRE_Real *Gx_data,
1668                                      HYPRE_Real *Gy_data,
1669                                      HYPRE_Real *Gz_data,
1670                                      HYPRE_Real *data_x_out,
1671                                      HYPRE_Real *data_y_out,
1672                                      HYPRE_Real *data_z_out )
1673 {
1674    const HYPRE_Int i = hypre_cuda_get_grid_warp_id<1,1>();
1675 
1676    if (i >= nrows)
1677    {
1678       return;
1679    }
1680 
1681    const HYPRE_Int lane_id = hypre_cuda_get_lane_id<1>();
1682    HYPRE_Int j, istart, iend;
1683    HYPRE_Real t, G[3], *Gdata[3], *Odata[3];
1684 
1685    Gdata[0] = Gx_data;
1686    Gdata[1] = Gy_data;
1687    Gdata[2] = Gz_data;
1688 
1689    Odata[0] = data_x_out;
1690    Odata[1] = data_y_out;
1691    Odata[2] = data_z_out;
1692 
1693    if (lane_id < 2)
1694    {
1695       j = read_only_load(i_in + i + lane_id);
1696    }
1697 
1698    istart = __shfl_sync(HYPRE_WARP_FULL_MASK, j, 0);
1699    iend   = __shfl_sync(HYPRE_WARP_FULL_MASK, j, 1);
1700 
1701    if (lane_id < dim)
1702    {
1703       t = read_only_load(Gdata[lane_id] + i);
1704    }
1705 
1706    for (HYPRE_Int d = 0; d < dim; d++)
1707    {
1708       G[d] = __shfl_sync(HYPRE_WARP_FULL_MASK, t, d);
1709    }
1710 
1711    for (j = istart + lane_id; j < iend; j += HYPRE_WARP_SIZE)
1712    {
1713       const HYPRE_Real v = data_in ? fabs(read_only_load(&data_in[j])) * 0.5 : 1.0;
1714 
1715       for (HYPRE_Int d = 0; d < dim; d++)
1716       {
1717          Odata[d][j] = v * G[d];
1718       }
1719    }
1720 }
1721 #endif
1722 
hypre_AMSComputePixyz(hypre_ParCSRMatrix * A,hypre_ParCSRMatrix * G,hypre_ParVector * Gx,hypre_ParVector * Gy,hypre_ParVector * Gz,HYPRE_Int dim,hypre_ParCSRMatrix ** Pix_ptr,hypre_ParCSRMatrix ** Piy_ptr,hypre_ParCSRMatrix ** Piz_ptr)1723 HYPRE_Int hypre_AMSComputePixyz(hypre_ParCSRMatrix *A,
1724                                 hypre_ParCSRMatrix *G,
1725                                 hypre_ParVector *Gx,
1726                                 hypre_ParVector *Gy,
1727                                 hypre_ParVector *Gz,
1728                                 HYPRE_Int dim,
1729                                 hypre_ParCSRMatrix **Pix_ptr,
1730                                 hypre_ParCSRMatrix **Piy_ptr,
1731                                 hypre_ParCSRMatrix **Piz_ptr)
1732 {
1733    hypre_ParCSRMatrix *Pix, *Piy, *Piz;
1734 
1735 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
1736    HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_ParCSRMatrixMemoryLocation(G) );
1737 #endif
1738 
1739    /* Compute Pix, Piy, Piz  */
1740    {
1741       HYPRE_Int i, j;
1742 
1743       HYPRE_Real *Gx_data, *Gy_data, *Gz_data;
1744 
1745       MPI_Comm comm = hypre_ParCSRMatrixComm(G);
1746       HYPRE_BigInt global_num_rows = hypre_ParCSRMatrixGlobalNumRows(G);
1747       HYPRE_BigInt global_num_cols = hypre_ParCSRMatrixGlobalNumCols(G);
1748       HYPRE_BigInt *row_starts = hypre_ParCSRMatrixRowStarts(G);
1749       HYPRE_BigInt *col_starts = hypre_ParCSRMatrixColStarts(G);
1750       HYPRE_Int num_cols_offd = hypre_CSRMatrixNumCols(hypre_ParCSRMatrixOffd(G));
1751       HYPRE_Int num_nonzeros_diag = hypre_CSRMatrixNumNonzeros(hypre_ParCSRMatrixDiag(G));
1752       HYPRE_Int num_nonzeros_offd = hypre_CSRMatrixNumNonzeros(hypre_ParCSRMatrixOffd(G));
1753 
1754       Pix = hypre_ParCSRMatrixCreate(comm,
1755                                      global_num_rows,
1756                                      global_num_cols,
1757                                      row_starts,
1758                                      col_starts,
1759                                      num_cols_offd,
1760                                      num_nonzeros_diag,
1761                                      num_nonzeros_offd);
1762       hypre_ParCSRMatrixOwnsData(Pix) = 1;
1763       hypre_ParCSRMatrixInitialize(Pix);
1764 
1765       if (dim >= 2)
1766       {
1767          Piy = hypre_ParCSRMatrixCreate(comm,
1768                                         global_num_rows,
1769                                         global_num_cols,
1770                                         row_starts,
1771                                         col_starts,
1772                                         num_cols_offd,
1773                                         num_nonzeros_diag,
1774                                         num_nonzeros_offd);
1775          hypre_ParCSRMatrixOwnsData(Piy) = 1;
1776          hypre_ParCSRMatrixInitialize(Piy);
1777       }
1778 
1779       if (dim == 3)
1780       {
1781          Piz = hypre_ParCSRMatrixCreate(comm,
1782                                         global_num_rows,
1783                                         global_num_cols,
1784                                         row_starts,
1785                                         col_starts,
1786                                         num_cols_offd,
1787                                         num_nonzeros_diag,
1788                                         num_nonzeros_offd);
1789          hypre_ParCSRMatrixOwnsData(Piz) = 1;
1790          hypre_ParCSRMatrixInitialize(Piz);
1791       }
1792 
1793       Gx_data = hypre_VectorData(hypre_ParVectorLocalVector(Gx));
1794       if (dim >= 2)
1795          Gy_data = hypre_VectorData(hypre_ParVectorLocalVector(Gy));
1796       if (dim == 3)
1797          Gz_data = hypre_VectorData(hypre_ParVectorLocalVector(Gz));
1798 
1799       /* Fill-in the diagonal part */
1800       if (dim == 3)
1801       {
1802          hypre_CSRMatrix *G_diag = hypre_ParCSRMatrixDiag(G);
1803          HYPRE_Int *G_diag_I = hypre_CSRMatrixI(G_diag);
1804          HYPRE_Int *G_diag_J = hypre_CSRMatrixJ(G_diag);
1805          HYPRE_Real *G_diag_data = hypre_CSRMatrixData(G_diag);
1806 
1807          HYPRE_Int G_diag_nrows = hypre_CSRMatrixNumRows(G_diag);
1808          HYPRE_Int G_diag_nnz = hypre_CSRMatrixNumNonzeros(G_diag);
1809 
1810          hypre_CSRMatrix *Pix_diag = hypre_ParCSRMatrixDiag(Pix);
1811          HYPRE_Int *Pix_diag_I = hypre_CSRMatrixI(Pix_diag);
1812          HYPRE_Int *Pix_diag_J = hypre_CSRMatrixJ(Pix_diag);
1813          HYPRE_Real *Pix_diag_data = hypre_CSRMatrixData(Pix_diag);
1814 
1815          hypre_CSRMatrix *Piy_diag = hypre_ParCSRMatrixDiag(Piy);
1816          HYPRE_Int *Piy_diag_I = hypre_CSRMatrixI(Piy_diag);
1817          HYPRE_Int *Piy_diag_J = hypre_CSRMatrixJ(Piy_diag);
1818          HYPRE_Real *Piy_diag_data = hypre_CSRMatrixData(Piy_diag);
1819 
1820          hypre_CSRMatrix *Piz_diag = hypre_ParCSRMatrixDiag(Piz);
1821          HYPRE_Int *Piz_diag_I = hypre_CSRMatrixI(Piz_diag);
1822          HYPRE_Int *Piz_diag_J = hypre_CSRMatrixJ(Piz_diag);
1823          HYPRE_Real *Piz_diag_data = hypre_CSRMatrixData(Piz_diag);
1824 
1825 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
1826          if (exec == HYPRE_EXEC_DEVICE)
1827          {
1828             HYPRE_THRUST_CALL( copy_n,
1829                                thrust::make_zip_iterator(thrust::make_tuple(G_diag_I, G_diag_I, G_diag_I)),
1830                                G_diag_nrows + 1,
1831                                thrust::make_zip_iterator(thrust::make_tuple(Pix_diag_I, Piy_diag_I, Piz_diag_I)) );
1832 
1833             HYPRE_THRUST_CALL( copy_n,
1834                                thrust::make_zip_iterator(thrust::make_tuple(G_diag_J, G_diag_J, G_diag_J)),
1835                                G_diag_nnz,
1836                                thrust::make_zip_iterator(thrust::make_tuple(Pix_diag_J, Piy_diag_J, Piz_diag_J)) );
1837 
1838             dim3 bDim = hypre_GetDefaultCUDABlockDimension();
1839             dim3 gDim = hypre_GetDefaultCUDAGridDimension(G_diag_nrows, "warp", bDim);
1840 
1841             HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
1842                                G_diag_nrows, dim, G_diag_I, G_diag_data, Gx_data, Gy_data, Gz_data,
1843                                Pix_diag_data, Piy_diag_data, Piz_diag_data );
1844          }
1845          else
1846 #endif
1847          {
1848             for (i = 0; i < G_diag_nrows+1; i++)
1849             {
1850                Pix_diag_I[i] = G_diag_I[i];
1851                Piy_diag_I[i] = G_diag_I[i];
1852                Piz_diag_I[i] = G_diag_I[i];
1853             }
1854 
1855             for (i = 0; i < G_diag_nnz; i++)
1856             {
1857                Pix_diag_J[i] = G_diag_J[i];
1858                Piy_diag_J[i] = G_diag_J[i];
1859                Piz_diag_J[i] = G_diag_J[i];
1860             }
1861 
1862             for (i = 0; i < G_diag_nrows; i++)
1863                for (j = G_diag_I[i]; j < G_diag_I[i+1]; j++)
1864                {
1865                   *Pix_diag_data++ = fabs(G_diag_data[j]) * 0.5 * Gx_data[i];
1866                   *Piy_diag_data++ = fabs(G_diag_data[j]) * 0.5 * Gy_data[i];
1867                   *Piz_diag_data++ = fabs(G_diag_data[j]) * 0.5 * Gz_data[i];
1868                }
1869          }
1870       }
1871       else if (dim == 2)
1872       {
1873          hypre_CSRMatrix *G_diag = hypre_ParCSRMatrixDiag(G);
1874          HYPRE_Int *G_diag_I = hypre_CSRMatrixI(G_diag);
1875          HYPRE_Int *G_diag_J = hypre_CSRMatrixJ(G_diag);
1876          HYPRE_Real *G_diag_data = hypre_CSRMatrixData(G_diag);
1877 
1878          HYPRE_Int G_diag_nrows = hypre_CSRMatrixNumRows(G_diag);
1879          HYPRE_Int G_diag_nnz = hypre_CSRMatrixNumNonzeros(G_diag);
1880 
1881          hypre_CSRMatrix *Pix_diag = hypre_ParCSRMatrixDiag(Pix);
1882          HYPRE_Int *Pix_diag_I = hypre_CSRMatrixI(Pix_diag);
1883          HYPRE_Int *Pix_diag_J = hypre_CSRMatrixJ(Pix_diag);
1884          HYPRE_Real *Pix_diag_data = hypre_CSRMatrixData(Pix_diag);
1885 
1886          hypre_CSRMatrix *Piy_diag = hypre_ParCSRMatrixDiag(Piy);
1887          HYPRE_Int *Piy_diag_I = hypre_CSRMatrixI(Piy_diag);
1888          HYPRE_Int *Piy_diag_J = hypre_CSRMatrixJ(Piy_diag);
1889          HYPRE_Real *Piy_diag_data = hypre_CSRMatrixData(Piy_diag);
1890 
1891 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
1892          if (exec == HYPRE_EXEC_DEVICE)
1893          {
1894             HYPRE_THRUST_CALL( copy_n,
1895                                thrust::make_zip_iterator(thrust::make_tuple(G_diag_I, G_diag_I)),
1896                                G_diag_nrows + 1,
1897                                thrust::make_zip_iterator(thrust::make_tuple(Pix_diag_I, Piy_diag_I)) );
1898 
1899             HYPRE_THRUST_CALL( copy_n,
1900                                thrust::make_zip_iterator(thrust::make_tuple(G_diag_J, G_diag_J)),
1901                                G_diag_nnz,
1902                                thrust::make_zip_iterator(thrust::make_tuple(Pix_diag_J, Piy_diag_J)) );
1903 
1904             dim3 bDim = hypre_GetDefaultCUDABlockDimension();
1905             dim3 gDim = hypre_GetDefaultCUDAGridDimension(G_diag_nrows, "warp", bDim);
1906 
1907             HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
1908                                G_diag_nrows, dim, G_diag_I, G_diag_data, Gx_data, Gy_data, NULL,
1909                                Pix_diag_data, Piy_diag_data, NULL );
1910          }
1911          else
1912 #endif
1913          {
1914             for (i = 0; i < G_diag_nrows+1; i++)
1915             {
1916                Pix_diag_I[i] = G_diag_I[i];
1917                Piy_diag_I[i] = G_diag_I[i];
1918             }
1919 
1920             for (i = 0; i < G_diag_nnz; i++)
1921             {
1922                Pix_diag_J[i] = G_diag_J[i];
1923                Piy_diag_J[i] = G_diag_J[i];
1924             }
1925 
1926             for (i = 0; i < G_diag_nrows; i++)
1927                for (j = G_diag_I[i]; j < G_diag_I[i+1]; j++)
1928                {
1929                   *Pix_diag_data++ = fabs(G_diag_data[j]) * 0.5 * Gx_data[i];
1930                   *Piy_diag_data++ = fabs(G_diag_data[j]) * 0.5 * Gy_data[i];
1931                }
1932          }
1933       }
1934       else
1935       {
1936          hypre_CSRMatrix *G_diag = hypre_ParCSRMatrixDiag(G);
1937          HYPRE_Int *G_diag_I = hypre_CSRMatrixI(G_diag);
1938          HYPRE_Int *G_diag_J = hypre_CSRMatrixJ(G_diag);
1939          HYPRE_Real *G_diag_data = hypre_CSRMatrixData(G_diag);
1940 
1941          HYPRE_Int G_diag_nrows = hypre_CSRMatrixNumRows(G_diag);
1942          HYPRE_Int G_diag_nnz = hypre_CSRMatrixNumNonzeros(G_diag);
1943 
1944          hypre_CSRMatrix *Pix_diag = hypre_ParCSRMatrixDiag(Pix);
1945          HYPRE_Int *Pix_diag_I = hypre_CSRMatrixI(Pix_diag);
1946          HYPRE_Int *Pix_diag_J = hypre_CSRMatrixJ(Pix_diag);
1947          HYPRE_Real *Pix_diag_data = hypre_CSRMatrixData(Pix_diag);
1948 
1949 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
1950          if (exec == HYPRE_EXEC_DEVICE)
1951          {
1952             HYPRE_THRUST_CALL( copy_n,
1953                                G_diag_I,
1954                                G_diag_nrows + 1,
1955                                Pix_diag_I );
1956 
1957             HYPRE_THRUST_CALL( copy_n,
1958                                G_diag_J,
1959                                G_diag_nnz,
1960                                Pix_diag_J );
1961 
1962             dim3 bDim = hypre_GetDefaultCUDABlockDimension();
1963             dim3 gDim = hypre_GetDefaultCUDAGridDimension(G_diag_nrows, "warp", bDim);
1964 
1965             HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
1966                                G_diag_nrows, dim, G_diag_I, G_diag_data, Gx_data, NULL, NULL,
1967                                Pix_diag_data, NULL, NULL );
1968          }
1969          else
1970 #endif
1971          {
1972             for (i = 0; i < G_diag_nrows+1; i++)
1973             {
1974                Pix_diag_I[i] = G_diag_I[i];
1975             }
1976 
1977             for (i = 0; i < G_diag_nnz; i++)
1978             {
1979                Pix_diag_J[i] = G_diag_J[i];
1980             }
1981 
1982             for (i = 0; i < G_diag_nrows; i++)
1983                for (j = G_diag_I[i]; j < G_diag_I[i+1]; j++)
1984                {
1985                   *Pix_diag_data++ = fabs(G_diag_data[j]) * 0.5 * Gx_data[i];
1986                }
1987          }
1988       }
1989 
1990 
1991       /* Fill-in the off-diagonal part */
1992       if (dim == 3)
1993       {
1994          hypre_CSRMatrix *G_offd = hypre_ParCSRMatrixOffd(G);
1995          HYPRE_Int *G_offd_I = hypre_CSRMatrixI(G_offd);
1996          HYPRE_Int *G_offd_J = hypre_CSRMatrixJ(G_offd);
1997          HYPRE_Real *G_offd_data = hypre_CSRMatrixData(G_offd);
1998 
1999          HYPRE_Int G_offd_nrows = hypre_CSRMatrixNumRows(G_offd);
2000          HYPRE_Int G_offd_ncols = hypre_CSRMatrixNumCols(G_offd);
2001          HYPRE_Int G_offd_nnz = hypre_CSRMatrixNumNonzeros(G_offd);
2002 
2003          hypre_CSRMatrix *Pix_offd = hypre_ParCSRMatrixOffd(Pix);
2004          HYPRE_Int *Pix_offd_I = hypre_CSRMatrixI(Pix_offd);
2005          HYPRE_Int *Pix_offd_J = hypre_CSRMatrixJ(Pix_offd);
2006          HYPRE_Real *Pix_offd_data = hypre_CSRMatrixData(Pix_offd);
2007 
2008          hypre_CSRMatrix *Piy_offd = hypre_ParCSRMatrixOffd(Piy);
2009          HYPRE_Int *Piy_offd_I = hypre_CSRMatrixI(Piy_offd);
2010          HYPRE_Int *Piy_offd_J = hypre_CSRMatrixJ(Piy_offd);
2011          HYPRE_Real *Piy_offd_data = hypre_CSRMatrixData(Piy_offd);
2012 
2013          hypre_CSRMatrix *Piz_offd = hypre_ParCSRMatrixOffd(Piz);
2014          HYPRE_Int *Piz_offd_I = hypre_CSRMatrixI(Piz_offd);
2015          HYPRE_Int *Piz_offd_J = hypre_CSRMatrixJ(Piz_offd);
2016          HYPRE_Real *Piz_offd_data = hypre_CSRMatrixData(Piz_offd);
2017 
2018          HYPRE_BigInt *G_cmap = hypre_ParCSRMatrixColMapOffd(G);
2019          HYPRE_BigInt *Pix_cmap = hypre_ParCSRMatrixColMapOffd(Pix);
2020          HYPRE_BigInt *Piy_cmap = hypre_ParCSRMatrixColMapOffd(Piy);
2021          HYPRE_BigInt *Piz_cmap = hypre_ParCSRMatrixColMapOffd(Piz);
2022 
2023 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
2024          if (exec == HYPRE_EXEC_DEVICE)
2025          {
2026             if (G_offd_ncols)
2027             {
2028                HYPRE_THRUST_CALL( copy_n,
2029                                   thrust::make_zip_iterator(thrust::make_tuple(G_offd_I, G_offd_I, G_offd_I)),
2030                                   G_offd_nrows + 1,
2031                                   thrust::make_zip_iterator(thrust::make_tuple(Pix_offd_I, Piy_offd_I, Piz_offd_I)) );
2032             }
2033 
2034             HYPRE_THRUST_CALL( copy_n,
2035                                thrust::make_zip_iterator(thrust::make_tuple(G_offd_J, G_offd_J, G_offd_J)),
2036                                G_offd_nnz,
2037                                thrust::make_zip_iterator(thrust::make_tuple(Pix_offd_J, Piy_offd_J, Piz_offd_J)) );
2038 
2039             dim3 bDim = hypre_GetDefaultCUDABlockDimension();
2040             dim3 gDim = hypre_GetDefaultCUDAGridDimension(G_offd_nrows, "warp", bDim);
2041 
2042             HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
2043                                G_offd_nrows, dim, G_offd_I, G_offd_data, Gx_data, Gy_data, Gz_data,
2044                                Pix_offd_data, Piy_offd_data, Piz_offd_data );
2045          }
2046          else
2047 #endif
2048          {
2049             if (G_offd_ncols)
2050                for (i = 0; i < G_offd_nrows+1; i++)
2051                {
2052                   Pix_offd_I[i] = G_offd_I[i];
2053                   Piy_offd_I[i] = G_offd_I[i];
2054                   Piz_offd_I[i] = G_offd_I[i];
2055                }
2056 
2057             for (i = 0; i < G_offd_nnz; i++)
2058             {
2059                Pix_offd_J[i] = G_offd_J[i];
2060                Piy_offd_J[i] = G_offd_J[i];
2061                Piz_offd_J[i] = G_offd_J[i];
2062             }
2063 
2064             for (i = 0; i < G_offd_nrows; i++)
2065                for (j = G_offd_I[i]; j < G_offd_I[i+1]; j++)
2066                {
2067                   *Pix_offd_data++ = fabs(G_offd_data[j]) * 0.5 * Gx_data[i];
2068                   *Piy_offd_data++ = fabs(G_offd_data[j]) * 0.5 * Gy_data[i];
2069                   *Piz_offd_data++ = fabs(G_offd_data[j]) * 0.5 * Gz_data[i];
2070                }
2071          }
2072 
2073          for (i = 0; i < G_offd_ncols; i++)
2074          {
2075             Pix_cmap[i] = G_cmap[i];
2076             Piy_cmap[i] = G_cmap[i];
2077             Piz_cmap[i] = G_cmap[i];
2078          }
2079       }
2080       else if (dim == 2)
2081       {
2082          hypre_CSRMatrix *G_offd = hypre_ParCSRMatrixOffd(G);
2083          HYPRE_Int *G_offd_I = hypre_CSRMatrixI(G_offd);
2084          HYPRE_Int *G_offd_J = hypre_CSRMatrixJ(G_offd);
2085          HYPRE_Real *G_offd_data = hypre_CSRMatrixData(G_offd);
2086 
2087          HYPRE_Int G_offd_nrows = hypre_CSRMatrixNumRows(G_offd);
2088          HYPRE_Int G_offd_ncols = hypre_CSRMatrixNumCols(G_offd);
2089          HYPRE_Int G_offd_nnz = hypre_CSRMatrixNumNonzeros(G_offd);
2090 
2091          hypre_CSRMatrix *Pix_offd = hypre_ParCSRMatrixOffd(Pix);
2092          HYPRE_Int *Pix_offd_I = hypre_CSRMatrixI(Pix_offd);
2093          HYPRE_Int *Pix_offd_J = hypre_CSRMatrixJ(Pix_offd);
2094          HYPRE_Real *Pix_offd_data = hypre_CSRMatrixData(Pix_offd);
2095 
2096          hypre_CSRMatrix *Piy_offd = hypre_ParCSRMatrixOffd(Piy);
2097          HYPRE_Int *Piy_offd_I = hypre_CSRMatrixI(Piy_offd);
2098          HYPRE_Int *Piy_offd_J = hypre_CSRMatrixJ(Piy_offd);
2099          HYPRE_Real *Piy_offd_data = hypre_CSRMatrixData(Piy_offd);
2100 
2101          HYPRE_BigInt *G_cmap = hypre_ParCSRMatrixColMapOffd(G);
2102          HYPRE_BigInt *Pix_cmap = hypre_ParCSRMatrixColMapOffd(Pix);
2103          HYPRE_BigInt *Piy_cmap = hypre_ParCSRMatrixColMapOffd(Piy);
2104 
2105 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
2106          if (exec == HYPRE_EXEC_DEVICE)
2107          {
2108             if (G_offd_ncols)
2109             {
2110                HYPRE_THRUST_CALL( copy_n,
2111                                   thrust::make_zip_iterator(thrust::make_tuple(G_offd_I, G_offd_I)),
2112                                   G_offd_nrows + 1,
2113                                   thrust::make_zip_iterator(thrust::make_tuple(Pix_offd_I, Piy_offd_I)) );
2114             }
2115 
2116             HYPRE_THRUST_CALL( copy_n,
2117                                thrust::make_zip_iterator(thrust::make_tuple(G_offd_J, G_offd_J)),
2118                                G_offd_nnz,
2119                                thrust::make_zip_iterator(thrust::make_tuple(Pix_offd_J, Piy_offd_J)) );
2120 
2121             dim3 bDim = hypre_GetDefaultCUDABlockDimension();
2122             dim3 gDim = hypre_GetDefaultCUDAGridDimension(G_offd_nrows, "warp", bDim);
2123 
2124             HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
2125                                G_offd_nrows, dim, G_offd_I, G_offd_data, Gx_data, Gy_data, NULL,
2126                                Pix_offd_data, Piy_offd_data, NULL );
2127          }
2128          else
2129 #endif
2130          {
2131             if (G_offd_ncols)
2132                for (i = 0; i < G_offd_nrows+1; i++)
2133                {
2134                   Pix_offd_I[i] = G_offd_I[i];
2135                   Piy_offd_I[i] = G_offd_I[i];
2136                }
2137 
2138             for (i = 0; i < G_offd_nnz; i++)
2139             {
2140                Pix_offd_J[i] = G_offd_J[i];
2141                Piy_offd_J[i] = G_offd_J[i];
2142             }
2143 
2144             for (i = 0; i < G_offd_nrows; i++)
2145                for (j = G_offd_I[i]; j < G_offd_I[i+1]; j++)
2146                {
2147                   *Pix_offd_data++ = fabs(G_offd_data[j]) * 0.5 * Gx_data[i];
2148                   *Piy_offd_data++ = fabs(G_offd_data[j]) * 0.5 * Gy_data[i];
2149                }
2150          }
2151 
2152          for (i = 0; i < G_offd_ncols; i++)
2153          {
2154             Pix_cmap[i] = G_cmap[i];
2155             Piy_cmap[i] = G_cmap[i];
2156          }
2157       }
2158       else
2159       {
2160          hypre_CSRMatrix *G_offd = hypre_ParCSRMatrixOffd(G);
2161          HYPRE_Int *G_offd_I = hypre_CSRMatrixI(G_offd);
2162          HYPRE_Int *G_offd_J = hypre_CSRMatrixJ(G_offd);
2163          HYPRE_Real *G_offd_data = hypre_CSRMatrixData(G_offd);
2164 
2165          HYPRE_Int G_offd_nrows = hypre_CSRMatrixNumRows(G_offd);
2166          HYPRE_Int G_offd_ncols = hypre_CSRMatrixNumCols(G_offd);
2167          HYPRE_Int G_offd_nnz = hypre_CSRMatrixNumNonzeros(G_offd);
2168 
2169          hypre_CSRMatrix *Pix_offd = hypre_ParCSRMatrixOffd(Pix);
2170          HYPRE_Int *Pix_offd_I = hypre_CSRMatrixI(Pix_offd);
2171          HYPRE_Int *Pix_offd_J = hypre_CSRMatrixJ(Pix_offd);
2172          HYPRE_Real *Pix_offd_data = hypre_CSRMatrixData(Pix_offd);
2173 
2174          HYPRE_BigInt *G_cmap = hypre_ParCSRMatrixColMapOffd(G);
2175          HYPRE_BigInt *Pix_cmap = hypre_ParCSRMatrixColMapOffd(Pix);
2176 
2177 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
2178          if (exec == HYPRE_EXEC_DEVICE)
2179          {
2180             if (G_offd_ncols)
2181             {
2182                HYPRE_THRUST_CALL( copy_n,
2183                                   G_offd_I,
2184                                   G_offd_nrows + 1,
2185                                   Pix_offd_I );
2186             }
2187 
2188             HYPRE_THRUST_CALL( copy_n,
2189                                G_offd_J,
2190                                G_offd_nnz,
2191                                Pix_offd_J );
2192 
2193             dim3 bDim = hypre_GetDefaultCUDABlockDimension();
2194             dim3 gDim = hypre_GetDefaultCUDAGridDimension(G_offd_nrows, "warp", bDim);
2195 
2196             HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
2197                                G_offd_nrows, dim, G_offd_I, G_offd_data, Gx_data, NULL, NULL,
2198                                Pix_offd_data, NULL, NULL );
2199          }
2200          else
2201 #endif
2202          {
2203             if (G_offd_ncols)
2204                for (i = 0; i < G_offd_nrows+1; i++)
2205                {
2206                   Pix_offd_I[i] = G_offd_I[i];
2207                }
2208 
2209             for (i = 0; i < G_offd_nnz; i++)
2210             {
2211                Pix_offd_J[i] = G_offd_J[i];
2212             }
2213 
2214             for (i = 0; i < G_offd_nrows; i++)
2215                for (j = G_offd_I[i]; j < G_offd_I[i+1]; j++)
2216                {
2217                   *Pix_offd_data++ = fabs(G_offd_data[j]) * 0.5 * Gx_data[i];
2218                }
2219          }
2220 
2221          for (i = 0; i < G_offd_ncols; i++)
2222          {
2223             Pix_cmap[i] = G_cmap[i];
2224          }
2225       }
2226    }
2227 
2228    *Pix_ptr = Pix;
2229    if (dim >= 2)
2230       *Piy_ptr = Piy;
2231    if (dim == 3)
2232       *Piz_ptr = Piz;
2233 
2234    return hypre_error_flag;
2235 }
2236 
2237 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
2238 __global__ void
hypreCUDAKernel_AMSComputeGPi_copy2(HYPRE_Int nrows,HYPRE_Int dim,HYPRE_Int * i_in,HYPRE_Real * data_in,HYPRE_Real * Gx_data,HYPRE_Real * Gy_data,HYPRE_Real * Gz_data,HYPRE_Real * data_out)2239 hypreCUDAKernel_AMSComputeGPi_copy2(HYPRE_Int   nrows,
2240                                     HYPRE_Int   dim,
2241                                     HYPRE_Int  *i_in,
2242                                     HYPRE_Real *data_in,
2243                                     HYPRE_Real *Gx_data,
2244                                     HYPRE_Real *Gy_data,
2245                                     HYPRE_Real *Gz_data,
2246                                     HYPRE_Real *data_out)
2247 {
2248    const HYPRE_Int i = hypre_cuda_get_grid_warp_id<1,1>();
2249 
2250    if (i >= nrows)
2251    {
2252       return;
2253    }
2254 
2255    const HYPRE_Int lane_id = hypre_cuda_get_lane_id<1>();
2256    HYPRE_Int j, istart, iend;
2257    HYPRE_Real t, G[3], *Gdata[3];
2258 
2259    Gdata[0] = Gx_data;
2260    Gdata[1] = Gy_data;
2261    Gdata[2] = Gz_data;
2262 
2263    if (lane_id < 2)
2264    {
2265       j = read_only_load(i_in + i + lane_id);
2266    }
2267 
2268    istart = __shfl_sync(HYPRE_WARP_FULL_MASK, j, 0);
2269    iend   = __shfl_sync(HYPRE_WARP_FULL_MASK, j, 1);
2270 
2271    if (lane_id < dim - 1)
2272    {
2273       t = read_only_load(Gdata[lane_id] + i);
2274    }
2275 
2276    for (HYPRE_Int d = 0; d < dim - 1; d++)
2277    {
2278       G[d] = __shfl_sync(HYPRE_WARP_FULL_MASK, t, d);
2279    }
2280 
2281    for (j = istart + lane_id; j < iend; j += HYPRE_WARP_SIZE)
2282    {
2283       const HYPRE_Real u = read_only_load(&data_in[j]);
2284       const HYPRE_Real v = fabs(u) * 0.5;
2285       const HYPRE_Int k = j * dim;
2286 
2287       data_out[k] = u;
2288       for (HYPRE_Int d = 0; d < dim - 1; d++)
2289       {
2290          data_out[k+d+1] = v * G[d];
2291       }
2292    }
2293 }
2294 #endif
2295 
2296 /*--------------------------------------------------------------------------
2297  * hypre_AMSComputeGPi
2298  *
2299  * Construct the matrix [G,Pi] which can be considered an interpolation
2300  * matrix from S_h^4 (4 copies of the scalar linear finite element space)
2301  * to the edge finite elements space.
2302  *--------------------------------------------------------------------------*/
2303 
hypre_AMSComputeGPi(hypre_ParCSRMatrix * A,hypre_ParCSRMatrix * G,hypre_ParVector * Gx,hypre_ParVector * Gy,hypre_ParVector * Gz,HYPRE_Int dim,hypre_ParCSRMatrix ** GPi_ptr)2304 HYPRE_Int hypre_AMSComputeGPi(hypre_ParCSRMatrix *A,
2305                               hypre_ParCSRMatrix *G,
2306                               hypre_ParVector *Gx,
2307                               hypre_ParVector *Gy,
2308                               hypre_ParVector *Gz,
2309                               HYPRE_Int dim,
2310                               hypre_ParCSRMatrix **GPi_ptr)
2311 {
2312    hypre_ParCSRMatrix *GPi;
2313 
2314    /* Take into account G */
2315    dim++;
2316 
2317    /* Compute GPi = [Pi_x, Pi_y, Pi_z, G] */
2318    {
2319       HYPRE_Int i, j, d;
2320 
2321       HYPRE_Real *Gx_data, *Gy_data, *Gz_data;
2322 
2323       MPI_Comm comm = hypre_ParCSRMatrixComm(G);
2324       HYPRE_BigInt global_num_rows = hypre_ParCSRMatrixGlobalNumRows(G);
2325       HYPRE_BigInt global_num_cols = dim*hypre_ParCSRMatrixGlobalNumCols(G);
2326       HYPRE_BigInt *row_starts = hypre_ParCSRMatrixRowStarts(G);
2327       HYPRE_BigInt *col_starts;
2328       HYPRE_Int col_starts_size;
2329       HYPRE_Int num_cols_offd = dim*hypre_CSRMatrixNumCols(hypre_ParCSRMatrixOffd(G));
2330       HYPRE_Int num_nonzeros_diag = dim*hypre_CSRMatrixNumNonzeros(hypre_ParCSRMatrixDiag(G));
2331       HYPRE_Int num_nonzeros_offd = dim*hypre_CSRMatrixNumNonzeros(hypre_ParCSRMatrixOffd(G));
2332       HYPRE_BigInt *col_starts_G = hypre_ParCSRMatrixColStarts(G);
2333       col_starts_size = 2;
2334       col_starts = hypre_TAlloc(HYPRE_BigInt, col_starts_size, HYPRE_MEMORY_HOST);
2335       for (i = 0; i < col_starts_size; i++)
2336          col_starts[i] = (HYPRE_BigInt) dim * col_starts_G[i];
2337 
2338       GPi = hypre_ParCSRMatrixCreate(comm,
2339                                      global_num_rows,
2340                                      global_num_cols,
2341                                      row_starts,
2342                                      col_starts,
2343                                      num_cols_offd,
2344                                      num_nonzeros_diag,
2345                                      num_nonzeros_offd);
2346 
2347       hypre_ParCSRMatrixOwnsData(GPi) = 1;
2348       hypre_ParCSRMatrixInitialize(GPi);
2349 
2350       Gx_data = hypre_VectorData(hypre_ParVectorLocalVector(Gx));
2351       if (dim >= 3)
2352          Gy_data = hypre_VectorData(hypre_ParVectorLocalVector(Gy));
2353       if (dim == 4)
2354          Gz_data = hypre_VectorData(hypre_ParVectorLocalVector(Gz));
2355 
2356 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
2357       HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy2( hypre_ParCSRMatrixMemoryLocation(G),
2358                                                          hypre_ParCSRMatrixMemoryLocation(GPi) );
2359 #endif
2360 
2361       /* Fill-in the diagonal part */
2362       {
2363          hypre_CSRMatrix *G_diag = hypre_ParCSRMatrixDiag(G);
2364          HYPRE_Int *G_diag_I = hypre_CSRMatrixI(G_diag);
2365          HYPRE_Int *G_diag_J = hypre_CSRMatrixJ(G_diag);
2366          HYPRE_Real *G_diag_data = hypre_CSRMatrixData(G_diag);
2367 
2368          HYPRE_Int G_diag_nrows = hypre_CSRMatrixNumRows(G_diag);
2369          HYPRE_Int G_diag_nnz = hypre_CSRMatrixNumNonzeros(G_diag);
2370 
2371          hypre_CSRMatrix *GPi_diag = hypre_ParCSRMatrixDiag(GPi);
2372          HYPRE_Int *GPi_diag_I = hypre_CSRMatrixI(GPi_diag);
2373          HYPRE_Int *GPi_diag_J = hypre_CSRMatrixJ(GPi_diag);
2374          HYPRE_Real *GPi_diag_data = hypre_CSRMatrixData(GPi_diag);
2375 
2376 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
2377          if (exec == HYPRE_EXEC_DEVICE)
2378          {
2379             HYPRE_THRUST_CALL( transform,
2380                                G_diag_I,
2381                                G_diag_I + G_diag_nrows + 1,
2382                                GPi_diag_I,
2383                                dim * _1 );
2384 
2385             dim3 bDim = hypre_GetDefaultCUDABlockDimension();
2386             dim3 gDim = hypre_GetDefaultCUDAGridDimension(G_diag_nnz, "thread", bDim);
2387 
2388             HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
2389                                G_diag_nnz, dim, G_diag_J, GPi_diag_J );
2390 
2391             gDim = hypre_GetDefaultCUDAGridDimension(G_diag_nrows, "warp", bDim);
2392 
2393             HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputeGPi_copy2, gDim, bDim,
2394                                G_diag_nrows, dim, G_diag_I, G_diag_data, Gx_data, Gy_data, Gz_data,
2395                                GPi_diag_data );
2396          }
2397          else
2398 #endif
2399          {
2400             for (i = 0; i < G_diag_nrows+1; i++)
2401                GPi_diag_I[i] = dim * G_diag_I[i];
2402 
2403             for (i = 0; i < G_diag_nnz; i++)
2404                for (d = 0; d < dim; d++)
2405                   GPi_diag_J[dim*i+d] = dim*G_diag_J[i]+d;
2406 
2407             for (i = 0; i < G_diag_nrows; i++)
2408                for (j = G_diag_I[i]; j < G_diag_I[i+1]; j++)
2409                {
2410                   *GPi_diag_data++ = G_diag_data[j];
2411                   *GPi_diag_data++ = fabs(G_diag_data[j]) * 0.5 * Gx_data[i];
2412                   if (dim >= 3)
2413                      *GPi_diag_data++ = fabs(G_diag_data[j]) * 0.5 * Gy_data[i];
2414                   if (dim == 4)
2415                      *GPi_diag_data++ = fabs(G_diag_data[j]) * 0.5 * Gz_data[i];
2416                }
2417          }
2418       }
2419 
2420       /* Fill-in the off-diagonal part */
2421       {
2422          hypre_CSRMatrix *G_offd = hypre_ParCSRMatrixOffd(G);
2423          HYPRE_Int *G_offd_I = hypre_CSRMatrixI(G_offd);
2424          HYPRE_Int *G_offd_J = hypre_CSRMatrixJ(G_offd);
2425          HYPRE_Real *G_offd_data = hypre_CSRMatrixData(G_offd);
2426 
2427          HYPRE_Int G_offd_nrows = hypre_CSRMatrixNumRows(G_offd);
2428          HYPRE_Int G_offd_ncols = hypre_CSRMatrixNumCols(G_offd);
2429          HYPRE_Int G_offd_nnz = hypre_CSRMatrixNumNonzeros(G_offd);
2430 
2431          hypre_CSRMatrix *GPi_offd = hypre_ParCSRMatrixOffd(GPi);
2432          HYPRE_Int *GPi_offd_I = hypre_CSRMatrixI(GPi_offd);
2433          HYPRE_Int *GPi_offd_J = hypre_CSRMatrixJ(GPi_offd);
2434          HYPRE_Real *GPi_offd_data = hypre_CSRMatrixData(GPi_offd);
2435 
2436          HYPRE_BigInt *G_cmap = hypre_ParCSRMatrixColMapOffd(G);
2437          HYPRE_BigInt *GPi_cmap = hypre_ParCSRMatrixColMapOffd(GPi);
2438 
2439 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
2440          if (exec == HYPRE_EXEC_DEVICE)
2441          {
2442             if (G_offd_ncols)
2443             {
2444                HYPRE_THRUST_CALL( transform,
2445                                   G_offd_I,
2446                                   G_offd_I + G_offd_nrows + 1,
2447                                   GPi_offd_I,
2448                                   dim * _1 );
2449             }
2450 
2451             dim3 bDim = hypre_GetDefaultCUDABlockDimension();
2452             dim3 gDim = hypre_GetDefaultCUDAGridDimension(G_offd_nnz, "thread", bDim);
2453 
2454             HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
2455                                G_offd_nnz, dim, G_offd_J, GPi_offd_J );
2456 
2457             gDim = hypre_GetDefaultCUDAGridDimension(G_offd_nrows, "warp", bDim);
2458 
2459             HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputeGPi_copy2, gDim, bDim,
2460                                G_offd_nrows, dim, G_offd_I, G_offd_data, Gx_data, Gy_data, Gz_data,
2461                                GPi_offd_data );
2462          }
2463          else
2464 #endif
2465          {
2466             if (G_offd_ncols)
2467                for (i = 0; i < G_offd_nrows+1; i++)
2468                   GPi_offd_I[i] = dim * G_offd_I[i];
2469 
2470             for (i = 0; i < G_offd_nnz; i++)
2471                for (d = 0; d < dim; d++)
2472                   GPi_offd_J[dim*i+d] = dim*G_offd_J[i]+d;
2473 
2474             for (i = 0; i < G_offd_nrows; i++)
2475                for (j = G_offd_I[i]; j < G_offd_I[i+1]; j++)
2476                {
2477                   *GPi_offd_data++ = G_offd_data[j];
2478                   *GPi_offd_data++ = fabs(G_offd_data[j]) * 0.5 * Gx_data[i];
2479                   if (dim >= 3)
2480                      *GPi_offd_data++ = fabs(G_offd_data[j]) * 0.5 * Gy_data[i];
2481                   if (dim == 4)
2482                      *GPi_offd_data++ = fabs(G_offd_data[j]) * 0.5 * Gz_data[i];
2483                }
2484          }
2485 
2486          for (i = 0; i < G_offd_ncols; i++)
2487             for (d = 0; d < dim; d++)
2488                GPi_cmap[dim*i+d] = dim*G_cmap[i]+d;
2489       }
2490 
2491    }
2492 
2493    *GPi_ptr = GPi;
2494 
2495    return hypre_error_flag;
2496 }
2497 
2498 /*--------------------------------------------------------------------------
2499  * hypre_AMSSetup
2500  *
2501  * Construct the AMS solver components.
2502  *
2503  * The following functions need to be called before hypre_AMSSetup():
2504  * - hypre_AMSSetDimension() (if solving a 2D problem)
2505  * - hypre_AMSSetDiscreteGradient()
2506  * - hypre_AMSSetCoordinateVectors() or hypre_AMSSetEdgeConstantVectors
2507  *--------------------------------------------------------------------------*/
2508 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
2509 __global__ void
hypreCUDAKernel_FixInterNodes(HYPRE_Int nrows,HYPRE_Int * G0t_diag_i,HYPRE_Complex * G0t_diag_data,HYPRE_Int * G0t_offd_i,HYPRE_Complex * G0t_offd_data,HYPRE_Real * interior_nodes_data)2510 hypreCUDAKernel_FixInterNodes( HYPRE_Int      nrows,
2511                                HYPRE_Int     *G0t_diag_i,
2512                                HYPRE_Complex *G0t_diag_data,
2513                                HYPRE_Int     *G0t_offd_i,
2514                                HYPRE_Complex *G0t_offd_data,
2515                                HYPRE_Real    *interior_nodes_data)
2516 {
2517    HYPRE_Int row_i = hypre_cuda_get_grid_warp_id<1,1>();
2518 
2519    if (row_i >= nrows)
2520    {
2521       return;
2522    }
2523 
2524    HYPRE_Int lane = hypre_cuda_get_lane_id<1>();
2525    HYPRE_Int not1 = 0;
2526 
2527    if (lane == 0)
2528    {
2529       not1 = read_only_load(&interior_nodes_data[row_i]) != 1.0;
2530    }
2531 
2532    not1 = __shfl_sync(HYPRE_WARP_FULL_MASK, not1, 0);
2533 
2534    if (!not1)
2535    {
2536       return;
2537    }
2538 
2539    HYPRE_Int p1, q1, p2 = 0, q2 = 0;
2540    bool nonempty_offd = G0t_offd_data != NULL;
2541 
2542    if (lane < 2)
2543    {
2544       p1 = read_only_load(G0t_diag_i + row_i + lane);
2545       if (nonempty_offd)
2546       {
2547          p2 = read_only_load(G0t_offd_i + row_i + lane);
2548       }
2549    }
2550 
2551    q1 = __shfl_sync(HYPRE_WARP_FULL_MASK, p1, 1);
2552    p1 = __shfl_sync(HYPRE_WARP_FULL_MASK, p1, 0);
2553    if (nonempty_offd)
2554    {
2555       q2 = __shfl_sync(HYPRE_WARP_FULL_MASK, p2, 1);
2556       p2 = __shfl_sync(HYPRE_WARP_FULL_MASK, p2, 0);
2557    }
2558 
2559    for (HYPRE_Int j = p1 + lane; j < q1; j += HYPRE_WARP_SIZE)
2560    {
2561       G0t_diag_data[j] = 0.0;
2562    }
2563    for (HYPRE_Int j = p2 + lane; j < q2; j += HYPRE_WARP_SIZE)
2564    {
2565       G0t_offd_data[j] = 0.0;
2566    }
2567 }
2568 
2569 __global__ void
hypreCUDAKernel_AMSSetupScaleGGt(HYPRE_Int Gt_num_rows,HYPRE_Int * Gt_diag_i,HYPRE_Int * Gt_diag_j,HYPRE_Real * Gt_diag_data,HYPRE_Int * Gt_offd_i,HYPRE_Real * Gt_offd_data,HYPRE_Real * Gx_data,HYPRE_Real * Gy_data,HYPRE_Real * Gz_data)2570 hypreCUDAKernel_AMSSetupScaleGGt( HYPRE_Int   Gt_num_rows,
2571                                   HYPRE_Int  *Gt_diag_i,
2572                                   HYPRE_Int  *Gt_diag_j,
2573                                   HYPRE_Real *Gt_diag_data,
2574                                   HYPRE_Int  *Gt_offd_i,
2575                                   HYPRE_Real *Gt_offd_data,
2576                                   HYPRE_Real *Gx_data,
2577                                   HYPRE_Real *Gy_data,
2578                                   HYPRE_Real *Gz_data )
2579 {
2580    HYPRE_Int row_i = hypre_cuda_get_grid_warp_id<1,1>();
2581 
2582    if (row_i >= Gt_num_rows)
2583    {
2584       return;
2585    }
2586 
2587    HYPRE_Int lane = hypre_cuda_get_lane_id<1>();
2588    HYPRE_Real h2 = 0.0;
2589    HYPRE_Int ne, p1, q1, p2 = 0, q2 = 0;
2590 
2591    if (lane < 2)
2592    {
2593       p1 = read_only_load(Gt_diag_i + row_i + lane);
2594    }
2595    q1 = __shfl_sync(HYPRE_WARP_FULL_MASK, p1, 1);
2596    p1 = __shfl_sync(HYPRE_WARP_FULL_MASK, p1, 0);
2597    ne = q1 - p1;
2598 
2599    if (ne == 0)
2600    {
2601       return;
2602    }
2603 
2604    if (Gt_offd_data != NULL)
2605    {
2606       if (lane < 2)
2607       {
2608          p2 = read_only_load(Gt_offd_i + row_i + lane);
2609       }
2610       q2 = __shfl_sync(HYPRE_WARP_FULL_MASK, p2, 1);
2611       p2 = __shfl_sync(HYPRE_WARP_FULL_MASK, p2, 0);
2612    }
2613 
2614    for (HYPRE_Int j = p1 + lane; j < q1; j += HYPRE_WARP_SIZE)
2615    {
2616       const HYPRE_Int k = read_only_load(&Gt_diag_j[j]);
2617       const HYPRE_Real Gx = read_only_load(&Gx_data[k]);
2618       const HYPRE_Real Gy = read_only_load(&Gy_data[k]);
2619       const HYPRE_Real Gz = read_only_load(&Gz_data[k]);
2620 
2621       h2 += Gx*Gx + Gy*Gy + Gz*Gz;
2622    }
2623 
2624    h2 = warp_allreduce_sum(h2) / ne;
2625 
2626    for (HYPRE_Int j = p1 + lane; j < q1; j += HYPRE_WARP_SIZE)
2627    {
2628       Gt_diag_data[j] *= h2;
2629    }
2630 
2631    for (HYPRE_Int j = p2 + lane; j < q2; j += HYPRE_WARP_SIZE)
2632    {
2633       Gt_offd_data[j] *= h2;
2634    }
2635 }
2636 #endif
2637 
hypre_AMSSetup(void * solver,hypre_ParCSRMatrix * A,hypre_ParVector * b,hypre_ParVector * x)2638 HYPRE_Int hypre_AMSSetup(void *solver,
2639                          hypre_ParCSRMatrix *A,
2640                          hypre_ParVector *b,
2641                          hypre_ParVector *x)
2642 {
2643 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
2644    HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_ParCSRMatrixMemoryLocation(A) );
2645 #endif
2646 
2647    hypre_AMSData *ams_data = (hypre_AMSData *) solver;
2648 
2649    HYPRE_Int input_info = 0;
2650 
2651    ams_data -> A = A;
2652 
2653    /* Modifications for problems with zero-conductivity regions */
2654    if (ams_data -> interior_nodes)
2655    {
2656       hypre_ParCSRMatrix *G0t, *Aorig = A;
2657 
2658       /* Make sure that multiple Setup()+Solve() give identical results */
2659       ams_data -> solve_counter = 0;
2660 
2661       /* Construct the discrete gradient matrix for the zero-conductivity region
2662          by eliminating the zero-conductivity nodes from G^t. The range of G0
2663          represents the kernel of A, i.e. the gradients of nodal basis functions
2664          supported in zero-conductivity regions. */
2665       hypre_ParCSRMatrixTranspose(ams_data -> G, &G0t, 1);
2666 
2667       {
2668          HYPRE_Int i, j;
2669          HYPRE_Int nv = hypre_ParCSRMatrixNumCols(ams_data -> G);
2670          hypre_CSRMatrix *G0td = hypre_ParCSRMatrixDiag(G0t);
2671          HYPRE_Int *G0tdI = hypre_CSRMatrixI(G0td);
2672          HYPRE_Real *G0tdA = hypre_CSRMatrixData(G0td);
2673          hypre_CSRMatrix *G0to = hypre_ParCSRMatrixOffd(G0t);
2674          HYPRE_Int *G0toI = hypre_CSRMatrixI(G0to);
2675          HYPRE_Real *G0toA = hypre_CSRMatrixData(G0to);
2676          HYPRE_Real *interior_nodes_data=hypre_VectorData(
2677             hypre_ParVectorLocalVector((hypre_ParVector*) ams_data -> interior_nodes));
2678 
2679 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
2680          if (exec == HYPRE_EXEC_DEVICE)
2681          {
2682             dim3 bDim = hypre_GetDefaultCUDABlockDimension();
2683             dim3 gDim = hypre_GetDefaultCUDAGridDimension(nv, "warp", bDim);
2684             HYPRE_CUDA_LAUNCH( hypreCUDAKernel_FixInterNodes, gDim, bDim,
2685                                nv, G0tdI, G0tdA, G0toI, G0toA, interior_nodes_data );
2686          }
2687          else
2688 #endif
2689          {
2690             for (i = 0; i < nv; i++)
2691             {
2692                if (interior_nodes_data[i] != 1)
2693                {
2694                   for (j = G0tdI[i]; j < G0tdI[i+1]; j++)
2695                      G0tdA[j] = 0.0;
2696                   if (G0toI)
2697                      for (j = G0toI[i]; j < G0toI[i+1]; j++)
2698                         G0toA[j] = 0.0;
2699                }
2700             }
2701          }
2702       }
2703       hypre_ParCSRMatrixTranspose(G0t, & ams_data -> G0, 1);
2704 
2705       /* Construct the subspace matrix A_G0 = G0^T G0 */
2706 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
2707       if (exec == HYPRE_EXEC_DEVICE)
2708       {
2709          ams_data -> A_G0 = hypre_ParCSRMatMat(G0t, ams_data -> G0);
2710       }
2711       else
2712 #endif
2713       {
2714          ams_data -> A_G0 = hypre_ParMatmul(G0t, ams_data -> G0);
2715       }
2716       hypre_ParCSRMatrixFixZeroRows(ams_data -> A_G0);
2717 
2718       /* Create AMG solver for A_G0 */
2719       HYPRE_BoomerAMGCreate(&ams_data -> B_G0);
2720       HYPRE_BoomerAMGSetCoarsenType(ams_data -> B_G0, ams_data -> B_G_coarsen_type);
2721       HYPRE_BoomerAMGSetAggNumLevels(ams_data -> B_G0, ams_data -> B_G_agg_levels);
2722       HYPRE_BoomerAMGSetRelaxType(ams_data -> B_G0, ams_data -> B_G_relax_type);
2723       HYPRE_BoomerAMGSetNumSweeps(ams_data -> B_G0, 1);
2724       HYPRE_BoomerAMGSetMaxLevels(ams_data -> B_G0, 25);
2725       HYPRE_BoomerAMGSetTol(ams_data -> B_G0, 0.0);
2726       HYPRE_BoomerAMGSetMaxIter(ams_data -> B_G0, 3); /* use just a few V-cycles */
2727       HYPRE_BoomerAMGSetStrongThreshold(ams_data -> B_G0, ams_data -> B_G_theta);
2728       HYPRE_BoomerAMGSetInterpType(ams_data -> B_G0, ams_data -> B_G_interp_type);
2729       HYPRE_BoomerAMGSetPMaxElmts(ams_data -> B_G0, ams_data -> B_G_Pmax);
2730       HYPRE_BoomerAMGSetMinCoarseSize(ams_data -> B_G0, 2); /* don't coarsen to 0 */
2731       /* Generally, don't use exact solve on the coarsest level (matrix may be singular) */
2732       HYPRE_BoomerAMGSetCycleRelaxType(ams_data -> B_G0, ams_data -> B_G_coarse_relax_type, 3);
2733       HYPRE_BoomerAMGSetup(ams_data -> B_G0,
2734                            (HYPRE_ParCSRMatrix)ams_data -> A_G0,
2735                            0, 0);
2736 
2737       /* Construct the preconditioner for ams_data->A = A + G0 G0^T.
2738          NOTE: this can be optimized significantly by taking into account that
2739          the sparsity pattern of A is subset of the sparsity pattern of G0 G0^T */
2740       {
2741 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
2742          hypre_ParCSRMatrix *A;
2743          if (exec == HYPRE_EXEC_DEVICE)
2744          {
2745             A = hypre_ParCSRMatMat(ams_data -> G0, G0t);
2746          }
2747          else
2748 #endif
2749          {
2750             A = hypre_ParMatmul(ams_data -> G0, G0t);
2751          }
2752          hypre_ParCSRMatrix *B = Aorig;
2753          hypre_ParCSRMatrix **C_ptr = &ams_data -> A;
2754          hypre_ParCSRMatrix *C;
2755          HYPRE_Real factor, lfactor;
2756          /* scale (penalize) G0 G0^T before adding it to the matrix */
2757          {
2758             HYPRE_Int i;
2759             HYPRE_Int B_num_rows = hypre_CSRMatrixNumRows(hypre_ParCSRMatrixDiag(B));
2760             HYPRE_Real *B_diag_data = hypre_CSRMatrixData(hypre_ParCSRMatrixDiag(B));
2761             HYPRE_Real *B_offd_data = hypre_CSRMatrixData(hypre_ParCSRMatrixOffd(B));
2762             HYPRE_Int *B_diag_i = hypre_CSRMatrixI(hypre_ParCSRMatrixDiag(B));
2763             HYPRE_Int *B_offd_i = hypre_CSRMatrixI(hypre_ParCSRMatrixOffd(B));
2764             lfactor = -1;
2765 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
2766             if (exec == HYPRE_EXEC_DEVICE)
2767             {
2768                HYPRE_Int nnz_diag = hypre_CSRMatrixNumNonzeros(hypre_ParCSRMatrixDiag(B));
2769                HYPRE_Int nnz_offd = hypre_CSRMatrixNumNonzeros(hypre_ParCSRMatrixOffd(B));
2770 #if defined(HYPRE_DEBUG)
2771                HYPRE_Int nnz;
2772                hypre_TMemcpy(&nnz, &B_diag_i[B_num_rows], HYPRE_Int, 1, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE);
2773                hypre_assert(nnz == nnz_diag);
2774                hypre_TMemcpy(&nnz, &B_offd_i[B_num_rows], HYPRE_Int, 1, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE);
2775                hypre_assert(nnz == nnz_offd);
2776 #endif
2777                if (nnz_diag)
2778                {
2779                   lfactor = HYPRE_THRUST_CALL( reduce,
2780                                                thrust::make_transform_iterator(B_diag_data,            absolute_value<HYPRE_Real>()),
2781                                                thrust::make_transform_iterator(B_diag_data + nnz_diag, absolute_value<HYPRE_Real>()),
2782                                                -1.0,
2783                                                thrust::maximum<HYPRE_Real>() );
2784                }
2785 
2786                if (nnz_offd)
2787                {
2788                   lfactor = HYPRE_THRUST_CALL( reduce,
2789                                                thrust::make_transform_iterator(B_offd_data,            absolute_value<HYPRE_Real>()),
2790                                                thrust::make_transform_iterator(B_offd_data + nnz_offd, absolute_value<HYPRE_Real>()),
2791                                                lfactor,
2792                                                thrust::maximum<HYPRE_Real>() );
2793 
2794                }
2795             }
2796             else
2797 #endif
2798             {
2799                for (i = 0; i < B_diag_i[B_num_rows]; i++)
2800                   if (fabs(B_diag_data[i]) > lfactor)
2801                      lfactor = fabs(B_diag_data[i]);
2802                for (i = 0; i < B_offd_i[B_num_rows]; i++)
2803                   if (fabs(B_offd_data[i]) > lfactor)
2804                      lfactor = fabs(B_offd_data[i]);
2805             }
2806 
2807             lfactor *= 1e-10; /* scaling factor: max|A_ij|*1e-10 */
2808             hypre_MPI_Allreduce(&lfactor, &factor, 1, HYPRE_MPI_REAL, hypre_MPI_MAX, hypre_ParCSRMatrixComm(A));
2809          }
2810 
2811          hypre_ParCSRMatrixAdd(factor, A, 1.0, B, &C);
2812 
2813          /*hypre_CSRMatrix *A_local, *B_local, *C_local, *C_tmp;
2814 
2815          MPI_Comm comm = hypre_ParCSRMatrixComm(A);
2816          HYPRE_BigInt global_num_rows = hypre_ParCSRMatrixGlobalNumRows(A);
2817          HYPRE_BigInt global_num_cols = hypre_ParCSRMatrixGlobalNumCols(A);
2818          HYPRE_BigInt *row_starts = hypre_ParCSRMatrixRowStarts(A);
2819          HYPRE_BigInt *col_starts = hypre_ParCSRMatrixColStarts(A);
2820          HYPRE_Int A_num_cols_offd = hypre_CSRMatrixNumCols(hypre_ParCSRMatrixOffd(A));
2821          HYPRE_Int A_num_nonzeros_diag = hypre_CSRMatrixNumNonzeros(hypre_ParCSRMatrixDiag(A));
2822          HYPRE_Int A_num_nonzeros_offd = hypre_CSRMatrixNumNonzeros(hypre_ParCSRMatrixOffd(A));
2823          HYPRE_Int B_num_cols_offd = hypre_CSRMatrixNumCols(hypre_ParCSRMatrixOffd(B));
2824          HYPRE_Int B_num_nonzeros_diag = hypre_CSRMatrixNumNonzeros(hypre_ParCSRMatrixDiag(B));
2825          HYPRE_Int B_num_nonzeros_offd = hypre_CSRMatrixNumNonzeros(hypre_ParCSRMatrixOffd(B));
2826 
2827          A_local = hypre_MergeDiagAndOffd(A);
2828          B_local = hypre_MergeDiagAndOffd(B);*/
2829          /* scale (penalize) G0 G0^T before adding it to the matrix */
2830          /*{
2831             HYPRE_Int i, nnz = hypre_CSRMatrixNumNonzeros(A_local);
2832             HYPRE_Real *data = hypre_CSRMatrixData(A_local);
2833             HYPRE_Real *dataB = hypre_CSRMatrixData(B_local);
2834             HYPRE_Int nnzB = hypre_CSRMatrixNumNonzeros(B_local);
2835             HYPRE_Real factor, lfactor;
2836             lfactor = -1;
2837             for (i = 0; i < nnzB; i++)
2838                if (fabs(dataB[i]) > lfactor)
2839                   lfactor = fabs(dataB[i]);
2840             lfactor *= 1e-10;
2841             hypre_MPI_Allreduce(&lfactor, &factor, 1, HYPRE_MPI_REAL, hypre_MPI_MAX,
2842                                 hypre_ParCSRMatrixComm(A));
2843             for (i = 0; i < nnz; i++)
2844                data[i] *= factor;
2845          }
2846          C_tmp = hypre_CSRMatrixBigAdd(A_local, B_local);
2847          C_local = hypre_CSRMatrixBigDeleteZeros(C_tmp,0.0);
2848          if (C_local)
2849             hypre_CSRMatrixDestroy(C_tmp);
2850          else
2851             C_local = C_tmp;
2852 
2853          C = hypre_ParCSRMatrixCreate (comm,
2854                                        global_num_rows,
2855                                        global_num_cols,
2856                                        row_starts,
2857                                        col_starts,
2858                                        A_num_cols_offd + B_num_cols_offd,
2859                                        A_num_nonzeros_diag + B_num_nonzeros_diag,
2860                                        A_num_nonzeros_offd + B_num_nonzeros_offd);
2861          GenerateDiagAndOffd(C_local, C,
2862                              hypre_ParCSRMatrixFirstColDiag(A),
2863                              hypre_ParCSRMatrixLastColDiag(A));
2864 
2865          hypre_CSRMatrixDestroy(A_local);
2866          hypre_CSRMatrixDestroy(B_local);
2867          hypre_CSRMatrixDestroy(C_local);
2868          */
2869 
2870          hypre_ParCSRMatrixDestroy(A);
2871 
2872          *C_ptr = C;
2873       }
2874 
2875       hypre_ParCSRMatrixDestroy(G0t);
2876    }
2877 
2878    /* Make sure that the first entry in each row is the diagonal one. */
2879    /* hypre_CSRMatrixReorder(hypre_ParCSRMatrixDiag(ams_data -> A)); */
2880 
2881    /* Compute the l1 norm of the rows of A */
2882    if (ams_data -> A_relax_type >= 1 && ams_data -> A_relax_type <= 4)
2883    {
2884       HYPRE_Real *l1_norm_data = NULL;
2885 
2886       hypre_ParCSRComputeL1Norms(ams_data -> A, ams_data -> A_relax_type, NULL, &l1_norm_data);
2887 
2888       ams_data -> A_l1_norms = hypre_SeqVectorCreate(hypre_ParCSRMatrixNumRows(ams_data -> A));
2889       hypre_VectorData(ams_data -> A_l1_norms) = l1_norm_data;
2890       hypre_SeqVectorInitialize_v2(ams_data -> A_l1_norms, hypre_ParCSRMatrixMemoryLocation(ams_data -> A));
2891    }
2892 
2893    /* Chebyshev? */
2894    if (ams_data -> A_relax_type == 16)
2895    {
2896       hypre_ParCSRMaxEigEstimateCG(ams_data->A, 1, 10,
2897                                    &ams_data->A_max_eig_est,
2898                                    &ams_data->A_min_eig_est);
2899    }
2900 
2901    /* If not given, compute Gx, Gy and Gz */
2902    {
2903       if (ams_data -> x != NULL &&
2904           (ams_data -> dim == 1 || ams_data -> y != NULL) &&
2905           (ams_data -> dim <= 2 || ams_data -> z != NULL))
2906          input_info = 1;
2907 
2908       if (ams_data -> Gx != NULL &&
2909           (ams_data -> dim == 1 || ams_data -> Gy != NULL) &&
2910           (ams_data -> dim <= 2 || ams_data -> Gz != NULL))
2911          input_info = 2;
2912 
2913       if (input_info == 1)
2914       {
2915          ams_data -> Gx = hypre_ParVectorInRangeOf(ams_data -> G);
2916          hypre_ParCSRMatrixMatvec (1.0, ams_data -> G, ams_data -> x, 0.0, ams_data -> Gx);
2917          if (ams_data -> dim >= 2)
2918          {
2919             ams_data -> Gy = hypre_ParVectorInRangeOf(ams_data -> G);
2920             hypre_ParCSRMatrixMatvec (1.0, ams_data -> G, ams_data -> y, 0.0, ams_data -> Gy);
2921          }
2922          if (ams_data -> dim == 3)
2923          {
2924             ams_data -> Gz = hypre_ParVectorInRangeOf(ams_data -> G);
2925             hypre_ParCSRMatrixMatvec (1.0, ams_data -> G, ams_data -> z, 0.0, ams_data -> Gz);
2926          }
2927       }
2928    }
2929 
2930    if (ams_data -> Pi == NULL && ams_data -> Pix == NULL)
2931    {
2932       if (ams_data -> cycle_type == 20)
2933          /* Construct the combined interpolation matrix [G,Pi] */
2934          hypre_AMSComputeGPi(ams_data -> A,
2935                              ams_data -> G,
2936                              ams_data -> Gx,
2937                              ams_data -> Gy,
2938                              ams_data -> Gz,
2939                              ams_data -> dim,
2940                              &ams_data -> Pi);
2941       else if (ams_data -> cycle_type > 10)
2942          /* Construct Pi{x,y,z} instead of Pi = [Pix,Piy,Piz] */
2943          hypre_AMSComputePixyz(ams_data -> A,
2944                                ams_data -> G,
2945                                ams_data -> Gx,
2946                                ams_data -> Gy,
2947                                ams_data -> Gz,
2948                                ams_data -> dim,
2949                                &ams_data -> Pix,
2950                                &ams_data -> Piy,
2951                                &ams_data -> Piz);
2952       else
2953          /* Construct the Pi interpolation matrix */
2954          hypre_AMSComputePi(ams_data -> A,
2955                             ams_data -> G,
2956                             ams_data -> Gx,
2957                             ams_data -> Gy,
2958                             ams_data -> Gz,
2959                             ams_data -> dim,
2960                             &ams_data -> Pi);
2961    }
2962 
2963    /* Keep Gx, Gy and Gz only if use the method with discrete divergence
2964       stabilization (where we use them to compute the local mesh size). */
2965    if (input_info == 1 && ams_data -> cycle_type != 9)
2966    {
2967       hypre_ParVectorDestroy(ams_data -> Gx);
2968       if (ams_data -> dim >= 2)
2969          hypre_ParVectorDestroy(ams_data -> Gy);
2970       if (ams_data -> dim == 3)
2971          hypre_ParVectorDestroy(ams_data -> Gz);
2972    }
2973 
2974    /* Create the AMG solver on the range of G^T */
2975    if (!ams_data -> beta_is_zero && ams_data -> cycle_type != 20)
2976    {
2977       HYPRE_BoomerAMGCreate(&ams_data -> B_G);
2978       HYPRE_BoomerAMGSetCoarsenType(ams_data -> B_G, ams_data -> B_G_coarsen_type);
2979       HYPRE_BoomerAMGSetAggNumLevels(ams_data -> B_G, ams_data -> B_G_agg_levels);
2980       HYPRE_BoomerAMGSetRelaxType(ams_data -> B_G, ams_data -> B_G_relax_type);
2981       HYPRE_BoomerAMGSetNumSweeps(ams_data -> B_G, 1);
2982       HYPRE_BoomerAMGSetMaxLevels(ams_data -> B_G, 25);
2983       HYPRE_BoomerAMGSetTol(ams_data -> B_G, 0.0);
2984       HYPRE_BoomerAMGSetMaxIter(ams_data -> B_G, 1);
2985       HYPRE_BoomerAMGSetStrongThreshold(ams_data -> B_G, ams_data -> B_G_theta);
2986       HYPRE_BoomerAMGSetInterpType(ams_data -> B_G, ams_data -> B_G_interp_type);
2987       HYPRE_BoomerAMGSetPMaxElmts(ams_data -> B_G, ams_data -> B_G_Pmax);
2988       HYPRE_BoomerAMGSetMinCoarseSize(ams_data -> B_G, 2); /* don't coarsen to 0 */
2989 
2990       /* Generally, don't use exact solve on the coarsest level (matrix may be singular) */
2991       HYPRE_BoomerAMGSetCycleRelaxType(ams_data -> B_G, ams_data -> B_G_coarse_relax_type, 3);
2992 
2993       if (ams_data -> cycle_type == 0)
2994          HYPRE_BoomerAMGSetMaxLevels(ams_data -> B_G, 2);
2995 
2996       /* If not given, construct the coarse space matrix by RAP */
2997       if (!ams_data -> A_G)
2998       {
2999          if (!hypre_ParCSRMatrixCommPkg(ams_data -> G))
3000          {
3001             hypre_MatvecCommPkgCreate(ams_data -> G);
3002          }
3003 
3004          if (!hypre_ParCSRMatrixCommPkg(ams_data -> A))
3005          {
3006             hypre_MatvecCommPkgCreate(ams_data -> A);
3007          }
3008 
3009 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
3010          if (exec == HYPRE_EXEC_DEVICE)
3011          {
3012             ams_data -> A_G = hypre_ParCSRMatrixRAPKT(ams_data -> G,
3013                                                       ams_data -> A,
3014                                                       ams_data -> G, 1);
3015          }
3016          else
3017 #endif
3018          {
3019             hypre_BoomerAMGBuildCoarseOperator(ams_data -> G,
3020                                                ams_data -> A,
3021                                                ams_data -> G,
3022                                                &ams_data -> A_G);
3023          }
3024 
3025          /* Make sure that A_G has no zero rows (this can happen
3026             if beta is zero in part of the domain). */
3027          hypre_ParCSRMatrixFixZeroRows(ams_data -> A_G);
3028          ams_data -> owns_A_G = 1;
3029       }
3030 
3031       HYPRE_BoomerAMGSetup(ams_data -> B_G,
3032                            (HYPRE_ParCSRMatrix)ams_data -> A_G,
3033                            NULL, NULL);
3034    }
3035 
3036    if (ams_data -> cycle_type > 10 && ams_data -> cycle_type != 20)
3037    /* Create the AMG solvers on the range of Pi{x,y,z}^T */
3038    {
3039       HYPRE_BoomerAMGCreate(&ams_data -> B_Pix);
3040       HYPRE_BoomerAMGSetCoarsenType(ams_data -> B_Pix, ams_data -> B_Pi_coarsen_type);
3041       HYPRE_BoomerAMGSetAggNumLevels(ams_data -> B_Pix, ams_data -> B_Pi_agg_levels);
3042       HYPRE_BoomerAMGSetRelaxType(ams_data -> B_Pix, ams_data -> B_Pi_relax_type);
3043       HYPRE_BoomerAMGSetNumSweeps(ams_data -> B_Pix, 1);
3044       HYPRE_BoomerAMGSetMaxLevels(ams_data -> B_Pix, 25);
3045       HYPRE_BoomerAMGSetTol(ams_data -> B_Pix, 0.0);
3046       HYPRE_BoomerAMGSetMaxIter(ams_data -> B_Pix, 1);
3047       HYPRE_BoomerAMGSetStrongThreshold(ams_data -> B_Pix, ams_data -> B_Pi_theta);
3048       HYPRE_BoomerAMGSetInterpType(ams_data -> B_Pix, ams_data -> B_Pi_interp_type);
3049       HYPRE_BoomerAMGSetPMaxElmts(ams_data -> B_Pix, ams_data -> B_Pi_Pmax);
3050       HYPRE_BoomerAMGSetMinCoarseSize(ams_data -> B_Pix, 2);
3051 
3052       HYPRE_BoomerAMGCreate(&ams_data -> B_Piy);
3053       HYPRE_BoomerAMGSetCoarsenType(ams_data -> B_Piy, ams_data -> B_Pi_coarsen_type);
3054       HYPRE_BoomerAMGSetAggNumLevels(ams_data -> B_Piy, ams_data -> B_Pi_agg_levels);
3055       HYPRE_BoomerAMGSetRelaxType(ams_data -> B_Piy, ams_data -> B_Pi_relax_type);
3056       HYPRE_BoomerAMGSetNumSweeps(ams_data -> B_Piy, 1);
3057       HYPRE_BoomerAMGSetMaxLevels(ams_data -> B_Piy, 25);
3058       HYPRE_BoomerAMGSetTol(ams_data -> B_Piy, 0.0);
3059       HYPRE_BoomerAMGSetMaxIter(ams_data -> B_Piy, 1);
3060       HYPRE_BoomerAMGSetStrongThreshold(ams_data -> B_Piy, ams_data -> B_Pi_theta);
3061       HYPRE_BoomerAMGSetInterpType(ams_data -> B_Piy, ams_data -> B_Pi_interp_type);
3062       HYPRE_BoomerAMGSetPMaxElmts(ams_data -> B_Piy, ams_data -> B_Pi_Pmax);
3063       HYPRE_BoomerAMGSetMinCoarseSize(ams_data -> B_Piy, 2);
3064 
3065       HYPRE_BoomerAMGCreate(&ams_data -> B_Piz);
3066       HYPRE_BoomerAMGSetCoarsenType(ams_data -> B_Piz, ams_data -> B_Pi_coarsen_type);
3067       HYPRE_BoomerAMGSetAggNumLevels(ams_data -> B_Piz, ams_data -> B_Pi_agg_levels);
3068       HYPRE_BoomerAMGSetRelaxType(ams_data -> B_Piz, ams_data -> B_Pi_relax_type);
3069       HYPRE_BoomerAMGSetNumSweeps(ams_data -> B_Piz, 1);
3070       HYPRE_BoomerAMGSetMaxLevels(ams_data -> B_Piz, 25);
3071       HYPRE_BoomerAMGSetTol(ams_data -> B_Piz, 0.0);
3072       HYPRE_BoomerAMGSetMaxIter(ams_data -> B_Piz, 1);
3073       HYPRE_BoomerAMGSetStrongThreshold(ams_data -> B_Piz, ams_data -> B_Pi_theta);
3074       HYPRE_BoomerAMGSetInterpType(ams_data -> B_Piz, ams_data -> B_Pi_interp_type);
3075       HYPRE_BoomerAMGSetPMaxElmts(ams_data -> B_Piz, ams_data -> B_Pi_Pmax);
3076       HYPRE_BoomerAMGSetMinCoarseSize(ams_data -> B_Piz, 2);
3077 
3078       /* Generally, don't use exact solve on the coarsest level (matrices may be singular) */
3079       HYPRE_BoomerAMGSetCycleRelaxType(ams_data -> B_Pix, ams_data -> B_Pi_coarse_relax_type, 3);
3080       HYPRE_BoomerAMGSetCycleRelaxType(ams_data -> B_Piy, ams_data -> B_Pi_coarse_relax_type, 3);
3081       HYPRE_BoomerAMGSetCycleRelaxType(ams_data -> B_Piz, ams_data -> B_Pi_coarse_relax_type, 3);
3082 
3083       if (ams_data -> cycle_type == 0)
3084       {
3085          HYPRE_BoomerAMGSetMaxLevels(ams_data -> B_Pix, 2);
3086          HYPRE_BoomerAMGSetMaxLevels(ams_data -> B_Piy, 2);
3087          HYPRE_BoomerAMGSetMaxLevels(ams_data -> B_Piz, 2);
3088       }
3089 
3090       /* Construct the coarse space matrices by RAP */
3091       if (!hypre_ParCSRMatrixCommPkg(ams_data -> Pix))
3092       {
3093          hypre_MatvecCommPkgCreate(ams_data -> Pix);
3094       }
3095 
3096 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
3097       if (exec == HYPRE_EXEC_DEVICE)
3098       {
3099          ams_data -> A_Pix = hypre_ParCSRMatrixRAPKT(ams_data -> Pix, ams_data -> A, ams_data -> Pix, 1);
3100       }
3101       else
3102 #endif
3103       {
3104          hypre_BoomerAMGBuildCoarseOperator(ams_data -> Pix,
3105                                             ams_data -> A,
3106                                             ams_data -> Pix,
3107                                             &ams_data -> A_Pix);
3108       }
3109 
3110       /* Make sure that A_Pix has no zero rows (this can happen
3111          for some kinds of boundary conditions with contact). */
3112       hypre_ParCSRMatrixFixZeroRows(ams_data -> A_Pix);
3113 
3114       HYPRE_BoomerAMGSetup(ams_data -> B_Pix,
3115                            (HYPRE_ParCSRMatrix)ams_data -> A_Pix,
3116                            NULL, NULL);
3117 
3118       if (ams_data -> Piy)
3119       {
3120          if (!hypre_ParCSRMatrixCommPkg(ams_data -> Piy))
3121          {
3122             hypre_MatvecCommPkgCreate(ams_data -> Piy);
3123          }
3124 
3125 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
3126          if (exec == HYPRE_EXEC_DEVICE)
3127          {
3128             ams_data -> A_Piy = hypre_ParCSRMatrixRAPKT(ams_data -> Piy,
3129                                                         ams_data -> A,
3130                                                         ams_data -> Piy, 1);
3131          }
3132          else
3133 #endif
3134          {
3135             hypre_BoomerAMGBuildCoarseOperator(ams_data -> Piy,
3136                                                ams_data -> A,
3137                                                ams_data -> Piy,
3138                                                &ams_data -> A_Piy);
3139          }
3140 
3141          /* Make sure that A_Piy has no zero rows (this can happen
3142             for some kinds of boundary conditions with contact). */
3143          hypre_ParCSRMatrixFixZeroRows(ams_data -> A_Piy);
3144 
3145          HYPRE_BoomerAMGSetup(ams_data -> B_Piy,
3146                               (HYPRE_ParCSRMatrix)ams_data -> A_Piy,
3147                               NULL, NULL);
3148       }
3149 
3150       if (ams_data -> Piz)
3151       {
3152          if (!hypre_ParCSRMatrixCommPkg(ams_data -> Piz))
3153          {
3154             hypre_MatvecCommPkgCreate(ams_data -> Piz);
3155          }
3156 
3157 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
3158          if (exec == HYPRE_EXEC_DEVICE)
3159          {
3160             ams_data -> A_Piz = hypre_ParCSRMatrixRAPKT(ams_data -> Piz,
3161                                                         ams_data -> A,
3162                                                         ams_data -> Piz, 1);
3163          }
3164          else
3165 #endif
3166          {
3167             hypre_BoomerAMGBuildCoarseOperator(ams_data -> Piz,
3168                                                ams_data -> A,
3169                                                ams_data -> Piz,
3170                                                &ams_data -> A_Piz);
3171          }
3172 
3173          /* Make sure that A_Piz has no zero rows (this can happen
3174             for some kinds of boundary conditions with contact). */
3175          hypre_ParCSRMatrixFixZeroRows(ams_data -> A_Piz);
3176 
3177          HYPRE_BoomerAMGSetup(ams_data -> B_Piz,
3178                               (HYPRE_ParCSRMatrix)ams_data -> A_Piz,
3179                               NULL, NULL);
3180       }
3181    }
3182    else
3183    /* Create the AMG solver on the range of Pi^T */
3184    {
3185       HYPRE_BoomerAMGCreate(&ams_data -> B_Pi);
3186       HYPRE_BoomerAMGSetCoarsenType(ams_data -> B_Pi, ams_data -> B_Pi_coarsen_type);
3187       HYPRE_BoomerAMGSetAggNumLevels(ams_data -> B_Pi, ams_data -> B_Pi_agg_levels);
3188       HYPRE_BoomerAMGSetRelaxType(ams_data -> B_Pi, ams_data -> B_Pi_relax_type);
3189       HYPRE_BoomerAMGSetNumSweeps(ams_data -> B_Pi, 1);
3190       HYPRE_BoomerAMGSetMaxLevels(ams_data -> B_Pi, 25);
3191       HYPRE_BoomerAMGSetTol(ams_data -> B_Pi, 0.0);
3192       HYPRE_BoomerAMGSetMaxIter(ams_data -> B_Pi, 1);
3193       HYPRE_BoomerAMGSetStrongThreshold(ams_data -> B_Pi, ams_data -> B_Pi_theta);
3194       HYPRE_BoomerAMGSetInterpType(ams_data -> B_Pi, ams_data -> B_Pi_interp_type);
3195       HYPRE_BoomerAMGSetPMaxElmts(ams_data -> B_Pi, ams_data -> B_Pi_Pmax);
3196       HYPRE_BoomerAMGSetMinCoarseSize(ams_data -> B_Pi, 2); /* don't coarsen to 0 */
3197 
3198       /* Generally, don't use exact solve on the coarsest level (matrix may be singular) */
3199       HYPRE_BoomerAMGSetCycleRelaxType(ams_data -> B_Pi, ams_data -> B_Pi_coarse_relax_type, 3);
3200 
3201       if (ams_data -> cycle_type == 0)
3202          HYPRE_BoomerAMGSetMaxLevels(ams_data -> B_Pi, 2);
3203 
3204       /* If not given, construct the coarse space matrix by RAP and
3205          notify BoomerAMG that this is a dim x dim block system. */
3206       if (!ams_data -> A_Pi)
3207       {
3208          if (!hypre_ParCSRMatrixCommPkg(ams_data -> Pi))
3209          {
3210             hypre_MatvecCommPkgCreate(ams_data -> Pi);
3211          }
3212 
3213          if (!hypre_ParCSRMatrixCommPkg(ams_data -> A))
3214          {
3215             hypre_MatvecCommPkgCreate(ams_data -> A);
3216          }
3217 
3218          if (ams_data -> cycle_type == 9)
3219          {
3220             /* Add a discrete divergence term to A before computing  Pi^t A Pi */
3221             {
3222                hypre_ParCSRMatrix *Gt, *GGt, *ApGGt;
3223                hypre_ParCSRMatrixTranspose(ams_data -> G, &Gt, 1);
3224 
3225                /* scale GGt by h^2 */
3226                {
3227                   HYPRE_Real h2;
3228                   HYPRE_Int i, j, k, ne;
3229 
3230                   hypre_CSRMatrix *Gt_diag = hypre_ParCSRMatrixDiag(Gt);
3231                   HYPRE_Int Gt_num_rows = hypre_CSRMatrixNumRows(Gt_diag);
3232                   HYPRE_Int *Gt_diag_I = hypre_CSRMatrixI(Gt_diag);
3233                   HYPRE_Int *Gt_diag_J = hypre_CSRMatrixJ(Gt_diag);
3234                   HYPRE_Real *Gt_diag_data = hypre_CSRMatrixData(Gt_diag);
3235 
3236                   hypre_CSRMatrix *Gt_offd = hypre_ParCSRMatrixOffd(Gt);
3237                   HYPRE_Int *Gt_offd_I = hypre_CSRMatrixI(Gt_offd);
3238                   HYPRE_Real *Gt_offd_data = hypre_CSRMatrixData(Gt_offd);
3239 
3240                   HYPRE_Real *Gx_data = hypre_VectorData(hypre_ParVectorLocalVector(ams_data -> Gx));
3241                   HYPRE_Real *Gy_data = hypre_VectorData(hypre_ParVectorLocalVector(ams_data -> Gy));
3242                   HYPRE_Real *Gz_data = hypre_VectorData(hypre_ParVectorLocalVector(ams_data -> Gz));
3243 
3244 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
3245                   if (exec == HYPRE_EXEC_DEVICE)
3246                   {
3247                      dim3 bDim = hypre_GetDefaultCUDABlockDimension();
3248                      dim3 gDim = hypre_GetDefaultCUDAGridDimension(Gt_num_rows, "warp", bDim);
3249                      HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSSetupScaleGGt, gDim, bDim,
3250                            Gt_num_rows, Gt_diag_I, Gt_diag_J, Gt_diag_data, Gt_offd_I, Gt_offd_data,
3251                            Gx_data, Gy_data, Gz_data );
3252                   }
3253                   else
3254 #endif
3255                   {
3256                      for (i = 0; i < Gt_num_rows; i++)
3257                      {
3258                         /* determine the characteristic mesh size for vertex i */
3259                         h2 = 0.0;
3260                         ne = 0;
3261                         for (j = Gt_diag_I[i]; j < Gt_diag_I[i+1]; j++)
3262                         {
3263                            k = Gt_diag_J[j];
3264                            h2 += Gx_data[k]*Gx_data[k]+Gy_data[k]*Gy_data[k]+Gz_data[k]*Gz_data[k];
3265                            ne++;
3266                         }
3267 
3268                         if (ne != 0)
3269                         {
3270                            h2 /= ne;
3271                            for (j = Gt_diag_I[i]; j < Gt_diag_I[i+1]; j++)
3272                               Gt_diag_data[j] *= h2;
3273                            for (j = Gt_offd_I[i]; j < Gt_offd_I[i+1]; j++)
3274                               Gt_offd_data[j] *= h2;
3275                         }
3276                      }
3277                   }
3278                }
3279 
3280                /* we only needed Gx, Gy and Gz to compute the local mesh size */
3281                if (input_info == 1)
3282                {
3283                   hypre_ParVectorDestroy(ams_data -> Gx);
3284                   if (ams_data -> dim >= 2)
3285                      hypre_ParVectorDestroy(ams_data -> Gy);
3286                   if (ams_data -> dim == 3)
3287                      hypre_ParVectorDestroy(ams_data -> Gz);
3288                }
3289 
3290 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
3291                if (exec == HYPRE_EXEC_DEVICE)
3292                {
3293                   GGt = hypre_ParCSRMatMat(ams_data -> G, Gt);
3294                }
3295 #endif
3296                else
3297                {
3298                   GGt = hypre_ParMatmul(ams_data -> G, Gt);
3299                }
3300                hypre_ParCSRMatrixDestroy(Gt);
3301 
3302                /* hypre_ParCSRMatrixAdd(GGt, A, &ams_data -> A); */
3303                hypre_ParCSRMatrixAdd(1.0, GGt, 1.0, ams_data -> A, &ApGGt);
3304                /*{
3305                   hypre_ParCSRMatrix *A = GGt;
3306                   hypre_ParCSRMatrix *B = ams_data -> A;
3307                   hypre_ParCSRMatrix **C_ptr = &ApGGt;
3308 
3309                   hypre_ParCSRMatrix *C;
3310                   hypre_CSRMatrix *A_local, *B_local, *C_local;
3311 
3312                   MPI_Comm comm = hypre_ParCSRMatrixComm(A);
3313                   HYPRE_BigInt global_num_rows = hypre_ParCSRMatrixGlobalNumRows(A);
3314                   HYPRE_BigInt global_num_cols = hypre_ParCSRMatrixGlobalNumCols(A);
3315                   HYPRE_BigInt *row_starts = hypre_ParCSRMatrixRowStarts(A);
3316                   HYPRE_BigInt *col_starts = hypre_ParCSRMatrixColStarts(A);
3317                   HYPRE_Int A_num_cols_offd = hypre_CSRMatrixNumCols(hypre_ParCSRMatrixOffd(A));
3318                   HYPRE_Int A_num_nonzeros_diag = hypre_CSRMatrixNumNonzeros(hypre_ParCSRMatrixDiag(A));
3319                   HYPRE_Int A_num_nonzeros_offd = hypre_CSRMatrixNumNonzeros(hypre_ParCSRMatrixOffd(A));
3320                   HYPRE_Int B_num_cols_offd = hypre_CSRMatrixNumCols(hypre_ParCSRMatrixOffd(B));
3321                   HYPRE_Int B_num_nonzeros_diag = hypre_CSRMatrixNumNonzeros(hypre_ParCSRMatrixDiag(B));
3322                   HYPRE_Int B_num_nonzeros_offd = hypre_CSRMatrixNumNonzeros(hypre_ParCSRMatrixOffd(B));
3323 
3324                   A_local = hypre_MergeDiagAndOffd(A);
3325                   B_local = hypre_MergeDiagAndOffd(B);
3326                   C_local = hypre_CSRMatrixBigAdd(A_local, B_local);
3327                   hypre_CSRMatrixBigJtoJ(C_local);
3328 
3329                   C = hypre_ParCSRMatrixCreate (comm,
3330                                                 global_num_rows,
3331                                                 global_num_cols,
3332                                                 row_starts,
3333                                                 col_starts,
3334                                                 A_num_cols_offd + B_num_cols_offd,
3335                                                 A_num_nonzeros_diag + B_num_nonzeros_diag,
3336                                                 A_num_nonzeros_offd + B_num_nonzeros_offd);
3337                   GenerateDiagAndOffd(C_local, C,
3338                                       hypre_ParCSRMatrixFirstColDiag(A),
3339                                       hypre_ParCSRMatrixLastColDiag(A));
3340 
3341                   hypre_CSRMatrixDestroy(A_local);
3342                   hypre_CSRMatrixDestroy(B_local);
3343                   hypre_CSRMatrixDestroy(C_local);
3344 
3345                   *C_ptr = C;
3346                }*/
3347 
3348                hypre_ParCSRMatrixDestroy(GGt);
3349 
3350 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
3351                if (exec == HYPRE_EXEC_DEVICE)
3352                {
3353                   ams_data -> A_Pi = hypre_ParCSRMatrixRAPKT(ams_data -> Pi, ApGGt, ams_data -> Pi, 1);
3354                }
3355                else
3356 #endif
3357                {
3358                   hypre_BoomerAMGBuildCoarseOperator(ams_data -> Pi,
3359                                                      ApGGt,
3360                                                      ams_data -> Pi,
3361                                                      &ams_data -> A_Pi);
3362                }
3363             }
3364          }
3365          else
3366          {
3367 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
3368             if (exec == HYPRE_EXEC_DEVICE)
3369             {
3370                ams_data -> A_Pi = hypre_ParCSRMatrixRAPKT(ams_data -> Pi, ams_data -> A, ams_data -> Pi, 1);
3371             }
3372             else
3373 #endif
3374             {
3375                hypre_BoomerAMGBuildCoarseOperator(ams_data -> Pi,
3376                                                   ams_data -> A,
3377                                                   ams_data -> Pi,
3378                                                   &ams_data -> A_Pi);
3379             }
3380          }
3381 
3382          ams_data -> owns_A_Pi = 1;
3383 
3384          if (ams_data -> cycle_type != 20)
3385             HYPRE_BoomerAMGSetNumFunctions(ams_data -> B_Pi, ams_data -> dim);
3386          else
3387             HYPRE_BoomerAMGSetNumFunctions(ams_data -> B_Pi, ams_data -> dim + 1);
3388          /* HYPRE_BoomerAMGSetNodal(ams_data -> B_Pi, 1); */
3389       }
3390 
3391       /* Make sure that A_Pi has no zero rows (this can happen for
3392          some kinds of boundary conditions with contact). */
3393       hypre_ParCSRMatrixFixZeroRows(ams_data -> A_Pi);
3394 
3395       HYPRE_BoomerAMGSetup(ams_data -> B_Pi,
3396                            (HYPRE_ParCSRMatrix)ams_data -> A_Pi,
3397                            0, 0);
3398    }
3399 
3400    /* Allocate temporary vectors */
3401    ams_data -> r0 = hypre_ParVectorInRangeOf(ams_data -> A);
3402    ams_data -> g0 = hypre_ParVectorInRangeOf(ams_data -> A);
3403    if (ams_data -> A_G)
3404    {
3405       ams_data -> r1 = hypre_ParVectorInRangeOf(ams_data -> A_G);
3406       ams_data -> g1 = hypre_ParVectorInRangeOf(ams_data -> A_G);
3407    }
3408    if (ams_data -> r1 == NULL && ams_data -> A_Pix)
3409    {
3410       ams_data -> r1 = hypre_ParVectorInRangeOf(ams_data -> A_Pix);
3411       ams_data -> g1 = hypre_ParVectorInRangeOf(ams_data -> A_Pix);
3412    }
3413    if (ams_data -> Pi)
3414    {
3415       ams_data -> r2 = hypre_ParVectorInDomainOf(ams_data -> Pi);
3416       ams_data -> g2 = hypre_ParVectorInDomainOf(ams_data -> Pi);
3417    }
3418 
3419    return hypre_error_flag;
3420 }
3421 
3422 /*--------------------------------------------------------------------------
3423  * hypre_AMSSolve
3424  *
3425  * Solve the system A x = b.
3426  *--------------------------------------------------------------------------*/
3427 
hypre_AMSSolve(void * solver,hypre_ParCSRMatrix * A,hypre_ParVector * b,hypre_ParVector * x)3428 HYPRE_Int hypre_AMSSolve(void *solver,
3429                          hypre_ParCSRMatrix *A,
3430                          hypre_ParVector *b,
3431                          hypre_ParVector *x)
3432 {
3433    hypre_AMSData *ams_data = (hypre_AMSData *) solver;
3434 
3435    HYPRE_Int i, my_id = -1;
3436    HYPRE_Real r0_norm, r_norm, b_norm, relative_resid = 0, old_resid;
3437 
3438    char cycle[30];
3439    hypre_ParCSRMatrix *Ai[5], *Pi[5];
3440    HYPRE_Solver Bi[5];
3441    HYPRE_PtrToSolverFcn HBi[5];
3442    hypre_ParVector *ri[5], *gi[5];
3443    HYPRE_Int needZ = 0;
3444 
3445    hypre_ParVector *z = ams_data -> zz;
3446 
3447    Ai[0] = ams_data -> A_G;    Pi[0] = ams_data -> G;
3448    Ai[1] = ams_data -> A_Pi;   Pi[1] = ams_data -> Pi;
3449    Ai[2] = ams_data -> A_Pix;  Pi[2] = ams_data -> Pix;
3450    Ai[3] = ams_data -> A_Piy;  Pi[3] = ams_data -> Piy;
3451    Ai[4] = ams_data -> A_Piz;  Pi[4] = ams_data -> Piz;
3452 
3453    Bi[0] = ams_data -> B_G;    HBi[0] = (HYPRE_PtrToSolverFcn) hypre_BoomerAMGSolve;
3454    Bi[1] = ams_data -> B_Pi;   HBi[1] = (HYPRE_PtrToSolverFcn) hypre_BoomerAMGBlockSolve;
3455    Bi[2] = ams_data -> B_Pix;  HBi[2] = (HYPRE_PtrToSolverFcn) hypre_BoomerAMGSolve;
3456    Bi[3] = ams_data -> B_Piy;  HBi[3] = (HYPRE_PtrToSolverFcn) hypre_BoomerAMGSolve;
3457    Bi[4] = ams_data -> B_Piz;  HBi[4] = (HYPRE_PtrToSolverFcn) hypre_BoomerAMGSolve;
3458 
3459    ri[0] = ams_data -> r1;     gi[0] = ams_data -> g1;
3460    ri[1] = ams_data -> r2;     gi[1] = ams_data -> g2;
3461    ri[2] = ams_data -> r1;     gi[2] = ams_data -> g1;
3462    ri[3] = ams_data -> r1;     gi[3] = ams_data -> g1;
3463    ri[4] = ams_data -> r1;     gi[4] = ams_data -> g1;
3464 
3465    /* may need to create an additional temporary vector for relaxation */
3466 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
3467    HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_ParCSRMatrixMemoryLocation(A) );
3468 
3469    if (exec == HYPRE_EXEC_DEVICE)
3470    {
3471       needZ = ams_data -> A_relax_type == 2 || ams_data -> A_relax_type == 4 || ams_data -> A_relax_type == 16;
3472    }
3473    else
3474 #endif
3475    {
3476       needZ = hypre_NumThreads() > 1 || ams_data -> A_relax_type == 16;
3477    }
3478 
3479    if (needZ && !z)
3480    {
3481       z = hypre_ParVectorCreate(hypre_ParCSRMatrixComm(A),
3482                                 hypre_ParCSRMatrixGlobalNumRows(A),
3483                                 hypre_ParCSRMatrixRowStarts(A));
3484       hypre_ParVectorInitialize(z);
3485       ams_data -> zz = z;
3486    }
3487 
3488    if (ams_data -> print_level > 0)
3489       hypre_MPI_Comm_rank(hypre_ParCSRMatrixComm(A), &my_id);
3490 
3491    /* Compatible subspace projection for problems with zero-conductivity regions.
3492       Note that this modifies the input (r.h.s.) vector b! */
3493    if ( (ams_data -> B_G0) &&
3494         (++ams_data->solve_counter % ( ams_data -> projection_frequency ) == 0) )
3495    {
3496       /* hypre_printf("Projecting onto the compatible subspace...\n"); */
3497       hypre_AMSProjectOutGradients(ams_data, b);
3498    }
3499 
3500    if (ams_data -> beta_is_zero)
3501    {
3502       switch (ams_data -> cycle_type)
3503       {
3504          case 0:
3505             hypre_sprintf(cycle,"%s","0");
3506             break;
3507          case 1:
3508          case 3:
3509          case 5:
3510          case 7:
3511          default:
3512             hypre_sprintf(cycle,"%s","020");
3513             break;
3514          case 2:
3515          case 4:
3516          case 6:
3517          case 8:
3518             hypre_sprintf(cycle,"%s","(0+2)");
3519             break;
3520          case 11:
3521          case 13:
3522             hypre_sprintf(cycle,"%s","0345430");
3523             break;
3524          case 12:
3525             hypre_sprintf(cycle,"%s","(0+3+4+5)");
3526             break;
3527          case 14:
3528             hypre_sprintf(cycle,"%s","0(+3+4+5)0");
3529             break;
3530       }
3531    }
3532    else
3533    {
3534       switch (ams_data -> cycle_type)
3535       {
3536          case 0:
3537             hypre_sprintf(cycle,"%s","010");
3538             break;
3539          case 1:
3540          default:
3541             hypre_sprintf(cycle,"%s","01210");
3542             break;
3543          case 2:
3544             hypre_sprintf(cycle,"%s","(0+1+2)");
3545             break;
3546          case 3:
3547             hypre_sprintf(cycle,"%s","02120");
3548             break;
3549          case 4:
3550             hypre_sprintf(cycle,"%s","(010+2)");
3551             break;
3552          case 5:
3553             hypre_sprintf(cycle,"%s","0102010");
3554             break;
3555          case 6:
3556             hypre_sprintf(cycle,"%s","(020+1)");
3557             break;
3558          case 7:
3559             hypre_sprintf(cycle,"%s","0201020");
3560             break;
3561          case 8:
3562             hypre_sprintf(cycle,"%s","0(+1+2)0");
3563             break;
3564          case 9:
3565             hypre_sprintf(cycle,"%s","01210");
3566             break;
3567          case 11:
3568             hypre_sprintf(cycle,"%s","013454310");
3569             break;
3570          case 12:
3571             hypre_sprintf(cycle,"%s","(0+1+3+4+5)");
3572             break;
3573          case 13:
3574             hypre_sprintf(cycle,"%s","034515430");
3575             break;
3576          case 14:
3577             hypre_sprintf(cycle,"%s","01(+3+4+5)10");
3578             break;
3579          case 20:
3580             hypre_sprintf(cycle,"%s","020");
3581             break;
3582       }
3583    }
3584 
3585    for (i = 0; i < ams_data -> maxit; i++)
3586    {
3587       /* Compute initial residual norms */
3588       if (ams_data -> maxit > 1 && i == 0)
3589       {
3590          hypre_ParVectorCopy(b, ams_data -> r0);
3591          hypre_ParCSRMatrixMatvec(-1.0, ams_data -> A, x, 1.0, ams_data -> r0);
3592          r_norm = sqrt(hypre_ParVectorInnerProd(ams_data -> r0,ams_data -> r0));
3593          r0_norm = r_norm;
3594          b_norm = sqrt(hypre_ParVectorInnerProd(b, b));
3595          if (b_norm)
3596             relative_resid = r_norm / b_norm;
3597          else
3598             relative_resid = r_norm;
3599          if (my_id == 0 && ams_data -> print_level > 0)
3600          {
3601             hypre_printf("                                            relative\n");
3602             hypre_printf("               residual        factor       residual\n");
3603             hypre_printf("               --------        ------       --------\n");
3604             hypre_printf("    Initial    %e                 %e\n",
3605                          r_norm, relative_resid);
3606          }
3607       }
3608 
3609       /* Apply the preconditioner */
3610       hypre_ParCSRSubspacePrec(ams_data -> A,
3611                                ams_data -> A_relax_type,
3612                                ams_data -> A_relax_times,
3613                                ams_data -> A_l1_norms ? hypre_VectorData(ams_data -> A_l1_norms) : NULL,
3614                                ams_data -> A_relax_weight,
3615                                ams_data -> A_omega,
3616                                ams_data -> A_max_eig_est,
3617                                ams_data -> A_min_eig_est,
3618                                ams_data -> A_cheby_order,
3619                                ams_data -> A_cheby_fraction,
3620                                Ai, Bi, HBi, Pi, ri, gi,
3621                                b, x,
3622                                ams_data -> r0,
3623                                ams_data -> g0,
3624                                cycle,
3625                                z);
3626 
3627       /* Compute new residual norms */
3628       if (ams_data -> maxit > 1)
3629       {
3630          old_resid = r_norm;
3631          hypre_ParVectorCopy(b, ams_data -> r0);
3632          hypre_ParCSRMatrixMatvec(-1.0, ams_data -> A, x, 1.0, ams_data -> r0);
3633          r_norm = sqrt(hypre_ParVectorInnerProd(ams_data -> r0,ams_data -> r0));
3634          if (b_norm)
3635             relative_resid = r_norm / b_norm;
3636          else
3637             relative_resid = r_norm;
3638          if (my_id == 0 && ams_data -> print_level > 0)
3639             hypre_printf("    Cycle %2d   %e    %f     %e \n",
3640                          i+1, r_norm, r_norm / old_resid, relative_resid);
3641       }
3642 
3643       if (relative_resid < ams_data -> tol)
3644       {
3645          i++;
3646          break;
3647       }
3648    }
3649 
3650    if (my_id == 0 && ams_data -> print_level > 0 && ams_data -> maxit > 1)
3651       hypre_printf("\n\n Average Convergence Factor = %f\n\n",
3652                    pow((r_norm/r0_norm),(1.0/(HYPRE_Real) i)));
3653 
3654    ams_data -> num_iterations = i;
3655    ams_data -> rel_resid_norm = relative_resid;
3656 
3657    if (ams_data -> num_iterations == ams_data -> maxit && ams_data -> tol > 0.0)
3658       hypre_error(HYPRE_ERROR_CONV);
3659 
3660    return hypre_error_flag;
3661 }
3662 
3663 /*--------------------------------------------------------------------------
3664  * hypre_ParCSRSubspacePrec
3665  *
3666  * General subspace preconditioner for A0 y = x, based on ParCSR storage.
3667  *
3668  * P[i] and A[i] are the interpolation and coarse grid matrices for
3669  * the (i+1)'th subspace. B[i] is an AMG solver for A[i]. r[i] and g[i]
3670  * are temporary vectors. A0_* are the fine grid smoothing parameters.
3671  *
3672  * The default mode is multiplicative, '+' changes the next correction
3673  * to additive, based on residual computed at '('.
3674  *--------------------------------------------------------------------------*/
3675 
hypre_ParCSRSubspacePrec(hypre_ParCSRMatrix * A0,HYPRE_Int A0_relax_type,HYPRE_Int A0_relax_times,HYPRE_Real * A0_l1_norms,HYPRE_Real A0_relax_weight,HYPRE_Real A0_omega,HYPRE_Real A0_max_eig_est,HYPRE_Real A0_min_eig_est,HYPRE_Int A0_cheby_order,HYPRE_Real A0_cheby_fraction,hypre_ParCSRMatrix ** A,HYPRE_Solver * B,HYPRE_PtrToSolverFcn * HB,hypre_ParCSRMatrix ** P,hypre_ParVector ** r,hypre_ParVector ** g,hypre_ParVector * x,hypre_ParVector * y,hypre_ParVector * r0,hypre_ParVector * g0,char * cycle,hypre_ParVector * z)3676 HYPRE_Int hypre_ParCSRSubspacePrec(/* fine space matrix */
3677                                    hypre_ParCSRMatrix *A0,
3678                                    /* relaxation parameters */
3679                                    HYPRE_Int A0_relax_type,
3680                                    HYPRE_Int A0_relax_times,
3681                                    HYPRE_Real *A0_l1_norms,
3682                                    HYPRE_Real A0_relax_weight,
3683                                    HYPRE_Real A0_omega,
3684                                    HYPRE_Real A0_max_eig_est,
3685                                    HYPRE_Real A0_min_eig_est,
3686                                    HYPRE_Int A0_cheby_order,
3687                                    HYPRE_Real A0_cheby_fraction,
3688                                    /* subspace matrices */
3689                                    hypre_ParCSRMatrix **A,
3690                                    /* subspace preconditioners */
3691                                    HYPRE_Solver *B,
3692                                    /* hypre solver functions for B */
3693                                    HYPRE_PtrToSolverFcn *HB,
3694                                    /* subspace interpolations */
3695                                    hypre_ParCSRMatrix **P,
3696                                    /* temporary subspace vectors */
3697                                    hypre_ParVector **r,
3698                                    hypre_ParVector **g,
3699                                    /* right-hand side */
3700                                    hypre_ParVector *x,
3701                                    /* current approximation */
3702                                    hypre_ParVector *y,
3703                                    /* current residual */
3704                                    hypre_ParVector *r0,
3705                                    /* temporary vector */
3706                                    hypre_ParVector *g0,
3707                                    char *cycle,
3708                                    /* temporary vector */
3709                                    hypre_ParVector *z)
3710 {
3711    char *op;
3712    HYPRE_Int use_saved_residual = 0;
3713 
3714    for (op = cycle; *op != '\0'; op++)
3715    {
3716       /* do nothing */
3717       if (*op == ')')
3718          continue;
3719 
3720       /* compute the residual: r = x - Ay */
3721       else if (*op == '(')
3722       {
3723          hypre_ParVectorCopy(x,r0);
3724          hypre_ParCSRMatrixMatvec(-1.0, A0, y, 1.0, r0);
3725       }
3726 
3727       /* switch to additive correction */
3728       else if (*op == '+')
3729       {
3730          use_saved_residual = 1;
3731          continue;
3732       }
3733 
3734       /* smooth: y += S (x - Ay) */
3735       else if (*op == '0')
3736       {
3737          hypre_ParCSRRelax(A0, x,
3738                            A0_relax_type,
3739                            A0_relax_times,
3740                            A0_l1_norms,
3741                            A0_relax_weight,
3742                            A0_omega,
3743                            A0_max_eig_est,
3744                            A0_min_eig_est,
3745                            A0_cheby_order,
3746                            A0_cheby_fraction,
3747                            y, g0, z);
3748       }
3749 
3750       /* subspace correction: y += P B^{-1} P^t r */
3751       else
3752       {
3753          HYPRE_Int i = *op - '1';
3754          if (i < 0)
3755             hypre_error_in_arg(16);
3756 
3757          /* skip empty subspaces */
3758          if (!A[i]) continue;
3759 
3760          /* compute the residual? */
3761          if (use_saved_residual)
3762          {
3763             use_saved_residual = 0;
3764             hypre_ParCSRMatrixMatvecT(1.0, P[i], r0, 0.0, r[i]);
3765          }
3766          else
3767          {
3768             hypre_ParVectorCopy(x,g0);
3769             hypre_ParCSRMatrixMatvec(-1.0, A0, y, 1.0, g0);
3770             hypre_ParCSRMatrixMatvecT(1.0, P[i], g0, 0.0, r[i]);
3771          }
3772 
3773          hypre_ParVectorSetConstantValues(g[i], 0.0);
3774          (*HB[i]) (B[i], (HYPRE_Matrix)A[i],
3775                    (HYPRE_Vector)r[i], (HYPRE_Vector)g[i]);
3776          hypre_ParCSRMatrixMatvec(1.0, P[i], g[i], 0.0, g0);
3777          hypre_ParVectorAxpy(1.0, g0, y);
3778       }
3779    }
3780 
3781    return hypre_error_flag;
3782 }
3783 
3784 /*--------------------------------------------------------------------------
3785  * hypre_AMSGetNumIterations
3786  *
3787  * Get the number of AMS iterations.
3788  *--------------------------------------------------------------------------*/
3789 
hypre_AMSGetNumIterations(void * solver,HYPRE_Int * num_iterations)3790 HYPRE_Int hypre_AMSGetNumIterations(void *solver,
3791                                     HYPRE_Int *num_iterations)
3792 {
3793    hypre_AMSData *ams_data = (hypre_AMSData *) solver;
3794    *num_iterations = ams_data -> num_iterations;
3795    return hypre_error_flag;
3796 }
3797 
3798 /*--------------------------------------------------------------------------
3799  * hypre_AMSGetFinalRelativeResidualNorm
3800  *
3801  * Get the final relative residual norm in AMS.
3802  *--------------------------------------------------------------------------*/
3803 
hypre_AMSGetFinalRelativeResidualNorm(void * solver,HYPRE_Real * rel_resid_norm)3804 HYPRE_Int hypre_AMSGetFinalRelativeResidualNorm(void *solver,
3805                                                 HYPRE_Real *rel_resid_norm)
3806 {
3807    hypre_AMSData *ams_data = (hypre_AMSData *) solver;
3808    *rel_resid_norm = ams_data -> rel_resid_norm;
3809    return hypre_error_flag;
3810 }
3811 
3812 /*--------------------------------------------------------------------------
3813  * hypre_AMSProjectOutGradients
3814  *
3815  * For problems with zero-conductivity regions, project the vector onto the
3816  * compatible subspace: x = (I - G0 (G0^t G0)^{-1} G0^T) x, where G0 is the
3817  * discrete gradient restricted to the interior nodes of the regions with
3818  * zero conductivity. This ensures that x is orthogonal to the gradients in
3819  * the range of G0.
3820  *
3821  * This function is typically called after the solution iteration is complete,
3822  * in order to facilitate the visualization of the computed field. Without it
3823  * the values in the zero-conductivity regions contain kernel components.
3824  *--------------------------------------------------------------------------*/
3825 
hypre_AMSProjectOutGradients(void * solver,hypre_ParVector * x)3826 HYPRE_Int hypre_AMSProjectOutGradients(void *solver,
3827                                        hypre_ParVector *x)
3828 {
3829    hypre_AMSData *ams_data = (hypre_AMSData *) solver;
3830 
3831    if (ams_data -> B_G0)
3832    {
3833       hypre_ParCSRMatrixMatvecT(1.0, ams_data -> G0, x, 0.0, ams_data -> r1);
3834       hypre_ParVectorSetConstantValues(ams_data -> g1, 0.0);
3835       hypre_BoomerAMGSolve(ams_data -> B_G0, ams_data -> A_G0, ams_data -> r1, ams_data -> g1);
3836       hypre_ParCSRMatrixMatvec(1.0, ams_data -> G0, ams_data -> g1, 0.0, ams_data -> g0);
3837       hypre_ParVectorAxpy(-1.0, ams_data -> g0, x);
3838    }
3839 
3840    return hypre_error_flag;
3841 }
3842 
3843 /*--------------------------------------------------------------------------
3844  * hypre_AMSConstructDiscreteGradient
3845  *
3846  * Construct and return the lowest-order discrete gradient matrix G, based on:
3847  * - a matrix on the egdes (e.g. the stiffness matrix A)
3848  * - a vector on the vertices (e.g. the x coordinates)
3849  * - the array edge_vertex, which lists the global indexes of the
3850  *   vertices of the local edges.
3851  *
3852  * We assume that edge_vertex lists the edge vertices consecutively,
3853  * and that the orientation of all edges is consistent. More specificaly:
3854  * If edge_orientation = 1, the edges are already oriented.
3855  * If edge_orientation = 2, the orientation of edge i depends only on the
3856  *                          sign of edge_vertex[2*i+1] - edge_vertex[2*i].
3857  *--------------------------------------------------------------------------*/
3858 
hypre_AMSConstructDiscreteGradient(hypre_ParCSRMatrix * A,hypre_ParVector * x_coord,HYPRE_BigInt * edge_vertex,HYPRE_Int edge_orientation,hypre_ParCSRMatrix ** G_ptr)3859 HYPRE_Int hypre_AMSConstructDiscreteGradient(hypre_ParCSRMatrix *A,
3860                                              hypre_ParVector *x_coord,
3861                                              HYPRE_BigInt *edge_vertex,
3862                                              HYPRE_Int edge_orientation,
3863                                              hypre_ParCSRMatrix **G_ptr)
3864 {
3865    hypre_ParCSRMatrix *G;
3866 
3867    HYPRE_Int nedges;
3868 
3869    nedges = hypre_ParCSRMatrixNumRows(A);
3870 
3871    /* Construct the local part of G based on edge_vertex and the edge
3872       and vertex partitionings from A and x_coord */
3873    {
3874       HYPRE_Int i, *I = hypre_CTAlloc(HYPRE_Int,  nedges+1, HYPRE_MEMORY_HOST);
3875       HYPRE_Real *data = hypre_CTAlloc(HYPRE_Real,  2*nedges, HYPRE_MEMORY_HOST);
3876       hypre_CSRMatrix *local = hypre_CSRMatrixCreate (nedges,
3877                                                       hypre_ParVectorGlobalSize(x_coord),
3878                                                       2*nedges);
3879 
3880       for (i = 0; i <= nedges; i++)
3881          I[i] = 2*i;
3882 
3883       if (edge_orientation == 1)
3884       {
3885          /* Assume that the edges are already oriented */
3886          for (i = 0; i < 2*nedges; i+=2)
3887          {
3888             data[i]   = -1.0;
3889             data[i+1] =  1.0;
3890          }
3891       }
3892       else if (edge_orientation == 2)
3893       {
3894          /* Assume that the edge orientation is based on the vertex indexes */
3895          for (i = 0; i < 2*nedges; i+=2)
3896          {
3897             if (edge_vertex[i] < edge_vertex[i+1])
3898             {
3899                data[i]   = -1.0;
3900                data[i+1] =  1.0;
3901             }
3902             else
3903             {
3904                data[i]   =  1.0;
3905                data[i+1] = -1.0;
3906             }
3907          }
3908       }
3909       else
3910       {
3911          hypre_error_in_arg(4);
3912       }
3913 
3914       hypre_CSRMatrixI(local) = I;
3915       hypre_CSRMatrixBigJ(local) = edge_vertex;
3916       hypre_CSRMatrixData(local) = data;
3917 
3918       hypre_CSRMatrixRownnz(local) = NULL;
3919       hypre_CSRMatrixOwnsData(local) = 1;
3920       hypre_CSRMatrixNumRownnz(local) = nedges;
3921 
3922       /* Generate the discrete gradient matrix */
3923       G = hypre_ParCSRMatrixCreate(hypre_ParCSRMatrixComm(A),
3924                                    hypre_ParCSRMatrixGlobalNumRows(A),
3925                                    hypre_ParVectorGlobalSize(x_coord),
3926                                    hypre_ParCSRMatrixRowStarts(A),
3927                                    hypre_ParVectorPartitioning(x_coord),
3928                                    0, 0, 0);
3929       hypre_CSRMatrixBigJtoJ(local);
3930       GenerateDiagAndOffd(local, G,
3931                           hypre_ParVectorFirstIndex(x_coord),
3932                           hypre_ParVectorLastIndex(x_coord));
3933 
3934 
3935       /* Account for empty rows in G. These may appear when A includes only
3936          the interior (non-Dirichlet b.c.) edges. */
3937       {
3938          hypre_CSRMatrix *G_diag = hypre_ParCSRMatrixDiag(G);
3939          G_diag->num_cols = hypre_VectorSize(hypre_ParVectorLocalVector(x_coord));
3940       }
3941 
3942       /* Free the local matrix */
3943       hypre_CSRMatrixDestroy(local);
3944    }
3945 
3946    *G_ptr = G;
3947 
3948    return hypre_error_flag;
3949 }
3950 
3951 /*--------------------------------------------------------------------------
3952  * hypre_AMSFEISetup
3953  *
3954  * Construct an AMS solver object based on the following data:
3955  *
3956  *    A              - the edge element stiffness matrix
3957  *    num_vert       - number of vertices (nodes) in the processor
3958  *    num_local_vert - number of vertices owned by the processor
3959  *    vert_number    - global indexes of the vertices in the processor
3960  *    vert_coord     - coordinates of the vertices in the processor
3961  *    num_edges      - number of edges owned by the processor
3962  *    edge_vertex    - the vertices of the edges owned by the processor.
3963  *                     Vertices are in local numbering (the same as in
3964  *                     vert_number), and edge orientation is always from
3965  *                     the first to the second vertex.
3966  *
3967  * Here we distinguish between vertices that belong to elements in the
3968  * current processor, and the subset of these vertices that is owned by
3969  * the processor.
3970  *
3971  * This function is written specifically for input from the FEI and should
3972  * be called before hypre_AMSSetup().
3973  *--------------------------------------------------------------------------*/
3974 
hypre_AMSFEISetup(void * solver,hypre_ParCSRMatrix * A,hypre_ParVector * b,hypre_ParVector * x,HYPRE_Int num_vert,HYPRE_Int num_local_vert,HYPRE_BigInt * vert_number,HYPRE_Real * vert_coord,HYPRE_Int num_edges,HYPRE_BigInt * edge_vertex)3975 HYPRE_Int hypre_AMSFEISetup(void *solver,
3976                             hypre_ParCSRMatrix *A,
3977                             hypre_ParVector *b,
3978                             hypre_ParVector *x,
3979                             HYPRE_Int num_vert,
3980                             HYPRE_Int num_local_vert,
3981                             HYPRE_BigInt *vert_number,
3982                             HYPRE_Real *vert_coord,
3983                             HYPRE_Int num_edges,
3984                             HYPRE_BigInt *edge_vertex)
3985 {
3986    hypre_AMSData *ams_data = (hypre_AMSData *) solver;
3987 
3988    HYPRE_Int i, j;
3989 
3990    hypre_ParCSRMatrix *G;
3991    hypre_ParVector *x_coord, *y_coord, *z_coord;
3992    HYPRE_Real *x_data, *y_data, *z_data;
3993 
3994    MPI_Comm comm = hypre_ParCSRMatrixComm(A);
3995    HYPRE_BigInt *vert_part, num_global_vert;
3996    HYPRE_BigInt vert_start, vert_end;
3997    HYPRE_BigInt big_local_vert = (HYPRE_BigInt) num_local_vert;
3998 
3999    /* Find the processor partitioning of the vertices */
4000    vert_part = hypre_TAlloc(HYPRE_BigInt, 2, HYPRE_MEMORY_HOST);
4001    hypre_MPI_Scan(&big_local_vert, &vert_part[1], 1, HYPRE_MPI_BIG_INT, hypre_MPI_SUM, comm);
4002    vert_part[0] = vert_part[1] - big_local_vert;
4003    hypre_MPI_Allreduce(&big_local_vert, &num_global_vert, 1, HYPRE_MPI_BIG_INT, hypre_MPI_SUM, comm);
4004 
4005    /* Construct hypre parallel vectors for the vertex coordinates */
4006    x_coord = hypre_ParVectorCreate(comm, num_global_vert, vert_part);
4007    hypre_ParVectorInitialize(x_coord);
4008    hypre_ParVectorOwnsData(x_coord) = 1;
4009    x_data = hypre_VectorData(hypre_ParVectorLocalVector(x_coord));
4010 
4011    y_coord = hypre_ParVectorCreate(comm, num_global_vert, vert_part);
4012    hypre_ParVectorInitialize(y_coord);
4013    hypre_ParVectorOwnsData(y_coord) = 1;
4014    y_data = hypre_VectorData(hypre_ParVectorLocalVector(y_coord));
4015 
4016    z_coord = hypre_ParVectorCreate(comm, num_global_vert, vert_part);
4017    hypre_ParVectorInitialize(z_coord);
4018    hypre_ParVectorOwnsData(z_coord) = 1;
4019    z_data = hypre_VectorData(hypre_ParVectorLocalVector(z_coord));
4020 
4021    vert_start = hypre_ParVectorFirstIndex(x_coord);
4022    vert_end   = hypre_ParVectorLastIndex(x_coord);
4023 
4024    /* Save coordinates of locally owned vertices */
4025    for (i = 0; i < num_vert; i++)
4026    {
4027       if (vert_number[i] >= vert_start && vert_number[i] <= vert_end)
4028       {
4029          j = (HYPRE_Int)(vert_number[i] - vert_start);
4030          x_data[j] = vert_coord[3*i];
4031          y_data[j] = vert_coord[3*i+1];
4032          z_data[j] = vert_coord[3*i+2];
4033       }
4034    }
4035 
4036    /* Change vertex numbers from local to global */
4037    for (i = 0; i < 2*num_edges; i++)
4038       edge_vertex[i] = vert_number[edge_vertex[i]];
4039 
4040    /* Construct the local part of G based on edge_vertex */
4041    {
4042       /* HYPRE_Int num_edges = hypre_ParCSRMatrixNumRows(A); */
4043       HYPRE_Int *I = hypre_CTAlloc(HYPRE_Int,  num_edges+1, HYPRE_MEMORY_HOST);
4044       HYPRE_Real *data = hypre_CTAlloc(HYPRE_Real,  2*num_edges, HYPRE_MEMORY_HOST);
4045       hypre_CSRMatrix *local = hypre_CSRMatrixCreate (num_edges,
4046                                                       num_global_vert,
4047                                                       2*num_edges);
4048 
4049       for (i = 0; i <= num_edges; i++)
4050          I[i] = 2*i;
4051 
4052       /* Assume that the edge orientation is based on the vertex indexes */
4053       for (i = 0; i < 2*num_edges; i+=2)
4054       {
4055          data[i]   =  1.0;
4056          data[i+1] = -1.0;
4057       }
4058 
4059       hypre_CSRMatrixI(local) = I;
4060       hypre_CSRMatrixBigJ(local) = edge_vertex;
4061       hypre_CSRMatrixData(local) = data;
4062 
4063       hypre_CSRMatrixRownnz(local) = NULL;
4064       hypre_CSRMatrixOwnsData(local) = 1;
4065       hypre_CSRMatrixNumRownnz(local) = num_edges;
4066 
4067       G = hypre_ParCSRMatrixCreate(comm,
4068                                    hypre_ParCSRMatrixGlobalNumRows(A),
4069                                    num_global_vert,
4070                                    hypre_ParCSRMatrixRowStarts(A),
4071                                    vert_part,
4072                                    0, 0, 0);
4073       hypre_CSRMatrixBigJtoJ(local);
4074       GenerateDiagAndOffd(local, G, vert_start, vert_end);
4075 
4076       //hypre_CSRMatrixJ(local) = NULL;
4077       hypre_CSRMatrixDestroy(local);
4078    }
4079 
4080    ams_data -> G = G;
4081 
4082    ams_data -> x = x_coord;
4083    ams_data -> y = y_coord;
4084    ams_data -> z = z_coord;
4085 
4086    return hypre_error_flag;
4087 }
4088 
4089 /*--------------------------------------------------------------------------
4090  * hypre_AMSFEIDestroy
4091  *
4092  * Free the additional memory allocated in hypre_AMSFEISetup().
4093  *
4094  * This function is written specifically for input from the FEI and should
4095  * be called before hypre_AMSDestroy().
4096  *--------------------------------------------------------------------------*/
4097 
hypre_AMSFEIDestroy(void * solver)4098 HYPRE_Int hypre_AMSFEIDestroy(void *solver)
4099 {
4100    hypre_AMSData *ams_data = (hypre_AMSData *) solver;
4101 
4102    if (ams_data -> G)
4103       hypre_ParCSRMatrixDestroy(ams_data -> G);
4104 
4105    if (ams_data -> x)
4106       hypre_ParVectorDestroy(ams_data -> x);
4107    if (ams_data -> y)
4108       hypre_ParVectorDestroy(ams_data -> y);
4109    if (ams_data -> z)
4110       hypre_ParVectorDestroy(ams_data -> z);
4111 
4112    return hypre_error_flag;
4113 }
4114 
4115 /*--------------------------------------------------------------------------
4116  * hypre_ParCSRComputeL1Norms Threads
4117  *
4118  * Compute the l1 norms of the rows of a given matrix, depending on
4119  * the option parameter:
4120  *
4121  * option 1 = Compute the l1 norm of the rows
4122  * option 2 = Compute the l1 norm of the (processor) off-diagonal
4123  *            part of the rows plus the diagonal of A
4124  * option 3 = Compute the l2 norm^2 of the rows
4125  * option 4 = Truncated version of option 2 based on Remark 6.2 in "Multigrid
4126  *            Smoothers for Ultra-Parallel Computing"
4127  *
4128  * The above computations are done in a CF manner, whenever the provided
4129  * cf_marker is not NULL.
4130  *--------------------------------------------------------------------------*/
4131 
hypre_ParCSRComputeL1NormsThreads(hypre_ParCSRMatrix * A,HYPRE_Int option,HYPRE_Int num_threads,HYPRE_Int * cf_marker,HYPRE_Real ** l1_norm_ptr)4132 HYPRE_Int hypre_ParCSRComputeL1NormsThreads(hypre_ParCSRMatrix *A,
4133                                             HYPRE_Int           option,
4134                                             HYPRE_Int           num_threads,
4135                                             HYPRE_Int          *cf_marker,
4136                                             HYPRE_Real        **l1_norm_ptr)
4137 {
4138    HYPRE_Int i, j, k;
4139    HYPRE_Int num_rows = hypre_ParCSRMatrixNumRows(A);
4140 
4141    hypre_CSRMatrix *A_diag = hypre_ParCSRMatrixDiag(A);
4142    HYPRE_Int *A_diag_I = hypre_CSRMatrixI(A_diag);
4143    HYPRE_Int *A_diag_J = hypre_CSRMatrixJ(A_diag);
4144    HYPRE_Real *A_diag_data = hypre_CSRMatrixData(A_diag);
4145 
4146    hypre_CSRMatrix *A_offd = hypre_ParCSRMatrixOffd(A);
4147    HYPRE_Int *A_offd_I = hypre_CSRMatrixI(A_offd);
4148    HYPRE_Int *A_offd_J = hypre_CSRMatrixJ(A_offd);
4149    HYPRE_Real *A_offd_data = hypre_CSRMatrixData(A_offd);
4150    HYPRE_Int num_cols_offd = hypre_CSRMatrixNumCols(A_offd);
4151 
4152    HYPRE_Real diag;
4153    HYPRE_Real *l1_norm = hypre_TAlloc(HYPRE_Real, num_rows, hypre_ParCSRMatrixMemoryLocation(A));
4154    HYPRE_Int ii, ns, ne, rest, size;
4155 
4156    HYPRE_Int *cf_marker_offd = NULL;
4157    HYPRE_Int cf_diag;
4158 
4159    /* collect the cf marker data from other procs */
4160    if (cf_marker != NULL)
4161    {
4162       HYPRE_Int index;
4163       HYPRE_Int num_sends;
4164       HYPRE_Int start;
4165       HYPRE_Int *int_buf_data = NULL;
4166 
4167       hypre_ParCSRCommPkg  *comm_pkg = hypre_ParCSRMatrixCommPkg(A);
4168       hypre_ParCSRCommHandle *comm_handle;
4169 
4170       if (num_cols_offd)
4171          cf_marker_offd = hypre_CTAlloc(HYPRE_Int,  num_cols_offd, HYPRE_MEMORY_HOST);
4172       num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg);
4173       if (hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends))
4174          int_buf_data = hypre_CTAlloc(HYPRE_Int,
4175                                       hypre_ParCSRCommPkgSendMapStart(comm_pkg,  num_sends), HYPRE_MEMORY_HOST);
4176       index = 0;
4177       for (i = 0; i < num_sends; i++)
4178       {
4179          start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i);
4180          for (j = start; j < hypre_ParCSRCommPkgSendMapStart(comm_pkg, i+1); j++)
4181          {
4182             int_buf_data[index++] = cf_marker[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,j)];
4183          }
4184       }
4185       comm_handle = hypre_ParCSRCommHandleCreate(11, comm_pkg, int_buf_data,
4186                                                  cf_marker_offd);
4187       hypre_ParCSRCommHandleDestroy(comm_handle);
4188       hypre_TFree(int_buf_data, HYPRE_MEMORY_HOST);
4189    }
4190 
4191 #ifdef HYPRE_USING_OPENMP
4192 #pragma omp parallel for private(i,ii,j,k,ns,ne,rest,size,diag,cf_diag) HYPRE_SMP_SCHEDULE
4193 #endif
4194    for (k = 0; k < num_threads; k++)
4195    {
4196       size = num_rows/num_threads;
4197       rest = num_rows - size*num_threads;
4198       if (k < rest)
4199       {
4200          ns = k*size+k;
4201          ne = (k+1)*size+k+1;
4202       }
4203       else
4204       {
4205          ns = k*size+rest;
4206          ne = (k+1)*size+rest;
4207       }
4208 
4209       if (option == 1)
4210       {
4211          for (i = ns; i < ne; i++)
4212          {
4213             l1_norm[i] = 0.0;
4214             if (cf_marker == NULL)
4215             {
4216                /* Add the l1 norm of the diag part of the ith row */
4217                for (j = A_diag_I[i]; j < A_diag_I[i+1]; j++)
4218                   l1_norm[i] += fabs(A_diag_data[j]);
4219                /* Add the l1 norm of the offd part of the ith row */
4220                if (num_cols_offd)
4221                {
4222                   for (j = A_offd_I[i]; j < A_offd_I[i+1]; j++)
4223                      l1_norm[i] += fabs(A_offd_data[j]);
4224                }
4225             }
4226             else
4227             {
4228                cf_diag = cf_marker[i];
4229                /* Add the CF l1 norm of the diag part of the ith row */
4230                for (j = A_diag_I[i]; j < A_diag_I[i+1]; j++)
4231                   if (cf_diag == cf_marker[A_diag_J[j]])
4232                      l1_norm[i] += fabs(A_diag_data[j]);
4233                /* Add the CF l1 norm of the offd part of the ith row */
4234                if (num_cols_offd)
4235                {
4236                   for (j = A_offd_I[i]; j < A_offd_I[i+1]; j++)
4237                      if (cf_diag == cf_marker_offd[A_offd_J[j]])
4238                         l1_norm[i] += fabs(A_offd_data[j]);
4239                }
4240             }
4241          }
4242       }
4243       else if (option == 2)
4244       {
4245          for (i = ns; i < ne; i++)
4246          {
4247             l1_norm[i] = 0.0;
4248             if (cf_marker == NULL)
4249             {
4250                /* Add the diagonal and the local off-thread part of the ith row */
4251                for (j = A_diag_I[i]; j < A_diag_I[i+1]; j++)
4252                {
4253                   ii = A_diag_J[j];
4254                   if (ii == i || ii < ns || ii >= ne)
4255                      l1_norm[i] += fabs(A_diag_data[j]);
4256                }
4257                /* Add the l1 norm of the offd part of the ith row */
4258                if (num_cols_offd)
4259                {
4260                   for (j = A_offd_I[i]; j < A_offd_I[i+1]; j++)
4261                      l1_norm[i] += fabs(A_offd_data[j]);
4262                }
4263             }
4264             else
4265             {
4266                cf_diag = cf_marker[i];
4267                /* Add the diagonal and the local off-thread part of the ith row */
4268                for (j = A_diag_I[i]; j < A_diag_I[i+1]; j++)
4269                {
4270                   ii = A_diag_J[j];
4271                   if ((ii == i || ii < ns || ii >= ne) &&
4272                       (cf_diag == cf_marker[A_diag_J[j]]))
4273                      l1_norm[i] += fabs(A_diag_data[j]);
4274                }
4275                /* Add the CF l1 norm of the offd part of the ith row */
4276                if (num_cols_offd)
4277                {
4278                   for (j = A_offd_I[i]; j < A_offd_I[i+1]; j++)
4279                      if (cf_diag == cf_marker_offd[A_offd_J[j]])
4280                         l1_norm[i] += fabs(A_offd_data[j]);
4281                }
4282             }
4283          }
4284       }
4285       else if (option == 3)
4286       {
4287          for (i = ns; i < ne; i++)
4288          {
4289             l1_norm[i] = 0.0;
4290             for (j = A_diag_I[i]; j < A_diag_I[i+1]; j++)
4291                l1_norm[i] += A_diag_data[j] * A_diag_data[j];
4292             if (num_cols_offd)
4293                for (j = A_offd_I[i]; j < A_offd_I[i+1]; j++)
4294                   l1_norm[i] += A_offd_data[j] * A_offd_data[j];
4295          }
4296       }
4297       else if (option == 4)
4298       {
4299          for (i = ns; i < ne; i++)
4300          {
4301             l1_norm[i] = 0.0;
4302             if (cf_marker == NULL)
4303             {
4304                /* Add the diagonal and the local off-thread part of the ith row */
4305                for (j = A_diag_I[i]; j < A_diag_I[i+1]; j++)
4306                {
4307                   ii = A_diag_J[j];
4308                   if (ii == i || ii < ns || ii >= ne)
4309                   {
4310                      if (ii == i)
4311                      {
4312                         diag = fabs(A_diag_data[j]);
4313                         l1_norm[i] += fabs(A_diag_data[j]);
4314                      }
4315                      else
4316                         l1_norm[i] += 0.5*fabs(A_diag_data[j]);
4317                   }
4318                }
4319                /* Add the l1 norm of the offd part of the ith row */
4320                if (num_cols_offd)
4321                {
4322                   for (j = A_offd_I[i]; j < A_offd_I[i+1]; j++)
4323                      l1_norm[i] += 0.5*fabs(A_offd_data[j]);
4324                }
4325             }
4326             else
4327             {
4328                cf_diag = cf_marker[i];
4329                /* Add the diagonal and the local off-thread part of the ith row */
4330                for (j = A_diag_I[i]; j < A_diag_I[i+1]; j++)
4331                {
4332                   ii = A_diag_J[j];
4333                   if ((ii == i || ii < ns || ii >= ne) &&
4334                       (cf_diag == cf_marker[A_diag_J[j]]))
4335                   {
4336                      if (ii == i)
4337                      {
4338                         diag = fabs(A_diag_data[j]);
4339                         l1_norm[i] += fabs(A_diag_data[j]);
4340                      }
4341                      else
4342                         l1_norm[i] += 0.5*fabs(A_diag_data[j]);
4343                   }
4344                }
4345                /* Add the CF l1 norm of the offd part of the ith row */
4346                if (num_cols_offd)
4347                {
4348                   for (j = A_offd_I[i]; j < A_offd_I[i+1]; j++)
4349                      if (cf_diag == cf_marker_offd[A_offd_J[j]])
4350                         l1_norm[i] += 0.5*fabs(A_offd_data[j]);
4351                }
4352             }
4353 
4354             /* Truncate according to Remark 6.2 */
4355             if (l1_norm[i] <= 4.0/3.0*diag)
4356                l1_norm[i] = diag;
4357          }
4358       }
4359 
4360       else if (option == 5) /*stores diagonal of A for Jacobi using matvec, rlx 7 */
4361       {
4362          /* Set the diag element */
4363          for (i = ns; i < ne; i++)
4364          {
4365             l1_norm[i] =  A_diag_data[A_diag_I[i]];
4366             if (l1_norm[i] == 0) l1_norm[i] = 1.0;
4367          }
4368       }
4369 
4370       if (option < 5)
4371       {
4372          /* Handle negative definite matrices */
4373          for (i = ns; i < ne; i++)
4374             if (A_diag_data[A_diag_I[i]] < 0)
4375                l1_norm[i] = -l1_norm[i];
4376 
4377          for (i = ns; i < ne; i++)
4378          /* if (fabs(l1_norm[i]) < DBL_EPSILON) */
4379             if (fabs(l1_norm[i]) == 0.0)
4380             {
4381                hypre_error_in_arg(1);
4382                break;
4383             }
4384       }
4385 
4386    }
4387 
4388    hypre_TFree(cf_marker_offd, HYPRE_MEMORY_HOST);
4389 
4390    *l1_norm_ptr = l1_norm;
4391 
4392    return hypre_error_flag;
4393 }
4394