1 /******************************************************************************
2 * Copyright 1998-2019 Lawrence Livermore National Security, LLC and other
3 * HYPRE Project Developers. See the top-level COPYRIGHT file for details.
4 *
5 * SPDX-License-Identifier: (Apache-2.0 OR MIT)
6 ******************************************************************************/
7
8 #include "_hypre_parcsr_ls.h"
9 #include "float.h"
10 #include "ams.h"
11 #include "_hypre_utilities.hpp"
12
13 /*--------------------------------------------------------------------------
14 * hypre_ParCSRRelax
15 *
16 * Relaxation on the ParCSR matrix A with right-hand side f and
17 * initial guess u. Possible values for relax_type are:
18 *
19 * 1 = l1-scaled (or weighted) Jacobi
20 * 2 = l1-scaled block Gauss-Seidel/SSOR
21 * 3 = Kaczmarz
22 * 4 = truncated version of 2 (Remark 6.2 in smoothers paper)
23 * x = BoomerAMG relaxation with relax_type = |x|
24 * (16 = Cheby)
25 *
26 * The default value of relax_type is 2.
27 *--------------------------------------------------------------------------*/
28
29 HYPRE_Int
hypre_ParCSRRelax(hypre_ParCSRMatrix * A,hypre_ParVector * f,HYPRE_Int relax_type,HYPRE_Int relax_times,HYPRE_Real * l1_norms,HYPRE_Real relax_weight,HYPRE_Real omega,HYPRE_Real max_eig_est,HYPRE_Real min_eig_est,HYPRE_Int cheby_order,HYPRE_Real cheby_fraction,hypre_ParVector * u,hypre_ParVector * v,hypre_ParVector * z)30 hypre_ParCSRRelax( hypre_ParCSRMatrix *A, /* matrix to relax with */
31 hypre_ParVector *f, /* right-hand side */
32 HYPRE_Int relax_type, /* relaxation type */
33 HYPRE_Int relax_times, /* number of sweeps */
34 HYPRE_Real *l1_norms, /* l1 norms of the rows of A */
35 HYPRE_Real relax_weight, /* damping coefficient (usually <= 1) */
36 HYPRE_Real omega, /* SOR parameter (usually in (0,2) */
37 HYPRE_Real max_eig_est, /* for cheby smoothers */
38 HYPRE_Real min_eig_est,
39 HYPRE_Int cheby_order,
40 HYPRE_Real cheby_fraction,
41 hypre_ParVector *u, /* initial/updated approximation */
42 hypre_ParVector *v, /* temporary vector */
43 hypre_ParVector *z /* temporary vector */ )
44 {
45 HYPRE_Int sweep;
46
47 for (sweep = 0; sweep < relax_times; sweep++)
48 {
49 if (relax_type == 1) /* l1-scaled Jacobi */
50 {
51 hypre_BoomerAMGRelax(A, f, NULL, 7, 0, relax_weight, 1.0, l1_norms, u, v, z);
52 }
53 else if (relax_type == 2 || relax_type == 4) /* offd-l1-scaled block GS */
54 {
55 /* !!! Note: relax_weight and omega flipped !!! */
56 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
57 HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_ParCSRMatrixMemoryLocation(A) );
58 if (exec == HYPRE_EXEC_DEVICE)
59 {
60 hypre_BoomerAMGRelaxHybridGaussSeidelDevice(A, f, NULL, 0, omega, relax_weight, l1_norms, u, v, z,
61 1, 1 /* symm */);
62 }
63 else
64 #endif
65 {
66 hypre_BoomerAMGRelaxHybridGaussSeidel_core(A, f, NULL, 0, omega, relax_weight, l1_norms, u, v, z,
67 1, 1 /* symm */, 0 /* skip diag */, 1, 0);
68 }
69 }
70 else if (relax_type == 3) /* Kaczmarz */
71 {
72 hypre_BoomerAMGRelax(A, f, NULL, 20, 0, relax_weight, omega, l1_norms, u, v, z);
73 }
74 else /* call BoomerAMG relaxation */
75 {
76 if (relax_type == 16)
77 {
78 hypre_ParCSRRelax_Cheby(A, f, max_eig_est, min_eig_est, cheby_fraction, cheby_order, 1,
79 0, u, v, z);
80 }
81 else
82 {
83 hypre_BoomerAMGRelax(A, f, NULL, hypre_abs(relax_type), 0, relax_weight,
84 omega, l1_norms, u, v, z);
85 }
86 }
87 }
88
89 return hypre_error_flag;
90 }
91
92 /*--------------------------------------------------------------------------
93 * hypre_ParVectorInRangeOf
94 *
95 * Return a vector that belongs to the range of a given matrix.
96 *--------------------------------------------------------------------------*/
97
hypre_ParVectorInRangeOf(hypre_ParCSRMatrix * A)98 hypre_ParVector *hypre_ParVectorInRangeOf(hypre_ParCSRMatrix *A)
99 {
100 hypre_ParVector *x;
101
102 x = hypre_ParVectorCreate(hypre_ParCSRMatrixComm(A),
103 hypre_ParCSRMatrixGlobalNumRows(A),
104 hypre_ParCSRMatrixRowStarts(A));
105 hypre_ParVectorInitialize(x);
106 hypre_ParVectorOwnsData(x) = 1;
107
108 return x;
109 }
110
111 /*--------------------------------------------------------------------------
112 * hypre_ParVectorInDomainOf
113 *
114 * Return a vector that belongs to the domain of a given matrix.
115 *--------------------------------------------------------------------------*/
116
hypre_ParVectorInDomainOf(hypre_ParCSRMatrix * A)117 hypre_ParVector *hypre_ParVectorInDomainOf(hypre_ParCSRMatrix *A)
118 {
119 hypre_ParVector *x;
120
121 x = hypre_ParVectorCreate(hypre_ParCSRMatrixComm(A),
122 hypre_ParCSRMatrixGlobalNumCols(A),
123 hypre_ParCSRMatrixColStarts(A));
124 hypre_ParVectorInitialize(x);
125 hypre_ParVectorOwnsData(x) = 1;
126
127 return x;
128 }
129
130 /*--------------------------------------------------------------------------
131 * hypre_ParVectorBlockSplit
132 *
133 * Extract the dim sub-vectors x_0,...,x_{dim-1} composing a parallel
134 * block vector x. It is assumed that &x[i] = [x_0[i],...,x_{dim-1}[i]].
135 *--------------------------------------------------------------------------*/
136 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
137 template<HYPRE_Int dir>
138 __global__ void
hypreCUDAKernel_ParVectorBlockSplitGather(HYPRE_Int size,HYPRE_Int dim,HYPRE_Real * x0,HYPRE_Real * x1,HYPRE_Real * x2,HYPRE_Real * x)139 hypreCUDAKernel_ParVectorBlockSplitGather(HYPRE_Int size,
140 HYPRE_Int dim,
141 HYPRE_Real *x0,
142 HYPRE_Real *x1,
143 HYPRE_Real *x2,
144 HYPRE_Real *x)
145 {
146 const HYPRE_Int i = hypre_cuda_get_grid_thread_id<1,1>();
147
148 if (i >= size * dim)
149 {
150 return;
151 }
152
153 HYPRE_Real *xx[3];
154
155 xx[0] = x0;
156 xx[1] = x1;
157 xx[2] = x2;
158
159 const HYPRE_Int d = i % dim;
160 const HYPRE_Int k = i / dim;
161
162 if (dir == 0)
163 {
164 xx[d][k] = x[i];
165 }
166 else if (dir == 1)
167 {
168 x[i] = xx[d][k];
169 }
170 }
171 #endif
172
hypre_ParVectorBlockSplit(hypre_ParVector * x,hypre_ParVector * x_[3],HYPRE_Int dim)173 HYPRE_Int hypre_ParVectorBlockSplit(hypre_ParVector *x,
174 hypre_ParVector *x_[3],
175 HYPRE_Int dim)
176 {
177 HYPRE_Int i, d, size_;
178 HYPRE_Real *x_data, *x_data_[3];
179
180 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
181 HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_ParVectorMemoryLocation(x) );
182 #endif
183
184 size_ = hypre_VectorSize(hypre_ParVectorLocalVector(x_[0]));
185
186 x_data = hypre_VectorData(hypre_ParVectorLocalVector(x));
187 for (d = 0; d < dim; d++)
188 x_data_[d] = hypre_VectorData(hypre_ParVectorLocalVector(x_[d]));
189
190 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
191 if (exec == HYPRE_EXEC_DEVICE)
192 {
193 dim3 bDim = hypre_GetDefaultCUDABlockDimension();
194 dim3 gDim = hypre_GetDefaultCUDAGridDimension(size_ * dim, "thread", bDim);
195 HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ParVectorBlockSplitGather<0>, gDim, bDim,
196 size_, dim, x_data_[0], x_data_[1], x_data_[2], x_data);
197 }
198 else
199 #endif
200 {
201 for (i = 0; i < size_; i++)
202 for (d = 0; d < dim; d++)
203 x_data_[d][i] = x_data[dim*i+d];
204 }
205
206 return hypre_error_flag;
207 }
208
209 /*--------------------------------------------------------------------------
210 * hypre_ParVectorBlockGather
211 *
212 * Compose a parallel block vector x from dim given sub-vectors
213 * x_0,...,x_{dim-1}, such that &x[i] = [x_0[i],...,x_{dim-1}[i]].
214 *--------------------------------------------------------------------------*/
215
hypre_ParVectorBlockGather(hypre_ParVector * x,hypre_ParVector * x_[3],HYPRE_Int dim)216 HYPRE_Int hypre_ParVectorBlockGather(hypre_ParVector *x,
217 hypre_ParVector *x_[3],
218 HYPRE_Int dim)
219 {
220 HYPRE_Int i, d, size_;
221 HYPRE_Real *x_data, *x_data_[3];
222
223 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
224 HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_ParVectorMemoryLocation(x) );
225 #endif
226
227 size_ = hypre_VectorSize(hypre_ParVectorLocalVector(x_[0]));
228
229 x_data = hypre_VectorData(hypre_ParVectorLocalVector(x));
230 for (d = 0; d < dim; d++)
231 x_data_[d] = hypre_VectorData(hypre_ParVectorLocalVector(x_[d]));
232
233 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
234 if (exec == HYPRE_EXEC_DEVICE)
235 {
236 dim3 bDim = hypre_GetDefaultCUDABlockDimension();
237 dim3 gDim = hypre_GetDefaultCUDAGridDimension(size_ * dim, "thread", bDim);
238 HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ParVectorBlockSplitGather<1>, gDim, bDim,
239 size_, dim, x_data_[0], x_data_[1], x_data_[2], x_data);
240 }
241 else
242 #endif
243 {
244 for (i = 0; i < size_; i++)
245 for (d = 0; d < dim; d++)
246 x_data[dim*i+d] = x_data_[d][i];
247 }
248
249 return hypre_error_flag;
250 }
251
252 /*--------------------------------------------------------------------------
253 * hypre_BoomerAMGBlockSolve
254 *
255 * Apply the block-diagonal solver diag(B) to the system diag(A) x = b.
256 * Here B is a given BoomerAMG solver for A, while x and b are "block"
257 * parallel vectors.
258 *--------------------------------------------------------------------------*/
259
hypre_BoomerAMGBlockSolve(void * B,hypre_ParCSRMatrix * A,hypre_ParVector * b,hypre_ParVector * x)260 HYPRE_Int hypre_BoomerAMGBlockSolve(void *B,
261 hypre_ParCSRMatrix *A,
262 hypre_ParVector *b,
263 hypre_ParVector *x)
264 {
265 HYPRE_Int d, dim = 1;
266
267 hypre_ParVector *b_[3];
268 hypre_ParVector *x_[3];
269
270 dim = hypre_ParVectorGlobalSize(x) / hypre_ParCSRMatrixGlobalNumRows(A);
271
272 if (dim == 1)
273 {
274 hypre_BoomerAMGSolve(B, A, b, x);
275 return hypre_error_flag;
276 }
277
278 for (d = 0; d < dim; d++)
279 {
280 b_[d] = hypre_ParVectorInRangeOf(A);
281 x_[d] = hypre_ParVectorInRangeOf(A);
282 }
283
284 hypre_ParVectorBlockSplit(b, b_, dim);
285 hypre_ParVectorBlockSplit(x, x_, dim);
286
287 for (d = 0; d < dim; d++)
288 hypre_BoomerAMGSolve(B, A, b_[d], x_[d]);
289
290 hypre_ParVectorBlockGather(x, x_, dim);
291
292 for (d = 0; d < dim; d++)
293 {
294 hypre_ParVectorDestroy(b_[d]);
295 hypre_ParVectorDestroy(x_[d]);
296 }
297
298 return hypre_error_flag;
299 }
300
301 /*--------------------------------------------------------------------------
302 * hypre_ParCSRMatrixFixZeroRows
303 *
304 * For every zero row in the matrix: set the diagonal element to 1.
305 *--------------------------------------------------------------------------*/
306
hypre_ParCSRMatrixFixZeroRowsHost(hypre_ParCSRMatrix * A)307 HYPRE_Int hypre_ParCSRMatrixFixZeroRowsHost(hypre_ParCSRMatrix *A)
308 {
309 HYPRE_Int i, j;
310 HYPRE_Real l1_norm;
311 HYPRE_Int num_rows = hypre_ParCSRMatrixNumRows(A);
312
313 hypre_CSRMatrix *A_diag = hypre_ParCSRMatrixDiag(A);
314 HYPRE_Int *A_diag_I = hypre_CSRMatrixI(A_diag);
315 HYPRE_Int *A_diag_J = hypre_CSRMatrixJ(A_diag);
316 HYPRE_Real *A_diag_data = hypre_CSRMatrixData(A_diag);
317
318 hypre_CSRMatrix *A_offd = hypre_ParCSRMatrixOffd(A);
319 HYPRE_Int *A_offd_I = hypre_CSRMatrixI(A_offd);
320 HYPRE_Real *A_offd_data = hypre_CSRMatrixData(A_offd);
321 HYPRE_Int num_cols_offd = hypre_CSRMatrixNumCols(A_offd);
322
323 /* a row will be considered zero if its l1 norm is less than eps */
324 HYPRE_Real eps = 0.0; /* DBL_EPSILON * 1e+4; */
325
326 for (i = 0; i < num_rows; i++)
327 {
328 l1_norm = 0.0;
329 for (j = A_diag_I[i]; j < A_diag_I[i+1]; j++)
330 l1_norm += fabs(A_diag_data[j]);
331 if (num_cols_offd)
332 for (j = A_offd_I[i]; j < A_offd_I[i+1]; j++)
333 l1_norm += fabs(A_offd_data[j]);
334
335 if (l1_norm <= eps)
336 {
337 for (j = A_diag_I[i]; j < A_diag_I[i+1]; j++)
338 if (A_diag_J[j] == i)
339 A_diag_data[j] = 1.0;
340 else
341 A_diag_data[j] = 0.0;
342 if (num_cols_offd)
343 for (j = A_offd_I[i]; j < A_offd_I[i+1]; j++)
344 A_offd_data[j] = 0.0;
345 }
346 }
347
348 return hypre_error_flag;
349 }
350
351 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
352 __global__ void
hypreCUDAKernel_ParCSRMatrixFixZeroRows(HYPRE_Int nrows,HYPRE_Int * A_diag_i,HYPRE_Int * A_diag_j,HYPRE_Complex * A_diag_data,HYPRE_Int * A_offd_i,HYPRE_Complex * A_offd_data,HYPRE_Int num_cols_offd)353 hypreCUDAKernel_ParCSRMatrixFixZeroRows( HYPRE_Int nrows,
354 HYPRE_Int *A_diag_i,
355 HYPRE_Int *A_diag_j,
356 HYPRE_Complex *A_diag_data,
357 HYPRE_Int *A_offd_i,
358 HYPRE_Complex *A_offd_data,
359 HYPRE_Int num_cols_offd)
360 {
361 HYPRE_Int row_i = hypre_cuda_get_grid_warp_id<1,1>();
362
363 if (row_i >= nrows)
364 {
365 return;
366 }
367
368 HYPRE_Int lane = hypre_cuda_get_lane_id<1>();
369 HYPRE_Real eps = 0.0; /* DBL_EPSILON * 1e+4; */
370 HYPRE_Real l1_norm = 0.0;
371 HYPRE_Int p1, q1, p2 = 0, q2 = 0;
372
373 if (lane < 2)
374 {
375 p1 = read_only_load(A_diag_i + row_i + lane);
376 if (num_cols_offd)
377 {
378 p2 = read_only_load(A_offd_i + row_i + lane);
379 }
380 }
381
382 q1 = __shfl_sync(HYPRE_WARP_FULL_MASK, p1, 1);
383 p1 = __shfl_sync(HYPRE_WARP_FULL_MASK, p1, 0);
384 if (num_cols_offd)
385 {
386 q2 = __shfl_sync(HYPRE_WARP_FULL_MASK, p2, 1);
387 p2 = __shfl_sync(HYPRE_WARP_FULL_MASK, p2, 0);
388 }
389
390 for (HYPRE_Int j = p1 + lane; j < q1; j += HYPRE_WARP_SIZE)
391 {
392 l1_norm += fabs(A_diag_data[j]);
393 }
394
395 for (HYPRE_Int j = p2 + lane; j < q2; j += HYPRE_WARP_SIZE)
396 {
397 l1_norm += fabs(A_offd_data[j]);
398 }
399
400 l1_norm = warp_allreduce_sum(l1_norm);
401
402 if (l1_norm <= eps)
403 {
404 for (HYPRE_Int j = p1 + lane; j < q1; j += HYPRE_WARP_SIZE)
405 {
406 if (row_i == read_only_load(&A_diag_j[j]))
407 {
408 A_diag_data[j] = 1.0;
409 }
410 else
411 {
412 A_diag_data[j] = 0.0;
413 }
414 }
415
416 for (HYPRE_Int j = p2 + lane; j < q2; j += HYPRE_WARP_SIZE)
417 {
418 A_offd_data[j] = 0.0;
419 }
420 }
421 }
422
hypre_ParCSRMatrixFixZeroRowsDevice(hypre_ParCSRMatrix * A)423 HYPRE_Int hypre_ParCSRMatrixFixZeroRowsDevice(hypre_ParCSRMatrix *A)
424 {
425 HYPRE_Int nrows = hypre_ParCSRMatrixNumRows(A);
426 hypre_CSRMatrix *A_diag = hypre_ParCSRMatrixDiag(A);
427 HYPRE_Real *A_diag_data = hypre_CSRMatrixData(A_diag);
428 HYPRE_Int *A_diag_i = hypre_CSRMatrixI(A_diag);
429 HYPRE_Int *A_diag_j = hypre_CSRMatrixJ(A_diag);
430 hypre_CSRMatrix *A_offd = hypre_ParCSRMatrixOffd(A);
431 HYPRE_Real *A_offd_data = hypre_CSRMatrixData(A_offd);
432 HYPRE_Int *A_offd_i = hypre_CSRMatrixI(A_offd);
433 HYPRE_Int num_cols_offd = hypre_CSRMatrixNumCols(A_offd);
434 dim3 bDim, gDim;
435
436 bDim = hypre_GetDefaultCUDABlockDimension();
437 gDim = hypre_GetDefaultCUDAGridDimension(nrows, "warp", bDim);
438
439 HYPRE_CUDA_LAUNCH(hypreCUDAKernel_ParCSRMatrixFixZeroRows, gDim, bDim,
440 nrows, A_diag_i, A_diag_j, A_diag_data, A_offd_i, A_offd_data, num_cols_offd);
441
442 //hypre_SyncCudaComputeStream(hypre_handle());
443
444 return hypre_error_flag;
445 }
446 #endif
447
hypre_ParCSRMatrixFixZeroRows(hypre_ParCSRMatrix * A)448 HYPRE_Int hypre_ParCSRMatrixFixZeroRows(hypre_ParCSRMatrix *A)
449 {
450 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
451 HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_ParCSRMatrixMemoryLocation(A) );
452 if (exec == HYPRE_EXEC_DEVICE)
453 {
454 return hypre_ParCSRMatrixFixZeroRowsDevice(A);
455 }
456 else
457 #endif
458 {
459 return hypre_ParCSRMatrixFixZeroRowsHost(A);
460 }
461 }
462
463 /*--------------------------------------------------------------------------
464 * hypre_ParCSRComputeL1Norms
465 *
466 * Compute the l1 norms of the rows of a given matrix, depending on
467 * the option parameter:
468 *
469 * option 1 = Compute the l1 norm of the rows
470 * option 2 = Compute the l1 norm of the (processor) off-diagonal
471 * part of the rows plus the diagonal of A
472 * option 3 = Compute the l2 norm^2 of the rows
473 * option 4 = Truncated version of option 2 based on Remark 6.2 in "Multigrid
474 * Smoothers for Ultra-Parallel Computing"
475 *
476 * The above computations are done in a CF manner, whenever the provided
477 * cf_marker is not NULL.
478 *--------------------------------------------------------------------------*/
479
480 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
481 struct l1_norm_op1 : public thrust::binary_function<HYPRE_Complex, HYPRE_Complex, HYPRE_Complex>
482 {
483 __host__ __device__
operatorl1_norm_op1484 HYPRE_Complex operator()(HYPRE_Complex &x, HYPRE_Complex &y) const
485 {
486 return x <= 4.0/3.0 * y ? y : x;
487 }
488 };
489 #endif
490
hypre_ParCSRComputeL1Norms(hypre_ParCSRMatrix * A,HYPRE_Int option,HYPRE_Int * cf_marker,HYPRE_Real ** l1_norm_ptr)491 HYPRE_Int hypre_ParCSRComputeL1Norms(hypre_ParCSRMatrix *A,
492 HYPRE_Int option,
493 HYPRE_Int *cf_marker,
494 HYPRE_Real **l1_norm_ptr)
495 {
496 HYPRE_Int i;
497 HYPRE_Int num_rows = hypre_ParCSRMatrixNumRows(A);
498 hypre_CSRMatrix *A_diag = hypre_ParCSRMatrixDiag(A);
499 hypre_CSRMatrix *A_offd = hypre_ParCSRMatrixOffd(A);
500 HYPRE_Int num_cols_offd = hypre_CSRMatrixNumCols(A_offd);
501
502 HYPRE_MemoryLocation memory_location_l1 = hypre_ParCSRMatrixMemoryLocation(A);
503
504 HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( memory_location_l1 );
505
506 if (exec == HYPRE_EXEC_HOST)
507 {
508 HYPRE_Int num_threads = hypre_NumThreads();
509 if (num_threads > 1)
510 {
511 return hypre_ParCSRComputeL1NormsThreads(A, option, num_threads, cf_marker, l1_norm_ptr);
512 }
513 }
514
515 HYPRE_Real *l1_norm = hypre_TAlloc(HYPRE_Real, num_rows, memory_location_l1);
516
517 HYPRE_MemoryLocation memory_location_tmp = exec == HYPRE_EXEC_HOST ? HYPRE_MEMORY_HOST : HYPRE_MEMORY_DEVICE;
518 HYPRE_Real *diag_tmp = NULL;
519
520 HYPRE_Int *cf_marker_offd = NULL;
521
522 /* collect the cf marker data from other procs */
523 if (cf_marker != NULL)
524 {
525 HYPRE_Int num_sends;
526 HYPRE_Int *int_buf_data = NULL;
527
528 hypre_ParCSRCommPkg *comm_pkg = hypre_ParCSRMatrixCommPkg(A);
529 hypre_ParCSRCommHandle *comm_handle;
530
531 if (num_cols_offd)
532 {
533 cf_marker_offd = hypre_CTAlloc(HYPRE_Int, num_cols_offd, memory_location_tmp);
534 }
535 num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg);
536 if (hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends))
537 {
538 int_buf_data = hypre_CTAlloc(HYPRE_Int, hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends),
539 memory_location_tmp);
540 }
541 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
542 if (exec == HYPRE_EXEC_DEVICE)
543 {
544 hypre_ParCSRCommPkgCopySendMapElmtsToDevice(comm_pkg);
545 HYPRE_THRUST_CALL( gather,
546 hypre_ParCSRCommPkgDeviceSendMapElmts(comm_pkg),
547 hypre_ParCSRCommPkgDeviceSendMapElmts(comm_pkg) + hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends),
548 cf_marker,
549 int_buf_data );
550 }
551 else
552 #endif
553 {
554 HYPRE_Int index = 0;
555 HYPRE_Int start;
556 HYPRE_Int j;
557 for (i = 0; i < num_sends; i++)
558 {
559 start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i);
560 for (j = start; j < hypre_ParCSRCommPkgSendMapStart(comm_pkg, i+1); j++)
561 {
562 int_buf_data[index++] = cf_marker[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,j)];
563 }
564 }
565 }
566
567 comm_handle = hypre_ParCSRCommHandleCreate_v2(11, comm_pkg, memory_location_tmp, int_buf_data,
568 memory_location_tmp, cf_marker_offd);
569 hypre_ParCSRCommHandleDestroy(comm_handle);
570 hypre_TFree(int_buf_data, memory_location_tmp);
571 }
572
573 if (option == 1)
574 {
575 /* Set the l1 norm of the diag part */
576 hypre_CSRMatrixComputeRowSum(A_diag, cf_marker, cf_marker, l1_norm, 1, 1.0, "set");
577
578 /* Add the l1 norm of the offd part */
579 if (num_cols_offd)
580 {
581 hypre_CSRMatrixComputeRowSum(A_offd, cf_marker, cf_marker_offd, l1_norm, 1, 1.0, "add");
582 }
583 }
584 else if (option == 2)
585 {
586 /* Set the abs(diag) element */
587 hypre_CSRMatrixExtractDiagonal(A_diag, l1_norm, 1);
588 /* Add the l1 norm of the offd part */
589 if (num_cols_offd)
590 {
591 hypre_CSRMatrixComputeRowSum(A_offd, cf_marker, cf_marker, l1_norm, 1, 1.0, "add");
592 }
593 }
594 else if (option == 3)
595 {
596 /* Set the CF l2 norm of the diag part */
597 hypre_CSRMatrixComputeRowSum(A_diag, NULL, NULL, l1_norm, 2, 1.0, "set");
598 /* Add the CF l2 norm of the offd part */
599 if (num_cols_offd)
600 {
601 hypre_CSRMatrixComputeRowSum(A_offd, NULL, NULL, l1_norm, 2, 1.0, "add");
602 }
603 }
604 else if (option == 4)
605 {
606 /* Set the abs(diag) element */
607 hypre_CSRMatrixExtractDiagonal(A_diag, l1_norm, 1);
608
609 diag_tmp = hypre_TAlloc(HYPRE_Real, num_rows, memory_location_tmp);
610 hypre_TMemcpy(diag_tmp, l1_norm, HYPRE_Real, num_rows, memory_location_tmp, memory_location_l1);
611
612 /* Add the scaled l1 norm of the offd part */
613 if (num_cols_offd)
614 {
615 hypre_CSRMatrixComputeRowSum(A_offd, cf_marker, cf_marker_offd, l1_norm, 1, 0.5, "add");
616 }
617
618 /* Truncate according to Remark 6.2 */
619 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
620 if (exec == HYPRE_EXEC_DEVICE)
621 {
622 HYPRE_THRUST_CALL( transform, l1_norm, l1_norm + num_rows, diag_tmp, l1_norm, l1_norm_op1() );
623 }
624 else
625 #endif
626 {
627 for (i = 0; i < num_rows; i++)
628 {
629 if (l1_norm[i] <= 4.0/3.0 * diag_tmp[i])
630 {
631 l1_norm[i] = diag_tmp[i];
632 }
633 }
634 }
635 }
636 else if (option == 5) /*stores diagonal of A for Jacobi using matvec, rlx 7 */
637 {
638 /* Set the diag element */
639 hypre_CSRMatrixExtractDiagonal(A_diag, l1_norm, 0);
640
641 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
642 if ( exec == HYPRE_EXEC_DEVICE)
643 {
644 thrust::identity<HYPRE_Complex> identity;
645 HYPRE_THRUST_CALL( replace_if, l1_norm, l1_norm + num_rows, thrust::not1(identity), 1.0 );
646 }
647 else
648 #endif
649 {
650 for (i = 0; i < num_rows; i++)
651 {
652 if (l1_norm[i] == 0.0)
653 {
654 l1_norm[i] = 1.0;
655 }
656 }
657 }
658
659 *l1_norm_ptr = l1_norm;
660
661 return hypre_error_flag;
662 }
663
664 /* Handle negative definite matrices */
665 if (!diag_tmp)
666 {
667 diag_tmp = hypre_TAlloc(HYPRE_Real, num_rows, memory_location_tmp);
668 }
669
670 /* Set the diag element */
671 hypre_CSRMatrixExtractDiagonal(A_diag, diag_tmp, 0);
672
673 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
674 if (exec == HYPRE_EXEC_DEVICE)
675 {
676 HYPRE_THRUST_CALL( transform_if, l1_norm, l1_norm + num_rows, diag_tmp, l1_norm, thrust::negate<HYPRE_Real>(),
677 is_negative<HYPRE_Real>() );
678 //bool any_zero = HYPRE_THRUST_CALL( any_of, l1_norm, l1_norm + num_rows, thrust::not1(thrust::identity<HYPRE_Complex>()) );
679 bool any_zero = 0.0 == HYPRE_THRUST_CALL( reduce, l1_norm, l1_norm + num_rows, 1.0, thrust::minimum<HYPRE_Real>() );
680 if ( any_zero )
681 {
682 hypre_error_in_arg(1);
683 }
684 }
685 else
686 #endif
687 {
688 for (i = 0; i < num_rows; i++)
689 {
690 if (diag_tmp[i] < 0.0)
691 {
692 l1_norm[i] = -l1_norm[i];
693 }
694 }
695
696 for (i = 0; i < num_rows; i++)
697 {
698 /* if (fabs(l1_norm[i]) < DBL_EPSILON) */
699 if (fabs(l1_norm[i]) == 0.0)
700 {
701 hypre_error_in_arg(1);
702 break;
703 }
704 }
705 }
706
707 hypre_TFree(cf_marker_offd, memory_location_tmp);
708 hypre_TFree(diag_tmp, memory_location_tmp);
709
710 *l1_norm_ptr = l1_norm;
711
712 return hypre_error_flag;
713 }
714
715 /*--------------------------------------------------------------------------
716 * hypre_ParCSRMatrixSetDiagRows
717 *
718 * For every row containing only a diagonal element: set it to d.
719 *--------------------------------------------------------------------------*/
720 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
721 __global__ void
hypreCUDAKernel_ParCSRMatrixSetDiagRows(HYPRE_Int nrows,HYPRE_Int * A_diag_I,HYPRE_Int * A_diag_J,HYPRE_Complex * A_diag_data,HYPRE_Int * A_offd_I,HYPRE_Int num_cols_offd,HYPRE_Real d)722 hypreCUDAKernel_ParCSRMatrixSetDiagRows(HYPRE_Int nrows,
723 HYPRE_Int *A_diag_I,
724 HYPRE_Int *A_diag_J,
725 HYPRE_Complex *A_diag_data,
726 HYPRE_Int *A_offd_I,
727 HYPRE_Int num_cols_offd,
728 HYPRE_Real d)
729 {
730 const HYPRE_Int i = hypre_cuda_get_grid_thread_id<1,1>();
731 if (i >= nrows)
732 {
733 return;
734 }
735
736 HYPRE_Int j = read_only_load(&A_diag_I[i]);
737
738 if ( (read_only_load(&A_diag_I[i+1]) == j+1) && (read_only_load(&A_diag_J[j]) == i) &&
739 (!num_cols_offd || (read_only_load(&A_offd_I[i+1]) == read_only_load(&A_offd_I[i]))) )
740 {
741 A_diag_data[j] = d;
742 }
743 }
744 #endif
745
hypre_ParCSRMatrixSetDiagRows(hypre_ParCSRMatrix * A,HYPRE_Real d)746 HYPRE_Int hypre_ParCSRMatrixSetDiagRows(hypre_ParCSRMatrix *A, HYPRE_Real d)
747 {
748 HYPRE_Int i, j;
749 HYPRE_Int num_rows = hypre_ParCSRMatrixNumRows(A);
750
751 hypre_CSRMatrix *A_diag = hypre_ParCSRMatrixDiag(A);
752 HYPRE_Int *A_diag_I = hypre_CSRMatrixI(A_diag);
753 HYPRE_Int *A_diag_J = hypre_CSRMatrixJ(A_diag);
754 HYPRE_Real *A_diag_data = hypre_CSRMatrixData(A_diag);
755
756 hypre_CSRMatrix *A_offd = hypre_ParCSRMatrixOffd(A);
757 HYPRE_Int *A_offd_I = hypre_CSRMatrixI(A_offd);
758 HYPRE_Int num_cols_offd = hypre_CSRMatrixNumCols(A_offd);
759
760 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
761 HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_ParCSRMatrixMemoryLocation(A) );
762 if (exec == HYPRE_EXEC_DEVICE)
763 {
764 dim3 bDim = hypre_GetDefaultCUDABlockDimension();
765 dim3 gDim = hypre_GetDefaultCUDAGridDimension(num_rows, "thread", bDim);
766 HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ParCSRMatrixSetDiagRows, gDim, bDim,
767 num_rows, A_diag_I, A_diag_J, A_diag_data, A_offd_I, num_cols_offd, d);
768 }
769 else
770 #endif
771 {
772 for (i = 0; i < num_rows; i++)
773 {
774 j = A_diag_I[i];
775 if ((A_diag_I[i+1] == j+1) && (A_diag_J[j] == i) &&
776 (!num_cols_offd || (A_offd_I[i+1] == A_offd_I[i])))
777 {
778 A_diag_data[j] = d;
779 }
780 }
781 }
782
783 return hypre_error_flag;
784 }
785
786 /*--------------------------------------------------------------------------
787 * hypre_AMSCreate
788 *
789 * Allocate the AMS solver structure.
790 *--------------------------------------------------------------------------*/
791
hypre_AMSCreate()792 void * hypre_AMSCreate()
793 {
794 hypre_AMSData *ams_data;
795
796 ams_data = hypre_CTAlloc(hypre_AMSData, 1, HYPRE_MEMORY_HOST);
797
798 /* Default parameters */
799
800 ams_data -> dim = 3; /* 3D problem */
801 ams_data -> maxit = 20; /* perform at most 20 iterations */
802 ams_data -> tol = 1e-6; /* convergence tolerance */
803 ams_data -> print_level = 1; /* print residual norm at each step */
804 ams_data -> cycle_type = 1; /* a 3-level multiplicative solver */
805 ams_data -> A_relax_type = 2; /* offd-l1-scaled GS */
806 ams_data -> A_relax_times = 1; /* one relaxation sweep */
807 ams_data -> A_relax_weight = 1.0; /* damping parameter */
808 ams_data -> A_omega = 1.0; /* SSOR coefficient */
809 ams_data -> A_cheby_order = 2; /* Cheby: order (1 -4 are vaild) */
810 ams_data -> A_cheby_fraction = .3; /* Cheby: fraction of spectrum to smooth */
811
812 ams_data -> B_G_coarsen_type = 10; /* HMIS coarsening */
813 ams_data -> B_G_agg_levels = 1; /* Levels of aggressive coarsening */
814 ams_data -> B_G_relax_type = 3; /* hybrid G-S/Jacobi */
815 ams_data -> B_G_theta = 0.25; /* strength threshold */
816 ams_data -> B_G_interp_type = 0; /* interpolation type */
817 ams_data -> B_G_Pmax = 0; /* max nonzero elements in interp. rows */
818 ams_data -> B_Pi_coarsen_type = 10; /* HMIS coarsening */
819 ams_data -> B_Pi_agg_levels = 1; /* Levels of aggressive coarsening */
820 ams_data -> B_Pi_relax_type = 3; /* hybrid G-S/Jacobi */
821 ams_data -> B_Pi_theta = 0.25; /* strength threshold */
822 ams_data -> B_Pi_interp_type = 0; /* interpolation type */
823 ams_data -> B_Pi_Pmax = 0; /* max nonzero elements in interp. rows */
824 ams_data -> beta_is_zero = 0; /* the problem has a mass term */
825
826 /* By default, do l1-GS smoothing on the coarsest grid */
827 ams_data -> B_G_coarse_relax_type = 8;
828 ams_data -> B_Pi_coarse_relax_type = 8;
829
830 /* The rest of the fields are initialized using the Set functions */
831
832 ams_data -> A = NULL;
833 ams_data -> G = NULL;
834 ams_data -> A_G = NULL;
835 ams_data -> B_G = 0;
836 ams_data -> Pi = NULL;
837 ams_data -> A_Pi = NULL;
838 ams_data -> B_Pi = 0;
839 ams_data -> x = NULL;
840 ams_data -> y = NULL;
841 ams_data -> z = NULL;
842 ams_data -> Gx = NULL;
843 ams_data -> Gy = NULL;
844 ams_data -> Gz = NULL;
845
846 ams_data -> r0 = NULL;
847 ams_data -> g0 = NULL;
848 ams_data -> r1 = NULL;
849 ams_data -> g1 = NULL;
850 ams_data -> r2 = NULL;
851 ams_data -> g2 = NULL;
852 ams_data -> zz = NULL;
853
854 ams_data -> Pix = NULL;
855 ams_data -> Piy = NULL;
856 ams_data -> Piz = NULL;
857 ams_data -> A_Pix = NULL;
858 ams_data -> A_Piy = NULL;
859 ams_data -> A_Piz = NULL;
860 ams_data -> B_Pix = 0;
861 ams_data -> B_Piy = 0;
862 ams_data -> B_Piz = 0;
863
864 ams_data -> interior_nodes = NULL;
865 ams_data -> G0 = NULL;
866 ams_data -> A_G0 = NULL;
867 ams_data -> B_G0 = 0;
868 ams_data -> projection_frequency = 5;
869
870 ams_data -> A_l1_norms = NULL;
871 ams_data -> A_max_eig_est = 0;
872 ams_data -> A_min_eig_est = 0;
873
874 ams_data -> owns_Pi = 1;
875 ams_data -> owns_A_G = 0;
876 ams_data -> owns_A_Pi = 0;
877
878 return (void *) ams_data;
879 }
880
881 /*--------------------------------------------------------------------------
882 * hypre_AMSDestroy
883 *
884 * Deallocate the AMS solver structure. Note that the input data (given
885 * through the Set functions) is not destroyed.
886 *--------------------------------------------------------------------------*/
887
hypre_AMSDestroy(void * solver)888 HYPRE_Int hypre_AMSDestroy(void *solver)
889 {
890 hypre_AMSData *ams_data = (hypre_AMSData *) solver;
891
892 if (!ams_data)
893 {
894 hypre_error_in_arg(1);
895 return hypre_error_flag;
896 }
897
898 if (ams_data -> owns_A_G)
899 if (ams_data -> A_G)
900 hypre_ParCSRMatrixDestroy(ams_data -> A_G);
901 if (!ams_data -> beta_is_zero)
902 if (ams_data -> B_G)
903 HYPRE_BoomerAMGDestroy(ams_data -> B_G);
904
905 if (ams_data -> owns_Pi && ams_data -> Pi)
906 hypre_ParCSRMatrixDestroy(ams_data -> Pi);
907 if (ams_data -> owns_A_Pi)
908 if (ams_data -> A_Pi)
909 hypre_ParCSRMatrixDestroy(ams_data -> A_Pi);
910 if (ams_data -> B_Pi)
911 HYPRE_BoomerAMGDestroy(ams_data -> B_Pi);
912
913 if (ams_data -> owns_Pi && ams_data -> Pix)
914 hypre_ParCSRMatrixDestroy(ams_data -> Pix);
915 if (ams_data -> A_Pix)
916 hypre_ParCSRMatrixDestroy(ams_data -> A_Pix);
917 if (ams_data -> B_Pix)
918 HYPRE_BoomerAMGDestroy(ams_data -> B_Pix);
919 if (ams_data -> owns_Pi && ams_data -> Piy)
920 hypre_ParCSRMatrixDestroy(ams_data -> Piy);
921 if (ams_data -> A_Piy)
922 hypre_ParCSRMatrixDestroy(ams_data -> A_Piy);
923 if (ams_data -> B_Piy)
924 HYPRE_BoomerAMGDestroy(ams_data -> B_Piy);
925 if (ams_data -> owns_Pi && ams_data -> Piz)
926 hypre_ParCSRMatrixDestroy(ams_data -> Piz);
927 if (ams_data -> A_Piz)
928 hypre_ParCSRMatrixDestroy(ams_data -> A_Piz);
929 if (ams_data -> B_Piz)
930 HYPRE_BoomerAMGDestroy(ams_data -> B_Piz);
931
932 if (ams_data -> r0)
933 hypre_ParVectorDestroy(ams_data -> r0);
934 if (ams_data -> g0)
935 hypre_ParVectorDestroy(ams_data -> g0);
936 if (ams_data -> r1)
937 hypre_ParVectorDestroy(ams_data -> r1);
938 if (ams_data -> g1)
939 hypre_ParVectorDestroy(ams_data -> g1);
940 if (ams_data -> r2)
941 hypre_ParVectorDestroy(ams_data -> r2);
942 if (ams_data -> g2)
943 hypre_ParVectorDestroy(ams_data -> g2);
944 if (ams_data -> zz)
945 hypre_ParVectorDestroy(ams_data -> zz);
946
947 if (ams_data -> G0)
948 hypre_ParCSRMatrixDestroy(ams_data -> A);
949 if (ams_data -> G0)
950 hypre_ParCSRMatrixDestroy(ams_data -> G0);
951 if (ams_data -> A_G0)
952 hypre_ParCSRMatrixDestroy(ams_data -> A_G0);
953 if (ams_data -> B_G0)
954 HYPRE_BoomerAMGDestroy(ams_data -> B_G0);
955
956 hypre_SeqVectorDestroy(ams_data -> A_l1_norms);
957
958 /* G, x, y ,z, Gx, Gy and Gz are not destroyed */
959
960 if (ams_data)
961 {
962 hypre_TFree(ams_data, HYPRE_MEMORY_HOST);
963 }
964
965 return hypre_error_flag;
966 }
967
968 /*--------------------------------------------------------------------------
969 * hypre_AMSSetDimension
970 *
971 * Set problem dimension (2 or 3). By default we assume dim = 3.
972 *--------------------------------------------------------------------------*/
973
hypre_AMSSetDimension(void * solver,HYPRE_Int dim)974 HYPRE_Int hypre_AMSSetDimension(void *solver,
975 HYPRE_Int dim)
976 {
977 hypre_AMSData *ams_data = (hypre_AMSData *) solver;
978
979 if (dim != 1 && dim != 2 && dim != 3)
980 hypre_error_in_arg(2);
981
982 ams_data -> dim = dim;
983 return hypre_error_flag;
984 }
985
986 /*--------------------------------------------------------------------------
987 * hypre_AMSSetDiscreteGradient
988 *
989 * Set the discrete gradient matrix G.
990 * This function should be called before hypre_AMSSetup()!
991 *--------------------------------------------------------------------------*/
992
hypre_AMSSetDiscreteGradient(void * solver,hypre_ParCSRMatrix * G)993 HYPRE_Int hypre_AMSSetDiscreteGradient(void *solver,
994 hypre_ParCSRMatrix *G)
995 {
996 hypre_AMSData *ams_data = (hypre_AMSData *) solver;
997 ams_data -> G = G;
998 return hypre_error_flag;
999 }
1000
1001 /*--------------------------------------------------------------------------
1002 * hypre_AMSSetCoordinateVectors
1003 *
1004 * Set the x, y and z coordinates of the vertices in the mesh.
1005 *
1006 * Either SetCoordinateVectors or SetEdgeConstantVectors should be
1007 * called before hypre_AMSSetup()!
1008 *--------------------------------------------------------------------------*/
1009
hypre_AMSSetCoordinateVectors(void * solver,hypre_ParVector * x,hypre_ParVector * y,hypre_ParVector * z)1010 HYPRE_Int hypre_AMSSetCoordinateVectors(void *solver,
1011 hypre_ParVector *x,
1012 hypre_ParVector *y,
1013 hypre_ParVector *z)
1014 {
1015 hypre_AMSData *ams_data = (hypre_AMSData *) solver;
1016 ams_data -> x = x;
1017 ams_data -> y = y;
1018 ams_data -> z = z;
1019 return hypre_error_flag;
1020 }
1021
1022 /*--------------------------------------------------------------------------
1023 * hypre_AMSSetEdgeConstantVectors
1024 *
1025 * Set the vectors Gx, Gy and Gz which give the representations of
1026 * the constant vector fields (1,0,0), (0,1,0) and (0,0,1) in the
1027 * edge element basis.
1028 *
1029 * Either SetCoordinateVectors or SetEdgeConstantVectors should be
1030 * called before hypre_AMSSetup()!
1031 *--------------------------------------------------------------------------*/
1032
hypre_AMSSetEdgeConstantVectors(void * solver,hypre_ParVector * Gx,hypre_ParVector * Gy,hypre_ParVector * Gz)1033 HYPRE_Int hypre_AMSSetEdgeConstantVectors(void *solver,
1034 hypre_ParVector *Gx,
1035 hypre_ParVector *Gy,
1036 hypre_ParVector *Gz)
1037 {
1038 hypre_AMSData *ams_data = (hypre_AMSData *) solver;
1039 ams_data -> Gx = Gx;
1040 ams_data -> Gy = Gy;
1041 ams_data -> Gz = Gz;
1042 return hypre_error_flag;
1043 }
1044
1045 /*--------------------------------------------------------------------------
1046 * hypre_AMSSetInterpolations
1047 *
1048 * Set the (components of) the Nedelec interpolation matrix Pi=[Pix,Piy,Piz].
1049 *
1050 * This function is generally intended to be used only for high-order Nedelec
1051 * discretizations (in the lowest order case, Pi is constructed internally in
1052 * AMS from the discreet gradient matrix and the coordinates of the vertices),
1053 * though it can also be used in the lowest-order case or for other types of
1054 * discretizations (e.g. ones based on the second family of Nedelec elements).
1055 *
1056 * By definition, Pi is the matrix representation of the linear operator that
1057 * interpolates (high-order) vector nodal finite elements into the (high-order)
1058 * Nedelec space. The component matrices are defined as Pix phi = Pi (phi,0,0)
1059 * and similarly for Piy and Piz. Note that all these operators depend on the
1060 * choice of the basis and degrees of freedom in the high-order spaces.
1061 *
1062 * The column numbering of Pi should be node-based, i.e. the x/y/z components of
1063 * the first node (vertex or high-order dof) should be listed first, followed by
1064 * the x/y/z components of the second node and so on (see the documentation of
1065 * HYPRE_BoomerAMGSetDofFunc).
1066 *
1067 * If used, this function should be called before hypre_AMSSetup() and there is
1068 * no need to provide the vertex coordinates. Furthermore, only one of the sets
1069 * {Pi} and {Pix,Piy,Piz} needs to be specified (though it is OK to provide
1070 * both). If Pix is NULL, then scalar Pi-based AMS cycles, i.e. those with
1071 * cycle_type > 10, will be unavailable. Similarly, AMS cycles based on
1072 * monolithic Pi (cycle_type < 10) require that Pi is not NULL.
1073 *--------------------------------------------------------------------------*/
1074
hypre_AMSSetInterpolations(void * solver,hypre_ParCSRMatrix * Pi,hypre_ParCSRMatrix * Pix,hypre_ParCSRMatrix * Piy,hypre_ParCSRMatrix * Piz)1075 HYPRE_Int hypre_AMSSetInterpolations(void *solver,
1076 hypre_ParCSRMatrix *Pi,
1077 hypre_ParCSRMatrix *Pix,
1078 hypre_ParCSRMatrix *Piy,
1079 hypre_ParCSRMatrix *Piz)
1080 {
1081 hypre_AMSData *ams_data = (hypre_AMSData *) solver;
1082 ams_data -> Pi = Pi;
1083 ams_data -> Pix = Pix;
1084 ams_data -> Piy = Piy;
1085 ams_data -> Piz = Piz;
1086 ams_data -> owns_Pi = 0;
1087 return hypre_error_flag;
1088 }
1089
1090 /*--------------------------------------------------------------------------
1091 * hypre_AMSSetAlphaPoissonMatrix
1092 *
1093 * Set the matrix corresponding to the Poisson problem with coefficient
1094 * alpha (the curl-curl term coefficient in the Maxwell problem).
1095 *
1096 * If this function is called, the coarse space solver on the range
1097 * of Pi^T is a block-diagonal version of A_Pi. If this function is not
1098 * called, the coarse space solver on the range of Pi^T is constructed
1099 * as Pi^T A Pi in hypre_AMSSetup().
1100 *--------------------------------------------------------------------------*/
1101
hypre_AMSSetAlphaPoissonMatrix(void * solver,hypre_ParCSRMatrix * A_Pi)1102 HYPRE_Int hypre_AMSSetAlphaPoissonMatrix(void *solver,
1103 hypre_ParCSRMatrix *A_Pi)
1104 {
1105 hypre_AMSData *ams_data = (hypre_AMSData *) solver;
1106 ams_data -> A_Pi = A_Pi;
1107
1108 /* Penalize the eliminated degrees of freedom */
1109 hypre_ParCSRMatrixSetDiagRows(A_Pi, HYPRE_REAL_MAX);
1110
1111 /* Make sure that the first entry in each row is the diagonal one. */
1112 /* hypre_CSRMatrixReorder(hypre_ParCSRMatrixDiag(A_Pi)); */
1113
1114 return hypre_error_flag;
1115 }
1116
1117 /*--------------------------------------------------------------------------
1118 * hypre_AMSSetBetaPoissonMatrix
1119 *
1120 * Set the matrix corresponding to the Poisson problem with coefficient
1121 * beta (the mass term coefficient in the Maxwell problem).
1122 *
1123 * This function call is optional - if not given, the Poisson matrix will
1124 * be computed in hypre_AMSSetup(). If the given matrix is NULL, we assume
1125 * that beta is 0 and use two-level (instead of three-level) methods.
1126 *--------------------------------------------------------------------------*/
1127
hypre_AMSSetBetaPoissonMatrix(void * solver,hypre_ParCSRMatrix * A_G)1128 HYPRE_Int hypre_AMSSetBetaPoissonMatrix(void *solver,
1129 hypre_ParCSRMatrix *A_G)
1130 {
1131 hypre_AMSData *ams_data = (hypre_AMSData *) solver;
1132 ams_data -> A_G = A_G;
1133 if (!A_G)
1134 ams_data -> beta_is_zero = 1;
1135 else
1136 {
1137 /* Penalize the eliminated degrees of freedom */
1138 hypre_ParCSRMatrixSetDiagRows(A_G, HYPRE_REAL_MAX);
1139
1140 /* Make sure that the first entry in each row is the diagonal one. */
1141 /* hypre_CSRMatrixReorder(hypre_ParCSRMatrixDiag(A_G)); */
1142 }
1143 return hypre_error_flag;
1144 }
1145
1146 /*--------------------------------------------------------------------------
1147 * hypre_AMSSetInteriorNodes
1148 *
1149 * Set the list of nodes which are interior to the zero-conductivity region.
1150 * A node is interior if interior_nodes[i] == 1.0.
1151 *
1152 * Should be called before hypre_AMSSetup()!
1153 *--------------------------------------------------------------------------*/
1154
hypre_AMSSetInteriorNodes(void * solver,hypre_ParVector * interior_nodes)1155 HYPRE_Int hypre_AMSSetInteriorNodes(void *solver,
1156 hypre_ParVector *interior_nodes)
1157 {
1158 hypre_AMSData *ams_data = (hypre_AMSData *) solver;
1159 ams_data -> interior_nodes = interior_nodes;
1160 return hypre_error_flag;
1161 }
1162
1163 /*--------------------------------------------------------------------------
1164 * hypre_AMSSetProjectionFrequency
1165 *
1166 * How often to project the r.h.s. onto the compatible sub-space Ker(G0^T),
1167 * when iterating with the solver.
1168 *
1169 * The default value is every 5th iteration.
1170 *--------------------------------------------------------------------------*/
1171
hypre_AMSSetProjectionFrequency(void * solver,HYPRE_Int projection_frequency)1172 HYPRE_Int hypre_AMSSetProjectionFrequency(void *solver,
1173 HYPRE_Int projection_frequency)
1174 {
1175 hypre_AMSData *ams_data = (hypre_AMSData *) solver;
1176 ams_data -> projection_frequency = projection_frequency;
1177 return hypre_error_flag;
1178 }
1179
1180 /*--------------------------------------------------------------------------
1181 * hypre_AMSSetMaxIter
1182 *
1183 * Set the maximum number of iterations in the three-level method.
1184 * The default value is 20. To use the AMS solver as a preconditioner,
1185 * set maxit to 1, tol to 0.0 and print_level to 0.
1186 *--------------------------------------------------------------------------*/
1187
hypre_AMSSetMaxIter(void * solver,HYPRE_Int maxit)1188 HYPRE_Int hypre_AMSSetMaxIter(void *solver,
1189 HYPRE_Int maxit)
1190 {
1191 hypre_AMSData *ams_data = (hypre_AMSData *) solver;
1192 ams_data -> maxit = maxit;
1193 return hypre_error_flag;
1194 }
1195
1196 /*--------------------------------------------------------------------------
1197 * hypre_AMSSetTol
1198 *
1199 * Set the convergence tolerance (if the method is used as a solver).
1200 * The default value is 1e-6.
1201 *--------------------------------------------------------------------------*/
1202
hypre_AMSSetTol(void * solver,HYPRE_Real tol)1203 HYPRE_Int hypre_AMSSetTol(void *solver,
1204 HYPRE_Real tol)
1205 {
1206 hypre_AMSData *ams_data = (hypre_AMSData *) solver;
1207 ams_data -> tol = tol;
1208 return hypre_error_flag;
1209 }
1210
1211 /*--------------------------------------------------------------------------
1212 * hypre_AMSSetCycleType
1213 *
1214 * Choose which three-level solver to use. Possible values are:
1215 *
1216 * 1 = 3-level multipl. solver (01210) <-- small solution time
1217 * 2 = 3-level additive solver (0+1+2)
1218 * 3 = 3-level multipl. solver (02120)
1219 * 4 = 3-level additive solver (010+2)
1220 * 5 = 3-level multipl. solver (0102010) <-- small solution time
1221 * 6 = 3-level additive solver (1+020)
1222 * 7 = 3-level multipl. solver (0201020) <-- small number of iterations
1223 * 8 = 3-level additive solver (0(1+2)0) <-- small solution time
1224 * 9 = 3-level multipl. solver (01210) with discrete divergence
1225 * 11 = 5-level multipl. solver (013454310) <-- small solution time, memory
1226 * 12 = 5-level additive solver (0+1+3+4+5)
1227 * 13 = 5-level multipl. solver (034515430) <-- small solution time, memory
1228 * 14 = 5-level additive solver (01(3+4+5)10)
1229 * 20 = 2-level multipl. solver (0[12]0)
1230 *
1231 * 0 = a Hiptmair-like smoother (010)
1232 *
1233 * The default value is 1.
1234 *--------------------------------------------------------------------------*/
1235
hypre_AMSSetCycleType(void * solver,HYPRE_Int cycle_type)1236 HYPRE_Int hypre_AMSSetCycleType(void *solver,
1237 HYPRE_Int cycle_type)
1238 {
1239 hypre_AMSData *ams_data = (hypre_AMSData *) solver;
1240 ams_data -> cycle_type = cycle_type;
1241 return hypre_error_flag;
1242 }
1243
1244 /*--------------------------------------------------------------------------
1245 * hypre_AMSSetPrintLevel
1246 *
1247 * Control how much information is printed during the solution iterations.
1248 * The defaut values is 1 (print residual norm at each step).
1249 *--------------------------------------------------------------------------*/
1250
hypre_AMSSetPrintLevel(void * solver,HYPRE_Int print_level)1251 HYPRE_Int hypre_AMSSetPrintLevel(void *solver,
1252 HYPRE_Int print_level)
1253 {
1254 hypre_AMSData *ams_data = (hypre_AMSData *) solver;
1255 ams_data -> print_level = print_level;
1256 return hypre_error_flag;
1257 }
1258
1259 /*--------------------------------------------------------------------------
1260 * hypre_AMSSetSmoothingOptions
1261 *
1262 * Set relaxation parameters for A. Default values: 2, 1, 1.0, 1.0.
1263 *--------------------------------------------------------------------------*/
1264
hypre_AMSSetSmoothingOptions(void * solver,HYPRE_Int A_relax_type,HYPRE_Int A_relax_times,HYPRE_Real A_relax_weight,HYPRE_Real A_omega)1265 HYPRE_Int hypre_AMSSetSmoothingOptions(void *solver,
1266 HYPRE_Int A_relax_type,
1267 HYPRE_Int A_relax_times,
1268 HYPRE_Real A_relax_weight,
1269 HYPRE_Real A_omega)
1270 {
1271 hypre_AMSData *ams_data = (hypre_AMSData *) solver;
1272 ams_data -> A_relax_type = A_relax_type;
1273 ams_data -> A_relax_times = A_relax_times;
1274 ams_data -> A_relax_weight = A_relax_weight;
1275 ams_data -> A_omega = A_omega;
1276 return hypre_error_flag;
1277 }
1278 /*--------------------------------------------------------------------------
1279 * hypre_AMSSetChebySmoothingOptions
1280 * AB: note: this could be added to the above,
1281 * but I didn't want to change parameter list)
1282 * Set parameters for chebyshev smoother for A. Default values: 2,.3.
1283 *--------------------------------------------------------------------------*/
1284
hypre_AMSSetChebySmoothingOptions(void * solver,HYPRE_Int A_cheby_order,HYPRE_Int A_cheby_fraction)1285 HYPRE_Int hypre_AMSSetChebySmoothingOptions(void *solver,
1286 HYPRE_Int A_cheby_order,
1287 HYPRE_Int A_cheby_fraction)
1288 {
1289 hypre_AMSData *ams_data = (hypre_AMSData *) solver;
1290 ams_data -> A_cheby_order = A_cheby_order;
1291 ams_data -> A_cheby_fraction = A_cheby_fraction;
1292
1293 return hypre_error_flag;
1294 }
1295 /*--------------------------------------------------------------------------
1296 * hypre_AMSSetAlphaAMGOptions
1297 *
1298 * Set AMG parameters for B_Pi. Default values: 10, 1, 3, 0.25, 0, 0.
1299 *--------------------------------------------------------------------------*/
1300
hypre_AMSSetAlphaAMGOptions(void * solver,HYPRE_Int B_Pi_coarsen_type,HYPRE_Int B_Pi_agg_levels,HYPRE_Int B_Pi_relax_type,HYPRE_Real B_Pi_theta,HYPRE_Int B_Pi_interp_type,HYPRE_Int B_Pi_Pmax)1301 HYPRE_Int hypre_AMSSetAlphaAMGOptions(void *solver,
1302 HYPRE_Int B_Pi_coarsen_type,
1303 HYPRE_Int B_Pi_agg_levels,
1304 HYPRE_Int B_Pi_relax_type,
1305 HYPRE_Real B_Pi_theta,
1306 HYPRE_Int B_Pi_interp_type,
1307 HYPRE_Int B_Pi_Pmax)
1308 {
1309 hypre_AMSData *ams_data = (hypre_AMSData *) solver;
1310 ams_data -> B_Pi_coarsen_type = B_Pi_coarsen_type;
1311 ams_data -> B_Pi_agg_levels = B_Pi_agg_levels;
1312 ams_data -> B_Pi_relax_type = B_Pi_relax_type;
1313 ams_data -> B_Pi_theta = B_Pi_theta;
1314 ams_data -> B_Pi_interp_type = B_Pi_interp_type;
1315 ams_data -> B_Pi_Pmax = B_Pi_Pmax;
1316 return hypre_error_flag;
1317 }
1318
1319 /*--------------------------------------------------------------------------
1320 * hypre_AMSSetAlphaAMGCoarseRelaxType
1321 *
1322 * Set the AMG coarsest level relaxation for B_Pi. Default value: 8.
1323 *--------------------------------------------------------------------------*/
1324
hypre_AMSSetAlphaAMGCoarseRelaxType(void * solver,HYPRE_Int B_Pi_coarse_relax_type)1325 HYPRE_Int hypre_AMSSetAlphaAMGCoarseRelaxType(void *solver,
1326 HYPRE_Int B_Pi_coarse_relax_type)
1327 {
1328 hypre_AMSData *ams_data = (hypre_AMSData *)solver;
1329 ams_data -> B_Pi_coarse_relax_type = B_Pi_coarse_relax_type;
1330 return hypre_error_flag;
1331 }
1332
1333 /*--------------------------------------------------------------------------
1334 * hypre_AMSSetBetaAMGOptions
1335 *
1336 * Set AMG parameters for B_G. Default values: 10, 1, 3, 0.25, 0, 0.
1337 *--------------------------------------------------------------------------*/
1338
hypre_AMSSetBetaAMGOptions(void * solver,HYPRE_Int B_G_coarsen_type,HYPRE_Int B_G_agg_levels,HYPRE_Int B_G_relax_type,HYPRE_Real B_G_theta,HYPRE_Int B_G_interp_type,HYPRE_Int B_G_Pmax)1339 HYPRE_Int hypre_AMSSetBetaAMGOptions(void *solver,
1340 HYPRE_Int B_G_coarsen_type,
1341 HYPRE_Int B_G_agg_levels,
1342 HYPRE_Int B_G_relax_type,
1343 HYPRE_Real B_G_theta,
1344 HYPRE_Int B_G_interp_type,
1345 HYPRE_Int B_G_Pmax)
1346 {
1347 hypre_AMSData *ams_data = (hypre_AMSData *) solver;
1348 ams_data -> B_G_coarsen_type = B_G_coarsen_type;
1349 ams_data -> B_G_agg_levels = B_G_agg_levels;
1350 ams_data -> B_G_relax_type = B_G_relax_type;
1351 ams_data -> B_G_theta = B_G_theta;
1352 ams_data -> B_G_interp_type = B_G_interp_type;
1353 ams_data -> B_G_Pmax = B_G_Pmax;
1354 return hypre_error_flag;
1355 }
1356
1357 /*--------------------------------------------------------------------------
1358 * hypre_AMSSetBetaAMGCoarseRelaxType
1359 *
1360 * Set the AMG coarsest level relaxation for B_G. Default value: 8.
1361 *--------------------------------------------------------------------------*/
1362
hypre_AMSSetBetaAMGCoarseRelaxType(void * solver,HYPRE_Int B_G_coarse_relax_type)1363 HYPRE_Int hypre_AMSSetBetaAMGCoarseRelaxType(void *solver,
1364 HYPRE_Int B_G_coarse_relax_type)
1365 {
1366 hypre_AMSData *ams_data = (hypre_AMSData *) solver;
1367 ams_data -> B_G_coarse_relax_type = B_G_coarse_relax_type;
1368 return hypre_error_flag;
1369 }
1370
1371 /*--------------------------------------------------------------------------
1372 * hypre_AMSComputePi
1373 *
1374 * Construct the Pi interpolation matrix, which maps the space of vector
1375 * linear finite elements to the space of edge finite elements.
1376 *
1377 * The construction is based on the fact that Pi = [Pi_x, Pi_y, Pi_z],
1378 * where each block has the same sparsity structure as G, and the entries
1379 * can be computed from the vectors Gx, Gy, Gz.
1380 *--------------------------------------------------------------------------*/
1381
1382 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
1383 __global__ void
hypreCUDAKernel_AMSComputePi_copy1(HYPRE_Int nnz,HYPRE_Int dim,HYPRE_Int * j_in,HYPRE_Int * j_out)1384 hypreCUDAKernel_AMSComputePi_copy1(HYPRE_Int nnz,
1385 HYPRE_Int dim,
1386 HYPRE_Int *j_in,
1387 HYPRE_Int *j_out)
1388 {
1389 const HYPRE_Int i = hypre_cuda_get_grid_thread_id<1,1>();
1390
1391 if (i < nnz)
1392 {
1393 const HYPRE_Int j = dim * i;
1394
1395 for (HYPRE_Int d = 0; d < dim; d++)
1396 {
1397 j_out[j+d] = dim * read_only_load(&j_in[i]) + d;
1398 }
1399 }
1400 }
1401
1402 __global__ void
hypreCUDAKernel_AMSComputePi_copy2(HYPRE_Int nrows,HYPRE_Int dim,HYPRE_Int * i_in,HYPRE_Real * data_in,HYPRE_Real * Gx_data,HYPRE_Real * Gy_data,HYPRE_Real * Gz_data,HYPRE_Real * data_out)1403 hypreCUDAKernel_AMSComputePi_copy2(HYPRE_Int nrows,
1404 HYPRE_Int dim,
1405 HYPRE_Int *i_in,
1406 HYPRE_Real *data_in,
1407 HYPRE_Real *Gx_data,
1408 HYPRE_Real *Gy_data,
1409 HYPRE_Real *Gz_data,
1410 HYPRE_Real *data_out)
1411 {
1412 const HYPRE_Int i = hypre_cuda_get_grid_warp_id<1,1>();
1413
1414 if (i >= nrows)
1415 {
1416 return;
1417 }
1418
1419 const HYPRE_Int lane_id = hypre_cuda_get_lane_id<1>();
1420 HYPRE_Int j, istart, iend;
1421 HYPRE_Real t, G[3], *Gdata[3];
1422
1423 Gdata[0] = Gx_data;
1424 Gdata[1] = Gy_data;
1425 Gdata[2] = Gz_data;
1426
1427 if (lane_id < 2)
1428 {
1429 j = read_only_load(i_in + i + lane_id);
1430 }
1431
1432 istart = __shfl_sync(HYPRE_WARP_FULL_MASK, j, 0);
1433 iend = __shfl_sync(HYPRE_WARP_FULL_MASK, j, 1);
1434
1435 if (lane_id < dim)
1436 {
1437 t = read_only_load(Gdata[lane_id] + i);
1438 }
1439
1440 for (HYPRE_Int d = 0; d < dim; d++)
1441 {
1442 G[d] = __shfl_sync(HYPRE_WARP_FULL_MASK, t, d);
1443 }
1444
1445 for (j = istart + lane_id; j < iend; j += HYPRE_WARP_SIZE)
1446 {
1447 const HYPRE_Real v = data_in ? fabs(read_only_load(&data_in[j])) * 0.5 : 1.0;
1448 const HYPRE_Int k = j * dim;
1449
1450 for (HYPRE_Int d = 0; d < dim; d++)
1451 {
1452 data_out[k+d] = v * G[d];
1453 }
1454 }
1455 }
1456
1457 #endif
1458
hypre_AMSComputePi(hypre_ParCSRMatrix * A,hypre_ParCSRMatrix * G,hypre_ParVector * Gx,hypre_ParVector * Gy,hypre_ParVector * Gz,HYPRE_Int dim,hypre_ParCSRMatrix ** Pi_ptr)1459 HYPRE_Int hypre_AMSComputePi(hypre_ParCSRMatrix *A,
1460 hypre_ParCSRMatrix *G,
1461 hypre_ParVector *Gx,
1462 hypre_ParVector *Gy,
1463 hypre_ParVector *Gz,
1464 HYPRE_Int dim,
1465 hypre_ParCSRMatrix **Pi_ptr)
1466 {
1467 hypre_ParCSRMatrix *Pi;
1468
1469 /* Compute Pi = [Pi_x, Pi_y, Pi_z] */
1470 {
1471 HYPRE_Int i, j, d;
1472
1473 HYPRE_Real *Gx_data, *Gy_data, *Gz_data;
1474
1475 MPI_Comm comm = hypre_ParCSRMatrixComm(G);
1476 HYPRE_BigInt global_num_rows = hypre_ParCSRMatrixGlobalNumRows(G);
1477 HYPRE_BigInt global_num_cols = dim*hypre_ParCSRMatrixGlobalNumCols(G);
1478 HYPRE_BigInt *row_starts = hypre_ParCSRMatrixRowStarts(G);
1479 HYPRE_BigInt *col_starts;
1480 HYPRE_Int col_starts_size;
1481 HYPRE_Int num_cols_offd = dim*hypre_CSRMatrixNumCols(hypre_ParCSRMatrixOffd(G));
1482 HYPRE_Int num_nonzeros_diag = dim*hypre_CSRMatrixNumNonzeros(hypre_ParCSRMatrixDiag(G));
1483 HYPRE_Int num_nonzeros_offd = dim*hypre_CSRMatrixNumNonzeros(hypre_ParCSRMatrixOffd(G));
1484 HYPRE_BigInt *col_starts_G = hypre_ParCSRMatrixColStarts(G);
1485
1486 col_starts_size = 2;
1487 col_starts = hypre_TAlloc(HYPRE_BigInt, col_starts_size, HYPRE_MEMORY_HOST);
1488 for (i = 0; i < col_starts_size; i++)
1489 col_starts[i] = (HYPRE_BigInt)dim * col_starts_G[i];
1490
1491 Pi = hypre_ParCSRMatrixCreate(comm,
1492 global_num_rows,
1493 global_num_cols,
1494 row_starts,
1495 col_starts,
1496 num_cols_offd,
1497 num_nonzeros_diag,
1498 num_nonzeros_offd);
1499
1500 hypre_ParCSRMatrixOwnsData(Pi) = 1;
1501 hypre_ParCSRMatrixInitialize(Pi);
1502 hypre_TFree(col_starts, HYPRE_MEMORY_HOST);
1503
1504 Gx_data = hypre_VectorData(hypre_ParVectorLocalVector(Gx));
1505 if (dim >= 2)
1506 Gy_data = hypre_VectorData(hypre_ParVectorLocalVector(Gy));
1507 if (dim == 3)
1508 Gz_data = hypre_VectorData(hypre_ParVectorLocalVector(Gz));
1509
1510 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
1511 HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy2( hypre_ParCSRMatrixMemoryLocation(G),
1512 hypre_ParCSRMatrixMemoryLocation(Pi) );
1513 #endif
1514
1515 /* Fill-in the diagonal part */
1516 {
1517 hypre_CSRMatrix *G_diag = hypre_ParCSRMatrixDiag(G);
1518 HYPRE_Int *G_diag_I = hypre_CSRMatrixI(G_diag);
1519 HYPRE_Int *G_diag_J = hypre_CSRMatrixJ(G_diag);
1520 HYPRE_Real *G_diag_data = hypre_CSRMatrixData(G_diag);
1521
1522 HYPRE_Int G_diag_nrows = hypre_CSRMatrixNumRows(G_diag);
1523 HYPRE_Int G_diag_nnz = hypre_CSRMatrixNumNonzeros(G_diag);
1524
1525 hypre_CSRMatrix *Pi_diag = hypre_ParCSRMatrixDiag(Pi);
1526 HYPRE_Int *Pi_diag_I = hypre_CSRMatrixI(Pi_diag);
1527 HYPRE_Int *Pi_diag_J = hypre_CSRMatrixJ(Pi_diag);
1528 HYPRE_Real *Pi_diag_data = hypre_CSRMatrixData(Pi_diag);
1529
1530 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
1531 if (exec == HYPRE_EXEC_DEVICE)
1532 {
1533 HYPRE_THRUST_CALL( transform,
1534 G_diag_I,
1535 G_diag_I + G_diag_nrows + 1,
1536 Pi_diag_I,
1537 dim * _1 );
1538
1539 dim3 bDim = hypre_GetDefaultCUDABlockDimension();
1540 dim3 gDim = hypre_GetDefaultCUDAGridDimension(G_diag_nnz, "thread", bDim);
1541
1542 HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
1543 G_diag_nnz, dim, G_diag_J, Pi_diag_J );
1544
1545 gDim = hypre_GetDefaultCUDAGridDimension(G_diag_nrows, "warp", bDim);
1546
1547 HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim,
1548 G_diag_nrows, dim, G_diag_I, G_diag_data, Gx_data, Gy_data, Gz_data,
1549 Pi_diag_data );
1550 }
1551 else
1552 #endif
1553 {
1554 for (i = 0; i < G_diag_nrows+1; i++)
1555 Pi_diag_I[i] = dim * G_diag_I[i];
1556
1557 for (i = 0; i < G_diag_nnz; i++)
1558 for (d = 0; d < dim; d++)
1559 Pi_diag_J[dim*i+d] = dim*G_diag_J[i]+d;
1560
1561 for (i = 0; i < G_diag_nrows; i++)
1562 for (j = G_diag_I[i]; j < G_diag_I[i+1]; j++)
1563 {
1564 *Pi_diag_data++ = fabs(G_diag_data[j]) * 0.5 * Gx_data[i];
1565 if (dim >= 2)
1566 *Pi_diag_data++ = fabs(G_diag_data[j]) * 0.5 * Gy_data[i];
1567 if (dim == 3)
1568 *Pi_diag_data++ = fabs(G_diag_data[j]) * 0.5 * Gz_data[i];
1569 }
1570 }
1571 }
1572
1573 /* Fill-in the off-diagonal part */
1574 {
1575 hypre_CSRMatrix *G_offd = hypre_ParCSRMatrixOffd(G);
1576 HYPRE_Int *G_offd_I = hypre_CSRMatrixI(G_offd);
1577 HYPRE_Int *G_offd_J = hypre_CSRMatrixJ(G_offd);
1578 HYPRE_Real *G_offd_data = hypre_CSRMatrixData(G_offd);
1579
1580 HYPRE_Int G_offd_nrows = hypre_CSRMatrixNumRows(G_offd);
1581 HYPRE_Int G_offd_ncols = hypre_CSRMatrixNumCols(G_offd);
1582 HYPRE_Int G_offd_nnz = hypre_CSRMatrixNumNonzeros(G_offd);
1583
1584 hypre_CSRMatrix *Pi_offd = hypre_ParCSRMatrixOffd(Pi);
1585 HYPRE_Int *Pi_offd_I = hypre_CSRMatrixI(Pi_offd);
1586 HYPRE_Int *Pi_offd_J = hypre_CSRMatrixJ(Pi_offd);
1587 HYPRE_Real *Pi_offd_data = hypre_CSRMatrixData(Pi_offd);
1588
1589 HYPRE_BigInt *G_cmap = hypre_ParCSRMatrixColMapOffd(G);
1590 HYPRE_BigInt *Pi_cmap = hypre_ParCSRMatrixColMapOffd(Pi);
1591
1592 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
1593 if (exec == HYPRE_EXEC_DEVICE)
1594 {
1595 if (G_offd_ncols)
1596 {
1597 HYPRE_THRUST_CALL( transform,
1598 G_offd_I,
1599 G_offd_I + G_offd_nrows + 1,
1600 Pi_offd_I,
1601 dim * _1 );
1602 }
1603
1604 dim3 bDim = hypre_GetDefaultCUDABlockDimension();
1605 dim3 gDim = hypre_GetDefaultCUDAGridDimension(G_offd_nnz, "thread", bDim);
1606
1607 HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
1608 G_offd_nnz, dim, G_offd_J, Pi_offd_J );
1609
1610 gDim = hypre_GetDefaultCUDAGridDimension(G_offd_nrows, "warp", bDim);
1611
1612 HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim,
1613 G_offd_nrows, dim, G_offd_I, G_offd_data, Gx_data, Gy_data, Gz_data,
1614 Pi_offd_data );
1615 }
1616 else
1617 #endif
1618 {
1619 if (G_offd_ncols)
1620 for (i = 0; i < G_offd_nrows+1; i++)
1621 Pi_offd_I[i] = dim * G_offd_I[i];
1622
1623 for (i = 0; i < G_offd_nnz; i++)
1624 for (d = 0; d < dim; d++)
1625 Pi_offd_J[dim*i+d] = dim*G_offd_J[i]+d;
1626
1627 for (i = 0; i < G_offd_nrows; i++)
1628 for (j = G_offd_I[i]; j < G_offd_I[i+1]; j++)
1629 {
1630 *Pi_offd_data++ = fabs(G_offd_data[j]) * 0.5 * Gx_data[i];
1631 if (dim >= 2)
1632 *Pi_offd_data++ = fabs(G_offd_data[j]) * 0.5 * Gy_data[i];
1633 if (dim == 3)
1634 *Pi_offd_data++ = fabs(G_offd_data[j]) * 0.5 * Gz_data[i];
1635 }
1636 }
1637
1638 for (i = 0; i < G_offd_ncols; i++)
1639 for (d = 0; d < dim; d++)
1640 Pi_cmap[dim*i+d] = (HYPRE_BigInt)dim * G_cmap[i] + (HYPRE_BigInt)d;
1641 }
1642 }
1643
1644 *Pi_ptr = Pi;
1645
1646 return hypre_error_flag;
1647 }
1648
1649 /*--------------------------------------------------------------------------
1650 * hypre_AMSComputePixyz
1651 *
1652 * Construct the components Pix, Piy, Piz of the interpolation matrix Pi,
1653 * which maps the space of vector linear finite elements to the space of
1654 * edge finite elements.
1655 *
1656 * The construction is based on the fact that each component has the same
1657 * sparsity structure as G, and the entries can be computed from the vectors
1658 * Gx, Gy, Gz.
1659 *--------------------------------------------------------------------------*/
1660
1661 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
1662 __global__ void
hypreCUDAKernel_AMSComputePixyz_copy(HYPRE_Int nrows,HYPRE_Int dim,HYPRE_Int * i_in,HYPRE_Real * data_in,HYPRE_Real * Gx_data,HYPRE_Real * Gy_data,HYPRE_Real * Gz_data,HYPRE_Real * data_x_out,HYPRE_Real * data_y_out,HYPRE_Real * data_z_out)1663 hypreCUDAKernel_AMSComputePixyz_copy(HYPRE_Int nrows,
1664 HYPRE_Int dim,
1665 HYPRE_Int *i_in,
1666 HYPRE_Real *data_in,
1667 HYPRE_Real *Gx_data,
1668 HYPRE_Real *Gy_data,
1669 HYPRE_Real *Gz_data,
1670 HYPRE_Real *data_x_out,
1671 HYPRE_Real *data_y_out,
1672 HYPRE_Real *data_z_out )
1673 {
1674 const HYPRE_Int i = hypre_cuda_get_grid_warp_id<1,1>();
1675
1676 if (i >= nrows)
1677 {
1678 return;
1679 }
1680
1681 const HYPRE_Int lane_id = hypre_cuda_get_lane_id<1>();
1682 HYPRE_Int j, istart, iend;
1683 HYPRE_Real t, G[3], *Gdata[3], *Odata[3];
1684
1685 Gdata[0] = Gx_data;
1686 Gdata[1] = Gy_data;
1687 Gdata[2] = Gz_data;
1688
1689 Odata[0] = data_x_out;
1690 Odata[1] = data_y_out;
1691 Odata[2] = data_z_out;
1692
1693 if (lane_id < 2)
1694 {
1695 j = read_only_load(i_in + i + lane_id);
1696 }
1697
1698 istart = __shfl_sync(HYPRE_WARP_FULL_MASK, j, 0);
1699 iend = __shfl_sync(HYPRE_WARP_FULL_MASK, j, 1);
1700
1701 if (lane_id < dim)
1702 {
1703 t = read_only_load(Gdata[lane_id] + i);
1704 }
1705
1706 for (HYPRE_Int d = 0; d < dim; d++)
1707 {
1708 G[d] = __shfl_sync(HYPRE_WARP_FULL_MASK, t, d);
1709 }
1710
1711 for (j = istart + lane_id; j < iend; j += HYPRE_WARP_SIZE)
1712 {
1713 const HYPRE_Real v = data_in ? fabs(read_only_load(&data_in[j])) * 0.5 : 1.0;
1714
1715 for (HYPRE_Int d = 0; d < dim; d++)
1716 {
1717 Odata[d][j] = v * G[d];
1718 }
1719 }
1720 }
1721 #endif
1722
hypre_AMSComputePixyz(hypre_ParCSRMatrix * A,hypre_ParCSRMatrix * G,hypre_ParVector * Gx,hypre_ParVector * Gy,hypre_ParVector * Gz,HYPRE_Int dim,hypre_ParCSRMatrix ** Pix_ptr,hypre_ParCSRMatrix ** Piy_ptr,hypre_ParCSRMatrix ** Piz_ptr)1723 HYPRE_Int hypre_AMSComputePixyz(hypre_ParCSRMatrix *A,
1724 hypre_ParCSRMatrix *G,
1725 hypre_ParVector *Gx,
1726 hypre_ParVector *Gy,
1727 hypre_ParVector *Gz,
1728 HYPRE_Int dim,
1729 hypre_ParCSRMatrix **Pix_ptr,
1730 hypre_ParCSRMatrix **Piy_ptr,
1731 hypre_ParCSRMatrix **Piz_ptr)
1732 {
1733 hypre_ParCSRMatrix *Pix, *Piy, *Piz;
1734
1735 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
1736 HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_ParCSRMatrixMemoryLocation(G) );
1737 #endif
1738
1739 /* Compute Pix, Piy, Piz */
1740 {
1741 HYPRE_Int i, j;
1742
1743 HYPRE_Real *Gx_data, *Gy_data, *Gz_data;
1744
1745 MPI_Comm comm = hypre_ParCSRMatrixComm(G);
1746 HYPRE_BigInt global_num_rows = hypre_ParCSRMatrixGlobalNumRows(G);
1747 HYPRE_BigInt global_num_cols = hypre_ParCSRMatrixGlobalNumCols(G);
1748 HYPRE_BigInt *row_starts = hypre_ParCSRMatrixRowStarts(G);
1749 HYPRE_BigInt *col_starts = hypre_ParCSRMatrixColStarts(G);
1750 HYPRE_Int num_cols_offd = hypre_CSRMatrixNumCols(hypre_ParCSRMatrixOffd(G));
1751 HYPRE_Int num_nonzeros_diag = hypre_CSRMatrixNumNonzeros(hypre_ParCSRMatrixDiag(G));
1752 HYPRE_Int num_nonzeros_offd = hypre_CSRMatrixNumNonzeros(hypre_ParCSRMatrixOffd(G));
1753
1754 Pix = hypre_ParCSRMatrixCreate(comm,
1755 global_num_rows,
1756 global_num_cols,
1757 row_starts,
1758 col_starts,
1759 num_cols_offd,
1760 num_nonzeros_diag,
1761 num_nonzeros_offd);
1762 hypre_ParCSRMatrixOwnsData(Pix) = 1;
1763 hypre_ParCSRMatrixInitialize(Pix);
1764
1765 if (dim >= 2)
1766 {
1767 Piy = hypre_ParCSRMatrixCreate(comm,
1768 global_num_rows,
1769 global_num_cols,
1770 row_starts,
1771 col_starts,
1772 num_cols_offd,
1773 num_nonzeros_diag,
1774 num_nonzeros_offd);
1775 hypre_ParCSRMatrixOwnsData(Piy) = 1;
1776 hypre_ParCSRMatrixInitialize(Piy);
1777 }
1778
1779 if (dim == 3)
1780 {
1781 Piz = hypre_ParCSRMatrixCreate(comm,
1782 global_num_rows,
1783 global_num_cols,
1784 row_starts,
1785 col_starts,
1786 num_cols_offd,
1787 num_nonzeros_diag,
1788 num_nonzeros_offd);
1789 hypre_ParCSRMatrixOwnsData(Piz) = 1;
1790 hypre_ParCSRMatrixInitialize(Piz);
1791 }
1792
1793 Gx_data = hypre_VectorData(hypre_ParVectorLocalVector(Gx));
1794 if (dim >= 2)
1795 Gy_data = hypre_VectorData(hypre_ParVectorLocalVector(Gy));
1796 if (dim == 3)
1797 Gz_data = hypre_VectorData(hypre_ParVectorLocalVector(Gz));
1798
1799 /* Fill-in the diagonal part */
1800 if (dim == 3)
1801 {
1802 hypre_CSRMatrix *G_diag = hypre_ParCSRMatrixDiag(G);
1803 HYPRE_Int *G_diag_I = hypre_CSRMatrixI(G_diag);
1804 HYPRE_Int *G_diag_J = hypre_CSRMatrixJ(G_diag);
1805 HYPRE_Real *G_diag_data = hypre_CSRMatrixData(G_diag);
1806
1807 HYPRE_Int G_diag_nrows = hypre_CSRMatrixNumRows(G_diag);
1808 HYPRE_Int G_diag_nnz = hypre_CSRMatrixNumNonzeros(G_diag);
1809
1810 hypre_CSRMatrix *Pix_diag = hypre_ParCSRMatrixDiag(Pix);
1811 HYPRE_Int *Pix_diag_I = hypre_CSRMatrixI(Pix_diag);
1812 HYPRE_Int *Pix_diag_J = hypre_CSRMatrixJ(Pix_diag);
1813 HYPRE_Real *Pix_diag_data = hypre_CSRMatrixData(Pix_diag);
1814
1815 hypre_CSRMatrix *Piy_diag = hypre_ParCSRMatrixDiag(Piy);
1816 HYPRE_Int *Piy_diag_I = hypre_CSRMatrixI(Piy_diag);
1817 HYPRE_Int *Piy_diag_J = hypre_CSRMatrixJ(Piy_diag);
1818 HYPRE_Real *Piy_diag_data = hypre_CSRMatrixData(Piy_diag);
1819
1820 hypre_CSRMatrix *Piz_diag = hypre_ParCSRMatrixDiag(Piz);
1821 HYPRE_Int *Piz_diag_I = hypre_CSRMatrixI(Piz_diag);
1822 HYPRE_Int *Piz_diag_J = hypre_CSRMatrixJ(Piz_diag);
1823 HYPRE_Real *Piz_diag_data = hypre_CSRMatrixData(Piz_diag);
1824
1825 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
1826 if (exec == HYPRE_EXEC_DEVICE)
1827 {
1828 HYPRE_THRUST_CALL( copy_n,
1829 thrust::make_zip_iterator(thrust::make_tuple(G_diag_I, G_diag_I, G_diag_I)),
1830 G_diag_nrows + 1,
1831 thrust::make_zip_iterator(thrust::make_tuple(Pix_diag_I, Piy_diag_I, Piz_diag_I)) );
1832
1833 HYPRE_THRUST_CALL( copy_n,
1834 thrust::make_zip_iterator(thrust::make_tuple(G_diag_J, G_diag_J, G_diag_J)),
1835 G_diag_nnz,
1836 thrust::make_zip_iterator(thrust::make_tuple(Pix_diag_J, Piy_diag_J, Piz_diag_J)) );
1837
1838 dim3 bDim = hypre_GetDefaultCUDABlockDimension();
1839 dim3 gDim = hypre_GetDefaultCUDAGridDimension(G_diag_nrows, "warp", bDim);
1840
1841 HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
1842 G_diag_nrows, dim, G_diag_I, G_diag_data, Gx_data, Gy_data, Gz_data,
1843 Pix_diag_data, Piy_diag_data, Piz_diag_data );
1844 }
1845 else
1846 #endif
1847 {
1848 for (i = 0; i < G_diag_nrows+1; i++)
1849 {
1850 Pix_diag_I[i] = G_diag_I[i];
1851 Piy_diag_I[i] = G_diag_I[i];
1852 Piz_diag_I[i] = G_diag_I[i];
1853 }
1854
1855 for (i = 0; i < G_diag_nnz; i++)
1856 {
1857 Pix_diag_J[i] = G_diag_J[i];
1858 Piy_diag_J[i] = G_diag_J[i];
1859 Piz_diag_J[i] = G_diag_J[i];
1860 }
1861
1862 for (i = 0; i < G_diag_nrows; i++)
1863 for (j = G_diag_I[i]; j < G_diag_I[i+1]; j++)
1864 {
1865 *Pix_diag_data++ = fabs(G_diag_data[j]) * 0.5 * Gx_data[i];
1866 *Piy_diag_data++ = fabs(G_diag_data[j]) * 0.5 * Gy_data[i];
1867 *Piz_diag_data++ = fabs(G_diag_data[j]) * 0.5 * Gz_data[i];
1868 }
1869 }
1870 }
1871 else if (dim == 2)
1872 {
1873 hypre_CSRMatrix *G_diag = hypre_ParCSRMatrixDiag(G);
1874 HYPRE_Int *G_diag_I = hypre_CSRMatrixI(G_diag);
1875 HYPRE_Int *G_diag_J = hypre_CSRMatrixJ(G_diag);
1876 HYPRE_Real *G_diag_data = hypre_CSRMatrixData(G_diag);
1877
1878 HYPRE_Int G_diag_nrows = hypre_CSRMatrixNumRows(G_diag);
1879 HYPRE_Int G_diag_nnz = hypre_CSRMatrixNumNonzeros(G_diag);
1880
1881 hypre_CSRMatrix *Pix_diag = hypre_ParCSRMatrixDiag(Pix);
1882 HYPRE_Int *Pix_diag_I = hypre_CSRMatrixI(Pix_diag);
1883 HYPRE_Int *Pix_diag_J = hypre_CSRMatrixJ(Pix_diag);
1884 HYPRE_Real *Pix_diag_data = hypre_CSRMatrixData(Pix_diag);
1885
1886 hypre_CSRMatrix *Piy_diag = hypre_ParCSRMatrixDiag(Piy);
1887 HYPRE_Int *Piy_diag_I = hypre_CSRMatrixI(Piy_diag);
1888 HYPRE_Int *Piy_diag_J = hypre_CSRMatrixJ(Piy_diag);
1889 HYPRE_Real *Piy_diag_data = hypre_CSRMatrixData(Piy_diag);
1890
1891 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
1892 if (exec == HYPRE_EXEC_DEVICE)
1893 {
1894 HYPRE_THRUST_CALL( copy_n,
1895 thrust::make_zip_iterator(thrust::make_tuple(G_diag_I, G_diag_I)),
1896 G_diag_nrows + 1,
1897 thrust::make_zip_iterator(thrust::make_tuple(Pix_diag_I, Piy_diag_I)) );
1898
1899 HYPRE_THRUST_CALL( copy_n,
1900 thrust::make_zip_iterator(thrust::make_tuple(G_diag_J, G_diag_J)),
1901 G_diag_nnz,
1902 thrust::make_zip_iterator(thrust::make_tuple(Pix_diag_J, Piy_diag_J)) );
1903
1904 dim3 bDim = hypre_GetDefaultCUDABlockDimension();
1905 dim3 gDim = hypre_GetDefaultCUDAGridDimension(G_diag_nrows, "warp", bDim);
1906
1907 HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
1908 G_diag_nrows, dim, G_diag_I, G_diag_data, Gx_data, Gy_data, NULL,
1909 Pix_diag_data, Piy_diag_data, NULL );
1910 }
1911 else
1912 #endif
1913 {
1914 for (i = 0; i < G_diag_nrows+1; i++)
1915 {
1916 Pix_diag_I[i] = G_diag_I[i];
1917 Piy_diag_I[i] = G_diag_I[i];
1918 }
1919
1920 for (i = 0; i < G_diag_nnz; i++)
1921 {
1922 Pix_diag_J[i] = G_diag_J[i];
1923 Piy_diag_J[i] = G_diag_J[i];
1924 }
1925
1926 for (i = 0; i < G_diag_nrows; i++)
1927 for (j = G_diag_I[i]; j < G_diag_I[i+1]; j++)
1928 {
1929 *Pix_diag_data++ = fabs(G_diag_data[j]) * 0.5 * Gx_data[i];
1930 *Piy_diag_data++ = fabs(G_diag_data[j]) * 0.5 * Gy_data[i];
1931 }
1932 }
1933 }
1934 else
1935 {
1936 hypre_CSRMatrix *G_diag = hypre_ParCSRMatrixDiag(G);
1937 HYPRE_Int *G_diag_I = hypre_CSRMatrixI(G_diag);
1938 HYPRE_Int *G_diag_J = hypre_CSRMatrixJ(G_diag);
1939 HYPRE_Real *G_diag_data = hypre_CSRMatrixData(G_diag);
1940
1941 HYPRE_Int G_diag_nrows = hypre_CSRMatrixNumRows(G_diag);
1942 HYPRE_Int G_diag_nnz = hypre_CSRMatrixNumNonzeros(G_diag);
1943
1944 hypre_CSRMatrix *Pix_diag = hypre_ParCSRMatrixDiag(Pix);
1945 HYPRE_Int *Pix_diag_I = hypre_CSRMatrixI(Pix_diag);
1946 HYPRE_Int *Pix_diag_J = hypre_CSRMatrixJ(Pix_diag);
1947 HYPRE_Real *Pix_diag_data = hypre_CSRMatrixData(Pix_diag);
1948
1949 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
1950 if (exec == HYPRE_EXEC_DEVICE)
1951 {
1952 HYPRE_THRUST_CALL( copy_n,
1953 G_diag_I,
1954 G_diag_nrows + 1,
1955 Pix_diag_I );
1956
1957 HYPRE_THRUST_CALL( copy_n,
1958 G_diag_J,
1959 G_diag_nnz,
1960 Pix_diag_J );
1961
1962 dim3 bDim = hypre_GetDefaultCUDABlockDimension();
1963 dim3 gDim = hypre_GetDefaultCUDAGridDimension(G_diag_nrows, "warp", bDim);
1964
1965 HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
1966 G_diag_nrows, dim, G_diag_I, G_diag_data, Gx_data, NULL, NULL,
1967 Pix_diag_data, NULL, NULL );
1968 }
1969 else
1970 #endif
1971 {
1972 for (i = 0; i < G_diag_nrows+1; i++)
1973 {
1974 Pix_diag_I[i] = G_diag_I[i];
1975 }
1976
1977 for (i = 0; i < G_diag_nnz; i++)
1978 {
1979 Pix_diag_J[i] = G_diag_J[i];
1980 }
1981
1982 for (i = 0; i < G_diag_nrows; i++)
1983 for (j = G_diag_I[i]; j < G_diag_I[i+1]; j++)
1984 {
1985 *Pix_diag_data++ = fabs(G_diag_data[j]) * 0.5 * Gx_data[i];
1986 }
1987 }
1988 }
1989
1990
1991 /* Fill-in the off-diagonal part */
1992 if (dim == 3)
1993 {
1994 hypre_CSRMatrix *G_offd = hypre_ParCSRMatrixOffd(G);
1995 HYPRE_Int *G_offd_I = hypre_CSRMatrixI(G_offd);
1996 HYPRE_Int *G_offd_J = hypre_CSRMatrixJ(G_offd);
1997 HYPRE_Real *G_offd_data = hypre_CSRMatrixData(G_offd);
1998
1999 HYPRE_Int G_offd_nrows = hypre_CSRMatrixNumRows(G_offd);
2000 HYPRE_Int G_offd_ncols = hypre_CSRMatrixNumCols(G_offd);
2001 HYPRE_Int G_offd_nnz = hypre_CSRMatrixNumNonzeros(G_offd);
2002
2003 hypre_CSRMatrix *Pix_offd = hypre_ParCSRMatrixOffd(Pix);
2004 HYPRE_Int *Pix_offd_I = hypre_CSRMatrixI(Pix_offd);
2005 HYPRE_Int *Pix_offd_J = hypre_CSRMatrixJ(Pix_offd);
2006 HYPRE_Real *Pix_offd_data = hypre_CSRMatrixData(Pix_offd);
2007
2008 hypre_CSRMatrix *Piy_offd = hypre_ParCSRMatrixOffd(Piy);
2009 HYPRE_Int *Piy_offd_I = hypre_CSRMatrixI(Piy_offd);
2010 HYPRE_Int *Piy_offd_J = hypre_CSRMatrixJ(Piy_offd);
2011 HYPRE_Real *Piy_offd_data = hypre_CSRMatrixData(Piy_offd);
2012
2013 hypre_CSRMatrix *Piz_offd = hypre_ParCSRMatrixOffd(Piz);
2014 HYPRE_Int *Piz_offd_I = hypre_CSRMatrixI(Piz_offd);
2015 HYPRE_Int *Piz_offd_J = hypre_CSRMatrixJ(Piz_offd);
2016 HYPRE_Real *Piz_offd_data = hypre_CSRMatrixData(Piz_offd);
2017
2018 HYPRE_BigInt *G_cmap = hypre_ParCSRMatrixColMapOffd(G);
2019 HYPRE_BigInt *Pix_cmap = hypre_ParCSRMatrixColMapOffd(Pix);
2020 HYPRE_BigInt *Piy_cmap = hypre_ParCSRMatrixColMapOffd(Piy);
2021 HYPRE_BigInt *Piz_cmap = hypre_ParCSRMatrixColMapOffd(Piz);
2022
2023 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
2024 if (exec == HYPRE_EXEC_DEVICE)
2025 {
2026 if (G_offd_ncols)
2027 {
2028 HYPRE_THRUST_CALL( copy_n,
2029 thrust::make_zip_iterator(thrust::make_tuple(G_offd_I, G_offd_I, G_offd_I)),
2030 G_offd_nrows + 1,
2031 thrust::make_zip_iterator(thrust::make_tuple(Pix_offd_I, Piy_offd_I, Piz_offd_I)) );
2032 }
2033
2034 HYPRE_THRUST_CALL( copy_n,
2035 thrust::make_zip_iterator(thrust::make_tuple(G_offd_J, G_offd_J, G_offd_J)),
2036 G_offd_nnz,
2037 thrust::make_zip_iterator(thrust::make_tuple(Pix_offd_J, Piy_offd_J, Piz_offd_J)) );
2038
2039 dim3 bDim = hypre_GetDefaultCUDABlockDimension();
2040 dim3 gDim = hypre_GetDefaultCUDAGridDimension(G_offd_nrows, "warp", bDim);
2041
2042 HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
2043 G_offd_nrows, dim, G_offd_I, G_offd_data, Gx_data, Gy_data, Gz_data,
2044 Pix_offd_data, Piy_offd_data, Piz_offd_data );
2045 }
2046 else
2047 #endif
2048 {
2049 if (G_offd_ncols)
2050 for (i = 0; i < G_offd_nrows+1; i++)
2051 {
2052 Pix_offd_I[i] = G_offd_I[i];
2053 Piy_offd_I[i] = G_offd_I[i];
2054 Piz_offd_I[i] = G_offd_I[i];
2055 }
2056
2057 for (i = 0; i < G_offd_nnz; i++)
2058 {
2059 Pix_offd_J[i] = G_offd_J[i];
2060 Piy_offd_J[i] = G_offd_J[i];
2061 Piz_offd_J[i] = G_offd_J[i];
2062 }
2063
2064 for (i = 0; i < G_offd_nrows; i++)
2065 for (j = G_offd_I[i]; j < G_offd_I[i+1]; j++)
2066 {
2067 *Pix_offd_data++ = fabs(G_offd_data[j]) * 0.5 * Gx_data[i];
2068 *Piy_offd_data++ = fabs(G_offd_data[j]) * 0.5 * Gy_data[i];
2069 *Piz_offd_data++ = fabs(G_offd_data[j]) * 0.5 * Gz_data[i];
2070 }
2071 }
2072
2073 for (i = 0; i < G_offd_ncols; i++)
2074 {
2075 Pix_cmap[i] = G_cmap[i];
2076 Piy_cmap[i] = G_cmap[i];
2077 Piz_cmap[i] = G_cmap[i];
2078 }
2079 }
2080 else if (dim == 2)
2081 {
2082 hypre_CSRMatrix *G_offd = hypre_ParCSRMatrixOffd(G);
2083 HYPRE_Int *G_offd_I = hypre_CSRMatrixI(G_offd);
2084 HYPRE_Int *G_offd_J = hypre_CSRMatrixJ(G_offd);
2085 HYPRE_Real *G_offd_data = hypre_CSRMatrixData(G_offd);
2086
2087 HYPRE_Int G_offd_nrows = hypre_CSRMatrixNumRows(G_offd);
2088 HYPRE_Int G_offd_ncols = hypre_CSRMatrixNumCols(G_offd);
2089 HYPRE_Int G_offd_nnz = hypre_CSRMatrixNumNonzeros(G_offd);
2090
2091 hypre_CSRMatrix *Pix_offd = hypre_ParCSRMatrixOffd(Pix);
2092 HYPRE_Int *Pix_offd_I = hypre_CSRMatrixI(Pix_offd);
2093 HYPRE_Int *Pix_offd_J = hypre_CSRMatrixJ(Pix_offd);
2094 HYPRE_Real *Pix_offd_data = hypre_CSRMatrixData(Pix_offd);
2095
2096 hypre_CSRMatrix *Piy_offd = hypre_ParCSRMatrixOffd(Piy);
2097 HYPRE_Int *Piy_offd_I = hypre_CSRMatrixI(Piy_offd);
2098 HYPRE_Int *Piy_offd_J = hypre_CSRMatrixJ(Piy_offd);
2099 HYPRE_Real *Piy_offd_data = hypre_CSRMatrixData(Piy_offd);
2100
2101 HYPRE_BigInt *G_cmap = hypre_ParCSRMatrixColMapOffd(G);
2102 HYPRE_BigInt *Pix_cmap = hypre_ParCSRMatrixColMapOffd(Pix);
2103 HYPRE_BigInt *Piy_cmap = hypre_ParCSRMatrixColMapOffd(Piy);
2104
2105 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
2106 if (exec == HYPRE_EXEC_DEVICE)
2107 {
2108 if (G_offd_ncols)
2109 {
2110 HYPRE_THRUST_CALL( copy_n,
2111 thrust::make_zip_iterator(thrust::make_tuple(G_offd_I, G_offd_I)),
2112 G_offd_nrows + 1,
2113 thrust::make_zip_iterator(thrust::make_tuple(Pix_offd_I, Piy_offd_I)) );
2114 }
2115
2116 HYPRE_THRUST_CALL( copy_n,
2117 thrust::make_zip_iterator(thrust::make_tuple(G_offd_J, G_offd_J)),
2118 G_offd_nnz,
2119 thrust::make_zip_iterator(thrust::make_tuple(Pix_offd_J, Piy_offd_J)) );
2120
2121 dim3 bDim = hypre_GetDefaultCUDABlockDimension();
2122 dim3 gDim = hypre_GetDefaultCUDAGridDimension(G_offd_nrows, "warp", bDim);
2123
2124 HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
2125 G_offd_nrows, dim, G_offd_I, G_offd_data, Gx_data, Gy_data, NULL,
2126 Pix_offd_data, Piy_offd_data, NULL );
2127 }
2128 else
2129 #endif
2130 {
2131 if (G_offd_ncols)
2132 for (i = 0; i < G_offd_nrows+1; i++)
2133 {
2134 Pix_offd_I[i] = G_offd_I[i];
2135 Piy_offd_I[i] = G_offd_I[i];
2136 }
2137
2138 for (i = 0; i < G_offd_nnz; i++)
2139 {
2140 Pix_offd_J[i] = G_offd_J[i];
2141 Piy_offd_J[i] = G_offd_J[i];
2142 }
2143
2144 for (i = 0; i < G_offd_nrows; i++)
2145 for (j = G_offd_I[i]; j < G_offd_I[i+1]; j++)
2146 {
2147 *Pix_offd_data++ = fabs(G_offd_data[j]) * 0.5 * Gx_data[i];
2148 *Piy_offd_data++ = fabs(G_offd_data[j]) * 0.5 * Gy_data[i];
2149 }
2150 }
2151
2152 for (i = 0; i < G_offd_ncols; i++)
2153 {
2154 Pix_cmap[i] = G_cmap[i];
2155 Piy_cmap[i] = G_cmap[i];
2156 }
2157 }
2158 else
2159 {
2160 hypre_CSRMatrix *G_offd = hypre_ParCSRMatrixOffd(G);
2161 HYPRE_Int *G_offd_I = hypre_CSRMatrixI(G_offd);
2162 HYPRE_Int *G_offd_J = hypre_CSRMatrixJ(G_offd);
2163 HYPRE_Real *G_offd_data = hypre_CSRMatrixData(G_offd);
2164
2165 HYPRE_Int G_offd_nrows = hypre_CSRMatrixNumRows(G_offd);
2166 HYPRE_Int G_offd_ncols = hypre_CSRMatrixNumCols(G_offd);
2167 HYPRE_Int G_offd_nnz = hypre_CSRMatrixNumNonzeros(G_offd);
2168
2169 hypre_CSRMatrix *Pix_offd = hypre_ParCSRMatrixOffd(Pix);
2170 HYPRE_Int *Pix_offd_I = hypre_CSRMatrixI(Pix_offd);
2171 HYPRE_Int *Pix_offd_J = hypre_CSRMatrixJ(Pix_offd);
2172 HYPRE_Real *Pix_offd_data = hypre_CSRMatrixData(Pix_offd);
2173
2174 HYPRE_BigInt *G_cmap = hypre_ParCSRMatrixColMapOffd(G);
2175 HYPRE_BigInt *Pix_cmap = hypre_ParCSRMatrixColMapOffd(Pix);
2176
2177 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
2178 if (exec == HYPRE_EXEC_DEVICE)
2179 {
2180 if (G_offd_ncols)
2181 {
2182 HYPRE_THRUST_CALL( copy_n,
2183 G_offd_I,
2184 G_offd_nrows + 1,
2185 Pix_offd_I );
2186 }
2187
2188 HYPRE_THRUST_CALL( copy_n,
2189 G_offd_J,
2190 G_offd_nnz,
2191 Pix_offd_J );
2192
2193 dim3 bDim = hypre_GetDefaultCUDABlockDimension();
2194 dim3 gDim = hypre_GetDefaultCUDAGridDimension(G_offd_nrows, "warp", bDim);
2195
2196 HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
2197 G_offd_nrows, dim, G_offd_I, G_offd_data, Gx_data, NULL, NULL,
2198 Pix_offd_data, NULL, NULL );
2199 }
2200 else
2201 #endif
2202 {
2203 if (G_offd_ncols)
2204 for (i = 0; i < G_offd_nrows+1; i++)
2205 {
2206 Pix_offd_I[i] = G_offd_I[i];
2207 }
2208
2209 for (i = 0; i < G_offd_nnz; i++)
2210 {
2211 Pix_offd_J[i] = G_offd_J[i];
2212 }
2213
2214 for (i = 0; i < G_offd_nrows; i++)
2215 for (j = G_offd_I[i]; j < G_offd_I[i+1]; j++)
2216 {
2217 *Pix_offd_data++ = fabs(G_offd_data[j]) * 0.5 * Gx_data[i];
2218 }
2219 }
2220
2221 for (i = 0; i < G_offd_ncols; i++)
2222 {
2223 Pix_cmap[i] = G_cmap[i];
2224 }
2225 }
2226 }
2227
2228 *Pix_ptr = Pix;
2229 if (dim >= 2)
2230 *Piy_ptr = Piy;
2231 if (dim == 3)
2232 *Piz_ptr = Piz;
2233
2234 return hypre_error_flag;
2235 }
2236
2237 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
2238 __global__ void
hypreCUDAKernel_AMSComputeGPi_copy2(HYPRE_Int nrows,HYPRE_Int dim,HYPRE_Int * i_in,HYPRE_Real * data_in,HYPRE_Real * Gx_data,HYPRE_Real * Gy_data,HYPRE_Real * Gz_data,HYPRE_Real * data_out)2239 hypreCUDAKernel_AMSComputeGPi_copy2(HYPRE_Int nrows,
2240 HYPRE_Int dim,
2241 HYPRE_Int *i_in,
2242 HYPRE_Real *data_in,
2243 HYPRE_Real *Gx_data,
2244 HYPRE_Real *Gy_data,
2245 HYPRE_Real *Gz_data,
2246 HYPRE_Real *data_out)
2247 {
2248 const HYPRE_Int i = hypre_cuda_get_grid_warp_id<1,1>();
2249
2250 if (i >= nrows)
2251 {
2252 return;
2253 }
2254
2255 const HYPRE_Int lane_id = hypre_cuda_get_lane_id<1>();
2256 HYPRE_Int j, istart, iend;
2257 HYPRE_Real t, G[3], *Gdata[3];
2258
2259 Gdata[0] = Gx_data;
2260 Gdata[1] = Gy_data;
2261 Gdata[2] = Gz_data;
2262
2263 if (lane_id < 2)
2264 {
2265 j = read_only_load(i_in + i + lane_id);
2266 }
2267
2268 istart = __shfl_sync(HYPRE_WARP_FULL_MASK, j, 0);
2269 iend = __shfl_sync(HYPRE_WARP_FULL_MASK, j, 1);
2270
2271 if (lane_id < dim - 1)
2272 {
2273 t = read_only_load(Gdata[lane_id] + i);
2274 }
2275
2276 for (HYPRE_Int d = 0; d < dim - 1; d++)
2277 {
2278 G[d] = __shfl_sync(HYPRE_WARP_FULL_MASK, t, d);
2279 }
2280
2281 for (j = istart + lane_id; j < iend; j += HYPRE_WARP_SIZE)
2282 {
2283 const HYPRE_Real u = read_only_load(&data_in[j]);
2284 const HYPRE_Real v = fabs(u) * 0.5;
2285 const HYPRE_Int k = j * dim;
2286
2287 data_out[k] = u;
2288 for (HYPRE_Int d = 0; d < dim - 1; d++)
2289 {
2290 data_out[k+d+1] = v * G[d];
2291 }
2292 }
2293 }
2294 #endif
2295
2296 /*--------------------------------------------------------------------------
2297 * hypre_AMSComputeGPi
2298 *
2299 * Construct the matrix [G,Pi] which can be considered an interpolation
2300 * matrix from S_h^4 (4 copies of the scalar linear finite element space)
2301 * to the edge finite elements space.
2302 *--------------------------------------------------------------------------*/
2303
hypre_AMSComputeGPi(hypre_ParCSRMatrix * A,hypre_ParCSRMatrix * G,hypre_ParVector * Gx,hypre_ParVector * Gy,hypre_ParVector * Gz,HYPRE_Int dim,hypre_ParCSRMatrix ** GPi_ptr)2304 HYPRE_Int hypre_AMSComputeGPi(hypre_ParCSRMatrix *A,
2305 hypre_ParCSRMatrix *G,
2306 hypre_ParVector *Gx,
2307 hypre_ParVector *Gy,
2308 hypre_ParVector *Gz,
2309 HYPRE_Int dim,
2310 hypre_ParCSRMatrix **GPi_ptr)
2311 {
2312 hypre_ParCSRMatrix *GPi;
2313
2314 /* Take into account G */
2315 dim++;
2316
2317 /* Compute GPi = [Pi_x, Pi_y, Pi_z, G] */
2318 {
2319 HYPRE_Int i, j, d;
2320
2321 HYPRE_Real *Gx_data, *Gy_data, *Gz_data;
2322
2323 MPI_Comm comm = hypre_ParCSRMatrixComm(G);
2324 HYPRE_BigInt global_num_rows = hypre_ParCSRMatrixGlobalNumRows(G);
2325 HYPRE_BigInt global_num_cols = dim*hypre_ParCSRMatrixGlobalNumCols(G);
2326 HYPRE_BigInt *row_starts = hypre_ParCSRMatrixRowStarts(G);
2327 HYPRE_BigInt *col_starts;
2328 HYPRE_Int col_starts_size;
2329 HYPRE_Int num_cols_offd = dim*hypre_CSRMatrixNumCols(hypre_ParCSRMatrixOffd(G));
2330 HYPRE_Int num_nonzeros_diag = dim*hypre_CSRMatrixNumNonzeros(hypre_ParCSRMatrixDiag(G));
2331 HYPRE_Int num_nonzeros_offd = dim*hypre_CSRMatrixNumNonzeros(hypre_ParCSRMatrixOffd(G));
2332 HYPRE_BigInt *col_starts_G = hypre_ParCSRMatrixColStarts(G);
2333 col_starts_size = 2;
2334 col_starts = hypre_TAlloc(HYPRE_BigInt, col_starts_size, HYPRE_MEMORY_HOST);
2335 for (i = 0; i < col_starts_size; i++)
2336 col_starts[i] = (HYPRE_BigInt) dim * col_starts_G[i];
2337
2338 GPi = hypre_ParCSRMatrixCreate(comm,
2339 global_num_rows,
2340 global_num_cols,
2341 row_starts,
2342 col_starts,
2343 num_cols_offd,
2344 num_nonzeros_diag,
2345 num_nonzeros_offd);
2346
2347 hypre_ParCSRMatrixOwnsData(GPi) = 1;
2348 hypre_ParCSRMatrixInitialize(GPi);
2349
2350 Gx_data = hypre_VectorData(hypre_ParVectorLocalVector(Gx));
2351 if (dim >= 3)
2352 Gy_data = hypre_VectorData(hypre_ParVectorLocalVector(Gy));
2353 if (dim == 4)
2354 Gz_data = hypre_VectorData(hypre_ParVectorLocalVector(Gz));
2355
2356 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
2357 HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy2( hypre_ParCSRMatrixMemoryLocation(G),
2358 hypre_ParCSRMatrixMemoryLocation(GPi) );
2359 #endif
2360
2361 /* Fill-in the diagonal part */
2362 {
2363 hypre_CSRMatrix *G_diag = hypre_ParCSRMatrixDiag(G);
2364 HYPRE_Int *G_diag_I = hypre_CSRMatrixI(G_diag);
2365 HYPRE_Int *G_diag_J = hypre_CSRMatrixJ(G_diag);
2366 HYPRE_Real *G_diag_data = hypre_CSRMatrixData(G_diag);
2367
2368 HYPRE_Int G_diag_nrows = hypre_CSRMatrixNumRows(G_diag);
2369 HYPRE_Int G_diag_nnz = hypre_CSRMatrixNumNonzeros(G_diag);
2370
2371 hypre_CSRMatrix *GPi_diag = hypre_ParCSRMatrixDiag(GPi);
2372 HYPRE_Int *GPi_diag_I = hypre_CSRMatrixI(GPi_diag);
2373 HYPRE_Int *GPi_diag_J = hypre_CSRMatrixJ(GPi_diag);
2374 HYPRE_Real *GPi_diag_data = hypre_CSRMatrixData(GPi_diag);
2375
2376 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
2377 if (exec == HYPRE_EXEC_DEVICE)
2378 {
2379 HYPRE_THRUST_CALL( transform,
2380 G_diag_I,
2381 G_diag_I + G_diag_nrows + 1,
2382 GPi_diag_I,
2383 dim * _1 );
2384
2385 dim3 bDim = hypre_GetDefaultCUDABlockDimension();
2386 dim3 gDim = hypre_GetDefaultCUDAGridDimension(G_diag_nnz, "thread", bDim);
2387
2388 HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
2389 G_diag_nnz, dim, G_diag_J, GPi_diag_J );
2390
2391 gDim = hypre_GetDefaultCUDAGridDimension(G_diag_nrows, "warp", bDim);
2392
2393 HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputeGPi_copy2, gDim, bDim,
2394 G_diag_nrows, dim, G_diag_I, G_diag_data, Gx_data, Gy_data, Gz_data,
2395 GPi_diag_data );
2396 }
2397 else
2398 #endif
2399 {
2400 for (i = 0; i < G_diag_nrows+1; i++)
2401 GPi_diag_I[i] = dim * G_diag_I[i];
2402
2403 for (i = 0; i < G_diag_nnz; i++)
2404 for (d = 0; d < dim; d++)
2405 GPi_diag_J[dim*i+d] = dim*G_diag_J[i]+d;
2406
2407 for (i = 0; i < G_diag_nrows; i++)
2408 for (j = G_diag_I[i]; j < G_diag_I[i+1]; j++)
2409 {
2410 *GPi_diag_data++ = G_diag_data[j];
2411 *GPi_diag_data++ = fabs(G_diag_data[j]) * 0.5 * Gx_data[i];
2412 if (dim >= 3)
2413 *GPi_diag_data++ = fabs(G_diag_data[j]) * 0.5 * Gy_data[i];
2414 if (dim == 4)
2415 *GPi_diag_data++ = fabs(G_diag_data[j]) * 0.5 * Gz_data[i];
2416 }
2417 }
2418 }
2419
2420 /* Fill-in the off-diagonal part */
2421 {
2422 hypre_CSRMatrix *G_offd = hypre_ParCSRMatrixOffd(G);
2423 HYPRE_Int *G_offd_I = hypre_CSRMatrixI(G_offd);
2424 HYPRE_Int *G_offd_J = hypre_CSRMatrixJ(G_offd);
2425 HYPRE_Real *G_offd_data = hypre_CSRMatrixData(G_offd);
2426
2427 HYPRE_Int G_offd_nrows = hypre_CSRMatrixNumRows(G_offd);
2428 HYPRE_Int G_offd_ncols = hypre_CSRMatrixNumCols(G_offd);
2429 HYPRE_Int G_offd_nnz = hypre_CSRMatrixNumNonzeros(G_offd);
2430
2431 hypre_CSRMatrix *GPi_offd = hypre_ParCSRMatrixOffd(GPi);
2432 HYPRE_Int *GPi_offd_I = hypre_CSRMatrixI(GPi_offd);
2433 HYPRE_Int *GPi_offd_J = hypre_CSRMatrixJ(GPi_offd);
2434 HYPRE_Real *GPi_offd_data = hypre_CSRMatrixData(GPi_offd);
2435
2436 HYPRE_BigInt *G_cmap = hypre_ParCSRMatrixColMapOffd(G);
2437 HYPRE_BigInt *GPi_cmap = hypre_ParCSRMatrixColMapOffd(GPi);
2438
2439 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
2440 if (exec == HYPRE_EXEC_DEVICE)
2441 {
2442 if (G_offd_ncols)
2443 {
2444 HYPRE_THRUST_CALL( transform,
2445 G_offd_I,
2446 G_offd_I + G_offd_nrows + 1,
2447 GPi_offd_I,
2448 dim * _1 );
2449 }
2450
2451 dim3 bDim = hypre_GetDefaultCUDABlockDimension();
2452 dim3 gDim = hypre_GetDefaultCUDAGridDimension(G_offd_nnz, "thread", bDim);
2453
2454 HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
2455 G_offd_nnz, dim, G_offd_J, GPi_offd_J );
2456
2457 gDim = hypre_GetDefaultCUDAGridDimension(G_offd_nrows, "warp", bDim);
2458
2459 HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputeGPi_copy2, gDim, bDim,
2460 G_offd_nrows, dim, G_offd_I, G_offd_data, Gx_data, Gy_data, Gz_data,
2461 GPi_offd_data );
2462 }
2463 else
2464 #endif
2465 {
2466 if (G_offd_ncols)
2467 for (i = 0; i < G_offd_nrows+1; i++)
2468 GPi_offd_I[i] = dim * G_offd_I[i];
2469
2470 for (i = 0; i < G_offd_nnz; i++)
2471 for (d = 0; d < dim; d++)
2472 GPi_offd_J[dim*i+d] = dim*G_offd_J[i]+d;
2473
2474 for (i = 0; i < G_offd_nrows; i++)
2475 for (j = G_offd_I[i]; j < G_offd_I[i+1]; j++)
2476 {
2477 *GPi_offd_data++ = G_offd_data[j];
2478 *GPi_offd_data++ = fabs(G_offd_data[j]) * 0.5 * Gx_data[i];
2479 if (dim >= 3)
2480 *GPi_offd_data++ = fabs(G_offd_data[j]) * 0.5 * Gy_data[i];
2481 if (dim == 4)
2482 *GPi_offd_data++ = fabs(G_offd_data[j]) * 0.5 * Gz_data[i];
2483 }
2484 }
2485
2486 for (i = 0; i < G_offd_ncols; i++)
2487 for (d = 0; d < dim; d++)
2488 GPi_cmap[dim*i+d] = dim*G_cmap[i]+d;
2489 }
2490
2491 }
2492
2493 *GPi_ptr = GPi;
2494
2495 return hypre_error_flag;
2496 }
2497
2498 /*--------------------------------------------------------------------------
2499 * hypre_AMSSetup
2500 *
2501 * Construct the AMS solver components.
2502 *
2503 * The following functions need to be called before hypre_AMSSetup():
2504 * - hypre_AMSSetDimension() (if solving a 2D problem)
2505 * - hypre_AMSSetDiscreteGradient()
2506 * - hypre_AMSSetCoordinateVectors() or hypre_AMSSetEdgeConstantVectors
2507 *--------------------------------------------------------------------------*/
2508 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
2509 __global__ void
hypreCUDAKernel_FixInterNodes(HYPRE_Int nrows,HYPRE_Int * G0t_diag_i,HYPRE_Complex * G0t_diag_data,HYPRE_Int * G0t_offd_i,HYPRE_Complex * G0t_offd_data,HYPRE_Real * interior_nodes_data)2510 hypreCUDAKernel_FixInterNodes( HYPRE_Int nrows,
2511 HYPRE_Int *G0t_diag_i,
2512 HYPRE_Complex *G0t_diag_data,
2513 HYPRE_Int *G0t_offd_i,
2514 HYPRE_Complex *G0t_offd_data,
2515 HYPRE_Real *interior_nodes_data)
2516 {
2517 HYPRE_Int row_i = hypre_cuda_get_grid_warp_id<1,1>();
2518
2519 if (row_i >= nrows)
2520 {
2521 return;
2522 }
2523
2524 HYPRE_Int lane = hypre_cuda_get_lane_id<1>();
2525 HYPRE_Int not1 = 0;
2526
2527 if (lane == 0)
2528 {
2529 not1 = read_only_load(&interior_nodes_data[row_i]) != 1.0;
2530 }
2531
2532 not1 = __shfl_sync(HYPRE_WARP_FULL_MASK, not1, 0);
2533
2534 if (!not1)
2535 {
2536 return;
2537 }
2538
2539 HYPRE_Int p1, q1, p2 = 0, q2 = 0;
2540 bool nonempty_offd = G0t_offd_data != NULL;
2541
2542 if (lane < 2)
2543 {
2544 p1 = read_only_load(G0t_diag_i + row_i + lane);
2545 if (nonempty_offd)
2546 {
2547 p2 = read_only_load(G0t_offd_i + row_i + lane);
2548 }
2549 }
2550
2551 q1 = __shfl_sync(HYPRE_WARP_FULL_MASK, p1, 1);
2552 p1 = __shfl_sync(HYPRE_WARP_FULL_MASK, p1, 0);
2553 if (nonempty_offd)
2554 {
2555 q2 = __shfl_sync(HYPRE_WARP_FULL_MASK, p2, 1);
2556 p2 = __shfl_sync(HYPRE_WARP_FULL_MASK, p2, 0);
2557 }
2558
2559 for (HYPRE_Int j = p1 + lane; j < q1; j += HYPRE_WARP_SIZE)
2560 {
2561 G0t_diag_data[j] = 0.0;
2562 }
2563 for (HYPRE_Int j = p2 + lane; j < q2; j += HYPRE_WARP_SIZE)
2564 {
2565 G0t_offd_data[j] = 0.0;
2566 }
2567 }
2568
2569 __global__ void
hypreCUDAKernel_AMSSetupScaleGGt(HYPRE_Int Gt_num_rows,HYPRE_Int * Gt_diag_i,HYPRE_Int * Gt_diag_j,HYPRE_Real * Gt_diag_data,HYPRE_Int * Gt_offd_i,HYPRE_Real * Gt_offd_data,HYPRE_Real * Gx_data,HYPRE_Real * Gy_data,HYPRE_Real * Gz_data)2570 hypreCUDAKernel_AMSSetupScaleGGt( HYPRE_Int Gt_num_rows,
2571 HYPRE_Int *Gt_diag_i,
2572 HYPRE_Int *Gt_diag_j,
2573 HYPRE_Real *Gt_diag_data,
2574 HYPRE_Int *Gt_offd_i,
2575 HYPRE_Real *Gt_offd_data,
2576 HYPRE_Real *Gx_data,
2577 HYPRE_Real *Gy_data,
2578 HYPRE_Real *Gz_data )
2579 {
2580 HYPRE_Int row_i = hypre_cuda_get_grid_warp_id<1,1>();
2581
2582 if (row_i >= Gt_num_rows)
2583 {
2584 return;
2585 }
2586
2587 HYPRE_Int lane = hypre_cuda_get_lane_id<1>();
2588 HYPRE_Real h2 = 0.0;
2589 HYPRE_Int ne, p1, q1, p2 = 0, q2 = 0;
2590
2591 if (lane < 2)
2592 {
2593 p1 = read_only_load(Gt_diag_i + row_i + lane);
2594 }
2595 q1 = __shfl_sync(HYPRE_WARP_FULL_MASK, p1, 1);
2596 p1 = __shfl_sync(HYPRE_WARP_FULL_MASK, p1, 0);
2597 ne = q1 - p1;
2598
2599 if (ne == 0)
2600 {
2601 return;
2602 }
2603
2604 if (Gt_offd_data != NULL)
2605 {
2606 if (lane < 2)
2607 {
2608 p2 = read_only_load(Gt_offd_i + row_i + lane);
2609 }
2610 q2 = __shfl_sync(HYPRE_WARP_FULL_MASK, p2, 1);
2611 p2 = __shfl_sync(HYPRE_WARP_FULL_MASK, p2, 0);
2612 }
2613
2614 for (HYPRE_Int j = p1 + lane; j < q1; j += HYPRE_WARP_SIZE)
2615 {
2616 const HYPRE_Int k = read_only_load(&Gt_diag_j[j]);
2617 const HYPRE_Real Gx = read_only_load(&Gx_data[k]);
2618 const HYPRE_Real Gy = read_only_load(&Gy_data[k]);
2619 const HYPRE_Real Gz = read_only_load(&Gz_data[k]);
2620
2621 h2 += Gx*Gx + Gy*Gy + Gz*Gz;
2622 }
2623
2624 h2 = warp_allreduce_sum(h2) / ne;
2625
2626 for (HYPRE_Int j = p1 + lane; j < q1; j += HYPRE_WARP_SIZE)
2627 {
2628 Gt_diag_data[j] *= h2;
2629 }
2630
2631 for (HYPRE_Int j = p2 + lane; j < q2; j += HYPRE_WARP_SIZE)
2632 {
2633 Gt_offd_data[j] *= h2;
2634 }
2635 }
2636 #endif
2637
hypre_AMSSetup(void * solver,hypre_ParCSRMatrix * A,hypre_ParVector * b,hypre_ParVector * x)2638 HYPRE_Int hypre_AMSSetup(void *solver,
2639 hypre_ParCSRMatrix *A,
2640 hypre_ParVector *b,
2641 hypre_ParVector *x)
2642 {
2643 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
2644 HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_ParCSRMatrixMemoryLocation(A) );
2645 #endif
2646
2647 hypre_AMSData *ams_data = (hypre_AMSData *) solver;
2648
2649 HYPRE_Int input_info = 0;
2650
2651 ams_data -> A = A;
2652
2653 /* Modifications for problems with zero-conductivity regions */
2654 if (ams_data -> interior_nodes)
2655 {
2656 hypre_ParCSRMatrix *G0t, *Aorig = A;
2657
2658 /* Make sure that multiple Setup()+Solve() give identical results */
2659 ams_data -> solve_counter = 0;
2660
2661 /* Construct the discrete gradient matrix for the zero-conductivity region
2662 by eliminating the zero-conductivity nodes from G^t. The range of G0
2663 represents the kernel of A, i.e. the gradients of nodal basis functions
2664 supported in zero-conductivity regions. */
2665 hypre_ParCSRMatrixTranspose(ams_data -> G, &G0t, 1);
2666
2667 {
2668 HYPRE_Int i, j;
2669 HYPRE_Int nv = hypre_ParCSRMatrixNumCols(ams_data -> G);
2670 hypre_CSRMatrix *G0td = hypre_ParCSRMatrixDiag(G0t);
2671 HYPRE_Int *G0tdI = hypre_CSRMatrixI(G0td);
2672 HYPRE_Real *G0tdA = hypre_CSRMatrixData(G0td);
2673 hypre_CSRMatrix *G0to = hypre_ParCSRMatrixOffd(G0t);
2674 HYPRE_Int *G0toI = hypre_CSRMatrixI(G0to);
2675 HYPRE_Real *G0toA = hypre_CSRMatrixData(G0to);
2676 HYPRE_Real *interior_nodes_data=hypre_VectorData(
2677 hypre_ParVectorLocalVector((hypre_ParVector*) ams_data -> interior_nodes));
2678
2679 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
2680 if (exec == HYPRE_EXEC_DEVICE)
2681 {
2682 dim3 bDim = hypre_GetDefaultCUDABlockDimension();
2683 dim3 gDim = hypre_GetDefaultCUDAGridDimension(nv, "warp", bDim);
2684 HYPRE_CUDA_LAUNCH( hypreCUDAKernel_FixInterNodes, gDim, bDim,
2685 nv, G0tdI, G0tdA, G0toI, G0toA, interior_nodes_data );
2686 }
2687 else
2688 #endif
2689 {
2690 for (i = 0; i < nv; i++)
2691 {
2692 if (interior_nodes_data[i] != 1)
2693 {
2694 for (j = G0tdI[i]; j < G0tdI[i+1]; j++)
2695 G0tdA[j] = 0.0;
2696 if (G0toI)
2697 for (j = G0toI[i]; j < G0toI[i+1]; j++)
2698 G0toA[j] = 0.0;
2699 }
2700 }
2701 }
2702 }
2703 hypre_ParCSRMatrixTranspose(G0t, & ams_data -> G0, 1);
2704
2705 /* Construct the subspace matrix A_G0 = G0^T G0 */
2706 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
2707 if (exec == HYPRE_EXEC_DEVICE)
2708 {
2709 ams_data -> A_G0 = hypre_ParCSRMatMat(G0t, ams_data -> G0);
2710 }
2711 else
2712 #endif
2713 {
2714 ams_data -> A_G0 = hypre_ParMatmul(G0t, ams_data -> G0);
2715 }
2716 hypre_ParCSRMatrixFixZeroRows(ams_data -> A_G0);
2717
2718 /* Create AMG solver for A_G0 */
2719 HYPRE_BoomerAMGCreate(&ams_data -> B_G0);
2720 HYPRE_BoomerAMGSetCoarsenType(ams_data -> B_G0, ams_data -> B_G_coarsen_type);
2721 HYPRE_BoomerAMGSetAggNumLevels(ams_data -> B_G0, ams_data -> B_G_agg_levels);
2722 HYPRE_BoomerAMGSetRelaxType(ams_data -> B_G0, ams_data -> B_G_relax_type);
2723 HYPRE_BoomerAMGSetNumSweeps(ams_data -> B_G0, 1);
2724 HYPRE_BoomerAMGSetMaxLevels(ams_data -> B_G0, 25);
2725 HYPRE_BoomerAMGSetTol(ams_data -> B_G0, 0.0);
2726 HYPRE_BoomerAMGSetMaxIter(ams_data -> B_G0, 3); /* use just a few V-cycles */
2727 HYPRE_BoomerAMGSetStrongThreshold(ams_data -> B_G0, ams_data -> B_G_theta);
2728 HYPRE_BoomerAMGSetInterpType(ams_data -> B_G0, ams_data -> B_G_interp_type);
2729 HYPRE_BoomerAMGSetPMaxElmts(ams_data -> B_G0, ams_data -> B_G_Pmax);
2730 HYPRE_BoomerAMGSetMinCoarseSize(ams_data -> B_G0, 2); /* don't coarsen to 0 */
2731 /* Generally, don't use exact solve on the coarsest level (matrix may be singular) */
2732 HYPRE_BoomerAMGSetCycleRelaxType(ams_data -> B_G0, ams_data -> B_G_coarse_relax_type, 3);
2733 HYPRE_BoomerAMGSetup(ams_data -> B_G0,
2734 (HYPRE_ParCSRMatrix)ams_data -> A_G0,
2735 0, 0);
2736
2737 /* Construct the preconditioner for ams_data->A = A + G0 G0^T.
2738 NOTE: this can be optimized significantly by taking into account that
2739 the sparsity pattern of A is subset of the sparsity pattern of G0 G0^T */
2740 {
2741 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
2742 hypre_ParCSRMatrix *A;
2743 if (exec == HYPRE_EXEC_DEVICE)
2744 {
2745 A = hypre_ParCSRMatMat(ams_data -> G0, G0t);
2746 }
2747 else
2748 #endif
2749 {
2750 A = hypre_ParMatmul(ams_data -> G0, G0t);
2751 }
2752 hypre_ParCSRMatrix *B = Aorig;
2753 hypre_ParCSRMatrix **C_ptr = &ams_data -> A;
2754 hypre_ParCSRMatrix *C;
2755 HYPRE_Real factor, lfactor;
2756 /* scale (penalize) G0 G0^T before adding it to the matrix */
2757 {
2758 HYPRE_Int i;
2759 HYPRE_Int B_num_rows = hypre_CSRMatrixNumRows(hypre_ParCSRMatrixDiag(B));
2760 HYPRE_Real *B_diag_data = hypre_CSRMatrixData(hypre_ParCSRMatrixDiag(B));
2761 HYPRE_Real *B_offd_data = hypre_CSRMatrixData(hypre_ParCSRMatrixOffd(B));
2762 HYPRE_Int *B_diag_i = hypre_CSRMatrixI(hypre_ParCSRMatrixDiag(B));
2763 HYPRE_Int *B_offd_i = hypre_CSRMatrixI(hypre_ParCSRMatrixOffd(B));
2764 lfactor = -1;
2765 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
2766 if (exec == HYPRE_EXEC_DEVICE)
2767 {
2768 HYPRE_Int nnz_diag = hypre_CSRMatrixNumNonzeros(hypre_ParCSRMatrixDiag(B));
2769 HYPRE_Int nnz_offd = hypre_CSRMatrixNumNonzeros(hypre_ParCSRMatrixOffd(B));
2770 #if defined(HYPRE_DEBUG)
2771 HYPRE_Int nnz;
2772 hypre_TMemcpy(&nnz, &B_diag_i[B_num_rows], HYPRE_Int, 1, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE);
2773 hypre_assert(nnz == nnz_diag);
2774 hypre_TMemcpy(&nnz, &B_offd_i[B_num_rows], HYPRE_Int, 1, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE);
2775 hypre_assert(nnz == nnz_offd);
2776 #endif
2777 if (nnz_diag)
2778 {
2779 lfactor = HYPRE_THRUST_CALL( reduce,
2780 thrust::make_transform_iterator(B_diag_data, absolute_value<HYPRE_Real>()),
2781 thrust::make_transform_iterator(B_diag_data + nnz_diag, absolute_value<HYPRE_Real>()),
2782 -1.0,
2783 thrust::maximum<HYPRE_Real>() );
2784 }
2785
2786 if (nnz_offd)
2787 {
2788 lfactor = HYPRE_THRUST_CALL( reduce,
2789 thrust::make_transform_iterator(B_offd_data, absolute_value<HYPRE_Real>()),
2790 thrust::make_transform_iterator(B_offd_data + nnz_offd, absolute_value<HYPRE_Real>()),
2791 lfactor,
2792 thrust::maximum<HYPRE_Real>() );
2793
2794 }
2795 }
2796 else
2797 #endif
2798 {
2799 for (i = 0; i < B_diag_i[B_num_rows]; i++)
2800 if (fabs(B_diag_data[i]) > lfactor)
2801 lfactor = fabs(B_diag_data[i]);
2802 for (i = 0; i < B_offd_i[B_num_rows]; i++)
2803 if (fabs(B_offd_data[i]) > lfactor)
2804 lfactor = fabs(B_offd_data[i]);
2805 }
2806
2807 lfactor *= 1e-10; /* scaling factor: max|A_ij|*1e-10 */
2808 hypre_MPI_Allreduce(&lfactor, &factor, 1, HYPRE_MPI_REAL, hypre_MPI_MAX, hypre_ParCSRMatrixComm(A));
2809 }
2810
2811 hypre_ParCSRMatrixAdd(factor, A, 1.0, B, &C);
2812
2813 /*hypre_CSRMatrix *A_local, *B_local, *C_local, *C_tmp;
2814
2815 MPI_Comm comm = hypre_ParCSRMatrixComm(A);
2816 HYPRE_BigInt global_num_rows = hypre_ParCSRMatrixGlobalNumRows(A);
2817 HYPRE_BigInt global_num_cols = hypre_ParCSRMatrixGlobalNumCols(A);
2818 HYPRE_BigInt *row_starts = hypre_ParCSRMatrixRowStarts(A);
2819 HYPRE_BigInt *col_starts = hypre_ParCSRMatrixColStarts(A);
2820 HYPRE_Int A_num_cols_offd = hypre_CSRMatrixNumCols(hypre_ParCSRMatrixOffd(A));
2821 HYPRE_Int A_num_nonzeros_diag = hypre_CSRMatrixNumNonzeros(hypre_ParCSRMatrixDiag(A));
2822 HYPRE_Int A_num_nonzeros_offd = hypre_CSRMatrixNumNonzeros(hypre_ParCSRMatrixOffd(A));
2823 HYPRE_Int B_num_cols_offd = hypre_CSRMatrixNumCols(hypre_ParCSRMatrixOffd(B));
2824 HYPRE_Int B_num_nonzeros_diag = hypre_CSRMatrixNumNonzeros(hypre_ParCSRMatrixDiag(B));
2825 HYPRE_Int B_num_nonzeros_offd = hypre_CSRMatrixNumNonzeros(hypre_ParCSRMatrixOffd(B));
2826
2827 A_local = hypre_MergeDiagAndOffd(A);
2828 B_local = hypre_MergeDiagAndOffd(B);*/
2829 /* scale (penalize) G0 G0^T before adding it to the matrix */
2830 /*{
2831 HYPRE_Int i, nnz = hypre_CSRMatrixNumNonzeros(A_local);
2832 HYPRE_Real *data = hypre_CSRMatrixData(A_local);
2833 HYPRE_Real *dataB = hypre_CSRMatrixData(B_local);
2834 HYPRE_Int nnzB = hypre_CSRMatrixNumNonzeros(B_local);
2835 HYPRE_Real factor, lfactor;
2836 lfactor = -1;
2837 for (i = 0; i < nnzB; i++)
2838 if (fabs(dataB[i]) > lfactor)
2839 lfactor = fabs(dataB[i]);
2840 lfactor *= 1e-10;
2841 hypre_MPI_Allreduce(&lfactor, &factor, 1, HYPRE_MPI_REAL, hypre_MPI_MAX,
2842 hypre_ParCSRMatrixComm(A));
2843 for (i = 0; i < nnz; i++)
2844 data[i] *= factor;
2845 }
2846 C_tmp = hypre_CSRMatrixBigAdd(A_local, B_local);
2847 C_local = hypre_CSRMatrixBigDeleteZeros(C_tmp,0.0);
2848 if (C_local)
2849 hypre_CSRMatrixDestroy(C_tmp);
2850 else
2851 C_local = C_tmp;
2852
2853 C = hypre_ParCSRMatrixCreate (comm,
2854 global_num_rows,
2855 global_num_cols,
2856 row_starts,
2857 col_starts,
2858 A_num_cols_offd + B_num_cols_offd,
2859 A_num_nonzeros_diag + B_num_nonzeros_diag,
2860 A_num_nonzeros_offd + B_num_nonzeros_offd);
2861 GenerateDiagAndOffd(C_local, C,
2862 hypre_ParCSRMatrixFirstColDiag(A),
2863 hypre_ParCSRMatrixLastColDiag(A));
2864
2865 hypre_CSRMatrixDestroy(A_local);
2866 hypre_CSRMatrixDestroy(B_local);
2867 hypre_CSRMatrixDestroy(C_local);
2868 */
2869
2870 hypre_ParCSRMatrixDestroy(A);
2871
2872 *C_ptr = C;
2873 }
2874
2875 hypre_ParCSRMatrixDestroy(G0t);
2876 }
2877
2878 /* Make sure that the first entry in each row is the diagonal one. */
2879 /* hypre_CSRMatrixReorder(hypre_ParCSRMatrixDiag(ams_data -> A)); */
2880
2881 /* Compute the l1 norm of the rows of A */
2882 if (ams_data -> A_relax_type >= 1 && ams_data -> A_relax_type <= 4)
2883 {
2884 HYPRE_Real *l1_norm_data = NULL;
2885
2886 hypre_ParCSRComputeL1Norms(ams_data -> A, ams_data -> A_relax_type, NULL, &l1_norm_data);
2887
2888 ams_data -> A_l1_norms = hypre_SeqVectorCreate(hypre_ParCSRMatrixNumRows(ams_data -> A));
2889 hypre_VectorData(ams_data -> A_l1_norms) = l1_norm_data;
2890 hypre_SeqVectorInitialize_v2(ams_data -> A_l1_norms, hypre_ParCSRMatrixMemoryLocation(ams_data -> A));
2891 }
2892
2893 /* Chebyshev? */
2894 if (ams_data -> A_relax_type == 16)
2895 {
2896 hypre_ParCSRMaxEigEstimateCG(ams_data->A, 1, 10,
2897 &ams_data->A_max_eig_est,
2898 &ams_data->A_min_eig_est);
2899 }
2900
2901 /* If not given, compute Gx, Gy and Gz */
2902 {
2903 if (ams_data -> x != NULL &&
2904 (ams_data -> dim == 1 || ams_data -> y != NULL) &&
2905 (ams_data -> dim <= 2 || ams_data -> z != NULL))
2906 input_info = 1;
2907
2908 if (ams_data -> Gx != NULL &&
2909 (ams_data -> dim == 1 || ams_data -> Gy != NULL) &&
2910 (ams_data -> dim <= 2 || ams_data -> Gz != NULL))
2911 input_info = 2;
2912
2913 if (input_info == 1)
2914 {
2915 ams_data -> Gx = hypre_ParVectorInRangeOf(ams_data -> G);
2916 hypre_ParCSRMatrixMatvec (1.0, ams_data -> G, ams_data -> x, 0.0, ams_data -> Gx);
2917 if (ams_data -> dim >= 2)
2918 {
2919 ams_data -> Gy = hypre_ParVectorInRangeOf(ams_data -> G);
2920 hypre_ParCSRMatrixMatvec (1.0, ams_data -> G, ams_data -> y, 0.0, ams_data -> Gy);
2921 }
2922 if (ams_data -> dim == 3)
2923 {
2924 ams_data -> Gz = hypre_ParVectorInRangeOf(ams_data -> G);
2925 hypre_ParCSRMatrixMatvec (1.0, ams_data -> G, ams_data -> z, 0.0, ams_data -> Gz);
2926 }
2927 }
2928 }
2929
2930 if (ams_data -> Pi == NULL && ams_data -> Pix == NULL)
2931 {
2932 if (ams_data -> cycle_type == 20)
2933 /* Construct the combined interpolation matrix [G,Pi] */
2934 hypre_AMSComputeGPi(ams_data -> A,
2935 ams_data -> G,
2936 ams_data -> Gx,
2937 ams_data -> Gy,
2938 ams_data -> Gz,
2939 ams_data -> dim,
2940 &ams_data -> Pi);
2941 else if (ams_data -> cycle_type > 10)
2942 /* Construct Pi{x,y,z} instead of Pi = [Pix,Piy,Piz] */
2943 hypre_AMSComputePixyz(ams_data -> A,
2944 ams_data -> G,
2945 ams_data -> Gx,
2946 ams_data -> Gy,
2947 ams_data -> Gz,
2948 ams_data -> dim,
2949 &ams_data -> Pix,
2950 &ams_data -> Piy,
2951 &ams_data -> Piz);
2952 else
2953 /* Construct the Pi interpolation matrix */
2954 hypre_AMSComputePi(ams_data -> A,
2955 ams_data -> G,
2956 ams_data -> Gx,
2957 ams_data -> Gy,
2958 ams_data -> Gz,
2959 ams_data -> dim,
2960 &ams_data -> Pi);
2961 }
2962
2963 /* Keep Gx, Gy and Gz only if use the method with discrete divergence
2964 stabilization (where we use them to compute the local mesh size). */
2965 if (input_info == 1 && ams_data -> cycle_type != 9)
2966 {
2967 hypre_ParVectorDestroy(ams_data -> Gx);
2968 if (ams_data -> dim >= 2)
2969 hypre_ParVectorDestroy(ams_data -> Gy);
2970 if (ams_data -> dim == 3)
2971 hypre_ParVectorDestroy(ams_data -> Gz);
2972 }
2973
2974 /* Create the AMG solver on the range of G^T */
2975 if (!ams_data -> beta_is_zero && ams_data -> cycle_type != 20)
2976 {
2977 HYPRE_BoomerAMGCreate(&ams_data -> B_G);
2978 HYPRE_BoomerAMGSetCoarsenType(ams_data -> B_G, ams_data -> B_G_coarsen_type);
2979 HYPRE_BoomerAMGSetAggNumLevels(ams_data -> B_G, ams_data -> B_G_agg_levels);
2980 HYPRE_BoomerAMGSetRelaxType(ams_data -> B_G, ams_data -> B_G_relax_type);
2981 HYPRE_BoomerAMGSetNumSweeps(ams_data -> B_G, 1);
2982 HYPRE_BoomerAMGSetMaxLevels(ams_data -> B_G, 25);
2983 HYPRE_BoomerAMGSetTol(ams_data -> B_G, 0.0);
2984 HYPRE_BoomerAMGSetMaxIter(ams_data -> B_G, 1);
2985 HYPRE_BoomerAMGSetStrongThreshold(ams_data -> B_G, ams_data -> B_G_theta);
2986 HYPRE_BoomerAMGSetInterpType(ams_data -> B_G, ams_data -> B_G_interp_type);
2987 HYPRE_BoomerAMGSetPMaxElmts(ams_data -> B_G, ams_data -> B_G_Pmax);
2988 HYPRE_BoomerAMGSetMinCoarseSize(ams_data -> B_G, 2); /* don't coarsen to 0 */
2989
2990 /* Generally, don't use exact solve on the coarsest level (matrix may be singular) */
2991 HYPRE_BoomerAMGSetCycleRelaxType(ams_data -> B_G, ams_data -> B_G_coarse_relax_type, 3);
2992
2993 if (ams_data -> cycle_type == 0)
2994 HYPRE_BoomerAMGSetMaxLevels(ams_data -> B_G, 2);
2995
2996 /* If not given, construct the coarse space matrix by RAP */
2997 if (!ams_data -> A_G)
2998 {
2999 if (!hypre_ParCSRMatrixCommPkg(ams_data -> G))
3000 {
3001 hypre_MatvecCommPkgCreate(ams_data -> G);
3002 }
3003
3004 if (!hypre_ParCSRMatrixCommPkg(ams_data -> A))
3005 {
3006 hypre_MatvecCommPkgCreate(ams_data -> A);
3007 }
3008
3009 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
3010 if (exec == HYPRE_EXEC_DEVICE)
3011 {
3012 ams_data -> A_G = hypre_ParCSRMatrixRAPKT(ams_data -> G,
3013 ams_data -> A,
3014 ams_data -> G, 1);
3015 }
3016 else
3017 #endif
3018 {
3019 hypre_BoomerAMGBuildCoarseOperator(ams_data -> G,
3020 ams_data -> A,
3021 ams_data -> G,
3022 &ams_data -> A_G);
3023 }
3024
3025 /* Make sure that A_G has no zero rows (this can happen
3026 if beta is zero in part of the domain). */
3027 hypre_ParCSRMatrixFixZeroRows(ams_data -> A_G);
3028 ams_data -> owns_A_G = 1;
3029 }
3030
3031 HYPRE_BoomerAMGSetup(ams_data -> B_G,
3032 (HYPRE_ParCSRMatrix)ams_data -> A_G,
3033 NULL, NULL);
3034 }
3035
3036 if (ams_data -> cycle_type > 10 && ams_data -> cycle_type != 20)
3037 /* Create the AMG solvers on the range of Pi{x,y,z}^T */
3038 {
3039 HYPRE_BoomerAMGCreate(&ams_data -> B_Pix);
3040 HYPRE_BoomerAMGSetCoarsenType(ams_data -> B_Pix, ams_data -> B_Pi_coarsen_type);
3041 HYPRE_BoomerAMGSetAggNumLevels(ams_data -> B_Pix, ams_data -> B_Pi_agg_levels);
3042 HYPRE_BoomerAMGSetRelaxType(ams_data -> B_Pix, ams_data -> B_Pi_relax_type);
3043 HYPRE_BoomerAMGSetNumSweeps(ams_data -> B_Pix, 1);
3044 HYPRE_BoomerAMGSetMaxLevels(ams_data -> B_Pix, 25);
3045 HYPRE_BoomerAMGSetTol(ams_data -> B_Pix, 0.0);
3046 HYPRE_BoomerAMGSetMaxIter(ams_data -> B_Pix, 1);
3047 HYPRE_BoomerAMGSetStrongThreshold(ams_data -> B_Pix, ams_data -> B_Pi_theta);
3048 HYPRE_BoomerAMGSetInterpType(ams_data -> B_Pix, ams_data -> B_Pi_interp_type);
3049 HYPRE_BoomerAMGSetPMaxElmts(ams_data -> B_Pix, ams_data -> B_Pi_Pmax);
3050 HYPRE_BoomerAMGSetMinCoarseSize(ams_data -> B_Pix, 2);
3051
3052 HYPRE_BoomerAMGCreate(&ams_data -> B_Piy);
3053 HYPRE_BoomerAMGSetCoarsenType(ams_data -> B_Piy, ams_data -> B_Pi_coarsen_type);
3054 HYPRE_BoomerAMGSetAggNumLevels(ams_data -> B_Piy, ams_data -> B_Pi_agg_levels);
3055 HYPRE_BoomerAMGSetRelaxType(ams_data -> B_Piy, ams_data -> B_Pi_relax_type);
3056 HYPRE_BoomerAMGSetNumSweeps(ams_data -> B_Piy, 1);
3057 HYPRE_BoomerAMGSetMaxLevels(ams_data -> B_Piy, 25);
3058 HYPRE_BoomerAMGSetTol(ams_data -> B_Piy, 0.0);
3059 HYPRE_BoomerAMGSetMaxIter(ams_data -> B_Piy, 1);
3060 HYPRE_BoomerAMGSetStrongThreshold(ams_data -> B_Piy, ams_data -> B_Pi_theta);
3061 HYPRE_BoomerAMGSetInterpType(ams_data -> B_Piy, ams_data -> B_Pi_interp_type);
3062 HYPRE_BoomerAMGSetPMaxElmts(ams_data -> B_Piy, ams_data -> B_Pi_Pmax);
3063 HYPRE_BoomerAMGSetMinCoarseSize(ams_data -> B_Piy, 2);
3064
3065 HYPRE_BoomerAMGCreate(&ams_data -> B_Piz);
3066 HYPRE_BoomerAMGSetCoarsenType(ams_data -> B_Piz, ams_data -> B_Pi_coarsen_type);
3067 HYPRE_BoomerAMGSetAggNumLevels(ams_data -> B_Piz, ams_data -> B_Pi_agg_levels);
3068 HYPRE_BoomerAMGSetRelaxType(ams_data -> B_Piz, ams_data -> B_Pi_relax_type);
3069 HYPRE_BoomerAMGSetNumSweeps(ams_data -> B_Piz, 1);
3070 HYPRE_BoomerAMGSetMaxLevels(ams_data -> B_Piz, 25);
3071 HYPRE_BoomerAMGSetTol(ams_data -> B_Piz, 0.0);
3072 HYPRE_BoomerAMGSetMaxIter(ams_data -> B_Piz, 1);
3073 HYPRE_BoomerAMGSetStrongThreshold(ams_data -> B_Piz, ams_data -> B_Pi_theta);
3074 HYPRE_BoomerAMGSetInterpType(ams_data -> B_Piz, ams_data -> B_Pi_interp_type);
3075 HYPRE_BoomerAMGSetPMaxElmts(ams_data -> B_Piz, ams_data -> B_Pi_Pmax);
3076 HYPRE_BoomerAMGSetMinCoarseSize(ams_data -> B_Piz, 2);
3077
3078 /* Generally, don't use exact solve on the coarsest level (matrices may be singular) */
3079 HYPRE_BoomerAMGSetCycleRelaxType(ams_data -> B_Pix, ams_data -> B_Pi_coarse_relax_type, 3);
3080 HYPRE_BoomerAMGSetCycleRelaxType(ams_data -> B_Piy, ams_data -> B_Pi_coarse_relax_type, 3);
3081 HYPRE_BoomerAMGSetCycleRelaxType(ams_data -> B_Piz, ams_data -> B_Pi_coarse_relax_type, 3);
3082
3083 if (ams_data -> cycle_type == 0)
3084 {
3085 HYPRE_BoomerAMGSetMaxLevels(ams_data -> B_Pix, 2);
3086 HYPRE_BoomerAMGSetMaxLevels(ams_data -> B_Piy, 2);
3087 HYPRE_BoomerAMGSetMaxLevels(ams_data -> B_Piz, 2);
3088 }
3089
3090 /* Construct the coarse space matrices by RAP */
3091 if (!hypre_ParCSRMatrixCommPkg(ams_data -> Pix))
3092 {
3093 hypre_MatvecCommPkgCreate(ams_data -> Pix);
3094 }
3095
3096 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
3097 if (exec == HYPRE_EXEC_DEVICE)
3098 {
3099 ams_data -> A_Pix = hypre_ParCSRMatrixRAPKT(ams_data -> Pix, ams_data -> A, ams_data -> Pix, 1);
3100 }
3101 else
3102 #endif
3103 {
3104 hypre_BoomerAMGBuildCoarseOperator(ams_data -> Pix,
3105 ams_data -> A,
3106 ams_data -> Pix,
3107 &ams_data -> A_Pix);
3108 }
3109
3110 /* Make sure that A_Pix has no zero rows (this can happen
3111 for some kinds of boundary conditions with contact). */
3112 hypre_ParCSRMatrixFixZeroRows(ams_data -> A_Pix);
3113
3114 HYPRE_BoomerAMGSetup(ams_data -> B_Pix,
3115 (HYPRE_ParCSRMatrix)ams_data -> A_Pix,
3116 NULL, NULL);
3117
3118 if (ams_data -> Piy)
3119 {
3120 if (!hypre_ParCSRMatrixCommPkg(ams_data -> Piy))
3121 {
3122 hypre_MatvecCommPkgCreate(ams_data -> Piy);
3123 }
3124
3125 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
3126 if (exec == HYPRE_EXEC_DEVICE)
3127 {
3128 ams_data -> A_Piy = hypre_ParCSRMatrixRAPKT(ams_data -> Piy,
3129 ams_data -> A,
3130 ams_data -> Piy, 1);
3131 }
3132 else
3133 #endif
3134 {
3135 hypre_BoomerAMGBuildCoarseOperator(ams_data -> Piy,
3136 ams_data -> A,
3137 ams_data -> Piy,
3138 &ams_data -> A_Piy);
3139 }
3140
3141 /* Make sure that A_Piy has no zero rows (this can happen
3142 for some kinds of boundary conditions with contact). */
3143 hypre_ParCSRMatrixFixZeroRows(ams_data -> A_Piy);
3144
3145 HYPRE_BoomerAMGSetup(ams_data -> B_Piy,
3146 (HYPRE_ParCSRMatrix)ams_data -> A_Piy,
3147 NULL, NULL);
3148 }
3149
3150 if (ams_data -> Piz)
3151 {
3152 if (!hypre_ParCSRMatrixCommPkg(ams_data -> Piz))
3153 {
3154 hypre_MatvecCommPkgCreate(ams_data -> Piz);
3155 }
3156
3157 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
3158 if (exec == HYPRE_EXEC_DEVICE)
3159 {
3160 ams_data -> A_Piz = hypre_ParCSRMatrixRAPKT(ams_data -> Piz,
3161 ams_data -> A,
3162 ams_data -> Piz, 1);
3163 }
3164 else
3165 #endif
3166 {
3167 hypre_BoomerAMGBuildCoarseOperator(ams_data -> Piz,
3168 ams_data -> A,
3169 ams_data -> Piz,
3170 &ams_data -> A_Piz);
3171 }
3172
3173 /* Make sure that A_Piz has no zero rows (this can happen
3174 for some kinds of boundary conditions with contact). */
3175 hypre_ParCSRMatrixFixZeroRows(ams_data -> A_Piz);
3176
3177 HYPRE_BoomerAMGSetup(ams_data -> B_Piz,
3178 (HYPRE_ParCSRMatrix)ams_data -> A_Piz,
3179 NULL, NULL);
3180 }
3181 }
3182 else
3183 /* Create the AMG solver on the range of Pi^T */
3184 {
3185 HYPRE_BoomerAMGCreate(&ams_data -> B_Pi);
3186 HYPRE_BoomerAMGSetCoarsenType(ams_data -> B_Pi, ams_data -> B_Pi_coarsen_type);
3187 HYPRE_BoomerAMGSetAggNumLevels(ams_data -> B_Pi, ams_data -> B_Pi_agg_levels);
3188 HYPRE_BoomerAMGSetRelaxType(ams_data -> B_Pi, ams_data -> B_Pi_relax_type);
3189 HYPRE_BoomerAMGSetNumSweeps(ams_data -> B_Pi, 1);
3190 HYPRE_BoomerAMGSetMaxLevels(ams_data -> B_Pi, 25);
3191 HYPRE_BoomerAMGSetTol(ams_data -> B_Pi, 0.0);
3192 HYPRE_BoomerAMGSetMaxIter(ams_data -> B_Pi, 1);
3193 HYPRE_BoomerAMGSetStrongThreshold(ams_data -> B_Pi, ams_data -> B_Pi_theta);
3194 HYPRE_BoomerAMGSetInterpType(ams_data -> B_Pi, ams_data -> B_Pi_interp_type);
3195 HYPRE_BoomerAMGSetPMaxElmts(ams_data -> B_Pi, ams_data -> B_Pi_Pmax);
3196 HYPRE_BoomerAMGSetMinCoarseSize(ams_data -> B_Pi, 2); /* don't coarsen to 0 */
3197
3198 /* Generally, don't use exact solve on the coarsest level (matrix may be singular) */
3199 HYPRE_BoomerAMGSetCycleRelaxType(ams_data -> B_Pi, ams_data -> B_Pi_coarse_relax_type, 3);
3200
3201 if (ams_data -> cycle_type == 0)
3202 HYPRE_BoomerAMGSetMaxLevels(ams_data -> B_Pi, 2);
3203
3204 /* If not given, construct the coarse space matrix by RAP and
3205 notify BoomerAMG that this is a dim x dim block system. */
3206 if (!ams_data -> A_Pi)
3207 {
3208 if (!hypre_ParCSRMatrixCommPkg(ams_data -> Pi))
3209 {
3210 hypre_MatvecCommPkgCreate(ams_data -> Pi);
3211 }
3212
3213 if (!hypre_ParCSRMatrixCommPkg(ams_data -> A))
3214 {
3215 hypre_MatvecCommPkgCreate(ams_data -> A);
3216 }
3217
3218 if (ams_data -> cycle_type == 9)
3219 {
3220 /* Add a discrete divergence term to A before computing Pi^t A Pi */
3221 {
3222 hypre_ParCSRMatrix *Gt, *GGt, *ApGGt;
3223 hypre_ParCSRMatrixTranspose(ams_data -> G, &Gt, 1);
3224
3225 /* scale GGt by h^2 */
3226 {
3227 HYPRE_Real h2;
3228 HYPRE_Int i, j, k, ne;
3229
3230 hypre_CSRMatrix *Gt_diag = hypre_ParCSRMatrixDiag(Gt);
3231 HYPRE_Int Gt_num_rows = hypre_CSRMatrixNumRows(Gt_diag);
3232 HYPRE_Int *Gt_diag_I = hypre_CSRMatrixI(Gt_diag);
3233 HYPRE_Int *Gt_diag_J = hypre_CSRMatrixJ(Gt_diag);
3234 HYPRE_Real *Gt_diag_data = hypre_CSRMatrixData(Gt_diag);
3235
3236 hypre_CSRMatrix *Gt_offd = hypre_ParCSRMatrixOffd(Gt);
3237 HYPRE_Int *Gt_offd_I = hypre_CSRMatrixI(Gt_offd);
3238 HYPRE_Real *Gt_offd_data = hypre_CSRMatrixData(Gt_offd);
3239
3240 HYPRE_Real *Gx_data = hypre_VectorData(hypre_ParVectorLocalVector(ams_data -> Gx));
3241 HYPRE_Real *Gy_data = hypre_VectorData(hypre_ParVectorLocalVector(ams_data -> Gy));
3242 HYPRE_Real *Gz_data = hypre_VectorData(hypre_ParVectorLocalVector(ams_data -> Gz));
3243
3244 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
3245 if (exec == HYPRE_EXEC_DEVICE)
3246 {
3247 dim3 bDim = hypre_GetDefaultCUDABlockDimension();
3248 dim3 gDim = hypre_GetDefaultCUDAGridDimension(Gt_num_rows, "warp", bDim);
3249 HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSSetupScaleGGt, gDim, bDim,
3250 Gt_num_rows, Gt_diag_I, Gt_diag_J, Gt_diag_data, Gt_offd_I, Gt_offd_data,
3251 Gx_data, Gy_data, Gz_data );
3252 }
3253 else
3254 #endif
3255 {
3256 for (i = 0; i < Gt_num_rows; i++)
3257 {
3258 /* determine the characteristic mesh size for vertex i */
3259 h2 = 0.0;
3260 ne = 0;
3261 for (j = Gt_diag_I[i]; j < Gt_diag_I[i+1]; j++)
3262 {
3263 k = Gt_diag_J[j];
3264 h2 += Gx_data[k]*Gx_data[k]+Gy_data[k]*Gy_data[k]+Gz_data[k]*Gz_data[k];
3265 ne++;
3266 }
3267
3268 if (ne != 0)
3269 {
3270 h2 /= ne;
3271 for (j = Gt_diag_I[i]; j < Gt_diag_I[i+1]; j++)
3272 Gt_diag_data[j] *= h2;
3273 for (j = Gt_offd_I[i]; j < Gt_offd_I[i+1]; j++)
3274 Gt_offd_data[j] *= h2;
3275 }
3276 }
3277 }
3278 }
3279
3280 /* we only needed Gx, Gy and Gz to compute the local mesh size */
3281 if (input_info == 1)
3282 {
3283 hypre_ParVectorDestroy(ams_data -> Gx);
3284 if (ams_data -> dim >= 2)
3285 hypre_ParVectorDestroy(ams_data -> Gy);
3286 if (ams_data -> dim == 3)
3287 hypre_ParVectorDestroy(ams_data -> Gz);
3288 }
3289
3290 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
3291 if (exec == HYPRE_EXEC_DEVICE)
3292 {
3293 GGt = hypre_ParCSRMatMat(ams_data -> G, Gt);
3294 }
3295 #endif
3296 else
3297 {
3298 GGt = hypre_ParMatmul(ams_data -> G, Gt);
3299 }
3300 hypre_ParCSRMatrixDestroy(Gt);
3301
3302 /* hypre_ParCSRMatrixAdd(GGt, A, &ams_data -> A); */
3303 hypre_ParCSRMatrixAdd(1.0, GGt, 1.0, ams_data -> A, &ApGGt);
3304 /*{
3305 hypre_ParCSRMatrix *A = GGt;
3306 hypre_ParCSRMatrix *B = ams_data -> A;
3307 hypre_ParCSRMatrix **C_ptr = &ApGGt;
3308
3309 hypre_ParCSRMatrix *C;
3310 hypre_CSRMatrix *A_local, *B_local, *C_local;
3311
3312 MPI_Comm comm = hypre_ParCSRMatrixComm(A);
3313 HYPRE_BigInt global_num_rows = hypre_ParCSRMatrixGlobalNumRows(A);
3314 HYPRE_BigInt global_num_cols = hypre_ParCSRMatrixGlobalNumCols(A);
3315 HYPRE_BigInt *row_starts = hypre_ParCSRMatrixRowStarts(A);
3316 HYPRE_BigInt *col_starts = hypre_ParCSRMatrixColStarts(A);
3317 HYPRE_Int A_num_cols_offd = hypre_CSRMatrixNumCols(hypre_ParCSRMatrixOffd(A));
3318 HYPRE_Int A_num_nonzeros_diag = hypre_CSRMatrixNumNonzeros(hypre_ParCSRMatrixDiag(A));
3319 HYPRE_Int A_num_nonzeros_offd = hypre_CSRMatrixNumNonzeros(hypre_ParCSRMatrixOffd(A));
3320 HYPRE_Int B_num_cols_offd = hypre_CSRMatrixNumCols(hypre_ParCSRMatrixOffd(B));
3321 HYPRE_Int B_num_nonzeros_diag = hypre_CSRMatrixNumNonzeros(hypre_ParCSRMatrixDiag(B));
3322 HYPRE_Int B_num_nonzeros_offd = hypre_CSRMatrixNumNonzeros(hypre_ParCSRMatrixOffd(B));
3323
3324 A_local = hypre_MergeDiagAndOffd(A);
3325 B_local = hypre_MergeDiagAndOffd(B);
3326 C_local = hypre_CSRMatrixBigAdd(A_local, B_local);
3327 hypre_CSRMatrixBigJtoJ(C_local);
3328
3329 C = hypre_ParCSRMatrixCreate (comm,
3330 global_num_rows,
3331 global_num_cols,
3332 row_starts,
3333 col_starts,
3334 A_num_cols_offd + B_num_cols_offd,
3335 A_num_nonzeros_diag + B_num_nonzeros_diag,
3336 A_num_nonzeros_offd + B_num_nonzeros_offd);
3337 GenerateDiagAndOffd(C_local, C,
3338 hypre_ParCSRMatrixFirstColDiag(A),
3339 hypre_ParCSRMatrixLastColDiag(A));
3340
3341 hypre_CSRMatrixDestroy(A_local);
3342 hypre_CSRMatrixDestroy(B_local);
3343 hypre_CSRMatrixDestroy(C_local);
3344
3345 *C_ptr = C;
3346 }*/
3347
3348 hypre_ParCSRMatrixDestroy(GGt);
3349
3350 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
3351 if (exec == HYPRE_EXEC_DEVICE)
3352 {
3353 ams_data -> A_Pi = hypre_ParCSRMatrixRAPKT(ams_data -> Pi, ApGGt, ams_data -> Pi, 1);
3354 }
3355 else
3356 #endif
3357 {
3358 hypre_BoomerAMGBuildCoarseOperator(ams_data -> Pi,
3359 ApGGt,
3360 ams_data -> Pi,
3361 &ams_data -> A_Pi);
3362 }
3363 }
3364 }
3365 else
3366 {
3367 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
3368 if (exec == HYPRE_EXEC_DEVICE)
3369 {
3370 ams_data -> A_Pi = hypre_ParCSRMatrixRAPKT(ams_data -> Pi, ams_data -> A, ams_data -> Pi, 1);
3371 }
3372 else
3373 #endif
3374 {
3375 hypre_BoomerAMGBuildCoarseOperator(ams_data -> Pi,
3376 ams_data -> A,
3377 ams_data -> Pi,
3378 &ams_data -> A_Pi);
3379 }
3380 }
3381
3382 ams_data -> owns_A_Pi = 1;
3383
3384 if (ams_data -> cycle_type != 20)
3385 HYPRE_BoomerAMGSetNumFunctions(ams_data -> B_Pi, ams_data -> dim);
3386 else
3387 HYPRE_BoomerAMGSetNumFunctions(ams_data -> B_Pi, ams_data -> dim + 1);
3388 /* HYPRE_BoomerAMGSetNodal(ams_data -> B_Pi, 1); */
3389 }
3390
3391 /* Make sure that A_Pi has no zero rows (this can happen for
3392 some kinds of boundary conditions with contact). */
3393 hypre_ParCSRMatrixFixZeroRows(ams_data -> A_Pi);
3394
3395 HYPRE_BoomerAMGSetup(ams_data -> B_Pi,
3396 (HYPRE_ParCSRMatrix)ams_data -> A_Pi,
3397 0, 0);
3398 }
3399
3400 /* Allocate temporary vectors */
3401 ams_data -> r0 = hypre_ParVectorInRangeOf(ams_data -> A);
3402 ams_data -> g0 = hypre_ParVectorInRangeOf(ams_data -> A);
3403 if (ams_data -> A_G)
3404 {
3405 ams_data -> r1 = hypre_ParVectorInRangeOf(ams_data -> A_G);
3406 ams_data -> g1 = hypre_ParVectorInRangeOf(ams_data -> A_G);
3407 }
3408 if (ams_data -> r1 == NULL && ams_data -> A_Pix)
3409 {
3410 ams_data -> r1 = hypre_ParVectorInRangeOf(ams_data -> A_Pix);
3411 ams_data -> g1 = hypre_ParVectorInRangeOf(ams_data -> A_Pix);
3412 }
3413 if (ams_data -> Pi)
3414 {
3415 ams_data -> r2 = hypre_ParVectorInDomainOf(ams_data -> Pi);
3416 ams_data -> g2 = hypre_ParVectorInDomainOf(ams_data -> Pi);
3417 }
3418
3419 return hypre_error_flag;
3420 }
3421
3422 /*--------------------------------------------------------------------------
3423 * hypre_AMSSolve
3424 *
3425 * Solve the system A x = b.
3426 *--------------------------------------------------------------------------*/
3427
hypre_AMSSolve(void * solver,hypre_ParCSRMatrix * A,hypre_ParVector * b,hypre_ParVector * x)3428 HYPRE_Int hypre_AMSSolve(void *solver,
3429 hypre_ParCSRMatrix *A,
3430 hypre_ParVector *b,
3431 hypre_ParVector *x)
3432 {
3433 hypre_AMSData *ams_data = (hypre_AMSData *) solver;
3434
3435 HYPRE_Int i, my_id = -1;
3436 HYPRE_Real r0_norm, r_norm, b_norm, relative_resid = 0, old_resid;
3437
3438 char cycle[30];
3439 hypre_ParCSRMatrix *Ai[5], *Pi[5];
3440 HYPRE_Solver Bi[5];
3441 HYPRE_PtrToSolverFcn HBi[5];
3442 hypre_ParVector *ri[5], *gi[5];
3443 HYPRE_Int needZ = 0;
3444
3445 hypre_ParVector *z = ams_data -> zz;
3446
3447 Ai[0] = ams_data -> A_G; Pi[0] = ams_data -> G;
3448 Ai[1] = ams_data -> A_Pi; Pi[1] = ams_data -> Pi;
3449 Ai[2] = ams_data -> A_Pix; Pi[2] = ams_data -> Pix;
3450 Ai[3] = ams_data -> A_Piy; Pi[3] = ams_data -> Piy;
3451 Ai[4] = ams_data -> A_Piz; Pi[4] = ams_data -> Piz;
3452
3453 Bi[0] = ams_data -> B_G; HBi[0] = (HYPRE_PtrToSolverFcn) hypre_BoomerAMGSolve;
3454 Bi[1] = ams_data -> B_Pi; HBi[1] = (HYPRE_PtrToSolverFcn) hypre_BoomerAMGBlockSolve;
3455 Bi[2] = ams_data -> B_Pix; HBi[2] = (HYPRE_PtrToSolverFcn) hypre_BoomerAMGSolve;
3456 Bi[3] = ams_data -> B_Piy; HBi[3] = (HYPRE_PtrToSolverFcn) hypre_BoomerAMGSolve;
3457 Bi[4] = ams_data -> B_Piz; HBi[4] = (HYPRE_PtrToSolverFcn) hypre_BoomerAMGSolve;
3458
3459 ri[0] = ams_data -> r1; gi[0] = ams_data -> g1;
3460 ri[1] = ams_data -> r2; gi[1] = ams_data -> g2;
3461 ri[2] = ams_data -> r1; gi[2] = ams_data -> g1;
3462 ri[3] = ams_data -> r1; gi[3] = ams_data -> g1;
3463 ri[4] = ams_data -> r1; gi[4] = ams_data -> g1;
3464
3465 /* may need to create an additional temporary vector for relaxation */
3466 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
3467 HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_ParCSRMatrixMemoryLocation(A) );
3468
3469 if (exec == HYPRE_EXEC_DEVICE)
3470 {
3471 needZ = ams_data -> A_relax_type == 2 || ams_data -> A_relax_type == 4 || ams_data -> A_relax_type == 16;
3472 }
3473 else
3474 #endif
3475 {
3476 needZ = hypre_NumThreads() > 1 || ams_data -> A_relax_type == 16;
3477 }
3478
3479 if (needZ && !z)
3480 {
3481 z = hypre_ParVectorCreate(hypre_ParCSRMatrixComm(A),
3482 hypre_ParCSRMatrixGlobalNumRows(A),
3483 hypre_ParCSRMatrixRowStarts(A));
3484 hypre_ParVectorInitialize(z);
3485 ams_data -> zz = z;
3486 }
3487
3488 if (ams_data -> print_level > 0)
3489 hypre_MPI_Comm_rank(hypre_ParCSRMatrixComm(A), &my_id);
3490
3491 /* Compatible subspace projection for problems with zero-conductivity regions.
3492 Note that this modifies the input (r.h.s.) vector b! */
3493 if ( (ams_data -> B_G0) &&
3494 (++ams_data->solve_counter % ( ams_data -> projection_frequency ) == 0) )
3495 {
3496 /* hypre_printf("Projecting onto the compatible subspace...\n"); */
3497 hypre_AMSProjectOutGradients(ams_data, b);
3498 }
3499
3500 if (ams_data -> beta_is_zero)
3501 {
3502 switch (ams_data -> cycle_type)
3503 {
3504 case 0:
3505 hypre_sprintf(cycle,"%s","0");
3506 break;
3507 case 1:
3508 case 3:
3509 case 5:
3510 case 7:
3511 default:
3512 hypre_sprintf(cycle,"%s","020");
3513 break;
3514 case 2:
3515 case 4:
3516 case 6:
3517 case 8:
3518 hypre_sprintf(cycle,"%s","(0+2)");
3519 break;
3520 case 11:
3521 case 13:
3522 hypre_sprintf(cycle,"%s","0345430");
3523 break;
3524 case 12:
3525 hypre_sprintf(cycle,"%s","(0+3+4+5)");
3526 break;
3527 case 14:
3528 hypre_sprintf(cycle,"%s","0(+3+4+5)0");
3529 break;
3530 }
3531 }
3532 else
3533 {
3534 switch (ams_data -> cycle_type)
3535 {
3536 case 0:
3537 hypre_sprintf(cycle,"%s","010");
3538 break;
3539 case 1:
3540 default:
3541 hypre_sprintf(cycle,"%s","01210");
3542 break;
3543 case 2:
3544 hypre_sprintf(cycle,"%s","(0+1+2)");
3545 break;
3546 case 3:
3547 hypre_sprintf(cycle,"%s","02120");
3548 break;
3549 case 4:
3550 hypre_sprintf(cycle,"%s","(010+2)");
3551 break;
3552 case 5:
3553 hypre_sprintf(cycle,"%s","0102010");
3554 break;
3555 case 6:
3556 hypre_sprintf(cycle,"%s","(020+1)");
3557 break;
3558 case 7:
3559 hypre_sprintf(cycle,"%s","0201020");
3560 break;
3561 case 8:
3562 hypre_sprintf(cycle,"%s","0(+1+2)0");
3563 break;
3564 case 9:
3565 hypre_sprintf(cycle,"%s","01210");
3566 break;
3567 case 11:
3568 hypre_sprintf(cycle,"%s","013454310");
3569 break;
3570 case 12:
3571 hypre_sprintf(cycle,"%s","(0+1+3+4+5)");
3572 break;
3573 case 13:
3574 hypre_sprintf(cycle,"%s","034515430");
3575 break;
3576 case 14:
3577 hypre_sprintf(cycle,"%s","01(+3+4+5)10");
3578 break;
3579 case 20:
3580 hypre_sprintf(cycle,"%s","020");
3581 break;
3582 }
3583 }
3584
3585 for (i = 0; i < ams_data -> maxit; i++)
3586 {
3587 /* Compute initial residual norms */
3588 if (ams_data -> maxit > 1 && i == 0)
3589 {
3590 hypre_ParVectorCopy(b, ams_data -> r0);
3591 hypre_ParCSRMatrixMatvec(-1.0, ams_data -> A, x, 1.0, ams_data -> r0);
3592 r_norm = sqrt(hypre_ParVectorInnerProd(ams_data -> r0,ams_data -> r0));
3593 r0_norm = r_norm;
3594 b_norm = sqrt(hypre_ParVectorInnerProd(b, b));
3595 if (b_norm)
3596 relative_resid = r_norm / b_norm;
3597 else
3598 relative_resid = r_norm;
3599 if (my_id == 0 && ams_data -> print_level > 0)
3600 {
3601 hypre_printf(" relative\n");
3602 hypre_printf(" residual factor residual\n");
3603 hypre_printf(" -------- ------ --------\n");
3604 hypre_printf(" Initial %e %e\n",
3605 r_norm, relative_resid);
3606 }
3607 }
3608
3609 /* Apply the preconditioner */
3610 hypre_ParCSRSubspacePrec(ams_data -> A,
3611 ams_data -> A_relax_type,
3612 ams_data -> A_relax_times,
3613 ams_data -> A_l1_norms ? hypre_VectorData(ams_data -> A_l1_norms) : NULL,
3614 ams_data -> A_relax_weight,
3615 ams_data -> A_omega,
3616 ams_data -> A_max_eig_est,
3617 ams_data -> A_min_eig_est,
3618 ams_data -> A_cheby_order,
3619 ams_data -> A_cheby_fraction,
3620 Ai, Bi, HBi, Pi, ri, gi,
3621 b, x,
3622 ams_data -> r0,
3623 ams_data -> g0,
3624 cycle,
3625 z);
3626
3627 /* Compute new residual norms */
3628 if (ams_data -> maxit > 1)
3629 {
3630 old_resid = r_norm;
3631 hypre_ParVectorCopy(b, ams_data -> r0);
3632 hypre_ParCSRMatrixMatvec(-1.0, ams_data -> A, x, 1.0, ams_data -> r0);
3633 r_norm = sqrt(hypre_ParVectorInnerProd(ams_data -> r0,ams_data -> r0));
3634 if (b_norm)
3635 relative_resid = r_norm / b_norm;
3636 else
3637 relative_resid = r_norm;
3638 if (my_id == 0 && ams_data -> print_level > 0)
3639 hypre_printf(" Cycle %2d %e %f %e \n",
3640 i+1, r_norm, r_norm / old_resid, relative_resid);
3641 }
3642
3643 if (relative_resid < ams_data -> tol)
3644 {
3645 i++;
3646 break;
3647 }
3648 }
3649
3650 if (my_id == 0 && ams_data -> print_level > 0 && ams_data -> maxit > 1)
3651 hypre_printf("\n\n Average Convergence Factor = %f\n\n",
3652 pow((r_norm/r0_norm),(1.0/(HYPRE_Real) i)));
3653
3654 ams_data -> num_iterations = i;
3655 ams_data -> rel_resid_norm = relative_resid;
3656
3657 if (ams_data -> num_iterations == ams_data -> maxit && ams_data -> tol > 0.0)
3658 hypre_error(HYPRE_ERROR_CONV);
3659
3660 return hypre_error_flag;
3661 }
3662
3663 /*--------------------------------------------------------------------------
3664 * hypre_ParCSRSubspacePrec
3665 *
3666 * General subspace preconditioner for A0 y = x, based on ParCSR storage.
3667 *
3668 * P[i] and A[i] are the interpolation and coarse grid matrices for
3669 * the (i+1)'th subspace. B[i] is an AMG solver for A[i]. r[i] and g[i]
3670 * are temporary vectors. A0_* are the fine grid smoothing parameters.
3671 *
3672 * The default mode is multiplicative, '+' changes the next correction
3673 * to additive, based on residual computed at '('.
3674 *--------------------------------------------------------------------------*/
3675
hypre_ParCSRSubspacePrec(hypre_ParCSRMatrix * A0,HYPRE_Int A0_relax_type,HYPRE_Int A0_relax_times,HYPRE_Real * A0_l1_norms,HYPRE_Real A0_relax_weight,HYPRE_Real A0_omega,HYPRE_Real A0_max_eig_est,HYPRE_Real A0_min_eig_est,HYPRE_Int A0_cheby_order,HYPRE_Real A0_cheby_fraction,hypre_ParCSRMatrix ** A,HYPRE_Solver * B,HYPRE_PtrToSolverFcn * HB,hypre_ParCSRMatrix ** P,hypre_ParVector ** r,hypre_ParVector ** g,hypre_ParVector * x,hypre_ParVector * y,hypre_ParVector * r0,hypre_ParVector * g0,char * cycle,hypre_ParVector * z)3676 HYPRE_Int hypre_ParCSRSubspacePrec(/* fine space matrix */
3677 hypre_ParCSRMatrix *A0,
3678 /* relaxation parameters */
3679 HYPRE_Int A0_relax_type,
3680 HYPRE_Int A0_relax_times,
3681 HYPRE_Real *A0_l1_norms,
3682 HYPRE_Real A0_relax_weight,
3683 HYPRE_Real A0_omega,
3684 HYPRE_Real A0_max_eig_est,
3685 HYPRE_Real A0_min_eig_est,
3686 HYPRE_Int A0_cheby_order,
3687 HYPRE_Real A0_cheby_fraction,
3688 /* subspace matrices */
3689 hypre_ParCSRMatrix **A,
3690 /* subspace preconditioners */
3691 HYPRE_Solver *B,
3692 /* hypre solver functions for B */
3693 HYPRE_PtrToSolverFcn *HB,
3694 /* subspace interpolations */
3695 hypre_ParCSRMatrix **P,
3696 /* temporary subspace vectors */
3697 hypre_ParVector **r,
3698 hypre_ParVector **g,
3699 /* right-hand side */
3700 hypre_ParVector *x,
3701 /* current approximation */
3702 hypre_ParVector *y,
3703 /* current residual */
3704 hypre_ParVector *r0,
3705 /* temporary vector */
3706 hypre_ParVector *g0,
3707 char *cycle,
3708 /* temporary vector */
3709 hypre_ParVector *z)
3710 {
3711 char *op;
3712 HYPRE_Int use_saved_residual = 0;
3713
3714 for (op = cycle; *op != '\0'; op++)
3715 {
3716 /* do nothing */
3717 if (*op == ')')
3718 continue;
3719
3720 /* compute the residual: r = x - Ay */
3721 else if (*op == '(')
3722 {
3723 hypre_ParVectorCopy(x,r0);
3724 hypre_ParCSRMatrixMatvec(-1.0, A0, y, 1.0, r0);
3725 }
3726
3727 /* switch to additive correction */
3728 else if (*op == '+')
3729 {
3730 use_saved_residual = 1;
3731 continue;
3732 }
3733
3734 /* smooth: y += S (x - Ay) */
3735 else if (*op == '0')
3736 {
3737 hypre_ParCSRRelax(A0, x,
3738 A0_relax_type,
3739 A0_relax_times,
3740 A0_l1_norms,
3741 A0_relax_weight,
3742 A0_omega,
3743 A0_max_eig_est,
3744 A0_min_eig_est,
3745 A0_cheby_order,
3746 A0_cheby_fraction,
3747 y, g0, z);
3748 }
3749
3750 /* subspace correction: y += P B^{-1} P^t r */
3751 else
3752 {
3753 HYPRE_Int i = *op - '1';
3754 if (i < 0)
3755 hypre_error_in_arg(16);
3756
3757 /* skip empty subspaces */
3758 if (!A[i]) continue;
3759
3760 /* compute the residual? */
3761 if (use_saved_residual)
3762 {
3763 use_saved_residual = 0;
3764 hypre_ParCSRMatrixMatvecT(1.0, P[i], r0, 0.0, r[i]);
3765 }
3766 else
3767 {
3768 hypre_ParVectorCopy(x,g0);
3769 hypre_ParCSRMatrixMatvec(-1.0, A0, y, 1.0, g0);
3770 hypre_ParCSRMatrixMatvecT(1.0, P[i], g0, 0.0, r[i]);
3771 }
3772
3773 hypre_ParVectorSetConstantValues(g[i], 0.0);
3774 (*HB[i]) (B[i], (HYPRE_Matrix)A[i],
3775 (HYPRE_Vector)r[i], (HYPRE_Vector)g[i]);
3776 hypre_ParCSRMatrixMatvec(1.0, P[i], g[i], 0.0, g0);
3777 hypre_ParVectorAxpy(1.0, g0, y);
3778 }
3779 }
3780
3781 return hypre_error_flag;
3782 }
3783
3784 /*--------------------------------------------------------------------------
3785 * hypre_AMSGetNumIterations
3786 *
3787 * Get the number of AMS iterations.
3788 *--------------------------------------------------------------------------*/
3789
hypre_AMSGetNumIterations(void * solver,HYPRE_Int * num_iterations)3790 HYPRE_Int hypre_AMSGetNumIterations(void *solver,
3791 HYPRE_Int *num_iterations)
3792 {
3793 hypre_AMSData *ams_data = (hypre_AMSData *) solver;
3794 *num_iterations = ams_data -> num_iterations;
3795 return hypre_error_flag;
3796 }
3797
3798 /*--------------------------------------------------------------------------
3799 * hypre_AMSGetFinalRelativeResidualNorm
3800 *
3801 * Get the final relative residual norm in AMS.
3802 *--------------------------------------------------------------------------*/
3803
hypre_AMSGetFinalRelativeResidualNorm(void * solver,HYPRE_Real * rel_resid_norm)3804 HYPRE_Int hypre_AMSGetFinalRelativeResidualNorm(void *solver,
3805 HYPRE_Real *rel_resid_norm)
3806 {
3807 hypre_AMSData *ams_data = (hypre_AMSData *) solver;
3808 *rel_resid_norm = ams_data -> rel_resid_norm;
3809 return hypre_error_flag;
3810 }
3811
3812 /*--------------------------------------------------------------------------
3813 * hypre_AMSProjectOutGradients
3814 *
3815 * For problems with zero-conductivity regions, project the vector onto the
3816 * compatible subspace: x = (I - G0 (G0^t G0)^{-1} G0^T) x, where G0 is the
3817 * discrete gradient restricted to the interior nodes of the regions with
3818 * zero conductivity. This ensures that x is orthogonal to the gradients in
3819 * the range of G0.
3820 *
3821 * This function is typically called after the solution iteration is complete,
3822 * in order to facilitate the visualization of the computed field. Without it
3823 * the values in the zero-conductivity regions contain kernel components.
3824 *--------------------------------------------------------------------------*/
3825
hypre_AMSProjectOutGradients(void * solver,hypre_ParVector * x)3826 HYPRE_Int hypre_AMSProjectOutGradients(void *solver,
3827 hypre_ParVector *x)
3828 {
3829 hypre_AMSData *ams_data = (hypre_AMSData *) solver;
3830
3831 if (ams_data -> B_G0)
3832 {
3833 hypre_ParCSRMatrixMatvecT(1.0, ams_data -> G0, x, 0.0, ams_data -> r1);
3834 hypre_ParVectorSetConstantValues(ams_data -> g1, 0.0);
3835 hypre_BoomerAMGSolve(ams_data -> B_G0, ams_data -> A_G0, ams_data -> r1, ams_data -> g1);
3836 hypre_ParCSRMatrixMatvec(1.0, ams_data -> G0, ams_data -> g1, 0.0, ams_data -> g0);
3837 hypre_ParVectorAxpy(-1.0, ams_data -> g0, x);
3838 }
3839
3840 return hypre_error_flag;
3841 }
3842
3843 /*--------------------------------------------------------------------------
3844 * hypre_AMSConstructDiscreteGradient
3845 *
3846 * Construct and return the lowest-order discrete gradient matrix G, based on:
3847 * - a matrix on the egdes (e.g. the stiffness matrix A)
3848 * - a vector on the vertices (e.g. the x coordinates)
3849 * - the array edge_vertex, which lists the global indexes of the
3850 * vertices of the local edges.
3851 *
3852 * We assume that edge_vertex lists the edge vertices consecutively,
3853 * and that the orientation of all edges is consistent. More specificaly:
3854 * If edge_orientation = 1, the edges are already oriented.
3855 * If edge_orientation = 2, the orientation of edge i depends only on the
3856 * sign of edge_vertex[2*i+1] - edge_vertex[2*i].
3857 *--------------------------------------------------------------------------*/
3858
hypre_AMSConstructDiscreteGradient(hypre_ParCSRMatrix * A,hypre_ParVector * x_coord,HYPRE_BigInt * edge_vertex,HYPRE_Int edge_orientation,hypre_ParCSRMatrix ** G_ptr)3859 HYPRE_Int hypre_AMSConstructDiscreteGradient(hypre_ParCSRMatrix *A,
3860 hypre_ParVector *x_coord,
3861 HYPRE_BigInt *edge_vertex,
3862 HYPRE_Int edge_orientation,
3863 hypre_ParCSRMatrix **G_ptr)
3864 {
3865 hypre_ParCSRMatrix *G;
3866
3867 HYPRE_Int nedges;
3868
3869 nedges = hypre_ParCSRMatrixNumRows(A);
3870
3871 /* Construct the local part of G based on edge_vertex and the edge
3872 and vertex partitionings from A and x_coord */
3873 {
3874 HYPRE_Int i, *I = hypre_CTAlloc(HYPRE_Int, nedges+1, HYPRE_MEMORY_HOST);
3875 HYPRE_Real *data = hypre_CTAlloc(HYPRE_Real, 2*nedges, HYPRE_MEMORY_HOST);
3876 hypre_CSRMatrix *local = hypre_CSRMatrixCreate (nedges,
3877 hypre_ParVectorGlobalSize(x_coord),
3878 2*nedges);
3879
3880 for (i = 0; i <= nedges; i++)
3881 I[i] = 2*i;
3882
3883 if (edge_orientation == 1)
3884 {
3885 /* Assume that the edges are already oriented */
3886 for (i = 0; i < 2*nedges; i+=2)
3887 {
3888 data[i] = -1.0;
3889 data[i+1] = 1.0;
3890 }
3891 }
3892 else if (edge_orientation == 2)
3893 {
3894 /* Assume that the edge orientation is based on the vertex indexes */
3895 for (i = 0; i < 2*nedges; i+=2)
3896 {
3897 if (edge_vertex[i] < edge_vertex[i+1])
3898 {
3899 data[i] = -1.0;
3900 data[i+1] = 1.0;
3901 }
3902 else
3903 {
3904 data[i] = 1.0;
3905 data[i+1] = -1.0;
3906 }
3907 }
3908 }
3909 else
3910 {
3911 hypre_error_in_arg(4);
3912 }
3913
3914 hypre_CSRMatrixI(local) = I;
3915 hypre_CSRMatrixBigJ(local) = edge_vertex;
3916 hypre_CSRMatrixData(local) = data;
3917
3918 hypre_CSRMatrixRownnz(local) = NULL;
3919 hypre_CSRMatrixOwnsData(local) = 1;
3920 hypre_CSRMatrixNumRownnz(local) = nedges;
3921
3922 /* Generate the discrete gradient matrix */
3923 G = hypre_ParCSRMatrixCreate(hypre_ParCSRMatrixComm(A),
3924 hypre_ParCSRMatrixGlobalNumRows(A),
3925 hypre_ParVectorGlobalSize(x_coord),
3926 hypre_ParCSRMatrixRowStarts(A),
3927 hypre_ParVectorPartitioning(x_coord),
3928 0, 0, 0);
3929 hypre_CSRMatrixBigJtoJ(local);
3930 GenerateDiagAndOffd(local, G,
3931 hypre_ParVectorFirstIndex(x_coord),
3932 hypre_ParVectorLastIndex(x_coord));
3933
3934
3935 /* Account for empty rows in G. These may appear when A includes only
3936 the interior (non-Dirichlet b.c.) edges. */
3937 {
3938 hypre_CSRMatrix *G_diag = hypre_ParCSRMatrixDiag(G);
3939 G_diag->num_cols = hypre_VectorSize(hypre_ParVectorLocalVector(x_coord));
3940 }
3941
3942 /* Free the local matrix */
3943 hypre_CSRMatrixDestroy(local);
3944 }
3945
3946 *G_ptr = G;
3947
3948 return hypre_error_flag;
3949 }
3950
3951 /*--------------------------------------------------------------------------
3952 * hypre_AMSFEISetup
3953 *
3954 * Construct an AMS solver object based on the following data:
3955 *
3956 * A - the edge element stiffness matrix
3957 * num_vert - number of vertices (nodes) in the processor
3958 * num_local_vert - number of vertices owned by the processor
3959 * vert_number - global indexes of the vertices in the processor
3960 * vert_coord - coordinates of the vertices in the processor
3961 * num_edges - number of edges owned by the processor
3962 * edge_vertex - the vertices of the edges owned by the processor.
3963 * Vertices are in local numbering (the same as in
3964 * vert_number), and edge orientation is always from
3965 * the first to the second vertex.
3966 *
3967 * Here we distinguish between vertices that belong to elements in the
3968 * current processor, and the subset of these vertices that is owned by
3969 * the processor.
3970 *
3971 * This function is written specifically for input from the FEI and should
3972 * be called before hypre_AMSSetup().
3973 *--------------------------------------------------------------------------*/
3974
hypre_AMSFEISetup(void * solver,hypre_ParCSRMatrix * A,hypre_ParVector * b,hypre_ParVector * x,HYPRE_Int num_vert,HYPRE_Int num_local_vert,HYPRE_BigInt * vert_number,HYPRE_Real * vert_coord,HYPRE_Int num_edges,HYPRE_BigInt * edge_vertex)3975 HYPRE_Int hypre_AMSFEISetup(void *solver,
3976 hypre_ParCSRMatrix *A,
3977 hypre_ParVector *b,
3978 hypre_ParVector *x,
3979 HYPRE_Int num_vert,
3980 HYPRE_Int num_local_vert,
3981 HYPRE_BigInt *vert_number,
3982 HYPRE_Real *vert_coord,
3983 HYPRE_Int num_edges,
3984 HYPRE_BigInt *edge_vertex)
3985 {
3986 hypre_AMSData *ams_data = (hypre_AMSData *) solver;
3987
3988 HYPRE_Int i, j;
3989
3990 hypre_ParCSRMatrix *G;
3991 hypre_ParVector *x_coord, *y_coord, *z_coord;
3992 HYPRE_Real *x_data, *y_data, *z_data;
3993
3994 MPI_Comm comm = hypre_ParCSRMatrixComm(A);
3995 HYPRE_BigInt *vert_part, num_global_vert;
3996 HYPRE_BigInt vert_start, vert_end;
3997 HYPRE_BigInt big_local_vert = (HYPRE_BigInt) num_local_vert;
3998
3999 /* Find the processor partitioning of the vertices */
4000 vert_part = hypre_TAlloc(HYPRE_BigInt, 2, HYPRE_MEMORY_HOST);
4001 hypre_MPI_Scan(&big_local_vert, &vert_part[1], 1, HYPRE_MPI_BIG_INT, hypre_MPI_SUM, comm);
4002 vert_part[0] = vert_part[1] - big_local_vert;
4003 hypre_MPI_Allreduce(&big_local_vert, &num_global_vert, 1, HYPRE_MPI_BIG_INT, hypre_MPI_SUM, comm);
4004
4005 /* Construct hypre parallel vectors for the vertex coordinates */
4006 x_coord = hypre_ParVectorCreate(comm, num_global_vert, vert_part);
4007 hypre_ParVectorInitialize(x_coord);
4008 hypre_ParVectorOwnsData(x_coord) = 1;
4009 x_data = hypre_VectorData(hypre_ParVectorLocalVector(x_coord));
4010
4011 y_coord = hypre_ParVectorCreate(comm, num_global_vert, vert_part);
4012 hypre_ParVectorInitialize(y_coord);
4013 hypre_ParVectorOwnsData(y_coord) = 1;
4014 y_data = hypre_VectorData(hypre_ParVectorLocalVector(y_coord));
4015
4016 z_coord = hypre_ParVectorCreate(comm, num_global_vert, vert_part);
4017 hypre_ParVectorInitialize(z_coord);
4018 hypre_ParVectorOwnsData(z_coord) = 1;
4019 z_data = hypre_VectorData(hypre_ParVectorLocalVector(z_coord));
4020
4021 vert_start = hypre_ParVectorFirstIndex(x_coord);
4022 vert_end = hypre_ParVectorLastIndex(x_coord);
4023
4024 /* Save coordinates of locally owned vertices */
4025 for (i = 0; i < num_vert; i++)
4026 {
4027 if (vert_number[i] >= vert_start && vert_number[i] <= vert_end)
4028 {
4029 j = (HYPRE_Int)(vert_number[i] - vert_start);
4030 x_data[j] = vert_coord[3*i];
4031 y_data[j] = vert_coord[3*i+1];
4032 z_data[j] = vert_coord[3*i+2];
4033 }
4034 }
4035
4036 /* Change vertex numbers from local to global */
4037 for (i = 0; i < 2*num_edges; i++)
4038 edge_vertex[i] = vert_number[edge_vertex[i]];
4039
4040 /* Construct the local part of G based on edge_vertex */
4041 {
4042 /* HYPRE_Int num_edges = hypre_ParCSRMatrixNumRows(A); */
4043 HYPRE_Int *I = hypre_CTAlloc(HYPRE_Int, num_edges+1, HYPRE_MEMORY_HOST);
4044 HYPRE_Real *data = hypre_CTAlloc(HYPRE_Real, 2*num_edges, HYPRE_MEMORY_HOST);
4045 hypre_CSRMatrix *local = hypre_CSRMatrixCreate (num_edges,
4046 num_global_vert,
4047 2*num_edges);
4048
4049 for (i = 0; i <= num_edges; i++)
4050 I[i] = 2*i;
4051
4052 /* Assume that the edge orientation is based on the vertex indexes */
4053 for (i = 0; i < 2*num_edges; i+=2)
4054 {
4055 data[i] = 1.0;
4056 data[i+1] = -1.0;
4057 }
4058
4059 hypre_CSRMatrixI(local) = I;
4060 hypre_CSRMatrixBigJ(local) = edge_vertex;
4061 hypre_CSRMatrixData(local) = data;
4062
4063 hypre_CSRMatrixRownnz(local) = NULL;
4064 hypre_CSRMatrixOwnsData(local) = 1;
4065 hypre_CSRMatrixNumRownnz(local) = num_edges;
4066
4067 G = hypre_ParCSRMatrixCreate(comm,
4068 hypre_ParCSRMatrixGlobalNumRows(A),
4069 num_global_vert,
4070 hypre_ParCSRMatrixRowStarts(A),
4071 vert_part,
4072 0, 0, 0);
4073 hypre_CSRMatrixBigJtoJ(local);
4074 GenerateDiagAndOffd(local, G, vert_start, vert_end);
4075
4076 //hypre_CSRMatrixJ(local) = NULL;
4077 hypre_CSRMatrixDestroy(local);
4078 }
4079
4080 ams_data -> G = G;
4081
4082 ams_data -> x = x_coord;
4083 ams_data -> y = y_coord;
4084 ams_data -> z = z_coord;
4085
4086 return hypre_error_flag;
4087 }
4088
4089 /*--------------------------------------------------------------------------
4090 * hypre_AMSFEIDestroy
4091 *
4092 * Free the additional memory allocated in hypre_AMSFEISetup().
4093 *
4094 * This function is written specifically for input from the FEI and should
4095 * be called before hypre_AMSDestroy().
4096 *--------------------------------------------------------------------------*/
4097
hypre_AMSFEIDestroy(void * solver)4098 HYPRE_Int hypre_AMSFEIDestroy(void *solver)
4099 {
4100 hypre_AMSData *ams_data = (hypre_AMSData *) solver;
4101
4102 if (ams_data -> G)
4103 hypre_ParCSRMatrixDestroy(ams_data -> G);
4104
4105 if (ams_data -> x)
4106 hypre_ParVectorDestroy(ams_data -> x);
4107 if (ams_data -> y)
4108 hypre_ParVectorDestroy(ams_data -> y);
4109 if (ams_data -> z)
4110 hypre_ParVectorDestroy(ams_data -> z);
4111
4112 return hypre_error_flag;
4113 }
4114
4115 /*--------------------------------------------------------------------------
4116 * hypre_ParCSRComputeL1Norms Threads
4117 *
4118 * Compute the l1 norms of the rows of a given matrix, depending on
4119 * the option parameter:
4120 *
4121 * option 1 = Compute the l1 norm of the rows
4122 * option 2 = Compute the l1 norm of the (processor) off-diagonal
4123 * part of the rows plus the diagonal of A
4124 * option 3 = Compute the l2 norm^2 of the rows
4125 * option 4 = Truncated version of option 2 based on Remark 6.2 in "Multigrid
4126 * Smoothers for Ultra-Parallel Computing"
4127 *
4128 * The above computations are done in a CF manner, whenever the provided
4129 * cf_marker is not NULL.
4130 *--------------------------------------------------------------------------*/
4131
hypre_ParCSRComputeL1NormsThreads(hypre_ParCSRMatrix * A,HYPRE_Int option,HYPRE_Int num_threads,HYPRE_Int * cf_marker,HYPRE_Real ** l1_norm_ptr)4132 HYPRE_Int hypre_ParCSRComputeL1NormsThreads(hypre_ParCSRMatrix *A,
4133 HYPRE_Int option,
4134 HYPRE_Int num_threads,
4135 HYPRE_Int *cf_marker,
4136 HYPRE_Real **l1_norm_ptr)
4137 {
4138 HYPRE_Int i, j, k;
4139 HYPRE_Int num_rows = hypre_ParCSRMatrixNumRows(A);
4140
4141 hypre_CSRMatrix *A_diag = hypre_ParCSRMatrixDiag(A);
4142 HYPRE_Int *A_diag_I = hypre_CSRMatrixI(A_diag);
4143 HYPRE_Int *A_diag_J = hypre_CSRMatrixJ(A_diag);
4144 HYPRE_Real *A_diag_data = hypre_CSRMatrixData(A_diag);
4145
4146 hypre_CSRMatrix *A_offd = hypre_ParCSRMatrixOffd(A);
4147 HYPRE_Int *A_offd_I = hypre_CSRMatrixI(A_offd);
4148 HYPRE_Int *A_offd_J = hypre_CSRMatrixJ(A_offd);
4149 HYPRE_Real *A_offd_data = hypre_CSRMatrixData(A_offd);
4150 HYPRE_Int num_cols_offd = hypre_CSRMatrixNumCols(A_offd);
4151
4152 HYPRE_Real diag;
4153 HYPRE_Real *l1_norm = hypre_TAlloc(HYPRE_Real, num_rows, hypre_ParCSRMatrixMemoryLocation(A));
4154 HYPRE_Int ii, ns, ne, rest, size;
4155
4156 HYPRE_Int *cf_marker_offd = NULL;
4157 HYPRE_Int cf_diag;
4158
4159 /* collect the cf marker data from other procs */
4160 if (cf_marker != NULL)
4161 {
4162 HYPRE_Int index;
4163 HYPRE_Int num_sends;
4164 HYPRE_Int start;
4165 HYPRE_Int *int_buf_data = NULL;
4166
4167 hypre_ParCSRCommPkg *comm_pkg = hypre_ParCSRMatrixCommPkg(A);
4168 hypre_ParCSRCommHandle *comm_handle;
4169
4170 if (num_cols_offd)
4171 cf_marker_offd = hypre_CTAlloc(HYPRE_Int, num_cols_offd, HYPRE_MEMORY_HOST);
4172 num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg);
4173 if (hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends))
4174 int_buf_data = hypre_CTAlloc(HYPRE_Int,
4175 hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends), HYPRE_MEMORY_HOST);
4176 index = 0;
4177 for (i = 0; i < num_sends; i++)
4178 {
4179 start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i);
4180 for (j = start; j < hypre_ParCSRCommPkgSendMapStart(comm_pkg, i+1); j++)
4181 {
4182 int_buf_data[index++] = cf_marker[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,j)];
4183 }
4184 }
4185 comm_handle = hypre_ParCSRCommHandleCreate(11, comm_pkg, int_buf_data,
4186 cf_marker_offd);
4187 hypre_ParCSRCommHandleDestroy(comm_handle);
4188 hypre_TFree(int_buf_data, HYPRE_MEMORY_HOST);
4189 }
4190
4191 #ifdef HYPRE_USING_OPENMP
4192 #pragma omp parallel for private(i,ii,j,k,ns,ne,rest,size,diag,cf_diag) HYPRE_SMP_SCHEDULE
4193 #endif
4194 for (k = 0; k < num_threads; k++)
4195 {
4196 size = num_rows/num_threads;
4197 rest = num_rows - size*num_threads;
4198 if (k < rest)
4199 {
4200 ns = k*size+k;
4201 ne = (k+1)*size+k+1;
4202 }
4203 else
4204 {
4205 ns = k*size+rest;
4206 ne = (k+1)*size+rest;
4207 }
4208
4209 if (option == 1)
4210 {
4211 for (i = ns; i < ne; i++)
4212 {
4213 l1_norm[i] = 0.0;
4214 if (cf_marker == NULL)
4215 {
4216 /* Add the l1 norm of the diag part of the ith row */
4217 for (j = A_diag_I[i]; j < A_diag_I[i+1]; j++)
4218 l1_norm[i] += fabs(A_diag_data[j]);
4219 /* Add the l1 norm of the offd part of the ith row */
4220 if (num_cols_offd)
4221 {
4222 for (j = A_offd_I[i]; j < A_offd_I[i+1]; j++)
4223 l1_norm[i] += fabs(A_offd_data[j]);
4224 }
4225 }
4226 else
4227 {
4228 cf_diag = cf_marker[i];
4229 /* Add the CF l1 norm of the diag part of the ith row */
4230 for (j = A_diag_I[i]; j < A_diag_I[i+1]; j++)
4231 if (cf_diag == cf_marker[A_diag_J[j]])
4232 l1_norm[i] += fabs(A_diag_data[j]);
4233 /* Add the CF l1 norm of the offd part of the ith row */
4234 if (num_cols_offd)
4235 {
4236 for (j = A_offd_I[i]; j < A_offd_I[i+1]; j++)
4237 if (cf_diag == cf_marker_offd[A_offd_J[j]])
4238 l1_norm[i] += fabs(A_offd_data[j]);
4239 }
4240 }
4241 }
4242 }
4243 else if (option == 2)
4244 {
4245 for (i = ns; i < ne; i++)
4246 {
4247 l1_norm[i] = 0.0;
4248 if (cf_marker == NULL)
4249 {
4250 /* Add the diagonal and the local off-thread part of the ith row */
4251 for (j = A_diag_I[i]; j < A_diag_I[i+1]; j++)
4252 {
4253 ii = A_diag_J[j];
4254 if (ii == i || ii < ns || ii >= ne)
4255 l1_norm[i] += fabs(A_diag_data[j]);
4256 }
4257 /* Add the l1 norm of the offd part of the ith row */
4258 if (num_cols_offd)
4259 {
4260 for (j = A_offd_I[i]; j < A_offd_I[i+1]; j++)
4261 l1_norm[i] += fabs(A_offd_data[j]);
4262 }
4263 }
4264 else
4265 {
4266 cf_diag = cf_marker[i];
4267 /* Add the diagonal and the local off-thread part of the ith row */
4268 for (j = A_diag_I[i]; j < A_diag_I[i+1]; j++)
4269 {
4270 ii = A_diag_J[j];
4271 if ((ii == i || ii < ns || ii >= ne) &&
4272 (cf_diag == cf_marker[A_diag_J[j]]))
4273 l1_norm[i] += fabs(A_diag_data[j]);
4274 }
4275 /* Add the CF l1 norm of the offd part of the ith row */
4276 if (num_cols_offd)
4277 {
4278 for (j = A_offd_I[i]; j < A_offd_I[i+1]; j++)
4279 if (cf_diag == cf_marker_offd[A_offd_J[j]])
4280 l1_norm[i] += fabs(A_offd_data[j]);
4281 }
4282 }
4283 }
4284 }
4285 else if (option == 3)
4286 {
4287 for (i = ns; i < ne; i++)
4288 {
4289 l1_norm[i] = 0.0;
4290 for (j = A_diag_I[i]; j < A_diag_I[i+1]; j++)
4291 l1_norm[i] += A_diag_data[j] * A_diag_data[j];
4292 if (num_cols_offd)
4293 for (j = A_offd_I[i]; j < A_offd_I[i+1]; j++)
4294 l1_norm[i] += A_offd_data[j] * A_offd_data[j];
4295 }
4296 }
4297 else if (option == 4)
4298 {
4299 for (i = ns; i < ne; i++)
4300 {
4301 l1_norm[i] = 0.0;
4302 if (cf_marker == NULL)
4303 {
4304 /* Add the diagonal and the local off-thread part of the ith row */
4305 for (j = A_diag_I[i]; j < A_diag_I[i+1]; j++)
4306 {
4307 ii = A_diag_J[j];
4308 if (ii == i || ii < ns || ii >= ne)
4309 {
4310 if (ii == i)
4311 {
4312 diag = fabs(A_diag_data[j]);
4313 l1_norm[i] += fabs(A_diag_data[j]);
4314 }
4315 else
4316 l1_norm[i] += 0.5*fabs(A_diag_data[j]);
4317 }
4318 }
4319 /* Add the l1 norm of the offd part of the ith row */
4320 if (num_cols_offd)
4321 {
4322 for (j = A_offd_I[i]; j < A_offd_I[i+1]; j++)
4323 l1_norm[i] += 0.5*fabs(A_offd_data[j]);
4324 }
4325 }
4326 else
4327 {
4328 cf_diag = cf_marker[i];
4329 /* Add the diagonal and the local off-thread part of the ith row */
4330 for (j = A_diag_I[i]; j < A_diag_I[i+1]; j++)
4331 {
4332 ii = A_diag_J[j];
4333 if ((ii == i || ii < ns || ii >= ne) &&
4334 (cf_diag == cf_marker[A_diag_J[j]]))
4335 {
4336 if (ii == i)
4337 {
4338 diag = fabs(A_diag_data[j]);
4339 l1_norm[i] += fabs(A_diag_data[j]);
4340 }
4341 else
4342 l1_norm[i] += 0.5*fabs(A_diag_data[j]);
4343 }
4344 }
4345 /* Add the CF l1 norm of the offd part of the ith row */
4346 if (num_cols_offd)
4347 {
4348 for (j = A_offd_I[i]; j < A_offd_I[i+1]; j++)
4349 if (cf_diag == cf_marker_offd[A_offd_J[j]])
4350 l1_norm[i] += 0.5*fabs(A_offd_data[j]);
4351 }
4352 }
4353
4354 /* Truncate according to Remark 6.2 */
4355 if (l1_norm[i] <= 4.0/3.0*diag)
4356 l1_norm[i] = diag;
4357 }
4358 }
4359
4360 else if (option == 5) /*stores diagonal of A for Jacobi using matvec, rlx 7 */
4361 {
4362 /* Set the diag element */
4363 for (i = ns; i < ne; i++)
4364 {
4365 l1_norm[i] = A_diag_data[A_diag_I[i]];
4366 if (l1_norm[i] == 0) l1_norm[i] = 1.0;
4367 }
4368 }
4369
4370 if (option < 5)
4371 {
4372 /* Handle negative definite matrices */
4373 for (i = ns; i < ne; i++)
4374 if (A_diag_data[A_diag_I[i]] < 0)
4375 l1_norm[i] = -l1_norm[i];
4376
4377 for (i = ns; i < ne; i++)
4378 /* if (fabs(l1_norm[i]) < DBL_EPSILON) */
4379 if (fabs(l1_norm[i]) == 0.0)
4380 {
4381 hypre_error_in_arg(1);
4382 break;
4383 }
4384 }
4385
4386 }
4387
4388 hypre_TFree(cf_marker_offd, HYPRE_MEMORY_HOST);
4389
4390 *l1_norm_ptr = l1_norm;
4391
4392 return hypre_error_flag;
4393 }
4394