1 /******************************************************************************
2 * Copyright 1998-2019 Lawrence Livermore National Security, LLC and other
3 * HYPRE Project Developers. See the top-level COPYRIGHT file for details.
4 *
5 * SPDX-License-Identifier: (Apache-2.0 OR MIT)
6 ******************************************************************************/
7
8 #include "_hypre_parcsr_ls.h"
9 #include "_hypre_utilities.hpp"
10
11 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
12
13 __global__ void hypre_BoomerAMGBuildDirInterp_getnnz( HYPRE_Int nr_of_rows, HYPRE_Int *S_diag_i, HYPRE_Int *S_diag_j, HYPRE_Int *S_offd_i, HYPRE_Int *S_offd_j, HYPRE_Int *CF_marker, HYPRE_Int *CF_marker_offd, HYPRE_Int num_functions, HYPRE_Int *dof_func, HYPRE_Int *dof_func_offd, HYPRE_Int *P_diag_i, HYPRE_Int *P_offd_i);
14
15 __global__ void hypre_BoomerAMGBuildDirInterp_getcoef( HYPRE_Int nr_of_rows, HYPRE_Int *A_diag_i, HYPRE_Int *A_diag_j, HYPRE_Real *A_diag_data, HYPRE_Int *A_offd_i, HYPRE_Int *A_offd_j, HYPRE_Real *A_offd_data, HYPRE_Int *Soc_diag_j, HYPRE_Int *Soc_offd_j, HYPRE_Int *CF_marker, HYPRE_Int *CF_marker_offd, HYPRE_Int num_functions, HYPRE_Int *dof_func, HYPRE_Int *dof_func_offd, HYPRE_Int *P_diag_i, HYPRE_Int *P_diag_j, HYPRE_Real *P_diag_data, HYPRE_Int *P_offd_i, HYPRE_Int *P_offd_j, HYPRE_Real *P_offd_data, HYPRE_Int *fine_to_coarse );
16
17 __global__ void hypre_BoomerAMGBuildDirInterp_getcoef_v2( HYPRE_Int nr_of_rows, HYPRE_Int *A_diag_i, HYPRE_Int *A_diag_j, HYPRE_Real *A_diag_data, HYPRE_Int *A_offd_i, HYPRE_Int *A_offd_j, HYPRE_Real *A_offd_data, HYPRE_Int *Soc_diag_j, HYPRE_Int *Soc_offd_j, HYPRE_Int *CF_marker, HYPRE_Int *CF_marker_offd, HYPRE_Int num_functions, HYPRE_Int *dof_func, HYPRE_Int *dof_func_offd, HYPRE_Int *P_diag_i, HYPRE_Int *P_diag_j, HYPRE_Real *P_diag_data, HYPRE_Int *P_offd_i, HYPRE_Int *P_offd_j, HYPRE_Real *P_offd_data, HYPRE_Int *fine_to_coarse );
18
19 __global__ void
20 hypre_BoomerAMGBuildInterpOnePnt_getnnz( HYPRE_Int nr_of_rows, HYPRE_Int *A_diag_i, HYPRE_Int *A_strong_diag_j, HYPRE_Complex *A_diag_a, HYPRE_Int *A_offd_i, HYPRE_Int *A_strong_offd_j, HYPRE_Complex *A_offd_a, HYPRE_Int *CF_marker, HYPRE_Int *CF_marker_offd, HYPRE_Int *diag_compress_marker, HYPRE_Int *offd_compress_marker, HYPRE_Int *P_diag_i, HYPRE_Int *P_diag_j, HYPRE_Int *P_offd_i, HYPRE_Int *P_offd_j);
21
22 /*---------------------------------------------------------------------------
23 * hypre_BoomerAMGBuildDirInterp
24 *--------------------------------------------------------------------------*/
25
26 HYPRE_Int
hypre_BoomerAMGBuildDirInterpDevice(hypre_ParCSRMatrix * A,HYPRE_Int * CF_marker,hypre_ParCSRMatrix * S,HYPRE_BigInt * num_cpts_global,HYPRE_Int num_functions,HYPRE_Int * dof_func,HYPRE_Int debug_flag,HYPRE_Real trunc_factor,HYPRE_Int max_elmts,HYPRE_Int interp_type,hypre_ParCSRMatrix ** P_ptr)27 hypre_BoomerAMGBuildDirInterpDevice( hypre_ParCSRMatrix *A,
28 HYPRE_Int *CF_marker,
29 hypre_ParCSRMatrix *S,
30 HYPRE_BigInt *num_cpts_global,
31 HYPRE_Int num_functions,
32 HYPRE_Int *dof_func,
33 HYPRE_Int debug_flag,
34 HYPRE_Real trunc_factor,
35 HYPRE_Int max_elmts,
36 HYPRE_Int interp_type,
37 hypre_ParCSRMatrix **P_ptr)
38 {
39 MPI_Comm comm = hypre_ParCSRMatrixComm(A);
40 hypre_ParCSRCommPkg *comm_pkg = hypre_ParCSRMatrixCommPkg(A);
41 hypre_ParCSRCommHandle *comm_handle;
42
43 hypre_CSRMatrix *A_diag = hypre_ParCSRMatrixDiag(A);
44 HYPRE_Real *A_diag_data = hypre_CSRMatrixData(A_diag);
45 HYPRE_Int *A_diag_i = hypre_CSRMatrixI(A_diag);
46 HYPRE_Int *A_diag_j = hypre_CSRMatrixJ(A_diag);
47
48 hypre_CSRMatrix *A_offd = hypre_ParCSRMatrixOffd(A);
49 HYPRE_Real *A_offd_data = hypre_CSRMatrixData(A_offd);
50 HYPRE_Int *A_offd_i = hypre_CSRMatrixI(A_offd);
51 HYPRE_Int *A_offd_j = hypre_CSRMatrixJ(A_offd);
52 HYPRE_Int num_cols_A_offd = hypre_CSRMatrixNumCols(A_offd);
53 // HYPRE_BigInt *col_map_offd_A = hypre_ParCSRMatrixColMapOffd(A);
54
55 HYPRE_Int n_fine = hypre_CSRMatrixNumRows(A_diag);
56
57 hypre_BoomerAMGMakeSocFromSDevice(A, S);
58
59 hypre_CSRMatrix *S_diag = hypre_ParCSRMatrixDiag(S);
60 HYPRE_Int *S_diag_i = hypre_CSRMatrixI(S_diag);
61 HYPRE_Int *S_diag_j = hypre_CSRMatrixJ(S_diag);
62
63 hypre_CSRMatrix *S_offd = hypre_ParCSRMatrixOffd(S);
64 HYPRE_Int *S_offd_i = hypre_CSRMatrixI(S_offd);
65 HYPRE_Int *S_offd_j = hypre_CSRMatrixJ(S_offd);
66
67 hypre_ParCSRMatrix *P;
68 HYPRE_Int *tmp_map_offd_h = NULL;
69
70 HYPRE_Int *CF_marker_offd = NULL;
71 HYPRE_Int *dof_func_offd = NULL;
72 HYPRE_Int *dof_func_dev = NULL;
73
74 hypre_CSRMatrix *P_diag;
75 hypre_CSRMatrix *P_offd;
76 HYPRE_Real *P_diag_data;
77 HYPRE_Int *P_diag_i;
78 HYPRE_Int *P_diag_j;
79 HYPRE_Real *P_offd_data;
80 HYPRE_Int *P_offd_i;
81 HYPRE_Int *P_offd_j;
82 HYPRE_Int P_diag_size, P_offd_size;
83
84 HYPRE_Int *fine_to_coarse_d;
85 HYPRE_Int *fine_to_coarse_h;
86 HYPRE_BigInt total_global_cpts;
87 HYPRE_Int num_cols_P_offd = 0;
88
89 HYPRE_Int i;
90 HYPRE_Int j;
91 HYPRE_Int start;
92 HYPRE_Int my_id;
93 HYPRE_Int num_procs;
94 HYPRE_Int num_sends;
95 HYPRE_Int index;
96 HYPRE_Int *int_buf_data;
97
98 HYPRE_Real wall_time; /* for debugging instrumentation */
99
100 HYPRE_MemoryLocation memory_location = hypre_ParCSRMatrixMemoryLocation(A);
101
102 hypre_MPI_Comm_size(comm, &num_procs);
103 hypre_MPI_Comm_rank(comm,&my_id);
104
105 if (my_id == (num_procs -1))
106 {
107 total_global_cpts = num_cpts_global[1];
108 }
109 hypre_MPI_Bcast( &total_global_cpts, 1, HYPRE_MPI_BIG_INT, num_procs-1, comm);
110
111 if (!comm_pkg)
112 {
113 hypre_MatvecCommPkgCreate(A);
114 comm_pkg = hypre_ParCSRMatrixCommPkg(A);
115 }
116 if (debug_flag == 4)
117 {
118 wall_time = time_getWallclockSeconds();
119 }
120
121 /* 1. Communicate CF_marker to/from other processors */
122 if (num_cols_A_offd)
123 {
124 CF_marker_offd = hypre_TAlloc(HYPRE_Int, num_cols_A_offd, HYPRE_MEMORY_DEVICE);
125 }
126
127 num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg);
128 int_buf_data = hypre_TAlloc(HYPRE_Int, hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends), HYPRE_MEMORY_DEVICE);
129 hypre_ParCSRCommPkgCopySendMapElmtsToDevice(comm_pkg);
130 HYPRE_THRUST_CALL( gather,
131 hypre_ParCSRCommPkgDeviceSendMapElmts(comm_pkg),
132 hypre_ParCSRCommPkgDeviceSendMapElmts(comm_pkg) + hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends),
133 CF_marker,
134 int_buf_data );
135
136 comm_handle = hypre_ParCSRCommHandleCreate_v2(11, comm_pkg, HYPRE_MEMORY_DEVICE, int_buf_data,
137 HYPRE_MEMORY_DEVICE, CF_marker_offd);
138 hypre_ParCSRCommHandleDestroy(comm_handle);
139
140 if (num_functions > 1)
141 {
142 /* 2. Communicate dof_func to/from other processors */
143 if (num_cols_A_offd > 0)
144 {
145 dof_func_offd = hypre_TAlloc(HYPRE_Int, num_cols_A_offd, HYPRE_MEMORY_DEVICE);
146 }
147
148 index = 0;
149 for (i = 0; i < num_sends; i++)
150 {
151 start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i);
152 for (j=start; j < hypre_ParCSRCommPkgSendMapStart(comm_pkg, i+1); j++)
153 {
154 int_buf_data[index++] = dof_func[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,j)];
155 }
156 }
157 comm_handle = hypre_ParCSRCommHandleCreate_v2(11, comm_pkg, HYPRE_MEMORY_HOST, int_buf_data,
158 HYPRE_MEMORY_DEVICE, dof_func_offd);
159 hypre_ParCSRCommHandleDestroy(comm_handle);
160
161 dof_func_dev = hypre_TAlloc(HYPRE_Int, n_fine, HYPRE_MEMORY_DEVICE);
162 hypre_TMemcpy(dof_func_dev, dof_func, HYPRE_Int, n_fine, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST);
163 }
164
165 if (debug_flag == 4)
166 {
167 wall_time = time_getWallclockSeconds() - wall_time;
168 hypre_printf("Proc = %d Interp: Comm 1 CF_marker = %f\n", my_id, wall_time);
169 fflush(NULL);
170 }
171
172 /* 3. Figure out the size of the interpolation matrix, P, i.e., compute P_diag_i and P_offd_i */
173 /* Also, compute fine_to_coarse array: When i is a coarse point, fine_to_coarse[i] will hold a */
174 /* corresponding coarse point index in the range 0..n_coarse-1 */
175 P_diag_i = hypre_TAlloc(HYPRE_Int, n_fine+1, memory_location);
176 P_offd_i = hypre_TAlloc(HYPRE_Int, n_fine+1, memory_location);
177
178 dim3 bDim = hypre_GetDefaultCUDABlockDimension();
179 dim3 gDim = hypre_GetDefaultCUDAGridDimension(n_fine, "warp", bDim);
180
181 HYPRE_CUDA_LAUNCH( hypre_BoomerAMGBuildDirInterp_getnnz, gDim, bDim,
182 n_fine, S_diag_i, S_diag_j, S_offd_i, S_offd_j,
183 CF_marker, CF_marker_offd, num_functions,
184 dof_func_dev, dof_func_offd, P_diag_i, P_offd_i);
185
186 /* The scans will transform P_diag_i and P_offd_i to the CSR I-vectors */
187 hypreDevice_IntegerExclusiveScan(n_fine+1, P_diag_i);
188 hypreDevice_IntegerExclusiveScan(n_fine+1, P_offd_i);
189
190 fine_to_coarse_d = hypre_TAlloc(HYPRE_Int, n_fine, HYPRE_MEMORY_DEVICE);
191 /* The scan will make fine_to_coarse[i] for i a coarse point hold a
192 * coarse point index in the range from 0 to n_coarse-1 */
193 HYPRE_THRUST_CALL( exclusive_scan,
194 thrust::make_transform_iterator(CF_marker, is_nonnegative<HYPRE_Int>()),
195 thrust::make_transform_iterator(CF_marker + n_fine, is_nonnegative<HYPRE_Int>()),
196 fine_to_coarse_d,
197 HYPRE_Int(0) ); /* *MUST* pass init value since input and output types diff. */
198
199 /* 4. Compute the CSR arrays P_diag_j, P_diag_data, P_offd_j, and P_offd_data */
200 /* P_diag_i and P_offd_i are now known, first allocate the remaining CSR arrays of P */
201 hypre_TMemcpy(&P_diag_size, &P_diag_i[n_fine], HYPRE_Int, 1, HYPRE_MEMORY_HOST, memory_location);
202 hypre_TMemcpy(&P_offd_size, &P_offd_i[n_fine], HYPRE_Int, 1, HYPRE_MEMORY_HOST, memory_location);
203
204 P_diag_j = hypre_TAlloc(HYPRE_Int, P_diag_size, memory_location);
205 P_diag_data = hypre_TAlloc(HYPRE_Real, P_diag_size, memory_location);
206
207 P_offd_j = hypre_TAlloc(HYPRE_Int, P_offd_size, memory_location);
208 P_offd_data = hypre_TAlloc(HYPRE_Real, P_offd_size, memory_location);
209
210 if (interp_type == 3)
211 {
212 HYPRE_CUDA_LAUNCH( hypre_BoomerAMGBuildDirInterp_getcoef, gDim, bDim,
213 n_fine, A_diag_i, A_diag_j, A_diag_data,
214 A_offd_i, A_offd_j, A_offd_data,
215 hypre_ParCSRMatrixSocDiagJ(S),
216 hypre_ParCSRMatrixSocOffdJ(S),
217 CF_marker, CF_marker_offd,
218 num_functions, dof_func_dev, dof_func_offd,
219 P_diag_i, P_diag_j, P_diag_data,
220 P_offd_i, P_offd_j, P_offd_data,
221 fine_to_coarse_d );
222 }
223 else
224 {
225 HYPRE_CUDA_LAUNCH( hypre_BoomerAMGBuildDirInterp_getcoef_v2, gDim, bDim,
226 n_fine, A_diag_i, A_diag_j, A_diag_data,
227 A_offd_i, A_offd_j, A_offd_data,
228 hypre_ParCSRMatrixSocDiagJ(S),
229 hypre_ParCSRMatrixSocOffdJ(S),
230 CF_marker, CF_marker_offd,
231 num_functions, dof_func_dev, dof_func_offd,
232 P_diag_i, P_diag_j, P_diag_data,
233 P_offd_i, P_offd_j, P_offd_data,
234 fine_to_coarse_d );
235 }
236
237 /* !!!! Free them here */
238 /*
239 hypre_TFree(hypre_ParCSRMatrixSocDiagJ(S), HYPRE_MEMORY_DEVICE);
240 hypre_TFree(hypre_ParCSRMatrixSocOffdJ(S), HYPRE_MEMORY_DEVICE);
241 */
242
243 HYPRE_THRUST_CALL(replace, CF_marker, CF_marker + n_fine, -3, -1);
244
245 /* 5. Construct the result as a ParCSRMatrix. At this point, P's column indices */
246 /* are defined with A's enumeration of columns */
247
248 P = hypre_ParCSRMatrixCreate(comm,
249 hypre_ParCSRMatrixGlobalNumRows(A),
250 total_global_cpts,
251 hypre_ParCSRMatrixColStarts(A),
252 num_cpts_global,
253 0,
254 P_diag_size,
255 P_offd_size);
256
257 P_diag = hypre_ParCSRMatrixDiag(P);
258 hypre_CSRMatrixData(P_diag) = P_diag_data;
259 hypre_CSRMatrixI(P_diag) = P_diag_i;
260 hypre_CSRMatrixJ(P_diag) = P_diag_j;
261
262 P_offd = hypre_ParCSRMatrixOffd(P);
263 hypre_CSRMatrixData(P_offd) = P_offd_data;
264 hypre_CSRMatrixI(P_offd) = P_offd_i;
265 hypre_CSRMatrixJ(P_offd) = P_offd_j;
266
267 hypre_CSRMatrixMemoryLocation(P_diag) = memory_location;
268 hypre_CSRMatrixMemoryLocation(P_offd) = memory_location;
269
270 /* 6. Compress P, removing coefficients smaller than trunc_factor * Max, and */
271 /* make sure no row has more than max_elmts elements */
272
273 if (trunc_factor != 0.0 || max_elmts > 0)
274 {
275 hypre_BoomerAMGInterpTruncationDevice(P, trunc_factor, max_elmts);
276
277 P_offd_i = hypre_CSRMatrixI(P_offd);
278 P_offd_j = hypre_CSRMatrixJ(P_offd);
279 P_offd_size = hypre_CSRMatrixNumNonzeros(P_offd);
280 /* hypre_TMemcpy(&P_offd_size, &P_offd_i[n_fine], HYPRE_Int, 1, HYPRE_MEMORY_HOST, memory_location); */
281 }
282
283 /* 7. Translate P_offd's column indices from the values inherited from A_offd to a 0,1,2,3,... enumeration, */
284 /* and construct the col_map array that translates these into the global 0..c-1 enumeration */
285 if (P_offd_size)
286 {
287 /* Array P_marker has length equal to the number of A's offd columns+1, and will */
288 /* store a translation code from A_offd's local column numbers to P_offd's local column numbers */
289 /* Example: if A_offd has 6 columns, locally 0,1,..,5, and points 1 and 4 are coarse points, then
290 P_marker=[0,1,0,0,1,0,0], */
291
292 /* First, set P_marker[i] to 1 if A's column i is also present in P, otherwise P_marker[i] is 0 */
293 HYPRE_Int *P_marker = hypre_TAlloc(HYPRE_Int, num_cols_A_offd, HYPRE_MEMORY_DEVICE);
294 HYPRE_Int *P_colids = hypre_TAlloc(HYPRE_Int, hypre_max(P_offd_size, num_cols_A_offd), HYPRE_MEMORY_DEVICE);
295
296 hypre_TMemcpy(P_colids, P_offd_j, HYPRE_Int, P_offd_size, HYPRE_MEMORY_DEVICE, memory_location);
297 /* sort and unique */
298 HYPRE_THRUST_CALL(sort, P_colids, P_colids + P_offd_size);
299 HYPRE_Int *new_end = HYPRE_THRUST_CALL(unique, P_colids, P_colids + P_offd_size);
300
301 num_cols_P_offd = new_end - P_colids;
302
303 HYPRE_THRUST_CALL(fill_n, P_marker, num_cols_A_offd, 0);
304 hypreDevice_ScatterConstant(P_marker, num_cols_P_offd, P_colids, 1);
305
306 /* Because P's columns correspond to P_marker[i]=1 (and =0 otherwise), the scan below will return */
307 /* an enumeration of P's columns 0,1,... at the corresponding locations in P_marker. */
308 /* P_marker[num_cols_A_offd] will contain num_cols_P_offd, so sum reduction above could */
309 /* have been replaced by reading the last element of P_marker. */
310 HYPRE_THRUST_CALL(exclusive_scan, P_marker, P_marker + num_cols_A_offd, P_colids);
311 /* Example: P_marker becomes [0,0,1,1,1,2] so that P_marker[1]=0, P_marker[4]=1 */
312
313 /* Do the re-enumeration, P_offd_j are mapped, using P_marker as map */
314 HYPRE_THRUST_CALL(gather, P_offd_j, P_offd_j + P_offd_size, P_colids, P_offd_j);
315
316 /* Create and define array tmp_map_offd. This array is the inverse of the P_marker mapping, */
317 /* Example: num_cols_P_offd=2, tmp_map_offd[0] = 1, tmp_map_offd[1]=4 */
318 /* P_colids is large enough to hold */
319 new_end = HYPRE_THRUST_CALL(copy_if,
320 thrust::make_counting_iterator(0),
321 thrust::make_counting_iterator(num_cols_A_offd),
322 P_marker,
323 P_colids,
324 thrust::identity<HYPRE_Int>());
325 hypre_assert(new_end - P_colids == num_cols_P_offd);
326
327 tmp_map_offd_h = hypre_TAlloc(HYPRE_Int, num_cols_P_offd, HYPRE_MEMORY_HOST);
328 hypre_TMemcpy(tmp_map_offd_h, P_colids, HYPRE_Int, num_cols_P_offd, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE);
329
330 hypre_TFree(P_colids, HYPRE_MEMORY_DEVICE);
331 hypre_TFree(P_marker, HYPRE_MEMORY_DEVICE);
332 }
333
334 /* 8. P_offd_j now has a 0,1,2,3... local column index enumeration. */
335 /* tmp_map_offd contains the index mapping from P's offd local columns to A's offd local columns.*/
336 /* Below routine is in parcsr_ls/par_rap_communication.c. It sets col_map_offd in P, */
337 /* comm_pkg in P, and perhaps more members of P ??? */
338
339 fine_to_coarse_h = hypre_TAlloc(HYPRE_Int, n_fine, HYPRE_MEMORY_HOST);
340 hypre_TMemcpy(fine_to_coarse_h, fine_to_coarse_d, HYPRE_Int, n_fine, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE);
341
342 hypre_ParCSRMatrixColMapOffd(P) = hypre_CTAlloc(HYPRE_BigInt, num_cols_P_offd, HYPRE_MEMORY_HOST);
343 hypre_CSRMatrixNumCols(P_offd) = num_cols_P_offd;
344
345 hypre_GetCommPkgRTFromCommPkgA(P, A, fine_to_coarse_h, tmp_map_offd_h);
346
347 *P_ptr = P;
348
349 hypre_TFree(CF_marker_offd, HYPRE_MEMORY_DEVICE);
350 hypre_TFree(dof_func_offd, HYPRE_MEMORY_DEVICE);
351 hypre_TFree(dof_func_dev, HYPRE_MEMORY_DEVICE);
352 hypre_TFree(int_buf_data, HYPRE_MEMORY_DEVICE);
353 hypre_TFree(fine_to_coarse_d, HYPRE_MEMORY_DEVICE);
354 hypre_TFree(fine_to_coarse_h, HYPRE_MEMORY_HOST);
355 hypre_TFree(tmp_map_offd_h, HYPRE_MEMORY_HOST);
356
357 return hypre_error_flag;
358 }
359
360
361 /*-----------------------------------------------------------------------*/
362 __global__ void
hypre_BoomerAMGBuildDirInterp_getnnz(HYPRE_Int nr_of_rows,HYPRE_Int * S_diag_i,HYPRE_Int * S_diag_j,HYPRE_Int * S_offd_i,HYPRE_Int * S_offd_j,HYPRE_Int * CF_marker,HYPRE_Int * CF_marker_offd,HYPRE_Int num_functions,HYPRE_Int * dof_func,HYPRE_Int * dof_func_offd,HYPRE_Int * P_diag_i,HYPRE_Int * P_offd_i)363 hypre_BoomerAMGBuildDirInterp_getnnz( HYPRE_Int nr_of_rows,
364 HYPRE_Int *S_diag_i,
365 HYPRE_Int *S_diag_j,
366 HYPRE_Int *S_offd_i,
367 HYPRE_Int *S_offd_j,
368 HYPRE_Int *CF_marker,
369 HYPRE_Int *CF_marker_offd,
370 HYPRE_Int num_functions,
371 HYPRE_Int *dof_func,
372 HYPRE_Int *dof_func_offd,
373 HYPRE_Int *P_diag_i,
374 HYPRE_Int *P_offd_i)
375 {
376 /*-----------------------------------------------------------------------*/
377 /* Determine size of interpolation matrix, P
378
379 If A is of size m x m, then P will be of size m x c where c is the
380 number of coarse points.
381
382 It is assumed that S have the same global column enumeration as A
383
384 Input: nr_of_rows - Number of rows in matrix (local in processor)
385 S_diag_i, S_diag_j - CSR representation of S_diag
386 S_offd_i, S_offd_j - CSR representation of S_offd
387 num_function - Number of degrees of freedom per grid point
388 dof_func - vector of length nr_of_rows, indicating the degree of freedom of vector element.
389 dof_func_offd - vector over ncols of A_offd, indicating the degree of freedom.
390
391 Output: P_diag_i - Vector where P_diag_i[i] holds the number of non-zero elements of P_diag on row i.
392 P_offd_i - Vector where P_offd_i[i] holds the number of non-zero elements of P_offd on row i.
393 fine_to_coarse - Vector of length nr_of_rows.
394 fine_to_coarse[i] is set to 1 if i is a coarse pt.
395 Eventually, fine_to_coarse[j] will map A's column j
396 to a re-enumerated column index in matrix P.
397 */
398 /*-----------------------------------------------------------------------*/
399
400 HYPRE_Int i = hypre_cuda_get_grid_warp_id<1,1>();
401
402 if (i >= nr_of_rows)
403 {
404 return;
405 }
406
407 HYPRE_Int p, q, dof_func_i;
408 HYPRE_Int jPd = 0, jPo = 0;
409 HYPRE_Int lane = hypre_cuda_get_lane_id<1>();
410
411 if (lane == 0)
412 {
413 p = read_only_load(CF_marker + i);
414 }
415 p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0);
416
417 /*--------------------------------------------------------------------
418 * If i is a C-point, interpolation is the identity.
419 *--------------------------------------------------------------------*/
420 if (p >= 0)
421 {
422 if (lane == 0)
423 {
424 P_diag_i[i] = 1;
425 P_offd_i[i] = 0;
426 }
427 return;
428 }
429
430 /*--------------------------------------------------------------------
431 * If i is an F-point, interpolation is from the C-points that
432 * strongly influence i.
433 *--------------------------------------------------------------------*/
434 if (num_functions > 1 && dof_func != NULL)
435 {
436 if (lane == 0)
437 {
438 dof_func_i = read_only_load(&dof_func[i]);
439 }
440 dof_func_i = __shfl_sync(HYPRE_WARP_FULL_MASK, dof_func_i, 0);
441 }
442
443 /* diag part */
444 if (lane < 2)
445 {
446 p = read_only_load(S_diag_i + i + lane);
447 }
448 q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1);
449 p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0);
450
451 for (HYPRE_Int j = p + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE)
452 {
453 if (j < q)
454 {
455 const HYPRE_Int col = read_only_load(&S_diag_j[j]);
456 if ( read_only_load(&CF_marker[col]) > 0 && (num_functions == 1 || read_only_load(&dof_func[col]) == dof_func_i) )
457 {
458 jPd++;
459 }
460 }
461 }
462 jPd = warp_reduce_sum(jPd);
463
464 /* offd part */
465 if (lane < 2)
466 {
467 p = read_only_load(S_offd_i + i + lane);
468 }
469 q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1);
470 p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0);
471
472 for (HYPRE_Int j = p + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE)
473 {
474 if (j < q)
475 {
476 const HYPRE_Int tmp = read_only_load(&S_offd_j[j]);
477 const HYPRE_Int col = tmp;
478 if ( read_only_load(&CF_marker_offd[col]) > 0 && (num_functions == 1 || read_only_load(&dof_func_offd[col]) == dof_func_i) )
479 {
480 jPo++;
481 }
482 }
483 }
484 jPo = warp_reduce_sum(jPo);
485
486 if (lane == 0)
487 {
488 P_diag_i[i] = jPd;
489 P_offd_i[i] = jPo;
490 }
491 }
492
493 /*-----------------------------------------------------------------------*
494 *-----------------------------------------------------------------------*/
495 __global__ void
hypre_BoomerAMGBuildDirInterp_getcoef(HYPRE_Int nr_of_rows,HYPRE_Int * A_diag_i,HYPRE_Int * A_diag_j,HYPRE_Real * A_diag_data,HYPRE_Int * A_offd_i,HYPRE_Int * A_offd_j,HYPRE_Real * A_offd_data,HYPRE_Int * Soc_diag_j,HYPRE_Int * Soc_offd_j,HYPRE_Int * CF_marker,HYPRE_Int * CF_marker_offd,HYPRE_Int num_functions,HYPRE_Int * dof_func,HYPRE_Int * dof_func_offd,HYPRE_Int * P_diag_i,HYPRE_Int * P_diag_j,HYPRE_Real * P_diag_data,HYPRE_Int * P_offd_i,HYPRE_Int * P_offd_j,HYPRE_Real * P_offd_data,HYPRE_Int * fine_to_coarse)496 hypre_BoomerAMGBuildDirInterp_getcoef( HYPRE_Int nr_of_rows,
497 HYPRE_Int *A_diag_i,
498 HYPRE_Int *A_diag_j,
499 HYPRE_Real *A_diag_data,
500 HYPRE_Int *A_offd_i,
501 HYPRE_Int *A_offd_j,
502 HYPRE_Real *A_offd_data,
503 HYPRE_Int *Soc_diag_j,
504 HYPRE_Int *Soc_offd_j,
505 HYPRE_Int *CF_marker,
506 HYPRE_Int *CF_marker_offd,
507 HYPRE_Int num_functions,
508 HYPRE_Int *dof_func,
509 HYPRE_Int *dof_func_offd,
510 HYPRE_Int *P_diag_i,
511 HYPRE_Int *P_diag_j,
512 HYPRE_Real *P_diag_data,
513 HYPRE_Int *P_offd_i,
514 HYPRE_Int *P_offd_j,
515 HYPRE_Real *P_offd_data,
516 HYPRE_Int *fine_to_coarse )
517 {
518 /*-----------------------------------------------------------------------*/
519 /* Compute interpolation matrix, P
520
521 Input: nr_of_rows - Number of rows in matrix (local in processor)
522 A_diag_i, A_diag_j, A_diag_data - CSR representation of A_diag
523 A_offd_i, A_offd_j, A_offd_data - CSR representation of A_offd
524 S_diag_i, S_diag_j - CSR representation of S_diag
525 S_offd_i, S_offd_j - CSR representation of S_offd
526 CF_marker - Coarse/Fine flags for indices (rows) in this processor
527 CF_marker_offd - Coarse/Fine flags for indices (rows) not in this processor
528 num_function - Number of degrees of freedom per grid point
529 dof_func - vector over nonzero elements of A_diag, indicating the degree of freedom
530 dof_func_offd - vector over nonzero elements of A_offd, indicating the degree of freedom
531 fine_to_coarse - Vector of length nr_of_rows-1.
532
533 Output: P_diag_j - Column indices in CSR representation of P_diag
534 P_diag_data - Matrix elements in CSR representation of P_diag
535 P_offd_j - Column indices in CSR representation of P_offd
536 P_offd_data - Matrix elements in CSR representation of P_diag
537 */
538 /*-----------------------------------------------------------------------*/
539
540 HYPRE_Int i = hypre_cuda_get_grid_warp_id<1,1>();
541
542 if (i >= nr_of_rows)
543 {
544 return;
545 }
546
547 HYPRE_Int lane = hypre_cuda_get_lane_id<1>();
548
549 HYPRE_Int k, dof_func_i;
550
551 if (lane == 0)
552 {
553 k = read_only_load(CF_marker + i);
554 }
555 k = __shfl_sync(HYPRE_WARP_FULL_MASK, k, 0);
556
557 /*--------------------------------------------------------------------
558 * If i is a C-point, interpolation is the identity.
559 *--------------------------------------------------------------------*/
560 if (k > 0)
561 {
562 if (lane == 0)
563 {
564 const HYPRE_Int ind = read_only_load(&P_diag_i[i]);
565 P_diag_j[ind] = read_only_load(&fine_to_coarse[i]);
566 P_diag_data[ind] = 1.0;
567 }
568
569 return;
570 }
571
572 /*--------------------------------------------------------------------
573 * Point is f-point, use direct interpolation
574 *--------------------------------------------------------------------*/
575 if (num_functions > 1 && dof_func != NULL)
576 {
577 if (lane == 0)
578 {
579 dof_func_i = read_only_load(&dof_func[i]);
580 }
581 dof_func_i = __shfl_sync(HYPRE_WARP_FULL_MASK, dof_func_i, 0);
582 }
583
584 HYPRE_Real diagonal = 0.0, sum_N_pos = 0.0, sum_N_neg = 0.0, sum_P_pos = 0.0, sum_P_neg = 0.0;
585
586 /* diag part */
587 HYPRE_Int p_diag_A, q_diag_A, p_diag_P, q_diag_P;
588 if (lane < 2)
589 {
590 p_diag_A = read_only_load(A_diag_i + i + lane);
591 p_diag_P = read_only_load(P_diag_i + i + lane);
592 }
593 q_diag_A = __shfl_sync(HYPRE_WARP_FULL_MASK, p_diag_A, 1);
594 p_diag_A = __shfl_sync(HYPRE_WARP_FULL_MASK, p_diag_A, 0);
595 q_diag_P = __shfl_sync(HYPRE_WARP_FULL_MASK, p_diag_P, 1);
596 p_diag_P = __shfl_sync(HYPRE_WARP_FULL_MASK, p_diag_P, 0);
597
598 k = p_diag_P;
599 for (HYPRE_Int j = p_diag_A + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q_diag_A); j += HYPRE_WARP_SIZE)
600 {
601 HYPRE_Int col, sum, pos;
602 HYPRE_Int is_SC = 0; /* if is a Strong-C */
603 HYPRE_Complex val;
604
605 if (j < q_diag_A)
606 {
607 col = read_only_load(&A_diag_j[j]);
608
609 if (i == col)
610 {
611 diagonal = read_only_load(&A_diag_data[j]);
612 }
613 else if ( num_functions == 1 || read_only_load(&dof_func[col]) == dof_func_i )
614 {
615 val = read_only_load(&A_diag_data[j]);
616
617 if (val > 0.0)
618 {
619 sum_N_pos += val;
620 }
621 else
622 {
623 sum_N_neg += val;
624 }
625
626 is_SC = read_only_load(&Soc_diag_j[j]) > -1 && read_only_load(&CF_marker[col]) > 0;
627
628 if (is_SC)
629 {
630 if (val > 0.0)
631 {
632 sum_P_pos += val;
633 }
634 else
635 {
636 sum_P_neg += val;
637 }
638 }
639 }
640 }
641
642 pos = warp_prefix_sum(lane, is_SC, sum);
643
644 if (is_SC)
645 {
646 P_diag_data[k + pos] = val;
647 P_diag_j[k + pos] = read_only_load(&fine_to_coarse[col]);
648 }
649 k += sum;
650 }
651
652 hypre_device_assert(k == q_diag_P);
653
654 /* offd part */
655 HYPRE_Int p_offd_A, q_offd_A, p_offd_P, q_offd_P;
656 if (lane < 2)
657 {
658 p_offd_A = read_only_load(A_offd_i + i + lane);
659 p_offd_P = read_only_load(P_offd_i + i + lane);
660 }
661 q_offd_A = __shfl_sync(HYPRE_WARP_FULL_MASK, p_offd_A, 1);
662 p_offd_A = __shfl_sync(HYPRE_WARP_FULL_MASK, p_offd_A, 0);
663 q_offd_P = __shfl_sync(HYPRE_WARP_FULL_MASK, p_offd_P, 1);
664 p_offd_P = __shfl_sync(HYPRE_WARP_FULL_MASK, p_offd_P, 0);
665
666 k = p_offd_P;
667 for (HYPRE_Int j = p_offd_A + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q_offd_A); j += HYPRE_WARP_SIZE)
668 {
669 HYPRE_Int col, sum, pos;
670 HYPRE_Int is_SC = 0; /* if is a Strong-C */
671 HYPRE_Complex val;
672
673 if (j < q_offd_A)
674 {
675 col = read_only_load(&A_offd_j[j]);
676
677 if ( num_functions == 1 || read_only_load(&dof_func_offd[col]) == dof_func_i )
678 {
679 val = read_only_load(&A_offd_data[j]);
680
681 if (val > 0.0)
682 {
683 sum_N_pos += val;
684 }
685 else
686 {
687 sum_N_neg += val;
688 }
689
690 is_SC = read_only_load(&Soc_offd_j[j]) > -1 && read_only_load(&CF_marker_offd[col]) > 0;
691
692 if (is_SC)
693 {
694 if (val > 0.0)
695 {
696 sum_P_pos += val;
697 }
698 else
699 {
700 sum_P_neg += val;
701 }
702 }
703 }
704 }
705
706 pos = warp_prefix_sum(lane, is_SC, sum);
707
708 if (is_SC)
709 {
710 P_offd_data[k + pos] = val;
711 P_offd_j[k + pos] = col;
712 }
713 k += sum;
714 }
715
716 hypre_device_assert(k == q_offd_P);
717
718 diagonal = warp_allreduce_sum(diagonal);
719 sum_N_pos = warp_allreduce_sum(sum_N_pos);
720 sum_N_neg = warp_allreduce_sum(sum_N_neg);
721 sum_P_pos = warp_allreduce_sum(sum_P_pos);
722 sum_P_neg = warp_allreduce_sum(sum_P_neg);
723
724 HYPRE_Complex alfa = 1.0, beta = 1.0;
725
726 if (sum_P_neg)
727 {
728 alfa = sum_N_neg / (sum_P_neg * diagonal);
729 }
730
731 if (sum_P_pos)
732 {
733 beta = sum_N_pos / (sum_P_pos * diagonal);
734 }
735
736 for (HYPRE_Int j = p_diag_P + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q_diag_P); j += HYPRE_WARP_SIZE)
737 {
738 /* if (P_diag_data[j] > 0.0)
739 P_diag_data[j] *= -beta;
740 else
741 P_diag_data[j] *= -alfa; */
742 if (j < q_diag_P)
743 {
744 P_diag_data[j] *= (P_diag_data[j] > 0.0) * (alfa-beta) - alfa;
745 }
746 }
747
748 for (HYPRE_Int j = p_offd_P + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q_offd_P); j += HYPRE_WARP_SIZE)
749 {
750 /* if (P_offd_data[indp]> 0)
751 P_offd_data[indp] *= -beta;
752 else
753 P_offd_data[indp] *= -alfa; */
754 if (j < q_offd_P)
755 {
756 P_offd_data[j] *= (P_offd_data[j] > 0.0) * (alfa-beta) - alfa;
757 }
758 }
759 }
760
761 /*-----------------------------------------------------------------------*
762 *-----------------------------------------------------------------------*/
763 __global__ void
hypre_BoomerAMGBuildDirInterp_getcoef_v2(HYPRE_Int nr_of_rows,HYPRE_Int * A_diag_i,HYPRE_Int * A_diag_j,HYPRE_Real * A_diag_data,HYPRE_Int * A_offd_i,HYPRE_Int * A_offd_j,HYPRE_Real * A_offd_data,HYPRE_Int * Soc_diag_j,HYPRE_Int * Soc_offd_j,HYPRE_Int * CF_marker,HYPRE_Int * CF_marker_offd,HYPRE_Int num_functions,HYPRE_Int * dof_func,HYPRE_Int * dof_func_offd,HYPRE_Int * P_diag_i,HYPRE_Int * P_diag_j,HYPRE_Real * P_diag_data,HYPRE_Int * P_offd_i,HYPRE_Int * P_offd_j,HYPRE_Real * P_offd_data,HYPRE_Int * fine_to_coarse)764 hypre_BoomerAMGBuildDirInterp_getcoef_v2( HYPRE_Int nr_of_rows,
765 HYPRE_Int *A_diag_i,
766 HYPRE_Int *A_diag_j,
767 HYPRE_Real *A_diag_data,
768 HYPRE_Int *A_offd_i,
769 HYPRE_Int *A_offd_j,
770 HYPRE_Real *A_offd_data,
771 HYPRE_Int *Soc_diag_j,
772 HYPRE_Int *Soc_offd_j,
773 HYPRE_Int *CF_marker,
774 HYPRE_Int *CF_marker_offd,
775 HYPRE_Int num_functions,
776 HYPRE_Int *dof_func,
777 HYPRE_Int *dof_func_offd,
778 HYPRE_Int *P_diag_i,
779 HYPRE_Int *P_diag_j,
780 HYPRE_Real *P_diag_data,
781 HYPRE_Int *P_offd_i,
782 HYPRE_Int *P_offd_j,
783 HYPRE_Real *P_offd_data,
784 HYPRE_Int *fine_to_coarse )
785 {
786 /*-----------------------------------------------------------------------*/
787 /* Compute interpolation matrix, P
788
789 Input: nr_of_rows - Number of rows in matrix (local in processor)
790 A_diag_i, A_diag_j, A_diag_data - CSR representation of A_diag
791 A_offd_i, A_offd_j, A_offd_data - CSR representation of A_offd
792 S_diag_i, S_diag_j - CSR representation of S_diag
793 S_offd_i, S_offd_j - CSR representation of S_offd
794 CF_marker - Coarse/Fine flags for indices (rows) in this processor
795 CF_marker_offd - Coarse/Fine flags for indices (rows) not in this processor
796 num_function - Number of degrees of freedom per grid point
797 dof_func - vector over nonzero elements of A_diag, indicating the degree of freedom
798 dof_func_offd - vector over nonzero elements of A_offd, indicating the degree of freedom
799 fine_to_coarse - Vector of length nr_of_rows-1.
800
801 Output: P_diag_j - Column indices in CSR representation of P_diag
802 P_diag_data - Matrix elements in CSR representation of P_diag
803 P_offd_j - Column indices in CSR representation of P_offd
804 P_offd_data - Matrix elements in CSR representation of P_diag
805 */
806 /*-----------------------------------------------------------------------*/
807
808 HYPRE_Int i = hypre_cuda_get_grid_warp_id<1,1>();
809
810 if (i >= nr_of_rows)
811 {
812 return;
813 }
814
815 HYPRE_Int lane = hypre_cuda_get_lane_id<1>();
816
817 HYPRE_Int k, dof_func_i;
818
819 if (lane == 0)
820 {
821 k = read_only_load(CF_marker + i);
822 }
823 k = __shfl_sync(HYPRE_WARP_FULL_MASK, k, 0);
824
825 /*--------------------------------------------------------------------
826 * If i is a C-point, interpolation is the identity.
827 *--------------------------------------------------------------------*/
828 if (k > 0)
829 {
830 if (lane == 0)
831 {
832 const HYPRE_Int ind = read_only_load(&P_diag_i[i]);
833 P_diag_j[ind] = read_only_load(&fine_to_coarse[i]);
834 P_diag_data[ind] = 1.0;
835 }
836
837 return;
838 }
839
840 /*--------------------------------------------------------------------
841 * Point is f-point, use direct interpolation
842 *--------------------------------------------------------------------*/
843 if (num_functions > 1 && dof_func != NULL)
844 {
845 if (lane == 0)
846 {
847 dof_func_i = read_only_load(&dof_func[i]);
848 }
849 dof_func_i = __shfl_sync(HYPRE_WARP_FULL_MASK, dof_func_i, 0);
850 }
851
852 HYPRE_Real diagonal = 0.0, sum_F = 0.0;
853
854 /* diag part */
855 HYPRE_Int p_diag_A, q_diag_A, p_diag_P, q_diag_P;
856 if (lane < 2)
857 {
858 p_diag_A = read_only_load(A_diag_i + i + lane);
859 p_diag_P = read_only_load(P_diag_i + i + lane);
860 }
861 q_diag_A = __shfl_sync(HYPRE_WARP_FULL_MASK, p_diag_A, 1);
862 p_diag_A = __shfl_sync(HYPRE_WARP_FULL_MASK, p_diag_A, 0);
863 q_diag_P = __shfl_sync(HYPRE_WARP_FULL_MASK, p_diag_P, 1);
864 p_diag_P = __shfl_sync(HYPRE_WARP_FULL_MASK, p_diag_P, 0);
865
866 k = p_diag_P;
867 for (HYPRE_Int j = p_diag_A + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q_diag_A); j += HYPRE_WARP_SIZE)
868 {
869 HYPRE_Int col, sum, pos;
870 HYPRE_Int is_SC = 0; /* if is a Strong-C */
871 HYPRE_Complex val;
872
873 if (j < q_diag_A)
874 {
875 col = read_only_load(&A_diag_j[j]);
876
877 if (i == col)
878 {
879 diagonal = read_only_load(&A_diag_data[j]);
880 }
881 else if ( num_functions == 1 || read_only_load(&dof_func[col]) == dof_func_i )
882 {
883 val = read_only_load(&A_diag_data[j]);
884 if (read_only_load(&Soc_diag_j[j]) > -1)
885 {
886 if (read_only_load(&CF_marker[col]) > 0)
887 {
888 is_SC = 1;
889 }
890 else
891 {
892 sum_F += val;
893 }
894 }
895 else
896 {
897 diagonal += val;
898 }
899 }
900 }
901
902 pos = warp_prefix_sum(lane, is_SC, sum);
903
904 if (is_SC)
905 {
906 P_diag_data[k + pos] = val;
907 P_diag_j[k + pos] = read_only_load(&fine_to_coarse[col]);
908 }
909 k += sum;
910 }
911
912 hypre_device_assert(k == q_diag_P);
913
914 /* offd part */
915 HYPRE_Int p_offd_A, q_offd_A, p_offd_P, q_offd_P;
916 if (lane < 2)
917 {
918 p_offd_A = read_only_load(A_offd_i + i + lane);
919 p_offd_P = read_only_load(P_offd_i + i + lane);
920 }
921 q_offd_A = __shfl_sync(HYPRE_WARP_FULL_MASK, p_offd_A, 1);
922 p_offd_A = __shfl_sync(HYPRE_WARP_FULL_MASK, p_offd_A, 0);
923 q_offd_P = __shfl_sync(HYPRE_WARP_FULL_MASK, p_offd_P, 1);
924 p_offd_P = __shfl_sync(HYPRE_WARP_FULL_MASK, p_offd_P, 0);
925
926 k = p_offd_P;
927 for (HYPRE_Int j = p_offd_A + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q_offd_A); j += HYPRE_WARP_SIZE)
928 {
929 HYPRE_Int col, sum, pos;
930 HYPRE_Int is_SC = 0; /* if is a Strong-C */
931 HYPRE_Complex val;
932
933 if (j < q_offd_A)
934 {
935 col = read_only_load(&A_offd_j[j]);
936
937 if ( num_functions == 1 || read_only_load(&dof_func_offd[col]) == dof_func_i )
938 {
939 val = read_only_load(&A_offd_data[j]);
940 if (read_only_load(&Soc_offd_j[j]) > -1)
941 {
942 if (read_only_load(&CF_marker_offd[col]) > 0)
943 {
944 is_SC = 1;
945 }
946 else
947 {
948 sum_F += val;
949 }
950 }
951 else
952 {
953 diagonal += val;
954 }
955 }
956 }
957
958 pos = warp_prefix_sum(lane, is_SC, sum);
959
960 if (is_SC)
961 {
962 P_offd_data[k + pos] = val;
963 P_offd_j[k + pos] = col;
964 }
965 k += sum;
966 }
967
968 hypre_device_assert(k == q_offd_P);
969
970 diagonal = warp_allreduce_sum(diagonal);
971 sum_F = warp_allreduce_sum(sum_F);
972
973 HYPRE_Complex beta = sum_F / (q_diag_P - p_diag_P + q_offd_P - p_offd_P);
974
975 for (HYPRE_Int j = p_diag_P + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q_diag_P); j += HYPRE_WARP_SIZE)
976 {
977 /* if (P_diag_data[j] > 0.0)
978 P_diag_data[j] *= -beta;
979 else
980 P_diag_data[j] *= -alfa; */
981 if (j < q_diag_P)
982 {
983 P_diag_data[j] = -(P_diag_data[j] + beta) / diagonal;
984 }
985 }
986
987 for (HYPRE_Int j = p_offd_P + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q_offd_P); j += HYPRE_WARP_SIZE)
988 {
989 /* if (P_offd_data[indp]> 0)
990 P_offd_data[indp] *= -beta;
991 else
992 P_offd_data[indp] *= -alfa; */
993 if (j < q_offd_P)
994 {
995 P_offd_data[j] = -(P_offd_data[j] + beta) / diagonal;
996 }
997 }
998 }
999
1000 HYPRE_Int
hypre_BoomerAMGBuildInterpOnePntDevice(hypre_ParCSRMatrix * A,HYPRE_Int * CF_marker,hypre_ParCSRMatrix * S,HYPRE_BigInt * num_cpts_global,HYPRE_Int num_functions,HYPRE_Int * dof_func,HYPRE_Int debug_flag,hypre_ParCSRMatrix ** P_ptr)1001 hypre_BoomerAMGBuildInterpOnePntDevice( hypre_ParCSRMatrix *A,
1002 HYPRE_Int *CF_marker,
1003 hypre_ParCSRMatrix *S,
1004 HYPRE_BigInt *num_cpts_global,
1005 HYPRE_Int num_functions,
1006 HYPRE_Int *dof_func,
1007 HYPRE_Int debug_flag,
1008 hypre_ParCSRMatrix **P_ptr)
1009 {
1010 MPI_Comm comm = hypre_ParCSRMatrixComm(A);
1011 hypre_ParCSRCommPkg *comm_pkg = hypre_ParCSRMatrixCommPkg(A);
1012 hypre_ParCSRCommHandle *comm_handle;
1013
1014 hypre_CSRMatrix *A_diag = hypre_ParCSRMatrixDiag(A);
1015 HYPRE_Int *A_diag_i = hypre_CSRMatrixI(A_diag);
1016 HYPRE_Int *A_strong_diag_j = hypre_ParCSRMatrixSocDiagJ(S);
1017 HYPRE_Complex *A_diag_a = hypre_CSRMatrixData(A_diag);
1018
1019 hypre_CSRMatrix *A_offd = hypre_ParCSRMatrixOffd(A);
1020 HYPRE_Int *A_offd_i = hypre_CSRMatrixI(A_offd);
1021 HYPRE_Int *A_strong_offd_j = hypre_ParCSRMatrixSocOffdJ(S);
1022 HYPRE_Complex *A_offd_a = hypre_CSRMatrixData(A_offd);
1023
1024 HYPRE_Int num_cols_A_offd = hypre_CSRMatrixNumCols(A_offd);
1025
1026 /* Interpolation matrix P */
1027 hypre_ParCSRMatrix *P;
1028 /* csr's */
1029 hypre_CSRMatrix *P_diag;
1030 hypre_CSRMatrix *P_offd;
1031 /* arrays */
1032 HYPRE_Real *P_diag_data;
1033 HYPRE_Int *P_diag_i;
1034 HYPRE_Int *P_diag_j;
1035 HYPRE_Int *P_diag_j_temp;
1036 HYPRE_Int *P_diag_j_temp_compressed;
1037 HYPRE_Real *P_offd_data;
1038 HYPRE_Int *P_offd_i;
1039 HYPRE_Int *P_offd_j;
1040 HYPRE_Int *P_offd_j_temp;
1041 HYPRE_Int *P_offd_j_temp_compressed;
1042 HYPRE_Int num_cols_P_offd;
1043 HYPRE_BigInt *col_map_offd_P = NULL;
1044 HYPRE_BigInt *col_map_offd_P_device = NULL;
1045 /* CF marker off-diag part */
1046 HYPRE_Int *CF_marker_offd = NULL;
1047 /* nnz */
1048 HYPRE_Int nnz_diag, nnz_offd;
1049 /* local size */
1050 HYPRE_Int n_fine = hypre_CSRMatrixNumRows(A_diag);
1051 /* fine to coarse mapping: diag part and offd part */
1052 HYPRE_Int *fine_to_coarse;
1053 HYPRE_BigInt *fine_to_coarse_offd = NULL;
1054 HYPRE_BigInt total_global_cpts, my_first_cpt;
1055 HYPRE_Int my_id, num_procs;
1056 HYPRE_Int num_sends;
1057 HYPRE_Int *int_buf_data = NULL;
1058 HYPRE_BigInt *big_int_buf_data = NULL;
1059 //HYPRE_Int col_start = hypre_ParCSRMatrixFirstRowIndex(A);
1060 //HYPRE_Int col_end = col_start + n_fine;
1061 /* arrays for compressing P_diag and P_offd col indices and data */
1062 HYPRE_Int *diag_compress_marker;
1063 HYPRE_Int *offd_compress_marker;
1064
1065 hypre_MPI_Comm_size(comm, &num_procs);
1066 hypre_MPI_Comm_rank(comm,&my_id);
1067
1068 my_first_cpt = num_cpts_global[0];
1069 if (my_id == (num_procs -1)) total_global_cpts = num_cpts_global[1];
1070 hypre_MPI_Bcast(&total_global_cpts, 1, HYPRE_MPI_BIG_INT, num_procs-1, comm);
1071
1072 /* fine to coarse mapping */
1073 fine_to_coarse = hypre_TAlloc(HYPRE_Int, n_fine, HYPRE_MEMORY_DEVICE);
1074 HYPRE_THRUST_CALL( exclusive_scan,
1075 thrust::make_transform_iterator(CF_marker, is_nonnegative<HYPRE_Int>()),
1076 thrust::make_transform_iterator(CF_marker + n_fine, is_nonnegative<HYPRE_Int>()),
1077 fine_to_coarse,
1078 HYPRE_Int(0) ); /* *MUST* pass init value since input and output types diff. */
1079
1080 /*-------------------------------------------------------------------
1081 * Get the CF_marker data for the off-processor columns
1082 *-------------------------------------------------------------------*/
1083 if (num_cols_A_offd)
1084 {
1085 CF_marker_offd = hypre_CTAlloc(HYPRE_Int, num_cols_A_offd, HYPRE_MEMORY_DEVICE);
1086 }
1087 /* if CommPkg of A is not present, create it */
1088 if (!comm_pkg)
1089 {
1090 hypre_MatvecCommPkgCreate(A);
1091 comm_pkg = hypre_ParCSRMatrixCommPkg(A);
1092 }
1093 /* number of sends to do (number of procs) */
1094 num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg);
1095 /* send buffer, of size send_map_starts[num_sends]),
1096 * i.e., number of entries to send */
1097 int_buf_data = hypre_CTAlloc(HYPRE_Int, hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends), HYPRE_MEMORY_DEVICE);
1098
1099 /* copy CF markers of elements to send to buffer */
1100 HYPRE_THRUST_CALL( gather,
1101 hypre_ParCSRCommPkgDeviceSendMapElmts(comm_pkg),
1102 hypre_ParCSRCommPkgDeviceSendMapElmts(comm_pkg) +
1103 hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends),
1104 CF_marker,
1105 int_buf_data );
1106 /* create a handle to start communication. 11: for integer */
1107 comm_handle = hypre_ParCSRCommHandleCreate_v2(11, comm_pkg, HYPRE_MEMORY_DEVICE, int_buf_data, HYPRE_MEMORY_DEVICE, CF_marker_offd);
1108 /* destroy the handle to finish communication */
1109 hypre_ParCSRCommHandleDestroy(comm_handle);
1110 hypre_TFree(int_buf_data, HYPRE_MEMORY_DEVICE);
1111
1112 /*-----------------------------------------------------------------------
1113 * First Pass: Determine size of P and fill in fine_to_coarse mapping,
1114 * and find the most strongly influencing C-pt for each F-pt
1115 *-----------------------------------------------------------------------*/
1116
1117 P_diag_i = hypre_CTAlloc(HYPRE_Int, n_fine+1, HYPRE_MEMORY_DEVICE);
1118 P_offd_i = hypre_CTAlloc(HYPRE_Int, n_fine+1, HYPRE_MEMORY_DEVICE);
1119
1120 diag_compress_marker = hypre_CTAlloc(HYPRE_Int, n_fine, HYPRE_MEMORY_DEVICE);
1121 offd_compress_marker = hypre_CTAlloc(HYPRE_Int, n_fine, HYPRE_MEMORY_DEVICE);
1122
1123 /* Overallocate here and compress later */
1124 P_diag_j_temp = hypre_CTAlloc(HYPRE_Int, n_fine, HYPRE_MEMORY_DEVICE);
1125 P_offd_j_temp = hypre_CTAlloc(HYPRE_Int, n_fine, HYPRE_MEMORY_DEVICE);
1126
1127 dim3 bDim = hypre_GetDefaultCUDABlockDimension();
1128 dim3 gDim = hypre_GetDefaultCUDAGridDimension(n_fine, "warp", bDim);
1129
1130 HYPRE_CUDA_LAUNCH( hypre_BoomerAMGBuildInterpOnePnt_getnnz, gDim, bDim,
1131 n_fine, A_diag_i, A_strong_diag_j, A_diag_a, A_offd_i, A_strong_offd_j,
1132 A_offd_a, CF_marker, CF_marker_offd, diag_compress_marker,
1133 offd_compress_marker, P_diag_i, P_diag_j_temp, P_offd_i, P_offd_j_temp);
1134
1135 /*-----------------------------------------------------------------------
1136 * Send and receive fine_to_coarse info.
1137 *-----------------------------------------------------------------------*/
1138 fine_to_coarse_offd = hypre_CTAlloc(HYPRE_BigInt, num_cols_A_offd,HYPRE_MEMORY_DEVICE);
1139 big_int_buf_data = hypre_CTAlloc(HYPRE_BigInt, hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends), HYPRE_MEMORY_DEVICE);
1140 HYPRE_THRUST_CALL( gather,
1141 hypre_ParCSRCommPkgDeviceSendMapElmts(comm_pkg),
1142 hypre_ParCSRCommPkgDeviceSendMapElmts(comm_pkg) +
1143 hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends),
1144 fine_to_coarse,
1145 big_int_buf_data );
1146 HYPRE_THRUST_CALL( transform,
1147 big_int_buf_data,
1148 big_int_buf_data + hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends),
1149 thrust::make_constant_iterator(my_first_cpt),
1150 big_int_buf_data,
1151 thrust::plus<HYPRE_BigInt>() );
1152 comm_handle = hypre_ParCSRCommHandleCreate_v2(21, comm_pkg, HYPRE_MEMORY_DEVICE, big_int_buf_data, HYPRE_MEMORY_DEVICE, fine_to_coarse_offd);
1153 hypre_ParCSRCommHandleDestroy(comm_handle);
1154 hypre_TFree(big_int_buf_data, HYPRE_MEMORY_DEVICE);
1155
1156 /*-----------------------------------------------------------------------
1157 * Fill values and finish setting up P.
1158 *-----------------------------------------------------------------------*/
1159
1160 /* scan P_diag_i (which has number of nonzeros in each row) to get row indices */
1161 hypreDevice_IntegerExclusiveScan(n_fine+1, P_diag_i);
1162 hypreDevice_IntegerExclusiveScan(n_fine+1, P_offd_i);
1163
1164 /* get the number of nonzeros and allocate column index and data arrays */
1165 hypre_TMemcpy(&nnz_diag, &P_diag_i[n_fine], HYPRE_Int, 1, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE);
1166 hypre_TMemcpy(&nnz_offd, &P_offd_i[n_fine], HYPRE_Int, 1, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE);
1167
1168 P_diag_j = hypre_TAlloc(HYPRE_Int, nnz_diag, HYPRE_MEMORY_DEVICE);
1169 P_diag_data = hypre_TAlloc(HYPRE_Real, nnz_diag, HYPRE_MEMORY_DEVICE);
1170
1171
1172 P_offd_j = hypre_TAlloc(HYPRE_Int, nnz_offd, HYPRE_MEMORY_DEVICE);
1173 P_offd_data = hypre_TAlloc(HYPRE_Real, nnz_offd, HYPRE_MEMORY_DEVICE);
1174
1175 /* set data values to 1.0 */
1176 HYPRE_THRUST_CALL( fill_n,
1177 P_diag_data,
1178 nnz_diag,
1179 1.0 );
1180 HYPRE_THRUST_CALL( fill_n,
1181 P_offd_data,
1182 nnz_offd,
1183 1.0 );
1184
1185 /* compress temporary column indices */
1186 P_diag_j_temp_compressed = hypre_TAlloc(HYPRE_Int, nnz_diag, HYPRE_MEMORY_DEVICE);
1187 P_offd_j_temp_compressed = hypre_TAlloc(HYPRE_Int, nnz_offd, HYPRE_MEMORY_DEVICE);
1188
1189 HYPRE_THRUST_CALL( copy_if,
1190 P_diag_j_temp,
1191 P_diag_j_temp + n_fine,
1192 diag_compress_marker,
1193 P_diag_j_temp_compressed,
1194 equal<HYPRE_Int>(1) );
1195 HYPRE_THRUST_CALL( copy_if,
1196 P_offd_j_temp,
1197 P_offd_j_temp + n_fine,
1198 offd_compress_marker,
1199 P_offd_j_temp_compressed,
1200 equal<HYPRE_Int>(1) );
1201
1202 /* map the diag column indices */
1203 HYPRE_THRUST_CALL( gather,
1204 P_diag_j_temp_compressed,
1205 P_diag_j_temp_compressed + nnz_diag,
1206 fine_to_coarse,
1207 P_diag_j );
1208
1209 /* mark the offd indices for P as a subset of offd indices of A */
1210 HYPRE_Int *mark_P_offd_idx = hypre_CTAlloc(HYPRE_Int, num_cols_A_offd, HYPRE_MEMORY_DEVICE);
1211 // note that scatter is usually not safe if the same index appears more than once in the map,
1212 // but here we are just scattering constant values, so this is safe
1213 HYPRE_THRUST_CALL( scatter,
1214 thrust::make_constant_iterator(1),
1215 thrust::make_constant_iterator(1) + nnz_offd,
1216 P_offd_j_temp_compressed,
1217 mark_P_offd_idx );
1218 num_cols_P_offd = HYPRE_THRUST_CALL(reduce, mark_P_offd_idx, mark_P_offd_idx + num_cols_A_offd);
1219
1220 /* get a mapping from P offd indices to A offd indices */
1221 /* offd_map_P_to_A[ P offd idx ] = A offd idx */
1222 HYPRE_Int *offd_map_P_to_A = hypre_CTAlloc(HYPRE_Int, num_cols_P_offd, HYPRE_MEMORY_DEVICE);
1223 HYPRE_THRUST_CALL( copy_if,
1224 thrust::make_counting_iterator(0),
1225 thrust::make_counting_iterator(num_cols_A_offd),
1226 mark_P_offd_idx,
1227 offd_map_P_to_A,
1228 equal<HYPRE_Int>(1) );
1229 hypre_TFree(mark_P_offd_idx, HYPRE_MEMORY_DEVICE);
1230
1231 /* also get an inverse mapping from A offd indices to P offd indices */
1232 /* offd_map_A_to_P[ A offd idx ] = -1 if not a P idx, else P offd idx */
1233 HYPRE_Int *offd_map_A_to_P = hypre_TAlloc(HYPRE_Int, num_cols_A_offd, HYPRE_MEMORY_DEVICE);
1234 HYPRE_THRUST_CALL( fill_n,
1235 offd_map_A_to_P,
1236 num_cols_A_offd,
1237 -1 );
1238 HYPRE_THRUST_CALL( scatter,
1239 thrust::make_counting_iterator(0),
1240 thrust::make_counting_iterator(num_cols_P_offd),
1241 offd_map_P_to_A,
1242 offd_map_A_to_P );
1243
1244 /* use inverse mapping above to map P_offd_j */
1245 HYPRE_THRUST_CALL( gather,
1246 P_offd_j_temp_compressed,
1247 P_offd_j_temp_compressed + nnz_offd,
1248 offd_map_A_to_P,
1249 P_offd_j );
1250 hypre_TFree(P_offd_j_temp_compressed, HYPRE_MEMORY_DEVICE);
1251 hypre_TFree(offd_map_A_to_P, HYPRE_MEMORY_DEVICE);
1252
1253 /* setup col_map_offd for P */
1254 col_map_offd_P_device = hypre_CTAlloc(HYPRE_BigInt, num_cols_P_offd, HYPRE_MEMORY_DEVICE);
1255 col_map_offd_P = hypre_CTAlloc(HYPRE_BigInt, num_cols_P_offd, HYPRE_MEMORY_HOST);
1256 HYPRE_THRUST_CALL( gather,
1257 offd_map_P_to_A,
1258 offd_map_P_to_A + num_cols_P_offd,
1259 fine_to_coarse_offd,
1260 col_map_offd_P_device);
1261 hypre_TMemcpy(col_map_offd_P, col_map_offd_P_device, HYPRE_BigInt, num_cols_P_offd, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE);
1262 hypre_TFree(offd_map_P_to_A, HYPRE_MEMORY_DEVICE);
1263 hypre_TFree(col_map_offd_P_device, HYPRE_MEMORY_DEVICE);
1264
1265 /* Now, we should have everything of Parcsr matrix P */
1266 P = hypre_ParCSRMatrixCreate(comm,
1267 hypre_ParCSRMatrixGlobalNumCols(A), /* global num of rows */
1268 total_global_cpts, /* global num of cols */
1269 hypre_ParCSRMatrixColStarts(A), /* row_starts */
1270 num_cpts_global, /* col_starts */
1271 num_cols_P_offd, /* num cols offd */
1272 nnz_diag,
1273 nnz_offd);
1274
1275 P_diag = hypre_ParCSRMatrixDiag(P);
1276 hypre_CSRMatrixData(P_diag) = P_diag_data;
1277 hypre_CSRMatrixI(P_diag) = P_diag_i;
1278 hypre_CSRMatrixJ(P_diag) = P_diag_j;
1279
1280 P_offd = hypre_ParCSRMatrixOffd(P);
1281 hypre_CSRMatrixData(P_offd) = P_offd_data;
1282 hypre_CSRMatrixI(P_offd) = P_offd_i;
1283 hypre_CSRMatrixJ(P_offd) = P_offd_j;
1284
1285 hypre_ParCSRMatrixColMapOffd(P) = col_map_offd_P;
1286
1287 /* create CommPkg of P */
1288 hypre_MatvecCommPkgCreate(P);
1289
1290 *P_ptr = P;
1291
1292 /* free workspace */
1293 hypre_TFree(CF_marker_offd, HYPRE_MEMORY_DEVICE);
1294 hypre_TFree(fine_to_coarse, HYPRE_MEMORY_DEVICE);
1295 hypre_TFree(fine_to_coarse_offd, HYPRE_MEMORY_DEVICE);
1296
1297 return hypre_error_flag;
1298 }
1299
1300 /*-----------------------------------------------------------------------*/
1301 __global__ void
hypre_BoomerAMGBuildInterpOnePnt_getnnz(HYPRE_Int nr_of_rows,HYPRE_Int * A_diag_i,HYPRE_Int * A_strong_diag_j,HYPRE_Complex * A_diag_a,HYPRE_Int * A_offd_i,HYPRE_Int * A_strong_offd_j,HYPRE_Complex * A_offd_a,HYPRE_Int * CF_marker,HYPRE_Int * CF_marker_offd,HYPRE_Int * diag_compress_marker,HYPRE_Int * offd_compress_marker,HYPRE_Int * P_diag_i,HYPRE_Int * P_diag_j,HYPRE_Int * P_offd_i,HYPRE_Int * P_offd_j)1302 hypre_BoomerAMGBuildInterpOnePnt_getnnz( HYPRE_Int nr_of_rows,
1303 HYPRE_Int *A_diag_i,
1304 HYPRE_Int *A_strong_diag_j,
1305 HYPRE_Complex *A_diag_a,
1306 HYPRE_Int *A_offd_i,
1307 HYPRE_Int *A_strong_offd_j,
1308 HYPRE_Complex *A_offd_a,
1309 HYPRE_Int *CF_marker,
1310 HYPRE_Int *CF_marker_offd,
1311 HYPRE_Int *diag_compress_marker,
1312 HYPRE_Int *offd_compress_marker,
1313 HYPRE_Int *P_diag_i,
1314 HYPRE_Int *P_diag_j,
1315 HYPRE_Int *P_offd_i,
1316 HYPRE_Int *P_offd_j)
1317 {
1318 /*-----------------------------------------------------------------------*/
1319 /* Determine size of interpolation matrix, P
1320
1321 If A is of size m x m, then P will be of size m x c where c is the
1322 number of coarse points.
1323
1324 It is assumed that S have the same global column enumeration as A
1325
1326 Input: nr_of_rows - Number of rows in matrix (local in processor)
1327 A_diag_i, A_strong_diag_j, - Arrays associated with ParCSRMatrix A
1328 A_diag_a, A_offd_i, where the column indices are taken from S
1329 A_strong_offd_j, A_offd_a and mark weak connections with negative indices
1330 CF_maker - coarse/fine marker for on-processor points
1331 CF_maker_offd - coarse/fine marker for off-processor connections
1332
1333 Output: P_diag_i - Vector where P_diag_i[i] holds the number of non-zero elements of P_diag on row i (will be 1).
1334 P_diag_i - Vector where P_diag_j[i] holds a temporary, uncompressed column indices for P_diag.
1335 P_offd_i - Vector where P_offd_i[i] holds the number of non-zero elements of P_offd on row i (will be 1).
1336 P_offd_i - Vector where P_offd_j[i] holds a temporary, uncompressed column indices for P_offd.
1337 diag_compress_marker - Array of 0s and 1s used to compress P_diag col indices and data.
1338 offd_compress_marker - Array of 0s and 1s used to compress P_offd col indices and data.
1339 */
1340 /*-----------------------------------------------------------------------*/
1341
1342 HYPRE_Int i = hypre_cuda_get_grid_warp_id<1,1>();
1343
1344 if (i >= nr_of_rows)
1345 {
1346 return;
1347 }
1348
1349 HYPRE_Int p, q;
1350 HYPRE_Int max_j_diag = -1, max_j_offd = -1;
1351 HYPRE_Int lane = hypre_cuda_get_lane_id<1>();
1352 HYPRE_Real max_diag = -1.0, max_offd = -1.0;
1353 HYPRE_Real warp_max_diag = -1.0, warp_max_offd = -1.0;
1354
1355 if (lane == 0)
1356 {
1357 p = read_only_load(CF_marker + i);
1358 }
1359 p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0);
1360
1361 /*--------------------------------------------------------------------
1362 * If i is a C-point, interpolation is the identity.
1363 *--------------------------------------------------------------------*/
1364 if (p >= 0)
1365 {
1366 if (lane == 0)
1367 {
1368 P_diag_i[i] = 1;
1369 P_diag_j[i] = i;
1370 diag_compress_marker[i] = 1;
1371 }
1372 return;
1373 }
1374
1375 /*--------------------------------------------------------------------
1376 * If i is an F-point, find strongest connected C-point,
1377 * which could be in diag or offd.
1378 *--------------------------------------------------------------------*/
1379
1380 /* diag part */
1381 if (lane < 2)
1382 {
1383 p = read_only_load(A_diag_i + i + lane);
1384 }
1385 q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1);
1386 p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0);
1387
1388 for (HYPRE_Int j = p + lane; j < q; j += HYPRE_WARP_SIZE)
1389 {
1390 /* column indices are negative for weak connections */
1391 const HYPRE_Int col = read_only_load(&A_strong_diag_j[j]);
1392 if (col >= 0)
1393 {
1394 const HYPRE_Complex val = fabs( read_only_load(&A_diag_a[j]) );
1395 if ( read_only_load(&CF_marker[col]) > 0 && val > max_diag )
1396 {
1397 max_diag = val;
1398 max_j_diag = col;
1399 }
1400 }
1401 }
1402 warp_max_diag = warp_allreduce_max(max_diag);
1403
1404 /* offd part */
1405 if (lane < 2)
1406 {
1407 p = read_only_load(A_offd_i + i + lane);
1408 }
1409 q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1);
1410 p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0);
1411
1412 for (HYPRE_Int j = p + lane; j < q; j += HYPRE_WARP_SIZE)
1413 {
1414 const HYPRE_Int col = read_only_load(&A_strong_offd_j[j]);
1415 /* column indices are negative for weak connections */
1416 if (col >= 0)
1417 {
1418 const HYPRE_Complex val = fabs( read_only_load(&A_offd_a[j]) );
1419 if ( read_only_load(&CF_marker_offd[col]) > 0 && val > max_offd )
1420 {
1421 max_offd = val;
1422 max_j_offd = col;
1423 }
1424 }
1425 }
1426 warp_max_offd = warp_allreduce_max(max_offd);
1427
1428 /*--------------------------------------------------------------------
1429 * If no max found, then there is no strongly connected C-point,
1430 * and this will be a zero row
1431 *--------------------------------------------------------------------*/
1432
1433 if (warp_max_offd < 0 && warp_max_diag < 0)
1434 {
1435 return;
1436 }
1437
1438 /*--------------------------------------------------------------------
1439 * Otherwise, find the column index in either diag or offd
1440 *--------------------------------------------------------------------*/
1441
1442 if (warp_max_offd > warp_max_diag)
1443 {
1444 if (warp_max_offd != max_offd)
1445 {
1446 max_j_offd = -1;
1447 }
1448 max_j_offd = warp_reduce_max(max_j_offd);
1449 if (lane == 0)
1450 {
1451 P_offd_i[i] = 1;
1452 P_offd_j[i] = max_j_offd;
1453 offd_compress_marker[i] = 1;
1454 }
1455 }
1456 else
1457 {
1458 if (warp_max_diag != max_diag)
1459 {
1460 max_j_diag = -1;
1461 }
1462 max_j_diag = warp_reduce_max(max_j_diag);
1463 if (lane == 0)
1464 {
1465 P_diag_i[i] = 1;
1466 P_diag_j[i] = max_j_diag;
1467 diag_compress_marker[i] = 1;
1468 }
1469 }
1470 }
1471
1472 #endif // defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
1473