1 /******************************************************************************
2  * Copyright 1998-2019 Lawrence Livermore National Security, LLC and other
3  * HYPRE Project Developers. See the top-level COPYRIGHT file for details.
4  *
5  * SPDX-License-Identifier: (Apache-2.0 OR MIT)
6  ******************************************************************************/
7 
8 #include "_hypre_parcsr_ls.h"
9 #include "_hypre_utilities.hpp"
10 
11 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
12 
13 __global__ void hypre_BoomerAMGBuildDirInterp_getnnz( HYPRE_Int nr_of_rows, HYPRE_Int *S_diag_i, HYPRE_Int *S_diag_j, HYPRE_Int *S_offd_i, HYPRE_Int *S_offd_j, HYPRE_Int *CF_marker, HYPRE_Int *CF_marker_offd, HYPRE_Int num_functions, HYPRE_Int *dof_func, HYPRE_Int *dof_func_offd, HYPRE_Int *P_diag_i, HYPRE_Int *P_offd_i);
14 
15 __global__ void hypre_BoomerAMGBuildDirInterp_getcoef( HYPRE_Int nr_of_rows, HYPRE_Int *A_diag_i, HYPRE_Int *A_diag_j, HYPRE_Real *A_diag_data, HYPRE_Int *A_offd_i, HYPRE_Int *A_offd_j, HYPRE_Real *A_offd_data, HYPRE_Int *Soc_diag_j, HYPRE_Int *Soc_offd_j, HYPRE_Int *CF_marker, HYPRE_Int *CF_marker_offd, HYPRE_Int num_functions, HYPRE_Int *dof_func, HYPRE_Int *dof_func_offd, HYPRE_Int *P_diag_i, HYPRE_Int *P_diag_j, HYPRE_Real *P_diag_data, HYPRE_Int *P_offd_i, HYPRE_Int *P_offd_j, HYPRE_Real *P_offd_data, HYPRE_Int *fine_to_coarse );
16 
17 __global__ void hypre_BoomerAMGBuildDirInterp_getcoef_v2( HYPRE_Int nr_of_rows, HYPRE_Int *A_diag_i, HYPRE_Int *A_diag_j, HYPRE_Real *A_diag_data, HYPRE_Int *A_offd_i, HYPRE_Int *A_offd_j, HYPRE_Real *A_offd_data, HYPRE_Int *Soc_diag_j, HYPRE_Int *Soc_offd_j, HYPRE_Int *CF_marker, HYPRE_Int *CF_marker_offd, HYPRE_Int num_functions, HYPRE_Int *dof_func, HYPRE_Int *dof_func_offd, HYPRE_Int *P_diag_i, HYPRE_Int *P_diag_j, HYPRE_Real *P_diag_data, HYPRE_Int *P_offd_i, HYPRE_Int *P_offd_j, HYPRE_Real *P_offd_data, HYPRE_Int *fine_to_coarse );
18 
19 __global__ void
20 hypre_BoomerAMGBuildInterpOnePnt_getnnz( HYPRE_Int nr_of_rows, HYPRE_Int *A_diag_i, HYPRE_Int *A_strong_diag_j, HYPRE_Complex *A_diag_a, HYPRE_Int *A_offd_i, HYPRE_Int *A_strong_offd_j, HYPRE_Complex *A_offd_a, HYPRE_Int *CF_marker, HYPRE_Int *CF_marker_offd, HYPRE_Int *diag_compress_marker, HYPRE_Int *offd_compress_marker, HYPRE_Int *P_diag_i, HYPRE_Int *P_diag_j, HYPRE_Int *P_offd_i, HYPRE_Int *P_offd_j);
21 
22 /*---------------------------------------------------------------------------
23  * hypre_BoomerAMGBuildDirInterp
24  *--------------------------------------------------------------------------*/
25 
26 HYPRE_Int
hypre_BoomerAMGBuildDirInterpDevice(hypre_ParCSRMatrix * A,HYPRE_Int * CF_marker,hypre_ParCSRMatrix * S,HYPRE_BigInt * num_cpts_global,HYPRE_Int num_functions,HYPRE_Int * dof_func,HYPRE_Int debug_flag,HYPRE_Real trunc_factor,HYPRE_Int max_elmts,HYPRE_Int interp_type,hypre_ParCSRMatrix ** P_ptr)27 hypre_BoomerAMGBuildDirInterpDevice( hypre_ParCSRMatrix   *A,
28                                      HYPRE_Int            *CF_marker,
29                                      hypre_ParCSRMatrix   *S,
30                                      HYPRE_BigInt         *num_cpts_global,
31                                      HYPRE_Int             num_functions,
32                                      HYPRE_Int            *dof_func,
33                                      HYPRE_Int             debug_flag,
34                                      HYPRE_Real            trunc_factor,
35                                      HYPRE_Int             max_elmts,
36                                      HYPRE_Int             interp_type,
37                                      hypre_ParCSRMatrix  **P_ptr)
38 {
39    MPI_Comm                comm     = hypre_ParCSRMatrixComm(A);
40    hypre_ParCSRCommPkg    *comm_pkg = hypre_ParCSRMatrixCommPkg(A);
41    hypre_ParCSRCommHandle *comm_handle;
42 
43    hypre_CSRMatrix *A_diag      = hypre_ParCSRMatrixDiag(A);
44    HYPRE_Real      *A_diag_data = hypre_CSRMatrixData(A_diag);
45    HYPRE_Int       *A_diag_i    = hypre_CSRMatrixI(A_diag);
46    HYPRE_Int       *A_diag_j    = hypre_CSRMatrixJ(A_diag);
47 
48    hypre_CSRMatrix *A_offd      = hypre_ParCSRMatrixOffd(A);
49    HYPRE_Real      *A_offd_data = hypre_CSRMatrixData(A_offd);
50    HYPRE_Int       *A_offd_i    = hypre_CSRMatrixI(A_offd);
51    HYPRE_Int       *A_offd_j    = hypre_CSRMatrixJ(A_offd);
52    HYPRE_Int        num_cols_A_offd = hypre_CSRMatrixNumCols(A_offd);
53    //   HYPRE_BigInt   *col_map_offd_A = hypre_ParCSRMatrixColMapOffd(A);
54 
55    HYPRE_Int        n_fine = hypre_CSRMatrixNumRows(A_diag);
56 
57    hypre_BoomerAMGMakeSocFromSDevice(A, S);
58 
59    hypre_CSRMatrix *S_diag   = hypre_ParCSRMatrixDiag(S);
60    HYPRE_Int       *S_diag_i = hypre_CSRMatrixI(S_diag);
61    HYPRE_Int       *S_diag_j = hypre_CSRMatrixJ(S_diag);
62 
63    hypre_CSRMatrix *S_offd   = hypre_ParCSRMatrixOffd(S);
64    HYPRE_Int       *S_offd_i = hypre_CSRMatrixI(S_offd);
65    HYPRE_Int       *S_offd_j = hypre_CSRMatrixJ(S_offd);
66 
67    hypre_ParCSRMatrix *P;
68    HYPRE_Int          *tmp_map_offd_h = NULL;
69 
70    HYPRE_Int       *CF_marker_offd = NULL;
71    HYPRE_Int       *dof_func_offd = NULL;
72    HYPRE_Int       *dof_func_dev = NULL;
73 
74    hypre_CSRMatrix *P_diag;
75    hypre_CSRMatrix *P_offd;
76    HYPRE_Real      *P_diag_data;
77    HYPRE_Int       *P_diag_i;
78    HYPRE_Int       *P_diag_j;
79    HYPRE_Real      *P_offd_data;
80    HYPRE_Int       *P_offd_i;
81    HYPRE_Int       *P_offd_j;
82    HYPRE_Int        P_diag_size, P_offd_size;
83 
84    HYPRE_Int       *fine_to_coarse_d;
85    HYPRE_Int       *fine_to_coarse_h;
86    HYPRE_BigInt     total_global_cpts;
87    HYPRE_Int        num_cols_P_offd = 0;
88 
89    HYPRE_Int        i;
90    HYPRE_Int        j;
91    HYPRE_Int        start;
92    HYPRE_Int        my_id;
93    HYPRE_Int        num_procs;
94    HYPRE_Int        num_sends;
95    HYPRE_Int        index;
96    HYPRE_Int       *int_buf_data;
97 
98    HYPRE_Real       wall_time;  /* for debugging instrumentation  */
99 
100    HYPRE_MemoryLocation memory_location = hypre_ParCSRMatrixMemoryLocation(A);
101 
102    hypre_MPI_Comm_size(comm, &num_procs);
103    hypre_MPI_Comm_rank(comm,&my_id);
104 
105    if (my_id == (num_procs -1))
106    {
107       total_global_cpts = num_cpts_global[1];
108    }
109    hypre_MPI_Bcast( &total_global_cpts, 1, HYPRE_MPI_BIG_INT, num_procs-1, comm);
110 
111    if (!comm_pkg)
112    {
113       hypre_MatvecCommPkgCreate(A);
114       comm_pkg = hypre_ParCSRMatrixCommPkg(A);
115    }
116    if (debug_flag == 4)
117    {
118       wall_time = time_getWallclockSeconds();
119    }
120 
121    /* 1. Communicate CF_marker to/from other processors */
122    if (num_cols_A_offd)
123    {
124       CF_marker_offd = hypre_TAlloc(HYPRE_Int, num_cols_A_offd, HYPRE_MEMORY_DEVICE);
125    }
126 
127    num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg);
128    int_buf_data = hypre_TAlloc(HYPRE_Int, hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends), HYPRE_MEMORY_DEVICE);
129    hypre_ParCSRCommPkgCopySendMapElmtsToDevice(comm_pkg);
130    HYPRE_THRUST_CALL( gather,
131                       hypre_ParCSRCommPkgDeviceSendMapElmts(comm_pkg),
132                       hypre_ParCSRCommPkgDeviceSendMapElmts(comm_pkg) + hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends),
133                       CF_marker,
134                       int_buf_data );
135 
136    comm_handle = hypre_ParCSRCommHandleCreate_v2(11, comm_pkg, HYPRE_MEMORY_DEVICE, int_buf_data,
137                                                  HYPRE_MEMORY_DEVICE, CF_marker_offd);
138    hypre_ParCSRCommHandleDestroy(comm_handle);
139 
140    if (num_functions > 1)
141    {
142       /* 2. Communicate dof_func to/from other processors */
143       if (num_cols_A_offd > 0)
144       {
145          dof_func_offd = hypre_TAlloc(HYPRE_Int, num_cols_A_offd, HYPRE_MEMORY_DEVICE);
146       }
147 
148       index = 0;
149       for (i = 0; i < num_sends; i++)
150       {
151          start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i);
152          for (j=start; j < hypre_ParCSRCommPkgSendMapStart(comm_pkg, i+1); j++)
153          {
154             int_buf_data[index++] = dof_func[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,j)];
155          }
156       }
157       comm_handle = hypre_ParCSRCommHandleCreate_v2(11, comm_pkg, HYPRE_MEMORY_HOST, int_buf_data,
158                                                     HYPRE_MEMORY_DEVICE, dof_func_offd);
159       hypre_ParCSRCommHandleDestroy(comm_handle);
160 
161       dof_func_dev = hypre_TAlloc(HYPRE_Int, n_fine, HYPRE_MEMORY_DEVICE);
162       hypre_TMemcpy(dof_func_dev, dof_func, HYPRE_Int, n_fine, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST);
163    }
164 
165    if (debug_flag == 4)
166    {
167       wall_time = time_getWallclockSeconds() - wall_time;
168       hypre_printf("Proc = %d     Interp: Comm 1 CF_marker =    %f\n", my_id, wall_time);
169       fflush(NULL);
170    }
171 
172    /* 3. Figure out the size of the interpolation matrix, P, i.e., compute P_diag_i and P_offd_i */
173    /*    Also, compute fine_to_coarse array: When i is a coarse point, fine_to_coarse[i] will hold a  */
174    /*    corresponding coarse point index in the range 0..n_coarse-1 */
175    P_diag_i = hypre_TAlloc(HYPRE_Int, n_fine+1, memory_location);
176    P_offd_i = hypre_TAlloc(HYPRE_Int, n_fine+1, memory_location);
177 
178    dim3 bDim = hypre_GetDefaultCUDABlockDimension();
179    dim3 gDim = hypre_GetDefaultCUDAGridDimension(n_fine, "warp", bDim);
180 
181    HYPRE_CUDA_LAUNCH( hypre_BoomerAMGBuildDirInterp_getnnz, gDim, bDim,
182                       n_fine, S_diag_i, S_diag_j, S_offd_i, S_offd_j,
183                       CF_marker, CF_marker_offd, num_functions,
184                       dof_func_dev, dof_func_offd, P_diag_i, P_offd_i);
185 
186    /* The scans will transform P_diag_i and P_offd_i to the CSR I-vectors */
187    hypreDevice_IntegerExclusiveScan(n_fine+1, P_diag_i);
188    hypreDevice_IntegerExclusiveScan(n_fine+1, P_offd_i);
189 
190    fine_to_coarse_d = hypre_TAlloc(HYPRE_Int, n_fine, HYPRE_MEMORY_DEVICE);
191    /* The scan will make fine_to_coarse[i] for i a coarse point hold a
192     * coarse point index in the range from 0 to n_coarse-1 */
193    HYPRE_THRUST_CALL( exclusive_scan,
194                       thrust::make_transform_iterator(CF_marker,          is_nonnegative<HYPRE_Int>()),
195                       thrust::make_transform_iterator(CF_marker + n_fine, is_nonnegative<HYPRE_Int>()),
196                       fine_to_coarse_d,
197                       HYPRE_Int(0) ); /* *MUST* pass init value since input and output types diff. */
198 
199    /* 4. Compute the CSR arrays P_diag_j, P_diag_data, P_offd_j, and P_offd_data */
200    /*    P_diag_i and P_offd_i are now known, first allocate the remaining CSR arrays of P */
201    hypre_TMemcpy(&P_diag_size, &P_diag_i[n_fine], HYPRE_Int, 1, HYPRE_MEMORY_HOST, memory_location);
202    hypre_TMemcpy(&P_offd_size, &P_offd_i[n_fine], HYPRE_Int, 1, HYPRE_MEMORY_HOST, memory_location);
203 
204    P_diag_j    = hypre_TAlloc(HYPRE_Int,  P_diag_size, memory_location);
205    P_diag_data = hypre_TAlloc(HYPRE_Real, P_diag_size, memory_location);
206 
207    P_offd_j    = hypre_TAlloc(HYPRE_Int,  P_offd_size, memory_location);
208    P_offd_data = hypre_TAlloc(HYPRE_Real, P_offd_size, memory_location);
209 
210    if (interp_type == 3)
211    {
212       HYPRE_CUDA_LAUNCH( hypre_BoomerAMGBuildDirInterp_getcoef, gDim, bDim,
213                          n_fine, A_diag_i, A_diag_j, A_diag_data,
214                          A_offd_i, A_offd_j, A_offd_data,
215                          hypre_ParCSRMatrixSocDiagJ(S),
216                          hypre_ParCSRMatrixSocOffdJ(S),
217                          CF_marker, CF_marker_offd,
218                          num_functions, dof_func_dev, dof_func_offd,
219                          P_diag_i, P_diag_j, P_diag_data,
220                          P_offd_i, P_offd_j, P_offd_data,
221                          fine_to_coarse_d );
222    }
223    else
224    {
225       HYPRE_CUDA_LAUNCH( hypre_BoomerAMGBuildDirInterp_getcoef_v2, gDim, bDim,
226                          n_fine, A_diag_i, A_diag_j, A_diag_data,
227                          A_offd_i, A_offd_j, A_offd_data,
228                          hypre_ParCSRMatrixSocDiagJ(S),
229                          hypre_ParCSRMatrixSocOffdJ(S),
230                          CF_marker, CF_marker_offd,
231                          num_functions, dof_func_dev, dof_func_offd,
232                          P_diag_i, P_diag_j, P_diag_data,
233                          P_offd_i, P_offd_j, P_offd_data,
234                          fine_to_coarse_d );
235    }
236 
237    /* !!!! Free them here */
238    /*
239    hypre_TFree(hypre_ParCSRMatrixSocDiagJ(S), HYPRE_MEMORY_DEVICE);
240    hypre_TFree(hypre_ParCSRMatrixSocOffdJ(S), HYPRE_MEMORY_DEVICE);
241    */
242 
243    HYPRE_THRUST_CALL(replace, CF_marker, CF_marker + n_fine, -3, -1);
244 
245    /* 5. Construct the result as a ParCSRMatrix. At this point, P's column indices */
246    /*    are defined with A's enumeration of columns */
247 
248    P = hypre_ParCSRMatrixCreate(comm,
249                                 hypre_ParCSRMatrixGlobalNumRows(A),
250                                 total_global_cpts,
251                                 hypre_ParCSRMatrixColStarts(A),
252                                 num_cpts_global,
253                                 0,
254                                 P_diag_size,
255                                 P_offd_size);
256 
257    P_diag = hypre_ParCSRMatrixDiag(P);
258    hypre_CSRMatrixData(P_diag) = P_diag_data;
259    hypre_CSRMatrixI(P_diag)    = P_diag_i;
260    hypre_CSRMatrixJ(P_diag)    = P_diag_j;
261 
262    P_offd = hypre_ParCSRMatrixOffd(P);
263    hypre_CSRMatrixData(P_offd) = P_offd_data;
264    hypre_CSRMatrixI(P_offd)    = P_offd_i;
265    hypre_CSRMatrixJ(P_offd)    = P_offd_j;
266 
267    hypre_CSRMatrixMemoryLocation(P_diag) = memory_location;
268    hypre_CSRMatrixMemoryLocation(P_offd) = memory_location;
269 
270    /* 6. Compress P, removing coefficients smaller than trunc_factor * Max, and */
271    /*    make sure no row has more than max_elmts elements */
272 
273    if (trunc_factor != 0.0 || max_elmts > 0)
274    {
275       hypre_BoomerAMGInterpTruncationDevice(P, trunc_factor, max_elmts);
276 
277       P_offd_i    = hypre_CSRMatrixI(P_offd);
278       P_offd_j    = hypre_CSRMatrixJ(P_offd);
279       P_offd_size = hypre_CSRMatrixNumNonzeros(P_offd);
280       /* hypre_TMemcpy(&P_offd_size, &P_offd_i[n_fine], HYPRE_Int, 1, HYPRE_MEMORY_HOST, memory_location); */
281    }
282 
283    /* 7. Translate P_offd's column indices from the values inherited from A_offd to a 0,1,2,3,... enumeration, */
284    /*    and construct the col_map array that translates these into the global 0..c-1 enumeration */
285    if (P_offd_size)
286    {
287       /* Array P_marker has length equal to the number of A's offd columns+1, and will */
288       /* store a translation code from A_offd's local column numbers to P_offd's local column numbers */
289       /* Example: if A_offd has 6 columns, locally 0,1,..,5, and points 1 and 4 are coarse points, then
290          P_marker=[0,1,0,0,1,0,0], */
291 
292       /* First,  set P_marker[i] to 1 if A's column i is also present in P, otherwise P_marker[i] is 0 */
293       HYPRE_Int *P_marker = hypre_TAlloc(HYPRE_Int, num_cols_A_offd, HYPRE_MEMORY_DEVICE);
294       HYPRE_Int *P_colids = hypre_TAlloc(HYPRE_Int, hypre_max(P_offd_size, num_cols_A_offd), HYPRE_MEMORY_DEVICE);
295 
296       hypre_TMemcpy(P_colids, P_offd_j, HYPRE_Int, P_offd_size, HYPRE_MEMORY_DEVICE, memory_location);
297       /* sort and unique */
298       HYPRE_THRUST_CALL(sort, P_colids, P_colids + P_offd_size);
299       HYPRE_Int *new_end = HYPRE_THRUST_CALL(unique, P_colids, P_colids + P_offd_size);
300 
301       num_cols_P_offd = new_end - P_colids;
302 
303       HYPRE_THRUST_CALL(fill_n, P_marker, num_cols_A_offd, 0);
304       hypreDevice_ScatterConstant(P_marker, num_cols_P_offd, P_colids, 1);
305 
306       /* Because P's columns correspond to P_marker[i]=1 (and =0 otherwise), the scan below will return  */
307       /* an enumeration of P's columns 0,1,... at the corresponding locations in P_marker. */
308       /* P_marker[num_cols_A_offd] will contain num_cols_P_offd, so sum reduction above could  */
309       /* have been replaced by reading the last element of P_marker. */
310       HYPRE_THRUST_CALL(exclusive_scan, P_marker, P_marker + num_cols_A_offd, P_colids);
311       /* Example: P_marker becomes [0,0,1,1,1,2] so that P_marker[1]=0, P_marker[4]=1  */
312 
313       /* Do the re-enumeration, P_offd_j are mapped, using P_marker as map  */
314       HYPRE_THRUST_CALL(gather, P_offd_j, P_offd_j + P_offd_size, P_colids, P_offd_j);
315 
316       /* Create and define array tmp_map_offd. This array is the inverse of the P_marker mapping, */
317       /* Example: num_cols_P_offd=2, tmp_map_offd[0] = 1, tmp_map_offd[1]=4  */
318       /* P_colids is large enough to hold */
319       new_end = HYPRE_THRUST_CALL(copy_if,
320                                   thrust::make_counting_iterator(0),
321                                   thrust::make_counting_iterator(num_cols_A_offd),
322                                   P_marker,
323                                   P_colids,
324                                   thrust::identity<HYPRE_Int>());
325       hypre_assert(new_end - P_colids == num_cols_P_offd);
326 
327       tmp_map_offd_h = hypre_TAlloc(HYPRE_Int, num_cols_P_offd, HYPRE_MEMORY_HOST);
328       hypre_TMemcpy(tmp_map_offd_h, P_colids, HYPRE_Int, num_cols_P_offd, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE);
329 
330       hypre_TFree(P_colids, HYPRE_MEMORY_DEVICE);
331       hypre_TFree(P_marker, HYPRE_MEMORY_DEVICE);
332    }
333 
334    /* 8. P_offd_j now has a 0,1,2,3... local column index enumeration. */
335    /*    tmp_map_offd contains the index mapping from P's offd local columns to A's offd local columns.*/
336    /*    Below routine is in parcsr_ls/par_rap_communication.c. It sets col_map_offd in P, */
337    /*    comm_pkg in P, and perhaps more members of P ??? */
338 
339    fine_to_coarse_h = hypre_TAlloc(HYPRE_Int, n_fine, HYPRE_MEMORY_HOST);
340    hypre_TMemcpy(fine_to_coarse_h, fine_to_coarse_d, HYPRE_Int, n_fine, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE);
341 
342    hypre_ParCSRMatrixColMapOffd(P) = hypre_CTAlloc(HYPRE_BigInt, num_cols_P_offd, HYPRE_MEMORY_HOST);
343    hypre_CSRMatrixNumCols(P_offd)  = num_cols_P_offd;
344 
345    hypre_GetCommPkgRTFromCommPkgA(P, A, fine_to_coarse_h, tmp_map_offd_h);
346 
347    *P_ptr = P;
348 
349    hypre_TFree(CF_marker_offd,   HYPRE_MEMORY_DEVICE);
350    hypre_TFree(dof_func_offd,    HYPRE_MEMORY_DEVICE);
351    hypre_TFree(dof_func_dev,     HYPRE_MEMORY_DEVICE);
352    hypre_TFree(int_buf_data,     HYPRE_MEMORY_DEVICE);
353    hypre_TFree(fine_to_coarse_d, HYPRE_MEMORY_DEVICE);
354    hypre_TFree(fine_to_coarse_h, HYPRE_MEMORY_HOST);
355    hypre_TFree(tmp_map_offd_h,   HYPRE_MEMORY_HOST);
356 
357    return hypre_error_flag;
358 }
359 
360 
361 /*-----------------------------------------------------------------------*/
362  __global__ void
hypre_BoomerAMGBuildDirInterp_getnnz(HYPRE_Int nr_of_rows,HYPRE_Int * S_diag_i,HYPRE_Int * S_diag_j,HYPRE_Int * S_offd_i,HYPRE_Int * S_offd_j,HYPRE_Int * CF_marker,HYPRE_Int * CF_marker_offd,HYPRE_Int num_functions,HYPRE_Int * dof_func,HYPRE_Int * dof_func_offd,HYPRE_Int * P_diag_i,HYPRE_Int * P_offd_i)363 hypre_BoomerAMGBuildDirInterp_getnnz( HYPRE_Int  nr_of_rows,
364                                       HYPRE_Int *S_diag_i,
365                                       HYPRE_Int *S_diag_j,
366                                       HYPRE_Int *S_offd_i,
367                                       HYPRE_Int *S_offd_j,
368                                       HYPRE_Int *CF_marker,
369                                       HYPRE_Int *CF_marker_offd,
370                                       HYPRE_Int  num_functions,
371                                       HYPRE_Int *dof_func,
372                                       HYPRE_Int *dof_func_offd,
373                                       HYPRE_Int *P_diag_i,
374                                       HYPRE_Int *P_offd_i)
375 {
376    /*-----------------------------------------------------------------------*/
377    /* Determine size of interpolation matrix, P
378 
379       If A is of size m x m, then P will be of size m x c where c is the
380       number of coarse points.
381 
382       It is assumed that S have the same global column enumeration as A
383 
384       Input: nr_of_rows         - Number of rows in matrix (local in processor)
385              S_diag_i, S_diag_j - CSR representation of S_diag
386              S_offd_i, S_offd_j - CSR representation of S_offd
387              num_function  - Number of degrees of freedom per grid point
388              dof_func      - vector of length nr_of_rows, indicating the degree of freedom of vector element.
389              dof_func_offd - vector over ncols of A_offd, indicating the degree of freedom.
390 
391       Output: P_diag_i       - Vector where P_diag_i[i] holds the number of non-zero elements of P_diag on row i.
392               P_offd_i       - Vector where P_offd_i[i] holds the number of non-zero elements of P_offd on row i.
393               fine_to_coarse - Vector of length nr_of_rows.
394                                fine_to_coarse[i] is set to 1 if i is a coarse pt.
395                                Eventually, fine_to_coarse[j] will map A's column j
396                                to a re-enumerated column index in matrix P.
397     */
398    /*-----------------------------------------------------------------------*/
399 
400    HYPRE_Int i = hypre_cuda_get_grid_warp_id<1,1>();
401 
402    if (i >= nr_of_rows)
403    {
404       return;
405    }
406 
407    HYPRE_Int p, q, dof_func_i;
408    HYPRE_Int jPd = 0, jPo = 0;
409    HYPRE_Int lane = hypre_cuda_get_lane_id<1>();
410 
411    if (lane == 0)
412    {
413       p = read_only_load(CF_marker + i);
414    }
415    p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0);
416 
417    /*--------------------------------------------------------------------
418     *  If i is a C-point, interpolation is the identity.
419     *--------------------------------------------------------------------*/
420    if (p >= 0)
421    {
422       if (lane == 0)
423       {
424          P_diag_i[i] = 1;
425          P_offd_i[i] = 0;
426       }
427       return;
428    }
429 
430    /*--------------------------------------------------------------------
431     *  If i is an F-point, interpolation is from the C-points that
432     *  strongly influence i.
433     *--------------------------------------------------------------------*/
434    if (num_functions > 1 && dof_func != NULL)
435    {
436       if (lane == 0)
437       {
438          dof_func_i = read_only_load(&dof_func[i]);
439       }
440       dof_func_i = __shfl_sync(HYPRE_WARP_FULL_MASK, dof_func_i, 0);
441    }
442 
443    /* diag part */
444    if (lane < 2)
445    {
446       p = read_only_load(S_diag_i + i + lane);
447    }
448    q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1);
449    p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0);
450 
451    for (HYPRE_Int j = p + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE)
452    {
453       if (j < q)
454       {
455          const HYPRE_Int col = read_only_load(&S_diag_j[j]);
456          if ( read_only_load(&CF_marker[col]) > 0 && (num_functions == 1 || read_only_load(&dof_func[col]) == dof_func_i) )
457          {
458             jPd++;
459          }
460       }
461    }
462    jPd = warp_reduce_sum(jPd);
463 
464    /* offd part */
465    if (lane < 2)
466    {
467       p = read_only_load(S_offd_i + i + lane);
468    }
469    q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1);
470    p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0);
471 
472    for (HYPRE_Int j = p + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE)
473    {
474       if (j < q)
475       {
476          const HYPRE_Int tmp = read_only_load(&S_offd_j[j]);
477          const HYPRE_Int col = tmp;
478          if ( read_only_load(&CF_marker_offd[col]) > 0 && (num_functions == 1 || read_only_load(&dof_func_offd[col]) == dof_func_i) )
479          {
480             jPo++;
481          }
482       }
483    }
484    jPo = warp_reduce_sum(jPo);
485 
486    if (lane == 0)
487    {
488       P_diag_i[i] = jPd;
489       P_offd_i[i] = jPo;
490    }
491 }
492 
493 /*-----------------------------------------------------------------------*
494  *-----------------------------------------------------------------------*/
495  __global__ void
hypre_BoomerAMGBuildDirInterp_getcoef(HYPRE_Int nr_of_rows,HYPRE_Int * A_diag_i,HYPRE_Int * A_diag_j,HYPRE_Real * A_diag_data,HYPRE_Int * A_offd_i,HYPRE_Int * A_offd_j,HYPRE_Real * A_offd_data,HYPRE_Int * Soc_diag_j,HYPRE_Int * Soc_offd_j,HYPRE_Int * CF_marker,HYPRE_Int * CF_marker_offd,HYPRE_Int num_functions,HYPRE_Int * dof_func,HYPRE_Int * dof_func_offd,HYPRE_Int * P_diag_i,HYPRE_Int * P_diag_j,HYPRE_Real * P_diag_data,HYPRE_Int * P_offd_i,HYPRE_Int * P_offd_j,HYPRE_Real * P_offd_data,HYPRE_Int * fine_to_coarse)496 hypre_BoomerAMGBuildDirInterp_getcoef( HYPRE_Int   nr_of_rows,
497                                        HYPRE_Int  *A_diag_i,
498                                        HYPRE_Int  *A_diag_j,
499                                        HYPRE_Real *A_diag_data,
500                                        HYPRE_Int  *A_offd_i,
501                                        HYPRE_Int  *A_offd_j,
502                                        HYPRE_Real *A_offd_data,
503                                        HYPRE_Int  *Soc_diag_j,
504                                        HYPRE_Int  *Soc_offd_j,
505                                        HYPRE_Int  *CF_marker,
506                                        HYPRE_Int  *CF_marker_offd,
507                                        HYPRE_Int   num_functions,
508                                        HYPRE_Int  *dof_func,
509                                        HYPRE_Int  *dof_func_offd,
510                                        HYPRE_Int  *P_diag_i,
511                                        HYPRE_Int  *P_diag_j,
512                                        HYPRE_Real *P_diag_data,
513                                        HYPRE_Int  *P_offd_i,
514                                        HYPRE_Int  *P_offd_j,
515                                        HYPRE_Real *P_offd_data,
516                                        HYPRE_Int  *fine_to_coarse )
517 {
518    /*-----------------------------------------------------------------------*/
519    /* Compute interpolation matrix, P
520 
521       Input: nr_of_rows - Number of rows in matrix (local in processor)
522              A_diag_i, A_diag_j, A_diag_data - CSR representation of A_diag
523              A_offd_i, A_offd_j, A_offd_data - CSR representation of A_offd
524              S_diag_i, S_diag_j - CSR representation of S_diag
525              S_offd_i, S_offd_j - CSR representation of S_offd
526              CF_marker          - Coarse/Fine flags for indices (rows) in this processor
527              CF_marker_offd     - Coarse/Fine flags for indices (rows) not in this processor
528              num_function  - Number of degrees of freedom per grid point
529              dof_func      - vector over nonzero elements of A_diag, indicating the degree of freedom
530              dof_func_offd - vector over nonzero elements of A_offd, indicating the degree of freedom
531              fine_to_coarse - Vector of length nr_of_rows-1.
532 
533       Output: P_diag_j         - Column indices in CSR representation of P_diag
534               P_diag_data      - Matrix elements in CSR representation of P_diag
535               P_offd_j         - Column indices in CSR representation of P_offd
536               P_offd_data      - Matrix elements in CSR representation of P_diag
537    */
538    /*-----------------------------------------------------------------------*/
539 
540    HYPRE_Int i = hypre_cuda_get_grid_warp_id<1,1>();
541 
542    if (i >= nr_of_rows)
543    {
544       return;
545    }
546 
547    HYPRE_Int lane = hypre_cuda_get_lane_id<1>();
548 
549    HYPRE_Int k, dof_func_i;
550 
551    if (lane == 0)
552    {
553       k = read_only_load(CF_marker + i);
554    }
555    k = __shfl_sync(HYPRE_WARP_FULL_MASK, k, 0);
556 
557    /*--------------------------------------------------------------------
558     *  If i is a C-point, interpolation is the identity.
559     *--------------------------------------------------------------------*/
560    if (k > 0)
561    {
562       if (lane == 0)
563       {
564          const HYPRE_Int ind = read_only_load(&P_diag_i[i]);
565          P_diag_j[ind]       = read_only_load(&fine_to_coarse[i]);
566          P_diag_data[ind]    = 1.0;
567       }
568 
569       return;
570    }
571 
572    /*--------------------------------------------------------------------
573     *  Point is f-point, use direct interpolation
574     *--------------------------------------------------------------------*/
575    if (num_functions > 1 && dof_func != NULL)
576    {
577       if (lane == 0)
578       {
579          dof_func_i = read_only_load(&dof_func[i]);
580       }
581       dof_func_i = __shfl_sync(HYPRE_WARP_FULL_MASK, dof_func_i, 0);
582    }
583 
584    HYPRE_Real diagonal = 0.0, sum_N_pos = 0.0, sum_N_neg = 0.0, sum_P_pos = 0.0, sum_P_neg = 0.0;
585 
586    /* diag part */
587    HYPRE_Int p_diag_A, q_diag_A, p_diag_P, q_diag_P;
588    if (lane < 2)
589    {
590       p_diag_A = read_only_load(A_diag_i + i + lane);
591       p_diag_P = read_only_load(P_diag_i + i + lane);
592    }
593    q_diag_A = __shfl_sync(HYPRE_WARP_FULL_MASK, p_diag_A, 1);
594    p_diag_A = __shfl_sync(HYPRE_WARP_FULL_MASK, p_diag_A, 0);
595    q_diag_P = __shfl_sync(HYPRE_WARP_FULL_MASK, p_diag_P, 1);
596    p_diag_P = __shfl_sync(HYPRE_WARP_FULL_MASK, p_diag_P, 0);
597 
598    k = p_diag_P;
599    for (HYPRE_Int j = p_diag_A + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q_diag_A); j += HYPRE_WARP_SIZE)
600    {
601       HYPRE_Int col, sum, pos;
602       HYPRE_Int is_SC = 0; /* if is a Strong-C */
603       HYPRE_Complex val;
604 
605       if (j < q_diag_A)
606       {
607          col = read_only_load(&A_diag_j[j]);
608 
609          if (i == col)
610          {
611             diagonal = read_only_load(&A_diag_data[j]);
612          }
613          else if ( num_functions == 1 || read_only_load(&dof_func[col]) == dof_func_i )
614          {
615             val = read_only_load(&A_diag_data[j]);
616 
617             if (val > 0.0)
618             {
619                sum_N_pos += val;
620             }
621             else
622             {
623                sum_N_neg += val;
624             }
625 
626             is_SC = read_only_load(&Soc_diag_j[j]) > -1 && read_only_load(&CF_marker[col]) > 0;
627 
628             if (is_SC)
629             {
630                if (val > 0.0)
631                {
632                   sum_P_pos += val;
633                }
634                else
635                {
636                   sum_P_neg += val;
637                }
638             }
639          }
640       }
641 
642       pos = warp_prefix_sum(lane, is_SC, sum);
643 
644       if (is_SC)
645       {
646          P_diag_data[k + pos] = val;
647          P_diag_j[k + pos] = read_only_load(&fine_to_coarse[col]);
648       }
649       k += sum;
650    }
651 
652    hypre_device_assert(k == q_diag_P);
653 
654    /* offd part */
655    HYPRE_Int p_offd_A, q_offd_A, p_offd_P, q_offd_P;
656    if (lane < 2)
657    {
658       p_offd_A = read_only_load(A_offd_i + i + lane);
659       p_offd_P = read_only_load(P_offd_i + i + lane);
660    }
661    q_offd_A = __shfl_sync(HYPRE_WARP_FULL_MASK, p_offd_A, 1);
662    p_offd_A = __shfl_sync(HYPRE_WARP_FULL_MASK, p_offd_A, 0);
663    q_offd_P = __shfl_sync(HYPRE_WARP_FULL_MASK, p_offd_P, 1);
664    p_offd_P = __shfl_sync(HYPRE_WARP_FULL_MASK, p_offd_P, 0);
665 
666    k = p_offd_P;
667    for (HYPRE_Int j = p_offd_A + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q_offd_A); j += HYPRE_WARP_SIZE)
668    {
669       HYPRE_Int col, sum, pos;
670       HYPRE_Int is_SC = 0; /* if is a Strong-C */
671       HYPRE_Complex val;
672 
673       if (j < q_offd_A)
674       {
675          col = read_only_load(&A_offd_j[j]);
676 
677          if ( num_functions == 1 || read_only_load(&dof_func_offd[col]) == dof_func_i )
678          {
679             val = read_only_load(&A_offd_data[j]);
680 
681             if (val > 0.0)
682             {
683                sum_N_pos += val;
684             }
685             else
686             {
687                sum_N_neg += val;
688             }
689 
690             is_SC = read_only_load(&Soc_offd_j[j]) > -1 && read_only_load(&CF_marker_offd[col]) > 0;
691 
692             if (is_SC)
693             {
694                if (val > 0.0)
695                {
696                   sum_P_pos += val;
697                }
698                else
699                {
700                   sum_P_neg += val;
701                }
702             }
703          }
704       }
705 
706       pos = warp_prefix_sum(lane, is_SC, sum);
707 
708       if (is_SC)
709       {
710          P_offd_data[k + pos] = val;
711          P_offd_j[k + pos] = col;
712       }
713       k += sum;
714    }
715 
716    hypre_device_assert(k == q_offd_P);
717 
718    diagonal  = warp_allreduce_sum(diagonal);
719    sum_N_pos = warp_allreduce_sum(sum_N_pos);
720    sum_N_neg = warp_allreduce_sum(sum_N_neg);
721    sum_P_pos = warp_allreduce_sum(sum_P_pos);
722    sum_P_neg = warp_allreduce_sum(sum_P_neg);
723 
724    HYPRE_Complex alfa = 1.0, beta = 1.0;
725 
726    if (sum_P_neg)
727    {
728       alfa = sum_N_neg / (sum_P_neg * diagonal);
729    }
730 
731    if (sum_P_pos)
732    {
733       beta = sum_N_pos / (sum_P_pos * diagonal);
734    }
735 
736    for (HYPRE_Int j = p_diag_P + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q_diag_P); j += HYPRE_WARP_SIZE)
737    {
738       /* if (P_diag_data[j] > 0.0)
739             P_diag_data[j] *= -beta;
740          else
741             P_diag_data[j] *= -alfa; */
742       if (j < q_diag_P)
743       {
744          P_diag_data[j] *= (P_diag_data[j] > 0.0) * (alfa-beta) - alfa;
745       }
746    }
747 
748    for (HYPRE_Int j = p_offd_P + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q_offd_P); j += HYPRE_WARP_SIZE)
749    {
750       /* if (P_offd_data[indp]> 0)
751             P_offd_data[indp] *= -beta;
752          else
753             P_offd_data[indp] *= -alfa; */
754       if (j < q_offd_P)
755       {
756          P_offd_data[j] *= (P_offd_data[j] > 0.0) * (alfa-beta) - alfa;
757       }
758    }
759 }
760 
761 /*-----------------------------------------------------------------------*
762  *-----------------------------------------------------------------------*/
763  __global__ void
hypre_BoomerAMGBuildDirInterp_getcoef_v2(HYPRE_Int nr_of_rows,HYPRE_Int * A_diag_i,HYPRE_Int * A_diag_j,HYPRE_Real * A_diag_data,HYPRE_Int * A_offd_i,HYPRE_Int * A_offd_j,HYPRE_Real * A_offd_data,HYPRE_Int * Soc_diag_j,HYPRE_Int * Soc_offd_j,HYPRE_Int * CF_marker,HYPRE_Int * CF_marker_offd,HYPRE_Int num_functions,HYPRE_Int * dof_func,HYPRE_Int * dof_func_offd,HYPRE_Int * P_diag_i,HYPRE_Int * P_diag_j,HYPRE_Real * P_diag_data,HYPRE_Int * P_offd_i,HYPRE_Int * P_offd_j,HYPRE_Real * P_offd_data,HYPRE_Int * fine_to_coarse)764 hypre_BoomerAMGBuildDirInterp_getcoef_v2( HYPRE_Int   nr_of_rows,
765                                           HYPRE_Int  *A_diag_i,
766                                           HYPRE_Int  *A_diag_j,
767                                           HYPRE_Real *A_diag_data,
768                                           HYPRE_Int  *A_offd_i,
769                                           HYPRE_Int  *A_offd_j,
770                                           HYPRE_Real *A_offd_data,
771                                           HYPRE_Int  *Soc_diag_j,
772                                           HYPRE_Int  *Soc_offd_j,
773                                           HYPRE_Int  *CF_marker,
774                                           HYPRE_Int  *CF_marker_offd,
775                                           HYPRE_Int   num_functions,
776                                           HYPRE_Int  *dof_func,
777                                           HYPRE_Int  *dof_func_offd,
778                                           HYPRE_Int  *P_diag_i,
779                                           HYPRE_Int  *P_diag_j,
780                                           HYPRE_Real *P_diag_data,
781                                           HYPRE_Int  *P_offd_i,
782                                           HYPRE_Int  *P_offd_j,
783                                           HYPRE_Real *P_offd_data,
784                                           HYPRE_Int  *fine_to_coarse )
785 {
786    /*-----------------------------------------------------------------------*/
787    /* Compute interpolation matrix, P
788 
789       Input: nr_of_rows - Number of rows in matrix (local in processor)
790              A_diag_i, A_diag_j, A_diag_data - CSR representation of A_diag
791              A_offd_i, A_offd_j, A_offd_data - CSR representation of A_offd
792              S_diag_i, S_diag_j - CSR representation of S_diag
793              S_offd_i, S_offd_j - CSR representation of S_offd
794              CF_marker          - Coarse/Fine flags for indices (rows) in this processor
795              CF_marker_offd     - Coarse/Fine flags for indices (rows) not in this processor
796              num_function  - Number of degrees of freedom per grid point
797              dof_func      - vector over nonzero elements of A_diag, indicating the degree of freedom
798              dof_func_offd - vector over nonzero elements of A_offd, indicating the degree of freedom
799              fine_to_coarse - Vector of length nr_of_rows-1.
800 
801       Output: P_diag_j         - Column indices in CSR representation of P_diag
802               P_diag_data      - Matrix elements in CSR representation of P_diag
803               P_offd_j         - Column indices in CSR representation of P_offd
804               P_offd_data      - Matrix elements in CSR representation of P_diag
805    */
806    /*-----------------------------------------------------------------------*/
807 
808    HYPRE_Int i = hypre_cuda_get_grid_warp_id<1,1>();
809 
810    if (i >= nr_of_rows)
811    {
812       return;
813    }
814 
815    HYPRE_Int lane = hypre_cuda_get_lane_id<1>();
816 
817    HYPRE_Int k, dof_func_i;
818 
819    if (lane == 0)
820    {
821       k = read_only_load(CF_marker + i);
822    }
823    k = __shfl_sync(HYPRE_WARP_FULL_MASK, k, 0);
824 
825    /*--------------------------------------------------------------------
826     *  If i is a C-point, interpolation is the identity.
827     *--------------------------------------------------------------------*/
828    if (k > 0)
829    {
830       if (lane == 0)
831       {
832          const HYPRE_Int ind = read_only_load(&P_diag_i[i]);
833          P_diag_j[ind]       = read_only_load(&fine_to_coarse[i]);
834          P_diag_data[ind]    = 1.0;
835       }
836 
837       return;
838    }
839 
840    /*--------------------------------------------------------------------
841     *  Point is f-point, use direct interpolation
842     *--------------------------------------------------------------------*/
843    if (num_functions > 1 && dof_func != NULL)
844    {
845       if (lane == 0)
846       {
847          dof_func_i = read_only_load(&dof_func[i]);
848       }
849       dof_func_i = __shfl_sync(HYPRE_WARP_FULL_MASK, dof_func_i, 0);
850    }
851 
852    HYPRE_Real diagonal = 0.0, sum_F = 0.0;
853 
854    /* diag part */
855    HYPRE_Int p_diag_A, q_diag_A, p_diag_P, q_diag_P;
856    if (lane < 2)
857    {
858       p_diag_A = read_only_load(A_diag_i + i + lane);
859       p_diag_P = read_only_load(P_diag_i + i + lane);
860    }
861    q_diag_A = __shfl_sync(HYPRE_WARP_FULL_MASK, p_diag_A, 1);
862    p_diag_A = __shfl_sync(HYPRE_WARP_FULL_MASK, p_diag_A, 0);
863    q_diag_P = __shfl_sync(HYPRE_WARP_FULL_MASK, p_diag_P, 1);
864    p_diag_P = __shfl_sync(HYPRE_WARP_FULL_MASK, p_diag_P, 0);
865 
866    k = p_diag_P;
867    for (HYPRE_Int j = p_diag_A + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q_diag_A); j += HYPRE_WARP_SIZE)
868    {
869       HYPRE_Int col, sum, pos;
870       HYPRE_Int is_SC = 0; /* if is a Strong-C */
871       HYPRE_Complex val;
872 
873       if (j < q_diag_A)
874       {
875          col = read_only_load(&A_diag_j[j]);
876 
877          if (i == col)
878          {
879             diagonal = read_only_load(&A_diag_data[j]);
880          }
881          else if ( num_functions == 1 || read_only_load(&dof_func[col]) == dof_func_i )
882          {
883             val = read_only_load(&A_diag_data[j]);
884             if (read_only_load(&Soc_diag_j[j]) > -1)
885             {
886                if (read_only_load(&CF_marker[col]) > 0)
887                {
888                   is_SC = 1;
889                }
890                else
891                {
892                   sum_F += val;
893                }
894             }
895             else
896             {
897                diagonal += val;
898             }
899          }
900       }
901 
902       pos = warp_prefix_sum(lane, is_SC, sum);
903 
904       if (is_SC)
905       {
906          P_diag_data[k + pos] = val;
907          P_diag_j[k + pos] = read_only_load(&fine_to_coarse[col]);
908       }
909       k += sum;
910    }
911 
912    hypre_device_assert(k == q_diag_P);
913 
914    /* offd part */
915    HYPRE_Int p_offd_A, q_offd_A, p_offd_P, q_offd_P;
916    if (lane < 2)
917    {
918       p_offd_A = read_only_load(A_offd_i + i + lane);
919       p_offd_P = read_only_load(P_offd_i + i + lane);
920    }
921    q_offd_A = __shfl_sync(HYPRE_WARP_FULL_MASK, p_offd_A, 1);
922    p_offd_A = __shfl_sync(HYPRE_WARP_FULL_MASK, p_offd_A, 0);
923    q_offd_P = __shfl_sync(HYPRE_WARP_FULL_MASK, p_offd_P, 1);
924    p_offd_P = __shfl_sync(HYPRE_WARP_FULL_MASK, p_offd_P, 0);
925 
926    k = p_offd_P;
927    for (HYPRE_Int j = p_offd_A + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q_offd_A); j += HYPRE_WARP_SIZE)
928    {
929       HYPRE_Int col, sum, pos;
930       HYPRE_Int is_SC = 0; /* if is a Strong-C */
931       HYPRE_Complex val;
932 
933       if (j < q_offd_A)
934       {
935          col = read_only_load(&A_offd_j[j]);
936 
937          if ( num_functions == 1 || read_only_load(&dof_func_offd[col]) == dof_func_i )
938          {
939             val = read_only_load(&A_offd_data[j]);
940             if (read_only_load(&Soc_offd_j[j]) > -1)
941             {
942                if (read_only_load(&CF_marker_offd[col]) > 0)
943                {
944                   is_SC = 1;
945                }
946                else
947                {
948                   sum_F += val;
949                }
950             }
951             else
952             {
953                diagonal += val;
954             }
955          }
956       }
957 
958       pos = warp_prefix_sum(lane, is_SC, sum);
959 
960       if (is_SC)
961       {
962          P_offd_data[k + pos] = val;
963          P_offd_j[k + pos] = col;
964       }
965       k += sum;
966    }
967 
968    hypre_device_assert(k == q_offd_P);
969 
970    diagonal  = warp_allreduce_sum(diagonal);
971    sum_F     = warp_allreduce_sum(sum_F);
972 
973    HYPRE_Complex beta = sum_F / (q_diag_P - p_diag_P + q_offd_P - p_offd_P);
974 
975    for (HYPRE_Int j = p_diag_P + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q_diag_P); j += HYPRE_WARP_SIZE)
976    {
977       /* if (P_diag_data[j] > 0.0)
978             P_diag_data[j] *= -beta;
979          else
980             P_diag_data[j] *= -alfa; */
981       if (j < q_diag_P)
982       {
983          P_diag_data[j] = -(P_diag_data[j] + beta) / diagonal;
984       }
985    }
986 
987    for (HYPRE_Int j = p_offd_P + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q_offd_P); j += HYPRE_WARP_SIZE)
988    {
989       /* if (P_offd_data[indp]> 0)
990             P_offd_data[indp] *= -beta;
991          else
992             P_offd_data[indp] *= -alfa; */
993       if (j < q_offd_P)
994       {
995          P_offd_data[j] = -(P_offd_data[j] + beta) / diagonal;
996       }
997    }
998 }
999 
1000 HYPRE_Int
hypre_BoomerAMGBuildInterpOnePntDevice(hypre_ParCSRMatrix * A,HYPRE_Int * CF_marker,hypre_ParCSRMatrix * S,HYPRE_BigInt * num_cpts_global,HYPRE_Int num_functions,HYPRE_Int * dof_func,HYPRE_Int debug_flag,hypre_ParCSRMatrix ** P_ptr)1001 hypre_BoomerAMGBuildInterpOnePntDevice( hypre_ParCSRMatrix  *A,
1002                                         HYPRE_Int           *CF_marker,
1003                                         hypre_ParCSRMatrix  *S,
1004                                         HYPRE_BigInt        *num_cpts_global,
1005                                         HYPRE_Int            num_functions,
1006                                         HYPRE_Int           *dof_func,
1007                                         HYPRE_Int            debug_flag,
1008                                         hypre_ParCSRMatrix **P_ptr)
1009 {
1010    MPI_Comm                 comm     = hypre_ParCSRMatrixComm(A);
1011    hypre_ParCSRCommPkg     *comm_pkg = hypre_ParCSRMatrixCommPkg(A);
1012    hypre_ParCSRCommHandle  *comm_handle;
1013 
1014    hypre_CSRMatrix         *A_diag          = hypre_ParCSRMatrixDiag(A);
1015    HYPRE_Int               *A_diag_i        = hypre_CSRMatrixI(A_diag);
1016    HYPRE_Int               *A_strong_diag_j = hypre_ParCSRMatrixSocDiagJ(S);
1017    HYPRE_Complex           *A_diag_a        = hypre_CSRMatrixData(A_diag);
1018 
1019    hypre_CSRMatrix         *A_offd          = hypre_ParCSRMatrixOffd(A);
1020    HYPRE_Int               *A_offd_i        = hypre_CSRMatrixI(A_offd);
1021    HYPRE_Int               *A_strong_offd_j = hypre_ParCSRMatrixSocOffdJ(S);
1022    HYPRE_Complex           *A_offd_a        = hypre_CSRMatrixData(A_offd);
1023 
1024    HYPRE_Int                num_cols_A_offd = hypre_CSRMatrixNumCols(A_offd);
1025 
1026    /* Interpolation matrix P */
1027    hypre_ParCSRMatrix      *P;
1028    /* csr's */
1029    hypre_CSRMatrix    *P_diag;
1030    hypre_CSRMatrix    *P_offd;
1031    /* arrays */
1032    HYPRE_Real         *P_diag_data;
1033    HYPRE_Int          *P_diag_i;
1034    HYPRE_Int          *P_diag_j;
1035    HYPRE_Int          *P_diag_j_temp;
1036    HYPRE_Int          *P_diag_j_temp_compressed;
1037    HYPRE_Real         *P_offd_data;
1038    HYPRE_Int          *P_offd_i;
1039    HYPRE_Int          *P_offd_j;
1040    HYPRE_Int          *P_offd_j_temp;
1041    HYPRE_Int          *P_offd_j_temp_compressed;
1042    HYPRE_Int           num_cols_P_offd;
1043    HYPRE_BigInt       *col_map_offd_P = NULL;
1044    HYPRE_BigInt       *col_map_offd_P_device = NULL;
1045    /* CF marker off-diag part */
1046    HYPRE_Int          *CF_marker_offd = NULL;
1047    /* nnz */
1048    HYPRE_Int           nnz_diag, nnz_offd;
1049    /* local size */
1050    HYPRE_Int           n_fine = hypre_CSRMatrixNumRows(A_diag);
1051    /* fine to coarse mapping: diag part and offd part */
1052    HYPRE_Int          *fine_to_coarse;
1053    HYPRE_BigInt       *fine_to_coarse_offd = NULL;
1054    HYPRE_BigInt        total_global_cpts, my_first_cpt;
1055    HYPRE_Int           my_id, num_procs;
1056    HYPRE_Int           num_sends;
1057    HYPRE_Int          *int_buf_data = NULL;
1058    HYPRE_BigInt       *big_int_buf_data = NULL;
1059    //HYPRE_Int col_start = hypre_ParCSRMatrixFirstRowIndex(A);
1060    //HYPRE_Int col_end   = col_start + n_fine;
1061    /* arrays for compressing P_diag and P_offd col indices and data */
1062    HYPRE_Int          *diag_compress_marker;
1063    HYPRE_Int          *offd_compress_marker;
1064 
1065    hypre_MPI_Comm_size(comm, &num_procs);
1066    hypre_MPI_Comm_rank(comm,&my_id);
1067 
1068    my_first_cpt = num_cpts_global[0];
1069    if (my_id == (num_procs -1)) total_global_cpts = num_cpts_global[1];
1070    hypre_MPI_Bcast(&total_global_cpts, 1, HYPRE_MPI_BIG_INT, num_procs-1, comm);
1071 
1072    /* fine to coarse mapping */
1073    fine_to_coarse = hypre_TAlloc(HYPRE_Int, n_fine, HYPRE_MEMORY_DEVICE);
1074    HYPRE_THRUST_CALL( exclusive_scan,
1075                       thrust::make_transform_iterator(CF_marker,          is_nonnegative<HYPRE_Int>()),
1076                       thrust::make_transform_iterator(CF_marker + n_fine, is_nonnegative<HYPRE_Int>()),
1077                       fine_to_coarse,
1078                       HYPRE_Int(0) ); /* *MUST* pass init value since input and output types diff. */
1079 
1080    /*-------------------------------------------------------------------
1081     * Get the CF_marker data for the off-processor columns
1082     *-------------------------------------------------------------------*/
1083    if (num_cols_A_offd)
1084    {
1085       CF_marker_offd = hypre_CTAlloc(HYPRE_Int, num_cols_A_offd, HYPRE_MEMORY_DEVICE);
1086    }
1087    /* if CommPkg of A is not present, create it */
1088    if (!comm_pkg)
1089    {
1090       hypre_MatvecCommPkgCreate(A);
1091       comm_pkg = hypre_ParCSRMatrixCommPkg(A);
1092    }
1093    /* number of sends to do (number of procs) */
1094    num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg);
1095    /* send buffer, of size send_map_starts[num_sends]),
1096     * i.e., number of entries to send */
1097    int_buf_data = hypre_CTAlloc(HYPRE_Int, hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends), HYPRE_MEMORY_DEVICE);
1098 
1099    /* copy CF markers of elements to send to buffer */
1100    HYPRE_THRUST_CALL( gather,
1101                       hypre_ParCSRCommPkgDeviceSendMapElmts(comm_pkg),
1102                       hypre_ParCSRCommPkgDeviceSendMapElmts(comm_pkg) +
1103                       hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends),
1104                       CF_marker,
1105                       int_buf_data );
1106    /* create a handle to start communication. 11: for integer */
1107    comm_handle = hypre_ParCSRCommHandleCreate_v2(11, comm_pkg, HYPRE_MEMORY_DEVICE, int_buf_data, HYPRE_MEMORY_DEVICE, CF_marker_offd);
1108    /* destroy the handle to finish communication */
1109    hypre_ParCSRCommHandleDestroy(comm_handle);
1110    hypre_TFree(int_buf_data, HYPRE_MEMORY_DEVICE);
1111 
1112    /*-----------------------------------------------------------------------
1113     *  First Pass: Determine size of P and fill in fine_to_coarse mapping,
1114     *  and find the most strongly influencing C-pt for each F-pt
1115     *-----------------------------------------------------------------------*/
1116 
1117    P_diag_i = hypre_CTAlloc(HYPRE_Int, n_fine+1, HYPRE_MEMORY_DEVICE);
1118    P_offd_i = hypre_CTAlloc(HYPRE_Int, n_fine+1, HYPRE_MEMORY_DEVICE);
1119 
1120    diag_compress_marker = hypre_CTAlloc(HYPRE_Int, n_fine, HYPRE_MEMORY_DEVICE);
1121    offd_compress_marker = hypre_CTAlloc(HYPRE_Int, n_fine, HYPRE_MEMORY_DEVICE);
1122 
1123    /* Overallocate here and compress later */
1124    P_diag_j_temp = hypre_CTAlloc(HYPRE_Int, n_fine, HYPRE_MEMORY_DEVICE);
1125    P_offd_j_temp = hypre_CTAlloc(HYPRE_Int, n_fine, HYPRE_MEMORY_DEVICE);
1126 
1127    dim3 bDim = hypre_GetDefaultCUDABlockDimension();
1128    dim3 gDim = hypre_GetDefaultCUDAGridDimension(n_fine, "warp", bDim);
1129 
1130    HYPRE_CUDA_LAUNCH( hypre_BoomerAMGBuildInterpOnePnt_getnnz, gDim, bDim,
1131                       n_fine, A_diag_i, A_strong_diag_j, A_diag_a, A_offd_i, A_strong_offd_j,
1132                       A_offd_a, CF_marker, CF_marker_offd, diag_compress_marker,
1133                       offd_compress_marker, P_diag_i, P_diag_j_temp, P_offd_i, P_offd_j_temp);
1134 
1135    /*-----------------------------------------------------------------------
1136     *  Send and receive fine_to_coarse info.
1137     *-----------------------------------------------------------------------*/
1138    fine_to_coarse_offd = hypre_CTAlloc(HYPRE_BigInt, num_cols_A_offd,HYPRE_MEMORY_DEVICE);
1139    big_int_buf_data = hypre_CTAlloc(HYPRE_BigInt, hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends), HYPRE_MEMORY_DEVICE);
1140    HYPRE_THRUST_CALL( gather,
1141                       hypre_ParCSRCommPkgDeviceSendMapElmts(comm_pkg),
1142                       hypre_ParCSRCommPkgDeviceSendMapElmts(comm_pkg) +
1143                       hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends),
1144                       fine_to_coarse,
1145                       big_int_buf_data );
1146    HYPRE_THRUST_CALL( transform,
1147                       big_int_buf_data,
1148                       big_int_buf_data + hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends),
1149                       thrust::make_constant_iterator(my_first_cpt),
1150                       big_int_buf_data,
1151                       thrust::plus<HYPRE_BigInt>() );
1152    comm_handle = hypre_ParCSRCommHandleCreate_v2(21, comm_pkg, HYPRE_MEMORY_DEVICE, big_int_buf_data, HYPRE_MEMORY_DEVICE, fine_to_coarse_offd);
1153    hypre_ParCSRCommHandleDestroy(comm_handle);
1154    hypre_TFree(big_int_buf_data, HYPRE_MEMORY_DEVICE);
1155 
1156    /*-----------------------------------------------------------------------
1157     *  Fill values and finish setting up P.
1158     *-----------------------------------------------------------------------*/
1159 
1160    /* scan P_diag_i (which has number of nonzeros in each row) to get row indices */
1161    hypreDevice_IntegerExclusiveScan(n_fine+1, P_diag_i);
1162    hypreDevice_IntegerExclusiveScan(n_fine+1, P_offd_i);
1163 
1164    /* get the number of nonzeros and allocate column index and data arrays */
1165    hypre_TMemcpy(&nnz_diag, &P_diag_i[n_fine], HYPRE_Int, 1, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE);
1166    hypre_TMemcpy(&nnz_offd, &P_offd_i[n_fine], HYPRE_Int, 1, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE);
1167 
1168    P_diag_j    = hypre_TAlloc(HYPRE_Int,  nnz_diag, HYPRE_MEMORY_DEVICE);
1169    P_diag_data = hypre_TAlloc(HYPRE_Real, nnz_diag, HYPRE_MEMORY_DEVICE);
1170 
1171 
1172    P_offd_j    = hypre_TAlloc(HYPRE_Int,  nnz_offd, HYPRE_MEMORY_DEVICE);
1173    P_offd_data = hypre_TAlloc(HYPRE_Real, nnz_offd, HYPRE_MEMORY_DEVICE);
1174 
1175    /* set data values to 1.0 */
1176    HYPRE_THRUST_CALL( fill_n,
1177                       P_diag_data,
1178                       nnz_diag,
1179                       1.0 );
1180    HYPRE_THRUST_CALL( fill_n,
1181                       P_offd_data,
1182                       nnz_offd,
1183                       1.0 );
1184 
1185    /* compress temporary column indices */
1186    P_diag_j_temp_compressed = hypre_TAlloc(HYPRE_Int, nnz_diag, HYPRE_MEMORY_DEVICE);
1187    P_offd_j_temp_compressed = hypre_TAlloc(HYPRE_Int, nnz_offd, HYPRE_MEMORY_DEVICE);
1188 
1189    HYPRE_THRUST_CALL( copy_if,
1190                       P_diag_j_temp,
1191                       P_diag_j_temp + n_fine,
1192                       diag_compress_marker,
1193                       P_diag_j_temp_compressed,
1194                       equal<HYPRE_Int>(1) );
1195    HYPRE_THRUST_CALL( copy_if,
1196                       P_offd_j_temp,
1197                       P_offd_j_temp + n_fine,
1198                       offd_compress_marker,
1199                       P_offd_j_temp_compressed,
1200                       equal<HYPRE_Int>(1) );
1201 
1202    /* map the diag column indices */
1203    HYPRE_THRUST_CALL( gather,
1204                       P_diag_j_temp_compressed,
1205                       P_diag_j_temp_compressed + nnz_diag,
1206                       fine_to_coarse,
1207                       P_diag_j );
1208 
1209    /* mark the offd indices for P as a subset of offd indices of A */
1210    HYPRE_Int *mark_P_offd_idx = hypre_CTAlloc(HYPRE_Int, num_cols_A_offd, HYPRE_MEMORY_DEVICE);
1211    // note that scatter is usually not safe if the same index appears more than once in the map,
1212    // but here we are just scattering constant values, so this is safe
1213    HYPRE_THRUST_CALL( scatter,
1214                       thrust::make_constant_iterator(1),
1215                       thrust::make_constant_iterator(1) + nnz_offd,
1216                       P_offd_j_temp_compressed,
1217                       mark_P_offd_idx );
1218    num_cols_P_offd = HYPRE_THRUST_CALL(reduce, mark_P_offd_idx, mark_P_offd_idx + num_cols_A_offd);
1219 
1220    /* get a mapping from P offd indices to A offd indices */
1221    /* offd_map_P_to_A[ P offd idx ] = A offd idx */
1222    HYPRE_Int *offd_map_P_to_A = hypre_CTAlloc(HYPRE_Int, num_cols_P_offd, HYPRE_MEMORY_DEVICE);
1223    HYPRE_THRUST_CALL( copy_if,
1224                       thrust::make_counting_iterator(0),
1225                       thrust::make_counting_iterator(num_cols_A_offd),
1226                       mark_P_offd_idx,
1227                       offd_map_P_to_A,
1228                       equal<HYPRE_Int>(1) );
1229    hypre_TFree(mark_P_offd_idx, HYPRE_MEMORY_DEVICE);
1230 
1231    /* also get an inverse mapping from A offd indices to P offd indices */
1232    /* offd_map_A_to_P[ A offd idx ] = -1 if not a P idx, else P offd idx */
1233    HYPRE_Int *offd_map_A_to_P = hypre_TAlloc(HYPRE_Int, num_cols_A_offd, HYPRE_MEMORY_DEVICE);
1234    HYPRE_THRUST_CALL( fill_n,
1235                       offd_map_A_to_P,
1236                       num_cols_A_offd,
1237                       -1 );
1238    HYPRE_THRUST_CALL( scatter,
1239                       thrust::make_counting_iterator(0),
1240                       thrust::make_counting_iterator(num_cols_P_offd),
1241                       offd_map_P_to_A,
1242                       offd_map_A_to_P );
1243 
1244    /* use inverse mapping above to map P_offd_j */
1245    HYPRE_THRUST_CALL( gather,
1246                       P_offd_j_temp_compressed,
1247                       P_offd_j_temp_compressed + nnz_offd,
1248                       offd_map_A_to_P,
1249                       P_offd_j );
1250    hypre_TFree(P_offd_j_temp_compressed, HYPRE_MEMORY_DEVICE);
1251    hypre_TFree(offd_map_A_to_P, HYPRE_MEMORY_DEVICE);
1252 
1253    /* setup col_map_offd for P */
1254    col_map_offd_P_device = hypre_CTAlloc(HYPRE_BigInt, num_cols_P_offd, HYPRE_MEMORY_DEVICE);
1255    col_map_offd_P = hypre_CTAlloc(HYPRE_BigInt, num_cols_P_offd, HYPRE_MEMORY_HOST);
1256    HYPRE_THRUST_CALL( gather,
1257                       offd_map_P_to_A,
1258                       offd_map_P_to_A + num_cols_P_offd,
1259                       fine_to_coarse_offd,
1260                       col_map_offd_P_device);
1261    hypre_TMemcpy(col_map_offd_P, col_map_offd_P_device, HYPRE_BigInt, num_cols_P_offd, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE);
1262    hypre_TFree(offd_map_P_to_A, HYPRE_MEMORY_DEVICE);
1263    hypre_TFree(col_map_offd_P_device, HYPRE_MEMORY_DEVICE);
1264 
1265    /* Now, we should have everything of Parcsr matrix P */
1266    P = hypre_ParCSRMatrixCreate(comm,
1267                                 hypre_ParCSRMatrixGlobalNumCols(A), /* global num of rows */
1268                                 total_global_cpts, /* global num of cols */
1269                                 hypre_ParCSRMatrixColStarts(A), /* row_starts */
1270                                 num_cpts_global, /* col_starts */
1271                                 num_cols_P_offd, /* num cols offd */
1272                                 nnz_diag,
1273                                 nnz_offd);
1274 
1275    P_diag = hypre_ParCSRMatrixDiag(P);
1276    hypre_CSRMatrixData(P_diag) = P_diag_data;
1277    hypre_CSRMatrixI(P_diag)    = P_diag_i;
1278    hypre_CSRMatrixJ(P_diag)    = P_diag_j;
1279 
1280    P_offd = hypre_ParCSRMatrixOffd(P);
1281    hypre_CSRMatrixData(P_offd) = P_offd_data;
1282    hypre_CSRMatrixI(P_offd)    = P_offd_i;
1283    hypre_CSRMatrixJ(P_offd)    = P_offd_j;
1284 
1285    hypre_ParCSRMatrixColMapOffd(P) = col_map_offd_P;
1286 
1287    /* create CommPkg of P */
1288    hypre_MatvecCommPkgCreate(P);
1289 
1290    *P_ptr = P;
1291 
1292    /* free workspace */
1293    hypre_TFree(CF_marker_offd, HYPRE_MEMORY_DEVICE);
1294    hypre_TFree(fine_to_coarse, HYPRE_MEMORY_DEVICE);
1295    hypre_TFree(fine_to_coarse_offd, HYPRE_MEMORY_DEVICE);
1296 
1297    return hypre_error_flag;
1298 }
1299 
1300 /*-----------------------------------------------------------------------*/
1301 __global__ void
hypre_BoomerAMGBuildInterpOnePnt_getnnz(HYPRE_Int nr_of_rows,HYPRE_Int * A_diag_i,HYPRE_Int * A_strong_diag_j,HYPRE_Complex * A_diag_a,HYPRE_Int * A_offd_i,HYPRE_Int * A_strong_offd_j,HYPRE_Complex * A_offd_a,HYPRE_Int * CF_marker,HYPRE_Int * CF_marker_offd,HYPRE_Int * diag_compress_marker,HYPRE_Int * offd_compress_marker,HYPRE_Int * P_diag_i,HYPRE_Int * P_diag_j,HYPRE_Int * P_offd_i,HYPRE_Int * P_offd_j)1302 hypre_BoomerAMGBuildInterpOnePnt_getnnz( HYPRE_Int      nr_of_rows,
1303                                          HYPRE_Int     *A_diag_i,
1304                                          HYPRE_Int     *A_strong_diag_j,
1305                                          HYPRE_Complex *A_diag_a,
1306                                          HYPRE_Int     *A_offd_i,
1307                                          HYPRE_Int     *A_strong_offd_j,
1308                                          HYPRE_Complex *A_offd_a,
1309                                          HYPRE_Int     *CF_marker,
1310                                          HYPRE_Int     *CF_marker_offd,
1311                                          HYPRE_Int     *diag_compress_marker,
1312                                          HYPRE_Int     *offd_compress_marker,
1313                                          HYPRE_Int     *P_diag_i,
1314                                          HYPRE_Int     *P_diag_j,
1315                                          HYPRE_Int     *P_offd_i,
1316                                          HYPRE_Int     *P_offd_j)
1317 {
1318    /*-----------------------------------------------------------------------*/
1319    /* Determine size of interpolation matrix, P
1320 
1321       If A is of size m x m, then P will be of size m x c where c is the
1322       number of coarse points.
1323 
1324       It is assumed that S have the same global column enumeration as A
1325 
1326       Input: nr_of_rows                  - Number of rows in matrix (local in processor)
1327              A_diag_i, A_strong_diag_j,  - Arrays associated with ParCSRMatrix A
1328              A_diag_a, A_offd_i,           where the column indices are taken from S
1329              A_strong_offd_j, A_offd_a     and mark weak connections with negative indices
1330              CF_maker                    - coarse/fine marker for on-processor points
1331              CF_maker_offd               - coarse/fine marker for off-processor connections
1332 
1333       Output: P_diag_i             - Vector where P_diag_i[i] holds the number of non-zero elements of P_diag on row i (will be 1).
1334               P_diag_i             - Vector where P_diag_j[i] holds a temporary, uncompressed column indices for P_diag.
1335               P_offd_i             - Vector where P_offd_i[i] holds the number of non-zero elements of P_offd on row i (will be 1).
1336               P_offd_i             - Vector where P_offd_j[i] holds a temporary, uncompressed column indices for P_offd.
1337               diag_compress_marker - Array of 0s and 1s used to compress P_diag col indices and data.
1338               offd_compress_marker - Array of 0s and 1s used to compress P_offd col indices and data.
1339     */
1340    /*-----------------------------------------------------------------------*/
1341 
1342    HYPRE_Int i = hypre_cuda_get_grid_warp_id<1,1>();
1343 
1344    if (i >= nr_of_rows)
1345    {
1346       return;
1347    }
1348 
1349    HYPRE_Int p, q;
1350    HYPRE_Int max_j_diag = -1, max_j_offd = -1;
1351    HYPRE_Int lane = hypre_cuda_get_lane_id<1>();
1352    HYPRE_Real max_diag = -1.0, max_offd = -1.0;
1353    HYPRE_Real warp_max_diag = -1.0, warp_max_offd = -1.0;
1354 
1355    if (lane == 0)
1356    {
1357       p = read_only_load(CF_marker + i);
1358    }
1359    p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0);
1360 
1361    /*--------------------------------------------------------------------
1362     *  If i is a C-point, interpolation is the identity.
1363     *--------------------------------------------------------------------*/
1364    if (p >= 0)
1365    {
1366       if (lane == 0)
1367       {
1368          P_diag_i[i] = 1;
1369          P_diag_j[i] = i;
1370          diag_compress_marker[i] = 1;
1371       }
1372       return;
1373    }
1374 
1375    /*--------------------------------------------------------------------
1376     *  If i is an F-point, find strongest connected C-point,
1377     *  which could be in diag or offd.
1378     *--------------------------------------------------------------------*/
1379 
1380    /* diag part */
1381    if (lane < 2)
1382    {
1383       p = read_only_load(A_diag_i + i + lane);
1384    }
1385    q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1);
1386    p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0);
1387 
1388    for (HYPRE_Int j = p + lane; j < q; j += HYPRE_WARP_SIZE)
1389    {
1390       /* column indices are negative for weak connections */
1391       const HYPRE_Int col = read_only_load(&A_strong_diag_j[j]);
1392       if (col >= 0)
1393       {
1394          const HYPRE_Complex val = fabs( read_only_load(&A_diag_a[j]) );
1395          if ( read_only_load(&CF_marker[col]) > 0 && val > max_diag )
1396          {
1397             max_diag = val;
1398             max_j_diag = col;
1399          }
1400       }
1401    }
1402    warp_max_diag = warp_allreduce_max(max_diag);
1403 
1404    /* offd part */
1405    if (lane < 2)
1406    {
1407       p = read_only_load(A_offd_i + i + lane);
1408    }
1409    q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1);
1410    p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0);
1411 
1412    for (HYPRE_Int j = p + lane; j < q; j += HYPRE_WARP_SIZE)
1413    {
1414       const HYPRE_Int col = read_only_load(&A_strong_offd_j[j]);
1415       /* column indices are negative for weak connections */
1416       if (col >= 0)
1417       {
1418          const HYPRE_Complex val = fabs( read_only_load(&A_offd_a[j]) );
1419          if ( read_only_load(&CF_marker_offd[col]) > 0 && val > max_offd )
1420          {
1421             max_offd = val;
1422             max_j_offd = col;
1423          }
1424       }
1425    }
1426    warp_max_offd = warp_allreduce_max(max_offd);
1427 
1428    /*--------------------------------------------------------------------
1429     *  If no max found, then there is no strongly connected C-point,
1430     *  and this will be a zero row
1431     *--------------------------------------------------------------------*/
1432 
1433    if (warp_max_offd < 0 && warp_max_diag < 0)
1434    {
1435       return;
1436    }
1437 
1438    /*--------------------------------------------------------------------
1439     *  Otherwise, find the column index in either diag or offd
1440     *--------------------------------------------------------------------*/
1441 
1442    if (warp_max_offd > warp_max_diag)
1443    {
1444       if (warp_max_offd != max_offd)
1445       {
1446          max_j_offd = -1;
1447       }
1448       max_j_offd = warp_reduce_max(max_j_offd);
1449       if (lane == 0)
1450       {
1451          P_offd_i[i] = 1;
1452          P_offd_j[i] = max_j_offd;
1453          offd_compress_marker[i] = 1;
1454       }
1455    }
1456    else
1457    {
1458       if (warp_max_diag != max_diag)
1459       {
1460          max_j_diag = -1;
1461       }
1462       max_j_diag = warp_reduce_max(max_j_diag);
1463       if (lane == 0)
1464       {
1465          P_diag_i[i] = 1;
1466          P_diag_j[i] = max_j_diag;
1467          diag_compress_marker[i] = 1;
1468       }
1469    }
1470 }
1471 
1472 #endif // defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
1473