1 /******************************************************************************
2  * Copyright 1998-2019 Lawrence Livermore National Security, LLC and other
3  * HYPRE Project Developers. See the top-level COPYRIGHT file for details.
4  *
5  * SPDX-License-Identifier: (Apache-2.0 OR MIT)
6  ******************************************************************************/
7 
8 #include "_hypre_utilities.h"
9 #include "_hypre_utilities.hpp"
10 
11 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
12 
13 /*
14  * The architecture identification macro __CUDA_ARCH__ is assigned a three-digit value string xy0
15  * (ending in a literal 0) during each nvcc compilation stage 1 that compiles for compute_xy.
16  * This macro can be used in the implementation of GPU functions for determining the virtual architecture
17  * for which it is currently being compiled. The host code (the non-GPU code) must not depend on it.
18  * Note that compute_XX refers to a PTX version and sm_XX refers to a cubin version.
19 */
20 __global__ void
hypreCUDAKernel_CompileFlagSafetyCheck(hypre_int * cuda_arch_compile)21 hypreCUDAKernel_CompileFlagSafetyCheck(hypre_int *cuda_arch_compile)
22 {
23 #if defined(__CUDA_ARCH__)
24    cuda_arch_compile[0] = __CUDA_ARCH__;
25 #endif
26 }
27 
28 /*
29  * Assume this function is called inside HYPRE_Init(), at a place where we do not want to
30  * activate memory pooling, so we do not use hypre's memory model to Alloc and Free.
31  * See commented out code below (and do not delete)
32 */
hypre_CudaCompileFlagCheck()33 void hypre_CudaCompileFlagCheck()
34 {
35   // This is really only defined for CUDA and not for HIP
36 #if defined(HYPRE_USING_CUDA)
37 
38    HYPRE_Int device = hypre_HandleCudaDevice(hypre_handle());
39 
40    struct cudaDeviceProp props;
41    cudaGetDeviceProperties(&props, device);
42    hypre_int cuda_arch_actual = props.major*100 + props.minor*10;
43    hypre_int cuda_arch_compile = -1;
44    dim3 gDim(1,1,1), bDim(1,1,1);
45 
46    hypre_int *cuda_arch_compile_d = NULL;
47    //cuda_arch_compile_d = hypre_TAlloc(hypre_int, 1, HYPRE_MEMORY_DEVICE);
48    HYPRE_CUDA_CALL( cudaMalloc(&cuda_arch_compile_d, sizeof(hypre_int)) );
49    hypre_TMemcpy(cuda_arch_compile_d, &cuda_arch_compile, hypre_int, 1, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST);
50    HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CompileFlagSafetyCheck, gDim, bDim, cuda_arch_compile_d );
51    hypre_TMemcpy(&cuda_arch_compile, cuda_arch_compile_d, hypre_int, 1, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE);
52    //hypre_TFree(cuda_arch_compile_d, HYPRE_MEMORY_DEVICE);
53    HYPRE_CUDA_CALL( cudaFree(cuda_arch_compile_d) );
54 
55    /* HYPRE_CUDA_CALL(cudaDeviceSynchronize()); */
56 
57    if (-1 == cuda_arch_compile)
58    {
59       hypre_error_w_msg(1, "hypre error: no proper cuda_arch found");
60    }
61    else if (cuda_arch_actual != cuda_arch_compile)
62    {
63       char msg[256];
64       hypre_sprintf(msg, "hypre warning: Compile 'arch=compute_' does not match device arch %d", cuda_arch_actual);
65       hypre_error_w_msg(1, msg);
66       /*
67       hypre_printf("%s\n", msg);
68       hypre_MPI_Abort(hypre_MPI_COMM_WORLD, -1);
69       */
70    }
71 
72 #endif // defined(HYPRE_USING_CUDA)
73 }
74 
75 dim3
hypre_GetDefaultCUDABlockDimension()76 hypre_GetDefaultCUDABlockDimension()
77 {
78    dim3 bDim(512, 1, 1);
79 
80    return bDim;
81 }
82 
83 dim3
hypre_GetDefaultCUDAGridDimension(HYPRE_Int n,const char * granularity,dim3 bDim)84 hypre_GetDefaultCUDAGridDimension( HYPRE_Int n,
85                                    const char *granularity,
86                                    dim3 bDim )
87 {
88    HYPRE_Int num_blocks = 0;
89    HYPRE_Int num_threads_per_block = bDim.x * bDim.y * bDim.z;
90 
91    if (granularity[0] == 't')
92    {
93       num_blocks = (n + num_threads_per_block - 1) / num_threads_per_block;
94    }
95    else if (granularity[0] == 'w')
96    {
97       HYPRE_Int num_warps_per_block = num_threads_per_block >> HYPRE_WARP_BITSHIFT;
98 
99       hypre_assert(num_warps_per_block * HYPRE_WARP_SIZE == num_threads_per_block);
100 
101       num_blocks = (n + num_warps_per_block - 1) / num_warps_per_block;
102    }
103    else
104    {
105       hypre_printf("Error %s %d: Unknown granularity !\n", __FILE__, __LINE__);
106       hypre_assert(0);
107    }
108 
109    dim3 gDim(num_blocks, 1, 1);
110 
111    return gDim;
112 }
113 
114 /**
115  * Get NNZ of each row in d_row_indices and stored the results in d_rownnz
116  * All pointers are device pointers.
117  * d_rownnz can be the same as d_row_indices
118  */
119 __global__ void
hypreCUDAKernel_GetRowNnz(HYPRE_Int nrows,HYPRE_Int * d_row_indices,HYPRE_Int * d_diag_ia,HYPRE_Int * d_offd_ia,HYPRE_Int * d_rownnz)120 hypreCUDAKernel_GetRowNnz(HYPRE_Int nrows, HYPRE_Int *d_row_indices, HYPRE_Int *d_diag_ia, HYPRE_Int *d_offd_ia,
121                           HYPRE_Int *d_rownnz)
122 {
123    const HYPRE_Int global_thread_id = hypre_cuda_get_grid_thread_id<1,1>();
124 
125    if (global_thread_id < nrows)
126    {
127       HYPRE_Int i;
128 
129       if (d_row_indices)
130       {
131          i = read_only_load(&d_row_indices[global_thread_id]);
132       }
133       else
134       {
135          i = global_thread_id;
136       }
137 
138       d_rownnz[global_thread_id] = read_only_load(&d_diag_ia[i+1]) - read_only_load(&d_diag_ia[i]) +
139                                    read_only_load(&d_offd_ia[i+1]) - read_only_load(&d_offd_ia[i]);
140    }
141 }
142 
143 /* special case: if d_row_indices == NULL, it means d_row_indices=[0,1,...,nrows-1] */
144 HYPRE_Int
hypreDevice_GetRowNnz(HYPRE_Int nrows,HYPRE_Int * d_row_indices,HYPRE_Int * d_diag_ia,HYPRE_Int * d_offd_ia,HYPRE_Int * d_rownnz)145 hypreDevice_GetRowNnz(HYPRE_Int nrows, HYPRE_Int *d_row_indices, HYPRE_Int *d_diag_ia, HYPRE_Int *d_offd_ia,
146                       HYPRE_Int *d_rownnz)
147 {
148    const dim3 bDim = hypre_GetDefaultCUDABlockDimension();
149    const dim3 gDim = hypre_GetDefaultCUDAGridDimension(nrows, "thread", bDim);
150 
151    /* trivial case */
152    if (nrows <= 0)
153    {
154       return hypre_error_flag;
155    }
156 
157    HYPRE_CUDA_LAUNCH( hypreCUDAKernel_GetRowNnz, gDim, bDim, nrows, d_row_indices, d_diag_ia, d_offd_ia, d_rownnz );
158 
159    return hypre_error_flag;
160 }
161 
162 __global__ void
hypreCUDAKernel_CopyParCSRRows(HYPRE_Int nrows,HYPRE_Int * d_row_indices,HYPRE_Int has_offd,HYPRE_BigInt first_col,HYPRE_BigInt * d_col_map_offd_A,HYPRE_Int * d_diag_i,HYPRE_Int * d_diag_j,HYPRE_Complex * d_diag_a,HYPRE_Int * d_offd_i,HYPRE_Int * d_offd_j,HYPRE_Complex * d_offd_a,HYPRE_Int * d_ib,HYPRE_BigInt * d_jb,HYPRE_Complex * d_ab)163 hypreCUDAKernel_CopyParCSRRows(HYPRE_Int      nrows,
164                                HYPRE_Int     *d_row_indices,
165                                HYPRE_Int      has_offd,
166                                HYPRE_BigInt   first_col,
167                                HYPRE_BigInt  *d_col_map_offd_A,
168                                HYPRE_Int     *d_diag_i,
169                                HYPRE_Int     *d_diag_j,
170                                HYPRE_Complex *d_diag_a,
171                                HYPRE_Int     *d_offd_i,
172                                HYPRE_Int     *d_offd_j,
173                                HYPRE_Complex *d_offd_a,
174                                HYPRE_Int     *d_ib,
175                                HYPRE_BigInt  *d_jb,
176                                HYPRE_Complex *d_ab)
177 {
178    const HYPRE_Int global_warp_id = hypre_cuda_get_grid_warp_id<1,1>();
179 
180    if (global_warp_id >= nrows)
181    {
182       return;
183    }
184 
185    /* lane id inside the warp */
186    const HYPRE_Int lane_id = hypre_cuda_get_lane_id<1>();
187    HYPRE_Int i, j, k, p, row, istart, iend, bstart;
188 
189    /* diag part */
190    if (lane_id < 2)
191    {
192       /* row index to work on */
193       if (d_row_indices)
194       {
195          row = read_only_load(d_row_indices + global_warp_id);
196       }
197       else
198       {
199          row = global_warp_id;
200       }
201       /* start/end position of the row */
202       j = read_only_load(d_diag_i + row + lane_id);
203       /* start position of b */
204       k = d_ib ? read_only_load(d_ib + global_warp_id) : 0;
205    }
206    istart = __shfl_sync(HYPRE_WARP_FULL_MASK, j, 0);
207    iend   = __shfl_sync(HYPRE_WARP_FULL_MASK, j, 1);
208    bstart = __shfl_sync(HYPRE_WARP_FULL_MASK, k, 0);
209 
210    p = bstart - istart;
211    for (i = istart + lane_id; i < iend; i += HYPRE_WARP_SIZE)
212    {
213       d_jb[p+i] = read_only_load(d_diag_j + i) + first_col;
214       if (d_ab)
215       {
216          d_ab[p+i] = read_only_load(d_diag_a + i);
217       }
218    }
219 
220    if (!has_offd)
221    {
222       return;
223    }
224 
225    /* offd part */
226    if (lane_id < 2)
227    {
228       j = read_only_load(d_offd_i + row + lane_id);
229    }
230    bstart += iend - istart;
231    istart = __shfl_sync(HYPRE_WARP_FULL_MASK, j, 0);
232    iend   = __shfl_sync(HYPRE_WARP_FULL_MASK, j, 1);
233 
234    p = bstart - istart;
235    for (i = istart + lane_id; i < iend; i += HYPRE_WARP_SIZE)
236    {
237       if (d_col_map_offd_A)
238       {
239          d_jb[p+i] = d_col_map_offd_A[read_only_load(d_offd_j + i)];
240       }
241       else
242       {
243          d_jb[p+i] = -1 - read_only_load(d_offd_j + i);
244       }
245 
246       if (d_ab)
247       {
248          d_ab[p+i] = read_only_load(d_offd_a + i);
249       }
250    }
251 }
252 
253 /* B = A(row_indices, :) */
254 /* Note: d_ib is an input vector that contains row ptrs,
255  *       i.e., start positions where to put the rows in d_jb and d_ab.
256  *       The col indices in B are global indices, i.e., BigJ
257  *       of length (nrows + 1) or nrow (without the last entry, nnz) */
258 /* Special cases:
259  *    if d_row_indices == NULL, it means d_row_indices=[0,1,...,nrows-1]
260  *    If col_map_offd_A == NULL, use (-1 - d_offd_j) as column id
261  *    If nrows == 1 and d_ib == NULL, it means d_ib[0] = 0 */
262 HYPRE_Int
hypreDevice_CopyParCSRRows(HYPRE_Int nrows,HYPRE_Int * d_row_indices,HYPRE_Int job,HYPRE_Int has_offd,HYPRE_BigInt first_col,HYPRE_BigInt * d_col_map_offd_A,HYPRE_Int * d_diag_i,HYPRE_Int * d_diag_j,HYPRE_Complex * d_diag_a,HYPRE_Int * d_offd_i,HYPRE_Int * d_offd_j,HYPRE_Complex * d_offd_a,HYPRE_Int * d_ib,HYPRE_BigInt * d_jb,HYPRE_Complex * d_ab)263 hypreDevice_CopyParCSRRows(HYPRE_Int      nrows,
264                            HYPRE_Int     *d_row_indices,
265                            HYPRE_Int      job,
266                            HYPRE_Int      has_offd,
267                            HYPRE_BigInt   first_col,
268                            HYPRE_BigInt  *d_col_map_offd_A,
269                            HYPRE_Int     *d_diag_i,
270                            HYPRE_Int     *d_diag_j,
271                            HYPRE_Complex *d_diag_a,
272                            HYPRE_Int     *d_offd_i,
273                            HYPRE_Int     *d_offd_j,
274                            HYPRE_Complex *d_offd_a,
275                            HYPRE_Int     *d_ib,
276                            HYPRE_BigInt  *d_jb,
277                            HYPRE_Complex *d_ab)
278 {
279    /* trivial case */
280    if (nrows <= 0)
281    {
282       return hypre_error_flag;
283    }
284 
285    hypre_assert(!(nrows > 1 && d_ib == NULL));
286 
287    const dim3 bDim = hypre_GetDefaultCUDABlockDimension();
288    const dim3 gDim = hypre_GetDefaultCUDAGridDimension(nrows, "warp", bDim);
289 
290    /*
291    if (job == 2)
292    {
293    }
294    */
295 
296    HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CopyParCSRRows, gDim, bDim,
297                       nrows, d_row_indices, has_offd, first_col, d_col_map_offd_A,
298                       d_diag_i, d_diag_j, d_diag_a,
299                       d_offd_i, d_offd_j, d_offd_a,
300                       d_ib, d_jb, d_ab );
301 
302    return hypre_error_flag;
303 }
304 
305 HYPRE_Int
hypreDevice_IntegerReduceSum(HYPRE_Int n,HYPRE_Int * d_i)306 hypreDevice_IntegerReduceSum(HYPRE_Int n, HYPRE_Int *d_i)
307 {
308    return HYPRE_THRUST_CALL(reduce, d_i, d_i + n);
309 }
310 
311 HYPRE_Int
hypreDevice_IntegerInclusiveScan(HYPRE_Int n,HYPRE_Int * d_i)312 hypreDevice_IntegerInclusiveScan(HYPRE_Int n, HYPRE_Int *d_i)
313 {
314    HYPRE_THRUST_CALL(inclusive_scan, d_i, d_i + n, d_i);
315 
316    return hypre_error_flag;
317 }
318 
319 HYPRE_Int
hypreDevice_IntegerExclusiveScan(HYPRE_Int n,HYPRE_Int * d_i)320 hypreDevice_IntegerExclusiveScan(HYPRE_Int n, HYPRE_Int *d_i)
321 {
322    HYPRE_THRUST_CALL(exclusive_scan, d_i, d_i + n, d_i);
323 
324    return hypre_error_flag;
325 }
326 
327 HYPRE_Int
hypreDevice_Scalen(HYPRE_Complex * d_x,size_t n,HYPRE_Complex v)328 hypreDevice_Scalen(HYPRE_Complex *d_x, size_t n, HYPRE_Complex v)
329 {
330    HYPRE_THRUST_CALL( transform, d_x, d_x + n, d_x, v * _1 );
331 
332    return hypre_error_flag;
333 }
334 
335 HYPRE_Int
hypreDevice_Filln(HYPRE_Complex * d_x,size_t n,HYPRE_Complex v)336 hypreDevice_Filln(HYPRE_Complex *d_x, size_t n, HYPRE_Complex v)
337 {
338    HYPRE_THRUST_CALL( fill_n, d_x, n, v);
339 
340    return hypre_error_flag;
341 }
342 
343 HYPRE_Int
hypreDevice_BigIntFilln(HYPRE_BigInt * d_x,size_t n,HYPRE_BigInt v)344 hypreDevice_BigIntFilln(HYPRE_BigInt *d_x, size_t n, HYPRE_BigInt v)
345 {
346    HYPRE_THRUST_CALL( fill_n, d_x, n, v);
347 
348    return hypre_error_flag;
349 }
350 
351 struct hypre_empty_row_functor
352 {
353    // This is needed for clang
354    typedef bool result_type;
355 
356    __device__
operatorhypre_empty_row_functor357    bool operator()(const thrust::tuple<HYPRE_Int, HYPRE_Int>& t) const
358    {
359       const HYPRE_Int a = thrust::get<0>(t);
360       const HYPRE_Int b = thrust::get<1>(t);
361 
362       return a != b;
363    }
364 };
365 
366 HYPRE_Int*
hypreDevice_CsrRowPtrsToIndices(HYPRE_Int nrows,HYPRE_Int nnz,HYPRE_Int * d_row_ptr)367 hypreDevice_CsrRowPtrsToIndices(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr)
368 {
369    /* trivial case */
370    if (nrows <= 0 || nnz <= 0)
371    {
372       return NULL;
373    }
374 
375    HYPRE_Int *d_row_ind = hypre_TAlloc(HYPRE_Int, nnz, HYPRE_MEMORY_DEVICE);
376 
377    hypreDevice_CsrRowPtrsToIndices_v2(nrows, nnz, d_row_ptr, d_row_ind);
378 
379    return d_row_ind;
380 }
381 
382 HYPRE_Int
hypreDevice_CsrRowPtrsToIndices_v2(HYPRE_Int nrows,HYPRE_Int nnz,HYPRE_Int * d_row_ptr,HYPRE_Int * d_row_ind)383 hypreDevice_CsrRowPtrsToIndices_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr, HYPRE_Int *d_row_ind)
384 {
385    /* trivial case */
386    if (nrows <= 0 || nnz <= 0)
387    {
388       return hypre_error_flag;
389    }
390 
391    HYPRE_THRUST_CALL( fill, d_row_ind, d_row_ind + nnz, 0 );
392 
393    HYPRE_THRUST_CALL( scatter_if,
394                       thrust::counting_iterator<HYPRE_Int>(0),
395                       thrust::counting_iterator<HYPRE_Int>(nrows),
396                       d_row_ptr,
397                       thrust::make_transform_iterator( thrust::make_zip_iterator(thrust::make_tuple(d_row_ptr, d_row_ptr+1)),
398                                                        hypre_empty_row_functor() ),
399                       d_row_ind );
400 
401    HYPRE_THRUST_CALL( inclusive_scan, d_row_ind, d_row_ind + nnz, d_row_ind, thrust::maximum<HYPRE_Int>());
402 
403    return hypre_error_flag;
404 }
405 
406 /* Input: d_row_num, of size nrows, contains the rows indices that can be BigInt or Int
407  * Output: d_row_ind */
408 template <typename T>
409 HYPRE_Int
hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows,HYPRE_Int nnz,HYPRE_Int * d_row_ptr,T * d_row_num,T * d_row_ind)410 hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr, T *d_row_num, T *d_row_ind)
411 {
412    /* trivial case */
413    if (nrows <= 0)
414    {
415       return hypre_error_flag;
416    }
417 
418    HYPRE_Int *map = hypre_TAlloc(HYPRE_Int, nnz, HYPRE_MEMORY_DEVICE);
419 
420    hypreDevice_CsrRowPtrsToIndices_v2(nrows, nnz, d_row_ptr, map);
421 
422    HYPRE_THRUST_CALL(gather, map, map + nnz, d_row_num, d_row_ind);
423 
424    hypre_TFree(map, HYPRE_MEMORY_DEVICE);
425 
426    return hypre_error_flag;
427 }
428 
429 template HYPRE_Int hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr, HYPRE_Int *d_row_num, HYPRE_Int *d_row_ind);
430 #if defined(HYPRE_MIXEDINT)
431 template HYPRE_Int hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr, HYPRE_BigInt *d_row_num, HYPRE_BigInt *d_row_ind);
432 #endif
433 
434 HYPRE_Int*
hypreDevice_CsrRowIndicesToPtrs(HYPRE_Int nrows,HYPRE_Int nnz,HYPRE_Int * d_row_ind)435 hypreDevice_CsrRowIndicesToPtrs(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind)
436 {
437    HYPRE_Int *d_row_ptr = hypre_TAlloc(HYPRE_Int, nrows+1, HYPRE_MEMORY_DEVICE);
438 
439    HYPRE_THRUST_CALL( lower_bound,
440                       d_row_ind, d_row_ind + nnz,
441                       thrust::counting_iterator<HYPRE_Int>(0),
442                       thrust::counting_iterator<HYPRE_Int>(nrows+1),
443                       d_row_ptr);
444 
445    return d_row_ptr;
446 }
447 
448 HYPRE_Int
hypreDevice_CsrRowIndicesToPtrs_v2(HYPRE_Int nrows,HYPRE_Int nnz,HYPRE_Int * d_row_ind,HYPRE_Int * d_row_ptr)449 hypreDevice_CsrRowIndicesToPtrs_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind, HYPRE_Int *d_row_ptr)
450 {
451    HYPRE_THRUST_CALL( lower_bound,
452                       d_row_ind, d_row_ind + nnz,
453                       thrust::counting_iterator<HYPRE_Int>(0),
454                       thrust::counting_iterator<HYPRE_Int>(nrows+1),
455                       d_row_ptr);
456 
457    return hypre_error_flag;
458 }
459 
460 __global__ void
hypreCUDAKernel_ScatterAddTrivial(HYPRE_Int n,HYPRE_Real * x,HYPRE_Int * map,HYPRE_Real * y)461 hypreCUDAKernel_ScatterAddTrivial(HYPRE_Int n, HYPRE_Real *x, HYPRE_Int *map, HYPRE_Real *y)
462 {
463    for (HYPRE_Int i = 0; i < n; i++)
464    {
465       x[map[i]] += y[i];
466    }
467 }
468 
469 /* x[map[i]] += y[i], same index cannot appear more than once in map */
470 __global__ void
hypreCUDAKernel_ScatterAdd(HYPRE_Int n,HYPRE_Real * x,HYPRE_Int * map,HYPRE_Real * y)471 hypreCUDAKernel_ScatterAdd(HYPRE_Int n, HYPRE_Real *x, HYPRE_Int *map, HYPRE_Real *y)
472 {
473    HYPRE_Int global_thread_id = hypre_cuda_get_grid_thread_id<1,1>();
474 
475    if (global_thread_id < n)
476    {
477       x[map[global_thread_id]] += y[global_thread_id];
478    }
479 }
480 
481 /* Generalized Scatter-and-Add
482  * for i = 0 : ny-1, x[map[i]] += y[i];
483  * Note: An index is allowed to appear more than once in map
484  *       Content in y will be destroyed
485  *       When work != NULL, work is at least of size [2*sizeof(HYPRE_Int)+sizeof(HYPRE_Complex)]*ny
486  */
487 HYPRE_Int
hypreDevice_GenScatterAdd(HYPRE_Real * x,HYPRE_Int ny,HYPRE_Int * map,HYPRE_Real * y,char * work)488 hypreDevice_GenScatterAdd(HYPRE_Real *x, HYPRE_Int ny, HYPRE_Int *map, HYPRE_Real *y, char *work)
489 {
490    if (ny <= 0)
491    {
492       return hypre_error_flag;
493    }
494 
495    if (ny <= 2)
496    {
497       /* trivial cases, n = 1, 2 */
498       dim3 bDim = 1;
499       dim3 gDim = 1;
500       HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ScatterAddTrivial, gDim, bDim, ny, x, map, y );
501    }
502    else
503    {
504       /* general cases */
505       HYPRE_Int *map2, *reduced_map, reduced_n;
506       HYPRE_Real *reduced_y;
507 
508       if (work)
509       {
510          map2 = (HYPRE_Int *) work;
511          reduced_map = map2 + ny;
512          reduced_y = (HYPRE_Real *) (reduced_map + ny);
513       }
514       else
515       {
516          map2        = hypre_TAlloc(HYPRE_Int,  ny, HYPRE_MEMORY_DEVICE);
517          reduced_map = hypre_TAlloc(HYPRE_Int,  ny, HYPRE_MEMORY_DEVICE);
518          reduced_y   = hypre_TAlloc(HYPRE_Real, ny, HYPRE_MEMORY_DEVICE);
519       }
520 
521       hypre_TMemcpy(map2, map, HYPRE_Int, ny, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
522 
523       HYPRE_THRUST_CALL(sort_by_key, map2, map2 + ny, y);
524 
525       thrust::pair<HYPRE_Int*, HYPRE_Real*> new_end = HYPRE_THRUST_CALL( reduce_by_key,
526                                                                          map2,
527                                                                          map2 + ny,
528                                                                          y,
529                                                                          reduced_map,
530                                                                          reduced_y );
531 
532       reduced_n = new_end.first - reduced_map;
533 
534       hypre_assert(reduced_n == new_end.second - reduced_y);
535 
536       dim3 bDim = hypre_GetDefaultCUDABlockDimension();
537       dim3 gDim = hypre_GetDefaultCUDAGridDimension(reduced_n, "thread", bDim);
538 
539       HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ScatterAdd, gDim, bDim,
540                          reduced_n, x, reduced_map, reduced_y );
541 
542       if (!work)
543       {
544          hypre_TFree(map2, HYPRE_MEMORY_DEVICE);
545          hypre_TFree(reduced_map, HYPRE_MEMORY_DEVICE);
546          hypre_TFree(reduced_y, HYPRE_MEMORY_DEVICE);
547       }
548    }
549 
550    return hypre_error_flag;
551 }
552 
553 /* x[map[i]] = v */
554 template <typename T>
555 __global__ void
hypreCUDAKernel_ScatterConstant(T * x,HYPRE_Int n,HYPRE_Int * map,T v)556 hypreCUDAKernel_ScatterConstant(T *x, HYPRE_Int n, HYPRE_Int *map, T v)
557 {
558    HYPRE_Int global_thread_id = hypre_cuda_get_grid_thread_id<1,1>();
559 
560    if (global_thread_id < n)
561    {
562       x[map[global_thread_id]] = v;
563    }
564 }
565 
566 /* x[map[i]] = v
567  * n is length of map
568  * TODO: thrust? */
569 template <typename T>
570 HYPRE_Int
hypreDevice_ScatterConstant(T * x,HYPRE_Int n,HYPRE_Int * map,T v)571 hypreDevice_ScatterConstant(T *x, HYPRE_Int n, HYPRE_Int *map, T v)
572 {
573    /* trivial case */
574    if (n <= 0)
575    {
576       return hypre_error_flag;
577    }
578 
579    dim3 bDim = hypre_GetDefaultCUDABlockDimension();
580    dim3 gDim = hypre_GetDefaultCUDAGridDimension(n, "thread", bDim);
581 
582    HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ScatterConstant, gDim, bDim, x, n, map, v );
583 
584    return hypre_error_flag;
585 }
586 
587 template HYPRE_Int hypreDevice_ScatterConstant(HYPRE_Int     *x, HYPRE_Int n, HYPRE_Int *map, HYPRE_Int     v);
588 template HYPRE_Int hypreDevice_ScatterConstant(HYPRE_Complex *x, HYPRE_Int n, HYPRE_Int *map, HYPRE_Complex v);
589 
590 __global__ void
hypreCUDAKernel_IVAXPY(HYPRE_Int n,HYPRE_Complex * a,HYPRE_Complex * x,HYPRE_Complex * y)591 hypreCUDAKernel_IVAXPY(HYPRE_Int n, HYPRE_Complex *a, HYPRE_Complex *x, HYPRE_Complex *y)
592 {
593    HYPRE_Int i = hypre_cuda_get_grid_thread_id<1,1>();
594 
595    if (i < n)
596    {
597       y[i] += x[i] / a[i];
598    }
599 }
600 
601 /* Inverse Vector AXPY: y[i] = x[i] / a[i] + y[i] */
602 HYPRE_Int
hypreDevice_IVAXPY(HYPRE_Int n,HYPRE_Complex * a,HYPRE_Complex * x,HYPRE_Complex * y)603 hypreDevice_IVAXPY(HYPRE_Int n, HYPRE_Complex *a, HYPRE_Complex *x, HYPRE_Complex *y)
604 {
605    /* trivial case */
606    if (n <= 0)
607    {
608       return hypre_error_flag;
609    }
610 
611    dim3 bDim = hypre_GetDefaultCUDABlockDimension();
612    dim3 gDim = hypre_GetDefaultCUDAGridDimension(n, "thread", bDim);
613 
614    HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IVAXPY, gDim, bDim, n, a, x, y );
615 
616    return hypre_error_flag;
617 }
618 
619 __global__ void
hypreCUDAKernel_IVAXPYMarked(HYPRE_Int n,HYPRE_Complex * a,HYPRE_Complex * x,HYPRE_Complex * y,HYPRE_Int * marker,HYPRE_Int marker_val)620 hypreCUDAKernel_IVAXPYMarked(HYPRE_Int n, HYPRE_Complex *a, HYPRE_Complex *x, HYPRE_Complex *y, HYPRE_Int *marker, HYPRE_Int marker_val)
621 {
622    HYPRE_Int i = hypre_cuda_get_grid_thread_id<1,1>();
623 
624    if (i < n)
625    {
626       if (marker[i] == marker_val)
627       {
628          y[i] += x[i] / a[i];
629       }
630    }
631 }
632 
633 /* Inverse Vector AXPY: y[i] = x[i] / a[i] + y[i] */
634 HYPRE_Int
hypreDevice_IVAXPYMarked(HYPRE_Int n,HYPRE_Complex * a,HYPRE_Complex * x,HYPRE_Complex * y,HYPRE_Int * marker,HYPRE_Int marker_val)635 hypreDevice_IVAXPYMarked(HYPRE_Int n, HYPRE_Complex *a, HYPRE_Complex *x, HYPRE_Complex *y, HYPRE_Int *marker, HYPRE_Int marker_val)
636 {
637    /* trivial case */
638    if (n <= 0)
639    {
640       return hypre_error_flag;
641    }
642 
643    dim3 bDim = hypre_GetDefaultCUDABlockDimension();
644    dim3 gDim = hypre_GetDefaultCUDAGridDimension(n, "thread", bDim);
645 
646    HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IVAXPYMarked, gDim, bDim, n, a, x, y, marker, marker_val );
647 
648    return hypre_error_flag;
649 }
650 
651 __global__ void
hypreCUDAKernel_DiagScaleVector(HYPRE_Int n,HYPRE_Int * A_i,HYPRE_Complex * A_data,HYPRE_Complex * x,HYPRE_Complex beta,HYPRE_Complex * y)652 hypreCUDAKernel_DiagScaleVector(HYPRE_Int n, HYPRE_Int *A_i, HYPRE_Complex *A_data, HYPRE_Complex *x, HYPRE_Complex beta, HYPRE_Complex *y)
653 {
654    HYPRE_Int i = hypre_cuda_get_grid_thread_id<1,1>();
655 
656    if (i < n)
657    {
658       if (beta != 0.0)
659       {
660          y[i] = x[i] / A_data[A_i[i]] + beta * y[i];
661       }
662       else
663       {
664          y[i] = x[i] / A_data[A_i[i]];
665       }
666    }
667 }
668 
669 /* y = diag(A) \ x + beta y
670  * Note: Assume A_i[i] points to the ith diagonal entry of A */
671 HYPRE_Int
hypreDevice_DiagScaleVector(HYPRE_Int n,HYPRE_Int * A_i,HYPRE_Complex * A_data,HYPRE_Complex * x,HYPRE_Complex beta,HYPRE_Complex * y)672 hypreDevice_DiagScaleVector(HYPRE_Int n, HYPRE_Int *A_i, HYPRE_Complex *A_data, HYPRE_Complex *x, HYPRE_Complex beta, HYPRE_Complex *y)
673 {
674    /* trivial case */
675    if (n <= 0)
676    {
677       return hypre_error_flag;
678    }
679 
680    dim3 bDim = hypre_GetDefaultCUDABlockDimension();
681    dim3 gDim = hypre_GetDefaultCUDAGridDimension(n, "thread", bDim);
682 
683    HYPRE_CUDA_LAUNCH( hypreCUDAKernel_DiagScaleVector, gDim, bDim, n, A_i, A_data, x, beta, y );
684 
685    return hypre_error_flag;
686 }
687 
688 __global__ void
hypreCUDAKernel_DiagScaleVector2(HYPRE_Int n,HYPRE_Int * A_i,HYPRE_Complex * A_data,HYPRE_Complex * x,HYPRE_Complex beta,HYPRE_Complex * y,HYPRE_Complex * z)689 hypreCUDAKernel_DiagScaleVector2(HYPRE_Int n, HYPRE_Int *A_i, HYPRE_Complex *A_data, HYPRE_Complex *x, HYPRE_Complex beta, HYPRE_Complex *y, HYPRE_Complex *z)
690 {
691    HYPRE_Int i = hypre_cuda_get_grid_thread_id<1,1>();
692 
693    if (i < n)
694    {
695       HYPRE_Complex t = x[i] / A_data[A_i[i]];
696       y[i] = t;
697       z[i] += beta*t;
698    }
699 }
700 
701 /* y = diag(A) \ x
702  * z = beta * (diag(A) \ x) + z
703  * Note: Assume A_i[i] points to the ith diagonal entry of A */
704 HYPRE_Int
hypreDevice_DiagScaleVector2(HYPRE_Int n,HYPRE_Int * A_i,HYPRE_Complex * A_data,HYPRE_Complex * x,HYPRE_Complex beta,HYPRE_Complex * y,HYPRE_Complex * z)705 hypreDevice_DiagScaleVector2(HYPRE_Int n, HYPRE_Int *A_i, HYPRE_Complex *A_data, HYPRE_Complex *x, HYPRE_Complex beta, HYPRE_Complex *y, HYPRE_Complex *z)
706 {
707    /* trivial case */
708    if (n <= 0)
709    {
710       return hypre_error_flag;
711    }
712 
713    dim3 bDim = hypre_GetDefaultCUDABlockDimension();
714    dim3 gDim = hypre_GetDefaultCUDAGridDimension(n, "thread", bDim);
715 
716    HYPRE_CUDA_LAUNCH( hypreCUDAKernel_DiagScaleVector2, gDim, bDim, n, A_i, A_data, x, beta, y, z );
717 
718    return hypre_error_flag;
719 }
720 
721 __global__ void
hypreCUDAKernel_BigToSmallCopy(HYPRE_Int * __restrict__ tgt,const HYPRE_BigInt * __restrict__ src,HYPRE_Int size)722 hypreCUDAKernel_BigToSmallCopy(      HYPRE_Int*    __restrict__ tgt,
723                                const HYPRE_BigInt* __restrict__ src,
724                                      HYPRE_Int                  size)
725 {
726    HYPRE_Int i = hypre_cuda_get_grid_thread_id<1,1>();
727 
728    if (i < size)
729    {
730       tgt[i] = src[i];
731    }
732 }
733 
734 HYPRE_Int
hypreDevice_BigToSmallCopy(HYPRE_Int * tgt,const HYPRE_BigInt * src,HYPRE_Int size)735 hypreDevice_BigToSmallCopy(HYPRE_Int *tgt, const HYPRE_BigInt *src, HYPRE_Int size)
736 {
737    dim3 bDim = hypre_GetDefaultCUDABlockDimension();
738    dim3 gDim = hypre_GetDefaultCUDAGridDimension(size, "thread", bDim);
739 
740    HYPRE_CUDA_LAUNCH( hypreCUDAKernel_BigToSmallCopy, gDim, bDim, tgt, src, size);
741 
742    return hypre_error_flag;
743 }
744 
745 
746 /* https://github.com/OrangeOwlSolutions/Thrust/blob/master/Sort_by_key_with_tuple_key.cu */
747 /* opt: 0, (a,b) < (a',b') iff a < a' or (a = a' and  b  <  b')  [normal tupe comp]
748  *      1, (a,b) < (a',b') iff a < a' or (a = a' and |b| > |b'|) [used in dropping small entries]
749  *      2, (a,b) < (a',b') iff a < a' or (a = a' and (b == a or b < b') and b' != a') [used in putting diagonal first]
750  */
751 template <typename T1, typename T2, typename T3>
752 HYPRE_Int
hypreDevice_StableSortByTupleKey(HYPRE_Int N,T1 * keys1,T2 * keys2,T3 * vals,HYPRE_Int opt)753 hypreDevice_StableSortByTupleKey(HYPRE_Int N, T1 *keys1, T2 *keys2, T3 *vals, HYPRE_Int opt)
754 {
755    auto begin_keys = thrust::make_zip_iterator(thrust::make_tuple(keys1,     keys2));
756    auto end_keys   = thrust::make_zip_iterator(thrust::make_tuple(keys1 + N, keys2 + N));
757 
758    if (opt == 0)
759    {
760       HYPRE_THRUST_CALL(stable_sort_by_key, begin_keys, end_keys, vals, thrust::less< thrust::tuple<T1, T2> >());
761    }
762    else if (opt == 1)
763    {
764       HYPRE_THRUST_CALL(stable_sort_by_key, begin_keys, end_keys, vals, TupleComp2<T1,T2>());
765    }
766    else if (opt == 2)
767    {
768       HYPRE_THRUST_CALL(stable_sort_by_key, begin_keys, end_keys, vals, TupleComp3<T1,T2>());
769    }
770 
771    return hypre_error_flag;
772 }
773 
774 template HYPRE_Int hypreDevice_StableSortByTupleKey(HYPRE_Int N, HYPRE_Int *keys1, HYPRE_Int  *keys2, HYPRE_Int     *vals, HYPRE_Int opt);
775 template HYPRE_Int hypreDevice_StableSortByTupleKey(HYPRE_Int N, HYPRE_Int *keys1, HYPRE_Real *keys2, HYPRE_Int     *vals, HYPRE_Int opt);
776 template HYPRE_Int hypreDevice_StableSortByTupleKey(HYPRE_Int N, HYPRE_Int *keys1, HYPRE_Int  *keys2, HYPRE_Complex *vals, HYPRE_Int opt);
777 
778 /* opt:
779  *      0, (a,b) < (a',b') iff a < a' or (a = a' and  b  <  b')                       [normal tupe comp]
780  *      2, (a,b) < (a',b') iff a < a' or (a = a' and (b == a or b < b') and b' != a') [used in assembly to put diagonal first]
781  */
782 template <typename T1, typename T2, typename T3, typename T4>
783 HYPRE_Int
hypreDevice_StableSortTupleByTupleKey(HYPRE_Int N,T1 * keys1,T2 * keys2,T3 * vals1,T4 * vals2,HYPRE_Int opt)784 hypreDevice_StableSortTupleByTupleKey(HYPRE_Int N, T1 *keys1, T2 *keys2, T3 *vals1, T4 *vals2, HYPRE_Int opt)
785 {
786    auto begin_keys = thrust::make_zip_iterator(thrust::make_tuple(keys1,     keys2));
787    auto end_keys   = thrust::make_zip_iterator(thrust::make_tuple(keys1 + N, keys2 + N));
788    auto begin_vals = thrust::make_zip_iterator(thrust::make_tuple(vals1,     vals2));
789 
790    if (opt == 0)
791    {
792       HYPRE_THRUST_CALL(stable_sort_by_key, begin_keys, end_keys, begin_vals, thrust::less< thrust::tuple<T1, T2> >());
793    }
794    else if (opt == 2)
795    {
796       HYPRE_THRUST_CALL(stable_sort_by_key, begin_keys, end_keys, begin_vals, TupleComp3<T1,T2>());
797    }
798 
799    return hypre_error_flag;
800 }
801 
802 template HYPRE_Int hypreDevice_StableSortTupleByTupleKey(HYPRE_Int N, HYPRE_Int *keys1, HYPRE_Int *keys2, char *vals1, HYPRE_Complex *vals2, HYPRE_Int opt);
803 #if defined(HYPRE_MIXEDINT)
804 template HYPRE_Int hypreDevice_StableSortTupleByTupleKey(HYPRE_Int N, HYPRE_BigInt *keys1, HYPRE_BigInt *keys2, char *vals1, HYPRE_Complex *vals2, HYPRE_Int opt);
805 #endif
806 
807 template <typename T1, typename T2, typename T3>
808 HYPRE_Int
hypreDevice_ReduceByTupleKey(HYPRE_Int N,T1 * keys1_in,T2 * keys2_in,T3 * vals_in,T1 * keys1_out,T2 * keys2_out,T3 * vals_out)809 hypreDevice_ReduceByTupleKey(HYPRE_Int N, T1 *keys1_in,  T2 *keys2_in,  T3 *vals_in,
810                                           T1 *keys1_out, T2 *keys2_out, T3 *vals_out)
811 {
812    auto begin_keys_in  = thrust::make_zip_iterator(thrust::make_tuple(keys1_in,     keys2_in    ));
813    auto end_keys_in    = thrust::make_zip_iterator(thrust::make_tuple(keys1_in + N, keys2_in + N));
814    auto begin_keys_out = thrust::make_zip_iterator(thrust::make_tuple(keys1_out,    keys2_out   ));
815    thrust::equal_to< thrust::tuple<T1, T2> > pred;
816    thrust::plus<T3> func;
817 
818    auto new_end = HYPRE_THRUST_CALL(reduce_by_key, begin_keys_in, end_keys_in, vals_in, begin_keys_out, vals_out, pred, func);
819 
820    return new_end.second - vals_out;
821 }
822 
823 template HYPRE_Int hypreDevice_ReduceByTupleKey(HYPRE_Int N, HYPRE_Int *keys1_in, HYPRE_Int *keys2_in, HYPRE_Complex *vals_in, HYPRE_Int *keys1_out, HYPRE_Int *keys2_out, HYPRE_Complex *vals_out);
824 
825 #endif // #if defined(HYPRE_USING_CUDA)  || defined(HYPRE_USING_HIP)
826 
827 #if defined(HYPRE_USING_CUSPARSE)
828 /*
829  * @brief Determines the associated CudaDataType for the HYPRE_Complex typedef
830  * @return Returns cuda data type corresponding with HYPRE_Complex
831  *
832  * @todo Should be known compile time
833  * @todo Support more sizes
834  * @todo Support complex
835  * @warning Only works for Single and Double precision
836  * @note Perhaps some typedefs should be added where HYPRE_Complex is typedef'd
837  */
838 cudaDataType
hypre_HYPREComplexToCudaDataType()839 hypre_HYPREComplexToCudaDataType()
840 {
841    /*
842    if (sizeof(char)*CHAR_BIT != 8)
843    {
844       hypre_error_w_msg(HYPRE_ERROR_GENERIC, "ERROR:  Unsupported char size");
845       hypre_assert(false);
846    }
847    */
848 #if defined(HYPRE_COMPLEX)
849    return CUDA_C_64F;
850 #else
851 #if defined(HYPRE_SINGLE)
852    hypre_assert(sizeof(HYPRE_Complex) == 4);
853    return CUDA_R_32F;
854 #elif defined(HYPRE_LONG_DOUBLE)
855 #error "Long Double is not supported on GPUs"
856 #else
857    hypre_assert(sizeof(HYPRE_Complex) == 8);
858    return CUDA_R_64F;
859 #endif
860 #endif // #if defined(HYPRE_COMPLEX)
861 }
862 
863 /*
864  * @brief Determines the associated cusparseIndexType_t for the HYPRE_Int typedef
865  */
866 cusparseIndexType_t
hypre_HYPREIntToCusparseIndexType()867 hypre_HYPREIntToCusparseIndexType()
868 {
869    /*
870    if(sizeof(char)*CHAR_BIT!=8)
871    {
872       hypre_error_w_msg(HYPRE_ERROR_GENERIC, "ERROR:  Unsupported char size");
873       hypre_assert(false);
874    }
875    */
876 
877 #if defined(HYPRE_BIGINT)
878    hypre_assert(sizeof(HYPRE_Int) == 8);
879    return CUSPARSE_INDEX_64I;
880 #else
881    hypre_assert(sizeof(HYPRE_Int) == 4);
882    return CUSPARSE_INDEX_32I;
883 #endif
884 }
885 #endif // #if defined(HYPRE_USING_CUSPARSE)
886 
887 #if defined(HYPRE_USING_GPU)
888 
889 #if defined(HYPRE_USING_DEVICE_OPENMP)
890 cudaStream_t
891 #elif defined(HYPRE_USING_CUDA)
892 cudaStream_t
893 #elif defined(HYPRE_USING_HIP)
894 hipStream_t
895 #endif
hypre_CudaDataCudaStream(hypre_CudaData * data,HYPRE_Int i)896 hypre_CudaDataCudaStream(hypre_CudaData *data, HYPRE_Int i)
897 {
898 #if defined(HYPRE_USING_DEVICE_OPENMP)
899    cudaStream_t stream = 0;
900 #elif defined(HYPRE_USING_CUDA)
901    cudaStream_t stream = 0;
902 #elif defined(HYPRE_USING_HIP)
903    hipStream_t stream = 0;
904 #endif
905 
906 #if defined(HYPRE_USING_CUDA_STREAMS)
907    if (i >= HYPRE_MAX_NUM_STREAMS)
908    {
909       /* return the default stream, i.e., the NULL stream */
910       /*
911       hypre_printf("CUDA stream %d exceeds the max number %d\n",
912                    i, HYPRE_MAX_NUM_STREAMS);
913       */
914       return NULL;
915    }
916 
917    if (data->cuda_streams[i])
918    {
919       return data->cuda_streams[i];
920    }
921 
922 #if defined(HYPRE_USING_DEVICE_OPENMP)
923    HYPRE_CUDA_CALL(cudaStreamCreateWithFlags(&stream, cudaStreamDefault));
924 #elif defined(HYPRE_USING_CUDA)
925    //HYPRE_CUDA_CALL(cudaStreamCreateWithFlags(&stream,cudaStreamNonBlocking));
926    HYPRE_CUDA_CALL(cudaStreamCreateWithFlags(&stream, cudaStreamDefault));
927 #elif defined(HYPRE_USING_HIP)
928    HYPRE_HIP_CALL(hipStreamCreateWithFlags(&stream, hipStreamDefault));
929 #endif
930 
931    data->cuda_streams[i] = stream;
932 #endif
933 
934    return stream;
935 }
936 
937 #if defined(HYPRE_USING_DEVICE_OPENMP)
938 cudaStream_t
939 #elif defined(HYPRE_USING_CUDA)
940 cudaStream_t
941 #elif defined(HYPRE_USING_HIP)
942 hipStream_t
943 #endif
hypre_CudaDataCudaComputeStream(hypre_CudaData * data)944 hypre_CudaDataCudaComputeStream(hypre_CudaData *data)
945 {
946    return hypre_CudaDataCudaStream(data,
947                                    hypre_CudaDataCudaComputeStreamNum(data));
948 }
949 
950 #if defined(HYPRE_USING_CURAND)
951 curandGenerator_t
hypre_CudaDataCurandGenerator(hypre_CudaData * data)952 hypre_CudaDataCurandGenerator(hypre_CudaData *data)
953 {
954    if (data->curand_generator)
955    {
956       return data->curand_generator;
957    }
958 
959    curandGenerator_t gen;
960    HYPRE_CURAND_CALL( curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT) );
961    HYPRE_CURAND_CALL( curandSetPseudoRandomGeneratorSeed(gen, 1234ULL) );
962    HYPRE_CURAND_CALL( curandSetStream(gen, hypre_CudaDataCudaComputeStream(data)) );
963 
964    data->curand_generator = gen;
965 
966    return gen;
967 }
968 
969 /* T = float or hypre_double */
970 template <typename T>
971 HYPRE_Int
hypre_CurandUniform_core(HYPRE_Int n,T * urand,HYPRE_Int set_seed,hypre_ulonglongint seed,HYPRE_Int set_offset,hypre_ulonglongint offset)972 hypre_CurandUniform_core( HYPRE_Int          n,
973                           T                 *urand,
974                           HYPRE_Int          set_seed,
975                           hypre_ulonglongint seed,
976                           HYPRE_Int          set_offset,
977                           hypre_ulonglongint offset)
978 {
979    curandGenerator_t gen = hypre_HandleCurandGenerator(hypre_handle());
980 
981    if (set_seed)
982    {
983       HYPRE_CURAND_CALL( curandSetPseudoRandomGeneratorSeed(gen, seed) );
984    }
985 
986    if (set_offset)
987    {
988       HYPRE_CURAND_CALL( curandSetGeneratorOffset(gen, offset) );
989    }
990 
991    if (sizeof(T) == sizeof(hypre_double))
992    {
993       HYPRE_CURAND_CALL( curandGenerateUniformDouble(gen, (hypre_double *) urand, n) );
994    }
995    else if (sizeof(T) == sizeof(float))
996    {
997       HYPRE_CURAND_CALL( curandGenerateUniform(gen, (float *) urand, n) );
998    }
999 
1000    return hypre_error_flag;
1001 }
1002 #endif /* #if defined(HYPRE_USING_CURAND) */
1003 
1004 #if defined(HYPRE_USING_ROCRAND)
1005 rocrand_generator
hypre_CudaDataCurandGenerator(hypre_CudaData * data)1006 hypre_CudaDataCurandGenerator(hypre_CudaData *data)
1007 {
1008    if (data->curand_generator)
1009    {
1010       return data->curand_generator;
1011    }
1012 
1013    rocrand_generator gen;
1014    HYPRE_ROCRAND_CALL( rocrand_create_generator(&gen, ROCRAND_RNG_PSEUDO_DEFAULT) );
1015    HYPRE_ROCRAND_CALL( rocrand_set_seed(gen, 1234ULL) );
1016    HYPRE_ROCRAND_CALL( rocrand_set_stream(gen, hypre_CudaDataCudaComputeStream(data)) );
1017 
1018    data->curand_generator = gen;
1019 
1020    return gen;
1021 }
1022 
1023 template <typename T>
1024 HYPRE_Int
hypre_CurandUniform_core(HYPRE_Int n,T * urand,HYPRE_Int set_seed,hypre_ulonglongint seed,HYPRE_Int set_offset,hypre_ulonglongint offset)1025 hypre_CurandUniform_core( HYPRE_Int          n,
1026                           T                 *urand,
1027                           HYPRE_Int          set_seed,
1028                           hypre_ulonglongint seed,
1029                           HYPRE_Int          set_offset,
1030                           hypre_ulonglongint offset)
1031 {
1032    hypre_GpuProfilingPushRange("hypre_CurandUniform_core");
1033 
1034    rocrand_generator gen = hypre_HandleCurandGenerator(hypre_handle());
1035 
1036    if (set_seed)
1037    {
1038       HYPRE_ROCRAND_CALL( rocrand_set_seed(gen, seed) );
1039    }
1040 
1041    if (set_offset)
1042    {
1043       HYPRE_ROCRAND_CALL( rocrand_set_offset(gen, offset) );
1044    }
1045 
1046    if (sizeof(T) == sizeof(hypre_double))
1047    {
1048       HYPRE_ROCRAND_CALL( rocrand_generate_uniform_double(gen, (hypre_double *) urand, n) );
1049    }
1050    else if (sizeof(T) == sizeof(float))
1051    {
1052       HYPRE_ROCRAND_CALL( rocrand_generate_uniform(gen, (float *) urand, n) );
1053    }
1054 
1055    hypre_GpuProfilingPopRange();
1056 
1057    return hypre_error_flag;
1058 }
1059 #endif /* #if defined(HYPRE_USING_ROCRAND) */
1060 
1061 #if defined(HYPRE_USING_CURAND) || defined(HYPRE_USING_ROCRAND)
1062 
1063 HYPRE_Int
hypre_CurandUniform(HYPRE_Int n,HYPRE_Real * urand,HYPRE_Int set_seed,hypre_ulonglongint seed,HYPRE_Int set_offset,hypre_ulonglongint offset)1064 hypre_CurandUniform( HYPRE_Int          n,
1065                      HYPRE_Real        *urand,
1066                      HYPRE_Int          set_seed,
1067                      hypre_ulonglongint seed,
1068                      HYPRE_Int          set_offset,
1069                      hypre_ulonglongint offset)
1070 {
1071    return hypre_CurandUniform_core(n, urand, set_seed, seed, set_offset, offset);
1072 }
1073 
1074 HYPRE_Int
hypre_CurandUniformSingle(HYPRE_Int n,float * urand,HYPRE_Int set_seed,hypre_ulonglongint seed,HYPRE_Int set_offset,hypre_ulonglongint offset)1075 hypre_CurandUniformSingle( HYPRE_Int          n,
1076                            float             *urand,
1077                            HYPRE_Int          set_seed,
1078                            hypre_ulonglongint seed,
1079                            HYPRE_Int          set_offset,
1080                            hypre_ulonglongint offset)
1081 {
1082    return hypre_CurandUniform_core(n, urand, set_seed, seed, set_offset, offset);
1083 }
1084 
1085 #endif /* #if defined(HYPRE_USING_CURAND) || defined(HYPRE_USING_ROCRAND) */
1086 
1087 #if defined(HYPRE_USING_CUBLAS)
1088 cublasHandle_t
hypre_CudaDataCublasHandle(hypre_CudaData * data)1089 hypre_CudaDataCublasHandle(hypre_CudaData *data)
1090 {
1091    if (data->cublas_handle)
1092    {
1093       return data->cublas_handle;
1094    }
1095 
1096    cublasHandle_t handle;
1097    HYPRE_CUBLAS_CALL( cublasCreate(&handle) );
1098 
1099    HYPRE_CUBLAS_CALL( cublasSetStream(handle, hypre_CudaDataCudaComputeStream(data)) );
1100 
1101    data->cublas_handle = handle;
1102 
1103    return handle;
1104 }
1105 #endif
1106 
1107 #if defined(HYPRE_USING_CUSPARSE)
1108 cusparseHandle_t
hypre_CudaDataCusparseHandle(hypre_CudaData * data)1109 hypre_CudaDataCusparseHandle(hypre_CudaData *data)
1110 {
1111    if (data->cusparse_handle)
1112    {
1113       return data->cusparse_handle;
1114    }
1115 
1116    cusparseHandle_t handle;
1117    HYPRE_CUSPARSE_CALL( cusparseCreate(&handle) );
1118 
1119    HYPRE_CUSPARSE_CALL( cusparseSetStream(handle, hypre_CudaDataCudaComputeStream(data)) );
1120 
1121    data->cusparse_handle = handle;
1122 
1123    return handle;
1124 }
1125 #endif // defined(HYPRE_USING_CUSPARSE)
1126 
1127 
1128 #if defined(HYPRE_USING_ROCSPARSE)
1129 rocsparse_handle
hypre_CudaDataCusparseHandle(hypre_CudaData * data)1130 hypre_CudaDataCusparseHandle(hypre_CudaData *data)
1131 {
1132    if (data->cusparse_handle)
1133    {
1134       return data->cusparse_handle;
1135    }
1136 
1137    rocsparse_handle handle;
1138    HYPRE_ROCSPARSE_CALL( rocsparse_create_handle(&handle) );
1139 
1140    HYPRE_ROCSPARSE_CALL( rocsparse_set_stream(handle, hypre_CudaDataCudaComputeStream(data)) );
1141 
1142    data->cusparse_handle = handle;
1143 
1144    return handle;
1145 }
1146 #endif // defined(HYPRE_USING_ROCSPARSE)
1147 
1148 
1149 
1150 hypre_CudaData*
hypre_CudaDataCreate()1151 hypre_CudaDataCreate()
1152 {
1153    hypre_CudaData *data = hypre_CTAlloc(hypre_CudaData, 1, HYPRE_MEMORY_HOST);
1154 
1155    hypre_CudaDataCudaDevice(data)            = 0;
1156    hypre_CudaDataCudaComputeStreamNum(data)  = 0;
1157 
1158    /* SpGeMM */
1159 #if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE)
1160    hypre_CudaDataSpgemmUseCusparse(data) = 1;
1161 #else
1162    hypre_CudaDataSpgemmUseCusparse(data) = 0;
1163 #endif
1164 
1165    hypre_CudaDataSpgemmAlgorithm(data)                = 1;
1166    /* 1: naive overestimate, 2: naive underestimate, 3: Cohen's algorithm */
1167    hypre_CudaDataSpgemmRownnzEstimateMethod(data)     = 3;
1168    hypre_CudaDataSpgemmRownnzEstimateNsamples(data)   = 32;
1169    hypre_CudaDataSpgemmRownnzEstimateMultFactor(data) = 1.5;
1170    hypre_CudaDataSpgemmHashType(data)                 = 'D';
1171 
1172    /* pmis */
1173 #if defined(HYPRE_USING_CURAND) || defined(HYPRE_USING_ROCRAND)
1174    hypre_CudaDataUseGpuRand(data) = 1;
1175 #else
1176    hypre_CudaDataUseGpuRand(data) = 0;
1177 #endif
1178 
1179    /* device pool */
1180 #ifdef HYPRE_USING_DEVICE_POOL
1181    hypre_CudaDataCubBinGrowth(data)      = 8u;
1182    hypre_CudaDataCubMinBin(data)         = 1u;
1183    hypre_CudaDataCubMaxBin(data)         = (hypre_uint) -1;
1184    hypre_CudaDataCubMaxCachedBytes(data) = (size_t) -1;
1185    hypre_CudaDataCubDevAllocator(data)   = NULL;
1186    hypre_CudaDataCubUvmAllocator(data)   = NULL;
1187 #endif
1188 
1189    return data;
1190 }
1191 
1192 void
hypre_CudaDataDestroy(hypre_CudaData * data)1193 hypre_CudaDataDestroy(hypre_CudaData *data)
1194 {
1195    if (!data)
1196    {
1197       return;
1198    }
1199 
1200    hypre_TFree(hypre_CudaDataCudaReduceBuffer(data),     HYPRE_MEMORY_DEVICE);
1201    hypre_TFree(hypre_CudaDataStructCommRecvBuffer(data), HYPRE_MEMORY_DEVICE);
1202    hypre_TFree(hypre_CudaDataStructCommSendBuffer(data), HYPRE_MEMORY_DEVICE);
1203 
1204 #if defined(HYPRE_USING_CURAND)
1205    if (data->curand_generator)
1206    {
1207       HYPRE_CURAND_CALL( curandDestroyGenerator(data->curand_generator) );
1208    }
1209 #endif
1210 
1211 #if defined(HYPRE_USING_ROCRAND)
1212    if (data->curand_generator)
1213    {
1214       HYPRE_ROCRAND_CALL( rocrand_destroy_generator(data->curand_generator) );
1215    }
1216 #endif
1217 
1218 #if defined(HYPRE_USING_CUBLAS)
1219    if (data->cublas_handle)
1220    {
1221       HYPRE_CUBLAS_CALL( cublasDestroy(data->cublas_handle) );
1222    }
1223 #endif
1224 
1225 #if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE)
1226    if (data->cusparse_handle)
1227    {
1228 #if defined(HYPRE_USING_CUSPARSE)
1229       HYPRE_CUSPARSE_CALL( cusparseDestroy(data->cusparse_handle) );
1230 #elif defined(HYPRE_USING_ROCSPARSE)
1231       HYPRE_ROCSPARSE_CALL( rocsparse_destroy_handle(data->cusparse_handle) );
1232 #endif
1233    }
1234 #endif // #if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE)
1235 
1236    for (HYPRE_Int i = 0; i < HYPRE_MAX_NUM_STREAMS; i++)
1237    {
1238       if (data->cuda_streams[i])
1239       {
1240 #if defined(HYPRE_USING_DEVICE_OPENMP)
1241          HYPRE_CUDA_CALL( cudaStreamDestroy(data->cuda_streams[i]) );
1242 #elif defined(HYPRE_USING_CUDA)
1243          HYPRE_CUDA_CALL( cudaStreamDestroy(data->cuda_streams[i]) );
1244 #elif defined(HYPRE_USING_HIP)
1245          HYPRE_HIP_CALL( hipStreamDestroy(data->cuda_streams[i]) );
1246 #endif
1247       }
1248    }
1249 
1250 #ifdef HYPRE_USING_DEVICE_POOL
1251    hypre_CudaDataCubCachingAllocatorDestroy(data);
1252 #endif
1253 
1254    hypre_TFree(data, HYPRE_MEMORY_HOST);
1255 }
1256 
1257 HYPRE_Int
hypre_SyncCudaDevice(hypre_Handle * hypre_handle)1258 hypre_SyncCudaDevice(hypre_Handle *hypre_handle)
1259 {
1260 #if defined(HYPRE_USING_DEVICE_OPENMP)
1261    HYPRE_CUDA_CALL( cudaDeviceSynchronize() );
1262 #elif defined(HYPRE_USING_CUDA)
1263    HYPRE_CUDA_CALL( cudaDeviceSynchronize() );
1264 #elif defined(HYPRE_USING_HIP)
1265    HYPRE_HIP_CALL( hipDeviceSynchronize() );
1266 #endif
1267    return hypre_error_flag;
1268 }
1269 
1270 HYPRE_Int
hypre_ResetCudaDevice(hypre_Handle * hypre_handle)1271 hypre_ResetCudaDevice(hypre_Handle *hypre_handle)
1272 {
1273 #if defined(HYPRE_USING_CUDA)
1274    cudaDeviceReset();
1275 #elif defined(HYPRE_USING_HIP)
1276    hipDeviceReset();
1277 #endif
1278    return hypre_error_flag;
1279 }
1280 
1281 /* synchronize the Hypre compute stream
1282  * action: 0: set sync stream to false
1283  *         1: set sync stream to true
1284  *         2: restore sync stream to default
1285  *         3: return the current value of cuda_compute_stream_sync
1286  *         4: sync stream based on cuda_compute_stream_sync
1287  */
1288 HYPRE_Int
hypre_SyncCudaComputeStream_core(HYPRE_Int action,hypre_Handle * hypre_handle,HYPRE_Int * cuda_compute_stream_sync_ptr)1289 hypre_SyncCudaComputeStream_core(HYPRE_Int     action,
1290                                  hypre_Handle *hypre_handle,
1291                                  HYPRE_Int    *cuda_compute_stream_sync_ptr)
1292 {
1293    /* with UVM the default is to sync at kernel completions, since host is also able to
1294     * touch GPU memory */
1295 #if defined(HYPRE_USING_UNIFIED_MEMORY)
1296    static const HYPRE_Int cuda_compute_stream_sync_default = 1;
1297 #else
1298    static const HYPRE_Int cuda_compute_stream_sync_default = 0;
1299 #endif
1300 
1301    /* this controls if synchronize the stream after computations */
1302    static HYPRE_Int cuda_compute_stream_sync = cuda_compute_stream_sync_default;
1303 
1304    switch (action)
1305    {
1306       case 0:
1307          cuda_compute_stream_sync = 0;
1308          break;
1309       case 1:
1310          cuda_compute_stream_sync = 1;
1311          break;
1312       case 2:
1313          cuda_compute_stream_sync = cuda_compute_stream_sync_default;
1314          break;
1315       case 3:
1316          *cuda_compute_stream_sync_ptr = cuda_compute_stream_sync;
1317          break;
1318       case 4:
1319 #if defined(HYPRE_USING_DEVICE_OPENMP)
1320          HYPRE_CUDA_CALL( cudaDeviceSynchronize() );
1321 #else
1322          if (cuda_compute_stream_sync)
1323          {
1324 #if defined(HYPRE_USING_CUDA)
1325             HYPRE_CUDA_CALL( cudaStreamSynchronize(hypre_HandleCudaComputeStream(hypre_handle)) );
1326 #elif defined(HYPRE_USING_HIP)
1327             HYPRE_HIP_CALL( hipStreamSynchronize(hypre_HandleCudaComputeStream(hypre_handle)) );
1328 #endif
1329          }
1330 #endif
1331          break;
1332       default:
1333          hypre_printf("hypre_SyncCudaComputeStream_core invalid action\n");
1334          hypre_error_in_arg(1);
1335    }
1336 
1337    return hypre_error_flag;
1338 }
1339 
1340 HYPRE_Int
hypre_SetSyncCudaCompute(HYPRE_Int action)1341 hypre_SetSyncCudaCompute(HYPRE_Int action)
1342 {
1343    /* convert to 1/0 */
1344    action = action != 0;
1345    hypre_SyncCudaComputeStream_core(action, NULL, NULL);
1346 
1347    return hypre_error_flag;
1348 }
1349 
1350 HYPRE_Int
hypre_RestoreSyncCudaCompute()1351 hypre_RestoreSyncCudaCompute()
1352 {
1353    hypre_SyncCudaComputeStream_core(2, NULL, NULL);
1354 
1355    return hypre_error_flag;
1356 }
1357 
1358 HYPRE_Int
hypre_GetSyncCudaCompute(HYPRE_Int * cuda_compute_stream_sync_ptr)1359 hypre_GetSyncCudaCompute(HYPRE_Int *cuda_compute_stream_sync_ptr)
1360 {
1361    hypre_SyncCudaComputeStream_core(3, NULL, cuda_compute_stream_sync_ptr);
1362 
1363    return hypre_error_flag;
1364 }
1365 
1366 HYPRE_Int
hypre_SyncCudaComputeStream(hypre_Handle * hypre_handle)1367 hypre_SyncCudaComputeStream(hypre_Handle *hypre_handle)
1368 {
1369    hypre_SyncCudaComputeStream_core(4, hypre_handle, NULL);
1370 
1371    return hypre_error_flag;
1372 }
1373 
1374 #endif // #if defined(HYPRE_USING_GPU)
1375 
1376 
1377 /* This function is supposed to be used in the test drivers to mimic
1378  * users' GPU binding approaches
1379  * It is supposed to be called before HYPRE_Init,
1380  * so that HYPRE_Init can get the wanted device id
1381  */
1382 HYPRE_Int
hypre_bind_device(HYPRE_Int myid,HYPRE_Int nproc,MPI_Comm comm)1383 hypre_bind_device( HYPRE_Int myid,
1384                    HYPRE_Int nproc,
1385                    MPI_Comm  comm )
1386 {
1387 #ifdef HYPRE_USING_GPU
1388    /* proc id (rank) on the running node */
1389    HYPRE_Int myNodeid;
1390    /* num of procs (size) on the node */
1391    HYPRE_Int NodeSize;
1392    /* num of devices seen */
1393    hypre_int nDevices;
1394    /* device id that want to bind */
1395    hypre_int device_id;
1396 
1397    hypre_MPI_Comm node_comm;
1398    hypre_MPI_Comm_split_type( comm, hypre_MPI_COMM_TYPE_SHARED,
1399                               myid, hypre_MPI_INFO_NULL, &node_comm );
1400    hypre_MPI_Comm_rank(node_comm, &myNodeid);
1401    hypre_MPI_Comm_size(node_comm, &NodeSize);
1402    hypre_MPI_Comm_free(&node_comm);
1403 
1404    /* get number of devices on this node */
1405    hypre_GetDeviceCount(&nDevices);
1406 
1407    /* set device */
1408    device_id = myNodeid % nDevices;
1409    hypre_SetDevice(device_id, NULL);
1410 
1411 #if defined(HYPRE_DEBUG) && defined(HYPRE_PRINT_ERRORS)
1412    hypre_printf("Proc [global %d/%d, local %d/%d] can see %d GPUs and is running on %d\n",
1413                 myid, nproc, myNodeid, NodeSize, nDevices, device_id);
1414 #endif
1415 
1416 #endif /* #ifdef HYPRE_USING_GPU */
1417 
1418    return hypre_error_flag;
1419 }
1420 
1421