1 /******************************************************************************
2 * Copyright 1998-2019 Lawrence Livermore National Security, LLC and other
3 * HYPRE Project Developers. See the top-level COPYRIGHT file for details.
4 *
5 * SPDX-License-Identifier: (Apache-2.0 OR MIT)
6 ******************************************************************************/
7
8 #include "_hypre_utilities.h"
9 #include "_hypre_utilities.hpp"
10
11 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
12
13 /*
14 * The architecture identification macro __CUDA_ARCH__ is assigned a three-digit value string xy0
15 * (ending in a literal 0) during each nvcc compilation stage 1 that compiles for compute_xy.
16 * This macro can be used in the implementation of GPU functions for determining the virtual architecture
17 * for which it is currently being compiled. The host code (the non-GPU code) must not depend on it.
18 * Note that compute_XX refers to a PTX version and sm_XX refers to a cubin version.
19 */
20 __global__ void
hypreCUDAKernel_CompileFlagSafetyCheck(hypre_int * cuda_arch_compile)21 hypreCUDAKernel_CompileFlagSafetyCheck(hypre_int *cuda_arch_compile)
22 {
23 #if defined(__CUDA_ARCH__)
24 cuda_arch_compile[0] = __CUDA_ARCH__;
25 #endif
26 }
27
28 /*
29 * Assume this function is called inside HYPRE_Init(), at a place where we do not want to
30 * activate memory pooling, so we do not use hypre's memory model to Alloc and Free.
31 * See commented out code below (and do not delete)
32 */
hypre_CudaCompileFlagCheck()33 void hypre_CudaCompileFlagCheck()
34 {
35 // This is really only defined for CUDA and not for HIP
36 #if defined(HYPRE_USING_CUDA)
37
38 HYPRE_Int device = hypre_HandleCudaDevice(hypre_handle());
39
40 struct cudaDeviceProp props;
41 cudaGetDeviceProperties(&props, device);
42 hypre_int cuda_arch_actual = props.major*100 + props.minor*10;
43 hypre_int cuda_arch_compile = -1;
44 dim3 gDim(1,1,1), bDim(1,1,1);
45
46 hypre_int *cuda_arch_compile_d = NULL;
47 //cuda_arch_compile_d = hypre_TAlloc(hypre_int, 1, HYPRE_MEMORY_DEVICE);
48 HYPRE_CUDA_CALL( cudaMalloc(&cuda_arch_compile_d, sizeof(hypre_int)) );
49 hypre_TMemcpy(cuda_arch_compile_d, &cuda_arch_compile, hypre_int, 1, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST);
50 HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CompileFlagSafetyCheck, gDim, bDim, cuda_arch_compile_d );
51 hypre_TMemcpy(&cuda_arch_compile, cuda_arch_compile_d, hypre_int, 1, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE);
52 //hypre_TFree(cuda_arch_compile_d, HYPRE_MEMORY_DEVICE);
53 HYPRE_CUDA_CALL( cudaFree(cuda_arch_compile_d) );
54
55 /* HYPRE_CUDA_CALL(cudaDeviceSynchronize()); */
56
57 if (-1 == cuda_arch_compile)
58 {
59 hypre_error_w_msg(1, "hypre error: no proper cuda_arch found");
60 }
61 else if (cuda_arch_actual != cuda_arch_compile)
62 {
63 char msg[256];
64 hypre_sprintf(msg, "hypre warning: Compile 'arch=compute_' does not match device arch %d", cuda_arch_actual);
65 hypre_error_w_msg(1, msg);
66 /*
67 hypre_printf("%s\n", msg);
68 hypre_MPI_Abort(hypre_MPI_COMM_WORLD, -1);
69 */
70 }
71
72 #endif // defined(HYPRE_USING_CUDA)
73 }
74
75 dim3
hypre_GetDefaultCUDABlockDimension()76 hypre_GetDefaultCUDABlockDimension()
77 {
78 dim3 bDim(512, 1, 1);
79
80 return bDim;
81 }
82
83 dim3
hypre_GetDefaultCUDAGridDimension(HYPRE_Int n,const char * granularity,dim3 bDim)84 hypre_GetDefaultCUDAGridDimension( HYPRE_Int n,
85 const char *granularity,
86 dim3 bDim )
87 {
88 HYPRE_Int num_blocks = 0;
89 HYPRE_Int num_threads_per_block = bDim.x * bDim.y * bDim.z;
90
91 if (granularity[0] == 't')
92 {
93 num_blocks = (n + num_threads_per_block - 1) / num_threads_per_block;
94 }
95 else if (granularity[0] == 'w')
96 {
97 HYPRE_Int num_warps_per_block = num_threads_per_block >> HYPRE_WARP_BITSHIFT;
98
99 hypre_assert(num_warps_per_block * HYPRE_WARP_SIZE == num_threads_per_block);
100
101 num_blocks = (n + num_warps_per_block - 1) / num_warps_per_block;
102 }
103 else
104 {
105 hypre_printf("Error %s %d: Unknown granularity !\n", __FILE__, __LINE__);
106 hypre_assert(0);
107 }
108
109 dim3 gDim(num_blocks, 1, 1);
110
111 return gDim;
112 }
113
114 /**
115 * Get NNZ of each row in d_row_indices and stored the results in d_rownnz
116 * All pointers are device pointers.
117 * d_rownnz can be the same as d_row_indices
118 */
119 __global__ void
hypreCUDAKernel_GetRowNnz(HYPRE_Int nrows,HYPRE_Int * d_row_indices,HYPRE_Int * d_diag_ia,HYPRE_Int * d_offd_ia,HYPRE_Int * d_rownnz)120 hypreCUDAKernel_GetRowNnz(HYPRE_Int nrows, HYPRE_Int *d_row_indices, HYPRE_Int *d_diag_ia, HYPRE_Int *d_offd_ia,
121 HYPRE_Int *d_rownnz)
122 {
123 const HYPRE_Int global_thread_id = hypre_cuda_get_grid_thread_id<1,1>();
124
125 if (global_thread_id < nrows)
126 {
127 HYPRE_Int i;
128
129 if (d_row_indices)
130 {
131 i = read_only_load(&d_row_indices[global_thread_id]);
132 }
133 else
134 {
135 i = global_thread_id;
136 }
137
138 d_rownnz[global_thread_id] = read_only_load(&d_diag_ia[i+1]) - read_only_load(&d_diag_ia[i]) +
139 read_only_load(&d_offd_ia[i+1]) - read_only_load(&d_offd_ia[i]);
140 }
141 }
142
143 /* special case: if d_row_indices == NULL, it means d_row_indices=[0,1,...,nrows-1] */
144 HYPRE_Int
hypreDevice_GetRowNnz(HYPRE_Int nrows,HYPRE_Int * d_row_indices,HYPRE_Int * d_diag_ia,HYPRE_Int * d_offd_ia,HYPRE_Int * d_rownnz)145 hypreDevice_GetRowNnz(HYPRE_Int nrows, HYPRE_Int *d_row_indices, HYPRE_Int *d_diag_ia, HYPRE_Int *d_offd_ia,
146 HYPRE_Int *d_rownnz)
147 {
148 const dim3 bDim = hypre_GetDefaultCUDABlockDimension();
149 const dim3 gDim = hypre_GetDefaultCUDAGridDimension(nrows, "thread", bDim);
150
151 /* trivial case */
152 if (nrows <= 0)
153 {
154 return hypre_error_flag;
155 }
156
157 HYPRE_CUDA_LAUNCH( hypreCUDAKernel_GetRowNnz, gDim, bDim, nrows, d_row_indices, d_diag_ia, d_offd_ia, d_rownnz );
158
159 return hypre_error_flag;
160 }
161
162 __global__ void
hypreCUDAKernel_CopyParCSRRows(HYPRE_Int nrows,HYPRE_Int * d_row_indices,HYPRE_Int has_offd,HYPRE_BigInt first_col,HYPRE_BigInt * d_col_map_offd_A,HYPRE_Int * d_diag_i,HYPRE_Int * d_diag_j,HYPRE_Complex * d_diag_a,HYPRE_Int * d_offd_i,HYPRE_Int * d_offd_j,HYPRE_Complex * d_offd_a,HYPRE_Int * d_ib,HYPRE_BigInt * d_jb,HYPRE_Complex * d_ab)163 hypreCUDAKernel_CopyParCSRRows(HYPRE_Int nrows,
164 HYPRE_Int *d_row_indices,
165 HYPRE_Int has_offd,
166 HYPRE_BigInt first_col,
167 HYPRE_BigInt *d_col_map_offd_A,
168 HYPRE_Int *d_diag_i,
169 HYPRE_Int *d_diag_j,
170 HYPRE_Complex *d_diag_a,
171 HYPRE_Int *d_offd_i,
172 HYPRE_Int *d_offd_j,
173 HYPRE_Complex *d_offd_a,
174 HYPRE_Int *d_ib,
175 HYPRE_BigInt *d_jb,
176 HYPRE_Complex *d_ab)
177 {
178 const HYPRE_Int global_warp_id = hypre_cuda_get_grid_warp_id<1,1>();
179
180 if (global_warp_id >= nrows)
181 {
182 return;
183 }
184
185 /* lane id inside the warp */
186 const HYPRE_Int lane_id = hypre_cuda_get_lane_id<1>();
187 HYPRE_Int i, j, k, p, row, istart, iend, bstart;
188
189 /* diag part */
190 if (lane_id < 2)
191 {
192 /* row index to work on */
193 if (d_row_indices)
194 {
195 row = read_only_load(d_row_indices + global_warp_id);
196 }
197 else
198 {
199 row = global_warp_id;
200 }
201 /* start/end position of the row */
202 j = read_only_load(d_diag_i + row + lane_id);
203 /* start position of b */
204 k = d_ib ? read_only_load(d_ib + global_warp_id) : 0;
205 }
206 istart = __shfl_sync(HYPRE_WARP_FULL_MASK, j, 0);
207 iend = __shfl_sync(HYPRE_WARP_FULL_MASK, j, 1);
208 bstart = __shfl_sync(HYPRE_WARP_FULL_MASK, k, 0);
209
210 p = bstart - istart;
211 for (i = istart + lane_id; i < iend; i += HYPRE_WARP_SIZE)
212 {
213 d_jb[p+i] = read_only_load(d_diag_j + i) + first_col;
214 if (d_ab)
215 {
216 d_ab[p+i] = read_only_load(d_diag_a + i);
217 }
218 }
219
220 if (!has_offd)
221 {
222 return;
223 }
224
225 /* offd part */
226 if (lane_id < 2)
227 {
228 j = read_only_load(d_offd_i + row + lane_id);
229 }
230 bstart += iend - istart;
231 istart = __shfl_sync(HYPRE_WARP_FULL_MASK, j, 0);
232 iend = __shfl_sync(HYPRE_WARP_FULL_MASK, j, 1);
233
234 p = bstart - istart;
235 for (i = istart + lane_id; i < iend; i += HYPRE_WARP_SIZE)
236 {
237 if (d_col_map_offd_A)
238 {
239 d_jb[p+i] = d_col_map_offd_A[read_only_load(d_offd_j + i)];
240 }
241 else
242 {
243 d_jb[p+i] = -1 - read_only_load(d_offd_j + i);
244 }
245
246 if (d_ab)
247 {
248 d_ab[p+i] = read_only_load(d_offd_a + i);
249 }
250 }
251 }
252
253 /* B = A(row_indices, :) */
254 /* Note: d_ib is an input vector that contains row ptrs,
255 * i.e., start positions where to put the rows in d_jb and d_ab.
256 * The col indices in B are global indices, i.e., BigJ
257 * of length (nrows + 1) or nrow (without the last entry, nnz) */
258 /* Special cases:
259 * if d_row_indices == NULL, it means d_row_indices=[0,1,...,nrows-1]
260 * If col_map_offd_A == NULL, use (-1 - d_offd_j) as column id
261 * If nrows == 1 and d_ib == NULL, it means d_ib[0] = 0 */
262 HYPRE_Int
hypreDevice_CopyParCSRRows(HYPRE_Int nrows,HYPRE_Int * d_row_indices,HYPRE_Int job,HYPRE_Int has_offd,HYPRE_BigInt first_col,HYPRE_BigInt * d_col_map_offd_A,HYPRE_Int * d_diag_i,HYPRE_Int * d_diag_j,HYPRE_Complex * d_diag_a,HYPRE_Int * d_offd_i,HYPRE_Int * d_offd_j,HYPRE_Complex * d_offd_a,HYPRE_Int * d_ib,HYPRE_BigInt * d_jb,HYPRE_Complex * d_ab)263 hypreDevice_CopyParCSRRows(HYPRE_Int nrows,
264 HYPRE_Int *d_row_indices,
265 HYPRE_Int job,
266 HYPRE_Int has_offd,
267 HYPRE_BigInt first_col,
268 HYPRE_BigInt *d_col_map_offd_A,
269 HYPRE_Int *d_diag_i,
270 HYPRE_Int *d_diag_j,
271 HYPRE_Complex *d_diag_a,
272 HYPRE_Int *d_offd_i,
273 HYPRE_Int *d_offd_j,
274 HYPRE_Complex *d_offd_a,
275 HYPRE_Int *d_ib,
276 HYPRE_BigInt *d_jb,
277 HYPRE_Complex *d_ab)
278 {
279 /* trivial case */
280 if (nrows <= 0)
281 {
282 return hypre_error_flag;
283 }
284
285 hypre_assert(!(nrows > 1 && d_ib == NULL));
286
287 const dim3 bDim = hypre_GetDefaultCUDABlockDimension();
288 const dim3 gDim = hypre_GetDefaultCUDAGridDimension(nrows, "warp", bDim);
289
290 /*
291 if (job == 2)
292 {
293 }
294 */
295
296 HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CopyParCSRRows, gDim, bDim,
297 nrows, d_row_indices, has_offd, first_col, d_col_map_offd_A,
298 d_diag_i, d_diag_j, d_diag_a,
299 d_offd_i, d_offd_j, d_offd_a,
300 d_ib, d_jb, d_ab );
301
302 return hypre_error_flag;
303 }
304
305 HYPRE_Int
hypreDevice_IntegerReduceSum(HYPRE_Int n,HYPRE_Int * d_i)306 hypreDevice_IntegerReduceSum(HYPRE_Int n, HYPRE_Int *d_i)
307 {
308 return HYPRE_THRUST_CALL(reduce, d_i, d_i + n);
309 }
310
311 HYPRE_Int
hypreDevice_IntegerInclusiveScan(HYPRE_Int n,HYPRE_Int * d_i)312 hypreDevice_IntegerInclusiveScan(HYPRE_Int n, HYPRE_Int *d_i)
313 {
314 HYPRE_THRUST_CALL(inclusive_scan, d_i, d_i + n, d_i);
315
316 return hypre_error_flag;
317 }
318
319 HYPRE_Int
hypreDevice_IntegerExclusiveScan(HYPRE_Int n,HYPRE_Int * d_i)320 hypreDevice_IntegerExclusiveScan(HYPRE_Int n, HYPRE_Int *d_i)
321 {
322 HYPRE_THRUST_CALL(exclusive_scan, d_i, d_i + n, d_i);
323
324 return hypre_error_flag;
325 }
326
327 HYPRE_Int
hypreDevice_Scalen(HYPRE_Complex * d_x,size_t n,HYPRE_Complex v)328 hypreDevice_Scalen(HYPRE_Complex *d_x, size_t n, HYPRE_Complex v)
329 {
330 HYPRE_THRUST_CALL( transform, d_x, d_x + n, d_x, v * _1 );
331
332 return hypre_error_flag;
333 }
334
335 HYPRE_Int
hypreDevice_Filln(HYPRE_Complex * d_x,size_t n,HYPRE_Complex v)336 hypreDevice_Filln(HYPRE_Complex *d_x, size_t n, HYPRE_Complex v)
337 {
338 HYPRE_THRUST_CALL( fill_n, d_x, n, v);
339
340 return hypre_error_flag;
341 }
342
343 HYPRE_Int
hypreDevice_BigIntFilln(HYPRE_BigInt * d_x,size_t n,HYPRE_BigInt v)344 hypreDevice_BigIntFilln(HYPRE_BigInt *d_x, size_t n, HYPRE_BigInt v)
345 {
346 HYPRE_THRUST_CALL( fill_n, d_x, n, v);
347
348 return hypre_error_flag;
349 }
350
351 struct hypre_empty_row_functor
352 {
353 // This is needed for clang
354 typedef bool result_type;
355
356 __device__
operatorhypre_empty_row_functor357 bool operator()(const thrust::tuple<HYPRE_Int, HYPRE_Int>& t) const
358 {
359 const HYPRE_Int a = thrust::get<0>(t);
360 const HYPRE_Int b = thrust::get<1>(t);
361
362 return a != b;
363 }
364 };
365
366 HYPRE_Int*
hypreDevice_CsrRowPtrsToIndices(HYPRE_Int nrows,HYPRE_Int nnz,HYPRE_Int * d_row_ptr)367 hypreDevice_CsrRowPtrsToIndices(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr)
368 {
369 /* trivial case */
370 if (nrows <= 0 || nnz <= 0)
371 {
372 return NULL;
373 }
374
375 HYPRE_Int *d_row_ind = hypre_TAlloc(HYPRE_Int, nnz, HYPRE_MEMORY_DEVICE);
376
377 hypreDevice_CsrRowPtrsToIndices_v2(nrows, nnz, d_row_ptr, d_row_ind);
378
379 return d_row_ind;
380 }
381
382 HYPRE_Int
hypreDevice_CsrRowPtrsToIndices_v2(HYPRE_Int nrows,HYPRE_Int nnz,HYPRE_Int * d_row_ptr,HYPRE_Int * d_row_ind)383 hypreDevice_CsrRowPtrsToIndices_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr, HYPRE_Int *d_row_ind)
384 {
385 /* trivial case */
386 if (nrows <= 0 || nnz <= 0)
387 {
388 return hypre_error_flag;
389 }
390
391 HYPRE_THRUST_CALL( fill, d_row_ind, d_row_ind + nnz, 0 );
392
393 HYPRE_THRUST_CALL( scatter_if,
394 thrust::counting_iterator<HYPRE_Int>(0),
395 thrust::counting_iterator<HYPRE_Int>(nrows),
396 d_row_ptr,
397 thrust::make_transform_iterator( thrust::make_zip_iterator(thrust::make_tuple(d_row_ptr, d_row_ptr+1)),
398 hypre_empty_row_functor() ),
399 d_row_ind );
400
401 HYPRE_THRUST_CALL( inclusive_scan, d_row_ind, d_row_ind + nnz, d_row_ind, thrust::maximum<HYPRE_Int>());
402
403 return hypre_error_flag;
404 }
405
406 /* Input: d_row_num, of size nrows, contains the rows indices that can be BigInt or Int
407 * Output: d_row_ind */
408 template <typename T>
409 HYPRE_Int
hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows,HYPRE_Int nnz,HYPRE_Int * d_row_ptr,T * d_row_num,T * d_row_ind)410 hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr, T *d_row_num, T *d_row_ind)
411 {
412 /* trivial case */
413 if (nrows <= 0)
414 {
415 return hypre_error_flag;
416 }
417
418 HYPRE_Int *map = hypre_TAlloc(HYPRE_Int, nnz, HYPRE_MEMORY_DEVICE);
419
420 hypreDevice_CsrRowPtrsToIndices_v2(nrows, nnz, d_row_ptr, map);
421
422 HYPRE_THRUST_CALL(gather, map, map + nnz, d_row_num, d_row_ind);
423
424 hypre_TFree(map, HYPRE_MEMORY_DEVICE);
425
426 return hypre_error_flag;
427 }
428
429 template HYPRE_Int hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr, HYPRE_Int *d_row_num, HYPRE_Int *d_row_ind);
430 #if defined(HYPRE_MIXEDINT)
431 template HYPRE_Int hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr, HYPRE_BigInt *d_row_num, HYPRE_BigInt *d_row_ind);
432 #endif
433
434 HYPRE_Int*
hypreDevice_CsrRowIndicesToPtrs(HYPRE_Int nrows,HYPRE_Int nnz,HYPRE_Int * d_row_ind)435 hypreDevice_CsrRowIndicesToPtrs(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind)
436 {
437 HYPRE_Int *d_row_ptr = hypre_TAlloc(HYPRE_Int, nrows+1, HYPRE_MEMORY_DEVICE);
438
439 HYPRE_THRUST_CALL( lower_bound,
440 d_row_ind, d_row_ind + nnz,
441 thrust::counting_iterator<HYPRE_Int>(0),
442 thrust::counting_iterator<HYPRE_Int>(nrows+1),
443 d_row_ptr);
444
445 return d_row_ptr;
446 }
447
448 HYPRE_Int
hypreDevice_CsrRowIndicesToPtrs_v2(HYPRE_Int nrows,HYPRE_Int nnz,HYPRE_Int * d_row_ind,HYPRE_Int * d_row_ptr)449 hypreDevice_CsrRowIndicesToPtrs_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind, HYPRE_Int *d_row_ptr)
450 {
451 HYPRE_THRUST_CALL( lower_bound,
452 d_row_ind, d_row_ind + nnz,
453 thrust::counting_iterator<HYPRE_Int>(0),
454 thrust::counting_iterator<HYPRE_Int>(nrows+1),
455 d_row_ptr);
456
457 return hypre_error_flag;
458 }
459
460 __global__ void
hypreCUDAKernel_ScatterAddTrivial(HYPRE_Int n,HYPRE_Real * x,HYPRE_Int * map,HYPRE_Real * y)461 hypreCUDAKernel_ScatterAddTrivial(HYPRE_Int n, HYPRE_Real *x, HYPRE_Int *map, HYPRE_Real *y)
462 {
463 for (HYPRE_Int i = 0; i < n; i++)
464 {
465 x[map[i]] += y[i];
466 }
467 }
468
469 /* x[map[i]] += y[i], same index cannot appear more than once in map */
470 __global__ void
hypreCUDAKernel_ScatterAdd(HYPRE_Int n,HYPRE_Real * x,HYPRE_Int * map,HYPRE_Real * y)471 hypreCUDAKernel_ScatterAdd(HYPRE_Int n, HYPRE_Real *x, HYPRE_Int *map, HYPRE_Real *y)
472 {
473 HYPRE_Int global_thread_id = hypre_cuda_get_grid_thread_id<1,1>();
474
475 if (global_thread_id < n)
476 {
477 x[map[global_thread_id]] += y[global_thread_id];
478 }
479 }
480
481 /* Generalized Scatter-and-Add
482 * for i = 0 : ny-1, x[map[i]] += y[i];
483 * Note: An index is allowed to appear more than once in map
484 * Content in y will be destroyed
485 * When work != NULL, work is at least of size [2*sizeof(HYPRE_Int)+sizeof(HYPRE_Complex)]*ny
486 */
487 HYPRE_Int
hypreDevice_GenScatterAdd(HYPRE_Real * x,HYPRE_Int ny,HYPRE_Int * map,HYPRE_Real * y,char * work)488 hypreDevice_GenScatterAdd(HYPRE_Real *x, HYPRE_Int ny, HYPRE_Int *map, HYPRE_Real *y, char *work)
489 {
490 if (ny <= 0)
491 {
492 return hypre_error_flag;
493 }
494
495 if (ny <= 2)
496 {
497 /* trivial cases, n = 1, 2 */
498 dim3 bDim = 1;
499 dim3 gDim = 1;
500 HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ScatterAddTrivial, gDim, bDim, ny, x, map, y );
501 }
502 else
503 {
504 /* general cases */
505 HYPRE_Int *map2, *reduced_map, reduced_n;
506 HYPRE_Real *reduced_y;
507
508 if (work)
509 {
510 map2 = (HYPRE_Int *) work;
511 reduced_map = map2 + ny;
512 reduced_y = (HYPRE_Real *) (reduced_map + ny);
513 }
514 else
515 {
516 map2 = hypre_TAlloc(HYPRE_Int, ny, HYPRE_MEMORY_DEVICE);
517 reduced_map = hypre_TAlloc(HYPRE_Int, ny, HYPRE_MEMORY_DEVICE);
518 reduced_y = hypre_TAlloc(HYPRE_Real, ny, HYPRE_MEMORY_DEVICE);
519 }
520
521 hypre_TMemcpy(map2, map, HYPRE_Int, ny, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
522
523 HYPRE_THRUST_CALL(sort_by_key, map2, map2 + ny, y);
524
525 thrust::pair<HYPRE_Int*, HYPRE_Real*> new_end = HYPRE_THRUST_CALL( reduce_by_key,
526 map2,
527 map2 + ny,
528 y,
529 reduced_map,
530 reduced_y );
531
532 reduced_n = new_end.first - reduced_map;
533
534 hypre_assert(reduced_n == new_end.second - reduced_y);
535
536 dim3 bDim = hypre_GetDefaultCUDABlockDimension();
537 dim3 gDim = hypre_GetDefaultCUDAGridDimension(reduced_n, "thread", bDim);
538
539 HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ScatterAdd, gDim, bDim,
540 reduced_n, x, reduced_map, reduced_y );
541
542 if (!work)
543 {
544 hypre_TFree(map2, HYPRE_MEMORY_DEVICE);
545 hypre_TFree(reduced_map, HYPRE_MEMORY_DEVICE);
546 hypre_TFree(reduced_y, HYPRE_MEMORY_DEVICE);
547 }
548 }
549
550 return hypre_error_flag;
551 }
552
553 /* x[map[i]] = v */
554 template <typename T>
555 __global__ void
hypreCUDAKernel_ScatterConstant(T * x,HYPRE_Int n,HYPRE_Int * map,T v)556 hypreCUDAKernel_ScatterConstant(T *x, HYPRE_Int n, HYPRE_Int *map, T v)
557 {
558 HYPRE_Int global_thread_id = hypre_cuda_get_grid_thread_id<1,1>();
559
560 if (global_thread_id < n)
561 {
562 x[map[global_thread_id]] = v;
563 }
564 }
565
566 /* x[map[i]] = v
567 * n is length of map
568 * TODO: thrust? */
569 template <typename T>
570 HYPRE_Int
hypreDevice_ScatterConstant(T * x,HYPRE_Int n,HYPRE_Int * map,T v)571 hypreDevice_ScatterConstant(T *x, HYPRE_Int n, HYPRE_Int *map, T v)
572 {
573 /* trivial case */
574 if (n <= 0)
575 {
576 return hypre_error_flag;
577 }
578
579 dim3 bDim = hypre_GetDefaultCUDABlockDimension();
580 dim3 gDim = hypre_GetDefaultCUDAGridDimension(n, "thread", bDim);
581
582 HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ScatterConstant, gDim, bDim, x, n, map, v );
583
584 return hypre_error_flag;
585 }
586
587 template HYPRE_Int hypreDevice_ScatterConstant(HYPRE_Int *x, HYPRE_Int n, HYPRE_Int *map, HYPRE_Int v);
588 template HYPRE_Int hypreDevice_ScatterConstant(HYPRE_Complex *x, HYPRE_Int n, HYPRE_Int *map, HYPRE_Complex v);
589
590 __global__ void
hypreCUDAKernel_IVAXPY(HYPRE_Int n,HYPRE_Complex * a,HYPRE_Complex * x,HYPRE_Complex * y)591 hypreCUDAKernel_IVAXPY(HYPRE_Int n, HYPRE_Complex *a, HYPRE_Complex *x, HYPRE_Complex *y)
592 {
593 HYPRE_Int i = hypre_cuda_get_grid_thread_id<1,1>();
594
595 if (i < n)
596 {
597 y[i] += x[i] / a[i];
598 }
599 }
600
601 /* Inverse Vector AXPY: y[i] = x[i] / a[i] + y[i] */
602 HYPRE_Int
hypreDevice_IVAXPY(HYPRE_Int n,HYPRE_Complex * a,HYPRE_Complex * x,HYPRE_Complex * y)603 hypreDevice_IVAXPY(HYPRE_Int n, HYPRE_Complex *a, HYPRE_Complex *x, HYPRE_Complex *y)
604 {
605 /* trivial case */
606 if (n <= 0)
607 {
608 return hypre_error_flag;
609 }
610
611 dim3 bDim = hypre_GetDefaultCUDABlockDimension();
612 dim3 gDim = hypre_GetDefaultCUDAGridDimension(n, "thread", bDim);
613
614 HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IVAXPY, gDim, bDim, n, a, x, y );
615
616 return hypre_error_flag;
617 }
618
619 __global__ void
hypreCUDAKernel_IVAXPYMarked(HYPRE_Int n,HYPRE_Complex * a,HYPRE_Complex * x,HYPRE_Complex * y,HYPRE_Int * marker,HYPRE_Int marker_val)620 hypreCUDAKernel_IVAXPYMarked(HYPRE_Int n, HYPRE_Complex *a, HYPRE_Complex *x, HYPRE_Complex *y, HYPRE_Int *marker, HYPRE_Int marker_val)
621 {
622 HYPRE_Int i = hypre_cuda_get_grid_thread_id<1,1>();
623
624 if (i < n)
625 {
626 if (marker[i] == marker_val)
627 {
628 y[i] += x[i] / a[i];
629 }
630 }
631 }
632
633 /* Inverse Vector AXPY: y[i] = x[i] / a[i] + y[i] */
634 HYPRE_Int
hypreDevice_IVAXPYMarked(HYPRE_Int n,HYPRE_Complex * a,HYPRE_Complex * x,HYPRE_Complex * y,HYPRE_Int * marker,HYPRE_Int marker_val)635 hypreDevice_IVAXPYMarked(HYPRE_Int n, HYPRE_Complex *a, HYPRE_Complex *x, HYPRE_Complex *y, HYPRE_Int *marker, HYPRE_Int marker_val)
636 {
637 /* trivial case */
638 if (n <= 0)
639 {
640 return hypre_error_flag;
641 }
642
643 dim3 bDim = hypre_GetDefaultCUDABlockDimension();
644 dim3 gDim = hypre_GetDefaultCUDAGridDimension(n, "thread", bDim);
645
646 HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IVAXPYMarked, gDim, bDim, n, a, x, y, marker, marker_val );
647
648 return hypre_error_flag;
649 }
650
651 __global__ void
hypreCUDAKernel_DiagScaleVector(HYPRE_Int n,HYPRE_Int * A_i,HYPRE_Complex * A_data,HYPRE_Complex * x,HYPRE_Complex beta,HYPRE_Complex * y)652 hypreCUDAKernel_DiagScaleVector(HYPRE_Int n, HYPRE_Int *A_i, HYPRE_Complex *A_data, HYPRE_Complex *x, HYPRE_Complex beta, HYPRE_Complex *y)
653 {
654 HYPRE_Int i = hypre_cuda_get_grid_thread_id<1,1>();
655
656 if (i < n)
657 {
658 if (beta != 0.0)
659 {
660 y[i] = x[i] / A_data[A_i[i]] + beta * y[i];
661 }
662 else
663 {
664 y[i] = x[i] / A_data[A_i[i]];
665 }
666 }
667 }
668
669 /* y = diag(A) \ x + beta y
670 * Note: Assume A_i[i] points to the ith diagonal entry of A */
671 HYPRE_Int
hypreDevice_DiagScaleVector(HYPRE_Int n,HYPRE_Int * A_i,HYPRE_Complex * A_data,HYPRE_Complex * x,HYPRE_Complex beta,HYPRE_Complex * y)672 hypreDevice_DiagScaleVector(HYPRE_Int n, HYPRE_Int *A_i, HYPRE_Complex *A_data, HYPRE_Complex *x, HYPRE_Complex beta, HYPRE_Complex *y)
673 {
674 /* trivial case */
675 if (n <= 0)
676 {
677 return hypre_error_flag;
678 }
679
680 dim3 bDim = hypre_GetDefaultCUDABlockDimension();
681 dim3 gDim = hypre_GetDefaultCUDAGridDimension(n, "thread", bDim);
682
683 HYPRE_CUDA_LAUNCH( hypreCUDAKernel_DiagScaleVector, gDim, bDim, n, A_i, A_data, x, beta, y );
684
685 return hypre_error_flag;
686 }
687
688 __global__ void
hypreCUDAKernel_DiagScaleVector2(HYPRE_Int n,HYPRE_Int * A_i,HYPRE_Complex * A_data,HYPRE_Complex * x,HYPRE_Complex beta,HYPRE_Complex * y,HYPRE_Complex * z)689 hypreCUDAKernel_DiagScaleVector2(HYPRE_Int n, HYPRE_Int *A_i, HYPRE_Complex *A_data, HYPRE_Complex *x, HYPRE_Complex beta, HYPRE_Complex *y, HYPRE_Complex *z)
690 {
691 HYPRE_Int i = hypre_cuda_get_grid_thread_id<1,1>();
692
693 if (i < n)
694 {
695 HYPRE_Complex t = x[i] / A_data[A_i[i]];
696 y[i] = t;
697 z[i] += beta*t;
698 }
699 }
700
701 /* y = diag(A) \ x
702 * z = beta * (diag(A) \ x) + z
703 * Note: Assume A_i[i] points to the ith diagonal entry of A */
704 HYPRE_Int
hypreDevice_DiagScaleVector2(HYPRE_Int n,HYPRE_Int * A_i,HYPRE_Complex * A_data,HYPRE_Complex * x,HYPRE_Complex beta,HYPRE_Complex * y,HYPRE_Complex * z)705 hypreDevice_DiagScaleVector2(HYPRE_Int n, HYPRE_Int *A_i, HYPRE_Complex *A_data, HYPRE_Complex *x, HYPRE_Complex beta, HYPRE_Complex *y, HYPRE_Complex *z)
706 {
707 /* trivial case */
708 if (n <= 0)
709 {
710 return hypre_error_flag;
711 }
712
713 dim3 bDim = hypre_GetDefaultCUDABlockDimension();
714 dim3 gDim = hypre_GetDefaultCUDAGridDimension(n, "thread", bDim);
715
716 HYPRE_CUDA_LAUNCH( hypreCUDAKernel_DiagScaleVector2, gDim, bDim, n, A_i, A_data, x, beta, y, z );
717
718 return hypre_error_flag;
719 }
720
721 __global__ void
hypreCUDAKernel_BigToSmallCopy(HYPRE_Int * __restrict__ tgt,const HYPRE_BigInt * __restrict__ src,HYPRE_Int size)722 hypreCUDAKernel_BigToSmallCopy( HYPRE_Int* __restrict__ tgt,
723 const HYPRE_BigInt* __restrict__ src,
724 HYPRE_Int size)
725 {
726 HYPRE_Int i = hypre_cuda_get_grid_thread_id<1,1>();
727
728 if (i < size)
729 {
730 tgt[i] = src[i];
731 }
732 }
733
734 HYPRE_Int
hypreDevice_BigToSmallCopy(HYPRE_Int * tgt,const HYPRE_BigInt * src,HYPRE_Int size)735 hypreDevice_BigToSmallCopy(HYPRE_Int *tgt, const HYPRE_BigInt *src, HYPRE_Int size)
736 {
737 dim3 bDim = hypre_GetDefaultCUDABlockDimension();
738 dim3 gDim = hypre_GetDefaultCUDAGridDimension(size, "thread", bDim);
739
740 HYPRE_CUDA_LAUNCH( hypreCUDAKernel_BigToSmallCopy, gDim, bDim, tgt, src, size);
741
742 return hypre_error_flag;
743 }
744
745
746 /* https://github.com/OrangeOwlSolutions/Thrust/blob/master/Sort_by_key_with_tuple_key.cu */
747 /* opt: 0, (a,b) < (a',b') iff a < a' or (a = a' and b < b') [normal tupe comp]
748 * 1, (a,b) < (a',b') iff a < a' or (a = a' and |b| > |b'|) [used in dropping small entries]
749 * 2, (a,b) < (a',b') iff a < a' or (a = a' and (b == a or b < b') and b' != a') [used in putting diagonal first]
750 */
751 template <typename T1, typename T2, typename T3>
752 HYPRE_Int
hypreDevice_StableSortByTupleKey(HYPRE_Int N,T1 * keys1,T2 * keys2,T3 * vals,HYPRE_Int opt)753 hypreDevice_StableSortByTupleKey(HYPRE_Int N, T1 *keys1, T2 *keys2, T3 *vals, HYPRE_Int opt)
754 {
755 auto begin_keys = thrust::make_zip_iterator(thrust::make_tuple(keys1, keys2));
756 auto end_keys = thrust::make_zip_iterator(thrust::make_tuple(keys1 + N, keys2 + N));
757
758 if (opt == 0)
759 {
760 HYPRE_THRUST_CALL(stable_sort_by_key, begin_keys, end_keys, vals, thrust::less< thrust::tuple<T1, T2> >());
761 }
762 else if (opt == 1)
763 {
764 HYPRE_THRUST_CALL(stable_sort_by_key, begin_keys, end_keys, vals, TupleComp2<T1,T2>());
765 }
766 else if (opt == 2)
767 {
768 HYPRE_THRUST_CALL(stable_sort_by_key, begin_keys, end_keys, vals, TupleComp3<T1,T2>());
769 }
770
771 return hypre_error_flag;
772 }
773
774 template HYPRE_Int hypreDevice_StableSortByTupleKey(HYPRE_Int N, HYPRE_Int *keys1, HYPRE_Int *keys2, HYPRE_Int *vals, HYPRE_Int opt);
775 template HYPRE_Int hypreDevice_StableSortByTupleKey(HYPRE_Int N, HYPRE_Int *keys1, HYPRE_Real *keys2, HYPRE_Int *vals, HYPRE_Int opt);
776 template HYPRE_Int hypreDevice_StableSortByTupleKey(HYPRE_Int N, HYPRE_Int *keys1, HYPRE_Int *keys2, HYPRE_Complex *vals, HYPRE_Int opt);
777
778 /* opt:
779 * 0, (a,b) < (a',b') iff a < a' or (a = a' and b < b') [normal tupe comp]
780 * 2, (a,b) < (a',b') iff a < a' or (a = a' and (b == a or b < b') and b' != a') [used in assembly to put diagonal first]
781 */
782 template <typename T1, typename T2, typename T3, typename T4>
783 HYPRE_Int
hypreDevice_StableSortTupleByTupleKey(HYPRE_Int N,T1 * keys1,T2 * keys2,T3 * vals1,T4 * vals2,HYPRE_Int opt)784 hypreDevice_StableSortTupleByTupleKey(HYPRE_Int N, T1 *keys1, T2 *keys2, T3 *vals1, T4 *vals2, HYPRE_Int opt)
785 {
786 auto begin_keys = thrust::make_zip_iterator(thrust::make_tuple(keys1, keys2));
787 auto end_keys = thrust::make_zip_iterator(thrust::make_tuple(keys1 + N, keys2 + N));
788 auto begin_vals = thrust::make_zip_iterator(thrust::make_tuple(vals1, vals2));
789
790 if (opt == 0)
791 {
792 HYPRE_THRUST_CALL(stable_sort_by_key, begin_keys, end_keys, begin_vals, thrust::less< thrust::tuple<T1, T2> >());
793 }
794 else if (opt == 2)
795 {
796 HYPRE_THRUST_CALL(stable_sort_by_key, begin_keys, end_keys, begin_vals, TupleComp3<T1,T2>());
797 }
798
799 return hypre_error_flag;
800 }
801
802 template HYPRE_Int hypreDevice_StableSortTupleByTupleKey(HYPRE_Int N, HYPRE_Int *keys1, HYPRE_Int *keys2, char *vals1, HYPRE_Complex *vals2, HYPRE_Int opt);
803 #if defined(HYPRE_MIXEDINT)
804 template HYPRE_Int hypreDevice_StableSortTupleByTupleKey(HYPRE_Int N, HYPRE_BigInt *keys1, HYPRE_BigInt *keys2, char *vals1, HYPRE_Complex *vals2, HYPRE_Int opt);
805 #endif
806
807 template <typename T1, typename T2, typename T3>
808 HYPRE_Int
hypreDevice_ReduceByTupleKey(HYPRE_Int N,T1 * keys1_in,T2 * keys2_in,T3 * vals_in,T1 * keys1_out,T2 * keys2_out,T3 * vals_out)809 hypreDevice_ReduceByTupleKey(HYPRE_Int N, T1 *keys1_in, T2 *keys2_in, T3 *vals_in,
810 T1 *keys1_out, T2 *keys2_out, T3 *vals_out)
811 {
812 auto begin_keys_in = thrust::make_zip_iterator(thrust::make_tuple(keys1_in, keys2_in ));
813 auto end_keys_in = thrust::make_zip_iterator(thrust::make_tuple(keys1_in + N, keys2_in + N));
814 auto begin_keys_out = thrust::make_zip_iterator(thrust::make_tuple(keys1_out, keys2_out ));
815 thrust::equal_to< thrust::tuple<T1, T2> > pred;
816 thrust::plus<T3> func;
817
818 auto new_end = HYPRE_THRUST_CALL(reduce_by_key, begin_keys_in, end_keys_in, vals_in, begin_keys_out, vals_out, pred, func);
819
820 return new_end.second - vals_out;
821 }
822
823 template HYPRE_Int hypreDevice_ReduceByTupleKey(HYPRE_Int N, HYPRE_Int *keys1_in, HYPRE_Int *keys2_in, HYPRE_Complex *vals_in, HYPRE_Int *keys1_out, HYPRE_Int *keys2_out, HYPRE_Complex *vals_out);
824
825 #endif // #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
826
827 #if defined(HYPRE_USING_CUSPARSE)
828 /*
829 * @brief Determines the associated CudaDataType for the HYPRE_Complex typedef
830 * @return Returns cuda data type corresponding with HYPRE_Complex
831 *
832 * @todo Should be known compile time
833 * @todo Support more sizes
834 * @todo Support complex
835 * @warning Only works for Single and Double precision
836 * @note Perhaps some typedefs should be added where HYPRE_Complex is typedef'd
837 */
838 cudaDataType
hypre_HYPREComplexToCudaDataType()839 hypre_HYPREComplexToCudaDataType()
840 {
841 /*
842 if (sizeof(char)*CHAR_BIT != 8)
843 {
844 hypre_error_w_msg(HYPRE_ERROR_GENERIC, "ERROR: Unsupported char size");
845 hypre_assert(false);
846 }
847 */
848 #if defined(HYPRE_COMPLEX)
849 return CUDA_C_64F;
850 #else
851 #if defined(HYPRE_SINGLE)
852 hypre_assert(sizeof(HYPRE_Complex) == 4);
853 return CUDA_R_32F;
854 #elif defined(HYPRE_LONG_DOUBLE)
855 #error "Long Double is not supported on GPUs"
856 #else
857 hypre_assert(sizeof(HYPRE_Complex) == 8);
858 return CUDA_R_64F;
859 #endif
860 #endif // #if defined(HYPRE_COMPLEX)
861 }
862
863 /*
864 * @brief Determines the associated cusparseIndexType_t for the HYPRE_Int typedef
865 */
866 cusparseIndexType_t
hypre_HYPREIntToCusparseIndexType()867 hypre_HYPREIntToCusparseIndexType()
868 {
869 /*
870 if(sizeof(char)*CHAR_BIT!=8)
871 {
872 hypre_error_w_msg(HYPRE_ERROR_GENERIC, "ERROR: Unsupported char size");
873 hypre_assert(false);
874 }
875 */
876
877 #if defined(HYPRE_BIGINT)
878 hypre_assert(sizeof(HYPRE_Int) == 8);
879 return CUSPARSE_INDEX_64I;
880 #else
881 hypre_assert(sizeof(HYPRE_Int) == 4);
882 return CUSPARSE_INDEX_32I;
883 #endif
884 }
885 #endif // #if defined(HYPRE_USING_CUSPARSE)
886
887 #if defined(HYPRE_USING_GPU)
888
889 #if defined(HYPRE_USING_DEVICE_OPENMP)
890 cudaStream_t
891 #elif defined(HYPRE_USING_CUDA)
892 cudaStream_t
893 #elif defined(HYPRE_USING_HIP)
894 hipStream_t
895 #endif
hypre_CudaDataCudaStream(hypre_CudaData * data,HYPRE_Int i)896 hypre_CudaDataCudaStream(hypre_CudaData *data, HYPRE_Int i)
897 {
898 #if defined(HYPRE_USING_DEVICE_OPENMP)
899 cudaStream_t stream = 0;
900 #elif defined(HYPRE_USING_CUDA)
901 cudaStream_t stream = 0;
902 #elif defined(HYPRE_USING_HIP)
903 hipStream_t stream = 0;
904 #endif
905
906 #if defined(HYPRE_USING_CUDA_STREAMS)
907 if (i >= HYPRE_MAX_NUM_STREAMS)
908 {
909 /* return the default stream, i.e., the NULL stream */
910 /*
911 hypre_printf("CUDA stream %d exceeds the max number %d\n",
912 i, HYPRE_MAX_NUM_STREAMS);
913 */
914 return NULL;
915 }
916
917 if (data->cuda_streams[i])
918 {
919 return data->cuda_streams[i];
920 }
921
922 #if defined(HYPRE_USING_DEVICE_OPENMP)
923 HYPRE_CUDA_CALL(cudaStreamCreateWithFlags(&stream, cudaStreamDefault));
924 #elif defined(HYPRE_USING_CUDA)
925 //HYPRE_CUDA_CALL(cudaStreamCreateWithFlags(&stream,cudaStreamNonBlocking));
926 HYPRE_CUDA_CALL(cudaStreamCreateWithFlags(&stream, cudaStreamDefault));
927 #elif defined(HYPRE_USING_HIP)
928 HYPRE_HIP_CALL(hipStreamCreateWithFlags(&stream, hipStreamDefault));
929 #endif
930
931 data->cuda_streams[i] = stream;
932 #endif
933
934 return stream;
935 }
936
937 #if defined(HYPRE_USING_DEVICE_OPENMP)
938 cudaStream_t
939 #elif defined(HYPRE_USING_CUDA)
940 cudaStream_t
941 #elif defined(HYPRE_USING_HIP)
942 hipStream_t
943 #endif
hypre_CudaDataCudaComputeStream(hypre_CudaData * data)944 hypre_CudaDataCudaComputeStream(hypre_CudaData *data)
945 {
946 return hypre_CudaDataCudaStream(data,
947 hypre_CudaDataCudaComputeStreamNum(data));
948 }
949
950 #if defined(HYPRE_USING_CURAND)
951 curandGenerator_t
hypre_CudaDataCurandGenerator(hypre_CudaData * data)952 hypre_CudaDataCurandGenerator(hypre_CudaData *data)
953 {
954 if (data->curand_generator)
955 {
956 return data->curand_generator;
957 }
958
959 curandGenerator_t gen;
960 HYPRE_CURAND_CALL( curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT) );
961 HYPRE_CURAND_CALL( curandSetPseudoRandomGeneratorSeed(gen, 1234ULL) );
962 HYPRE_CURAND_CALL( curandSetStream(gen, hypre_CudaDataCudaComputeStream(data)) );
963
964 data->curand_generator = gen;
965
966 return gen;
967 }
968
969 /* T = float or hypre_double */
970 template <typename T>
971 HYPRE_Int
hypre_CurandUniform_core(HYPRE_Int n,T * urand,HYPRE_Int set_seed,hypre_ulonglongint seed,HYPRE_Int set_offset,hypre_ulonglongint offset)972 hypre_CurandUniform_core( HYPRE_Int n,
973 T *urand,
974 HYPRE_Int set_seed,
975 hypre_ulonglongint seed,
976 HYPRE_Int set_offset,
977 hypre_ulonglongint offset)
978 {
979 curandGenerator_t gen = hypre_HandleCurandGenerator(hypre_handle());
980
981 if (set_seed)
982 {
983 HYPRE_CURAND_CALL( curandSetPseudoRandomGeneratorSeed(gen, seed) );
984 }
985
986 if (set_offset)
987 {
988 HYPRE_CURAND_CALL( curandSetGeneratorOffset(gen, offset) );
989 }
990
991 if (sizeof(T) == sizeof(hypre_double))
992 {
993 HYPRE_CURAND_CALL( curandGenerateUniformDouble(gen, (hypre_double *) urand, n) );
994 }
995 else if (sizeof(T) == sizeof(float))
996 {
997 HYPRE_CURAND_CALL( curandGenerateUniform(gen, (float *) urand, n) );
998 }
999
1000 return hypre_error_flag;
1001 }
1002 #endif /* #if defined(HYPRE_USING_CURAND) */
1003
1004 #if defined(HYPRE_USING_ROCRAND)
1005 rocrand_generator
hypre_CudaDataCurandGenerator(hypre_CudaData * data)1006 hypre_CudaDataCurandGenerator(hypre_CudaData *data)
1007 {
1008 if (data->curand_generator)
1009 {
1010 return data->curand_generator;
1011 }
1012
1013 rocrand_generator gen;
1014 HYPRE_ROCRAND_CALL( rocrand_create_generator(&gen, ROCRAND_RNG_PSEUDO_DEFAULT) );
1015 HYPRE_ROCRAND_CALL( rocrand_set_seed(gen, 1234ULL) );
1016 HYPRE_ROCRAND_CALL( rocrand_set_stream(gen, hypre_CudaDataCudaComputeStream(data)) );
1017
1018 data->curand_generator = gen;
1019
1020 return gen;
1021 }
1022
1023 template <typename T>
1024 HYPRE_Int
hypre_CurandUniform_core(HYPRE_Int n,T * urand,HYPRE_Int set_seed,hypre_ulonglongint seed,HYPRE_Int set_offset,hypre_ulonglongint offset)1025 hypre_CurandUniform_core( HYPRE_Int n,
1026 T *urand,
1027 HYPRE_Int set_seed,
1028 hypre_ulonglongint seed,
1029 HYPRE_Int set_offset,
1030 hypre_ulonglongint offset)
1031 {
1032 hypre_GpuProfilingPushRange("hypre_CurandUniform_core");
1033
1034 rocrand_generator gen = hypre_HandleCurandGenerator(hypre_handle());
1035
1036 if (set_seed)
1037 {
1038 HYPRE_ROCRAND_CALL( rocrand_set_seed(gen, seed) );
1039 }
1040
1041 if (set_offset)
1042 {
1043 HYPRE_ROCRAND_CALL( rocrand_set_offset(gen, offset) );
1044 }
1045
1046 if (sizeof(T) == sizeof(hypre_double))
1047 {
1048 HYPRE_ROCRAND_CALL( rocrand_generate_uniform_double(gen, (hypre_double *) urand, n) );
1049 }
1050 else if (sizeof(T) == sizeof(float))
1051 {
1052 HYPRE_ROCRAND_CALL( rocrand_generate_uniform(gen, (float *) urand, n) );
1053 }
1054
1055 hypre_GpuProfilingPopRange();
1056
1057 return hypre_error_flag;
1058 }
1059 #endif /* #if defined(HYPRE_USING_ROCRAND) */
1060
1061 #if defined(HYPRE_USING_CURAND) || defined(HYPRE_USING_ROCRAND)
1062
1063 HYPRE_Int
hypre_CurandUniform(HYPRE_Int n,HYPRE_Real * urand,HYPRE_Int set_seed,hypre_ulonglongint seed,HYPRE_Int set_offset,hypre_ulonglongint offset)1064 hypre_CurandUniform( HYPRE_Int n,
1065 HYPRE_Real *urand,
1066 HYPRE_Int set_seed,
1067 hypre_ulonglongint seed,
1068 HYPRE_Int set_offset,
1069 hypre_ulonglongint offset)
1070 {
1071 return hypre_CurandUniform_core(n, urand, set_seed, seed, set_offset, offset);
1072 }
1073
1074 HYPRE_Int
hypre_CurandUniformSingle(HYPRE_Int n,float * urand,HYPRE_Int set_seed,hypre_ulonglongint seed,HYPRE_Int set_offset,hypre_ulonglongint offset)1075 hypre_CurandUniformSingle( HYPRE_Int n,
1076 float *urand,
1077 HYPRE_Int set_seed,
1078 hypre_ulonglongint seed,
1079 HYPRE_Int set_offset,
1080 hypre_ulonglongint offset)
1081 {
1082 return hypre_CurandUniform_core(n, urand, set_seed, seed, set_offset, offset);
1083 }
1084
1085 #endif /* #if defined(HYPRE_USING_CURAND) || defined(HYPRE_USING_ROCRAND) */
1086
1087 #if defined(HYPRE_USING_CUBLAS)
1088 cublasHandle_t
hypre_CudaDataCublasHandle(hypre_CudaData * data)1089 hypre_CudaDataCublasHandle(hypre_CudaData *data)
1090 {
1091 if (data->cublas_handle)
1092 {
1093 return data->cublas_handle;
1094 }
1095
1096 cublasHandle_t handle;
1097 HYPRE_CUBLAS_CALL( cublasCreate(&handle) );
1098
1099 HYPRE_CUBLAS_CALL( cublasSetStream(handle, hypre_CudaDataCudaComputeStream(data)) );
1100
1101 data->cublas_handle = handle;
1102
1103 return handle;
1104 }
1105 #endif
1106
1107 #if defined(HYPRE_USING_CUSPARSE)
1108 cusparseHandle_t
hypre_CudaDataCusparseHandle(hypre_CudaData * data)1109 hypre_CudaDataCusparseHandle(hypre_CudaData *data)
1110 {
1111 if (data->cusparse_handle)
1112 {
1113 return data->cusparse_handle;
1114 }
1115
1116 cusparseHandle_t handle;
1117 HYPRE_CUSPARSE_CALL( cusparseCreate(&handle) );
1118
1119 HYPRE_CUSPARSE_CALL( cusparseSetStream(handle, hypre_CudaDataCudaComputeStream(data)) );
1120
1121 data->cusparse_handle = handle;
1122
1123 return handle;
1124 }
1125 #endif // defined(HYPRE_USING_CUSPARSE)
1126
1127
1128 #if defined(HYPRE_USING_ROCSPARSE)
1129 rocsparse_handle
hypre_CudaDataCusparseHandle(hypre_CudaData * data)1130 hypre_CudaDataCusparseHandle(hypre_CudaData *data)
1131 {
1132 if (data->cusparse_handle)
1133 {
1134 return data->cusparse_handle;
1135 }
1136
1137 rocsparse_handle handle;
1138 HYPRE_ROCSPARSE_CALL( rocsparse_create_handle(&handle) );
1139
1140 HYPRE_ROCSPARSE_CALL( rocsparse_set_stream(handle, hypre_CudaDataCudaComputeStream(data)) );
1141
1142 data->cusparse_handle = handle;
1143
1144 return handle;
1145 }
1146 #endif // defined(HYPRE_USING_ROCSPARSE)
1147
1148
1149
1150 hypre_CudaData*
hypre_CudaDataCreate()1151 hypre_CudaDataCreate()
1152 {
1153 hypre_CudaData *data = hypre_CTAlloc(hypre_CudaData, 1, HYPRE_MEMORY_HOST);
1154
1155 hypre_CudaDataCudaDevice(data) = 0;
1156 hypre_CudaDataCudaComputeStreamNum(data) = 0;
1157
1158 /* SpGeMM */
1159 #if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE)
1160 hypre_CudaDataSpgemmUseCusparse(data) = 1;
1161 #else
1162 hypre_CudaDataSpgemmUseCusparse(data) = 0;
1163 #endif
1164
1165 hypre_CudaDataSpgemmAlgorithm(data) = 1;
1166 /* 1: naive overestimate, 2: naive underestimate, 3: Cohen's algorithm */
1167 hypre_CudaDataSpgemmRownnzEstimateMethod(data) = 3;
1168 hypre_CudaDataSpgemmRownnzEstimateNsamples(data) = 32;
1169 hypre_CudaDataSpgemmRownnzEstimateMultFactor(data) = 1.5;
1170 hypre_CudaDataSpgemmHashType(data) = 'D';
1171
1172 /* pmis */
1173 #if defined(HYPRE_USING_CURAND) || defined(HYPRE_USING_ROCRAND)
1174 hypre_CudaDataUseGpuRand(data) = 1;
1175 #else
1176 hypre_CudaDataUseGpuRand(data) = 0;
1177 #endif
1178
1179 /* device pool */
1180 #ifdef HYPRE_USING_DEVICE_POOL
1181 hypre_CudaDataCubBinGrowth(data) = 8u;
1182 hypre_CudaDataCubMinBin(data) = 1u;
1183 hypre_CudaDataCubMaxBin(data) = (hypre_uint) -1;
1184 hypre_CudaDataCubMaxCachedBytes(data) = (size_t) -1;
1185 hypre_CudaDataCubDevAllocator(data) = NULL;
1186 hypre_CudaDataCubUvmAllocator(data) = NULL;
1187 #endif
1188
1189 return data;
1190 }
1191
1192 void
hypre_CudaDataDestroy(hypre_CudaData * data)1193 hypre_CudaDataDestroy(hypre_CudaData *data)
1194 {
1195 if (!data)
1196 {
1197 return;
1198 }
1199
1200 hypre_TFree(hypre_CudaDataCudaReduceBuffer(data), HYPRE_MEMORY_DEVICE);
1201 hypre_TFree(hypre_CudaDataStructCommRecvBuffer(data), HYPRE_MEMORY_DEVICE);
1202 hypre_TFree(hypre_CudaDataStructCommSendBuffer(data), HYPRE_MEMORY_DEVICE);
1203
1204 #if defined(HYPRE_USING_CURAND)
1205 if (data->curand_generator)
1206 {
1207 HYPRE_CURAND_CALL( curandDestroyGenerator(data->curand_generator) );
1208 }
1209 #endif
1210
1211 #if defined(HYPRE_USING_ROCRAND)
1212 if (data->curand_generator)
1213 {
1214 HYPRE_ROCRAND_CALL( rocrand_destroy_generator(data->curand_generator) );
1215 }
1216 #endif
1217
1218 #if defined(HYPRE_USING_CUBLAS)
1219 if (data->cublas_handle)
1220 {
1221 HYPRE_CUBLAS_CALL( cublasDestroy(data->cublas_handle) );
1222 }
1223 #endif
1224
1225 #if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE)
1226 if (data->cusparse_handle)
1227 {
1228 #if defined(HYPRE_USING_CUSPARSE)
1229 HYPRE_CUSPARSE_CALL( cusparseDestroy(data->cusparse_handle) );
1230 #elif defined(HYPRE_USING_ROCSPARSE)
1231 HYPRE_ROCSPARSE_CALL( rocsparse_destroy_handle(data->cusparse_handle) );
1232 #endif
1233 }
1234 #endif // #if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE)
1235
1236 for (HYPRE_Int i = 0; i < HYPRE_MAX_NUM_STREAMS; i++)
1237 {
1238 if (data->cuda_streams[i])
1239 {
1240 #if defined(HYPRE_USING_DEVICE_OPENMP)
1241 HYPRE_CUDA_CALL( cudaStreamDestroy(data->cuda_streams[i]) );
1242 #elif defined(HYPRE_USING_CUDA)
1243 HYPRE_CUDA_CALL( cudaStreamDestroy(data->cuda_streams[i]) );
1244 #elif defined(HYPRE_USING_HIP)
1245 HYPRE_HIP_CALL( hipStreamDestroy(data->cuda_streams[i]) );
1246 #endif
1247 }
1248 }
1249
1250 #ifdef HYPRE_USING_DEVICE_POOL
1251 hypre_CudaDataCubCachingAllocatorDestroy(data);
1252 #endif
1253
1254 hypre_TFree(data, HYPRE_MEMORY_HOST);
1255 }
1256
1257 HYPRE_Int
hypre_SyncCudaDevice(hypre_Handle * hypre_handle)1258 hypre_SyncCudaDevice(hypre_Handle *hypre_handle)
1259 {
1260 #if defined(HYPRE_USING_DEVICE_OPENMP)
1261 HYPRE_CUDA_CALL( cudaDeviceSynchronize() );
1262 #elif defined(HYPRE_USING_CUDA)
1263 HYPRE_CUDA_CALL( cudaDeviceSynchronize() );
1264 #elif defined(HYPRE_USING_HIP)
1265 HYPRE_HIP_CALL( hipDeviceSynchronize() );
1266 #endif
1267 return hypre_error_flag;
1268 }
1269
1270 HYPRE_Int
hypre_ResetCudaDevice(hypre_Handle * hypre_handle)1271 hypre_ResetCudaDevice(hypre_Handle *hypre_handle)
1272 {
1273 #if defined(HYPRE_USING_CUDA)
1274 cudaDeviceReset();
1275 #elif defined(HYPRE_USING_HIP)
1276 hipDeviceReset();
1277 #endif
1278 return hypre_error_flag;
1279 }
1280
1281 /* synchronize the Hypre compute stream
1282 * action: 0: set sync stream to false
1283 * 1: set sync stream to true
1284 * 2: restore sync stream to default
1285 * 3: return the current value of cuda_compute_stream_sync
1286 * 4: sync stream based on cuda_compute_stream_sync
1287 */
1288 HYPRE_Int
hypre_SyncCudaComputeStream_core(HYPRE_Int action,hypre_Handle * hypre_handle,HYPRE_Int * cuda_compute_stream_sync_ptr)1289 hypre_SyncCudaComputeStream_core(HYPRE_Int action,
1290 hypre_Handle *hypre_handle,
1291 HYPRE_Int *cuda_compute_stream_sync_ptr)
1292 {
1293 /* with UVM the default is to sync at kernel completions, since host is also able to
1294 * touch GPU memory */
1295 #if defined(HYPRE_USING_UNIFIED_MEMORY)
1296 static const HYPRE_Int cuda_compute_stream_sync_default = 1;
1297 #else
1298 static const HYPRE_Int cuda_compute_stream_sync_default = 0;
1299 #endif
1300
1301 /* this controls if synchronize the stream after computations */
1302 static HYPRE_Int cuda_compute_stream_sync = cuda_compute_stream_sync_default;
1303
1304 switch (action)
1305 {
1306 case 0:
1307 cuda_compute_stream_sync = 0;
1308 break;
1309 case 1:
1310 cuda_compute_stream_sync = 1;
1311 break;
1312 case 2:
1313 cuda_compute_stream_sync = cuda_compute_stream_sync_default;
1314 break;
1315 case 3:
1316 *cuda_compute_stream_sync_ptr = cuda_compute_stream_sync;
1317 break;
1318 case 4:
1319 #if defined(HYPRE_USING_DEVICE_OPENMP)
1320 HYPRE_CUDA_CALL( cudaDeviceSynchronize() );
1321 #else
1322 if (cuda_compute_stream_sync)
1323 {
1324 #if defined(HYPRE_USING_CUDA)
1325 HYPRE_CUDA_CALL( cudaStreamSynchronize(hypre_HandleCudaComputeStream(hypre_handle)) );
1326 #elif defined(HYPRE_USING_HIP)
1327 HYPRE_HIP_CALL( hipStreamSynchronize(hypre_HandleCudaComputeStream(hypre_handle)) );
1328 #endif
1329 }
1330 #endif
1331 break;
1332 default:
1333 hypre_printf("hypre_SyncCudaComputeStream_core invalid action\n");
1334 hypre_error_in_arg(1);
1335 }
1336
1337 return hypre_error_flag;
1338 }
1339
1340 HYPRE_Int
hypre_SetSyncCudaCompute(HYPRE_Int action)1341 hypre_SetSyncCudaCompute(HYPRE_Int action)
1342 {
1343 /* convert to 1/0 */
1344 action = action != 0;
1345 hypre_SyncCudaComputeStream_core(action, NULL, NULL);
1346
1347 return hypre_error_flag;
1348 }
1349
1350 HYPRE_Int
hypre_RestoreSyncCudaCompute()1351 hypre_RestoreSyncCudaCompute()
1352 {
1353 hypre_SyncCudaComputeStream_core(2, NULL, NULL);
1354
1355 return hypre_error_flag;
1356 }
1357
1358 HYPRE_Int
hypre_GetSyncCudaCompute(HYPRE_Int * cuda_compute_stream_sync_ptr)1359 hypre_GetSyncCudaCompute(HYPRE_Int *cuda_compute_stream_sync_ptr)
1360 {
1361 hypre_SyncCudaComputeStream_core(3, NULL, cuda_compute_stream_sync_ptr);
1362
1363 return hypre_error_flag;
1364 }
1365
1366 HYPRE_Int
hypre_SyncCudaComputeStream(hypre_Handle * hypre_handle)1367 hypre_SyncCudaComputeStream(hypre_Handle *hypre_handle)
1368 {
1369 hypre_SyncCudaComputeStream_core(4, hypre_handle, NULL);
1370
1371 return hypre_error_flag;
1372 }
1373
1374 #endif // #if defined(HYPRE_USING_GPU)
1375
1376
1377 /* This function is supposed to be used in the test drivers to mimic
1378 * users' GPU binding approaches
1379 * It is supposed to be called before HYPRE_Init,
1380 * so that HYPRE_Init can get the wanted device id
1381 */
1382 HYPRE_Int
hypre_bind_device(HYPRE_Int myid,HYPRE_Int nproc,MPI_Comm comm)1383 hypre_bind_device( HYPRE_Int myid,
1384 HYPRE_Int nproc,
1385 MPI_Comm comm )
1386 {
1387 #ifdef HYPRE_USING_GPU
1388 /* proc id (rank) on the running node */
1389 HYPRE_Int myNodeid;
1390 /* num of procs (size) on the node */
1391 HYPRE_Int NodeSize;
1392 /* num of devices seen */
1393 hypre_int nDevices;
1394 /* device id that want to bind */
1395 hypre_int device_id;
1396
1397 hypre_MPI_Comm node_comm;
1398 hypre_MPI_Comm_split_type( comm, hypre_MPI_COMM_TYPE_SHARED,
1399 myid, hypre_MPI_INFO_NULL, &node_comm );
1400 hypre_MPI_Comm_rank(node_comm, &myNodeid);
1401 hypre_MPI_Comm_size(node_comm, &NodeSize);
1402 hypre_MPI_Comm_free(&node_comm);
1403
1404 /* get number of devices on this node */
1405 hypre_GetDeviceCount(&nDevices);
1406
1407 /* set device */
1408 device_id = myNodeid % nDevices;
1409 hypre_SetDevice(device_id, NULL);
1410
1411 #if defined(HYPRE_DEBUG) && defined(HYPRE_PRINT_ERRORS)
1412 hypre_printf("Proc [global %d/%d, local %d/%d] can see %d GPUs and is running on %d\n",
1413 myid, nproc, myNodeid, NodeSize, nDevices, device_id);
1414 #endif
1415
1416 #endif /* #ifdef HYPRE_USING_GPU */
1417
1418 return hypre_error_flag;
1419 }
1420
1421