1 /*
2 * -----------------------------------------------------------------
3 * Programmer(s): Cody J. Balos @ LLNL
4 * -----------------------------------------------------------------
5 * SUNDIALS Copyright Start
6 * Copyright (c) 2002-2021, Lawrence Livermore National Security
7 * and Southern Methodist University.
8 * All rights reserved.
9 *
10 * See the top-level LICENSE and NOTICE files for details.
11 *
12 * SPDX-License-Identifier: BSD-3-Clause
13 * SUNDIALS Copyright End
14 * -----------------------------------------------------------------
15 * This is the header file is for the cuSPARSE implementation of the
16 * SUNMATRIX module.
17 * -----------------------------------------------------------------
18 */
19
20 #include <stdio.h>
21 #include <stdlib.h>
22
23 #include <sunmemory/sunmemory_cuda.h>
24 #include <sunmatrix/sunmatrix_cusparse.h>
25
26 #include "sundials_cuda.h"
27 #include "sundials_debug.h"
28 #include "cusparse_kernels.cuh"
29
30
31 /* Use the namespace for the kernels */
32 using namespace sundials::sunmatrix_cusparse;
33
34 /* Constants */
35 #define ZERO RCONST(0.0)
36 #define ONE RCONST(1.0)
37
38 /* Private function prototypes */
39 static booleantype SMCompatible_cuSparse(SUNMatrix, SUNMatrix);
40 static SUNMatrix SUNMatrix_cuSparse_NewEmpty();
41 #if CUDART_VERSION >= 11000
42 static cusparseStatus_t CreateSpMatDescr(SUNMatrix, cusparseSpMatDescr_t*);
43 #endif
44
45 /* Macros for handling the different function names based on precision */
46 #if defined(SUNDIALS_DOUBLE_PRECISION)
47 #define cusparseXcsrmv cusparseDcsrmv
48 #define CUDA_R_XF CUDA_R_64F
49 #elif defined(SUNDIALS_SINGLE_PRECISION)
50 #define cusparseXcsrmv cusparseScsrmv
51 #define CUDA_R_XF CUDA_R_32F
52 #endif
53
54 /* Content accessor macros */
55 #define SMCU_CONTENT(A) ( (SUNMatrix_Content_cuSparse)(A->content) )
56 #define SMCU_ROWS(A) ( SMCU_CONTENT(A)->M )
57 #define SMCU_COLUMNS(A) ( SMCU_CONTENT(A)->N )
58 #define SMCU_NNZ(A) ( SMCU_CONTENT(A)->NNZ )
59 #define SMCU_NBLOCKS(A) ( SMCU_CONTENT(A)->nblocks )
60 #define SMCU_BLOCKROWS(A) ( SMCU_CONTENT(A)->blockrows )
61 #define SMCU_BLOCKCOLS(A) ( SMCU_CONTENT(A)->blockcols )
62 #define SMCU_BLOCKNNZ(A) ( SMCU_CONTENT(A)->blocknnz )
63 #define SMCU_NP(A) ( SMCU_CONTENT(A)->NP )
64 #define SMCU_SPARSETYPE(A) ( SMCU_CONTENT(A)->sparse_type )
65 #define SMCU_OWNMATD(A) ( SMCU_CONTENT(A)->own_matd )
66 #define SMCU_OWNEXEC(A) ( SMCU_CONTENT(A)->own_exec )
67 #define SMCU_DATA(A) ( SMCU_CONTENT(A)->data )
68 #define SMCU_DATAp(A) ( (realtype*)SMCU_CONTENT(A)->data->ptr )
69 #define SMCU_INDEXVALS(A) ( SMCU_CONTENT(A)->colind )
70 #define SMCU_INDEXPTRS(A) ( SMCU_CONTENT(A)->rowptrs )
71 #define SMCU_INDEXVALSp(A) ( (int*) SMCU_CONTENT(A)->colind->ptr )
72 #define SMCU_INDEXPTRSp(A) ( (int*) SMCU_CONTENT(A)->rowptrs->ptr )
73 #define SMCU_MEMHELP(A) ( SMCU_CONTENT(A)->mem_helper )
74 #define SMCU_MATDESCR(A) ( SMCU_CONTENT(A)->mat_descr )
75 #define SMCU_CUSPHANDLE(A) ( SMCU_CONTENT(A)->cusp_handle )
76 #define SMCU_FIXEDPATTERN(A)( SMCU_CONTENT(A)->fixed_pattern )
77 #define SMCU_EXECPOLICY(A) ( SMCU_CONTENT(A)->exec_policy )
78
79
80 /* ------------------------------------------------------------------
81 * Default execution policy definition.
82 *
83 * This policy tries to help us leverage the structure of the matrix.
84 * It will choose block sizes which are a multiple of the warp size,
85 * and it will choose a grid size to such that all work elements are
86 * covered.
87 * ------------------------------------------------------------------ */
88
89 class SUNCuSparseMatrixExecPolicy : public SUNCudaExecPolicy
90 {
91 public:
SUNCuSparseMatrixExecPolicy(const cudaStream_t stream=0)92 SUNCuSparseMatrixExecPolicy(const cudaStream_t stream = 0)
93 : stream_(stream)
94 {}
95
SUNCuSparseMatrixExecPolicy(const SUNCuSparseMatrixExecPolicy & ex)96 SUNCuSparseMatrixExecPolicy(const SUNCuSparseMatrixExecPolicy& ex)
97 : stream_(ex.stream_)
98 {}
99
gridSize(size_t numWorkElements,size_t blockDim=0) const100 virtual size_t gridSize(size_t numWorkElements, size_t blockDim = 0) const
101 {
102 return(numWorkElements + blockDim - 1)/blockDim;
103 }
104
blockSize(size_t numWorkElements=0,size_t gridDim=0) const105 virtual size_t blockSize(size_t numWorkElements = 0, size_t gridDim = 0) const
106 {
107 return(max_block_size(CUDA_WARP_SIZE*(numWorkElements + CUDA_WARP_SIZE - 1)/CUDA_WARP_SIZE));
108 }
109
stream() const110 virtual const cudaStream_t* stream() const
111 {
112 return(&stream_);
113 }
114
clone() const115 virtual CudaExecPolicy* clone() const
116 {
117 return(static_cast<CudaExecPolicy*>(new SUNCuSparseMatrixExecPolicy(*this)));
118 }
119
max_block_size(int val)120 static size_t max_block_size(int val)
121 {
122 return((val > MAX_CUDA_BLOCKSIZE) ? MAX_CUDA_BLOCKSIZE : val );
123 }
124
125 private:
126 const cudaStream_t stream_;
127 };
128
129 /* ------------------------------------------------------------------
130 * Constructors.
131 * ------------------------------------------------------------------ */
132
SUNMatrix_cuSparse_NewCSR(int M,int N,int NNZ,cusparseHandle_t cusp)133 SUNMatrix SUNMatrix_cuSparse_NewCSR(int M, int N, int NNZ, cusparseHandle_t cusp)
134 {
135 SUNMemory d_colind, d_rowptr, d_values;
136 int alloc_fail = 0;
137
138 /* return with NULL matrix on illegal input */
139 if ( (M <= 0) || (N <= 0) || (NNZ < 0) )
140 {
141 SUNDIALS_DEBUG_PRINT("ERROR in SUNMatrix_NewCSR_cuSparse: illegal value(s) for M, N, or NNZ\n");
142 return(NULL);
143 }
144
145 SUNMatrix A = SUNMatrix_cuSparse_NewEmpty();
146 if (A == NULL)
147 {
148 SUNDIALS_DEBUG_PRINT("ERROR in SUNMatrix_NewCSR_cuSparse: SUNMatrix_cuSparse_NewEmpty returned NULL\n");
149 return(NULL);
150 }
151
152 SMCU_MEMHELP(A) = SUNMemoryHelper_Cuda();
153 if (SMCU_MEMHELP(A) == NULL)
154 {
155 SUNDIALS_DEBUG_PRINT("ERROR in SUNMatrix_NewCSR_cuSparse: SUNMemoryHelper_Cuda returned NULL\n");
156 SUNMatDestroy(A);
157 return(NULL);
158 }
159
160 /* Allocate device memory for the matrix */
161 alloc_fail += SUNMemoryHelper_Alloc(SMCU_MEMHELP(A), &d_colind,
162 sizeof(int)*NNZ, SUNMEMTYPE_DEVICE);
163 alloc_fail += SUNMemoryHelper_Alloc(SMCU_MEMHELP(A), &d_rowptr,
164 sizeof(int)*(M+1), SUNMEMTYPE_DEVICE);
165 alloc_fail += SUNMemoryHelper_Alloc(SMCU_MEMHELP(A), &d_values,
166 sizeof(realtype)*NNZ, SUNMEMTYPE_DEVICE);
167 if (alloc_fail)
168 {
169 SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), d_colind);
170 SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), d_rowptr);
171 SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), d_values);
172 SUNMatDestroy(A);
173 return(NULL);
174 }
175
176 /* Choose sensible defaults */
177 cusparseStatus_t cusparse_status = CUSPARSE_STATUS_SUCCESS;
178 cusparseMatDescr_t mat_descr;
179 cusparse_status = cusparseCreateMatDescr(&mat_descr);
180 if (!SUNDIALS_CUSPARSE_VERIFY(cusparse_status))
181 {
182 SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), d_colind);
183 SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), d_rowptr);
184 SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), d_values);
185 SUNMatDestroy(A);
186 return(NULL);
187 }
188
189 cusparse_status = cusparseSetMatType(mat_descr, CUSPARSE_MATRIX_TYPE_GENERAL);
190 if (!SUNDIALS_CUSPARSE_VERIFY(cusparse_status))
191 {
192 SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), d_colind);
193 SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), d_rowptr);
194 SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), d_values);
195 cusparseDestroyMatDescr(mat_descr);
196 SUNMatDestroy(A);
197 return(NULL);
198 }
199
200 cusparse_status = cusparseSetMatIndexBase(mat_descr, CUSPARSE_INDEX_BASE_ZERO);
201 if (!SUNDIALS_CUSPARSE_VERIFY(cusparse_status))
202 {
203 SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), d_colind);
204 SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), d_rowptr);
205 SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), d_values);
206 cusparseDestroyMatDescr(mat_descr);
207 SUNMatDestroy(A);
208 return(NULL);
209 }
210
211 cudaStream_t stream;
212 if (!SUNDIALS_CUSPARSE_VERIFY(cusparseGetStream(cusp, &stream)))
213 {
214 SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), d_colind);
215 SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), d_rowptr);
216 SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), d_values);
217 cusparseDestroyMatDescr(mat_descr);
218 SUNMatDestroy(A);
219 return(NULL);
220 }
221
222 /* Fill the content */
223 SMCU_CONTENT(A)->M = M;
224 SMCU_CONTENT(A)->N = N;
225 SMCU_CONTENT(A)->NNZ = NNZ;
226 SMCU_CONTENT(A)->nblocks = 1;
227 SMCU_CONTENT(A)->blockrows = M;
228 SMCU_CONTENT(A)->blockcols = N;
229 SMCU_CONTENT(A)->blocknnz = NNZ;
230 SMCU_CONTENT(A)->own_matd = SUNTRUE;
231 SMCU_CONTENT(A)->own_exec = SUNTRUE;
232 SMCU_CONTENT(A)->matvec_issetup = SUNFALSE;
233 SMCU_CONTENT(A)->fixed_pattern = SUNFALSE;
234 SMCU_CONTENT(A)->sparse_type = SUNMAT_CUSPARSE_CSR;
235 SMCU_CONTENT(A)->colind = d_colind;
236 SMCU_CONTENT(A)->rowptrs = d_rowptr;
237 SMCU_CONTENT(A)->data = d_values;
238 SMCU_CONTENT(A)->mat_descr = mat_descr;
239 SMCU_CONTENT(A)->cusp_handle = cusp;
240 SMCU_CONTENT(A)->exec_policy = new SUNCuSparseMatrixExecPolicy(stream);
241
242 #if CUDART_VERSION >= 11000
243 cusparseSpMatDescr_t spmat_descr;
244 if (!SUNDIALS_CUSPARSE_VERIFY(CreateSpMatDescr(A, &spmat_descr)))
245 {
246 SUNMatDestroy(A);
247 return(NULL);
248 }
249 SMCU_CONTENT(A)->spmat_descr = spmat_descr;
250 SMCU_CONTENT(A)->dBufferMem = NULL;
251 SMCU_CONTENT(A)->bufferSize = 0;
252 SMCU_CONTENT(A)->vecX = NULL;
253 SMCU_CONTENT(A)->vecY = NULL;
254 #endif
255
256 return A;
257 }
258
259
SUNMatrix_cuSparse_MakeCSR(cusparseMatDescr_t mat_descr,int M,int N,int NNZ,int * rowptrs,int * colind,realtype * data,cusparseHandle_t cusp)260 SUNMatrix SUNMatrix_cuSparse_MakeCSR(cusparseMatDescr_t mat_descr, int M, int N, int NNZ,
261 int *rowptrs , int *colind , realtype *data,
262 cusparseHandle_t cusp)
263 {
264 /* return with NULL matrix on illegal input */
265 if ( (M <= 0) || (N <= 0) || (NNZ < 0) )
266 {
267 SUNDIALS_DEBUG_PRINT("ERROR in SUNMatrix_MakeCSR_cuSparse: illegal value(s) for M, N, or NNZ\n");
268 return(NULL);
269 }
270
271 if ( (rowptrs == NULL) || (colind == NULL) || (data == NULL) )
272 {
273 SUNDIALS_DEBUG_PRINT("ERROR in SUNMatrix_MakeCSR_cuSparse: rowptrs, colind, or data is NULL\n");
274 return(NULL);
275 }
276
277 if (cusparseGetMatIndexBase(mat_descr) != CUSPARSE_INDEX_BASE_ZERO)
278 {
279 SUNDIALS_DEBUG_PRINT("ERROR in SUNMatrix_MakeCSR_cuSparse: the cusparseMatDescr_t must have index base CUSPARSE_INDEX_BASE_ZERO\n");
280 return(NULL);
281 }
282
283 SUNMatrix A = SUNMatrix_cuSparse_NewEmpty();
284 if (A == NULL)
285 {
286 SUNDIALS_DEBUG_PRINT("ERROR in SUNMatrix_MakeCSR_cuSparse: SUNMatrix_cuSparse_NewEmpty returned NULL\n");
287 return(NULL);
288 }
289
290 SMCU_MEMHELP(A) = SUNMemoryHelper_Cuda();
291 if (SMCU_MEMHELP(A) == NULL)
292 {
293 SUNDIALS_DEBUG_PRINT("ERROR in SUNMatrix_NewCSR_cuSparse: SUNMemoryHelper_Cuda returned NULL\n");
294 SUNMatDestroy(A);
295 return(NULL);
296 }
297
298 cudaStream_t stream;
299 if (!SUNDIALS_CUSPARSE_VERIFY(cusparseGetStream(cusp, &stream)))
300 {
301 SUNMatDestroy(A);
302 return(NULL);
303 }
304
305 /* Fill content */
306 SMCU_CONTENT(A)->M = M;
307 SMCU_CONTENT(A)->N = N;
308 SMCU_CONTENT(A)->NNZ = NNZ;
309 SMCU_CONTENT(A)->nblocks = 1;
310 SMCU_CONTENT(A)->blockrows = M;
311 SMCU_CONTENT(A)->blockcols = N;
312 SMCU_CONTENT(A)->blocknnz = NNZ;
313 SMCU_CONTENT(A)->own_matd = SUNFALSE;
314 SMCU_CONTENT(A)->own_exec = SUNTRUE;
315 SMCU_CONTENT(A)->matvec_issetup = SUNFALSE;
316 SMCU_CONTENT(A)->fixed_pattern = SUNFALSE;
317 SMCU_CONTENT(A)->sparse_type = SUNMAT_CUSPARSE_CSR;
318 SMCU_CONTENT(A)->colind = SUNMemoryHelper_Wrap(colind, SUNMEMTYPE_DEVICE);
319 SMCU_CONTENT(A)->rowptrs = SUNMemoryHelper_Wrap(rowptrs, SUNMEMTYPE_DEVICE);
320 SMCU_CONTENT(A)->data = SUNMemoryHelper_Wrap(data, SUNMEMTYPE_DEVICE);
321 SMCU_CONTENT(A)->mat_descr = mat_descr;
322 SMCU_CONTENT(A)->cusp_handle = cusp;
323
324 SMCU_CONTENT(A)->exec_policy = new SUNCuSparseMatrixExecPolicy(stream);
325
326 if (SMCU_CONTENT(A)->colind == NULL ||
327 SMCU_CONTENT(A)->rowptrs == NULL ||
328 SMCU_CONTENT(A)->data == NULL)
329 {
330 SUNMatDestroy(A);
331 return(NULL);
332 }
333
334 #if CUDART_VERSION >= 11000
335 cusparseSpMatDescr_t spmat_descr;
336 if (!SUNDIALS_CUSPARSE_VERIFY(CreateSpMatDescr(A, &spmat_descr)))
337 {
338 SUNMatDestroy(A);
339 return(NULL);
340 }
341 SMCU_CONTENT(A)->spmat_descr = spmat_descr;
342 SMCU_CONTENT(A)->dBufferMem = NULL;
343 SMCU_CONTENT(A)->bufferSize = 0;
344 SMCU_CONTENT(A)->vecX = NULL;
345 SMCU_CONTENT(A)->vecY = NULL;
346 #endif
347
348 return(A);
349 }
350
351
SUNMatrix_cuSparse_NewBlockCSR(int nblocks,int blockrows,int blockcols,int blocknnz,cusparseHandle_t cusp)352 SUNMatrix SUNMatrix_cuSparse_NewBlockCSR(int nblocks, int blockrows, int blockcols, int blocknnz, cusparseHandle_t cusp)
353 {
354 SUNMemory d_colind, d_rowptr, d_values;
355 int M, N, NNZ;
356 int alloc_fail = 0;
357
358 /* Return with NULL matrix on illegal input */
359 if (blockrows != blockcols)
360 {
361 SUNDIALS_DEBUG_PRINT("ERROR in SUNMatrix_cuSparse_NewBlockCSR: matrix must be square for the BCSR format\n");
362 return(NULL);
363 }
364
365 M = nblocks * blockrows;
366 N = M;
367 NNZ = nblocks * blocknnz;
368
369 /* Return with NULL matrix on illegal input */
370 if ( (M <= 0) || (N <= 0) || (NNZ < 0) )
371 {
372 SUNDIALS_DEBUG_PRINT("ERROR in SUNMatrix_cuSparse_NewBlockCSR: illegal value(s) for M, N, or NNZ\n");
373 return(NULL);
374 }
375
376 /* Allocate the SUNMatrix object */
377 SUNMatrix A = SUNMatrix_cuSparse_NewEmpty();
378 if (A == NULL)
379 {
380 SUNDIALS_DEBUG_PRINT("ERROR in SUNMatrix_cuSparse_NewBlockCSR: SUNMatrix_cuSparse_NewEmpty returned NULL\n");
381 return(NULL);
382 }
383
384 SMCU_MEMHELP(A) = SUNMemoryHelper_Cuda();
385 if (SMCU_MEMHELP(A) == NULL)
386 {
387 SUNDIALS_DEBUG_PRINT("ERROR in SUNMatrix_NewCSR_cuSparse: SUNMemoryHelper_Cuda returned NULL\n");
388 SUNMatDestroy(A);
389 return(NULL);
390 }
391
392 /* Allocate device memory for the matrix */
393 alloc_fail += SUNMemoryHelper_Alloc(SMCU_MEMHELP(A), &d_colind,
394 sizeof(int)*blocknnz, SUNMEMTYPE_DEVICE);
395 alloc_fail += SUNMemoryHelper_Alloc(SMCU_MEMHELP(A), &d_rowptr,
396 sizeof(int)*(blockrows + 1),
397 SUNMEMTYPE_DEVICE);
398 alloc_fail += SUNMemoryHelper_Alloc(SMCU_MEMHELP(A), &d_values,
399 sizeof(realtype)*blocknnz*nblocks,
400 SUNMEMTYPE_DEVICE);
401 if (alloc_fail)
402 {
403 SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), d_colind);
404 SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), d_rowptr);
405 SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), d_values);
406 SUNMatDestroy(A);
407 return(NULL);
408 }
409
410 /* Choose sensible defaults */
411 cusparseStatus_t cusparse_status = CUSPARSE_STATUS_SUCCESS;
412 cusparseMatDescr_t mat_descr;
413 cusparse_status = cusparseCreateMatDescr(&mat_descr);
414 if (!SUNDIALS_CUSPARSE_VERIFY(cusparse_status))
415 {
416 SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), d_colind);
417 SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), d_rowptr);
418 SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), d_values);
419 SUNMatDestroy(A);
420 return(NULL);
421 }
422
423 cusparse_status = cusparseSetMatType(mat_descr, CUSPARSE_MATRIX_TYPE_GENERAL);
424 if (!SUNDIALS_CUSPARSE_VERIFY(cusparse_status))
425 {
426 SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), d_colind);
427 SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), d_rowptr);
428 SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), d_values);
429 cusparseDestroyMatDescr(mat_descr);
430 SUNMatDestroy(A);
431 return(NULL);
432 }
433
434 cusparse_status = cusparseSetMatIndexBase(mat_descr, CUSPARSE_INDEX_BASE_ZERO);
435 if (!SUNDIALS_CUSPARSE_VERIFY(cusparse_status))
436 {
437 SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), d_colind);
438 SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), d_rowptr);
439 SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), d_values);
440 cusparseDestroyMatDescr(mat_descr);
441 SUNMatDestroy(A);
442 return(NULL);
443 }
444
445 cudaStream_t stream;
446 if (!SUNDIALS_CUSPARSE_VERIFY(cusparseGetStream(cusp, &stream)))
447 {
448 SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), d_colind);
449 SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), d_rowptr);
450 SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), d_values);
451 cusparseDestroyMatDescr(mat_descr);
452 SUNMatDestroy(A);
453 return(NULL);
454 }
455
456 /* Fill the content */
457 SMCU_CONTENT(A)->M = M;
458 SMCU_CONTENT(A)->N = N;
459 SMCU_CONTENT(A)->NNZ = NNZ;
460 SMCU_CONTENT(A)->nblocks = nblocks;
461 SMCU_CONTENT(A)->blockrows = blockrows;
462 SMCU_CONTENT(A)->blockcols = blockrows;
463 SMCU_CONTENT(A)->blocknnz = blocknnz;
464 SMCU_CONTENT(A)->own_matd = SUNTRUE;
465 SMCU_CONTENT(A)->own_exec = SUNTRUE;
466 SMCU_CONTENT(A)->matvec_issetup = SUNFALSE;
467 SMCU_CONTENT(A)->cusp_handle = cusp;
468 SMCU_CONTENT(A)->fixed_pattern = SUNFALSE;
469 SMCU_CONTENT(A)->sparse_type = SUNMAT_CUSPARSE_BCSR;
470 SMCU_CONTENT(A)->colind = d_colind;
471 SMCU_CONTENT(A)->rowptrs = d_rowptr;
472 SMCU_CONTENT(A)->data = d_values;
473 SMCU_CONTENT(A)->mat_descr = mat_descr;
474 SMCU_CONTENT(A)->exec_policy = new SUNCuSparseMatrixExecPolicy(stream);
475
476 #if CUDART_VERSION >= 11000
477 cusparseSpMatDescr_t spmat_descr;
478 if (!SUNDIALS_CUSPARSE_VERIFY(CreateSpMatDescr(A, &spmat_descr)))
479 {
480 SUNMatDestroy(A);
481 return(NULL);
482 }
483 SMCU_CONTENT(A)->spmat_descr = spmat_descr;
484 SMCU_CONTENT(A)->dBufferMem = NULL;
485 SMCU_CONTENT(A)->bufferSize = 0;
486 SMCU_CONTENT(A)->vecX = NULL;
487 SMCU_CONTENT(A)->vecY = NULL;
488 #endif
489
490 return(A);
491 }
492
493 /* ------------------------------------------------------------------
494 * Implementation specific routines.
495 * ------------------------------------------------------------------ */
496
SUNMatrix_cuSparse_SparseType(SUNMatrix A)497 int SUNMatrix_cuSparse_SparseType(SUNMatrix A)
498 {
499 if (SUNMatGetID(A) == SUNMATRIX_CUSPARSE)
500 return(SMCU_SPARSETYPE(A));
501 else
502 return(SUNMAT_ILL_INPUT);
503 }
504
SUNMatrix_cuSparse_Rows(SUNMatrix A)505 int SUNMatrix_cuSparse_Rows(SUNMatrix A)
506 {
507 if (SUNMatGetID(A) == SUNMATRIX_CUSPARSE)
508 return(SMCU_ROWS(A));
509 else
510 return(SUNMAT_ILL_INPUT);
511 }
512
SUNMatrix_cuSparse_Columns(SUNMatrix A)513 int SUNMatrix_cuSparse_Columns(SUNMatrix A)
514 {
515 if (SUNMatGetID(A) == SUNMATRIX_CUSPARSE)
516 return(SMCU_COLUMNS(A));
517 else
518 return(SUNMAT_ILL_INPUT);
519 }
520
SUNMatrix_cuSparse_NNZ(SUNMatrix A)521 int SUNMatrix_cuSparse_NNZ(SUNMatrix A)
522 {
523 if (SUNMatGetID(A) == SUNMATRIX_CUSPARSE)
524 return(SMCU_NNZ(A));
525 else
526 return(SUNMAT_ILL_INPUT);
527 }
528
SUNMatrix_cuSparse_IndexPointers(SUNMatrix A)529 int* SUNMatrix_cuSparse_IndexPointers(SUNMatrix A)
530 {
531 if (SUNMatGetID(A) == SUNMATRIX_CUSPARSE)
532 return(SMCU_INDEXPTRSp(A));
533 else
534 return(NULL);
535 }
536
SUNMatrix_cuSparse_IndexValues(SUNMatrix A)537 int* SUNMatrix_cuSparse_IndexValues(SUNMatrix A)
538 {
539 if (SUNMatGetID(A) == SUNMATRIX_CUSPARSE)
540 return(SMCU_INDEXVALSp(A));
541 else
542 return(NULL);
543 }
544
SUNMatrix_cuSparse_Data(SUNMatrix A)545 realtype* SUNMatrix_cuSparse_Data(SUNMatrix A)
546 {
547 if (SUNMatGetID(A) == SUNMATRIX_CUSPARSE)
548 return(SMCU_DATAp(A));
549 else
550 return(NULL);
551 }
552
SUNMatrix_cuSparse_NumBlocks(SUNMatrix A)553 int SUNMatrix_cuSparse_NumBlocks(SUNMatrix A)
554 {
555 if (SUNMatGetID(A) == SUNMATRIX_CUSPARSE)
556 return(SMCU_NBLOCKS(A));
557 else
558 return(SUNMAT_ILL_INPUT);
559 }
560
SUNMatrix_cuSparse_BlockRows(SUNMatrix A)561 int SUNMatrix_cuSparse_BlockRows(SUNMatrix A)
562 {
563 if (SUNMatGetID(A) == SUNMATRIX_CUSPARSE)
564 return(SMCU_BLOCKROWS(A));
565 else
566 return(SUNMAT_ILL_INPUT);
567 }
568
SUNMatrix_cuSparse_BlockColumns(SUNMatrix A)569 int SUNMatrix_cuSparse_BlockColumns(SUNMatrix A)
570 {
571 if (SUNMatGetID(A) == SUNMATRIX_CUSPARSE)
572 return(SMCU_BLOCKCOLS(A));
573 else
574 return(SUNMAT_ILL_INPUT);
575 }
576
SUNMatrix_cuSparse_BlockNNZ(SUNMatrix A)577 int SUNMatrix_cuSparse_BlockNNZ(SUNMatrix A)
578 {
579 if (SUNMatGetID(A) == SUNMATRIX_CUSPARSE)
580 return(SMCU_BLOCKNNZ(A));
581 else
582 return(SUNMAT_ILL_INPUT);
583 }
584
SUNMatrix_cuSparse_BlockData(SUNMatrix A,int blockidx)585 realtype* SUNMatrix_cuSparse_BlockData(SUNMatrix A, int blockidx)
586 {
587 realtype *matdata;
588 int offset;
589
590 if (SUNMatGetID(A) != SUNMATRIX_CUSPARSE)
591 return(NULL);
592
593 if (blockidx >= SMCU_NBLOCKS(A))
594 return(NULL);
595
596 matdata = SMCU_DATAp(A);
597 offset = SMCU_BLOCKNNZ(A)*blockidx;
598
599 return(&matdata[offset]);
600 }
601
SUNMatrix_cuSparse_MatDescr(SUNMatrix A)602 cusparseMatDescr_t SUNMatrix_cuSparse_MatDescr(SUNMatrix A)
603 {
604 if (SUNMatGetID(A) == SUNMATRIX_CUSPARSE)
605 return(SMCU_MATDESCR(A));
606 else
607 return(NULL);
608 }
609
SUNMatrix_cuSparse_SetFixedPattern(SUNMatrix A,booleantype yesno)610 int SUNMatrix_cuSparse_SetFixedPattern(SUNMatrix A, booleantype yesno)
611 {
612 if (SUNMatGetID(A) != SUNMATRIX_CUSPARSE)
613 return(SUNMAT_ILL_INPUT);
614
615 SMCU_FIXEDPATTERN(A) = yesno;
616
617 return(SUNMAT_SUCCESS);
618 }
619
620
SUNMatrix_cuSparse_SetKernelExecPolicy(SUNMatrix A,SUNCudaExecPolicy * exec_policy)621 int SUNMatrix_cuSparse_SetKernelExecPolicy(SUNMatrix A, SUNCudaExecPolicy* exec_policy)
622 {
623 if (SUNMatGetID(A) != SUNMATRIX_CUSPARSE || exec_policy == NULL)
624 return(SUNMAT_ILL_INPUT);
625
626 if (SMCU_OWNEXEC(A)) delete SMCU_EXECPOLICY(A);
627 SMCU_EXECPOLICY(A) = exec_policy;
628
629 SMCU_OWNEXEC(A) = SUNFALSE;
630
631 return(SUNMAT_SUCCESS);
632 }
633
634
SUNMatrix_cuSparse_CopyToDevice(SUNMatrix dA,realtype * h_data,int * h_idxptrs,int * h_idxvals)635 int SUNMatrix_cuSparse_CopyToDevice(SUNMatrix dA, realtype* h_data,
636 int* h_idxptrs, int* h_idxvals)
637 {
638 int retval;
639 SUNMemory _h_data, _h_idxptrs, _h_idxvals;
640 const cudaStream_t* stream;
641 int nidxvals, nidxptrs;
642
643 if (SUNMatGetID(dA) != SUNMATRIX_CUSPARSE)
644 return(SUNMAT_ILL_INPUT);
645
646 stream = SMCU_EXECPOLICY(dA)->stream();
647
648 if (h_data != NULL)
649 {
650 _h_data = SUNMemoryHelper_Wrap(h_data, SUNMEMTYPE_HOST);
651 retval = SUNMemoryHelper_CopyAsync(SMCU_MEMHELP(dA),
652 SMCU_DATA(dA),
653 _h_data,
654 SMCU_NNZ(dA)*sizeof(realtype),
655 (void*) stream);
656 SUNMemoryHelper_Dealloc(SMCU_MEMHELP(dA), _h_data);
657 if (retval != 0) return(SUNMAT_OPERATION_FAIL);
658 }
659
660 switch(SMCU_SPARSETYPE(dA))
661 {
662 case SUNMAT_CUSPARSE_CSR:
663 nidxptrs = SMCU_ROWS(dA)+1;
664 nidxvals = SMCU_NNZ(dA);
665 break;
666 case SUNMAT_CUSPARSE_BCSR:
667 nidxptrs = SMCU_BLOCKROWS(dA)+1;
668 nidxvals = SMCU_BLOCKNNZ(dA);
669 break;
670 default:
671 SUNDIALS_DEBUG_PRINT("ERROR in SUNMatrix_cuSparse_CopyToDevice: unrecognized sparse type\n");
672 return(SUNMAT_ILL_INPUT);
673 }
674
675 if (h_idxptrs != NULL)
676 {
677 _h_idxptrs = SUNMemoryHelper_Wrap(h_idxptrs, SUNMEMTYPE_HOST);
678 retval = SUNMemoryHelper_CopyAsync(SMCU_MEMHELP(dA),
679 SMCU_INDEXPTRS(dA),
680 _h_idxptrs,
681 nidxptrs*sizeof(int),
682 (void*) stream);
683 SUNMemoryHelper_Dealloc(SMCU_MEMHELP(dA), _h_idxptrs);
684 if (retval != 0) return(SUNMAT_OPERATION_FAIL);
685 }
686
687 if (h_idxvals != NULL)
688 {
689 _h_idxvals = SUNMemoryHelper_Wrap(h_idxvals, SUNMEMTYPE_HOST);
690 retval = SUNMemoryHelper_CopyAsync(SMCU_MEMHELP(dA),
691 SMCU_INDEXVALS(dA),
692 _h_idxvals,
693 nidxvals*sizeof(int),
694 (void*) stream);
695 SUNMemoryHelper_Dealloc(SMCU_MEMHELP(dA), _h_idxvals);
696 if (retval != 0) return(SUNMAT_OPERATION_FAIL);
697 }
698
699 return(SUNMAT_SUCCESS);
700 }
701
702
SUNMatrix_cuSparse_CopyFromDevice(SUNMatrix dA,realtype * h_data,int * h_idxptrs,int * h_idxvals)703 int SUNMatrix_cuSparse_CopyFromDevice(SUNMatrix dA, realtype* h_data,
704 int* h_idxptrs, int* h_idxvals)
705 {
706 int retval;
707 SUNMemory _h_data, _h_idxptrs, _h_idxvals;
708 const cudaStream_t* stream;
709 int nidxvals, nidxptrs;
710
711 if (SUNMatGetID(dA) != SUNMATRIX_CUSPARSE)
712 return(SUNMAT_ILL_INPUT);
713
714 stream = SMCU_EXECPOLICY(dA)->stream();
715
716 if (h_data != NULL)
717 {
718 _h_data = SUNMemoryHelper_Wrap(h_data, SUNMEMTYPE_HOST);
719 retval = SUNMemoryHelper_CopyAsync(SMCU_MEMHELP(dA),
720 _h_data,
721 SMCU_DATA(dA),
722 SMCU_NNZ(dA)*sizeof(realtype),
723 (void*) stream);
724 SUNMemoryHelper_Dealloc(SMCU_MEMHELP(dA), _h_data);
725 if (retval != 0) return(SUNMAT_OPERATION_FAIL);
726 }
727
728
729 switch(SMCU_SPARSETYPE(dA))
730 {
731 case SUNMAT_CUSPARSE_CSR:
732 nidxptrs = SMCU_ROWS(dA)+1;
733 nidxvals = SMCU_NNZ(dA);
734 case SUNMAT_CUSPARSE_BCSR:
735 nidxptrs = SMCU_BLOCKROWS(dA)+1;
736 nidxvals = SMCU_BLOCKNNZ(dA);
737 }
738
739 if (h_idxptrs != NULL)
740 {
741 _h_idxptrs = SUNMemoryHelper_Wrap(h_idxptrs, SUNMEMTYPE_HOST);
742 retval = SUNMemoryHelper_CopyAsync(SMCU_MEMHELP(dA),
743 _h_idxptrs,
744 SMCU_INDEXPTRS(dA),
745 nidxptrs*sizeof(int),
746 (void*) stream);
747 SUNMemoryHelper_Dealloc(SMCU_MEMHELP(dA), _h_idxptrs);
748 if (retval != 0) return(SUNMAT_OPERATION_FAIL);
749 }
750
751 if (h_idxvals != NULL)
752 {
753 _h_idxvals = SUNMemoryHelper_Wrap(h_idxvals, SUNMEMTYPE_HOST);
754 retval = SUNMemoryHelper_CopyAsync(SMCU_MEMHELP(dA),
755 _h_idxvals,
756 SMCU_INDEXVALS(dA),
757 nidxvals*sizeof(int),
758 (void*) stream);
759 SUNMemoryHelper_Dealloc(SMCU_MEMHELP(dA), _h_idxvals);
760 if (retval != 0) return(SUNMAT_OPERATION_FAIL);
761 }
762
763
764 return(SUNMAT_SUCCESS);
765 }
766
767 /*
768 * -----------------------------------------------------------------
769 * implementation of matrix operations
770 * -----------------------------------------------------------------
771 */
772
773
SUNMatGetID_cuSparse(SUNMatrix A)774 SUNMatrix_ID SUNMatGetID_cuSparse(SUNMatrix A)
775 {
776 return(SUNMATRIX_CUSPARSE);
777 }
778
779 /* Returns a new matrix allocated to have the same structure as A,
780 but it does not copy any nonzeros, column vals, or row pointers. */
SUNMatClone_cuSparse(SUNMatrix A)781 SUNMatrix SUNMatClone_cuSparse(SUNMatrix A)
782 {
783 SUNMatrix B;
784
785 switch (SMCU_SPARSETYPE(A))
786 {
787 case SUNMAT_CUSPARSE_CSR:
788 B = SUNMatrix_cuSparse_NewCSR(SMCU_ROWS(A), SMCU_COLUMNS(A), SMCU_NNZ(A),
789 SMCU_CUSPHANDLE(A));
790 break;
791 case SUNMAT_CUSPARSE_BCSR:
792 B = SUNMatrix_cuSparse_NewBlockCSR(SMCU_NBLOCKS(A), SMCU_BLOCKROWS(A), SMCU_BLOCKCOLS(A),
793 SMCU_BLOCKNNZ(A), SMCU_CUSPHANDLE(A));
794 break;
795 default:
796 SUNDIALS_DEBUG_PRINT("ERROR in SUNMatClone_cuSparse: sparse type not recognized\n");
797 B = NULL;
798 }
799
800 SMCU_FIXEDPATTERN(B) = SMCU_FIXEDPATTERN(A);
801 delete SMCU_EXECPOLICY(B);
802 SMCU_EXECPOLICY(B) = SMCU_EXECPOLICY(A)->clone();
803
804 return(B);
805 }
806
807
808 /* Deallocates the SUNMatrix object and all data it owns */
SUNMatDestroy_cuSparse(SUNMatrix A)809 void SUNMatDestroy_cuSparse(SUNMatrix A)
810 {
811 if (A == NULL) return;
812
813 /* free content */
814 if (A->content != NULL)
815 {
816 if (SMCU_MEMHELP(A))
817 {
818 SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), SMCU_DATA(A));
819 SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), SMCU_INDEXPTRS(A));
820 SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), SMCU_INDEXVALS(A));
821 }
822 else
823 {
824 SUNDIALS_DEBUG_PRINT("WARNING in SUNMatDestroy_cuSparse: mem_helper was NULL when trying to dealloc data, this could result in a memory leak\n");
825 }
826
827 if (SMCU_OWNMATD(A))
828 {
829 /* free cusparseMatDescr_t */
830 SUNDIALS_CUSPARSE_VERIFY( cusparseDestroyMatDescr(SMCU_MATDESCR(A)) );
831 }
832
833 #if CUDART_VERSION >= 11000
834 SUNDIALS_CUSPARSE_VERIFY( cusparseDestroyDnVec(SMCU_CONTENT(A)->vecX) );
835 SUNDIALS_CUSPARSE_VERIFY( cusparseDestroyDnVec(SMCU_CONTENT(A)->vecY) );
836 SUNDIALS_CUSPARSE_VERIFY( cusparseDestroySpMat(SMCU_CONTENT(A)->spmat_descr) );
837 SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), SMCU_CONTENT(A)->dBufferMem);
838 #endif
839
840 if (SMCU_EXECPOLICY(A) && SMCU_OWNEXEC(A))
841 {
842 delete SMCU_EXECPOLICY(A);
843 SMCU_EXECPOLICY(A) = NULL;
844 }
845
846 SUNMemoryHelper_Destroy(SMCU_MEMHELP(A));
847
848 /* free content struct */
849 free(A->content);
850 A->content = NULL;
851 }
852
853 /* free ops and matrix */
854 if (A->ops) { free(A->ops); A->ops = NULL; }
855 free(A); A = NULL;
856
857 return;
858 }
859
860
861 /* Performs A_ij = 0 */
SUNMatZero_cuSparse(SUNMatrix A)862 int SUNMatZero_cuSparse(SUNMatrix A)
863 {
864 cudaError_t cuerr;
865 cudaStream_t stream;
866
867 stream = *SMCU_EXECPOLICY(A)->stream();
868
869 /* set all data to zero */
870 cuerr = cudaMemsetAsync(SMCU_DATAp(A), 0, SMCU_NNZ(A)*sizeof(realtype), stream);
871 if (!SUNDIALS_CUDA_VERIFY(cuerr)) return(SUNMAT_OPERATION_FAIL);
872
873 /* set all rowptrs to zero unless the sparsity pattern is fixed */
874 if (!SMCU_FIXEDPATTERN(A))
875 {
876 cuerr = cudaMemsetAsync(SMCU_INDEXPTRSp(A), 0,
877 (SMCU_BLOCKROWS(A)+1)*sizeof(int),
878 stream);
879 if (!SUNDIALS_CUDA_VERIFY(cuerr)) return(SUNMAT_OPERATION_FAIL);
880
881 /* set all colind to zero */
882 cuerr = cudaMemsetAsync(SMCU_INDEXVALSp(A), 0,
883 SMCU_BLOCKNNZ(A)*sizeof(int),
884 stream);
885 if (!SUNDIALS_CUDA_VERIFY(cuerr)) return(SUNMAT_OPERATION_FAIL);
886 }
887
888 return(SUNMAT_SUCCESS);
889 }
890
891
892 /* Copies the nonzeros, column vals, and row pointers into dst */
SUNMatCopy_cuSparse(SUNMatrix src,SUNMatrix dst)893 int SUNMatCopy_cuSparse(SUNMatrix src, SUNMatrix dst)
894 {
895 int retval;
896 const cudaStream_t* stream;
897
898 /* Verify that src and dst are compatible */
899 if (!SMCompatible_cuSparse(src, dst))
900 return(SUNMAT_ILL_INPUT);
901
902 stream = SMCU_EXECPOLICY(src)->stream();
903
904 /* Ensure that dst is allocated with at least as
905 much memory as we have nonzeros in src */
906 if (SMCU_NNZ(dst) < SMCU_NNZ(src))
907 {
908 SUNDIALS_DEBUG_PRINT("ERROR in SUNMatCopy_cuSparse: the destination matrix has less nonzeros than the source\n");
909 return(SUNMAT_ILL_INPUT);
910 }
911
912 /* Zero out dst so that copy works correctly */
913 if (SUNMatZero_cuSparse(dst) != SUNMAT_SUCCESS)
914 {
915 SUNDIALS_DEBUG_PRINT("ERROR in SUNMatCopy_cuSparse: SUNMatZero_cuSparse failed\n");
916 return(SUNMAT_OPERATION_FAIL);
917 }
918
919 /* Copy the data over */
920 retval = SUNMemoryHelper_CopyAsync(SMCU_MEMHELP(src),
921 SMCU_DATA(dst),
922 SMCU_DATA(src),
923 SMCU_NNZ(src)*sizeof(realtype),
924 (void*) stream);
925 if (retval) return(SUNMAT_OPERATION_FAIL);
926
927 /* Copy the row pointers over */
928 retval = SUNMemoryHelper_CopyAsync(SMCU_MEMHELP(src),
929 SMCU_INDEXPTRS(dst),
930 SMCU_INDEXPTRS(src),
931 (SMCU_BLOCKROWS(src)+1)*sizeof(int),
932 (void*) stream);
933 if (retval) return(SUNMAT_OPERATION_FAIL);
934
935 /* Copy the column indices over */
936 retval = SUNMemoryHelper_CopyAsync(SMCU_MEMHELP(src),
937 SMCU_INDEXVALS(dst),
938 SMCU_INDEXVALS(src),
939 SMCU_BLOCKNNZ(src)*sizeof(int),
940 (void*) stream);
941 if (retval) return(SUNMAT_OPERATION_FAIL);
942
943 return(SUNMAT_SUCCESS);
944 }
945
946
947 /* Performs A = cA + I. Requires the diagonal to be allocated already. */
SUNMatScaleAddI_cuSparse(realtype c,SUNMatrix A)948 int SUNMatScaleAddI_cuSparse(realtype c, SUNMatrix A)
949 {
950 unsigned threadsPerBlock, gridSize;
951 cudaStream_t stream = *SMCU_EXECPOLICY(A)->stream();
952
953 switch (SMCU_SPARSETYPE(A))
954 {
955 case SUNMAT_CUSPARSE_CSR:
956 /* Choose the grid size to be the number of rows in the matrix,
957 and then choose threadsPerBlock to be a multiple of the warp size
958 that results in enough threads to have one per 2 columns. */
959 threadsPerBlock = SMCU_EXECPOLICY(A)->blockSize(SMCU_COLUMNS(A)/2);
960 gridSize = SMCU_EXECPOLICY(A)->gridSize(SMCU_ROWS(A)*SMCU_COLUMNS(A)/2, threadsPerBlock);
961 scaleAddIKernelCSR<realtype, int>
962 <<<gridSize, threadsPerBlock, 0, stream>>>(SMCU_ROWS(A),
963 c,
964 SMCU_DATAp(A),
965 SMCU_INDEXPTRSp(A),
966 SMCU_INDEXVALSp(A));
967 break;
968 case SUNMAT_CUSPARSE_BCSR:
969 /* Choose the grid size to be the number of blocks in the matrix,
970 and then choose threadsPerBlock to be a multiple of the warp size
971 that results in enough threads to have one per row of the block. */
972 threadsPerBlock = SMCU_EXECPOLICY(A)->blockSize(SMCU_BLOCKROWS(A));
973 gridSize = SMCU_EXECPOLICY(A)->gridSize(SMCU_NBLOCKS(A)*SMCU_BLOCKROWS(A), threadsPerBlock);
974 scaleAddIKernelBCSR<realtype, int>
975 <<<gridSize, threadsPerBlock, 0, stream>>>(SMCU_BLOCKROWS(A),
976 SMCU_NBLOCKS(A),
977 SMCU_BLOCKNNZ(A),
978 c,
979 SMCU_DATAp(A),
980 SMCU_INDEXPTRSp(A),
981 SMCU_INDEXVALSp(A));
982 break;
983 default:
984 SUNDIALS_DEBUG_PRINT("ERROR in SUNMatScaleAddI_cuSparse: sparse type not recognized\n");
985 return(SUNMAT_ILL_INPUT);
986 }
987
988 #ifdef SUNDIALS_DEBUG_CUDA_LASTERROR
989 cudaDeviceSynchronize();
990 if (!SUNDIALS_CUDA_VERIFY(cudaGetLastError())) return(SUNMAT_OPERATION_FAIL);
991 #endif
992
993 return(SUNMAT_SUCCESS);
994 }
995
996
997 /* Performs A = cA + B */
SUNMatScaleAdd_cuSparse(realtype c,SUNMatrix A,SUNMatrix B)998 int SUNMatScaleAdd_cuSparse(realtype c, SUNMatrix A, SUNMatrix B)
999 {
1000 cudaStream_t stream;
1001 unsigned threadsPerBlock, gridSize;
1002
1003 if (!SMCompatible_cuSparse(A, B))
1004 {
1005 SUNDIALS_DEBUG_PRINT("ERROR in SUNMatScaleAdd_cuSparse: SUNMatScaleAdd_cuSparse failed\n");
1006 return(SUNMAT_ILL_INPUT);
1007 }
1008
1009 stream = *SMCU_EXECPOLICY(A)->stream();
1010
1011 switch (SMCU_SPARSETYPE(A))
1012 {
1013 case SUNMAT_CUSPARSE_CSR:
1014 /* Choose the grid size to be the number of rows in the matrix,
1015 and then choose threadsPerBlock to be a multiple of the warp size
1016 that results in enough threads to have one per 2 columns. */
1017 threadsPerBlock = SMCU_EXECPOLICY(A)->blockSize(SMCU_COLUMNS(A)/2);
1018 gridSize = SMCU_EXECPOLICY(A)->gridSize(SMCU_ROWS(A)*SMCU_COLUMNS(A)/2, threadsPerBlock);
1019 scaleAddKernelCSR<realtype, int>
1020 <<<gridSize, threadsPerBlock, 0, stream>>>(SMCU_NNZ(A),
1021 c,
1022 SMCU_DATAp(A),
1023 SMCU_DATAp(B));
1024 break;
1025 case SUNMAT_CUSPARSE_BCSR:
1026 /* Choose the grid size to be the number of blocks in the matrix,
1027 and then choose threadsPerBlock to be a multiple of the warp size
1028 that results in enough threads to have one per row of the block. */
1029 threadsPerBlock = SMCU_EXECPOLICY(A)->blockSize(SMCU_BLOCKROWS(A));
1030 gridSize = SMCU_EXECPOLICY(A)->gridSize(SMCU_NBLOCKS(A)*SMCU_BLOCKROWS(A), threadsPerBlock);
1031 scaleAddKernelCSR<realtype, int>
1032 <<<gridSize, threadsPerBlock, 0, stream>>>(SMCU_NNZ(A),
1033 c,
1034 SMCU_DATAp(A),
1035 SMCU_DATAp(B));
1036 break;
1037 default:
1038 SUNDIALS_DEBUG_PRINT("ERROR in SUNMatScaleAdd_cuSparse: sparse type not recognized\n");
1039 return(SUNMAT_ILL_INPUT);
1040 }
1041
1042 #ifdef SUNDIALS_DEBUG_CUDA_LASTERROR
1043 cudaDeviceSynchronize();
1044 if (!SUNDIALS_CUDA_VERIFY(cudaGetLastError())) return(SUNMAT_OPERATION_FAIL);
1045 #endif
1046
1047 return(SUNMAT_SUCCESS);
1048 }
1049
1050 /* Setup buffers needed for Matvec */
SUNMatMatvecSetup_cuSparse(SUNMatrix A)1051 int SUNMatMatvecSetup_cuSparse(SUNMatrix A)
1052 {
1053 #if CUDART_VERSION >= 11000
1054 realtype placeholder[1];
1055 const realtype one = ONE;
1056
1057 /* Check if setup has already been done */
1058 if (!(SMCU_CONTENT(A)->matvec_issetup))
1059 {
1060 SUNDIALS_CUSPARSE_VERIFY( cusparseCreateDnVec(&SMCU_CONTENT(A)->vecX,
1061 SMCU_COLUMNS(A),
1062 placeholder, CUDA_R_XF) );
1063 SUNDIALS_CUSPARSE_VERIFY( cusparseCreateDnVec(&SMCU_CONTENT(A)->vecY,
1064 SMCU_ROWS(A),
1065 placeholder, CUDA_R_XF) );
1066
1067 SUNDIALS_CUSPARSE_VERIFY(
1068 cusparseSpMV_bufferSize(SMCU_CUSPHANDLE(A),
1069 CUSPARSE_OPERATION_NON_TRANSPOSE,
1070 &one, SMCU_CONTENT(A)->spmat_descr,
1071 SMCU_CONTENT(A)->vecX, &one, SMCU_CONTENT(A)->vecY,
1072 CUDA_R_XF, CUSPARSE_MV_ALG_DEFAULT,
1073 &SMCU_CONTENT(A)->bufferSize) );
1074
1075 if ( SUNMemoryHelper_Alloc(SMCU_MEMHELP(A), &SMCU_CONTENT(A)->dBufferMem,
1076 SMCU_CONTENT(A)->bufferSize, SUNMEMTYPE_DEVICE) )
1077 return(SUNMAT_OPERATION_FAIL);
1078 }
1079 #endif
1080 SMCU_CONTENT(A)->matvec_issetup = SUNTRUE;
1081 return(SUNMAT_SUCCESS);
1082 }
1083
1084 /* Perform y = Ax */
SUNMatMatvec_cuSparse(SUNMatrix A,N_Vector x,N_Vector y)1085 int SUNMatMatvec_cuSparse(SUNMatrix A, N_Vector x, N_Vector y)
1086 {
1087 /* Verify that the dimensions of A, x, and y agree */
1088 if ( (SMCU_COLUMNS(A) != N_VGetLength(x)) ||
1089 (SMCU_ROWS(A) != N_VGetLength(y)) )
1090 {
1091 SUNDIALS_DEBUG_PRINT("ERROR in SUNMatMatvec_cuSparse: dimensions do not agree\n");
1092 return(SUNMAT_ILL_INPUT);
1093 }
1094
1095 realtype *d_xdata = N_VGetDeviceArrayPointer(x);
1096 realtype *d_ydata = N_VGetDeviceArrayPointer(y);
1097
1098 if (SMCU_SPARSETYPE(A) == SUNMAT_CUSPARSE_CSR)
1099 {
1100 const realtype one = ONE;
1101
1102 /* Zero result vector */
1103 N_VConst(ZERO, y);
1104
1105 #if CUDART_VERSION >= 11000
1106 {
1107 /* Setup matvec if it has not been done yet */
1108 if (!SMCU_CONTENT(A)->matvec_issetup && SUNMatMatvecSetup_cuSparse(A))
1109 {
1110 return(SUNMAT_OPERATION_FAIL);
1111 }
1112
1113 SUNDIALS_CUSPARSE_VERIFY( cusparseDnVecSetValues(SMCU_CONTENT(A)->vecX,
1114 d_xdata) );
1115 SUNDIALS_CUSPARSE_VERIFY( cusparseDnVecSetValues(SMCU_CONTENT(A)->vecY,
1116 d_ydata) );
1117
1118 SUNDIALS_CUSPARSE_VERIFY( cusparseSpMV(SMCU_CUSPHANDLE(A),
1119 CUSPARSE_OPERATION_NON_TRANSPOSE,
1120 &one, SMCU_CONTENT(A)->spmat_descr,
1121 SMCU_CONTENT(A)->vecX, &one,
1122 SMCU_CONTENT(A)->vecY, CUDA_R_XF,
1123 CUSPARSE_MV_ALG_DEFAULT,
1124 SMCU_CONTENT(A)->dBufferMem->ptr) );
1125 }
1126 #else
1127 SUNDIALS_CUSPARSE_VERIFY(
1128 cusparseXcsrmv(SMCU_CUSPHANDLE(A), CUSPARSE_OPERATION_NON_TRANSPOSE,
1129 SMCU_ROWS(A), SMCU_COLUMNS(A), SMCU_NNZ(A),
1130 &one, SMCU_MATDESCR(A), SMCU_DATAp(A), SMCU_INDEXPTRSp(A),
1131 SMCU_INDEXVALSp(A), d_xdata, &one, d_ydata) );
1132 #endif
1133 }
1134 else if (SMCU_SPARSETYPE(A) == SUNMAT_CUSPARSE_BCSR)
1135 {
1136 cudaStream_t stream;
1137 unsigned gridSize, threadsPerBlock;
1138
1139 stream = *SMCU_EXECPOLICY(A)->stream();
1140
1141 /* Choose the grid size to be the number of blocks in the matrix,
1142 and then choose threadsPerBlock to be a multiple of the warp size
1143 that results in enough threads to have one per row of the block. */
1144 threadsPerBlock = SMCU_EXECPOLICY(A)->blockSize(SMCU_COLUMNS(A)/2);
1145 gridSize = SMCU_EXECPOLICY(A)->gridSize(SMCU_ROWS(A)*SMCU_COLUMNS(A)/2, threadsPerBlock);
1146 matvecBCSR<realtype, int>
1147 <<<gridSize, threadsPerBlock, 0, stream>>>(SMCU_BLOCKROWS(A),
1148 SMCU_NBLOCKS(A),
1149 SMCU_BLOCKNNZ(A),
1150 SMCU_DATAp(A),
1151 SMCU_INDEXPTRSp(A),
1152 SMCU_INDEXVALSp(A),
1153 d_xdata,
1154 d_ydata);
1155 }
1156 else
1157 {
1158 SUNDIALS_DEBUG_PRINT("ERROR in SUNMatMatvec_cuSparse: sparse type not recognized\n");
1159 return(SUNMAT_ILL_INPUT);
1160 }
1161
1162 #ifdef SUNDIALS_DEBUG_CUDA_LASTERROR
1163 cudaDeviceSynchronize();
1164 if (!SUNDIALS_CUDA_VERIFY(cudaGetLastError())) return(SUNMAT_OPERATION_FAIL);
1165 #endif
1166
1167 return(SUNMAT_SUCCESS);
1168 }
1169
1170
1171 /*
1172 * =================================================================
1173 * private functions
1174 * =================================================================
1175 */
1176
1177
1178 /* -----------------------------------------------------------------
1179 * Function to check compatibility of two sparse SUNMatrix objects
1180 */
SMCompatible_cuSparse(SUNMatrix A,SUNMatrix B)1181 static booleantype SMCompatible_cuSparse(SUNMatrix A, SUNMatrix B)
1182 {
1183 /* both matrices must be sparse */
1184 if ( (SUNMatGetID(A) != SUNMATRIX_CUSPARSE) ||
1185 (SUNMatGetID(B) != SUNMATRIX_CUSPARSE) )
1186 return(SUNFALSE);
1187
1188 /* both matrices must have the same shape and sparsity type */
1189 if (SMCU_ROWS(A) != SMCU_ROWS(B))
1190 return(SUNFALSE);
1191 if (SMCU_COLUMNS(A) != SMCU_COLUMNS(B))
1192 return(SUNFALSE);
1193 if (SMCU_SPARSETYPE(A) != SMCU_SPARSETYPE(B))
1194 return(SUNFALSE);
1195
1196 return(SUNTRUE);
1197 }
1198
1199 /* -----------------------------------------------------------------
1200 * Function to create empty SUNMatrix with ops attached and
1201 * the content structure allocated.
1202 */
SUNMatrix_cuSparse_NewEmpty()1203 SUNMatrix SUNMatrix_cuSparse_NewEmpty()
1204 {
1205 /* Create an empty matrix object */
1206 SUNMatrix A = NULL;
1207 A = SUNMatNewEmpty();
1208 if (A == NULL)
1209 {
1210 SUNDIALS_DEBUG_PRINT("ERROR in SUNMatrix_cuSparse_NewEmpty: SUNMatNewEmpty failed\n");
1211 return(NULL);
1212 }
1213
1214 /* Attach operations */
1215 A->ops->getid = SUNMatGetID_cuSparse;
1216 A->ops->clone = SUNMatClone_cuSparse;
1217 A->ops->destroy = SUNMatDestroy_cuSparse;
1218 A->ops->zero = SUNMatZero_cuSparse;
1219 A->ops->copy = SUNMatCopy_cuSparse;
1220 A->ops->scaleadd = SUNMatScaleAdd_cuSparse;
1221 A->ops->scaleaddi = SUNMatScaleAddI_cuSparse;
1222 A->ops->matvecsetup = SUNMatMatvecSetup_cuSparse;
1223 A->ops->matvec = SUNMatMatvec_cuSparse;
1224
1225 /* Create content */
1226 SUNMatrix_Content_cuSparse content = NULL;
1227 content = (SUNMatrix_Content_cuSparse) malloc(sizeof *content);
1228 if (content == NULL)
1229 {
1230 SUNDIALS_DEBUG_PRINT("ERROR in SUNMatrix_cuSparse_NewEmpty: failed to malloc content\n");
1231 SUNMatDestroy(A);
1232 return(NULL);
1233 }
1234
1235 /* Attach content */
1236 A->content = content;
1237 content->mem_helper = NULL;
1238
1239 return(A);
1240 }
1241
1242 #if CUDART_VERSION >= 11000
CreateSpMatDescr(SUNMatrix A,cusparseSpMatDescr_t * spmat_descr)1243 cusparseStatus_t CreateSpMatDescr(SUNMatrix A, cusparseSpMatDescr_t *spmat_descr)
1244 {
1245 /* CUDA 11 introduced the "Generic API" and removed the cusparseXcsrmv that
1246 works on the old cusparseMatDescr_t and raw data arrays. However,
1247 cuSolverSp stuff requires the cusparseMatDescr_t still. So, we have to
1248 create this cusparseSpMatDescr_t *and* the cusparseMatDescr_t. */
1249 return(cusparseCreateCsr(spmat_descr, SMCU_ROWS(A), SMCU_COLUMNS(A),
1250 SMCU_NNZ(A), SMCU_INDEXPTRSp(A),
1251 SMCU_INDEXVALSp(A), SMCU_DATAp(A),
1252 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
1253 CUSPARSE_INDEX_BASE_ZERO, CUDA_R_XF));
1254 }
1255 #endif
1256