1 /*
2  * -----------------------------------------------------------------
3  * Programmer(s): Cody J. Balos @ LLNL
4  * -----------------------------------------------------------------
5  * SUNDIALS Copyright Start
6  * Copyright (c) 2002-2021, Lawrence Livermore National Security
7  * and Southern Methodist University.
8  * All rights reserved.
9  *
10  * See the top-level LICENSE and NOTICE files for details.
11  *
12  * SPDX-License-Identifier: BSD-3-Clause
13  * SUNDIALS Copyright End
14  * -----------------------------------------------------------------
15  * This is the header file is for the cuSPARSE implementation of the
16  * SUNMATRIX module.
17  * -----------------------------------------------------------------
18  */
19 
20 #include <stdio.h>
21 #include <stdlib.h>
22 
23 #include <sunmemory/sunmemory_cuda.h>
24 #include <sunmatrix/sunmatrix_cusparse.h>
25 
26 #include "sundials_cuda.h"
27 #include "sundials_debug.h"
28 #include "cusparse_kernels.cuh"
29 
30 
31 /* Use the namespace for the kernels */
32 using namespace sundials::sunmatrix_cusparse;
33 
34 /* Constants */
35 #define ZERO RCONST(0.0)
36 #define ONE  RCONST(1.0)
37 
38 /* Private function prototypes */
39 static booleantype SMCompatible_cuSparse(SUNMatrix, SUNMatrix);
40 static SUNMatrix SUNMatrix_cuSparse_NewEmpty();
41 #if CUDART_VERSION >= 11000
42 static cusparseStatus_t CreateSpMatDescr(SUNMatrix, cusparseSpMatDescr_t*);
43 #endif
44 
45 /* Macros for handling the different function names based on precision */
46 #if defined(SUNDIALS_DOUBLE_PRECISION)
47 #define cusparseXcsrmv cusparseDcsrmv
48 #define CUDA_R_XF CUDA_R_64F
49 #elif defined(SUNDIALS_SINGLE_PRECISION)
50 #define cusparseXcsrmv cusparseScsrmv
51 #define CUDA_R_XF CUDA_R_32F
52 #endif
53 
54 /* Content accessor macros */
55 #define SMCU_CONTENT(A)     ( (SUNMatrix_Content_cuSparse)(A->content) )
56 #define SMCU_ROWS(A)        ( SMCU_CONTENT(A)->M )
57 #define SMCU_COLUMNS(A)     ( SMCU_CONTENT(A)->N )
58 #define SMCU_NNZ(A)         ( SMCU_CONTENT(A)->NNZ )
59 #define SMCU_NBLOCKS(A)     ( SMCU_CONTENT(A)->nblocks )
60 #define SMCU_BLOCKROWS(A)   ( SMCU_CONTENT(A)->blockrows )
61 #define SMCU_BLOCKCOLS(A)   ( SMCU_CONTENT(A)->blockcols )
62 #define SMCU_BLOCKNNZ(A)    ( SMCU_CONTENT(A)->blocknnz )
63 #define SMCU_NP(A)          ( SMCU_CONTENT(A)->NP )
64 #define SMCU_SPARSETYPE(A)  ( SMCU_CONTENT(A)->sparse_type )
65 #define SMCU_OWNMATD(A)     ( SMCU_CONTENT(A)->own_matd )
66 #define SMCU_OWNEXEC(A)     ( SMCU_CONTENT(A)->own_exec )
67 #define SMCU_DATA(A)        ( SMCU_CONTENT(A)->data )
68 #define SMCU_DATAp(A)       ( (realtype*)SMCU_CONTENT(A)->data->ptr )
69 #define SMCU_INDEXVALS(A)   ( SMCU_CONTENT(A)->colind )
70 #define SMCU_INDEXPTRS(A)   ( SMCU_CONTENT(A)->rowptrs )
71 #define SMCU_INDEXVALSp(A)  ( (int*) SMCU_CONTENT(A)->colind->ptr )
72 #define SMCU_INDEXPTRSp(A)  ( (int*) SMCU_CONTENT(A)->rowptrs->ptr )
73 #define SMCU_MEMHELP(A)     ( SMCU_CONTENT(A)->mem_helper )
74 #define SMCU_MATDESCR(A)    ( SMCU_CONTENT(A)->mat_descr )
75 #define SMCU_CUSPHANDLE(A)  ( SMCU_CONTENT(A)->cusp_handle )
76 #define SMCU_FIXEDPATTERN(A)( SMCU_CONTENT(A)->fixed_pattern )
77 #define SMCU_EXECPOLICY(A)  ( SMCU_CONTENT(A)->exec_policy )
78 
79 
80 /* ------------------------------------------------------------------
81  * Default execution policy definition.
82  *
83  * This policy tries to help us leverage the structure of the matrix.
84  * It will choose block sizes which are a multiple of the warp size,
85  * and it will choose a grid size to such that all work elements are
86  * covered.
87  * ------------------------------------------------------------------ */
88 
89 class SUNCuSparseMatrixExecPolicy : public SUNCudaExecPolicy
90 {
91 public:
SUNCuSparseMatrixExecPolicy(const cudaStream_t stream=0)92   SUNCuSparseMatrixExecPolicy(const cudaStream_t stream = 0)
93     : stream_(stream)
94   {}
95 
SUNCuSparseMatrixExecPolicy(const SUNCuSparseMatrixExecPolicy & ex)96   SUNCuSparseMatrixExecPolicy(const SUNCuSparseMatrixExecPolicy& ex)
97     : stream_(ex.stream_)
98   {}
99 
gridSize(size_t numWorkElements,size_t blockDim=0) const100   virtual size_t gridSize(size_t numWorkElements, size_t blockDim = 0) const
101   {
102     return(numWorkElements + blockDim - 1)/blockDim;
103   }
104 
blockSize(size_t numWorkElements=0,size_t gridDim=0) const105   virtual size_t blockSize(size_t numWorkElements = 0, size_t gridDim = 0) const
106   {
107     return(max_block_size(CUDA_WARP_SIZE*(numWorkElements + CUDA_WARP_SIZE - 1)/CUDA_WARP_SIZE));
108   }
109 
stream() const110   virtual const cudaStream_t* stream() const
111   {
112     return(&stream_);
113   }
114 
clone() const115   virtual CudaExecPolicy* clone() const
116   {
117     return(static_cast<CudaExecPolicy*>(new SUNCuSparseMatrixExecPolicy(*this)));
118   }
119 
max_block_size(int val)120   static size_t max_block_size(int val)
121   {
122     return((val > MAX_CUDA_BLOCKSIZE) ? MAX_CUDA_BLOCKSIZE : val );
123   }
124 
125 private:
126   const cudaStream_t stream_;
127 };
128 
129 /* ------------------------------------------------------------------
130  * Constructors.
131  * ------------------------------------------------------------------ */
132 
SUNMatrix_cuSparse_NewCSR(int M,int N,int NNZ,cusparseHandle_t cusp)133 SUNMatrix SUNMatrix_cuSparse_NewCSR(int M, int N, int NNZ, cusparseHandle_t cusp)
134 {
135   SUNMemory d_colind, d_rowptr, d_values;
136   int alloc_fail = 0;
137 
138   /* return with NULL matrix on illegal input */
139   if ( (M <= 0) || (N <= 0) || (NNZ < 0) )
140   {
141     SUNDIALS_DEBUG_PRINT("ERROR in SUNMatrix_NewCSR_cuSparse: illegal value(s) for M, N, or NNZ\n");
142     return(NULL);
143   }
144 
145   SUNMatrix A = SUNMatrix_cuSparse_NewEmpty();
146   if (A == NULL)
147   {
148     SUNDIALS_DEBUG_PRINT("ERROR in SUNMatrix_NewCSR_cuSparse: SUNMatrix_cuSparse_NewEmpty returned NULL\n");
149     return(NULL);
150   }
151 
152   SMCU_MEMHELP(A) = SUNMemoryHelper_Cuda();
153   if (SMCU_MEMHELP(A) == NULL)
154   {
155     SUNDIALS_DEBUG_PRINT("ERROR in SUNMatrix_NewCSR_cuSparse: SUNMemoryHelper_Cuda returned NULL\n");
156     SUNMatDestroy(A);
157     return(NULL);
158   }
159 
160   /* Allocate device memory for the matrix */
161   alloc_fail += SUNMemoryHelper_Alloc(SMCU_MEMHELP(A), &d_colind,
162                                       sizeof(int)*NNZ, SUNMEMTYPE_DEVICE);
163   alloc_fail += SUNMemoryHelper_Alloc(SMCU_MEMHELP(A), &d_rowptr,
164                                       sizeof(int)*(M+1), SUNMEMTYPE_DEVICE);
165   alloc_fail += SUNMemoryHelper_Alloc(SMCU_MEMHELP(A), &d_values,
166                                       sizeof(realtype)*NNZ, SUNMEMTYPE_DEVICE);
167   if (alloc_fail)
168   {
169     SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), d_colind);
170     SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), d_rowptr);
171     SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), d_values);
172     SUNMatDestroy(A);
173     return(NULL);
174   }
175 
176   /* Choose sensible defaults */
177   cusparseStatus_t cusparse_status = CUSPARSE_STATUS_SUCCESS;
178   cusparseMatDescr_t mat_descr;
179   cusparse_status = cusparseCreateMatDescr(&mat_descr);
180   if (!SUNDIALS_CUSPARSE_VERIFY(cusparse_status))
181   {
182     SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), d_colind);
183     SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), d_rowptr);
184     SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), d_values);
185     SUNMatDestroy(A);
186     return(NULL);
187   }
188 
189   cusparse_status = cusparseSetMatType(mat_descr, CUSPARSE_MATRIX_TYPE_GENERAL);
190   if (!SUNDIALS_CUSPARSE_VERIFY(cusparse_status))
191   {
192     SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), d_colind);
193     SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), d_rowptr);
194     SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), d_values);
195     cusparseDestroyMatDescr(mat_descr);
196     SUNMatDestroy(A);
197     return(NULL);
198   }
199 
200   cusparse_status = cusparseSetMatIndexBase(mat_descr, CUSPARSE_INDEX_BASE_ZERO);
201   if (!SUNDIALS_CUSPARSE_VERIFY(cusparse_status))
202   {
203     SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), d_colind);
204     SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), d_rowptr);
205     SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), d_values);
206     cusparseDestroyMatDescr(mat_descr);
207     SUNMatDestroy(A);
208     return(NULL);
209   }
210 
211   cudaStream_t stream;
212   if (!SUNDIALS_CUSPARSE_VERIFY(cusparseGetStream(cusp, &stream)))
213   {
214     SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), d_colind);
215     SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), d_rowptr);
216     SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), d_values);
217     cusparseDestroyMatDescr(mat_descr);
218     SUNMatDestroy(A);
219     return(NULL);
220   }
221 
222   /* Fill the content */
223   SMCU_CONTENT(A)->M              = M;
224   SMCU_CONTENT(A)->N              = N;
225   SMCU_CONTENT(A)->NNZ            = NNZ;
226   SMCU_CONTENT(A)->nblocks        = 1;
227   SMCU_CONTENT(A)->blockrows      = M;
228   SMCU_CONTENT(A)->blockcols      = N;
229   SMCU_CONTENT(A)->blocknnz       = NNZ;
230   SMCU_CONTENT(A)->own_matd       = SUNTRUE;
231   SMCU_CONTENT(A)->own_exec       = SUNTRUE;
232   SMCU_CONTENT(A)->matvec_issetup = SUNFALSE;
233   SMCU_CONTENT(A)->fixed_pattern  = SUNFALSE;
234   SMCU_CONTENT(A)->sparse_type    = SUNMAT_CUSPARSE_CSR;
235   SMCU_CONTENT(A)->colind         = d_colind;
236   SMCU_CONTENT(A)->rowptrs        = d_rowptr;
237   SMCU_CONTENT(A)->data           = d_values;
238   SMCU_CONTENT(A)->mat_descr      = mat_descr;
239   SMCU_CONTENT(A)->cusp_handle    = cusp;
240   SMCU_CONTENT(A)->exec_policy    = new SUNCuSparseMatrixExecPolicy(stream);
241 
242 #if CUDART_VERSION >= 11000
243   cusparseSpMatDescr_t spmat_descr;
244   if (!SUNDIALS_CUSPARSE_VERIFY(CreateSpMatDescr(A, &spmat_descr)))
245   {
246     SUNMatDestroy(A);
247     return(NULL);
248   }
249   SMCU_CONTENT(A)->spmat_descr = spmat_descr;
250   SMCU_CONTENT(A)->dBufferMem  = NULL;
251   SMCU_CONTENT(A)->bufferSize  = 0;
252   SMCU_CONTENT(A)->vecX        = NULL;
253   SMCU_CONTENT(A)->vecY        = NULL;
254 #endif
255 
256   return A;
257 }
258 
259 
SUNMatrix_cuSparse_MakeCSR(cusparseMatDescr_t mat_descr,int M,int N,int NNZ,int * rowptrs,int * colind,realtype * data,cusparseHandle_t cusp)260 SUNMatrix SUNMatrix_cuSparse_MakeCSR(cusparseMatDescr_t mat_descr, int M, int N, int NNZ,
261                                      int *rowptrs , int *colind , realtype *data,
262                                      cusparseHandle_t cusp)
263 {
264   /* return with NULL matrix on illegal input */
265   if ( (M <= 0) || (N <= 0) || (NNZ < 0) )
266   {
267     SUNDIALS_DEBUG_PRINT("ERROR in SUNMatrix_MakeCSR_cuSparse: illegal value(s) for M, N, or NNZ\n");
268     return(NULL);
269   }
270 
271   if ( (rowptrs == NULL) || (colind == NULL) || (data == NULL) )
272   {
273     SUNDIALS_DEBUG_PRINT("ERROR in SUNMatrix_MakeCSR_cuSparse: rowptrs, colind, or data is NULL\n");
274     return(NULL);
275   }
276 
277   if (cusparseGetMatIndexBase(mat_descr) != CUSPARSE_INDEX_BASE_ZERO)
278   {
279     SUNDIALS_DEBUG_PRINT("ERROR in SUNMatrix_MakeCSR_cuSparse: the cusparseMatDescr_t must have index base CUSPARSE_INDEX_BASE_ZERO\n");
280     return(NULL);
281   }
282 
283   SUNMatrix A = SUNMatrix_cuSparse_NewEmpty();
284   if (A == NULL)
285   {
286     SUNDIALS_DEBUG_PRINT("ERROR in SUNMatrix_MakeCSR_cuSparse: SUNMatrix_cuSparse_NewEmpty returned NULL\n");
287     return(NULL);
288   }
289 
290   SMCU_MEMHELP(A) = SUNMemoryHelper_Cuda();
291   if (SMCU_MEMHELP(A) == NULL)
292   {
293     SUNDIALS_DEBUG_PRINT("ERROR in SUNMatrix_NewCSR_cuSparse: SUNMemoryHelper_Cuda returned NULL\n");
294     SUNMatDestroy(A);
295     return(NULL);
296   }
297 
298   cudaStream_t stream;
299   if (!SUNDIALS_CUSPARSE_VERIFY(cusparseGetStream(cusp, &stream)))
300   {
301     SUNMatDestroy(A);
302     return(NULL);
303   }
304 
305   /* Fill content */
306   SMCU_CONTENT(A)->M              = M;
307   SMCU_CONTENT(A)->N              = N;
308   SMCU_CONTENT(A)->NNZ            = NNZ;
309   SMCU_CONTENT(A)->nblocks        = 1;
310   SMCU_CONTENT(A)->blockrows      = M;
311   SMCU_CONTENT(A)->blockcols      = N;
312   SMCU_CONTENT(A)->blocknnz       = NNZ;
313   SMCU_CONTENT(A)->own_matd       = SUNFALSE;
314   SMCU_CONTENT(A)->own_exec       = SUNTRUE;
315   SMCU_CONTENT(A)->matvec_issetup = SUNFALSE;
316   SMCU_CONTENT(A)->fixed_pattern  = SUNFALSE;
317   SMCU_CONTENT(A)->sparse_type    = SUNMAT_CUSPARSE_CSR;
318   SMCU_CONTENT(A)->colind         = SUNMemoryHelper_Wrap(colind, SUNMEMTYPE_DEVICE);
319   SMCU_CONTENT(A)->rowptrs        = SUNMemoryHelper_Wrap(rowptrs, SUNMEMTYPE_DEVICE);
320   SMCU_CONTENT(A)->data           = SUNMemoryHelper_Wrap(data, SUNMEMTYPE_DEVICE);
321   SMCU_CONTENT(A)->mat_descr      = mat_descr;
322   SMCU_CONTENT(A)->cusp_handle    = cusp;
323 
324   SMCU_CONTENT(A)->exec_policy   = new SUNCuSparseMatrixExecPolicy(stream);
325 
326   if (SMCU_CONTENT(A)->colind == NULL ||
327       SMCU_CONTENT(A)->rowptrs == NULL ||
328       SMCU_CONTENT(A)->data == NULL)
329   {
330     SUNMatDestroy(A);
331     return(NULL);
332   }
333 
334 #if CUDART_VERSION >= 11000
335   cusparseSpMatDescr_t spmat_descr;
336   if (!SUNDIALS_CUSPARSE_VERIFY(CreateSpMatDescr(A, &spmat_descr)))
337   {
338     SUNMatDestroy(A);
339     return(NULL);
340   }
341   SMCU_CONTENT(A)->spmat_descr = spmat_descr;
342   SMCU_CONTENT(A)->dBufferMem  = NULL;
343   SMCU_CONTENT(A)->bufferSize  = 0;
344   SMCU_CONTENT(A)->vecX        = NULL;
345   SMCU_CONTENT(A)->vecY        = NULL;
346 #endif
347 
348   return(A);
349 }
350 
351 
SUNMatrix_cuSparse_NewBlockCSR(int nblocks,int blockrows,int blockcols,int blocknnz,cusparseHandle_t cusp)352 SUNMatrix SUNMatrix_cuSparse_NewBlockCSR(int nblocks, int blockrows, int blockcols, int blocknnz, cusparseHandle_t cusp)
353 {
354   SUNMemory d_colind, d_rowptr, d_values;
355   int M, N, NNZ;
356   int alloc_fail = 0;
357 
358   /* Return with NULL matrix on illegal input */
359   if (blockrows != blockcols)
360   {
361     SUNDIALS_DEBUG_PRINT("ERROR in SUNMatrix_cuSparse_NewBlockCSR: matrix must be square for the BCSR format\n");
362     return(NULL);
363   }
364 
365   M   = nblocks * blockrows;
366   N   = M;
367   NNZ = nblocks * blocknnz;
368 
369   /* Return with NULL matrix on illegal input */
370   if ( (M <= 0) || (N <= 0) || (NNZ < 0) )
371   {
372     SUNDIALS_DEBUG_PRINT("ERROR in SUNMatrix_cuSparse_NewBlockCSR: illegal value(s) for M, N, or NNZ\n");
373     return(NULL);
374   }
375 
376   /* Allocate the SUNMatrix object */
377   SUNMatrix A = SUNMatrix_cuSparse_NewEmpty();
378   if (A == NULL)
379   {
380     SUNDIALS_DEBUG_PRINT("ERROR in SUNMatrix_cuSparse_NewBlockCSR: SUNMatrix_cuSparse_NewEmpty returned NULL\n");
381     return(NULL);
382   }
383 
384   SMCU_MEMHELP(A) = SUNMemoryHelper_Cuda();
385   if (SMCU_MEMHELP(A) == NULL)
386   {
387     SUNDIALS_DEBUG_PRINT("ERROR in SUNMatrix_NewCSR_cuSparse: SUNMemoryHelper_Cuda returned NULL\n");
388     SUNMatDestroy(A);
389     return(NULL);
390   }
391 
392   /* Allocate device memory for the matrix */
393   alloc_fail += SUNMemoryHelper_Alloc(SMCU_MEMHELP(A), &d_colind,
394                                       sizeof(int)*blocknnz, SUNMEMTYPE_DEVICE);
395   alloc_fail += SUNMemoryHelper_Alloc(SMCU_MEMHELP(A), &d_rowptr,
396                                       sizeof(int)*(blockrows + 1),
397                                       SUNMEMTYPE_DEVICE);
398   alloc_fail += SUNMemoryHelper_Alloc(SMCU_MEMHELP(A), &d_values,
399                                       sizeof(realtype)*blocknnz*nblocks,
400                                       SUNMEMTYPE_DEVICE);
401   if (alloc_fail)
402   {
403     SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), d_colind);
404     SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), d_rowptr);
405     SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), d_values);
406     SUNMatDestroy(A);
407     return(NULL);
408   }
409 
410   /* Choose sensible defaults */
411   cusparseStatus_t cusparse_status = CUSPARSE_STATUS_SUCCESS;
412   cusparseMatDescr_t mat_descr;
413   cusparse_status = cusparseCreateMatDescr(&mat_descr);
414   if (!SUNDIALS_CUSPARSE_VERIFY(cusparse_status))
415   {
416     SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), d_colind);
417     SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), d_rowptr);
418     SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), d_values);
419     SUNMatDestroy(A);
420     return(NULL);
421   }
422 
423   cusparse_status = cusparseSetMatType(mat_descr, CUSPARSE_MATRIX_TYPE_GENERAL);
424   if (!SUNDIALS_CUSPARSE_VERIFY(cusparse_status))
425   {
426     SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), d_colind);
427     SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), d_rowptr);
428     SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), d_values);
429     cusparseDestroyMatDescr(mat_descr);
430     SUNMatDestroy(A);
431     return(NULL);
432   }
433 
434   cusparse_status = cusparseSetMatIndexBase(mat_descr, CUSPARSE_INDEX_BASE_ZERO);
435   if (!SUNDIALS_CUSPARSE_VERIFY(cusparse_status))
436   {
437     SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), d_colind);
438     SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), d_rowptr);
439     SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), d_values);
440     cusparseDestroyMatDescr(mat_descr);
441     SUNMatDestroy(A);
442     return(NULL);
443   }
444 
445   cudaStream_t stream;
446   if (!SUNDIALS_CUSPARSE_VERIFY(cusparseGetStream(cusp, &stream)))
447   {
448     SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), d_colind);
449     SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), d_rowptr);
450     SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), d_values);
451     cusparseDestroyMatDescr(mat_descr);
452     SUNMatDestroy(A);
453     return(NULL);
454   }
455 
456   /* Fill the content */
457   SMCU_CONTENT(A)->M              = M;
458   SMCU_CONTENT(A)->N              = N;
459   SMCU_CONTENT(A)->NNZ            = NNZ;
460   SMCU_CONTENT(A)->nblocks        = nblocks;
461   SMCU_CONTENT(A)->blockrows      = blockrows;
462   SMCU_CONTENT(A)->blockcols      = blockrows;
463   SMCU_CONTENT(A)->blocknnz       = blocknnz;
464   SMCU_CONTENT(A)->own_matd       = SUNTRUE;
465   SMCU_CONTENT(A)->own_exec       = SUNTRUE;
466   SMCU_CONTENT(A)->matvec_issetup = SUNFALSE;
467   SMCU_CONTENT(A)->cusp_handle    = cusp;
468   SMCU_CONTENT(A)->fixed_pattern  = SUNFALSE;
469   SMCU_CONTENT(A)->sparse_type    = SUNMAT_CUSPARSE_BCSR;
470   SMCU_CONTENT(A)->colind         = d_colind;
471   SMCU_CONTENT(A)->rowptrs        = d_rowptr;
472   SMCU_CONTENT(A)->data           = d_values;
473   SMCU_CONTENT(A)->mat_descr      = mat_descr;
474   SMCU_CONTENT(A)->exec_policy    = new SUNCuSparseMatrixExecPolicy(stream);
475 
476 #if CUDART_VERSION >= 11000
477   cusparseSpMatDescr_t spmat_descr;
478   if (!SUNDIALS_CUSPARSE_VERIFY(CreateSpMatDescr(A, &spmat_descr)))
479   {
480     SUNMatDestroy(A);
481     return(NULL);
482   }
483   SMCU_CONTENT(A)->spmat_descr = spmat_descr;
484   SMCU_CONTENT(A)->dBufferMem  = NULL;
485   SMCU_CONTENT(A)->bufferSize  = 0;
486   SMCU_CONTENT(A)->vecX        = NULL;
487   SMCU_CONTENT(A)->vecY        = NULL;
488 #endif
489 
490   return(A);
491 }
492 
493 /* ------------------------------------------------------------------
494  * Implementation specific routines.
495  * ------------------------------------------------------------------ */
496 
SUNMatrix_cuSparse_SparseType(SUNMatrix A)497 int SUNMatrix_cuSparse_SparseType(SUNMatrix A)
498 {
499   if (SUNMatGetID(A) == SUNMATRIX_CUSPARSE)
500     return(SMCU_SPARSETYPE(A));
501   else
502     return(SUNMAT_ILL_INPUT);
503 }
504 
SUNMatrix_cuSparse_Rows(SUNMatrix A)505 int SUNMatrix_cuSparse_Rows(SUNMatrix A)
506 {
507   if (SUNMatGetID(A) == SUNMATRIX_CUSPARSE)
508     return(SMCU_ROWS(A));
509   else
510     return(SUNMAT_ILL_INPUT);
511 }
512 
SUNMatrix_cuSparse_Columns(SUNMatrix A)513 int SUNMatrix_cuSparse_Columns(SUNMatrix A)
514 {
515   if (SUNMatGetID(A) == SUNMATRIX_CUSPARSE)
516     return(SMCU_COLUMNS(A));
517   else
518     return(SUNMAT_ILL_INPUT);
519 }
520 
SUNMatrix_cuSparse_NNZ(SUNMatrix A)521 int SUNMatrix_cuSparse_NNZ(SUNMatrix A)
522 {
523   if (SUNMatGetID(A) == SUNMATRIX_CUSPARSE)
524     return(SMCU_NNZ(A));
525   else
526     return(SUNMAT_ILL_INPUT);
527 }
528 
SUNMatrix_cuSparse_IndexPointers(SUNMatrix A)529 int* SUNMatrix_cuSparse_IndexPointers(SUNMatrix A)
530 {
531   if (SUNMatGetID(A) == SUNMATRIX_CUSPARSE)
532     return(SMCU_INDEXPTRSp(A));
533   else
534     return(NULL);
535 }
536 
SUNMatrix_cuSparse_IndexValues(SUNMatrix A)537 int* SUNMatrix_cuSparse_IndexValues(SUNMatrix A)
538 {
539   if (SUNMatGetID(A) == SUNMATRIX_CUSPARSE)
540     return(SMCU_INDEXVALSp(A));
541   else
542     return(NULL);
543 }
544 
SUNMatrix_cuSparse_Data(SUNMatrix A)545 realtype* SUNMatrix_cuSparse_Data(SUNMatrix A)
546 {
547   if (SUNMatGetID(A) == SUNMATRIX_CUSPARSE)
548     return(SMCU_DATAp(A));
549   else
550     return(NULL);
551 }
552 
SUNMatrix_cuSparse_NumBlocks(SUNMatrix A)553 int SUNMatrix_cuSparse_NumBlocks(SUNMatrix A)
554 {
555   if (SUNMatGetID(A) == SUNMATRIX_CUSPARSE)
556     return(SMCU_NBLOCKS(A));
557   else
558     return(SUNMAT_ILL_INPUT);
559 }
560 
SUNMatrix_cuSparse_BlockRows(SUNMatrix A)561 int SUNMatrix_cuSparse_BlockRows(SUNMatrix A)
562 {
563   if (SUNMatGetID(A) == SUNMATRIX_CUSPARSE)
564     return(SMCU_BLOCKROWS(A));
565   else
566     return(SUNMAT_ILL_INPUT);
567 }
568 
SUNMatrix_cuSparse_BlockColumns(SUNMatrix A)569 int SUNMatrix_cuSparse_BlockColumns(SUNMatrix A)
570 {
571   if (SUNMatGetID(A) == SUNMATRIX_CUSPARSE)
572     return(SMCU_BLOCKCOLS(A));
573   else
574     return(SUNMAT_ILL_INPUT);
575 }
576 
SUNMatrix_cuSparse_BlockNNZ(SUNMatrix A)577 int SUNMatrix_cuSparse_BlockNNZ(SUNMatrix A)
578 {
579   if (SUNMatGetID(A) == SUNMATRIX_CUSPARSE)
580     return(SMCU_BLOCKNNZ(A));
581   else
582     return(SUNMAT_ILL_INPUT);
583 }
584 
SUNMatrix_cuSparse_BlockData(SUNMatrix A,int blockidx)585 realtype* SUNMatrix_cuSparse_BlockData(SUNMatrix A, int blockidx)
586 {
587   realtype *matdata;
588   int offset;
589 
590   if (SUNMatGetID(A) != SUNMATRIX_CUSPARSE)
591     return(NULL);
592 
593   if (blockidx >= SMCU_NBLOCKS(A))
594     return(NULL);
595 
596   matdata = SMCU_DATAp(A);
597   offset = SMCU_BLOCKNNZ(A)*blockidx;
598 
599   return(&matdata[offset]);
600 }
601 
SUNMatrix_cuSparse_MatDescr(SUNMatrix A)602 cusparseMatDescr_t SUNMatrix_cuSparse_MatDescr(SUNMatrix A)
603 {
604   if (SUNMatGetID(A) == SUNMATRIX_CUSPARSE)
605     return(SMCU_MATDESCR(A));
606   else
607     return(NULL);
608 }
609 
SUNMatrix_cuSparse_SetFixedPattern(SUNMatrix A,booleantype yesno)610 int SUNMatrix_cuSparse_SetFixedPattern(SUNMatrix A, booleantype yesno)
611 {
612   if (SUNMatGetID(A) != SUNMATRIX_CUSPARSE)
613     return(SUNMAT_ILL_INPUT);
614 
615   SMCU_FIXEDPATTERN(A) = yesno;
616 
617   return(SUNMAT_SUCCESS);
618 }
619 
620 
SUNMatrix_cuSparse_SetKernelExecPolicy(SUNMatrix A,SUNCudaExecPolicy * exec_policy)621 int SUNMatrix_cuSparse_SetKernelExecPolicy(SUNMatrix A, SUNCudaExecPolicy* exec_policy)
622 {
623   if (SUNMatGetID(A) != SUNMATRIX_CUSPARSE || exec_policy == NULL)
624     return(SUNMAT_ILL_INPUT);
625 
626   if (SMCU_OWNEXEC(A)) delete SMCU_EXECPOLICY(A);
627   SMCU_EXECPOLICY(A) = exec_policy;
628 
629   SMCU_OWNEXEC(A) = SUNFALSE;
630 
631   return(SUNMAT_SUCCESS);
632 }
633 
634 
SUNMatrix_cuSparse_CopyToDevice(SUNMatrix dA,realtype * h_data,int * h_idxptrs,int * h_idxvals)635 int SUNMatrix_cuSparse_CopyToDevice(SUNMatrix dA, realtype* h_data,
636                                     int* h_idxptrs, int* h_idxvals)
637 {
638   int retval;
639   SUNMemory _h_data, _h_idxptrs, _h_idxvals;
640   const cudaStream_t* stream;
641   int nidxvals, nidxptrs;
642 
643   if (SUNMatGetID(dA) != SUNMATRIX_CUSPARSE)
644     return(SUNMAT_ILL_INPUT);
645 
646   stream  = SMCU_EXECPOLICY(dA)->stream();
647 
648   if (h_data != NULL)
649   {
650     _h_data = SUNMemoryHelper_Wrap(h_data, SUNMEMTYPE_HOST);
651     retval  = SUNMemoryHelper_CopyAsync(SMCU_MEMHELP(dA),
652                                         SMCU_DATA(dA),
653                                         _h_data,
654                                         SMCU_NNZ(dA)*sizeof(realtype),
655                                         (void*) stream);
656     SUNMemoryHelper_Dealloc(SMCU_MEMHELP(dA), _h_data);
657     if (retval != 0) return(SUNMAT_OPERATION_FAIL);
658   }
659 
660   switch(SMCU_SPARSETYPE(dA))
661   {
662     case SUNMAT_CUSPARSE_CSR:
663       nidxptrs = SMCU_ROWS(dA)+1;
664       nidxvals = SMCU_NNZ(dA);
665       break;
666     case SUNMAT_CUSPARSE_BCSR:
667       nidxptrs = SMCU_BLOCKROWS(dA)+1;
668       nidxvals = SMCU_BLOCKNNZ(dA);
669       break;
670     default:
671       SUNDIALS_DEBUG_PRINT("ERROR in SUNMatrix_cuSparse_CopyToDevice: unrecognized sparse type\n");
672       return(SUNMAT_ILL_INPUT);
673   }
674 
675   if (h_idxptrs != NULL)
676   {
677     _h_idxptrs = SUNMemoryHelper_Wrap(h_idxptrs, SUNMEMTYPE_HOST);
678     retval = SUNMemoryHelper_CopyAsync(SMCU_MEMHELP(dA),
679                                        SMCU_INDEXPTRS(dA),
680                                        _h_idxptrs,
681                                        nidxptrs*sizeof(int),
682                                        (void*) stream);
683     SUNMemoryHelper_Dealloc(SMCU_MEMHELP(dA), _h_idxptrs);
684     if (retval != 0) return(SUNMAT_OPERATION_FAIL);
685   }
686 
687   if (h_idxvals != NULL)
688   {
689     _h_idxvals = SUNMemoryHelper_Wrap(h_idxvals, SUNMEMTYPE_HOST);
690     retval = SUNMemoryHelper_CopyAsync(SMCU_MEMHELP(dA),
691                                        SMCU_INDEXVALS(dA),
692                                        _h_idxvals,
693                                        nidxvals*sizeof(int),
694                                        (void*) stream);
695     SUNMemoryHelper_Dealloc(SMCU_MEMHELP(dA), _h_idxvals);
696     if (retval != 0) return(SUNMAT_OPERATION_FAIL);
697   }
698 
699   return(SUNMAT_SUCCESS);
700 }
701 
702 
SUNMatrix_cuSparse_CopyFromDevice(SUNMatrix dA,realtype * h_data,int * h_idxptrs,int * h_idxvals)703 int SUNMatrix_cuSparse_CopyFromDevice(SUNMatrix dA, realtype* h_data,
704                                       int* h_idxptrs, int* h_idxvals)
705 {
706   int retval;
707   SUNMemory _h_data, _h_idxptrs, _h_idxvals;
708   const cudaStream_t* stream;
709   int nidxvals, nidxptrs;
710 
711   if (SUNMatGetID(dA) != SUNMATRIX_CUSPARSE)
712     return(SUNMAT_ILL_INPUT);
713 
714   stream = SMCU_EXECPOLICY(dA)->stream();
715 
716   if (h_data != NULL)
717   {
718     _h_data = SUNMemoryHelper_Wrap(h_data, SUNMEMTYPE_HOST);
719     retval  = SUNMemoryHelper_CopyAsync(SMCU_MEMHELP(dA),
720                                         _h_data,
721                                         SMCU_DATA(dA),
722                                         SMCU_NNZ(dA)*sizeof(realtype),
723                                         (void*) stream);
724     SUNMemoryHelper_Dealloc(SMCU_MEMHELP(dA), _h_data);
725     if (retval != 0) return(SUNMAT_OPERATION_FAIL);
726   }
727 
728 
729   switch(SMCU_SPARSETYPE(dA))
730   {
731     case SUNMAT_CUSPARSE_CSR:
732       nidxptrs = SMCU_ROWS(dA)+1;
733       nidxvals = SMCU_NNZ(dA);
734     case SUNMAT_CUSPARSE_BCSR:
735       nidxptrs = SMCU_BLOCKROWS(dA)+1;
736       nidxvals = SMCU_BLOCKNNZ(dA);
737   }
738 
739   if (h_idxptrs != NULL)
740   {
741     _h_idxptrs = SUNMemoryHelper_Wrap(h_idxptrs, SUNMEMTYPE_HOST);
742     retval = SUNMemoryHelper_CopyAsync(SMCU_MEMHELP(dA),
743                                        _h_idxptrs,
744                                        SMCU_INDEXPTRS(dA),
745                                        nidxptrs*sizeof(int),
746                                        (void*) stream);
747     SUNMemoryHelper_Dealloc(SMCU_MEMHELP(dA), _h_idxptrs);
748     if (retval != 0) return(SUNMAT_OPERATION_FAIL);
749   }
750 
751   if (h_idxvals != NULL)
752   {
753     _h_idxvals = SUNMemoryHelper_Wrap(h_idxvals, SUNMEMTYPE_HOST);
754     retval = SUNMemoryHelper_CopyAsync(SMCU_MEMHELP(dA),
755                                        _h_idxvals,
756                                        SMCU_INDEXVALS(dA),
757                                        nidxvals*sizeof(int),
758                                        (void*) stream);
759     SUNMemoryHelper_Dealloc(SMCU_MEMHELP(dA), _h_idxvals);
760     if (retval != 0) return(SUNMAT_OPERATION_FAIL);
761   }
762 
763 
764   return(SUNMAT_SUCCESS);
765 }
766 
767 /*
768  * -----------------------------------------------------------------
769  * implementation of matrix operations
770  * -----------------------------------------------------------------
771  */
772 
773 
SUNMatGetID_cuSparse(SUNMatrix A)774 SUNMatrix_ID SUNMatGetID_cuSparse(SUNMatrix A)
775 {
776   return(SUNMATRIX_CUSPARSE);
777 }
778 
779 /* Returns a new matrix allocated to have the same structure as A,
780    but it does not copy any nonzeros, column vals, or row pointers. */
SUNMatClone_cuSparse(SUNMatrix A)781 SUNMatrix SUNMatClone_cuSparse(SUNMatrix A)
782 {
783   SUNMatrix B;
784 
785   switch (SMCU_SPARSETYPE(A))
786   {
787     case SUNMAT_CUSPARSE_CSR:
788       B = SUNMatrix_cuSparse_NewCSR(SMCU_ROWS(A), SMCU_COLUMNS(A), SMCU_NNZ(A),
789                                     SMCU_CUSPHANDLE(A));
790       break;
791     case SUNMAT_CUSPARSE_BCSR:
792       B = SUNMatrix_cuSparse_NewBlockCSR(SMCU_NBLOCKS(A), SMCU_BLOCKROWS(A), SMCU_BLOCKCOLS(A),
793                                          SMCU_BLOCKNNZ(A), SMCU_CUSPHANDLE(A));
794       break;
795     default:
796       SUNDIALS_DEBUG_PRINT("ERROR in SUNMatClone_cuSparse: sparse type not recognized\n");
797       B = NULL;
798   }
799 
800   SMCU_FIXEDPATTERN(B) = SMCU_FIXEDPATTERN(A);
801   delete SMCU_EXECPOLICY(B);
802   SMCU_EXECPOLICY(B) = SMCU_EXECPOLICY(A)->clone();
803 
804   return(B);
805 }
806 
807 
808 /* Deallocates the SUNMatrix object and all data it owns */
SUNMatDestroy_cuSparse(SUNMatrix A)809 void SUNMatDestroy_cuSparse(SUNMatrix A)
810 {
811   if (A == NULL) return;
812 
813   /* free content */
814   if (A->content != NULL)
815   {
816     if (SMCU_MEMHELP(A))
817     {
818       SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), SMCU_DATA(A));
819       SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), SMCU_INDEXPTRS(A));
820       SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), SMCU_INDEXVALS(A));
821     }
822     else
823     {
824       SUNDIALS_DEBUG_PRINT("WARNING in SUNMatDestroy_cuSparse: mem_helper was NULL when trying to dealloc data, this could result in a memory leak\n");
825     }
826 
827     if (SMCU_OWNMATD(A))
828     {
829       /* free cusparseMatDescr_t */
830       SUNDIALS_CUSPARSE_VERIFY( cusparseDestroyMatDescr(SMCU_MATDESCR(A)) );
831     }
832 
833 #if CUDART_VERSION >= 11000
834     SUNDIALS_CUSPARSE_VERIFY( cusparseDestroyDnVec(SMCU_CONTENT(A)->vecX) );
835     SUNDIALS_CUSPARSE_VERIFY( cusparseDestroyDnVec(SMCU_CONTENT(A)->vecY) );
836     SUNDIALS_CUSPARSE_VERIFY( cusparseDestroySpMat(SMCU_CONTENT(A)->spmat_descr) );
837     SUNMemoryHelper_Dealloc(SMCU_MEMHELP(A), SMCU_CONTENT(A)->dBufferMem);
838 #endif
839 
840     if (SMCU_EXECPOLICY(A) && SMCU_OWNEXEC(A))
841     {
842       delete SMCU_EXECPOLICY(A);
843       SMCU_EXECPOLICY(A) = NULL;
844     }
845 
846     SUNMemoryHelper_Destroy(SMCU_MEMHELP(A));
847 
848     /* free content struct */
849     free(A->content);
850     A->content = NULL;
851   }
852 
853   /* free ops and matrix */
854   if (A->ops) { free(A->ops); A->ops = NULL; }
855   free(A); A = NULL;
856 
857   return;
858 }
859 
860 
861 /* Performs A_ij = 0 */
SUNMatZero_cuSparse(SUNMatrix A)862 int SUNMatZero_cuSparse(SUNMatrix A)
863 {
864   cudaError_t cuerr;
865   cudaStream_t stream;
866 
867   stream = *SMCU_EXECPOLICY(A)->stream();
868 
869   /* set all data to zero */
870   cuerr = cudaMemsetAsync(SMCU_DATAp(A), 0, SMCU_NNZ(A)*sizeof(realtype), stream);
871   if (!SUNDIALS_CUDA_VERIFY(cuerr)) return(SUNMAT_OPERATION_FAIL);
872 
873   /* set all rowptrs to zero unless the sparsity pattern is fixed */
874   if (!SMCU_FIXEDPATTERN(A))
875   {
876     cuerr = cudaMemsetAsync(SMCU_INDEXPTRSp(A), 0,
877                             (SMCU_BLOCKROWS(A)+1)*sizeof(int),
878                             stream);
879     if (!SUNDIALS_CUDA_VERIFY(cuerr)) return(SUNMAT_OPERATION_FAIL);
880 
881     /* set all colind to zero */
882     cuerr = cudaMemsetAsync(SMCU_INDEXVALSp(A), 0,
883                             SMCU_BLOCKNNZ(A)*sizeof(int),
884                             stream);
885     if (!SUNDIALS_CUDA_VERIFY(cuerr)) return(SUNMAT_OPERATION_FAIL);
886   }
887 
888   return(SUNMAT_SUCCESS);
889 }
890 
891 
892 /* Copies the nonzeros, column vals, and row pointers into dst */
SUNMatCopy_cuSparse(SUNMatrix src,SUNMatrix dst)893 int SUNMatCopy_cuSparse(SUNMatrix src, SUNMatrix dst)
894 {
895   int retval;
896   const cudaStream_t* stream;
897 
898   /* Verify that src and dst are compatible */
899   if (!SMCompatible_cuSparse(src, dst))
900     return(SUNMAT_ILL_INPUT);
901 
902   stream = SMCU_EXECPOLICY(src)->stream();
903 
904   /* Ensure that dst is allocated with at least as
905      much memory as we have nonzeros in src */
906   if (SMCU_NNZ(dst) < SMCU_NNZ(src))
907   {
908     SUNDIALS_DEBUG_PRINT("ERROR in SUNMatCopy_cuSparse: the destination matrix has less nonzeros than the source\n");
909     return(SUNMAT_ILL_INPUT);
910   }
911 
912   /* Zero out dst so that copy works correctly */
913   if (SUNMatZero_cuSparse(dst) != SUNMAT_SUCCESS)
914   {
915     SUNDIALS_DEBUG_PRINT("ERROR in SUNMatCopy_cuSparse: SUNMatZero_cuSparse failed\n");
916     return(SUNMAT_OPERATION_FAIL);
917   }
918 
919   /* Copy the data over */
920   retval = SUNMemoryHelper_CopyAsync(SMCU_MEMHELP(src),
921                                      SMCU_DATA(dst),
922                                      SMCU_DATA(src),
923                                      SMCU_NNZ(src)*sizeof(realtype),
924                                      (void*) stream);
925   if (retval) return(SUNMAT_OPERATION_FAIL);
926 
927   /* Copy the row pointers over */
928   retval = SUNMemoryHelper_CopyAsync(SMCU_MEMHELP(src),
929                                      SMCU_INDEXPTRS(dst),
930                                      SMCU_INDEXPTRS(src),
931                                      (SMCU_BLOCKROWS(src)+1)*sizeof(int),
932                                      (void*) stream);
933   if (retval) return(SUNMAT_OPERATION_FAIL);
934 
935   /* Copy the column indices over */
936   retval = SUNMemoryHelper_CopyAsync(SMCU_MEMHELP(src),
937                                      SMCU_INDEXVALS(dst),
938                                      SMCU_INDEXVALS(src),
939                                      SMCU_BLOCKNNZ(src)*sizeof(int),
940                                      (void*) stream);
941   if (retval) return(SUNMAT_OPERATION_FAIL);
942 
943   return(SUNMAT_SUCCESS);
944 }
945 
946 
947 /* Performs A = cA + I. Requires the diagonal to be allocated already. */
SUNMatScaleAddI_cuSparse(realtype c,SUNMatrix A)948 int SUNMatScaleAddI_cuSparse(realtype c, SUNMatrix A)
949 {
950   unsigned threadsPerBlock, gridSize;
951   cudaStream_t stream = *SMCU_EXECPOLICY(A)->stream();
952 
953   switch (SMCU_SPARSETYPE(A))
954   {
955     case SUNMAT_CUSPARSE_CSR:
956       /* Choose the grid size to be the number of rows in the matrix,
957         and then choose threadsPerBlock to be a multiple of the warp size
958         that results in enough threads to have one per 2 columns. */
959       threadsPerBlock = SMCU_EXECPOLICY(A)->blockSize(SMCU_COLUMNS(A)/2);
960       gridSize = SMCU_EXECPOLICY(A)->gridSize(SMCU_ROWS(A)*SMCU_COLUMNS(A)/2, threadsPerBlock);
961       scaleAddIKernelCSR<realtype, int>
962         <<<gridSize, threadsPerBlock, 0, stream>>>(SMCU_ROWS(A),
963                                                    c,
964                                                    SMCU_DATAp(A),
965                                                    SMCU_INDEXPTRSp(A),
966                                                    SMCU_INDEXVALSp(A));
967       break;
968     case SUNMAT_CUSPARSE_BCSR:
969       /* Choose the grid size to be the number of blocks in the matrix,
970          and then choose threadsPerBlock to be a multiple of the warp size
971          that results in enough threads to have one per row of the block. */
972       threadsPerBlock = SMCU_EXECPOLICY(A)->blockSize(SMCU_BLOCKROWS(A));
973       gridSize = SMCU_EXECPOLICY(A)->gridSize(SMCU_NBLOCKS(A)*SMCU_BLOCKROWS(A), threadsPerBlock);
974       scaleAddIKernelBCSR<realtype, int>
975         <<<gridSize, threadsPerBlock, 0, stream>>>(SMCU_BLOCKROWS(A),
976                                                    SMCU_NBLOCKS(A),
977                                                    SMCU_BLOCKNNZ(A),
978                                                    c,
979                                                    SMCU_DATAp(A),
980                                                    SMCU_INDEXPTRSp(A),
981                                                    SMCU_INDEXVALSp(A));
982       break;
983     default:
984       SUNDIALS_DEBUG_PRINT("ERROR in SUNMatScaleAddI_cuSparse: sparse type not recognized\n");
985       return(SUNMAT_ILL_INPUT);
986   }
987 
988 #ifdef SUNDIALS_DEBUG_CUDA_LASTERROR
989   cudaDeviceSynchronize();
990   if (!SUNDIALS_CUDA_VERIFY(cudaGetLastError())) return(SUNMAT_OPERATION_FAIL);
991 #endif
992 
993   return(SUNMAT_SUCCESS);
994 }
995 
996 
997 /* Performs A = cA + B */
SUNMatScaleAdd_cuSparse(realtype c,SUNMatrix A,SUNMatrix B)998 int SUNMatScaleAdd_cuSparse(realtype c, SUNMatrix A, SUNMatrix B)
999 {
1000   cudaStream_t stream;
1001   unsigned threadsPerBlock, gridSize;
1002 
1003   if (!SMCompatible_cuSparse(A, B))
1004   {
1005     SUNDIALS_DEBUG_PRINT("ERROR in SUNMatScaleAdd_cuSparse: SUNMatScaleAdd_cuSparse failed\n");
1006     return(SUNMAT_ILL_INPUT);
1007   }
1008 
1009   stream = *SMCU_EXECPOLICY(A)->stream();
1010 
1011   switch (SMCU_SPARSETYPE(A))
1012   {
1013     case SUNMAT_CUSPARSE_CSR:
1014       /* Choose the grid size to be the number of rows in the matrix,
1015         and then choose threadsPerBlock to be a multiple of the warp size
1016         that results in enough threads to have one per 2 columns. */
1017       threadsPerBlock = SMCU_EXECPOLICY(A)->blockSize(SMCU_COLUMNS(A)/2);
1018       gridSize = SMCU_EXECPOLICY(A)->gridSize(SMCU_ROWS(A)*SMCU_COLUMNS(A)/2, threadsPerBlock);
1019       scaleAddKernelCSR<realtype, int>
1020         <<<gridSize, threadsPerBlock, 0, stream>>>(SMCU_NNZ(A),
1021                                                    c,
1022                                                    SMCU_DATAp(A),
1023                                                    SMCU_DATAp(B));
1024       break;
1025     case SUNMAT_CUSPARSE_BCSR:
1026       /* Choose the grid size to be the number of blocks in the matrix,
1027          and then choose threadsPerBlock to be a multiple of the warp size
1028          that results in enough threads to have one per row of the block. */
1029       threadsPerBlock = SMCU_EXECPOLICY(A)->blockSize(SMCU_BLOCKROWS(A));
1030       gridSize = SMCU_EXECPOLICY(A)->gridSize(SMCU_NBLOCKS(A)*SMCU_BLOCKROWS(A), threadsPerBlock);
1031       scaleAddKernelCSR<realtype, int>
1032         <<<gridSize, threadsPerBlock, 0, stream>>>(SMCU_NNZ(A),
1033                                                    c,
1034                                                    SMCU_DATAp(A),
1035                                                    SMCU_DATAp(B));
1036       break;
1037     default:
1038       SUNDIALS_DEBUG_PRINT("ERROR in SUNMatScaleAdd_cuSparse: sparse type not recognized\n");
1039       return(SUNMAT_ILL_INPUT);
1040   }
1041 
1042 #ifdef SUNDIALS_DEBUG_CUDA_LASTERROR
1043   cudaDeviceSynchronize();
1044   if (!SUNDIALS_CUDA_VERIFY(cudaGetLastError())) return(SUNMAT_OPERATION_FAIL);
1045 #endif
1046 
1047   return(SUNMAT_SUCCESS);
1048 }
1049 
1050 /* Setup buffers needed for Matvec */
SUNMatMatvecSetup_cuSparse(SUNMatrix A)1051 int SUNMatMatvecSetup_cuSparse(SUNMatrix A)
1052 {
1053 #if CUDART_VERSION >= 11000
1054   realtype placeholder[1];
1055   const realtype one = ONE;
1056 
1057   /* Check if setup has already been done */
1058   if (!(SMCU_CONTENT(A)->matvec_issetup))
1059   {
1060     SUNDIALS_CUSPARSE_VERIFY( cusparseCreateDnVec(&SMCU_CONTENT(A)->vecX,
1061                                                   SMCU_COLUMNS(A),
1062                                                   placeholder, CUDA_R_XF) );
1063     SUNDIALS_CUSPARSE_VERIFY( cusparseCreateDnVec(&SMCU_CONTENT(A)->vecY,
1064                                                   SMCU_ROWS(A),
1065                                                   placeholder, CUDA_R_XF) );
1066 
1067     SUNDIALS_CUSPARSE_VERIFY(
1068       cusparseSpMV_bufferSize(SMCU_CUSPHANDLE(A),
1069                               CUSPARSE_OPERATION_NON_TRANSPOSE,
1070                               &one, SMCU_CONTENT(A)->spmat_descr,
1071                               SMCU_CONTENT(A)->vecX, &one, SMCU_CONTENT(A)->vecY,
1072                               CUDA_R_XF, CUSPARSE_MV_ALG_DEFAULT,
1073                               &SMCU_CONTENT(A)->bufferSize) );
1074 
1075     if ( SUNMemoryHelper_Alloc(SMCU_MEMHELP(A), &SMCU_CONTENT(A)->dBufferMem,
1076                                SMCU_CONTENT(A)->bufferSize, SUNMEMTYPE_DEVICE) )
1077       return(SUNMAT_OPERATION_FAIL);
1078   }
1079 #endif
1080   SMCU_CONTENT(A)->matvec_issetup = SUNTRUE;
1081   return(SUNMAT_SUCCESS);
1082 }
1083 
1084 /* Perform y = Ax */
SUNMatMatvec_cuSparse(SUNMatrix A,N_Vector x,N_Vector y)1085 int SUNMatMatvec_cuSparse(SUNMatrix A, N_Vector x, N_Vector y)
1086 {
1087   /* Verify that the dimensions of A, x, and y agree */
1088   if ( (SMCU_COLUMNS(A) != N_VGetLength(x)) ||
1089        (SMCU_ROWS(A) != N_VGetLength(y)) )
1090   {
1091     SUNDIALS_DEBUG_PRINT("ERROR in SUNMatMatvec_cuSparse: dimensions do not agree\n");
1092     return(SUNMAT_ILL_INPUT);
1093   }
1094 
1095   realtype *d_xdata = N_VGetDeviceArrayPointer(x);
1096   realtype *d_ydata = N_VGetDeviceArrayPointer(y);
1097 
1098   if (SMCU_SPARSETYPE(A) == SUNMAT_CUSPARSE_CSR)
1099   {
1100     const realtype one = ONE;
1101 
1102     /* Zero result vector */
1103     N_VConst(ZERO, y);
1104 
1105 #if CUDART_VERSION >= 11000
1106     {
1107       /* Setup matvec if it has not been done yet */
1108       if (!SMCU_CONTENT(A)->matvec_issetup && SUNMatMatvecSetup_cuSparse(A))
1109       {
1110         return(SUNMAT_OPERATION_FAIL);
1111       }
1112 
1113       SUNDIALS_CUSPARSE_VERIFY( cusparseDnVecSetValues(SMCU_CONTENT(A)->vecX,
1114                                                        d_xdata) );
1115       SUNDIALS_CUSPARSE_VERIFY( cusparseDnVecSetValues(SMCU_CONTENT(A)->vecY,
1116                                                        d_ydata) );
1117 
1118       SUNDIALS_CUSPARSE_VERIFY( cusparseSpMV(SMCU_CUSPHANDLE(A),
1119                                              CUSPARSE_OPERATION_NON_TRANSPOSE,
1120                                              &one, SMCU_CONTENT(A)->spmat_descr,
1121                                              SMCU_CONTENT(A)->vecX, &one,
1122                                              SMCU_CONTENT(A)->vecY, CUDA_R_XF,
1123                                              CUSPARSE_MV_ALG_DEFAULT,
1124                                              SMCU_CONTENT(A)->dBufferMem->ptr) );
1125     }
1126 #else
1127     SUNDIALS_CUSPARSE_VERIFY(
1128       cusparseXcsrmv(SMCU_CUSPHANDLE(A), CUSPARSE_OPERATION_NON_TRANSPOSE,
1129                      SMCU_ROWS(A), SMCU_COLUMNS(A), SMCU_NNZ(A),
1130                      &one, SMCU_MATDESCR(A), SMCU_DATAp(A), SMCU_INDEXPTRSp(A),
1131                      SMCU_INDEXVALSp(A), d_xdata, &one, d_ydata) );
1132 #endif
1133   }
1134   else if (SMCU_SPARSETYPE(A) == SUNMAT_CUSPARSE_BCSR)
1135   {
1136     cudaStream_t stream;
1137     unsigned gridSize, threadsPerBlock;
1138 
1139     stream = *SMCU_EXECPOLICY(A)->stream();
1140 
1141     /* Choose the grid size to be the number of blocks in the matrix,
1142        and then choose threadsPerBlock to be a multiple of the warp size
1143        that results in enough threads to have one per row of the block. */
1144     threadsPerBlock = SMCU_EXECPOLICY(A)->blockSize(SMCU_COLUMNS(A)/2);
1145     gridSize = SMCU_EXECPOLICY(A)->gridSize(SMCU_ROWS(A)*SMCU_COLUMNS(A)/2, threadsPerBlock);
1146     matvecBCSR<realtype, int>
1147       <<<gridSize, threadsPerBlock, 0, stream>>>(SMCU_BLOCKROWS(A),
1148                                                  SMCU_NBLOCKS(A),
1149                                                  SMCU_BLOCKNNZ(A),
1150                                                  SMCU_DATAp(A),
1151                                                  SMCU_INDEXPTRSp(A),
1152                                                  SMCU_INDEXVALSp(A),
1153                                                  d_xdata,
1154                                                  d_ydata);
1155   }
1156   else
1157   {
1158     SUNDIALS_DEBUG_PRINT("ERROR in SUNMatMatvec_cuSparse: sparse type not recognized\n");
1159     return(SUNMAT_ILL_INPUT);
1160   }
1161 
1162 #ifdef SUNDIALS_DEBUG_CUDA_LASTERROR
1163     cudaDeviceSynchronize();
1164     if (!SUNDIALS_CUDA_VERIFY(cudaGetLastError())) return(SUNMAT_OPERATION_FAIL);
1165 #endif
1166 
1167   return(SUNMAT_SUCCESS);
1168 }
1169 
1170 
1171 /*
1172  * =================================================================
1173  * private functions
1174  * =================================================================
1175  */
1176 
1177 
1178 /* -----------------------------------------------------------------
1179  * Function to check compatibility of two sparse SUNMatrix objects
1180  */
SMCompatible_cuSparse(SUNMatrix A,SUNMatrix B)1181 static booleantype SMCompatible_cuSparse(SUNMatrix A, SUNMatrix B)
1182 {
1183   /* both matrices must be sparse */
1184   if ( (SUNMatGetID(A) != SUNMATRIX_CUSPARSE) ||
1185        (SUNMatGetID(B) != SUNMATRIX_CUSPARSE) )
1186     return(SUNFALSE);
1187 
1188   /* both matrices must have the same shape and sparsity type */
1189   if (SMCU_ROWS(A) != SMCU_ROWS(B))
1190     return(SUNFALSE);
1191   if (SMCU_COLUMNS(A) != SMCU_COLUMNS(B))
1192     return(SUNFALSE);
1193   if (SMCU_SPARSETYPE(A) != SMCU_SPARSETYPE(B))
1194     return(SUNFALSE);
1195 
1196   return(SUNTRUE);
1197 }
1198 
1199 /* -----------------------------------------------------------------
1200  * Function to create empty SUNMatrix with ops attached and
1201  * the content structure allocated.
1202  */
SUNMatrix_cuSparse_NewEmpty()1203 SUNMatrix SUNMatrix_cuSparse_NewEmpty()
1204 {
1205   /* Create an empty matrix object */
1206   SUNMatrix A = NULL;
1207   A = SUNMatNewEmpty();
1208   if (A == NULL)
1209   {
1210     SUNDIALS_DEBUG_PRINT("ERROR in SUNMatrix_cuSparse_NewEmpty: SUNMatNewEmpty failed\n");
1211     return(NULL);
1212   }
1213 
1214   /* Attach operations */
1215   A->ops->getid       = SUNMatGetID_cuSparse;
1216   A->ops->clone       = SUNMatClone_cuSparse;
1217   A->ops->destroy     = SUNMatDestroy_cuSparse;
1218   A->ops->zero        = SUNMatZero_cuSparse;
1219   A->ops->copy        = SUNMatCopy_cuSparse;
1220   A->ops->scaleadd    = SUNMatScaleAdd_cuSparse;
1221   A->ops->scaleaddi   = SUNMatScaleAddI_cuSparse;
1222   A->ops->matvecsetup = SUNMatMatvecSetup_cuSparse;
1223   A->ops->matvec      = SUNMatMatvec_cuSparse;
1224 
1225   /* Create content */
1226   SUNMatrix_Content_cuSparse content = NULL;
1227   content = (SUNMatrix_Content_cuSparse) malloc(sizeof *content);
1228   if (content == NULL)
1229   {
1230     SUNDIALS_DEBUG_PRINT("ERROR in SUNMatrix_cuSparse_NewEmpty: failed to malloc content\n");
1231     SUNMatDestroy(A);
1232     return(NULL);
1233   }
1234 
1235   /* Attach content */
1236   A->content = content;
1237   content->mem_helper = NULL;
1238 
1239   return(A);
1240 }
1241 
1242 #if CUDART_VERSION >= 11000
CreateSpMatDescr(SUNMatrix A,cusparseSpMatDescr_t * spmat_descr)1243 cusparseStatus_t CreateSpMatDescr(SUNMatrix A, cusparseSpMatDescr_t *spmat_descr)
1244 {
1245   /* CUDA 11 introduced the "Generic API" and removed the cusparseXcsrmv that
1246     works on the old cusparseMatDescr_t and raw data arrays. However,
1247     cuSolverSp stuff requires the cusparseMatDescr_t still. So, we have to
1248     create this cusparseSpMatDescr_t *and* the cusparseMatDescr_t. */
1249   return(cusparseCreateCsr(spmat_descr, SMCU_ROWS(A), SMCU_COLUMNS(A),
1250                            SMCU_NNZ(A), SMCU_INDEXPTRSp(A),
1251                            SMCU_INDEXVALSp(A), SMCU_DATAp(A),
1252                            CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
1253                            CUSPARSE_INDEX_BASE_ZERO, CUDA_R_XF));
1254 }
1255 #endif
1256