1 /*
2 Defines the basic matrix operations for the AIJ (compressed row)
3 matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_CXX_COMPLEX_FIX
7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
8
9 #include <petscconf.h>
10 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
11 #include <../src/mat/impls/sbaij/seq/sbaij.h>
12 #include <../src/vec/vec/impls/dvecimpl.h>
13 #include <petsc/private/vecimpl.h>
14 #undef VecType
15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16
17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
19 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
20 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
21
22 typedef enum {
23 CUSPARSE_MV_ALG_DEFAULT = 0,
24 CUSPARSE_COOMV_ALG = 1,
25 CUSPARSE_CSRMV_ALG1 = 2,
26 CUSPARSE_CSRMV_ALG2 = 3
27 } cusparseSpMVAlg_t;
28
29 typedef enum {
30 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
31 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1,
32 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2,
33 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3,
34 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4,
35 CUSPARSE_SPMM_ALG_DEFAULT = 0,
36 CUSPARSE_SPMM_COO_ALG1 = 1,
37 CUSPARSE_SPMM_COO_ALG2 = 2,
38 CUSPARSE_SPMM_COO_ALG3 = 3,
39 CUSPARSE_SPMM_COO_ALG4 = 5,
40 CUSPARSE_SPMM_CSR_ALG1 = 4,
41 CUSPARSE_SPMM_CSR_ALG2 = 6,
42 } cusparseSpMMAlg_t;
43
44 typedef enum {
45 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
46 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc
47 } cusparseCsr2CscAlg_t;
48 */
49 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
50 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
51 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
52 #endif
53
54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
57
58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
61
62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
67 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
68 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
69 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
70 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
71 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
72 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
73 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
74
75 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
76 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
77 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
78 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
79 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
80 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
81
MatCUSPARSESetStream(Mat A,const cudaStream_t stream)82 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
83 {
84 cusparseStatus_t stat;
85 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
86
87 PetscFunctionBegin;
88 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
89 cusparsestruct->stream = stream;
90 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
91 PetscFunctionReturn(0);
92 }
93
MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)94 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
95 {
96 cusparseStatus_t stat;
97 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
98
99 PetscFunctionBegin;
100 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
101 if (cusparsestruct->handle != handle) {
102 if (cusparsestruct->handle) {
103 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
104 }
105 cusparsestruct->handle = handle;
106 }
107 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
108 PetscFunctionReturn(0);
109 }
110
MatCUSPARSEClearHandle(Mat A)111 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
112 {
113 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
114
115 PetscFunctionBegin;
116 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
117 if (cusparsestruct->handle) cusparsestruct->handle = 0;
118 PetscFunctionReturn(0);
119 }
120
MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType * type)121 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
122 {
123 PetscFunctionBegin;
124 *type = MATSOLVERCUSPARSE;
125 PetscFunctionReturn(0);
126 }
127
128 /*MC
129 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
130 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
131 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
132 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
133 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
134 algorithms are not recommended. This class does NOT support direct solver operations.
135
136 Level: beginner
137
138 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
139 M*/
140
MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat * B)141 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
142 {
143 PetscErrorCode ierr;
144 PetscInt n = A->rmap->n;
145
146 PetscFunctionBegin;
147 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
148 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
149 (*B)->factortype = ftype;
150 (*B)->useordering = PETSC_TRUE;
151 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
152
153 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
154 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
155 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
156 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE;
157 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
158 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE;
159 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
160 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
161
162 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
163 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
164 PetscFunctionReturn(0);
165 }
166
MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)167 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
168 {
169 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
170
171 PetscFunctionBegin;
172 switch (op) {
173 case MAT_CUSPARSE_MULT:
174 cusparsestruct->format = format;
175 break;
176 case MAT_CUSPARSE_ALL:
177 cusparsestruct->format = format;
178 break;
179 default:
180 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
181 }
182 PetscFunctionReturn(0);
183 }
184
185 /*@
186 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
187 operation. Only the MatMult operation can use different GPU storage formats
188 for MPIAIJCUSPARSE matrices.
189 Not Collective
190
191 Input Parameters:
192 + A - Matrix of type SEQAIJCUSPARSE
193 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
194 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
195
196 Output Parameter:
197
198 Level: intermediate
199
200 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
201 @*/
MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)202 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
203 {
204 PetscErrorCode ierr;
205
206 PetscFunctionBegin;
207 PetscValidHeaderSpecific(A, MAT_CLASSID,1);
208 ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
209 PetscFunctionReturn(0);
210 }
211
212 /*@
213 MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the tranpose matrix before calling MatMultTranspose
214
215 Collective on mat
216
217 Input Parameters:
218 + A - Matrix of type SEQAIJCUSPARSE
219 - transgen - the boolean flag
220
221 Level: intermediate
222
223 .seealso: MATSEQAIJCUSPARSE
224 @*/
MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen)225 PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen)
226 {
227 PetscErrorCode ierr;
228 PetscBool flg;
229
230 PetscFunctionBegin;
231 PetscValidHeaderSpecific(A,MAT_CLASSID,1);
232 ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
233 if (flg) {
234 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
235
236 if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
237 cusp->transgen = transgen;
238 if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */
239 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
240 }
241 }
242 PetscFunctionReturn(0);
243 }
244
MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems * PetscOptionsObject,Mat A)245 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
246 {
247 PetscErrorCode ierr;
248 MatCUSPARSEStorageFormat format;
249 PetscBool flg;
250 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
251
252 PetscFunctionBegin;
253 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
254 if (A->factortype == MAT_FACTOR_NONE) {
255 PetscBool transgen = cusparsestruct->transgen;
256
257 ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr);
258 if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);}
259
260 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
261 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
262 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
263
264 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
265 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
266 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
267 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
268 cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
269 ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
270 "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
271 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
272 if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
273
274 cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
275 ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
276 "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
277 if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
278
279 cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
280 ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
281 "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
282 if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
283 #endif
284 }
285 ierr = PetscOptionsTail();CHKERRQ(ierr);
286 PetscFunctionReturn(0);
287 }
288
MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo * info)289 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
290 {
291 PetscErrorCode ierr;
292
293 PetscFunctionBegin;
294 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
295 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
296 PetscFunctionReturn(0);
297 }
298
MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo * info)299 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
300 {
301 PetscErrorCode ierr;
302
303 PetscFunctionBegin;
304 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
305 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
306 PetscFunctionReturn(0);
307 }
308
MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo * info)309 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
310 {
311 PetscErrorCode ierr;
312
313 PetscFunctionBegin;
314 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
315 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
316 PetscFunctionReturn(0);
317 }
318
MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo * info)319 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
320 {
321 PetscErrorCode ierr;
322
323 PetscFunctionBegin;
324 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
325 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
326 PetscFunctionReturn(0);
327 }
328
MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)329 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
330 {
331 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
332 PetscInt n = A->rmap->n;
333 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
334 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
335 cusparseStatus_t stat;
336 const PetscInt *ai = a->i,*aj = a->j,*vi;
337 const MatScalar *aa = a->a,*v;
338 PetscInt *AiLo, *AjLo;
339 PetscScalar *AALo;
340 PetscInt i,nz, nzLower, offset, rowOffset;
341 PetscErrorCode ierr;
342 cudaError_t cerr;
343
344 PetscFunctionBegin;
345 if (!n) PetscFunctionReturn(0);
346 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
347 try {
348 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
349 nzLower=n+ai[n]-ai[1];
350
351 /* Allocate Space for the lower triangular matrix */
352 cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
353 cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
354 cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
355
356 /* Fill the lower triangular matrix */
357 AiLo[0] = (PetscInt) 0;
358 AiLo[n] = nzLower;
359 AjLo[0] = (PetscInt) 0;
360 AALo[0] = (MatScalar) 1.0;
361 v = aa;
362 vi = aj;
363 offset = 1;
364 rowOffset= 1;
365 for (i=1; i<n; i++) {
366 nz = ai[i+1] - ai[i];
367 /* additional 1 for the term on the diagonal */
368 AiLo[i] = rowOffset;
369 rowOffset += nz+1;
370
371 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
372 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
373
374 offset += nz;
375 AjLo[offset] = (PetscInt) i;
376 AALo[offset] = (MatScalar) 1.0;
377 offset += 1;
378
379 v += nz;
380 vi += nz;
381 }
382
383 /* allocate space for the triangular factor information */
384 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
385
386 /* Create the matrix description */
387 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
388 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
389 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
390 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
391 #else
392 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
393 #endif
394 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
395 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
396
397 /* set the operation */
398 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
399
400 /* set the matrix */
401 loTriFactor->csrMat = new CsrMatrix;
402 loTriFactor->csrMat->num_rows = n;
403 loTriFactor->csrMat->num_cols = n;
404 loTriFactor->csrMat->num_entries = nzLower;
405
406 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
407 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
408
409 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
410 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
411
412 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
413 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
414
415 /* Create the solve analysis information */
416 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
417 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
418 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
419 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
420 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
421 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
422 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
423 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
424 #endif
425
426 /* perform the solve analysis */
427 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
428 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
429 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
430 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
431 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
432 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
433 #endif
434 );CHKERRCUSPARSE(stat);
435
436 /* assign the pointer. Is this really necessary? */
437 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
438
439 cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
440 cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
441 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
442 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
443 } catch(char *ex) {
444 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
445 }
446 }
447 PetscFunctionReturn(0);
448 }
449
MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)450 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
451 {
452 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
453 PetscInt n = A->rmap->n;
454 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
455 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
456 cusparseStatus_t stat;
457 const PetscInt *aj = a->j,*adiag = a->diag,*vi;
458 const MatScalar *aa = a->a,*v;
459 PetscInt *AiUp, *AjUp;
460 PetscScalar *AAUp;
461 PetscInt i,nz, nzUpper, offset;
462 PetscErrorCode ierr;
463 cudaError_t cerr;
464
465 PetscFunctionBegin;
466 if (!n) PetscFunctionReturn(0);
467 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
468 try {
469 /* next, figure out the number of nonzeros in the upper triangular matrix. */
470 nzUpper = adiag[0]-adiag[n];
471
472 /* Allocate Space for the upper triangular matrix */
473 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
474 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
475 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
476
477 /* Fill the upper triangular matrix */
478 AiUp[0]=(PetscInt) 0;
479 AiUp[n]=nzUpper;
480 offset = nzUpper;
481 for (i=n-1; i>=0; i--) {
482 v = aa + adiag[i+1] + 1;
483 vi = aj + adiag[i+1] + 1;
484
485 /* number of elements NOT on the diagonal */
486 nz = adiag[i] - adiag[i+1]-1;
487
488 /* decrement the offset */
489 offset -= (nz+1);
490
491 /* first, set the diagonal elements */
492 AjUp[offset] = (PetscInt) i;
493 AAUp[offset] = (MatScalar)1./v[nz];
494 AiUp[i] = AiUp[i+1] - (nz+1);
495
496 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
497 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
498 }
499
500 /* allocate space for the triangular factor information */
501 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
502
503 /* Create the matrix description */
504 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
505 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
506 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
507 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
508 #else
509 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
510 #endif
511 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
512 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
513
514 /* set the operation */
515 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
516
517 /* set the matrix */
518 upTriFactor->csrMat = new CsrMatrix;
519 upTriFactor->csrMat->num_rows = n;
520 upTriFactor->csrMat->num_cols = n;
521 upTriFactor->csrMat->num_entries = nzUpper;
522
523 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
524 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
525
526 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
527 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
528
529 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
530 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
531
532 /* Create the solve analysis information */
533 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
534 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
535 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
536 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
537 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
538 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
539 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
540 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
541 #endif
542
543 /* perform the solve analysis */
544 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
545 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
546 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
547 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
548 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
549 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
550 #endif
551 );CHKERRCUSPARSE(stat);
552
553 /* assign the pointer. Is this really necessary? */
554 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
555
556 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
557 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
558 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
559 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
560 } catch(char *ex) {
561 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
562 }
563 }
564 PetscFunctionReturn(0);
565 }
566
MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)567 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
568 {
569 PetscErrorCode ierr;
570 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
571 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
572 IS isrow = a->row,iscol = a->icol;
573 PetscBool row_identity,col_identity;
574 const PetscInt *r,*c;
575 PetscInt n = A->rmap->n;
576
577 PetscFunctionBegin;
578 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
579 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
580 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
581
582 cusparseTriFactors->workVector = new THRUSTARRAY(n);
583 cusparseTriFactors->nnz=a->nz;
584
585 A->offloadmask = PETSC_OFFLOAD_BOTH;
586 /* lower triangular indices */
587 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
588 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
589 if (!row_identity) {
590 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
591 cusparseTriFactors->rpermIndices->assign(r, r+n);
592 }
593 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
594
595 /* upper triangular indices */
596 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
597 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
598 if (!col_identity) {
599 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
600 cusparseTriFactors->cpermIndices->assign(c, c+n);
601 }
602
603 if (!row_identity && !col_identity) {
604 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
605 } else if(!row_identity) {
606 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
607 } else if(!col_identity) {
608 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
609 }
610
611 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
612 PetscFunctionReturn(0);
613 }
614
MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)615 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
616 {
617 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
618 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
619 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
620 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
621 cusparseStatus_t stat;
622 PetscErrorCode ierr;
623 cudaError_t cerr;
624 PetscInt *AiUp, *AjUp;
625 PetscScalar *AAUp;
626 PetscScalar *AALo;
627 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
628 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data;
629 const PetscInt *ai = b->i,*aj = b->j,*vj;
630 const MatScalar *aa = b->a,*v;
631
632 PetscFunctionBegin;
633 if (!n) PetscFunctionReturn(0);
634 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
635 try {
636 /* Allocate Space for the upper triangular matrix */
637 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
638 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
639 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
640 cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
641
642 /* Fill the upper triangular matrix */
643 AiUp[0]=(PetscInt) 0;
644 AiUp[n]=nzUpper;
645 offset = 0;
646 for (i=0; i<n; i++) {
647 /* set the pointers */
648 v = aa + ai[i];
649 vj = aj + ai[i];
650 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
651
652 /* first, set the diagonal elements */
653 AjUp[offset] = (PetscInt) i;
654 AAUp[offset] = (MatScalar)1.0/v[nz];
655 AiUp[i] = offset;
656 AALo[offset] = (MatScalar)1.0/v[nz];
657
658 offset+=1;
659 if (nz>0) {
660 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
661 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
662 for (j=offset; j<offset+nz; j++) {
663 AAUp[j] = -AAUp[j];
664 AALo[j] = AAUp[j]/v[nz];
665 }
666 offset+=nz;
667 }
668 }
669
670 /* allocate space for the triangular factor information */
671 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
672
673 /* Create the matrix description */
674 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
675 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
676 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
677 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
678 #else
679 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
680 #endif
681 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
682 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
683
684 /* set the matrix */
685 upTriFactor->csrMat = new CsrMatrix;
686 upTriFactor->csrMat->num_rows = A->rmap->n;
687 upTriFactor->csrMat->num_cols = A->cmap->n;
688 upTriFactor->csrMat->num_entries = a->nz;
689
690 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
691 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
692
693 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
694 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
695
696 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
697 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
698
699 /* set the operation */
700 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
701
702 /* Create the solve analysis information */
703 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
704 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
705 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
706 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
707 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
708 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
709 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
710 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
711 #endif
712
713 /* perform the solve analysis */
714 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
715 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
716 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
717 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
718 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
719 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
720 #endif
721 );CHKERRCUSPARSE(stat);
722
723 /* assign the pointer. Is this really necessary? */
724 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
725
726 /* allocate space for the triangular factor information */
727 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
728
729 /* Create the matrix description */
730 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
731 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
732 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
733 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
734 #else
735 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
736 #endif
737 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
738 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
739
740 /* set the operation */
741 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
742
743 /* set the matrix */
744 loTriFactor->csrMat = new CsrMatrix;
745 loTriFactor->csrMat->num_rows = A->rmap->n;
746 loTriFactor->csrMat->num_cols = A->cmap->n;
747 loTriFactor->csrMat->num_entries = a->nz;
748
749 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
750 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
751
752 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
753 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
754
755 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
756 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
757 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
758
759 /* Create the solve analysis information */
760 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
761 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
762 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
763 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
764 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
765 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
766 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
767 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
768 #endif
769
770 /* perform the solve analysis */
771 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
772 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
773 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
774 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
775 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
776 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
777 #endif
778 );CHKERRCUSPARSE(stat);
779
780 /* assign the pointer. Is this really necessary? */
781 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
782
783 A->offloadmask = PETSC_OFFLOAD_BOTH;
784 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
785 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
786 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
787 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
788 } catch(char *ex) {
789 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
790 }
791 }
792 PetscFunctionReturn(0);
793 }
794
MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)795 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
796 {
797 PetscErrorCode ierr;
798 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
799 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
800 IS ip = a->row;
801 const PetscInt *rip;
802 PetscBool perm_identity;
803 PetscInt n = A->rmap->n;
804
805 PetscFunctionBegin;
806 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
807 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
808 cusparseTriFactors->workVector = new THRUSTARRAY(n);
809 cusparseTriFactors->nnz=(a->nz-n)*2 + n;
810
811 /* lower triangular indices */
812 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
813 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
814 if (!perm_identity) {
815 IS iip;
816 const PetscInt *irip;
817
818 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
819 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
820 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
821 cusparseTriFactors->rpermIndices->assign(rip, rip+n);
822 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
823 cusparseTriFactors->cpermIndices->assign(irip, irip+n);
824 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
825 ierr = ISDestroy(&iip);CHKERRQ(ierr);
826 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
827 }
828 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
829 PetscFunctionReturn(0);
830 }
831
MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo * info)832 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
833 {
834 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data;
835 IS isrow = b->row,iscol = b->col;
836 PetscBool row_identity,col_identity;
837 PetscErrorCode ierr;
838
839 PetscFunctionBegin;
840 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
841 B->offloadmask = PETSC_OFFLOAD_CPU;
842 /* determine which version of MatSolve needs to be used. */
843 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
844 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
845 if (row_identity && col_identity) {
846 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
847 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
848 B->ops->matsolve = NULL;
849 B->ops->matsolvetranspose = NULL;
850 } else {
851 B->ops->solve = MatSolve_SeqAIJCUSPARSE;
852 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
853 B->ops->matsolve = NULL;
854 B->ops->matsolvetranspose = NULL;
855 }
856
857 /* get the triangular factors */
858 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
859 PetscFunctionReturn(0);
860 }
861
MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo * info)862 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
863 {
864 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data;
865 IS ip = b->row;
866 PetscBool perm_identity;
867 PetscErrorCode ierr;
868
869 PetscFunctionBegin;
870 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
871 B->offloadmask = PETSC_OFFLOAD_CPU;
872 /* determine which version of MatSolve needs to be used. */
873 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
874 if (perm_identity) {
875 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
876 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
877 B->ops->matsolve = NULL;
878 B->ops->matsolvetranspose = NULL;
879 } else {
880 B->ops->solve = MatSolve_SeqAIJCUSPARSE;
881 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
882 B->ops->matsolve = NULL;
883 B->ops->matsolvetranspose = NULL;
884 }
885
886 /* get the triangular factors */
887 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
888 PetscFunctionReturn(0);
889 }
890
MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)891 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
892 {
893 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
894 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
895 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
896 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
897 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
898 cusparseStatus_t stat;
899 cusparseIndexBase_t indexBase;
900 cusparseMatrixType_t matrixType;
901 cusparseFillMode_t fillMode;
902 cusparseDiagType_t diagType;
903
904 PetscFunctionBegin;
905
906 /*********************************************/
907 /* Now the Transpose of the Lower Tri Factor */
908 /*********************************************/
909
910 /* allocate space for the transpose of the lower triangular factor */
911 loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
912
913 /* set the matrix descriptors of the lower triangular factor */
914 matrixType = cusparseGetMatType(loTriFactor->descr);
915 indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
916 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
917 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
918 diagType = cusparseGetMatDiagType(loTriFactor->descr);
919
920 /* Create the matrix description */
921 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
922 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
923 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
924 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
925 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
926
927 /* set the operation */
928 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
929
930 /* allocate GPU space for the CSC of the lower triangular factor*/
931 loTriFactorT->csrMat = new CsrMatrix;
932 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols;
933 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows;
934 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
935 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
936 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
937 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
938
939 /* compute the transpose of the lower triangular factor, i.e. the CSC */
940 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
941 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
942 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
943 loTriFactor->csrMat->values->data().get(),
944 loTriFactor->csrMat->row_offsets->data().get(),
945 loTriFactor->csrMat->column_indices->data().get(),
946 loTriFactorT->csrMat->values->data().get(),
947 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
948 CUSPARSE_ACTION_NUMERIC,indexBase,
949 CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
950 cudaError_t cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
951 #endif
952
953 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
954 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
955 loTriFactor->csrMat->values->data().get(),
956 loTriFactor->csrMat->row_offsets->data().get(),
957 loTriFactor->csrMat->column_indices->data().get(),
958 loTriFactorT->csrMat->values->data().get(),
959 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
960 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
961 CUSPARSE_ACTION_NUMERIC, indexBase,
962 CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer
963 #else
964 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
965 CUSPARSE_ACTION_NUMERIC, indexBase
966 #endif
967 );CHKERRCUSPARSE(stat);
968
969 /* Create the solve analysis information */
970 stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
971 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
972 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
973 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
974 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
975 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
976 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
977 cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
978 #endif
979
980 /* perform the solve analysis */
981 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
982 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
983 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
984 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo
985 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
986 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
987 #endif
988 );CHKERRCUSPARSE(stat);
989
990 /* assign the pointer. Is this really necessary? */
991 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
992
993 /*********************************************/
994 /* Now the Transpose of the Upper Tri Factor */
995 /*********************************************/
996
997 /* allocate space for the transpose of the upper triangular factor */
998 upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
999
1000 /* set the matrix descriptors of the upper triangular factor */
1001 matrixType = cusparseGetMatType(upTriFactor->descr);
1002 indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1003 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1004 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1005 diagType = cusparseGetMatDiagType(upTriFactor->descr);
1006
1007 /* Create the matrix description */
1008 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
1009 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1010 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1011 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1012 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1013
1014 /* set the operation */
1015 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1016
1017 /* allocate GPU space for the CSC of the upper triangular factor*/
1018 upTriFactorT->csrMat = new CsrMatrix;
1019 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols;
1020 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows;
1021 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
1022 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1023 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1024 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1025
1026 /* compute the transpose of the upper triangular factor, i.e. the CSC */
1027 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1028 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1029 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1030 upTriFactor->csrMat->values->data().get(),
1031 upTriFactor->csrMat->row_offsets->data().get(),
1032 upTriFactor->csrMat->column_indices->data().get(),
1033 upTriFactorT->csrMat->values->data().get(),
1034 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1035 CUSPARSE_ACTION_NUMERIC,indexBase,
1036 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1037 cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1038 #endif
1039
1040 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1041 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1042 upTriFactor->csrMat->values->data().get(),
1043 upTriFactor->csrMat->row_offsets->data().get(),
1044 upTriFactor->csrMat->column_indices->data().get(),
1045 upTriFactorT->csrMat->values->data().get(),
1046 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1047 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1048 CUSPARSE_ACTION_NUMERIC, indexBase,
1049 CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer
1050 #else
1051 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1052 CUSPARSE_ACTION_NUMERIC, indexBase
1053 #endif
1054 );CHKERRCUSPARSE(stat);
1055
1056 /* Create the solve analysis information */
1057 stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1058 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1059 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1060 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1061 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1062 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1063 &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1064 cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1065 #endif
1066
1067 /* perform the solve analysis */
1068 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1069 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1070 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1071 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo
1072 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1073 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1074 #endif
1075 );CHKERRCUSPARSE(stat);
1076
1077 /* assign the pointer. Is this really necessary? */
1078 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1079 PetscFunctionReturn(0);
1080 }
1081
MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)1082 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
1083 {
1084 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1085 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1086 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1087 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1088 cusparseStatus_t stat;
1089 cusparseIndexBase_t indexBase;
1090 cudaError_t err;
1091 PetscErrorCode ierr;
1092
1093 PetscFunctionBegin;
1094 if (!cusparsestruct->transgen || cusparsestruct->matTranspose) PetscFunctionReturn(0);
1095 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1096 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1097 /* create cusparse matrix */
1098 matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1099 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1100 indexBase = cusparseGetMatIndexBase(matstruct->descr);
1101 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
1102 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1103
1104 /* set alpha and beta */
1105 err = cudaMalloc((void **)&(matstructT->alpha_one), sizeof(PetscScalar));CHKERRCUDA(err);
1106 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1107 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1108 err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1109 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1110 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1111 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1112
1113 if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1114 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1115 CsrMatrix *matrixT= new CsrMatrix;
1116 matrixT->num_rows = A->cmap->n;
1117 matrixT->num_cols = A->rmap->n;
1118 matrixT->num_entries = a->nz;
1119 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1120 matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1121 matrixT->values = new THRUSTARRAY(a->nz);
1122
1123 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
1124 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1125
1126 /* compute the transpose, i.e. the CSC */
1127 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1128 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1129 A->cmap->n, matrix->num_entries,
1130 matrix->values->data().get(),
1131 cusparsestruct->rowoffsets_gpu->data().get(),
1132 matrix->column_indices->data().get(),
1133 matrixT->values->data().get(),
1134 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1135 CUSPARSE_ACTION_NUMERIC,indexBase,
1136 cusparsestruct->csr2cscAlg, &cusparsestruct->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1137 err = cudaMalloc(&cusparsestruct->csr2cscBuffer,cusparsestruct->csr2cscBufferSize);CHKERRCUDA(err);
1138 #endif
1139
1140 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1141 A->cmap->n, matrix->num_entries,
1142 matrix->values->data().get(),
1143 cusparsestruct->rowoffsets_gpu->data().get(),
1144 matrix->column_indices->data().get(),
1145 matrixT->values->data().get(),
1146 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1147 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1148 CUSPARSE_ACTION_NUMERIC,indexBase,
1149 cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1150 #else
1151 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1152 CUSPARSE_ACTION_NUMERIC, indexBase
1153 #endif
1154 );CHKERRCUSPARSE(stat);
1155 matstructT->mat = matrixT;
1156
1157 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1158 stat = cusparseCreateCsr(&matstructT->matDescr,
1159 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1160 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1161 matrixT->values->data().get(),
1162 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1163 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1164 #endif
1165 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1166 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1167 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1168 #else
1169 CsrMatrix *temp = new CsrMatrix;
1170 CsrMatrix *tempT = new CsrMatrix;
1171 /* First convert HYB to CSR */
1172 temp->num_rows = A->rmap->n;
1173 temp->num_cols = A->cmap->n;
1174 temp->num_entries = a->nz;
1175 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1176 temp->column_indices = new THRUSTINTARRAY32(a->nz);
1177 temp->values = new THRUSTARRAY(a->nz);
1178
1179 stat = cusparse_hyb2csr(cusparsestruct->handle,
1180 matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1181 temp->values->data().get(),
1182 temp->row_offsets->data().get(),
1183 temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1184
1185 /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1186 tempT->num_rows = A->rmap->n;
1187 tempT->num_cols = A->cmap->n;
1188 tempT->num_entries = a->nz;
1189 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1190 tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1191 tempT->values = new THRUSTARRAY(a->nz);
1192
1193 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1194 temp->num_cols, temp->num_entries,
1195 temp->values->data().get(),
1196 temp->row_offsets->data().get(),
1197 temp->column_indices->data().get(),
1198 tempT->values->data().get(),
1199 tempT->column_indices->data().get(),
1200 tempT->row_offsets->data().get(),
1201 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1202
1203 /* Last, convert CSC to HYB */
1204 cusparseHybMat_t hybMat;
1205 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1206 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1207 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1208 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1209 matstructT->descr, tempT->values->data().get(),
1210 tempT->row_offsets->data().get(),
1211 tempT->column_indices->data().get(),
1212 hybMat, 0, partition);CHKERRCUSPARSE(stat);
1213
1214 /* assign the pointer */
1215 matstructT->mat = hybMat;
1216 /* delete temporaries */
1217 if (tempT) {
1218 if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1219 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1220 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1221 delete (CsrMatrix*) tempT;
1222 }
1223 if (temp) {
1224 if (temp->values) delete (THRUSTARRAY*) temp->values;
1225 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1226 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1227 delete (CsrMatrix*) temp;
1228 }
1229 #endif
1230 }
1231 err = WaitForCUDA();CHKERRCUDA(err);
1232 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1233 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1234 /* the compressed row indices is not used for matTranspose */
1235 matstructT->cprowIndices = NULL;
1236 /* assign the pointer */
1237 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1238 PetscFunctionReturn(0);
1239 }
1240
1241 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)1242 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1243 {
1244 PetscInt n = xx->map->n;
1245 const PetscScalar *barray;
1246 PetscScalar *xarray;
1247 thrust::device_ptr<const PetscScalar> bGPU;
1248 thrust::device_ptr<PetscScalar> xGPU;
1249 cusparseStatus_t stat;
1250 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1251 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1252 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1253 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1254 PetscErrorCode ierr;
1255 cudaError_t cerr;
1256
1257 PetscFunctionBegin;
1258 /* Analyze the matrix and create the transpose ... on the fly */
1259 if (!loTriFactorT && !upTriFactorT) {
1260 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1261 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1262 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1263 }
1264
1265 /* Get the GPU pointers */
1266 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1267 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1268 xGPU = thrust::device_pointer_cast(xarray);
1269 bGPU = thrust::device_pointer_cast(barray);
1270
1271 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1272 /* First, reorder with the row permutation */
1273 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1274 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1275 xGPU);
1276
1277 /* First, solve U */
1278 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1279 upTriFactorT->csrMat->num_rows,
1280 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1281 upTriFactorT->csrMat->num_entries,
1282 #endif
1283 &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1284 upTriFactorT->csrMat->values->data().get(),
1285 upTriFactorT->csrMat->row_offsets->data().get(),
1286 upTriFactorT->csrMat->column_indices->data().get(),
1287 upTriFactorT->solveInfo,
1288 xarray, tempGPU->data().get()
1289 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1290 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1291 #endif
1292 );CHKERRCUSPARSE(stat);
1293
1294 /* Then, solve L */
1295 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1296 loTriFactorT->csrMat->num_rows,
1297 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1298 loTriFactorT->csrMat->num_entries,
1299 #endif
1300 &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1301 loTriFactorT->csrMat->values->data().get(),
1302 loTriFactorT->csrMat->row_offsets->data().get(),
1303 loTriFactorT->csrMat->column_indices->data().get(),
1304 loTriFactorT->solveInfo,
1305 tempGPU->data().get(), xarray
1306 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1307 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1308 #endif
1309 );CHKERRCUSPARSE(stat);
1310
1311 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1312 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1313 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1314 tempGPU->begin());
1315
1316 /* Copy the temporary to the full solution. */
1317 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1318
1319 /* restore */
1320 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1321 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1322 cerr = WaitForCUDA();CHKERRCUDA(cerr);
1323 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1324 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1325 PetscFunctionReturn(0);
1326 }
1327
MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)1328 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1329 {
1330 const PetscScalar *barray;
1331 PetscScalar *xarray;
1332 cusparseStatus_t stat;
1333 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1334 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1335 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1336 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1337 PetscErrorCode ierr;
1338 cudaError_t cerr;
1339
1340 PetscFunctionBegin;
1341 /* Analyze the matrix and create the transpose ... on the fly */
1342 if (!loTriFactorT && !upTriFactorT) {
1343 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1344 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1345 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1346 }
1347
1348 /* Get the GPU pointers */
1349 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1350 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1351
1352 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1353 /* First, solve U */
1354 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1355 upTriFactorT->csrMat->num_rows,
1356 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1357 upTriFactorT->csrMat->num_entries,
1358 #endif
1359 &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1360 upTriFactorT->csrMat->values->data().get(),
1361 upTriFactorT->csrMat->row_offsets->data().get(),
1362 upTriFactorT->csrMat->column_indices->data().get(),
1363 upTriFactorT->solveInfo,
1364 barray, tempGPU->data().get()
1365 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1366 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1367 #endif
1368 );CHKERRCUSPARSE(stat);
1369
1370 /* Then, solve L */
1371 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1372 loTriFactorT->csrMat->num_rows,
1373 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1374 loTriFactorT->csrMat->num_entries,
1375 #endif
1376 &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1377 loTriFactorT->csrMat->values->data().get(),
1378 loTriFactorT->csrMat->row_offsets->data().get(),
1379 loTriFactorT->csrMat->column_indices->data().get(),
1380 loTriFactorT->solveInfo,
1381 tempGPU->data().get(), xarray
1382 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1383 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1384 #endif
1385 );CHKERRCUSPARSE(stat);
1386
1387 /* restore */
1388 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1389 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1390 cerr = WaitForCUDA();CHKERRCUDA(cerr);
1391 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1392 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1393 PetscFunctionReturn(0);
1394 }
1395
MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)1396 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1397 {
1398 const PetscScalar *barray;
1399 PetscScalar *xarray;
1400 thrust::device_ptr<const PetscScalar> bGPU;
1401 thrust::device_ptr<PetscScalar> xGPU;
1402 cusparseStatus_t stat;
1403 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1404 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1405 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1406 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1407 PetscErrorCode ierr;
1408 cudaError_t cerr;
1409
1410 PetscFunctionBegin;
1411
1412 /* Get the GPU pointers */
1413 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1414 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1415 xGPU = thrust::device_pointer_cast(xarray);
1416 bGPU = thrust::device_pointer_cast(barray);
1417
1418 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1419 /* First, reorder with the row permutation */
1420 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1421 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1422 tempGPU->begin());
1423
1424 /* Next, solve L */
1425 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1426 loTriFactor->csrMat->num_rows,
1427 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1428 loTriFactor->csrMat->num_entries,
1429 #endif
1430 &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1431 loTriFactor->csrMat->values->data().get(),
1432 loTriFactor->csrMat->row_offsets->data().get(),
1433 loTriFactor->csrMat->column_indices->data().get(),
1434 loTriFactor->solveInfo,
1435 tempGPU->data().get(), xarray
1436 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1437 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1438 #endif
1439 );CHKERRCUSPARSE(stat);
1440
1441 /* Then, solve U */
1442 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1443 upTriFactor->csrMat->num_rows,
1444 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1445 upTriFactor->csrMat->num_entries,
1446 #endif
1447 &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1448 upTriFactor->csrMat->values->data().get(),
1449 upTriFactor->csrMat->row_offsets->data().get(),
1450 upTriFactor->csrMat->column_indices->data().get(),
1451 upTriFactor->solveInfo,
1452 xarray, tempGPU->data().get()
1453 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1454 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1455 #endif
1456 );CHKERRCUSPARSE(stat);
1457
1458 /* Last, reorder with the column permutation */
1459 thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1460 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1461 xGPU);
1462
1463 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1464 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1465 cerr = WaitForCUDA();CHKERRCUDA(cerr);
1466 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1467 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1468 PetscFunctionReturn(0);
1469 }
1470
MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)1471 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1472 {
1473 const PetscScalar *barray;
1474 PetscScalar *xarray;
1475 cusparseStatus_t stat;
1476 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1477 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1478 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1479 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1480 PetscErrorCode ierr;
1481 cudaError_t cerr;
1482
1483 PetscFunctionBegin;
1484 /* Get the GPU pointers */
1485 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1486 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1487
1488 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1489 /* First, solve L */
1490 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1491 loTriFactor->csrMat->num_rows,
1492 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1493 loTriFactor->csrMat->num_entries,
1494 #endif
1495 &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1496 loTriFactor->csrMat->values->data().get(),
1497 loTriFactor->csrMat->row_offsets->data().get(),
1498 loTriFactor->csrMat->column_indices->data().get(),
1499 loTriFactor->solveInfo,
1500 barray, tempGPU->data().get()
1501 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1502 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1503 #endif
1504 );CHKERRCUSPARSE(stat);
1505
1506 /* Next, solve U */
1507 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1508 upTriFactor->csrMat->num_rows,
1509 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1510 upTriFactor->csrMat->num_entries,
1511 #endif
1512 &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1513 upTriFactor->csrMat->values->data().get(),
1514 upTriFactor->csrMat->row_offsets->data().get(),
1515 upTriFactor->csrMat->column_indices->data().get(),
1516 upTriFactor->solveInfo,
1517 tempGPU->data().get(), xarray
1518 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1519 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1520 #endif
1521 );CHKERRCUSPARSE(stat);
1522
1523 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1524 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1525 cerr = WaitForCUDA();CHKERRCUDA(cerr);
1526 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1527 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1528 PetscFunctionReturn(0);
1529 }
1530
MatSeqAIJCUSPARSECopyToGPU(Mat A)1531 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1532 {
1533 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1534 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1535 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1536 PetscInt m = A->rmap->n,*ii,*ridx,tmp;
1537 PetscErrorCode ierr;
1538 cusparseStatus_t stat;
1539 cudaError_t err;
1540
1541 PetscFunctionBegin;
1542 if (A->boundtocpu) PetscFunctionReturn(0);
1543 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1544 if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1545 /* Copy values only */
1546 CsrMatrix *matrix,*matrixT;
1547 matrix = (CsrMatrix*)cusparsestruct->mat->mat;
1548
1549 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1550 matrix->values->assign(a->a, a->a+a->nz);
1551 err = WaitForCUDA();CHKERRCUDA(err);
1552 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1553 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1554
1555 /* Update matT when it was built before */
1556 if (cusparsestruct->matTranspose) {
1557 cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr);
1558 matrixT = (CsrMatrix*)cusparsestruct->matTranspose->mat;
1559 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1560 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1561 A->cmap->n, matrix->num_entries,
1562 matrix->values->data().get(),
1563 cusparsestruct->rowoffsets_gpu->data().get(),
1564 matrix->column_indices->data().get(),
1565 matrixT->values->data().get(),
1566 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1567 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1568 CUSPARSE_ACTION_NUMERIC,indexBase,
1569 cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1570 #else
1571 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1572 CUSPARSE_ACTION_NUMERIC, indexBase
1573 #endif
1574 );CHKERRCUSPARSE(stat);
1575 err = WaitForCUDA();CHKERRCUDA(err);
1576 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1577 }
1578 } else {
1579 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1580 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1581 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr);
1582 delete cusparsestruct->workVector;
1583 delete cusparsestruct->rowoffsets_gpu;
1584 try {
1585 if (a->compressedrow.use) {
1586 m = a->compressedrow.nrows;
1587 ii = a->compressedrow.i;
1588 ridx = a->compressedrow.rindex;
1589 } else {
1590 m = A->rmap->n;
1591 ii = a->i;
1592 ridx = NULL;
1593 }
1594 cusparsestruct->nrows = m;
1595
1596 /* create cusparse matrix */
1597 matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1598 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1599 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1600 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1601
1602 err = cudaMalloc((void **)&(matstruct->alpha_one), sizeof(PetscScalar));CHKERRCUDA(err);
1603 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1604 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1605 err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1606 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1607 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1608 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1609
1610 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1611 if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1612 /* set the matrix */
1613 CsrMatrix *mat= new CsrMatrix;
1614 mat->num_rows = m;
1615 mat->num_cols = A->cmap->n;
1616 mat->num_entries = a->nz;
1617 mat->row_offsets = new THRUSTINTARRAY32(m+1);
1618 mat->row_offsets->assign(ii, ii + m+1);
1619
1620 mat->column_indices = new THRUSTINTARRAY32(a->nz);
1621 mat->column_indices->assign(a->j, a->j+a->nz);
1622
1623 mat->values = new THRUSTARRAY(a->nz);
1624 mat->values->assign(a->a, a->a+a->nz);
1625
1626 /* assign the pointer */
1627 matstruct->mat = mat;
1628 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1629 if (mat->num_rows) { /* cusparse errors on empty matrices! */
1630 stat = cusparseCreateCsr(&matstruct->matDescr,
1631 mat->num_rows, mat->num_cols, mat->num_entries,
1632 mat->row_offsets->data().get(), mat->column_indices->data().get(),
1633 mat->values->data().get(),
1634 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1635 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1636 }
1637 #endif
1638 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1639 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1640 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1641 #else
1642 CsrMatrix *mat= new CsrMatrix;
1643 mat->num_rows = m;
1644 mat->num_cols = A->cmap->n;
1645 mat->num_entries = a->nz;
1646 mat->row_offsets = new THRUSTINTARRAY32(m+1);
1647 mat->row_offsets->assign(ii, ii + m+1);
1648
1649 mat->column_indices = new THRUSTINTARRAY32(a->nz);
1650 mat->column_indices->assign(a->j, a->j+a->nz);
1651
1652 mat->values = new THRUSTARRAY(a->nz);
1653 mat->values->assign(a->a, a->a+a->nz);
1654
1655 cusparseHybMat_t hybMat;
1656 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1657 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1658 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1659 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1660 matstruct->descr, mat->values->data().get(),
1661 mat->row_offsets->data().get(),
1662 mat->column_indices->data().get(),
1663 hybMat, 0, partition);CHKERRCUSPARSE(stat);
1664 /* assign the pointer */
1665 matstruct->mat = hybMat;
1666
1667 if (mat) {
1668 if (mat->values) delete (THRUSTARRAY*)mat->values;
1669 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1670 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1671 delete (CsrMatrix*)mat;
1672 }
1673 #endif
1674 }
1675
1676 /* assign the compressed row indices */
1677 if (a->compressedrow.use) {
1678 cusparsestruct->workVector = new THRUSTARRAY(m);
1679 matstruct->cprowIndices = new THRUSTINTARRAY(m);
1680 matstruct->cprowIndices->assign(ridx,ridx+m);
1681 tmp = m;
1682 } else {
1683 cusparsestruct->workVector = NULL;
1684 matstruct->cprowIndices = NULL;
1685 tmp = 0;
1686 }
1687 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1688
1689 /* assign the pointer */
1690 cusparsestruct->mat = matstruct;
1691 } catch(char *ex) {
1692 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1693 }
1694 err = WaitForCUDA();CHKERRCUDA(err);
1695 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1696 cusparsestruct->nonzerostate = A->nonzerostate;
1697 }
1698 A->offloadmask = PETSC_OFFLOAD_BOTH;
1699 }
1700 PetscFunctionReturn(0);
1701 }
1702
1703 struct VecCUDAPlusEquals
1704 {
1705 template <typename Tuple>
1706 __host__ __device__
operator ()VecCUDAPlusEquals1707 void operator()(Tuple t)
1708 {
1709 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1710 }
1711 };
1712
1713 struct VecCUDAEqualsReverse
1714 {
1715 template <typename Tuple>
1716 __host__ __device__
operator ()VecCUDAEqualsReverse1717 void operator()(Tuple t)
1718 {
1719 thrust::get<0>(t) = thrust::get<1>(t);
1720 }
1721 };
1722
1723 struct MatMatCusparse {
1724 PetscBool cisdense;
1725 PetscScalar *Bt;
1726 Mat X;
1727 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1728 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */
1729 cusparseDnMatDescr_t matBDescr;
1730 cusparseDnMatDescr_t matCDescr;
1731 size_t spmmBufferSize;
1732 void *spmmBuffer;
1733 PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1734 #endif
1735 };
1736
MatDestroy_MatMatCusparse(void * data)1737 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1738 {
1739 PetscErrorCode ierr;
1740 MatMatCusparse *mmdata = (MatMatCusparse *)data;
1741 cudaError_t cerr;
1742
1743 PetscFunctionBegin;
1744 cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1745 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1746 cusparseStatus_t stat;
1747 if (mmdata->matBDescr) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat);}
1748 if (mmdata->matCDescr) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat);}
1749 if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);}
1750 #endif
1751 ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1752 ierr = PetscFree(data);CHKERRQ(ierr);
1753 PetscFunctionReturn(0);
1754 }
1755
1756 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1757
MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)1758 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1759 {
1760 Mat_Product *product = C->product;
1761 Mat A,B;
1762 PetscInt m,n,blda,clda;
1763 PetscBool flg,biscuda;
1764 Mat_SeqAIJCUSPARSE *cusp;
1765 cusparseStatus_t stat;
1766 cusparseOperation_t opA;
1767 const PetscScalar *barray;
1768 PetscScalar *carray;
1769 PetscErrorCode ierr;
1770 MatMatCusparse *mmdata;
1771 Mat_SeqAIJCUSPARSEMultStruct *mat;
1772 CsrMatrix *csrmat;
1773 cudaError_t cerr;
1774
1775 PetscFunctionBegin;
1776 MatCheckProduct(C,1);
1777 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1778 mmdata = (MatMatCusparse*)product->data;
1779 A = product->A;
1780 B = product->B;
1781 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1782 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1783 /* currently CopyToGpu does not copy if the matrix is bound to CPU
1784 Instead of silently accepting the wrong answer, I prefer to raise the error */
1785 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1786 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1787 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1788 switch (product->type) {
1789 case MATPRODUCT_AB:
1790 case MATPRODUCT_PtAP:
1791 mat = cusp->mat;
1792 opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1793 m = A->rmap->n;
1794 n = B->cmap->n;
1795 break;
1796 case MATPRODUCT_AtB:
1797 if (!cusp->transgen) {
1798 mat = cusp->mat;
1799 opA = CUSPARSE_OPERATION_TRANSPOSE;
1800 } else {
1801 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1802 mat = cusp->matTranspose;
1803 opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1804 }
1805 m = A->cmap->n;
1806 n = B->cmap->n;
1807 break;
1808 case MATPRODUCT_ABt:
1809 case MATPRODUCT_RARt:
1810 mat = cusp->mat;
1811 opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1812 m = A->rmap->n;
1813 n = B->rmap->n;
1814 break;
1815 default:
1816 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1817 }
1818 if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
1819 csrmat = (CsrMatrix*)mat->mat;
1820 /* if the user passed a CPU matrix, copy the data to the GPU */
1821 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
1822 if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
1823 ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
1824
1825 ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
1826 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1827 ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1828 ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
1829 } else {
1830 ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
1831 ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
1832 }
1833
1834 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1835 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1836 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
1837 /* (re)allcoate spmmBuffer if not initialized or LDAs are different */
1838 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
1839 if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
1840 if (!mmdata->matBDescr) {
1841 stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
1842 mmdata->Blda = blda;
1843 }
1844
1845 if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
1846 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
1847 stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
1848 mmdata->Clda = clda;
1849 }
1850
1851 if (!mat->matDescr) {
1852 stat = cusparseCreateCsr(&mat->matDescr,
1853 csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
1854 csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
1855 csrmat->values->data().get(),
1856 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1857 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1858 }
1859 stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
1860 mat->matDescr,mmdata->matBDescr,mat->beta_zero,
1861 mmdata->matCDescr,cusparse_scalartype,
1862 cusp->spmmAlg,&mmdata->spmmBufferSize);CHKERRCUSPARSE(stat);
1863 if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);}
1864 cerr = cudaMalloc(&mmdata->spmmBuffer,mmdata->spmmBufferSize);CHKERRCUDA(cerr);
1865 mmdata->initialized = PETSC_TRUE;
1866 } else {
1867 /* to be safe, always update pointers of the mats */
1868 stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
1869 stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
1870 stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
1871 }
1872
1873 /* do cusparseSpMM, which supports transpose on B */
1874 stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
1875 mat->matDescr,mmdata->matBDescr,mat->beta_zero,
1876 mmdata->matCDescr,cusparse_scalartype,
1877 cusp->spmmAlg,mmdata->spmmBuffer);CHKERRCUSPARSE(stat);
1878 #else
1879 PetscInt k;
1880 /* cusparseXcsrmm does not support transpose on B */
1881 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
1882 cublasHandle_t cublasv2handle;
1883 cublasStatus_t cerr;
1884
1885 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
1886 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
1887 B->cmap->n,B->rmap->n,
1888 &PETSC_CUSPARSE_ONE ,barray,blda,
1889 &PETSC_CUSPARSE_ZERO,barray,blda,
1890 mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
1891 blda = B->cmap->n;
1892 k = B->cmap->n;
1893 } else {
1894 k = B->rmap->n;
1895 }
1896
1897 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
1898 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
1899 csrmat->num_entries,mat->alpha_one,mat->descr,
1900 csrmat->values->data().get(),
1901 csrmat->row_offsets->data().get(),
1902 csrmat->column_indices->data().get(),
1903 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
1904 carray,clda);CHKERRCUSPARSE(stat);
1905 #endif
1906 cerr = WaitForCUDA();CHKERRCUDA(cerr);
1907 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1908 ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
1909 ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
1910 if (product->type == MATPRODUCT_RARt) {
1911 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1912 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
1913 } else if (product->type == MATPRODUCT_PtAP) {
1914 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1915 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
1916 } else {
1917 ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
1918 }
1919 if (mmdata->cisdense) {
1920 ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
1921 }
1922 if (!biscuda) {
1923 ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
1924 }
1925 PetscFunctionReturn(0);
1926 }
1927
MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)1928 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1929 {
1930 Mat_Product *product = C->product;
1931 Mat A,B;
1932 PetscInt m,n;
1933 PetscBool cisdense,flg;
1934 PetscErrorCode ierr;
1935 MatMatCusparse *mmdata;
1936 Mat_SeqAIJCUSPARSE *cusp;
1937
1938 PetscFunctionBegin;
1939 MatCheckProduct(C,1);
1940 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
1941 A = product->A;
1942 B = product->B;
1943 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1944 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1945 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1946 if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
1947 switch (product->type) {
1948 case MATPRODUCT_AB:
1949 m = A->rmap->n;
1950 n = B->cmap->n;
1951 break;
1952 case MATPRODUCT_AtB:
1953 m = A->cmap->n;
1954 n = B->cmap->n;
1955 break;
1956 case MATPRODUCT_ABt:
1957 m = A->rmap->n;
1958 n = B->rmap->n;
1959 break;
1960 case MATPRODUCT_PtAP:
1961 m = B->cmap->n;
1962 n = B->cmap->n;
1963 break;
1964 case MATPRODUCT_RARt:
1965 m = B->rmap->n;
1966 n = B->rmap->n;
1967 break;
1968 default:
1969 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1970 }
1971 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
1972 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
1973 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
1974 ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
1975
1976 /* product data */
1977 ierr = PetscNew(&mmdata);CHKERRQ(ierr);
1978 mmdata->cisdense = cisdense;
1979 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
1980 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
1981 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
1982 cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
1983 }
1984 #endif
1985 /* for these products we need intermediate storage */
1986 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1987 ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
1988 ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
1989 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
1990 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
1991 } else {
1992 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
1993 }
1994 }
1995 C->product->data = mmdata;
1996 C->product->destroy = MatDestroy_MatMatCusparse;
1997
1998 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
1999 PetscFunctionReturn(0);
2000 }
2001
2002 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2003
2004 /* handles dense B */
MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C)2005 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C)
2006 {
2007 Mat_Product *product = C->product;
2008 PetscErrorCode ierr;
2009
2010 PetscFunctionBegin;
2011 MatCheckProduct(C,1);
2012 if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A");
2013 if (product->A->boundtocpu) {
2014 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(C);CHKERRQ(ierr);
2015 PetscFunctionReturn(0);
2016 }
2017 switch (product->type) {
2018 case MATPRODUCT_AB:
2019 case MATPRODUCT_AtB:
2020 case MATPRODUCT_ABt:
2021 case MATPRODUCT_PtAP:
2022 case MATPRODUCT_RARt:
2023 C->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2024 default:
2025 break;
2026 }
2027 PetscFunctionReturn(0);
2028 }
2029
MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)2030 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2031 {
2032 PetscErrorCode ierr;
2033
2034 PetscFunctionBegin;
2035 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2036 PetscFunctionReturn(0);
2037 }
2038
MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)2039 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2040 {
2041 PetscErrorCode ierr;
2042
2043 PetscFunctionBegin;
2044 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2045 PetscFunctionReturn(0);
2046 }
2047
MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)2048 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2049 {
2050 PetscErrorCode ierr;
2051
2052 PetscFunctionBegin;
2053 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2054 PetscFunctionReturn(0);
2055 }
2056
MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)2057 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2058 {
2059 PetscErrorCode ierr;
2060
2061 PetscFunctionBegin;
2062 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2063 PetscFunctionReturn(0);
2064 }
2065
MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)2066 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2067 {
2068 PetscErrorCode ierr;
2069
2070 PetscFunctionBegin;
2071 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2072 PetscFunctionReturn(0);
2073 }
2074
2075 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)2076 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2077 {
2078 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2079 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2080 Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2081 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr;
2082 PetscErrorCode ierr;
2083 cudaError_t cerr;
2084 cusparseStatus_t stat;
2085 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2086 PetscBool compressed;
2087 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2088 PetscInt nx,ny;
2089 #endif
2090
2091 PetscFunctionBegin;
2092 if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported");
2093 if (!a->nonzerorowcnt) {
2094 if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
2095 PetscFunctionReturn(0);
2096 }
2097 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2098 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2099 if (!trans) {
2100 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2101 } else {
2102 if (herm || !cusparsestruct->transgen) {
2103 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2104 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2105 } else {
2106 if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);}
2107 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2108 }
2109 }
2110 /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2111 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2112
2113 try {
2114 ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2115 if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
2116 else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
2117
2118 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2119 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2120 /* z = A x + beta y.
2121 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2122 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2123 */
2124 xptr = xarray;
2125 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2126 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2127 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2128 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2129 allocated to accommodate different uses. So we get the length info directly from mat.
2130 */
2131 if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2132 CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2133 nx = mat->num_cols;
2134 ny = mat->num_rows;
2135 }
2136 #endif
2137 } else {
2138 /* z = A^T x + beta y
2139 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2140 Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2141 */
2142 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2143 dptr = zarray;
2144 beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2145 if (compressed) { /* Scatter x to work vector */
2146 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2147 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2148 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2149 VecCUDAEqualsReverse());
2150 }
2151 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2152 if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2153 CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2154 nx = mat->num_rows;
2155 ny = mat->num_cols;
2156 }
2157 #endif
2158 }
2159
2160 /* csr_spmv does y = alpha op(A) x + beta y */
2161 if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2162 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2163 if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2164 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2165 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2166 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2167 stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
2168 matstruct->matDescr,
2169 matstruct->cuSpMV[opA].vecXDescr, beta,
2170 matstruct->cuSpMV[opA].vecYDescr,
2171 cusparse_scalartype,
2172 cusparsestruct->spmvAlg,
2173 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
2174 cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
2175
2176 matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2177 } else {
2178 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2179 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
2180 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
2181 }
2182
2183 stat = cusparseSpMV(cusparsestruct->handle, opA,
2184 matstruct->alpha_one,
2185 matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */
2186 matstruct->cuSpMV[opA].vecXDescr,
2187 beta,
2188 matstruct->cuSpMV[opA].vecYDescr,
2189 cusparse_scalartype,
2190 cusparsestruct->spmvAlg,
2191 matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
2192 #else
2193 CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2194 stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
2195 mat->num_rows, mat->num_cols,
2196 mat->num_entries, matstruct->alpha_one, matstruct->descr,
2197 mat->values->data().get(), mat->row_offsets->data().get(),
2198 mat->column_indices->data().get(), xptr, beta,
2199 dptr);CHKERRCUSPARSE(stat);
2200 #endif
2201 } else {
2202 if (cusparsestruct->nrows) {
2203 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2204 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2205 #else
2206 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
2207 stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
2208 matstruct->alpha_one, matstruct->descr, hybMat,
2209 xptr, beta,
2210 dptr);CHKERRCUSPARSE(stat);
2211 #endif
2212 }
2213 }
2214 cerr = WaitForCUDA();CHKERRCUDA(cerr);
2215 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2216
2217 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2218 if (yy) { /* MatMultAdd: zz = A*xx + yy */
2219 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2220 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
2221 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2222 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2223 }
2224 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2225 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
2226 }
2227
2228 /* ScatterAdd the result from work vector into the full vector when A is compressed */
2229 if (compressed) {
2230 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
2231
2232 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2233 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
2234 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2235 VecCUDAPlusEquals());
2236 cerr = WaitForCUDA();CHKERRCUDA(cerr);
2237 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2238 }
2239 } else {
2240 if (yy && yy != zz) {
2241 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2242 }
2243 }
2244 ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2245 if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
2246 else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
2247 } catch(char *ex) {
2248 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
2249 }
2250 if (yy) {
2251 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
2252 } else {
2253 ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
2254 }
2255 PetscFunctionReturn(0);
2256 }
2257
MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)2258 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2259 {
2260 PetscErrorCode ierr;
2261
2262 PetscFunctionBegin;
2263 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2264 PetscFunctionReturn(0);
2265 }
2266
MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)2267 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
2268 {
2269 PetscErrorCode ierr;
2270 PetscSplitCSRDataStructure *d_mat = NULL, h_mat;
2271 PetscBool is_seq = PETSC_TRUE;
2272 PetscInt nnz_state = A->nonzerostate;
2273 PetscFunctionBegin;
2274 if (A->factortype == MAT_FACTOR_NONE) {
2275 d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2276 }
2277 if (d_mat) {
2278 cudaError_t err;
2279 ierr = PetscInfo(A,"Assemble device matrix\n");CHKERRQ(ierr);
2280 err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
2281 nnz_state = h_mat.nonzerostate;
2282 is_seq = h_mat.seq;
2283 }
2284 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it?
2285 if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
2286 if (A->factortype == MAT_FACTOR_NONE && A->nonzerostate >= nnz_state && is_seq) { // assembled on CPU eventhough equiped for GPU
2287 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2288 } else if (nnz_state > A->nonzerostate) {
2289 A->offloadmask = PETSC_OFFLOAD_GPU;
2290 }
2291
2292 PetscFunctionReturn(0);
2293 }
2294
2295 /* --------------------------------------------------------------------------------*/
2296 /*@
2297 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
2298 (the default parallel PETSc format). This matrix will ultimately pushed down
2299 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
2300 assembly performance the user should preallocate the matrix storage by setting
2301 the parameter nz (or the array nnz). By setting these parameters accurately,
2302 performance during matrix assembly can be increased by more than a factor of 50.
2303
2304 Collective
2305
2306 Input Parameters:
2307 + comm - MPI communicator, set to PETSC_COMM_SELF
2308 . m - number of rows
2309 . n - number of columns
2310 . nz - number of nonzeros per row (same for all rows)
2311 - nnz - array containing the number of nonzeros in the various rows
2312 (possibly different for each row) or NULL
2313
2314 Output Parameter:
2315 . A - the matrix
2316
2317 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
2318 MatXXXXSetPreallocation() paradgm instead of this routine directly.
2319 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
2320
2321 Notes:
2322 If nnz is given then nz is ignored
2323
2324 The AIJ format (also called the Yale sparse matrix format or
2325 compressed row storage), is fully compatible with standard Fortran 77
2326 storage. That is, the stored row and column indices can begin at
2327 either one (as in Fortran) or zero. See the users' manual for details.
2328
2329 Specify the preallocated storage with either nz or nnz (not both).
2330 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
2331 allocation. For large problems you MUST preallocate memory or you
2332 will get TERRIBLE performance, see the users' manual chapter on matrices.
2333
2334 By default, this format uses inodes (identical nodes) when possible, to
2335 improve numerical efficiency of matrix-vector products and solves. We
2336 search for consecutive rows with the same nonzero structure, thereby
2337 reusing matrix information to achieve increased efficiency.
2338
2339 Level: intermediate
2340
2341 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
2342 @*/
MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat * A)2343 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
2344 {
2345 PetscErrorCode ierr;
2346
2347 PetscFunctionBegin;
2348 ierr = MatCreate(comm,A);CHKERRQ(ierr);
2349 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
2350 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2351 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
2352 PetscFunctionReturn(0);
2353 }
2354
MatDestroy_SeqAIJCUSPARSE(Mat A)2355 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
2356 {
2357 PetscErrorCode ierr;
2358 PetscSplitCSRDataStructure *d_mat = NULL;
2359
2360 PetscFunctionBegin;
2361 if (A->factortype == MAT_FACTOR_NONE) {
2362 d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2363 ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL;
2364 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
2365 } else {
2366 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
2367 }
2368 if (d_mat) {
2369 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2370 cudaError_t err;
2371 PetscSplitCSRDataStructure h_mat;
2372 ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
2373 err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
2374 if (h_mat.seq) {
2375 if (a->compressedrow.use) {
2376 err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
2377 }
2378 err = cudaFree(d_mat);CHKERRCUDA(err);
2379 }
2380 }
2381 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
2382 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
2383 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
2384 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
2385 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
2386 PetscFunctionReturn(0);
2387 }
2388
2389 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
2390 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat * B)2391 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
2392 {
2393 PetscErrorCode ierr;
2394
2395 PetscFunctionBegin;
2396 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
2397 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
2398 PetscFunctionReturn(0);
2399 }
2400
MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)2401 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
2402 {
2403 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2404 PetscErrorCode ierr;
2405
2406 PetscFunctionBegin;
2407 if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
2408 /* Currently, there is no case in which an AIJCUSPARSE matrix ever has its offloadmask set to PETS_OFFLOAD_GPU.
2409 If this changes, we need to implement a routine to update the CPU (host) version of the matrix from the GPU one.
2410 Right now, for safety we simply check for PETSC_OFFLOAD_GPU and have MatBindToCPU() error in this case.
2411 TODO: Add MatAIJCUSPARSECopyFromGPU() and make MatBindToCPU() functional for AIJCUSPARSE matries;
2412 can follow the example of MatBindToCPU_SeqAIJViennaCL(). */
2413 if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"PETSC_OFFLOAD_GPU should not happen. Please report your use case to petsc-dev@mcs.anl.gov");
2414 /* TODO: add support for this? */
2415 if (flg) {
2416 A->ops->mult = MatMult_SeqAIJ;
2417 A->ops->multadd = MatMultAdd_SeqAIJ;
2418 A->ops->multtranspose = MatMultTranspose_SeqAIJ;
2419 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ;
2420 A->ops->multhermitiantranspose = NULL;
2421 A->ops->multhermitiantransposeadd = NULL;
2422 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
2423 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
2424 } else {
2425 A->ops->mult = MatMult_SeqAIJCUSPARSE;
2426 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE;
2427 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE;
2428 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
2429 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE;
2430 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
2431 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
2432 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
2433 }
2434 A->boundtocpu = flg;
2435 a->inode.use = flg;
2436 PetscFunctionReturn(0);
2437 }
2438
MatZeroEntries_SeqAIJCUSPARSE(Mat A)2439 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
2440 {
2441 PetscSplitCSRDataStructure *d_mat = NULL;
2442 PetscErrorCode ierr;
2443 PetscFunctionBegin;
2444 if (A->factortype == MAT_FACTOR_NONE) {
2445 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
2446 d_mat = spptr->deviceMat;
2447 }
2448 if (d_mat) {
2449 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2450 PetscInt n = A->rmap->n, nnz = a->i[n];
2451 cudaError_t err;
2452 PetscScalar *vals;
2453 ierr = PetscInfo(A,"Zero device matrix\n");CHKERRQ(ierr);
2454 err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
2455 err = cudaMemset( vals, 0, (nnz)*sizeof(PetscScalar));CHKERRCUDA(err);
2456 }
2457 ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
2458
2459 PetscFunctionReturn(0);
2460 }
2461
MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A,MatType mtype,MatReuse reuse,Mat * newmat)2462 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
2463 {
2464 PetscErrorCode ierr;
2465 cusparseStatus_t stat;
2466 Mat B;
2467
2468 PetscFunctionBegin;
2469 if (reuse == MAT_INITIAL_MATRIX) {
2470 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
2471 } else if (reuse == MAT_REUSE_MATRIX) {
2472 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
2473 }
2474 B = *newmat;
2475
2476 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
2477 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
2478
2479 if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
2480 if (B->factortype == MAT_FACTOR_NONE) {
2481 Mat_SeqAIJCUSPARSE *spptr;
2482
2483 ierr = PetscNew(&spptr);CHKERRQ(ierr);
2484 spptr->format = MAT_CUSPARSE_CSR;
2485 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2486 B->spptr = spptr;
2487 spptr->deviceMat = NULL;
2488 } else {
2489 Mat_SeqAIJCUSPARSETriFactors *spptr;
2490
2491 ierr = PetscNew(&spptr);CHKERRQ(ierr);
2492 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2493 B->spptr = spptr;
2494 }
2495 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
2496 }
2497 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE;
2498 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE;
2499 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
2500 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE;
2501 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE;
2502 B->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE;
2503
2504 ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
2505 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2506 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
2507 PetscFunctionReturn(0);
2508 }
2509
MatCreate_SeqAIJCUSPARSE(Mat B)2510 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
2511 {
2512 PetscErrorCode ierr;
2513
2514 PetscFunctionBegin;
2515 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr);
2516 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
2517 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2518 ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr);
2519 ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr);
2520 ierr = PetscOptionsEnd();CHKERRQ(ierr);
2521 PetscFunctionReturn(0);
2522 }
2523
2524 /*MC
2525 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
2526
2527 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
2528 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
2529 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
2530
2531 Options Database Keys:
2532 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
2533 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2534 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2535
2536 Level: beginner
2537
2538 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
2539 M*/
2540
2541 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
2542
2543
MatSolverTypeRegister_CUSPARSE(void)2544 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
2545 {
2546 PetscErrorCode ierr;
2547
2548 PetscFunctionBegin;
2549 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2550 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2551 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2552 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2553 PetscFunctionReturn(0);
2554 }
2555
MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE ** cusparsestruct)2556 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
2557 {
2558 PetscErrorCode ierr;
2559 cusparseStatus_t stat;
2560 cusparseHandle_t handle;
2561
2562 PetscFunctionBegin;
2563 if (*cusparsestruct) {
2564 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
2565 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
2566 delete (*cusparsestruct)->workVector;
2567 delete (*cusparsestruct)->rowoffsets_gpu;
2568 if (handle = (*cusparsestruct)->handle) {stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);}
2569 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2570 cudaError_t cerr = cudaFree((*cusparsestruct)->csr2cscBuffer);CHKERRCUDA(cerr);
2571 #endif
2572 ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
2573 }
2574 PetscFunctionReturn(0);
2575 }
2576
CsrMatrix_Destroy(CsrMatrix ** mat)2577 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
2578 {
2579 PetscFunctionBegin;
2580 if (*mat) {
2581 delete (*mat)->values;
2582 delete (*mat)->column_indices;
2583 delete (*mat)->row_offsets;
2584 delete *mat;
2585 *mat = 0;
2586 }
2587 PetscFunctionReturn(0);
2588 }
2589
MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct ** trifactor)2590 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
2591 {
2592 cusparseStatus_t stat;
2593 PetscErrorCode ierr;
2594
2595 PetscFunctionBegin;
2596 if (*trifactor) {
2597 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
2598 if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
2599 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
2600 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2601 cudaError_t cerr;
2602 if ((*trifactor)->solveBuffer) {cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
2603 if ((*trifactor)->csr2cscBuffer) {cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
2604 #endif
2605 delete *trifactor;
2606 *trifactor = 0;
2607 }
2608 PetscFunctionReturn(0);
2609 }
2610
MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct ** matstruct,MatCUSPARSEStorageFormat format)2611 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
2612 {
2613 CsrMatrix *mat;
2614 cusparseStatus_t stat;
2615 cudaError_t err;
2616
2617 PetscFunctionBegin;
2618 if (*matstruct) {
2619 if ((*matstruct)->mat) {
2620 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
2621 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2622 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2623 #else
2624 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
2625 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
2626 #endif
2627 } else {
2628 mat = (CsrMatrix*)(*matstruct)->mat;
2629 CsrMatrix_Destroy(&mat);
2630 }
2631 }
2632 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
2633 delete (*matstruct)->cprowIndices;
2634 if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
2635 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
2636 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
2637
2638 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2639 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
2640 if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
2641 for (int i=0; i<3; i++) {
2642 if (mdata->cuSpMV[i].initialized) {
2643 err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
2644 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
2645 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
2646 }
2647 }
2648 #endif
2649 delete *matstruct;
2650 *matstruct = 0;
2651 }
2652 PetscFunctionReturn(0);
2653 }
2654
MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors ** trifactors)2655 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2656 {
2657 PetscErrorCode ierr;
2658
2659 PetscFunctionBegin;
2660 if (*trifactors) {
2661 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
2662 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
2663 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
2664 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
2665 delete (*trifactors)->rpermIndices;
2666 delete (*trifactors)->cpermIndices;
2667 delete (*trifactors)->workVector;
2668 (*trifactors)->rpermIndices = 0;
2669 (*trifactors)->cpermIndices = 0;
2670 (*trifactors)->workVector = 0;
2671 }
2672 PetscFunctionReturn(0);
2673 }
2674
MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors ** trifactors)2675 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2676 {
2677 PetscErrorCode ierr;
2678 cusparseHandle_t handle;
2679 cusparseStatus_t stat;
2680
2681 PetscFunctionBegin;
2682 if (*trifactors) {
2683 ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
2684 if (handle = (*trifactors)->handle) {
2685 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
2686 }
2687 ierr = PetscFree(*trifactors);CHKERRQ(ierr);
2688 }
2689 PetscFunctionReturn(0);
2690 }
2691