1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_CXX_COMPLEX_FIX
7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
8 
9 #include <petscconf.h>
10 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11 #include <../src/mat/impls/sbaij/seq/sbaij.h>
12 #include <../src/vec/vec/impls/dvecimpl.h>
13 #include <petsc/private/vecimpl.h>
14 #undef VecType
15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16 
17 const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
19   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
20     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
21 
22   typedef enum {
23       CUSPARSE_MV_ALG_DEFAULT = 0,
24       CUSPARSE_COOMV_ALG      = 1,
25       CUSPARSE_CSRMV_ALG1     = 2,
26       CUSPARSE_CSRMV_ALG2     = 3
27   } cusparseSpMVAlg_t;
28 
29   typedef enum {
30       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
31       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
32       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
33       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
34       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
35       CUSPARSE_SPMM_ALG_DEFAULT = 0,
36       CUSPARSE_SPMM_COO_ALG1    = 1,
37       CUSPARSE_SPMM_COO_ALG2    = 2,
38       CUSPARSE_SPMM_COO_ALG3    = 3,
39       CUSPARSE_SPMM_COO_ALG4    = 5,
40       CUSPARSE_SPMM_CSR_ALG1    = 4,
41       CUSPARSE_SPMM_CSR_ALG2    = 6,
42   } cusparseSpMMAlg_t;
43 
44   typedef enum {
45       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
46       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
47   } cusparseCsr2CscAlg_t;
48   */
49   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
50   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
51   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
52 #endif
53 
54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
57 
58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
61 
62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
67 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
68 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
69 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
70 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
71 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
72 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
73 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
74 
75 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
76 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
77 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
78 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
79 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
80 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
81 
MatCUSPARSESetStream(Mat A,const cudaStream_t stream)82 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
83 {
84   cusparseStatus_t   stat;
85   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
86 
87   PetscFunctionBegin;
88   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
89   cusparsestruct->stream = stream;
90   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
91   PetscFunctionReturn(0);
92 }
93 
MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)94 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
95 {
96   cusparseStatus_t   stat;
97   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
98 
99   PetscFunctionBegin;
100   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
101   if (cusparsestruct->handle != handle) {
102     if (cusparsestruct->handle) {
103       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
104     }
105     cusparsestruct->handle = handle;
106   }
107   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
108   PetscFunctionReturn(0);
109 }
110 
MatCUSPARSEClearHandle(Mat A)111 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
112 {
113   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
114 
115   PetscFunctionBegin;
116   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
117   if (cusparsestruct->handle) cusparsestruct->handle = 0;
118   PetscFunctionReturn(0);
119 }
120 
MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType * type)121 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
122 {
123   PetscFunctionBegin;
124   *type = MATSOLVERCUSPARSE;
125   PetscFunctionReturn(0);
126 }
127 
128 /*MC
129   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
130   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
131   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
132   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
133   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
134   algorithms are not recommended. This class does NOT support direct solver operations.
135 
136   Level: beginner
137 
138 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
139 M*/
140 
MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat * B)141 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
142 {
143   PetscErrorCode ierr;
144   PetscInt       n = A->rmap->n;
145 
146   PetscFunctionBegin;
147   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
148   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
149   (*B)->factortype = ftype;
150   (*B)->useordering = PETSC_TRUE;
151   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
152 
153   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
154     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
155     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
156     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
157   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
158     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
159     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
160   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
161 
162   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
163   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
164   PetscFunctionReturn(0);
165 }
166 
MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)167 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
168 {
169   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
170 
171   PetscFunctionBegin;
172   switch (op) {
173   case MAT_CUSPARSE_MULT:
174     cusparsestruct->format = format;
175     break;
176   case MAT_CUSPARSE_ALL:
177     cusparsestruct->format = format;
178     break;
179   default:
180     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
181   }
182   PetscFunctionReturn(0);
183 }
184 
185 /*@
186    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
187    operation. Only the MatMult operation can use different GPU storage formats
188    for MPIAIJCUSPARSE matrices.
189    Not Collective
190 
191    Input Parameters:
192 +  A - Matrix of type SEQAIJCUSPARSE
193 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
194 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
195 
196    Output Parameter:
197 
198    Level: intermediate
199 
200 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
201 @*/
MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)202 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
203 {
204   PetscErrorCode ierr;
205 
206   PetscFunctionBegin;
207   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
208   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
209   PetscFunctionReturn(0);
210 }
211 
212 /*@
213    MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the tranpose matrix before calling MatMultTranspose
214 
215    Collective on mat
216 
217    Input Parameters:
218 +  A - Matrix of type SEQAIJCUSPARSE
219 -  transgen - the boolean flag
220 
221    Level: intermediate
222 
223 .seealso: MATSEQAIJCUSPARSE
224 @*/
MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen)225 PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen)
226 {
227   PetscErrorCode ierr;
228   PetscBool      flg;
229 
230   PetscFunctionBegin;
231   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
232   ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
233   if (flg) {
234     Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
235 
236     if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
237     cusp->transgen = transgen;
238     if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */
239       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
240     }
241   }
242   PetscFunctionReturn(0);
243 }
244 
MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems * PetscOptionsObject,Mat A)245 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
246 {
247   PetscErrorCode           ierr;
248   MatCUSPARSEStorageFormat format;
249   PetscBool                flg;
250   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
251 
252   PetscFunctionBegin;
253   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
254   if (A->factortype == MAT_FACTOR_NONE) {
255     PetscBool transgen = cusparsestruct->transgen;
256 
257     ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr);
258     if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);}
259 
260     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
261                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
262     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
263 
264     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
265                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
266     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
267    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
268     cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
269     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
270                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
271     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
272     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
273 
274     cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
275     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
276                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
277     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
278 
279     cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
280     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
281                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
282     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
283    #endif
284   }
285   ierr = PetscOptionsTail();CHKERRQ(ierr);
286   PetscFunctionReturn(0);
287 }
288 
MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo * info)289 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
290 {
291   PetscErrorCode ierr;
292 
293   PetscFunctionBegin;
294   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
295   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
296   PetscFunctionReturn(0);
297 }
298 
MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo * info)299 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
300 {
301   PetscErrorCode ierr;
302 
303   PetscFunctionBegin;
304   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
305   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
306   PetscFunctionReturn(0);
307 }
308 
MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo * info)309 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
310 {
311   PetscErrorCode ierr;
312 
313   PetscFunctionBegin;
314   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
315   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
316   PetscFunctionReturn(0);
317 }
318 
MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo * info)319 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
320 {
321   PetscErrorCode ierr;
322 
323   PetscFunctionBegin;
324   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
325   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
326   PetscFunctionReturn(0);
327 }
328 
MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)329 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
330 {
331   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
332   PetscInt                          n = A->rmap->n;
333   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
334   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
335   cusparseStatus_t                  stat;
336   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
337   const MatScalar                   *aa = a->a,*v;
338   PetscInt                          *AiLo, *AjLo;
339   PetscScalar                       *AALo;
340   PetscInt                          i,nz, nzLower, offset, rowOffset;
341   PetscErrorCode                    ierr;
342   cudaError_t                       cerr;
343 
344   PetscFunctionBegin;
345   if (!n) PetscFunctionReturn(0);
346   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
347     try {
348       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
349       nzLower=n+ai[n]-ai[1];
350 
351       /* Allocate Space for the lower triangular matrix */
352       cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
353       cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
354       cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
355 
356       /* Fill the lower triangular matrix */
357       AiLo[0]  = (PetscInt) 0;
358       AiLo[n]  = nzLower;
359       AjLo[0]  = (PetscInt) 0;
360       AALo[0]  = (MatScalar) 1.0;
361       v        = aa;
362       vi       = aj;
363       offset   = 1;
364       rowOffset= 1;
365       for (i=1; i<n; i++) {
366         nz = ai[i+1] - ai[i];
367         /* additional 1 for the term on the diagonal */
368         AiLo[i]    = rowOffset;
369         rowOffset += nz+1;
370 
371         ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
372         ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
373 
374         offset      += nz;
375         AjLo[offset] = (PetscInt) i;
376         AALo[offset] = (MatScalar) 1.0;
377         offset      += 1;
378 
379         v  += nz;
380         vi += nz;
381       }
382 
383       /* allocate space for the triangular factor information */
384       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
385 
386       /* Create the matrix description */
387       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
388       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
389      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
390       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
391      #else
392       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
393      #endif
394       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
395       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
396 
397       /* set the operation */
398       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
399 
400       /* set the matrix */
401       loTriFactor->csrMat = new CsrMatrix;
402       loTriFactor->csrMat->num_rows = n;
403       loTriFactor->csrMat->num_cols = n;
404       loTriFactor->csrMat->num_entries = nzLower;
405 
406       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
407       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
408 
409       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
410       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
411 
412       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
413       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
414 
415       /* Create the solve analysis information */
416       stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
417     #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
418       stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
419                                      loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
420                                      loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
421                                      loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
422                                      &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
423       cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
424     #endif
425 
426       /* perform the solve analysis */
427       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
428                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
429                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
430                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
431                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
432                                ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
433                              #endif
434                               );CHKERRCUSPARSE(stat);
435 
436       /* assign the pointer. Is this really necessary? */
437       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
438 
439       cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
440       cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
441       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
442       ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
443     } catch(char *ex) {
444       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
445     }
446   }
447   PetscFunctionReturn(0);
448 }
449 
MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)450 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
451 {
452   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
453   PetscInt                          n = A->rmap->n;
454   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
455   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
456   cusparseStatus_t                  stat;
457   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
458   const MatScalar                   *aa = a->a,*v;
459   PetscInt                          *AiUp, *AjUp;
460   PetscScalar                       *AAUp;
461   PetscInt                          i,nz, nzUpper, offset;
462   PetscErrorCode                    ierr;
463   cudaError_t                       cerr;
464 
465   PetscFunctionBegin;
466   if (!n) PetscFunctionReturn(0);
467   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
468     try {
469       /* next, figure out the number of nonzeros in the upper triangular matrix. */
470       nzUpper = adiag[0]-adiag[n];
471 
472       /* Allocate Space for the upper triangular matrix */
473       cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
474       cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
475       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
476 
477       /* Fill the upper triangular matrix */
478       AiUp[0]=(PetscInt) 0;
479       AiUp[n]=nzUpper;
480       offset = nzUpper;
481       for (i=n-1; i>=0; i--) {
482         v  = aa + adiag[i+1] + 1;
483         vi = aj + adiag[i+1] + 1;
484 
485         /* number of elements NOT on the diagonal */
486         nz = adiag[i] - adiag[i+1]-1;
487 
488         /* decrement the offset */
489         offset -= (nz+1);
490 
491         /* first, set the diagonal elements */
492         AjUp[offset] = (PetscInt) i;
493         AAUp[offset] = (MatScalar)1./v[nz];
494         AiUp[i]      = AiUp[i+1] - (nz+1);
495 
496         ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
497         ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
498       }
499 
500       /* allocate space for the triangular factor information */
501       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
502 
503       /* Create the matrix description */
504       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
505       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
506      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
507       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
508      #else
509       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
510      #endif
511       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
512       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
513 
514       /* set the operation */
515       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
516 
517       /* set the matrix */
518       upTriFactor->csrMat = new CsrMatrix;
519       upTriFactor->csrMat->num_rows = n;
520       upTriFactor->csrMat->num_cols = n;
521       upTriFactor->csrMat->num_entries = nzUpper;
522 
523       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
524       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
525 
526       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
527       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
528 
529       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
530       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
531 
532       /* Create the solve analysis information */
533       stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
534     #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
535       stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
536                                    upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
537                                    upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
538                                    upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
539                                    &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
540       cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
541     #endif
542 
543       /* perform the solve analysis */
544       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
545                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
546                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
547                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
548                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
549                                ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
550                              #endif
551                               );CHKERRCUSPARSE(stat);
552 
553       /* assign the pointer. Is this really necessary? */
554       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
555 
556       cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
557       cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
558       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
559       ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
560     } catch(char *ex) {
561       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
562     }
563   }
564   PetscFunctionReturn(0);
565 }
566 
MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)567 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
568 {
569   PetscErrorCode               ierr;
570   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
571   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
572   IS                           isrow = a->row,iscol = a->icol;
573   PetscBool                    row_identity,col_identity;
574   const PetscInt               *r,*c;
575   PetscInt                     n = A->rmap->n;
576 
577   PetscFunctionBegin;
578   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
579   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
580   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
581 
582   cusparseTriFactors->workVector = new THRUSTARRAY(n);
583   cusparseTriFactors->nnz=a->nz;
584 
585   A->offloadmask = PETSC_OFFLOAD_BOTH;
586   /* lower triangular indices */
587   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
588   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
589   if (!row_identity) {
590     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
591     cusparseTriFactors->rpermIndices->assign(r, r+n);
592   }
593   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
594 
595   /* upper triangular indices */
596   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
597   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
598   if (!col_identity) {
599     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
600     cusparseTriFactors->cpermIndices->assign(c, c+n);
601   }
602 
603   if (!row_identity && !col_identity) {
604     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
605   } else if(!row_identity) {
606     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
607   } else if(!col_identity) {
608     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
609   }
610 
611   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
612   PetscFunctionReturn(0);
613 }
614 
MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)615 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
616 {
617   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
618   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
619   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
620   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
621   cusparseStatus_t                  stat;
622   PetscErrorCode                    ierr;
623   cudaError_t                       cerr;
624   PetscInt                          *AiUp, *AjUp;
625   PetscScalar                       *AAUp;
626   PetscScalar                       *AALo;
627   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
628   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
629   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
630   const MatScalar                   *aa = b->a,*v;
631 
632   PetscFunctionBegin;
633   if (!n) PetscFunctionReturn(0);
634   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
635     try {
636       /* Allocate Space for the upper triangular matrix */
637       cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
638       cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
639       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
640       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
641 
642       /* Fill the upper triangular matrix */
643       AiUp[0]=(PetscInt) 0;
644       AiUp[n]=nzUpper;
645       offset = 0;
646       for (i=0; i<n; i++) {
647         /* set the pointers */
648         v  = aa + ai[i];
649         vj = aj + ai[i];
650         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
651 
652         /* first, set the diagonal elements */
653         AjUp[offset] = (PetscInt) i;
654         AAUp[offset] = (MatScalar)1.0/v[nz];
655         AiUp[i]      = offset;
656         AALo[offset] = (MatScalar)1.0/v[nz];
657 
658         offset+=1;
659         if (nz>0) {
660           ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
661           ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
662           for (j=offset; j<offset+nz; j++) {
663             AAUp[j] = -AAUp[j];
664             AALo[j] = AAUp[j]/v[nz];
665           }
666           offset+=nz;
667         }
668       }
669 
670       /* allocate space for the triangular factor information */
671       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
672 
673       /* Create the matrix description */
674       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
675       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
676      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
677       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
678      #else
679       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
680      #endif
681       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
682       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
683 
684       /* set the matrix */
685       upTriFactor->csrMat = new CsrMatrix;
686       upTriFactor->csrMat->num_rows = A->rmap->n;
687       upTriFactor->csrMat->num_cols = A->cmap->n;
688       upTriFactor->csrMat->num_entries = a->nz;
689 
690       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
691       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
692 
693       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
694       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
695 
696       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
697       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
698 
699       /* set the operation */
700       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
701 
702       /* Create the solve analysis information */
703       stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
704     #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
705       stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
706                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
707                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
708                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
709                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
710       cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
711     #endif
712 
713       /* perform the solve analysis */
714       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
715                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
716                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
717                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
718                               #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
719                                ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
720                               #endif
721                               );CHKERRCUSPARSE(stat);
722 
723       /* assign the pointer. Is this really necessary? */
724       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
725 
726       /* allocate space for the triangular factor information */
727       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
728 
729       /* Create the matrix description */
730       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
731       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
732      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
733       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
734      #else
735       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
736      #endif
737       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
738       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
739 
740       /* set the operation */
741       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
742 
743       /* set the matrix */
744       loTriFactor->csrMat = new CsrMatrix;
745       loTriFactor->csrMat->num_rows = A->rmap->n;
746       loTriFactor->csrMat->num_cols = A->cmap->n;
747       loTriFactor->csrMat->num_entries = a->nz;
748 
749       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
750       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
751 
752       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
753       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
754 
755       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
756       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
757       ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
758 
759       /* Create the solve analysis information */
760       stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
761     #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
762       stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
763                                      loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
764                                      loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
765                                      loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
766                                      &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
767       cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
768     #endif
769 
770       /* perform the solve analysis */
771       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
772                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
773                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
774                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
775                               #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
776                                ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
777                               #endif
778                               );CHKERRCUSPARSE(stat);
779 
780       /* assign the pointer. Is this really necessary? */
781       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
782 
783       A->offloadmask = PETSC_OFFLOAD_BOTH;
784       cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
785       cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
786       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
787       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
788     } catch(char *ex) {
789       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
790     }
791   }
792   PetscFunctionReturn(0);
793 }
794 
MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)795 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
796 {
797   PetscErrorCode               ierr;
798   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
799   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
800   IS                           ip = a->row;
801   const PetscInt               *rip;
802   PetscBool                    perm_identity;
803   PetscInt                     n = A->rmap->n;
804 
805   PetscFunctionBegin;
806   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
807   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
808   cusparseTriFactors->workVector = new THRUSTARRAY(n);
809   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
810 
811   /* lower triangular indices */
812   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
813   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
814   if (!perm_identity) {
815     IS             iip;
816     const PetscInt *irip;
817 
818     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
819     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
820     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
821     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
822     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
823     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
824     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
825     ierr = ISDestroy(&iip);CHKERRQ(ierr);
826     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
827   }
828   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
829   PetscFunctionReturn(0);
830 }
831 
MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo * info)832 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
833 {
834   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
835   IS             isrow = b->row,iscol = b->col;
836   PetscBool      row_identity,col_identity;
837   PetscErrorCode ierr;
838 
839   PetscFunctionBegin;
840   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
841   B->offloadmask = PETSC_OFFLOAD_CPU;
842   /* determine which version of MatSolve needs to be used. */
843   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
844   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
845   if (row_identity && col_identity) {
846     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
847     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
848     B->ops->matsolve = NULL;
849     B->ops->matsolvetranspose = NULL;
850   } else {
851     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
852     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
853     B->ops->matsolve = NULL;
854     B->ops->matsolvetranspose = NULL;
855   }
856 
857   /* get the triangular factors */
858   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
859   PetscFunctionReturn(0);
860 }
861 
MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo * info)862 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
863 {
864   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
865   IS             ip = b->row;
866   PetscBool      perm_identity;
867   PetscErrorCode ierr;
868 
869   PetscFunctionBegin;
870   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
871   B->offloadmask = PETSC_OFFLOAD_CPU;
872   /* determine which version of MatSolve needs to be used. */
873   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
874   if (perm_identity) {
875     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
876     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
877     B->ops->matsolve = NULL;
878     B->ops->matsolvetranspose = NULL;
879   } else {
880     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
881     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
882     B->ops->matsolve = NULL;
883     B->ops->matsolvetranspose = NULL;
884   }
885 
886   /* get the triangular factors */
887   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
888   PetscFunctionReturn(0);
889 }
890 
MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)891 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
892 {
893   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
894   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
895   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
896   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
897   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
898   cusparseStatus_t                  stat;
899   cusparseIndexBase_t               indexBase;
900   cusparseMatrixType_t              matrixType;
901   cusparseFillMode_t                fillMode;
902   cusparseDiagType_t                diagType;
903 
904   PetscFunctionBegin;
905 
906   /*********************************************/
907   /* Now the Transpose of the Lower Tri Factor */
908   /*********************************************/
909 
910   /* allocate space for the transpose of the lower triangular factor */
911   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
912 
913   /* set the matrix descriptors of the lower triangular factor */
914   matrixType = cusparseGetMatType(loTriFactor->descr);
915   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
916   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
917     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
918   diagType = cusparseGetMatDiagType(loTriFactor->descr);
919 
920   /* Create the matrix description */
921   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
922   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
923   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
924   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
925   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
926 
927   /* set the operation */
928   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
929 
930   /* allocate GPU space for the CSC of the lower triangular factor*/
931   loTriFactorT->csrMat = new CsrMatrix;
932   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
933   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
934   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
935   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
936   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
937   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
938 
939   /* compute the transpose of the lower triangular factor, i.e. the CSC */
940 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
941   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
942                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
943                                        loTriFactor->csrMat->values->data().get(),
944                                        loTriFactor->csrMat->row_offsets->data().get(),
945                                        loTriFactor->csrMat->column_indices->data().get(),
946                                        loTriFactorT->csrMat->values->data().get(),
947                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
948                                        CUSPARSE_ACTION_NUMERIC,indexBase,
949                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
950   cudaError_t cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
951 #endif
952 
953   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
954                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
955                           loTriFactor->csrMat->values->data().get(),
956                           loTriFactor->csrMat->row_offsets->data().get(),
957                           loTriFactor->csrMat->column_indices->data().get(),
958                           loTriFactorT->csrMat->values->data().get(),
959                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
960                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
961                           CUSPARSE_ACTION_NUMERIC, indexBase,
962                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer
963                         #else
964                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
965                           CUSPARSE_ACTION_NUMERIC, indexBase
966                         #endif
967                         );CHKERRCUSPARSE(stat);
968 
969   /* Create the solve analysis information */
970   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
971 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
972   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
973                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
974                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
975                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
976                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
977   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
978 #endif
979 
980   /* perform the solve analysis */
981   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
982                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
983                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
984                            loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo
985                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
986                            ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
987                           #endif
988                           );CHKERRCUSPARSE(stat);
989 
990   /* assign the pointer. Is this really necessary? */
991   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
992 
993   /*********************************************/
994   /* Now the Transpose of the Upper Tri Factor */
995   /*********************************************/
996 
997   /* allocate space for the transpose of the upper triangular factor */
998   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
999 
1000   /* set the matrix descriptors of the upper triangular factor */
1001   matrixType = cusparseGetMatType(upTriFactor->descr);
1002   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1003   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1004     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1005   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1006 
1007   /* Create the matrix description */
1008   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
1009   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1010   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1011   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1012   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1013 
1014   /* set the operation */
1015   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1016 
1017   /* allocate GPU space for the CSC of the upper triangular factor*/
1018   upTriFactorT->csrMat = new CsrMatrix;
1019   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1020   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1021   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1022   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1023   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1024   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1025 
1026   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1027 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1028   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1029                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1030                                 upTriFactor->csrMat->values->data().get(),
1031                                 upTriFactor->csrMat->row_offsets->data().get(),
1032                                 upTriFactor->csrMat->column_indices->data().get(),
1033                                 upTriFactorT->csrMat->values->data().get(),
1034                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1035                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1036                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1037   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1038 #endif
1039 
1040   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1041                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1042                           upTriFactor->csrMat->values->data().get(),
1043                           upTriFactor->csrMat->row_offsets->data().get(),
1044                           upTriFactor->csrMat->column_indices->data().get(),
1045                           upTriFactorT->csrMat->values->data().get(),
1046                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1047                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1048                           CUSPARSE_ACTION_NUMERIC, indexBase,
1049                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer
1050                         #else
1051                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1052                           CUSPARSE_ACTION_NUMERIC, indexBase
1053                         #endif
1054                         );CHKERRCUSPARSE(stat);
1055 
1056   /* Create the solve analysis information */
1057   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1058   #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1059   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1060                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1061                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1062                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1063                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1064   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1065   #endif
1066 
1067   /* perform the solve analysis */
1068   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1069                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1070                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1071                            upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo
1072                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1073                            ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1074                           #endif
1075                           );CHKERRCUSPARSE(stat);
1076 
1077   /* assign the pointer. Is this really necessary? */
1078   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1079   PetscFunctionReturn(0);
1080 }
1081 
MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)1082 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
1083 {
1084   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1085   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1086   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1087   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1088   cusparseStatus_t             stat;
1089   cusparseIndexBase_t          indexBase;
1090   cudaError_t                  err;
1091   PetscErrorCode               ierr;
1092 
1093   PetscFunctionBegin;
1094   if (!cusparsestruct->transgen || cusparsestruct->matTranspose) PetscFunctionReturn(0);
1095   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1096   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1097   /* create cusparse matrix */
1098   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1099   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1100   indexBase = cusparseGetMatIndexBase(matstruct->descr);
1101   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
1102   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1103 
1104   /* set alpha and beta */
1105   err = cudaMalloc((void **)&(matstructT->alpha_one),    sizeof(PetscScalar));CHKERRCUDA(err);
1106   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1107   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1108   err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1109   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1110   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1111   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1112 
1113   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1114     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1115     CsrMatrix *matrixT= new CsrMatrix;
1116     matrixT->num_rows = A->cmap->n;
1117     matrixT->num_cols = A->rmap->n;
1118     matrixT->num_entries = a->nz;
1119     matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1120     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1121     matrixT->values = new THRUSTARRAY(a->nz);
1122 
1123     cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
1124     cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1125 
1126     /* compute the transpose, i.e. the CSC */
1127    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1128     stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1129                                   A->cmap->n, matrix->num_entries,
1130                                   matrix->values->data().get(),
1131                                   cusparsestruct->rowoffsets_gpu->data().get(),
1132                                   matrix->column_indices->data().get(),
1133                                   matrixT->values->data().get(),
1134                                   matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1135                                   CUSPARSE_ACTION_NUMERIC,indexBase,
1136                                   cusparsestruct->csr2cscAlg, &cusparsestruct->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1137     err = cudaMalloc(&cusparsestruct->csr2cscBuffer,cusparsestruct->csr2cscBufferSize);CHKERRCUDA(err);
1138    #endif
1139 
1140     stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1141                             A->cmap->n, matrix->num_entries,
1142                             matrix->values->data().get(),
1143                             cusparsestruct->rowoffsets_gpu->data().get(),
1144                             matrix->column_indices->data().get(),
1145                             matrixT->values->data().get(),
1146                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1147                             matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1148                             CUSPARSE_ACTION_NUMERIC,indexBase,
1149                             cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1150                           #else
1151                             matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1152                             CUSPARSE_ACTION_NUMERIC, indexBase
1153                           #endif
1154                            );CHKERRCUSPARSE(stat);
1155     matstructT->mat = matrixT;
1156 
1157    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1158     stat = cusparseCreateCsr(&matstructT->matDescr,
1159                              matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1160                              matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1161                              matrixT->values->data().get(),
1162                              CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1163                              indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1164    #endif
1165   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1166    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1167     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1168    #else
1169     CsrMatrix *temp  = new CsrMatrix;
1170     CsrMatrix *tempT = new CsrMatrix;
1171     /* First convert HYB to CSR */
1172     temp->num_rows = A->rmap->n;
1173     temp->num_cols = A->cmap->n;
1174     temp->num_entries = a->nz;
1175     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1176     temp->column_indices = new THRUSTINTARRAY32(a->nz);
1177     temp->values = new THRUSTARRAY(a->nz);
1178 
1179     stat = cusparse_hyb2csr(cusparsestruct->handle,
1180                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1181                             temp->values->data().get(),
1182                             temp->row_offsets->data().get(),
1183                             temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1184 
1185     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1186     tempT->num_rows = A->rmap->n;
1187     tempT->num_cols = A->cmap->n;
1188     tempT->num_entries = a->nz;
1189     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1190     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1191     tempT->values = new THRUSTARRAY(a->nz);
1192 
1193     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1194                             temp->num_cols, temp->num_entries,
1195                             temp->values->data().get(),
1196                             temp->row_offsets->data().get(),
1197                             temp->column_indices->data().get(),
1198                             tempT->values->data().get(),
1199                             tempT->column_indices->data().get(),
1200                             tempT->row_offsets->data().get(),
1201                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1202 
1203     /* Last, convert CSC to HYB */
1204     cusparseHybMat_t hybMat;
1205     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1206     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1207       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1208     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1209                             matstructT->descr, tempT->values->data().get(),
1210                             tempT->row_offsets->data().get(),
1211                             tempT->column_indices->data().get(),
1212                             hybMat, 0, partition);CHKERRCUSPARSE(stat);
1213 
1214     /* assign the pointer */
1215     matstructT->mat = hybMat;
1216     /* delete temporaries */
1217     if (tempT) {
1218       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1219       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1220       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1221       delete (CsrMatrix*) tempT;
1222     }
1223     if (temp) {
1224       if (temp->values) delete (THRUSTARRAY*) temp->values;
1225       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1226       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1227       delete (CsrMatrix*) temp;
1228     }
1229    #endif
1230   }
1231   err  = WaitForCUDA();CHKERRCUDA(err);
1232   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1233   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1234   /* the compressed row indices is not used for matTranspose */
1235   matstructT->cprowIndices = NULL;
1236   /* assign the pointer */
1237   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1238   PetscFunctionReturn(0);
1239 }
1240 
1241 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)1242 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1243 {
1244   PetscInt                              n = xx->map->n;
1245   const PetscScalar                     *barray;
1246   PetscScalar                           *xarray;
1247   thrust::device_ptr<const PetscScalar> bGPU;
1248   thrust::device_ptr<PetscScalar>       xGPU;
1249   cusparseStatus_t                      stat;
1250   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1251   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1252   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1253   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1254   PetscErrorCode                        ierr;
1255   cudaError_t                           cerr;
1256 
1257   PetscFunctionBegin;
1258   /* Analyze the matrix and create the transpose ... on the fly */
1259   if (!loTriFactorT && !upTriFactorT) {
1260     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1261     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1262     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1263   }
1264 
1265   /* Get the GPU pointers */
1266   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1267   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1268   xGPU = thrust::device_pointer_cast(xarray);
1269   bGPU = thrust::device_pointer_cast(barray);
1270 
1271   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1272   /* First, reorder with the row permutation */
1273   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1274                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1275                xGPU);
1276 
1277   /* First, solve U */
1278   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1279                         upTriFactorT->csrMat->num_rows,
1280                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1281                         upTriFactorT->csrMat->num_entries,
1282                       #endif
1283                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1284                         upTriFactorT->csrMat->values->data().get(),
1285                         upTriFactorT->csrMat->row_offsets->data().get(),
1286                         upTriFactorT->csrMat->column_indices->data().get(),
1287                         upTriFactorT->solveInfo,
1288                         xarray, tempGPU->data().get()
1289                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1290                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1291                       #endif
1292                         );CHKERRCUSPARSE(stat);
1293 
1294   /* Then, solve L */
1295   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1296                         loTriFactorT->csrMat->num_rows,
1297                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1298                         loTriFactorT->csrMat->num_entries,
1299                       #endif
1300                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1301                         loTriFactorT->csrMat->values->data().get(),
1302                         loTriFactorT->csrMat->row_offsets->data().get(),
1303                         loTriFactorT->csrMat->column_indices->data().get(),
1304                         loTriFactorT->solveInfo,
1305                         tempGPU->data().get(), xarray
1306                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1307                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1308                       #endif
1309                         );CHKERRCUSPARSE(stat);
1310 
1311   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1312   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1313                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1314                tempGPU->begin());
1315 
1316   /* Copy the temporary to the full solution. */
1317   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1318 
1319   /* restore */
1320   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1321   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1322   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1323   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1324   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1325   PetscFunctionReturn(0);
1326 }
1327 
MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)1328 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1329 {
1330   const PetscScalar                 *barray;
1331   PetscScalar                       *xarray;
1332   cusparseStatus_t                  stat;
1333   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1334   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1335   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1336   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1337   PetscErrorCode                    ierr;
1338   cudaError_t                       cerr;
1339 
1340   PetscFunctionBegin;
1341   /* Analyze the matrix and create the transpose ... on the fly */
1342   if (!loTriFactorT && !upTriFactorT) {
1343     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1344     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1345     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1346   }
1347 
1348   /* Get the GPU pointers */
1349   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1350   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1351 
1352   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1353   /* First, solve U */
1354   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1355                         upTriFactorT->csrMat->num_rows,
1356                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1357                         upTriFactorT->csrMat->num_entries,
1358                       #endif
1359                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1360                         upTriFactorT->csrMat->values->data().get(),
1361                         upTriFactorT->csrMat->row_offsets->data().get(),
1362                         upTriFactorT->csrMat->column_indices->data().get(),
1363                         upTriFactorT->solveInfo,
1364                         barray, tempGPU->data().get()
1365                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1366                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1367                       #endif
1368                         );CHKERRCUSPARSE(stat);
1369 
1370   /* Then, solve L */
1371   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1372                         loTriFactorT->csrMat->num_rows,
1373                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1374                         loTriFactorT->csrMat->num_entries,
1375                       #endif
1376                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1377                         loTriFactorT->csrMat->values->data().get(),
1378                         loTriFactorT->csrMat->row_offsets->data().get(),
1379                         loTriFactorT->csrMat->column_indices->data().get(),
1380                         loTriFactorT->solveInfo,
1381                         tempGPU->data().get(), xarray
1382                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1383                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1384                       #endif
1385                         );CHKERRCUSPARSE(stat);
1386 
1387   /* restore */
1388   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1389   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1390   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1391   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1392   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1393   PetscFunctionReturn(0);
1394 }
1395 
MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)1396 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1397 {
1398   const PetscScalar                     *barray;
1399   PetscScalar                           *xarray;
1400   thrust::device_ptr<const PetscScalar> bGPU;
1401   thrust::device_ptr<PetscScalar>       xGPU;
1402   cusparseStatus_t                      stat;
1403   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1404   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1405   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1406   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1407   PetscErrorCode                        ierr;
1408   cudaError_t                           cerr;
1409 
1410   PetscFunctionBegin;
1411 
1412   /* Get the GPU pointers */
1413   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1414   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1415   xGPU = thrust::device_pointer_cast(xarray);
1416   bGPU = thrust::device_pointer_cast(barray);
1417 
1418   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1419   /* First, reorder with the row permutation */
1420   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1421                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1422                tempGPU->begin());
1423 
1424   /* Next, solve L */
1425   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1426                         loTriFactor->csrMat->num_rows,
1427                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1428                         loTriFactor->csrMat->num_entries,
1429                       #endif
1430                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1431                         loTriFactor->csrMat->values->data().get(),
1432                         loTriFactor->csrMat->row_offsets->data().get(),
1433                         loTriFactor->csrMat->column_indices->data().get(),
1434                         loTriFactor->solveInfo,
1435                         tempGPU->data().get(), xarray
1436                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1437                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1438                       #endif
1439                         );CHKERRCUSPARSE(stat);
1440 
1441   /* Then, solve U */
1442   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1443                         upTriFactor->csrMat->num_rows,
1444                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1445                         upTriFactor->csrMat->num_entries,
1446                       #endif
1447                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1448                         upTriFactor->csrMat->values->data().get(),
1449                         upTriFactor->csrMat->row_offsets->data().get(),
1450                         upTriFactor->csrMat->column_indices->data().get(),
1451                         upTriFactor->solveInfo,
1452                         xarray, tempGPU->data().get()
1453                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1454                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1455                       #endif
1456                         );CHKERRCUSPARSE(stat);
1457 
1458   /* Last, reorder with the column permutation */
1459   thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1460                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1461                xGPU);
1462 
1463   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1464   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1465   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1466   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1467   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1468   PetscFunctionReturn(0);
1469 }
1470 
MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)1471 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1472 {
1473   const PetscScalar                 *barray;
1474   PetscScalar                       *xarray;
1475   cusparseStatus_t                  stat;
1476   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1477   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1478   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1479   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1480   PetscErrorCode                    ierr;
1481   cudaError_t                       cerr;
1482 
1483   PetscFunctionBegin;
1484   /* Get the GPU pointers */
1485   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1486   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1487 
1488   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1489   /* First, solve L */
1490   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1491                         loTriFactor->csrMat->num_rows,
1492                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1493                         loTriFactor->csrMat->num_entries,
1494                       #endif
1495                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1496                         loTriFactor->csrMat->values->data().get(),
1497                         loTriFactor->csrMat->row_offsets->data().get(),
1498                         loTriFactor->csrMat->column_indices->data().get(),
1499                         loTriFactor->solveInfo,
1500                         barray, tempGPU->data().get()
1501                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1502                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1503                       #endif
1504                         );CHKERRCUSPARSE(stat);
1505 
1506   /* Next, solve U */
1507   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1508                         upTriFactor->csrMat->num_rows,
1509                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1510                         upTriFactor->csrMat->num_entries,
1511                       #endif
1512                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1513                         upTriFactor->csrMat->values->data().get(),
1514                         upTriFactor->csrMat->row_offsets->data().get(),
1515                         upTriFactor->csrMat->column_indices->data().get(),
1516                         upTriFactor->solveInfo,
1517                         tempGPU->data().get(), xarray
1518                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1519                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1520                       #endif
1521                         );CHKERRCUSPARSE(stat);
1522 
1523   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1524   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1525   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1526   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1527   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1528   PetscFunctionReturn(0);
1529 }
1530 
MatSeqAIJCUSPARSECopyToGPU(Mat A)1531 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1532 {
1533   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1534   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1535   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1536   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1537   PetscErrorCode               ierr;
1538   cusparseStatus_t             stat;
1539   cudaError_t                  err;
1540 
1541   PetscFunctionBegin;
1542   if (A->boundtocpu) PetscFunctionReturn(0);
1543   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1544     if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1545       /* Copy values only */
1546       CsrMatrix *matrix,*matrixT;
1547       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
1548 
1549       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1550       matrix->values->assign(a->a, a->a+a->nz);
1551       err  = WaitForCUDA();CHKERRCUDA(err);
1552       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1553       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1554 
1555       /* Update matT when it was built before */
1556       if (cusparsestruct->matTranspose) {
1557         cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr);
1558         matrixT = (CsrMatrix*)cusparsestruct->matTranspose->mat;
1559         ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1560         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1561                             A->cmap->n, matrix->num_entries,
1562                             matrix->values->data().get(),
1563                             cusparsestruct->rowoffsets_gpu->data().get(),
1564                             matrix->column_indices->data().get(),
1565                             matrixT->values->data().get(),
1566                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1567                             matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1568                             CUSPARSE_ACTION_NUMERIC,indexBase,
1569                             cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1570                           #else
1571                             matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1572                             CUSPARSE_ACTION_NUMERIC, indexBase
1573                           #endif
1574                            );CHKERRCUSPARSE(stat);
1575         err  = WaitForCUDA();CHKERRCUDA(err);
1576         ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1577       }
1578     } else {
1579       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1580       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1581       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr);
1582       delete cusparsestruct->workVector;
1583       delete cusparsestruct->rowoffsets_gpu;
1584       try {
1585         if (a->compressedrow.use) {
1586           m    = a->compressedrow.nrows;
1587           ii   = a->compressedrow.i;
1588           ridx = a->compressedrow.rindex;
1589         } else {
1590           m    = A->rmap->n;
1591           ii   = a->i;
1592           ridx = NULL;
1593         }
1594         cusparsestruct->nrows = m;
1595 
1596         /* create cusparse matrix */
1597         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1598         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1599         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1600         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1601 
1602         err = cudaMalloc((void **)&(matstruct->alpha_one),    sizeof(PetscScalar));CHKERRCUDA(err);
1603         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1604         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1605         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1606         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1607         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1608         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1609 
1610         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1611         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1612           /* set the matrix */
1613           CsrMatrix *mat= new CsrMatrix;
1614           mat->num_rows = m;
1615           mat->num_cols = A->cmap->n;
1616           mat->num_entries = a->nz;
1617           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1618           mat->row_offsets->assign(ii, ii + m+1);
1619 
1620           mat->column_indices = new THRUSTINTARRAY32(a->nz);
1621           mat->column_indices->assign(a->j, a->j+a->nz);
1622 
1623           mat->values = new THRUSTARRAY(a->nz);
1624           mat->values->assign(a->a, a->a+a->nz);
1625 
1626           /* assign the pointer */
1627           matstruct->mat = mat;
1628          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1629           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1630             stat = cusparseCreateCsr(&matstruct->matDescr,
1631                                     mat->num_rows, mat->num_cols, mat->num_entries,
1632                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1633                                     mat->values->data().get(),
1634                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1635                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1636           }
1637          #endif
1638         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1639          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1640           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1641          #else
1642           CsrMatrix *mat= new CsrMatrix;
1643           mat->num_rows = m;
1644           mat->num_cols = A->cmap->n;
1645           mat->num_entries = a->nz;
1646           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1647           mat->row_offsets->assign(ii, ii + m+1);
1648 
1649           mat->column_indices = new THRUSTINTARRAY32(a->nz);
1650           mat->column_indices->assign(a->j, a->j+a->nz);
1651 
1652           mat->values = new THRUSTARRAY(a->nz);
1653           mat->values->assign(a->a, a->a+a->nz);
1654 
1655           cusparseHybMat_t hybMat;
1656           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1657           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1658             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1659           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1660               matstruct->descr, mat->values->data().get(),
1661               mat->row_offsets->data().get(),
1662               mat->column_indices->data().get(),
1663               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1664           /* assign the pointer */
1665           matstruct->mat = hybMat;
1666 
1667           if (mat) {
1668             if (mat->values) delete (THRUSTARRAY*)mat->values;
1669             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1670             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1671             delete (CsrMatrix*)mat;
1672           }
1673          #endif
1674         }
1675 
1676         /* assign the compressed row indices */
1677         if (a->compressedrow.use) {
1678           cusparsestruct->workVector = new THRUSTARRAY(m);
1679           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1680           matstruct->cprowIndices->assign(ridx,ridx+m);
1681           tmp = m;
1682         } else {
1683           cusparsestruct->workVector = NULL;
1684           matstruct->cprowIndices    = NULL;
1685           tmp = 0;
1686         }
1687         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1688 
1689         /* assign the pointer */
1690         cusparsestruct->mat = matstruct;
1691       } catch(char *ex) {
1692         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1693       }
1694       err  = WaitForCUDA();CHKERRCUDA(err);
1695       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1696       cusparsestruct->nonzerostate = A->nonzerostate;
1697     }
1698     A->offloadmask = PETSC_OFFLOAD_BOTH;
1699   }
1700   PetscFunctionReturn(0);
1701 }
1702 
1703 struct VecCUDAPlusEquals
1704 {
1705   template <typename Tuple>
1706   __host__ __device__
operator ()VecCUDAPlusEquals1707   void operator()(Tuple t)
1708   {
1709     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1710   }
1711 };
1712 
1713 struct VecCUDAEqualsReverse
1714 {
1715   template <typename Tuple>
1716   __host__ __device__
operator ()VecCUDAEqualsReverse1717   void operator()(Tuple t)
1718   {
1719     thrust::get<0>(t) = thrust::get<1>(t);
1720   }
1721 };
1722 
1723 struct MatMatCusparse {
1724   PetscBool            cisdense;
1725   PetscScalar          *Bt;
1726   Mat                  X;
1727 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1728   PetscBool            initialized;   /* C = alpha op(A) op(B) + beta C */
1729   cusparseDnMatDescr_t matBDescr;
1730   cusparseDnMatDescr_t matCDescr;
1731   size_t               spmmBufferSize;
1732   void                 *spmmBuffer;
1733   PetscInt             Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1734 #endif
1735 };
1736 
MatDestroy_MatMatCusparse(void * data)1737 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1738 {
1739   PetscErrorCode ierr;
1740   MatMatCusparse *mmdata = (MatMatCusparse *)data;
1741   cudaError_t    cerr;
1742 
1743   PetscFunctionBegin;
1744   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1745  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1746   cusparseStatus_t stat;
1747   if (mmdata->matBDescr)  {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat);}
1748   if (mmdata->matCDescr)  {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat);}
1749   if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);}
1750  #endif
1751   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1752   ierr = PetscFree(data);CHKERRQ(ierr);
1753   PetscFunctionReturn(0);
1754 }
1755 
1756 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1757 
MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)1758 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1759 {
1760   Mat_Product                  *product = C->product;
1761   Mat                          A,B;
1762   PetscInt                     m,n,blda,clda;
1763   PetscBool                    flg,biscuda;
1764   Mat_SeqAIJCUSPARSE           *cusp;
1765   cusparseStatus_t             stat;
1766   cusparseOperation_t          opA;
1767   const PetscScalar            *barray;
1768   PetscScalar                  *carray;
1769   PetscErrorCode               ierr;
1770   MatMatCusparse               *mmdata;
1771   Mat_SeqAIJCUSPARSEMultStruct *mat;
1772   CsrMatrix                    *csrmat;
1773   cudaError_t                  cerr;
1774 
1775   PetscFunctionBegin;
1776   MatCheckProduct(C,1);
1777   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1778   mmdata = (MatMatCusparse*)product->data;
1779   A    = product->A;
1780   B    = product->B;
1781   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1782   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1783   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1784      Instead of silently accepting the wrong answer, I prefer to raise the error */
1785   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1786   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1787   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1788   switch (product->type) {
1789   case MATPRODUCT_AB:
1790   case MATPRODUCT_PtAP:
1791     mat = cusp->mat;
1792     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1793     m   = A->rmap->n;
1794     n   = B->cmap->n;
1795     break;
1796   case MATPRODUCT_AtB:
1797     if (!cusp->transgen) {
1798       mat = cusp->mat;
1799       opA = CUSPARSE_OPERATION_TRANSPOSE;
1800     } else {
1801       ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1802       mat  = cusp->matTranspose;
1803       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1804     }
1805     m = A->cmap->n;
1806     n = B->cmap->n;
1807     break;
1808   case MATPRODUCT_ABt:
1809   case MATPRODUCT_RARt:
1810     mat = cusp->mat;
1811     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1812     m   = A->rmap->n;
1813     n   = B->rmap->n;
1814     break;
1815   default:
1816     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1817   }
1818   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
1819   csrmat = (CsrMatrix*)mat->mat;
1820   /* if the user passed a CPU matrix, copy the data to the GPU */
1821   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
1822   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
1823   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
1824 
1825   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
1826   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1827     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1828     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
1829   } else {
1830     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
1831     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
1832   }
1833 
1834   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1835  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1836   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
1837   /* (re)allcoate spmmBuffer if not initialized or LDAs are different */
1838   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
1839     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
1840     if (!mmdata->matBDescr) {
1841       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
1842       mmdata->Blda = blda;
1843     }
1844 
1845     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
1846     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
1847       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
1848       mmdata->Clda = clda;
1849     }
1850 
1851     if (!mat->matDescr) {
1852       stat = cusparseCreateCsr(&mat->matDescr,
1853                               csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
1854                               csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
1855                               csrmat->values->data().get(),
1856                               CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1857                               CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1858     }
1859     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
1860                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
1861                                    mmdata->matCDescr,cusparse_scalartype,
1862                                    cusp->spmmAlg,&mmdata->spmmBufferSize);CHKERRCUSPARSE(stat);
1863     if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);}
1864     cerr = cudaMalloc(&mmdata->spmmBuffer,mmdata->spmmBufferSize);CHKERRCUDA(cerr);
1865     mmdata->initialized = PETSC_TRUE;
1866   } else {
1867     /* to be safe, always update pointers of the mats */
1868     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
1869     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
1870     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
1871   }
1872 
1873   /* do cusparseSpMM, which supports transpose on B */
1874   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
1875                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
1876                       mmdata->matCDescr,cusparse_scalartype,
1877                       cusp->spmmAlg,mmdata->spmmBuffer);CHKERRCUSPARSE(stat);
1878  #else
1879   PetscInt k;
1880   /* cusparseXcsrmm does not support transpose on B */
1881   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
1882     cublasHandle_t cublasv2handle;
1883     cublasStatus_t cerr;
1884 
1885     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
1886     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
1887                        B->cmap->n,B->rmap->n,
1888                        &PETSC_CUSPARSE_ONE ,barray,blda,
1889                        &PETSC_CUSPARSE_ZERO,barray,blda,
1890                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
1891     blda = B->cmap->n;
1892     k    = B->cmap->n;
1893   } else {
1894     k    = B->rmap->n;
1895   }
1896 
1897   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
1898   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
1899                            csrmat->num_entries,mat->alpha_one,mat->descr,
1900                            csrmat->values->data().get(),
1901                            csrmat->row_offsets->data().get(),
1902                            csrmat->column_indices->data().get(),
1903                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
1904                            carray,clda);CHKERRCUSPARSE(stat);
1905  #endif
1906   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1907   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1908   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
1909   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
1910   if (product->type == MATPRODUCT_RARt) {
1911     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1912     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
1913   } else if (product->type == MATPRODUCT_PtAP) {
1914     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1915     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
1916   } else {
1917     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
1918   }
1919   if (mmdata->cisdense) {
1920     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
1921   }
1922   if (!biscuda) {
1923     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
1924   }
1925   PetscFunctionReturn(0);
1926 }
1927 
MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)1928 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1929 {
1930   Mat_Product        *product = C->product;
1931   Mat                A,B;
1932   PetscInt           m,n;
1933   PetscBool          cisdense,flg;
1934   PetscErrorCode     ierr;
1935   MatMatCusparse     *mmdata;
1936   Mat_SeqAIJCUSPARSE *cusp;
1937 
1938   PetscFunctionBegin;
1939   MatCheckProduct(C,1);
1940   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
1941   A    = product->A;
1942   B    = product->B;
1943   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1944   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1945   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1946   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
1947   switch (product->type) {
1948   case MATPRODUCT_AB:
1949     m = A->rmap->n;
1950     n = B->cmap->n;
1951     break;
1952   case MATPRODUCT_AtB:
1953     m = A->cmap->n;
1954     n = B->cmap->n;
1955     break;
1956   case MATPRODUCT_ABt:
1957     m = A->rmap->n;
1958     n = B->rmap->n;
1959     break;
1960   case MATPRODUCT_PtAP:
1961     m = B->cmap->n;
1962     n = B->cmap->n;
1963     break;
1964   case MATPRODUCT_RARt:
1965     m = B->rmap->n;
1966     n = B->rmap->n;
1967     break;
1968   default:
1969     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1970   }
1971   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
1972   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
1973   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
1974   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
1975 
1976   /* product data */
1977   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
1978   mmdata->cisdense = cisdense;
1979  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
1980   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
1981   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
1982     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
1983   }
1984  #endif
1985   /* for these products we need intermediate storage */
1986   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1987     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
1988     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
1989     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
1990       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
1991     } else {
1992       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
1993     }
1994   }
1995   C->product->data    = mmdata;
1996   C->product->destroy = MatDestroy_MatMatCusparse;
1997 
1998   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
1999   PetscFunctionReturn(0);
2000 }
2001 
2002 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2003 
2004 /* handles dense B */
MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C)2005 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C)
2006 {
2007   Mat_Product    *product = C->product;
2008   PetscErrorCode ierr;
2009 
2010   PetscFunctionBegin;
2011   MatCheckProduct(C,1);
2012   if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A");
2013   if (product->A->boundtocpu) {
2014     ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(C);CHKERRQ(ierr);
2015     PetscFunctionReturn(0);
2016   }
2017   switch (product->type) {
2018   case MATPRODUCT_AB:
2019   case MATPRODUCT_AtB:
2020   case MATPRODUCT_ABt:
2021   case MATPRODUCT_PtAP:
2022   case MATPRODUCT_RARt:
2023     C->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2024   default:
2025     break;
2026   }
2027   PetscFunctionReturn(0);
2028 }
2029 
MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)2030 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2031 {
2032   PetscErrorCode ierr;
2033 
2034   PetscFunctionBegin;
2035   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2036   PetscFunctionReturn(0);
2037 }
2038 
MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)2039 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2040 {
2041   PetscErrorCode ierr;
2042 
2043   PetscFunctionBegin;
2044   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2045   PetscFunctionReturn(0);
2046 }
2047 
MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)2048 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2049 {
2050   PetscErrorCode ierr;
2051 
2052   PetscFunctionBegin;
2053   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2054   PetscFunctionReturn(0);
2055 }
2056 
MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)2057 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2058 {
2059   PetscErrorCode ierr;
2060 
2061   PetscFunctionBegin;
2062   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2063   PetscFunctionReturn(0);
2064 }
2065 
MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)2066 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2067 {
2068   PetscErrorCode ierr;
2069 
2070   PetscFunctionBegin;
2071   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2072   PetscFunctionReturn(0);
2073 }
2074 
2075 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)2076 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2077 {
2078   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2079   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2080   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2081   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2082   PetscErrorCode               ierr;
2083   cudaError_t                  cerr;
2084   cusparseStatus_t             stat;
2085   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2086   PetscBool                    compressed;
2087 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2088   PetscInt                     nx,ny;
2089 #endif
2090 
2091   PetscFunctionBegin;
2092   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported");
2093   if (!a->nonzerorowcnt) {
2094     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
2095     PetscFunctionReturn(0);
2096   }
2097   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2098   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2099   if (!trans) {
2100     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2101   } else {
2102     if (herm || !cusparsestruct->transgen) {
2103       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2104       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2105     } else {
2106       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);}
2107       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2108     }
2109   }
2110   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2111   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2112 
2113   try {
2114     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2115     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
2116     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
2117 
2118     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2119     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2120       /* z = A x + beta y.
2121          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2122          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2123       */
2124       xptr = xarray;
2125       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2126       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2127      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2128       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2129           allocated to accommodate different uses. So we get the length info directly from mat.
2130        */
2131       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2132         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2133         nx = mat->num_cols;
2134         ny = mat->num_rows;
2135       }
2136      #endif
2137     } else {
2138       /* z = A^T x + beta y
2139          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2140          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2141        */
2142       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2143       dptr = zarray;
2144       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2145       if (compressed) { /* Scatter x to work vector */
2146         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2147         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2148                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2149                          VecCUDAEqualsReverse());
2150       }
2151      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2152       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2153         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2154         nx = mat->num_rows;
2155         ny = mat->num_cols;
2156       }
2157      #endif
2158     }
2159 
2160     /* csr_spmv does y = alpha op(A) x + beta y */
2161     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2162      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2163       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2164       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2165         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2166         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2167         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
2168                                 matstruct->matDescr,
2169                                 matstruct->cuSpMV[opA].vecXDescr, beta,
2170                                 matstruct->cuSpMV[opA].vecYDescr,
2171                                 cusparse_scalartype,
2172                                 cusparsestruct->spmvAlg,
2173                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
2174         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
2175 
2176         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2177       } else {
2178         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2179         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
2180         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
2181       }
2182 
2183       stat = cusparseSpMV(cusparsestruct->handle, opA,
2184                                matstruct->alpha_one,
2185                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */
2186                                matstruct->cuSpMV[opA].vecXDescr,
2187                                beta,
2188                                matstruct->cuSpMV[opA].vecYDescr,
2189                                cusparse_scalartype,
2190                                cusparsestruct->spmvAlg,
2191                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
2192      #else
2193       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2194       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
2195                                mat->num_rows, mat->num_cols,
2196                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
2197                                mat->values->data().get(), mat->row_offsets->data().get(),
2198                                mat->column_indices->data().get(), xptr, beta,
2199                                dptr);CHKERRCUSPARSE(stat);
2200      #endif
2201     } else {
2202       if (cusparsestruct->nrows) {
2203        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2204         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2205        #else
2206         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
2207         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
2208                                  matstruct->alpha_one, matstruct->descr, hybMat,
2209                                  xptr, beta,
2210                                  dptr);CHKERRCUSPARSE(stat);
2211        #endif
2212       }
2213     }
2214     cerr = WaitForCUDA();CHKERRCUDA(cerr);
2215     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2216 
2217     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2218       if (yy) { /* MatMultAdd: zz = A*xx + yy */
2219         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2220           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
2221         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2222           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2223         }
2224       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2225         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
2226       }
2227 
2228       /* ScatterAdd the result from work vector into the full vector when A is compressed */
2229       if (compressed) {
2230         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
2231 
2232         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2233         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
2234                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2235                          VecCUDAPlusEquals());
2236         cerr = WaitForCUDA();CHKERRCUDA(cerr);
2237         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2238       }
2239     } else {
2240       if (yy && yy != zz) {
2241         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2242       }
2243     }
2244     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2245     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
2246     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
2247   } catch(char *ex) {
2248     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
2249   }
2250   if (yy) {
2251     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
2252   } else {
2253     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
2254   }
2255   PetscFunctionReturn(0);
2256 }
2257 
MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)2258 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2259 {
2260   PetscErrorCode ierr;
2261 
2262   PetscFunctionBegin;
2263   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2264   PetscFunctionReturn(0);
2265 }
2266 
MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)2267 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
2268 {
2269   PetscErrorCode              ierr;
2270   PetscSplitCSRDataStructure  *d_mat = NULL, h_mat;
2271   PetscBool                   is_seq = PETSC_TRUE;
2272   PetscInt                    nnz_state = A->nonzerostate;
2273   PetscFunctionBegin;
2274   if (A->factortype == MAT_FACTOR_NONE) {
2275     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2276   }
2277   if (d_mat) {
2278     cudaError_t err;
2279     ierr = PetscInfo(A,"Assemble device matrix\n");CHKERRQ(ierr);
2280     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
2281     nnz_state = h_mat.nonzerostate;
2282     is_seq = h_mat.seq;
2283   }
2284   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it?
2285   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
2286   if (A->factortype == MAT_FACTOR_NONE && A->nonzerostate >= nnz_state && is_seq) { // assembled on CPU eventhough equiped for GPU
2287     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2288   } else if (nnz_state > A->nonzerostate) {
2289     A->offloadmask = PETSC_OFFLOAD_GPU;
2290   }
2291 
2292   PetscFunctionReturn(0);
2293 }
2294 
2295 /* --------------------------------------------------------------------------------*/
2296 /*@
2297    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
2298    (the default parallel PETSc format). This matrix will ultimately pushed down
2299    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
2300    assembly performance the user should preallocate the matrix storage by setting
2301    the parameter nz (or the array nnz).  By setting these parameters accurately,
2302    performance during matrix assembly can be increased by more than a factor of 50.
2303 
2304    Collective
2305 
2306    Input Parameters:
2307 +  comm - MPI communicator, set to PETSC_COMM_SELF
2308 .  m - number of rows
2309 .  n - number of columns
2310 .  nz - number of nonzeros per row (same for all rows)
2311 -  nnz - array containing the number of nonzeros in the various rows
2312          (possibly different for each row) or NULL
2313 
2314    Output Parameter:
2315 .  A - the matrix
2316 
2317    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
2318    MatXXXXSetPreallocation() paradgm instead of this routine directly.
2319    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
2320 
2321    Notes:
2322    If nnz is given then nz is ignored
2323 
2324    The AIJ format (also called the Yale sparse matrix format or
2325    compressed row storage), is fully compatible with standard Fortran 77
2326    storage.  That is, the stored row and column indices can begin at
2327    either one (as in Fortran) or zero.  See the users' manual for details.
2328 
2329    Specify the preallocated storage with either nz or nnz (not both).
2330    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
2331    allocation.  For large problems you MUST preallocate memory or you
2332    will get TERRIBLE performance, see the users' manual chapter on matrices.
2333 
2334    By default, this format uses inodes (identical nodes) when possible, to
2335    improve numerical efficiency of matrix-vector products and solves. We
2336    search for consecutive rows with the same nonzero structure, thereby
2337    reusing matrix information to achieve increased efficiency.
2338 
2339    Level: intermediate
2340 
2341 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
2342 @*/
MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat * A)2343 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
2344 {
2345   PetscErrorCode ierr;
2346 
2347   PetscFunctionBegin;
2348   ierr = MatCreate(comm,A);CHKERRQ(ierr);
2349   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
2350   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2351   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
2352   PetscFunctionReturn(0);
2353 }
2354 
MatDestroy_SeqAIJCUSPARSE(Mat A)2355 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
2356 {
2357   PetscErrorCode              ierr;
2358   PetscSplitCSRDataStructure  *d_mat = NULL;
2359 
2360   PetscFunctionBegin;
2361   if (A->factortype == MAT_FACTOR_NONE) {
2362     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2363     ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL;
2364     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
2365   } else {
2366     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
2367   }
2368   if (d_mat) {
2369     Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
2370     cudaError_t                err;
2371     PetscSplitCSRDataStructure h_mat;
2372     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
2373     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
2374     if (h_mat.seq) {
2375       if (a->compressedrow.use) {
2376  	err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
2377       }
2378       err = cudaFree(d_mat);CHKERRCUDA(err);
2379     }
2380   }
2381   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
2382   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
2383   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
2384   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
2385   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
2386   PetscFunctionReturn(0);
2387 }
2388 
2389 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
2390 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat * B)2391 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
2392 {
2393   PetscErrorCode ierr;
2394 
2395   PetscFunctionBegin;
2396   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
2397   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
2398   PetscFunctionReturn(0);
2399 }
2400 
MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)2401 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
2402 {
2403   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
2404   PetscErrorCode ierr;
2405 
2406   PetscFunctionBegin;
2407   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
2408   /* Currently, there is no case in which an AIJCUSPARSE matrix ever has its offloadmask set to PETS_OFFLOAD_GPU.
2409      If this changes, we need to implement a routine to update the CPU (host) version of the matrix from the GPU one.
2410      Right now, for safety we simply check for PETSC_OFFLOAD_GPU and have MatBindToCPU() error in this case.
2411      TODO: Add MatAIJCUSPARSECopyFromGPU() and make MatBindToCPU() functional for AIJCUSPARSE matries;
2412            can follow the example of MatBindToCPU_SeqAIJViennaCL(). */
2413   if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"PETSC_OFFLOAD_GPU should not happen. Please report your use case to petsc-dev@mcs.anl.gov");
2414   /* TODO: add support for this? */
2415   if (flg) {
2416     A->ops->mult                      = MatMult_SeqAIJ;
2417     A->ops->multadd                   = MatMultAdd_SeqAIJ;
2418     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
2419     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
2420     A->ops->multhermitiantranspose    = NULL;
2421     A->ops->multhermitiantransposeadd = NULL;
2422     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
2423     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
2424   } else {
2425     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
2426     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
2427     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
2428     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
2429     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
2430     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
2431     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
2432     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
2433   }
2434   A->boundtocpu = flg;
2435   a->inode.use = flg;
2436   PetscFunctionReturn(0);
2437 }
2438 
MatZeroEntries_SeqAIJCUSPARSE(Mat A)2439 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
2440 {
2441   PetscSplitCSRDataStructure  *d_mat = NULL;
2442   PetscErrorCode               ierr;
2443   PetscFunctionBegin;
2444   if (A->factortype == MAT_FACTOR_NONE) {
2445     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
2446     d_mat = spptr->deviceMat;
2447   }
2448   if (d_mat) {
2449     Mat_SeqAIJ   *a = (Mat_SeqAIJ*)A->data;
2450     PetscInt     n = A->rmap->n, nnz = a->i[n];
2451     cudaError_t  err;
2452     PetscScalar  *vals;
2453     ierr = PetscInfo(A,"Zero device matrix\n");CHKERRQ(ierr);
2454     err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
2455     err = cudaMemset( vals, 0, (nnz)*sizeof(PetscScalar));CHKERRCUDA(err);
2456   }
2457   ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
2458 
2459   PetscFunctionReturn(0);
2460 }
2461 
MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A,MatType mtype,MatReuse reuse,Mat * newmat)2462 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
2463 {
2464   PetscErrorCode   ierr;
2465   cusparseStatus_t stat;
2466   Mat              B;
2467 
2468   PetscFunctionBegin;
2469   if (reuse == MAT_INITIAL_MATRIX) {
2470     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
2471   } else if (reuse == MAT_REUSE_MATRIX) {
2472     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
2473   }
2474   B = *newmat;
2475 
2476   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
2477   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
2478 
2479   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
2480     if (B->factortype == MAT_FACTOR_NONE) {
2481       Mat_SeqAIJCUSPARSE *spptr;
2482 
2483       ierr = PetscNew(&spptr);CHKERRQ(ierr);
2484       spptr->format = MAT_CUSPARSE_CSR;
2485       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2486       B->spptr = spptr;
2487       spptr->deviceMat = NULL;
2488     } else {
2489       Mat_SeqAIJCUSPARSETriFactors *spptr;
2490 
2491       ierr = PetscNew(&spptr);CHKERRQ(ierr);
2492       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2493       B->spptr = spptr;
2494     }
2495     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
2496   }
2497   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
2498   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
2499   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
2500   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
2501   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
2502   B->ops->zeroentries    = MatZeroEntries_SeqAIJCUSPARSE;
2503 
2504   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
2505   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2506   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
2507   PetscFunctionReturn(0);
2508 }
2509 
MatCreate_SeqAIJCUSPARSE(Mat B)2510 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
2511 {
2512   PetscErrorCode ierr;
2513 
2514   PetscFunctionBegin;
2515   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr);
2516   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
2517   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2518   ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr);
2519   ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr);
2520   ierr = PetscOptionsEnd();CHKERRQ(ierr);
2521   PetscFunctionReturn(0);
2522 }
2523 
2524 /*MC
2525    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
2526 
2527    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
2528    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
2529    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
2530 
2531    Options Database Keys:
2532 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
2533 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2534 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2535 
2536   Level: beginner
2537 
2538 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
2539 M*/
2540 
2541 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
2542 
2543 
MatSolverTypeRegister_CUSPARSE(void)2544 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
2545 {
2546   PetscErrorCode ierr;
2547 
2548   PetscFunctionBegin;
2549   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2550   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2551   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2552   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2553   PetscFunctionReturn(0);
2554 }
2555 
MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE ** cusparsestruct)2556 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
2557 {
2558   PetscErrorCode   ierr;
2559   cusparseStatus_t stat;
2560   cusparseHandle_t handle;
2561 
2562   PetscFunctionBegin;
2563   if (*cusparsestruct) {
2564     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
2565     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
2566     delete (*cusparsestruct)->workVector;
2567     delete (*cusparsestruct)->rowoffsets_gpu;
2568     if (handle = (*cusparsestruct)->handle) {stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);}
2569    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2570     cudaError_t cerr = cudaFree((*cusparsestruct)->csr2cscBuffer);CHKERRCUDA(cerr);
2571    #endif
2572     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
2573   }
2574   PetscFunctionReturn(0);
2575 }
2576 
CsrMatrix_Destroy(CsrMatrix ** mat)2577 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
2578 {
2579   PetscFunctionBegin;
2580   if (*mat) {
2581     delete (*mat)->values;
2582     delete (*mat)->column_indices;
2583     delete (*mat)->row_offsets;
2584     delete *mat;
2585     *mat = 0;
2586   }
2587   PetscFunctionReturn(0);
2588 }
2589 
MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct ** trifactor)2590 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
2591 {
2592   cusparseStatus_t stat;
2593   PetscErrorCode   ierr;
2594 
2595   PetscFunctionBegin;
2596   if (*trifactor) {
2597     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
2598     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
2599     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
2600    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2601     cudaError_t cerr;
2602     if ((*trifactor)->solveBuffer)   {cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
2603     if ((*trifactor)->csr2cscBuffer) {cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
2604    #endif
2605     delete *trifactor;
2606     *trifactor = 0;
2607   }
2608   PetscFunctionReturn(0);
2609 }
2610 
MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct ** matstruct,MatCUSPARSEStorageFormat format)2611 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
2612 {
2613   CsrMatrix        *mat;
2614   cusparseStatus_t stat;
2615   cudaError_t      err;
2616 
2617   PetscFunctionBegin;
2618   if (*matstruct) {
2619     if ((*matstruct)->mat) {
2620       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
2621        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2622         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2623        #else
2624         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
2625         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
2626        #endif
2627       } else {
2628         mat = (CsrMatrix*)(*matstruct)->mat;
2629         CsrMatrix_Destroy(&mat);
2630       }
2631     }
2632     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
2633     delete (*matstruct)->cprowIndices;
2634     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
2635     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
2636     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
2637 
2638    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2639     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
2640     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
2641     for (int i=0; i<3; i++) {
2642       if (mdata->cuSpMV[i].initialized) {
2643         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
2644         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
2645         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
2646       }
2647     }
2648    #endif
2649     delete *matstruct;
2650     *matstruct = 0;
2651   }
2652   PetscFunctionReturn(0);
2653 }
2654 
MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors ** trifactors)2655 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2656 {
2657   PetscErrorCode ierr;
2658 
2659   PetscFunctionBegin;
2660   if (*trifactors) {
2661     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
2662     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
2663     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
2664     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
2665     delete (*trifactors)->rpermIndices;
2666     delete (*trifactors)->cpermIndices;
2667     delete (*trifactors)->workVector;
2668     (*trifactors)->rpermIndices = 0;
2669     (*trifactors)->cpermIndices = 0;
2670     (*trifactors)->workVector = 0;
2671   }
2672   PetscFunctionReturn(0);
2673 }
2674 
MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors ** trifactors)2675 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2676 {
2677   PetscErrorCode   ierr;
2678   cusparseHandle_t handle;
2679   cusparseStatus_t stat;
2680 
2681   PetscFunctionBegin;
2682   if (*trifactors) {
2683     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
2684     if (handle = (*trifactors)->handle) {
2685       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
2686     }
2687     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
2688   }
2689   PetscFunctionReturn(0);
2690 }
2691