1 
2 
3 /*
4     Defines the basic matrix operations for the AIJ (compressed row)
5   matrix storage format.
6 */
7 
8 #include <petscconf.h>
9 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
10 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11 #include <petscbt.h>
12 #include <../src/vec/vec/impls/dvecimpl.h>
13 #include <petsc/private/vecimpl.h>
14 
15 #include <../src/mat/impls/aij/seq/seqviennacl/viennaclmatimpl.h>
16 
17 
18 #include <algorithm>
19 #include <vector>
20 #include <string>
21 
22 #include "viennacl/linalg/prod.hpp"
23 
24 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJViennaCL(Mat A, MatType type, MatReuse reuse, Mat *newmat);
25 PETSC_INTERN PetscErrorCode MatGetFactor_seqaij_petsc(Mat,MatFactorType,Mat*);
26 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
27 
MatViennaCLCopyToGPU(Mat A)28 PetscErrorCode MatViennaCLCopyToGPU(Mat A)
29 {
30   Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
31   Mat_SeqAIJ         *a              = (Mat_SeqAIJ*)A->data;
32   PetscErrorCode     ierr;
33 
34   PetscFunctionBegin;
35   if (A->rmap->n > 0 && A->cmap->n > 0 && a->nz) { //some OpenCL SDKs have issues with buffers of size 0
36     if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
37       ierr = PetscLogEventBegin(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr);
38 
39       try {
40         if (a->compressedrow.use) {
41           if (!viennaclstruct->compressed_mat) viennaclstruct->compressed_mat = new ViennaCLCompressedAIJMatrix();
42 
43           // Since PetscInt is different from cl_uint, we have to convert:
44           viennacl::backend::mem_handle dummy;
45 
46           viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(dummy, a->compressedrow.nrows+1);
47           for (PetscInt i=0; i<=a->compressedrow.nrows; ++i)
48             row_buffer.set(i, (a->compressedrow.i)[i]);
49 
50           viennacl::backend::typesafe_host_array<unsigned int> row_indices; row_indices.raw_resize(dummy, a->compressedrow.nrows);
51           for (PetscInt i=0; i<a->compressedrow.nrows; ++i)
52             row_indices.set(i, (a->compressedrow.rindex)[i]);
53 
54           viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(dummy, a->nz);
55           for (PetscInt i=0; i<a->nz; ++i)
56             col_buffer.set(i, (a->j)[i]);
57 
58           viennaclstruct->compressed_mat->set(row_buffer.get(), row_indices.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->compressedrow.nrows, a->nz);
59           ierr = PetscLogCpuToGpu(((2*a->compressedrow.nrows)+1+a->nz)*sizeof(PetscInt) + (a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
60         } else {
61           if (!viennaclstruct->mat) viennaclstruct->mat = new ViennaCLAIJMatrix();
62 
63           // Since PetscInt is in general different from cl_uint, we have to convert:
64           viennacl::backend::mem_handle dummy;
65 
66           viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(dummy, A->rmap->n+1);
67           for (PetscInt i=0; i<=A->rmap->n; ++i)
68             row_buffer.set(i, (a->i)[i]);
69 
70           viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(dummy, a->nz);
71           for (PetscInt i=0; i<a->nz; ++i)
72             col_buffer.set(i, (a->j)[i]);
73 
74           viennaclstruct->mat->set(row_buffer.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->nz);
75           ierr = PetscLogCpuToGpu(((A->rmap->n+1)+a->nz)*sizeof(PetscInt)+(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
76         }
77         ViennaCLWaitForGPU();
78       } catch(std::exception const & ex) {
79         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
80       }
81 
82       // Create temporary vector for v += A*x:
83       if (viennaclstruct->tempvec) {
84         if (viennaclstruct->tempvec->size() != static_cast<std::size_t>(A->rmap->n)) {
85           delete (ViennaCLVector*)viennaclstruct->tempvec;
86           viennaclstruct->tempvec = new ViennaCLVector(A->rmap->n);
87         } else {
88           viennaclstruct->tempvec->clear();
89         }
90       } else {
91         viennaclstruct->tempvec = new ViennaCLVector(A->rmap->n);
92       }
93 
94       A->offloadmask = PETSC_OFFLOAD_BOTH;
95 
96       ierr = PetscLogEventEnd(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr);
97     }
98   }
99   PetscFunctionReturn(0);
100 }
101 
MatViennaCLCopyFromGPU(Mat A,const ViennaCLAIJMatrix * Agpu)102 PetscErrorCode MatViennaCLCopyFromGPU(Mat A, const ViennaCLAIJMatrix *Agpu)
103 {
104   Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
105   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
106   PetscInt           m  = A->rmap->n;
107   PetscErrorCode     ierr;
108 
109   PetscFunctionBegin;
110   if (A->offloadmask == PETSC_OFFLOAD_BOTH) PetscFunctionReturn(0);
111   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED && Agpu) {
112     try {
113       if (a->compressedrow.use) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "ViennaCL: Cannot handle row compression for GPU matrices");
114       else {
115 
116         if ((PetscInt)Agpu->size1() != m) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "GPU matrix has %d rows, should be %d", Agpu->size1(), m);
117         a->nz           = Agpu->nnz();
118         a->maxnz        = a->nz; /* Since we allocate exactly the right amount */
119         A->preallocated = PETSC_TRUE;
120         if (a->singlemalloc) {
121           if (a->a) {ierr = PetscFree3(a->a,a->j,a->i);CHKERRQ(ierr);}
122         } else {
123           if (a->i) {ierr = PetscFree(a->i);CHKERRQ(ierr);}
124           if (a->j) {ierr = PetscFree(a->j);CHKERRQ(ierr);}
125           if (a->a) {ierr = PetscFree(a->a);CHKERRQ(ierr);}
126         }
127         ierr = PetscMalloc3(a->nz,&a->a,a->nz,&a->j,m+1,&a->i);CHKERRQ(ierr);
128         ierr = PetscLogObjectMemory((PetscObject)A, a->nz*(sizeof(PetscScalar)+sizeof(PetscInt))+(m+1)*sizeof(PetscInt));CHKERRQ(ierr);
129 
130         a->singlemalloc = PETSC_TRUE;
131 
132         /* Setup row lengths */
133         ierr = PetscFree(a->imax);CHKERRQ(ierr);
134         ierr = PetscFree(a->ilen);CHKERRQ(ierr);
135         ierr = PetscMalloc1(m,&a->imax);CHKERRQ(ierr);
136         ierr = PetscMalloc1(m,&a->ilen);CHKERRQ(ierr);
137         ierr = PetscLogObjectMemory((PetscObject)A, 2*m*sizeof(PetscInt));CHKERRQ(ierr);
138 
139         /* Copy data back from GPU */
140         viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(Agpu->handle1(), Agpu->size1() + 1);
141 
142         // copy row array
143         viennacl::backend::memory_read(Agpu->handle1(), 0, row_buffer.raw_size(), row_buffer.get());
144         (a->i)[0] = row_buffer[0];
145         for (PetscInt i = 0; i < (PetscInt)Agpu->size1(); ++i) {
146           (a->i)[i+1] = row_buffer[i+1];
147           a->imax[i]  = a->ilen[i] = a->i[i+1] - a->i[i];  //Set imax[] and ilen[] arrays at the same time as i[] for better cache reuse
148         }
149 
150         // copy column indices
151         viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(Agpu->handle2(), Agpu->nnz());
152         viennacl::backend::memory_read(Agpu->handle2(), 0, col_buffer.raw_size(), col_buffer.get());
153         for (PetscInt i=0; i < (PetscInt)Agpu->nnz(); ++i)
154           (a->j)[i] = col_buffer[i];
155 
156         // copy nonzero entries directly to destination (no conversion required)
157         viennacl::backend::memory_read(Agpu->handle(), 0, sizeof(PetscScalar)*Agpu->nnz(), a->a);
158 
159         ierr = PetscLogGpuToCpu(row_buffer.raw_size()+col_buffer.raw_size()+(Agpu->nnz()*sizeof(PetscScalar)));CHKERRQ(ierr);
160         ViennaCLWaitForGPU();
161         /* TODO: Once a->diag is moved out of MatAssemblyEnd(), invalidate it here. */
162       }
163     } catch(std::exception const & ex) {
164       SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what());
165     }
166   } else if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED) {
167     PetscFunctionReturn(0);
168   } else {
169     if (!Agpu && A->offloadmask != PETSC_OFFLOAD_GPU) PetscFunctionReturn(0);
170 
171     if (a->compressedrow.use) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "ViennaCL: Cannot handle row compression for GPU matrices");
172     if (!Agpu) {
173       viennacl::backend::memory_read(viennaclstruct->mat->handle(), 0, sizeof(PetscScalar)*viennaclstruct->mat->nnz(), a->a);
174     } else {
175       viennacl::backend::memory_read(Agpu->handle(), 0, sizeof(PetscScalar)*Agpu->nnz(), a->a);
176     }
177   }
178   A->offloadmask = PETSC_OFFLOAD_BOTH;
179   /* This assembly prevents resetting the flag to PETSC_OFFLOAD_CPU and recopying */
180   ierr = MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
181   ierr = MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
182   PetscFunctionReturn(0);
183 }
184 
MatMult_SeqAIJViennaCL(Mat A,Vec xx,Vec yy)185 PetscErrorCode MatMult_SeqAIJViennaCL(Mat A,Vec xx,Vec yy)
186 {
187   Mat_SeqAIJ           *a = (Mat_SeqAIJ*)A->data;
188   PetscErrorCode       ierr;
189   Mat_SeqAIJViennaCL   *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
190   const ViennaCLVector *xgpu=NULL;
191   ViennaCLVector       *ygpu=NULL;
192 
193   PetscFunctionBegin;
194   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
195   ierr = MatViennaCLCopyToGPU(A);CHKERRQ(ierr);
196   if (A->rmap->n > 0 && A->cmap->n > 0 && a->nz) {
197     ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr);
198     ierr = VecViennaCLGetArrayWrite(yy,&ygpu);CHKERRQ(ierr);
199     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
200     try {
201       if (a->compressedrow.use) {
202         *ygpu = viennacl::linalg::prod(*viennaclstruct->compressed_mat, *xgpu);
203       } else {
204         *ygpu = viennacl::linalg::prod(*viennaclstruct->mat,*xgpu);
205       }
206       ViennaCLWaitForGPU();
207     } catch (std::exception const & ex) {
208       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
209     }
210     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
211     ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr);
212     ierr = VecViennaCLRestoreArrayWrite(yy,&ygpu);CHKERRQ(ierr);
213     ierr = PetscLogGpuFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
214   } else {
215     ierr = VecSet_SeqViennaCL(yy,0);CHKERRQ(ierr);
216   }
217   PetscFunctionReturn(0);
218 }
219 
MatMultAdd_SeqAIJViennaCL(Mat A,Vec xx,Vec yy,Vec zz)220 PetscErrorCode MatMultAdd_SeqAIJViennaCL(Mat A,Vec xx,Vec yy,Vec zz)
221 {
222   Mat_SeqAIJ           *a = (Mat_SeqAIJ*)A->data;
223   PetscErrorCode       ierr;
224   Mat_SeqAIJViennaCL   *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
225   const ViennaCLVector *xgpu=NULL,*ygpu=NULL;
226   ViennaCLVector       *zgpu=NULL;
227 
228   PetscFunctionBegin;
229   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
230   ierr = MatViennaCLCopyToGPU(A);CHKERRQ(ierr);
231   if (A->rmap->n > 0 && A->cmap->n > 0 && a->nz) {
232     try {
233       ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr);
234       ierr = VecViennaCLGetArrayRead(yy,&ygpu);CHKERRQ(ierr);
235       ierr = VecViennaCLGetArrayWrite(zz,&zgpu);CHKERRQ(ierr);
236       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
237       if (a->compressedrow.use) *viennaclstruct->tempvec = viennacl::linalg::prod(*viennaclstruct->compressed_mat, *xgpu);
238       else *viennaclstruct->tempvec = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu);
239       if (zz != yy) *zgpu = *ygpu + *viennaclstruct->tempvec;
240       else *zgpu += *viennaclstruct->tempvec;
241       ViennaCLWaitForGPU();
242       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
243       ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr);
244       ierr = VecViennaCLRestoreArrayRead(yy,&ygpu);CHKERRQ(ierr);
245       ierr = VecViennaCLRestoreArrayWrite(zz,&zgpu);CHKERRQ(ierr);
246 
247     } catch(std::exception const & ex) {
248       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
249     }
250     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
251   } else {
252     ierr = VecCopy_SeqViennaCL(yy,zz);CHKERRQ(ierr);
253   }
254   PetscFunctionReturn(0);
255 }
256 
MatAssemblyEnd_SeqAIJViennaCL(Mat A,MatAssemblyType mode)257 PetscErrorCode MatAssemblyEnd_SeqAIJViennaCL(Mat A,MatAssemblyType mode)
258 {
259   PetscErrorCode ierr;
260 
261   PetscFunctionBegin;
262   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
263   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
264   if (!A->boundtocpu) {
265     ierr = MatViennaCLCopyToGPU(A);CHKERRQ(ierr);
266   }
267   PetscFunctionReturn(0);
268 }
269 
270 /* --------------------------------------------------------------------------------*/
271 /*@C
272    MatCreateSeqAIJViennaCL - Creates a sparse matrix in AIJ (compressed row) format
273    (the default parallel PETSc format).  This matrix will ultimately be pushed down
274    to GPUs and use the ViennaCL library for calculations. For good matrix
275    assembly performance the user should preallocate the matrix storage by setting
276    the parameter nz (or the array nnz).  By setting these parameters accurately,
277    performance during matrix assembly can be increased substantially.
278 
279 
280    Collective
281 
282    Input Parameters:
283 +  comm - MPI communicator, set to PETSC_COMM_SELF
284 .  m - number of rows
285 .  n - number of columns
286 .  nz - number of nonzeros per row (same for all rows)
287 -  nnz - array containing the number of nonzeros in the various rows
288          (possibly different for each row) or NULL
289 
290    Output Parameter:
291 .  A - the matrix
292 
293    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
294    MatXXXXSetPreallocation() paradigm instead of this routine directly.
295    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
296 
297    Notes:
298    If nnz is given then nz is ignored
299 
300    The AIJ format (also called the Yale sparse matrix format or
301    compressed row storage), is fully compatible with standard Fortran 77
302    storage.  That is, the stored row and column indices can begin at
303    either one (as in Fortran) or zero.  See the users' manual for details.
304 
305    Specify the preallocated storage with either nz or nnz (not both).
306    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
307    allocation.  For large problems you MUST preallocate memory or you
308    will get TERRIBLE performance, see the users' manual chapter on matrices.
309 
310    Level: intermediate
311 
312 .seealso: MatCreate(), MatCreateAIJ(), MatCreateAIJCUSPARSE(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
313 
314 @*/
MatCreateSeqAIJViennaCL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat * A)315 PetscErrorCode  MatCreateSeqAIJViennaCL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
316 {
317   PetscErrorCode ierr;
318 
319   PetscFunctionBegin;
320   ierr = MatCreate(comm,A);CHKERRQ(ierr);
321   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
322   ierr = MatSetType(*A,MATSEQAIJVIENNACL);CHKERRQ(ierr);
323   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
324   PetscFunctionReturn(0);
325 }
326 
327 
MatDestroy_SeqAIJViennaCL(Mat A)328 PetscErrorCode MatDestroy_SeqAIJViennaCL(Mat A)
329 {
330   PetscErrorCode ierr;
331   Mat_SeqAIJViennaCL *viennaclcontainer = (Mat_SeqAIJViennaCL*)A->spptr;
332 
333   PetscFunctionBegin;
334   try {
335     if (viennaclcontainer) {
336       delete viennaclcontainer->tempvec;
337       delete viennaclcontainer->mat;
338       delete viennaclcontainer->compressed_mat;
339       delete viennaclcontainer;
340     }
341     A->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
342   } catch(std::exception const & ex) {
343     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
344   }
345 
346   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaij_seqaijviennacl_C",NULL);CHKERRQ(ierr);
347   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijviennacl_seqdense_C",NULL);CHKERRQ(ierr);
348   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijviennacl_seqaij_C",NULL);CHKERRQ(ierr);
349 
350   /* this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
351   A->spptr = 0;
352   ierr     = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
353   PetscFunctionReturn(0);
354 }
355 
356 
MatCreate_SeqAIJViennaCL(Mat B)357 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJViennaCL(Mat B)
358 {
359   PetscErrorCode ierr;
360 
361   PetscFunctionBegin;
362   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
363   ierr = MatConvert_SeqAIJ_SeqAIJViennaCL(B,MATSEQAIJVIENNACL,MAT_INPLACE_MATRIX,&B);
364   PetscFunctionReturn(0);
365 }
366 
367 static PetscErrorCode MatBindToCPU_SeqAIJViennaCL(Mat,PetscBool);
MatDuplicate_SeqAIJViennaCL(Mat A,MatDuplicateOption cpvalues,Mat * B)368 static PetscErrorCode MatDuplicate_SeqAIJViennaCL(Mat A,MatDuplicateOption cpvalues,Mat *B)
369 {
370   PetscErrorCode ierr;
371   Mat            C;
372 
373   PetscFunctionBegin;
374   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
375   C = *B;
376 
377   ierr = MatBindToCPU_SeqAIJViennaCL(A,PETSC_FALSE);CHKERRQ(ierr);
378   C->ops->bindtocpu = MatBindToCPU_SeqAIJViennaCL;
379 
380   C->spptr = new Mat_SeqAIJViennaCL();
381   ((Mat_SeqAIJViennaCL*)C->spptr)->tempvec        = NULL;
382   ((Mat_SeqAIJViennaCL*)C->spptr)->mat            = NULL;
383   ((Mat_SeqAIJViennaCL*)C->spptr)->compressed_mat = NULL;
384 
385   ierr = PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJVIENNACL);CHKERRQ(ierr);
386 
387   C->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
388 
389   /* If the source matrix is already assembled, copy the destination matrix to the GPU */
390   if (C->assembled) {
391     ierr = MatViennaCLCopyToGPU(C);CHKERRQ(ierr);
392   }
393 
394   PetscFunctionReturn(0);
395 }
396 
MatSeqAIJGetArray_SeqAIJViennaCL(Mat A,PetscScalar * array[])397 static PetscErrorCode MatSeqAIJGetArray_SeqAIJViennaCL(Mat A,PetscScalar *array[])
398 {
399   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
400   PetscErrorCode ierr;
401 
402   PetscFunctionBegin;
403   ierr = MatViennaCLCopyFromGPU(A,(const ViennaCLAIJMatrix *)NULL);CHKERRQ(ierr);
404   *array = a->a;
405   A->offloadmask = PETSC_OFFLOAD_CPU;
406   PetscFunctionReturn(0);
407 }
408 
409 
MatSeqAIJRestoreArray_SeqAIJViennaCL(Mat A,PetscScalar * array[])410 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJViennaCL(Mat A,PetscScalar *array[])
411 {
412   PetscFunctionBegin;
413   *array = NULL;
414   PetscFunctionReturn(0);
415 }
416 
MatBindToCPU_SeqAIJViennaCL(Mat A,PetscBool flg)417 static PetscErrorCode MatBindToCPU_SeqAIJViennaCL(Mat A,PetscBool flg)
418 {
419   Mat_SeqAIJ     *aij = (Mat_SeqAIJ*)A->data;
420   PetscErrorCode ierr;
421 
422   PetscFunctionBegin;
423   A->boundtocpu  = flg;
424   aij->inode.use = flg;
425   if (flg) {
426     /* make sure we have an up-to-date copy on the CPU */
427     ierr = MatViennaCLCopyFromGPU(A,(const ViennaCLAIJMatrix *)NULL);CHKERRQ(ierr);
428     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
429     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJRestoreArray_C",MatSeqAIJRestoreArray_SeqAIJ);CHKERRQ(ierr);
430 
431     A->ops->mult        = MatMult_SeqAIJ;
432     A->ops->multadd     = MatMultAdd_SeqAIJ;
433     A->ops->assemblyend = MatAssemblyEnd_SeqAIJ;
434     A->ops->duplicate   = MatDuplicate_SeqAIJ;
435   } else {
436     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJViennaCL);CHKERRQ(ierr);
437     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJRestoreArray_C",MatSeqAIJRestoreArray_SeqAIJViennaCL);CHKERRQ(ierr);
438 
439     A->ops->mult        = MatMult_SeqAIJViennaCL;
440     A->ops->multadd     = MatMultAdd_SeqAIJViennaCL;
441     A->ops->assemblyend = MatAssemblyEnd_SeqAIJViennaCL;
442     A->ops->destroy     = MatDestroy_SeqAIJViennaCL;
443     A->ops->duplicate   = MatDuplicate_SeqAIJViennaCL;
444   }
445   PetscFunctionReturn(0);
446 }
447 
MatConvert_SeqAIJ_SeqAIJViennaCL(Mat A,MatType type,MatReuse reuse,Mat * newmat)448 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJViennaCL(Mat A,MatType type,MatReuse reuse,Mat *newmat)
449 {
450   PetscErrorCode ierr;
451   Mat            B;
452 
453   PetscFunctionBegin;
454 
455   if (reuse == MAT_REUSE_MATRIX) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MAT_REUSE_MATRIX is not supported. Consider using MAT_INPLACE_MATRIX instead");
456 
457   if (reuse == MAT_INITIAL_MATRIX) {
458     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
459   }
460 
461   B = *newmat;
462 
463   B->spptr = new Mat_SeqAIJViennaCL();
464 
465   ((Mat_SeqAIJViennaCL*)B->spptr)->tempvec        = NULL;
466   ((Mat_SeqAIJViennaCL*)B->spptr)->mat            = NULL;
467   ((Mat_SeqAIJViennaCL*)B->spptr)->compressed_mat = NULL;
468 
469   ierr = MatBindToCPU_SeqAIJViennaCL(A,PETSC_FALSE);CHKERRQ(ierr);
470   A->ops->bindtocpu = MatBindToCPU_SeqAIJViennaCL;
471 
472   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJVIENNACL);CHKERRQ(ierr);
473   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
474   ierr = PetscStrallocpy(VECVIENNACL,&B->defaultvectype);CHKERRQ(ierr);
475 
476   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaij_seqaijviennacl_C",MatConvert_SeqAIJ_SeqAIJViennaCL);CHKERRQ(ierr);
477   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijviennacl_seqdense_C",MatProductSetFromOptions_SeqAIJ_SeqDense);CHKERRQ(ierr);
478   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijviennacl_seqaij_C",MatProductSetFromOptions_SeqAIJ);CHKERRQ(ierr);
479 
480   B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
481 
482   /* If the source matrix is already assembled, copy the destination matrix to the GPU */
483   if (B->assembled) {
484     ierr = MatViennaCLCopyToGPU(B);CHKERRQ(ierr);
485   }
486 
487   PetscFunctionReturn(0);
488 }
489 
490 
491 /*MC
492    MATSEQAIJVIENNACL - MATAIJVIENNACL = "aijviennacl" = "seqaijviennacl" - A matrix type to be used for sparse matrices.
493 
494    A matrix type type whose data resides on GPUs. These matrices are in CSR format by
495    default. All matrix calculations are performed using the ViennaCL library.
496 
497    Options Database Keys:
498 +  -mat_type aijviennacl - sets the matrix type to "seqaijviennacl" during a call to MatSetFromOptions()
499 .  -mat_viennacl_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions().
500 -  -mat_viennacl_mult_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions().
501 
502   Level: beginner
503 
504 .seealso: MatCreateSeqAIJViennaCL(), MATAIJVIENNACL, MatCreateAIJViennaCL()
505 M*/
506 
507 
MatSolverTypeRegister_ViennaCL(void)508 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_ViennaCL(void)
509 {
510   PetscErrorCode ierr;
511 
512   PetscFunctionBegin;
513   ierr = MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL, MAT_FACTOR_LU,MatGetFactor_seqaij_petsc);CHKERRQ(ierr);
514   ierr = MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL, MAT_FACTOR_CHOLESKY,MatGetFactor_seqaij_petsc);CHKERRQ(ierr);
515   ierr = MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL, MAT_FACTOR_ILU,MatGetFactor_seqaij_petsc);CHKERRQ(ierr);
516   ierr = MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL, MAT_FACTOR_ICC,MatGetFactor_seqaij_petsc);CHKERRQ(ierr);
517   PetscFunctionReturn(0);
518 }
519