1 #include "gpu.h"
2 
SCS(accum_by_atrans_gpu)3 void SCS(accum_by_atrans_gpu)(const ScsGpuMatrix *Ag,
4                               const cusparseDnVecDescr_t x,
5                               cusparseDnVecDescr_t y,
6                               cusparseHandle_t cusparse_handle,
7                               size_t *buffer_size, void **buffer) {
8   /* y += A'*x
9      x and y MUST be on GPU already
10   */
11   const scs_float onef = 1.0;
12   size_t new_buffer_size = 0;
13 
14   CUSPARSE_GEN(SpMV_bufferSize)
15   (cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &onef, Ag->descr, x,
16    &onef, y, SCS_CUDA_FLOAT, SCS_CSRMV_ALG, &new_buffer_size);
17 
18   if (new_buffer_size > *buffer_size) {
19     if (*buffer != SCS_NULL) {
20       cudaFree(*buffer);
21     }
22     cudaMalloc(buffer, *buffer_size);
23     *buffer_size = new_buffer_size;
24   }
25 
26   CUSPARSE_GEN(SpMV)
27   (cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &onef, Ag->descr, x,
28    &onef, y, SCS_CUDA_FLOAT, SCS_CSRMV_ALG, buffer);
29 }
30 
31 /* this is slow, use trans routine if possible */
SCS(accum_by_a_gpu)32 void SCS(accum_by_a_gpu)(const ScsGpuMatrix *Ag, const cusparseDnVecDescr_t x,
33                          cusparseDnVecDescr_t y,
34                          cusparseHandle_t cusparse_handle, size_t *buffer_size,
35                          void **buffer) {
36   /* y += A*x
37      x and y MUST be on GPU already
38    */
39   const scs_float onef = 1.0;
40   size_t new_buffer_size = 0;
41 
42   /* The A matrix idx pointers must be ORDERED */
43   CUSPARSE_GEN(SpMV_bufferSize)
44   (cusparse_handle, CUSPARSE_OPERATION_TRANSPOSE, &onef, Ag->descr, x, &onef, y,
45    SCS_CUDA_FLOAT, SCS_CSRMV_ALG, &new_buffer_size);
46 
47   if (new_buffer_size > *buffer_size) {
48     if (*buffer != SCS_NULL) {
49       cudaFree(*buffer);
50     }
51     cudaMalloc(buffer, *buffer_size);
52     *buffer_size = new_buffer_size;
53   }
54 
55   CUSPARSE_GEN(SpMV)
56   (cusparse_handle, CUSPARSE_OPERATION_TRANSPOSE, &onef, Ag->descr, x, &onef, y,
57    SCS_CUDA_FLOAT, SCS_CSRMV_ALG, buffer);
58 }
59 
60 /* This assumes that P has been made full (ie not triangular) and uses the
61  * fact that the GPU is faster for general sparse matrices than for symmetric
62  */
63 /* y += P*x
64    x and y MUST be on GPU already
65  */
SCS(accum_by_p_gpu)66 void SCS(accum_by_p_gpu)(const ScsGpuMatrix *Pg, const cusparseDnVecDescr_t x,
67                          cusparseDnVecDescr_t y,
68                          cusparseHandle_t cusparse_handle, size_t *buffer_size,
69                          void **buffer) {
70   SCS(accum_by_atrans_gpu)(Pg, x, y, cusparse_handle, buffer_size, buffer);
71 }
72 
SCS(free_gpu_matrix)73 void SCS(free_gpu_matrix)(ScsGpuMatrix *A) {
74   cudaFree(A->x);
75   cudaFree(A->i);
76   cudaFree(A->p);
77   cusparseDestroySpMat(A->descr);
78 }
79