1 /*
2  * -----------------------------------------------------------------
3  * Programmer(s): Cody J. Balos @ LLNL
4  * -----------------------------------------------------------------
5  * SUNDIALS Copyright Start
6  * Copyright (c) 2002-2021, Lawrence Livermore National Security
7  * and Southern Methodist University.
8  * All rights reserved.
9  *
10  * See the top-level LICENSE and NOTICE files for details.
11  *
12  * SPDX-License-Identifier: BSD-3-Clause
13  * SUNDIALS Copyright End
14  * -----------------------------------------------------------------
15  * This is the header file is for the cuSPARSE implementation of the
16  * SUNMATRIX module.
17  * -----------------------------------------------------------------
18  */
19 
20 
21 #ifndef _SUNCUSPARSE_MATRIX_KERNELS_CUH_
22 #define _SUNCUSPARSE_MATRIX_KERNELS_CUH_
23 
24 #include <cuda_runtime.h>
25 #include <sunmatrix/sunmatrix_cusparse.h>
26 
27 namespace sundials
28 {
29 namespace sunmatrix_cusparse
30 {
31 
32 template <typename T, typename I>
33 __global__ void
scaleAddIKernelCSR(I m,T c,T * A,const I * rowptr,const I * colind)34 scaleAddIKernelCSR(I m, T c, T* A, const I* rowptr, const I* colind)
35 {
36     // REQUIRES THE DIAGONAL TO BE PRESENT!
37 
38     // Each thread loops over one row of the matrix so memory accesses by a thread are stride-1.
39     // If there aren't enough threads to cover all rows, then some threads will be reused for
40     // more than one row.
41     for (I row = blockIdx.x*blockDim.x + threadIdx.x;
42          row < m;
43          row += blockDim.x * gridDim.x)
44     {
45         I tmp = rowptr[row];
46         I rownnz = rowptr[row+1] - tmp;
47         I idx = tmp;
48         for (I j = 0; j < rownnz; j++)
49         {
50             if (colind[idx+j] == row) A[idx+j] = c*A[idx+j] + 1.0;
51             else                      A[idx+j] = c*A[idx+j];
52         }
53     }
54 }
55 
56 template <typename T, typename I>
57 __global__ void
scaleAddIKernelBCSR(I m,I nblocks,I blocknnz,T c,T * A,const I * rowptr,const I * colind)58 scaleAddIKernelBCSR(I m, I nblocks, I blocknnz, T c, T* A, const I* rowptr, const I* colind)
59 {
60     // REQUIRES THE DIAGONAL TO BE PRESENT!
61 
62     // Ideally each thread block will be in charge of one block of the matrix.
63     for (I block = blockIdx.x;
64          block < nblocks;
65          block += gridDim.x)
66     {
67         // Each thread loops over one row of the matrix so memory accesses by a thread are stride-1.
68         // If there aren't enough threads to cover all rows, then some threads will be reused for
69         // more than one row.
70         for (I row = threadIdx.x;
71              row < m;
72              row += blockDim.x)
73         {
74             I tmp = rowptr[row];
75             I rownnz = rowptr[row+1] - tmp;
76             I idxl = tmp;
77             I idxg = block*blocknnz + tmp;
78             for (I j = 0; j < rownnz; j++)
79             {
80                 if (colind[idxl+j] == row) A[idxg+j] = c*A[idxg+j] + 1.0;
81                 else                       A[idxg+j] = c*A[idxg+j];
82             }
83         }
84     }
85 }
86 
87 template <typename T, typename I>
88 __global__ void
scaleAddKernelCSR(I nnz,T c,T * A,const T * B)89 scaleAddKernelCSR(I nnz, T c, T* A, const T* B)
90 {
91     // REQUIRES A AND B TO HAVE THE SAME SPARSITY PATTERN
92     for (I i = blockIdx.x * blockDim.x + threadIdx.x;
93          i < nnz;
94          i += blockDim.x * gridDim.x)
95     {
96         A[i] = c*A[i] + B[i];
97     }
98 }
99 
100 template <typename T, typename I>
101 __global__ void
matvecBCSR(I m,I nblocks,I blocknnz,const T * A,const I * rowptr,const I * colind,const T * x,T * y)102 matvecBCSR(I m, I nblocks, I blocknnz, const T* A, const I* rowptr, const I* colind, const T* x,  T* y)
103 {
104     // Zero out result vector
105     for (I i = blockIdx.x * blockDim.x + threadIdx.x;
106          i < nblocks*blocknnz;
107          i += blockDim.x * gridDim.x)
108     {
109         y[i] = 0.0;
110     }
111 
112     __syncthreads();
113 
114     // Ideally each thread block will be in charge of one block of the matrix.
115     for (I block = blockIdx.x;
116          block < nblocks;
117          block += gridDim.x)
118     {
119         // Each thread loops over one row of the matrix so memory accesses by a thread are stride-1.
120         // If there aren't enough threads to cover all rows, then some threads will be reused for
121         // more than one row.
122         for (I row = threadIdx.x;
123              row < m;
124              row += blockDim.x)
125         {
126             I tmp = rowptr[row];
127             I rownnz = rowptr[row+1] - tmp; // number of nnz in this row
128             I idxl = tmp;                   // local (to this block) starting nonzero index
129             I idxg = block*blocknnz + tmp;  // global (overall matrix) starting nonzero index
130             I rowg = block*m+row;           // global (overall matrix) row
131             I colg = block*m;               // global (overall matrix) starting column
132             for (I j = 0; j < rownnz; j++)
133             {
134                 y[rowg] += A[idxg+j] * x[ colg+colind[idxl+j] ];
135             }
136         }
137     }
138 }
139 
140 // kernels for debugging
141 #ifdef SUNDIALS_DEBUG
142 
143 template <typename T, typename I>
144 __global__ void
print_kernel(I m,I nnz,I blocknnz,T * A,const I * rowptr,const I * colind)145 print_kernel(I m, I nnz, I blocknnz, T* A, const I* rowptr, const I* colind)
146 {
147     for (I i = blockIdx.x * blockDim.x + threadIdx.x;
148          i < nnz;
149          i += blockDim.x * gridDim.x)
150     {
151         printf("A[%d] = %f\n", i, A[i]);
152     }
153     for (I i = blockIdx.x * blockDim.x + threadIdx.x;
154         i < m+1;
155         i += blockDim.x * gridDim.x)
156     {
157         printf("rowptr[%d] = %d\n", i, rowptr[i]);
158     }
159     for (I i = blockIdx.x * blockDim.x + threadIdx.x;
160         i < blocknnz;
161         i += blockDim.x * gridDim.x)
162     {
163         printf("colind[%d] = %d\n", i, colind[i]);
164     }
165 }
166 
167 #endif
168 
169 } // namespace sunmatrix_cusparse
170 } // namespace sundials
171 
172 #endif