1 /*
2 * -----------------------------------------------------------------
3 * Programmer(s): Cody J. Balos @ LLNL
4 * -----------------------------------------------------------------
5 * SUNDIALS Copyright Start
6 * Copyright (c) 2002-2021, Lawrence Livermore National Security
7 * and Southern Methodist University.
8 * All rights reserved.
9 *
10 * See the top-level LICENSE and NOTICE files for details.
11 *
12 * SPDX-License-Identifier: BSD-3-Clause
13 * SUNDIALS Copyright End
14 * -----------------------------------------------------------------
15 * This is the header file is for the cuSPARSE implementation of the
16 * SUNMATRIX module.
17 * -----------------------------------------------------------------
18 */
19
20
21 #ifndef _SUNCUSPARSE_MATRIX_KERNELS_CUH_
22 #define _SUNCUSPARSE_MATRIX_KERNELS_CUH_
23
24 #include <cuda_runtime.h>
25 #include <sunmatrix/sunmatrix_cusparse.h>
26
27 namespace sundials
28 {
29 namespace sunmatrix_cusparse
30 {
31
32 template <typename T, typename I>
33 __global__ void
scaleAddIKernelCSR(I m,T c,T * A,const I * rowptr,const I * colind)34 scaleAddIKernelCSR(I m, T c, T* A, const I* rowptr, const I* colind)
35 {
36 // REQUIRES THE DIAGONAL TO BE PRESENT!
37
38 // Each thread loops over one row of the matrix so memory accesses by a thread are stride-1.
39 // If there aren't enough threads to cover all rows, then some threads will be reused for
40 // more than one row.
41 for (I row = blockIdx.x*blockDim.x + threadIdx.x;
42 row < m;
43 row += blockDim.x * gridDim.x)
44 {
45 I tmp = rowptr[row];
46 I rownnz = rowptr[row+1] - tmp;
47 I idx = tmp;
48 for (I j = 0; j < rownnz; j++)
49 {
50 if (colind[idx+j] == row) A[idx+j] = c*A[idx+j] + 1.0;
51 else A[idx+j] = c*A[idx+j];
52 }
53 }
54 }
55
56 template <typename T, typename I>
57 __global__ void
scaleAddIKernelBCSR(I m,I nblocks,I blocknnz,T c,T * A,const I * rowptr,const I * colind)58 scaleAddIKernelBCSR(I m, I nblocks, I blocknnz, T c, T* A, const I* rowptr, const I* colind)
59 {
60 // REQUIRES THE DIAGONAL TO BE PRESENT!
61
62 // Ideally each thread block will be in charge of one block of the matrix.
63 for (I block = blockIdx.x;
64 block < nblocks;
65 block += gridDim.x)
66 {
67 // Each thread loops over one row of the matrix so memory accesses by a thread are stride-1.
68 // If there aren't enough threads to cover all rows, then some threads will be reused for
69 // more than one row.
70 for (I row = threadIdx.x;
71 row < m;
72 row += blockDim.x)
73 {
74 I tmp = rowptr[row];
75 I rownnz = rowptr[row+1] - tmp;
76 I idxl = tmp;
77 I idxg = block*blocknnz + tmp;
78 for (I j = 0; j < rownnz; j++)
79 {
80 if (colind[idxl+j] == row) A[idxg+j] = c*A[idxg+j] + 1.0;
81 else A[idxg+j] = c*A[idxg+j];
82 }
83 }
84 }
85 }
86
87 template <typename T, typename I>
88 __global__ void
scaleAddKernelCSR(I nnz,T c,T * A,const T * B)89 scaleAddKernelCSR(I nnz, T c, T* A, const T* B)
90 {
91 // REQUIRES A AND B TO HAVE THE SAME SPARSITY PATTERN
92 for (I i = blockIdx.x * blockDim.x + threadIdx.x;
93 i < nnz;
94 i += blockDim.x * gridDim.x)
95 {
96 A[i] = c*A[i] + B[i];
97 }
98 }
99
100 template <typename T, typename I>
101 __global__ void
matvecBCSR(I m,I nblocks,I blocknnz,const T * A,const I * rowptr,const I * colind,const T * x,T * y)102 matvecBCSR(I m, I nblocks, I blocknnz, const T* A, const I* rowptr, const I* colind, const T* x, T* y)
103 {
104 // Zero out result vector
105 for (I i = blockIdx.x * blockDim.x + threadIdx.x;
106 i < nblocks*blocknnz;
107 i += blockDim.x * gridDim.x)
108 {
109 y[i] = 0.0;
110 }
111
112 __syncthreads();
113
114 // Ideally each thread block will be in charge of one block of the matrix.
115 for (I block = blockIdx.x;
116 block < nblocks;
117 block += gridDim.x)
118 {
119 // Each thread loops over one row of the matrix so memory accesses by a thread are stride-1.
120 // If there aren't enough threads to cover all rows, then some threads will be reused for
121 // more than one row.
122 for (I row = threadIdx.x;
123 row < m;
124 row += blockDim.x)
125 {
126 I tmp = rowptr[row];
127 I rownnz = rowptr[row+1] - tmp; // number of nnz in this row
128 I idxl = tmp; // local (to this block) starting nonzero index
129 I idxg = block*blocknnz + tmp; // global (overall matrix) starting nonzero index
130 I rowg = block*m+row; // global (overall matrix) row
131 I colg = block*m; // global (overall matrix) starting column
132 for (I j = 0; j < rownnz; j++)
133 {
134 y[rowg] += A[idxg+j] * x[ colg+colind[idxl+j] ];
135 }
136 }
137 }
138 }
139
140 // kernels for debugging
141 #ifdef SUNDIALS_DEBUG
142
143 template <typename T, typename I>
144 __global__ void
print_kernel(I m,I nnz,I blocknnz,T * A,const I * rowptr,const I * colind)145 print_kernel(I m, I nnz, I blocknnz, T* A, const I* rowptr, const I* colind)
146 {
147 for (I i = blockIdx.x * blockDim.x + threadIdx.x;
148 i < nnz;
149 i += blockDim.x * gridDim.x)
150 {
151 printf("A[%d] = %f\n", i, A[i]);
152 }
153 for (I i = blockIdx.x * blockDim.x + threadIdx.x;
154 i < m+1;
155 i += blockDim.x * gridDim.x)
156 {
157 printf("rowptr[%d] = %d\n", i, rowptr[i]);
158 }
159 for (I i = blockIdx.x * blockDim.x + threadIdx.x;
160 i < blocknnz;
161 i += blockDim.x * gridDim.x)
162 {
163 printf("colind[%d] = %d\n", i, colind[i]);
164 }
165 }
166
167 #endif
168
169 } // namespace sunmatrix_cusparse
170 } // namespace sundials
171
172 #endif