1 /*
2 * -----------------------------------------------------------------
3 * Programmer(s): David Gardner, Cody J. Balos @ LLNL
4 * -----------------------------------------------------------------
5 * SUNDIALS Copyright Start
6 * Copyright (c) 2002-2021, Lawrence Livermore National Security
7 * and Southern Methodist University.
8 * All rights reserved.
9 *
10 * See the top-level LICENSE and NOTICE files for details.
11 *
12 * SPDX-License-Identifier: BSD-3-Clause
13 * SUNDIALS Copyright End
14 * -----------------------------------------------------------------
15 */
16
17
18 #ifndef _NVECTOR_HIP_ARRAY_KERNELS_HIP_HPP_
19 #define _NVECTOR_HIP_ARRAY_KERNELS_HIP_HPP_
20
21 #include <limits>
22 #include <hip/hip_runtime.h>
23
24 #include "sundials_hip_kernels.hip.hpp"
25
26 using namespace sundials::hip;
27
28 namespace sundials
29 {
30 namespace nvector_hip
31 {
32
33 /* -----------------------------------------------------------------
34 * The namespace for HIP kernels
35 *
36 * Reduction HIP kernels in nvector are based in part on "reduction"
37 * example in NVIDIA Corporation CUDA Samples, and parallel reduction
38 * examples in textbook by J. Cheng at al. "CUDA C Programming".
39 * -----------------------------------------------------------------
40 */
41
42 /*
43 * -----------------------------------------------------------------------------
44 * fused vector operation kernels
45 * -----------------------------------------------------------------------------
46 */
47
48 /*
49 * Computes the linear combination of nv vectors
50 */
51 template <typename T, typename I>
52 __global__ void
linearCombinationKernel(int nv,T * c,T ** xd,T * zd,I n)53 linearCombinationKernel(int nv, T* c, T** xd, T* zd, I n)
54 {
55 GRID_STRIDE_XLOOP(I, i, n)
56 {
57 zd[i] = c[0]*xd[0][i];
58 for (int j=1; j<nv; j++)
59 zd[i] += c[j]*xd[j][i];
60 }
61 }
62
63 /*
64 * Computes the scaled sum of one vector with nv other vectors
65 */
66 template <typename T, typename I>
67 __global__ void
scaleAddMultiKernel(int nv,T * c,T * xd,T ** yd,T ** zd,I n)68 scaleAddMultiKernel(int nv, T* c, T* xd, T** yd, T** zd, I n)
69 {
70 GRID_STRIDE_XLOOP(I, i, n)
71 {
72 for (int j=0; j<nv; j++)
73 zd[j][i] = c[j] * xd[i] + yd[j][i];
74 }
75 }
76
77
78 /*
79 * Dot product of one vector with nv other vectors.
80 *
81 */
82 template <typename T, typename I>
83 __global__ void
dotProdMultiKernel(int nv,const T * xd,T ** yd,T * out,I n)84 dotProdMultiKernel(int nv, const T* xd, T** yd, T* out, I n)
85 {
86 // REQUIRES nv blocks (i.e. gridDim.x == nv)
87 const I k = blockIdx.x;
88
89 // Initialize to zero.
90 T sum = 0.0;
91 for (I i = threadIdx.x; i < n; i += blockDim.x)
92 { // each thread computes n/blockDim.x elements
93 sum += xd[i] * yd[k][i];
94 }
95 sum = blockReduce<T, RSUM>(sum, 0.0);
96
97 // Copy reduction result for each block to global memory
98 if (threadIdx.x == 0) atomicAdd(&out[k], sum);
99 }
100
101
102 /*
103 * -----------------------------------------------------------------------------
104 * vector array operation kernels
105 * -----------------------------------------------------------------------------
106 */
107
108
109 /*
110 * Computes the linear sum of multiple vectors
111 */
112 template <typename T, typename I>
113 __global__ void
linearSumVectorArrayKernel(int nv,T a,T ** xd,T b,T ** yd,T ** zd,I n)114 linearSumVectorArrayKernel(int nv, T a, T** xd, T b, T** yd, T** zd, I n)
115 {
116 GRID_STRIDE_XLOOP(I, i, n)
117 {
118 for (int j=0; j<nv; j++)
119 zd[j][i] = a * xd[j][i] + b * yd[j][i];
120 }
121 }
122
123
124 /*
125 * Scales multiple vectors
126 */
127 template <typename T, typename I>
128 __global__ void
scaleVectorArrayKernel(int nv,T * c,T ** xd,T ** zd,I n)129 scaleVectorArrayKernel(int nv, T* c, T** xd, T** zd, I n)
130 {
131 GRID_STRIDE_XLOOP(I, i, n)
132 {
133 for (int j=0; j<nv; j++)
134 zd[j][i] = c[j] * xd[j][i];
135 }
136 }
137
138
139 /*
140 * Sets multiple vectors equal to a constant
141 */
142 template <typename T, typename I>
143 __global__ void
constVectorArrayKernel(int nv,T c,T ** zd,I n)144 constVectorArrayKernel(int nv, T c, T** zd, I n)
145 {
146 GRID_STRIDE_XLOOP(I, i, n)
147 {
148 for (int j=0; j<nv; j++)
149 zd[j][i] = c;
150 }
151 }
152
153
154 /*
155 * WRMS norm of nv vectors.
156 *
157 */
158 template <typename T, typename I>
159 __global__ void
wL2NormSquareVectorArrayKernel(int nv,T ** xd,T ** wd,T * out,I n)160 wL2NormSquareVectorArrayKernel(int nv, T** xd, T** wd, T* out, I n)
161 {
162 // REQUIRES nv blocks (i.e. gridDim.x == nv)
163 const I k = blockIdx.x;
164
165 // Initialize to zero.
166 T sum = 0.0;
167 for (I i = threadIdx.x; i < n; i += blockDim.x)
168 { // each thread computes n/blockDim.x elements
169 sum += xd[k][i] * wd[k][i] * xd[k][i] * wd[k][i];
170 }
171 sum = blockReduce<T, RSUM>(sum, 0.0);
172
173 // Copy reduction result for each block to global memory
174 if (threadIdx.x == 0) atomicAdd(&out[k], sum);
175 }
176
177
178 /*
179 * Masked WRMS norm of nv vectors.
180 *
181 */
182 template <typename T, typename I>
183 __global__ void
wL2NormSquareMaskVectorArrayKernel(int nv,T ** xd,T ** wd,T * id,T * out,I n)184 wL2NormSquareMaskVectorArrayKernel(int nv, T** xd, T** wd, T* id, T* out, I n)
185 {
186 // REQUIRES nv blocks (i.e. gridDim.x == nv)
187 const I k = blockIdx.x;
188
189 // Initialize to zero.
190 T sum = 0.0;
191 for (I i = threadIdx.x; i < n; i += blockDim.x)
192 { // each thread computes n/blockDim.x elements
193 if (id[i] > 0.0) sum += xd[k][i] * wd[k][i] * xd[k][i] * wd[k][i];
194 }
195 sum = blockReduce<T, RSUM>(sum, 0.0);
196
197 // Copy reduction result for each block to global memory
198 if (threadIdx.x == 0) atomicAdd(&out[k], sum);
199 }
200
201
202 /*
203 * Computes the scaled sum of a vector array with multiple other vector arrays
204 */
205 template <typename T, typename I>
206 __global__ void
scaleAddMultiVectorArrayKernel(int nv,int ns,T * c,T ** xd,T ** yd,T ** zd,I n)207 scaleAddMultiVectorArrayKernel(int nv, int ns, T* c, T** xd, T** yd, T** zd, I n)
208 {
209 GRID_STRIDE_XLOOP(I, i, n)
210 {
211 for (int k=0; k<nv; k++)
212 for (int j=0; j<ns; j++)
213 zd[k*ns+j][i] = c[j] * xd[k][i] + yd[k*ns+j][i];
214 }
215 }
216
217
218 /*
219 * Computes the scaled sum of a vector array with multiple other vector arrays
220 */
221 template <typename T, typename I>
222 __global__ void
linearCombinationVectorArrayKernel(int nv,int ns,T * c,T ** xd,T ** zd,I n)223 linearCombinationVectorArrayKernel(int nv, int ns, T* c, T** xd, T** zd, I n)
224 {
225 GRID_STRIDE_XLOOP(I, i, n)
226 {
227 for (int k=0; k<nv; k++)
228 {
229 zd[k][i] = c[0]*xd[k*ns][i];
230 for (int j=1; j<ns; j++)
231 zd[k][i] += c[j]*xd[k*ns+j][i];
232 }
233 }
234 }
235
236 } // namespace nvector_hip
237 } // namespace sundials
238
239 #endif // _NVECTOR_HIP_ARRAY_KERNELS_HIP_HPP_
240