1 /*
2  * -----------------------------------------------------------------
3  * Programmer(s): David Gardner, Cody J. Balos @ LLNL
4  * -----------------------------------------------------------------
5  * SUNDIALS Copyright Start
6  * Copyright (c) 2002-2021, Lawrence Livermore National Security
7  * and Southern Methodist University.
8  * All rights reserved.
9  *
10  * See the top-level LICENSE and NOTICE files for details.
11  *
12  * SPDX-License-Identifier: BSD-3-Clause
13  * SUNDIALS Copyright End
14  * -----------------------------------------------------------------
15  */
16 
17 
18 #ifndef _NVECTOR_HIP_ARRAY_KERNELS_HIP_HPP_
19 #define _NVECTOR_HIP_ARRAY_KERNELS_HIP_HPP_
20 
21 #include <limits>
22 #include <hip/hip_runtime.h>
23 
24 #include "sundials_hip_kernels.hip.hpp"
25 
26 using namespace sundials::hip;
27 
28 namespace sundials
29 {
30 namespace nvector_hip
31 {
32 
33 /* -----------------------------------------------------------------
34  * The namespace for HIP kernels
35  *
36  * Reduction HIP kernels in nvector are based in part on "reduction"
37  * example in NVIDIA Corporation CUDA Samples, and parallel reduction
38  * examples in textbook by J. Cheng at al. "CUDA C Programming".
39  * -----------------------------------------------------------------
40  */
41 
42 /*
43  * -----------------------------------------------------------------------------
44  * fused vector operation kernels
45  * -----------------------------------------------------------------------------
46  */
47 
48 /*
49  * Computes the linear combination of nv vectors
50  */
51 template <typename T, typename I>
52 __global__ void
linearCombinationKernel(int nv,T * c,T ** xd,T * zd,I n)53 linearCombinationKernel(int nv, T* c, T** xd, T* zd, I n)
54 {
55   GRID_STRIDE_XLOOP(I, i, n)
56   {
57     zd[i] = c[0]*xd[0][i];
58     for (int j=1; j<nv; j++)
59       zd[i] += c[j]*xd[j][i];
60   }
61 }
62 
63 /*
64  * Computes the scaled sum of one vector with nv other vectors
65  */
66 template <typename T, typename I>
67 __global__ void
scaleAddMultiKernel(int nv,T * c,T * xd,T ** yd,T ** zd,I n)68 scaleAddMultiKernel(int nv, T* c, T* xd, T** yd, T** zd, I n)
69 {
70   GRID_STRIDE_XLOOP(I, i, n)
71   {
72     for (int j=0; j<nv; j++)
73       zd[j][i] = c[j] * xd[i] + yd[j][i];
74   }
75 }
76 
77 
78 /*
79  * Dot product of one vector with nv other vectors.
80  *
81  */
82 template <typename T, typename I>
83 __global__ void
dotProdMultiKernel(int nv,const T * xd,T ** yd,T * out,I n)84 dotProdMultiKernel(int nv, const T* xd, T** yd, T* out, I n)
85 {
86   // REQUIRES nv blocks (i.e. gridDim.x == nv)
87   const I k = blockIdx.x;
88 
89   // Initialize to zero.
90   T sum = 0.0;
91   for (I i = threadIdx.x; i < n; i += blockDim.x)
92   { // each thread computes n/blockDim.x elements
93     sum += xd[i] * yd[k][i];
94   }
95   sum = blockReduce<T, RSUM>(sum, 0.0);
96 
97   // Copy reduction result for each block to global memory
98   if (threadIdx.x == 0) atomicAdd(&out[k], sum);
99 }
100 
101 
102 /*
103  * -----------------------------------------------------------------------------
104  * vector array operation kernels
105  * -----------------------------------------------------------------------------
106  */
107 
108 
109 /*
110  * Computes the linear sum of multiple vectors
111  */
112 template <typename T, typename I>
113 __global__ void
linearSumVectorArrayKernel(int nv,T a,T ** xd,T b,T ** yd,T ** zd,I n)114 linearSumVectorArrayKernel(int nv, T a, T** xd, T b, T** yd, T** zd, I n)
115 {
116   GRID_STRIDE_XLOOP(I, i, n)
117   {
118     for (int j=0; j<nv; j++)
119       zd[j][i] = a * xd[j][i] + b * yd[j][i];
120   }
121 }
122 
123 
124 /*
125  * Scales multiple vectors
126  */
127 template <typename T, typename I>
128 __global__ void
scaleVectorArrayKernel(int nv,T * c,T ** xd,T ** zd,I n)129 scaleVectorArrayKernel(int nv, T* c, T** xd, T** zd, I n)
130 {
131   GRID_STRIDE_XLOOP(I, i, n)
132   {
133     for (int j=0; j<nv; j++)
134       zd[j][i] = c[j] * xd[j][i];
135   }
136 }
137 
138 
139 /*
140  * Sets multiple vectors equal to a constant
141  */
142 template <typename T, typename I>
143 __global__ void
constVectorArrayKernel(int nv,T c,T ** zd,I n)144 constVectorArrayKernel(int nv, T c, T** zd, I n)
145 {
146   GRID_STRIDE_XLOOP(I, i, n)
147   {
148     for (int j=0; j<nv; j++)
149       zd[j][i] = c;
150   }
151 }
152 
153 
154 /*
155  * WRMS norm of nv vectors.
156  *
157  */
158 template <typename T, typename I>
159 __global__ void
wL2NormSquareVectorArrayKernel(int nv,T ** xd,T ** wd,T * out,I n)160 wL2NormSquareVectorArrayKernel(int nv, T** xd, T** wd, T* out, I n)
161 {
162   // REQUIRES nv blocks (i.e. gridDim.x == nv)
163   const I k = blockIdx.x;
164 
165   // Initialize to zero.
166   T sum = 0.0;
167   for (I i = threadIdx.x; i < n; i += blockDim.x)
168   { // each thread computes n/blockDim.x elements
169     sum += xd[k][i] * wd[k][i] * xd[k][i] * wd[k][i];
170   }
171   sum = blockReduce<T, RSUM>(sum, 0.0);
172 
173   // Copy reduction result for each block to global memory
174   if (threadIdx.x == 0) atomicAdd(&out[k], sum);
175 }
176 
177 
178 /*
179  * Masked WRMS norm of nv vectors.
180  *
181  */
182 template <typename T, typename I>
183 __global__ void
wL2NormSquareMaskVectorArrayKernel(int nv,T ** xd,T ** wd,T * id,T * out,I n)184 wL2NormSquareMaskVectorArrayKernel(int nv, T** xd, T** wd, T* id, T* out, I n)
185 {
186   // REQUIRES nv blocks (i.e. gridDim.x == nv)
187   const I k = blockIdx.x;
188 
189   // Initialize to zero.
190   T sum = 0.0;
191   for (I i = threadIdx.x; i < n; i += blockDim.x)
192   { // each thread computes n/blockDim.x elements
193     if (id[i] > 0.0) sum += xd[k][i] * wd[k][i] * xd[k][i] * wd[k][i];
194   }
195   sum = blockReduce<T, RSUM>(sum, 0.0);
196 
197   // Copy reduction result for each block to global memory
198   if (threadIdx.x == 0) atomicAdd(&out[k], sum);
199 }
200 
201 
202 /*
203  * Computes the scaled sum of a vector array with multiple other vector arrays
204  */
205 template <typename T, typename I>
206 __global__ void
scaleAddMultiVectorArrayKernel(int nv,int ns,T * c,T ** xd,T ** yd,T ** zd,I n)207 scaleAddMultiVectorArrayKernel(int nv, int ns, T* c, T** xd, T** yd, T** zd, I n)
208 {
209   GRID_STRIDE_XLOOP(I, i, n)
210   {
211     for (int k=0; k<nv; k++)
212       for (int j=0; j<ns; j++)
213         zd[k*ns+j][i] = c[j] * xd[k][i] + yd[k*ns+j][i];
214   }
215 }
216 
217 
218 /*
219  * Computes the scaled sum of a vector array with multiple other vector arrays
220  */
221 template <typename T, typename I>
222 __global__ void
linearCombinationVectorArrayKernel(int nv,int ns,T * c,T ** xd,T ** zd,I n)223 linearCombinationVectorArrayKernel(int nv, int ns, T* c, T** xd, T** zd, I n)
224 {
225   GRID_STRIDE_XLOOP(I, i, n)
226   {
227     for (int k=0; k<nv; k++)
228     {
229       zd[k][i] = c[0]*xd[k*ns][i];
230       for (int j=1; j<ns; j++)
231         zd[k][i] += c[j]*xd[k*ns+j][i];
232     }
233   }
234 }
235 
236 } // namespace nvector_hip
237 } // namespace sundials
238 
239 #endif // _NVECTOR_HIP_ARRAY_KERNELS_HIP_HPP_
240