1 /*
2 * This file is part of the GROMACS molecular simulation package.
3 *
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013-2016,2017,2018,2019,2020,2021, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
10 *
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
15 *
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
20 *
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 *
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
33 *
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
36 */
37
38 /*! \internal \file
39 * \brief Implements PME GPU spline calculation and charge spreading in CUDA.
40 * TODO: consider always pre-sorting particles (as in DD case).
41 *
42 * \author Aleksei Iupinov <a.yupinov@gmail.com>
43 */
44
45 #include "gmxpre.h"
46
47 #include <cassert>
48
49 #include "gromacs/gpu_utils/cuda_kernel_utils.cuh"
50 #include "gromacs/gpu_utils/typecasts.cuh"
51
52 #include "pme.cuh"
53 #include "pme_gpu_calculate_splines.cuh"
54 #include "pme_grid.h"
55
56 /*
57 * This define affects the spline calculation behaviour in the kernel.
58 * 0: a single GPU thread handles a single dimension of a single particle (calculating and storing
59 * (order) spline values and derivatives). 1: (order) threads do redundant work on this same task,
60 * each one stores only a single theta and single dtheta into global arrays. The only efficiency
61 * difference is less global store operations, countered by more redundant spline computation.
62 *
63 * TODO: estimate if this should be a boolean parameter (and add it to the unit test if so).
64 */
65 #define PME_GPU_PARALLEL_SPLINE 0
66
67 /*! \brief
68 * Charge spreading onto the grid.
69 * This corresponds to the CPU function spread_coefficients_bsplines_thread().
70 * Optional second stage of the spline_and_spread_kernel.
71 *
72 * \tparam[in] order PME interpolation order.
73 * \tparam[in] wrapX Whether the grid overlap in dimension X should be wrapped.
74 * \tparam[in] wrapY Whether the grid overlap in dimension Y should be wrapped.
75 * \tparam[in] gridIndex The index of the grid to use in the kernel.
76 * \tparam[in] threadsPerAtom How many threads work on each atom
77 *
78 * \param[in] kernelParams Input PME CUDA data in constant memory.
79 * \param[in] atomCharge Atom charge/coefficient of atom processed by thread.
80 * \param[in] sm_gridlineIndices Atom gridline indices in the shared memory.
81 * \param[in] sm_theta Atom spline values in the shared memory.
82 */
83 template<int order, bool wrapX, bool wrapY, int gridIndex, ThreadsPerAtom threadsPerAtom>
spread_charges(const PmeGpuCudaKernelParams kernelParams,const float * atomCharge,const int * __restrict__ sm_gridlineIndices,const float * __restrict__ sm_theta)84 __device__ __forceinline__ void spread_charges(const PmeGpuCudaKernelParams kernelParams,
85 const float* atomCharge,
86 const int* __restrict__ sm_gridlineIndices,
87 const float* __restrict__ sm_theta)
88 {
89 /* Global memory pointer to the output grid */
90 float* __restrict__ gm_grid = kernelParams.grid.d_realGrid[gridIndex];
91
92 // Number of atoms processed by a single warp in spread and gather
93 const int threadsPerAtomValue = (threadsPerAtom == ThreadsPerAtom::Order) ? order : order * order;
94 const int atomsPerWarp = warp_size / threadsPerAtomValue;
95
96 const int nx = kernelParams.grid.realGridSize[XX];
97 const int ny = kernelParams.grid.realGridSize[YY];
98 const int nz = kernelParams.grid.realGridSize[ZZ];
99 const int pny = kernelParams.grid.realGridSizePadded[YY];
100 const int pnz = kernelParams.grid.realGridSizePadded[ZZ];
101
102 const int offx = 0, offy = 0, offz = 0; // unused for now
103
104 const int atomIndexLocal = threadIdx.z;
105
106 const int chargeCheck = pme_gpu_check_atom_charge(*atomCharge);
107 if (chargeCheck)
108 {
109 // Spline Z coordinates
110 const int ithz = threadIdx.x;
111
112 const int ixBase = sm_gridlineIndices[atomIndexLocal * DIM + XX] - offx;
113 const int iyBase = sm_gridlineIndices[atomIndexLocal * DIM + YY] - offy;
114 int iz = sm_gridlineIndices[atomIndexLocal * DIM + ZZ] - offz + ithz;
115 if (iz >= nz)
116 {
117 iz -= nz;
118 }
119 /* Atom index w.r.t. warp - alternating 0 1 0 1 .. */
120 const int atomWarpIndex = atomIndexLocal % atomsPerWarp;
121 /* Warp index w.r.t. block - could probably be obtained easier? */
122 const int warpIndex = atomIndexLocal / atomsPerWarp;
123
124 const int splineIndexBase = getSplineParamIndexBase<order, atomsPerWarp>(warpIndex, atomWarpIndex);
125 const int splineIndexZ = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, ZZ, ithz);
126 const float thetaZ = sm_theta[splineIndexZ];
127
128 /* loop not used if order*order threads per atom */
129 const int ithyMin = (threadsPerAtom == ThreadsPerAtom::Order) ? 0 : threadIdx.y;
130 const int ithyMax = (threadsPerAtom == ThreadsPerAtom::Order) ? order : threadIdx.y + 1;
131 for (int ithy = ithyMin; ithy < ithyMax; ithy++)
132 {
133 int iy = iyBase + ithy;
134 if (wrapY & (iy >= ny))
135 {
136 iy -= ny;
137 }
138
139 const int splineIndexY = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, YY, ithy);
140 float thetaY = sm_theta[splineIndexY];
141 const float Val = thetaZ * thetaY * (*atomCharge);
142 assert(isfinite(Val));
143 const int offset = iy * pnz + iz;
144
145 #pragma unroll
146 for (int ithx = 0; (ithx < order); ithx++)
147 {
148 int ix = ixBase + ithx;
149 if (wrapX & (ix >= nx))
150 {
151 ix -= nx;
152 }
153 const int gridIndexGlobal = ix * pny * pnz + offset;
154 const int splineIndexX =
155 getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, XX, ithx);
156 const float thetaX = sm_theta[splineIndexX];
157 assert(isfinite(thetaX));
158 assert(isfinite(gm_grid[gridIndexGlobal]));
159 atomicAdd(gm_grid + gridIndexGlobal, thetaX * Val);
160 }
161 }
162 }
163 }
164
165 /*! \brief
166 * A spline computation and charge spreading kernel function.
167 *
168 * Two tuning parameters can be used for additional performance. For small systems and for debugging
169 * writeGlobal should be used removing the need to recalculate the theta values in the gather kernel.
170 * Similarly for useOrderThreads large systems order threads per atom gives higher performance than order*order threads
171 *
172 * \tparam[in] order PME interpolation order.
173 * \tparam[in] computeSplines A boolean which tells if the spline parameter and
174 * gridline indices' computation should be performed.
175 * \tparam[in] spreadCharges A boolean which tells if the charge spreading should be performed.
176 * \tparam[in] wrapX A boolean which tells if the grid overlap in dimension X should be wrapped.
177 * \tparam[in] wrapY A boolean which tells if the grid overlap in dimension Y should be wrapped.
178 * \tparam[in] numGrids The number of grids to use in the kernel. Can be 1 or 2.
179 * \tparam[in] writeGlobal A boolean which tells if the theta values and gridlines should be written to global memory.
180 * \tparam[in] threadsPerAtom How many threads work on each atom
181 * \param[in] kernelParams Input PME CUDA data in constant memory.
182 */
183 template<int order, bool computeSplines, bool spreadCharges, bool wrapX, bool wrapY, int numGrids, bool writeGlobal, ThreadsPerAtom threadsPerAtom>
__launch_bounds__(c_spreadMaxThreadsPerBlock)184 __launch_bounds__(c_spreadMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBUTE __global__
185 void pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams kernelParams)
186 {
187 const int threadsPerAtomValue = (threadsPerAtom == ThreadsPerAtom::Order) ? order : order * order;
188 const int atomsPerBlock = c_spreadMaxThreadsPerBlock / threadsPerAtomValue;
189 // Number of atoms processed by a single warp in spread and gather
190 const int atomsPerWarp = warp_size / threadsPerAtomValue;
191 // Gridline indices, ivec
192 __shared__ int sm_gridlineIndices[atomsPerBlock * DIM];
193 // Charges
194 __shared__ float sm_coefficients[atomsPerBlock];
195 // Spline values
196 __shared__ float sm_theta[atomsPerBlock * DIM * order];
197 float dtheta;
198
199 float3 atomX;
200 float atomCharge;
201
202 const int blockIndex = blockIdx.y * gridDim.x + blockIdx.x;
203 const int atomIndexOffset = blockIndex * atomsPerBlock;
204
205 /* Thread index w.r.t. block */
206 const int threadLocalId =
207 (threadIdx.z * (blockDim.x * blockDim.y)) + (threadIdx.y * blockDim.x) + threadIdx.x;
208 /* Warp index w.r.t. block - could probably be obtained easier? */
209 const int warpIndex = threadLocalId / warp_size;
210
211 /* Atom index w.r.t. warp */
212 const int atomWarpIndex = threadIdx.z % atomsPerWarp;
213 /* Atom index w.r.t. block/shared memory */
214 const int atomIndexLocal = warpIndex * atomsPerWarp + atomWarpIndex;
215 /* Atom index w.r.t. global memory */
216 const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
217
218 /* Early return for fully empty blocks at the end
219 * (should only happen for billions of input atoms)
220 */
221 if (atomIndexOffset >= kernelParams.atoms.nAtoms)
222 {
223 return;
224 }
225 /* Charges, required for both spline and spread */
226 if (c_useAtomDataPrefetch)
227 {
228 pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(sm_coefficients,
229 kernelParams.atoms.d_coefficients[0]);
230 __syncthreads();
231 atomCharge = sm_coefficients[atomIndexLocal];
232 }
233 else
234 {
235 atomCharge = kernelParams.atoms.d_coefficients[0][atomIndexGlobal];
236 }
237
238 if (computeSplines)
239 {
240 const float3* __restrict__ gm_coordinates = asFloat3(kernelParams.atoms.d_coordinates);
241 if (c_useAtomDataPrefetch)
242 {
243 // Coordinates
244 __shared__ float3 sm_coordinates[atomsPerBlock];
245
246 /* Staging coordinates */
247 pme_gpu_stage_atom_data<float3, atomsPerBlock, 1>(sm_coordinates, gm_coordinates);
248 __syncthreads();
249 atomX = sm_coordinates[atomIndexLocal];
250 }
251 else
252 {
253 atomX = gm_coordinates[atomIndexGlobal];
254 }
255 calculate_splines<order, atomsPerBlock, atomsPerWarp, false, writeGlobal, numGrids>(
256 kernelParams, atomIndexOffset, atomX, atomCharge, sm_theta, &dtheta, sm_gridlineIndices);
257 __syncwarp();
258 }
259 else
260 {
261 /* Staging the data for spread
262 * (the data is assumed to be in GPU global memory with proper layout already,
263 * as in after running the spline kernel)
264 */
265 /* Spline data - only thetas (dthetas will only be needed in gather) */
266 pme_gpu_stage_atom_data<float, atomsPerBlock, DIM * order>(sm_theta, kernelParams.atoms.d_theta);
267 /* Gridline indices */
268 pme_gpu_stage_atom_data<int, atomsPerBlock, DIM>(sm_gridlineIndices,
269 kernelParams.atoms.d_gridlineIndices);
270
271 __syncthreads();
272 }
273
274 /* Spreading */
275 if (spreadCharges)
276 {
277 spread_charges<order, wrapX, wrapY, 0, threadsPerAtom>(kernelParams, &atomCharge,
278 sm_gridlineIndices, sm_theta);
279 }
280 if (numGrids == 2)
281 {
282 __syncthreads();
283 if (c_useAtomDataPrefetch)
284 {
285 pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(sm_coefficients,
286 kernelParams.atoms.d_coefficients[1]);
287 __syncthreads();
288 atomCharge = sm_coefficients[atomIndexLocal];
289 }
290 else
291 {
292 atomCharge = kernelParams.atoms.d_coefficients[1][atomIndexGlobal];
293 }
294 if (spreadCharges)
295 {
296 spread_charges<order, wrapX, wrapY, 1, threadsPerAtom>(kernelParams, &atomCharge,
297 sm_gridlineIndices, sm_theta);
298 }
299 }
300 }
301
302 //! Kernel instantiations
303 // clang-format off
304 template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, 1, true, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams);
305 template __global__ void pme_spline_and_spread_kernel<4, true, false, true, true, 1, true, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams);
306 template __global__ void pme_spline_and_spread_kernel<4, false, true, true, true, 1, true, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams);
307 template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, 1, false, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams);
308 template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, 1, true, ThreadsPerAtom::OrderSquared> (const PmeGpuCudaKernelParams);
309 template __global__ void pme_spline_and_spread_kernel<4, true, false, true, true, 1, true, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
310 template __global__ void pme_spline_and_spread_kernel<4, false, true, true, true, 1, true, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
311 template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, 1, false, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
312 template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, 2, true, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams);
313 template __global__ void pme_spline_and_spread_kernel<4, true, false, true, true, 2, true, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams);
314 template __global__ void pme_spline_and_spread_kernel<4, false, true, true, true, 2, true, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams);
315 template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, 2, false, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams);
316 template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, 2, true, ThreadsPerAtom::OrderSquared> (const PmeGpuCudaKernelParams);
317 template __global__ void pme_spline_and_spread_kernel<4, true, false, true, true, 2, true, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
318 template __global__ void pme_spline_and_spread_kernel<4, false, true, true, true, 2, true, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
319 template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, 2, false, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
320 // clang-format on
321