1/* 2 * This file is part of the GROMACS molecular simulation package. 3 * 4 * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by 5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, 6 * and including many others, as listed in the AUTHORS file in the 7 * top-level source directory and at http://www.gromacs.org. 8 * 9 * GROMACS is free software; you can redistribute it and/or 10 * modify it under the terms of the GNU Lesser General Public License 11 * as published by the Free Software Foundation; either version 2.1 12 * of the License, or (at your option) any later version. 13 * 14 * GROMACS is distributed in the hope that it will be useful, 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17 * Lesser General Public License for more details. 18 * 19 * You should have received a copy of the GNU Lesser General Public 20 * License along with GROMACS; if not, see 21 * http://www.gnu.org/licenses, or write to the Free Software Foundation, 22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 23 * 24 * If you want to redistribute modifications to GROMACS, please 25 * consider that scientific software is very special. Version 26 * control is crucial - bugs must be traceable. We will be happy to 27 * consider code for inclusion in the official distribution, but 28 * derived work must not be called official GROMACS. Details are found 29 * in the README & COPYING files - if they are missing, get the 30 * official version at http://www.gromacs.org. 31 * 32 * To help us fund GROMACS development, we humbly ask that you cite 33 * the research papers on the package. Check out http://www.gromacs.org. 34 */ 35 36/*! \internal \file 37 * \brief Implements PME OpenCL spline parameter computation and charge spread kernels. 38 * When including this and other PME OpenCL kernel files, plenty of common 39 * constants/macros are expected to be defined (such as "order" which is PME interpolation order). 40 * For details, please see how pme_program.cl is compiled in pme_gpu_program_impl_ocl.cpp. 41 * 42 * This file's kernels specifically expect the following definitions: 43 * 44 * - atomsPerBlock which expresses how many atoms are processed by a single work group 45 * - order which is a PME interpolation order 46 * - computeSplines and spreadCharges must evaluate to either true or false to specify which 47 * kernel falvor is being compiled 48 * - wrapX and wrapY must evaluate to either true or false to specify whether the grid overlap 49 * in dimension X/Y is to be used 50 * 51 * \author Aleksei Iupinov <a.yupinov@gmail.com> 52 */ 53 54#include "gromacs/gpu_utils/vectype_ops.clh" 55 56#include "pme_gpu_calculate_splines.clh" 57#include "pme_gpu_types.h" 58 59/* 60 * This define affects the spline calculation behaviour in the kernel. 61 * 0: a single GPU thread handles a single dimension of a single particle (calculating and storing 62 * (order) spline values and derivatives). 1: (order) threads do redundant work on this same task, 63 * each one stores only a single theta and single dtheta into global arrays. The only efficiency 64 * difference is less global store operations, countered by more redundant spline computation. 65 * 66 * TODO: estimate if this should be a boolean parameter (and add it to the unit test if so). 67 */ 68#define PME_GPU_PARALLEL_SPLINE 0 69 70#ifndef COMPILE_SPREAD_HELPERS_ONCE 71# define COMPILE_SPREAD_HELPERS_ONCE 72 73/*! \brief 74 * General purpose function for loading atom-related data from global to shared memory. 75 * 76 * \param[out] sm_destination Local memory array for output. 77 * \param[in] gm_source Global memory array for input. 78 * \param[in] dataCountPerAtom Number of data elements per single 79 * atom (e.g. DIM for an rvec coordinates array). 80 * 81 */ 82inline void pme_gpu_stage_atom_data(__local float* __restrict__ sm_destination, 83 __global const float* __restrict__ gm_source, 84 const int dataCountPerAtom) 85{ 86 assert((get_local_id(2) * get_local_size(1) + get_local_id(1)) * get_local_size(0) + get_local_id(0) 87 < MAX_INT); 88 const int threadLocalIndex = 89 (int)((get_local_id(2) * get_local_size(1) + get_local_id(1)) * get_local_size(0) 90 + get_local_id(0)); 91 const int localIndex = threadLocalIndex; 92 const int globalIndexBase = (int)get_group_id(XX) * atomsPerBlock * dataCountPerAtom; 93 const int globalIndex = globalIndexBase + localIndex; 94 if (localIndex < atomsPerBlock * dataCountPerAtom) 95 { 96 assert(isfinite(float(gm_source[globalIndex]))); 97 sm_destination[localIndex] = gm_source[globalIndex]; 98 } 99} 100 101/*! \brief 102 * PME GPU spline parameter and gridline indices calculation. 103 * This corresponds to the CPU functions calc_interpolation_idx() and make_bsplines(). 104 * First stage of the whole kernel. 105 * 106 * \param[in] kernelParams Input PME GPU data in constant memory. 107 * \param[in] atomIndexOffset Starting atom index for the execution block w.r.t. global memory. 108 * \param[in] sm_coordinates Atom coordinates in the shared memory (rvec) 109 * \param[in] sm_coefficients Atom charges/coefficients in the shared memory. 110 * \param[out] sm_theta Atom spline values in the shared memory. 111 * \param[out] sm_gridlineIndices Atom gridline indices in the shared memory. 112 * \param[out] sm_fractCoords Atom fractional coordinates (rvec) 113 * \param[out] gm_theta Atom spline parameter values 114 * \param[out] gm_dtheta Atom spline parameter derivatives 115 * \param[out] gm_gridlineIndices Same as \p sm_gridlineIndices but in global memory. 116 * \param[in] gm_fractShiftsTable Atom fractional coordinates correction table 117 * \param[in] gm_gridlineIndicesTable Table with atom gridline indices 118 */ 119gmx_opencl_inline void calculate_splines(const struct PmeOpenCLKernelParams kernelParams, 120 const int atomIndexOffset, 121 __local const float* __restrict__ sm_coordinates, 122 __local const float* __restrict__ sm_coefficients, 123 __local float* __restrict__ sm_theta, 124 __local int* __restrict__ sm_gridlineIndices, 125 __local float* __restrict__ sm_fractCoords, 126 __global float* __restrict__ gm_theta, 127 __global float* __restrict__ gm_dtheta, 128 __global int* __restrict__ gm_gridlineIndices, 129 __global const float* __restrict__ gm_fractShiftsTable, 130 __global const int* __restrict__ gm_gridlineIndicesTable) 131{ 132 /* Thread index w.r.t. block */ 133 assert((get_local_id(2) * get_local_size(1) + get_local_id(1)) * get_local_size(0) + get_local_id(0) 134 < MAX_INT); 135 assert(numGrids == 1 || numGrids == 2); 136 assert(numGrids == 1 || c_skipNeutralAtoms == false); 137 const int threadLocalIndex = 138 (int)((get_local_id(2) * get_local_size(1) + get_local_id(1)) * get_local_size(0) 139 + get_local_id(0)); 140 /* Warp index w.r.t. block - could probably be obtained easier? */ 141 const int warpIndex = threadLocalIndex / warp_size; 142 /* Thread index w.r.t. warp */ 143 const int threadWarpIndex = threadLocalIndex % warp_size; 144 /* Atom index w.r.t. warp - alternating 0 1 0 1 .. */ 145 const int atomWarpIndex = threadWarpIndex % atomsPerWarp; 146 /* Atom index w.r.t. block/shared memory */ 147 const int atomIndexLocal = warpIndex * atomsPerWarp + atomWarpIndex; 148 149 /* Spline contribution index in one dimension */ 150 const int orderIndex = threadWarpIndex / (atomsPerWarp * DIM); 151 /* Dimension index */ 152 const int dimIndex = (threadWarpIndex - orderIndex * (atomsPerWarp * DIM)) / atomsPerWarp; 153 154 /* Multi-purpose index of rvec/ivec atom data */ 155 const int sharedMemoryIndex = atomIndexLocal * DIM + dimIndex; 156 157 /* Spline parameters need a working buffer. 158 * With PME_GPU_PARALLEL_SPLINE == 0 it is just a local array of (order) values for some of the threads, which is fine; 159 * With PME_GPU_PARALLEL_SPLINE == 1 (order) times more threads are involved, so the shared memory is used to avoid overhead. 160 * The buffer's size, striding and indexing are adjusted accordingly. 161 * The buffer is accessed with SPLINE_DATA_PTR and SPLINE_DATA macros. 162 */ 163# if PME_GPU_PARALLEL_SPLINE 164# define splineDataStride (atomsPerBlock * DIM) 165 const int splineDataIndex = sharedMemoryIndex; 166 __local float sm_splineData[splineDataStride * order]; 167 __local float* splineDataPtr = sm_splineData; 168# else 169# define splineDataStride 1 170 const int splineDataIndex = 0; 171 float splineData[splineDataStride * order]; 172 float* splineDataPtr = splineData; 173# endif 174 175# define SPLINE_DATA_PTR(i) (splineDataPtr + ((i)*splineDataStride + splineDataIndex)) 176# define SPLINE_DATA(i) (*SPLINE_DATA_PTR(i)) 177 178 const int localCheck = (dimIndex < DIM) && (orderIndex < (PME_GPU_PARALLEL_SPLINE ? order : 1)); 179 180 if (localCheck) 181 { 182 /* Indices interpolation */ 183 if (orderIndex == 0) 184 { 185 int tableIndex = 0; 186 float n = 0.0F; 187 float t = 0.0F; 188 const float3 x = vload3(atomIndexLocal, sm_coordinates); 189 190 /* Accessing fields in fshOffset/nXYZ/recipbox/... with dimIndex offset 191 * puts them into local memory(!) instead of accessing the constant memory directly. 192 * That's the reason for the switch, to unroll explicitly. 193 * The commented parts correspond to the 0 components of the recipbox. 194 */ 195 switch (dimIndex) 196 { 197 case XX: 198 tableIndex = kernelParams.grid.tablesOffsets[XX]; 199 n = kernelParams.grid.realGridSizeFP[XX]; 200 t = x.x * kernelParams.current.recipBox[dimIndex][XX] 201 + x.y * kernelParams.current.recipBox[dimIndex][YY] 202 + x.z * kernelParams.current.recipBox[dimIndex][ZZ]; 203 break; 204 205 case YY: 206 tableIndex = kernelParams.grid.tablesOffsets[YY]; 207 n = kernelParams.grid.realGridSizeFP[YY]; 208 t = /*x.x * kernelParams.current.recipbox[dimIndex][XX] + */ x.y 209 * kernelParams.current.recipBox[dimIndex][YY] 210 + x.z * kernelParams.current.recipBox[dimIndex][ZZ]; 211 break; 212 213 case ZZ: 214 tableIndex = kernelParams.grid.tablesOffsets[ZZ]; 215 n = kernelParams.grid.realGridSizeFP[ZZ]; 216 t = /*x.x * kernelParams.current.recipbox[dimIndex][XX] + x.y * kernelParams.current.recipbox[dimIndex][YY] + */ x 217 .z 218 * kernelParams.current.recipBox[dimIndex][ZZ]; 219 break; 220 default: 221 assert(false); 222 return; 223 break; 224 } 225 const float shift = c_pmeMaxUnitcellShift; 226 227 /* Fractional coordinates along box vectors, adding a positive shift to ensure t is positive for triclinic boxes */ 228 t = (t + shift) * n; 229 const int tInt = (int)t; 230 sm_fractCoords[sharedMemoryIndex] = t - (float)tInt; 231 tableIndex += tInt; 232 assert(tInt >= 0); 233 assert(tInt < c_pmeNeighborUnitcellCount * n); 234 sm_fractCoords[sharedMemoryIndex] += gm_fractShiftsTable[tableIndex]; 235 sm_gridlineIndices[sharedMemoryIndex] = gm_gridlineIndicesTable[tableIndex]; 236 gm_gridlineIndices[atomIndexOffset * DIM + sharedMemoryIndex] = 237 sm_gridlineIndices[sharedMemoryIndex]; 238 } 239 240 /* B-spline calculation */ 241 242 const int chargeCheck = pme_gpu_check_atom_charge(sm_coefficients[atomIndexLocal]); 243 /* With FEP (numGrids == 2), we might have 0 charge in state A, but !=0 in state B, so we always calculate splines */ 244 if (numGrids == 2 || chargeCheck) 245 { 246 int o = orderIndex; // This is an index that is set once for PME_GPU_PARALLEL_SPLINE == 1 247 248 const float dr = sm_fractCoords[sharedMemoryIndex]; 249 assert(isfinite(dr)); 250 251 /* dr is relative offset from lower cell limit */ 252 *SPLINE_DATA_PTR(order - 1) = 0.0F; 253 *SPLINE_DATA_PTR(1) = dr; 254 *SPLINE_DATA_PTR(0) = 1.0F - dr; 255 256# pragma unroll order 257 for (int k = 3; k < order; k++) 258 { 259 const float div = 1.0F / ((float)k - 1.0F); 260 *SPLINE_DATA_PTR(k - 1) = div * dr * SPLINE_DATA(k - 2); 261# pragma unroll 262 for (int l = 1; l < (k - 1); l++) 263 { 264 *SPLINE_DATA_PTR(k - l - 1) = 265 div 266 * ((dr + (float)l) * SPLINE_DATA(k - l - 2) 267 + ((float)k - (float)l - dr) * SPLINE_DATA(k - l - 1)); 268 } 269 *SPLINE_DATA_PTR(0) = div * (1.0F - dr) * SPLINE_DATA(0); 270 } 271 272 const int thetaIndexBase = getSplineParamIndexBase(warpIndex, atomWarpIndex); 273 const int thetaGlobalOffsetBase = atomIndexOffset * DIM * order; 274 275 /* Differentiation and storing the spline derivatives (dtheta) */ 276# if !PME_GPU_PARALLEL_SPLINE 277 // With PME_GPU_PARALLEL_SPLINE == 1, o is already set to orderIndex; 278 // With PME_GPU_PARALLEL_SPLINE == 0, we loop o over range(order). 279# pragma unroll order 280 for (o = 0; o < order; o++) 281# endif 282 { 283 const int thetaIndex = getSplineParamIndex(thetaIndexBase, dimIndex, o); 284 const int thetaGlobalIndex = thetaGlobalOffsetBase + thetaIndex; 285 286 const float dtheta = ((o > 0) ? SPLINE_DATA(o - 1) : 0.0F) - SPLINE_DATA(o); 287 assert(isfinite(dtheta)); 288 gm_dtheta[thetaGlobalIndex] = dtheta; 289 } 290 291 const float div = 1.0F / (order - 1.0F); 292 *SPLINE_DATA_PTR(order - 1) = div * dr * SPLINE_DATA(order - 2); 293# pragma unroll 294 for (int k = 1; k < (order - 1); k++) 295 { 296 *SPLINE_DATA_PTR(order - k - 1) = div 297 * ((dr + (float)k) * SPLINE_DATA(order - k - 2) 298 + (order - k - dr) * SPLINE_DATA(order - k - 1)); 299 } 300 *SPLINE_DATA_PTR(0) = div * (1.0F - dr) * SPLINE_DATA(0); 301 302 /* Storing the spline values (theta) */ 303# if !PME_GPU_PARALLEL_SPLINE 304 // See comment for the loop above 305# pragma unroll order 306 for (o = 0; o < order; o++) 307# endif 308 { 309 const int thetaIndex = getSplineParamIndex(thetaIndexBase, dimIndex, o); 310 const int thetaGlobalIndex = thetaGlobalOffsetBase + thetaIndex; 311 312 sm_theta[thetaIndex] = SPLINE_DATA(o); 313 assert(isfinite(sm_theta[thetaIndex])); 314 gm_theta[thetaGlobalIndex] = SPLINE_DATA(o); 315 } 316 } 317 } 318} 319 320/*! \brief 321 * Charge spreading onto the grid. 322 * This corresponds to the CPU function spread_coefficients_bsplines_thread(). 323 * Second stage of the whole kernel. 324 * 325 * \param[in] kernelParams Input PME GPU data in constant memory. 326 * \param[in] sm_coefficients Atom coefficents/charges in the shared memory. 327 * \param[in] sm_gridlineIndices Atom gridline indices in the shared memory. 328 * \param[in] sm_theta Atom spline values in the shared memory. 329 * \param[out] gm_grid Global 3D grid for spreading. 330 */ 331gmx_opencl_inline void spread_charges(const struct PmeOpenCLKernelParams kernelParams, 332 __local const float* __restrict__ sm_coefficients, 333 __local const int* __restrict__ sm_gridlineIndices, 334 __local const float* __restrict__ sm_theta, 335 __global float* __restrict__ gm_grid) 336{ 337 const int nx = kernelParams.grid.realGridSize[XX]; 338 const int ny = kernelParams.grid.realGridSize[YY]; 339 const int nz = kernelParams.grid.realGridSize[ZZ]; 340 const int pny = kernelParams.grid.realGridSizePadded[YY]; 341 const int pnz = kernelParams.grid.realGridSizePadded[ZZ]; 342 343 const int offx = 0; 344 const int offy = 0; 345 const int offz = 0; 346 347 const int atomIndexLocal = get_local_id(ZZ); 348 349 const int chargeCheck = pme_gpu_check_atom_charge(sm_coefficients[atomIndexLocal]); 350 if (chargeCheck) 351 { 352 // Spline Y/Z coordinates 353 const int ithy = get_local_id(YY); 354 const int ithz = get_local_id(XX); 355 const int ixBase = sm_gridlineIndices[atomIndexLocal * DIM + XX] - offx; 356 int iy = sm_gridlineIndices[atomIndexLocal * DIM + YY] - offy + ithy; 357 if (wrapY & (iy >= ny)) 358 { 359 iy -= ny; 360 } 361 int iz = sm_gridlineIndices[atomIndexLocal * DIM + ZZ] - offz + ithz; 362 if (iz >= nz) 363 { 364 iz -= nz; 365 } 366 367 /* Atom index w.r.t. warp - alternating 0 1 0 1 .. */ 368 const int atomWarpIndex = atomIndexLocal % atomsPerWarp; 369 /* Warp index w.r.t. block - could probably be obtained easier? */ 370 const int warpIndex = atomIndexLocal / atomsPerWarp; 371 372 const int splineIndexBase = getSplineParamIndexBase(warpIndex, atomWarpIndex); 373 const int splineIndexZ = getSplineParamIndex(splineIndexBase, ZZ, ithz); 374 const float thetaZ = sm_theta[splineIndexZ]; 375 const int splineIndexY = getSplineParamIndex(splineIndexBase, YY, ithy); 376 const float thetaY = sm_theta[splineIndexY]; 377 const float constVal = thetaZ * thetaY * sm_coefficients[atomIndexLocal]; 378 assert(isfinite(constVal)); 379 const int constOffset = iy * pnz + iz; 380 381# pragma unroll order 382 for (int ithx = 0; (ithx < order); ithx++) 383 { 384 int ix = ixBase + ithx; 385 if (wrapX & (ix >= nx)) 386 { 387 ix -= nx; 388 } 389 const int gridIndexGlobal = ix * pny * pnz + constOffset; 390 const int splineIndexX = getSplineParamIndex(splineIndexBase, XX, ithx); 391 const float thetaX = sm_theta[splineIndexX]; 392 assert(isfinite(thetaX)); 393 assert(isfinite(gm_grid[gridIndexGlobal])); 394 atomicAdd_g_f(gm_grid + gridIndexGlobal, thetaX * constVal); 395 } 396 } 397} 398 399#endif // COMPILE_SPREAD_HELPERS_ONCE 400 401/*! \brief 402 * A spline computation and/or charge spreading kernel function. 403 * Please see the file description for additional defines which this kernel expects. 404 * 405 * \param[in] kernelParams Input PME GPU data in constant memory. 406 * \param[in,out] gm_theta Atom spline parameter values 407 * \param[out] gm_dtheta Atom spline parameter derivatives 408 * \param[in,out] gm_gridlineIndices Atom gridline indices (ivec) 409 * \param[out] gm_gridA Global 3D grid for charge spreading. 410 * Grid for the unperturbed state, FEP state A or the single grid used for 411 * interpolated coefficients on one grid in FEP A/B. 412 * \param[out] gm_gridB Global 3D grid for charge spreading. 413 * FEP state B when using dual grids (when calculating energy and virials). 414 * \param[in] gm_fractShiftsTable Atom fractional coordinates correction table 415 * \param[in] gm_gridlineIndicesTable Table with atom gridline indices 416 * \param[in] gm_coefficientsA Atom charges/coefficients of the unperturbed state 417 * or FEP state A. 418 * \param[in] gm_coefficientsB Atom charges/coefficients in FEP state B. Only 419 * used when spreading interpolated coefficients on one grid. 420 * \param[in] gm_coordinates Atom coordinates (rvec) 421 */ 422__attribute__((reqd_work_group_size(order, order, atomsPerBlock))) __kernel void CUSTOMIZED_KERNEL_NAME( 423 pme_spline_and_spread_kernel)(const struct PmeOpenCLKernelParams kernelParams, 424 __global float* __restrict__ gm_theta, 425 __global float* __restrict__ gm_dtheta, 426 __global int* __restrict__ gm_gridlineIndices, 427 __global float* __restrict__ gm_gridA, 428 __global float* __restrict__ gm_gridB, 429 __global const float* __restrict__ gm_fractShiftsTable, 430 __global const int* __restrict__ gm_gridlineIndicesTable, 431 __global const float* __restrict__ gm_coefficientsA, 432 __global const float* __restrict__ gm_coefficientsB, 433 __global const float* __restrict__ gm_coordinates) 434{ 435 // Gridline indices, ivec 436 __local int sm_gridlineIndices[atomsPerBlock * DIM]; 437 // Charges 438 __local float sm_coefficients[atomsPerBlock]; 439 // Spline values 440 __local float sm_theta[atomsPerBlock * DIM * order]; 441 // Fractional coordinates - only for spline computation 442 __local float sm_fractCoords[atomsPerBlock * DIM]; 443 // Staging coordinates - only for spline computation 444 __local float sm_coordinates[DIM * atomsPerBlock]; 445 446 const int atomIndexOffset = (int)get_group_id(XX) * atomsPerBlock; 447 int gridIndex = 0; 448 449 /* Staging coefficients/charges for both spline and spread */ 450 pme_gpu_stage_atom_data(sm_coefficients, gm_coefficientsA, 1); 451 452 if (computeSplines) 453 { 454 /* Staging coordinates */ 455 pme_gpu_stage_atom_data(sm_coordinates, gm_coordinates, DIM); 456 457 barrier(CLK_LOCAL_MEM_FENCE); 458 calculate_splines(kernelParams, atomIndexOffset, sm_coordinates, sm_coefficients, sm_theta, 459 sm_gridlineIndices, sm_fractCoords, gm_theta, gm_dtheta, 460 gm_gridlineIndices, gm_fractShiftsTable, gm_gridlineIndicesTable); 461#if !defined(_AMD_SOURCE_) && !defined(_NVIDIA_SOURCE_) 462 /* This is only here for execution of e.g. 32-sized warps on 16-wide hardware; this was 463 * __syncwarp() in CUDA. #2519 464 */ 465 barrier(CLK_LOCAL_MEM_FENCE); 466#endif 467 } 468 else 469 { 470 /* Staging the data for spread 471 * (the data is assumed to be in GPU global memory with proper layout already, 472 * as in after running the spline kernel) 473 */ 474 /* Spline data - only thetas (dthetas will only be needed in gather) */ 475 pme_gpu_stage_atom_data(sm_theta, gm_theta, DIM * order); 476 /* Gridline indices - they're actually int and not float, but C99 is angry about overloads */ 477 pme_gpu_stage_atom_data((__local float*)sm_gridlineIndices, 478 (__global const float*)gm_gridlineIndices, DIM); 479 480 barrier(CLK_LOCAL_MEM_FENCE); 481 } 482 /* Spreading */ 483 if (spreadCharges) 484 { 485 spread_charges(kernelParams, sm_coefficients, sm_gridlineIndices, sm_theta, gm_gridA); 486 } 487 if (numGrids == 2) 488 { 489 barrier(CLK_LOCAL_MEM_FENCE); 490 gridIndex = 1; 491 pme_gpu_stage_atom_data(sm_coefficients, gm_coefficientsB, 1); 492 barrier(CLK_LOCAL_MEM_FENCE); 493 if (spreadCharges) 494 { 495 spread_charges(kernelParams, sm_coefficients, sm_gridlineIndices, sm_theta, gm_gridB); 496 } 497 } 498} 499