1 /* 2 * This file is part of the GROMACS molecular simulation package. 3 * 4 * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by 5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, 6 * and including many others, as listed in the AUTHORS file in the 7 * top-level source directory and at http://www.gromacs.org. 8 * 9 * GROMACS is free software; you can redistribute it and/or 10 * modify it under the terms of the GNU Lesser General Public License 11 * as published by the Free Software Foundation; either version 2.1 12 * of the License, or (at your option) any later version. 13 * 14 * GROMACS is distributed in the hope that it will be useful, 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17 * Lesser General Public License for more details. 18 * 19 * You should have received a copy of the GNU Lesser General Public 20 * License along with GROMACS; if not, see 21 * http://www.gnu.org/licenses, or write to the Free Software Foundation, 22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 23 * 24 * If you want to redistribute modifications to GROMACS, please 25 * consider that scientific software is very special. Version 26 * control is crucial - bugs must be traceable. We will be happy to 27 * consider code for inclusion in the official distribution, but 28 * derived work must not be called official GROMACS. Details are found 29 * in the README & COPYING files - if they are missing, get the 30 * official version at http://www.gromacs.org. 31 * 32 * To help us fund GROMACS development, we humbly ask that you cite 33 * the research papers on the package. Check out http://www.gromacs.org. 34 */ 35 /*! \internal \file 36 * \brief 37 * Declares PmeGpuProgramImpl, which stores PME GPU (compiled) kernel handles. 38 * 39 * \author Aleksei Iupinov <a.yupinov@gmail.com> 40 * \ingroup module_ewald 41 */ 42 #ifndef GMX_EWALD_PME_PME_GPU_PROGRAM_IMPL_H 43 #define GMX_EWALD_PME_PME_GPU_PROGRAM_IMPL_H 44 45 #include "config.h" 46 47 #include "gromacs/gpu_utils/device_context.h" 48 #include "gromacs/utility/classhelpers.h" 49 50 class DeviceContext; 51 struct DeviceInformation; 52 53 /*! \internal 54 * \brief 55 * PME GPU persistent host program/kernel data, which should be initialized once for the whole execution. 56 * 57 * Primary purpose of this is to not recompile GPU kernels for each OpenCL unit test, 58 * while the relevant GPU context (e.g. cl_context) instance persists. 59 * In CUDA, this just assigns the kernel function pointers. 60 * This also implicitly relies on the fact that reasonable share of the kernels are always used. 61 * If there were more template parameters, even smaller share of all possible kernels would be used. 62 * 63 * \todo In future if we would need to react to either user input or 64 * auto-tuning to compile different kernels, then we might wish to 65 * revisit the number of kernels we pre-compile, and/or the management 66 * of their lifetime. 67 * 68 * This also doesn't manage cuFFT/clFFT kernels, which depend on the PME grid dimensions. 69 * 70 * TODO: pass cl_context to the constructor and not create it inside. 71 * See also Issue #2522. 72 */ 73 struct PmeGpuProgramImpl 74 { 75 /*! \brief 76 * This is a handle to the GPU context, which is just a dummy in CUDA, 77 * but is created/destroyed by this class in OpenCL. 78 */ 79 const DeviceContext& deviceContext_; 80 81 //! Conveniently all the PME kernels use the same single argument type 82 #if GMX_GPU_CUDA 83 using PmeKernelHandle = void (*)(const struct PmeGpuCudaKernelParams); 84 #elif GMX_GPU_OPENCL 85 using PmeKernelHandle = cl_kernel; 86 #else 87 using PmeKernelHandle = void*; 88 #endif 89 90 /*! \brief 91 * Maximum synchronous GPU thread group execution width. 92 * "Warp" is a CUDA term which we end up reusing in OpenCL kernels as well. 93 * For CUDA, this is a static value that comes from gromacs/gpu_utils/cuda_arch_utils.cuh; 94 * for OpenCL, we have to query it dynamically. 95 */ 96 size_t warpSize_; 97 98 //@{ 99 /** 100 * Spread/spline kernels are compiled only for order of 4. 101 * There are multiple versions of each kernel, paramaretized according to 102 * Number of threads per atom. Using either order(4) or order*order (16) threads per atom is 103 * supported If the spline data is written in the spline/spread kernel and loaded in the gather 104 * or recalculated in the gather. 105 * Spreading kernels also have hardcoded X/Y indices wrapping parameters, 106 * as a placeholder for implementing 1/2D decomposition. 107 * The kernels are templated separately for spreading on one grid (one or 108 * two sets of coefficients) or on two grids (required for energy and virial 109 * calculations). 110 */ 111 size_t spreadWorkGroupSize; 112 113 PmeKernelHandle splineKernelSingle; 114 PmeKernelHandle splineKernelThPerAtom4Single; 115 PmeKernelHandle spreadKernelSingle; 116 PmeKernelHandle spreadKernelThPerAtom4Single; 117 PmeKernelHandle splineAndSpreadKernelSingle; 118 PmeKernelHandle splineAndSpreadKernelThPerAtom4Single; 119 PmeKernelHandle splineAndSpreadKernelWriteSplinesSingle; 120 PmeKernelHandle splineAndSpreadKernelWriteSplinesThPerAtom4Single; 121 PmeKernelHandle splineKernelDual; 122 PmeKernelHandle splineKernelThPerAtom4Dual; 123 PmeKernelHandle spreadKernelDual; 124 PmeKernelHandle spreadKernelThPerAtom4Dual; 125 PmeKernelHandle splineAndSpreadKernelDual; 126 PmeKernelHandle splineAndSpreadKernelThPerAtom4Dual; 127 PmeKernelHandle splineAndSpreadKernelWriteSplinesDual; 128 PmeKernelHandle splineAndSpreadKernelWriteSplinesThPerAtom4Dual; 129 //@} 130 131 //@{ 132 /** Same for gather: hardcoded X/Y unwrap parameters, order of 4, plus 133 * it can either reduce with previous forces in the host buffer, or ignore them. 134 * Also similarly to the gather we can use either order(4) or order*order (16) threads per atom 135 * and either recalculate the splines or read the ones written by the spread 136 * The kernels are templated separately for using one or two grids (required for 137 * calculating energies and virial). 138 */ 139 size_t gatherWorkGroupSize; 140 141 PmeKernelHandle gatherKernelSingle; 142 PmeKernelHandle gatherKernelThPerAtom4Single; 143 PmeKernelHandle gatherKernelReadSplinesSingle; 144 PmeKernelHandle gatherKernelReadSplinesThPerAtom4Single; 145 PmeKernelHandle gatherKernelDual; 146 PmeKernelHandle gatherKernelThPerAtom4Dual; 147 PmeKernelHandle gatherKernelReadSplinesDual; 148 PmeKernelHandle gatherKernelReadSplinesThPerAtom4Dual; 149 //@} 150 151 //@{ 152 /** Solve kernel doesn't care about the interpolation order, but can optionally 153 * compute energy and virial, and supports XYZ and YZX grid orderings. 154 * The kernels are templated separately for grids in state A and B. 155 */ 156 size_t solveMaxWorkGroupSize; 157 158 PmeKernelHandle solveYZXKernelA; 159 PmeKernelHandle solveXYZKernelA; 160 PmeKernelHandle solveYZXEnergyKernelA; 161 PmeKernelHandle solveXYZEnergyKernelA; 162 PmeKernelHandle solveYZXKernelB; 163 PmeKernelHandle solveXYZKernelB; 164 PmeKernelHandle solveYZXEnergyKernelB; 165 PmeKernelHandle solveXYZEnergyKernelB; 166 //@} 167 168 PmeGpuProgramImpl() = delete; 169 //! Constructor for the given device 170 explicit PmeGpuProgramImpl(const DeviceContext& deviceContext); 171 ~PmeGpuProgramImpl(); 172 GMX_DISALLOW_COPY_AND_ASSIGN(PmeGpuProgramImpl); 173 174 //! Return the warp size for which the kernels were compiled warpSizePmeGpuProgramImpl175 int warpSize() const { return warpSize_; } 176 177 private: 178 // Compiles kernels, if supported. Called by the constructor. 179 void compileKernels(const DeviceInformation& deviceInfo); 180 }; 181 182 #endif 183