1 /* 2 * This file is part of the GROMACS molecular simulation package. 3 * 4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands. 5 * Copyright (c) 2001-2004, The GROMACS development team. 6 * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team. 7 * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by 8 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, 9 * and including many others, as listed in the AUTHORS file in the 10 * top-level source directory and at http://www.gromacs.org. 11 * 12 * GROMACS is free software; you can redistribute it and/or 13 * modify it under the terms of the GNU Lesser General Public License 14 * as published by the Free Software Foundation; either version 2.1 15 * of the License, or (at your option) any later version. 16 * 17 * GROMACS is distributed in the hope that it will be useful, 18 * but WITHOUT ANY WARRANTY; without even the implied warranty of 19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 * Lesser General Public License for more details. 21 * 22 * You should have received a copy of the GNU Lesser General Public 23 * License along with GROMACS; if not, see 24 * http://www.gnu.org/licenses, or write to the Free Software Foundation, 25 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 26 * 27 * If you want to redistribute modifications to GROMACS, please 28 * consider that scientific software is very special. Version 29 * control is crucial - bugs must be traceable. We will be happy to 30 * consider code for inclusion in the official distribution, but 31 * derived work must not be called official GROMACS. Details are found 32 * in the README & COPYING files - if they are missing, get the 33 * official version at http://www.gromacs.org. 34 * 35 * To help us fund GROMACS development, we humbly ask that you cite 36 * the research papers on the package. Check out http://www.gromacs.org. 37 */ 38 /*! \internal \file 39 * 40 * \brief This file contains function declarations necessary for 41 * computing energies and forces for the PME long-ranged part (Coulomb 42 * and LJ). 43 * 44 * \author Berk Hess <hess@kth.se> 45 * \author Mark Abraham <mark.j.abraham@gmail.com> 46 * \ingroup module_ewald 47 */ 48 49 /* TODO This file is a temporary holding area for stuff local to the 50 * PME code, before it acquires some more normal ewald/file.c and 51 * ewald/file.h structure. In future clean up, get rid of this file, 52 * to build more normal. */ 53 54 #ifndef GMX_EWALD_PME_INTERNAL_H 55 #define GMX_EWALD_PME_INTERNAL_H 56 57 #include "config.h" 58 59 #include <vector> 60 61 #include "gromacs/math/gmxcomplex.h" 62 #include "gromacs/utility/alignedallocator.h" 63 #include "gromacs/utility/arrayref.h" 64 #include "gromacs/utility/basedefinitions.h" 65 #include "gromacs/utility/defaultinitializationallocator.h" 66 #include "gromacs/utility/gmxassert.h" 67 #include "gromacs/utility/gmxmpi.h" 68 69 #include "spline_vectors.h" 70 71 //! A repeat of typedef from parallel_3dfft.h 72 typedef struct gmx_parallel_3dfft* gmx_parallel_3dfft_t; 73 74 struct t_commrec; 75 struct t_inputrec; 76 struct PmeGpu; 77 78 enum class PmeRunMode; 79 80 //@{ 81 //! Grid indices for A state for charge and Lennard-Jones C6 82 #define PME_GRID_QA 0 83 #define PME_GRID_C6A 2 84 //@} 85 86 //@{ 87 /*! \brief Flags that indicate the number of PME grids in use */ 88 #define DO_Q 2 /* Electrostatic grids have index q<2 */ 89 #define DO_Q_AND_LJ 4 /* non-LB LJ grids have index 2 <= q < 4 */ 90 #define DO_Q_AND_LJ_LB 9 /* With LB rules we need a total of 2+7 grids */ 91 //@} 92 93 /*! \brief Pascal triangle coefficients scaled with (1/2)^6 for LJ-PME with LB-rules */ 94 static const real lb_scale_factor[] = { 1.0 / 64, 6.0 / 64, 15.0 / 64, 20.0 / 64, 95 15.0 / 64, 6.0 / 64, 1.0 / 64 }; 96 97 /*! \brief Pascal triangle coefficients used in solve_pme_lj_yzx, only need to do 4 calculations due to symmetry */ 98 static const real lb_scale_factor_symm[] = { 2.0 / 64, 12.0 / 64, 30.0 / 64, 20.0 / 64 }; 99 100 /*! \brief We only define a maximum to be able to use local arrays without allocation. 101 * An order larger than 12 should never be needed, even for test cases. 102 * If needed it can be changed here. 103 */ 104 #define PME_ORDER_MAX 12 105 106 107 /* Temporary suppression until these structs become opaque and don't live in 108 * a header that is included by other headers. Also, until then I have no 109 * idea what some of the names mean. */ 110 111 //! @cond Doxygen_Suppress 112 113 /*! \brief Data structure for grid communication */ 114 struct pme_grid_comm_t 115 { 116 int send_id; //!< Source rank id 117 int send_index0; 118 int send_nindex; 119 int recv_id; //!< Destination rank id 120 int recv_index0; 121 int recv_nindex; 122 int recv_size = 0; //!< Receive buffer width, used with OpenMP 123 }; 124 125 /*! \brief Data structure for grid overlap communication in a single dimension */ 126 struct pme_overlap_t 127 { 128 MPI_Comm mpi_comm; //!< MPI communcator 129 int nnodes; //!< Number of ranks 130 int nodeid; //!< Unique rank identifcator 131 std::vector<int> s2g0; //!< The local interpolation grid start 132 std::vector<int> s2g1; //!< The local interpolation grid end 133 int send_size; //!< Send buffer width, used with OpenMP 134 std::vector<pme_grid_comm_t> comm_data; //!< All the individual communication data for each rank 135 std::vector<real> sendbuf; //!< Shared buffer for sending 136 std::vector<real> recvbuf; //!< Shared buffer for receiving 137 }; 138 139 template<typename T> 140 using AlignedVector = std::vector<T, gmx::AlignedAllocator<T>>; 141 142 template<typename T> 143 using FastVector = std::vector<T, gmx::DefaultInitializationAllocator<T>>; 144 145 /*! \brief Data structure for organizing particle allocation to threads */ 146 struct AtomToThreadMap 147 { 148 //! Cumulative counts of the number of particles per thread 149 int* n = nullptr; 150 //! Storage buffer for n 151 std::vector<int> nBuffer; 152 //! Particle indices ordered on thread index (n) 153 FastVector<int> i; 154 }; 155 156 /*! \internal 157 * \brief Coefficients for theta or dtheta 158 */ 159 class SplineCoefficients 160 { 161 public: 162 //! Reallocate for use with up to nalloc coefficients 163 void realloc(int nalloc); 164 165 //! Pointers to the coefficient buffer for x, y, z 166 splinevec coefficients = { nullptr }; 167 168 private: 169 //! Storage for x coefficients 170 std::vector<real> bufferX_; 171 //! Storage for y coefficients 172 std::vector<real> bufferY_; 173 //! Storage for z coefficients, aligned for SIMD load 174 AlignedVector<real> bufferZ_; 175 }; 176 177 /*! \brief Data structure for beta-spline interpolation */ 178 struct splinedata_t 179 { 180 int n = 0; 181 FastVector<int> ind; 182 SplineCoefficients theta; 183 SplineCoefficients dtheta; 184 int nalloc = 0; 185 }; 186 187 /*! \brief PME slab MPI communication setup */ 188 struct SlabCommSetup 189 { 190 //! The nodes to send x and q to with DD 191 int node_dest; 192 //! The nodes to receive x and q from with DD 193 int node_src; 194 //! Index for commnode into the buffers 195 int buf_index; 196 //! The number of atoms to receive 197 int rcount; 198 }; 199 200 /*! \internal 201 * \brief Data structure for coordinating transfers between PME ranks along one dimension 202 * 203 * Also used for passing coordinates, coefficients and forces to and from PME routines. 204 */ 205 class PmeAtomComm 206 { 207 public: 208 //! Constructor, \p PmeMpiCommunicator is the communicator for this dimension 209 PmeAtomComm(MPI_Comm PmeMpiCommunicator, int numThreads, int pmeOrder, int dimIndex, bool doSpread); 210 211 //! Set the atom count and when necessary resizes atom buffers 212 void setNumAtoms(int numAtoms); 213 214 //! Returns the atom count numAtoms()215 int numAtoms() const { return numAtoms_; } 216 217 //! Returns the number of atoms to send to each rank sendCount()218 gmx::ArrayRef<int> sendCount() 219 { 220 GMX_ASSERT(!count_thread.empty(), "Need at least one thread_count"); 221 return count_thread[0]; 222 } 223 224 //! The index of the dimension, 0=x, 1=y 225 int dimind = 0; 226 //! The number of slabs and ranks this dimension is decomposed over 227 int nslab = 1; 228 //! Our MPI rank index 229 int nodeid = 0; 230 //! Communicator for this dimension 231 MPI_Comm mpi_comm; 232 233 //! Communication setup for each slab, only present with nslab > 1 234 std::vector<SlabCommSetup> slabCommSetup; 235 //! The maximum communication distance counted in MPI ranks 236 int maxshift = 0; 237 238 //! The target slab index for each particle 239 FastVector<int> pd; 240 //! Target particle counts for each slab, for each thread 241 std::vector<std::vector<int>> count_thread; 242 243 private: 244 //! The number of atoms 245 int numAtoms_ = 0; 246 247 public: 248 //! The coordinates 249 gmx::ArrayRef<const gmx::RVec> x; 250 //! The coefficient, charges or LJ C6 251 gmx::ArrayRef<const real> coefficient; 252 //! The forces 253 gmx::ArrayRef<gmx::RVec> f; 254 //! Coordinate buffer, used only with nslab > 1 255 FastVector<gmx::RVec> xBuffer; 256 //! Coefficient buffer, used only with nslab > 1 257 FastVector<real> coefficientBuffer; 258 //! Force buffer, used only with nslab > 1 259 FastVector<gmx::RVec> fBuffer; 260 //! Tells whether these coordinates are used for spreading 261 bool bSpread; 262 //! The PME order 263 int pme_order; 264 //! The grid index per atom 265 FastVector<gmx::IVec> idx; 266 //! Fractional atom coordinates relative to the lower cell boundary 267 FastVector<gmx::RVec> fractx; 268 269 //! The number of threads to use in PME 270 int nthread; 271 //! Thread index for each atom 272 FastVector<int> thread_idx; 273 std::vector<AtomToThreadMap> threadMap; 274 std::vector<splinedata_t> spline; 275 }; 276 277 /*! \brief Data structure for a single PME grid */ 278 struct pmegrid_t 279 { 280 ivec ci; /* The spatial location of this grid */ 281 ivec n; /* The used size of *grid, including order-1 */ 282 ivec offset; /* The grid offset from the full node grid */ 283 int order; /* PME spreading order */ 284 ivec s; /* The allocated size of *grid, s >= n */ 285 real* grid; /* The grid local thread, size n */ 286 }; 287 288 /*! \brief Data structures for PME grids */ 289 struct pmegrids_t 290 { 291 pmegrid_t grid; /* The full node grid (non thread-local) */ 292 int nthread; /* The number of threads operating on this grid */ 293 ivec nc; /* The local spatial decomposition over the threads */ 294 pmegrid_t* grid_th; /* Array of grids for each thread */ 295 real* grid_all; /* Allocated array for the grids in *grid_th */ 296 int* g2t[DIM]; /* The grid to thread index */ 297 ivec nthread_comm; /* The number of threads to communicate with */ 298 }; 299 300 /*! \brief Data structure for spline-interpolation working buffers */ 301 struct pme_spline_work; 302 303 /*! \brief Data structure for working buffers */ 304 struct pme_solve_work_t; 305 306 /*! \brief Master PME data structure */ 307 struct gmx_pme_t 308 { //NOLINT(clang-analyzer-optin.performance.Padding) 309 int ndecompdim; /* The number of decomposition dimensions */ 310 int nodeid; /* Our nodeid in mpi->mpi_comm */ 311 int nodeid_major; 312 int nodeid_minor; 313 int nnodes; /* The number of nodes doing PME */ 314 int nnodes_major; 315 int nnodes_minor; 316 317 MPI_Comm mpi_comm; 318 MPI_Comm mpi_comm_d[2]; /* Indexed on dimension, 0=x, 1=y */ 319 #if GMX_MPI 320 MPI_Datatype rvec_mpi; /* the pme vector's MPI type */ 321 #endif 322 323 gmx_bool bUseThreads; /* Does any of the PME ranks have nthread>1 ? */ 324 int nthread; /* The number of threads doing PME on our rank */ 325 326 gmx_bool bPPnode; /* Node also does particle-particle forces */ 327 bool doCoulomb; /* Apply PME to electrostatics */ 328 bool doLJ; /* Apply PME to Lennard-Jones r^-6 interactions */ 329 gmx_bool bFEP; /* Compute Free energy contribution */ 330 gmx_bool bFEP_q; 331 gmx_bool bFEP_lj; 332 int nkx, nky, nkz; /* Grid dimensions */ 333 gmx_bool bP3M; /* Do P3M: optimize the influence function */ 334 int pme_order; 335 real ewaldcoeff_q; /* Ewald splitting coefficient for Coulomb */ 336 real ewaldcoeff_lj; /* Ewald splitting coefficient for r^-6 */ 337 real epsilon_r; 338 339 340 enum PmeRunMode runMode; /* Which codepath is the PME runner taking - CPU, GPU, mixed; 341 * TODO: this is the information that should be owned by the task 342 * scheduler, and ideally not be duplicated here. 343 */ 344 345 PmeGpu* gpu; /* A pointer to the GPU data. 346 * TODO: this should be unique or a shared pointer. 347 * Currently in practice there is a single gmx_pme_t instance while a code 348 * is partially set up for many of them. The PME tuning calls gmx_pme_reinit() 349 * which fully reinitializes the one and only PME structure anew while maybe 350 * keeping the old grid buffers if they were already large enough. 351 * This small choice should be made clear in the later refactoring - 352 * do we store many PME objects for different grid sizes, 353 * or a single PME object that handles different grid sizes gracefully. 354 */ 355 356 357 class EwaldBoxZScaler* boxScaler; /**< The scaling data Ewald uses with walls (set at pme_init constant for the entire run) */ 358 359 360 int ljpme_combination_rule; /* Type of combination rule in LJ-PME */ 361 362 int ngrids; /* number of grids we maintain for pmegrid, (c)fftgrid and pfft_setups*/ 363 364 pmegrids_t pmegrid[DO_Q_AND_LJ_LB]; /* Grids on which we do spreading/interpolation, 365 * includes overlap Grid indices are ordered as 366 * follows: 367 * 0: Coloumb PME, state A 368 * 1: Coloumb PME, state B 369 * 2-8: LJ-PME 370 * This can probably be done in a better way 371 * but this simple hack works for now 372 */ 373 374 /* The PME coefficient spreading grid sizes/strides, includes pme_order-1 */ 375 int pmegrid_nx, pmegrid_ny, pmegrid_nz; 376 /* pmegrid_nz might be larger than strictly necessary to ensure 377 * memory alignment, pmegrid_nz_base gives the real base size. 378 */ 379 int pmegrid_nz_base; 380 /* The local PME grid starting indices */ 381 int pmegrid_start_ix, pmegrid_start_iy, pmegrid_start_iz; 382 383 /* Work data for spreading and gathering */ 384 pme_spline_work* spline_work; 385 386 real** fftgrid; /* Grids for FFT. With 1D FFT decomposition this can be a pointer */ 387 /* inside the interpolation grid, but separate for 2D PME decomp. */ 388 int fftgrid_nx, fftgrid_ny, fftgrid_nz; 389 390 t_complex** cfftgrid; /* Grids for complex FFT data */ 391 392 int cfftgrid_nx, cfftgrid_ny, cfftgrid_nz; 393 394 gmx_parallel_3dfft_t* pfft_setup; 395 396 int * nnx, *nny, *nnz; 397 real *fshx, *fshy, *fshz; 398 399 std::vector<PmeAtomComm> atc; /* Indexed on decomposition index */ 400 matrix recipbox; 401 real boxVolume; 402 splinevec bsp_mod; 403 /* Buffers to store data for local atoms for L-B combination rule 404 * calculations in LJ-PME. lb_buf1 stores either the coefficients 405 * for spreading/gathering (in serial), or the C6 coefficient for 406 * local atoms (in parallel). lb_buf2 is only used in parallel, 407 * and stores the sigma values for local atoms. */ 408 FastVector<real> lb_buf1; 409 FastVector<real> lb_buf2; 410 411 pme_overlap_t overlap[2]; /* Indexed on dimension, 0=x, 1=y */ 412 413 /* Atom step for energy only calculation in gmx_pme_calc_energy() */ 414 std::unique_ptr<PmeAtomComm> atc_energy; 415 416 /* Communication buffers */ 417 rvec* bufv; /* Communication buffer */ 418 real* bufr; /* Communication buffer */ 419 int buf_nalloc; /* The communication buffer size */ 420 421 /* thread local work data for solve_pme */ 422 struct pme_solve_work_t* solve_work; 423 424 /* Work data for sum_qgrid */ 425 real* sum_qgrid_tmp; 426 real* sum_qgrid_dd_tmp; 427 }; 428 429 //! @endcond 430 431 #endif 432