1 /*
2 * This file is part of the GROMACS molecular simulation package.
3 *
4 * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
5 * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
9 *
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
14 *
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 *
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
32 *
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
35 */
36
37 /*! \internal \file
38 *
39 * \brief
40 * Implements the Grid class.
41 *
42 * \author Berk Hess <hess@kth.se>
43 * \ingroup module_nbnxm
44 */
45
46 #include "gmxpre.h"
47
48 #include "grid.h"
49
50 #include <cmath>
51 #include <cstring>
52
53 #include <algorithm>
54
55 #include "gromacs/math/utilities.h"
56 #include "gromacs/math/vec.h"
57 #include "gromacs/mdlib/gmx_omp_nthreads.h"
58 #include "gromacs/mdlib/updategroupscog.h"
59 #include "gromacs/mdtypes/forcerec.h" // only for GET_CGINFO_*
60 #include "gromacs/nbnxm/atomdata.h"
61 #include "gromacs/simd/simd.h"
62 #include "gromacs/simd/vector_operations.h"
63
64 #include "boundingboxes.h"
65 #include "gridsetdata.h"
66 #include "nbnxm_geometry.h"
67 #include "pairlistparams.h"
68
69 namespace Nbnxm
70 {
71
Geometry(const PairlistType pairlistType)72 Grid::Geometry::Geometry(const PairlistType pairlistType) :
73 isSimple(pairlistType != PairlistType::HierarchicalNxN),
74 numAtomsICluster(IClusterSizePerListType[pairlistType]),
75 numAtomsJCluster(JClusterSizePerListType[pairlistType]),
76 numAtomsPerCell((isSimple ? 1 : c_gpuNumClusterPerCell) * numAtomsICluster),
77 numAtomsICluster2Log(get_2log(numAtomsICluster))
78 {
79 }
80
Grid(const PairlistType pairlistType,const bool & haveFep)81 Grid::Grid(const PairlistType pairlistType, const bool& haveFep) :
82 geometry_(pairlistType),
83 haveFep_(haveFep)
84 {
85 }
86
87 /*! \brief Returns the atom density (> 0) of a rectangular grid */
gridAtomDensity(int numAtoms,const rvec lowerCorner,const rvec upperCorner)88 static real gridAtomDensity(int numAtoms, const rvec lowerCorner, const rvec upperCorner)
89 {
90 rvec size;
91
92 if (numAtoms == 0)
93 {
94 /* To avoid zero density we use a minimum of 1 atom */
95 numAtoms = 1;
96 }
97
98 rvec_sub(upperCorner, lowerCorner, size);
99
100 return static_cast<real>(numAtoms) / (size[XX] * size[YY] * size[ZZ]);
101 }
102
setDimensions(const int ddZone,const int numAtoms,gmx::RVec lowerCorner,gmx::RVec upperCorner,real atomDensity,const real maxAtomGroupRadius,const bool haveFep,gmx::PinningPolicy pinningPolicy)103 void Grid::setDimensions(const int ddZone,
104 const int numAtoms,
105 gmx::RVec lowerCorner,
106 gmx::RVec upperCorner,
107 real atomDensity,
108 const real maxAtomGroupRadius,
109 const bool haveFep,
110 gmx::PinningPolicy pinningPolicy)
111 {
112 /* We allow passing lowerCorner=upperCorner, in which case we need to
113 * create a finite sized bounding box to avoid division by zero.
114 * We use a minimum size such that the volume fits in float with some
115 * margin for computing and using the atom number density.
116 */
117 constexpr real c_minimumGridSize = 1e-10;
118 for (int d = 0; d < DIM; d++)
119 {
120 GMX_ASSERT(upperCorner[d] >= lowerCorner[d],
121 "Upper corner should be larger than the lower corner");
122 if (upperCorner[d] - lowerCorner[d] < c_minimumGridSize)
123 {
124 /* Ensure we apply a correction to the bounding box */
125 real correction =
126 std::max(std::abs(lowerCorner[d]) * GMX_REAL_EPS, 0.5_real * c_minimumGridSize);
127 lowerCorner[d] -= correction;
128 upperCorner[d] += correction;
129 }
130 }
131
132 /* For the home zone we compute the density when not set (=-1) or when =0 */
133 if (ddZone == 0 && atomDensity <= 0)
134 {
135 atomDensity = gridAtomDensity(numAtoms, lowerCorner, upperCorner);
136 }
137
138 dimensions_.atomDensity = atomDensity;
139 dimensions_.maxAtomGroupRadius = maxAtomGroupRadius;
140
141 rvec size;
142 rvec_sub(upperCorner, lowerCorner, size);
143
144 if (numAtoms > geometry_.numAtomsPerCell)
145 {
146 GMX_ASSERT(atomDensity > 0, "With one or more atoms, the density should be positive");
147
148 /* target cell length */
149 real tlen_x;
150 real tlen_y;
151 if (geometry_.isSimple)
152 {
153 /* To minimize the zero interactions, we should make
154 * the largest of the i/j cell cubic.
155 */
156 int numAtomsInCell = std::max(geometry_.numAtomsICluster, geometry_.numAtomsJCluster);
157
158 /* Approximately cubic cells */
159 real tlen = std::cbrt(numAtomsInCell / atomDensity);
160 tlen_x = tlen;
161 tlen_y = tlen;
162 }
163 else
164 {
165 /* Approximately cubic sub cells */
166 real tlen = std::cbrt(geometry_.numAtomsICluster / atomDensity);
167 tlen_x = tlen * c_gpuNumClusterPerCellX;
168 tlen_y = tlen * c_gpuNumClusterPerCellY;
169 }
170 /* We round ncx and ncy down, because we get less cell pairs
171 * in the pairlist when the fixed cell dimensions (x,y) are
172 * larger than the variable one (z) than the other way around.
173 */
174 dimensions_.numCells[XX] = std::max(1, static_cast<int>(size[XX] / tlen_x));
175 dimensions_.numCells[YY] = std::max(1, static_cast<int>(size[YY] / tlen_y));
176 }
177 else
178 {
179 dimensions_.numCells[XX] = 1;
180 dimensions_.numCells[YY] = 1;
181 }
182
183 for (int d = 0; d < DIM - 1; d++)
184 {
185 dimensions_.cellSize[d] = size[d] / dimensions_.numCells[d];
186 dimensions_.invCellSize[d] = 1 / dimensions_.cellSize[d];
187 }
188
189 if (ddZone > 0)
190 {
191 /* This is a non-home zone, add an extra row of cells
192 * for particles communicated for bonded interactions.
193 * These can be beyond the cut-off. It doesn't matter where
194 * they end up on the grid, but for performance it's better
195 * if they don't end up in cells that can be within cut-off range.
196 */
197 dimensions_.numCells[XX]++;
198 dimensions_.numCells[YY]++;
199 }
200
201 /* We need one additional cell entry for particles moved by DD */
202 cxy_na_.resize(numColumns() + 1);
203 cxy_ind_.resize(numColumns() + 2);
204 changePinningPolicy(&cxy_na_, pinningPolicy);
205 changePinningPolicy(&cxy_ind_, pinningPolicy);
206
207 /* Worst case scenario of 1 atom in each last cell */
208 int maxNumCells;
209 if (geometry_.numAtomsJCluster <= geometry_.numAtomsICluster)
210 {
211 maxNumCells = numAtoms / geometry_.numAtomsPerCell + numColumns();
212 }
213 else
214 {
215 maxNumCells = numAtoms / geometry_.numAtomsPerCell
216 + numColumns() * geometry_.numAtomsJCluster / geometry_.numAtomsICluster;
217 }
218
219 if (!geometry_.isSimple)
220 {
221 numClusters_.resize(maxNumCells);
222 }
223 bbcz_.resize(maxNumCells);
224
225 /* This resize also zeros the contents, this avoid possible
226 * floating exceptions in SIMD with the unused bb elements.
227 */
228 if (geometry_.isSimple)
229 {
230 bb_.resize(maxNumCells);
231 }
232 else
233 {
234 #if NBNXN_BBXXXX
235 pbb_.resize(packedBoundingBoxesIndex(maxNumCells * c_gpuNumClusterPerCell));
236 #else
237 bb_.resize(maxNumCells * c_gpuNumClusterPerCell);
238 #endif
239 }
240
241 if (geometry_.numAtomsJCluster == geometry_.numAtomsICluster)
242 {
243 bbj_ = bb_;
244 }
245 else
246 {
247 GMX_ASSERT(geometry_.isSimple, "Only CPU lists should have different i/j cluster sizes");
248
249 bbjStorage_.resize(maxNumCells * geometry_.numAtomsICluster / geometry_.numAtomsJCluster);
250 bbj_ = bbjStorage_;
251 }
252
253 flags_.resize(maxNumCells);
254 if (haveFep)
255 {
256 fep_.resize(maxNumCells * geometry_.numAtomsPerCell / geometry_.numAtomsICluster);
257 }
258
259 copy_rvec(lowerCorner, dimensions_.lowerCorner);
260 copy_rvec(upperCorner, dimensions_.upperCorner);
261 copy_rvec(size, dimensions_.gridSize);
262 }
263
264 /* We need to sort paricles in grid columns on z-coordinate.
265 * As particle are very often distributed homogeneously, we use a sorting
266 * algorithm similar to pigeonhole sort. We multiply the z-coordinate
267 * by a factor, cast to an int and try to store in that hole. If the hole
268 * is full, we move this or another particle. A second pass is needed to make
269 * contiguous elements. SORT_GRID_OVERSIZE is the ratio of holes to particles.
270 * 4 is the optimal value for homogeneous particle distribution and allows
271 * for an O(#particles) sort up till distributions were all particles are
272 * concentrated in 1/4 of the space. No NlogN fallback is implemented,
273 * as it can be expensive to detect imhomogeneous particle distributions.
274 */
275 /*! \brief Ratio of grid cells to atoms */
276 static constexpr int c_sortGridRatio = 4;
277 /*! \brief Maximum ratio of holes used, in the worst case all particles
278 * end up in the last hole and we need num. atoms extra holes at the end.
279 */
280 static constexpr int c_sortGridMaxSizeFactor = c_sortGridRatio + 1;
281
282 /*! \brief Sorts particle index a on coordinates x along dim.
283 *
284 * Backwards tells if we want decreasing iso increasing coordinates.
285 * h0 is the minimum of the coordinate range.
286 * invh is the 1/length of the sorting range.
287 * n_per_h (>=n) is the expected average number of particles per 1/invh
288 * sort is the sorting work array.
289 * sort should have a size of at least n_per_h*c_sortGridRatio + n,
290 * or easier, allocate at least n*c_sortGridMaxSizeFactor elements.
291 */
sort_atoms(int dim,gmx_bool Backwards,int gmx_unused dd_zone,bool gmx_unused relevantAtomsAreWithinGridBounds,int * a,int n,gmx::ArrayRef<const gmx::RVec> x,real h0,real invh,int n_per_h,gmx::ArrayRef<int> sort)292 static void sort_atoms(int dim,
293 gmx_bool Backwards,
294 int gmx_unused dd_zone,
295 bool gmx_unused relevantAtomsAreWithinGridBounds,
296 int* a,
297 int n,
298 gmx::ArrayRef<const gmx::RVec> x,
299 real h0,
300 real invh,
301 int n_per_h,
302 gmx::ArrayRef<int> sort)
303 {
304 if (n <= 1)
305 {
306 /* Nothing to do */
307 return;
308 }
309
310 GMX_ASSERT(n <= n_per_h, "We require n <= n_per_h");
311
312 /* Transform the inverse range height into the inverse hole height */
313 invh *= n_per_h * c_sortGridRatio;
314
315 /* Set nsort to the maximum possible number of holes used.
316 * In worst case all n elements end up in the last bin.
317 */
318 int nsort = n_per_h * c_sortGridRatio + n;
319
320 /* Determine the index range used, so we can limit it for the second pass */
321 int zi_min = INT_MAX;
322 int zi_max = -1;
323
324 /* Sort the particles using a simple index sort */
325 for (int i = 0; i < n; i++)
326 {
327 /* The cast takes care of float-point rounding effects below zero.
328 * This code assumes particles are less than 1/SORT_GRID_OVERSIZE
329 * times the box height out of the box.
330 */
331 int zi = static_cast<int>((x[a[i]][dim] - h0) * invh);
332
333 #ifndef NDEBUG
334 /* As we can have rounding effect, we use > iso >= here */
335 if (relevantAtomsAreWithinGridBounds && (zi < 0 || (dd_zone == 0 && zi > n_per_h * c_sortGridRatio)))
336 {
337 gmx_fatal(FARGS, "(int)((x[%d][%c]=%f - %f)*%f) = %d, not in 0 - %d*%d\n", a[i],
338 'x' + dim, x[a[i]][dim], h0, invh, zi, n_per_h, c_sortGridRatio);
339 }
340 #endif
341 if (zi < 0)
342 {
343 zi = 0;
344 }
345
346 /* In a non-local domain, particles communcated for bonded interactions
347 * can be far beyond the grid size, which is set by the non-bonded
348 * cut-off distance. We sort such particles into the last cell.
349 */
350 if (zi > n_per_h * c_sortGridRatio)
351 {
352 zi = n_per_h * c_sortGridRatio;
353 }
354
355 /* Ideally this particle should go in sort cell zi,
356 * but that might already be in use,
357 * in that case find the first empty cell higher up
358 */
359 if (sort[zi] < 0)
360 {
361 sort[zi] = a[i];
362 zi_min = std::min(zi_min, zi);
363 zi_max = std::max(zi_max, zi);
364 }
365 else
366 {
367 /* We have multiple atoms in the same sorting slot.
368 * Sort on real z for minimal bounding box size.
369 * There is an extra check for identical z to ensure
370 * well-defined output order, independent of input order
371 * to ensure binary reproducibility after restarts.
372 */
373 while (sort[zi] >= 0
374 && (x[a[i]][dim] > x[sort[zi]][dim]
375 || (x[a[i]][dim] == x[sort[zi]][dim] && a[i] > sort[zi])))
376 {
377 zi++;
378 }
379
380 if (sort[zi] >= 0)
381 {
382 /* Shift all elements by one slot until we find an empty slot */
383 int cp = sort[zi];
384 int zim = zi + 1;
385 while (sort[zim] >= 0)
386 {
387 int tmp = sort[zim];
388 sort[zim] = cp;
389 cp = tmp;
390 zim++;
391 }
392 sort[zim] = cp;
393 zi_max = std::max(zi_max, zim);
394 }
395 sort[zi] = a[i];
396 zi_max = std::max(zi_max, zi);
397 }
398 }
399
400 int c = 0;
401 if (!Backwards)
402 {
403 for (int zi = 0; zi < nsort; zi++)
404 {
405 if (sort[zi] >= 0)
406 {
407 a[c++] = sort[zi];
408 sort[zi] = -1;
409 }
410 }
411 }
412 else
413 {
414 for (int zi = zi_max; zi >= zi_min; zi--)
415 {
416 if (sort[zi] >= 0)
417 {
418 a[c++] = sort[zi];
419 sort[zi] = -1;
420 }
421 }
422 }
423 if (c < n)
424 {
425 gmx_incons("Lost particles while sorting");
426 }
427 }
428
429 #if GMX_DOUBLE
430 //! Returns double up to one least significant float bit smaller than x
R2F_D(const float x)431 static double R2F_D(const float x)
432 {
433 return static_cast<float>(x >= 0 ? (1 - GMX_FLOAT_EPS) * x : (1 + GMX_FLOAT_EPS) * x);
434 }
435 //! Returns double up to one least significant float bit larger than x
R2F_U(const float x)436 static double R2F_U(const float x)
437 {
438 return static_cast<float>(x >= 0 ? (1 + GMX_FLOAT_EPS) * x : (1 - GMX_FLOAT_EPS) * x);
439 }
440 #else
441 //! Returns x
R2F_D(const float x)442 static float R2F_D(const float x)
443 {
444 return x;
445 }
446 //! Returns x
R2F_U(const float x)447 static float R2F_U(const float x)
448 {
449 return x;
450 }
451 #endif
452
453 //! Computes the bounding box for na coordinates in order x,y,z, bb order xyz0
calc_bounding_box(int na,int stride,const real * x,BoundingBox * bb)454 static void calc_bounding_box(int na, int stride, const real* x, BoundingBox* bb)
455 {
456 int i;
457 real xl, xh, yl, yh, zl, zh;
458
459 i = 0;
460 xl = x[i + XX];
461 xh = x[i + XX];
462 yl = x[i + YY];
463 yh = x[i + YY];
464 zl = x[i + ZZ];
465 zh = x[i + ZZ];
466 i += stride;
467 for (int j = 1; j < na; j++)
468 {
469 xl = std::min(xl, x[i + XX]);
470 xh = std::max(xh, x[i + XX]);
471 yl = std::min(yl, x[i + YY]);
472 yh = std::max(yh, x[i + YY]);
473 zl = std::min(zl, x[i + ZZ]);
474 zh = std::max(zh, x[i + ZZ]);
475 i += stride;
476 }
477 /* Note: possible double to float conversion here */
478 bb->lower.x = R2F_D(xl);
479 bb->lower.y = R2F_D(yl);
480 bb->lower.z = R2F_D(zl);
481 bb->upper.x = R2F_U(xh);
482 bb->upper.y = R2F_U(yh);
483 bb->upper.z = R2F_U(zh);
484 }
485
486 /*! \brief Computes the bounding box for na packed coordinates, bb order xyz0 */
calc_bounding_box_x_x4(int na,const real * x,BoundingBox * bb)487 static void calc_bounding_box_x_x4(int na, const real* x, BoundingBox* bb)
488 {
489 real xl, xh, yl, yh, zl, zh;
490
491 xl = x[XX * c_packX4];
492 xh = x[XX * c_packX4];
493 yl = x[YY * c_packX4];
494 yh = x[YY * c_packX4];
495 zl = x[ZZ * c_packX4];
496 zh = x[ZZ * c_packX4];
497 for (int j = 1; j < na; j++)
498 {
499 xl = std::min(xl, x[j + XX * c_packX4]);
500 xh = std::max(xh, x[j + XX * c_packX4]);
501 yl = std::min(yl, x[j + YY * c_packX4]);
502 yh = std::max(yh, x[j + YY * c_packX4]);
503 zl = std::min(zl, x[j + ZZ * c_packX4]);
504 zh = std::max(zh, x[j + ZZ * c_packX4]);
505 }
506 /* Note: possible double to float conversion here */
507 bb->lower.x = R2F_D(xl);
508 bb->lower.y = R2F_D(yl);
509 bb->lower.z = R2F_D(zl);
510 bb->upper.x = R2F_U(xh);
511 bb->upper.y = R2F_U(yh);
512 bb->upper.z = R2F_U(zh);
513 }
514
515 /*! \brief Computes the bounding box for na coordinates, bb order xyz0 */
calc_bounding_box_x_x8(int na,const real * x,BoundingBox * bb)516 static void calc_bounding_box_x_x8(int na, const real* x, BoundingBox* bb)
517 {
518 real xl, xh, yl, yh, zl, zh;
519
520 xl = x[XX * c_packX8];
521 xh = x[XX * c_packX8];
522 yl = x[YY * c_packX8];
523 yh = x[YY * c_packX8];
524 zl = x[ZZ * c_packX8];
525 zh = x[ZZ * c_packX8];
526 for (int j = 1; j < na; j++)
527 {
528 xl = std::min(xl, x[j + XX * c_packX8]);
529 xh = std::max(xh, x[j + XX * c_packX8]);
530 yl = std::min(yl, x[j + YY * c_packX8]);
531 yh = std::max(yh, x[j + YY * c_packX8]);
532 zl = std::min(zl, x[j + ZZ * c_packX8]);
533 zh = std::max(zh, x[j + ZZ * c_packX8]);
534 }
535 /* Note: possible double to float conversion here */
536 bb->lower.x = R2F_D(xl);
537 bb->lower.y = R2F_D(yl);
538 bb->lower.z = R2F_D(zl);
539 bb->upper.x = R2F_U(xh);
540 bb->upper.y = R2F_U(yh);
541 bb->upper.z = R2F_U(zh);
542 }
543
544 /*! \brief Computes the bounding box for na packed coordinates, bb order xyz0 */
calc_bounding_box_x_x4_halves(int na,const real * x,BoundingBox * bb,BoundingBox * bbj)545 gmx_unused static void calc_bounding_box_x_x4_halves(int na, const real* x, BoundingBox* bb, BoundingBox* bbj)
546 {
547 // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
548 using namespace gmx;
549
550 calc_bounding_box_x_x4(std::min(na, 2), x, bbj);
551
552 if (na > 2)
553 {
554 calc_bounding_box_x_x4(std::min(na - 2, 2), x + (c_packX4 >> 1), bbj + 1);
555 }
556 else
557 {
558 /* Set the "empty" bounding box to the same as the first one,
559 * so we don't need to treat special cases in the rest of the code.
560 */
561 #if NBNXN_SEARCH_BB_SIMD4
562 store4(bbj[1].lower.ptr(), load4(bbj[0].lower.ptr()));
563 store4(bbj[1].upper.ptr(), load4(bbj[0].upper.ptr()));
564 #else
565 bbj[1] = bbj[0];
566 #endif
567 }
568
569 #if NBNXN_SEARCH_BB_SIMD4
570 store4(bb->lower.ptr(), min(load4(bbj[0].lower.ptr()), load4(bbj[1].lower.ptr())));
571 store4(bb->upper.ptr(), max(load4(bbj[0].upper.ptr()), load4(bbj[1].upper.ptr())));
572 #else
573 {
574 bb->lower = BoundingBox::Corner::min(bbj[0].lower, bbj[1].lower);
575 bb->upper = BoundingBox::Corner::max(bbj[0].upper, bbj[1].upper);
576 }
577 #endif
578 }
579
580 #if NBNXN_BBXXXX
581
582 /*! \brief Computes the bounding box for na coordinates in order xyz, bb order xxxxyyyyzzzz */
calc_bounding_box_xxxx(int na,int stride,const real * x,float * bb)583 static void calc_bounding_box_xxxx(int na, int stride, const real* x, float* bb)
584 {
585 int i;
586 real xl, xh, yl, yh, zl, zh;
587
588 i = 0;
589 xl = x[i + XX];
590 xh = x[i + XX];
591 yl = x[i + YY];
592 yh = x[i + YY];
593 zl = x[i + ZZ];
594 zh = x[i + ZZ];
595 i += stride;
596 for (int j = 1; j < na; j++)
597 {
598 xl = std::min(xl, x[i + XX]);
599 xh = std::max(xh, x[i + XX]);
600 yl = std::min(yl, x[i + YY]);
601 yh = std::max(yh, x[i + YY]);
602 zl = std::min(zl, x[i + ZZ]);
603 zh = std::max(zh, x[i + ZZ]);
604 i += stride;
605 }
606 /* Note: possible double to float conversion here */
607 bb[0 * c_packedBoundingBoxesDimSize] = R2F_D(xl);
608 bb[1 * c_packedBoundingBoxesDimSize] = R2F_D(yl);
609 bb[2 * c_packedBoundingBoxesDimSize] = R2F_D(zl);
610 bb[3 * c_packedBoundingBoxesDimSize] = R2F_U(xh);
611 bb[4 * c_packedBoundingBoxesDimSize] = R2F_U(yh);
612 bb[5 * c_packedBoundingBoxesDimSize] = R2F_U(zh);
613 }
614
615 #endif /* NBNXN_BBXXXX */
616
617 #if NBNXN_SEARCH_SIMD4_FLOAT_X_BB
618
619 /*! \brief Computes the bounding box for na coordinates in order xyz?, bb order xyz0 */
calc_bounding_box_simd4(int na,const float * x,BoundingBox * bb)620 static void calc_bounding_box_simd4(int na, const float* x, BoundingBox* bb)
621 {
622 // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
623 using namespace gmx;
624
625 static_assert(sizeof(BoundingBox::Corner) == GMX_SIMD4_WIDTH * sizeof(float),
626 "The Corner struct should hold exactly 4 floats");
627
628 Simd4Float bb_0_S = load4(x);
629 Simd4Float bb_1_S = bb_0_S;
630
631 for (int i = 1; i < na; i++)
632 {
633 Simd4Float x_S = load4(x + i * GMX_SIMD4_WIDTH);
634 bb_0_S = min(bb_0_S, x_S);
635 bb_1_S = max(bb_1_S, x_S);
636 }
637
638 store4(bb->lower.ptr(), bb_0_S);
639 store4(bb->upper.ptr(), bb_1_S);
640 }
641
642 # if NBNXN_BBXXXX
643
644 /*! \brief Computes the bounding box for na coordinates in order xyz?, bb order xxxxyyyyzzzz */
calc_bounding_box_xxxx_simd4(int na,const float * x,BoundingBox * bb_work_aligned,real * bb)645 static void calc_bounding_box_xxxx_simd4(int na, const float* x, BoundingBox* bb_work_aligned, real* bb)
646 {
647 calc_bounding_box_simd4(na, x, bb_work_aligned);
648
649 bb[0 * c_packedBoundingBoxesDimSize] = bb_work_aligned->lower.x;
650 bb[1 * c_packedBoundingBoxesDimSize] = bb_work_aligned->lower.y;
651 bb[2 * c_packedBoundingBoxesDimSize] = bb_work_aligned->lower.z;
652 bb[3 * c_packedBoundingBoxesDimSize] = bb_work_aligned->upper.x;
653 bb[4 * c_packedBoundingBoxesDimSize] = bb_work_aligned->upper.y;
654 bb[5 * c_packedBoundingBoxesDimSize] = bb_work_aligned->upper.z;
655 }
656
657 # endif /* NBNXN_BBXXXX */
658
659 #endif /* NBNXN_SEARCH_SIMD4_FLOAT_X_BB */
660
661
662 /*! \brief Combines pairs of consecutive bounding boxes */
combine_bounding_box_pairs(const Grid & grid,gmx::ArrayRef<const BoundingBox> bb,gmx::ArrayRef<BoundingBox> bbj)663 static void combine_bounding_box_pairs(const Grid& grid,
664 gmx::ArrayRef<const BoundingBox> bb,
665 gmx::ArrayRef<BoundingBox> bbj)
666 {
667 // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
668 using namespace gmx;
669
670 for (int i = 0; i < grid.numColumns(); i++)
671 {
672 /* Starting bb in a column is expected to be 2-aligned */
673 const int sc2 = grid.firstCellInColumn(i) >> 1;
674 /* For odd numbers skip the last bb here */
675 const int nc2 = (grid.numAtomsInColumn(i) + 3) >> (2 + 1);
676 int c2;
677 for (c2 = sc2; c2 < sc2 + nc2; c2++)
678 {
679 #if NBNXN_SEARCH_BB_SIMD4
680 Simd4Float min_S, max_S;
681
682 min_S = min(load4(bb[c2 * 2 + 0].lower.ptr()), load4(bb[c2 * 2 + 1].lower.ptr()));
683 max_S = max(load4(bb[c2 * 2 + 0].upper.ptr()), load4(bb[c2 * 2 + 1].upper.ptr()));
684 store4(bbj[c2].lower.ptr(), min_S);
685 store4(bbj[c2].upper.ptr(), max_S);
686 #else
687 bbj[c2].lower = BoundingBox::Corner::min(bb[c2 * 2 + 0].lower, bb[c2 * 2 + 1].lower);
688 bbj[c2].upper = BoundingBox::Corner::max(bb[c2 * 2 + 0].upper, bb[c2 * 2 + 1].upper);
689 #endif
690 }
691 if (((grid.numAtomsInColumn(i) + 3) >> 2) & 1)
692 {
693 /* The bb count in this column is odd: duplicate the last bb */
694 bbj[c2].lower = bb[c2 * 2].lower;
695 bbj[c2].upper = bb[c2 * 2].upper;
696 }
697 }
698 }
699
700
701 /*! \brief Prints the average bb size, used for debug output */
print_bbsizes_simple(FILE * fp,const Grid & grid)702 static void print_bbsizes_simple(FILE* fp, const Grid& grid)
703 {
704 dvec ba = { 0 };
705 for (int c = 0; c < grid.numCells(); c++)
706 {
707 const BoundingBox& bb = grid.iBoundingBoxes()[c];
708 ba[XX] += bb.upper.x - bb.lower.x;
709 ba[YY] += bb.upper.y - bb.lower.y;
710 ba[ZZ] += bb.upper.z - bb.lower.z;
711 }
712 dsvmul(1.0 / grid.numCells(), ba, ba);
713
714 const Grid::Dimensions& dims = grid.dimensions();
715 real avgCellSizeZ =
716 (dims.atomDensity > 0 ? grid.geometry().numAtomsICluster
717 / (dims.atomDensity * dims.cellSize[XX] * dims.cellSize[YY])
718 : 0.0);
719
720 fprintf(fp, "ns bb: grid %4.2f %4.2f %4.2f abs %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
721 dims.cellSize[XX], dims.cellSize[YY], avgCellSizeZ, ba[XX], ba[YY], ba[ZZ],
722 ba[XX] * dims.invCellSize[XX], ba[YY] * dims.invCellSize[YY],
723 dims.atomDensity > 0 ? ba[ZZ] / avgCellSizeZ : 0.0);
724 }
725
726 /*! \brief Prints the average bb size, used for debug output */
print_bbsizes_supersub(FILE * fp,const Grid & grid)727 static void print_bbsizes_supersub(FILE* fp, const Grid& grid)
728 {
729 int ns;
730 dvec ba;
731
732 clear_dvec(ba);
733 ns = 0;
734 for (int c = 0; c < grid.numCells(); c++)
735 {
736 #if NBNXN_BBXXXX
737 for (int s = 0; s < grid.numClustersPerCell()[c]; s += c_packedBoundingBoxesDimSize)
738 {
739 int cs_w = (c * c_gpuNumClusterPerCell + s) / c_packedBoundingBoxesDimSize;
740 auto boundingBoxes = grid.packedBoundingBoxes().subArray(
741 cs_w * c_packedBoundingBoxesSize, c_packedBoundingBoxesSize);
742 for (int i = 0; i < c_packedBoundingBoxesDimSize; i++)
743 {
744 for (int d = 0; d < DIM; d++)
745 {
746 ba[d] += boundingBoxes[(DIM + d) * c_packedBoundingBoxesDimSize + i]
747 - boundingBoxes[(0 + d) * c_packedBoundingBoxesDimSize + i];
748 }
749 }
750 }
751 #else
752 for (int s = 0; s < grid.numClustersPerCell()[c]; s++)
753 {
754 const BoundingBox& bb = grid.iBoundingBoxes()[c * c_gpuNumClusterPerCell + s];
755 ba[XX] += bb.upper.x - bb.lower.x;
756 ba[YY] += bb.upper.y - bb.lower.y;
757 ba[ZZ] += bb.upper.z - bb.lower.z;
758 }
759 #endif
760 ns += grid.numClustersPerCell()[c];
761 }
762 dsvmul(1.0 / ns, ba, ba);
763
764 const Grid::Dimensions& dims = grid.dimensions();
765 const real avgClusterSizeZ =
766 (dims.atomDensity > 0 ? grid.geometry().numAtomsPerCell
767 / (dims.atomDensity * dims.cellSize[XX]
768 * dims.cellSize[YY] * c_gpuNumClusterPerCellZ)
769 : 0.0);
770
771 fprintf(fp, "ns bb: grid %4.2f %4.2f %4.2f abs %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
772 dims.cellSize[XX] / c_gpuNumClusterPerCellX,
773 dims.cellSize[YY] / c_gpuNumClusterPerCellY, avgClusterSizeZ, ba[XX], ba[YY], ba[ZZ],
774 ba[XX] * c_gpuNumClusterPerCellX * dims.invCellSize[XX],
775 ba[YY] * c_gpuNumClusterPerCellY * dims.invCellSize[YY],
776 dims.atomDensity > 0 ? ba[ZZ] / avgClusterSizeZ : 0.0);
777 }
778
779 /*!\brief Set non-bonded interaction flags for the current cluster.
780 *
781 * Sorts atoms on LJ coefficients: !=0 first, ==0 at the end.
782 */
sort_cluster_on_flag(int numAtomsInCluster,int atomStart,int atomEnd,const int * atinfo,gmx::ArrayRef<int> order,int * flags)783 static void sort_cluster_on_flag(int numAtomsInCluster,
784 int atomStart,
785 int atomEnd,
786 const int* atinfo,
787 gmx::ArrayRef<int> order,
788 int* flags)
789 {
790 constexpr int c_maxNumAtomsInCluster = 8;
791 int sort1[c_maxNumAtomsInCluster];
792 int sort2[c_maxNumAtomsInCluster];
793
794 GMX_ASSERT(numAtomsInCluster <= c_maxNumAtomsInCluster,
795 "Need to increase c_maxNumAtomsInCluster to support larger clusters");
796
797 *flags = 0;
798
799 int subc = 0;
800 for (int s = atomStart; s < atomEnd; s += numAtomsInCluster)
801 {
802 /* Make lists for this (sub-)cell on atoms with and without LJ */
803 int n1 = 0;
804 int n2 = 0;
805 gmx_bool haveQ = FALSE;
806 int a_lj_max = -1;
807 for (int a = s; a < std::min(s + numAtomsInCluster, atomEnd); a++)
808 {
809 haveQ = haveQ || GET_CGINFO_HAS_Q(atinfo[order[a]]);
810
811 if (GET_CGINFO_HAS_VDW(atinfo[order[a]]))
812 {
813 sort1[n1++] = order[a];
814 a_lj_max = a;
815 }
816 else
817 {
818 sort2[n2++] = order[a];
819 }
820 }
821
822 /* If we don't have atoms with LJ, there's nothing to sort */
823 if (n1 > 0)
824 {
825 *flags |= NBNXN_CI_DO_LJ(subc);
826
827 if (2 * n1 <= numAtomsInCluster)
828 {
829 /* Only sort when strictly necessary. Ordering particles
830 * Ordering particles can lead to less accurate summation
831 * due to rounding, both for LJ and Coulomb interactions.
832 */
833 if (2 * (a_lj_max - s) >= numAtomsInCluster)
834 {
835 for (int i = 0; i < n1; i++)
836 {
837 order[atomStart + i] = sort1[i];
838 }
839 for (int j = 0; j < n2; j++)
840 {
841 order[atomStart + n1 + j] = sort2[j];
842 }
843 }
844
845 *flags |= NBNXN_CI_HALF_LJ(subc);
846 }
847 }
848 if (haveQ)
849 {
850 *flags |= NBNXN_CI_DO_COUL(subc);
851 }
852 subc++;
853 }
854 }
855
856 /*! \brief Fill a pair search cell with atoms.
857 *
858 * Potentially sorts atoms and sets the interaction flags.
859 */
fillCell(GridSetData * gridSetData,nbnxn_atomdata_t * nbat,int atomStart,int atomEnd,const int * atinfo,gmx::ArrayRef<const gmx::RVec> x,BoundingBox gmx_unused * bb_work_aligned)860 void Grid::fillCell(GridSetData* gridSetData,
861 nbnxn_atomdata_t* nbat,
862 int atomStart,
863 int atomEnd,
864 const int* atinfo,
865 gmx::ArrayRef<const gmx::RVec> x,
866 BoundingBox gmx_unused* bb_work_aligned)
867 {
868 const int numAtoms = atomEnd - atomStart;
869
870 const gmx::ArrayRef<int>& cells = gridSetData->cells;
871 const gmx::ArrayRef<int>& atomIndices = gridSetData->atomIndices;
872
873 if (geometry_.isSimple)
874 {
875 /* Note that non-local grids are already sorted.
876 * Then sort_cluster_on_flag will only set the flags and the sorting
877 * will not affect the atom order.
878 */
879 sort_cluster_on_flag(geometry_.numAtomsICluster, atomStart, atomEnd, atinfo, atomIndices,
880 flags_.data() + atomToCluster(atomStart) - cellOffset_);
881 }
882
883 if (haveFep_)
884 {
885 /* Set the fep flag for perturbed atoms in this (sub-)cell */
886
887 /* The grid-local cluster/(sub-)cell index */
888 int cell = atomToCluster(atomStart)
889 - cellOffset_ * (geometry_.isSimple ? 1 : c_gpuNumClusterPerCell);
890 fep_[cell] = 0;
891 for (int at = atomStart; at < atomEnd; at++)
892 {
893 if (atomIndices[at] >= 0 && GET_CGINFO_FEP(atinfo[atomIndices[at]]))
894 {
895 fep_[cell] |= (1 << (at - atomStart));
896 }
897 }
898 }
899
900 /* Now we have sorted the atoms, set the cell indices */
901 for (int at = atomStart; at < atomEnd; at++)
902 {
903 cells[atomIndices[at]] = at;
904 }
905
906 copy_rvec_to_nbat_real(atomIndices.data() + atomStart, numAtoms, geometry_.numAtomsICluster,
907 as_rvec_array(x.data()), nbat->XFormat, nbat->x().data(), atomStart);
908
909 if (nbat->XFormat == nbatX4)
910 {
911 /* Store the bounding boxes as xyz.xyz. */
912 size_t offset = atomToCluster(atomStart - cellOffset_ * geometry_.numAtomsICluster);
913 BoundingBox* bb_ptr = bb_.data() + offset;
914
915 #if GMX_SIMD && GMX_SIMD_REAL_WIDTH == 2
916 if (2 * geometry_.numAtomsJCluster == geometry_.numAtomsICluster)
917 {
918 calc_bounding_box_x_x4_halves(numAtoms,
919 nbat->x().data() + atom_to_x_index<c_packX4>(atomStart),
920 bb_ptr, bbj_.data() + offset * 2);
921 }
922 else
923 #endif
924 {
925 calc_bounding_box_x_x4(numAtoms, nbat->x().data() + atom_to_x_index<c_packX4>(atomStart), bb_ptr);
926 }
927 }
928 else if (nbat->XFormat == nbatX8)
929 {
930 /* Store the bounding boxes as xyz.xyz. */
931 size_t offset = atomToCluster(atomStart - cellOffset_ * geometry_.numAtomsICluster);
932 BoundingBox* bb_ptr = bb_.data() + offset;
933
934 calc_bounding_box_x_x8(numAtoms, nbat->x().data() + atom_to_x_index<c_packX8>(atomStart), bb_ptr);
935 }
936 #if NBNXN_BBXXXX
937 else if (!geometry_.isSimple)
938 {
939 /* Store the bounding boxes in a format convenient
940 * for SIMD4 calculations: xxxxyyyyzzzz...
941 */
942 const int clusterIndex = ((atomStart - cellOffset_ * geometry_.numAtomsPerCell)
943 >> geometry_.numAtomsICluster2Log);
944 float* pbb_ptr = pbb_.data() + packedBoundingBoxesIndex(clusterIndex)
945 + (clusterIndex & (c_packedBoundingBoxesDimSize - 1));
946
947 # if NBNXN_SEARCH_SIMD4_FLOAT_X_BB
948 if (nbat->XFormat == nbatXYZQ)
949 {
950 GMX_ASSERT(bb_work_aligned != nullptr, "Must have valid aligned work structure");
951 calc_bounding_box_xxxx_simd4(numAtoms, nbat->x().data() + atomStart * nbat->xstride,
952 bb_work_aligned, pbb_ptr);
953 }
954 else
955 # endif
956 {
957 calc_bounding_box_xxxx(numAtoms, nbat->xstride,
958 nbat->x().data() + atomStart * nbat->xstride, pbb_ptr);
959 }
960 if (gmx_debug_at)
961 {
962 fprintf(debug, "cell %4d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
963 atomToCluster(atomStart), pbb_ptr[0 * c_packedBoundingBoxesDimSize],
964 pbb_ptr[3 * c_packedBoundingBoxesDimSize], pbb_ptr[1 * c_packedBoundingBoxesDimSize],
965 pbb_ptr[4 * c_packedBoundingBoxesDimSize], pbb_ptr[2 * c_packedBoundingBoxesDimSize],
966 pbb_ptr[5 * c_packedBoundingBoxesDimSize]);
967 }
968 }
969 #endif
970 else
971 {
972 /* Store the bounding boxes as xyz.xyz. */
973 BoundingBox* bb_ptr =
974 bb_.data() + atomToCluster(atomStart - cellOffset_ * geometry_.numAtomsPerCell);
975
976 calc_bounding_box(numAtoms, nbat->xstride, nbat->x().data() + atomStart * nbat->xstride, bb_ptr);
977
978 if (gmx_debug_at)
979 {
980 int bbo = atomToCluster(atomStart - cellOffset_ * geometry_.numAtomsPerCell);
981 fprintf(debug, "cell %4d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
982 atomToCluster(atomStart), bb_[bbo].lower.x, bb_[bbo].lower.y, bb_[bbo].lower.z,
983 bb_[bbo].upper.x, bb_[bbo].upper.y, bb_[bbo].upper.z);
984 }
985 }
986 }
987
sortColumnsCpuGeometry(GridSetData * gridSetData,int dd_zone,const int * atinfo,gmx::ArrayRef<const gmx::RVec> x,nbnxn_atomdata_t * nbat,const gmx::Range<int> columnRange,gmx::ArrayRef<int> sort_work)988 void Grid::sortColumnsCpuGeometry(GridSetData* gridSetData,
989 int dd_zone,
990 const int* atinfo,
991 gmx::ArrayRef<const gmx::RVec> x,
992 nbnxn_atomdata_t* nbat,
993 const gmx::Range<int> columnRange,
994 gmx::ArrayRef<int> sort_work)
995 {
996 if (debug)
997 {
998 fprintf(debug, "cell_offset %d sorting columns %d - %d\n", cellOffset_,
999 *columnRange.begin(), *columnRange.end());
1000 }
1001
1002 const bool relevantAtomsAreWithinGridBounds = (dimensions_.maxAtomGroupRadius == 0);
1003
1004 const int numAtomsPerCell = geometry_.numAtomsPerCell;
1005
1006 /* Sort the atoms within each x,y column in 3 dimensions */
1007 for (int cxy : columnRange)
1008 {
1009 const int numAtoms = numAtomsInColumn(cxy);
1010 const int numCellsZ = cxy_ind_[cxy + 1] - cxy_ind_[cxy];
1011 const int atomOffset = firstAtomInColumn(cxy);
1012
1013 /* Sort the atoms within each x,y column on z coordinate */
1014 sort_atoms(ZZ, FALSE, dd_zone, relevantAtomsAreWithinGridBounds,
1015 gridSetData->atomIndices.data() + atomOffset, numAtoms, x, dimensions_.lowerCorner[ZZ],
1016 1.0 / dimensions_.gridSize[ZZ], numCellsZ * numAtomsPerCell, sort_work);
1017
1018 /* Fill the ncz cells in this column */
1019 const int firstCell = firstCellInColumn(cxy);
1020 int cellFilled = firstCell;
1021 for (int cellZ = 0; cellZ < numCellsZ; cellZ++)
1022 {
1023 const int cell = firstCell + cellZ;
1024
1025 const int atomOffsetCell = atomOffset + cellZ * numAtomsPerCell;
1026 const int numAtomsCell = std::min(numAtomsPerCell, numAtoms - (atomOffsetCell - atomOffset));
1027
1028 fillCell(gridSetData, nbat, atomOffsetCell, atomOffsetCell + numAtomsCell, atinfo, x, nullptr);
1029
1030 /* This copy to bbcz is not really necessary.
1031 * But it allows to use the same grid search code
1032 * for the simple and supersub cell setups.
1033 */
1034 if (numAtomsCell > 0)
1035 {
1036 cellFilled = cell;
1037 }
1038 bbcz_[cell].lower = bb_[cellFilled].lower.z;
1039 bbcz_[cell].upper = bb_[cellFilled].upper.z;
1040 }
1041
1042 /* Set the unused atom indices to -1 */
1043 for (int ind = numAtoms; ind < numCellsZ * numAtomsPerCell; ind++)
1044 {
1045 gridSetData->atomIndices[atomOffset + ind] = -1;
1046 }
1047 }
1048 }
1049
1050 /* Spatially sort the atoms within one grid column */
sortColumnsGpuGeometry(GridSetData * gridSetData,int dd_zone,const int * atinfo,gmx::ArrayRef<const gmx::RVec> x,nbnxn_atomdata_t * nbat,const gmx::Range<int> columnRange,gmx::ArrayRef<int> sort_work)1051 void Grid::sortColumnsGpuGeometry(GridSetData* gridSetData,
1052 int dd_zone,
1053 const int* atinfo,
1054 gmx::ArrayRef<const gmx::RVec> x,
1055 nbnxn_atomdata_t* nbat,
1056 const gmx::Range<int> columnRange,
1057 gmx::ArrayRef<int> sort_work)
1058 {
1059 BoundingBox bb_work_array[2];
1060 BoundingBox* bb_work_aligned = reinterpret_cast<BoundingBox*>(
1061 (reinterpret_cast<std::size_t>(bb_work_array + 1)) & (~(static_cast<std::size_t>(15))));
1062
1063 if (debug)
1064 {
1065 fprintf(debug, "cell_offset %d sorting columns %d - %d\n", cellOffset_,
1066 *columnRange.begin(), *columnRange.end());
1067 }
1068
1069 const bool relevantAtomsAreWithinGridBounds = (dimensions_.maxAtomGroupRadius == 0);
1070
1071 const int numAtomsPerCell = geometry_.numAtomsPerCell;
1072
1073 const int subdiv_x = geometry_.numAtomsICluster;
1074 const int subdiv_y = c_gpuNumClusterPerCellX * subdiv_x;
1075 const int subdiv_z = c_gpuNumClusterPerCellY * subdiv_y;
1076
1077 /* Extract the atom index array that will be filled here */
1078 const gmx::ArrayRef<int>& atomIndices = gridSetData->atomIndices;
1079
1080 /* Sort the atoms within each x,y column in 3 dimensions.
1081 * Loop over all columns on the x/y grid.
1082 */
1083 for (int cxy : columnRange)
1084 {
1085 const int gridX = cxy / dimensions_.numCells[YY];
1086 const int gridY = cxy - gridX * dimensions_.numCells[YY];
1087
1088 const int numAtomsInColumn = cxy_na_[cxy];
1089 const int numCellsInColumn = cxy_ind_[cxy + 1] - cxy_ind_[cxy];
1090 const int atomOffset = firstAtomInColumn(cxy);
1091
1092 /* Sort the atoms within each x,y column on z coordinate */
1093 sort_atoms(ZZ, FALSE, dd_zone, relevantAtomsAreWithinGridBounds,
1094 atomIndices.data() + atomOffset, numAtomsInColumn, x, dimensions_.lowerCorner[ZZ],
1095 1.0 / dimensions_.gridSize[ZZ], numCellsInColumn * numAtomsPerCell, sort_work);
1096
1097 /* This loop goes over the cells and clusters along z at once */
1098 for (int sub_z = 0; sub_z < numCellsInColumn * c_gpuNumClusterPerCellZ; sub_z++)
1099 {
1100 const int atomOffsetZ = atomOffset + sub_z * subdiv_z;
1101 const int numAtomsZ = std::min(subdiv_z, numAtomsInColumn - (atomOffsetZ - atomOffset));
1102 int cz = -1;
1103 /* We have already sorted on z */
1104
1105 if (sub_z % c_gpuNumClusterPerCellZ == 0)
1106 {
1107 cz = sub_z / c_gpuNumClusterPerCellZ;
1108 const int cell = cxy_ind_[cxy] + cz;
1109
1110 /* The number of atoms in this cell/super-cluster */
1111 const int numAtoms =
1112 std::min(numAtomsPerCell, numAtomsInColumn - (atomOffsetZ - atomOffset));
1113
1114 numClusters_[cell] =
1115 std::min(c_gpuNumClusterPerCell, (numAtoms + geometry_.numAtomsICluster - 1)
1116 / geometry_.numAtomsICluster);
1117
1118 /* Store the z-boundaries of the bounding box of the cell */
1119 bbcz_[cell].lower = x[atomIndices[atomOffsetZ]][ZZ];
1120 bbcz_[cell].upper = x[atomIndices[atomOffsetZ + numAtoms - 1]][ZZ];
1121 }
1122
1123 if (c_gpuNumClusterPerCellY > 1)
1124 {
1125 /* Sort the atoms along y */
1126 sort_atoms(YY, (sub_z & 1) != 0, dd_zone, relevantAtomsAreWithinGridBounds,
1127 atomIndices.data() + atomOffsetZ, numAtomsZ, x,
1128 dimensions_.lowerCorner[YY] + gridY * dimensions_.cellSize[YY],
1129 dimensions_.invCellSize[YY], subdiv_z, sort_work);
1130 }
1131
1132 for (int sub_y = 0; sub_y < c_gpuNumClusterPerCellY; sub_y++)
1133 {
1134 const int atomOffsetY = atomOffsetZ + sub_y * subdiv_y;
1135 const int numAtomsY = std::min(subdiv_y, numAtomsInColumn - (atomOffsetY - atomOffset));
1136
1137 if (c_gpuNumClusterPerCellX > 1)
1138 {
1139 /* Sort the atoms along x */
1140 sort_atoms(XX, ((cz * c_gpuNumClusterPerCellY + sub_y) & 1) != 0, dd_zone,
1141 relevantAtomsAreWithinGridBounds, atomIndices.data() + atomOffsetY, numAtomsY,
1142 x, dimensions_.lowerCorner[XX] + gridX * dimensions_.cellSize[XX],
1143 dimensions_.invCellSize[XX], subdiv_y, sort_work);
1144 }
1145
1146 for (int sub_x = 0; sub_x < c_gpuNumClusterPerCellX; sub_x++)
1147 {
1148 const int atomOffsetX = atomOffsetY + sub_x * subdiv_x;
1149 const int numAtomsX =
1150 std::min(subdiv_x, numAtomsInColumn - (atomOffsetX - atomOffset));
1151
1152 fillCell(gridSetData, nbat, atomOffsetX, atomOffsetX + numAtomsX, atinfo, x,
1153 bb_work_aligned);
1154 }
1155 }
1156 }
1157
1158 /* Set the unused atom indices to -1 */
1159 for (int ind = numAtomsInColumn; ind < numCellsInColumn * numAtomsPerCell; ind++)
1160 {
1161 atomIndices[atomOffset + ind] = -1;
1162 }
1163 }
1164 }
1165
1166 /*! \brief Sets the cell index in the cell array for atom \p atomIndex and increments the atom count for the grid column */
setCellAndAtomCount(gmx::ArrayRef<int> cell,int cellIndex,gmx::ArrayRef<int> cxy_na,int atomIndex)1167 static void setCellAndAtomCount(gmx::ArrayRef<int> cell, int cellIndex, gmx::ArrayRef<int> cxy_na, int atomIndex)
1168 {
1169 cell[atomIndex] = cellIndex;
1170 cxy_na[cellIndex] += 1;
1171 }
1172
calcColumnIndices(const Grid::Dimensions & gridDims,const gmx::UpdateGroupsCog * updateGroupsCog,const gmx::Range<int> atomRange,gmx::ArrayRef<const gmx::RVec> x,const int dd_zone,const int * move,const int thread,const int nthread,gmx::ArrayRef<int> cell,gmx::ArrayRef<int> cxy_na)1173 void Grid::calcColumnIndices(const Grid::Dimensions& gridDims,
1174 const gmx::UpdateGroupsCog* updateGroupsCog,
1175 const gmx::Range<int> atomRange,
1176 gmx::ArrayRef<const gmx::RVec> x,
1177 const int dd_zone,
1178 const int* move,
1179 const int thread,
1180 const int nthread,
1181 gmx::ArrayRef<int> cell,
1182 gmx::ArrayRef<int> cxy_na)
1183 {
1184 const int numColumns = gridDims.numCells[XX] * gridDims.numCells[YY];
1185
1186 /* We add one extra cell for particles which moved during DD */
1187 for (int i = 0; i < numColumns; i++)
1188 {
1189 cxy_na[i] = 0;
1190 }
1191
1192 int taskAtomStart = *atomRange.begin() + static_cast<int>((thread + 0) * atomRange.size()) / nthread;
1193 int taskAtomEnd = *atomRange.begin() + static_cast<int>((thread + 1) * atomRange.size()) / nthread;
1194
1195 if (dd_zone == 0)
1196 {
1197 /* Home zone */
1198 for (int i = taskAtomStart; i < taskAtomEnd; i++)
1199 {
1200 if (move == nullptr || move[i] >= 0)
1201 {
1202
1203 const gmx::RVec& coord = (updateGroupsCog ? updateGroupsCog->cogForAtom(i) : x[i]);
1204
1205 /* We need to be careful with rounding,
1206 * particles might be a few bits outside the local zone.
1207 * The int cast takes care of the lower bound,
1208 * we will explicitly take care of the upper bound.
1209 */
1210 int cx = static_cast<int>((coord[XX] - gridDims.lowerCorner[XX])
1211 * gridDims.invCellSize[XX]);
1212 int cy = static_cast<int>((coord[YY] - gridDims.lowerCorner[YY])
1213 * gridDims.invCellSize[YY]);
1214
1215 #ifndef NDEBUG
1216 if (cx < 0 || cx > gridDims.numCells[XX] || cy < 0 || cy > gridDims.numCells[YY])
1217 {
1218 gmx_fatal(FARGS,
1219 "grid cell cx %d cy %d out of range (max %d %d)\n"
1220 "atom %f %f %f, grid->c0 %f %f",
1221 cx, cy, gridDims.numCells[XX], gridDims.numCells[YY], x[i][XX],
1222 x[i][YY], x[i][ZZ], gridDims.lowerCorner[XX], gridDims.lowerCorner[YY]);
1223 }
1224 #endif
1225 /* Take care of potential rouding issues */
1226 cx = std::min(cx, gridDims.numCells[XX] - 1);
1227 cy = std::min(cy, gridDims.numCells[YY] - 1);
1228
1229 /* For the moment cell will contain only the, grid local,
1230 * x and y indices, not z.
1231 */
1232 setCellAndAtomCount(cell, cx * gridDims.numCells[YY] + cy, cxy_na, i);
1233 }
1234 else
1235 {
1236 /* Put this moved particle after the end of the grid,
1237 * so we can process it later without using conditionals.
1238 */
1239 setCellAndAtomCount(cell, numColumns, cxy_na, i);
1240 }
1241 }
1242 }
1243 else
1244 {
1245 /* Non-home zone */
1246 for (int i = taskAtomStart; i < taskAtomEnd; i++)
1247 {
1248 int cx = static_cast<int>((x[i][XX] - gridDims.lowerCorner[XX]) * gridDims.invCellSize[XX]);
1249 int cy = static_cast<int>((x[i][YY] - gridDims.lowerCorner[YY]) * gridDims.invCellSize[YY]);
1250
1251 /* For non-home zones there could be particles outside
1252 * the non-bonded cut-off range, which have been communicated
1253 * for bonded interactions only. For the result it doesn't
1254 * matter where these end up on the grid. For performance
1255 * we put them in an extra row at the border.
1256 */
1257 cx = std::max(cx, 0);
1258 cx = std::min(cx, gridDims.numCells[XX] - 1);
1259 cy = std::max(cy, 0);
1260 cy = std::min(cy, gridDims.numCells[YY] - 1);
1261
1262 /* For the moment cell will contain only the, grid local,
1263 * x and y indices, not z.
1264 */
1265 setCellAndAtomCount(cell, cx * gridDims.numCells[YY] + cy, cxy_na, i);
1266 }
1267 }
1268 }
1269
1270 /*! \brief Resizes grid and atom data which depend on the number of cells */
resizeForNumberOfCells(const int numNbnxnAtoms,const int numAtomsMoved,GridSetData * gridSetData,nbnxn_atomdata_t * nbat)1271 static void resizeForNumberOfCells(const int numNbnxnAtoms,
1272 const int numAtomsMoved,
1273 GridSetData* gridSetData,
1274 nbnxn_atomdata_t* nbat)
1275 {
1276 /* Note: gridSetData->cellIndices was already resized before */
1277
1278 /* To avoid conditionals we store the moved particles at the end of a,
1279 * make sure we have enough space.
1280 */
1281 gridSetData->atomIndices.resize(numNbnxnAtoms + numAtomsMoved);
1282
1283 /* Make space in nbat for storing the atom coordinates */
1284 nbat->resizeCoordinateBuffer(numNbnxnAtoms);
1285 }
1286
setCellIndices(int ddZone,int cellOffset,GridSetData * gridSetData,gmx::ArrayRef<GridWork> gridWork,const gmx::Range<int> atomRange,const int * atinfo,gmx::ArrayRef<const gmx::RVec> x,const int numAtomsMoved,nbnxn_atomdata_t * nbat)1287 void Grid::setCellIndices(int ddZone,
1288 int cellOffset,
1289 GridSetData* gridSetData,
1290 gmx::ArrayRef<GridWork> gridWork,
1291 const gmx::Range<int> atomRange,
1292 const int* atinfo,
1293 gmx::ArrayRef<const gmx::RVec> x,
1294 const int numAtomsMoved,
1295 nbnxn_atomdata_t* nbat)
1296 {
1297 cellOffset_ = cellOffset;
1298
1299 srcAtomBegin_ = *atomRange.begin();
1300 srcAtomEnd_ = *atomRange.end();
1301
1302 const int nthread = gmx_omp_nthreads_get(emntPairsearch);
1303
1304 const int numAtomsPerCell = geometry_.numAtomsPerCell;
1305
1306 /* Make the cell index as a function of x and y */
1307 int ncz_max = 0;
1308 int ncz = 0;
1309 cxy_ind_[0] = 0;
1310 for (int i = 0; i < numColumns() + 1; i++)
1311 {
1312 /* We set ncz_max at the beginning of the loop iso at the end
1313 * to skip i=grid->ncx*grid->numCells[YY] which are moved particles
1314 * that do not need to be ordered on the grid.
1315 */
1316 if (ncz > ncz_max)
1317 {
1318 ncz_max = ncz;
1319 }
1320 int cxy_na_i = gridWork[0].numAtomsPerColumn[i];
1321 for (int thread = 1; thread < nthread; thread++)
1322 {
1323 cxy_na_i += gridWork[thread].numAtomsPerColumn[i];
1324 }
1325 ncz = (cxy_na_i + numAtomsPerCell - 1) / numAtomsPerCell;
1326 if (nbat->XFormat == nbatX8)
1327 {
1328 /* Make the number of cell a multiple of 2 */
1329 ncz = (ncz + 1) & ~1;
1330 }
1331 cxy_ind_[i + 1] = cxy_ind_[i] + ncz;
1332 /* Clear cxy_na_, so we can reuse the array below */
1333 cxy_na_[i] = 0;
1334 }
1335 numCellsTotal_ = cxy_ind_[numColumns()] - cxy_ind_[0];
1336 numCellsColumnMax_ = ncz_max;
1337
1338 /* Resize grid and atom data which depend on the number of cells */
1339 resizeForNumberOfCells(atomIndexEnd(), numAtomsMoved, gridSetData, nbat);
1340
1341 if (debug)
1342 {
1343 fprintf(debug, "ns na_sc %d na_c %d super-cells: %d x %d y %d z %.1f maxz %d\n",
1344 numAtomsPerCell, geometry_.numAtomsICluster, numCellsTotal_, dimensions_.numCells[XX],
1345 dimensions_.numCells[YY], numCellsTotal_ / (static_cast<double>(numColumns())), ncz_max);
1346 if (gmx_debug_at)
1347 {
1348 int i = 0;
1349 for (int cy = 0; cy < dimensions_.numCells[YY]; cy++)
1350 {
1351 for (int cx = 0; cx < dimensions_.numCells[XX]; cx++)
1352 {
1353 fprintf(debug, " %2d", cxy_ind_[i + 1] - cxy_ind_[i]);
1354 i++;
1355 }
1356 fprintf(debug, "\n");
1357 }
1358 }
1359 }
1360
1361 /* Make sure the work array for sorting is large enough */
1362 const int worstCaseSortBufferSize = ncz_max * numAtomsPerCell * c_sortGridMaxSizeFactor;
1363 if (worstCaseSortBufferSize > gmx::index(gridWork[0].sortBuffer.size()))
1364 {
1365 for (GridWork& work : gridWork)
1366 {
1367 /* Elements not in use should be -1 */
1368 work.sortBuffer.resize(worstCaseSortBufferSize, -1);
1369 }
1370 }
1371
1372 /* Now we know the dimensions we can fill the grid.
1373 * This is the first, unsorted fill. We sort the columns after this.
1374 */
1375 gmx::ArrayRef<int> cells = gridSetData->cells;
1376 gmx::ArrayRef<int> atomIndices = gridSetData->atomIndices;
1377 for (int i : atomRange)
1378 {
1379 /* At this point nbs->cell contains the local grid x,y indices */
1380 const int cxy = cells[i];
1381 atomIndices[firstAtomInColumn(cxy) + cxy_na_[cxy]++] = i;
1382 }
1383
1384 if (ddZone == 0)
1385 {
1386 /* Set the cell indices for the moved particles */
1387 int n0 = numCellsTotal_ * numAtomsPerCell;
1388 int n1 = numCellsTotal_ * numAtomsPerCell + cxy_na_[numColumns()];
1389 for (int i = n0; i < n1; i++)
1390 {
1391 cells[atomIndices[i]] = i;
1392 }
1393 }
1394
1395 /* Sort the super-cell columns along z into the sub-cells. */
1396 #pragma omp parallel for num_threads(nthread) schedule(static)
1397 for (int thread = 0; thread < nthread; thread++)
1398 {
1399 try
1400 {
1401 gmx::Range<int> columnRange(((thread + 0) * numColumns()) / nthread,
1402 ((thread + 1) * numColumns()) / nthread);
1403 if (geometry_.isSimple)
1404 {
1405 sortColumnsCpuGeometry(gridSetData, ddZone, atinfo, x, nbat, columnRange,
1406 gridWork[thread].sortBuffer);
1407 }
1408 else
1409 {
1410 sortColumnsGpuGeometry(gridSetData, ddZone, atinfo, x, nbat, columnRange,
1411 gridWork[thread].sortBuffer);
1412 }
1413 }
1414 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
1415 }
1416
1417 if (geometry_.isSimple && nbat->XFormat == nbatX8)
1418 {
1419 combine_bounding_box_pairs(*this, bb_, bbj_);
1420 }
1421
1422 if (!geometry_.isSimple)
1423 {
1424 numClustersTotal_ = 0;
1425 for (int i = 0; i < numCellsTotal_; i++)
1426 {
1427 numClustersTotal_ += numClusters_[i];
1428 }
1429 }
1430
1431 if (debug)
1432 {
1433 if (geometry_.isSimple)
1434 {
1435 print_bbsizes_simple(debug, *this);
1436 }
1437 else
1438 {
1439 fprintf(debug, "ns non-zero sub-cells: %d average atoms %.2f\n", numClustersTotal_,
1440 atomRange.size() / static_cast<double>(numClustersTotal_));
1441
1442 print_bbsizes_supersub(debug, *this);
1443 }
1444 }
1445 }
1446
1447 } // namespace Nbnxm
1448