1 /*
2 * This file is part of the GROMACS molecular simulation package.
3 *
4 * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
5 * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
9 *
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
14 *
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 *
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
32 *
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
35 */
36 /*! \internal \file
37 * \brief Define OpenCL implementation of nbnxm_gpu.h
38 *
39 * \author Anca Hamuraru <anca@streamcomputing.eu>
40 * \author Teemu Virolainen <teemu@streamcomputing.eu>
41 * \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
42 * \author Szilárd Páll <pall.szilard@gmail.com>
43 * \ingroup module_nbnxm
44 *
45 * TODO (psz):
46 * - Add a static const cl_uint c_pruneKernelWorkDim / c_nbnxnKernelWorkDim = 3;
47 * - Rework the copying of OCL data structures done before every invocation of both
48 * nb and prune kernels (using fillin_ocl_structures); also consider at the same
49 * time calling clSetKernelArg only on the updated parameters (if tracking changed
50 * parameters is feasible);
51 * - Consider using the event_wait_list argument to clEnqueueNDRangeKernel to mark
52 * dependencies on the kernel launched: e.g. the non-local nb kernel's dependency
53 * on the misc_ops_and_local_H2D_done event could be better expressed this way.
54 *
55 * - Consider extracting common sections of the OpenCL and CUDA nbnxn logic, e.g:
56 * - in nbnxn_gpu_launch_kernel_pruneonly() the pre- and post-kernel launch logic
57 * is identical in the two implementations, so a 3-way split might allow sharing
58 * code;
59 * -
60 *
61 */
62 #include "gmxpre.h"
63
64 #include <assert.h>
65 #include <stdlib.h>
66
67 #if defined(_MSVC)
68 # include <limits>
69 #endif
70
71 #include "thread_mpi/atomic.h"
72
73 #include "gromacs/gpu_utils/device_context.h"
74 #include "gromacs/gpu_utils/gputraits_ocl.h"
75 #include "gromacs/gpu_utils/oclutils.h"
76 #include "gromacs/hardware/device_information.h"
77 #include "gromacs/hardware/hw_info.h"
78 #include "gromacs/mdtypes/simulation_workload.h"
79 #include "gromacs/nbnxm/atomdata.h"
80 #include "gromacs/nbnxm/gpu_common.h"
81 #include "gromacs/nbnxm/gpu_common_utils.h"
82 #include "gromacs/nbnxm/gpu_data_mgmt.h"
83 #include "gromacs/nbnxm/nbnxm.h"
84 #include "gromacs/nbnxm/nbnxm_gpu.h"
85 #include "gromacs/nbnxm/pairlist.h"
86 #include "gromacs/pbcutil/ishift.h"
87 #include "gromacs/timing/gpu_timing.h"
88 #include "gromacs/utility/cstringutil.h"
89 #include "gromacs/utility/fatalerror.h"
90 #include "gromacs/utility/gmxassert.h"
91
92 #include "nbnxm_ocl_types.h"
93
94 namespace Nbnxm
95 {
96
97 /*! \brief Convenience constants */
98 //@{
99 static constexpr int c_clSize = c_nbnxnGpuClusterSize;
100 //@}
101
102
103 /*! \brief Validates the input global work size parameter.
104 */
validate_global_work_size(const KernelLaunchConfig & config,int work_dim,const DeviceInformation * dinfo)105 static inline void validate_global_work_size(const KernelLaunchConfig& config,
106 int work_dim,
107 const DeviceInformation* dinfo)
108 {
109 cl_uint device_size_t_size_bits;
110 cl_uint host_size_t_size_bits;
111
112 GMX_ASSERT(dinfo, "Need a valid device info object");
113
114 size_t global_work_size[3];
115 GMX_ASSERT(work_dim <= 3, "Not supporting hyper-grids just yet");
116 for (int i = 0; i < work_dim; i++)
117 {
118 global_work_size[i] = config.blockSize[i] * config.gridSize[i];
119 }
120
121 /* Each component of a global_work_size must not exceed the range given by the
122 sizeof(device size_t) for the device on which the kernel execution will
123 be enqueued. See:
124 https://www.khronos.org/registry/cl/sdk/1.0/docs/man/xhtml/clEnqueueNDRangeKernel.html
125 */
126 device_size_t_size_bits = dinfo->adress_bits;
127 host_size_t_size_bits = static_cast<cl_uint>(sizeof(size_t) * 8);
128
129 /* If sizeof(host size_t) <= sizeof(device size_t)
130 => global_work_size components will always be valid
131 else
132 => get device limit for global work size and
133 compare it against each component of global_work_size.
134 */
135 if (host_size_t_size_bits > device_size_t_size_bits)
136 {
137 size_t device_limit;
138
139 device_limit = (1ULL << device_size_t_size_bits) - 1;
140
141 for (int i = 0; i < work_dim; i++)
142 {
143 if (global_work_size[i] > device_limit)
144 {
145 gmx_fatal(
146 FARGS,
147 "Watch out, the input system is too large to simulate!\n"
148 "The number of nonbonded work units (=number of super-clusters) exceeds the"
149 "device capabilities. Global work size limit exceeded (%zu > %zu)!",
150 global_work_size[i], device_limit);
151 }
152 }
153 }
154 }
155
156 /* Constant arrays listing non-bonded kernel function names. The arrays are
157 * organized in 2-dim arrays by: electrostatics and VDW type.
158 *
159 * Note that the row- and column-order of function pointers has to match the
160 * order of corresponding enumerated electrostatics and vdw types, resp.,
161 * defined in nbnxm_ocl_types.h.
162 */
163
164 /*! \brief Force-only kernel function names. */
165 static const char* nb_kfunc_noener_noprune_ptr[eelTypeNR][evdwTypeNR] = {
166 { "nbnxn_kernel_ElecCut_VdwLJ_F_opencl", "nbnxn_kernel_ElecCut_VdwLJCombGeom_F_opencl",
167 "nbnxn_kernel_ElecCut_VdwLJCombLB_F_opencl", "nbnxn_kernel_ElecCut_VdwLJFsw_F_opencl",
168 "nbnxn_kernel_ElecCut_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_opencl",
169 "nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_opencl" },
170 { "nbnxn_kernel_ElecRF_VdwLJ_F_opencl", "nbnxn_kernel_ElecRF_VdwLJCombGeom_F_opencl",
171 "nbnxn_kernel_ElecRF_VdwLJCombLB_F_opencl", "nbnxn_kernel_ElecRF_VdwLJFsw_F_opencl",
172 "nbnxn_kernel_ElecRF_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_opencl",
173 "nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_opencl" },
174 { "nbnxn_kernel_ElecEwQSTab_VdwLJ_F_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_F_opencl",
175 "nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_F_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_opencl",
176 "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_opencl",
177 "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_opencl",
178 "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_opencl" },
179 { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_opencl",
180 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_F_opencl",
181 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_F_opencl",
182 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_opencl",
183 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_opencl",
184 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_opencl",
185 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_opencl" },
186 { "nbnxn_kernel_ElecEw_VdwLJ_F_opencl", "nbnxn_kernel_ElecEw_VdwLJCombGeom_F_opencl",
187 "nbnxn_kernel_ElecEw_VdwLJCombLB_F_opencl", "nbnxn_kernel_ElecEw_VdwLJFsw_F_opencl",
188 "nbnxn_kernel_ElecEw_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_opencl",
189 "nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_opencl" },
190 { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_opencl",
191 "nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_F_opencl",
192 "nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_F_opencl",
193 "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_opencl",
194 "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_opencl",
195 "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_opencl" }
196 };
197
198 /*! \brief Force + energy kernel function pointers. */
199 static const char* nb_kfunc_ener_noprune_ptr[eelTypeNR][evdwTypeNR] = {
200 { "nbnxn_kernel_ElecCut_VdwLJ_VF_opencl", "nbnxn_kernel_ElecCut_VdwLJCombGeom_VF_opencl",
201 "nbnxn_kernel_ElecCut_VdwLJCombLB_VF_opencl", "nbnxn_kernel_ElecCut_VdwLJFsw_VF_opencl",
202 "nbnxn_kernel_ElecCut_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_opencl",
203 "nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_opencl" },
204 { "nbnxn_kernel_ElecRF_VdwLJ_VF_opencl", "nbnxn_kernel_ElecRF_VdwLJCombGeom_VF_opencl",
205 "nbnxn_kernel_ElecRF_VdwLJCombLB_VF_opencl", "nbnxn_kernel_ElecRF_VdwLJFsw_VF_opencl",
206 "nbnxn_kernel_ElecRF_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_opencl",
207 "nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_opencl" },
208 { "nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_VF_opencl",
209 "nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_VF_opencl",
210 "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_opencl",
211 "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_opencl",
212 "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_opencl" },
213 { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_opencl",
214 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_VF_opencl",
215 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_VF_opencl",
216 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_opencl",
217 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_opencl",
218 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_opencl",
219 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_opencl" },
220 { "nbnxn_kernel_ElecEw_VdwLJ_VF_opencl", "nbnxn_kernel_ElecEw_VdwLJCombGeom_VF_opencl",
221 "nbnxn_kernel_ElecEw_VdwLJCombLB_VF_opencl", "nbnxn_kernel_ElecEw_VdwLJFsw_VF_opencl",
222 "nbnxn_kernel_ElecEw_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_opencl",
223 "nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_opencl" },
224 { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_opencl",
225 "nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_VF_opencl",
226 "nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_VF_opencl",
227 "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_opencl",
228 "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_opencl",
229 "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_opencl",
230 "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_opencl" }
231 };
232
233 /*! \brief Force + pruning kernel function pointers. */
234 static const char* nb_kfunc_noener_prune_ptr[eelTypeNR][evdwTypeNR] = {
235 { "nbnxn_kernel_ElecCut_VdwLJ_F_prune_opencl",
236 "nbnxn_kernel_ElecCut_VdwLJCombGeom_F_prune_opencl",
237 "nbnxn_kernel_ElecCut_VdwLJCombLB_F_prune_opencl",
238 "nbnxn_kernel_ElecCut_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJPsw_F_prune_opencl",
239 "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_prune_opencl",
240 "nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_prune_opencl" },
241 { "nbnxn_kernel_ElecRF_VdwLJ_F_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJCombGeom_F_prune_opencl",
242 "nbnxn_kernel_ElecRF_VdwLJCombLB_F_prune_opencl",
243 "nbnxn_kernel_ElecRF_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJPsw_F_prune_opencl",
244 "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_prune_opencl",
245 "nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_prune_opencl" },
246 { "nbnxn_kernel_ElecEwQSTab_VdwLJ_F_prune_opencl",
247 "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_F_prune_opencl",
248 "nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_F_prune_opencl",
249 "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_prune_opencl",
250 "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_prune_opencl",
251 "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_prune_opencl",
252 "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_prune_opencl" },
253 { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_prune_opencl",
254 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_F_prune_opencl",
255 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_F_prune_opencl",
256 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_prune_opencl",
257 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_prune_opencl",
258 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_prune_opencl",
259 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_prune_opencl" },
260 { "nbnxn_kernel_ElecEw_VdwLJ_F_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJCombGeom_F_prune_opencl",
261 "nbnxn_kernel_ElecEw_VdwLJCombLB_F_prune_opencl",
262 "nbnxn_kernel_ElecEw_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJPsw_F_prune_opencl",
263 "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_prune_opencl",
264 "nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_prune_opencl" },
265 { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_prune_opencl",
266 "nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_F_prune_opencl",
267 "nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_F_prune_opencl",
268 "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_prune_opencl",
269 "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_prune_opencl",
270 "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_prune_opencl",
271 "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_prune_opencl" }
272 };
273
274 /*! \brief Force + energy + pruning kernel function pointers. */
275 static const char* nb_kfunc_ener_prune_ptr[eelTypeNR][evdwTypeNR] = {
276 { "nbnxn_kernel_ElecCut_VdwLJ_VF_prune_opencl",
277 "nbnxn_kernel_ElecCut_VdwLJCombGeom_VF_prune_opencl",
278 "nbnxn_kernel_ElecCut_VdwLJCombLB_VF_prune_opencl",
279 "nbnxn_kernel_ElecCut_VdwLJFsw_VF_prune_opencl",
280 "nbnxn_kernel_ElecCut_VdwLJPsw_VF_prune_opencl",
281 "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_prune_opencl",
282 "nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_prune_opencl" },
283 { "nbnxn_kernel_ElecRF_VdwLJ_VF_prune_opencl",
284 "nbnxn_kernel_ElecRF_VdwLJCombGeom_VF_prune_opencl",
285 "nbnxn_kernel_ElecRF_VdwLJCombLB_VF_prune_opencl",
286 "nbnxn_kernel_ElecRF_VdwLJFsw_VF_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJPsw_VF_prune_opencl",
287 "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_prune_opencl",
288 "nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_prune_opencl" },
289 { "nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_prune_opencl",
290 "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_VF_prune_opencl",
291 "nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_VF_prune_opencl",
292 "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_prune_opencl",
293 "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_prune_opencl",
294 "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_prune_opencl",
295 "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_prune_opencl" },
296 { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_prune_opencl",
297 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_VF_prune_opencl",
298 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_VF_prune_opencl",
299 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_prune_opencl",
300 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_prune_opencl",
301 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_prune_opencl",
302 "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_prune_opencl" },
303 { "nbnxn_kernel_ElecEw_VdwLJ_VF_prune_opencl",
304 "nbnxn_kernel_ElecEw_VdwLJCombGeom_VF_prune_opencl",
305 "nbnxn_kernel_ElecEw_VdwLJCombLB_VF_prune_opencl",
306 "nbnxn_kernel_ElecEw_VdwLJFsw_VF_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJPsw_VF_prune_opencl",
307 "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_prune_opencl",
308 "nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_prune_opencl" },
309 { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_prune_opencl",
310 "nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_VF_prune_opencl",
311 "nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_VF_prune_opencl",
312 "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_prune_opencl",
313 "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_prune_opencl",
314 "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_prune_opencl",
315 "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_prune_opencl" }
316 };
317
318 /*! \brief Return a pointer to the prune kernel version to be executed at the current invocation.
319 *
320 * \param[in] kernel_pruneonly array of prune kernel objects
321 * \param[in] firstPrunePass true if the first pruning pass is being executed
322 */
selectPruneKernel(cl_kernel kernel_pruneonly[],bool firstPrunePass)323 static inline cl_kernel selectPruneKernel(cl_kernel kernel_pruneonly[], bool firstPrunePass)
324 {
325 cl_kernel* kernelPtr;
326
327 if (firstPrunePass)
328 {
329 kernelPtr = &(kernel_pruneonly[epruneFirst]);
330 }
331 else
332 {
333 kernelPtr = &(kernel_pruneonly[epruneRolling]);
334 }
335 // TODO: consider creating the prune kernel object here to avoid a
336 // clCreateKernel for the rolling prune kernel if this is not needed.
337 return *kernelPtr;
338 }
339
340 /*! \brief Return a pointer to the kernel version to be executed at the current step.
341 * OpenCL kernel objects are cached in nb. If the requested kernel is not
342 * found in the cache, it will be created and the cache will be updated.
343 */
select_nbnxn_kernel(NbnxmGpu * nb,int eeltype,int evdwtype,bool bDoEne,bool bDoPrune)344 static inline cl_kernel select_nbnxn_kernel(NbnxmGpu* nb, int eeltype, int evdwtype, bool bDoEne, bool bDoPrune)
345 {
346 const char* kernel_name_to_run;
347 cl_kernel* kernel_ptr;
348 cl_int cl_error;
349
350 GMX_ASSERT(eeltype < eelTypeNR,
351 "The electrostatics type requested is not implemented in the OpenCL kernels.");
352 GMX_ASSERT(evdwtype < evdwTypeNR,
353 "The VdW type requested is not implemented in the OpenCL kernels.");
354
355 if (bDoEne)
356 {
357 if (bDoPrune)
358 {
359 kernel_name_to_run = nb_kfunc_ener_prune_ptr[eeltype][evdwtype];
360 kernel_ptr = &(nb->kernel_ener_prune_ptr[eeltype][evdwtype]);
361 }
362 else
363 {
364 kernel_name_to_run = nb_kfunc_ener_noprune_ptr[eeltype][evdwtype];
365 kernel_ptr = &(nb->kernel_ener_noprune_ptr[eeltype][evdwtype]);
366 }
367 }
368 else
369 {
370 if (bDoPrune)
371 {
372 kernel_name_to_run = nb_kfunc_noener_prune_ptr[eeltype][evdwtype];
373 kernel_ptr = &(nb->kernel_noener_prune_ptr[eeltype][evdwtype]);
374 }
375 else
376 {
377 kernel_name_to_run = nb_kfunc_noener_noprune_ptr[eeltype][evdwtype];
378 kernel_ptr = &(nb->kernel_noener_noprune_ptr[eeltype][evdwtype]);
379 }
380 }
381
382 if (nullptr == kernel_ptr[0])
383 {
384 *kernel_ptr = clCreateKernel(nb->dev_rundata->program, kernel_name_to_run, &cl_error);
385 GMX_ASSERT(cl_error == CL_SUCCESS, ("clCreateKernel failed: " + ocl_get_error_string(cl_error)
386 + " for kernel named " + kernel_name_to_run)
387 .c_str());
388 }
389
390 return *kernel_ptr;
391 }
392
393 /*! \brief Calculates the amount of shared memory required by the nonbonded kernel in use.
394 */
calc_shmem_required_nonbonded(int vdwType,bool bPrefetchLjParam)395 static inline int calc_shmem_required_nonbonded(int vdwType, bool bPrefetchLjParam)
396 {
397 int shmem;
398
399 /* size of shmem (force-buffers/xq/atom type preloading) */
400 /* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
401 /* i-atom x+q in shared memory */
402 shmem = c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(float) * 4; /* xqib */
403 /* cj in shared memory, for both warps separately
404 * TODO: in the "nowarp kernels we load cj only once so the factor 2 is not needed.
405 */
406 shmem += 2 * c_nbnxnGpuJgroupSize * sizeof(int); /* cjs */
407 if (bPrefetchLjParam)
408 {
409 if (useLjCombRule(vdwType))
410 {
411 /* i-atom LJ combination parameters in shared memory */
412 shmem += c_nbnxnGpuNumClusterPerSupercluster * c_clSize * 2
413 * sizeof(float); /* atib abused for ljcp, float2 */
414 }
415 else
416 {
417 /* i-atom types in shared memory */
418 shmem += c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(int); /* atib */
419 }
420 }
421 /* force reduction buffers in shared memory */
422 shmem += c_clSize * c_clSize * 3 * sizeof(float); /* f_buf */
423 /* Warp vote. In fact it must be * number of warps in block.. */
424 shmem += sizeof(cl_uint) * 2; /* warp_any */
425 return shmem;
426 }
427
428 /*! \brief Initializes data structures that are going to be sent to the OpenCL device.
429 *
430 * The device can't use the same data structures as the host for two main reasons:
431 * - OpenCL restrictions (pointers are not accepted inside data structures)
432 * - some host side fields are not needed for the OpenCL kernels.
433 *
434 * This function is called before the launch of both nbnxn and prune kernels.
435 */
fillin_ocl_structures(NBParamGpu * nbp,cl_nbparam_params_t * nbparams_params)436 static void fillin_ocl_structures(NBParamGpu* nbp, cl_nbparam_params_t* nbparams_params)
437 {
438 nbparams_params->coulomb_tab_scale = nbp->coulomb_tab_scale;
439 nbparams_params->c_rf = nbp->c_rf;
440 nbparams_params->dispersion_shift = nbp->dispersion_shift;
441 nbparams_params->eeltype = nbp->eeltype;
442 nbparams_params->epsfac = nbp->epsfac;
443 nbparams_params->ewaldcoeff_lj = nbp->ewaldcoeff_lj;
444 nbparams_params->ewald_beta = nbp->ewald_beta;
445 nbparams_params->rcoulomb_sq = nbp->rcoulomb_sq;
446 nbparams_params->repulsion_shift = nbp->repulsion_shift;
447 nbparams_params->rlistOuter_sq = nbp->rlistOuter_sq;
448 nbparams_params->rvdw_sq = nbp->rvdw_sq;
449 nbparams_params->rlistInner_sq = nbp->rlistInner_sq;
450 nbparams_params->rvdw_switch = nbp->rvdw_switch;
451 nbparams_params->sh_ewald = nbp->sh_ewald;
452 nbparams_params->sh_lj_ewald = nbp->sh_lj_ewald;
453 nbparams_params->two_k_rf = nbp->two_k_rf;
454 nbparams_params->vdwtype = nbp->vdwtype;
455 nbparams_params->vdw_switch = nbp->vdw_switch;
456 }
457
458 /*! \brief Enqueues a wait for event completion.
459 *
460 * Then it releases the event and sets it to 0.
461 * Don't use this function when more than one wait will be issued for the event.
462 * Equivalent to Cuda Stream Sync. */
sync_ocl_event(cl_command_queue stream,cl_event * ocl_event)463 static void sync_ocl_event(cl_command_queue stream, cl_event* ocl_event)
464 {
465 cl_int gmx_unused cl_error;
466
467 /* Enqueue wait */
468 cl_error = clEnqueueBarrierWithWaitList(stream, 1, ocl_event, nullptr);
469 GMX_RELEASE_ASSERT(CL_SUCCESS == cl_error, ocl_get_error_string(cl_error).c_str());
470
471 /* Release event and reset it to 0. It is ok to release it as enqueuewaitforevents performs implicit retain for events. */
472 cl_error = clReleaseEvent(*ocl_event);
473 GMX_ASSERT(cl_error == CL_SUCCESS,
474 ("clReleaseEvent failed: " + ocl_get_error_string(cl_error)).c_str());
475 *ocl_event = nullptr;
476 }
477
478 /*! \brief Launch asynchronously the xq buffer host to device copy. */
gpu_copy_xq_to_gpu(NbnxmGpu * nb,const nbnxn_atomdata_t * nbatom,const AtomLocality atomLocality)479 void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const AtomLocality atomLocality)
480 {
481 GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
482
483 const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
484
485 /* local/nonlocal offset and length used for xq and f */
486 int adat_begin, adat_len;
487
488 cl_atomdata_t* adat = nb->atdat;
489 gpu_plist* plist = nb->plist[iloc];
490 cl_timers_t* t = nb->timers;
491 const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
492
493 bool bDoTime = nb->bDoTime;
494
495 /* Don't launch the non-local H2D copy if there is no dependent
496 work to do: neither non-local nor other (e.g. bonded) work
497 to do that has as input the nbnxn coordinates.
498 Doing the same for the local kernel is more complicated, since the
499 local part of the force array also depends on the non-local kernel.
500 So to avoid complicating the code and to reduce the risk of bugs,
501 we always call the local local x+q copy (and the rest of the local
502 work in nbnxn_gpu_launch_kernel().
503 */
504 if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
505 {
506 plist->haveFreshList = false;
507
508 return;
509 }
510
511 /* calculate the atom data index range based on locality */
512 if (atomLocality == AtomLocality::Local)
513 {
514 adat_begin = 0;
515 adat_len = adat->natoms_local;
516 }
517 else
518 {
519 adat_begin = adat->natoms_local;
520 adat_len = adat->natoms - adat->natoms_local;
521 }
522
523 /* beginning of timed HtoD section */
524 if (bDoTime)
525 {
526 t->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream);
527 }
528
529 /* HtoD x, q */
530 GMX_ASSERT(sizeof(float) == sizeof(*nbatom->x().data()),
531 "The size of the xyzq buffer element should be equal to the size of float4.");
532 copyToDeviceBuffer(&adat->xq, nbatom->x().data() + adat_begin * 4, adat_begin * 4, adat_len * 4,
533 deviceStream, GpuApiCallBehavior::Async,
534 bDoTime ? t->xf[atomLocality].nb_h2d.fetchNextEvent() : nullptr);
535
536 if (bDoTime)
537 {
538 t->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream);
539 }
540
541 /* When we get here all misc operations issues in the local stream as well as
542 the local xq H2D are done,
543 so we record that in the local stream and wait for it in the nonlocal one. */
544 if (nb->bUseTwoStreams)
545 {
546 if (iloc == InteractionLocality::Local)
547 {
548 cl_int gmx_used_in_debug cl_error = clEnqueueMarkerWithWaitList(
549 deviceStream.stream(), 0, nullptr, &(nb->misc_ops_and_local_H2D_done));
550 GMX_ASSERT(cl_error == CL_SUCCESS,
551 ("clEnqueueMarkerWithWaitList failed: " + ocl_get_error_string(cl_error)).c_str());
552
553 /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
554 * in the local stream in order to be able to sync with the above event
555 * from the non-local stream.
556 */
557 cl_error = clFlush(deviceStream.stream());
558 GMX_ASSERT(cl_error == CL_SUCCESS,
559 ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str());
560 }
561 else
562 {
563 sync_ocl_event(deviceStream.stream(), &(nb->misc_ops_and_local_H2D_done));
564 }
565 }
566 }
567
568
569 /*! \brief Launch GPU kernel
570
571 As we execute nonbonded workload in separate queues, before launching
572 the kernel we need to make sure that he following operations have completed:
573 - atomdata allocation and related H2D transfers (every nstlist step);
574 - pair list H2D transfer (every nstlist step);
575 - shift vector H2D transfer (every nstlist step);
576 - force (+shift force and energy) output clearing (every step).
577
578 These operations are issued in the local queue at the beginning of the step
579 and therefore always complete before the local kernel launch. The non-local
580 kernel is launched after the local on the same device/context, so this is
581 inherently scheduled after the operations in the local stream (including the
582 above "misc_ops").
583 However, for the sake of having a future-proof implementation, we use the
584 misc_ops_done event to record the point in time when the above operations
585 are finished and synchronize with this event in the non-local stream.
586 */
gpu_launch_kernel(NbnxmGpu * nb,const gmx::StepWorkload & stepWork,const Nbnxm::InteractionLocality iloc)587 void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const Nbnxm::InteractionLocality iloc)
588 {
589 cl_atomdata_t* adat = nb->atdat;
590 NBParamGpu* nbp = nb->nbparam;
591 gpu_plist* plist = nb->plist[iloc];
592 cl_timers_t* t = nb->timers;
593 const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
594
595 bool bDoTime = nb->bDoTime;
596
597 cl_nbparam_params_t nbparams_params;
598
599 /* Don't launch the non-local kernel if there is no work to do.
600 Doing the same for the local kernel is more complicated, since the
601 local part of the force array also depends on the non-local kernel.
602 So to avoid complicating the code and to reduce the risk of bugs,
603 we always call the local kernel and later (not in
604 this function) the stream wait, local f copyback and the f buffer
605 clearing. All these operations, except for the local interaction kernel,
606 are needed for the non-local interactions. The skip of the local kernel
607 call is taken care of later in this function. */
608 if (canSkipNonbondedWork(*nb, iloc))
609 {
610 plist->haveFreshList = false;
611
612 return;
613 }
614
615 if (nbp->useDynamicPruning && plist->haveFreshList)
616 {
617 /* Prunes for rlistOuter and rlistInner, sets plist->haveFreshList=false
618 (that's the way the timing accounting can distinguish between
619 separate prune kernel and combined force+prune).
620 */
621 Nbnxm::gpu_launch_kernel_pruneonly(nb, iloc, 1);
622 }
623
624 if (plist->nsci == 0)
625 {
626 /* Don't launch an empty local kernel (is not allowed with OpenCL).
627 */
628 return;
629 }
630
631 /* beginning of timed nonbonded calculation section */
632 if (bDoTime)
633 {
634 t->interaction[iloc].nb_k.openTimingRegion(deviceStream);
635 }
636
637 /* kernel launch config */
638
639 KernelLaunchConfig config;
640 config.sharedMemorySize = calc_shmem_required_nonbonded(nbp->vdwtype, nb->bPrefetchLjParam);
641 config.blockSize[0] = c_clSize;
642 config.blockSize[1] = c_clSize;
643 config.gridSize[0] = plist->nsci;
644
645 validate_global_work_size(config, 3, &nb->deviceContext_->deviceInfo());
646
647 if (debug)
648 {
649 fprintf(debug,
650 "Non-bonded GPU launch configuration:\n\tLocal work size: %zux%zux%zu\n\t"
651 "Global work size : %zux%zu\n\t#Super-clusters/clusters: %d/%d (%d)\n",
652 config.blockSize[0], config.blockSize[1], config.blockSize[2],
653 config.blockSize[0] * config.gridSize[0], config.blockSize[1] * config.gridSize[1],
654 plist->nsci * c_nbnxnGpuNumClusterPerSupercluster,
655 c_nbnxnGpuNumClusterPerSupercluster, plist->na_c);
656 }
657
658 fillin_ocl_structures(nbp, &nbparams_params);
659
660 auto* timingEvent = bDoTime ? t->interaction[iloc].nb_k.fetchNextEvent() : nullptr;
661 constexpr char kernelName[] = "k_calc_nb";
662 const auto kernel =
663 select_nbnxn_kernel(nb, nbp->eeltype, nbp->vdwtype, stepWork.computeEnergy,
664 (plist->haveFreshList && !nb->timers->interaction[iloc].didPrune));
665
666
667 // The OpenCL kernel takes int as second to last argument because bool is
668 // not supported as a kernel argument type (sizeof(bool) is implementation defined).
669 const int computeFshift = static_cast<int>(stepWork.computeVirial);
670 if (useLjCombRule(nb->nbparam->vdwtype))
671 {
672 const auto kernelArgs = prepareGpuKernelArguments(
673 kernel, config, &nbparams_params, &adat->xq, &adat->f, &adat->e_lj, &adat->e_el,
674 &adat->fshift, &adat->lj_comb, &adat->shift_vec, &nbp->nbfp, &nbp->nbfp_comb,
675 &nbp->coulomb_tab, &plist->sci, &plist->cj4, &plist->excl, &computeFshift);
676
677 launchGpuKernel(kernel, config, deviceStream, timingEvent, kernelName, kernelArgs);
678 }
679 else
680 {
681 const auto kernelArgs = prepareGpuKernelArguments(
682 kernel, config, &adat->ntypes, &nbparams_params, &adat->xq, &adat->f, &adat->e_lj,
683 &adat->e_el, &adat->fshift, &adat->atom_types, &adat->shift_vec, &nbp->nbfp, &nbp->nbfp_comb,
684 &nbp->coulomb_tab, &plist->sci, &plist->cj4, &plist->excl, &computeFshift);
685 launchGpuKernel(kernel, config, deviceStream, timingEvent, kernelName, kernelArgs);
686 }
687
688 if (bDoTime)
689 {
690 t->interaction[iloc].nb_k.closeTimingRegion(deviceStream);
691 }
692 }
693
694
695 /*! \brief Calculates the amount of shared memory required by the prune kernel.
696 *
697 * Note that for the sake of simplicity we use the CUDA terminology "shared memory"
698 * for OpenCL local memory.
699 *
700 * \param[in] num_threads_z cj4 concurrency equal to the number of threads/work items in the 3-rd
701 * dimension. \returns the amount of local memory in bytes required by the pruning kernel
702 */
calc_shmem_required_prune(const int num_threads_z)703 static inline int calc_shmem_required_prune(const int num_threads_z)
704 {
705 int shmem;
706
707 /* i-atom x in shared memory (for convenience we load all 4 components including q) */
708 shmem = c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(float) * 4;
709 /* cj in shared memory, for each warp separately
710 * Note: only need to load once per wavefront, but to keep the code simple,
711 * for now we load twice on AMD.
712 */
713 shmem += num_threads_z * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize * sizeof(int);
714 /* Warp vote, requires one uint per warp/32 threads per block. */
715 shmem += sizeof(cl_uint) * 2 * num_threads_z;
716
717 return shmem;
718 }
719
720 /*! \brief
721 * Launch the pairlist prune only kernel for the given locality.
722 * \p numParts tells in how many parts, i.e. calls the list will be pruned.
723 */
gpu_launch_kernel_pruneonly(NbnxmGpu * nb,const InteractionLocality iloc,const int numParts)724 void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, const int numParts)
725 {
726 cl_atomdata_t* adat = nb->atdat;
727 NBParamGpu* nbp = nb->nbparam;
728 gpu_plist* plist = nb->plist[iloc];
729 cl_timers_t* t = nb->timers;
730 const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
731 bool bDoTime = nb->bDoTime;
732
733 if (plist->haveFreshList)
734 {
735 GMX_ASSERT(numParts == 1, "With first pruning we expect 1 part");
736
737 /* Set rollingPruningNumParts to signal that it is not set */
738 plist->rollingPruningNumParts = 0;
739 plist->rollingPruningPart = 0;
740 }
741 else
742 {
743 if (plist->rollingPruningNumParts == 0)
744 {
745 plist->rollingPruningNumParts = numParts;
746 }
747 else
748 {
749 GMX_ASSERT(numParts == plist->rollingPruningNumParts,
750 "It is not allowed to change numParts in between list generation steps");
751 }
752 }
753
754 /* Use a local variable for part and update in plist, so we can return here
755 * without duplicating the part increment code.
756 */
757 int part = plist->rollingPruningPart;
758
759 plist->rollingPruningPart++;
760 if (plist->rollingPruningPart >= plist->rollingPruningNumParts)
761 {
762 plist->rollingPruningPart = 0;
763 }
764
765 /* Compute the number of list entries to prune in this pass */
766 int numSciInPart = (plist->nsci - part) / numParts;
767
768 /* Don't launch the kernel if there is no work to do. */
769 if (numSciInPart <= 0)
770 {
771 plist->haveFreshList = false;
772
773 return;
774 }
775
776 GpuRegionTimer* timer = nullptr;
777 if (bDoTime)
778 {
779 timer = &(plist->haveFreshList ? t->interaction[iloc].prune_k : t->interaction[iloc].rollingPrune_k);
780 }
781
782 /* beginning of timed prune calculation section */
783 if (bDoTime)
784 {
785 timer->openTimingRegion(deviceStream);
786 }
787
788 /* Kernel launch config:
789 * - The thread block dimensions match the size of i-clusters, j-clusters,
790 * and j-cluster concurrency, in x, y, and z, respectively.
791 * - The 1D block-grid contains as many blocks as super-clusters.
792 */
793 int num_threads_z = c_oclPruneKernelJ4ConcurrencyDEFAULT;
794
795
796 /* kernel launch config */
797 KernelLaunchConfig config;
798 config.sharedMemorySize = calc_shmem_required_prune(num_threads_z);
799 config.blockSize[0] = c_clSize;
800 config.blockSize[1] = c_clSize;
801 config.blockSize[2] = num_threads_z;
802 config.gridSize[0] = numSciInPart;
803
804 validate_global_work_size(config, 3, &nb->deviceContext_->deviceInfo());
805
806 if (debug)
807 {
808 fprintf(debug,
809 "Pruning GPU kernel launch configuration:\n\tLocal work size: %zux%zux%zu\n\t"
810 "\tGlobal work size: %zux%zu\n\t#Super-clusters/clusters: %d/%d (%d)\n"
811 "\tShMem: %zu\n",
812 config.blockSize[0], config.blockSize[1], config.blockSize[2],
813 config.blockSize[0] * config.gridSize[0], config.blockSize[1] * config.gridSize[1],
814 plist->nsci * c_nbnxnGpuNumClusterPerSupercluster,
815 c_nbnxnGpuNumClusterPerSupercluster, plist->na_c, config.sharedMemorySize);
816 }
817
818 cl_nbparam_params_t nbparams_params;
819 fillin_ocl_structures(nbp, &nbparams_params);
820
821 auto* timingEvent = bDoTime ? timer->fetchNextEvent() : nullptr;
822 constexpr char kernelName[] = "k_pruneonly";
823 const auto pruneKernel = selectPruneKernel(nb->kernel_pruneonly, plist->haveFreshList);
824 const auto kernelArgs = prepareGpuKernelArguments(pruneKernel, config, &nbparams_params,
825 &adat->xq, &adat->shift_vec, &plist->sci,
826 &plist->cj4, &plist->imask, &numParts, &part);
827 launchGpuKernel(pruneKernel, config, deviceStream, timingEvent, kernelName, kernelArgs);
828
829 if (plist->haveFreshList)
830 {
831 plist->haveFreshList = false;
832 /* Mark that pruning has been done */
833 nb->timers->interaction[iloc].didPrune = true;
834 }
835 else
836 {
837 /* Mark that rolling pruning has been done */
838 nb->timers->interaction[iloc].didRollingPrune = true;
839 }
840
841 if (bDoTime)
842 {
843 timer->closeTimingRegion(deviceStream);
844 }
845 }
846
847 /*! \brief
848 * Launch asynchronously the download of nonbonded forces from the GPU
849 * (and energies/shift forces if required).
850 */
gpu_launch_cpyback(NbnxmGpu * nb,struct nbnxn_atomdata_t * nbatom,const gmx::StepWorkload & stepWork,const AtomLocality aloc)851 void gpu_launch_cpyback(NbnxmGpu* nb,
852 struct nbnxn_atomdata_t* nbatom,
853 const gmx::StepWorkload& stepWork,
854 const AtomLocality aloc)
855 {
856 GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
857
858 cl_int gmx_unused cl_error;
859 int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
860
861 /* determine interaction locality from atom locality */
862 const InteractionLocality iloc = gpuAtomToInteractionLocality(aloc);
863
864 cl_atomdata_t* adat = nb->atdat;
865 cl_timers_t* t = nb->timers;
866 bool bDoTime = nb->bDoTime;
867 const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
868
869 /* don't launch non-local copy-back if there was no non-local work to do */
870 if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
871 {
872 /* TODO An alternative way to signal that non-local work is
873 complete is to use a clEnqueueMarker+clEnqueueBarrier
874 pair. However, the use of bNonLocalStreamActive has the
875 advantage of being local to the host, so probably minimizes
876 overhead. Curiously, for NVIDIA OpenCL with an empty-domain
877 test case, overall simulation performance was higher with
878 the API calls, but this has not been tested on AMD OpenCL,
879 so could be worth considering in future. */
880 nb->bNonLocalStreamActive = CL_FALSE;
881 return;
882 }
883
884 getGpuAtomRange(adat, aloc, &adat_begin, &adat_len);
885
886 /* beginning of timed D2H section */
887 if (bDoTime)
888 {
889 t->xf[aloc].nb_d2h.openTimingRegion(deviceStream);
890 }
891
892 /* With DD the local D2H transfer can only start after the non-local
893 has been launched. */
894 if (iloc == InteractionLocality::Local && nb->bNonLocalStreamActive)
895 {
896 sync_ocl_event(deviceStream.stream(), &(nb->nonlocal_done));
897 }
898
899 /* DtoH f */
900 GMX_ASSERT(sizeof(*nbatom->out[0].f.data()) == sizeof(float),
901 "The host force buffer should be in single precision to match device data size.");
902 copyFromDeviceBuffer(&nbatom->out[0].f[adat_begin * DIM], &adat->f, adat_begin * DIM,
903 adat_len * DIM, deviceStream, GpuApiCallBehavior::Async,
904 bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
905
906 /* kick off work */
907 cl_error = clFlush(deviceStream.stream());
908 GMX_ASSERT(cl_error == CL_SUCCESS, ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str());
909
910 /* After the non-local D2H is launched the nonlocal_done event can be
911 recorded which signals that the local D2H can proceed. This event is not
912 placed after the non-local kernel because we first need the non-local
913 data back first. */
914 if (iloc == InteractionLocality::NonLocal)
915 {
916 cl_error = clEnqueueMarkerWithWaitList(deviceStream.stream(), 0, nullptr, &(nb->nonlocal_done));
917 GMX_ASSERT(cl_error == CL_SUCCESS,
918 ("clEnqueueMarkerWithWaitList failed: " + ocl_get_error_string(cl_error)).c_str());
919 nb->bNonLocalStreamActive = CL_TRUE;
920 }
921
922 /* only transfer energies in the local stream */
923 if (iloc == InteractionLocality::Local)
924 {
925 /* DtoH fshift when virial is needed */
926 if (stepWork.computeVirial)
927 {
928 GMX_ASSERT(sizeof(*nb->nbst.fshift) == DIM * sizeof(float),
929 "Sizes of host- and device-side shift vector elements should be the same.");
930 copyFromDeviceBuffer(reinterpret_cast<float*>(nb->nbst.fshift), &adat->fshift, 0,
931 SHIFTS * DIM, deviceStream, GpuApiCallBehavior::Async,
932 bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
933 }
934
935 /* DtoH energies */
936 if (stepWork.computeEnergy)
937 {
938 GMX_ASSERT(sizeof(*nb->nbst.e_lj) == sizeof(float),
939 "Sizes of host- and device-side LJ energy terms should be the same.");
940 copyFromDeviceBuffer(nb->nbst.e_lj, &adat->e_lj, 0, 1, deviceStream, GpuApiCallBehavior::Async,
941 bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
942 GMX_ASSERT(sizeof(*nb->nbst.e_el) == sizeof(float),
943 "Sizes of host- and device-side electrostatic energy terms should be the "
944 "same.");
945 copyFromDeviceBuffer(nb->nbst.e_el, &adat->e_el, 0, 1, deviceStream, GpuApiCallBehavior::Async,
946 bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
947 }
948 }
949
950 if (bDoTime)
951 {
952 t->xf[aloc].nb_d2h.closeTimingRegion(deviceStream);
953 }
954 }
955
956 } // namespace Nbnxm
957