1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2004, The GROMACS development team.
6  * Copyright (c) 2011-2019,2020,2021, by the GROMACS development team, led by
7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8  * and including many others, as listed in the AUTHORS file in the
9  * top-level source directory and at http://www.gromacs.org.
10  *
11  * GROMACS is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public License
13  * as published by the Free Software Foundation; either version 2.1
14  * of the License, or (at your option) any later version.
15  *
16  * GROMACS is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public
22  * License along with GROMACS; if not, see
23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
25  *
26  * If you want to redistribute modifications to GROMACS, please
27  * consider that scientific software is very special. Version
28  * control is crucial - bugs must be traceable. We will be happy to
29  * consider code for inclusion in the official distribution, but
30  * derived work must not be called official GROMACS. Details are found
31  * in the README & COPYING files - if they are missing, get the
32  * official version at http://www.gromacs.org.
33  *
34  * To help us fund GROMACS development, we humbly ask that you cite
35  * the research papers on the package. Check out http://www.gromacs.org.
36  */
37 /*! \internal \file
38  *
39  * \brief Implements the integrator for normal molecular dynamics simulations
40  *
41  * \author David van der Spoel <david.vanderspoel@icm.uu.se>
42  * \ingroup module_mdrun
43  */
44 #include "gmxpre.h"
45 
46 #include <cinttypes>
47 #include <cmath>
48 #include <cstdio>
49 #include <cstdlib>
50 
51 #include <algorithm>
52 #include <memory>
53 #include <numeric>
54 
55 #include "gromacs/applied_forces/awh/awh.h"
56 #include "gromacs/commandline/filenm.h"
57 #include "gromacs/domdec/collect.h"
58 #include "gromacs/domdec/dlbtiming.h"
59 #include "gromacs/domdec/domdec.h"
60 #include "gromacs/domdec/domdec_network.h"
61 #include "gromacs/domdec/domdec_struct.h"
62 #include "gromacs/domdec/gpuhaloexchange.h"
63 #include "gromacs/domdec/mdsetup.h"
64 #include "gromacs/domdec/partition.h"
65 #include "gromacs/essentialdynamics/edsam.h"
66 #include "gromacs/ewald/pme_load_balancing.h"
67 #include "gromacs/ewald/pme_pp.h"
68 #include "gromacs/fileio/trxio.h"
69 #include "gromacs/gmxlib/network.h"
70 #include "gromacs/gmxlib/nrnb.h"
71 #include "gromacs/gpu_utils/device_stream_manager.h"
72 #include "gromacs/gpu_utils/gpu_utils.h"
73 #include "gromacs/math/units.h"
74 #include "gromacs/imd/imd.h"
75 #include "gromacs/listed_forces/listed_forces.h"
76 #include "gromacs/math/functions.h"
77 #include "gromacs/math/invertmatrix.h"
78 #include "gromacs/math/vec.h"
79 #include "gromacs/math/vectypes.h"
80 #include "gromacs/mdlib/checkpointhandler.h"
81 #include "gromacs/mdlib/compute_io.h"
82 #include "gromacs/mdlib/constr.h"
83 #include "gromacs/mdlib/coupling.h"
84 #include "gromacs/mdlib/ebin.h"
85 #include "gromacs/mdlib/enerdata_utils.h"
86 #include "gromacs/mdlib/energyoutput.h"
87 #include "gromacs/mdlib/expanded.h"
88 #include "gromacs/mdlib/force.h"
89 #include "gromacs/mdlib/force_flags.h"
90 #include "gromacs/mdlib/forcerec.h"
91 #include "gromacs/mdlib/freeenergyparameters.h"
92 #include "gromacs/mdlib/md_support.h"
93 #include "gromacs/mdlib/mdatoms.h"
94 #include "gromacs/mdlib/mdoutf.h"
95 #include "gromacs/mdlib/membed.h"
96 #include "gromacs/mdlib/resethandler.h"
97 #include "gromacs/mdlib/sighandler.h"
98 #include "gromacs/mdlib/simulationsignal.h"
99 #include "gromacs/mdlib/stat.h"
100 #include "gromacs/mdlib/stophandler.h"
101 #include "gromacs/mdlib/tgroup.h"
102 #include "gromacs/mdlib/trajectory_writing.h"
103 #include "gromacs/mdlib/update.h"
104 #include "gromacs/mdlib/update_constrain_gpu.h"
105 #include "gromacs/mdlib/vcm.h"
106 #include "gromacs/mdlib/vsite.h"
107 #include "gromacs/mdrunutility/handlerestart.h"
108 #include "gromacs/mdrunutility/multisim.h"
109 #include "gromacs/mdrunutility/printtime.h"
110 #include "gromacs/mdtypes/awh_history.h"
111 #include "gromacs/mdtypes/awh_params.h"
112 #include "gromacs/mdtypes/commrec.h"
113 #include "gromacs/mdtypes/df_history.h"
114 #include "gromacs/mdtypes/energyhistory.h"
115 #include "gromacs/mdtypes/fcdata.h"
116 #include "gromacs/mdtypes/forcebuffers.h"
117 #include "gromacs/mdtypes/forcerec.h"
118 #include "gromacs/mdtypes/group.h"
119 #include "gromacs/mdtypes/inputrec.h"
120 #include "gromacs/mdtypes/interaction_const.h"
121 #include "gromacs/mdtypes/md_enums.h"
122 #include "gromacs/mdtypes/mdatom.h"
123 #include "gromacs/mdtypes/mdrunoptions.h"
124 #include "gromacs/mdtypes/multipletimestepping.h"
125 #include "gromacs/mdtypes/observableshistory.h"
126 #include "gromacs/mdtypes/pullhistory.h"
127 #include "gromacs/mdtypes/simulation_workload.h"
128 #include "gromacs/mdtypes/state.h"
129 #include "gromacs/mdtypes/state_propagator_data_gpu.h"
130 #include "gromacs/modularsimulator/energydata.h"
131 #include "gromacs/nbnxm/gpu_data_mgmt.h"
132 #include "gromacs/nbnxm/nbnxm.h"
133 #include "gromacs/pbcutil/pbc.h"
134 #include "gromacs/pulling/output.h"
135 #include "gromacs/pulling/pull.h"
136 #include "gromacs/swap/swapcoords.h"
137 #include "gromacs/timing/wallcycle.h"
138 #include "gromacs/timing/walltime_accounting.h"
139 #include "gromacs/topology/atoms.h"
140 #include "gromacs/topology/idef.h"
141 #include "gromacs/topology/mtop_util.h"
142 #include "gromacs/topology/topology.h"
143 #include "gromacs/trajectory/trajectoryframe.h"
144 #include "gromacs/utility/basedefinitions.h"
145 #include "gromacs/utility/cstringutil.h"
146 #include "gromacs/utility/fatalerror.h"
147 #include "gromacs/utility/logger.h"
148 #include "gromacs/utility/real.h"
149 #include "gromacs/utility/smalloc.h"
150 
151 #include "legacysimulator.h"
152 #include "replicaexchange.h"
153 #include "shellfc.h"
154 
155 /* PLUMED */
156 #include "../../../Plumed.h"
157 extern int    plumedswitch;
158 extern plumed plumedmain;
159 /* END PLUMED */
160 
161 /* PLUMED HREX */
162 extern int plumed_hrex;
163 /* END PLUMED HREX */
164 
165 using gmx::SimulationSignaller;
166 
do_md()167 void gmx::LegacySimulator::do_md()
168 {
169     // TODO Historically, the EM and MD "integrators" used different
170     // names for the t_inputrec *parameter, but these must have the
171     // same name, now that it's a member of a struct. We use this ir
172     // alias to avoid a large ripple of nearly useless changes.
173     // t_inputrec is being replaced by IMdpOptionsProvider, so this
174     // will go away eventually.
175     t_inputrec*  ir = inputrec;
176     int64_t      step, step_rel;
177     double       t, t0 = ir->init_t;
178     gmx_bool     bGStatEveryStep, bGStat, bCalcVir, bCalcEnerStep, bCalcEner;
179     gmx_bool     bNS = FALSE, bNStList, bStopCM, bFirstStep, bInitStep, bLastStep = FALSE;
180     gmx_bool     bDoDHDL = FALSE, bDoFEP = FALSE, bDoExpanded = FALSE;
181     gmx_bool     do_ene, do_log, do_verbose;
182     gmx_bool     bMasterState;
183     unsigned int force_flags;
184     tensor force_vir = { { 0 } }, shake_vir = { { 0 } }, total_vir = { { 0 } }, pres = { { 0 } };
185     int    i, m;
186     rvec   mu_tot;
187     matrix pressureCouplingMu, M;
188     gmx_repl_ex_t     repl_ex = nullptr;
189     gmx_global_stat_t gstat;
190     gmx_shellfc_t*    shellfc;
191     gmx_bool          bSumEkinhOld, bDoReplEx, bExchanged, bNeedRepartition;
192     gmx_bool          bTemp, bPres, bTrotter;
193     real              dvdl_constr;
194     std::vector<RVec> cbuf;
195     matrix            lastbox;
196     int               lamnew = 0;
197     /* for FEP */
198     int       nstfep = 0;
199     double    cycles;
200     real      saved_conserved_quantity = 0;
201     real      last_ekin                = 0;
202     t_extmass MassQ;
203     char      sbuf[STEPSTRSIZE], sbuf2[STEPSTRSIZE];
204 
205     /* PME load balancing data for GPU kernels */
206     gmx_bool bPMETune         = FALSE;
207     gmx_bool bPMETunePrinting = FALSE;
208 
209     bool bInteractiveMDstep = false;
210 
211     /* PLUMED */
212     int plumedNeedsEnergy=0;
213     int plumedWantsToStop=0;
214     matrix plumed_vir;
215     real lambdaForce=0;
216     real realFepState=0;
217     /* END PLUMED */
218 
219     /* Domain decomposition could incorrectly miss a bonded
220        interaction, but checking for that requires a global
221        communication stage, which does not otherwise happen in DD
222        code. So we do that alongside the first global energy reduction
223        after a new DD is made. These variables handle whether the
224        check happens, and the result it returns. */
225     bool shouldCheckNumberOfBondedInteractions = false;
226     int  totalNumberOfBondedInteractions       = -1;
227 
228     SimulationSignals signals;
229     // Most global communnication stages don't propagate mdrun
230     // signals, and will use this object to achieve that.
231     SimulationSignaller nullSignaller(nullptr, nullptr, nullptr, false, false);
232 
233     if (!mdrunOptions.writeConfout)
234     {
235         // This is on by default, and the main known use case for
236         // turning it off is for convenience in benchmarking, which is
237         // something that should not show up in the general user
238         // interface.
239         GMX_LOG(mdlog.info)
240                 .asParagraph()
241                 .appendText(
242                         "The -noconfout functionality is deprecated, and may be removed in a "
243                         "future version.");
244     }
245 
246     /* md-vv uses averaged full step velocities for T-control
247        md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
248        md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
249     bTrotter = (EI_VV(ir->eI)
250                 && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir) || inputrecNvtTrotter(ir)));
251 
252     const bool bRerunMD = false;
253 
254     int nstglobalcomm = computeGlobalCommunicationPeriod(mdlog, ir, cr);
255     bGStatEveryStep   = (nstglobalcomm == 1);
256 
257     const SimulationGroups* groups = &top_global->groups;
258 
259     std::unique_ptr<EssentialDynamics> ed = nullptr;
260     if (opt2bSet("-ei", nfile, fnm))
261     {
262         /* Initialize essential dynamics sampling */
263         ed = init_edsam(mdlog, opt2fn_null("-ei", nfile, fnm), opt2fn("-eo", nfile, fnm), top_global,
264                         ir, cr, constr, state_global, observablesHistory, oenv, startingBehavior);
265     }
266     else if (observablesHistory->edsamHistory)
267     {
268         gmx_fatal(FARGS,
269                   "The checkpoint is from a run with essential dynamics sampling, "
270                   "but the current run did not specify the -ei option. "
271                   "Either specify the -ei option to mdrun, or do not use this checkpoint file.");
272     }
273 
274     int*                fep_state = MASTER(cr) ? &state_global->fep_state : nullptr;
275     gmx::ArrayRef<real> lambda    = MASTER(cr) ? state_global->lambda : gmx::ArrayRef<real>();
276     initialize_lambdas(fplog, *ir, MASTER(cr), fep_state, lambda);
277     Update     upd(*ir, deform);
278     const bool doSimulatedAnnealing = initSimulatedAnnealing(ir, &upd);
279     const bool useReplicaExchange   = (replExParams.exchangeInterval > 0);
280 
281     const t_fcdata& fcdata = *fr->fcdata;
282 
283     bool simulationsShareState = false;
284     int  nstSignalComm         = nstglobalcomm;
285     {
286         // TODO This implementation of ensemble orientation restraints is nasty because
287         // a user can't just do multi-sim with single-sim orientation restraints.
288         bool usingEnsembleRestraints =
289                 (fcdata.disres->nsystems > 1) || ((ms != nullptr) && (fcdata.orires->nr != 0));
290         bool awhUsesMultiSim = (ir->bDoAwh && ir->awhParams->shareBiasMultisim && (ms != nullptr));
291 
292         // Replica exchange, ensemble restraints and AWH need all
293         // simulations to remain synchronized, so they need
294         // checkpoints and stop conditions to act on the same step, so
295         // the propagation of such signals must take place between
296         // simulations, not just within simulations.
297         // TODO: Make algorithm initializers set these flags.
298         simulationsShareState = useReplicaExchange || usingEnsembleRestraints || awhUsesMultiSim || (plumedswitch && ms); // PLUMED hack, if we have multiple sim and plumed we usually want them to be in sync
299 
300         if (simulationsShareState)
301         {
302             // Inter-simulation signal communication does not need to happen
303             // often, so we use a minimum of 200 steps to reduce overhead.
304             const int c_minimumInterSimulationSignallingInterval = 200;
305             nstSignalComm = ((c_minimumInterSimulationSignallingInterval + nstglobalcomm - 1) / nstglobalcomm)
306                             * nstglobalcomm;
307         }
308     }
309 
310     if (startingBehavior != StartingBehavior::RestartWithAppending)
311     {
312         pleaseCiteCouplingAlgorithms(fplog, *ir);
313     }
314     gmx_mdoutf* outf =
315             init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, mdModulesNotifier, ir,
316                         top_global, oenv, wcycle, startingBehavior, simulationsShareState, ms);
317     gmx::EnergyOutput energyOutput(mdoutf_get_fp_ene(outf), top_global, ir, pull_work,
318                                    mdoutf_get_fp_dhdl(outf), false, startingBehavior,
319                                    simulationsShareState, mdModulesNotifier);
320 
321     gstat = global_stat_init(ir);
322 
323     const auto& simulationWork     = runScheduleWork->simulationWork;
324     const bool  useGpuForPme       = simulationWork.useGpuPme;
325     const bool  useGpuForNonbonded = simulationWork.useGpuNonbonded;
326     const bool  useGpuForBufferOps = simulationWork.useGpuBufferOps;
327     const bool  useGpuForUpdate    = simulationWork.useGpuUpdate;
328 
329     /* Check for polarizable models and flexible constraints */
330     shellfc = init_shell_flexcon(fplog, top_global, constr ? constr->numFlexibleConstraints() : 0,
331                                  ir->nstcalcenergy, DOMAINDECOMP(cr), useGpuForPme);
332 
333     {
334         double io = compute_io(ir, top_global->natoms, *groups, energyOutput.numEnergyTerms(), 1);
335         if ((io > 2000) && MASTER(cr))
336         {
337             fprintf(stderr, "\nWARNING: This run will generate roughly %.0f Mb of data\n\n", io);
338         }
339     }
340 
341     // Local state only becomes valid now.
342     std::unique_ptr<t_state> stateInstance;
343     t_state*                 state;
344 
345     gmx_localtop_t top(top_global->ffparams);
346 
347     auto mdatoms = mdAtoms->mdatoms();
348 
349     ForceBuffers f(fr->useMts, ((useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
350                                        ? PinningPolicy::PinnedIfSupported
351                                        : PinningPolicy::CannotBePinned);
352     if (DOMAINDECOMP(cr))
353     {
354         stateInstance = std::make_unique<t_state>();
355         state         = stateInstance.get();
356         dd_init_local_state(cr->dd, state_global, state);
357 
358         /* Distribute the charge groups over the nodes from the master node */
359         dd_partition_system(fplog, mdlog, ir->init_step, cr, TRUE, 1, state_global, *top_global, ir,
360                             imdSession, pull_work, state, &f, mdAtoms, &top, fr, vsite, constr,
361                             nrnb, nullptr, FALSE);
362         shouldCheckNumberOfBondedInteractions = true;
363         upd.setNumAtoms(state->natoms);
364     }
365     else
366     {
367         state_change_natoms(state_global, state_global->natoms);
368         /* Copy the pointer to the global state */
369         state = state_global;
370 
371         /* Generate and initialize new topology */
372         mdAlgorithmsSetupAtomData(cr, ir, *top_global, &top, fr, &f, mdAtoms, constr, vsite, shellfc);
373 
374         upd.setNumAtoms(state->natoms);
375     }
376 
377     std::unique_ptr<UpdateConstrainGpu> integrator;
378 
379     StatePropagatorDataGpu* stateGpu = fr->stateGpu;
380 
381     // TODO: the assertions below should be handled by UpdateConstraintsBuilder.
382     if (useGpuForUpdate)
383     {
384         GMX_RELEASE_ASSERT(!DOMAINDECOMP(cr) || ddUsesUpdateGroups(*cr->dd) || constr == nullptr
385                                    || constr->numConstraintsTotal() == 0,
386                            "Constraints in domain decomposition are only supported with update "
387                            "groups if using GPU update.\n");
388         GMX_RELEASE_ASSERT(ir->eConstrAlg != econtSHAKE || constr == nullptr
389                                    || constr->numConstraintsTotal() == 0,
390                            "SHAKE is not supported with GPU update.");
391         GMX_RELEASE_ASSERT(useGpuForPme || (useGpuForNonbonded && simulationWork.useGpuBufferOps),
392                            "Either PME or short-ranged non-bonded interaction tasks must run on "
393                            "the GPU to use GPU update.\n");
394         GMX_RELEASE_ASSERT(ir->eI == eiMD,
395                            "Only the md integrator is supported with the GPU update.\n");
396         GMX_RELEASE_ASSERT(
397                 ir->etc != etcNOSEHOOVER,
398                 "Nose-Hoover temperature coupling is not supported with the GPU update.\n");
399         GMX_RELEASE_ASSERT(
400                 ir->epc == epcNO || ir->epc == epcPARRINELLORAHMAN || ir->epc == epcBERENDSEN
401                         || ir->epc == epcCRESCALE,
402                 "Only Parrinello-Rahman, Berendsen, and C-rescale pressure coupling are supported "
403                 "with the GPU update.\n");
404         GMX_RELEASE_ASSERT(!mdatoms->haveVsites,
405                            "Virtual sites are not supported with the GPU update.\n");
406         GMX_RELEASE_ASSERT(ed == nullptr,
407                            "Essential dynamics is not supported with the GPU update.\n");
408         GMX_RELEASE_ASSERT(!ir->bPull || !pull_have_constraint(*ir->pull),
409                            "Constraints pulling is not supported with the GPU update.\n");
410         GMX_RELEASE_ASSERT(fcdata.orires->nr == 0,
411                            "Orientation restraints are not supported with the GPU update.\n");
412         GMX_RELEASE_ASSERT(
413                 ir->efep == efepNO
414                         || (!haveFepPerturbedMasses(*top_global) && !havePerturbedConstraints(*top_global)),
415                 "Free energy perturbation of masses and constraints are not supported with the GPU "
416                 "update.");
417 
418         if (constr != nullptr && constr->numConstraintsTotal() > 0)
419         {
420             GMX_LOG(mdlog.info)
421                     .asParagraph()
422                     .appendText("Updating coordinates and applying constraints on the GPU.");
423         }
424         else
425         {
426             GMX_LOG(mdlog.info).asParagraph().appendText("Updating coordinates on the GPU.");
427         }
428         GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
429                            "Device stream manager should be initialized in order to use GPU "
430                            "update-constraints.");
431         GMX_RELEASE_ASSERT(
432                 fr->deviceStreamManager->streamIsValid(gmx::DeviceStreamType::UpdateAndConstraints),
433                 "Update stream should be initialized in order to use GPU "
434                 "update-constraints.");
435         integrator = std::make_unique<UpdateConstrainGpu>(
436                 *ir, *top_global, fr->deviceStreamManager->context(),
437                 fr->deviceStreamManager->stream(gmx::DeviceStreamType::UpdateAndConstraints),
438                 stateGpu->xUpdatedOnDevice(), wcycle);
439 
440         integrator->setPbc(PbcType::Xyz, state->box);
441     }
442 
443     if (useGpuForPme || (useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
444     {
445         changePinningPolicy(&state->x, PinningPolicy::PinnedIfSupported);
446     }
447     if (useGpuForUpdate)
448     {
449         changePinningPolicy(&state->v, PinningPolicy::PinnedIfSupported);
450     }
451 
452     // NOTE: The global state is no longer used at this point.
453     // But state_global is still used as temporary storage space for writing
454     // the global state to file and potentially for replica exchange.
455     // (Global topology should persist.)
456 
457     update_mdatoms(mdatoms, state->lambda[efptMASS]);
458 
459     if (ir->bExpanded)
460     {
461         /* Check nstexpanded here, because the grompp check was broken */
462         if (ir->expandedvals->nstexpanded % ir->nstcalcenergy != 0)
463         {
464             gmx_fatal(FARGS,
465                       "With expanded ensemble, nstexpanded should be a multiple of nstcalcenergy");
466         }
467         init_expanded_ensemble(startingBehavior != StartingBehavior::NewSimulation, ir, state->dfhist);
468     }
469 
470     if (MASTER(cr))
471     {
472         EnergyData::initializeEnergyHistory(startingBehavior, observablesHistory, &energyOutput);
473     }
474 
475     preparePrevStepPullCom(ir, pull_work, mdatoms->massT, state, state_global, cr,
476                            startingBehavior != StartingBehavior::NewSimulation);
477 
478     // TODO: Remove this by converting AWH into a ForceProvider
479     auto awh = prepareAwhModule(fplog, *ir, state_global, cr, ms,
480                                 startingBehavior != StartingBehavior::NewSimulation,
481                                 shellfc != nullptr, opt2fn("-awh", nfile, fnm), pull_work);
482 
483     if (useReplicaExchange && MASTER(cr))
484     {
485         repl_ex = init_replica_exchange(fplog, ms, top_global->natoms, ir, replExParams);
486     }
487     /* PME tuning is only supported in the Verlet scheme, with PME for
488      * Coulomb. It is not supported with only LJ PME. */
489     bPMETune = (mdrunOptions.tunePme && EEL_PME(fr->ic->eeltype) && !mdrunOptions.reproducible
490                 && ir->cutoff_scheme != ecutsGROUP);
491 
492     pme_load_balancing_t* pme_loadbal = nullptr;
493     if (bPMETune)
494     {
495         pme_loadbal_init(&pme_loadbal, cr, mdlog, *ir, state->box, *fr->ic, *fr->nbv, fr->pmedata,
496                          fr->nbv->useGpu());
497     }
498 
499     if (!ir->bContinuation)
500     {
501         if (state->flags & (1U << estV))
502         {
503             auto v = makeArrayRef(state->v);
504             /* Set the velocities of vsites, shells and frozen atoms to zero */
505             for (i = 0; i < mdatoms->homenr; i++)
506             {
507                 if (mdatoms->ptype[i] == eptVSite || mdatoms->ptype[i] == eptShell)
508                 {
509                     clear_rvec(v[i]);
510                 }
511                 else if (mdatoms->cFREEZE)
512                 {
513                     for (m = 0; m < DIM; m++)
514                     {
515                         if (ir->opts.nFreeze[mdatoms->cFREEZE[i]][m])
516                         {
517                             v[i][m] = 0;
518                         }
519                     }
520                 }
521             }
522         }
523 
524         if (constr)
525         {
526             /* Constrain the initial coordinates and velocities */
527             do_constrain_first(fplog, constr, ir, mdatoms->nr, mdatoms->homenr,
528                                state->x.arrayRefWithPadding(), state->v.arrayRefWithPadding(),
529                                state->box, state->lambda[efptBONDED]);
530         }
531         if (vsite)
532         {
533             /* Construct the virtual sites for the initial configuration */
534             vsite->construct(state->x, ir->delta_t, {}, state->box);
535         }
536     }
537 
538     if (ir->efep != efepNO)
539     {
540         /* Set free energy calculation frequency as the greatest common
541          * denominator of nstdhdl and repl_ex_nst. */
542         nstfep = ir->fepvals->nstdhdl;
543         if (ir->bExpanded)
544         {
545             nstfep = std::gcd(ir->expandedvals->nstexpanded, nstfep);
546         }
547         if (useReplicaExchange)
548         {
549             nstfep = std::gcd(replExParams.exchangeInterval, nstfep);
550         }
551         if (ir->bDoAwh)
552         {
553             nstfep = std::gcd(ir->awhParams->nstSampleCoord, nstfep);
554         }
555     }
556 
557     /* Be REALLY careful about what flags you set here. You CANNOT assume
558      * this is the first step, since we might be restarting from a checkpoint,
559      * and in that case we should not do any modifications to the state.
560      */
561     bStopCM = (ir->comm_mode != ecmNO && !ir->bContinuation);
562 
563     // When restarting from a checkpoint, it can be appropriate to
564     // initialize ekind from quantities in the checkpoint. Otherwise,
565     // compute_globals must initialize ekind before the simulation
566     // starts/restarts. However, only the master rank knows what was
567     // found in the checkpoint file, so we have to communicate in
568     // order to coordinate the restart.
569     //
570     // TODO Consider removing this communication if/when checkpoint
571     // reading directly follows .tpr reading, because all ranks can
572     // agree on hasReadEkinState at that time.
573     bool hasReadEkinState = MASTER(cr) ? state_global->ekinstate.hasReadEkinState : false;
574     if (PAR(cr))
575     {
576         gmx_bcast(sizeof(hasReadEkinState), &hasReadEkinState, cr->mpi_comm_mygroup);
577     }
578     if (hasReadEkinState)
579     {
580         restore_ekinstate_from_state(cr, ekind, &state_global->ekinstate);
581     }
582 
583     unsigned int cglo_flags =
584             (CGLO_TEMPERATURE | CGLO_GSTAT | (EI_VV(ir->eI) ? CGLO_PRESSURE : 0)
585              | (EI_VV(ir->eI) ? CGLO_CONSTRAINT : 0) | (hasReadEkinState ? CGLO_READEKIN : 0));
586 
587     bSumEkinhOld = FALSE;
588 
589     t_vcm vcm(top_global->groups, *ir);
590     reportComRemovalInfo(fplog, vcm);
591 
592     /* To minimize communication, compute_globals computes the COM velocity
593      * and the kinetic energy for the velocities without COM motion removed.
594      * Thus to get the kinetic energy without the COM contribution, we need
595      * to call compute_globals twice.
596      */
597     for (int cgloIteration = 0; cgloIteration < (bStopCM ? 2 : 1); cgloIteration++)
598     {
599         unsigned int cglo_flags_iteration = cglo_flags;
600         if (bStopCM && cgloIteration == 0)
601         {
602             cglo_flags_iteration |= CGLO_STOPCM;
603             cglo_flags_iteration &= ~CGLO_TEMPERATURE;
604         }
605         compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
606                         makeConstArrayRef(state->v), state->box, mdatoms, nrnb, &vcm, nullptr,
607                         enerd, force_vir, shake_vir, total_vir, pres, constr, &nullSignaller,
608                         state->box, &totalNumberOfBondedInteractions, &bSumEkinhOld,
609                         cglo_flags_iteration
610                                 | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS
611                                                                          : 0));
612         if (cglo_flags_iteration & CGLO_STOPCM)
613         {
614             /* At initialization, do not pass x with acceleration-correction mode
615              * to avoid (incorrect) correction of the initial coordinates.
616              */
617             auto x = (vcm.mode == ecmLINEAR_ACCELERATION_CORRECTION) ? ArrayRef<RVec>()
618                                                                      : makeArrayRef(state->x);
619             process_and_stopcm_grp(fplog, &vcm, *mdatoms, x, makeArrayRef(state->v));
620             inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
621         }
622     }
623     checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions, top_global, &top,
624                                     makeConstArrayRef(state->x), state->box,
625                                     &shouldCheckNumberOfBondedInteractions);
626     if (ir->eI == eiVVAK)
627     {
628         /* a second call to get the half step temperature initialized as well */
629         /* we do the same call as above, but turn the pressure off -- internally to
630            compute_globals, this is recognized as a velocity verlet half-step
631            kinetic energy calculation.  This minimized excess variables, but
632            perhaps loses some logic?*/
633 
634         compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
635                         makeConstArrayRef(state->v), state->box, mdatoms, nrnb, &vcm, nullptr,
636                         enerd, force_vir, shake_vir, total_vir, pres, constr, &nullSignaller,
637                         state->box, nullptr, &bSumEkinhOld, cglo_flags & ~CGLO_PRESSURE);
638     }
639 
640     /* Calculate the initial half step temperature, and save the ekinh_old */
641     if (startingBehavior == StartingBehavior::NewSimulation)
642     {
643         for (i = 0; (i < ir->opts.ngtc); i++)
644         {
645             copy_mat(ekind->tcstat[i].ekinh, ekind->tcstat[i].ekinh_old);
646         }
647     }
648 
649     /* need to make an initiation call to get the Trotter variables set, as well as other constants
650        for non-trotter temperature control */
651     auto trotter_seq = init_npt_vars(ir, state, &MassQ, bTrotter);
652 
653     if (MASTER(cr))
654     {
655         if (!ir->bContinuation)
656         {
657             if (constr && ir->eConstrAlg == econtLINCS)
658             {
659                 fprintf(fplog, "RMS relative constraint deviation after constraining: %.2e\n",
660                         constr->rmsd());
661             }
662             if (EI_STATE_VELOCITY(ir->eI))
663             {
664                 real temp = enerd->term[F_TEMP];
665                 if (ir->eI != eiVV)
666                 {
667                     /* Result of Ekin averaged over velocities of -half
668                      * and +half step, while we only have -half step here.
669                      */
670                     temp *= 2;
671                 }
672                 fprintf(fplog, "Initial temperature: %g K\n", temp);
673             }
674         }
675 
676         char tbuf[20];
677         fprintf(stderr, "starting mdrun '%s'\n", *(top_global->name));
678         if (ir->nsteps >= 0)
679         {
680             sprintf(tbuf, "%8.1f", (ir->init_step + ir->nsteps) * ir->delta_t);
681         }
682         else
683         {
684             sprintf(tbuf, "%s", "infinite");
685         }
686         if (ir->init_step > 0)
687         {
688             fprintf(stderr, "%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
689                     gmx_step_str(ir->init_step + ir->nsteps, sbuf), tbuf,
690                     gmx_step_str(ir->init_step, sbuf2), ir->init_step * ir->delta_t);
691         }
692         else
693         {
694             fprintf(stderr, "%s steps, %s ps.\n", gmx_step_str(ir->nsteps, sbuf), tbuf);
695         }
696         fprintf(fplog, "\n");
697     }
698 
699     /* PLUMED */
700     if(plumedswitch){
701       /* detect plumed API version */
702       int pversion=0;
703       plumed_cmd(plumedmain,"getApiVersion",&pversion);
704       /* setting kbT is only implemented with api>1) */
705       real kbT=ir->opts.ref_t[0]*BOLTZ;
706       if(pversion>1) plumed_cmd(plumedmain,"setKbT",&kbT);
707       if(pversion>2){
708         int res=1;
709         if( (startingBehavior != StartingBehavior::NewSimulation) ) plumed_cmd(plumedmain,"setRestart",&res);
710       }
711 
712       if(ms && ms->numSimulations_>1) {
713         if(MASTER(cr)) plumed_cmd(plumedmain,"GREX setMPIIntercomm",&ms->mastersComm_);
714         if(PAR(cr)){
715           if(DOMAINDECOMP(cr)) {
716             plumed_cmd(plumedmain,"GREX setMPIIntracomm",&cr->dd->mpi_comm_all);
717           }else{
718             plumed_cmd(plumedmain,"GREX setMPIIntracomm",&cr->mpi_comm_mysim);
719           }
720         }
721         plumed_cmd(plumedmain,"GREX init",nullptr);
722       }
723       if(PAR(cr)){
724         if(DOMAINDECOMP(cr)) {
725           plumed_cmd(plumedmain,"setMPIComm",&cr->dd->mpi_comm_all);
726         }
727       }
728       plumed_cmd(plumedmain,"setNatoms",&top_global->natoms);
729       plumed_cmd(plumedmain,"setMDEngine","gromacs");
730       plumed_cmd(plumedmain,"setLog",fplog);
731       real real_delta_t=ir->delta_t;
732       plumed_cmd(plumedmain,"setTimestep",&real_delta_t);
733       plumed_cmd(plumedmain,"init",nullptr);
734 
735       if(PAR(cr)){
736         if(DOMAINDECOMP(cr)) {
737           int nat_home = dd_numHomeAtoms(*cr->dd);
738           plumed_cmd(plumedmain,"setAtomsNlocal",&nat_home);
739           plumed_cmd(plumedmain,"setAtomsGatindex",cr->dd->globalAtomIndices.data());
740         }
741       }
742       realFepState = state->fep_state;
743       plumed_cmd(plumedmain, "setExtraCV lambda", &realFepState);
744       plumed_cmd(plumedmain, "setExtraCVForce lambda", &lambdaForce);
745     }
746     /* END PLUMED */
747 
748     walltime_accounting_start_time(walltime_accounting);
749     wallcycle_start(wcycle, ewcRUN);
750     print_start(fplog, cr, walltime_accounting, "mdrun");
751 
752     /***********************************************************
753      *
754      *             Loop over MD steps
755      *
756      ************************************************************/
757 
758     bFirstStep = TRUE;
759     /* Skip the first Nose-Hoover integration when we get the state from tpx */
760     bInitStep        = startingBehavior == StartingBehavior::NewSimulation || EI_VV(ir->eI);
761     bSumEkinhOld     = FALSE;
762     bExchanged       = FALSE;
763     bNeedRepartition = FALSE;
764 
765     step     = ir->init_step;
766     step_rel = 0;
767 
768     auto stopHandler = stopHandlerBuilder->getStopHandlerMD(
769             compat::not_null<SimulationSignal*>(&signals[eglsSTOPCOND]), simulationsShareState,
770             MASTER(cr), ir->nstlist, mdrunOptions.reproducible, nstSignalComm,
771             mdrunOptions.maximumHoursToRun, ir->nstlist == 0, fplog, step, bNS, walltime_accounting);
772 
773     auto checkpointHandler = std::make_unique<CheckpointHandler>(
774             compat::make_not_null<SimulationSignal*>(&signals[eglsCHKPT]), simulationsShareState,
775             ir->nstlist == 0, MASTER(cr), mdrunOptions.writeConfout,
776             mdrunOptions.checkpointOptions.period);
777 
778     const bool resetCountersIsLocal = true;
779     auto       resetHandler         = std::make_unique<ResetHandler>(
780             compat::make_not_null<SimulationSignal*>(&signals[eglsRESETCOUNTERS]),
781             !resetCountersIsLocal, ir->nsteps, MASTER(cr), mdrunOptions.timingOptions.resetHalfway,
782             mdrunOptions.maximumHoursToRun, mdlog, wcycle, walltime_accounting);
783 
784     const DDBalanceRegionHandler ddBalanceRegionHandler(cr);
785 
786     if (MASTER(cr) && isMultiSim(ms) && !useReplicaExchange)
787     {
788         logInitialMultisimStatus(ms, cr, mdlog, simulationsShareState, ir->nsteps, ir->init_step);
789     }
790 
791     /* and stop now if we should */
792     bLastStep = (bLastStep || (ir->nsteps >= 0 && step_rel > ir->nsteps));
793     while (!bLastStep)
794     {
795 
796         /* Determine if this is a neighbor search step */
797         bNStList = (ir->nstlist > 0 && step % ir->nstlist == 0);
798 
799         if (bPMETune && bNStList)
800         {
801             // This has to be here because PME load balancing is called so early.
802             // TODO: Move to after all booleans are defined.
803             if (useGpuForUpdate && !bFirstStep)
804             {
805                 stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
806                 stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
807             }
808             /* PME grid + cut-off optimization with GPUs or PME nodes */
809             pme_loadbal_do(pme_loadbal, cr, (mdrunOptions.verbose && MASTER(cr)) ? stderr : nullptr,
810                            fplog, mdlog, *ir, fr, state->box, state->x, wcycle, step, step_rel,
811                            &bPMETunePrinting, simulationWork.useGpuPmePpCommunication);
812         }
813 
814         wallcycle_start(wcycle, ewcSTEP);
815 
816         bLastStep = (step_rel == ir->nsteps);
817         t         = t0 + step * ir->delta_t;
818 
819         // TODO Refactor this, so that nstfep does not need a default value of zero
820         if (ir->efep != efepNO || ir->bSimTemp)
821         {
822             /* find and set the current lambdas */
823             state->lambda = currentLambdas(step, *(ir->fepvals), state->fep_state);
824 
825             bDoDHDL     = do_per_step(step, ir->fepvals->nstdhdl);
826             bDoFEP      = ((ir->efep != efepNO) && do_per_step(step, nstfep));
827             bDoExpanded = (do_per_step(step, ir->expandedvals->nstexpanded) && (ir->bExpanded)
828                            && (!bFirstStep));
829         }
830 
831         bDoReplEx = (useReplicaExchange && (step > 0) && !bLastStep
832                      && do_per_step(step, replExParams.exchangeInterval));
833 
834         if (doSimulatedAnnealing)
835         {
836             update_annealing_target_temp(ir, t, &upd);
837         }
838 
839         /* Stop Center of Mass motion */
840         bStopCM = (ir->comm_mode != ecmNO && do_per_step(step, ir->nstcomm));
841 
842         /* Determine whether or not to do Neighbour Searching */
843         bNS = (bFirstStep || bNStList || bExchanged || bNeedRepartition);
844 
845         /* Note that the stopHandler will cause termination at nstglobalcomm
846          * steps. Since this concides with nstcalcenergy, nsttcouple and/or
847          * nstpcouple steps, we have computed the half-step kinetic energy
848          * of the previous step and can always output energies at the last step.
849          */
850         bLastStep = bLastStep || stopHandler->stoppingAfterCurrentStep(bNS);
851 
852         /* do_log triggers energy and virial calculation. Because this leads
853          * to different code paths, forces can be different. Thus for exact
854          * continuation we should avoid extra log output.
855          * Note that the || bLastStep can result in non-exact continuation
856          * beyond the last step. But we don't consider that to be an issue.
857          */
858         do_log     = (do_per_step(step, ir->nstlog)
859                   || (bFirstStep && startingBehavior == StartingBehavior::NewSimulation) || bLastStep);
860         do_verbose = mdrunOptions.verbose
861                      && (step % mdrunOptions.verboseStepPrintInterval == 0 || bFirstStep || bLastStep);
862 
863         if (useGpuForUpdate && !bFirstStep && bNS)
864         {
865             // Copy velocities from the GPU on search steps to keep a copy on host (device buffers are reinitialized).
866             stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
867             stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
868             // Copy coordinate from the GPU when needed at the search step.
869             // NOTE: The cases when coordinates needed on CPU for force evaluation are handled in sim_utils.
870             // NOTE: If the coordinates are to be written into output file they are also copied separately before the output.
871             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
872             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
873         }
874 
875         if (bNS && !(bFirstStep && ir->bContinuation))
876         {
877             bMasterState = FALSE;
878             /* Correct the new box if it is too skewed */
879             if (inputrecDynamicBox(ir))
880             {
881                 if (correct_box(fplog, step, state->box))
882                 {
883                     bMasterState = TRUE;
884                     // If update is offloaded, it should be informed about the box size change
885                     if (useGpuForUpdate)
886                     {
887                         integrator->setPbc(PbcType::Xyz, state->box);
888                     }
889                 }
890             }
891             if (DOMAINDECOMP(cr) && bMasterState)
892             {
893                 dd_collect_state(cr->dd, state, state_global);
894             }
895 
896             if (DOMAINDECOMP(cr))
897             {
898                 /* Repartition the domain decomposition */
899                 dd_partition_system(fplog, mdlog, step, cr, bMasterState, nstglobalcomm, state_global,
900                                     *top_global, ir, imdSession, pull_work, state, &f, mdAtoms, &top,
901                                     fr, vsite, constr, nrnb, wcycle, do_verbose && !bPMETunePrinting);
902                 shouldCheckNumberOfBondedInteractions = true;
903                 upd.setNumAtoms(state->natoms);
904 
905                 /* PLUMED */
906                 if(plumedswitch){
907                   int nat_home = dd_numHomeAtoms(*cr->dd);
908                   plumed_cmd(plumedmain,"setAtomsNlocal",&nat_home);
909                   plumed_cmd(plumedmain,"setAtomsGatindex",cr->dd->globalAtomIndices.data());
910                 }
911                 /* END PLUMED */
912             }
913         }
914 
915         // Allocate or re-size GPU halo exchange object, if necessary
916         if (bNS && havePPDomainDecomposition(cr) && simulationWork.useGpuHaloExchange)
917         {
918             GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
919                                "GPU device manager has to be initialized to use GPU "
920                                "version of halo exchange.");
921             constructGpuHaloExchange(mdlog, *cr, *fr->deviceStreamManager, wcycle);
922         }
923 
924         if (MASTER(cr) && do_log)
925         {
926             gmx::EnergyOutput::printHeader(fplog, step,
927                                            t); /* can we improve the information printed here? */
928         }
929 
930         if (ir->efep != efepNO)
931         {
932             update_mdatoms(mdatoms, state->lambda[efptMASS]);
933         }
934 
935         if (bExchanged)
936         {
937 
938             /* We need the kinetic energy at minus the half step for determining
939              * the full step kinetic energy and possibly for T-coupling.*/
940             /* This may not be quite working correctly yet . . . . */
941             compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
942                             makeConstArrayRef(state->v), state->box, mdatoms, nrnb, &vcm, wcycle,
943                             enerd, nullptr, nullptr, nullptr, nullptr, constr, &nullSignaller,
944                             state->box, &totalNumberOfBondedInteractions, &bSumEkinhOld,
945                             CGLO_GSTAT | CGLO_TEMPERATURE | CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS);
946             checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions, top_global,
947                                             &top, makeConstArrayRef(state->x), state->box,
948                                             &shouldCheckNumberOfBondedInteractions);
949         }
950         clear_mat(force_vir);
951 
952         /* PLUMED HREX */
953         gmx_bool bHREX = bDoReplEx && plumed_hrex;
954 
955         if (plumedswitch && bHREX) {
956           // gmx_enerdata_t *hrex_enerd;
957           int nlambda = enerd->foreignLambdaTerms.numLambdas();
958           gmx_enerdata_t hrex_enerd(enerd->grpp.nener, nlambda == 0 ? 0 : nlambda - 1);
959           int repl  = -1;
960           int nrepl = -1;
961           if (MASTER(cr)){
962             repl  = replica_exchange_get_repl(repl_ex);
963             nrepl = replica_exchange_get_nrepl(repl_ex);
964           }
965 
966           if (DOMAINDECOMP(cr)) {
967             dd_collect_state(cr->dd,state,state_global);
968           } else {
969             copy_state_serial(state, state_global);
970           }
971 
972           if(MASTER(cr)){
973             if(repl%2==step/replExParams.exchangeInterval%2){
974               if(repl-1>=0) exchange_state(ms,repl-1,state_global);
975             }else{
976               if(repl+1<nrepl) exchange_state(ms,repl+1,state_global);
977             }
978           }
979           if (!DOMAINDECOMP(cr)) {
980             copy_state_serial(state_global, state);
981           }
982           if(PAR(cr)){
983             if (DOMAINDECOMP(cr)) {
984               dd_partition_system(fplog,mdlog,step,cr,TRUE,1,
985                                   state_global,*top_global,ir,
986                                   imdSession, pull_work,
987                                   state,&f,mdAtoms,&top,fr,vsite,constr,
988                                   nrnb,wcycle,FALSE);
989             }
990           }
991           do_force(fplog, cr, ms, ir, awh.get(), enforcedRotation, imdSession, pull_work, step,
992                    nrnb, wcycle, &top, state->box, state->x.arrayRefWithPadding(), &state->hist,
993                    &f.view(), force_vir, mdatoms, &hrex_enerd, state->lambda,
994                    fr, runScheduleWork, vsite, mu_tot, t, ed ? ed->getLegacyED() : nullptr,
995                    GMX_FORCE_STATECHANGED |
996                    GMX_FORCE_DYNAMICBOX |
997                    GMX_FORCE_ALLFORCES |
998                    GMX_FORCE_VIRIAL |
999                    GMX_FORCE_ENERGY |
1000                    GMX_FORCE_DHDL |
1001                    GMX_FORCE_NS,
1002                    ddBalanceRegionHandler);
1003 
1004           plumed_cmd(plumedmain,"GREX cacheLocalUSwap",&(&hrex_enerd)->term[F_EPOT]);
1005 
1006           /* exchange back */
1007           if (DOMAINDECOMP(cr)) {
1008             dd_collect_state(cr->dd,state,state_global);
1009           } else {
1010             copy_state_serial(state, state_global);
1011           }
1012 
1013           if(MASTER(cr)){
1014             if(repl%2==step/replExParams.exchangeInterval%2){
1015               if(repl-1>=0) exchange_state(ms,repl-1,state_global);
1016             }else{
1017               if(repl+1<nrepl) exchange_state(ms,repl+1,state_global);
1018             }
1019           }
1020 
1021           if (!DOMAINDECOMP(cr)) {
1022             copy_state_serial(state_global, state);
1023           }
1024           if(PAR(cr)){
1025             if (DOMAINDECOMP(cr)) {
1026               dd_partition_system(fplog,mdlog,step,cr,TRUE,1,
1027                                   state_global,*top_global,ir,
1028                                   imdSession, pull_work,
1029                                   state,&f,mdAtoms,&top,fr,vsite,constr,
1030                                   nrnb,wcycle,FALSE);
1031               int nat_home = dd_numHomeAtoms(*cr->dd);
1032               plumed_cmd(plumedmain,"setAtomsNlocal",&nat_home);
1033               plumed_cmd(plumedmain,"setAtomsGatindex",cr->dd->globalAtomIndices.data());
1034             }
1035           }
1036           bNS=true;
1037         }
1038         /* END PLUMED HREX */
1039 
1040         checkpointHandler->decideIfCheckpointingThisStep(bNS, bFirstStep, bLastStep);
1041 
1042         /* Determine the energy and pressure:
1043          * at nstcalcenergy steps and at energy output steps (set below).
1044          */
1045         if (EI_VV(ir->eI) && (!bInitStep))
1046         {
1047             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
1048             bCalcVir      = bCalcEnerStep
1049                        || (ir->epc != epcNO
1050                            && (do_per_step(step, ir->nstpcouple) || do_per_step(step - 1, ir->nstpcouple)));
1051         }
1052         else
1053         {
1054             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
1055             bCalcVir = bCalcEnerStep || (ir->epc != epcNO && do_per_step(step, ir->nstpcouple));
1056         }
1057         bCalcEner = bCalcEnerStep;
1058 
1059         do_ene = (do_per_step(step, ir->nstenergy) || bLastStep);
1060 
1061         if (do_ene || do_log || bDoReplEx)
1062         {
1063             bCalcVir  = TRUE;
1064             bCalcEner = TRUE;
1065         }
1066 
1067         /* Do we need global communication ? */
1068         bGStat = (bCalcVir || bCalcEner || bStopCM || do_per_step(step, nstglobalcomm)
1069                   || (EI_VV(ir->eI) && inputrecNvtTrotter(ir) && do_per_step(step - 1, nstglobalcomm)));
1070 
1071         force_flags = (GMX_FORCE_STATECHANGED | ((inputrecDynamicBox(ir)) ? GMX_FORCE_DYNAMICBOX : 0)
1072                        | GMX_FORCE_ALLFORCES | (bCalcVir ? GMX_FORCE_VIRIAL : 0)
1073                        | (bCalcEner ? GMX_FORCE_ENERGY : 0) | (bDoFEP ? GMX_FORCE_DHDL : 0));
1074         if (fr->useMts && !do_per_step(step, ir->nstfout))
1075         {
1076             force_flags |= GMX_FORCE_DO_NOT_NEED_NORMAL_FORCE;
1077         }
1078 
1079         if (shellfc)
1080         {
1081             /* Now is the time to relax the shells */
1082             relax_shell_flexcon(fplog, cr, ms, mdrunOptions.verbose, enforcedRotation, step, ir,
1083                                 imdSession, pull_work, bNS, force_flags, &top, constr, enerd,
1084                                 state->natoms, state->x.arrayRefWithPadding(),
1085                                 state->v.arrayRefWithPadding(), state->box, state->lambda,
1086                                 &state->hist, &f.view(), force_vir, mdatoms, nrnb, wcycle, shellfc,
1087                                 fr, runScheduleWork, t, mu_tot, vsite, ddBalanceRegionHandler);
1088         }
1089         else
1090         {
1091             /* The AWH history need to be saved _before_ doing force calculations where the AWH bias
1092                is updated (or the AWH update will be performed twice for one step when continuing).
1093                It would be best to call this update function from do_md_trajectory_writing but that
1094                would occur after do_force. One would have to divide the update_awh function into one
1095                function applying the AWH force and one doing the AWH bias update. The update AWH
1096                bias function could then be called after do_md_trajectory_writing (then containing
1097                update_awh_history). The checkpointing will in the future probably moved to the start
1098                of the md loop which will rid of this issue. */
1099             if (awh && checkpointHandler->isCheckpointingStep() && MASTER(cr))
1100             {
1101                 awh->updateHistory(state_global->awhHistory.get());
1102             }
1103 
1104             /* The coordinates (x) are shifted (to get whole molecules)
1105              * in do_force.
1106              * This is parallellized as well, and does communication too.
1107              * Check comments in sim_util.c
1108              */
1109 
1110              /* PLUMED */
1111             plumedNeedsEnergy=0;
1112             if(plumedswitch){
1113               int pversion=0;
1114               plumed_cmd(plumedmain,"getApiVersion",&pversion);
1115               long int lstep=step; plumed_cmd(plumedmain,"setStepLong",&lstep);
1116               plumed_cmd(plumedmain,"setPositions",&state->x[0][0]);
1117               plumed_cmd(plumedmain,"setMasses",&mdatoms->massT[0]);
1118               plumed_cmd(plumedmain,"setCharges",&mdatoms->chargeA[0]);
1119               plumed_cmd(plumedmain,"setBox",&state->box[0][0]);
1120               plumed_cmd(plumedmain,"prepareCalc",nullptr);
1121               plumed_cmd(plumedmain,"setStopFlag",&plumedWantsToStop);
1122               int checkp=0; if(checkpointHandler->isCheckpointingStep()) checkp=1;
1123               if(pversion>3) plumed_cmd(plumedmain,"doCheckPoint",&checkp);
1124               plumed_cmd(plumedmain,"setForces",&f.view().force()[0][0]);
1125               plumed_cmd(plumedmain,"isEnergyNeeded",&plumedNeedsEnergy);
1126               if(plumedNeedsEnergy) force_flags |= GMX_FORCE_ENERGY | GMX_FORCE_VIRIAL;
1127               clear_mat(plumed_vir);
1128               plumed_cmd(plumedmain,"setVirial",&plumed_vir[0][0]);
1129             }
1130             /* END PLUMED */
1131             do_force(fplog, cr, ms, ir, awh.get(), enforcedRotation, imdSession, pull_work, step,
1132                      nrnb, wcycle, &top, state->box, state->x.arrayRefWithPadding(), &state->hist,
1133                      &f.view(), force_vir, mdatoms, enerd, state->lambda, fr, runScheduleWork,
1134                      vsite, mu_tot, t, ed ? ed->getLegacyED() : nullptr,
1135                      (bNS ? GMX_FORCE_NS : 0) | force_flags, ddBalanceRegionHandler);
1136             /* PLUMED */
1137             if(plumedswitch){
1138               if(plumedNeedsEnergy){
1139                 msmul(force_vir,2.0,plumed_vir);
1140                 plumed_cmd(plumedmain,"setEnergy",&enerd->term[F_EPOT]);
1141                 plumed_cmd(plumedmain,"performCalc",nullptr);
1142                 msmul(plumed_vir,0.5,force_vir);
1143               } else {
1144                 msmul(plumed_vir,0.5,plumed_vir);
1145                 m_add(force_vir,plumed_vir,force_vir);
1146               }
1147               if(bDoReplEx) plumed_cmd(plumedmain,"GREX savePositions",nullptr);
1148               if(plumedWantsToStop) ir->nsteps=step_rel+1;
1149               if(bHREX) plumed_cmd(plumedmain,"GREX cacheLocalUNow",&enerd->term[F_EPOT]);
1150             }
1151             /* END PLUMED */
1152         }
1153 
1154         // VV integrators do not need the following velocity half step
1155         // if it is the first step after starting from a checkpoint.
1156         // That is, the half step is needed on all other steps, and
1157         // also the first step when starting from a .tpr file.
1158         if (EI_VV(ir->eI) && (!bFirstStep || startingBehavior == StartingBehavior::NewSimulation))
1159         /*  ############### START FIRST UPDATE HALF-STEP FOR VV METHODS############### */
1160         {
1161             rvec* vbuf = nullptr;
1162 
1163             wallcycle_start(wcycle, ewcUPDATE);
1164             if (ir->eI == eiVV && bInitStep)
1165             {
1166                 /* if using velocity verlet with full time step Ekin,
1167                  * take the first half step only to compute the
1168                  * virial for the first step. From there,
1169                  * revert back to the initial coordinates
1170                  * so that the input is actually the initial step.
1171                  */
1172                 snew(vbuf, state->natoms);
1173                 copy_rvecn(state->v.rvec_array(), vbuf, 0,
1174                            state->natoms); /* should make this better for parallelizing? */
1175             }
1176             else
1177             {
1178                 /* this is for NHC in the Ekin(t+dt/2) version of vv */
1179                 trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ,
1180                                trotter_seq, ettTSEQ1);
1181             }
1182 
1183             upd.update_coords(*ir, step, mdatoms, state, f.view().forceWithPadding(), fcdata, ekind,
1184                               M, etrtVELOCITY1, cr, constr != nullptr);
1185 
1186             wallcycle_stop(wcycle, ewcUPDATE);
1187             constrain_velocities(constr, do_log, do_ene, step, state, nullptr, bCalcVir, shake_vir);
1188             wallcycle_start(wcycle, ewcUPDATE);
1189             /* if VV, compute the pressure and constraints */
1190             /* For VV2, we strictly only need this if using pressure
1191              * control, but we really would like to have accurate pressures
1192              * printed out.
1193              * Think about ways around this in the future?
1194              * For now, keep this choice in comments.
1195              */
1196             /*bPres = (ir->eI==eiVV || inputrecNptTrotter(ir)); */
1197             /*bTemp = ((ir->eI==eiVV &&(!bInitStep)) || (ir->eI==eiVVAK && inputrecNptTrotter(ir)));*/
1198             bPres = TRUE;
1199             bTemp = ((ir->eI == eiVV && (!bInitStep)) || (ir->eI == eiVVAK));
1200             if (bCalcEner && ir->eI == eiVVAK)
1201             {
1202                 bSumEkinhOld = TRUE;
1203             }
1204             /* for vv, the first half of the integration actually corresponds to the previous step.
1205                So we need information from the last step in the first half of the integration */
1206             if (bGStat || do_per_step(step - 1, nstglobalcomm))
1207             {
1208                 wallcycle_stop(wcycle, ewcUPDATE);
1209                 compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
1210                                 makeConstArrayRef(state->v), state->box, mdatoms, nrnb, &vcm, wcycle,
1211                                 enerd, force_vir, shake_vir, total_vir, pres, constr, &nullSignaller,
1212                                 state->box, &totalNumberOfBondedInteractions, &bSumEkinhOld,
1213                                 (bGStat ? CGLO_GSTAT : 0) | (bCalcEner ? CGLO_ENERGY : 0)
1214                                         | (bTemp ? CGLO_TEMPERATURE : 0) | (bPres ? CGLO_PRESSURE : 0)
1215                                         | (bPres ? CGLO_CONSTRAINT : 0) | (bStopCM ? CGLO_STOPCM : 0)
1216                                         | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS
1217                                                                                  : 0)
1218                                         | CGLO_SCALEEKIN);
1219                 /* explanation of above:
1220                    a) We compute Ekin at the full time step
1221                    if 1) we are using the AveVel Ekin, and it's not the
1222                    initial step, or 2) if we are using AveEkin, but need the full
1223                    time step kinetic energy for the pressure (always true now, since we want accurate statistics).
1224                    b) If we are using EkinAveEkin for the kinetic energy for the temperature control, we still feed in
1225                    EkinAveVel because it's needed for the pressure */
1226                 checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions,
1227                                                 top_global, &top, makeConstArrayRef(state->x),
1228                                                 state->box, &shouldCheckNumberOfBondedInteractions);
1229                 if (bStopCM)
1230                 {
1231                     process_and_stopcm_grp(fplog, &vcm, *mdatoms, makeArrayRef(state->x),
1232                                            makeArrayRef(state->v));
1233                     inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
1234                 }
1235                 wallcycle_start(wcycle, ewcUPDATE);
1236             }
1237             /* temperature scaling and pressure scaling to produce the extended variables at t+dt */
1238             if (!bInitStep)
1239             {
1240                 if (bTrotter)
1241                 {
1242                     m_add(force_vir, shake_vir,
1243                           total_vir); /* we need the un-dispersion corrected total vir here */
1244                     trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ,
1245                                    trotter_seq, ettTSEQ2);
1246 
1247                     /* TODO This is only needed when we're about to write
1248                      * a checkpoint, because we use it after the restart
1249                      * (in a kludge?). But what should we be doing if
1250                      * the startingBehavior is NewSimulation or bInitStep are true? */
1251                     if (inputrecNptTrotter(ir) || inputrecNphTrotter(ir))
1252                     {
1253                         copy_mat(shake_vir, state->svir_prev);
1254                         copy_mat(force_vir, state->fvir_prev);
1255                     }
1256                     if ((inputrecNptTrotter(ir) || inputrecNvtTrotter(ir)) && ir->eI == eiVV)
1257                     {
1258                         /* update temperature and kinetic energy now that step is over - this is the v(t+dt) point */
1259                         enerd->term[F_TEMP] =
1260                                 sum_ekin(&(ir->opts), ekind, nullptr, (ir->eI == eiVV), FALSE);
1261                         enerd->term[F_EKIN] = trace(ekind->ekin);
1262                     }
1263                 }
1264                 else if (bExchanged)
1265                 {
1266                     wallcycle_stop(wcycle, ewcUPDATE);
1267                     /* We need the kinetic energy at minus the half step for determining
1268                      * the full step kinetic energy and possibly for T-coupling.*/
1269                     /* This may not be quite working correctly yet . . . . */
1270                     compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
1271                                     makeConstArrayRef(state->v), state->box, mdatoms, nrnb, &vcm, wcycle,
1272                                     enerd, nullptr, nullptr, nullptr, nullptr, constr, &nullSignaller,
1273                                     state->box, nullptr, &bSumEkinhOld, CGLO_GSTAT | CGLO_TEMPERATURE);
1274                     wallcycle_start(wcycle, ewcUPDATE);
1275                 }
1276             }
1277             /* if it's the initial step, we performed this first step just to get the constraint virial */
1278             if (ir->eI == eiVV && bInitStep)
1279             {
1280                 copy_rvecn(vbuf, state->v.rvec_array(), 0, state->natoms);
1281                 sfree(vbuf);
1282             }
1283             wallcycle_stop(wcycle, ewcUPDATE);
1284         }
1285 
1286         /* compute the conserved quantity */
1287         if (EI_VV(ir->eI))
1288         {
1289             saved_conserved_quantity = NPT_energy(ir, state, &MassQ);
1290             if (ir->eI == eiVV)
1291             {
1292                 last_ekin = enerd->term[F_EKIN];
1293             }
1294             if ((ir->eDispCorr != edispcEnerPres) && (ir->eDispCorr != edispcAllEnerPres))
1295             {
1296                 saved_conserved_quantity -= enerd->term[F_DISPCORR];
1297             }
1298             /* sum up the foreign kinetic energy and dK/dl terms for vv.  currently done every step so that dhdl is correct in the .edr */
1299             if (ir->efep != efepNO)
1300             {
1301                 accumulateKineticLambdaComponents(enerd, state->lambda, *ir->fepvals);
1302             }
1303         }
1304 
1305         /* ########  END FIRST UPDATE STEP  ############## */
1306         /* ########  If doing VV, we now have v(dt) ###### */
1307         if (bDoExpanded)
1308         {
1309             /* perform extended ensemble sampling in lambda - we don't
1310                actually move to the new state before outputting
1311                statistics, but if performing simulated tempering, we
1312                do update the velocities and the tau_t. */
1313 
1314             lamnew = ExpandedEnsembleDynamics(fplog, ir, enerd, state, &MassQ, state->fep_state,
1315                                               state->dfhist, step, state->v.rvec_array(), mdatoms, &realFepState);
1316             /* history is maintained in state->dfhist, but state_global is what is sent to trajectory and log output */
1317             if (MASTER(cr))
1318             {
1319                 copy_df_history(state_global->dfhist, state->dfhist);
1320             }
1321         }
1322 
1323         // Copy coordinate from the GPU for the output/checkpointing if the update is offloaded and
1324         // coordinates have not already been copied for i) search or ii) CPU force tasks.
1325         if (useGpuForUpdate && !bNS && !runScheduleWork->domainWork.haveCpuLocalForceWork
1326             && (do_per_step(step, ir->nstxout) || do_per_step(step, ir->nstxout_compressed)
1327                 || checkpointHandler->isCheckpointingStep()))
1328         {
1329             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1330             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1331         }
1332         // Copy velocities if needed for the output/checkpointing.
1333         // NOTE: Copy on the search steps is done at the beginning of the step.
1334         if (useGpuForUpdate && !bNS
1335             && (do_per_step(step, ir->nstvout) || checkpointHandler->isCheckpointingStep()))
1336         {
1337             stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
1338             stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
1339         }
1340         // Copy forces for the output if the forces were reduced on the GPU (not the case on virial steps)
1341         // and update is offloaded hence forces are kept on the GPU for update and have not been
1342         // already transferred in do_force().
1343         // TODO: There should be an improved, explicit mechanism that ensures this copy is only executed
1344         //       when the forces are ready on the GPU -- the same synchronizer should be used as the one
1345         //       prior to GPU update.
1346         // TODO: When the output flags will be included in step workload, this copy can be combined with the
1347         //       copy call in do_force(...).
1348         // NOTE: The forces should not be copied here if the vsites are present, since they were modified
1349         //       on host after the D2H copy in do_force(...).
1350         if (runScheduleWork->stepWork.useGpuFBufferOps && (simulationWork.useGpuUpdate && !vsite)
1351             && do_per_step(step, ir->nstfout))
1352         {
1353             stateGpu->copyForcesFromGpu(f.view().force(), AtomLocality::Local);
1354             stateGpu->waitForcesReadyOnHost(AtomLocality::Local);
1355         }
1356         /* Now we have the energies and forces corresponding to the
1357          * coordinates at time t. We must output all of this before
1358          * the update.
1359          */
1360         do_md_trajectory_writing(fplog, cr, nfile, fnm, step, step_rel, t, ir, state, state_global,
1361                                  observablesHistory, top_global, fr, outf, energyOutput, ekind,
1362                                  f.view().force(), checkpointHandler->isCheckpointingStep(),
1363                                  bRerunMD, bLastStep, mdrunOptions.writeConfout, bSumEkinhOld);
1364         /* Check if IMD step and do IMD communication, if bIMD is TRUE. */
1365         bInteractiveMDstep = imdSession->run(step, bNS, state->box, state->x.rvec_array(), t);
1366 
1367         /* kludge -- virial is lost with restart for MTTK NPT control. Must reload (saved earlier). */
1368         if (startingBehavior != StartingBehavior::NewSimulation && bFirstStep
1369             && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir)))
1370         {
1371             copy_mat(state->svir_prev, shake_vir);
1372             copy_mat(state->fvir_prev, force_vir);
1373         }
1374 
1375         stopHandler->setSignal();
1376         resetHandler->setSignal(walltime_accounting);
1377 
1378         if (bGStat || !PAR(cr))
1379         {
1380             /* In parallel we only have to check for checkpointing in steps
1381              * where we do global communication,
1382              *  otherwise the other nodes don't know.
1383              */
1384             checkpointHandler->setSignal(walltime_accounting);
1385         }
1386 
1387         /* #########   START SECOND UPDATE STEP ################# */
1388 
1389         /* at the start of step, randomize or scale the velocities ((if vv. Restriction of Andersen
1390            controlled in preprocessing */
1391 
1392         if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */
1393         {
1394             gmx_bool bIfRandomize;
1395             bIfRandomize = update_randomize_velocities(ir, step, cr, mdatoms, state->v, &upd, constr);
1396             /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
1397             if (constr && bIfRandomize)
1398             {
1399                 constrain_velocities(constr, do_log, do_ene, step, state, nullptr, false, nullptr);
1400             }
1401         }
1402         /* Box is changed in update() when we do pressure coupling,
1403          * but we should still use the old box for energy corrections and when
1404          * writing it to the energy file, so it matches the trajectory files for
1405          * the same timestep above. Make a copy in a separate array.
1406          */
1407         copy_mat(state->box, lastbox);
1408 
1409         dvdl_constr = 0;
1410 
1411         if (!useGpuForUpdate)
1412         {
1413             wallcycle_start(wcycle, ewcUPDATE);
1414         }
1415         /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
1416         if (bTrotter)
1417         {
1418             trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ3);
1419             /* We can only do Berendsen coupling after we have summed
1420              * the kinetic energy or virial. Since the happens
1421              * in global_state after update, we should only do it at
1422              * step % nstlist = 1 with bGStatEveryStep=FALSE.
1423              */
1424         }
1425         else
1426         {
1427             update_tcouple(step, ir, state, ekind, &MassQ, mdatoms);
1428             update_pcouple_before_coordinates(fplog, step, ir, state, pressureCouplingMu, M, bInitStep);
1429         }
1430 
1431         if (EI_VV(ir->eI))
1432         {
1433             /* velocity half-step update */
1434             upd.update_coords(*ir, step, mdatoms, state, f.view().forceWithPadding(), fcdata, ekind,
1435                               M, etrtVELOCITY2, cr, constr != nullptr);
1436         }
1437 
1438         /* Above, initialize just copies ekinh into ekin,
1439          * it doesn't copy position (for VV),
1440          * and entire integrator for MD.
1441          */
1442 
1443         if (ir->eI == eiVVAK)
1444         {
1445             cbuf.resize(state->x.size());
1446             std::copy(state->x.begin(), state->x.end(), cbuf.begin());
1447         }
1448 
1449         /* With leap-frog type integrators we compute the kinetic energy
1450          * at a whole time step as the average of the half-time step kinetic
1451          * energies of two subsequent steps. Therefore we need to compute the
1452          * half step kinetic energy also if we need energies at the next step.
1453          */
1454         const bool needHalfStepKineticEnergy =
1455                 (!EI_VV(ir->eI) && (do_per_step(step + 1, nstglobalcomm) || step_rel + 1 == ir->nsteps));
1456 
1457         // Parrinello-Rahman requires the pressure to be availible before the update to compute
1458         // the velocity scaling matrix. Hence, it runs one step after the nstpcouple step.
1459         const bool doParrinelloRahman = (ir->epc == epcPARRINELLORAHMAN
1460                                          && do_per_step(step + ir->nstpcouple - 1, ir->nstpcouple));
1461 
1462         if (useGpuForUpdate)
1463         {
1464             if (bNS && (bFirstStep || DOMAINDECOMP(cr)))
1465             {
1466                 integrator->set(stateGpu->getCoordinates(), stateGpu->getVelocities(),
1467                                 stateGpu->getForces(), top.idef, *mdatoms, ekind->ngtc);
1468 
1469                 // Copy data to the GPU after buffers might have being reinitialized
1470                 stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
1471                 stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1472             }
1473 
1474             if (simulationWork.useGpuPme && !runScheduleWork->simulationWork.useGpuPmePpCommunication
1475                 && !thisRankHasDuty(cr, DUTY_PME))
1476             {
1477                 // The PME forces were recieved to the host, so have to be copied
1478                 stateGpu->copyForcesToGpu(f.view().force(), AtomLocality::All);
1479             }
1480             else if (!runScheduleWork->stepWork.useGpuFBufferOps)
1481             {
1482                 // The buffer ops were not offloaded this step, so the forces are on the
1483                 // host and have to be copied
1484                 stateGpu->copyForcesToGpu(f.view().force(), AtomLocality::Local);
1485             }
1486 
1487             const bool doTemperatureScaling =
1488                     (ir->etc != etcNO && do_per_step(step + ir->nsttcouple - 1, ir->nsttcouple));
1489 
1490             // This applies Leap-Frog, LINCS and SETTLE in succession
1491             integrator->integrate(stateGpu->getForcesReadyOnDeviceEvent(
1492                                           AtomLocality::Local, runScheduleWork->stepWork.useGpuFBufferOps),
1493                                   ir->delta_t, true, bCalcVir, shake_vir, doTemperatureScaling,
1494                                   ekind->tcstat, doParrinelloRahman, ir->nstpcouple * ir->delta_t, M);
1495 
1496             // Copy velocities D2H after update if:
1497             // - Globals are computed this step (includes the energy output steps).
1498             // - Temperature is needed for the next step.
1499             if (bGStat || needHalfStepKineticEnergy)
1500             {
1501                 stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
1502                 stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
1503             }
1504         }
1505         else
1506         {
1507             /* With multiple time stepping we need to do an additional normal
1508              * update step to obtain the virial, as the actual MTS integration
1509              * using an acceleration where the slow forces are multiplied by mtsFactor.
1510              * Using that acceleration would result in a virial with the slow
1511              * force contribution would be a factor mtsFactor too large.
1512              */
1513             if (fr->useMts && bCalcVir && constr != nullptr)
1514             {
1515                 upd.update_for_constraint_virial(*ir, *mdatoms, *state, f.view().forceWithPadding(), *ekind);
1516 
1517                 constrain_coordinates(constr, do_log, do_ene, step, state,
1518                                       upd.xp()->arrayRefWithPadding(), &dvdl_constr, bCalcVir, shake_vir);
1519             }
1520 
1521             ArrayRefWithPadding<const RVec> forceCombined =
1522                     (fr->useMts && step % ir->mtsLevels[1].stepFactor == 0)
1523                             ? f.view().forceMtsCombinedWithPadding()
1524                             : f.view().forceWithPadding();
1525             upd.update_coords(*ir, step, mdatoms, state, forceCombined, fcdata, ekind, M,
1526                               etrtPOSITION, cr, constr != nullptr);
1527 
1528             wallcycle_stop(wcycle, ewcUPDATE);
1529 
1530             constrain_coordinates(constr, do_log, do_ene, step, state, upd.xp()->arrayRefWithPadding(),
1531                                   &dvdl_constr, bCalcVir && !fr->useMts, shake_vir);
1532 
1533             upd.update_sd_second_half(*ir, step, &dvdl_constr, mdatoms, state, cr, nrnb, wcycle,
1534                                       constr, do_log, do_ene);
1535             upd.finish_update(*ir, mdatoms, state, wcycle, constr != nullptr);
1536         }
1537 
1538         if (ir->bPull && ir->pull->bSetPbcRefToPrevStepCOM)
1539         {
1540             updatePrevStepPullCom(pull_work, state);
1541         }
1542 
1543         if (ir->eI == eiVVAK)
1544         {
1545             /* erase F_EKIN and F_TEMP here? */
1546             /* just compute the kinetic energy at the half step to perform a trotter step */
1547             compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
1548                             makeConstArrayRef(state->v), state->box, mdatoms, nrnb, &vcm, wcycle, enerd,
1549                             force_vir, shake_vir, total_vir, pres, constr, &nullSignaller, lastbox,
1550                             nullptr, &bSumEkinhOld, (bGStat ? CGLO_GSTAT : 0) | CGLO_TEMPERATURE);
1551             wallcycle_start(wcycle, ewcUPDATE);
1552             trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ4);
1553             /* now we know the scaling, we can compute the positions again */
1554             std::copy(cbuf.begin(), cbuf.end(), state->x.begin());
1555 
1556             upd.update_coords(*ir, step, mdatoms, state, f.view().forceWithPadding(), fcdata, ekind,
1557                               M, etrtPOSITION, cr, constr != nullptr);
1558             wallcycle_stop(wcycle, ewcUPDATE);
1559 
1560             /* do we need an extra constraint here? just need to copy out of as_rvec_array(state->v.data()) to upd->xp? */
1561             /* are the small terms in the shake_vir here due
1562              * to numerical errors, or are they important
1563              * physically? I'm thinking they are just errors, but not completely sure.
1564              * For now, will call without actually constraining, constr=nullptr*/
1565             upd.finish_update(*ir, mdatoms, state, wcycle, false);
1566         }
1567         if (EI_VV(ir->eI))
1568         {
1569             /* this factor or 2 correction is necessary
1570                because half of the constraint force is removed
1571                in the vv step, so we have to double it.  See
1572                the Issue #1255.  It is not yet clear
1573                if the factor of 2 is exact, or just a very
1574                good approximation, and this will be
1575                investigated.  The next step is to see if this
1576                can be done adding a dhdl contribution from the
1577                rattle step, but this is somewhat more
1578                complicated with the current code. Will be
1579                investigated, hopefully for 4.6.3. However,
1580                this current solution is much better than
1581                having it completely wrong.
1582              */
1583             enerd->term[F_DVDL_CONSTR] += 2 * dvdl_constr;
1584         }
1585         else
1586         {
1587             enerd->term[F_DVDL_CONSTR] += dvdl_constr;
1588         }
1589 
1590         if (vsite != nullptr)
1591         {
1592             wallcycle_start(wcycle, ewcVSITECONSTR);
1593             vsite->construct(state->x, ir->delta_t, state->v, state->box);
1594             wallcycle_stop(wcycle, ewcVSITECONSTR);
1595         }
1596 
1597         /* ############## IF NOT VV, Calculate globals HERE  ############ */
1598         /* With Leap-Frog we can skip compute_globals at
1599          * non-communication steps, but we need to calculate
1600          * the kinetic energy one step before communication.
1601          */
1602         {
1603             // Organize to do inter-simulation signalling on steps if
1604             // and when algorithms require it.
1605             const bool doInterSimSignal = (simulationsShareState && do_per_step(step, nstSignalComm));
1606 
1607             if (bGStat || needHalfStepKineticEnergy || doInterSimSignal)
1608             {
1609                 // Copy coordinates when needed to stop the CM motion.
1610                 if (useGpuForUpdate && !EI_VV(ir->eI) && bStopCM)
1611                 {
1612                     stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1613                     stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1614                 }
1615                 // Since we're already communicating at this step, we
1616                 // can propagate intra-simulation signals. Note that
1617                 // check_nstglobalcomm has the responsibility for
1618                 // choosing the value of nstglobalcomm that is one way
1619                 // bGStat becomes true, so we can't get into a
1620                 // situation where e.g. checkpointing can't be
1621                 // signalled.
1622                 bool                doIntraSimSignal = true;
1623                 SimulationSignaller signaller(&signals, cr, ms, doInterSimSignal, doIntraSimSignal);
1624 
1625                 compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
1626                                 makeConstArrayRef(state->v), state->box, mdatoms, nrnb, &vcm,
1627                                 wcycle, enerd, force_vir, shake_vir, total_vir, pres, constr,
1628                                 &signaller, lastbox, &totalNumberOfBondedInteractions, &bSumEkinhOld,
1629                                 (bGStat ? CGLO_GSTAT : 0) | (!EI_VV(ir->eI) && bCalcEner ? CGLO_ENERGY : 0)
1630                                         | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
1631                                         | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
1632                                         | (!EI_VV(ir->eI) ? CGLO_PRESSURE : 0) | CGLO_CONSTRAINT
1633                                         | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS
1634                                                                                  : 0));
1635                 checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions,
1636                                                 top_global, &top, makeConstArrayRef(state->x),
1637                                                 state->box, &shouldCheckNumberOfBondedInteractions);
1638                 if (!EI_VV(ir->eI) && bStopCM)
1639                 {
1640                     process_and_stopcm_grp(fplog, &vcm, *mdatoms, makeArrayRef(state->x),
1641                                            makeArrayRef(state->v));
1642                     inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
1643 
1644                     // TODO: The special case of removing CM motion should be dealt more gracefully
1645                     if (useGpuForUpdate)
1646                     {
1647                         stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1648                         // Here we block until the H2D copy completes because event sync with the
1649                         // force kernels that use the coordinates on the next steps is not implemented
1650                         // (not because of a race on state->x being modified on the CPU while H2D is in progress).
1651                         stateGpu->waitCoordinatesCopiedToDevice(AtomLocality::Local);
1652                         // If the COM removal changed the velocities on the CPU, this has to be accounted for.
1653                         if (vcm.mode != ecmNO)
1654                         {
1655                             stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
1656                         }
1657                     }
1658                 }
1659             }
1660         }
1661 
1662         /* #############  END CALC EKIN AND PRESSURE ################# */
1663 
1664         /* Note: this is OK, but there are some numerical precision issues with using the convergence of
1665            the virial that should probably be addressed eventually. state->veta has better properies,
1666            but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
1667            generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
1668 
1669         if (ir->efep != efepNO && !EI_VV(ir->eI))
1670         {
1671             /* Sum up the foreign energy and dK/dl terms for md and sd.
1672                Currently done every step so that dH/dl is correct in the .edr */
1673             accumulateKineticLambdaComponents(enerd, state->lambda, *ir->fepvals);
1674         }
1675 
1676         update_pcouple_after_coordinates(fplog, step, ir, mdatoms, pres, force_vir, shake_vir,
1677                                          pressureCouplingMu, state, nrnb, upd.deform(), !useGpuForUpdate);
1678 
1679         const bool doBerendsenPressureCoupling =
1680                 (inputrec->epc == epcBERENDSEN && do_per_step(step, inputrec->nstpcouple));
1681         const bool doCRescalePressureCoupling =
1682                 (inputrec->epc == epcCRESCALE && do_per_step(step, inputrec->nstpcouple));
1683         if (useGpuForUpdate
1684             && (doBerendsenPressureCoupling || doCRescalePressureCoupling || doParrinelloRahman))
1685         {
1686             integrator->scaleCoordinates(pressureCouplingMu);
1687             if (doCRescalePressureCoupling)
1688             {
1689                 matrix pressureCouplingInvMu;
1690                 gmx::invertBoxMatrix(pressureCouplingMu, pressureCouplingInvMu);
1691                 integrator->scaleVelocities(pressureCouplingInvMu);
1692             }
1693             integrator->setPbc(PbcType::Xyz, state->box);
1694         }
1695 
1696         /* ################# END UPDATE STEP 2 ################# */
1697         /* #### We now have r(t+dt) and v(t+dt/2)  ############# */
1698 
1699         /* The coordinates (x) were unshifted in update */
1700         if (!bGStat)
1701         {
1702             /* We will not sum ekinh_old,
1703              * so signal that we still have to do it.
1704              */
1705             bSumEkinhOld = TRUE;
1706         }
1707 
1708         if (bCalcEner)
1709         {
1710             /* #########  BEGIN PREPARING EDR OUTPUT  ###########  */
1711 
1712             /* use the directly determined last velocity, not actually the averaged half steps */
1713             if (bTrotter && ir->eI == eiVV)
1714             {
1715                 enerd->term[F_EKIN] = last_ekin;
1716             }
1717             enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
1718 
1719             if (integratorHasConservedEnergyQuantity(ir))
1720             {
1721                 if (EI_VV(ir->eI))
1722                 {
1723                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity;
1724                 }
1725                 else
1726                 {
1727                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + NPT_energy(ir, state, &MassQ);
1728                 }
1729             }
1730             /* #########  END PREPARING EDR OUTPUT  ###########  */
1731         }
1732 
1733         /* Output stuff */
1734         if (MASTER(cr))
1735         {
1736             if (fplog && do_log && bDoExpanded)
1737             {
1738                 /* only needed if doing expanded ensemble */
1739                 PrintFreeEnergyInfoToFile(fplog, ir->fepvals, ir->expandedvals,
1740                                           ir->bSimTemp ? ir->simtempvals : nullptr,
1741                                           state_global->dfhist, state->fep_state, ir->nstlog, step);
1742             }
1743             if (bCalcEner)
1744             {
1745                 energyOutput.addDataAtEnergyStep(
1746                         bDoDHDL, bCalcEnerStep, t, mdatoms->tmass, enerd, ir->fepvals,
1747                         ir->expandedvals, lastbox,
1748                         PTCouplingArrays{ state->boxv, state->nosehoover_xi, state->nosehoover_vxi,
1749                                           state->nhpres_xi, state->nhpres_vxi },
1750                         state->fep_state, shake_vir, force_vir, total_vir, pres, ekind, mu_tot, constr);
1751             }
1752             else
1753             {
1754                 energyOutput.recordNonEnergyStep();
1755             }
1756 
1757             gmx_bool do_dr = do_per_step(step, ir->nstdisreout);
1758             gmx_bool do_or = do_per_step(step, ir->nstorireout);
1759 
1760             if (doSimulatedAnnealing)
1761             {
1762                 gmx::EnergyOutput::printAnnealingTemperatures(do_log ? fplog : nullptr, groups,
1763                                                               &(ir->opts));
1764             }
1765             if (do_log || do_ene || do_dr || do_or)
1766             {
1767                 energyOutput.printStepToEnergyFile(mdoutf_get_fp_ene(outf), do_ene, do_dr, do_or,
1768                                                    do_log ? fplog : nullptr, step, t,
1769                                                    fr->fcdata.get(), awh.get());
1770             }
1771             if (do_log && ir->bDoAwh && awh->hasFepLambdaDimension())
1772             {
1773                 const bool isInitialOutput = false;
1774                 printLambdaStateToLog(fplog, state->lambda, isInitialOutput);
1775             }
1776 
1777             if (ir->bPull)
1778             {
1779                 pull_print_output(pull_work, step, t);
1780             }
1781 
1782             if (do_per_step(step, ir->nstlog))
1783             {
1784                 if (fflush(fplog) != 0)
1785                 {
1786                     gmx_fatal(FARGS, "Cannot flush logfile - maybe you are out of disk space?");
1787                 }
1788             }
1789         }
1790         if (bDoExpanded)
1791         {
1792             /* Have to do this part _after_ outputting the logfile and the edr file */
1793             /* Gets written into the state at the beginning of next loop*/
1794             state->fep_state = lamnew;
1795             if(plumedswitch)
1796             {
1797                 realFepState = state->fep_state;
1798             }
1799         }
1800         else if (ir->bDoAwh && awh->needForeignEnergyDifferences(step))
1801         {
1802             state->fep_state = awh->fepLambdaState();
1803         }
1804         /* Print the remaining wall clock time for the run */
1805         if (isMasterSimMasterRank(ms, MASTER(cr)) && (do_verbose || gmx_got_usr_signal()) && !bPMETunePrinting)
1806         {
1807             if (shellfc)
1808             {
1809                 fprintf(stderr, "\n");
1810             }
1811             print_time(stderr, walltime_accounting, step, ir, cr);
1812         }
1813 
1814         /* Ion/water position swapping.
1815          * Not done in last step since trajectory writing happens before this call
1816          * in the MD loop and exchanges would be lost anyway. */
1817         bNeedRepartition = FALSE;
1818         if ((ir->eSwapCoords != eswapNO) && (step > 0) && !bLastStep && do_per_step(step, ir->swap->nstswap))
1819         {
1820             bNeedRepartition =
1821                     do_swapcoords(cr, step, t, ir, swap, wcycle, as_rvec_array(state->x.data()),
1822                                   state->box, MASTER(cr) && mdrunOptions.verbose, bRerunMD);
1823 
1824             if (bNeedRepartition && DOMAINDECOMP(cr))
1825             {
1826                 dd_collect_state(cr->dd, state, state_global);
1827             }
1828         }
1829 
1830         /* Replica exchange */
1831         bExchanged = FALSE;
1832         if (bDoReplEx)
1833         {
1834             bExchanged = replica_exchange(fplog, cr, ms, repl_ex, state_global, enerd, state, step, t);
1835         }
1836 
1837         if ((bExchanged || bNeedRepartition) && DOMAINDECOMP(cr))
1838         {
1839             dd_partition_system(fplog, mdlog, step, cr, TRUE, 1, state_global, *top_global, ir,
1840                                 imdSession, pull_work, state, &f, mdAtoms, &top, fr, vsite, constr,
1841                                 nrnb, wcycle, FALSE);
1842             shouldCheckNumberOfBondedInteractions = true;
1843             upd.setNumAtoms(state->natoms);
1844         }
1845 
1846         bFirstStep = FALSE;
1847         bInitStep  = FALSE;
1848 
1849         /* #######  SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */
1850         /* With all integrators, except VV, we need to retain the pressure
1851          * at the current step for coupling at the next step.
1852          */
1853         if ((state->flags & (1U << estPRES_PREV))
1854             && (bGStatEveryStep || (ir->nstpcouple > 0 && step % ir->nstpcouple == 0)))
1855         {
1856             /* Store the pressure in t_state for pressure coupling
1857              * at the next MD step.
1858              */
1859             copy_mat(pres, state->pres_prev);
1860         }
1861 
1862         /* #######  END SET VARIABLES FOR NEXT ITERATION ###### */
1863 
1864         if ((membed != nullptr) && (!bLastStep))
1865         {
1866             rescale_membed(step_rel, membed, as_rvec_array(state_global->x.data()));
1867         }
1868 
1869         cycles = wallcycle_stop(wcycle, ewcSTEP);
1870         if (DOMAINDECOMP(cr) && wcycle)
1871         {
1872             dd_cycles_add(cr->dd, cycles, ddCyclStep);
1873         }
1874 
1875         /* increase the MD step number */
1876         step++;
1877         step_rel++;
1878 
1879 #if GMX_FAHCORE
1880         if (MASTER(cr))
1881         {
1882             fcReportProgress(ir->nsteps + ir->init_step, step);
1883         }
1884 #endif
1885 
1886         resetHandler->resetCounters(step, step_rel, mdlog, fplog, cr, fr->nbv.get(), nrnb,
1887                                     fr->pmedata, pme_loadbal, wcycle, walltime_accounting);
1888 
1889         /* If bIMD is TRUE, the master updates the IMD energy record and sends positions to VMD client */
1890         imdSession->updateEnergyRecordAndSendPositionsAndEnergies(bInteractiveMDstep, step, bCalcEner);
1891     }
1892     /* End of main MD loop */
1893 
1894     /* Closing TNG files can include compressing data. Therefore it is good to do that
1895      * before stopping the time measurements. */
1896     mdoutf_tng_close(outf);
1897 
1898     /* Stop measuring walltime */
1899     walltime_accounting_end_time(walltime_accounting);
1900 
1901     if (!thisRankHasDuty(cr, DUTY_PME))
1902     {
1903         /* Tell the PME only node to finish */
1904         gmx_pme_send_finish(cr);
1905     }
1906 
1907     if (MASTER(cr))
1908     {
1909         if (ir->nstcalcenergy > 0)
1910         {
1911             energyOutput.printEnergyConservation(fplog, ir->simulation_part, EI_MD(ir->eI));
1912 
1913             gmx::EnergyOutput::printAnnealingTemperatures(fplog, groups, &(ir->opts));
1914             energyOutput.printAverages(fplog, groups);
1915         }
1916     }
1917     done_mdoutf(outf);
1918 
1919     if (bPMETune)
1920     {
1921         pme_loadbal_done(pme_loadbal, fplog, mdlog, fr->nbv->useGpu());
1922     }
1923 
1924     done_shellfc(fplog, shellfc, step_rel);
1925 
1926     if (useReplicaExchange && MASTER(cr))
1927     {
1928         print_replica_exchange_statistics(fplog, repl_ex);
1929     }
1930 
1931     walltime_accounting_set_nsteps_done(walltime_accounting, step_rel);
1932 
1933     global_stat_destroy(gstat);
1934 }
1935