1 //////////////////////////////////////////////////////////////////////////////////////
2 // This file is distributed under the University of Illinois/NCSA Open Source License.
3 // See LICENSE file in top directory for details.
4 //
5 // Copyright (c) 2020 QMCPACK developers.
6 //
7 // File developed by: Peter Doak, doakpw@ornl.gov, Oak Ridge National Laboratory
8 //
9 // File refactored from: VMC.cpp
10 //////////////////////////////////////////////////////////////////////////////////////
11 
12 #include "VMCBatched.h"
13 #include "Concurrency/ParallelExecutor.hpp"
14 #include "Concurrency/Info.hpp"
15 #include "Message/UniformCommunicateError.h"
16 #include "Message/CommOperators.h"
17 #include "Utilities/RunTimeManager.h"
18 #include "ParticleBase/RandomSeqGenerator.h"
19 #include "Particle/MCSample.h"
20 #include "MemoryUsage.h"
21 
22 namespace qmcplusplus
23 {
24 /** Constructor maintains proper ownership of input parameters
25    */
VMCBatched(const ProjectData & project_data,QMCDriverInput && qmcdriver_input,VMCDriverInput && input,MCPopulation && pop,SampleStack & samples,Communicate * comm)26 VMCBatched::VMCBatched(const ProjectData& project_data,
27                        QMCDriverInput&& qmcdriver_input,
28                        VMCDriverInput&& input,
29                        MCPopulation&& pop,
30                        SampleStack& samples,
31                        Communicate* comm)
32     : QMCDriverNew(project_data, std::move(qmcdriver_input), std::move(pop), "VMCBatched::", comm, "VMCBatched"),
33       vmcdriver_input_(input),
34       samples_(samples),
35       collect_samples_(false)
36 {}
37 
advanceWalkers(const StateForThread & sft,Crowd & crowd,QMCDriverNew::DriverTimers & timers,ContextForSteps & step_context,bool recompute)38 void VMCBatched::advanceWalkers(const StateForThread& sft,
39                                 Crowd& crowd,
40                                 QMCDriverNew::DriverTimers& timers,
41                                 ContextForSteps& step_context,
42                                 bool recompute)
43 {
44   if (crowd.size() == 0)
45     return;
46   auto& ps_dispatcher  = crowd.dispatchers_.ps_dispatcher_;
47   auto& twf_dispatcher = crowd.dispatchers_.twf_dispatcher_;
48   auto& ham_dispatcher = crowd.dispatchers_.ham_dispatcher_;
49   auto& walkers        = crowd.get_walkers();
50   const RefVectorWithLeader<ParticleSet> walker_elecs(crowd.get_walker_elecs()[0], crowd.get_walker_elecs());
51   const RefVectorWithLeader<TrialWaveFunction> walker_twfs(crowd.get_walker_twfs()[0], crowd.get_walker_twfs());
52 
53   ResourceCollectionTeamLock<ParticleSet> pset_res_lock(crowd.getSharedResource().pset_res, walker_elecs);
54   DriverWalkerResourceCollectionLock pbyp_lock(crowd.getSharedResource(), crowd.get_walker_twfs()[0],
55                                                crowd.get_walker_hamiltonians()[0]);
56 
57   assert(QMCDriverNew::checkLogAndGL(crowd));
58 
59   timers.movepbyp_timer.start();
60   const int num_walkers = crowd.size();
61   // Note std::vector<bool> is not like the rest of stl.
62   std::vector<bool> moved(num_walkers, false);
63   constexpr RealType mhalf(-0.5);
64   const bool use_drift = sft.vmcdrv_input.get_use_drift();
65   std::vector<TrialWaveFunction::GradType> grads_now(num_walkers);
66   std::vector<TrialWaveFunction::GradType> grads_new(num_walkers);
67   std::vector<TrialWaveFunction::PsiValueType> ratios(num_walkers);
68 
69   std::vector<PosType> drifts(num_walkers);
70   std::vector<RealType> log_gf(num_walkers);
71   std::vector<RealType> log_gb(num_walkers);
72   std::vector<RealType> prob(num_walkers);
73 
74   // local list to handle accept/reject
75   std::vector<bool> isAccepted;
76   std::vector<std::reference_wrapper<TrialWaveFunction>> twf_accept_list, twf_reject_list;
77   isAccepted.reserve(num_walkers);
78 
79   for (int sub_step = 0; sub_step < sft.qmcdrv_input.get_sub_steps(); sub_step++)
80   {
81     //This generates an entire steps worth of deltas.
82     step_context.nextDeltaRs(num_walkers * sft.population.get_num_particles());
83 
84     // up and down electrons are "species" within qmpack
85     for (int ig = 0; ig < step_context.get_num_groups(); ++ig) //loop over species
86     {
87       RealType tauovermass = sft.qmcdrv_input.get_tau() * sft.population.get_ptclgrp_inv_mass()[ig];
88       RealType oneover2tau = 0.5 / (tauovermass);
89       RealType sqrttau     = std::sqrt(tauovermass);
90 
91       twf_dispatcher.flex_prepareGroup(walker_twfs, walker_elecs, ig);
92 
93       int start_index = step_context.getPtclGroupStart(ig);
94       int end_index   = step_context.getPtclGroupEnd(ig);
95       for (int iat = start_index; iat < end_index; ++iat)
96       {
97         // step_context.deltaRsBegin returns an iterator to a flat series of PosTypes
98         // fastest in walkers then particles
99         auto delta_r_start = step_context.deltaRsBegin() + iat * num_walkers;
100         auto delta_r_end   = delta_r_start + num_walkers;
101 
102         if (use_drift)
103         {
104           twf_dispatcher.flex_evalGrad(walker_twfs, walker_elecs, iat, grads_now);
105           sft.drift_modifier.getDrifts(tauovermass, grads_now, drifts);
106 
107           std::transform(drifts.begin(), drifts.end(), delta_r_start, drifts.begin(),
108                          [sqrttau](const PosType& drift, const PosType& delta_r) {
109                            return drift + (sqrttau * delta_r);
110                          });
111         }
112         else
113         {
114           std::transform(delta_r_start, delta_r_end, drifts.begin(),
115                          [sqrttau](const PosType& delta_r) { return sqrttau * delta_r; });
116         }
117 
118         ps_dispatcher.flex_makeMove(walker_elecs, iat, drifts);
119 
120         // This is inelegant
121         if (use_drift)
122         {
123           twf_dispatcher.flex_calcRatioGrad(walker_twfs, walker_elecs, iat, ratios, grads_new);
124           std::transform(delta_r_start, delta_r_end, log_gf.begin(),
125                          [](const PosType& delta_r) { return mhalf * dot(delta_r, delta_r); });
126 
127           sft.drift_modifier.getDrifts(tauovermass, grads_new, drifts);
128 
129           std::transform(crowd.beginElectrons(), crowd.endElectrons(), drifts.begin(), drifts.begin(),
130                          [iat](const ParticleSet& elecs, const PosType& drift) {
131                            return elecs.R[iat] - elecs.activePos - drift;
132                          });
133 
134           std::transform(drifts.begin(), drifts.end(), log_gb.begin(),
135                          [oneover2tau](const PosType& drift) { return -oneover2tau * dot(drift, drift); });
136         }
137         else
138         {
139           twf_dispatcher.flex_calcRatio(walker_twfs, walker_elecs, iat, ratios);
140         }
141 
142         std::transform(ratios.begin(), ratios.end(), prob.begin(), [](auto ratio) { return std::norm(ratio); });
143 
144         isAccepted.clear();
145 
146         for (int i_accept = 0; i_accept < num_walkers; ++i_accept)
147           if (prob[i_accept] >= std::numeric_limits<RealType>::epsilon() &&
148               step_context.get_random_gen()() < prob[i_accept] * std::exp(log_gb[i_accept] - log_gf[i_accept]))
149           {
150             crowd.incAccept();
151             isAccepted.push_back(true);
152           }
153           else
154           {
155             crowd.incReject();
156             isAccepted.push_back(false);
157           }
158 
159         twf_dispatcher.flex_accept_rejectMove(walker_twfs, walker_elecs, iat, isAccepted, true);
160 
161         ps_dispatcher.flex_accept_rejectMove(walker_elecs, iat, isAccepted);
162       }
163     }
164     twf_dispatcher.flex_completeUpdates(walker_twfs);
165   }
166 
167   ps_dispatcher.flex_donePbyP(walker_elecs);
168   timers.movepbyp_timer.stop();
169 
170   timers.buffer_timer.start();
171   twf_dispatcher.flex_evaluateGL(walker_twfs, walker_elecs, recompute);
172 
173   assert(QMCDriverNew::checkLogAndGL(crowd));
174   timers.buffer_timer.stop();
175 
176   timers.hamiltonian_timer.start();
177   const RefVectorWithLeader<QMCHamiltonian> walker_hamiltonians(crowd.get_walker_hamiltonians()[0],
178                                                                 crowd.get_walker_hamiltonians());
179   std::vector<QMCHamiltonian::FullPrecRealType> local_energies(
180       ham_dispatcher.flex_evaluate(walker_hamiltonians, walker_elecs));
181   timers.hamiltonian_timer.stop();
182 
183   auto resetSigNLocalEnergy = [](MCPWalker& walker, TrialWaveFunction& twf, auto& local_energy) {
184     walker.resetProperty(twf.getLogPsi(), twf.getPhase(), local_energy);
185   };
186   for (int iw = 0; iw < crowd.size(); ++iw)
187     resetSigNLocalEnergy(walkers[iw], walker_twfs[iw], local_energies[iw]);
188 
189   // moved to be consistent with DMC
190   timers.collectables_timer.start();
191   auto evaluateNonPhysicalHamiltonianElements = [](QMCHamiltonian& ham, ParticleSet& pset, MCPWalker& walker) {
192     ham.auxHevaluate(pset, walker);
193   };
194   for (int iw = 0; iw < crowd.size(); ++iw)
195     evaluateNonPhysicalHamiltonianElements(walker_hamiltonians[iw], walker_elecs[iw], walkers[iw]);
196 
197   auto savePropertiesIntoWalker = [](QMCHamiltonian& ham, MCPWalker& walker) {
198     ham.saveProperty(walker.getPropertyBase());
199   };
200   for (int iw = 0; iw < crowd.size(); ++iw)
201     savePropertiesIntoWalker(walker_hamiltonians[iw], walkers[iw]);
202   timers.collectables_timer.stop();
203   // TODO:
204   //  check if all moves failed
205 }
206 
207 
208 /** Thread body for VMC step
209  *
210  */
runVMCStep(int crowd_id,const StateForThread & sft,DriverTimers & timers,std::vector<std::unique_ptr<ContextForSteps>> & context_for_steps,std::vector<std::unique_ptr<Crowd>> & crowds)211 void VMCBatched::runVMCStep(int crowd_id,
212                             const StateForThread& sft,
213                             DriverTimers& timers,
214                             std::vector<std::unique_ptr<ContextForSteps>>& context_for_steps,
215                             std::vector<std::unique_ptr<Crowd>>& crowds)
216 {
217   Crowd& crowd = *(crowds[crowd_id]);
218 
219   crowd.setRNGForHamiltonian(context_for_steps[crowd_id]->get_random_gen());
220 
221   int max_steps = sft.qmcdrv_input.get_max_steps();
222   // \todo delete
223   RealType cnorm = 1.0 / static_cast<RealType>(crowd.size());
224   IndexType step = sft.step;
225   // Are we entering the the last step of a block to recompute at?
226   bool recompute_this_step = (sft.is_recomputing_block && (step + 1) == max_steps);
227   advanceWalkers(sft, crowd, timers, *context_for_steps[crowd_id], recompute_this_step);
228   crowd.accumulate();
229 }
230 
process(xmlNodePtr node)231 void VMCBatched::process(xmlNodePtr node)
232 {
233   print_mem("VMCBatched before initialization", app_log());
234   // \todo get total walkers should be coming from VMCDriverInpu
235   try
236   {
237     QMCDriverNew::AdjustedWalkerCounts awc =
238         adjustGlobalWalkerCount(myComm->size(), myComm->rank(), qmcdriver_input_.get_total_walkers(),
239                                 qmcdriver_input_.get_walkers_per_rank(), 1.0, qmcdriver_input_.get_num_crowds());
240 
241     Base::startup(node, awc);
242   }
243   catch (const UniformCommunicateError& ue)
244   {
245     myComm->barrier_and_abort(ue.what());
246   }
247 }
248 
compute_samples_per_rank(const QMCDriverInput & qmcdriver_input,const IndexType local_walkers)249 int VMCBatched::compute_samples_per_rank(const QMCDriverInput& qmcdriver_input, const IndexType local_walkers)
250 {
251   int nblocks = qmcdriver_input.get_max_blocks();
252   int nsteps  = qmcdriver_input.get_max_steps();
253   return nblocks * nsteps * local_walkers;
254 }
255 
256 
257 /** Runs the actual VMC section
258  *
259  *  Dependent on base class state machine
260  *  Assumes state already updated from the following calls:
261  *  1. QMCDriverNew::setStatus
262  *  2. QMCDriverNew::putWalkers
263  *  3. QMCDriverNew::process
264  *
265  *  At the moment I don't care about 1st touch, prove it matters
266  *  If does consider giving more to the thread by value that should
267  *  end up thread local. (I think)
268  */
run()269 bool VMCBatched::run()
270 {
271   IndexType num_blocks = qmcdriver_input_.get_max_blocks();
272   //start the main estimator
273   estimator_manager_->startDriverRun();
274 
275   StateForThread vmc_state(qmcdriver_input_, vmcdriver_input_, *drift_modifier_, population_);
276 
277   LoopTimer<> vmc_loop;
278   RunTimeControl<> runtimeControl(run_time_manager, project_data_.getMaxCPUSeconds(), project_data_.getTitle(),
279                                   myComm->rank() == 0);
280 
281   { // walker initialization
282     ScopedTimer local_timer(timers_.init_walkers_timer);
283     ParallelExecutor<> section_start_task;
284     section_start_task(crowds_.size(), initialLogEvaluation, std::ref(crowds_), std::ref(step_contexts_));
285   }
286 
287   print_mem("VMCBatched after initialLogEvaluation", app_summary());
288 
289   ParallelExecutor<> crowd_task;
290 
291   if (qmcdriver_input_.get_warmup_steps() > 0)
292   {
293     // Run warm-up steps
294     auto runWarmupStep = [](int crowd_id, StateForThread& sft, DriverTimers& timers,
295                             UPtrVector<ContextForSteps>& context_for_steps, UPtrVector<Crowd>& crowds) {
296       Crowd& crowd = *(crowds[crowd_id]);
297       advanceWalkers(sft, crowd, timers, *context_for_steps[crowd_id], false);
298     };
299 
300     for (int step = 0; step < qmcdriver_input_.get_warmup_steps(); ++step)
301     {
302       ScopedTimer local_timer(timers_.run_steps_timer);
303       crowd_task(crowds_.size(), runWarmupStep, vmc_state, std::ref(timers_), std::ref(step_contexts_),
304                  std::ref(crowds_));
305     }
306 
307     app_log() << "Warm-up is completed!" << std::endl;
308     print_mem("VMCBatched after Warmup", app_log());
309   }
310 
311   for (int block = 0; block < num_blocks; ++block)
312   {
313     vmc_loop.start();
314     vmc_state.recalculate_properties_period =
315         (qmc_driver_mode_[QMC_UPDATE_MODE]) ? qmcdriver_input_.get_recalculate_properties_period() : 0;
316     vmc_state.is_recomputing_block = qmcdriver_input_.get_blocks_between_recompute()
317         ? (1 + block) % qmcdriver_input_.get_blocks_between_recompute() == 0
318         : false;
319 
320     estimator_manager_->startBlock(qmcdriver_input_.get_max_steps());
321 
322     for (auto& crowd : crowds_)
323       crowd->startBlock(qmcdriver_input_.get_max_steps());
324     for (int step = 0; step < qmcdriver_input_.get_max_steps(); ++step)
325     {
326       ScopedTimer local_timer(timers_.run_steps_timer);
327       vmc_state.step = step;
328       crowd_task(crowds_.size(), runVMCStep, vmc_state, timers_, std::ref(step_contexts_), std::ref(crowds_));
329 
330       if (collect_samples_)
331       {
332         const auto& elec_psets = population_.get_elec_particle_sets();
333         for (const auto& walker : elec_psets)
334         {
335           samples_.appendSample(MCSample(*walker));
336         }
337       }
338     }
339     print_mem("VMCBatched after a block", app_debug_stream());
340     endBlock();
341     vmc_loop.stop();
342 
343     bool stop_requested = false;
344     // Rank 0 decides whether the time limit was reached
345     if (!myComm->rank())
346       stop_requested = runtimeControl.checkStop(vmc_loop);
347     myComm->bcast(stop_requested);
348 
349     if (stop_requested)
350     {
351       if (!myComm->rank())
352         app_log() << runtimeControl.generateStopMessage("VMCBatched", block);
353       run_time_manager.markStop();
354       break;
355     }
356   }
357   // This is confusing logic from VMC.cpp want this functionality write documentation of this
358   // and clean it up
359   // bool wrotesamples = qmcdriver_input_.get_dump_config();
360   // if (qmcdriver_input_.get_dump_config())
361   // {
362   //wrotesamples = W.dumpEnsemble(wClones, wOut, myComm->size(), nBlocks);
363   //if (wrotesamples)
364   //  app_log() << "  samples are written to the config.h5" << std::endl;
365   // }
366 
367   // second argument was !wrotesample so if W.dumpEnsemble returns false or
368   // dump_config is false from input then dump_walkers
369   {
370     std::ostringstream o;
371     FullPrecRealType ene, var;
372     estimator_manager_->getApproximateEnergyVariance(ene, var);
373     o << "====================================================";
374     o << "\n  End of a VMC block";
375     o << "\n    QMC counter        = " << project_data_.getSeriesIndex();
376     o << "\n    time step          = " << qmcdriver_input_.get_tau();
377     o << "\n    reference energy   = " << ene;
378     o << "\n    reference variance = " << var;
379     o << "\n====================================================";
380     app_log() << o.str() << std::endl;
381   }
382 
383   print_mem("VMCBatched ends", app_log());
384 
385   estimator_manager_->stopDriverRun();
386 
387   return finalize(num_blocks, true);
388 }
389 
enable_sample_collection()390 void VMCBatched::enable_sample_collection()
391 {
392   int samples = compute_samples_per_rank(qmcdriver_input_, population_.get_num_local_walkers());
393   samples_.setMaxSamples(samples);
394   collect_samples_ = true;
395 
396   int total_samples = samples * population_.get_num_ranks();
397   app_log() << "VMCBatched Driver collecting samples, samples per rank = " << samples << '\n';
398   app_log() << "                                      total samples    = " << total_samples << '\n';
399 }
400 
401 } // namespace qmcplusplus
402