1 //////////////////////////////////////////////////////////////////////////////////////
2 // This file is distributed under the University of Illinois/NCSA Open Source License.
3 // See LICENSE file in top directory for details.
4 //
5 // Copyright (c) 2020 QMCPACK developers.
6 //
7 // File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
8 //
9 // File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
10 //////////////////////////////////////////////////////////////////////////////////////
11 
12 
13 /** @file SplineC2COMPTarget.h
14  *
15  * class to handle complex splines to complex orbitals with splines of arbitrary precision
16  * splines storage and computation is offloaded to accelerators using OpenMP target
17  */
18 #ifndef QMCPLUSPLUS_SPLINE_C2C_OMPTARGET_H
19 #define QMCPLUSPLUS_SPLINE_C2C_OMPTARGET_H
20 
21 #include <memory>
22 #include "QMCWaveFunctions/BsplineFactory/BsplineSet.h"
23 #include "OhmmsSoA/VectorSoaContainer.h"
24 #include "spline2/MultiBspline.hpp"
25 #include "OMPTarget/OMPallocator.hpp"
26 #include "Platforms/PinnedAllocator.h"
27 #include "Utilities/FairDivide.h"
28 #include "Utilities/TimerManager.h"
29 #include "SplineOMPTargetMultiWalkerMem.h"
30 
31 namespace qmcplusplus
32 {
33 /** class to match std::complex<ST> spline with BsplineSet::ValueType (complex) SPOs
34  * @tparam ST precision of spline
35  *
36  * Requires temporage storage and multiplication of phase vectors
37  * Internal storage use double sized arrays of ST type, aligned and padded.
38  */
39 template<typename ST>
40 class SplineC2COMPTarget : public BsplineSet
41 {
42 public:
43   template<typename DT>
44   using OffloadAllocator = OMPallocator<DT, aligned_allocator<DT>>;
45   template<typename DT>
46   using OffloadPinnedAllocator = OMPallocator<DT, PinnedAlignedAllocator<DT>>;
47 
48   using SplineType       = typename bspline_traits<ST, 3>::SplineType;
49   using BCType           = typename bspline_traits<ST, 3>::BCType;
50   using DataType         = ST;
51   using PointType        = TinyVector<ST, 3>;
52   using SingleSplineType = UBspline_3d_d;
53   // types for evaluation results
54   using ComplexT = typename BsplineSet::ValueType;
55   using BsplineSet::GGGVector_t;
56   using BsplineSet::GradVector_t;
57   using BsplineSet::HessVector_t;
58   using BsplineSet::ValueVector_t;
59 
60   using vContainer_type  = Vector<ST, aligned_allocator<ST>>;
61   using gContainer_type  = VectorSoaContainer<ST, 3>;
62   using hContainer_type  = VectorSoaContainer<ST, 6>;
63   using ghContainer_type = VectorSoaContainer<ST, 10>;
64 
65   template<typename DT>
66   using OffloadVector = Vector<DT, OffloadAllocator<DT>>;
67   template<typename DT>
68   using OffloadPosVector = VectorSoaContainer<DT, 3, OffloadAllocator<DT>>;
69 
70 private:
71   /// timer for offload portion
72   NewTimer& offload_timer_;
73   ///primitive cell
74   CrystalLattice<ST, 3> PrimLattice;
75   ///\f$GGt=G^t G \f$, transformation for tensor in LatticeUnit to CartesianUnit, e.g. Hessian
76   Tensor<ST, 3> GGt;
77   ///multi bspline set
78   std::shared_ptr<MultiBspline<ST, OffloadAllocator<ST>, OffloadAllocator<SplineType>>> SplineInst;
79 
80   std::shared_ptr<OffloadVector<ST>> mKK;
81   std::shared_ptr<OffloadPosVector<ST>> myKcart;
82   std::shared_ptr<OffloadVector<ST>> GGt_offload;
83   std::shared_ptr<OffloadVector<ST>> PrimLattice_G_offload;
84 
85   std::unique_ptr<SplineOMPTargetMultiWalkerMem<ST, ComplexT>> mw_mem_;
86 
87   ///team private ratios for reduction, numVP x numTeams
88   Matrix<ComplexT, OffloadPinnedAllocator<ComplexT>> ratios_private;
89   ///offload scratch space, dynamically resized to the maximal need
90   Vector<ST, OffloadPinnedAllocator<ST>> offload_scratch;
91   ///result scratch space, dynamically resized to the maximal need
92   Vector<ComplexT, OffloadPinnedAllocator<ComplexT>> results_scratch;
93   ///psiinv and position scratch space, used to avoid allocation on the fly and faster transfer
94   Vector<ComplexT, OffloadPinnedAllocator<ComplexT>> psiinv_pos_copy;
95   ///position scratch space, used to avoid allocation on the fly and faster transfer
96   Vector<ST, OffloadPinnedAllocator<ST>> multi_pos_copy;
97 
98   void evaluateVGLMultiPos(const Vector<ST, OffloadPinnedAllocator<ST>>& multi_pos_copy,
99                            Vector<ST, OffloadPinnedAllocator<ST>>& offload_scratch,
100                            Vector<ComplexT, OffloadPinnedAllocator<ComplexT>>& results_scratch,
101                            const RefVector<ValueVector_t>& psi_v_list,
102                            const RefVector<GradVector_t>& dpsi_v_list,
103                            const RefVector<ValueVector_t>& d2psi_v_list) const;
104 
105 protected:
106   /// intermediate result vectors
107   vContainer_type myV;
108   vContainer_type myL;
109   gContainer_type myG;
110   hContainer_type myH;
111   ghContainer_type mygH;
112 
113 public:
SplineC2COMPTarget()114   SplineC2COMPTarget()
115       : BsplineSet(true),
116         offload_timer_(*timer_manager.createTimer("SplineC2COMPTarget::offload", timer_level_fine)),
117         GGt_offload(std::make_shared<OffloadVector<ST>>(9)),
118         PrimLattice_G_offload(std::make_shared<OffloadVector<ST>>(9))
119   {
120     is_complex = true;
121     className  = "SplineC2COMPTarget";
122     KeyWord    = "SplineC2C";
123   }
124 
SplineC2COMPTarget(const SplineC2COMPTarget & in)125   SplineC2COMPTarget(const SplineC2COMPTarget& in)
126       : BsplineSet(in),
127         offload_timer_(in.offload_timer_),
128         PrimLattice(in.PrimLattice),
129         GGt(in.GGt),
130         SplineInst(in.SplineInst),
131         mKK(in.mKK),
132         myKcart(in.myKcart),
133         GGt_offload(in.GGt_offload),
134         PrimLattice_G_offload(in.PrimLattice_G_offload),
135         myV(in.myV),
136         myL(in.myL),
137         myG(in.myG),
138         myH(in.myH),
139         mygH(in.mygH)
140   {}
141 
createResource(ResourceCollection & collection)142   void createResource(ResourceCollection& collection) const override
143   {
144     auto resource_index = collection.addResource(std::make_unique<SplineOMPTargetMultiWalkerMem<ST, ComplexT>>());
145   }
146 
acquireResource(ResourceCollection & collection)147   void acquireResource(ResourceCollection& collection) override
148   {
149     auto res_ptr = dynamic_cast<SplineOMPTargetMultiWalkerMem<ST, ComplexT>*>(collection.lendResource().release());
150     if (!res_ptr)
151       throw std::runtime_error("SplineC2COMPTarget::acquireResource dynamic_cast failed");
152     mw_mem_.reset(res_ptr);
153   }
154 
releaseResource(ResourceCollection & collection)155   void releaseResource(ResourceCollection& collection) override { collection.takebackResource(std::move(mw_mem_)); }
156 
makeClone()157   virtual SPOSet* makeClone() const override { return new SplineC2COMPTarget(*this); }
158 
resizeStorage(size_t n,size_t nvals)159   inline void resizeStorage(size_t n, size_t nvals)
160   {
161     init_base(n);
162     size_t npad = getAlignedSize<ST>(2 * n);
163     myV.resize(npad);
164     myG.resize(npad);
165     myL.resize(npad);
166     myH.resize(npad);
167     mygH.resize(npad);
168   }
169 
bcast_tables(Communicate * comm)170   void bcast_tables(Communicate* comm) { chunked_bcast(comm, SplineInst->getSplinePtr()); }
171 
gather_tables(Communicate * comm)172   void gather_tables(Communicate* comm)
173   {
174     if (comm->size() == 1)
175       return;
176     const int Nbands      = kPoints.size();
177     const int Nbandgroups = comm->size();
178     offset.resize(Nbandgroups + 1, 0);
179     FairDivideLow(Nbands, Nbandgroups, offset);
180 
181     for (size_t ib = 0; ib < offset.size(); ib++)
182       offset[ib] *= 2;
183     gatherv(comm, SplineInst->getSplinePtr(), SplineInst->getSplinePtr()->z_stride, offset);
184   }
185 
186   template<typename GT, typename BCT>
create_spline(GT & xyz_g,BCT & xyz_bc)187   void create_spline(GT& xyz_g, BCT& xyz_bc)
188   {
189     resize_kpoints();
190     SplineInst = std::make_shared<MultiBspline<ST, OffloadAllocator<ST>, OffloadAllocator<SplineType>>>();
191     SplineInst->create(xyz_g, xyz_bc, myV.size());
192 
193     app_log() << "MEMORY " << SplineInst->sizeInByte() / (1 << 20) << " MB allocated "
194               << "for the coefficients in 3D spline orbital representation" << std::endl;
195   }
196 
197   /// this routine can not be called from threaded region
finalizeConstruction()198   void finalizeConstruction() override
199   {
200     // map the SplineInst->getSplinePtr() structure to GPU
201     auto* MultiSpline    = SplineInst->getSplinePtr();
202     auto* restrict coefs = MultiSpline->coefs;
203     // attach pointers on the device to achieve deep copy
204     PRAGMA_OFFLOAD("omp target map(always, to: MultiSpline[0:1], coefs[0:MultiSpline->coefs_size])")
205     {
206       MultiSpline->coefs = coefs;
207     }
208 
209     // transfer static data to GPU
210     auto* mKK_ptr = mKK->data();
211     PRAGMA_OFFLOAD("omp target update to(mKK_ptr[0:mKK->size()])")
212     auto* myKcart_ptr = myKcart->data();
213     PRAGMA_OFFLOAD("omp target update to(myKcart_ptr[0:myKcart->capacity()*3])")
214     for (size_t i = 0; i < 9; i++)
215     {
216       (*GGt_offload)[i]           = GGt[i];
217       (*PrimLattice_G_offload)[i] = PrimLattice.G[i];
218     }
219     auto* PrimLattice_G_ptr = PrimLattice_G_offload->data();
220     PRAGMA_OFFLOAD("omp target update to(PrimLattice_G_ptr[0:9])")
221     auto* GGt_ptr = GGt_offload->data();
222     PRAGMA_OFFLOAD("omp target update to(GGt_ptr[0:9])")
223   }
224 
flush_zero()225   inline void flush_zero() { SplineInst->flush_zero(); }
226 
227   /** remap kPoints to pack the double copy */
resize_kpoints()228   inline void resize_kpoints()
229   {
230     const size_t nk = kPoints.size();
231     mKK             = std::make_shared<OffloadVector<ST>>(nk);
232     myKcart         = std::make_shared<OffloadPosVector<ST>>(nk);
233     for (size_t i = 0; i < nk; ++i)
234     {
235       (*mKK)[i]     = -dot(kPoints[i], kPoints[i]);
236       (*myKcart)(i) = kPoints[i];
237     }
238   }
239 
240   void set_spline(SingleSplineType* spline_r, SingleSplineType* spline_i, int twist, int ispline, int level);
241 
242   bool read_splines(hdf_archive& h5f);
243 
244   bool write_splines(hdf_archive& h5f);
245 
246   void assign_v(const PointType& r, const vContainer_type& myV, ValueVector_t& psi, int first, int last) const;
247 
248   virtual void evaluateValue(const ParticleSet& P, const int iat, ValueVector_t& psi) override;
249 
250   virtual void evaluateDetRatios(const VirtualParticleSet& VP,
251                                  ValueVector_t& psi,
252                                  const ValueVector_t& psiinv,
253                                  std::vector<ValueType>& ratios) override;
254 
255   virtual void mw_evaluateDetRatios(const RefVectorWithLeader<SPOSet>& spo_list,
256                                     const RefVector<const VirtualParticleSet>& vp_list,
257                                     const RefVector<ValueVector_t>& psi_list,
258                                     const std::vector<const ValueType*>& invRow_ptr_list,
259                                     std::vector<std::vector<ValueType>>& ratios_list) const override;
260 
261   /** assign_vgl_from_l can be used when myL is precomputed and myV,myG,myL in cartesian
262    */
263   void assign_vgl_from_l(const PointType& r, ValueVector_t& psi, GradVector_t& dpsi, ValueVector_t& d2psi);
264 
265   virtual void evaluateVGL(const ParticleSet& P,
266                            const int iat,
267                            ValueVector_t& psi,
268                            GradVector_t& dpsi,
269                            ValueVector_t& d2psi) override;
270 
271   virtual void mw_evaluateVGL(const RefVectorWithLeader<SPOSet>& sa_list,
272                               const RefVectorWithLeader<ParticleSet>& P_list,
273                               int iat,
274                               const RefVector<ValueVector_t>& psi_v_list,
275                               const RefVector<GradVector_t>& dpsi_v_list,
276                               const RefVector<ValueVector_t>& d2psi_v_list) const override;
277 
278   virtual void mw_evaluateVGLandDetRatioGrads(const RefVectorWithLeader<SPOSet>& spo_list,
279                                               const RefVectorWithLeader<ParticleSet>& P_list,
280                                               int iat,
281                                               const std::vector<const ValueType*>& invRow_ptr_list,
282                                               VGLVector_t& phi_vgl_v,
283                                               std::vector<ValueType>& ratios,
284                                               std::vector<GradType>& grads) const override;
285 
286   void assign_vgh(const PointType& r,
287                   ValueVector_t& psi,
288                   GradVector_t& dpsi,
289                   HessVector_t& grad_grad_psi,
290                   int first,
291                   int last) const;
292 
293   virtual void evaluateVGH(const ParticleSet& P,
294                            const int iat,
295                            ValueVector_t& psi,
296                            GradVector_t& dpsi,
297                            HessVector_t& grad_grad_psi) override;
298 
299   void assign_vghgh(const PointType& r,
300                     ValueVector_t& psi,
301                     GradVector_t& dpsi,
302                     HessVector_t& grad_grad_psi,
303                     GGGVector_t& grad_grad_grad_psi,
304                     int first = 0,
305                     int last  = -1) const;
306 
307   virtual void evaluateVGHGH(const ParticleSet& P,
308                              const int iat,
309                              ValueVector_t& psi,
310                              GradVector_t& dpsi,
311                              HessVector_t& grad_grad_psi,
312                              GGGVector_t& grad_grad_grad_psi) override;
313 
314   virtual void evaluate_notranspose(const ParticleSet& P,
315                                     int first,
316                                     int last,
317                                     ValueMatrix_t& logdet,
318                                     GradMatrix_t& dlogdet,
319                                     ValueMatrix_t& d2logdet) override;
320 
321   template<class BSPLINESPO>
322   friend struct SplineSetReader;
323   friend struct BsplineReaderBase;
324 };
325 
326 extern template class SplineC2COMPTarget<float>;
327 extern template class SplineC2COMPTarget<double>;
328 
329 } // namespace qmcplusplus
330 #endif
331