1 //////////////////////////////////////////////////////////////////////////////////////
2 // This file is distributed under the University of Illinois/NCSA Open Source License.
3 // See LICENSE file in top directory for details.
4 //
5 // Copyright (c) 2020 QMCPACK developers.
6 //
7 // File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
8 //
9 // File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
10 //////////////////////////////////////////////////////////////////////////////////////
11 
12 
13 #include "SplineC2COMPTarget.h"
14 #include "spline2/MultiBsplineEval.hpp"
15 #include "spline2/MultiBsplineEval_OMPoffload.hpp"
16 #include "QMCWaveFunctions/BsplineFactory/contraction_helper.hpp"
17 #include "Platforms/OMPTarget/ompReduction.hpp"
18 #include "config/stdlib/math.hpp"
19 
20 namespace qmcplusplus
21 {
22 namespace C2C
23 {
24 template<typename ST, typename TT>
assign_v(ST x,ST y,ST z,TT * restrict results_scratch_ptr,size_t orb_size,const ST * restrict offload_scratch_ptr,const ST * restrict myKcart_ptr,size_t myKcart_padded_size,size_t first_spo,int index)25 inline void assign_v(ST x,
26                      ST y,
27                      ST z,
28                      TT* restrict results_scratch_ptr,
29                      size_t orb_size,
30                      const ST* restrict offload_scratch_ptr,
31                      const ST* restrict myKcart_ptr,
32                      size_t myKcart_padded_size,
33                      size_t first_spo,
34                      int index)
35 {
36   const ST* restrict kx = myKcart_ptr;
37   const ST* restrict ky = myKcart_ptr + myKcart_padded_size;
38   const ST* restrict kz = myKcart_ptr + myKcart_padded_size * 2;
39 
40   const ST* restrict val = offload_scratch_ptr;
41   TT* restrict psi       = results_scratch_ptr;
42 
43   //phase
44   ST s, c, p = -(x * kx[index] + y * ky[index] + z * kz[index]);
45   qmcplusplus::sincos(p, &s, &c);
46 
47   const ST val_r         = val[index * 2];
48   const ST val_i         = val[index * 2 + 1];
49   psi[first_spo + index] = TT(val_r * c - val_i * s, val_i * c + val_r * s);
50 }
51 
52 /** assign_vgl
53    */
54 template<typename ST, typename TT>
assign_vgl(ST x,ST y,ST z,TT * restrict results_scratch_ptr,const ST * mKK_ptr,size_t orb_size,const ST * restrict offload_scratch_ptr,size_t spline_padded_size,const ST symGGt[6],const ST G[9],const ST * myKcart_ptr,size_t myKcart_padded_size,size_t first_spo,int index)55 inline void assign_vgl(ST x,
56                        ST y,
57                        ST z,
58                        TT* restrict results_scratch_ptr,
59                        const ST* mKK_ptr,
60                        size_t orb_size,
61                        const ST* restrict offload_scratch_ptr,
62                        size_t spline_padded_size,
63                        const ST symGGt[6],
64                        const ST G[9],
65                        const ST* myKcart_ptr,
66                        size_t myKcart_padded_size,
67                        size_t first_spo,
68                        int index)
69 {
70   constexpr ST two(2);
71   const ST &g00 = G[0], &g01 = G[1], &g02 = G[2], &g10 = G[3], &g11 = G[4], &g12 = G[5], &g20 = G[6], &g21 = G[7],
72            &g22 = G[8];
73 
74   const ST* restrict k0 = myKcart_ptr;
75   const ST* restrict k1 = myKcart_ptr + myKcart_padded_size;
76   const ST* restrict k2 = myKcart_ptr + myKcart_padded_size * 2;
77 
78   const ST* restrict val = offload_scratch_ptr;
79   const ST* restrict g0  = offload_scratch_ptr + spline_padded_size;
80   const ST* restrict g1  = offload_scratch_ptr + spline_padded_size * 2;
81   const ST* restrict g2  = offload_scratch_ptr + spline_padded_size * 3;
82   const ST* restrict h00 = offload_scratch_ptr + spline_padded_size * 4;
83   const ST* restrict h01 = offload_scratch_ptr + spline_padded_size * 5;
84   const ST* restrict h02 = offload_scratch_ptr + spline_padded_size * 6;
85   const ST* restrict h11 = offload_scratch_ptr + spline_padded_size * 7;
86   const ST* restrict h12 = offload_scratch_ptr + spline_padded_size * 8;
87   const ST* restrict h22 = offload_scratch_ptr + spline_padded_size * 9;
88 
89   TT* restrict psi   = results_scratch_ptr;
90   TT* restrict dpsi  = results_scratch_ptr + orb_size;
91   TT* restrict d2psi = results_scratch_ptr + orb_size * 4;
92 
93   const size_t jr = index << 1;
94   const size_t ji = jr + 1;
95 
96   const ST kX    = k0[index];
97   const ST kY    = k1[index];
98   const ST kZ    = k2[index];
99   const ST val_r = val[jr];
100   const ST val_i = val[ji];
101 
102   //phase
103   ST s, c, p = -(x * kX + y * kY + z * kZ);
104   qmcplusplus::sincos(p, &s, &c);
105 
106   //dot(PrimLattice.G,myG[j])
107   const ST dX_r = g00 * g0[jr] + g01 * g1[jr] + g02 * g2[jr];
108   const ST dY_r = g10 * g0[jr] + g11 * g1[jr] + g12 * g2[jr];
109   const ST dZ_r = g20 * g0[jr] + g21 * g1[jr] + g22 * g2[jr];
110 
111   const ST dX_i = g00 * g0[ji] + g01 * g1[ji] + g02 * g2[ji];
112   const ST dY_i = g10 * g0[ji] + g11 * g1[ji] + g12 * g2[ji];
113   const ST dZ_i = g20 * g0[ji] + g21 * g1[ji] + g22 * g2[ji];
114 
115   // \f$\nabla \psi_r + {\bf k}\psi_i\f$
116   const ST gX_r = dX_r + val_i * kX;
117   const ST gY_r = dY_r + val_i * kY;
118   const ST gZ_r = dZ_r + val_i * kZ;
119   const ST gX_i = dX_i - val_r * kX;
120   const ST gY_i = dY_i - val_r * kY;
121   const ST gZ_i = dZ_i - val_r * kZ;
122 
123   const ST lcart_r = SymTrace(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], symGGt);
124   const ST lcart_i = SymTrace(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], symGGt);
125   const ST lap_r   = lcart_r + mKK_ptr[index] * val_r + two * (kX * dX_i + kY * dY_i + kZ * dZ_i);
126   const ST lap_i   = lcart_i + mKK_ptr[index] * val_i - two * (kX * dX_r + kY * dY_r + kZ * dZ_r);
127 
128   const size_t psiIndex  = first_spo + index;
129   psi[psiIndex]          = TT(c * val_r - s * val_i, c * val_i + s * val_r);
130   d2psi[psiIndex]        = TT(c * lap_r - s * lap_i, c * lap_i + s * lap_r);
131   dpsi[psiIndex * 3]     = TT(c * gX_r - s * gX_i, c * gX_i + s * gX_r);
132   dpsi[psiIndex * 3 + 1] = TT(c * gY_r - s * gY_i, c * gY_i + s * gY_r);
133   dpsi[psiIndex * 3 + 2] = TT(c * gZ_r - s * gZ_i, c * gZ_i + s * gZ_r);
134 }
135 } // namespace C2C
136 
137 template<typename ST>
set_spline(SingleSplineType * spline_r,SingleSplineType * spline_i,int twist,int ispline,int level)138 inline void SplineC2COMPTarget<ST>::set_spline(SingleSplineType* spline_r,
139                                                SingleSplineType* spline_i,
140                                                int twist,
141                                                int ispline,
142                                                int level)
143 {
144   SplineInst->copy_spline(spline_r, 2 * ispline);
145   SplineInst->copy_spline(spline_i, 2 * ispline + 1);
146 }
147 
148 template<typename ST>
read_splines(hdf_archive & h5f)149 bool SplineC2COMPTarget<ST>::read_splines(hdf_archive& h5f)
150 {
151   std::ostringstream o;
152   o << "spline_" << MyIndex;
153   einspline_engine<SplineType> bigtable(SplineInst->getSplinePtr());
154   return h5f.readEntry(bigtable, o.str().c_str()); //"spline_0");
155 }
156 
157 template<typename ST>
write_splines(hdf_archive & h5f)158 bool SplineC2COMPTarget<ST>::write_splines(hdf_archive& h5f)
159 {
160   std::ostringstream o;
161   o << "spline_" << MyIndex;
162   einspline_engine<SplineType> bigtable(SplineInst->getSplinePtr());
163   return h5f.writeEntry(bigtable, o.str().c_str()); //"spline_0");
164 }
165 
166 template<typename ST>
assign_v(const PointType & r,const vContainer_type & myV,ValueVector_t & psi,int first,int last) const167 inline void SplineC2COMPTarget<ST>::assign_v(const PointType& r,
168                                              const vContainer_type& myV,
169                                              ValueVector_t& psi,
170                                              int first,
171                                              int last) const
172 {
173   // protect last
174   last = last > kPoints.size() ? kPoints.size() : last;
175 
176   const ST x = r[0], y = r[1], z = r[2];
177   const ST* restrict kx = myKcart->data(0);
178   const ST* restrict ky = myKcart->data(1);
179   const ST* restrict kz = myKcart->data(2);
180 #pragma omp simd
181   for (size_t j = first; j < last; ++j)
182   {
183     ST s, c;
184     const ST val_r = myV[2 * j];
185     const ST val_i = myV[2 * j + 1];
186     qmcplusplus::sincos(-(x * kx[j] + y * ky[j] + z * kz[j]), &s, &c);
187     psi[j + first_spo] = ComplexT(val_r * c - val_i * s, val_i * c + val_r * s);
188   }
189 }
190 
191 template<typename ST>
evaluateValue(const ParticleSet & P,const int iat,ValueVector_t & psi)192 void SplineC2COMPTarget<ST>::evaluateValue(const ParticleSet& P, const int iat, ValueVector_t& psi)
193 {
194   const PointType& r = P.activeR(iat);
195   PointType ru(PrimLattice.toUnit_floor(r));
196 
197 #pragma omp parallel
198   {
199     int first, last;
200     FairDivideAligned(myV.size(), getAlignment<ST>(), omp_get_num_threads(), omp_get_thread_num(), first, last);
201 
202     spline2::evaluate3d(SplineInst->getSplinePtr(), ru, myV, first, last);
203     assign_v(r, myV, psi, first / 2, last / 2);
204   }
205 }
206 
207 template<typename ST>
evaluateDetRatios(const VirtualParticleSet & VP,ValueVector_t & psi,const ValueVector_t & psiinv,std::vector<ValueType> & ratios)208 void SplineC2COMPTarget<ST>::evaluateDetRatios(const VirtualParticleSet& VP,
209                                                ValueVector_t& psi,
210                                                const ValueVector_t& psiinv,
211                                                std::vector<ValueType>& ratios)
212 {
213   const int nVP = VP.getTotalNum();
214   psiinv_pos_copy.resize(psiinv.size() + nVP * 3);
215 
216   // stage psiinv to psiinv_pos_copy
217   std::copy_n(psiinv.data(), psiinv.size(), psiinv_pos_copy.data());
218 
219   // pack particle positions
220   auto* restrict pos_scratch = reinterpret_cast<RealType*>(psiinv_pos_copy.data() + psiinv.size());
221   for (int iat = 0; iat < nVP; ++iat)
222   {
223     const PointType& r = VP.activeR(iat);
224     PointType ru(PrimLattice.toUnit_floor(r));
225     pos_scratch[iat * 6]     = r[0];
226     pos_scratch[iat * 6 + 1] = r[1];
227     pos_scratch[iat * 6 + 2] = r[2];
228     pos_scratch[iat * 6 + 3] = ru[0];
229     pos_scratch[iat * 6 + 4] = ru[1];
230     pos_scratch[iat * 6 + 5] = ru[2];
231   }
232 
233   const int ChunkSizePerTeam = 128;
234   const int NumTeams         = (myV.size() + ChunkSizePerTeam - 1) / ChunkSizePerTeam;
235   ratios_private.resize(nVP, NumTeams);
236   const auto padded_size = myV.size();
237   offload_scratch.resize(padded_size * nVP);
238   const auto orb_size = psiinv.size();
239   results_scratch.resize(orb_size * nVP);
240 
241   // Ye: need to extract sizes and pointers before entering target region
242   const auto* spline_ptr         = SplineInst->getSplinePtr();
243   auto* offload_scratch_ptr      = offload_scratch.data();
244   auto* results_scratch_ptr      = results_scratch.data();
245   const auto myKcart_padded_size = myKcart->capacity();
246   auto* myKcart_ptr              = myKcart->data();
247   auto* psiinv_ptr               = psiinv_pos_copy.data();
248   auto* ratios_private_ptr       = ratios_private.data();
249   const size_t first_spo_local   = first_spo;
250 
251   {
252     ScopedTimer offload(offload_timer_);
253     PRAGMA_OFFLOAD("omp target teams distribute collapse(2) num_teams(NumTeams*nVP) \
254                 map(always, to: psiinv_ptr[0:psiinv_pos_copy.size()]) \
255                 map(always, from: ratios_private_ptr[0:NumTeams*nVP])")
256     for (int iat = 0; iat < nVP; iat++)
257       for (int team_id = 0; team_id < NumTeams; team_id++)
258       {
259         const int first = ChunkSizePerTeam * team_id;
260         const int last  = (first + ChunkSizePerTeam) > padded_size ? padded_size : first + ChunkSizePerTeam;
261         auto* restrict offload_scratch_iat_ptr = offload_scratch_ptr + padded_size * iat;
262         auto* restrict psi_iat_ptr             = results_scratch_ptr + orb_size * iat;
263         auto* restrict pos_scratch             = reinterpret_cast<RealType*>(psiinv_ptr + orb_size);
264 
265         int ix, iy, iz;
266         ST a[4], b[4], c[4];
267         spline2::computeLocationAndFractional(spline_ptr, ST(pos_scratch[iat * 6 + 3]), ST(pos_scratch[iat * 6 + 4]),
268                                               ST(pos_scratch[iat * 6 + 5]), ix, iy, iz, a, b, c);
269 
270         PRAGMA_OFFLOAD("omp parallel for")
271         for (int index = 0; index < last - first; index++)
272           spline2offload::evaluate_v_impl_v2(spline_ptr, ix, iy, iz, a, b, c, offload_scratch_iat_ptr + first, first,
273                                              index);
274         const int last_index = last / 2 < orb_size ? last / 2 : orb_size;
275         PRAGMA_OFFLOAD("omp parallel for")
276         for (int index = first / 2; index < last_index; index++)
277           C2C::assign_v(ST(pos_scratch[iat * 6]), ST(pos_scratch[iat * 6 + 1]), ST(pos_scratch[iat * 6 + 2]),
278                         psi_iat_ptr, orb_size, offload_scratch_iat_ptr, myKcart_ptr, myKcart_padded_size,
279                         first_spo_local, index);
280 
281         const int first_cplx = first / 2;
282         const int last_cplx  = orb_size < last / 2 ? orb_size : last / 2;
283         ComplexT sum(0);
284         PRAGMA_OFFLOAD("omp parallel for simd reduction(+:sum)")
285         for (int i = first_cplx; i < last_cplx; i++)
286           sum += psi_iat_ptr[i] * psiinv_ptr[i];
287         ratios_private_ptr[iat * NumTeams + team_id] = sum;
288       }
289   }
290 
291   // do the reduction manually
292   for (int iat = 0; iat < nVP; ++iat)
293   {
294     ratios[iat] = ComplexT(0);
295     for (int tid = 0; tid < NumTeams; tid++)
296       ratios[iat] += ratios_private[iat][tid];
297   }
298 }
299 
300 template<typename ST>
mw_evaluateDetRatios(const RefVectorWithLeader<SPOSet> & spo_list,const RefVector<const VirtualParticleSet> & vp_list,const RefVector<ValueVector_t> & psi_list,const std::vector<const ValueType * > & invRow_ptr_list,std::vector<std::vector<ValueType>> & ratios_list) const301 void SplineC2COMPTarget<ST>::mw_evaluateDetRatios(const RefVectorWithLeader<SPOSet>& spo_list,
302                                                   const RefVector<const VirtualParticleSet>& vp_list,
303                                                   const RefVector<ValueVector_t>& psi_list,
304                                                   const std::vector<const ValueType*>& invRow_ptr_list,
305                                                   std::vector<std::vector<ValueType>>& ratios_list) const
306 {
307   assert(this == &spo_list.getLeader());
308   auto& phi_leader            = spo_list.getCastedLeader<SplineC2COMPTarget<ST>>();
309   auto& mw_mem                = *phi_leader.mw_mem_;
310   auto& det_ratios_buffer_H2D = mw_mem.det_ratios_buffer_H2D;
311   auto& mw_ratios_private     = mw_mem.mw_ratios_private;
312   auto& mw_offload_scratch    = mw_mem.mw_offload_scratch;
313   auto& mw_results_scratch    = mw_mem.mw_results_scratch;
314   const size_t nw             = spo_list.size();
315   const size_t orb_size       = phi_leader.size();
316 
317   size_t mw_nVP = 0;
318   for (const VirtualParticleSet& VP : vp_list)
319     mw_nVP += VP.getTotalNum();
320 
321   const size_t packed_size = nw * sizeof(ValueType*) + mw_nVP * (6 * sizeof(ST) + sizeof(int));
322   det_ratios_buffer_H2D.resize(packed_size);
323 
324   // pack invRow_ptr_list to det_ratios_buffer_H2D
325   Vector<const ValueType*> ptr_buffer(reinterpret_cast<const ValueType**>(det_ratios_buffer_H2D.data()), nw);
326   for (size_t iw = 0; iw < nw; iw++)
327     ptr_buffer[iw] = invRow_ptr_list[iw];
328 
329   // pack particle positions
330   auto* pos_ptr = reinterpret_cast<ST*>(det_ratios_buffer_H2D.data() + nw * sizeof(ValueType*));
331   auto* ref_id_ptr =
332       reinterpret_cast<int*>(det_ratios_buffer_H2D.data() + nw * sizeof(ValueType*) + mw_nVP * 6 * sizeof(ST));
333   size_t iVP = 0;
334   for (size_t iw = 0; iw < nw; iw++)
335   {
336     const VirtualParticleSet& VP = vp_list[iw];
337     assert(ratios_list[iw].size() == VP.getTotalNum());
338     for (size_t iat = 0; iat < VP.getTotalNum(); ++iat, ++iVP)
339     {
340       ref_id_ptr[iVP]    = iw;
341       const PointType& r = VP.activeR(iat);
342       PointType ru(PrimLattice.toUnit_floor(r));
343       pos_ptr[0] = r[0];
344       pos_ptr[1] = r[1];
345       pos_ptr[2] = r[2];
346       pos_ptr[3] = ru[0];
347       pos_ptr[4] = ru[1];
348       pos_ptr[5] = ru[2];
349       pos_ptr += 6;
350     }
351   }
352 
353   const int ChunkSizePerTeam = 128;
354   const int NumTeams         = (myV.size() + ChunkSizePerTeam - 1) / ChunkSizePerTeam;
355   mw_ratios_private.resize(mw_nVP, NumTeams);
356   const auto padded_size = myV.size();
357   mw_offload_scratch.resize(padded_size * mw_nVP);
358   mw_results_scratch.resize(orb_size * mw_nVP);
359 
360   // Ye: need to extract sizes and pointers before entering target region
361   const auto* spline_ptr         = SplineInst->getSplinePtr();
362   auto* offload_scratch_ptr      = mw_offload_scratch.data();
363   auto* results_scratch_ptr      = mw_results_scratch.data();
364   const auto myKcart_padded_size = myKcart->capacity();
365   auto* myKcart_ptr              = myKcart->data();
366   auto* buffer_H2D_ptr           = det_ratios_buffer_H2D.data();
367   auto* ratios_private_ptr       = mw_ratios_private.data();
368   const size_t first_spo_local   = first_spo;
369 
370   {
371     ScopedTimer offload(offload_timer_);
372     PRAGMA_OFFLOAD("omp target teams distribute collapse(2) num_teams(NumTeams*mw_nVP) \
373                 map(always, to: buffer_H2D_ptr[0:det_ratios_buffer_H2D.size()]) \
374                 map(always, from: ratios_private_ptr[0:NumTeams*mw_nVP])")
375     for (int iat = 0; iat < mw_nVP; iat++)
376       for (int team_id = 0; team_id < NumTeams; team_id++)
377       {
378         const int first = ChunkSizePerTeam * team_id;
379         const int last  = (first + ChunkSizePerTeam) > padded_size ? padded_size : first + ChunkSizePerTeam;
380         auto* restrict offload_scratch_iat_ptr = offload_scratch_ptr + padded_size * iat;
381         auto* restrict psi_iat_ptr             = results_scratch_ptr + orb_size * iat;
382         auto* ref_id_ptr = reinterpret_cast<int*>(buffer_H2D_ptr + nw * sizeof(ValueType*) + mw_nVP * 6 * sizeof(ST));
383         auto* restrict psiinv_ptr  = reinterpret_cast<const ValueType**>(buffer_H2D_ptr)[ref_id_ptr[iat]];
384         auto* restrict pos_scratch = reinterpret_cast<ST*>(buffer_H2D_ptr + nw * sizeof(ValueType*));
385 
386         int ix, iy, iz;
387         ST a[4], b[4], c[4];
388         spline2::computeLocationAndFractional(spline_ptr, pos_scratch[iat * 6 + 3], pos_scratch[iat * 6 + 4],
389                                               pos_scratch[iat * 6 + 5], ix, iy, iz, a, b, c);
390 
391         PRAGMA_OFFLOAD("omp parallel for")
392         for (int index = 0; index < last - first; index++)
393           spline2offload::evaluate_v_impl_v2(spline_ptr, ix, iy, iz, a, b, c, offload_scratch_iat_ptr + first, first,
394                                              index);
395         const int last_index = last / 2 < orb_size ? last / 2 : orb_size;
396         PRAGMA_OFFLOAD("omp parallel for")
397         for (int index = first / 2; index < last_index; index++)
398           C2C::assign_v(pos_scratch[iat * 6], pos_scratch[iat * 6 + 1], pos_scratch[iat * 6 + 2], psi_iat_ptr, orb_size,
399                         offload_scratch_iat_ptr, myKcart_ptr, myKcart_padded_size, first_spo_local, index);
400 
401         const int first_cplx = first / 2;
402         const int last_cplx  = orb_size < last / 2 ? orb_size : last / 2;
403         ComplexT sum(0);
404         PRAGMA_OFFLOAD("omp parallel for simd reduction(+:sum)")
405         for (int i = first_cplx; i < last_cplx; i++)
406           sum += psi_iat_ptr[i] * psiinv_ptr[i];
407         ratios_private_ptr[iat * NumTeams + team_id] = sum;
408       }
409   }
410 
411   // do the reduction manually
412   iVP = 0;
413   for (size_t iw = 0; iw < nw; iw++)
414   {
415     auto& ratios = ratios_list[iw];
416     for (size_t iat = 0; iat < ratios.size(); iat++, iVP++)
417     {
418       ratios[iat] = ComplexT(0);
419       for (int tid = 0; tid < NumTeams; ++tid)
420         ratios[iat] += mw_ratios_private[iVP][tid];
421     }
422   }
423 }
424 
425 /** assign_vgl_from_l can be used when myL is precomputed and myV,myG,myL in cartesian
426    */
427 template<typename ST>
assign_vgl_from_l(const PointType & r,ValueVector_t & psi,GradVector_t & dpsi,ValueVector_t & d2psi)428 inline void SplineC2COMPTarget<ST>::assign_vgl_from_l(const PointType& r,
429                                                       ValueVector_t& psi,
430                                                       GradVector_t& dpsi,
431                                                       ValueVector_t& d2psi)
432 {
433   constexpr ST two(2);
434   const ST x = r[0], y = r[1], z = r[2];
435 
436   const ST* restrict k0 = myKcart->data(0);
437   const ST* restrict k1 = myKcart->data(1);
438   const ST* restrict k2 = myKcart->data(2);
439 
440   const ST* restrict g0 = myG.data(0);
441   const ST* restrict g1 = myG.data(1);
442   const ST* restrict g2 = myG.data(2);
443 
444   const size_t N = last_spo - first_spo;
445 #pragma omp simd
446   for (size_t j = 0; j < N; ++j)
447   {
448     const size_t jr = j << 1;
449     const size_t ji = jr + 1;
450 
451     const ST kX    = k0[j];
452     const ST kY    = k1[j];
453     const ST kZ    = k2[j];
454     const ST val_r = myV[jr];
455     const ST val_i = myV[ji];
456 
457     //phase
458     ST s, c;
459     qmcplusplus::sincos(-(x * kX + y * kY + z * kZ), &s, &c);
460 
461     //dot(PrimLattice.G,myG[j])
462     const ST dX_r = g0[jr];
463     const ST dY_r = g1[jr];
464     const ST dZ_r = g2[jr];
465 
466     const ST dX_i = g0[ji];
467     const ST dY_i = g1[ji];
468     const ST dZ_i = g2[ji];
469 
470     // \f$\nabla \psi_r + {\bf k}\psi_i\f$
471     const ST gX_r = dX_r + val_i * kX;
472     const ST gY_r = dY_r + val_i * kY;
473     const ST gZ_r = dZ_r + val_i * kZ;
474     const ST gX_i = dX_i - val_r * kX;
475     const ST gY_i = dY_i - val_r * kY;
476     const ST gZ_i = dZ_i - val_r * kZ;
477 
478     const ST lap_r = myL[jr] + (*mKK)[j] * val_r + two * (kX * dX_i + kY * dY_i + kZ * dZ_i);
479     const ST lap_i = myL[ji] + (*mKK)[j] * val_i - two * (kX * dX_r + kY * dY_r + kZ * dZ_r);
480 
481     const size_t psiIndex = j + first_spo;
482     psi[psiIndex]         = ComplexT(c * val_r - s * val_i, c * val_i + s * val_r);
483     dpsi[psiIndex][0]     = ComplexT(c * gX_r - s * gX_i, c * gX_i + s * gX_r);
484     dpsi[psiIndex][1]     = ComplexT(c * gY_r - s * gY_i, c * gY_i + s * gY_r);
485     dpsi[psiIndex][2]     = ComplexT(c * gZ_r - s * gZ_i, c * gZ_i + s * gZ_r);
486     d2psi[psiIndex]       = ComplexT(c * lap_r - s * lap_i, c * lap_i + s * lap_r);
487   }
488 }
489 
490 template<typename ST>
evaluateVGL(const ParticleSet & P,const int iat,ValueVector_t & psi,GradVector_t & dpsi,ValueVector_t & d2psi)491 void SplineC2COMPTarget<ST>::evaluateVGL(const ParticleSet& P,
492                                          const int iat,
493                                          ValueVector_t& psi,
494                                          GradVector_t& dpsi,
495                                          ValueVector_t& d2psi)
496 {
497   const PointType& r = P.activeR(iat);
498   PointType ru(PrimLattice.toUnit_floor(r));
499 
500   const int ChunkSizePerTeam = 128;
501   const int NumTeams         = (myV.size() + ChunkSizePerTeam - 1) / ChunkSizePerTeam;
502 
503   const auto padded_size = myV.size();
504   // for V(1)G(3)H(6) intermediate result
505   offload_scratch.resize(padded_size * 10);
506   const auto orb_size = psi.size();
507   // for V(1)G(3)L(1) final result
508   results_scratch.resize(orb_size * 5);
509 
510   // Ye: need to extract sizes and pointers before entering target region
511   const auto* spline_ptr    = SplineInst->getSplinePtr();
512   auto* offload_scratch_ptr = offload_scratch.data();
513   auto* results_scratch_ptr = results_scratch.data();
514   const auto x = r[0], y = r[1], z = r[2];
515   const auto rux = ru[0], ruy = ru[1], ruz = ru[2];
516   const auto myKcart_padded_size = myKcart->capacity();
517   auto* mKK_ptr                  = mKK->data();
518   auto* GGt_ptr                  = GGt_offload->data();
519   auto* PrimLattice_G_ptr        = PrimLattice_G_offload->data();
520   auto* myKcart_ptr              = myKcart->data();
521   const size_t first_spo_local   = first_spo;
522 
523   {
524     ScopedTimer offload(offload_timer_);
525     PRAGMA_OFFLOAD("omp target teams distribute num_teams(NumTeams) \
526                 map(always, from: results_scratch_ptr[0:orb_size*5])")
527     for (int team_id = 0; team_id < NumTeams; team_id++)
528     {
529       const int first = ChunkSizePerTeam * team_id;
530       const int last  = (first + ChunkSizePerTeam) > padded_size ? padded_size : first + ChunkSizePerTeam;
531 
532       int ix, iy, iz;
533       ST a[4], b[4], c[4], da[4], db[4], dc[4], d2a[4], d2b[4], d2c[4];
534       spline2::computeLocationAndFractional(spline_ptr, rux, ruy, ruz, ix, iy, iz, a, b, c, da, db, dc, d2a, d2b, d2c);
535 
536       const ST G[9]      = {PrimLattice_G_ptr[0], PrimLattice_G_ptr[1], PrimLattice_G_ptr[2],
537                        PrimLattice_G_ptr[3], PrimLattice_G_ptr[4], PrimLattice_G_ptr[5],
538                        PrimLattice_G_ptr[6], PrimLattice_G_ptr[7], PrimLattice_G_ptr[8]};
539       const ST symGGt[6] = {GGt_ptr[0], GGt_ptr[1] + GGt_ptr[3], GGt_ptr[2] + GGt_ptr[6],
540                             GGt_ptr[4], GGt_ptr[5] + GGt_ptr[7], GGt_ptr[8]};
541 
542       PRAGMA_OFFLOAD("omp parallel for")
543       for (int index = 0; index < last - first; index++)
544         spline2offload::evaluate_vgh_impl_v2(spline_ptr, ix, iy, iz, a, b, c, da, db, dc, d2a, d2b, d2c,
545                                              offload_scratch_ptr + first, offload_scratch_ptr + padded_size + first,
546                                              offload_scratch_ptr + padded_size * 4 + first, padded_size, first, index);
547       const int last_index = last / 2 < orb_size ? last / 2 : orb_size;
548       PRAGMA_OFFLOAD("omp parallel for")
549       for (int index = first / 2; index < last_index; index++)
550         C2C::assign_vgl(x, y, z, results_scratch_ptr, mKK_ptr, orb_size, offload_scratch_ptr, padded_size, symGGt, G,
551                         myKcart_ptr, myKcart_padded_size, first_spo_local, index);
552     }
553   }
554 
555   for (size_t i = 0; i < orb_size; i++)
556   {
557     psi[i]     = results_scratch[i];
558     dpsi[i][0] = results_scratch[orb_size + i * 3];
559     dpsi[i][1] = results_scratch[orb_size + i * 3 + 1];
560     dpsi[i][2] = results_scratch[orb_size + i * 3 + 2];
561     d2psi[i]   = results_scratch[orb_size * 4 + i];
562   }
563 }
564 
565 template<typename ST>
evaluateVGLMultiPos(const Vector<ST,OffloadPinnedAllocator<ST>> & multi_pos,Vector<ST,OffloadPinnedAllocator<ST>> & offload_scratch,Vector<ComplexT,OffloadPinnedAllocator<ComplexT>> & results_scratch,const RefVector<ValueVector_t> & psi_v_list,const RefVector<GradVector_t> & dpsi_v_list,const RefVector<ValueVector_t> & d2psi_v_list) const566 void SplineC2COMPTarget<ST>::evaluateVGLMultiPos(const Vector<ST, OffloadPinnedAllocator<ST>>& multi_pos,
567                                                  Vector<ST, OffloadPinnedAllocator<ST>>& offload_scratch,
568                                                  Vector<ComplexT, OffloadPinnedAllocator<ComplexT>>& results_scratch,
569                                                  const RefVector<ValueVector_t>& psi_v_list,
570                                                  const RefVector<GradVector_t>& dpsi_v_list,
571                                                  const RefVector<ValueVector_t>& d2psi_v_list) const
572 {
573   const size_t num_pos       = psi_v_list.size();
574   const int ChunkSizePerTeam = 128;
575   const int NumTeams         = (myV.size() + ChunkSizePerTeam - 1) / ChunkSizePerTeam;
576   const auto padded_size     = myV.size();
577   // for V(1)G(3)H(6) intermediate result
578   offload_scratch.resize(padded_size * num_pos * 10);
579   const auto orb_size = psi_v_list[0].get().size();
580   // for V(1)G(3)L(1) final result
581   results_scratch.resize(orb_size * num_pos * 5);
582 
583   // Ye: need to extract sizes and pointers before entering target region
584   const auto* spline_ptr         = SplineInst->getSplinePtr();
585   auto* pos_copy_ptr             = multi_pos.data();
586   auto* offload_scratch_ptr      = offload_scratch.data();
587   auto* results_scratch_ptr      = results_scratch.data();
588   const auto myKcart_padded_size = myKcart->capacity();
589   auto* mKK_ptr                  = mKK->data();
590   auto* GGt_ptr                  = GGt_offload->data();
591   auto* PrimLattice_G_ptr        = PrimLattice_G_offload->data();
592   auto* myKcart_ptr              = myKcart->data();
593   const size_t first_spo_local   = first_spo;
594 
595   {
596     ScopedTimer offload(offload_timer_);
597     PRAGMA_OFFLOAD("omp target teams distribute collapse(2) num_teams(NumTeams*num_pos) \
598                     map(always, to: pos_copy_ptr[0:num_pos*6]) \
599                     map(always, from: results_scratch_ptr[0:orb_size*num_pos*5])")
600     for (int iw = 0; iw < num_pos; iw++)
601       for (int team_id = 0; team_id < NumTeams; team_id++)
602       {
603         const int first = ChunkSizePerTeam * team_id;
604         const int last  = (first + ChunkSizePerTeam) > padded_size ? padded_size : first + ChunkSizePerTeam;
605         auto* restrict offload_scratch_iw_ptr = offload_scratch_ptr + padded_size * iw * 10;
606         auto* restrict psi_iw_ptr             = results_scratch_ptr + orb_size * iw * 5;
607 
608         int ix, iy, iz;
609         ST a[4], b[4], c[4], da[4], db[4], dc[4], d2a[4], d2b[4], d2c[4];
610         spline2::computeLocationAndFractional(spline_ptr, pos_copy_ptr[iw * 6 + 3], pos_copy_ptr[iw * 6 + 4],
611                                               pos_copy_ptr[iw * 6 + 5], ix, iy, iz, a, b, c, da, db, dc, d2a, d2b, d2c);
612 
613         const ST G[9]      = {PrimLattice_G_ptr[0], PrimLattice_G_ptr[1], PrimLattice_G_ptr[2],
614                          PrimLattice_G_ptr[3], PrimLattice_G_ptr[4], PrimLattice_G_ptr[5],
615                          PrimLattice_G_ptr[6], PrimLattice_G_ptr[7], PrimLattice_G_ptr[8]};
616         const ST symGGt[6] = {GGt_ptr[0], GGt_ptr[1] + GGt_ptr[3], GGt_ptr[2] + GGt_ptr[6],
617                               GGt_ptr[4], GGt_ptr[5] + GGt_ptr[7], GGt_ptr[8]};
618 
619         PRAGMA_OFFLOAD("omp parallel for")
620         for (int index = 0; index < last - first; index++)
621           spline2offload::evaluate_vgh_impl_v2(spline_ptr, ix, iy, iz, a, b, c, da, db, dc, d2a, d2b, d2c,
622                                                offload_scratch_iw_ptr + first,
623                                                offload_scratch_iw_ptr + padded_size + first,
624                                                offload_scratch_iw_ptr + padded_size * 4 + first, padded_size, first,
625                                                index);
626         const int last_index = last / 2 < orb_size ? last / 2 : orb_size;
627         PRAGMA_OFFLOAD("omp parallel for")
628         for (int index = first / 2; index < last_index; index++)
629           C2C::assign_vgl(pos_copy_ptr[iw * 6], pos_copy_ptr[iw * 6 + 1], pos_copy_ptr[iw * 6 + 2], psi_iw_ptr, mKK_ptr,
630                           orb_size, offload_scratch_iw_ptr, padded_size, symGGt, G, myKcart_ptr, myKcart_padded_size,
631                           first_spo_local, index);
632       }
633   }
634 
635   for (int iw = 0; iw < num_pos; ++iw)
636   {
637     auto* restrict results_iw_ptr = results_scratch_ptr + orb_size * iw * 5;
638     ValueVector_t& psi_v(psi_v_list[iw]);
639     GradVector_t& dpsi_v(dpsi_v_list[iw]);
640     ValueVector_t& d2psi_v(d2psi_v_list[iw]);
641     for (size_t i = 0; i < orb_size; i++)
642     {
643       psi_v[i]     = results_iw_ptr[i];
644       dpsi_v[i][0] = results_iw_ptr[orb_size + i * 3];
645       dpsi_v[i][1] = results_iw_ptr[orb_size + i * 3 + 1];
646       dpsi_v[i][2] = results_iw_ptr[orb_size + i * 3 + 2];
647       d2psi_v[i]   = results_iw_ptr[orb_size * 4 + i];
648     }
649   }
650 }
651 
652 template<typename ST>
mw_evaluateVGL(const RefVectorWithLeader<SPOSet> & sa_list,const RefVectorWithLeader<ParticleSet> & P_list,int iat,const RefVector<ValueVector_t> & psi_v_list,const RefVector<GradVector_t> & dpsi_v_list,const RefVector<ValueVector_t> & d2psi_v_list) const653 void SplineC2COMPTarget<ST>::mw_evaluateVGL(const RefVectorWithLeader<SPOSet>& sa_list,
654                                             const RefVectorWithLeader<ParticleSet>& P_list,
655                                             int iat,
656                                             const RefVector<ValueVector_t>& psi_v_list,
657                                             const RefVector<GradVector_t>& dpsi_v_list,
658                                             const RefVector<ValueVector_t>& d2psi_v_list) const
659 {
660   assert(this == &sa_list.getLeader());
661   auto& phi_leader = sa_list.getCastedLeader<SplineC2COMPTarget<ST>>();
662   // make this class unit tests friendly without the need of setup resources.
663   if (!phi_leader.mw_mem_)
664   {
665     app_warning() << "SplineC2COMPTarget : This message should not be seen in production (performance bug) runs but "
666                      "only unit tests (expected)."
667                   << std::endl;
668     phi_leader.mw_mem_ = std::make_unique<SplineOMPTargetMultiWalkerMem<ST, ComplexT>>();
669   }
670   auto& mw_mem             = *phi_leader.mw_mem_;
671   auto& mw_pos_copy        = mw_mem.mw_pos_copy;
672   auto& mw_offload_scratch = mw_mem.mw_offload_scratch;
673   auto& mw_results_scratch = mw_mem.mw_results_scratch;
674   const int nwalkers       = sa_list.size();
675   mw_pos_copy.resize(nwalkers * 6);
676 
677   // pack particle positions
678   for (int iw = 0; iw < nwalkers; ++iw)
679   {
680     const PointType& r = P_list[iw].activeR(iat);
681     PointType ru(PrimLattice.toUnit_floor(r));
682     mw_pos_copy[iw * 6]     = r[0];
683     mw_pos_copy[iw * 6 + 1] = r[1];
684     mw_pos_copy[iw * 6 + 2] = r[2];
685     mw_pos_copy[iw * 6 + 3] = ru[0];
686     mw_pos_copy[iw * 6 + 4] = ru[1];
687     mw_pos_copy[iw * 6 + 5] = ru[2];
688   }
689 
690   phi_leader.evaluateVGLMultiPos(mw_pos_copy, mw_offload_scratch, mw_results_scratch, psi_v_list, dpsi_v_list,
691                                  d2psi_v_list);
692 }
693 
694 template<typename ST>
mw_evaluateVGLandDetRatioGrads(const RefVectorWithLeader<SPOSet> & spo_list,const RefVectorWithLeader<ParticleSet> & P_list,int iat,const std::vector<const ValueType * > & invRow_ptr_list,VGLVector_t & phi_vgl_v,std::vector<ValueType> & ratios,std::vector<GradType> & grads) const695 void SplineC2COMPTarget<ST>::mw_evaluateVGLandDetRatioGrads(const RefVectorWithLeader<SPOSet>& spo_list,
696                                                             const RefVectorWithLeader<ParticleSet>& P_list,
697                                                             int iat,
698                                                             const std::vector<const ValueType*>& invRow_ptr_list,
699                                                             VGLVector_t& phi_vgl_v,
700                                                             std::vector<ValueType>& ratios,
701                                                             std::vector<GradType>& grads) const
702 {
703   assert(this == &spo_list.getLeader());
704   auto& phi_leader         = spo_list.getCastedLeader<SplineC2COMPTarget<ST>>();
705   auto& mw_mem             = *phi_leader.mw_mem_;
706   auto& buffer_H2D         = mw_mem.buffer_H2D;
707   auto& rg_private         = mw_mem.rg_private;
708   auto& mw_offload_scratch = mw_mem.mw_offload_scratch;
709   auto& mw_results_scratch = mw_mem.mw_results_scratch;
710   const int nwalkers       = spo_list.size();
711   buffer_H2D.resize(nwalkers, sizeof(ST) * 6 + sizeof(ValueType*));
712 
713   // pack particle positions and invRow pointers.
714   for (int iw = 0; iw < nwalkers; ++iw)
715   {
716     const PointType& r = P_list[iw].activeR(iat);
717     PointType ru(PrimLattice.toUnit_floor(r));
718     Vector<ST> pos_copy(reinterpret_cast<ST*>(buffer_H2D[iw]), 6);
719 
720     pos_copy[0] = r[0];
721     pos_copy[1] = r[1];
722     pos_copy[2] = r[2];
723     pos_copy[3] = ru[0];
724     pos_copy[4] = ru[1];
725     pos_copy[5] = ru[2];
726 
727     auto& invRow_ptr = *reinterpret_cast<const ValueType**>(buffer_H2D[iw] + sizeof(ST) * 6);
728     invRow_ptr       = invRow_ptr_list[iw];
729   }
730 
731   const size_t num_pos       = nwalkers;
732   const int ChunkSizePerTeam = 128;
733   const int NumTeams         = (myV.size() + ChunkSizePerTeam - 1) / ChunkSizePerTeam;
734   const auto padded_size     = myV.size();
735   // for V(1)G(3)H(6) intermediate result
736   mw_offload_scratch.resize(padded_size * num_pos * 10);
737   const auto orb_size = phi_vgl_v.size() / num_pos;
738   // for V(1)G(3)L(1) final result
739   mw_results_scratch.resize(orb_size * num_pos * 5);
740   // per team ratio and grads
741   rg_private.resize(num_pos, NumTeams * 4);
742 
743   // Ye: need to extract sizes and pointers before entering target region
744   const auto* spline_ptr         = SplineInst->getSplinePtr();
745   auto* buffer_H2D_ptr           = buffer_H2D.data();
746   auto* offload_scratch_ptr      = mw_offload_scratch.data();
747   auto* results_scratch_ptr      = mw_results_scratch.data();
748   const auto myKcart_padded_size = myKcart->capacity();
749   auto* mKK_ptr                  = mKK->data();
750   auto* GGt_ptr                  = GGt_offload->data();
751   auto* PrimLattice_G_ptr        = PrimLattice_G_offload->data();
752   auto* myKcart_ptr              = myKcart->data();
753   auto* phi_vgl_ptr              = phi_vgl_v.data();
754   auto* rg_private_ptr           = rg_private.data();
755   const size_t buffer_H2D_stride = buffer_H2D.cols();
756   const size_t first_spo_local   = first_spo;
757   const size_t phi_vgl_stride    = phi_vgl_v.capacity();
758 
759   {
760     ScopedTimer offload(offload_timer_);
761     PRAGMA_OFFLOAD("omp target teams distribute collapse(2) num_teams(NumTeams*num_pos) \
762                     map(always, to: buffer_H2D_ptr[:buffer_H2D.size()]) \
763                     map(always, from: rg_private_ptr[0:rg_private.size()])")
764     for (int iw = 0; iw < num_pos; iw++)
765       for (int team_id = 0; team_id < NumTeams; team_id++)
766       {
767         const int first = ChunkSizePerTeam * team_id;
768         const int last  = (first + ChunkSizePerTeam) > padded_size ? padded_size : first + ChunkSizePerTeam;
769         auto* restrict offload_scratch_iw_ptr = offload_scratch_ptr + padded_size * iw * 10;
770         auto* restrict psi_iw_ptr             = results_scratch_ptr + orb_size * iw * 5;
771         const auto* restrict pos_iw_ptr       = reinterpret_cast<ST*>(buffer_H2D_ptr + buffer_H2D_stride * iw);
772         const auto* restrict invRow_iw_ptr =
773             *reinterpret_cast<ValueType**>(buffer_H2D_ptr + buffer_H2D_stride * iw + sizeof(ST) * 6);
774 
775         int ix, iy, iz;
776         ST a[4], b[4], c[4], da[4], db[4], dc[4], d2a[4], d2b[4], d2c[4];
777         spline2::computeLocationAndFractional(spline_ptr, pos_iw_ptr[3], pos_iw_ptr[4], pos_iw_ptr[5], ix, iy, iz, a, b,
778                                               c, da, db, dc, d2a, d2b, d2c);
779 
780         const ST G[9]      = {PrimLattice_G_ptr[0], PrimLattice_G_ptr[1], PrimLattice_G_ptr[2],
781                          PrimLattice_G_ptr[3], PrimLattice_G_ptr[4], PrimLattice_G_ptr[5],
782                          PrimLattice_G_ptr[6], PrimLattice_G_ptr[7], PrimLattice_G_ptr[8]};
783         const ST symGGt[6] = {GGt_ptr[0], GGt_ptr[1] + GGt_ptr[3], GGt_ptr[2] + GGt_ptr[6],
784                               GGt_ptr[4], GGt_ptr[5] + GGt_ptr[7], GGt_ptr[8]};
785 
786         PRAGMA_OFFLOAD("omp parallel for")
787         for (int index = 0; index < last - first; index++)
788           spline2offload::evaluate_vgh_impl_v2(spline_ptr, ix, iy, iz, a, b, c, da, db, dc, d2a, d2b, d2c,
789                                                offload_scratch_iw_ptr + first,
790                                                offload_scratch_iw_ptr + padded_size + first,
791                                                offload_scratch_iw_ptr + padded_size * 4 + first, padded_size, first,
792                                                index);
793         const int last_index = last / 2 < orb_size ? last / 2 : orb_size;
794         PRAGMA_OFFLOAD("omp parallel for")
795         for (int index = first / 2; index < last_index; index++)
796           C2C::assign_vgl(pos_iw_ptr[0], pos_iw_ptr[1], pos_iw_ptr[2], psi_iw_ptr, mKK_ptr, orb_size,
797                           offload_scratch_iw_ptr, padded_size, symGGt, G, myKcart_ptr, myKcart_padded_size,
798                           first_spo_local, index);
799 
800         ValueType* restrict psi   = psi_iw_ptr;
801         ValueType* restrict dpsi  = psi_iw_ptr + orb_size;
802         ValueType* restrict d2psi = psi_iw_ptr + orb_size * 4;
803 
804         ValueType* restrict out_phi_v = phi_vgl_ptr + iw * orb_size;
805         ValueType* restrict out_phi_g = phi_vgl_ptr + phi_vgl_stride + iw * orb_size * 3;
806         ValueType* restrict out_phi_l = phi_vgl_ptr + phi_vgl_stride * 4 + iw * orb_size;
807 
808         ValueType ratio(0), grad_x(0), grad_y(0), grad_z(0);
809         PRAGMA_OFFLOAD("omp parallel for reduction(+: ratio, grad_x, grad_y, grad_z)")
810         for (size_t j = first / 2; j < (last / 2 > orb_size ? orb_size : last / 2); j++)
811         {
812           const size_t psiIndex = first_spo_local + j;
813 
814           out_phi_v[psiIndex]         = psi[psiIndex];
815           out_phi_l[psiIndex]         = d2psi[psiIndex];
816           out_phi_g[psiIndex * 3]     = dpsi[psiIndex * 3];
817           out_phi_g[psiIndex * 3 + 1] = dpsi[psiIndex * 3 + 1];
818           out_phi_g[psiIndex * 3 + 2] = dpsi[psiIndex * 3 + 2];
819 
820           ratio += psi[psiIndex] * invRow_iw_ptr[psiIndex];
821           grad_x += dpsi[psiIndex * 3] * invRow_iw_ptr[psiIndex];
822           grad_y += dpsi[psiIndex * 3 + 1] * invRow_iw_ptr[psiIndex];
823           grad_z += dpsi[psiIndex * 3 + 2] * invRow_iw_ptr[psiIndex];
824         }
825 
826         rg_private_ptr[(iw * NumTeams + team_id) * 4]     = ratio;
827         rg_private_ptr[(iw * NumTeams + team_id) * 4 + 1] = grad_x;
828         rg_private_ptr[(iw * NumTeams + team_id) * 4 + 2] = grad_y;
829         rg_private_ptr[(iw * NumTeams + team_id) * 4 + 3] = grad_z;
830       }
831   }
832 
833   for (int iw = 0; iw < num_pos; iw++)
834   {
835     ValueType ratio(0);
836     for (int team_id = 0; team_id < NumTeams; team_id++)
837       ratio += rg_private[iw][team_id * 4];
838     ratios[iw] = ratio;
839 
840     ValueType grad_x(0), grad_y(0), grad_z(0);
841     for (int team_id = 0; team_id < NumTeams; team_id++)
842     {
843       grad_x += rg_private[iw][team_id * 4 + 1];
844       grad_y += rg_private[iw][team_id * 4 + 2];
845       grad_z += rg_private[iw][team_id * 4 + 3];
846     }
847     grads[iw] = GradType{grad_x / ratio, grad_y / ratio, grad_z / ratio};
848   }
849 }
850 template<typename ST>
assign_vgh(const PointType & r,ValueVector_t & psi,GradVector_t & dpsi,HessVector_t & grad_grad_psi,int first,int last) const851 void SplineC2COMPTarget<ST>::assign_vgh(const PointType& r,
852                                         ValueVector_t& psi,
853                                         GradVector_t& dpsi,
854                                         HessVector_t& grad_grad_psi,
855                                         int first,
856                                         int last) const
857 {
858   // protect last
859   last = last > kPoints.size() ? kPoints.size() : last;
860 
861   const ST g00 = PrimLattice.G(0), g01 = PrimLattice.G(1), g02 = PrimLattice.G(2), g10 = PrimLattice.G(3),
862            g11 = PrimLattice.G(4), g12 = PrimLattice.G(5), g20 = PrimLattice.G(6), g21 = PrimLattice.G(7),
863            g22 = PrimLattice.G(8);
864   const ST x = r[0], y = r[1], z = r[2];
865 
866   const ST* restrict k0 = myKcart->data(0);
867   const ST* restrict k1 = myKcart->data(1);
868   const ST* restrict k2 = myKcart->data(2);
869 
870   const ST* restrict g0  = myG.data(0);
871   const ST* restrict g1  = myG.data(1);
872   const ST* restrict g2  = myG.data(2);
873   const ST* restrict h00 = myH.data(0);
874   const ST* restrict h01 = myH.data(1);
875   const ST* restrict h02 = myH.data(2);
876   const ST* restrict h11 = myH.data(3);
877   const ST* restrict h12 = myH.data(4);
878   const ST* restrict h22 = myH.data(5);
879 
880 #pragma omp simd
881   for (size_t j = first; j < last; ++j)
882   {
883     int jr = j << 1;
884     int ji = jr + 1;
885 
886     const ST kX    = k0[j];
887     const ST kY    = k1[j];
888     const ST kZ    = k2[j];
889     const ST val_r = myV[jr];
890     const ST val_i = myV[ji];
891 
892     //phase
893     ST s, c;
894     qmcplusplus::sincos(-(x * kX + y * kY + z * kZ), &s, &c);
895 
896     //dot(PrimLattice.G,myG[j])
897     const ST dX_r = g00 * g0[jr] + g01 * g1[jr] + g02 * g2[jr];
898     const ST dY_r = g10 * g0[jr] + g11 * g1[jr] + g12 * g2[jr];
899     const ST dZ_r = g20 * g0[jr] + g21 * g1[jr] + g22 * g2[jr];
900 
901     const ST dX_i = g00 * g0[ji] + g01 * g1[ji] + g02 * g2[ji];
902     const ST dY_i = g10 * g0[ji] + g11 * g1[ji] + g12 * g2[ji];
903     const ST dZ_i = g20 * g0[ji] + g21 * g1[ji] + g22 * g2[ji];
904 
905     // \f$\nabla \psi_r + {\bf k}\psi_i\f$
906     const ST gX_r = dX_r + val_i * kX;
907     const ST gY_r = dY_r + val_i * kY;
908     const ST gZ_r = dZ_r + val_i * kZ;
909     const ST gX_i = dX_i - val_r * kX;
910     const ST gY_i = dY_i - val_r * kY;
911     const ST gZ_i = dZ_i - val_r * kZ;
912 
913     const size_t psiIndex = j + first_spo;
914     psi[psiIndex]         = ComplexT(c * val_r - s * val_i, c * val_i + s * val_r);
915     dpsi[psiIndex][0]     = ComplexT(c * gX_r - s * gX_i, c * gX_i + s * gX_r);
916     dpsi[psiIndex][1]     = ComplexT(c * gY_r - s * gY_i, c * gY_i + s * gY_r);
917     dpsi[psiIndex][2]     = ComplexT(c * gZ_r - s * gZ_i, c * gZ_i + s * gZ_r);
918 
919     const ST h_xx_r =
920         v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g00, g01, g02) + kX * (gX_i + dX_i);
921     const ST h_xy_r =
922         v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g10, g11, g12) + kX * (gY_i + dY_i);
923     const ST h_xz_r =
924         v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g20, g21, g22) + kX * (gZ_i + dZ_i);
925     const ST h_yx_r =
926         v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g10, g11, g12, g00, g01, g02) + kY * (gX_i + dX_i);
927     const ST h_yy_r =
928         v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g10, g11, g12, g10, g11, g12) + kY * (gY_i + dY_i);
929     const ST h_yz_r =
930         v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g10, g11, g12, g20, g21, g22) + kY * (gZ_i + dZ_i);
931     const ST h_zx_r =
932         v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g20, g21, g22, g00, g01, g02) + kZ * (gX_i + dX_i);
933     const ST h_zy_r =
934         v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g20, g21, g22, g10, g11, g12) + kZ * (gY_i + dY_i);
935     const ST h_zz_r =
936         v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g20, g21, g22, g20, g21, g22) + kZ * (gZ_i + dZ_i);
937 
938     const ST h_xx_i =
939         v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g00, g01, g02) - kX * (gX_r + dX_r);
940     const ST h_xy_i =
941         v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g10, g11, g12) - kX * (gY_r + dY_r);
942     const ST h_xz_i =
943         v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g20, g21, g22) - kX * (gZ_r + dZ_r);
944     const ST h_yx_i =
945         v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g10, g11, g12, g00, g01, g02) - kY * (gX_r + dX_r);
946     const ST h_yy_i =
947         v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g10, g11, g12, g10, g11, g12) - kY * (gY_r + dY_r);
948     const ST h_yz_i =
949         v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g10, g11, g12, g20, g21, g22) - kY * (gZ_r + dZ_r);
950     const ST h_zx_i =
951         v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g20, g21, g22, g00, g01, g02) - kZ * (gX_r + dX_r);
952     const ST h_zy_i =
953         v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g20, g21, g22, g10, g11, g12) - kZ * (gY_r + dY_r);
954     const ST h_zz_i =
955         v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g20, g21, g22, g20, g21, g22) - kZ * (gZ_r + dZ_r);
956 
957     grad_grad_psi[psiIndex][0] = ComplexT(c * h_xx_r - s * h_xx_i, c * h_xx_i + s * h_xx_r);
958     grad_grad_psi[psiIndex][1] = ComplexT(c * h_xy_r - s * h_xy_i, c * h_xy_i + s * h_xy_r);
959     grad_grad_psi[psiIndex][2] = ComplexT(c * h_xz_r - s * h_xz_i, c * h_xz_i + s * h_xz_r);
960     grad_grad_psi[psiIndex][3] = ComplexT(c * h_yx_r - s * h_yx_i, c * h_yx_i + s * h_yx_r);
961     grad_grad_psi[psiIndex][4] = ComplexT(c * h_yy_r - s * h_yy_i, c * h_yy_i + s * h_yy_r);
962     grad_grad_psi[psiIndex][5] = ComplexT(c * h_yz_r - s * h_yz_i, c * h_yz_i + s * h_yz_r);
963     grad_grad_psi[psiIndex][6] = ComplexT(c * h_zx_r - s * h_zx_i, c * h_zx_i + s * h_zx_r);
964     grad_grad_psi[psiIndex][7] = ComplexT(c * h_zy_r - s * h_zy_i, c * h_zy_i + s * h_zy_r);
965     grad_grad_psi[psiIndex][8] = ComplexT(c * h_zz_r - s * h_zz_i, c * h_zz_i + s * h_zz_r);
966   }
967 }
968 
969 template<typename ST>
evaluateVGH(const ParticleSet & P,const int iat,ValueVector_t & psi,GradVector_t & dpsi,HessVector_t & grad_grad_psi)970 void SplineC2COMPTarget<ST>::evaluateVGH(const ParticleSet& P,
971                                          const int iat,
972                                          ValueVector_t& psi,
973                                          GradVector_t& dpsi,
974                                          HessVector_t& grad_grad_psi)
975 {
976   const PointType& r = P.activeR(iat);
977   PointType ru(PrimLattice.toUnit_floor(r));
978 
979 #pragma omp parallel
980   {
981     int first, last;
982     FairDivideAligned(myV.size(), getAlignment<ST>(), omp_get_num_threads(), omp_get_thread_num(), first, last);
983 
984     spline2::evaluate3d_vgh(SplineInst->getSplinePtr(), ru, myV, myG, myH, first, last);
985     assign_vgh(r, psi, dpsi, grad_grad_psi, first / 2, last / 2);
986   }
987 }
988 
989 template<typename ST>
assign_vghgh(const PointType & r,ValueVector_t & psi,GradVector_t & dpsi,HessVector_t & grad_grad_psi,GGGVector_t & grad_grad_grad_psi,int first,int last) const990 void SplineC2COMPTarget<ST>::assign_vghgh(const PointType& r,
991                                           ValueVector_t& psi,
992                                           GradVector_t& dpsi,
993                                           HessVector_t& grad_grad_psi,
994                                           GGGVector_t& grad_grad_grad_psi,
995                                           int first,
996                                           int last) const
997 {
998   // protect last
999   last = last < 0 ? kPoints.size() : (last > kPoints.size() ? kPoints.size() : last);
1000 
1001   const ST g00 = PrimLattice.G(0), g01 = PrimLattice.G(1), g02 = PrimLattice.G(2), g10 = PrimLattice.G(3),
1002            g11 = PrimLattice.G(4), g12 = PrimLattice.G(5), g20 = PrimLattice.G(6), g21 = PrimLattice.G(7),
1003            g22 = PrimLattice.G(8);
1004   const ST x = r[0], y = r[1], z = r[2];
1005 
1006   const ST* restrict k0 = myKcart->data(0);
1007   const ST* restrict k1 = myKcart->data(1);
1008   const ST* restrict k2 = myKcart->data(2);
1009 
1010   const ST* restrict g0  = myG.data(0);
1011   const ST* restrict g1  = myG.data(1);
1012   const ST* restrict g2  = myG.data(2);
1013   const ST* restrict h00 = myH.data(0);
1014   const ST* restrict h01 = myH.data(1);
1015   const ST* restrict h02 = myH.data(2);
1016   const ST* restrict h11 = myH.data(3);
1017   const ST* restrict h12 = myH.data(4);
1018   const ST* restrict h22 = myH.data(5);
1019 
1020   const ST* restrict gh000 = mygH.data(0);
1021   const ST* restrict gh001 = mygH.data(1);
1022   const ST* restrict gh002 = mygH.data(2);
1023   const ST* restrict gh011 = mygH.data(3);
1024   const ST* restrict gh012 = mygH.data(4);
1025   const ST* restrict gh022 = mygH.data(5);
1026   const ST* restrict gh111 = mygH.data(6);
1027   const ST* restrict gh112 = mygH.data(7);
1028   const ST* restrict gh122 = mygH.data(8);
1029   const ST* restrict gh222 = mygH.data(9);
1030 
1031 //SIMD doesn't work quite right yet.  Comment out until further debugging.
1032 #pragma omp simd
1033   for (size_t j = first; j < last; ++j)
1034   {
1035     int jr = j << 1;
1036     int ji = jr + 1;
1037 
1038     const ST kX    = k0[j];
1039     const ST kY    = k1[j];
1040     const ST kZ    = k2[j];
1041     const ST val_r = myV[jr];
1042     const ST val_i = myV[ji];
1043 
1044     //phase
1045     ST s, c;
1046     qmcplusplus::sincos(-(x * kX + y * kY + z * kZ), &s, &c);
1047 
1048     //dot(PrimLattice.G,myG[j])
1049     const ST dX_r = g00 * g0[jr] + g01 * g1[jr] + g02 * g2[jr];
1050     const ST dY_r = g10 * g0[jr] + g11 * g1[jr] + g12 * g2[jr];
1051     const ST dZ_r = g20 * g0[jr] + g21 * g1[jr] + g22 * g2[jr];
1052 
1053     const ST dX_i = g00 * g0[ji] + g01 * g1[ji] + g02 * g2[ji];
1054     const ST dY_i = g10 * g0[ji] + g11 * g1[ji] + g12 * g2[ji];
1055     const ST dZ_i = g20 * g0[ji] + g21 * g1[ji] + g22 * g2[ji];
1056 
1057     // \f$\nabla \psi_r + {\bf k}\psi_i\f$
1058     const ST gX_r = dX_r + val_i * kX;
1059     const ST gY_r = dY_r + val_i * kY;
1060     const ST gZ_r = dZ_r + val_i * kZ;
1061     const ST gX_i = dX_i - val_r * kX;
1062     const ST gY_i = dY_i - val_r * kY;
1063     const ST gZ_i = dZ_i - val_r * kZ;
1064 
1065     const size_t psiIndex = j + first_spo;
1066     psi[psiIndex]         = ComplexT(c * val_r - s * val_i, c * val_i + s * val_r);
1067     dpsi[psiIndex][0]     = ComplexT(c * gX_r - s * gX_i, c * gX_i + s * gX_r);
1068     dpsi[psiIndex][1]     = ComplexT(c * gY_r - s * gY_i, c * gY_i + s * gY_r);
1069     dpsi[psiIndex][2]     = ComplexT(c * gZ_r - s * gZ_i, c * gZ_i + s * gZ_r);
1070 
1071     //intermediates for computation of hessian. \partial_i \partial_j phi in cartesian coordinates.
1072     const ST f_xx_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g00, g01, g02);
1073     const ST f_xy_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g10, g11, g12);
1074     const ST f_xz_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g20, g21, g22);
1075     const ST f_yy_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g10, g11, g12, g10, g11, g12);
1076     const ST f_yz_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g10, g11, g12, g20, g21, g22);
1077     const ST f_zz_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g20, g21, g22, g20, g21, g22);
1078 
1079     const ST f_xx_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g00, g01, g02);
1080     const ST f_xy_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g10, g11, g12);
1081     const ST f_xz_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g20, g21, g22);
1082     const ST f_yy_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g10, g11, g12, g10, g11, g12);
1083     const ST f_yz_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g10, g11, g12, g20, g21, g22);
1084     const ST f_zz_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g20, g21, g22, g20, g21, g22);
1085 
1086     const ST h_xx_r = f_xx_r + 2 * kX * dX_i - kX * kX * val_r;
1087     const ST h_xy_r = f_xy_r + (kX * dY_i + kY * dX_i) - kX * kY * val_r;
1088     const ST h_xz_r = f_xz_r + (kX * dZ_i + kZ * dX_i) - kX * kZ * val_r;
1089     const ST h_yy_r = f_yy_r + 2 * kY * dY_i - kY * kY * val_r;
1090     const ST h_yz_r = f_yz_r + (kY * dZ_i + kZ * dY_i) - kY * kZ * val_r;
1091     const ST h_zz_r = f_zz_r + 2 * kZ * dZ_i - kZ * kZ * val_r;
1092 
1093     const ST h_xx_i = f_xx_i - 2 * kX * dX_r - kX * kX * val_i;
1094     const ST h_xy_i = f_xy_i - (kX * dY_r + kY * dX_r) - kX * kY * val_i;
1095     const ST h_xz_i = f_xz_i - (kX * dZ_r + kZ * dX_r) - kX * kZ * val_i;
1096     const ST h_yy_i = f_yy_i - 2 * kY * dY_r - kY * kY * val_i;
1097     const ST h_yz_i = f_yz_i - (kZ * dY_r + kY * dZ_r) - kZ * kY * val_i;
1098     const ST h_zz_i = f_zz_i - 2 * kZ * dZ_r - kZ * kZ * val_i;
1099 
1100     grad_grad_psi[psiIndex][0] = ComplexT(c * h_xx_r - s * h_xx_i, c * h_xx_i + s * h_xx_r);
1101     grad_grad_psi[psiIndex][1] = ComplexT(c * h_xy_r - s * h_xy_i, c * h_xy_i + s * h_xy_r);
1102     grad_grad_psi[psiIndex][2] = ComplexT(c * h_xz_r - s * h_xz_i, c * h_xz_i + s * h_xz_r);
1103     grad_grad_psi[psiIndex][3] = ComplexT(c * h_xy_r - s * h_xy_i, c * h_xy_i + s * h_xy_r);
1104     grad_grad_psi[psiIndex][4] = ComplexT(c * h_yy_r - s * h_yy_i, c * h_yy_i + s * h_yy_r);
1105     grad_grad_psi[psiIndex][5] = ComplexT(c * h_yz_r - s * h_yz_i, c * h_yz_i + s * h_yz_r);
1106     grad_grad_psi[psiIndex][6] = ComplexT(c * h_xz_r - s * h_xz_i, c * h_xz_i + s * h_xz_r);
1107     grad_grad_psi[psiIndex][7] = ComplexT(c * h_yz_r - s * h_yz_i, c * h_yz_i + s * h_yz_r);
1108     grad_grad_psi[psiIndex][8] = ComplexT(c * h_zz_r - s * h_zz_i, c * h_zz_i + s * h_zz_r);
1109 
1110     //These are the real and imaginary components of the third SPO derivative.  _xxx denotes
1111     // third derivative w.r.t. x, _xyz, a derivative with resepect to x,y, and z, and so on.
1112 
1113     const ST f3_xxx_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1114                                     gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g00, g01, g02, g00, g01, g02);
1115     const ST f3_xxy_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1116                                     gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g00, g01, g02, g10, g11, g12);
1117     const ST f3_xxz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1118                                     gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g00, g01, g02, g20, g21, g22);
1119     const ST f3_xyy_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1120                                     gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g10, g11, g12, g10, g11, g12);
1121     const ST f3_xyz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1122                                     gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g10, g11, g12, g20, g21, g22);
1123     const ST f3_xzz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1124                                     gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g20, g21, g22, g20, g21, g22);
1125     const ST f3_yyy_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1126                                     gh112[jr], gh122[jr], gh222[jr], g10, g11, g12, g10, g11, g12, g10, g11, g12);
1127     const ST f3_yyz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1128                                     gh112[jr], gh122[jr], gh222[jr], g10, g11, g12, g10, g11, g12, g20, g21, g22);
1129     const ST f3_yzz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1130                                     gh112[jr], gh122[jr], gh222[jr], g10, g11, g12, g20, g21, g22, g20, g21, g22);
1131     const ST f3_zzz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
1132                                     gh112[jr], gh122[jr], gh222[jr], g20, g21, g22, g20, g21, g22, g20, g21, g22);
1133 
1134     const ST f3_xxx_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1135                                     gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g00, g01, g02, g00, g01, g02);
1136     const ST f3_xxy_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1137                                     gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g00, g01, g02, g10, g11, g12);
1138     const ST f3_xxz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1139                                     gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g00, g01, g02, g20, g21, g22);
1140     const ST f3_xyy_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1141                                     gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g10, g11, g12, g10, g11, g12);
1142     const ST f3_xyz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1143                                     gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g10, g11, g12, g20, g21, g22);
1144     const ST f3_xzz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1145                                     gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g20, g21, g22, g20, g21, g22);
1146     const ST f3_yyy_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1147                                     gh112[ji], gh122[ji], gh222[ji], g10, g11, g12, g10, g11, g12, g10, g11, g12);
1148     const ST f3_yyz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1149                                     gh112[ji], gh122[ji], gh222[ji], g10, g11, g12, g10, g11, g12, g20, g21, g22);
1150     const ST f3_yzz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1151                                     gh112[ji], gh122[ji], gh222[ji], g10, g11, g12, g20, g21, g22, g20, g21, g22);
1152     const ST f3_zzz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
1153                                     gh112[ji], gh122[ji], gh222[ji], g20, g21, g22, g20, g21, g22, g20, g21, g22);
1154 
1155     //Here is where we build up the components of the physical hessian gradient, namely, d^3/dx^3(e^{-ik*r}\phi(r)
1156     const ST gh_xxx_r = f3_xxx_r + 3 * kX * f_xx_i - 3 * kX * kX * dX_r - kX * kX * kX * val_i;
1157     const ST gh_xxx_i = f3_xxx_i - 3 * kX * f_xx_r - 3 * kX * kX * dX_i + kX * kX * kX * val_r;
1158     const ST gh_xxy_r =
1159         f3_xxy_r + (kY * f_xx_i + 2 * kX * f_xy_i) - (kX * kX * dY_r + 2 * kX * kY * dX_r) - kX * kX * kY * val_i;
1160     const ST gh_xxy_i =
1161         f3_xxy_i - (kY * f_xx_r + 2 * kX * f_xy_r) - (kX * kX * dY_i + 2 * kX * kY * dX_i) + kX * kX * kY * val_r;
1162     const ST gh_xxz_r =
1163         f3_xxz_r + (kZ * f_xx_i + 2 * kX * f_xz_i) - (kX * kX * dZ_r + 2 * kX * kZ * dX_r) - kX * kX * kZ * val_i;
1164     const ST gh_xxz_i =
1165         f3_xxz_i - (kZ * f_xx_r + 2 * kX * f_xz_r) - (kX * kX * dZ_i + 2 * kX * kZ * dX_i) + kX * kX * kZ * val_r;
1166     const ST gh_xyy_r =
1167         f3_xyy_r + (2 * kY * f_xy_i + kX * f_yy_i) - (2 * kX * kY * dY_r + kY * kY * dX_r) - kX * kY * kY * val_i;
1168     const ST gh_xyy_i =
1169         f3_xyy_i - (2 * kY * f_xy_r + kX * f_yy_r) - (2 * kX * kY * dY_i + kY * kY * dX_i) + kX * kY * kY * val_r;
1170     const ST gh_xyz_r = f3_xyz_r + (kX * f_yz_i + kY * f_xz_i + kZ * f_xy_i) -
1171         (kX * kY * dZ_r + kY * kZ * dX_r + kZ * kX * dY_r) - kX * kY * kZ * val_i;
1172     const ST gh_xyz_i = f3_xyz_i - (kX * f_yz_r + kY * f_xz_r + kZ * f_xy_r) -
1173         (kX * kY * dZ_i + kY * kZ * dX_i + kZ * kX * dY_i) + kX * kY * kZ * val_r;
1174     const ST gh_xzz_r =
1175         f3_xzz_r + (2 * kZ * f_xz_i + kX * f_zz_i) - (2 * kX * kZ * dZ_r + kZ * kZ * dX_r) - kX * kZ * kZ * val_i;
1176     const ST gh_xzz_i =
1177         f3_xzz_i - (2 * kZ * f_xz_r + kX * f_zz_r) - (2 * kX * kZ * dZ_i + kZ * kZ * dX_i) + kX * kZ * kZ * val_r;
1178     const ST gh_yyy_r = f3_yyy_r + 3 * kY * f_yy_i - 3 * kY * kY * dY_r - kY * kY * kY * val_i;
1179     const ST gh_yyy_i = f3_yyy_i - 3 * kY * f_yy_r - 3 * kY * kY * dY_i + kY * kY * kY * val_r;
1180     const ST gh_yyz_r =
1181         f3_yyz_r + (kZ * f_yy_i + 2 * kY * f_yz_i) - (kY * kY * dZ_r + 2 * kY * kZ * dY_r) - kY * kY * kZ * val_i;
1182     const ST gh_yyz_i =
1183         f3_yyz_i - (kZ * f_yy_r + 2 * kY * f_yz_r) - (kY * kY * dZ_i + 2 * kY * kZ * dY_i) + kY * kY * kZ * val_r;
1184     const ST gh_yzz_r =
1185         f3_yzz_r + (2 * kZ * f_yz_i + kY * f_zz_i) - (2 * kY * kZ * dZ_r + kZ * kZ * dY_r) - kY * kZ * kZ * val_i;
1186     const ST gh_yzz_i =
1187         f3_yzz_i - (2 * kZ * f_yz_r + kY * f_zz_r) - (2 * kY * kZ * dZ_i + kZ * kZ * dY_i) + kY * kZ * kZ * val_r;
1188     const ST gh_zzz_r = f3_zzz_r + 3 * kZ * f_zz_i - 3 * kZ * kZ * dZ_r - kZ * kZ * kZ * val_i;
1189     const ST gh_zzz_i = f3_zzz_i - 3 * kZ * f_zz_r - 3 * kZ * kZ * dZ_i + kZ * kZ * kZ * val_r;
1190 
1191     grad_grad_grad_psi[psiIndex][0][0] = ComplexT(c * gh_xxx_r - s * gh_xxx_i, c * gh_xxx_i + s * gh_xxx_r);
1192     grad_grad_grad_psi[psiIndex][0][1] = ComplexT(c * gh_xxy_r - s * gh_xxy_i, c * gh_xxy_i + s * gh_xxy_r);
1193     grad_grad_grad_psi[psiIndex][0][2] = ComplexT(c * gh_xxz_r - s * gh_xxz_i, c * gh_xxz_i + s * gh_xxz_r);
1194     grad_grad_grad_psi[psiIndex][0][3] = ComplexT(c * gh_xxy_r - s * gh_xxy_i, c * gh_xxy_i + s * gh_xxy_r);
1195     grad_grad_grad_psi[psiIndex][0][4] = ComplexT(c * gh_xyy_r - s * gh_xyy_i, c * gh_xyy_i + s * gh_xyy_r);
1196     grad_grad_grad_psi[psiIndex][0][5] = ComplexT(c * gh_xyz_r - s * gh_xyz_i, c * gh_xyz_i + s * gh_xyz_r);
1197     grad_grad_grad_psi[psiIndex][0][6] = ComplexT(c * gh_xxz_r - s * gh_xxz_i, c * gh_xxz_i + s * gh_xxz_r);
1198     grad_grad_grad_psi[psiIndex][0][7] = ComplexT(c * gh_xyz_r - s * gh_xyz_i, c * gh_xyz_i + s * gh_xyz_r);
1199     grad_grad_grad_psi[psiIndex][0][8] = ComplexT(c * gh_xzz_r - s * gh_xzz_i, c * gh_xzz_i + s * gh_xzz_r);
1200 
1201     grad_grad_grad_psi[psiIndex][1][0] = ComplexT(c * gh_xxy_r - s * gh_xxy_i, c * gh_xxy_i + s * gh_xxy_r);
1202     grad_grad_grad_psi[psiIndex][1][1] = ComplexT(c * gh_xyy_r - s * gh_xyy_i, c * gh_xyy_i + s * gh_xyy_r);
1203     grad_grad_grad_psi[psiIndex][1][2] = ComplexT(c * gh_xyz_r - s * gh_xyz_i, c * gh_xyz_i + s * gh_xyz_r);
1204     grad_grad_grad_psi[psiIndex][1][3] = ComplexT(c * gh_xyy_r - s * gh_xyy_i, c * gh_xyy_i + s * gh_xyy_r);
1205     grad_grad_grad_psi[psiIndex][1][4] = ComplexT(c * gh_yyy_r - s * gh_yyy_i, c * gh_yyy_i + s * gh_yyy_r);
1206     grad_grad_grad_psi[psiIndex][1][5] = ComplexT(c * gh_yyz_r - s * gh_yyz_i, c * gh_yyz_i + s * gh_yyz_r);
1207     grad_grad_grad_psi[psiIndex][1][6] = ComplexT(c * gh_xyz_r - s * gh_xyz_i, c * gh_xyz_i + s * gh_xyz_r);
1208     grad_grad_grad_psi[psiIndex][1][7] = ComplexT(c * gh_yyz_r - s * gh_yyz_i, c * gh_yyz_i + s * gh_yyz_r);
1209     grad_grad_grad_psi[psiIndex][1][8] = ComplexT(c * gh_yzz_r - s * gh_yzz_i, c * gh_yzz_i + s * gh_yzz_r);
1210 
1211 
1212     grad_grad_grad_psi[psiIndex][2][0] = ComplexT(c * gh_xxz_r - s * gh_xxz_i, c * gh_xxz_i + s * gh_xxz_r);
1213     grad_grad_grad_psi[psiIndex][2][1] = ComplexT(c * gh_xyz_r - s * gh_xyz_i, c * gh_xyz_i + s * gh_xyz_r);
1214     grad_grad_grad_psi[psiIndex][2][2] = ComplexT(c * gh_xzz_r - s * gh_xzz_i, c * gh_xzz_i + s * gh_xzz_r);
1215     grad_grad_grad_psi[psiIndex][2][3] = ComplexT(c * gh_xyz_r - s * gh_xyz_i, c * gh_xyz_i + s * gh_xyz_r);
1216     grad_grad_grad_psi[psiIndex][2][4] = ComplexT(c * gh_yyz_r - s * gh_yyz_i, c * gh_yyz_i + s * gh_yyz_r);
1217     grad_grad_grad_psi[psiIndex][2][5] = ComplexT(c * gh_yzz_r - s * gh_yzz_i, c * gh_yzz_i + s * gh_yzz_r);
1218     grad_grad_grad_psi[psiIndex][2][6] = ComplexT(c * gh_xzz_r - s * gh_xzz_i, c * gh_xzz_i + s * gh_xzz_r);
1219     grad_grad_grad_psi[psiIndex][2][7] = ComplexT(c * gh_yzz_r - s * gh_yzz_i, c * gh_yzz_i + s * gh_yzz_r);
1220     grad_grad_grad_psi[psiIndex][2][8] = ComplexT(c * gh_zzz_r - s * gh_zzz_i, c * gh_zzz_i + s * gh_zzz_r);
1221   }
1222 }
1223 
1224 template<typename ST>
evaluateVGHGH(const ParticleSet & P,const int iat,ValueVector_t & psi,GradVector_t & dpsi,HessVector_t & grad_grad_psi,GGGVector_t & grad_grad_grad_psi)1225 void SplineC2COMPTarget<ST>::evaluateVGHGH(const ParticleSet& P,
1226                                            const int iat,
1227                                            ValueVector_t& psi,
1228                                            GradVector_t& dpsi,
1229                                            HessVector_t& grad_grad_psi,
1230                                            GGGVector_t& grad_grad_grad_psi)
1231 {
1232   const PointType& r = P.activeR(iat);
1233   PointType ru(PrimLattice.toUnit_floor(r));
1234 #pragma omp parallel
1235   {
1236     int first, last;
1237     FairDivideAligned(myV.size(), getAlignment<ST>(), omp_get_num_threads(), omp_get_thread_num(), first, last);
1238 
1239     spline2::evaluate3d_vghgh(SplineInst->getSplinePtr(), ru, myV, myG, myH, mygH, first, last);
1240     assign_vghgh(r, psi, dpsi, grad_grad_psi, grad_grad_grad_psi, first / 2, last / 2);
1241   }
1242 }
1243 
1244 template<typename ST>
evaluate_notranspose(const ParticleSet & P,int first,int last,ValueMatrix_t & logdet,GradMatrix_t & dlogdet,ValueMatrix_t & d2logdet)1245 void SplineC2COMPTarget<ST>::evaluate_notranspose(const ParticleSet& P,
1246                                                   int first,
1247                                                   int last,
1248                                                   ValueMatrix_t& logdet,
1249                                                   GradMatrix_t& dlogdet,
1250                                                   ValueMatrix_t& d2logdet)
1251 {
1252   // chunk the [first, last) loop into blocks to save temporary memory usage
1253   const int block_size = 16;
1254 
1255   // reference vectors refer to the rows of matrices
1256   std::vector<ValueVector_t> multi_psi_v;
1257   std::vector<GradVector_t> multi_dpsi_v;
1258   std::vector<ValueVector_t> multi_d2psi_v;
1259   RefVector<ValueVector_t> psi_v_list;
1260   RefVector<GradVector_t> dpsi_v_list;
1261   RefVector<ValueVector_t> d2psi_v_list;
1262 
1263   multi_psi_v.reserve(block_size);
1264   multi_dpsi_v.reserve(block_size);
1265   multi_d2psi_v.reserve(block_size);
1266   psi_v_list.reserve(block_size);
1267   dpsi_v_list.reserve(block_size);
1268   d2psi_v_list.reserve(block_size);
1269 
1270   for (int iat = first, i = 0; iat < last; iat += block_size, i += block_size)
1271   {
1272     const int actual_block_size = std::min(last - iat, block_size);
1273     multi_pos_copy.resize(actual_block_size * 6);
1274     multi_psi_v.clear();
1275     multi_dpsi_v.clear();
1276     multi_d2psi_v.clear();
1277     psi_v_list.clear();
1278     dpsi_v_list.clear();
1279     d2psi_v_list.clear();
1280 
1281     for (int ipos = 0; ipos < actual_block_size; ++ipos)
1282     {
1283       // pack particle positions
1284       const PointType& r = P.activeR(iat + ipos);
1285       PointType ru(PrimLattice.toUnit_floor(r));
1286       multi_pos_copy[ipos * 6]     = r[0];
1287       multi_pos_copy[ipos * 6 + 1] = r[1];
1288       multi_pos_copy[ipos * 6 + 2] = r[2];
1289       multi_pos_copy[ipos * 6 + 3] = ru[0];
1290       multi_pos_copy[ipos * 6 + 4] = ru[1];
1291       multi_pos_copy[ipos * 6 + 5] = ru[2];
1292 
1293       multi_psi_v.emplace_back(logdet[i + ipos], OrbitalSetSize);
1294       multi_dpsi_v.emplace_back(dlogdet[i + ipos], OrbitalSetSize);
1295       multi_d2psi_v.emplace_back(d2logdet[i + ipos], OrbitalSetSize);
1296 
1297       psi_v_list.push_back(multi_psi_v[ipos]);
1298       dpsi_v_list.push_back(multi_dpsi_v[ipos]);
1299       d2psi_v_list.push_back(multi_d2psi_v[ipos]);
1300     }
1301 
1302     evaluateVGLMultiPos(multi_pos_copy, offload_scratch, results_scratch, psi_v_list, dpsi_v_list, d2psi_v_list);
1303   }
1304 }
1305 
1306 template class SplineC2COMPTarget<float>;
1307 template class SplineC2COMPTarget<double>;
1308 
1309 } // namespace qmcplusplus
1310