1 //////////////////////////////////////////////////////////////////////////////////////
2 // This file is distributed under the University of Illinois/NCSA Open Source License.
3 // See LICENSE file in top directory for details.
4 //
5 // Copyright (c) 2016 Jeongnim Kim and QMCPACK developers.
6 //
7 // File developed by: Jeongnim Kim, jeongnim.kim@intel.com, Intel Corp.
8 //                    Amrita Mathuriya, amrita.mathuriya@intel.com, Intel Corp.
9 //                    Ye Luo, yeluo@anl.gov, Argonne National Laboratory
10 //
11 // File created by: Jeongnim Kim, jeongnim.kim@intel.com, Intel Corp.
12 //////////////////////////////////////////////////////////////////////////////////////
13 // -*- C++ -*-
14 /**@file MultiBsplineStd.hpp
15  *
16  * Literal port of einspline/multi_bspline_eval_d_std3.cpp by Ye at ANL
17  * Modified to handle both float/double and blocked operations
18  * - template parameter T for the precision
19  * - MUB spline object created by einspline allocators.
20  * Function signatures modified anticipating its use by a class that can perform data parallel execution
21  * - evaluate(...., int first, int last)
22  */
23 #ifndef SPLINE2_OFFLOAD_MULTIEINSPLINE_VGLH_HPP
24 #define SPLINE2_OFFLOAD_MULTIEINSPLINE_VGLH_HPP
25 
26 #include "OMPTarget/OMPstd.hpp"
27 #include "spline2/MultiBsplineData.hpp"
28 #include "spline2/MultiBsplineEval_helper.hpp"
29 
30 namespace spline2offload
31 {
32 template<typename T>
evaluate_vgl_impl(const typename qmcplusplus::bspline_traits<T,3>::SplineType * restrict spline_m,T x,T y,T z,T * restrict vals,T * restrict grads,T * restrict lapl,size_t out_offset,int first,int last)33 inline void evaluate_vgl_impl(const typename qmcplusplus::bspline_traits<T, 3>::SplineType* restrict spline_m,
34                               T x,
35                               T y,
36                               T z,
37                               T* restrict vals,
38                               T* restrict grads,
39                               T* restrict lapl,
40                               size_t out_offset,
41                               int first,
42                               int last)
43 {
44   x -= spline_m->x_grid.start;
45   y -= spline_m->y_grid.start;
46   z -= spline_m->z_grid.start;
47   T tx, ty, tz;
48   int ix, iy, iz;
49   spline2::getSplineBound(x * spline_m->x_grid.delta_inv, tx, ix, spline_m->x_grid.num - 1);
50   spline2::getSplineBound(y * spline_m->y_grid.delta_inv, ty, iy, spline_m->y_grid.num - 1);
51   spline2::getSplineBound(z * spline_m->z_grid.delta_inv, tz, iz, spline_m->z_grid.num - 1);
52 
53   T a[4], b[4], c[4], da[4], db[4], dc[4], d2a[4], d2b[4], d2c[4];
54 
55   spline2::MultiBsplineData<T>::compute_prefactors(a, da, d2a, tx);
56   spline2::MultiBsplineData<T>::compute_prefactors(b, db, d2b, ty);
57   spline2::MultiBsplineData<T>::compute_prefactors(c, dc, d2c, tz);
58 
59   const intptr_t xs = spline_m->x_stride;
60   const intptr_t ys = spline_m->y_stride;
61   const intptr_t zs = spline_m->z_stride;
62 
63   const int num_splines = last - first;
64 
65   T* restrict gx = grads;
66   T* restrict gy = grads + out_offset;
67   T* restrict gz = grads + 2 * out_offset;
68   T* restrict lx = lapl;
69   T* restrict ly = lapl + out_offset;
70   T* restrict lz = lapl + 2 * out_offset;
71 
72   OMPstd::fill_n(vals, num_splines, T());
73   OMPstd::fill_n(gx, num_splines, T());
74   OMPstd::fill_n(gy, num_splines, T());
75   OMPstd::fill_n(gz, num_splines, T());
76   OMPstd::fill_n(lx, num_splines, T());
77   OMPstd::fill_n(ly, num_splines, T());
78   OMPstd::fill_n(lz, num_splines, T());
79 
80   for (int i = 0; i < 4; i++)
81     for (int j = 0; j < 4; j++)
82     {
83       const T pre20 = d2a[i] * b[j];
84       const T pre10 = da[i] * b[j];
85       const T pre00 = a[i] * b[j];
86       const T pre11 = da[i] * db[j];
87       const T pre01 = a[i] * db[j];
88       const T pre02 = a[i] * d2b[j];
89 
90       const T* restrict coefs    = spline_m->coefs + ((ix + i) * xs + (iy + j) * ys + iz * zs) + first;
91       const T* restrict coefszs  = coefs + zs;
92       const T* restrict coefs2zs = coefs + 2 * zs;
93       const T* restrict coefs3zs = coefs + 3 * zs;
94 
95 #ifdef ENABLE_OFFLOAD
96 #pragma omp for
97 #else
98 #pragma omp simd aligned(coefs, coefszs, coefs2zs, coefs3zs, gx, gy, gz, lx, ly, lz, vals: QMC_SIMD_ALIGNMENT)
99 #endif
100       for (int n = 0; n < num_splines; n++)
101       {
102         const T coefsv    = coefs[n];
103         const T coefsvzs  = coefszs[n];
104         const T coefsv2zs = coefs2zs[n];
105         const T coefsv3zs = coefs3zs[n];
106 
107         T sum0 = c[0] * coefsv + c[1] * coefsvzs + c[2] * coefsv2zs + c[3] * coefsv3zs;
108         T sum1 = dc[0] * coefsv + dc[1] * coefsvzs + dc[2] * coefsv2zs + dc[3] * coefsv3zs;
109         T sum2 = d2c[0] * coefsv + d2c[1] * coefsvzs + d2c[2] * coefsv2zs + d2c[3] * coefsv3zs;
110         gx[n] += pre10 * sum0;
111         gy[n] += pre01 * sum0;
112         gz[n] += pre00 * sum1;
113         lx[n] += pre20 * sum0;
114         ly[n] += pre02 * sum0;
115         lz[n] += pre00 * sum2;
116         vals[n] += pre00 * sum0;
117       }
118     }
119 
120   const T dxInv = spline_m->x_grid.delta_inv;
121   const T dyInv = spline_m->y_grid.delta_inv;
122   const T dzInv = spline_m->z_grid.delta_inv;
123 
124   const T dxInv2 = dxInv * dxInv;
125   const T dyInv2 = dyInv * dyInv;
126   const T dzInv2 = dzInv * dzInv;
127 
128 #ifdef ENABLE_OFFLOAD
129 #pragma omp for
130 #else
131 #pragma omp simd aligned(gx, gy, gz, lx: QMC_SIMD_ALIGNMENT)
132 #endif
133   for (int n = 0; n < num_splines; n++)
134   {
135     gx[n] *= dxInv;
136     gy[n] *= dyInv;
137     gz[n] *= dzInv;
138     lx[n] = lx[n] * dxInv2 + ly[n] * dyInv2 + lz[n] * dzInv2;
139   }
140 }
141 
142 template<typename T>
evaluate_vgh_impl(const typename qmcplusplus::bspline_traits<T,3>::SplineType * restrict spline_m,T x,T y,T z,T * restrict vals,T * restrict grads,T * restrict hess,size_t out_offset,int first,int last)143 inline void evaluate_vgh_impl(const typename qmcplusplus::bspline_traits<T, 3>::SplineType* restrict spline_m,
144                               T x,
145                               T y,
146                               T z,
147                               T* restrict vals,
148                               T* restrict grads,
149                               T* restrict hess,
150                               size_t out_offset,
151                               int first,
152                               int last)
153 {
154   int ix, iy, iz;
155   T tx, ty, tz;
156   T a[4], b[4], c[4], da[4], db[4], dc[4], d2a[4], d2b[4], d2c[4];
157 
158   x -= spline_m->x_grid.start;
159   y -= spline_m->y_grid.start;
160   z -= spline_m->z_grid.start;
161   spline2::getSplineBound(x * spline_m->x_grid.delta_inv, tx, ix, spline_m->x_grid.num - 1);
162   spline2::getSplineBound(y * spline_m->y_grid.delta_inv, ty, iy, spline_m->y_grid.num - 1);
163   spline2::getSplineBound(z * spline_m->z_grid.delta_inv, tz, iz, spline_m->z_grid.num - 1);
164 
165   spline2::MultiBsplineData<T>::compute_prefactors(a, da, d2a, tx);
166   spline2::MultiBsplineData<T>::compute_prefactors(b, db, d2b, ty);
167   spline2::MultiBsplineData<T>::compute_prefactors(c, dc, d2c, tz);
168 
169   const intptr_t xs = spline_m->x_stride;
170   const intptr_t ys = spline_m->y_stride;
171   const intptr_t zs = spline_m->z_stride;
172 
173   const int num_splines = last - first;
174 
175   T* restrict gx = grads;
176   T* restrict gy = grads + out_offset;
177   T* restrict gz = grads + 2 * out_offset;
178 
179   T* restrict hxx = hess;
180   T* restrict hxy = hess + out_offset;
181   T* restrict hxz = hess + 2 * out_offset;
182   T* restrict hyy = hess + 3 * out_offset;
183   T* restrict hyz = hess + 4 * out_offset;
184   T* restrict hzz = hess + 5 * out_offset;
185 
186   OMPstd::fill_n(vals, num_splines, T());
187   OMPstd::fill_n(gx, num_splines, T());
188   OMPstd::fill_n(gy, num_splines, T());
189   OMPstd::fill_n(gz, num_splines, T());
190   OMPstd::fill_n(hxx, num_splines, T());
191   OMPstd::fill_n(hxy, num_splines, T());
192   OMPstd::fill_n(hxz, num_splines, T());
193   OMPstd::fill_n(hyy, num_splines, T());
194   OMPstd::fill_n(hyz, num_splines, T());
195   OMPstd::fill_n(hzz, num_splines, T());
196 
197   for (int i = 0; i < 4; i++)
198     for (int j = 0; j < 4; j++)
199     {
200       const T* restrict coefs    = spline_m->coefs + ((ix + i) * xs + (iy + j) * ys + iz * zs) + first;
201       const T* restrict coefszs  = coefs + zs;
202       const T* restrict coefs2zs = coefs + 2 * zs;
203       const T* restrict coefs3zs = coefs + 3 * zs;
204 
205       const T pre20 = d2a[i] * b[j];
206       const T pre10 = da[i] * b[j];
207       const T pre00 = a[i] * b[j];
208       const T pre11 = da[i] * db[j];
209       const T pre01 = a[i] * db[j];
210       const T pre02 = a[i] * d2b[j];
211 
212 #ifdef ENABLE_OFFLOAD
213 #pragma omp for
214 #else
215 #pragma omp simd aligned(coefs, coefszs, coefs2zs, coefs3zs, vals, gx, gy, gz, hxx, hyy, hzz, hxy, hxz, hyz: QMC_SIMD_ALIGNMENT)
216 #endif
217       for (int n = 0; n < num_splines; n++)
218       {
219         T coefsv    = coefs[n];
220         T coefsvzs  = coefszs[n];
221         T coefsv2zs = coefs2zs[n];
222         T coefsv3zs = coefs3zs[n];
223 
224         T sum0 = c[0] * coefsv + c[1] * coefsvzs + c[2] * coefsv2zs + c[3] * coefsv3zs;
225         T sum1 = dc[0] * coefsv + dc[1] * coefsvzs + dc[2] * coefsv2zs + dc[3] * coefsv3zs;
226         T sum2 = d2c[0] * coefsv + d2c[1] * coefsvzs + d2c[2] * coefsv2zs + d2c[3] * coefsv3zs;
227 
228         hxx[n] += pre20 * sum0;
229         hxy[n] += pre11 * sum0;
230         hxz[n] += pre10 * sum1;
231         hyy[n] += pre02 * sum0;
232         hyz[n] += pre01 * sum1;
233         hzz[n] += pre00 * sum2;
234         gx[n] += pre10 * sum0;
235         gy[n] += pre01 * sum0;
236         gz[n] += pre00 * sum1;
237         vals[n] += pre00 * sum0;
238       }
239     }
240 
241   const T dxInv = spline_m->x_grid.delta_inv;
242   const T dyInv = spline_m->y_grid.delta_inv;
243   const T dzInv = spline_m->z_grid.delta_inv;
244   const T dxx   = dxInv * dxInv;
245   const T dyy   = dyInv * dyInv;
246   const T dzz   = dzInv * dzInv;
247   const T dxy   = dxInv * dyInv;
248   const T dxz   = dxInv * dzInv;
249   const T dyz   = dyInv * dzInv;
250 
251 #ifdef ENABLE_OFFLOAD
252 #pragma omp for
253 #else
254 #pragma omp simd aligned(gx, gy, gz, hxx, hyy, hzz, hxy, hxz, hyz: QMC_SIMD_ALIGNMENT)
255 #endif
256   for (int n = 0; n < num_splines; n++)
257   {
258     gx[n] *= dxInv;
259     gy[n] *= dyInv;
260     gz[n] *= dzInv;
261     hxx[n] *= dxx;
262     hyy[n] *= dyy;
263     hzz[n] *= dzz;
264     hxy[n] *= dxy;
265     hxz[n] *= dxz;
266     hyz[n] *= dyz;
267   }
268 }
269 
270 template<typename T>
evaluate_vgh_impl_v2(const typename qmcplusplus::bspline_traits<T,3>::SplineType * restrict spline_m,int ix,int iy,int iz,const T a[4],const T b[4],const T c[4],const T da[4],const T db[4],const T dc[4],const T d2a[4],const T d2b[4],const T d2c[4],T * restrict vals,T * restrict grads,T * restrict hess,const size_t out_offset,const int first,const int index)271 inline void evaluate_vgh_impl_v2(const typename qmcplusplus::bspline_traits<T, 3>::SplineType* restrict spline_m,
272                                  int ix,
273                                  int iy,
274                                  int iz,
275                                  const T a[4],
276                                  const T b[4],
277                                  const T c[4],
278                                  const T da[4],
279                                  const T db[4],
280                                  const T dc[4],
281                                  const T d2a[4],
282                                  const T d2b[4],
283                                  const T d2c[4],
284                                  T* restrict vals,
285                                  T* restrict grads,
286                                  T* restrict hess,
287                                  const size_t out_offset,
288                                  const int first,
289                                  const int index)
290 {
291   const intptr_t xs = spline_m->x_stride;
292   const intptr_t ys = spline_m->y_stride;
293   const intptr_t zs = spline_m->z_stride;
294 
295   T val = T();
296   T gx  = T();
297   T gy  = T();
298   T gz  = T();
299   T hxx = T();
300   T hxy = T();
301   T hxz = T();
302   T hyy = T();
303   T hyz = T();
304   T hzz = T();
305 
306   for (int i = 0; i < 4; i++)
307     for (int j = 0; j < 4; j++)
308     {
309       const T* restrict coefs    = spline_m->coefs + ((ix + i) * xs + (iy + j) * ys + iz * zs) + first;
310       const T* restrict coefszs  = coefs + zs;
311       const T* restrict coefs2zs = coefs + 2 * zs;
312       const T* restrict coefs3zs = coefs + 3 * zs;
313 
314       const T pre20 = d2a[i] * b[j];
315       const T pre10 = da[i] * b[j];
316       const T pre00 = a[i] * b[j];
317       const T pre11 = da[i] * db[j];
318       const T pre01 = a[i] * db[j];
319       const T pre02 = a[i] * d2b[j];
320 
321       T coefsv    = coefs[index];
322       T coefsvzs  = coefszs[index];
323       T coefsv2zs = coefs2zs[index];
324       T coefsv3zs = coefs3zs[index];
325 
326       T sum0 = c[0] * coefsv + c[1] * coefsvzs + c[2] * coefsv2zs + c[3] * coefsv3zs;
327       T sum1 = dc[0] * coefsv + dc[1] * coefsvzs + dc[2] * coefsv2zs + dc[3] * coefsv3zs;
328       T sum2 = d2c[0] * coefsv + d2c[1] * coefsvzs + d2c[2] * coefsv2zs + d2c[3] * coefsv3zs;
329 
330       hxx += pre20 * sum0;
331       hxy += pre11 * sum0;
332       hxz += pre10 * sum1;
333       hyy += pre02 * sum0;
334       hyz += pre01 * sum1;
335       hzz += pre00 * sum2;
336       gx += pre10 * sum0;
337       gy += pre01 * sum0;
338       gz += pre00 * sum1;
339       val += pre00 * sum0;
340     }
341 
342   // put data back to the result vector
343   vals[index] = val;
344 
345   const T dxInv                 = spline_m->x_grid.delta_inv;
346   const T dyInv                 = spline_m->y_grid.delta_inv;
347   const T dzInv                 = spline_m->z_grid.delta_inv;
348   grads[index]                  = gx * dxInv;
349   grads[index + out_offset]     = gy * dyInv;
350   grads[index + 2 * out_offset] = gz * dzInv;
351   hess[index]                   = hxx * dxInv * dxInv;
352   hess[index + out_offset]      = hxy * dxInv * dyInv;
353   hess[index + 2 * out_offset]  = hxz * dxInv * dzInv;
354   hess[index + 3 * out_offset]  = hyy * dyInv * dyInv;
355   hess[index + 4 * out_offset]  = hyz * dyInv * dzInv;
356   hess[index + 5 * out_offset]  = hzz * dzInv * dzInv;
357 }
358 
359 } // namespace spline2offload
360 #endif
361