1 //////////////////////////////////////////////////////////////////////////////////////
2 // This file is distributed under the University of Illinois/NCSA Open Source License.
3 // See LICENSE file in top directory for details.
4 //
5 // Copyright (c) 2016 Jeongnim Kim and QMCPACK developers.
6 //
7 // File developed by: Jeongnim Kim, jeongnim.kim@intel.com, Intel Corp.
8 // Amrita Mathuriya, amrita.mathuriya@intel.com, Intel Corp.
9 // Ye Luo, yeluo@anl.gov, Argonne National Laboratory
10 //
11 // File created by: Jeongnim Kim, jeongnim.kim@intel.com, Intel Corp.
12 //////////////////////////////////////////////////////////////////////////////////////
13 // -*- C++ -*-
14 /**@file MultiBsplineStd.hpp
15 *
16 * Literal port of einspline/multi_bspline_eval_d_std3.cpp by Ye at ANL
17 * Modified to handle both float/double and blocked operations
18 * - template parameter T for the precision
19 * - MUB spline object created by einspline allocators.
20 * Function signatures modified anticipating its use by a class that can perform data parallel execution
21 * - evaluate(...., int first, int last)
22 */
23 #ifndef SPLINE2_OFFLOAD_MULTIEINSPLINE_VGLH_HPP
24 #define SPLINE2_OFFLOAD_MULTIEINSPLINE_VGLH_HPP
25
26 #include "OMPTarget/OMPstd.hpp"
27 #include "spline2/MultiBsplineData.hpp"
28 #include "spline2/MultiBsplineEval_helper.hpp"
29
30 namespace spline2offload
31 {
32 template<typename T>
evaluate_vgl_impl(const typename qmcplusplus::bspline_traits<T,3>::SplineType * restrict spline_m,T x,T y,T z,T * restrict vals,T * restrict grads,T * restrict lapl,size_t out_offset,int first,int last)33 inline void evaluate_vgl_impl(const typename qmcplusplus::bspline_traits<T, 3>::SplineType* restrict spline_m,
34 T x,
35 T y,
36 T z,
37 T* restrict vals,
38 T* restrict grads,
39 T* restrict lapl,
40 size_t out_offset,
41 int first,
42 int last)
43 {
44 x -= spline_m->x_grid.start;
45 y -= spline_m->y_grid.start;
46 z -= spline_m->z_grid.start;
47 T tx, ty, tz;
48 int ix, iy, iz;
49 spline2::getSplineBound(x * spline_m->x_grid.delta_inv, tx, ix, spline_m->x_grid.num - 1);
50 spline2::getSplineBound(y * spline_m->y_grid.delta_inv, ty, iy, spline_m->y_grid.num - 1);
51 spline2::getSplineBound(z * spline_m->z_grid.delta_inv, tz, iz, spline_m->z_grid.num - 1);
52
53 T a[4], b[4], c[4], da[4], db[4], dc[4], d2a[4], d2b[4], d2c[4];
54
55 spline2::MultiBsplineData<T>::compute_prefactors(a, da, d2a, tx);
56 spline2::MultiBsplineData<T>::compute_prefactors(b, db, d2b, ty);
57 spline2::MultiBsplineData<T>::compute_prefactors(c, dc, d2c, tz);
58
59 const intptr_t xs = spline_m->x_stride;
60 const intptr_t ys = spline_m->y_stride;
61 const intptr_t zs = spline_m->z_stride;
62
63 const int num_splines = last - first;
64
65 T* restrict gx = grads;
66 T* restrict gy = grads + out_offset;
67 T* restrict gz = grads + 2 * out_offset;
68 T* restrict lx = lapl;
69 T* restrict ly = lapl + out_offset;
70 T* restrict lz = lapl + 2 * out_offset;
71
72 OMPstd::fill_n(vals, num_splines, T());
73 OMPstd::fill_n(gx, num_splines, T());
74 OMPstd::fill_n(gy, num_splines, T());
75 OMPstd::fill_n(gz, num_splines, T());
76 OMPstd::fill_n(lx, num_splines, T());
77 OMPstd::fill_n(ly, num_splines, T());
78 OMPstd::fill_n(lz, num_splines, T());
79
80 for (int i = 0; i < 4; i++)
81 for (int j = 0; j < 4; j++)
82 {
83 const T pre20 = d2a[i] * b[j];
84 const T pre10 = da[i] * b[j];
85 const T pre00 = a[i] * b[j];
86 const T pre11 = da[i] * db[j];
87 const T pre01 = a[i] * db[j];
88 const T pre02 = a[i] * d2b[j];
89
90 const T* restrict coefs = spline_m->coefs + ((ix + i) * xs + (iy + j) * ys + iz * zs) + first;
91 const T* restrict coefszs = coefs + zs;
92 const T* restrict coefs2zs = coefs + 2 * zs;
93 const T* restrict coefs3zs = coefs + 3 * zs;
94
95 #ifdef ENABLE_OFFLOAD
96 #pragma omp for
97 #else
98 #pragma omp simd aligned(coefs, coefszs, coefs2zs, coefs3zs, gx, gy, gz, lx, ly, lz, vals: QMC_SIMD_ALIGNMENT)
99 #endif
100 for (int n = 0; n < num_splines; n++)
101 {
102 const T coefsv = coefs[n];
103 const T coefsvzs = coefszs[n];
104 const T coefsv2zs = coefs2zs[n];
105 const T coefsv3zs = coefs3zs[n];
106
107 T sum0 = c[0] * coefsv + c[1] * coefsvzs + c[2] * coefsv2zs + c[3] * coefsv3zs;
108 T sum1 = dc[0] * coefsv + dc[1] * coefsvzs + dc[2] * coefsv2zs + dc[3] * coefsv3zs;
109 T sum2 = d2c[0] * coefsv + d2c[1] * coefsvzs + d2c[2] * coefsv2zs + d2c[3] * coefsv3zs;
110 gx[n] += pre10 * sum0;
111 gy[n] += pre01 * sum0;
112 gz[n] += pre00 * sum1;
113 lx[n] += pre20 * sum0;
114 ly[n] += pre02 * sum0;
115 lz[n] += pre00 * sum2;
116 vals[n] += pre00 * sum0;
117 }
118 }
119
120 const T dxInv = spline_m->x_grid.delta_inv;
121 const T dyInv = spline_m->y_grid.delta_inv;
122 const T dzInv = spline_m->z_grid.delta_inv;
123
124 const T dxInv2 = dxInv * dxInv;
125 const T dyInv2 = dyInv * dyInv;
126 const T dzInv2 = dzInv * dzInv;
127
128 #ifdef ENABLE_OFFLOAD
129 #pragma omp for
130 #else
131 #pragma omp simd aligned(gx, gy, gz, lx: QMC_SIMD_ALIGNMENT)
132 #endif
133 for (int n = 0; n < num_splines; n++)
134 {
135 gx[n] *= dxInv;
136 gy[n] *= dyInv;
137 gz[n] *= dzInv;
138 lx[n] = lx[n] * dxInv2 + ly[n] * dyInv2 + lz[n] * dzInv2;
139 }
140 }
141
142 template<typename T>
evaluate_vgh_impl(const typename qmcplusplus::bspline_traits<T,3>::SplineType * restrict spline_m,T x,T y,T z,T * restrict vals,T * restrict grads,T * restrict hess,size_t out_offset,int first,int last)143 inline void evaluate_vgh_impl(const typename qmcplusplus::bspline_traits<T, 3>::SplineType* restrict spline_m,
144 T x,
145 T y,
146 T z,
147 T* restrict vals,
148 T* restrict grads,
149 T* restrict hess,
150 size_t out_offset,
151 int first,
152 int last)
153 {
154 int ix, iy, iz;
155 T tx, ty, tz;
156 T a[4], b[4], c[4], da[4], db[4], dc[4], d2a[4], d2b[4], d2c[4];
157
158 x -= spline_m->x_grid.start;
159 y -= spline_m->y_grid.start;
160 z -= spline_m->z_grid.start;
161 spline2::getSplineBound(x * spline_m->x_grid.delta_inv, tx, ix, spline_m->x_grid.num - 1);
162 spline2::getSplineBound(y * spline_m->y_grid.delta_inv, ty, iy, spline_m->y_grid.num - 1);
163 spline2::getSplineBound(z * spline_m->z_grid.delta_inv, tz, iz, spline_m->z_grid.num - 1);
164
165 spline2::MultiBsplineData<T>::compute_prefactors(a, da, d2a, tx);
166 spline2::MultiBsplineData<T>::compute_prefactors(b, db, d2b, ty);
167 spline2::MultiBsplineData<T>::compute_prefactors(c, dc, d2c, tz);
168
169 const intptr_t xs = spline_m->x_stride;
170 const intptr_t ys = spline_m->y_stride;
171 const intptr_t zs = spline_m->z_stride;
172
173 const int num_splines = last - first;
174
175 T* restrict gx = grads;
176 T* restrict gy = grads + out_offset;
177 T* restrict gz = grads + 2 * out_offset;
178
179 T* restrict hxx = hess;
180 T* restrict hxy = hess + out_offset;
181 T* restrict hxz = hess + 2 * out_offset;
182 T* restrict hyy = hess + 3 * out_offset;
183 T* restrict hyz = hess + 4 * out_offset;
184 T* restrict hzz = hess + 5 * out_offset;
185
186 OMPstd::fill_n(vals, num_splines, T());
187 OMPstd::fill_n(gx, num_splines, T());
188 OMPstd::fill_n(gy, num_splines, T());
189 OMPstd::fill_n(gz, num_splines, T());
190 OMPstd::fill_n(hxx, num_splines, T());
191 OMPstd::fill_n(hxy, num_splines, T());
192 OMPstd::fill_n(hxz, num_splines, T());
193 OMPstd::fill_n(hyy, num_splines, T());
194 OMPstd::fill_n(hyz, num_splines, T());
195 OMPstd::fill_n(hzz, num_splines, T());
196
197 for (int i = 0; i < 4; i++)
198 for (int j = 0; j < 4; j++)
199 {
200 const T* restrict coefs = spline_m->coefs + ((ix + i) * xs + (iy + j) * ys + iz * zs) + first;
201 const T* restrict coefszs = coefs + zs;
202 const T* restrict coefs2zs = coefs + 2 * zs;
203 const T* restrict coefs3zs = coefs + 3 * zs;
204
205 const T pre20 = d2a[i] * b[j];
206 const T pre10 = da[i] * b[j];
207 const T pre00 = a[i] * b[j];
208 const T pre11 = da[i] * db[j];
209 const T pre01 = a[i] * db[j];
210 const T pre02 = a[i] * d2b[j];
211
212 #ifdef ENABLE_OFFLOAD
213 #pragma omp for
214 #else
215 #pragma omp simd aligned(coefs, coefszs, coefs2zs, coefs3zs, vals, gx, gy, gz, hxx, hyy, hzz, hxy, hxz, hyz: QMC_SIMD_ALIGNMENT)
216 #endif
217 for (int n = 0; n < num_splines; n++)
218 {
219 T coefsv = coefs[n];
220 T coefsvzs = coefszs[n];
221 T coefsv2zs = coefs2zs[n];
222 T coefsv3zs = coefs3zs[n];
223
224 T sum0 = c[0] * coefsv + c[1] * coefsvzs + c[2] * coefsv2zs + c[3] * coefsv3zs;
225 T sum1 = dc[0] * coefsv + dc[1] * coefsvzs + dc[2] * coefsv2zs + dc[3] * coefsv3zs;
226 T sum2 = d2c[0] * coefsv + d2c[1] * coefsvzs + d2c[2] * coefsv2zs + d2c[3] * coefsv3zs;
227
228 hxx[n] += pre20 * sum0;
229 hxy[n] += pre11 * sum0;
230 hxz[n] += pre10 * sum1;
231 hyy[n] += pre02 * sum0;
232 hyz[n] += pre01 * sum1;
233 hzz[n] += pre00 * sum2;
234 gx[n] += pre10 * sum0;
235 gy[n] += pre01 * sum0;
236 gz[n] += pre00 * sum1;
237 vals[n] += pre00 * sum0;
238 }
239 }
240
241 const T dxInv = spline_m->x_grid.delta_inv;
242 const T dyInv = spline_m->y_grid.delta_inv;
243 const T dzInv = spline_m->z_grid.delta_inv;
244 const T dxx = dxInv * dxInv;
245 const T dyy = dyInv * dyInv;
246 const T dzz = dzInv * dzInv;
247 const T dxy = dxInv * dyInv;
248 const T dxz = dxInv * dzInv;
249 const T dyz = dyInv * dzInv;
250
251 #ifdef ENABLE_OFFLOAD
252 #pragma omp for
253 #else
254 #pragma omp simd aligned(gx, gy, gz, hxx, hyy, hzz, hxy, hxz, hyz: QMC_SIMD_ALIGNMENT)
255 #endif
256 for (int n = 0; n < num_splines; n++)
257 {
258 gx[n] *= dxInv;
259 gy[n] *= dyInv;
260 gz[n] *= dzInv;
261 hxx[n] *= dxx;
262 hyy[n] *= dyy;
263 hzz[n] *= dzz;
264 hxy[n] *= dxy;
265 hxz[n] *= dxz;
266 hyz[n] *= dyz;
267 }
268 }
269
270 template<typename T>
evaluate_vgh_impl_v2(const typename qmcplusplus::bspline_traits<T,3>::SplineType * restrict spline_m,int ix,int iy,int iz,const T a[4],const T b[4],const T c[4],const T da[4],const T db[4],const T dc[4],const T d2a[4],const T d2b[4],const T d2c[4],T * restrict vals,T * restrict grads,T * restrict hess,const size_t out_offset,const int first,const int index)271 inline void evaluate_vgh_impl_v2(const typename qmcplusplus::bspline_traits<T, 3>::SplineType* restrict spline_m,
272 int ix,
273 int iy,
274 int iz,
275 const T a[4],
276 const T b[4],
277 const T c[4],
278 const T da[4],
279 const T db[4],
280 const T dc[4],
281 const T d2a[4],
282 const T d2b[4],
283 const T d2c[4],
284 T* restrict vals,
285 T* restrict grads,
286 T* restrict hess,
287 const size_t out_offset,
288 const int first,
289 const int index)
290 {
291 const intptr_t xs = spline_m->x_stride;
292 const intptr_t ys = spline_m->y_stride;
293 const intptr_t zs = spline_m->z_stride;
294
295 T val = T();
296 T gx = T();
297 T gy = T();
298 T gz = T();
299 T hxx = T();
300 T hxy = T();
301 T hxz = T();
302 T hyy = T();
303 T hyz = T();
304 T hzz = T();
305
306 for (int i = 0; i < 4; i++)
307 for (int j = 0; j < 4; j++)
308 {
309 const T* restrict coefs = spline_m->coefs + ((ix + i) * xs + (iy + j) * ys + iz * zs) + first;
310 const T* restrict coefszs = coefs + zs;
311 const T* restrict coefs2zs = coefs + 2 * zs;
312 const T* restrict coefs3zs = coefs + 3 * zs;
313
314 const T pre20 = d2a[i] * b[j];
315 const T pre10 = da[i] * b[j];
316 const T pre00 = a[i] * b[j];
317 const T pre11 = da[i] * db[j];
318 const T pre01 = a[i] * db[j];
319 const T pre02 = a[i] * d2b[j];
320
321 T coefsv = coefs[index];
322 T coefsvzs = coefszs[index];
323 T coefsv2zs = coefs2zs[index];
324 T coefsv3zs = coefs3zs[index];
325
326 T sum0 = c[0] * coefsv + c[1] * coefsvzs + c[2] * coefsv2zs + c[3] * coefsv3zs;
327 T sum1 = dc[0] * coefsv + dc[1] * coefsvzs + dc[2] * coefsv2zs + dc[3] * coefsv3zs;
328 T sum2 = d2c[0] * coefsv + d2c[1] * coefsvzs + d2c[2] * coefsv2zs + d2c[3] * coefsv3zs;
329
330 hxx += pre20 * sum0;
331 hxy += pre11 * sum0;
332 hxz += pre10 * sum1;
333 hyy += pre02 * sum0;
334 hyz += pre01 * sum1;
335 hzz += pre00 * sum2;
336 gx += pre10 * sum0;
337 gy += pre01 * sum0;
338 gz += pre00 * sum1;
339 val += pre00 * sum0;
340 }
341
342 // put data back to the result vector
343 vals[index] = val;
344
345 const T dxInv = spline_m->x_grid.delta_inv;
346 const T dyInv = spline_m->y_grid.delta_inv;
347 const T dzInv = spline_m->z_grid.delta_inv;
348 grads[index] = gx * dxInv;
349 grads[index + out_offset] = gy * dyInv;
350 grads[index + 2 * out_offset] = gz * dzInv;
351 hess[index] = hxx * dxInv * dxInv;
352 hess[index + out_offset] = hxy * dxInv * dyInv;
353 hess[index + 2 * out_offset] = hxz * dxInv * dzInv;
354 hess[index + 3 * out_offset] = hyy * dyInv * dyInv;
355 hess[index + 4 * out_offset] = hyz * dyInv * dzInv;
356 hess[index + 5 * out_offset] = hzz * dzInv * dzInv;
357 }
358
359 } // namespace spline2offload
360 #endif
361