1 //////////////////////////////////////////////////////////////////////////////////////
2 // This file is distributed under the University of Illinois/NCSA Open Source License.
3 // See LICENSE file in top directory for details.
4 //
5 // Copyright (c) 2018 QMCPACK developers.
6 //
7 // File developed by: Raymond Clay III, rclay@sandia.gov, Sandia National Laboratories
8 //
9 // File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
10 //////////////////////////////////////////////////////////////////////////////////////
11 // -*- C++ -*-
12 /**@file MultiBsplineStd.hpp
13  *
14  * Literal port of einspline/multi_bspline_eval_d_std3.cpp by Ye at ANL
15  * Modified to handle both float/double and blocked operations
16  * - template parameter T for the precision
17  * - MUB spline object created by einspline allocators.
18  * Function signatures modified anticipating its use by a class that can perform data parallel execution
19  * - evaluate(...., int first, int last)
20  */
21 #ifndef SPLINE2_MULTIEINSPLINE_VGHGH_HPP
22 #define SPLINE2_MULTIEINSPLINE_VGHGH_HPP
23 
24 namespace spline2
25 {
26 template<typename T>
evaluate_vghgh_impl(const typename qmcplusplus::bspline_traits<T,3>::SplineType * restrict spline_m,T x,T y,T z,T * restrict vals,T * restrict grads,T * restrict hess,T * restrict ghess,size_t out_offset,int first,int last)27 inline void evaluate_vghgh_impl(const typename qmcplusplus::bspline_traits<T, 3>::SplineType* restrict spline_m,
28                                 T x,
29                                 T y,
30                                 T z,
31                                 T* restrict vals,
32                                 T* restrict grads,
33                                 T* restrict hess,
34                                 T* restrict ghess,
35                                 size_t out_offset,
36                                 int first,
37                                 int last)
38 {
39   int ix, iy, iz;
40   T tx, ty, tz;
41   T a[4], b[4], c[4];
42   T da[4], db[4], dc[4];
43   T d2a[4], d2b[4], d2c[4];
44   T d3a[4], d3b[4], d3c[4];
45 
46   x -= spline_m->x_grid.start;
47   y -= spline_m->y_grid.start;
48   z -= spline_m->z_grid.start;
49   getSplineBound(x * spline_m->x_grid.delta_inv, tx, ix, spline_m->x_grid.num - 1);
50   getSplineBound(y * spline_m->y_grid.delta_inv, ty, iy, spline_m->y_grid.num - 1);
51   getSplineBound(z * spline_m->z_grid.delta_inv, tz, iz, spline_m->z_grid.num - 1);
52 
53   MultiBsplineData<T>::compute_prefactors(a, da, d2a, d3a, tx);
54   MultiBsplineData<T>::compute_prefactors(b, db, d2b, d3b, ty);
55   MultiBsplineData<T>::compute_prefactors(c, dc, d2c, d3c, tz);
56 
57   const intptr_t xs = spline_m->x_stride;
58   const intptr_t ys = spline_m->y_stride;
59   const intptr_t zs = spline_m->z_stride;
60 
61   const int num_splines = last - first;
62 
63   T* restrict gx = grads;
64   T* restrict gy = grads + out_offset;
65   T* restrict gz = grads + 2 * out_offset;
66 
67   T* restrict hxx = hess;
68   T* restrict hxy = hess + out_offset;
69   T* restrict hxz = hess + 2 * out_offset;
70   T* restrict hyy = hess + 3 * out_offset;
71   T* restrict hyz = hess + 4 * out_offset;
72   T* restrict hzz = hess + 5 * out_offset;
73 
74   T* restrict gh_xxx = ghess;
75   T* restrict gh_xxy = ghess + out_offset;
76   T* restrict gh_xxz = ghess + 2 * out_offset;
77   T* restrict gh_xyy = ghess + 3 * out_offset;
78   T* restrict gh_xyz = ghess + 4 * out_offset;
79   T* restrict gh_xzz = ghess + 5 * out_offset;
80   T* restrict gh_yyy = ghess + 6 * out_offset;
81   T* restrict gh_yyz = ghess + 7 * out_offset;
82   T* restrict gh_yzz = ghess + 8 * out_offset;
83   T* restrict gh_zzz = ghess + 9 * out_offset;
84 
85   std::fill(vals, vals + num_splines, T());
86   std::fill(gx, gx + num_splines, T());
87   std::fill(gy, gy + num_splines, T());
88   std::fill(gz, gz + num_splines, T());
89   std::fill(hxx, hxx + num_splines, T());
90   std::fill(hxy, hxy + num_splines, T());
91   std::fill(hxz, hxz + num_splines, T());
92   std::fill(hyy, hyy + num_splines, T());
93   std::fill(hyz, hyz + num_splines, T());
94   std::fill(hzz, hzz + num_splines, T());
95 
96   std::fill(gh_xxx, gh_xxx + num_splines, T());
97   std::fill(gh_xxy, gh_xxy + num_splines, T());
98   std::fill(gh_xxz, gh_xxz + num_splines, T());
99   std::fill(gh_xyy, gh_xyy + num_splines, T());
100   std::fill(gh_xyz, gh_xyz + num_splines, T());
101   std::fill(gh_xzz, gh_xzz + num_splines, T());
102   std::fill(gh_yyy, gh_yyy + num_splines, T());
103   std::fill(gh_yyz, gh_yyz + num_splines, T());
104   std::fill(gh_yzz, gh_yzz + num_splines, T());
105   std::fill(gh_zzz, gh_zzz + num_splines, T());
106 
107   for (int i = 0; i < 4; i++)
108     for (int j = 0; j < 4; j++)
109     {
110       const T* restrict coefs    = spline_m->coefs + ((ix + i) * xs + (iy + j) * ys + iz * zs) + first;
111       const T* restrict coefszs  = coefs + zs;
112       const T* restrict coefs2zs = coefs + 2 * zs;
113       const T* restrict coefs3zs = coefs + 3 * zs;
114 
115       const T pre20 = d2a[i] * b[j];
116       const T pre10 = da[i] * b[j];
117       const T pre00 = a[i] * b[j];
118       const T pre11 = da[i] * db[j];
119       const T pre01 = a[i] * db[j];
120       const T pre02 = a[i] * d2b[j];
121 
122       const T pre30 = d3a[i] * b[j];
123       const T pre21 = d2a[i] * db[j];
124       const T pre12 = da[i] * d2b[j];
125       const T pre03 = a[i] * d3b[j];
126 
127 
128 #pragma omp simd aligned(coefs, coefszs, coefs2zs, coefs3zs, gx, gy, gz, hxx, hxy, hxz, hyy, hyz, hzz, gh_xxx, gh_xxy, \
129                          gh_xxz, gh_xyy, gh_xyz, gh_xzz, gh_yyy, gh_yyz, gh_yzz, gh_zzz, vals: QMC_SIMD_ALIGNMENT)
130       for (int n = 0; n < num_splines; n++)
131       {
132         T coefsv    = coefs[n];
133         T coefsvzs  = coefszs[n];
134         T coefsv2zs = coefs2zs[n];
135         T coefsv3zs = coefs3zs[n];
136 
137         T sum0 = c[0] * coefsv + c[1] * coefsvzs + c[2] * coefsv2zs + c[3] * coefsv3zs;
138         T sum1 = dc[0] * coefsv + dc[1] * coefsvzs + dc[2] * coefsv2zs + dc[3] * coefsv3zs;
139         T sum2 = d2c[0] * coefsv + d2c[1] * coefsvzs + d2c[2] * coefsv2zs + d2c[3] * coefsv3zs;
140         T sum3 = d3c[0] * coefsv + d3c[1] * coefsvzs + d3c[2] * coefsv2zs + d3c[3] * coefsv3zs;
141 
142         gh_xxx[n] += pre30 * sum0;
143         gh_xxy[n] += pre21 * sum0;
144         gh_xxz[n] += pre20 * sum1;
145         gh_xyy[n] += pre12 * sum0;
146         gh_xyz[n] += pre11 * sum1;
147         gh_xzz[n] += pre10 * sum2;
148         gh_yyy[n] += pre03 * sum0;
149         gh_yyz[n] += pre02 * sum1;
150         gh_yzz[n] += pre01 * sum2;
151         gh_zzz[n] += pre00 * sum3;
152 
153         hxx[n]  += pre20 * sum0;
154         hxy[n]  += pre11 * sum0;
155         hxz[n]  += pre10 * sum1;
156         hyy[n]  += pre02 * sum0;
157         hyz[n]  += pre01 * sum1;
158         hzz[n]  += pre00 * sum2;
159         gx[n]   += pre10 * sum0;
160         gy[n]   += pre01 * sum0;
161         gz[n]   += pre00 * sum1;
162         vals[n] += pre00 * sum0;
163       }
164     }
165 
166   const T dxInv = spline_m->x_grid.delta_inv;
167   const T dyInv = spline_m->y_grid.delta_inv;
168   const T dzInv = spline_m->z_grid.delta_inv;
169   const T dxx   = dxInv * dxInv;
170   const T dyy   = dyInv * dyInv;
171   const T dzz   = dzInv * dzInv;
172   const T dxy   = dxInv * dyInv;
173   const T dxz   = dxInv * dzInv;
174   const T dyz   = dyInv * dzInv;
175 
176   const T dxxx = dxInv * dxInv * dxInv;
177   const T dxxy = dxInv * dxInv * dyInv;
178   const T dxxz = dxInv * dxInv * dzInv;
179   const T dxyy = dxInv * dyInv * dyInv;
180   const T dxyz = dxInv * dyInv * dzInv;
181   const T dxzz = dxInv * dzInv * dzInv;
182   const T dyyy = dyInv * dyInv * dyInv;
183   const T dyyz = dyInv * dyInv * dzInv;
184   const T dyzz = dyInv * dzInv * dzInv;
185   const T dzzz = dzInv * dzInv * dzInv;
186 
187 #pragma omp simd aligned(gx, gy, gz, hxx, hxy, hxz, hyy, hyz, hzz, gh_xxx, gh_xxy, gh_xxz, gh_xyy, gh_xyz, gh_xzz, \
188                          gh_yyy, gh_yyz, gh_yzz, gh_zzz: QMC_SIMD_ALIGNMENT)
189   for (int n = 0; n < num_splines; n++)
190   {
191     gx[n]  *= dxInv;
192     gy[n]  *= dyInv;
193     gz[n]  *= dzInv;
194     hxx[n] *= dxx;
195     hyy[n] *= dyy;
196     hzz[n] *= dzz;
197     hxy[n] *= dxy;
198     hxz[n] *= dxz;
199     hyz[n] *= dyz;
200 
201     gh_xxx[n] *= dxxx;
202     gh_xxy[n] *= dxxy;
203     gh_xxz[n] *= dxxz;
204     gh_xyy[n] *= dxyy;
205     gh_xyz[n] *= dxyz;
206     gh_xzz[n] *= dxzz;
207     gh_yyy[n] *= dyyy;
208     gh_yyz[n] *= dyyz;
209     gh_yzz[n] *= dyzz;
210     gh_zzz[n] *= dzzz;
211   }
212 }
213 
214 } // namespace spline2
215 #endif
216