1 //////////////////////////////////////////////////////////////////////////////////////
2 // This file is distributed under the University of Illinois/NCSA Open Source License.
3 // See LICENSE file in top directory for details.
4 //
5 // Copyright (c) 2016 Jeongnim Kim and QMCPACK developers.
6 //
7 // File developed by: Ken Esler, kpesler@gmail.com, University of Illinois at Urbana-Champaign
8 //                    Luke Shulenburger, lshulen@sandia.gov, Sandia National Laboratories
9 //                    Jeremy McMinnis, jmcminis@gmail.com, University of Illinois at Urbana-Champaign
10 //                    Jeongnim Kim, jeongnim.kim@gmail.com, University of Illinois at Urbana-Champaign
11 //                    Paul R. C. Kent, kentpr@ornl.gov, Oak Ridge National Laboratory
12 //                    Ye Luo, yeluo@anl.gov, Argonne National Laboratory
13 //
14 // File created by: Ken Esler, kpesler@gmail.com, University of Illinois at Urbana-Champaign
15 //////////////////////////////////////////////////////////////////////////////////////
16 
17 
18 #define MAX_SPLINES 100
19 #include <stdio.h>
20 #include "BsplineJastrowCuda.h"
21 #include "CUDA_legacy/gpu_misc.h"
22 
23 bool AisInitialized = false;
24 
25 __constant__ float AcudaSpline[48];
26 __constant__ double AcudaSpline_double[48];
27 
recipSqrt(float x)28 inline __device__ float recipSqrt(float x) { return rsqrtf(x); }
recipSqrt(double x)29 inline __device__ double recipSqrt(double x) { return rsqrt(x); }
30 
dist(float dx,float dy,float dz)31 inline __device__ float dist(float dx, float dy, float dz) { return sqrtf(dx * dx + dy * dy + dz * dz); }
32 
dist(double dx,double dy,double dz)33 inline __device__ double dist(double dx, double dy, double dz) { return sqrt(dx * dx + dy * dy + dz * dz); }
34 
35 
cuda_spline_init()36 void cuda_spline_init()
37 {
38   // clang-format off
39   float A_h[48] = { -1.0/6.0,  3.0/6.0, -3.0/6.0, 1.0/6.0,
40                     3.0/6.0, -6.0/6.0,  0.0/6.0, 4.0/6.0,
41                     -3.0/6.0,  3.0/6.0,  3.0/6.0, 1.0/6.0,
42                     1.0/6.0,  0.0/6.0,  0.0/6.0, 0.0/6.0,
43                     0.0,     -0.5,      1.0,    -0.5,
44                     0.0,      1.5,     -2.0,     0.0,
45                     0.0,     -1.5,      1.0,     0.5,
46                     0.0,      0.5,      0.0,     0.0,
47                     0.0,      0.0,     -1.0,     1.0,
48                     0.0,      0.0,      3.0,    -2.0,
49                     0.0,      0.0,     -3.0,     1.0,
50                     0.0,      0.0,      1.0,     0.0
51                   };
52   cudaMemcpyToSymbol(AcudaSpline, A_h, 48*sizeof(float), 0,
53                      cudaMemcpyHostToDevice);
54   double A_d[48] = { -1.0/6.0,  3.0/6.0, -3.0/6.0, 1.0/6.0,
55                      3.0/6.0, -6.0/6.0,  0.0/6.0, 4.0/6.0,
56                      -3.0/6.0,  3.0/6.0,  3.0/6.0, 1.0/6.0,
57                      1.0/6.0,  0.0/6.0,  0.0/6.0, 0.0/6.0,
58                      0.0,     -0.5,      1.0,    -0.5,
59                      0.0,      1.5,     -2.0,     0.0,
60                      0.0,     -1.5,      1.0,     0.5,
61                      0.0,      0.5,      0.0,     0.0,
62                      0.0,      0.0,     -1.0,     1.0,
63                      0.0,      0.0,      3.0,    -2.0,
64                      0.0,      0.0,     -3.0,     1.0,
65                      0.0,      0.0,      1.0,     0.0
66                    };
67   cudaMemcpyToSymbol(AcudaSpline_double, A_d, 48*sizeof(double), 0,
68                      cudaMemcpyHostToDevice);
69   AisInitialized = true;
70   // clang-format on
71 }
72 
73 
74 template<typename T>
eval_1d_spline(const T dist,const T rmax,const T drInv,const T A[4][4],T coefs[])75 __device__ __forceinline__ T eval_1d_spline(const T dist, const T rmax, const T drInv, const T A[4][4], T coefs[])
76 {
77   if (dist >= rmax)
78     return (T)0.0;
79   T s = dist * drInv;
80   T sf;
81   T t       = modff(s, (float*)&sf);
82   int index = (int)sf;
83   T t2      = t * t;
84   T t3      = t * t2;
85   T coefs0  = coefs[index];
86   T coefs1  = coefs[index + 1];
87   T coefs2  = coefs[index + 2];
88   T coefs3  = coefs[index + 3];
89   T val0    = A[0][0] * t3 + A[0][1] * t2 + A[0][2] * t + A[0][3];
90   T val1    = A[1][0] * t3 + A[1][1] * t2 + A[1][2] * t + A[1][3];
91   T val2    = A[2][0] * t3 + A[2][1] * t2 + A[2][2] * t + A[2][3];
92   T val3    = A[3][0] * t3 + A[3][1] * t2 + A[3][2] * t + A[3][3];
93   return (coefs0 * val0 + coefs1 * val1 + coefs2 * val2 + coefs3 * val3);
94 }
95 
96 
97 template<typename T>
eval_1d_spline_vgl(T dist,T rmax,T drInv,T A[12][4],T coefs[],T & u,T & du,T & d2u)98 __device__ inline void eval_1d_spline_vgl(T dist, T rmax, T drInv, T A[12][4], T coefs[], T& u, T& du, T& d2u)
99 {
100   if (dist >= rmax)
101   {
102     u = du = d2u = (T)0.0;
103     return;
104   }
105   T s       = dist * drInv;
106   T sf      = floorf(s);
107   int index = (int)sf;
108   T t       = s - sf;
109   T t2      = t * t;
110   T t3      = t * t2;
111   u         = (coefs[index + 0] * (A[0][0] * t3 + A[0][1] * t2 + A[0][2] * t + A[0][3]) +
112        coefs[index + 1] * (A[1][0] * t3 + A[1][1] * t2 + A[1][2] * t + A[1][3]) +
113        coefs[index + 2] * (A[2][0] * t3 + A[2][1] * t2 + A[2][2] * t + A[2][3]) +
114        coefs[index + 3] * (A[3][0] * t3 + A[3][1] * t2 + A[3][2] * t + A[3][3]));
115   du        = drInv *
116       (coefs[index + 0] * (A[4][0] * t3 + A[4][1] * t2 + A[4][2] * t + A[4][3]) +
117        coefs[index + 1] * (A[5][0] * t3 + A[5][1] * t2 + A[5][2] * t + A[5][3]) +
118        coefs[index + 2] * (A[6][0] * t3 + A[6][1] * t2 + A[6][2] * t + A[6][3]) +
119        coefs[index + 3] * (A[7][0] * t3 + A[7][1] * t2 + A[7][2] * t + A[7][3]));
120   d2u = drInv * drInv *
121       (coefs[index + 0] * (A[8][0] * t3 + A[8][1] * t2 + A[8][2] * t + A[8][3]) +
122        coefs[index + 1] * (A[9][0] * t3 + A[9][1] * t2 + A[9][2] * t + A[9][3]) +
123        coefs[index + 2] * (A[10][0] * t3 + A[10][1] * t2 + A[10][2] * t + A[10][3]) +
124        coefs[index + 3] * (A[11][0] * t3 + A[11][1] * t2 + A[11][2] * t + A[11][3]));
125 }
126 
127 
128 #define MAX_COEFS 32
129 template<typename T, int BS>
two_body_sum_kernel(T ** R,int e1_first,int e1_last,int e2_first,int e2_last,T * spline_coefs,int numCoefs,T rMax,T * sum)130 __global__ void two_body_sum_kernel(T** R,
131                                     int e1_first,
132                                     int e1_last,
133                                     int e2_first,
134                                     int e2_last,
135                                     T* spline_coefs,
136                                     int numCoefs,
137                                     T rMax,
138                                     T* sum)
139 {
140   T dr    = rMax / (T)(numCoefs - 3);
141   T drInv = 1.0 / dr;
142   __syncthreads();
143   // Safety for rounding error
144   rMax *= 0.999999f;
145   int tid = threadIdx.x;
146   __shared__ T* myR;
147   if (tid == 0)
148     myR = R[blockIdx.x];
149   __shared__ T coefs[MAX_COEFS];
150   if (tid < numCoefs)
151     coefs[tid] = spline_coefs[tid];
152   __shared__ T r1[BS][3], r2[BS][3];
153   __shared__ T A[4][4];
154   if (tid < 16)
155     A[tid >> 2][tid & 3] = AcudaSpline[tid];
156   __syncthreads();
157   int N1  = e1_last - e1_first + 1;
158   int N2  = e2_last - e2_first + 1;
159   int NB1 = N1 / BS + ((N1 % BS) ? 1 : 0);
160   int NB2 = N2 / BS + ((N2 % BS) ? 1 : 0);
161   T mysum = (T)0.0;
162   for (int b1 = 0; b1 < NB1; b1++)
163   {
164     // Load block of positions from global memory
165     for (int i = 0; i < 3; i++)
166       if ((3 * b1 + i) * BS + tid < 3 * N1)
167         r1[0][i * BS + tid] = myR[3 * e1_first + (3 * b1 + i) * BS + tid];
168     __syncthreads();
169     int ptcl1 = e1_first + b1 * BS + tid;
170     for (int b2 = 0; b2 < NB2; b2++)
171     {
172       // Load block of positions from global memory
173       for (int i = 0; i < 3; i++)
174         if ((3 * b2 + i) * BS + tid < 3 * N2)
175           r2[0][i * BS + tid] = myR[3 * e2_first + (3 * b2 + i) * BS + tid];
176       __syncthreads();
177       // Now, loop over particles
178       int end = (b2 + 1) * BS < N2 ? BS : N2 - b2 * BS;
179       for (int j = 0; j < end; j++)
180       {
181         int ptcl2 = e2_first + b2 * BS + j;
182         T dx, dy, dz;
183         dx  = r2[j][0] - r1[tid][0];
184         dy  = r2[j][1] - r1[tid][1];
185         dz  = r2[j][2] - r1[tid][2];
186         T d = dist(dx, dy, dz);
187         if (ptcl1 != ptcl2 && (ptcl1 < (N1 + e1_first)) && (ptcl2 < (N2 + e2_first)))
188           mysum += eval_1d_spline(d, rMax, drInv, A, coefs);
189       }
190       __syncthreads();
191     }
192   }
193   __shared__ T shared_sum[BS];
194   shared_sum[tid] = mysum;
195   __syncthreads();
196   for (int s = BS >> 1; s > 0; s >>= 1)
197   {
198     if (tid < s)
199       shared_sum[tid] += shared_sum[tid + s];
200     __syncthreads();
201   }
202   T factor = (e1_first == e2_first) ? 0.5 : 1.0;
203   if (tid == 0)
204     sum[blockIdx.x] += factor * shared_sum[0];
205 }
206 
two_body_sum(float * R[],int e1_first,int e1_last,int e2_first,int e2_last,float spline_coefs[],int numCoefs,float rMax,float sum[],int numWalkers)207 void two_body_sum(float* R[],
208                   int e1_first,
209                   int e1_last,
210                   int e2_first,
211                   int e2_last,
212                   float spline_coefs[],
213                   int numCoefs,
214                   float rMax,
215                   float sum[],
216                   int numWalkers)
217 {
218   if (!AisInitialized)
219     cuda_spline_init();
220   const int BS = 128;
221   dim3 dimBlock(BS);
222   dim3 dimGrid(numWalkers);
223   two_body_sum_kernel<float, BS>
224       <<<dimGrid, dimBlock>>>(R, e1_first, e1_last, e2_first, e2_last, spline_coefs, numCoefs, rMax, sum);
225 }
226 
227 
two_body_sum(double * R[],int e1_first,int e1_last,int e2_first,int e2_last,double spline_coefs[],int numCoefs,double rMax,double sum[],int numWalkers)228 void two_body_sum(double* R[],
229                   int e1_first,
230                   int e1_last,
231                   int e2_first,
232                   int e2_last,
233                   double spline_coefs[],
234                   int numCoefs,
235                   double rMax,
236                   double sum[],
237                   int numWalkers)
238 {
239   if (!AisInitialized)
240     cuda_spline_init();
241   const int BS = 128;
242   dim3 dimBlock(BS);
243   dim3 dimGrid(numWalkers);
244   two_body_sum_kernel<double, BS>
245       <<<dimGrid, dimBlock>>>(R, e1_first, e1_last, e2_first, e2_last, spline_coefs, numCoefs, rMax, sum);
246 }
247 
248 
249 template<typename T, int BS>
two_body_ratio_kernel(T ** R,int first,int last,T * Rnew,int inew,int offset,T * spline_coefs,int numCoefs,T rMax,T * sum)250 __global__ void two_body_ratio_kernel(T** R,
251                                       int first,
252                                       int last,
253                                       T* Rnew,
254                                       int inew,
255                                       int offset,
256                                       T* spline_coefs,
257                                       int numCoefs,
258                                       T rMax,
259                                       T* sum)
260 {
261   T dr    = rMax / (T)(numCoefs - 3);
262   T drInv = 1.0 / dr;
263   __syncthreads();
264   // Safety for rounding error
265   rMax *= 0.999999f;
266   int tid = threadIdx.x;
267   __shared__ T* myR;
268   __shared__ T myRnew[3], myRold[3];
269   if (tid == 0)
270     myR = R[blockIdx.x];
271   __syncthreads();
272   if (tid < 3)
273   {
274     myRnew[tid] = Rnew[3 * (blockIdx.x + offset) + tid];
275     myRold[tid] = myR[3 * inew + tid];
276   }
277   __syncthreads();
278   __shared__ T coefs[MAX_COEFS];
279   __shared__ T r1[BS][3];
280   if (tid < numCoefs)
281     coefs[tid] = spline_coefs[tid];
282   __shared__ T A[4][4];
283   if (tid < 16)
284     A[(tid >> 2)][tid & 3] = AcudaSpline[tid];
285   __syncthreads();
286   int N  = last - first + 1;
287   int NB = N / BS + ((N % BS) ? 1 : 0);
288   __shared__ T shared_sum[BS];
289   shared_sum[tid] = (T)0.0;
290   for (int b = 0; b < NB; b++)
291   {
292     // Load block of positions from global memory
293     for (int i = 0; i < 3; i++)
294     {
295       int n = i * BS + tid;
296       if ((3 * b + i) * BS + tid < 3 * N)
297         r1[0][n] = myR[3 * first + (3 * b + i) * BS + tid];
298     }
299     __syncthreads();
300     int ptcl1 = first + b * BS + tid;
301     T dx, dy, dz;
302     dx      = myRnew[0] - r1[tid][0];
303     dy      = myRnew[1] - r1[tid][1];
304     dz      = myRnew[2] - r1[tid][2];
305     T d     = dist(dx, dy, dz);
306     T delta = eval_1d_spline(d, rMax, drInv, A, coefs);
307     dx      = myRold[0] - r1[tid][0];
308     dy      = myRold[1] - r1[tid][1];
309     dz      = myRold[2] - r1[tid][2];
310     d       = dist(dx, dy, dz);
311     delta -= eval_1d_spline(d, rMax, drInv, A, coefs);
312     if (ptcl1 != inew && (ptcl1 < (N + first)))
313       shared_sum[tid] += delta;
314     __syncthreads();
315   }
316   __syncthreads();
317   for (int s = (BS >> 1); s > 0; s >>= 1)
318   {
319     if (tid < s)
320       shared_sum[tid] += shared_sum[tid + s];
321     __syncthreads();
322   }
323   if (tid == 0)
324     sum[blockIdx.x] += shared_sum[0];
325 }
326 
327 
two_body_ratio(float * R[],int first,int last,float Rnew[],int inew,int offset,float spline_coefs[],int numCoefs,float rMax,float sum[],int numWalkers)328 void two_body_ratio(float* R[],
329                     int first,
330                     int last,
331                     float Rnew[],
332                     int inew,
333                     int offset,
334                     float spline_coefs[],
335                     int numCoefs,
336                     float rMax,
337                     float sum[],
338                     int numWalkers)
339 {
340   if (!AisInitialized)
341     cuda_spline_init();
342   const int BS = 128;
343   dim3 dimBlock(BS);
344   dim3 dimGrid(numWalkers);
345   two_body_ratio_kernel<float, BS>
346       <<<dimGrid, dimBlock>>>(R, first, last, Rnew, inew, offset, spline_coefs, numCoefs, rMax, sum);
347 }
348 
349 
two_body_ratio(double * R[],int first,int last,double Rnew[],int inew,int offset,double spline_coefs[],int numCoefs,double rMax,double sum[],int numWalkers)350 void two_body_ratio(double* R[],
351                     int first,
352                     int last,
353                     double Rnew[],
354                     int inew,
355                     int offset,
356                     double spline_coefs[],
357                     int numCoefs,
358                     double rMax,
359                     double sum[],
360                     int numWalkers)
361 {
362   if (!AisInitialized)
363     cuda_spline_init();
364   dim3 dimBlock(128);
365   dim3 dimGrid(numWalkers);
366   two_body_ratio_kernel<double, 128>
367       <<<dimGrid, dimBlock>>>(R, first, last, Rnew, inew, offset, spline_coefs, numCoefs, rMax, sum);
368 }
369 
370 
371 template<typename T, int BS>
two_body_ratio_grad_kernel(T ** R,int first,int last,T * Rnew,int inew,int offset,T * spline_coefs,int numCoefs,T rMax,bool zero,T * ratio_grad)372 __global__ void two_body_ratio_grad_kernel(T** R,
373                                            int first,
374                                            int last,
375                                            T* Rnew,
376                                            int inew,
377                                            int offset,
378                                            T* spline_coefs,
379                                            int numCoefs,
380                                            T rMax,
381                                            bool zero,
382                                            T* ratio_grad)
383 {
384   int tid = threadIdx.x;
385   T dr    = rMax / (T)(numCoefs - 3);
386   T drInv = 1.0 / dr;
387   __syncthreads();
388   // Safety for rounding error
389   rMax *= 0.999999f;
390   __shared__ T* myR;
391   __shared__ T myRnew[3], myRold[3];
392   if (tid == 0)
393     myR = R[blockIdx.x];
394   __syncthreads();
395   if (tid < 3)
396   {
397     myRnew[tid] = Rnew[3 * (blockIdx.x + offset) + tid];
398     myRold[tid] = myR[3 * inew + tid];
399   }
400   __syncthreads();
401   __shared__ T coefs[MAX_COEFS];
402   __shared__ T r1[BS][3];
403   if (tid < numCoefs)
404     coefs[tid] = spline_coefs[tid];
405   __syncthreads();
406   __shared__ T A[12][4];
407   if (tid < 16)
408   {
409     A[0 + (tid >> 2)][tid & 3] = AcudaSpline[tid + 0];
410     A[4 + (tid >> 2)][tid & 3] = AcudaSpline[tid + 16];
411     A[8 + (tid >> 2)][tid & 3] = AcudaSpline[tid + 32];
412   }
413   __syncthreads();
414   int N  = last - first + 1;
415   int NB = N / BS + ((N % BS) ? 1 : 0);
416   __shared__ T shared_sum[BS];
417   __shared__ T shared_grad[BS][3];
418   shared_sum[tid]     = (T)0.0;
419   shared_grad[tid][0] = shared_grad[tid][1] = shared_grad[tid][2] = 0.0f;
420   for (int b = 0; b < NB; b++)
421   {
422     // Load block of positions from global memory
423     for (int i = 0; i < 3; i++)
424     {
425       int n = i * BS + tid;
426       if ((3 * b + i) * BS + tid < 3 * N)
427         r1[0][n] = myR[3 * first + (3 * b + i) * BS + tid];
428     }
429     __syncthreads();
430     int ptcl1 = first + b * BS + tid;
431     T dx, dy, dz, u, du, d2u, delta, d;
432     dx    = myRold[0] - r1[tid][0];
433     dy    = myRold[1] - r1[tid][1];
434     dz    = myRold[2] - r1[tid][2];
435     d     = dist(dx, dy, dz);
436     delta = -eval_1d_spline(d, rMax, drInv, A, coefs);
437     dx    = myRnew[0] - r1[tid][0];
438     dy    = myRnew[1] - r1[tid][1];
439     dz    = myRnew[2] - r1[tid][2];
440     d     = dist(dx, dy, dz);
441     eval_1d_spline_vgl(d, rMax, drInv, A, coefs, u, du, d2u);
442     delta += u;
443     if (ptcl1 != inew && (ptcl1 < (N + first)))
444     {
445       du /= d;
446       shared_sum[tid]     += delta;
447       shared_grad[tid][0] += du * dx;
448       shared_grad[tid][1] += du * dy;
449       shared_grad[tid][2] += du * dz;
450     }
451     __syncthreads();
452   }
453   __syncthreads();
454   for (int s = (BS >> 1); s > 0; s >>= 1)
455   {
456     if (tid < s)
457     {
458       shared_sum[tid]     += shared_sum[tid + s];
459       shared_grad[tid][0] += shared_grad[tid + s][0];
460       shared_grad[tid][1] += shared_grad[tid + s][1];
461       shared_grad[tid][2] += shared_grad[tid + s][2];
462     }
463     __syncthreads();
464   }
465   if (tid == 0)
466   {
467     if (zero)
468     {
469       ratio_grad[4 * blockIdx.x + 0] = shared_sum[0];
470       ratio_grad[4 * blockIdx.x + 1] = shared_grad[0][0];
471       ratio_grad[4 * blockIdx.x + 2] = shared_grad[0][1];
472       ratio_grad[4 * blockIdx.x + 3] = shared_grad[0][2];
473     }
474     else
475     {
476       ratio_grad[4 * blockIdx.x + 0] += shared_sum[0];
477       ratio_grad[4 * blockIdx.x + 1] += shared_grad[0][0];
478       ratio_grad[4 * blockIdx.x + 2] += shared_grad[0][1];
479       ratio_grad[4 * blockIdx.x + 3] += shared_grad[0][2];
480     }
481   }
482 }
483 
484 
two_body_ratio_grad(float * R[],int first,int last,float Rnew[],int inew,int offset,float spline_coefs[],int numCoefs,float rMax,bool zero,float ratio_grad[],int numWalkers)485 void two_body_ratio_grad(float* R[],
486                          int first,
487                          int last,
488                          float Rnew[],
489                          int inew,
490                          int offset,
491                          float spline_coefs[],
492                          int numCoefs,
493                          float rMax,
494                          bool zero,
495                          float ratio_grad[],
496                          int numWalkers)
497 {
498   if (!AisInitialized)
499     cuda_spline_init();
500   const int BS = 32;
501   dim3 dimBlock(BS);
502   dim3 dimGrid(numWalkers);
503   two_body_ratio_grad_kernel<float, BS>
504       <<<dimGrid, dimBlock>>>(R, first, last, Rnew, inew, offset, spline_coefs, numCoefs, rMax, zero, ratio_grad);
505 }
506 
507 
two_body_ratio_grad(double * R[],int first,int last,double Rnew[],int inew,int offset,double spline_coefs[],int numCoefs,double rMax,bool zero,double ratio_grad[],int numWalkers)508 void two_body_ratio_grad(double* R[],
509                          int first,
510                          int last,
511                          double Rnew[],
512                          int inew,
513                          int offset,
514                          double spline_coefs[],
515                          int numCoefs,
516                          double rMax,
517                          bool zero,
518                          double ratio_grad[],
519                          int numWalkers)
520 {
521   if (!AisInitialized)
522     cuda_spline_init();
523   const int BS = 32;
524   dim3 dimBlock(BS);
525   dim3 dimGrid(numWalkers);
526   two_body_ratio_grad_kernel<double, BS>
527       <<<dimGrid, dimBlock>>>(R, first, last, Rnew, inew, offset, spline_coefs, numCoefs, rMax, zero, ratio_grad);
528 }
529 
530 
531 template<int BS>
two_body_NLratio_kernel(NLjobGPU<float> * jobs,int first,int last,float ** spline_coefs,int * numCoefs,float * rMaxList)532 __global__ void two_body_NLratio_kernel(NLjobGPU<float>* jobs,
533                                         int first,
534                                         int last,
535                                         float** spline_coefs,
536                                         int* numCoefs,
537                                         float* rMaxList)
538 {
539   const int MAX_RATIOS = 18;
540   int tid              = threadIdx.x;
541   __shared__ NLjobGPU<float> myJob;
542   __shared__ float myRnew[MAX_RATIOS][3], myRold[3];
543   __shared__ float* myCoefs;
544   __shared__ int myNumCoefs;
545   __shared__ float rMax;
546   if (tid == 0)
547   {
548     myJob      = jobs[blockIdx.x];
549     myCoefs    = spline_coefs[blockIdx.x];
550     myNumCoefs = numCoefs[blockIdx.x];
551     rMax       = rMaxList[blockIdx.x];
552   }
553   __syncthreads();
554   if (tid < 3)
555     myRold[tid] = myJob.R[3 * myJob.Elec + tid];
556   for (int i = 0; i < 3; i++)
557     if (i * BS + tid < 3 * myJob.NumQuadPoints)
558       myRnew[0][i * BS + tid] = myJob.QuadPoints[i * BS + tid];
559   float dr    = rMax / (float)(myNumCoefs - 3);
560   float drInv = 1.0 / dr;
561   __shared__ float coefs[MAX_COEFS];
562   //  __shared__ float r1[BS][3];
563   if (tid < myNumCoefs)
564     coefs[tid] = myCoefs[tid];
565   __shared__ float A[4][4];
566   if (tid < 16)
567     A[(tid >> 2)][tid & 3] = AcudaSpline[tid];
568   __syncthreads();
569   int N  = last - first + 1;
570   int NB = N / BS + ((N % BS) ? 1 : 0);
571   __shared__ float shared_sum[MAX_RATIOS][BS + 1];
572   for (int iq = 0; iq < myJob.NumQuadPoints; iq++)
573     shared_sum[iq][tid] = (float)0.0;
574   for (int b = 0; b < NB * BS; b += BS)
575   {
576     float3 r1_r = ((float3*)(myJob.R))[first + b + tid];
577     int ptcl1   = first + b + tid;
578     if (ptcl1 != myJob.Elec && (ptcl1 < (N + first)))
579     {
580       float dx, dy, dz;
581       dx         = myRold[0] - r1_r.x;
582       dy         = myRold[1] - r1_r.y;
583       dz         = myRold[2] - r1_r.z;
584       float d    = dist(dx, dy, dz);
585       float uOld = eval_1d_spline(d, rMax, drInv, A, coefs);
586       for (int iq = 0; iq < myJob.NumQuadPoints; iq++)
587       {
588         dx = myRnew[iq][0] - r1_r.x;
589         dy = myRnew[iq][1] - r1_r.y;
590         dz = myRnew[iq][2] - r1_r.z;
591         d  = dist(dx, dy, dz);
592         shared_sum[iq][tid] += eval_1d_spline(d, rMax, drInv, A, coefs) - uOld;
593       }
594     }
595   }
596   __syncthreads();
597   for (int s = (BS >> 1); s > 0; s >>= 1)
598   {
599     if (tid < s)
600       for (int iq = 0; iq < myJob.NumQuadPoints; iq++)
601         shared_sum[iq][tid] += shared_sum[iq][tid + s];
602     __syncthreads();
603   }
604   if (tid < myJob.NumQuadPoints)
605     myJob.Ratios[tid] *= expf(-shared_sum[tid][0]); // note this is single-precision!
606 }
607 
608 
609 template<int BS>
two_body_NLratio_kernel(NLjobGPU<double> * jobs,int first,int last,double ** spline_coefs,int * numCoefs,double * rMaxList)610 __global__ void two_body_NLratio_kernel(NLjobGPU<double>* jobs,
611                                         int first,
612                                         int last,
613                                         double** spline_coefs,
614                                         int* numCoefs,
615                                         double* rMaxList)
616 {
617   const int MAX_RATIOS = 18;
618   int tid              = threadIdx.x;
619   __shared__ NLjobGPU<double> myJob;
620   __shared__ double myRnew[MAX_RATIOS][3], myRold[3];
621   __shared__ double* myCoefs;
622   __shared__ int myNumCoefs;
623   __shared__ double rMax;
624   if (tid == 0)
625   {
626     myJob      = jobs[blockIdx.x];
627     myCoefs    = spline_coefs[blockIdx.x];
628     myNumCoefs = numCoefs[blockIdx.x];
629     rMax       = rMaxList[blockIdx.x];
630   }
631   __syncthreads();
632   if (tid < 3)
633     myRold[tid] = myJob.R[3 * myJob.Elec + tid];
634   for (int i = 0; i < 3; i++)
635     if (i * BS + tid < 3 * myJob.NumQuadPoints)
636       myRnew[0][i * BS + tid] = myJob.QuadPoints[i * BS + tid];
637   __syncthreads();
638   double dr    = rMax / (double)(myNumCoefs - 3);
639   double drInv = 1.0 / dr;
640   __shared__ double coefs[MAX_COEFS];
641   __shared__ double r1[BS][3];
642   if (tid < myNumCoefs)
643     coefs[tid] = myCoefs[tid];
644   __syncthreads();
645   __shared__ double A[4][4];
646   if (tid < 16)
647     A[(tid >> 2)][tid & 3] = AcudaSpline[tid];
648   __syncthreads();
649   int N  = last - first + 1;
650   int NB = N / BS + ((N % BS) ? 1 : 0);
651   __shared__ double shared_sum[MAX_RATIOS][BS + 1];
652   for (int iq = 0; iq < myJob.NumQuadPoints; iq++)
653     shared_sum[iq][tid] = (double)0.0;
654   for (int b = 0; b < NB; b++)
655   {
656     // Load block of positions from global memory
657     for (int i = 0; i < 3; i++)
658     {
659       int n = i * BS + tid;
660       if ((3 * b + i) * BS + tid < 3 * N)
661         r1[0][n] = myJob.R[3 * first + (3 * b + i) * BS + tid];
662     }
663     __syncthreads();
664     int ptcl1 = first + b * BS + tid;
665     double dx, dy, dz;
666     dx          = myRold[0] - r1[tid][0];
667     dy          = myRold[1] - r1[tid][1];
668     dz          = myRold[2] - r1[tid][2];
669     double d    = dist(dx, dy, dz);
670     double uOld = eval_1d_spline(d, rMax, drInv, A, coefs);
671     for (int iq = 0; iq < myJob.NumQuadPoints; iq++)
672     {
673       dx = myRnew[iq][0] - r1[tid][0];
674       dy = myRnew[iq][1] - r1[tid][1];
675       dz = myRnew[iq][2] - r1[tid][2];
676       d  = dist(dx, dy, dz);
677       if (ptcl1 != myJob.Elec && (ptcl1 < (N + first)))
678         shared_sum[iq][tid] += eval_1d_spline(d, rMax, drInv, A, coefs) - uOld;
679     }
680     __syncthreads();
681   }
682   for (int s = (BS >> 1); s > 0; s >>= 1)
683   {
684     if (tid < s)
685       for (int iq = 0; iq < myJob.NumQuadPoints; iq++)
686         shared_sum[iq][tid] += shared_sum[iq][tid + s];
687     __syncthreads();
688   }
689   if (tid < myJob.NumQuadPoints)
690     myJob.Ratios[tid] *= exp(-shared_sum[tid][0]);
691 }
692 
693 
two_body_NLratios(NLjobGPU<float> jobs[],int first,int last,float * spline_coefs[],int numCoefs[],float rMax[],int numjobs)694 void two_body_NLratios(NLjobGPU<float> jobs[],
695                        int first,
696                        int last,
697                        float* spline_coefs[],
698                        int numCoefs[],
699                        float rMax[],
700                        int numjobs)
701 {
702   if (!AisInitialized)
703     cuda_spline_init();
704   const int BS = 32;
705   dim3 dimBlock(BS);
706   while (numjobs > 65535)
707   {
708     dim3 dimGrid(65535);
709     two_body_NLratio_kernel<BS><<<dimGrid, dimBlock>>>(jobs, first, last, spline_coefs, numCoefs, rMax);
710     jobs += 65535;
711     numjobs -= 65535;
712   }
713   dim3 dimGrid(numjobs);
714   two_body_NLratio_kernel<BS><<<dimGrid, dimBlock>>>(jobs, first, last, spline_coefs, numCoefs, rMax);
715 }
716 
717 
two_body_NLratios(NLjobGPU<double> jobs[],int first,int last,double * spline_coefs[],int numCoefs[],double rMax[],int numjobs)718 void two_body_NLratios(NLjobGPU<double> jobs[],
719                        int first,
720                        int last,
721                        double* spline_coefs[],
722                        int numCoefs[],
723                        double rMax[],
724                        int numjobs)
725 {
726   if (!AisInitialized)
727     cuda_spline_init();
728   const int BS = 32;
729   dim3 dimBlock(BS);
730   while (numjobs > 65535)
731   {
732     dim3 dimGrid(65535);
733     two_body_NLratio_kernel<BS><<<dimGrid, dimBlock>>>(jobs, first, last, spline_coefs, numCoefs, rMax);
734     jobs += 65535;
735     numjobs -= 65535;
736   }
737   dim3 dimGrid(numjobs);
738   two_body_NLratio_kernel<BS><<<dimGrid, dimBlock>>>(jobs, first, last, spline_coefs, numCoefs, rMax);
739 }
740 
741 
742 #define MAX_COEFS 32
743 
744 template<typename T, int BS>
two_body_grad_lapl_kernel(T ** R,int e1_first,int e1_last,int e2_first,int e2_last,T * spline_coefs,int numCoefs,T rMax,T * gradLapl,int row_stride)745 __global__ void two_body_grad_lapl_kernel(T** R,
746                                           int e1_first,
747                                           int e1_last,
748                                           int e2_first,
749                                           int e2_last,
750                                           T* spline_coefs,
751                                           int numCoefs,
752                                           T rMax,
753                                           T* gradLapl,
754                                           int row_stride)
755 {
756   T dr    = rMax / (T)(numCoefs - 3);
757   T drInv = 1.0 / dr;
758   __syncthreads();
759   // Safety for rounding error
760   rMax *= 0.999999f;
761   int tid = threadIdx.x;
762   __shared__ T* myR;
763   if (tid == 0)
764     myR = R[blockIdx.x];
765   __shared__ T coefs[MAX_COEFS];
766   if (tid < numCoefs)
767     coefs[tid] = spline_coefs[tid];
768   __shared__ T r1[BS][3], r2[BS][3];
769   __shared__ T A[12][4];
770   if (tid < 16)
771   {
772     A[0 + (tid >> 2)][tid & 3] = AcudaSpline[tid + 0];
773     A[4 + (tid >> 2)][tid & 3] = AcudaSpline[tid + 16];
774     A[8 + (tid >> 2)][tid & 3] = AcudaSpline[tid + 32];
775   }
776   __syncthreads();
777   int N1  = e1_last - e1_first + 1;
778   int N2  = e2_last - e2_first + 1;
779   int NB1 = N1 / BS + ((N1 % BS) ? 1 : 0);
780   int NB2 = N2 / BS + ((N2 % BS) ? 1 : 0);
781   __shared__ T sGradLapl[BS][4];
782   for (int b1 = 0; b1 < NB1; b1++)
783   {
784     // Load block of positions from global memory
785     for (int i = 0; i < 3; i++)
786       if ((3 * b1 + i) * BS + tid < 3 * N1)
787         r1[0][i * BS + tid] = myR[3 * e1_first + (3 * b1 + i) * BS + tid];
788     __syncthreads();
789     int ptcl1         = e1_first + b1 * BS + tid;
790     int offset        = blockIdx.x * row_stride + 4 * b1 * BS + 4 * e1_first;
791     sGradLapl[tid][0] = sGradLapl[tid][1] = sGradLapl[tid][2] = sGradLapl[tid][3] = (T)0.0;
792     for (int b2 = 0; b2 < NB2; b2++)
793     {
794       // Load block of positions from global memory
795       for (int i = 0; i < 3; i++)
796         if ((3 * b2 + i) * BS + tid < 3 * N2)
797           r2[0][i * BS + tid] = myR[3 * e2_first + (3 * b2 + i) * BS + tid];
798       __syncthreads();
799       // Now, loop over particles
800       int end = (b2 + 1) * BS < N2 ? BS : N2 - b2 * BS;
801       for (int j = 0; j < end; j++)
802       {
803         int ptcl2 = e2_first + b2 * BS + j;
804         T dx, dy, dz, u, du, d2u;
805         dx  = r2[j][0] - r1[tid][0];
806         dy  = r2[j][1] - r1[tid][1];
807         dz  = r2[j][2] - r1[tid][2];
808         T d = dist(dx, dy, dz);
809         eval_1d_spline_vgl(d, rMax, drInv, A, coefs, u, du, d2u);
810         if (ptcl1 != ptcl2 && (ptcl1 < (N1 + e1_first)) && (ptcl2 < (N2 + e2_first)))
811         {
812           du /= d;
813           sGradLapl[tid][0] += du * dx;
814           sGradLapl[tid][1] += du * dy;
815           sGradLapl[tid][2] += du * dz;
816           sGradLapl[tid][3] -= d2u + 2.0 * du;
817         }
818       }
819       __syncthreads();
820     }
821     for (int i = 0; i < 4; i++)
822       if ((4 * b1 + i) * BS + tid < 4 * N1)
823         gradLapl[offset + i * BS + tid] += sGradLapl[0][i * BS + tid];
824     __syncthreads();
825   }
826 }
827 
828 
two_body_grad_lapl(float * R[],int e1_first,int e1_last,int e2_first,int e2_last,float spline_coefs[],int numCoefs,float rMax,float gradLapl[],int row_stride,int numWalkers)829 void two_body_grad_lapl(float* R[],
830                         int e1_first,
831                         int e1_last,
832                         int e2_first,
833                         int e2_last,
834                         float spline_coefs[],
835                         int numCoefs,
836                         float rMax,
837                         float gradLapl[],
838                         int row_stride,
839                         int numWalkers)
840 {
841   const int BS = 32;
842   dim3 dimBlock(BS);
843   dim3 dimGrid(numWalkers);
844   two_body_grad_lapl_kernel<float, BS><<<dimGrid, dimBlock>>>(R,
845                                                               e1_first,
846                                                               e1_last,
847                                                               e2_first,
848                                                               e2_last,
849                                                               spline_coefs,
850                                                               numCoefs,
851                                                               rMax,
852                                                               gradLapl,
853                                                               row_stride);
854 }
855 
856 
two_body_grad_lapl(double * R[],int e1_first,int e1_last,int e2_first,int e2_last,double spline_coefs[],int numCoefs,double rMax,double gradLapl[],int row_stride,int numWalkers)857 void two_body_grad_lapl(double* R[],
858                         int e1_first,
859                         int e1_last,
860                         int e2_first,
861                         int e2_last,
862                         double spline_coefs[],
863                         int numCoefs,
864                         double rMax,
865                         double gradLapl[],
866                         int row_stride,
867                         int numWalkers)
868 {
869   if (!AisInitialized)
870     cuda_spline_init();
871   const int BS = 32;
872   dim3 dimBlock(BS);
873   dim3 dimGrid(numWalkers);
874   two_body_grad_lapl_kernel<double, BS><<<dimGrid, dimBlock>>>(R,
875                                                                e1_first,
876                                                                e1_last,
877                                                                e2_first,
878                                                                e2_last,
879                                                                spline_coefs,
880                                                                numCoefs,
881                                                                rMax,
882                                                                gradLapl,
883                                                                row_stride);
884 }
885 
886 
887 template<typename T, int BS>
two_body_grad_kernel(T ** R,int first,int last,int iat,T * spline_coefs,int numCoefs,T rMax,bool zeroOut,T * grad)888 __global__ void two_body_grad_kernel(T** R,
889                                      int first,
890                                      int last,
891                                      int iat,
892                                      T* spline_coefs,
893                                      int numCoefs,
894                                      T rMax,
895                                      bool zeroOut,
896                                      T* grad)
897 {
898   T dr    = rMax / (T)(numCoefs - 3);
899   T drInv = 1.0 / dr;
900   __syncthreads();
901   // Safety for rounding error
902   rMax *= 0.999999f;
903   int tid = threadIdx.x;
904   __shared__ T *myR, r2[3];
905   if (tid == 0)
906     myR = R[blockIdx.x];
907   __syncthreads();
908   if (tid < 3)
909     r2[tid] = myR[3 * iat + tid];
910   __shared__ T coefs[MAX_COEFS];
911   if (tid < numCoefs)
912     coefs[tid] = spline_coefs[tid];
913   __shared__ T r1[BS][3];
914   __shared__ T A[12][4];
915   if (tid < 16)
916   {
917     A[0 + (tid >> 2)][tid & 3] = AcudaSpline[tid + 0];
918     A[4 + (tid >> 2)][tid & 3] = AcudaSpline[tid + 16];
919     A[8 + (tid >> 2)][tid & 3] = AcudaSpline[tid + 32];
920   }
921   __syncthreads();
922   int N  = last - first + 1;
923   int NB = N / BS + ((N % BS) ? 1 : 0);
924   __shared__ T sGrad[BS][3];
925   sGrad[tid][0] = sGrad[tid][1] = sGrad[tid][2] = (T)0.0;
926   for (int b = 0; b < NB; b++)
927   {
928     // Load block of positions from global memory
929     for (int i = 0; i < 3; i++)
930       if ((3 * b + i) * BS + tid < 3 * N)
931         r1[0][i * BS + tid] = myR[3 * first + (3 * b + i) * BS + tid];
932     __syncthreads();
933     int ptcl1 = first + b * BS + tid;
934     T dx, dy, dz, u, du, d2u;
935     dx  = r2[0] - r1[tid][0];
936     dy  = r2[1] - r1[tid][1];
937     dz  = r2[2] - r1[tid][2];
938     T d = dist(dx, dy, dz);
939     eval_1d_spline_vgl(d, rMax, drInv, A, coefs, u, du, d2u);
940     if (ptcl1 != iat && ptcl1 < (N + first))
941     {
942       du /= d;
943       sGrad[tid][0] += du * dx;
944       sGrad[tid][1] += du * dy;
945       sGrad[tid][2] += du * dz;
946     }
947     __syncthreads();
948   }
949   // Do reduction across threads in block
950   for (int s = BS >> 1; s > 0; s >>= 1)
951   {
952     if (tid < s)
953     {
954       sGrad[tid][0] += sGrad[tid + s][0];
955       sGrad[tid][1] += sGrad[tid + s][1];
956       sGrad[tid][2] += sGrad[tid + s][2];
957     }
958     __syncthreads();
959   }
960   if (tid < 3)
961   {
962     if (zeroOut)
963       grad[3 * blockIdx.x + tid] = sGrad[0][tid];
964     else
965       grad[3 * blockIdx.x + tid] += sGrad[0][tid];
966   }
967 }
968 
969 
two_body_gradient(float * R[],int first,int last,int iat,float spline_coefs[],int numCoefs,float rMax,bool zeroOut,float grad[],int numWalkers)970 void two_body_gradient(float* R[],
971                        int first,
972                        int last,
973                        int iat,
974                        float spline_coefs[],
975                        int numCoefs,
976                        float rMax,
977                        bool zeroOut,
978                        float grad[],
979                        int numWalkers)
980 {
981   const int BS = 32;
982   dim3 dimBlock(BS);
983   dim3 dimGrid(numWalkers);
984   two_body_grad_kernel<float, BS>
985       <<<dimGrid, dimBlock, 0, gpu::kernelStream>>>(R, first, last, iat, spline_coefs, numCoefs, rMax, zeroOut, grad);
986 }
987 
988 
two_body_gradient(double * R[],int first,int last,int iat,double spline_coefs[],int numCoefs,double rMax,bool zeroOut,double grad[],int numWalkers)989 void two_body_gradient(double* R[],
990                        int first,
991                        int last,
992                        int iat,
993                        double spline_coefs[],
994                        int numCoefs,
995                        double rMax,
996                        bool zeroOut,
997                        double grad[],
998                        int numWalkers)
999 {
1000   if (!AisInitialized)
1001     cuda_spline_init();
1002   const int BS = 32;
1003   dim3 dimBlock(BS);
1004   dim3 dimGrid(numWalkers);
1005   two_body_grad_kernel<double, BS>
1006       <<<dimGrid, dimBlock, 0, gpu::kernelStream>>>(R, first, last, iat, spline_coefs, numCoefs, rMax, zeroOut, grad);
1007 }
1008 
1009 
1010 template<typename T, int BS, unsigned COMPLEX>
two_body_derivs_kernel(T ** R,T ** gradLogPsi,int e1_first,int e1_last,int e2_first,int e2_last,int numCoefs,T rMax,T ** derivs)1011 __global__ void two_body_derivs_kernel(T** R,
1012                                        T** gradLogPsi,
1013                                        int e1_first,
1014                                        int e1_last,
1015                                        int e2_first,
1016                                        int e2_last,
1017                                        int numCoefs,
1018                                        T rMax,
1019                                        T** derivs)
1020 {
1021   T dr    = rMax / (T)(numCoefs - 3);
1022   T drInv = 1.0f / dr;
1023   __syncthreads();
1024   // Safety for rounding error
1025   rMax *= 0.999999f;
1026   int tid = threadIdx.x;
1027   __shared__ T *myR, *myGrad, *myDerivs;
1028   if (tid == 0)
1029   {
1030     myR      = R[blockIdx.x];
1031     myGrad   = gradLogPsi[blockIdx.x];
1032     myDerivs = derivs[blockIdx.x];
1033   }
1034   __shared__ T sderivs[MAX_COEFS][2];
1035   // __shared__ T coefs[MAX_COEFS];
1036   // if (tid < numCoefs)
1037   //   coefs[tid] = spline_coefs[tid];
1038   __shared__ T r1[BS][3], r2[BS][3];
1039   __shared__ T A[12][4];
1040   if (tid < 16)
1041   {
1042     A[0 + (tid >> 2)][tid & 3] = AcudaSpline[tid + 0];
1043     A[4 + (tid >> 2)][tid & 3] = AcudaSpline[tid + 16];
1044     A[8 + (tid >> 2)][tid & 3] = AcudaSpline[tid + 32];
1045   }
1046   __syncthreads();
1047   sderivs[tid][0] = T();
1048   sderivs[tid][1] = T();
1049   int N1          = e1_last - e1_first + 1;
1050   int N2          = e2_last - e2_first + 1;
1051   int NB1         = N1 / BS + ((N1 % BS) ? 1 : 0);
1052   int NB2         = N2 / BS + ((N2 % BS) ? 1 : 0);
1053   __shared__ T sGrad[BS][3];
1054   for (int b1 = 0; b1 < NB1; b1++)
1055   {
1056     // Load block of positions from global memory
1057     for (int i = 0; i < 3; i++)
1058       if ((3 * b1 + i) * BS + tid < 3 * N1)
1059       {
1060         int outoff       = i * BS + tid;
1061         int inoff        = outoff + 3 * e1_first + 3 * b1 * BS;
1062         r1[0][outoff]    = myR[inoff]; //[3*e1_first + (3*b1+i)*BS + tid];
1063         sGrad[0][outoff] = myGrad[inoff * COMPLEX];
1064       }
1065     __syncthreads();
1066     int ptcl1 = e1_first + b1 * BS + tid;
1067     for (int b2 = 0; b2 < NB2; b2++)
1068     {
1069       // Load block of positions from global memory
1070       for (int i = 0; i < 3; i++)
1071         if ((3 * b2 + i) * BS + tid < 3 * N2)
1072           r2[0][i * BS + tid] = myR[3 * e2_first + (3 * b2 + i) * BS + tid];
1073       __syncthreads();
1074       // Now, loop over particles
1075       int end = (b2 + 1) * BS < N2 ? BS : N2 - b2 * BS;
1076       for (int j = 0; j < end; j++)
1077       {
1078         int ptcl2 = e2_first + b2 * BS + j;
1079         T dx, dy, dz;
1080         dx        = r2[j][0] - r1[tid][0];
1081         dy        = r2[j][1] - r1[tid][1];
1082         dz        = r2[j][2] - r1[tid][2];
1083         T d       = dist(dx, dy, dz);
1084         T dInv    = 1.0f / d;
1085         T s       = d * drInv;
1086         T sf      = floorf(s);
1087         int index = (int)sf;
1088         T t       = s - sf;
1089         T t2      = t * t;
1090         T t3      = t * t2;
1091         T v0, v1, v2, v3;
1092         // sderivs[index+0][0] += (A[0][0]*t3 + A[0][1]*t2 + A[0][2]*t + A[0][3]);
1093         // sderivs[index+1][0] += (A[1][0]*t3 + A[1][1]*t2 + A[1][2]*t + A[1][3]);
1094         // sderivs[index+2][0] += (A[2][0]*t3 + A[2][1]*t2 + A[2][2]*t + A[2][3]);
1095         // sderivs[index+3][0] += (A[3][0]*t3 + A[3][1]*t2 + A[3][2]*t + A[3][3]);
1096         v0 = (A[0][0] * t3 + A[0][1] * t2 + A[0][2] * t + A[0][3]);
1097         v1 = (A[1][0] * t3 + A[1][1] * t2 + A[1][2] * t + A[1][3]);
1098         v2 = (A[2][0] * t3 + A[2][1] * t2 + A[2][2] * t + A[2][3]);
1099         v3 = (A[3][0] * t3 + A[3][1] * t2 + A[3][2] * t + A[3][3]);
1100         for (int id = 0; id < BS; id++)
1101           if (tid == id && ptcl1 != ptcl2 && ptcl1 <= e1_last && (d < rMax))
1102           {
1103             sderivs[index + 0][0] += v0;
1104             sderivs[index + 1][0] += v1;
1105             sderivs[index + 2][0] += v2;
1106             sderivs[index + 3][0] += v3;
1107           }
1108         T prefact = (dx * sGrad[tid][0] + dy * sGrad[tid][1] + dz * sGrad[tid][2]) * dInv;
1109         T du0     = drInv * (A[4][0] * t3 + A[4][1] * t2 + A[4][2] * t + A[4][3]);
1110         T du1     = drInv * (A[5][0] * t3 + A[5][1] * t2 + A[5][2] * t + A[5][3]);
1111         T du2     = drInv * (A[6][0] * t3 + A[6][1] * t2 + A[6][2] * t + A[6][3]);
1112         T du3     = drInv * (A[7][0] * t3 + A[7][1] * t2 + A[7][2] * t + A[7][3]);
1113         // This is the dot (gradu, grad_log_psi) term.
1114         v0 = 2.0f * prefact * du0;
1115         v1 = 2.0f * prefact * du1;
1116         v2 = 2.0f * prefact * du2;
1117         v3 = 2.0f * prefact * du3;
1118         // This is the lapl u term
1119         v0 -= drInv * drInv * (A[8][0] * t3 + A[8][1] * t2 + A[8][2] * t + A[8][3]) + 2.0f * du0 * dInv;
1120         v1 -= drInv * drInv * (A[9][0] * t3 + A[9][1] * t2 + A[9][2] * t + A[9][3]) + 2.0f * du1 * dInv;
1121         v2 -= drInv * drInv * (A[10][0] * t3 + A[10][1] * t2 + A[10][2] * t + A[10][3]) + 2.0f * du2 * dInv;
1122         v3 -= drInv * drInv * (A[11][0] * t3 + A[11][1] * t2 + A[11][2] * t + A[11][3]) + 2.0f * du3 * dInv;
1123         for (int id = 0; id < BS; id++)
1124           if (tid == id && ptcl1 != ptcl2 && ptcl1 <= e1_last && (d < rMax))
1125           {
1126             sderivs[index + 0][1] += v0;
1127             sderivs[index + 1][1] += v1;
1128             sderivs[index + 2][1] += v2;
1129             sderivs[index + 3][1] += v3;
1130           }
1131       }
1132       __syncthreads();
1133     }
1134   }
1135   //  if (e1_first == e2_first)
1136   sderivs[tid][0] *= 0.5f;
1137   sderivs[tid][1] *= 0.5f;
1138   if (tid < 2 * numCoefs)
1139     myDerivs[tid] = -sderivs[0][tid];
1140   if (tid + BS < 2 * numCoefs)
1141     myDerivs[tid + BS] = sderivs[0][tid + BS];
1142 }
1143 
1144 
two_body_derivs(float * R[],float * gradLogPsi[],int e1_first,int e1_last,int e2_first,int e2_last,int numCoefs,float rMax,float * derivs[],int numWalkers)1145 void two_body_derivs(float* R[],
1146                      float* gradLogPsi[],
1147                      int e1_first,
1148                      int e1_last,
1149                      int e2_first,
1150                      int e2_last,
1151                      int numCoefs,
1152                      float rMax,
1153                      float* derivs[],
1154                      int numWalkers)
1155 {
1156   const int BS = 32;
1157   dim3 dimBlock(BS);
1158   dim3 dimGrid(numWalkers);
1159   two_body_derivs_kernel<float, BS, 1>
1160       <<<dimGrid, dimBlock>>>(R, gradLogPsi, e1_first, e1_last, e2_first, e2_last, numCoefs, rMax, derivs);
1161 }
1162 
two_body_derivs(double * R[],double * gradLogPsi[],int e1_first,int e1_last,int e2_first,int e2_last,int numCoefs,double rMax,double * derivs[],int numWalkers)1163 void two_body_derivs(double* R[],
1164                      double* gradLogPsi[],
1165                      int e1_first,
1166                      int e1_last,
1167                      int e2_first,
1168                      int e2_last,
1169                      int numCoefs,
1170                      double rMax,
1171                      double* derivs[],
1172                      int numWalkers)
1173 {
1174   const int BS = 32;
1175   dim3 dimBlock(BS);
1176   dim3 dimGrid(numWalkers);
1177   two_body_derivs_kernel<double, BS, 1>
1178       <<<dimGrid, dimBlock>>>(R, gradLogPsi, e1_first, e1_last, e2_first, e2_last, numCoefs, rMax, derivs);
1179 }
1180 
1181 
1182 // Ye: use offset to recycle the old routines
1183 // block size can be further optimized.
1184 #ifdef QMC_COMPLEX
two_body_derivs(float * R[],std::complex<float> * gradLogPsi[],int e1_first,int e1_last,int e2_first,int e2_last,int numCoefs,float rMax,float * derivs[],int numWalkers)1185 void two_body_derivs(float* R[],
1186                      std::complex<float>* gradLogPsi[],
1187                      int e1_first,
1188                      int e1_last,
1189                      int e2_first,
1190                      int e2_last,
1191                      int numCoefs,
1192                      float rMax,
1193                      float* derivs[],
1194                      int numWalkers)
1195 {
1196   const int BS = 32;
1197   dim3 dimBlock(BS);
1198   dim3 dimGrid(numWalkers);
1199 
1200   two_body_derivs_kernel<float, BS, 2>
1201       <<<dimGrid, dimBlock>>>(R, (float**)gradLogPsi, e1_first, e1_last, e2_first, e2_last, numCoefs, rMax, derivs);
1202 }
1203 
two_body_derivs(double * R[],std::complex<double> * gradLogPsi[],int e1_first,int e1_last,int e2_first,int e2_last,int numCoefs,double rMax,double * derivs[],int numWalkers)1204 void two_body_derivs(double* R[],
1205                      std::complex<double>* gradLogPsi[],
1206                      int e1_first,
1207                      int e1_last,
1208                      int e2_first,
1209                      int e2_last,
1210                      int numCoefs,
1211                      double rMax,
1212                      double* derivs[],
1213                      int numWalkers)
1214 {
1215   const int BS = 32;
1216   dim3 dimBlock(BS);
1217   dim3 dimGrid(numWalkers);
1218 
1219   two_body_derivs_kernel<double, BS, 2>
1220       <<<dimGrid, dimBlock>>>(R, (double**)gradLogPsi, e1_first, e1_last, e2_first, e2_last, numCoefs, rMax, derivs);
1221 }
1222 #endif
1223 
1224 
1225 ////////////////////////////////////////////////////////////////
1226 //                      One-body routines                     //
1227 ////////////////////////////////////////////////////////////////
1228 
1229 template<typename T, int BS>
one_body_sum_kernel(T * C,T ** R,int cfirst,int clast,int efirst,int elast,T * spline_coefs,int numCoefs,T rMax,T * sum)1230 __global__ void one_body_sum_kernel(T* C,
1231                                     T** R,
1232                                     int cfirst,
1233                                     int clast,
1234                                     int efirst,
1235                                     int elast,
1236                                     T* spline_coefs,
1237                                     int numCoefs,
1238                                     T rMax,
1239                                     T* sum)
1240 {
1241   T dr    = rMax / (T)(numCoefs - 3);
1242   T drInv = 1.0 / dr;
1243   __syncthreads();
1244   // Safety for rounding error
1245   rMax *= 0.999999f;
1246   int tid = threadIdx.x;
1247   __shared__ T* myR;
1248   if (tid == 0)
1249     myR = R[blockIdx.x];
1250   __shared__ T coefs[MAX_COEFS];
1251   if (tid < numCoefs)
1252     coefs[tid] = spline_coefs[tid];
1253   __shared__ T rc[BS][3], re[BS][3];
1254   __shared__ T A[4][4];
1255   if (tid < 16)
1256     A[tid >> 2][tid & 3] = AcudaSpline[tid];
1257   __syncthreads();
1258   int Nc  = clast - cfirst + 1;
1259   int Ne  = elast - efirst + 1;
1260   int NBc = Nc / BS + ((Nc % BS) ? 1 : 0);
1261   int NBe = Ne / BS + ((Ne % BS) ? 1 : 0);
1262   T mysum = (T)0.0;
1263   for (int bc = 0; bc < NBc; bc++)
1264   {
1265     // Load block of positions from global memory
1266     for (int i = 0; i < 3; i++)
1267       if ((3 * bc + i) * BS + tid < 3 * Nc)
1268         rc[0][i * BS + tid] = C[3 * cfirst + (3 * bc + i) * BS + tid];
1269     __syncthreads();
1270     int ptcl1 = cfirst + bc * BS + tid;
1271     for (int be = 0; be < NBe; be++)
1272     {
1273       // Load block of positions from global memory
1274       for (int i = 0; i < 3; i++)
1275         if ((3 * be + i) * BS + tid < 3 * Ne)
1276           re[0][i * BS + tid] = myR[3 * efirst + (3 * be + i) * BS + tid];
1277       __syncthreads();
1278       // Now, loop over particles
1279       int end = (be + 1) * BS < Ne ? BS : Ne - be * BS;
1280       for (int j = 0; j < end; j++)
1281       {
1282         int ptcl2 = efirst + be * BS + j;
1283         T dx, dy, dz;
1284         dx  = re[j][0] - rc[tid][0];
1285         dy  = re[j][1] - rc[tid][1];
1286         dz  = re[j][2] - rc[tid][2];
1287         T d = dist(dx, dy, dz);
1288         if ((ptcl1 < (Nc + cfirst)) && (ptcl2 < (Ne + efirst)))
1289           mysum += eval_1d_spline(d, rMax, drInv, A, coefs);
1290       }
1291     }
1292     __syncthreads();
1293   }
1294   __shared__ T shared_sum[BS];
1295   shared_sum[tid] = mysum;
1296   __syncthreads();
1297   for (int s = BS >> 1; s > 0; s >>= 1)
1298   {
1299     if (tid < s)
1300       shared_sum[tid] += shared_sum[tid + s];
1301     __syncthreads();
1302   }
1303   if (tid == 0)
1304     sum[blockIdx.x] += shared_sum[0];
1305 }
1306 
one_body_sum(float C[],float * R[],int cfirst,int clast,int efirst,int elast,float spline_coefs[],int numCoefs,float rMax,float sum[],int numWalkers)1307 void one_body_sum(float C[],
1308                   float* R[],
1309                   int cfirst,
1310                   int clast,
1311                   int efirst,
1312                   int elast,
1313                   float spline_coefs[],
1314                   int numCoefs,
1315                   float rMax,
1316                   float sum[],
1317                   int numWalkers)
1318 {
1319   if (!AisInitialized)
1320     cuda_spline_init();
1321   const int BS = 32;
1322   dim3 dimBlock(BS);
1323   dim3 dimGrid(numWalkers);
1324   one_body_sum_kernel<float, BS>
1325       <<<dimGrid, dimBlock>>>(C, R, cfirst, clast, efirst, elast, spline_coefs, numCoefs, rMax, sum);
1326 }
1327 
1328 
one_body_sum(double C[],double * R[],int cfirst,int clast,int efirst,int elast,double spline_coefs[],int numCoefs,double rMax,double sum[],int numWalkers)1329 void one_body_sum(double C[],
1330                   double* R[],
1331                   int cfirst,
1332                   int clast,
1333                   int efirst,
1334                   int elast,
1335                   double spline_coefs[],
1336                   int numCoefs,
1337                   double rMax,
1338                   double sum[],
1339                   int numWalkers)
1340 {
1341   if (!AisInitialized)
1342     cuda_spline_init();
1343   const int BS = 128;
1344   dim3 dimBlock(BS);
1345   dim3 dimGrid(numWalkers);
1346   one_body_sum_kernel<double, BS>
1347       <<<dimGrid, dimBlock>>>(C, R, cfirst, clast, efirst, elast, spline_coefs, numCoefs, rMax, sum);
1348 }
1349 
1350 
1351 template<typename T, int BS>
one_body_ratio_kernel(T * C,T ** R,int cfirst,int clast,T * Rnew,int inew,T * spline_coefs,int numCoefs,int nw,T rMax,T * sum)1352 __global__ void one_body_ratio_kernel(T* C,
1353                                       T** R,
1354                                       int cfirst,
1355                                       int clast,
1356                                       T* Rnew,
1357                                       int inew,
1358                                       T* spline_coefs,
1359                                       int numCoefs,
1360                                       int nw,
1361                                       T rMax,
1362                                       T* sum)
1363 {
1364   T dr    = rMax / (T)(numCoefs - 3);
1365   T drInv = 1.0 / dr;
1366   __syncthreads();
1367   // Safety for rounding error
1368   rMax *= 0.999999f;
1369   int tid = threadIdx.x;
1370   __shared__ T* myR;
1371   __shared__ T myRnew[3], myRold[3];
1372   if (tid == 0)
1373     myR = R[blockIdx.x % nw];
1374   __syncthreads();
1375   if (tid < 3)
1376   {
1377     myRnew[tid] = Rnew[3 * blockIdx.x + tid];
1378     myRold[tid] = myR[3 * (inew + blockIdx.x / nw) + tid];
1379   }
1380   __syncthreads();
1381   __shared__ T coefs[MAX_COEFS];
1382   __shared__ T c[BS][3];
1383   if (tid < numCoefs)
1384     coefs[tid] = spline_coefs[tid];
1385   __shared__ T A[4][4];
1386   if (tid < 16)
1387     A[(tid >> 2)][tid & 3] = AcudaSpline[tid];
1388   __syncthreads();
1389   int Nc = clast - cfirst + 1;
1390   int NB = Nc / BS + ((Nc % BS) ? 1 : 0);
1391   __shared__ T shared_sum[BS];
1392   shared_sum[tid] = (T)0.0;
1393   for (int b = 0; b < NB; b++)
1394   {
1395     // Load block of positions from global memory
1396     for (int i = 0; i < 3; i++)
1397     {
1398       int n = i * BS + tid;
1399       if ((3 * b + i) * BS + tid < 3 * Nc)
1400         c[0][n] = C[3 * cfirst + (3 * b + i) * BS + tid];
1401     }
1402     __syncthreads();
1403     int ptcl1 = cfirst + b * BS + tid;
1404     T dx, dy, dz;
1405     dx      = myRnew[0] - c[tid][0];
1406     dy      = myRnew[1] - c[tid][1];
1407     dz      = myRnew[2] - c[tid][2];
1408     T d     = dist(dx, dy, dz);
1409     T delta = eval_1d_spline(d, rMax, drInv, A, coefs);
1410     dx      = myRold[0] - c[tid][0];
1411     dy      = myRold[1] - c[tid][1];
1412     dz      = myRold[2] - c[tid][2];
1413     d       = dist(dx, dy, dz);
1414     delta -= eval_1d_spline(d, rMax, drInv, A, coefs);
1415     if (ptcl1 < (Nc + cfirst))
1416       shared_sum[tid] += delta;
1417     __syncthreads();
1418   }
1419   for (int s = (BS >> 1); s > 0; s >>= 1)
1420   {
1421     if (tid < s)
1422       shared_sum[tid] += shared_sum[tid + s];
1423     __syncthreads();
1424   }
1425   if (tid == 0)
1426     sum[blockIdx.x] += shared_sum[0];
1427 }
1428 
1429 
one_body_ratio(float C[],float * R[],int first,int last,float Rnew[],int inew,float spline_coefs[],int numCoefs,int nw,float rMax,float sum[],int numWalkers)1430 void one_body_ratio(float C[],
1431                     float* R[],
1432                     int first,
1433                     int last,
1434                     float Rnew[],
1435                     int inew,
1436                     float spline_coefs[],
1437                     int numCoefs,
1438                     int nw,
1439                     float rMax,
1440                     float sum[],
1441                     int numWalkers)
1442 {
1443   if (!AisInitialized)
1444     cuda_spline_init();
1445   const int BS = 32;
1446   dim3 dimBlock(BS);
1447   dim3 dimGrid(numWalkers);
1448   one_body_ratio_kernel<float, BS>
1449       <<<dimGrid, dimBlock>>>(C, R, first, last, Rnew, inew, spline_coefs, numCoefs, nw, rMax, sum);
1450 }
1451 
1452 
one_body_ratio(double C[],double * R[],int first,int last,double Rnew[],int inew,double spline_coefs[],int numCoefs,int nw,double rMax,double sum[],int numWalkers)1453 void one_body_ratio(double C[],
1454                     double* R[],
1455                     int first,
1456                     int last,
1457                     double Rnew[],
1458                     int inew,
1459                     double spline_coefs[],
1460                     int numCoefs,
1461                     int nw,
1462                     double rMax,
1463                     double sum[],
1464                     int numWalkers)
1465 {
1466   if (!AisInitialized)
1467     cuda_spline_init();
1468   dim3 dimBlock(128);
1469   dim3 dimGrid(numWalkers);
1470   one_body_ratio_kernel<double, 128>
1471       <<<dimGrid, dimBlock>>>(C, R, first, last, Rnew, inew, spline_coefs, numCoefs, nw, rMax, sum);
1472 }
1473 
1474 
1475 template<typename T, int BS>
one_body_ratio_grad_kernel(T * C,T ** R,int cfirst,int clast,T * Rnew,int inew,T * spline_coefs,int numCoefs,int nw,T rMax,bool zero,T * ratio_grad)1476 __global__ void one_body_ratio_grad_kernel(T* C,
1477                                            T** R,
1478                                            int cfirst,
1479                                            int clast,
1480                                            T* Rnew,
1481                                            int inew,
1482                                            T* spline_coefs,
1483                                            int numCoefs,
1484                                            int nw,
1485                                            T rMax,
1486                                            bool zero,
1487                                            T* ratio_grad)
1488 {
1489   T dr    = rMax / (T)(numCoefs - 3);
1490   T drInv = 1.0 / dr;
1491   __syncthreads();
1492   // Safety for rounding error
1493   rMax *= 0.999999f;
1494   int tid = threadIdx.x;
1495   __shared__ T* myR;
1496   __shared__ T myRnew[3], myRold[3];
1497   if (tid == 0)
1498     myR = R[blockIdx.x % nw];
1499   __syncthreads();
1500   if (tid < 3)
1501   {
1502     myRnew[tid] = Rnew[3 * blockIdx.x + tid];
1503     myRold[tid] = myR[3 * (inew + blockIdx.x / nw) + tid];
1504   }
1505   __syncthreads();
1506   __shared__ T coefs[MAX_COEFS];
1507   __shared__ T c[BS][3];
1508   if (tid < numCoefs)
1509     coefs[tid] = spline_coefs[tid];
1510   __shared__ T A[12][4];
1511   if (tid < 16)
1512   {
1513     A[0 + (tid >> 2)][tid & 3] = AcudaSpline[tid + 0];
1514     A[4 + (tid >> 2)][tid & 3] = AcudaSpline[tid + 16];
1515     A[8 + (tid >> 2)][tid & 3] = AcudaSpline[tid + 32];
1516   }
1517   __syncthreads();
1518   int Nc = clast - cfirst + 1;
1519   int NB = Nc / BS + ((Nc % BS) ? 1 : 0);
1520   __shared__ T shared_sum[BS];
1521   __shared__ T shared_grad[BS][3];
1522   shared_sum[tid]     = (T)0.0;
1523   shared_grad[tid][0] = shared_grad[tid][1] = shared_grad[tid][2] = 0.0f;
1524   for (int b = 0; b < NB; b++)
1525   {
1526     // Load block of positions from global memory
1527     for (int i = 0; i < 3; i++)
1528     {
1529       int n = i * BS + tid;
1530       if ((3 * b + i) * BS + tid < 3 * Nc)
1531         c[0][n] = C[3 * cfirst + (3 * b + i) * BS + tid];
1532     }
1533     __syncthreads();
1534     int ptcl1 = cfirst + b * BS + tid;
1535     T dx, dy, dz, d, delta, u, du, d2u;
1536     dx    = myRold[0] - c[tid][0];
1537     dy    = myRold[1] - c[tid][1];
1538     dz    = myRold[2] - c[tid][2];
1539     d     = dist(dx, dy, dz);
1540     delta = -eval_1d_spline(d, rMax, drInv, A, coefs);
1541     dx    = myRnew[0] - c[tid][0];
1542     dy    = myRnew[1] - c[tid][1];
1543     dz    = myRnew[2] - c[tid][2];
1544     d     = dist(dx, dy, dz);
1545     eval_1d_spline_vgl(d, rMax, drInv, A, coefs, u, du, d2u);
1546     delta += u;
1547     if (ptcl1 < (Nc + cfirst))
1548     {
1549       du /= d;
1550       shared_sum[tid]     += delta;
1551       shared_grad[tid][0] += du * dx;
1552       shared_grad[tid][1] += du * dy;
1553       shared_grad[tid][2] += du * dz;
1554     }
1555     __syncthreads();
1556   }
1557   for (int s = (BS >> 1); s > 0; s >>= 1)
1558   {
1559     if (tid < s)
1560     {
1561       shared_sum[tid]     += shared_sum[tid + s];
1562       shared_grad[tid][0] += shared_grad[tid + s][0];
1563       shared_grad[tid][1] += shared_grad[tid + s][1];
1564       shared_grad[tid][2] += shared_grad[tid + s][2];
1565     }
1566     __syncthreads();
1567   }
1568   if (tid == 0)
1569   {
1570     if (zero)
1571     {
1572       ratio_grad[4 * blockIdx.x + 0] = shared_sum[0];
1573       ratio_grad[4 * blockIdx.x + 1] = shared_grad[0][0];
1574       ratio_grad[4 * blockIdx.x + 2] = shared_grad[0][1];
1575       ratio_grad[4 * blockIdx.x + 3] = shared_grad[0][2];
1576     }
1577     else
1578     {
1579       ratio_grad[4 * blockIdx.x + 0] += shared_sum[0];
1580       ratio_grad[4 * blockIdx.x + 1] += shared_grad[0][0];
1581       ratio_grad[4 * blockIdx.x + 2] += shared_grad[0][1];
1582       ratio_grad[4 * blockIdx.x + 3] += shared_grad[0][2];
1583     }
1584   }
1585 }
1586 
1587 
one_body_ratio_grad(float C[],float * R[],int first,int last,float Rnew[],int inew,float spline_coefs[],int numCoefs,int nw,float rMax,bool zero,float ratio_grad[],int numWalkers)1588 void one_body_ratio_grad(float C[],
1589                          float* R[],
1590                          int first,
1591                          int last,
1592                          float Rnew[],
1593                          int inew,
1594                          float spline_coefs[],
1595                          int numCoefs,
1596                          int nw,
1597                          float rMax,
1598                          bool zero,
1599                          float ratio_grad[],
1600                          int numWalkers)
1601 {
1602   if (!AisInitialized)
1603     cuda_spline_init();
1604   const int BS = 32;
1605   dim3 dimBlock(BS);
1606   dim3 dimGrid(numWalkers);
1607   one_body_ratio_grad_kernel<float, BS>
1608       <<<dimGrid, dimBlock>>>(C, R, first, last, Rnew, inew, spline_coefs, numCoefs, nw, rMax, zero, ratio_grad);
1609 }
1610 
1611 
one_body_ratio_grad(double C[],double * R[],int first,int last,double Rnew[],int inew,double spline_coefs[],int numCoefs,int nw,double rMax,bool zero,double ratio_grad[],int numWalkers)1612 void one_body_ratio_grad(double C[],
1613                          double* R[],
1614                          int first,
1615                          int last,
1616                          double Rnew[],
1617                          int inew,
1618                          double spline_coefs[],
1619                          int numCoefs,
1620                          int nw,
1621                          double rMax,
1622                          bool zero,
1623                          double ratio_grad[],
1624                          int numWalkers)
1625 {
1626   if (!AisInitialized)
1627     cuda_spline_init();
1628   const int BS = 32;
1629   dim3 dimBlock(BS);
1630   dim3 dimGrid(numWalkers);
1631   one_body_ratio_grad_kernel<double, BS>
1632       <<<dimGrid, dimBlock>>>(C, R, first, last, Rnew, inew, spline_coefs, numCoefs, nw, rMax, zero, ratio_grad);
1633 }
1634 
1635 
1636 template<typename T, int BS>
one_body_grad_lapl_kernel(T * C,T ** R,int cfirst,int clast,int efirst,int elast,T * spline_coefs,int numCoefs,T rMax,T * gradLapl,int row_stride)1637 __global__ void one_body_grad_lapl_kernel(T* C,
1638                                           T** R,
1639                                           int cfirst,
1640                                           int clast,
1641                                           int efirst,
1642                                           int elast,
1643                                           T* spline_coefs,
1644                                           int numCoefs,
1645                                           T rMax,
1646                                           T* gradLapl,
1647                                           int row_stride)
1648 {
1649   T dr    = rMax / (T)(numCoefs - 3);
1650   T drInv = 1.0 / dr;
1651   __syncthreads();
1652   // Safety for rounding error
1653   rMax *= 0.999999f;
1654   //  rMax *= 0.99999f;
1655   int tid = threadIdx.x;
1656   __shared__ T* myR;
1657   if (tid == 0)
1658     myR = R[blockIdx.x];
1659   __shared__ T coefs[MAX_COEFS];
1660   if (tid < numCoefs)
1661     coefs[tid] = spline_coefs[tid];
1662   __shared__ T r[BS][3], c[BS][3];
1663   __syncthreads();
1664   __shared__ T A[12][4];
1665   if (tid < 16)
1666   {
1667     A[0 + (tid >> 2)][tid & 3] = AcudaSpline[tid + 0];
1668     A[4 + (tid >> 2)][tid & 3] = AcudaSpline[tid + 16];
1669     A[8 + (tid >> 2)][tid & 3] = AcudaSpline[tid + 32];
1670   }
1671   __syncthreads();
1672   int Nc  = clast - cfirst + 1;
1673   int Ne  = elast - efirst + 1;
1674   int NBc = Nc / BS + ((Nc % BS) ? 1 : 0);
1675   int NBe = Ne / BS + ((Ne % BS) ? 1 : 0);
1676   __shared__ T sGradLapl[BS][4];
1677   for (int be = 0; be < NBe; be++)
1678   {
1679     // Load block of positions from global memory
1680     for (int i = 0; i < 3; i++)
1681       if ((3 * be + i) * BS + tid < 3 * Ne)
1682         r[0][i * BS + tid] = myR[3 * efirst + (3 * be + i) * BS + tid];
1683     __syncthreads();
1684     int eptcl         = efirst + be * BS + tid;
1685     int offset        = blockIdx.x * row_stride + 4 * be * BS + 4 * efirst;
1686     sGradLapl[tid][0] = sGradLapl[tid][1] = sGradLapl[tid][2] = sGradLapl[tid][3] = (T)0.0;
1687     for (int bc = 0; bc < NBc; bc++)
1688     {
1689       // Load block of positions from global memory
1690       for (int i = 0; i < 3; i++)
1691         if ((3 * bc + i) * BS + tid < 3 * Nc)
1692           c[0][i * BS + tid] = C[3 * cfirst + (3 * bc + i) * BS + tid];
1693       __syncthreads();
1694       // Now, loop over particles
1695       int end = ((bc + 1) * BS < Nc) ? BS : Nc - bc * BS;
1696       for (int j = 0; j < end; j++)
1697       {
1698         int cptcl = cfirst + bc * BS + j;
1699         T dx, dy, dz, u, du, d2u;
1700         dx  = r[tid][0] - c[j][0];
1701         dy  = r[tid][1] - c[j][1];
1702         dz  = r[tid][2] - c[j][2];
1703         T d = dist(dx, dy, dz);
1704         eval_1d_spline_vgl(d, rMax, drInv, A, coefs, u, du, d2u);
1705         //u = du = d2u = 0.0f;
1706         if (cptcl < (Nc + cfirst) && (eptcl < (Ne + efirst)))
1707         {
1708           du /= d;
1709           sGradLapl[tid][0] -= du * dx;
1710           sGradLapl[tid][1] -= du * dy;
1711           sGradLapl[tid][2] -= du * dz;
1712           sGradLapl[tid][3] -= d2u + 2.0 * du;
1713         }
1714       }
1715       __syncthreads();
1716     }
1717     __syncthreads();
1718     for (int i = 0; i < 4; i++)
1719       if ((4 * be + i) * BS + tid < 4 * Ne)
1720         gradLapl[offset + i * BS + tid] += sGradLapl[0][i * BS + tid];
1721     __syncthreads();
1722   }
1723 }
1724 
1725 
one_body_grad_lapl(float C[],float * R[],int e1_first,int e1_last,int e2_first,int e2_last,float spline_coefs[],int numCoefs,float rMax,float gradLapl[],int row_stride,int numWalkers)1726 void one_body_grad_lapl(float C[],
1727                         float* R[],
1728                         int e1_first,
1729                         int e1_last,
1730                         int e2_first,
1731                         int e2_last,
1732                         float spline_coefs[],
1733                         int numCoefs,
1734                         float rMax,
1735                         float gradLapl[],
1736                         int row_stride,
1737                         int numWalkers)
1738 {
1739   const int BS = 32;
1740   dim3 dimBlock(BS);
1741   dim3 dimGrid(numWalkers);
1742   one_body_grad_lapl_kernel<float, BS><<<dimGrid, dimBlock>>>(C,
1743                                                               R,
1744                                                               e1_first,
1745                                                               e1_last,
1746                                                               e2_first,
1747                                                               e2_last,
1748                                                               spline_coefs,
1749                                                               numCoefs,
1750                                                               rMax,
1751                                                               gradLapl,
1752                                                               row_stride);
1753 }
1754 
1755 
one_body_grad_lapl(double C[],double * R[],int e1_first,int e1_last,int e2_first,int e2_last,double spline_coefs[],int numCoefs,double rMax,double gradLapl[],int row_stride,int numWalkers)1756 void one_body_grad_lapl(double C[],
1757                         double* R[],
1758                         int e1_first,
1759                         int e1_last,
1760                         int e2_first,
1761                         int e2_last,
1762                         double spline_coefs[],
1763                         int numCoefs,
1764                         double rMax,
1765                         double gradLapl[],
1766                         int row_stride,
1767                         int numWalkers)
1768 {
1769   if (!AisInitialized)
1770     cuda_spline_init();
1771   const int BS = 32;
1772   dim3 dimBlock(BS);
1773   dim3 dimGrid(numWalkers);
1774   one_body_grad_lapl_kernel<double, BS><<<dimGrid, dimBlock>>>(C,
1775                                                                R,
1776                                                                e1_first,
1777                                                                e1_last,
1778                                                                e2_first,
1779                                                                e2_last,
1780                                                                spline_coefs,
1781                                                                numCoefs,
1782                                                                rMax,
1783                                                                gradLapl,
1784                                                                row_stride);
1785 }
1786 
1787 
1788 template<int BS>
one_body_NLratio_kernel(NLjobGPU<float> * jobs,float * C,int first,int last,float * spline_coefs,int numCoefs,float rMax)1789 __global__ void one_body_NLratio_kernel(NLjobGPU<float>* jobs,
1790                                         float* C,
1791                                         int first,
1792                                         int last,
1793                                         float* spline_coefs,
1794                                         int numCoefs,
1795                                         float rMax)
1796 {
1797   const int MAX_RATIOS = 18;
1798   int tid              = threadIdx.x;
1799   __shared__ NLjobGPU<float> myJob;
1800   __shared__ float myRnew[MAX_RATIOS][3], myRold[3];
1801   if (tid == 0)
1802     myJob = jobs[blockIdx.x];
1803   __syncthreads();
1804   if (tid < 3)
1805     myRold[tid] = myJob.R[3 * myJob.Elec + tid];
1806   for (int i = 0; i < 3; i++)
1807     if (i * BS + tid < 3 * myJob.NumQuadPoints)
1808       myRnew[0][i * BS + tid] = myJob.QuadPoints[i * BS + tid];
1809   __syncthreads();
1810   float dr    = rMax / (float)(numCoefs - 3);
1811   float drInv = 1.0 / dr;
1812   __syncthreads();
1813   // Safety for rounding error
1814   rMax *= 0.999999f;
1815   __shared__ float coefs[MAX_COEFS];
1816   __shared__ float c[BS][3];
1817   if (tid < numCoefs)
1818     coefs[tid] = spline_coefs[tid];
1819   __syncthreads();
1820   __shared__ float A[4][4];
1821   if (tid < 16)
1822     A[(tid >> 2)][tid & 3] = AcudaSpline[tid];
1823   __syncthreads();
1824   int N  = last - first + 1;
1825   int NB = N / BS + ((N % BS) ? 1 : 0);
1826   __shared__ float shared_sum[MAX_RATIOS][BS + 1];
1827   for (int iq = 0; iq < myJob.NumQuadPoints; iq++)
1828     shared_sum[iq][tid] = (float)0.0;
1829   for (int b = 0; b < NB; b++)
1830   {
1831     // Load block of positions from global memory
1832     for (int i = 0; i < 3; i++)
1833     {
1834       int n = i * BS + tid;
1835       if ((3 * b + i) * BS + tid < 3 * N)
1836         c[0][n] = C[3 * first + (3 * b + i) * BS + tid];
1837     }
1838     __syncthreads();
1839     int ptcl1 = first + b * BS + tid;
1840     float dx, dy, dz;
1841     dx         = myRold[0] - c[tid][0];
1842     dy         = myRold[1] - c[tid][1];
1843     dz         = myRold[2] - c[tid][2];
1844     float d    = dist(dx, dy, dz);
1845     float uOld = eval_1d_spline(d, rMax, drInv, A, coefs);
1846     for (int iq = 0; iq < myJob.NumQuadPoints; iq++)
1847     {
1848       dx = myRnew[iq][0] - c[tid][0];
1849       dy = myRnew[iq][1] - c[tid][1];
1850       dz = myRnew[iq][2] - c[tid][2];
1851       d  = dist(dx, dy, dz);
1852       if (ptcl1 < (N + first))
1853         shared_sum[iq][tid] += eval_1d_spline(d, rMax, drInv, A, coefs) - uOld;
1854     }
1855     __syncthreads();
1856   }
1857   for (int s = (BS >> 1); s > 0; s >>= 1)
1858   {
1859     if (tid < s)
1860       for (int iq = 0; iq < myJob.NumQuadPoints; iq++)
1861         shared_sum[iq][tid] += shared_sum[iq][tid + s];
1862     __syncthreads();
1863   }
1864   if (tid < myJob.NumQuadPoints)
1865     myJob.Ratios[tid] *= exp(-shared_sum[tid][0]);
1866 }
1867 
1868 
1869 template<int BS>
one_body_NLratio_kernel(NLjobGPU<double> * jobs,double * C,int first,int last,double * spline_coefs,int numCoefs,double rMax)1870 __global__ void one_body_NLratio_kernel(NLjobGPU<double>* jobs,
1871                                         double* C,
1872                                         int first,
1873                                         int last,
1874                                         double* spline_coefs,
1875                                         int numCoefs,
1876                                         double rMax)
1877 {
1878   const int MAX_RATIOS = 18;
1879   int tid              = threadIdx.x;
1880   __shared__ NLjobGPU<double> myJob;
1881   __shared__ double myRnew[MAX_RATIOS][3], myRold[3];
1882   if (tid == 0)
1883     myJob = jobs[blockIdx.x];
1884   __syncthreads();
1885   if (tid < 3)
1886     myRold[tid] = myJob.R[3 * myJob.Elec + tid];
1887   for (int i = 0; i < 3; i++)
1888     if (i * BS + tid < 3 * myJob.NumQuadPoints)
1889       myRnew[0][i * BS + tid] = myJob.QuadPoints[i * BS + tid];
1890   __syncthreads();
1891   double dr    = rMax / (double)(numCoefs - 3);
1892   double drInv = 1.0 / dr;
1893   __shared__ double coefs[MAX_COEFS];
1894   __shared__ double c[BS][3];
1895   if (tid < numCoefs)
1896     coefs[tid] = spline_coefs[tid];
1897   __syncthreads();
1898   __shared__ double A[4][4];
1899   if (tid < 16)
1900     A[(tid >> 2)][tid & 3] = AcudaSpline[tid];
1901   __syncthreads();
1902   int N  = last - first + 1;
1903   int NB = N / BS + ((N % BS) ? 1 : 0);
1904   __shared__ double shared_sum[MAX_RATIOS][BS + 1];
1905   for (int iq = 0; iq < myJob.NumQuadPoints; iq++)
1906     shared_sum[iq][tid] = (double)0.0;
1907   for (int b = 0; b < NB; b++)
1908   {
1909     // Load block of positions from global memory
1910     for (int i = 0; i < 3; i++)
1911     {
1912       int n = i * BS + tid;
1913       if ((3 * b + i) * BS + tid < 3 * N)
1914         c[0][n] = C[3 * first + (3 * b + i) * BS + tid];
1915     }
1916     __syncthreads();
1917     int ptcl1 = first + b * BS + tid;
1918     double dx, dy, dz;
1919     dx          = myRold[0] - c[tid][0];
1920     dy          = myRold[1] - c[tid][1];
1921     dz          = myRold[2] - c[tid][2];
1922     double d    = dist(dx, dy, dz);
1923     double uOld = eval_1d_spline(d, rMax, drInv, A, coefs);
1924     for (int iq = 0; iq < myJob.NumQuadPoints; iq++)
1925     {
1926       dx = myRnew[iq][0] - c[tid][0];
1927       dy = myRnew[iq][1] - c[tid][1];
1928       dz = myRnew[iq][2] - c[tid][2];
1929       d  = dist(dx, dy, dz);
1930       if (ptcl1 < (N + first))
1931         shared_sum[iq][tid] += eval_1d_spline(d, rMax, drInv, A, coefs) - uOld;
1932     }
1933     __syncthreads();
1934   }
1935   for (int s = (BS >> 1); s > 0; s >>= 1)
1936   {
1937     if (tid < s)
1938       for (int iq = 0; iq < myJob.NumQuadPoints; iq++)
1939         shared_sum[iq][tid] += shared_sum[iq][tid + s];
1940     __syncthreads();
1941   }
1942   if (tid < myJob.NumQuadPoints)
1943     myJob.Ratios[tid] *= exp(-shared_sum[tid][0]);
1944 }
1945 
1946 
one_body_NLratios(NLjobGPU<float> jobs[],float C[],int first,int last,float spline_coefs[],int numCoefs,float rMax,int numjobs)1947 void one_body_NLratios(NLjobGPU<float> jobs[],
1948                        float C[],
1949                        int first,
1950                        int last,
1951                        float spline_coefs[],
1952                        int numCoefs,
1953                        float rMax,
1954                        int numjobs)
1955 {
1956   if (!AisInitialized)
1957     cuda_spline_init();
1958   const int BS = 32;
1959   dim3 dimBlock(BS);
1960   while (numjobs > 65535)
1961   {
1962     dim3 dimGrid(65535);
1963     one_body_NLratio_kernel<BS><<<dimGrid, dimBlock>>>(jobs, C, first, last, spline_coefs, numCoefs, rMax);
1964     numjobs -= 65535;
1965     jobs += 65535;
1966   }
1967   dim3 dimGrid(numjobs);
1968   one_body_NLratio_kernel<BS><<<dimGrid, dimBlock>>>(jobs, C, first, last, spline_coefs, numCoefs, rMax);
1969 }
1970 
1971 
one_body_NLratios(NLjobGPU<double> jobs[],double C[],int first,int last,double spline_coefs[],int numCoefs,double rMax,int numjobs)1972 void one_body_NLratios(NLjobGPU<double> jobs[],
1973                        double C[],
1974                        int first,
1975                        int last,
1976                        double spline_coefs[],
1977                        int numCoefs,
1978                        double rMax,
1979                        int numjobs)
1980 {
1981   if (!AisInitialized)
1982     cuda_spline_init();
1983   const int BS = 32;
1984   dim3 dimBlock(BS);
1985   int blockx = numjobs % 65535;
1986   int blocky = numjobs / 65535 + 1;
1987   dim3 dimGrid(blockx, blocky);
1988   one_body_NLratio_kernel<BS><<<dimGrid, dimBlock>>>(jobs, C, first, last, spline_coefs, numCoefs, rMax);
1989 }
1990 
1991 
1992 template<typename T, int BS>
one_body_grad_kernel(T ** R,int iat,T * C,int first,int last,T * spline_coefs,int numCoefs,T rMax,bool zeroOut,T * grad)1993 __global__ void one_body_grad_kernel(T** R,
1994                                      int iat,
1995                                      T* C,
1996                                      int first,
1997                                      int last,
1998                                      T* spline_coefs,
1999                                      int numCoefs,
2000                                      T rMax,
2001                                      bool zeroOut,
2002                                      T* grad)
2003 {
2004   T dr    = rMax / (T)(numCoefs - 3);
2005   T drInv = 1.0 / dr;
2006   __syncthreads();
2007   // Safety for rounding error
2008   rMax *= 0.999999f;
2009   int tid = threadIdx.x;
2010   __shared__ T *myR, r[3];
2011   if (tid == 0)
2012     myR = R[blockIdx.x];
2013   __syncthreads();
2014   if (tid < 3)
2015     r[tid] = myR[3 * iat + tid];
2016   __shared__ T coefs[MAX_COEFS];
2017   if (tid < numCoefs)
2018     coefs[tid] = spline_coefs[tid];
2019   __shared__ T c[BS][3];
2020   __shared__ T A[12][4];
2021   if (tid < 16)
2022   {
2023     A[0 + (tid >> 2)][tid & 3] = AcudaSpline[tid + 0];
2024     A[4 + (tid >> 2)][tid & 3] = AcudaSpline[tid + 16];
2025     A[8 + (tid >> 2)][tid & 3] = AcudaSpline[tid + 32];
2026   }
2027   __syncthreads();
2028   int N  = last - first + 1;
2029   int NB = N / BS + ((N % BS) ? 1 : 0);
2030   __shared__ T sGrad[BS][3];
2031   sGrad[tid][0] = sGrad[tid][1] = sGrad[tid][2] = (T)0.0;
2032   for (int b = 0; b < NB; b++)
2033   {
2034     // Load block of positions from global memory
2035     for (int i = 0; i < 3; i++)
2036       if ((3 * b + i) * BS + tid < 3 * N)
2037         c[0][i * BS + tid] = C[3 * first + (3 * b + i) * BS + tid];
2038     __syncthreads();
2039     int ptcl1 = first + b * BS + tid;
2040     T dx, dy, dz, u, du, d2u;
2041     dx  = r[0] - c[tid][0];
2042     dy  = r[1] - c[tid][1];
2043     dz  = r[2] - c[tid][2];
2044     T d = dist(dx, dy, dz);
2045     eval_1d_spline_vgl(d, rMax, drInv, A, coefs, u, du, d2u);
2046     if (ptcl1 < (N + first))
2047     {
2048       du /= d;
2049       sGrad[tid][0] += du * dx;
2050       sGrad[tid][1] += du * dy;
2051       sGrad[tid][2] += du * dz;
2052     }
2053     __syncthreads();
2054   }
2055   // Do reduction across threads in block
2056   for (int s = BS >> 1; s > 0; s >>= 1)
2057   {
2058     if (tid < s)
2059     {
2060       sGrad[tid][0] += sGrad[tid + s][0];
2061       sGrad[tid][1] += sGrad[tid + s][1];
2062       sGrad[tid][2] += sGrad[tid + s][2];
2063     }
2064     __syncthreads();
2065   }
2066   if (tid < 3)
2067   {
2068     if (zeroOut)
2069       grad[3 * blockIdx.x + tid] = sGrad[0][tid];
2070     else
2071       grad[3 * blockIdx.x + tid] += sGrad[0][tid];
2072   }
2073 }
2074 
2075 
one_body_gradient(float * Rlist[],int iat,float C[],int first,int last,float spline_coefs[],int num_coefs,float rMax,bool zeroSum,float grad[],int numWalkers)2076 void one_body_gradient(float* Rlist[],
2077                        int iat,
2078                        float C[],
2079                        int first,
2080                        int last,
2081                        float spline_coefs[],
2082                        int num_coefs,
2083                        float rMax,
2084                        bool zeroSum,
2085                        float grad[],
2086                        int numWalkers)
2087 {
2088   if (!AisInitialized)
2089     cuda_spline_init();
2090   const int BS = 32;
2091   dim3 dimBlock(BS);
2092   dim3 dimGrid(numWalkers);
2093   one_body_grad_kernel<float, BS><<<dimGrid, dimBlock, 0, gpu::kernelStream>>>(Rlist,
2094                                                                                iat,
2095                                                                                C,
2096                                                                                first,
2097                                                                                last,
2098                                                                                spline_coefs,
2099                                                                                num_coefs,
2100                                                                                rMax,
2101                                                                                zeroSum,
2102                                                                                grad);
2103 }
2104 
one_body_gradient(double * Rlist[],int iat,double C[],int first,int last,double spline_coefs[],int num_coefs,double rMax,bool zeroSum,double grad[],int numWalkers)2105 void one_body_gradient(double* Rlist[],
2106                        int iat,
2107                        double C[],
2108                        int first,
2109                        int last,
2110                        double spline_coefs[],
2111                        int num_coefs,
2112                        double rMax,
2113                        bool zeroSum,
2114                        double grad[],
2115                        int numWalkers)
2116 {
2117   if (!AisInitialized)
2118     cuda_spline_init();
2119   const int BS = 32;
2120   dim3 dimBlock(BS);
2121   dim3 dimGrid(numWalkers);
2122   one_body_grad_kernel<double, BS><<<dimGrid, dimBlock, 0, gpu::kernelStream>>>(Rlist,
2123                                                                                 iat,
2124                                                                                 C,
2125                                                                                 first,
2126                                                                                 last,
2127                                                                                 spline_coefs,
2128                                                                                 num_coefs,
2129                                                                                 rMax,
2130                                                                                 zeroSum,
2131                                                                                 grad);
2132 }
2133 
2134 
2135 template<typename T, int BS, unsigned COMPLEX>
one_body_derivs_kernel(T * C,T ** R,T ** gradLogPsi,int cfirst,int clast,int efirst,int elast,int numCoefs,T rMax,T ** derivs)2136 __global__ void one_body_derivs_kernel(T* C,
2137                                        T** R,
2138                                        T** gradLogPsi,
2139                                        int cfirst,
2140                                        int clast,
2141                                        int efirst,
2142                                        int elast,
2143                                        int numCoefs,
2144                                        T rMax,
2145                                        T** derivs)
2146 {
2147   T dr    = rMax / (T)(numCoefs - 3);
2148   T drInv = 1.0 / dr;
2149   __syncthreads();
2150   // Safety for rounding error
2151   rMax *= 0.999999f;
2152   int tid = threadIdx.x;
2153   __shared__ T *myR, *myGrad, *myDerivs;
2154   if (tid == 0)
2155   {
2156     myR      = R[blockIdx.x];
2157     myGrad   = gradLogPsi[blockIdx.x];
2158     myDerivs = derivs[blockIdx.x];
2159   }
2160   __shared__ T sderivs[MAX_COEFS][2];
2161   __shared__ T r[BS][3], c[BS][3];
2162   __shared__ T A[12][4];
2163   if (tid < 16)
2164   {
2165     A[0 + (tid >> 2)][tid & 3] = AcudaSpline[tid + 0];
2166     A[4 + (tid >> 2)][tid & 3] = AcudaSpline[tid + 16];
2167     A[8 + (tid >> 2)][tid & 3] = AcudaSpline[tid + 32];
2168   }
2169   __syncthreads();
2170   sderivs[tid][0] = T();
2171   sderivs[tid][1] = T();
2172   int Nc          = clast - cfirst + 1;
2173   int Ne          = elast - efirst + 1;
2174   int NBc         = (Nc + BS - 1) / BS;
2175   int NBe         = (Ne + BS - 1) / BS;
2176   __shared__ T sGrad[BS][3];
2177   for (int be = 0; be < NBe; be++)
2178   {
2179     // Load block of positions from global memory
2180     for (int i = 0; i < 3; i++)
2181       if ((3 * be + i) * BS + tid < 3 * Ne)
2182       {
2183         int outoff       = i * BS + tid;
2184         int inoff        = outoff + 3 * efirst + 3 * be * BS;
2185         r[0][outoff]     = myR[inoff];
2186         sGrad[0][outoff] = myGrad[inoff * COMPLEX];
2187       }
2188     __syncthreads();
2189     int eptcl = efirst + be * BS + tid;
2190     for (int bc = 0; bc < NBc; bc++)
2191     {
2192       // Load block of positions from global memory
2193       for (int i = 0; i < 3; i++)
2194         if ((3 * bc + i) * BS + tid < 3 * Nc)
2195           c[0][i * BS + tid] = C[3 * cfirst + (3 * bc + i) * BS + tid];
2196       __syncthreads();
2197       // Now, loop over particles
2198       int end = min(BS, Nc - bc * BS);
2199       for (int j = 0; j < end; j++)
2200       {
2201         T dx, dy, dz;
2202         dx        = c[j][0] - r[tid][0];
2203         dy        = c[j][1] - r[tid][1];
2204         dz        = c[j][2] - r[tid][2];
2205         T d       = dist(dx, dy, dz);
2206         T dInv    = 1.0f / d;
2207         T s       = d * drInv;
2208         T sf      = floorf(s);
2209         int index = (int)sf;
2210         T t       = s - sf;
2211         T t2      = t * t;
2212         T t3      = t * t2;
2213         T v0      = (A[0][0] * t3 + A[0][1] * t2 + A[0][2] * t + A[0][3]);
2214         T v1      = (A[1][0] * t3 + A[1][1] * t2 + A[1][2] * t + A[1][3]);
2215         T v2      = (A[2][0] * t3 + A[2][1] * t2 + A[2][2] * t + A[2][3]);
2216         T v3      = (A[3][0] * t3 + A[3][1] * t2 + A[3][2] * t + A[3][3]);
2217         for (int id = 0; id < BS; id++)
2218           if (tid == id && eptcl <= elast && (d < rMax))
2219           {
2220             sderivs[index + 0][0] += v0;
2221             sderivs[index + 1][0] += v1;
2222             sderivs[index + 2][0] += v2;
2223             sderivs[index + 3][0] += v3;
2224           }
2225         T prefact = (dx * sGrad[tid][0] + dy * sGrad[tid][1] + dz * sGrad[tid][2]) * dInv;
2226         T du0     = drInv * (A[4][0] * t3 + A[4][1] * t2 + A[4][2] * t + A[4][3]);
2227         T du1     = drInv * (A[5][0] * t3 + A[5][1] * t2 + A[5][2] * t + A[5][3]);
2228         T du2     = drInv * (A[6][0] * t3 + A[6][1] * t2 + A[6][2] * t + A[6][3]);
2229         T du3     = drInv * (A[7][0] * t3 + A[7][1] * t2 + A[7][2] * t + A[7][3]);
2230         // This is the dot (gradu, grad_log_psi) term.
2231         v0 = 2.0f * prefact * du0;
2232         v1 = 2.0f * prefact * du1;
2233         v2 = 2.0f * prefact * du2;
2234         v3 = 2.0f * prefact * du3;
2235         // This is the lapl u term
2236         v0 -= drInv * drInv * (A[8][0] * t3 + A[8][1] * t2 + A[8][2] * t + A[8][3]) + 2.0f * du0 * dInv;
2237         v1 -= drInv * drInv * (A[9][0] * t3 + A[9][1] * t2 + A[9][2] * t + A[9][3]) + 2.0f * du1 * dInv;
2238         v2 -= drInv * drInv * (A[10][0] * t3 + A[10][1] * t2 + A[10][2] * t + A[10][3]) + 2.0f * du2 * dInv;
2239         v3 -= drInv * drInv * (A[11][0] * t3 + A[11][1] * t2 + A[11][2] * t + A[11][3]) + 2.0f * du3 * dInv;
2240         for (int id = 0; id < BS; id++)
2241           if (tid == id && eptcl <= elast && (d < rMax))
2242           {
2243             sderivs[index + 0][1] += v0;
2244             sderivs[index + 1][1] += v1;
2245             sderivs[index + 2][1] += v2;
2246             sderivs[index + 3][1] += v3;
2247           }
2248       }
2249       __syncthreads();
2250     }
2251   }
2252   sderivs[tid][1] *= 0.5f;
2253   if (tid < 2 * numCoefs)
2254     myDerivs[tid] = -sderivs[0][tid];
2255   if (tid + BS < 2 * numCoefs)
2256     myDerivs[tid + BS] = -sderivs[0][tid + BS];
2257 }
2258 
2259 
one_body_derivs(float C[],float * R[],float * gradLogPsi[],int cfirst,int clast,int efirst,int elast,int numCoefs,float rMax,float * derivs[],int numWalkers)2260 void one_body_derivs(float C[],
2261                      float* R[],
2262                      float* gradLogPsi[],
2263                      int cfirst,
2264                      int clast,
2265                      int efirst,
2266                      int elast,
2267                      int numCoefs,
2268                      float rMax,
2269                      float* derivs[],
2270                      int numWalkers)
2271 {
2272   const int BS = 32;
2273   dim3 dimBlock(BS);
2274   dim3 dimGrid(numWalkers);
2275   one_body_derivs_kernel<float, BS, 1>
2276       <<<dimGrid, dimBlock>>>(C, R, gradLogPsi, cfirst, clast, efirst, elast, numCoefs, rMax, derivs);
2277 }
2278 
2279 
one_body_derivs(double C[],double * R[],double * gradLogPsi[],int cfirst,int clast,int efirst,int elast,int numCoefs,double rMax,double * derivs[],int numWalkers)2280 void one_body_derivs(double C[],
2281                      double* R[],
2282                      double* gradLogPsi[],
2283                      int cfirst,
2284                      int clast,
2285                      int efirst,
2286                      int elast,
2287                      int numCoefs,
2288                      double rMax,
2289                      double* derivs[],
2290                      int numWalkers)
2291 {
2292   const int BS = 32;
2293   dim3 dimBlock(BS);
2294   dim3 dimGrid(numWalkers);
2295   one_body_derivs_kernel<double, BS, 1>
2296       <<<dimGrid, dimBlock>>>(C, R, gradLogPsi, cfirst, clast, efirst, elast, numCoefs, rMax, derivs);
2297 }
2298 
2299 // Ye: use offset to recycle the old routines
2300 // block size can be further optimized.
2301 #ifdef QMC_COMPLEX
one_body_derivs(float C[],float * R[],std::complex<float> * gradLogPsi[],int cfirst,int clast,int efirst,int elast,int numCoefs,float rMax,float * derivs[],int numWalkers)2302 void one_body_derivs(float C[],
2303                      float* R[],
2304                      std::complex<float>* gradLogPsi[],
2305                      int cfirst,
2306                      int clast,
2307                      int efirst,
2308                      int elast,
2309                      int numCoefs,
2310                      float rMax,
2311                      float* derivs[],
2312                      int numWalkers)
2313 {
2314   const int BS = 32;
2315   dim3 dimBlock(BS);
2316   dim3 dimGrid(numWalkers);
2317 
2318   one_body_derivs_kernel<float, BS, 2>
2319       <<<dimGrid, dimBlock>>>(C, R, (float**)gradLogPsi, cfirst, clast, efirst, elast, numCoefs, rMax, derivs);
2320 }
2321 
2322 
one_body_derivs(double C[],double * R[],std::complex<double> * gradLogPsi[],int cfirst,int clast,int efirst,int elast,int numCoefs,double rMax,double * derivs[],int numWalkers)2323 void one_body_derivs(double C[],
2324                      double* R[],
2325                      std::complex<double>* gradLogPsi[],
2326                      int cfirst,
2327                      int clast,
2328                      int efirst,
2329                      int elast,
2330                      int numCoefs,
2331                      double rMax,
2332                      double* derivs[],
2333                      int numWalkers)
2334 {
2335   const int BS = 32;
2336   dim3 dimBlock(BS);
2337   dim3 dimGrid(numWalkers);
2338 
2339   one_body_derivs_kernel<double, BS, 2>
2340       <<<dimGrid, dimBlock>>>(C, R, (double**)gradLogPsi, cfirst, clast, efirst, elast, numCoefs, rMax, derivs);
2341 }
2342 
2343 #endif
2344 
test()2345 void test()
2346 {
2347   dim3 dimBlock(32);
2348   dim3 dimGrid(1000);
2349   float* R[1000];
2350   float spline_coefs[10];
2351   float dr = 0.1;
2352   float sum[1000];
2353   two_body_sum_kernel<float, 32><<<dimGrid, dimBlock>>>(R, 0, 100, 0, 100, spline_coefs, 10, dr, sum);
2354 }
2355