1 //////////////////////////////////////////////////////////////////////////////////////
2 // This file is distributed under the University of Illinois/NCSA Open Source License.
3 // See LICENSE file in top directory for details.
4 //
5 // Copyright (c) 2016 Jeongnim Kim and QMCPACK developers.
6 //
7 // File developed by: Ken Esler, kpesler@gmail.com, University of Illinois at Urbana-Champaign
8 // Luke Shulenburger, lshulen@sandia.gov, Sandia National Laboratories
9 // Jeremy McMinnis, jmcminis@gmail.com, University of Illinois at Urbana-Champaign
10 // Jeongnim Kim, jeongnim.kim@gmail.com, University of Illinois at Urbana-Champaign
11 // Paul R. C. Kent, kentpr@ornl.gov, Oak Ridge National Laboratory
12 // Ye Luo, yeluo@anl.gov, Argonne National Laboratory
13 //
14 // File created by: Ken Esler, kpesler@gmail.com, University of Illinois at Urbana-Champaign
15 //////////////////////////////////////////////////////////////////////////////////////
16
17
18 #define MAX_SPLINES 100
19 #include <stdio.h>
20 #include "BsplineJastrowCuda.h"
21 #include "CUDA_legacy/gpu_misc.h"
22
23 bool AisInitialized = false;
24
25 __constant__ float AcudaSpline[48];
26 __constant__ double AcudaSpline_double[48];
27
recipSqrt(float x)28 inline __device__ float recipSqrt(float x) { return rsqrtf(x); }
recipSqrt(double x)29 inline __device__ double recipSqrt(double x) { return rsqrt(x); }
30
dist(float dx,float dy,float dz)31 inline __device__ float dist(float dx, float dy, float dz) { return sqrtf(dx * dx + dy * dy + dz * dz); }
32
dist(double dx,double dy,double dz)33 inline __device__ double dist(double dx, double dy, double dz) { return sqrt(dx * dx + dy * dy + dz * dz); }
34
35
cuda_spline_init()36 void cuda_spline_init()
37 {
38 // clang-format off
39 float A_h[48] = { -1.0/6.0, 3.0/6.0, -3.0/6.0, 1.0/6.0,
40 3.0/6.0, -6.0/6.0, 0.0/6.0, 4.0/6.0,
41 -3.0/6.0, 3.0/6.0, 3.0/6.0, 1.0/6.0,
42 1.0/6.0, 0.0/6.0, 0.0/6.0, 0.0/6.0,
43 0.0, -0.5, 1.0, -0.5,
44 0.0, 1.5, -2.0, 0.0,
45 0.0, -1.5, 1.0, 0.5,
46 0.0, 0.5, 0.0, 0.0,
47 0.0, 0.0, -1.0, 1.0,
48 0.0, 0.0, 3.0, -2.0,
49 0.0, 0.0, -3.0, 1.0,
50 0.0, 0.0, 1.0, 0.0
51 };
52 cudaMemcpyToSymbol(AcudaSpline, A_h, 48*sizeof(float), 0,
53 cudaMemcpyHostToDevice);
54 double A_d[48] = { -1.0/6.0, 3.0/6.0, -3.0/6.0, 1.0/6.0,
55 3.0/6.0, -6.0/6.0, 0.0/6.0, 4.0/6.0,
56 -3.0/6.0, 3.0/6.0, 3.0/6.0, 1.0/6.0,
57 1.0/6.0, 0.0/6.0, 0.0/6.0, 0.0/6.0,
58 0.0, -0.5, 1.0, -0.5,
59 0.0, 1.5, -2.0, 0.0,
60 0.0, -1.5, 1.0, 0.5,
61 0.0, 0.5, 0.0, 0.0,
62 0.0, 0.0, -1.0, 1.0,
63 0.0, 0.0, 3.0, -2.0,
64 0.0, 0.0, -3.0, 1.0,
65 0.0, 0.0, 1.0, 0.0
66 };
67 cudaMemcpyToSymbol(AcudaSpline_double, A_d, 48*sizeof(double), 0,
68 cudaMemcpyHostToDevice);
69 AisInitialized = true;
70 // clang-format on
71 }
72
73
74 template<typename T>
eval_1d_spline(const T dist,const T rmax,const T drInv,const T A[4][4],T coefs[])75 __device__ __forceinline__ T eval_1d_spline(const T dist, const T rmax, const T drInv, const T A[4][4], T coefs[])
76 {
77 if (dist >= rmax)
78 return (T)0.0;
79 T s = dist * drInv;
80 T sf;
81 T t = modff(s, (float*)&sf);
82 int index = (int)sf;
83 T t2 = t * t;
84 T t3 = t * t2;
85 T coefs0 = coefs[index];
86 T coefs1 = coefs[index + 1];
87 T coefs2 = coefs[index + 2];
88 T coefs3 = coefs[index + 3];
89 T val0 = A[0][0] * t3 + A[0][1] * t2 + A[0][2] * t + A[0][3];
90 T val1 = A[1][0] * t3 + A[1][1] * t2 + A[1][2] * t + A[1][3];
91 T val2 = A[2][0] * t3 + A[2][1] * t2 + A[2][2] * t + A[2][3];
92 T val3 = A[3][0] * t3 + A[3][1] * t2 + A[3][2] * t + A[3][3];
93 return (coefs0 * val0 + coefs1 * val1 + coefs2 * val2 + coefs3 * val3);
94 }
95
96
97 template<typename T>
eval_1d_spline_vgl(T dist,T rmax,T drInv,T A[12][4],T coefs[],T & u,T & du,T & d2u)98 __device__ inline void eval_1d_spline_vgl(T dist, T rmax, T drInv, T A[12][4], T coefs[], T& u, T& du, T& d2u)
99 {
100 if (dist >= rmax)
101 {
102 u = du = d2u = (T)0.0;
103 return;
104 }
105 T s = dist * drInv;
106 T sf = floorf(s);
107 int index = (int)sf;
108 T t = s - sf;
109 T t2 = t * t;
110 T t3 = t * t2;
111 u = (coefs[index + 0] * (A[0][0] * t3 + A[0][1] * t2 + A[0][2] * t + A[0][3]) +
112 coefs[index + 1] * (A[1][0] * t3 + A[1][1] * t2 + A[1][2] * t + A[1][3]) +
113 coefs[index + 2] * (A[2][0] * t3 + A[2][1] * t2 + A[2][2] * t + A[2][3]) +
114 coefs[index + 3] * (A[3][0] * t3 + A[3][1] * t2 + A[3][2] * t + A[3][3]));
115 du = drInv *
116 (coefs[index + 0] * (A[4][0] * t3 + A[4][1] * t2 + A[4][2] * t + A[4][3]) +
117 coefs[index + 1] * (A[5][0] * t3 + A[5][1] * t2 + A[5][2] * t + A[5][3]) +
118 coefs[index + 2] * (A[6][0] * t3 + A[6][1] * t2 + A[6][2] * t + A[6][3]) +
119 coefs[index + 3] * (A[7][0] * t3 + A[7][1] * t2 + A[7][2] * t + A[7][3]));
120 d2u = drInv * drInv *
121 (coefs[index + 0] * (A[8][0] * t3 + A[8][1] * t2 + A[8][2] * t + A[8][3]) +
122 coefs[index + 1] * (A[9][0] * t3 + A[9][1] * t2 + A[9][2] * t + A[9][3]) +
123 coefs[index + 2] * (A[10][0] * t3 + A[10][1] * t2 + A[10][2] * t + A[10][3]) +
124 coefs[index + 3] * (A[11][0] * t3 + A[11][1] * t2 + A[11][2] * t + A[11][3]));
125 }
126
127
128 #define MAX_COEFS 32
129 template<typename T, int BS>
two_body_sum_kernel(T ** R,int e1_first,int e1_last,int e2_first,int e2_last,T * spline_coefs,int numCoefs,T rMax,T * sum)130 __global__ void two_body_sum_kernel(T** R,
131 int e1_first,
132 int e1_last,
133 int e2_first,
134 int e2_last,
135 T* spline_coefs,
136 int numCoefs,
137 T rMax,
138 T* sum)
139 {
140 T dr = rMax / (T)(numCoefs - 3);
141 T drInv = 1.0 / dr;
142 __syncthreads();
143 // Safety for rounding error
144 rMax *= 0.999999f;
145 int tid = threadIdx.x;
146 __shared__ T* myR;
147 if (tid == 0)
148 myR = R[blockIdx.x];
149 __shared__ T coefs[MAX_COEFS];
150 if (tid < numCoefs)
151 coefs[tid] = spline_coefs[tid];
152 __shared__ T r1[BS][3], r2[BS][3];
153 __shared__ T A[4][4];
154 if (tid < 16)
155 A[tid >> 2][tid & 3] = AcudaSpline[tid];
156 __syncthreads();
157 int N1 = e1_last - e1_first + 1;
158 int N2 = e2_last - e2_first + 1;
159 int NB1 = N1 / BS + ((N1 % BS) ? 1 : 0);
160 int NB2 = N2 / BS + ((N2 % BS) ? 1 : 0);
161 T mysum = (T)0.0;
162 for (int b1 = 0; b1 < NB1; b1++)
163 {
164 // Load block of positions from global memory
165 for (int i = 0; i < 3; i++)
166 if ((3 * b1 + i) * BS + tid < 3 * N1)
167 r1[0][i * BS + tid] = myR[3 * e1_first + (3 * b1 + i) * BS + tid];
168 __syncthreads();
169 int ptcl1 = e1_first + b1 * BS + tid;
170 for (int b2 = 0; b2 < NB2; b2++)
171 {
172 // Load block of positions from global memory
173 for (int i = 0; i < 3; i++)
174 if ((3 * b2 + i) * BS + tid < 3 * N2)
175 r2[0][i * BS + tid] = myR[3 * e2_first + (3 * b2 + i) * BS + tid];
176 __syncthreads();
177 // Now, loop over particles
178 int end = (b2 + 1) * BS < N2 ? BS : N2 - b2 * BS;
179 for (int j = 0; j < end; j++)
180 {
181 int ptcl2 = e2_first + b2 * BS + j;
182 T dx, dy, dz;
183 dx = r2[j][0] - r1[tid][0];
184 dy = r2[j][1] - r1[tid][1];
185 dz = r2[j][2] - r1[tid][2];
186 T d = dist(dx, dy, dz);
187 if (ptcl1 != ptcl2 && (ptcl1 < (N1 + e1_first)) && (ptcl2 < (N2 + e2_first)))
188 mysum += eval_1d_spline(d, rMax, drInv, A, coefs);
189 }
190 __syncthreads();
191 }
192 }
193 __shared__ T shared_sum[BS];
194 shared_sum[tid] = mysum;
195 __syncthreads();
196 for (int s = BS >> 1; s > 0; s >>= 1)
197 {
198 if (tid < s)
199 shared_sum[tid] += shared_sum[tid + s];
200 __syncthreads();
201 }
202 T factor = (e1_first == e2_first) ? 0.5 : 1.0;
203 if (tid == 0)
204 sum[blockIdx.x] += factor * shared_sum[0];
205 }
206
two_body_sum(float * R[],int e1_first,int e1_last,int e2_first,int e2_last,float spline_coefs[],int numCoefs,float rMax,float sum[],int numWalkers)207 void two_body_sum(float* R[],
208 int e1_first,
209 int e1_last,
210 int e2_first,
211 int e2_last,
212 float spline_coefs[],
213 int numCoefs,
214 float rMax,
215 float sum[],
216 int numWalkers)
217 {
218 if (!AisInitialized)
219 cuda_spline_init();
220 const int BS = 128;
221 dim3 dimBlock(BS);
222 dim3 dimGrid(numWalkers);
223 two_body_sum_kernel<float, BS>
224 <<<dimGrid, dimBlock>>>(R, e1_first, e1_last, e2_first, e2_last, spline_coefs, numCoefs, rMax, sum);
225 }
226
227
two_body_sum(double * R[],int e1_first,int e1_last,int e2_first,int e2_last,double spline_coefs[],int numCoefs,double rMax,double sum[],int numWalkers)228 void two_body_sum(double* R[],
229 int e1_first,
230 int e1_last,
231 int e2_first,
232 int e2_last,
233 double spline_coefs[],
234 int numCoefs,
235 double rMax,
236 double sum[],
237 int numWalkers)
238 {
239 if (!AisInitialized)
240 cuda_spline_init();
241 const int BS = 128;
242 dim3 dimBlock(BS);
243 dim3 dimGrid(numWalkers);
244 two_body_sum_kernel<double, BS>
245 <<<dimGrid, dimBlock>>>(R, e1_first, e1_last, e2_first, e2_last, spline_coefs, numCoefs, rMax, sum);
246 }
247
248
249 template<typename T, int BS>
two_body_ratio_kernel(T ** R,int first,int last,T * Rnew,int inew,int offset,T * spline_coefs,int numCoefs,T rMax,T * sum)250 __global__ void two_body_ratio_kernel(T** R,
251 int first,
252 int last,
253 T* Rnew,
254 int inew,
255 int offset,
256 T* spline_coefs,
257 int numCoefs,
258 T rMax,
259 T* sum)
260 {
261 T dr = rMax / (T)(numCoefs - 3);
262 T drInv = 1.0 / dr;
263 __syncthreads();
264 // Safety for rounding error
265 rMax *= 0.999999f;
266 int tid = threadIdx.x;
267 __shared__ T* myR;
268 __shared__ T myRnew[3], myRold[3];
269 if (tid == 0)
270 myR = R[blockIdx.x];
271 __syncthreads();
272 if (tid < 3)
273 {
274 myRnew[tid] = Rnew[3 * (blockIdx.x + offset) + tid];
275 myRold[tid] = myR[3 * inew + tid];
276 }
277 __syncthreads();
278 __shared__ T coefs[MAX_COEFS];
279 __shared__ T r1[BS][3];
280 if (tid < numCoefs)
281 coefs[tid] = spline_coefs[tid];
282 __shared__ T A[4][4];
283 if (tid < 16)
284 A[(tid >> 2)][tid & 3] = AcudaSpline[tid];
285 __syncthreads();
286 int N = last - first + 1;
287 int NB = N / BS + ((N % BS) ? 1 : 0);
288 __shared__ T shared_sum[BS];
289 shared_sum[tid] = (T)0.0;
290 for (int b = 0; b < NB; b++)
291 {
292 // Load block of positions from global memory
293 for (int i = 0; i < 3; i++)
294 {
295 int n = i * BS + tid;
296 if ((3 * b + i) * BS + tid < 3 * N)
297 r1[0][n] = myR[3 * first + (3 * b + i) * BS + tid];
298 }
299 __syncthreads();
300 int ptcl1 = first + b * BS + tid;
301 T dx, dy, dz;
302 dx = myRnew[0] - r1[tid][0];
303 dy = myRnew[1] - r1[tid][1];
304 dz = myRnew[2] - r1[tid][2];
305 T d = dist(dx, dy, dz);
306 T delta = eval_1d_spline(d, rMax, drInv, A, coefs);
307 dx = myRold[0] - r1[tid][0];
308 dy = myRold[1] - r1[tid][1];
309 dz = myRold[2] - r1[tid][2];
310 d = dist(dx, dy, dz);
311 delta -= eval_1d_spline(d, rMax, drInv, A, coefs);
312 if (ptcl1 != inew && (ptcl1 < (N + first)))
313 shared_sum[tid] += delta;
314 __syncthreads();
315 }
316 __syncthreads();
317 for (int s = (BS >> 1); s > 0; s >>= 1)
318 {
319 if (tid < s)
320 shared_sum[tid] += shared_sum[tid + s];
321 __syncthreads();
322 }
323 if (tid == 0)
324 sum[blockIdx.x] += shared_sum[0];
325 }
326
327
two_body_ratio(float * R[],int first,int last,float Rnew[],int inew,int offset,float spline_coefs[],int numCoefs,float rMax,float sum[],int numWalkers)328 void two_body_ratio(float* R[],
329 int first,
330 int last,
331 float Rnew[],
332 int inew,
333 int offset,
334 float spline_coefs[],
335 int numCoefs,
336 float rMax,
337 float sum[],
338 int numWalkers)
339 {
340 if (!AisInitialized)
341 cuda_spline_init();
342 const int BS = 128;
343 dim3 dimBlock(BS);
344 dim3 dimGrid(numWalkers);
345 two_body_ratio_kernel<float, BS>
346 <<<dimGrid, dimBlock>>>(R, first, last, Rnew, inew, offset, spline_coefs, numCoefs, rMax, sum);
347 }
348
349
two_body_ratio(double * R[],int first,int last,double Rnew[],int inew,int offset,double spline_coefs[],int numCoefs,double rMax,double sum[],int numWalkers)350 void two_body_ratio(double* R[],
351 int first,
352 int last,
353 double Rnew[],
354 int inew,
355 int offset,
356 double spline_coefs[],
357 int numCoefs,
358 double rMax,
359 double sum[],
360 int numWalkers)
361 {
362 if (!AisInitialized)
363 cuda_spline_init();
364 dim3 dimBlock(128);
365 dim3 dimGrid(numWalkers);
366 two_body_ratio_kernel<double, 128>
367 <<<dimGrid, dimBlock>>>(R, first, last, Rnew, inew, offset, spline_coefs, numCoefs, rMax, sum);
368 }
369
370
371 template<typename T, int BS>
two_body_ratio_grad_kernel(T ** R,int first,int last,T * Rnew,int inew,int offset,T * spline_coefs,int numCoefs,T rMax,bool zero,T * ratio_grad)372 __global__ void two_body_ratio_grad_kernel(T** R,
373 int first,
374 int last,
375 T* Rnew,
376 int inew,
377 int offset,
378 T* spline_coefs,
379 int numCoefs,
380 T rMax,
381 bool zero,
382 T* ratio_grad)
383 {
384 int tid = threadIdx.x;
385 T dr = rMax / (T)(numCoefs - 3);
386 T drInv = 1.0 / dr;
387 __syncthreads();
388 // Safety for rounding error
389 rMax *= 0.999999f;
390 __shared__ T* myR;
391 __shared__ T myRnew[3], myRold[3];
392 if (tid == 0)
393 myR = R[blockIdx.x];
394 __syncthreads();
395 if (tid < 3)
396 {
397 myRnew[tid] = Rnew[3 * (blockIdx.x + offset) + tid];
398 myRold[tid] = myR[3 * inew + tid];
399 }
400 __syncthreads();
401 __shared__ T coefs[MAX_COEFS];
402 __shared__ T r1[BS][3];
403 if (tid < numCoefs)
404 coefs[tid] = spline_coefs[tid];
405 __syncthreads();
406 __shared__ T A[12][4];
407 if (tid < 16)
408 {
409 A[0 + (tid >> 2)][tid & 3] = AcudaSpline[tid + 0];
410 A[4 + (tid >> 2)][tid & 3] = AcudaSpline[tid + 16];
411 A[8 + (tid >> 2)][tid & 3] = AcudaSpline[tid + 32];
412 }
413 __syncthreads();
414 int N = last - first + 1;
415 int NB = N / BS + ((N % BS) ? 1 : 0);
416 __shared__ T shared_sum[BS];
417 __shared__ T shared_grad[BS][3];
418 shared_sum[tid] = (T)0.0;
419 shared_grad[tid][0] = shared_grad[tid][1] = shared_grad[tid][2] = 0.0f;
420 for (int b = 0; b < NB; b++)
421 {
422 // Load block of positions from global memory
423 for (int i = 0; i < 3; i++)
424 {
425 int n = i * BS + tid;
426 if ((3 * b + i) * BS + tid < 3 * N)
427 r1[0][n] = myR[3 * first + (3 * b + i) * BS + tid];
428 }
429 __syncthreads();
430 int ptcl1 = first + b * BS + tid;
431 T dx, dy, dz, u, du, d2u, delta, d;
432 dx = myRold[0] - r1[tid][0];
433 dy = myRold[1] - r1[tid][1];
434 dz = myRold[2] - r1[tid][2];
435 d = dist(dx, dy, dz);
436 delta = -eval_1d_spline(d, rMax, drInv, A, coefs);
437 dx = myRnew[0] - r1[tid][0];
438 dy = myRnew[1] - r1[tid][1];
439 dz = myRnew[2] - r1[tid][2];
440 d = dist(dx, dy, dz);
441 eval_1d_spline_vgl(d, rMax, drInv, A, coefs, u, du, d2u);
442 delta += u;
443 if (ptcl1 != inew && (ptcl1 < (N + first)))
444 {
445 du /= d;
446 shared_sum[tid] += delta;
447 shared_grad[tid][0] += du * dx;
448 shared_grad[tid][1] += du * dy;
449 shared_grad[tid][2] += du * dz;
450 }
451 __syncthreads();
452 }
453 __syncthreads();
454 for (int s = (BS >> 1); s > 0; s >>= 1)
455 {
456 if (tid < s)
457 {
458 shared_sum[tid] += shared_sum[tid + s];
459 shared_grad[tid][0] += shared_grad[tid + s][0];
460 shared_grad[tid][1] += shared_grad[tid + s][1];
461 shared_grad[tid][2] += shared_grad[tid + s][2];
462 }
463 __syncthreads();
464 }
465 if (tid == 0)
466 {
467 if (zero)
468 {
469 ratio_grad[4 * blockIdx.x + 0] = shared_sum[0];
470 ratio_grad[4 * blockIdx.x + 1] = shared_grad[0][0];
471 ratio_grad[4 * blockIdx.x + 2] = shared_grad[0][1];
472 ratio_grad[4 * blockIdx.x + 3] = shared_grad[0][2];
473 }
474 else
475 {
476 ratio_grad[4 * blockIdx.x + 0] += shared_sum[0];
477 ratio_grad[4 * blockIdx.x + 1] += shared_grad[0][0];
478 ratio_grad[4 * blockIdx.x + 2] += shared_grad[0][1];
479 ratio_grad[4 * blockIdx.x + 3] += shared_grad[0][2];
480 }
481 }
482 }
483
484
two_body_ratio_grad(float * R[],int first,int last,float Rnew[],int inew,int offset,float spline_coefs[],int numCoefs,float rMax,bool zero,float ratio_grad[],int numWalkers)485 void two_body_ratio_grad(float* R[],
486 int first,
487 int last,
488 float Rnew[],
489 int inew,
490 int offset,
491 float spline_coefs[],
492 int numCoefs,
493 float rMax,
494 bool zero,
495 float ratio_grad[],
496 int numWalkers)
497 {
498 if (!AisInitialized)
499 cuda_spline_init();
500 const int BS = 32;
501 dim3 dimBlock(BS);
502 dim3 dimGrid(numWalkers);
503 two_body_ratio_grad_kernel<float, BS>
504 <<<dimGrid, dimBlock>>>(R, first, last, Rnew, inew, offset, spline_coefs, numCoefs, rMax, zero, ratio_grad);
505 }
506
507
two_body_ratio_grad(double * R[],int first,int last,double Rnew[],int inew,int offset,double spline_coefs[],int numCoefs,double rMax,bool zero,double ratio_grad[],int numWalkers)508 void two_body_ratio_grad(double* R[],
509 int first,
510 int last,
511 double Rnew[],
512 int inew,
513 int offset,
514 double spline_coefs[],
515 int numCoefs,
516 double rMax,
517 bool zero,
518 double ratio_grad[],
519 int numWalkers)
520 {
521 if (!AisInitialized)
522 cuda_spline_init();
523 const int BS = 32;
524 dim3 dimBlock(BS);
525 dim3 dimGrid(numWalkers);
526 two_body_ratio_grad_kernel<double, BS>
527 <<<dimGrid, dimBlock>>>(R, first, last, Rnew, inew, offset, spline_coefs, numCoefs, rMax, zero, ratio_grad);
528 }
529
530
531 template<int BS>
two_body_NLratio_kernel(NLjobGPU<float> * jobs,int first,int last,float ** spline_coefs,int * numCoefs,float * rMaxList)532 __global__ void two_body_NLratio_kernel(NLjobGPU<float>* jobs,
533 int first,
534 int last,
535 float** spline_coefs,
536 int* numCoefs,
537 float* rMaxList)
538 {
539 const int MAX_RATIOS = 18;
540 int tid = threadIdx.x;
541 __shared__ NLjobGPU<float> myJob;
542 __shared__ float myRnew[MAX_RATIOS][3], myRold[3];
543 __shared__ float* myCoefs;
544 __shared__ int myNumCoefs;
545 __shared__ float rMax;
546 if (tid == 0)
547 {
548 myJob = jobs[blockIdx.x];
549 myCoefs = spline_coefs[blockIdx.x];
550 myNumCoefs = numCoefs[blockIdx.x];
551 rMax = rMaxList[blockIdx.x];
552 }
553 __syncthreads();
554 if (tid < 3)
555 myRold[tid] = myJob.R[3 * myJob.Elec + tid];
556 for (int i = 0; i < 3; i++)
557 if (i * BS + tid < 3 * myJob.NumQuadPoints)
558 myRnew[0][i * BS + tid] = myJob.QuadPoints[i * BS + tid];
559 float dr = rMax / (float)(myNumCoefs - 3);
560 float drInv = 1.0 / dr;
561 __shared__ float coefs[MAX_COEFS];
562 // __shared__ float r1[BS][3];
563 if (tid < myNumCoefs)
564 coefs[tid] = myCoefs[tid];
565 __shared__ float A[4][4];
566 if (tid < 16)
567 A[(tid >> 2)][tid & 3] = AcudaSpline[tid];
568 __syncthreads();
569 int N = last - first + 1;
570 int NB = N / BS + ((N % BS) ? 1 : 0);
571 __shared__ float shared_sum[MAX_RATIOS][BS + 1];
572 for (int iq = 0; iq < myJob.NumQuadPoints; iq++)
573 shared_sum[iq][tid] = (float)0.0;
574 for (int b = 0; b < NB * BS; b += BS)
575 {
576 float3 r1_r = ((float3*)(myJob.R))[first + b + tid];
577 int ptcl1 = first + b + tid;
578 if (ptcl1 != myJob.Elec && (ptcl1 < (N + first)))
579 {
580 float dx, dy, dz;
581 dx = myRold[0] - r1_r.x;
582 dy = myRold[1] - r1_r.y;
583 dz = myRold[2] - r1_r.z;
584 float d = dist(dx, dy, dz);
585 float uOld = eval_1d_spline(d, rMax, drInv, A, coefs);
586 for (int iq = 0; iq < myJob.NumQuadPoints; iq++)
587 {
588 dx = myRnew[iq][0] - r1_r.x;
589 dy = myRnew[iq][1] - r1_r.y;
590 dz = myRnew[iq][2] - r1_r.z;
591 d = dist(dx, dy, dz);
592 shared_sum[iq][tid] += eval_1d_spline(d, rMax, drInv, A, coefs) - uOld;
593 }
594 }
595 }
596 __syncthreads();
597 for (int s = (BS >> 1); s > 0; s >>= 1)
598 {
599 if (tid < s)
600 for (int iq = 0; iq < myJob.NumQuadPoints; iq++)
601 shared_sum[iq][tid] += shared_sum[iq][tid + s];
602 __syncthreads();
603 }
604 if (tid < myJob.NumQuadPoints)
605 myJob.Ratios[tid] *= expf(-shared_sum[tid][0]); // note this is single-precision!
606 }
607
608
609 template<int BS>
two_body_NLratio_kernel(NLjobGPU<double> * jobs,int first,int last,double ** spline_coefs,int * numCoefs,double * rMaxList)610 __global__ void two_body_NLratio_kernel(NLjobGPU<double>* jobs,
611 int first,
612 int last,
613 double** spline_coefs,
614 int* numCoefs,
615 double* rMaxList)
616 {
617 const int MAX_RATIOS = 18;
618 int tid = threadIdx.x;
619 __shared__ NLjobGPU<double> myJob;
620 __shared__ double myRnew[MAX_RATIOS][3], myRold[3];
621 __shared__ double* myCoefs;
622 __shared__ int myNumCoefs;
623 __shared__ double rMax;
624 if (tid == 0)
625 {
626 myJob = jobs[blockIdx.x];
627 myCoefs = spline_coefs[blockIdx.x];
628 myNumCoefs = numCoefs[blockIdx.x];
629 rMax = rMaxList[blockIdx.x];
630 }
631 __syncthreads();
632 if (tid < 3)
633 myRold[tid] = myJob.R[3 * myJob.Elec + tid];
634 for (int i = 0; i < 3; i++)
635 if (i * BS + tid < 3 * myJob.NumQuadPoints)
636 myRnew[0][i * BS + tid] = myJob.QuadPoints[i * BS + tid];
637 __syncthreads();
638 double dr = rMax / (double)(myNumCoefs - 3);
639 double drInv = 1.0 / dr;
640 __shared__ double coefs[MAX_COEFS];
641 __shared__ double r1[BS][3];
642 if (tid < myNumCoefs)
643 coefs[tid] = myCoefs[tid];
644 __syncthreads();
645 __shared__ double A[4][4];
646 if (tid < 16)
647 A[(tid >> 2)][tid & 3] = AcudaSpline[tid];
648 __syncthreads();
649 int N = last - first + 1;
650 int NB = N / BS + ((N % BS) ? 1 : 0);
651 __shared__ double shared_sum[MAX_RATIOS][BS + 1];
652 for (int iq = 0; iq < myJob.NumQuadPoints; iq++)
653 shared_sum[iq][tid] = (double)0.0;
654 for (int b = 0; b < NB; b++)
655 {
656 // Load block of positions from global memory
657 for (int i = 0; i < 3; i++)
658 {
659 int n = i * BS + tid;
660 if ((3 * b + i) * BS + tid < 3 * N)
661 r1[0][n] = myJob.R[3 * first + (3 * b + i) * BS + tid];
662 }
663 __syncthreads();
664 int ptcl1 = first + b * BS + tid;
665 double dx, dy, dz;
666 dx = myRold[0] - r1[tid][0];
667 dy = myRold[1] - r1[tid][1];
668 dz = myRold[2] - r1[tid][2];
669 double d = dist(dx, dy, dz);
670 double uOld = eval_1d_spline(d, rMax, drInv, A, coefs);
671 for (int iq = 0; iq < myJob.NumQuadPoints; iq++)
672 {
673 dx = myRnew[iq][0] - r1[tid][0];
674 dy = myRnew[iq][1] - r1[tid][1];
675 dz = myRnew[iq][2] - r1[tid][2];
676 d = dist(dx, dy, dz);
677 if (ptcl1 != myJob.Elec && (ptcl1 < (N + first)))
678 shared_sum[iq][tid] += eval_1d_spline(d, rMax, drInv, A, coefs) - uOld;
679 }
680 __syncthreads();
681 }
682 for (int s = (BS >> 1); s > 0; s >>= 1)
683 {
684 if (tid < s)
685 for (int iq = 0; iq < myJob.NumQuadPoints; iq++)
686 shared_sum[iq][tid] += shared_sum[iq][tid + s];
687 __syncthreads();
688 }
689 if (tid < myJob.NumQuadPoints)
690 myJob.Ratios[tid] *= exp(-shared_sum[tid][0]);
691 }
692
693
two_body_NLratios(NLjobGPU<float> jobs[],int first,int last,float * spline_coefs[],int numCoefs[],float rMax[],int numjobs)694 void two_body_NLratios(NLjobGPU<float> jobs[],
695 int first,
696 int last,
697 float* spline_coefs[],
698 int numCoefs[],
699 float rMax[],
700 int numjobs)
701 {
702 if (!AisInitialized)
703 cuda_spline_init();
704 const int BS = 32;
705 dim3 dimBlock(BS);
706 while (numjobs > 65535)
707 {
708 dim3 dimGrid(65535);
709 two_body_NLratio_kernel<BS><<<dimGrid, dimBlock>>>(jobs, first, last, spline_coefs, numCoefs, rMax);
710 jobs += 65535;
711 numjobs -= 65535;
712 }
713 dim3 dimGrid(numjobs);
714 two_body_NLratio_kernel<BS><<<dimGrid, dimBlock>>>(jobs, first, last, spline_coefs, numCoefs, rMax);
715 }
716
717
two_body_NLratios(NLjobGPU<double> jobs[],int first,int last,double * spline_coefs[],int numCoefs[],double rMax[],int numjobs)718 void two_body_NLratios(NLjobGPU<double> jobs[],
719 int first,
720 int last,
721 double* spline_coefs[],
722 int numCoefs[],
723 double rMax[],
724 int numjobs)
725 {
726 if (!AisInitialized)
727 cuda_spline_init();
728 const int BS = 32;
729 dim3 dimBlock(BS);
730 while (numjobs > 65535)
731 {
732 dim3 dimGrid(65535);
733 two_body_NLratio_kernel<BS><<<dimGrid, dimBlock>>>(jobs, first, last, spline_coefs, numCoefs, rMax);
734 jobs += 65535;
735 numjobs -= 65535;
736 }
737 dim3 dimGrid(numjobs);
738 two_body_NLratio_kernel<BS><<<dimGrid, dimBlock>>>(jobs, first, last, spline_coefs, numCoefs, rMax);
739 }
740
741
742 #define MAX_COEFS 32
743
744 template<typename T, int BS>
two_body_grad_lapl_kernel(T ** R,int e1_first,int e1_last,int e2_first,int e2_last,T * spline_coefs,int numCoefs,T rMax,T * gradLapl,int row_stride)745 __global__ void two_body_grad_lapl_kernel(T** R,
746 int e1_first,
747 int e1_last,
748 int e2_first,
749 int e2_last,
750 T* spline_coefs,
751 int numCoefs,
752 T rMax,
753 T* gradLapl,
754 int row_stride)
755 {
756 T dr = rMax / (T)(numCoefs - 3);
757 T drInv = 1.0 / dr;
758 __syncthreads();
759 // Safety for rounding error
760 rMax *= 0.999999f;
761 int tid = threadIdx.x;
762 __shared__ T* myR;
763 if (tid == 0)
764 myR = R[blockIdx.x];
765 __shared__ T coefs[MAX_COEFS];
766 if (tid < numCoefs)
767 coefs[tid] = spline_coefs[tid];
768 __shared__ T r1[BS][3], r2[BS][3];
769 __shared__ T A[12][4];
770 if (tid < 16)
771 {
772 A[0 + (tid >> 2)][tid & 3] = AcudaSpline[tid + 0];
773 A[4 + (tid >> 2)][tid & 3] = AcudaSpline[tid + 16];
774 A[8 + (tid >> 2)][tid & 3] = AcudaSpline[tid + 32];
775 }
776 __syncthreads();
777 int N1 = e1_last - e1_first + 1;
778 int N2 = e2_last - e2_first + 1;
779 int NB1 = N1 / BS + ((N1 % BS) ? 1 : 0);
780 int NB2 = N2 / BS + ((N2 % BS) ? 1 : 0);
781 __shared__ T sGradLapl[BS][4];
782 for (int b1 = 0; b1 < NB1; b1++)
783 {
784 // Load block of positions from global memory
785 for (int i = 0; i < 3; i++)
786 if ((3 * b1 + i) * BS + tid < 3 * N1)
787 r1[0][i * BS + tid] = myR[3 * e1_first + (3 * b1 + i) * BS + tid];
788 __syncthreads();
789 int ptcl1 = e1_first + b1 * BS + tid;
790 int offset = blockIdx.x * row_stride + 4 * b1 * BS + 4 * e1_first;
791 sGradLapl[tid][0] = sGradLapl[tid][1] = sGradLapl[tid][2] = sGradLapl[tid][3] = (T)0.0;
792 for (int b2 = 0; b2 < NB2; b2++)
793 {
794 // Load block of positions from global memory
795 for (int i = 0; i < 3; i++)
796 if ((3 * b2 + i) * BS + tid < 3 * N2)
797 r2[0][i * BS + tid] = myR[3 * e2_first + (3 * b2 + i) * BS + tid];
798 __syncthreads();
799 // Now, loop over particles
800 int end = (b2 + 1) * BS < N2 ? BS : N2 - b2 * BS;
801 for (int j = 0; j < end; j++)
802 {
803 int ptcl2 = e2_first + b2 * BS + j;
804 T dx, dy, dz, u, du, d2u;
805 dx = r2[j][0] - r1[tid][0];
806 dy = r2[j][1] - r1[tid][1];
807 dz = r2[j][2] - r1[tid][2];
808 T d = dist(dx, dy, dz);
809 eval_1d_spline_vgl(d, rMax, drInv, A, coefs, u, du, d2u);
810 if (ptcl1 != ptcl2 && (ptcl1 < (N1 + e1_first)) && (ptcl2 < (N2 + e2_first)))
811 {
812 du /= d;
813 sGradLapl[tid][0] += du * dx;
814 sGradLapl[tid][1] += du * dy;
815 sGradLapl[tid][2] += du * dz;
816 sGradLapl[tid][3] -= d2u + 2.0 * du;
817 }
818 }
819 __syncthreads();
820 }
821 for (int i = 0; i < 4; i++)
822 if ((4 * b1 + i) * BS + tid < 4 * N1)
823 gradLapl[offset + i * BS + tid] += sGradLapl[0][i * BS + tid];
824 __syncthreads();
825 }
826 }
827
828
two_body_grad_lapl(float * R[],int e1_first,int e1_last,int e2_first,int e2_last,float spline_coefs[],int numCoefs,float rMax,float gradLapl[],int row_stride,int numWalkers)829 void two_body_grad_lapl(float* R[],
830 int e1_first,
831 int e1_last,
832 int e2_first,
833 int e2_last,
834 float spline_coefs[],
835 int numCoefs,
836 float rMax,
837 float gradLapl[],
838 int row_stride,
839 int numWalkers)
840 {
841 const int BS = 32;
842 dim3 dimBlock(BS);
843 dim3 dimGrid(numWalkers);
844 two_body_grad_lapl_kernel<float, BS><<<dimGrid, dimBlock>>>(R,
845 e1_first,
846 e1_last,
847 e2_first,
848 e2_last,
849 spline_coefs,
850 numCoefs,
851 rMax,
852 gradLapl,
853 row_stride);
854 }
855
856
two_body_grad_lapl(double * R[],int e1_first,int e1_last,int e2_first,int e2_last,double spline_coefs[],int numCoefs,double rMax,double gradLapl[],int row_stride,int numWalkers)857 void two_body_grad_lapl(double* R[],
858 int e1_first,
859 int e1_last,
860 int e2_first,
861 int e2_last,
862 double spline_coefs[],
863 int numCoefs,
864 double rMax,
865 double gradLapl[],
866 int row_stride,
867 int numWalkers)
868 {
869 if (!AisInitialized)
870 cuda_spline_init();
871 const int BS = 32;
872 dim3 dimBlock(BS);
873 dim3 dimGrid(numWalkers);
874 two_body_grad_lapl_kernel<double, BS><<<dimGrid, dimBlock>>>(R,
875 e1_first,
876 e1_last,
877 e2_first,
878 e2_last,
879 spline_coefs,
880 numCoefs,
881 rMax,
882 gradLapl,
883 row_stride);
884 }
885
886
887 template<typename T, int BS>
two_body_grad_kernel(T ** R,int first,int last,int iat,T * spline_coefs,int numCoefs,T rMax,bool zeroOut,T * grad)888 __global__ void two_body_grad_kernel(T** R,
889 int first,
890 int last,
891 int iat,
892 T* spline_coefs,
893 int numCoefs,
894 T rMax,
895 bool zeroOut,
896 T* grad)
897 {
898 T dr = rMax / (T)(numCoefs - 3);
899 T drInv = 1.0 / dr;
900 __syncthreads();
901 // Safety for rounding error
902 rMax *= 0.999999f;
903 int tid = threadIdx.x;
904 __shared__ T *myR, r2[3];
905 if (tid == 0)
906 myR = R[blockIdx.x];
907 __syncthreads();
908 if (tid < 3)
909 r2[tid] = myR[3 * iat + tid];
910 __shared__ T coefs[MAX_COEFS];
911 if (tid < numCoefs)
912 coefs[tid] = spline_coefs[tid];
913 __shared__ T r1[BS][3];
914 __shared__ T A[12][4];
915 if (tid < 16)
916 {
917 A[0 + (tid >> 2)][tid & 3] = AcudaSpline[tid + 0];
918 A[4 + (tid >> 2)][tid & 3] = AcudaSpline[tid + 16];
919 A[8 + (tid >> 2)][tid & 3] = AcudaSpline[tid + 32];
920 }
921 __syncthreads();
922 int N = last - first + 1;
923 int NB = N / BS + ((N % BS) ? 1 : 0);
924 __shared__ T sGrad[BS][3];
925 sGrad[tid][0] = sGrad[tid][1] = sGrad[tid][2] = (T)0.0;
926 for (int b = 0; b < NB; b++)
927 {
928 // Load block of positions from global memory
929 for (int i = 0; i < 3; i++)
930 if ((3 * b + i) * BS + tid < 3 * N)
931 r1[0][i * BS + tid] = myR[3 * first + (3 * b + i) * BS + tid];
932 __syncthreads();
933 int ptcl1 = first + b * BS + tid;
934 T dx, dy, dz, u, du, d2u;
935 dx = r2[0] - r1[tid][0];
936 dy = r2[1] - r1[tid][1];
937 dz = r2[2] - r1[tid][2];
938 T d = dist(dx, dy, dz);
939 eval_1d_spline_vgl(d, rMax, drInv, A, coefs, u, du, d2u);
940 if (ptcl1 != iat && ptcl1 < (N + first))
941 {
942 du /= d;
943 sGrad[tid][0] += du * dx;
944 sGrad[tid][1] += du * dy;
945 sGrad[tid][2] += du * dz;
946 }
947 __syncthreads();
948 }
949 // Do reduction across threads in block
950 for (int s = BS >> 1; s > 0; s >>= 1)
951 {
952 if (tid < s)
953 {
954 sGrad[tid][0] += sGrad[tid + s][0];
955 sGrad[tid][1] += sGrad[tid + s][1];
956 sGrad[tid][2] += sGrad[tid + s][2];
957 }
958 __syncthreads();
959 }
960 if (tid < 3)
961 {
962 if (zeroOut)
963 grad[3 * blockIdx.x + tid] = sGrad[0][tid];
964 else
965 grad[3 * blockIdx.x + tid] += sGrad[0][tid];
966 }
967 }
968
969
two_body_gradient(float * R[],int first,int last,int iat,float spline_coefs[],int numCoefs,float rMax,bool zeroOut,float grad[],int numWalkers)970 void two_body_gradient(float* R[],
971 int first,
972 int last,
973 int iat,
974 float spline_coefs[],
975 int numCoefs,
976 float rMax,
977 bool zeroOut,
978 float grad[],
979 int numWalkers)
980 {
981 const int BS = 32;
982 dim3 dimBlock(BS);
983 dim3 dimGrid(numWalkers);
984 two_body_grad_kernel<float, BS>
985 <<<dimGrid, dimBlock, 0, gpu::kernelStream>>>(R, first, last, iat, spline_coefs, numCoefs, rMax, zeroOut, grad);
986 }
987
988
two_body_gradient(double * R[],int first,int last,int iat,double spline_coefs[],int numCoefs,double rMax,bool zeroOut,double grad[],int numWalkers)989 void two_body_gradient(double* R[],
990 int first,
991 int last,
992 int iat,
993 double spline_coefs[],
994 int numCoefs,
995 double rMax,
996 bool zeroOut,
997 double grad[],
998 int numWalkers)
999 {
1000 if (!AisInitialized)
1001 cuda_spline_init();
1002 const int BS = 32;
1003 dim3 dimBlock(BS);
1004 dim3 dimGrid(numWalkers);
1005 two_body_grad_kernel<double, BS>
1006 <<<dimGrid, dimBlock, 0, gpu::kernelStream>>>(R, first, last, iat, spline_coefs, numCoefs, rMax, zeroOut, grad);
1007 }
1008
1009
1010 template<typename T, int BS, unsigned COMPLEX>
two_body_derivs_kernel(T ** R,T ** gradLogPsi,int e1_first,int e1_last,int e2_first,int e2_last,int numCoefs,T rMax,T ** derivs)1011 __global__ void two_body_derivs_kernel(T** R,
1012 T** gradLogPsi,
1013 int e1_first,
1014 int e1_last,
1015 int e2_first,
1016 int e2_last,
1017 int numCoefs,
1018 T rMax,
1019 T** derivs)
1020 {
1021 T dr = rMax / (T)(numCoefs - 3);
1022 T drInv = 1.0f / dr;
1023 __syncthreads();
1024 // Safety for rounding error
1025 rMax *= 0.999999f;
1026 int tid = threadIdx.x;
1027 __shared__ T *myR, *myGrad, *myDerivs;
1028 if (tid == 0)
1029 {
1030 myR = R[blockIdx.x];
1031 myGrad = gradLogPsi[blockIdx.x];
1032 myDerivs = derivs[blockIdx.x];
1033 }
1034 __shared__ T sderivs[MAX_COEFS][2];
1035 // __shared__ T coefs[MAX_COEFS];
1036 // if (tid < numCoefs)
1037 // coefs[tid] = spline_coefs[tid];
1038 __shared__ T r1[BS][3], r2[BS][3];
1039 __shared__ T A[12][4];
1040 if (tid < 16)
1041 {
1042 A[0 + (tid >> 2)][tid & 3] = AcudaSpline[tid + 0];
1043 A[4 + (tid >> 2)][tid & 3] = AcudaSpline[tid + 16];
1044 A[8 + (tid >> 2)][tid & 3] = AcudaSpline[tid + 32];
1045 }
1046 __syncthreads();
1047 sderivs[tid][0] = T();
1048 sderivs[tid][1] = T();
1049 int N1 = e1_last - e1_first + 1;
1050 int N2 = e2_last - e2_first + 1;
1051 int NB1 = N1 / BS + ((N1 % BS) ? 1 : 0);
1052 int NB2 = N2 / BS + ((N2 % BS) ? 1 : 0);
1053 __shared__ T sGrad[BS][3];
1054 for (int b1 = 0; b1 < NB1; b1++)
1055 {
1056 // Load block of positions from global memory
1057 for (int i = 0; i < 3; i++)
1058 if ((3 * b1 + i) * BS + tid < 3 * N1)
1059 {
1060 int outoff = i * BS + tid;
1061 int inoff = outoff + 3 * e1_first + 3 * b1 * BS;
1062 r1[0][outoff] = myR[inoff]; //[3*e1_first + (3*b1+i)*BS + tid];
1063 sGrad[0][outoff] = myGrad[inoff * COMPLEX];
1064 }
1065 __syncthreads();
1066 int ptcl1 = e1_first + b1 * BS + tid;
1067 for (int b2 = 0; b2 < NB2; b2++)
1068 {
1069 // Load block of positions from global memory
1070 for (int i = 0; i < 3; i++)
1071 if ((3 * b2 + i) * BS + tid < 3 * N2)
1072 r2[0][i * BS + tid] = myR[3 * e2_first + (3 * b2 + i) * BS + tid];
1073 __syncthreads();
1074 // Now, loop over particles
1075 int end = (b2 + 1) * BS < N2 ? BS : N2 - b2 * BS;
1076 for (int j = 0; j < end; j++)
1077 {
1078 int ptcl2 = e2_first + b2 * BS + j;
1079 T dx, dy, dz;
1080 dx = r2[j][0] - r1[tid][0];
1081 dy = r2[j][1] - r1[tid][1];
1082 dz = r2[j][2] - r1[tid][2];
1083 T d = dist(dx, dy, dz);
1084 T dInv = 1.0f / d;
1085 T s = d * drInv;
1086 T sf = floorf(s);
1087 int index = (int)sf;
1088 T t = s - sf;
1089 T t2 = t * t;
1090 T t3 = t * t2;
1091 T v0, v1, v2, v3;
1092 // sderivs[index+0][0] += (A[0][0]*t3 + A[0][1]*t2 + A[0][2]*t + A[0][3]);
1093 // sderivs[index+1][0] += (A[1][0]*t3 + A[1][1]*t2 + A[1][2]*t + A[1][3]);
1094 // sderivs[index+2][0] += (A[2][0]*t3 + A[2][1]*t2 + A[2][2]*t + A[2][3]);
1095 // sderivs[index+3][0] += (A[3][0]*t3 + A[3][1]*t2 + A[3][2]*t + A[3][3]);
1096 v0 = (A[0][0] * t3 + A[0][1] * t2 + A[0][2] * t + A[0][3]);
1097 v1 = (A[1][0] * t3 + A[1][1] * t2 + A[1][2] * t + A[1][3]);
1098 v2 = (A[2][0] * t3 + A[2][1] * t2 + A[2][2] * t + A[2][3]);
1099 v3 = (A[3][0] * t3 + A[3][1] * t2 + A[3][2] * t + A[3][3]);
1100 for (int id = 0; id < BS; id++)
1101 if (tid == id && ptcl1 != ptcl2 && ptcl1 <= e1_last && (d < rMax))
1102 {
1103 sderivs[index + 0][0] += v0;
1104 sderivs[index + 1][0] += v1;
1105 sderivs[index + 2][0] += v2;
1106 sderivs[index + 3][0] += v3;
1107 }
1108 T prefact = (dx * sGrad[tid][0] + dy * sGrad[tid][1] + dz * sGrad[tid][2]) * dInv;
1109 T du0 = drInv * (A[4][0] * t3 + A[4][1] * t2 + A[4][2] * t + A[4][3]);
1110 T du1 = drInv * (A[5][0] * t3 + A[5][1] * t2 + A[5][2] * t + A[5][3]);
1111 T du2 = drInv * (A[6][0] * t3 + A[6][1] * t2 + A[6][2] * t + A[6][3]);
1112 T du3 = drInv * (A[7][0] * t3 + A[7][1] * t2 + A[7][2] * t + A[7][3]);
1113 // This is the dot (gradu, grad_log_psi) term.
1114 v0 = 2.0f * prefact * du0;
1115 v1 = 2.0f * prefact * du1;
1116 v2 = 2.0f * prefact * du2;
1117 v3 = 2.0f * prefact * du3;
1118 // This is the lapl u term
1119 v0 -= drInv * drInv * (A[8][0] * t3 + A[8][1] * t2 + A[8][2] * t + A[8][3]) + 2.0f * du0 * dInv;
1120 v1 -= drInv * drInv * (A[9][0] * t3 + A[9][1] * t2 + A[9][2] * t + A[9][3]) + 2.0f * du1 * dInv;
1121 v2 -= drInv * drInv * (A[10][0] * t3 + A[10][1] * t2 + A[10][2] * t + A[10][3]) + 2.0f * du2 * dInv;
1122 v3 -= drInv * drInv * (A[11][0] * t3 + A[11][1] * t2 + A[11][2] * t + A[11][3]) + 2.0f * du3 * dInv;
1123 for (int id = 0; id < BS; id++)
1124 if (tid == id && ptcl1 != ptcl2 && ptcl1 <= e1_last && (d < rMax))
1125 {
1126 sderivs[index + 0][1] += v0;
1127 sderivs[index + 1][1] += v1;
1128 sderivs[index + 2][1] += v2;
1129 sderivs[index + 3][1] += v3;
1130 }
1131 }
1132 __syncthreads();
1133 }
1134 }
1135 // if (e1_first == e2_first)
1136 sderivs[tid][0] *= 0.5f;
1137 sderivs[tid][1] *= 0.5f;
1138 if (tid < 2 * numCoefs)
1139 myDerivs[tid] = -sderivs[0][tid];
1140 if (tid + BS < 2 * numCoefs)
1141 myDerivs[tid + BS] = sderivs[0][tid + BS];
1142 }
1143
1144
two_body_derivs(float * R[],float * gradLogPsi[],int e1_first,int e1_last,int e2_first,int e2_last,int numCoefs,float rMax,float * derivs[],int numWalkers)1145 void two_body_derivs(float* R[],
1146 float* gradLogPsi[],
1147 int e1_first,
1148 int e1_last,
1149 int e2_first,
1150 int e2_last,
1151 int numCoefs,
1152 float rMax,
1153 float* derivs[],
1154 int numWalkers)
1155 {
1156 const int BS = 32;
1157 dim3 dimBlock(BS);
1158 dim3 dimGrid(numWalkers);
1159 two_body_derivs_kernel<float, BS, 1>
1160 <<<dimGrid, dimBlock>>>(R, gradLogPsi, e1_first, e1_last, e2_first, e2_last, numCoefs, rMax, derivs);
1161 }
1162
two_body_derivs(double * R[],double * gradLogPsi[],int e1_first,int e1_last,int e2_first,int e2_last,int numCoefs,double rMax,double * derivs[],int numWalkers)1163 void two_body_derivs(double* R[],
1164 double* gradLogPsi[],
1165 int e1_first,
1166 int e1_last,
1167 int e2_first,
1168 int e2_last,
1169 int numCoefs,
1170 double rMax,
1171 double* derivs[],
1172 int numWalkers)
1173 {
1174 const int BS = 32;
1175 dim3 dimBlock(BS);
1176 dim3 dimGrid(numWalkers);
1177 two_body_derivs_kernel<double, BS, 1>
1178 <<<dimGrid, dimBlock>>>(R, gradLogPsi, e1_first, e1_last, e2_first, e2_last, numCoefs, rMax, derivs);
1179 }
1180
1181
1182 // Ye: use offset to recycle the old routines
1183 // block size can be further optimized.
1184 #ifdef QMC_COMPLEX
two_body_derivs(float * R[],std::complex<float> * gradLogPsi[],int e1_first,int e1_last,int e2_first,int e2_last,int numCoefs,float rMax,float * derivs[],int numWalkers)1185 void two_body_derivs(float* R[],
1186 std::complex<float>* gradLogPsi[],
1187 int e1_first,
1188 int e1_last,
1189 int e2_first,
1190 int e2_last,
1191 int numCoefs,
1192 float rMax,
1193 float* derivs[],
1194 int numWalkers)
1195 {
1196 const int BS = 32;
1197 dim3 dimBlock(BS);
1198 dim3 dimGrid(numWalkers);
1199
1200 two_body_derivs_kernel<float, BS, 2>
1201 <<<dimGrid, dimBlock>>>(R, (float**)gradLogPsi, e1_first, e1_last, e2_first, e2_last, numCoefs, rMax, derivs);
1202 }
1203
two_body_derivs(double * R[],std::complex<double> * gradLogPsi[],int e1_first,int e1_last,int e2_first,int e2_last,int numCoefs,double rMax,double * derivs[],int numWalkers)1204 void two_body_derivs(double* R[],
1205 std::complex<double>* gradLogPsi[],
1206 int e1_first,
1207 int e1_last,
1208 int e2_first,
1209 int e2_last,
1210 int numCoefs,
1211 double rMax,
1212 double* derivs[],
1213 int numWalkers)
1214 {
1215 const int BS = 32;
1216 dim3 dimBlock(BS);
1217 dim3 dimGrid(numWalkers);
1218
1219 two_body_derivs_kernel<double, BS, 2>
1220 <<<dimGrid, dimBlock>>>(R, (double**)gradLogPsi, e1_first, e1_last, e2_first, e2_last, numCoefs, rMax, derivs);
1221 }
1222 #endif
1223
1224
1225 ////////////////////////////////////////////////////////////////
1226 // One-body routines //
1227 ////////////////////////////////////////////////////////////////
1228
1229 template<typename T, int BS>
one_body_sum_kernel(T * C,T ** R,int cfirst,int clast,int efirst,int elast,T * spline_coefs,int numCoefs,T rMax,T * sum)1230 __global__ void one_body_sum_kernel(T* C,
1231 T** R,
1232 int cfirst,
1233 int clast,
1234 int efirst,
1235 int elast,
1236 T* spline_coefs,
1237 int numCoefs,
1238 T rMax,
1239 T* sum)
1240 {
1241 T dr = rMax / (T)(numCoefs - 3);
1242 T drInv = 1.0 / dr;
1243 __syncthreads();
1244 // Safety for rounding error
1245 rMax *= 0.999999f;
1246 int tid = threadIdx.x;
1247 __shared__ T* myR;
1248 if (tid == 0)
1249 myR = R[blockIdx.x];
1250 __shared__ T coefs[MAX_COEFS];
1251 if (tid < numCoefs)
1252 coefs[tid] = spline_coefs[tid];
1253 __shared__ T rc[BS][3], re[BS][3];
1254 __shared__ T A[4][4];
1255 if (tid < 16)
1256 A[tid >> 2][tid & 3] = AcudaSpline[tid];
1257 __syncthreads();
1258 int Nc = clast - cfirst + 1;
1259 int Ne = elast - efirst + 1;
1260 int NBc = Nc / BS + ((Nc % BS) ? 1 : 0);
1261 int NBe = Ne / BS + ((Ne % BS) ? 1 : 0);
1262 T mysum = (T)0.0;
1263 for (int bc = 0; bc < NBc; bc++)
1264 {
1265 // Load block of positions from global memory
1266 for (int i = 0; i < 3; i++)
1267 if ((3 * bc + i) * BS + tid < 3 * Nc)
1268 rc[0][i * BS + tid] = C[3 * cfirst + (3 * bc + i) * BS + tid];
1269 __syncthreads();
1270 int ptcl1 = cfirst + bc * BS + tid;
1271 for (int be = 0; be < NBe; be++)
1272 {
1273 // Load block of positions from global memory
1274 for (int i = 0; i < 3; i++)
1275 if ((3 * be + i) * BS + tid < 3 * Ne)
1276 re[0][i * BS + tid] = myR[3 * efirst + (3 * be + i) * BS + tid];
1277 __syncthreads();
1278 // Now, loop over particles
1279 int end = (be + 1) * BS < Ne ? BS : Ne - be * BS;
1280 for (int j = 0; j < end; j++)
1281 {
1282 int ptcl2 = efirst + be * BS + j;
1283 T dx, dy, dz;
1284 dx = re[j][0] - rc[tid][0];
1285 dy = re[j][1] - rc[tid][1];
1286 dz = re[j][2] - rc[tid][2];
1287 T d = dist(dx, dy, dz);
1288 if ((ptcl1 < (Nc + cfirst)) && (ptcl2 < (Ne + efirst)))
1289 mysum += eval_1d_spline(d, rMax, drInv, A, coefs);
1290 }
1291 }
1292 __syncthreads();
1293 }
1294 __shared__ T shared_sum[BS];
1295 shared_sum[tid] = mysum;
1296 __syncthreads();
1297 for (int s = BS >> 1; s > 0; s >>= 1)
1298 {
1299 if (tid < s)
1300 shared_sum[tid] += shared_sum[tid + s];
1301 __syncthreads();
1302 }
1303 if (tid == 0)
1304 sum[blockIdx.x] += shared_sum[0];
1305 }
1306
one_body_sum(float C[],float * R[],int cfirst,int clast,int efirst,int elast,float spline_coefs[],int numCoefs,float rMax,float sum[],int numWalkers)1307 void one_body_sum(float C[],
1308 float* R[],
1309 int cfirst,
1310 int clast,
1311 int efirst,
1312 int elast,
1313 float spline_coefs[],
1314 int numCoefs,
1315 float rMax,
1316 float sum[],
1317 int numWalkers)
1318 {
1319 if (!AisInitialized)
1320 cuda_spline_init();
1321 const int BS = 32;
1322 dim3 dimBlock(BS);
1323 dim3 dimGrid(numWalkers);
1324 one_body_sum_kernel<float, BS>
1325 <<<dimGrid, dimBlock>>>(C, R, cfirst, clast, efirst, elast, spline_coefs, numCoefs, rMax, sum);
1326 }
1327
1328
one_body_sum(double C[],double * R[],int cfirst,int clast,int efirst,int elast,double spline_coefs[],int numCoefs,double rMax,double sum[],int numWalkers)1329 void one_body_sum(double C[],
1330 double* R[],
1331 int cfirst,
1332 int clast,
1333 int efirst,
1334 int elast,
1335 double spline_coefs[],
1336 int numCoefs,
1337 double rMax,
1338 double sum[],
1339 int numWalkers)
1340 {
1341 if (!AisInitialized)
1342 cuda_spline_init();
1343 const int BS = 128;
1344 dim3 dimBlock(BS);
1345 dim3 dimGrid(numWalkers);
1346 one_body_sum_kernel<double, BS>
1347 <<<dimGrid, dimBlock>>>(C, R, cfirst, clast, efirst, elast, spline_coefs, numCoefs, rMax, sum);
1348 }
1349
1350
1351 template<typename T, int BS>
one_body_ratio_kernel(T * C,T ** R,int cfirst,int clast,T * Rnew,int inew,T * spline_coefs,int numCoefs,int nw,T rMax,T * sum)1352 __global__ void one_body_ratio_kernel(T* C,
1353 T** R,
1354 int cfirst,
1355 int clast,
1356 T* Rnew,
1357 int inew,
1358 T* spline_coefs,
1359 int numCoefs,
1360 int nw,
1361 T rMax,
1362 T* sum)
1363 {
1364 T dr = rMax / (T)(numCoefs - 3);
1365 T drInv = 1.0 / dr;
1366 __syncthreads();
1367 // Safety for rounding error
1368 rMax *= 0.999999f;
1369 int tid = threadIdx.x;
1370 __shared__ T* myR;
1371 __shared__ T myRnew[3], myRold[3];
1372 if (tid == 0)
1373 myR = R[blockIdx.x % nw];
1374 __syncthreads();
1375 if (tid < 3)
1376 {
1377 myRnew[tid] = Rnew[3 * blockIdx.x + tid];
1378 myRold[tid] = myR[3 * (inew + blockIdx.x / nw) + tid];
1379 }
1380 __syncthreads();
1381 __shared__ T coefs[MAX_COEFS];
1382 __shared__ T c[BS][3];
1383 if (tid < numCoefs)
1384 coefs[tid] = spline_coefs[tid];
1385 __shared__ T A[4][4];
1386 if (tid < 16)
1387 A[(tid >> 2)][tid & 3] = AcudaSpline[tid];
1388 __syncthreads();
1389 int Nc = clast - cfirst + 1;
1390 int NB = Nc / BS + ((Nc % BS) ? 1 : 0);
1391 __shared__ T shared_sum[BS];
1392 shared_sum[tid] = (T)0.0;
1393 for (int b = 0; b < NB; b++)
1394 {
1395 // Load block of positions from global memory
1396 for (int i = 0; i < 3; i++)
1397 {
1398 int n = i * BS + tid;
1399 if ((3 * b + i) * BS + tid < 3 * Nc)
1400 c[0][n] = C[3 * cfirst + (3 * b + i) * BS + tid];
1401 }
1402 __syncthreads();
1403 int ptcl1 = cfirst + b * BS + tid;
1404 T dx, dy, dz;
1405 dx = myRnew[0] - c[tid][0];
1406 dy = myRnew[1] - c[tid][1];
1407 dz = myRnew[2] - c[tid][2];
1408 T d = dist(dx, dy, dz);
1409 T delta = eval_1d_spline(d, rMax, drInv, A, coefs);
1410 dx = myRold[0] - c[tid][0];
1411 dy = myRold[1] - c[tid][1];
1412 dz = myRold[2] - c[tid][2];
1413 d = dist(dx, dy, dz);
1414 delta -= eval_1d_spline(d, rMax, drInv, A, coefs);
1415 if (ptcl1 < (Nc + cfirst))
1416 shared_sum[tid] += delta;
1417 __syncthreads();
1418 }
1419 for (int s = (BS >> 1); s > 0; s >>= 1)
1420 {
1421 if (tid < s)
1422 shared_sum[tid] += shared_sum[tid + s];
1423 __syncthreads();
1424 }
1425 if (tid == 0)
1426 sum[blockIdx.x] += shared_sum[0];
1427 }
1428
1429
one_body_ratio(float C[],float * R[],int first,int last,float Rnew[],int inew,float spline_coefs[],int numCoefs,int nw,float rMax,float sum[],int numWalkers)1430 void one_body_ratio(float C[],
1431 float* R[],
1432 int first,
1433 int last,
1434 float Rnew[],
1435 int inew,
1436 float spline_coefs[],
1437 int numCoefs,
1438 int nw,
1439 float rMax,
1440 float sum[],
1441 int numWalkers)
1442 {
1443 if (!AisInitialized)
1444 cuda_spline_init();
1445 const int BS = 32;
1446 dim3 dimBlock(BS);
1447 dim3 dimGrid(numWalkers);
1448 one_body_ratio_kernel<float, BS>
1449 <<<dimGrid, dimBlock>>>(C, R, first, last, Rnew, inew, spline_coefs, numCoefs, nw, rMax, sum);
1450 }
1451
1452
one_body_ratio(double C[],double * R[],int first,int last,double Rnew[],int inew,double spline_coefs[],int numCoefs,int nw,double rMax,double sum[],int numWalkers)1453 void one_body_ratio(double C[],
1454 double* R[],
1455 int first,
1456 int last,
1457 double Rnew[],
1458 int inew,
1459 double spline_coefs[],
1460 int numCoefs,
1461 int nw,
1462 double rMax,
1463 double sum[],
1464 int numWalkers)
1465 {
1466 if (!AisInitialized)
1467 cuda_spline_init();
1468 dim3 dimBlock(128);
1469 dim3 dimGrid(numWalkers);
1470 one_body_ratio_kernel<double, 128>
1471 <<<dimGrid, dimBlock>>>(C, R, first, last, Rnew, inew, spline_coefs, numCoefs, nw, rMax, sum);
1472 }
1473
1474
1475 template<typename T, int BS>
one_body_ratio_grad_kernel(T * C,T ** R,int cfirst,int clast,T * Rnew,int inew,T * spline_coefs,int numCoefs,int nw,T rMax,bool zero,T * ratio_grad)1476 __global__ void one_body_ratio_grad_kernel(T* C,
1477 T** R,
1478 int cfirst,
1479 int clast,
1480 T* Rnew,
1481 int inew,
1482 T* spline_coefs,
1483 int numCoefs,
1484 int nw,
1485 T rMax,
1486 bool zero,
1487 T* ratio_grad)
1488 {
1489 T dr = rMax / (T)(numCoefs - 3);
1490 T drInv = 1.0 / dr;
1491 __syncthreads();
1492 // Safety for rounding error
1493 rMax *= 0.999999f;
1494 int tid = threadIdx.x;
1495 __shared__ T* myR;
1496 __shared__ T myRnew[3], myRold[3];
1497 if (tid == 0)
1498 myR = R[blockIdx.x % nw];
1499 __syncthreads();
1500 if (tid < 3)
1501 {
1502 myRnew[tid] = Rnew[3 * blockIdx.x + tid];
1503 myRold[tid] = myR[3 * (inew + blockIdx.x / nw) + tid];
1504 }
1505 __syncthreads();
1506 __shared__ T coefs[MAX_COEFS];
1507 __shared__ T c[BS][3];
1508 if (tid < numCoefs)
1509 coefs[tid] = spline_coefs[tid];
1510 __shared__ T A[12][4];
1511 if (tid < 16)
1512 {
1513 A[0 + (tid >> 2)][tid & 3] = AcudaSpline[tid + 0];
1514 A[4 + (tid >> 2)][tid & 3] = AcudaSpline[tid + 16];
1515 A[8 + (tid >> 2)][tid & 3] = AcudaSpline[tid + 32];
1516 }
1517 __syncthreads();
1518 int Nc = clast - cfirst + 1;
1519 int NB = Nc / BS + ((Nc % BS) ? 1 : 0);
1520 __shared__ T shared_sum[BS];
1521 __shared__ T shared_grad[BS][3];
1522 shared_sum[tid] = (T)0.0;
1523 shared_grad[tid][0] = shared_grad[tid][1] = shared_grad[tid][2] = 0.0f;
1524 for (int b = 0; b < NB; b++)
1525 {
1526 // Load block of positions from global memory
1527 for (int i = 0; i < 3; i++)
1528 {
1529 int n = i * BS + tid;
1530 if ((3 * b + i) * BS + tid < 3 * Nc)
1531 c[0][n] = C[3 * cfirst + (3 * b + i) * BS + tid];
1532 }
1533 __syncthreads();
1534 int ptcl1 = cfirst + b * BS + tid;
1535 T dx, dy, dz, d, delta, u, du, d2u;
1536 dx = myRold[0] - c[tid][0];
1537 dy = myRold[1] - c[tid][1];
1538 dz = myRold[2] - c[tid][2];
1539 d = dist(dx, dy, dz);
1540 delta = -eval_1d_spline(d, rMax, drInv, A, coefs);
1541 dx = myRnew[0] - c[tid][0];
1542 dy = myRnew[1] - c[tid][1];
1543 dz = myRnew[2] - c[tid][2];
1544 d = dist(dx, dy, dz);
1545 eval_1d_spline_vgl(d, rMax, drInv, A, coefs, u, du, d2u);
1546 delta += u;
1547 if (ptcl1 < (Nc + cfirst))
1548 {
1549 du /= d;
1550 shared_sum[tid] += delta;
1551 shared_grad[tid][0] += du * dx;
1552 shared_grad[tid][1] += du * dy;
1553 shared_grad[tid][2] += du * dz;
1554 }
1555 __syncthreads();
1556 }
1557 for (int s = (BS >> 1); s > 0; s >>= 1)
1558 {
1559 if (tid < s)
1560 {
1561 shared_sum[tid] += shared_sum[tid + s];
1562 shared_grad[tid][0] += shared_grad[tid + s][0];
1563 shared_grad[tid][1] += shared_grad[tid + s][1];
1564 shared_grad[tid][2] += shared_grad[tid + s][2];
1565 }
1566 __syncthreads();
1567 }
1568 if (tid == 0)
1569 {
1570 if (zero)
1571 {
1572 ratio_grad[4 * blockIdx.x + 0] = shared_sum[0];
1573 ratio_grad[4 * blockIdx.x + 1] = shared_grad[0][0];
1574 ratio_grad[4 * blockIdx.x + 2] = shared_grad[0][1];
1575 ratio_grad[4 * blockIdx.x + 3] = shared_grad[0][2];
1576 }
1577 else
1578 {
1579 ratio_grad[4 * blockIdx.x + 0] += shared_sum[0];
1580 ratio_grad[4 * blockIdx.x + 1] += shared_grad[0][0];
1581 ratio_grad[4 * blockIdx.x + 2] += shared_grad[0][1];
1582 ratio_grad[4 * blockIdx.x + 3] += shared_grad[0][2];
1583 }
1584 }
1585 }
1586
1587
one_body_ratio_grad(float C[],float * R[],int first,int last,float Rnew[],int inew,float spline_coefs[],int numCoefs,int nw,float rMax,bool zero,float ratio_grad[],int numWalkers)1588 void one_body_ratio_grad(float C[],
1589 float* R[],
1590 int first,
1591 int last,
1592 float Rnew[],
1593 int inew,
1594 float spline_coefs[],
1595 int numCoefs,
1596 int nw,
1597 float rMax,
1598 bool zero,
1599 float ratio_grad[],
1600 int numWalkers)
1601 {
1602 if (!AisInitialized)
1603 cuda_spline_init();
1604 const int BS = 32;
1605 dim3 dimBlock(BS);
1606 dim3 dimGrid(numWalkers);
1607 one_body_ratio_grad_kernel<float, BS>
1608 <<<dimGrid, dimBlock>>>(C, R, first, last, Rnew, inew, spline_coefs, numCoefs, nw, rMax, zero, ratio_grad);
1609 }
1610
1611
one_body_ratio_grad(double C[],double * R[],int first,int last,double Rnew[],int inew,double spline_coefs[],int numCoefs,int nw,double rMax,bool zero,double ratio_grad[],int numWalkers)1612 void one_body_ratio_grad(double C[],
1613 double* R[],
1614 int first,
1615 int last,
1616 double Rnew[],
1617 int inew,
1618 double spline_coefs[],
1619 int numCoefs,
1620 int nw,
1621 double rMax,
1622 bool zero,
1623 double ratio_grad[],
1624 int numWalkers)
1625 {
1626 if (!AisInitialized)
1627 cuda_spline_init();
1628 const int BS = 32;
1629 dim3 dimBlock(BS);
1630 dim3 dimGrid(numWalkers);
1631 one_body_ratio_grad_kernel<double, BS>
1632 <<<dimGrid, dimBlock>>>(C, R, first, last, Rnew, inew, spline_coefs, numCoefs, nw, rMax, zero, ratio_grad);
1633 }
1634
1635
1636 template<typename T, int BS>
one_body_grad_lapl_kernel(T * C,T ** R,int cfirst,int clast,int efirst,int elast,T * spline_coefs,int numCoefs,T rMax,T * gradLapl,int row_stride)1637 __global__ void one_body_grad_lapl_kernel(T* C,
1638 T** R,
1639 int cfirst,
1640 int clast,
1641 int efirst,
1642 int elast,
1643 T* spline_coefs,
1644 int numCoefs,
1645 T rMax,
1646 T* gradLapl,
1647 int row_stride)
1648 {
1649 T dr = rMax / (T)(numCoefs - 3);
1650 T drInv = 1.0 / dr;
1651 __syncthreads();
1652 // Safety for rounding error
1653 rMax *= 0.999999f;
1654 // rMax *= 0.99999f;
1655 int tid = threadIdx.x;
1656 __shared__ T* myR;
1657 if (tid == 0)
1658 myR = R[blockIdx.x];
1659 __shared__ T coefs[MAX_COEFS];
1660 if (tid < numCoefs)
1661 coefs[tid] = spline_coefs[tid];
1662 __shared__ T r[BS][3], c[BS][3];
1663 __syncthreads();
1664 __shared__ T A[12][4];
1665 if (tid < 16)
1666 {
1667 A[0 + (tid >> 2)][tid & 3] = AcudaSpline[tid + 0];
1668 A[4 + (tid >> 2)][tid & 3] = AcudaSpline[tid + 16];
1669 A[8 + (tid >> 2)][tid & 3] = AcudaSpline[tid + 32];
1670 }
1671 __syncthreads();
1672 int Nc = clast - cfirst + 1;
1673 int Ne = elast - efirst + 1;
1674 int NBc = Nc / BS + ((Nc % BS) ? 1 : 0);
1675 int NBe = Ne / BS + ((Ne % BS) ? 1 : 0);
1676 __shared__ T sGradLapl[BS][4];
1677 for (int be = 0; be < NBe; be++)
1678 {
1679 // Load block of positions from global memory
1680 for (int i = 0; i < 3; i++)
1681 if ((3 * be + i) * BS + tid < 3 * Ne)
1682 r[0][i * BS + tid] = myR[3 * efirst + (3 * be + i) * BS + tid];
1683 __syncthreads();
1684 int eptcl = efirst + be * BS + tid;
1685 int offset = blockIdx.x * row_stride + 4 * be * BS + 4 * efirst;
1686 sGradLapl[tid][0] = sGradLapl[tid][1] = sGradLapl[tid][2] = sGradLapl[tid][3] = (T)0.0;
1687 for (int bc = 0; bc < NBc; bc++)
1688 {
1689 // Load block of positions from global memory
1690 for (int i = 0; i < 3; i++)
1691 if ((3 * bc + i) * BS + tid < 3 * Nc)
1692 c[0][i * BS + tid] = C[3 * cfirst + (3 * bc + i) * BS + tid];
1693 __syncthreads();
1694 // Now, loop over particles
1695 int end = ((bc + 1) * BS < Nc) ? BS : Nc - bc * BS;
1696 for (int j = 0; j < end; j++)
1697 {
1698 int cptcl = cfirst + bc * BS + j;
1699 T dx, dy, dz, u, du, d2u;
1700 dx = r[tid][0] - c[j][0];
1701 dy = r[tid][1] - c[j][1];
1702 dz = r[tid][2] - c[j][2];
1703 T d = dist(dx, dy, dz);
1704 eval_1d_spline_vgl(d, rMax, drInv, A, coefs, u, du, d2u);
1705 //u = du = d2u = 0.0f;
1706 if (cptcl < (Nc + cfirst) && (eptcl < (Ne + efirst)))
1707 {
1708 du /= d;
1709 sGradLapl[tid][0] -= du * dx;
1710 sGradLapl[tid][1] -= du * dy;
1711 sGradLapl[tid][2] -= du * dz;
1712 sGradLapl[tid][3] -= d2u + 2.0 * du;
1713 }
1714 }
1715 __syncthreads();
1716 }
1717 __syncthreads();
1718 for (int i = 0; i < 4; i++)
1719 if ((4 * be + i) * BS + tid < 4 * Ne)
1720 gradLapl[offset + i * BS + tid] += sGradLapl[0][i * BS + tid];
1721 __syncthreads();
1722 }
1723 }
1724
1725
one_body_grad_lapl(float C[],float * R[],int e1_first,int e1_last,int e2_first,int e2_last,float spline_coefs[],int numCoefs,float rMax,float gradLapl[],int row_stride,int numWalkers)1726 void one_body_grad_lapl(float C[],
1727 float* R[],
1728 int e1_first,
1729 int e1_last,
1730 int e2_first,
1731 int e2_last,
1732 float spline_coefs[],
1733 int numCoefs,
1734 float rMax,
1735 float gradLapl[],
1736 int row_stride,
1737 int numWalkers)
1738 {
1739 const int BS = 32;
1740 dim3 dimBlock(BS);
1741 dim3 dimGrid(numWalkers);
1742 one_body_grad_lapl_kernel<float, BS><<<dimGrid, dimBlock>>>(C,
1743 R,
1744 e1_first,
1745 e1_last,
1746 e2_first,
1747 e2_last,
1748 spline_coefs,
1749 numCoefs,
1750 rMax,
1751 gradLapl,
1752 row_stride);
1753 }
1754
1755
one_body_grad_lapl(double C[],double * R[],int e1_first,int e1_last,int e2_first,int e2_last,double spline_coefs[],int numCoefs,double rMax,double gradLapl[],int row_stride,int numWalkers)1756 void one_body_grad_lapl(double C[],
1757 double* R[],
1758 int e1_first,
1759 int e1_last,
1760 int e2_first,
1761 int e2_last,
1762 double spline_coefs[],
1763 int numCoefs,
1764 double rMax,
1765 double gradLapl[],
1766 int row_stride,
1767 int numWalkers)
1768 {
1769 if (!AisInitialized)
1770 cuda_spline_init();
1771 const int BS = 32;
1772 dim3 dimBlock(BS);
1773 dim3 dimGrid(numWalkers);
1774 one_body_grad_lapl_kernel<double, BS><<<dimGrid, dimBlock>>>(C,
1775 R,
1776 e1_first,
1777 e1_last,
1778 e2_first,
1779 e2_last,
1780 spline_coefs,
1781 numCoefs,
1782 rMax,
1783 gradLapl,
1784 row_stride);
1785 }
1786
1787
1788 template<int BS>
one_body_NLratio_kernel(NLjobGPU<float> * jobs,float * C,int first,int last,float * spline_coefs,int numCoefs,float rMax)1789 __global__ void one_body_NLratio_kernel(NLjobGPU<float>* jobs,
1790 float* C,
1791 int first,
1792 int last,
1793 float* spline_coefs,
1794 int numCoefs,
1795 float rMax)
1796 {
1797 const int MAX_RATIOS = 18;
1798 int tid = threadIdx.x;
1799 __shared__ NLjobGPU<float> myJob;
1800 __shared__ float myRnew[MAX_RATIOS][3], myRold[3];
1801 if (tid == 0)
1802 myJob = jobs[blockIdx.x];
1803 __syncthreads();
1804 if (tid < 3)
1805 myRold[tid] = myJob.R[3 * myJob.Elec + tid];
1806 for (int i = 0; i < 3; i++)
1807 if (i * BS + tid < 3 * myJob.NumQuadPoints)
1808 myRnew[0][i * BS + tid] = myJob.QuadPoints[i * BS + tid];
1809 __syncthreads();
1810 float dr = rMax / (float)(numCoefs - 3);
1811 float drInv = 1.0 / dr;
1812 __syncthreads();
1813 // Safety for rounding error
1814 rMax *= 0.999999f;
1815 __shared__ float coefs[MAX_COEFS];
1816 __shared__ float c[BS][3];
1817 if (tid < numCoefs)
1818 coefs[tid] = spline_coefs[tid];
1819 __syncthreads();
1820 __shared__ float A[4][4];
1821 if (tid < 16)
1822 A[(tid >> 2)][tid & 3] = AcudaSpline[tid];
1823 __syncthreads();
1824 int N = last - first + 1;
1825 int NB = N / BS + ((N % BS) ? 1 : 0);
1826 __shared__ float shared_sum[MAX_RATIOS][BS + 1];
1827 for (int iq = 0; iq < myJob.NumQuadPoints; iq++)
1828 shared_sum[iq][tid] = (float)0.0;
1829 for (int b = 0; b < NB; b++)
1830 {
1831 // Load block of positions from global memory
1832 for (int i = 0; i < 3; i++)
1833 {
1834 int n = i * BS + tid;
1835 if ((3 * b + i) * BS + tid < 3 * N)
1836 c[0][n] = C[3 * first + (3 * b + i) * BS + tid];
1837 }
1838 __syncthreads();
1839 int ptcl1 = first + b * BS + tid;
1840 float dx, dy, dz;
1841 dx = myRold[0] - c[tid][0];
1842 dy = myRold[1] - c[tid][1];
1843 dz = myRold[2] - c[tid][2];
1844 float d = dist(dx, dy, dz);
1845 float uOld = eval_1d_spline(d, rMax, drInv, A, coefs);
1846 for (int iq = 0; iq < myJob.NumQuadPoints; iq++)
1847 {
1848 dx = myRnew[iq][0] - c[tid][0];
1849 dy = myRnew[iq][1] - c[tid][1];
1850 dz = myRnew[iq][2] - c[tid][2];
1851 d = dist(dx, dy, dz);
1852 if (ptcl1 < (N + first))
1853 shared_sum[iq][tid] += eval_1d_spline(d, rMax, drInv, A, coefs) - uOld;
1854 }
1855 __syncthreads();
1856 }
1857 for (int s = (BS >> 1); s > 0; s >>= 1)
1858 {
1859 if (tid < s)
1860 for (int iq = 0; iq < myJob.NumQuadPoints; iq++)
1861 shared_sum[iq][tid] += shared_sum[iq][tid + s];
1862 __syncthreads();
1863 }
1864 if (tid < myJob.NumQuadPoints)
1865 myJob.Ratios[tid] *= exp(-shared_sum[tid][0]);
1866 }
1867
1868
1869 template<int BS>
one_body_NLratio_kernel(NLjobGPU<double> * jobs,double * C,int first,int last,double * spline_coefs,int numCoefs,double rMax)1870 __global__ void one_body_NLratio_kernel(NLjobGPU<double>* jobs,
1871 double* C,
1872 int first,
1873 int last,
1874 double* spline_coefs,
1875 int numCoefs,
1876 double rMax)
1877 {
1878 const int MAX_RATIOS = 18;
1879 int tid = threadIdx.x;
1880 __shared__ NLjobGPU<double> myJob;
1881 __shared__ double myRnew[MAX_RATIOS][3], myRold[3];
1882 if (tid == 0)
1883 myJob = jobs[blockIdx.x];
1884 __syncthreads();
1885 if (tid < 3)
1886 myRold[tid] = myJob.R[3 * myJob.Elec + tid];
1887 for (int i = 0; i < 3; i++)
1888 if (i * BS + tid < 3 * myJob.NumQuadPoints)
1889 myRnew[0][i * BS + tid] = myJob.QuadPoints[i * BS + tid];
1890 __syncthreads();
1891 double dr = rMax / (double)(numCoefs - 3);
1892 double drInv = 1.0 / dr;
1893 __shared__ double coefs[MAX_COEFS];
1894 __shared__ double c[BS][3];
1895 if (tid < numCoefs)
1896 coefs[tid] = spline_coefs[tid];
1897 __syncthreads();
1898 __shared__ double A[4][4];
1899 if (tid < 16)
1900 A[(tid >> 2)][tid & 3] = AcudaSpline[tid];
1901 __syncthreads();
1902 int N = last - first + 1;
1903 int NB = N / BS + ((N % BS) ? 1 : 0);
1904 __shared__ double shared_sum[MAX_RATIOS][BS + 1];
1905 for (int iq = 0; iq < myJob.NumQuadPoints; iq++)
1906 shared_sum[iq][tid] = (double)0.0;
1907 for (int b = 0; b < NB; b++)
1908 {
1909 // Load block of positions from global memory
1910 for (int i = 0; i < 3; i++)
1911 {
1912 int n = i * BS + tid;
1913 if ((3 * b + i) * BS + tid < 3 * N)
1914 c[0][n] = C[3 * first + (3 * b + i) * BS + tid];
1915 }
1916 __syncthreads();
1917 int ptcl1 = first + b * BS + tid;
1918 double dx, dy, dz;
1919 dx = myRold[0] - c[tid][0];
1920 dy = myRold[1] - c[tid][1];
1921 dz = myRold[2] - c[tid][2];
1922 double d = dist(dx, dy, dz);
1923 double uOld = eval_1d_spline(d, rMax, drInv, A, coefs);
1924 for (int iq = 0; iq < myJob.NumQuadPoints; iq++)
1925 {
1926 dx = myRnew[iq][0] - c[tid][0];
1927 dy = myRnew[iq][1] - c[tid][1];
1928 dz = myRnew[iq][2] - c[tid][2];
1929 d = dist(dx, dy, dz);
1930 if (ptcl1 < (N + first))
1931 shared_sum[iq][tid] += eval_1d_spline(d, rMax, drInv, A, coefs) - uOld;
1932 }
1933 __syncthreads();
1934 }
1935 for (int s = (BS >> 1); s > 0; s >>= 1)
1936 {
1937 if (tid < s)
1938 for (int iq = 0; iq < myJob.NumQuadPoints; iq++)
1939 shared_sum[iq][tid] += shared_sum[iq][tid + s];
1940 __syncthreads();
1941 }
1942 if (tid < myJob.NumQuadPoints)
1943 myJob.Ratios[tid] *= exp(-shared_sum[tid][0]);
1944 }
1945
1946
one_body_NLratios(NLjobGPU<float> jobs[],float C[],int first,int last,float spline_coefs[],int numCoefs,float rMax,int numjobs)1947 void one_body_NLratios(NLjobGPU<float> jobs[],
1948 float C[],
1949 int first,
1950 int last,
1951 float spline_coefs[],
1952 int numCoefs,
1953 float rMax,
1954 int numjobs)
1955 {
1956 if (!AisInitialized)
1957 cuda_spline_init();
1958 const int BS = 32;
1959 dim3 dimBlock(BS);
1960 while (numjobs > 65535)
1961 {
1962 dim3 dimGrid(65535);
1963 one_body_NLratio_kernel<BS><<<dimGrid, dimBlock>>>(jobs, C, first, last, spline_coefs, numCoefs, rMax);
1964 numjobs -= 65535;
1965 jobs += 65535;
1966 }
1967 dim3 dimGrid(numjobs);
1968 one_body_NLratio_kernel<BS><<<dimGrid, dimBlock>>>(jobs, C, first, last, spline_coefs, numCoefs, rMax);
1969 }
1970
1971
one_body_NLratios(NLjobGPU<double> jobs[],double C[],int first,int last,double spline_coefs[],int numCoefs,double rMax,int numjobs)1972 void one_body_NLratios(NLjobGPU<double> jobs[],
1973 double C[],
1974 int first,
1975 int last,
1976 double spline_coefs[],
1977 int numCoefs,
1978 double rMax,
1979 int numjobs)
1980 {
1981 if (!AisInitialized)
1982 cuda_spline_init();
1983 const int BS = 32;
1984 dim3 dimBlock(BS);
1985 int blockx = numjobs % 65535;
1986 int blocky = numjobs / 65535 + 1;
1987 dim3 dimGrid(blockx, blocky);
1988 one_body_NLratio_kernel<BS><<<dimGrid, dimBlock>>>(jobs, C, first, last, spline_coefs, numCoefs, rMax);
1989 }
1990
1991
1992 template<typename T, int BS>
one_body_grad_kernel(T ** R,int iat,T * C,int first,int last,T * spline_coefs,int numCoefs,T rMax,bool zeroOut,T * grad)1993 __global__ void one_body_grad_kernel(T** R,
1994 int iat,
1995 T* C,
1996 int first,
1997 int last,
1998 T* spline_coefs,
1999 int numCoefs,
2000 T rMax,
2001 bool zeroOut,
2002 T* grad)
2003 {
2004 T dr = rMax / (T)(numCoefs - 3);
2005 T drInv = 1.0 / dr;
2006 __syncthreads();
2007 // Safety for rounding error
2008 rMax *= 0.999999f;
2009 int tid = threadIdx.x;
2010 __shared__ T *myR, r[3];
2011 if (tid == 0)
2012 myR = R[blockIdx.x];
2013 __syncthreads();
2014 if (tid < 3)
2015 r[tid] = myR[3 * iat + tid];
2016 __shared__ T coefs[MAX_COEFS];
2017 if (tid < numCoefs)
2018 coefs[tid] = spline_coefs[tid];
2019 __shared__ T c[BS][3];
2020 __shared__ T A[12][4];
2021 if (tid < 16)
2022 {
2023 A[0 + (tid >> 2)][tid & 3] = AcudaSpline[tid + 0];
2024 A[4 + (tid >> 2)][tid & 3] = AcudaSpline[tid + 16];
2025 A[8 + (tid >> 2)][tid & 3] = AcudaSpline[tid + 32];
2026 }
2027 __syncthreads();
2028 int N = last - first + 1;
2029 int NB = N / BS + ((N % BS) ? 1 : 0);
2030 __shared__ T sGrad[BS][3];
2031 sGrad[tid][0] = sGrad[tid][1] = sGrad[tid][2] = (T)0.0;
2032 for (int b = 0; b < NB; b++)
2033 {
2034 // Load block of positions from global memory
2035 for (int i = 0; i < 3; i++)
2036 if ((3 * b + i) * BS + tid < 3 * N)
2037 c[0][i * BS + tid] = C[3 * first + (3 * b + i) * BS + tid];
2038 __syncthreads();
2039 int ptcl1 = first + b * BS + tid;
2040 T dx, dy, dz, u, du, d2u;
2041 dx = r[0] - c[tid][0];
2042 dy = r[1] - c[tid][1];
2043 dz = r[2] - c[tid][2];
2044 T d = dist(dx, dy, dz);
2045 eval_1d_spline_vgl(d, rMax, drInv, A, coefs, u, du, d2u);
2046 if (ptcl1 < (N + first))
2047 {
2048 du /= d;
2049 sGrad[tid][0] += du * dx;
2050 sGrad[tid][1] += du * dy;
2051 sGrad[tid][2] += du * dz;
2052 }
2053 __syncthreads();
2054 }
2055 // Do reduction across threads in block
2056 for (int s = BS >> 1; s > 0; s >>= 1)
2057 {
2058 if (tid < s)
2059 {
2060 sGrad[tid][0] += sGrad[tid + s][0];
2061 sGrad[tid][1] += sGrad[tid + s][1];
2062 sGrad[tid][2] += sGrad[tid + s][2];
2063 }
2064 __syncthreads();
2065 }
2066 if (tid < 3)
2067 {
2068 if (zeroOut)
2069 grad[3 * blockIdx.x + tid] = sGrad[0][tid];
2070 else
2071 grad[3 * blockIdx.x + tid] += sGrad[0][tid];
2072 }
2073 }
2074
2075
one_body_gradient(float * Rlist[],int iat,float C[],int first,int last,float spline_coefs[],int num_coefs,float rMax,bool zeroSum,float grad[],int numWalkers)2076 void one_body_gradient(float* Rlist[],
2077 int iat,
2078 float C[],
2079 int first,
2080 int last,
2081 float spline_coefs[],
2082 int num_coefs,
2083 float rMax,
2084 bool zeroSum,
2085 float grad[],
2086 int numWalkers)
2087 {
2088 if (!AisInitialized)
2089 cuda_spline_init();
2090 const int BS = 32;
2091 dim3 dimBlock(BS);
2092 dim3 dimGrid(numWalkers);
2093 one_body_grad_kernel<float, BS><<<dimGrid, dimBlock, 0, gpu::kernelStream>>>(Rlist,
2094 iat,
2095 C,
2096 first,
2097 last,
2098 spline_coefs,
2099 num_coefs,
2100 rMax,
2101 zeroSum,
2102 grad);
2103 }
2104
one_body_gradient(double * Rlist[],int iat,double C[],int first,int last,double spline_coefs[],int num_coefs,double rMax,bool zeroSum,double grad[],int numWalkers)2105 void one_body_gradient(double* Rlist[],
2106 int iat,
2107 double C[],
2108 int first,
2109 int last,
2110 double spline_coefs[],
2111 int num_coefs,
2112 double rMax,
2113 bool zeroSum,
2114 double grad[],
2115 int numWalkers)
2116 {
2117 if (!AisInitialized)
2118 cuda_spline_init();
2119 const int BS = 32;
2120 dim3 dimBlock(BS);
2121 dim3 dimGrid(numWalkers);
2122 one_body_grad_kernel<double, BS><<<dimGrid, dimBlock, 0, gpu::kernelStream>>>(Rlist,
2123 iat,
2124 C,
2125 first,
2126 last,
2127 spline_coefs,
2128 num_coefs,
2129 rMax,
2130 zeroSum,
2131 grad);
2132 }
2133
2134
2135 template<typename T, int BS, unsigned COMPLEX>
one_body_derivs_kernel(T * C,T ** R,T ** gradLogPsi,int cfirst,int clast,int efirst,int elast,int numCoefs,T rMax,T ** derivs)2136 __global__ void one_body_derivs_kernel(T* C,
2137 T** R,
2138 T** gradLogPsi,
2139 int cfirst,
2140 int clast,
2141 int efirst,
2142 int elast,
2143 int numCoefs,
2144 T rMax,
2145 T** derivs)
2146 {
2147 T dr = rMax / (T)(numCoefs - 3);
2148 T drInv = 1.0 / dr;
2149 __syncthreads();
2150 // Safety for rounding error
2151 rMax *= 0.999999f;
2152 int tid = threadIdx.x;
2153 __shared__ T *myR, *myGrad, *myDerivs;
2154 if (tid == 0)
2155 {
2156 myR = R[blockIdx.x];
2157 myGrad = gradLogPsi[blockIdx.x];
2158 myDerivs = derivs[blockIdx.x];
2159 }
2160 __shared__ T sderivs[MAX_COEFS][2];
2161 __shared__ T r[BS][3], c[BS][3];
2162 __shared__ T A[12][4];
2163 if (tid < 16)
2164 {
2165 A[0 + (tid >> 2)][tid & 3] = AcudaSpline[tid + 0];
2166 A[4 + (tid >> 2)][tid & 3] = AcudaSpline[tid + 16];
2167 A[8 + (tid >> 2)][tid & 3] = AcudaSpline[tid + 32];
2168 }
2169 __syncthreads();
2170 sderivs[tid][0] = T();
2171 sderivs[tid][1] = T();
2172 int Nc = clast - cfirst + 1;
2173 int Ne = elast - efirst + 1;
2174 int NBc = (Nc + BS - 1) / BS;
2175 int NBe = (Ne + BS - 1) / BS;
2176 __shared__ T sGrad[BS][3];
2177 for (int be = 0; be < NBe; be++)
2178 {
2179 // Load block of positions from global memory
2180 for (int i = 0; i < 3; i++)
2181 if ((3 * be + i) * BS + tid < 3 * Ne)
2182 {
2183 int outoff = i * BS + tid;
2184 int inoff = outoff + 3 * efirst + 3 * be * BS;
2185 r[0][outoff] = myR[inoff];
2186 sGrad[0][outoff] = myGrad[inoff * COMPLEX];
2187 }
2188 __syncthreads();
2189 int eptcl = efirst + be * BS + tid;
2190 for (int bc = 0; bc < NBc; bc++)
2191 {
2192 // Load block of positions from global memory
2193 for (int i = 0; i < 3; i++)
2194 if ((3 * bc + i) * BS + tid < 3 * Nc)
2195 c[0][i * BS + tid] = C[3 * cfirst + (3 * bc + i) * BS + tid];
2196 __syncthreads();
2197 // Now, loop over particles
2198 int end = min(BS, Nc - bc * BS);
2199 for (int j = 0; j < end; j++)
2200 {
2201 T dx, dy, dz;
2202 dx = c[j][0] - r[tid][0];
2203 dy = c[j][1] - r[tid][1];
2204 dz = c[j][2] - r[tid][2];
2205 T d = dist(dx, dy, dz);
2206 T dInv = 1.0f / d;
2207 T s = d * drInv;
2208 T sf = floorf(s);
2209 int index = (int)sf;
2210 T t = s - sf;
2211 T t2 = t * t;
2212 T t3 = t * t2;
2213 T v0 = (A[0][0] * t3 + A[0][1] * t2 + A[0][2] * t + A[0][3]);
2214 T v1 = (A[1][0] * t3 + A[1][1] * t2 + A[1][2] * t + A[1][3]);
2215 T v2 = (A[2][0] * t3 + A[2][1] * t2 + A[2][2] * t + A[2][3]);
2216 T v3 = (A[3][0] * t3 + A[3][1] * t2 + A[3][2] * t + A[3][3]);
2217 for (int id = 0; id < BS; id++)
2218 if (tid == id && eptcl <= elast && (d < rMax))
2219 {
2220 sderivs[index + 0][0] += v0;
2221 sderivs[index + 1][0] += v1;
2222 sderivs[index + 2][0] += v2;
2223 sderivs[index + 3][0] += v3;
2224 }
2225 T prefact = (dx * sGrad[tid][0] + dy * sGrad[tid][1] + dz * sGrad[tid][2]) * dInv;
2226 T du0 = drInv * (A[4][0] * t3 + A[4][1] * t2 + A[4][2] * t + A[4][3]);
2227 T du1 = drInv * (A[5][0] * t3 + A[5][1] * t2 + A[5][2] * t + A[5][3]);
2228 T du2 = drInv * (A[6][0] * t3 + A[6][1] * t2 + A[6][2] * t + A[6][3]);
2229 T du3 = drInv * (A[7][0] * t3 + A[7][1] * t2 + A[7][2] * t + A[7][3]);
2230 // This is the dot (gradu, grad_log_psi) term.
2231 v0 = 2.0f * prefact * du0;
2232 v1 = 2.0f * prefact * du1;
2233 v2 = 2.0f * prefact * du2;
2234 v3 = 2.0f * prefact * du3;
2235 // This is the lapl u term
2236 v0 -= drInv * drInv * (A[8][0] * t3 + A[8][1] * t2 + A[8][2] * t + A[8][3]) + 2.0f * du0 * dInv;
2237 v1 -= drInv * drInv * (A[9][0] * t3 + A[9][1] * t2 + A[9][2] * t + A[9][3]) + 2.0f * du1 * dInv;
2238 v2 -= drInv * drInv * (A[10][0] * t3 + A[10][1] * t2 + A[10][2] * t + A[10][3]) + 2.0f * du2 * dInv;
2239 v3 -= drInv * drInv * (A[11][0] * t3 + A[11][1] * t2 + A[11][2] * t + A[11][3]) + 2.0f * du3 * dInv;
2240 for (int id = 0; id < BS; id++)
2241 if (tid == id && eptcl <= elast && (d < rMax))
2242 {
2243 sderivs[index + 0][1] += v0;
2244 sderivs[index + 1][1] += v1;
2245 sderivs[index + 2][1] += v2;
2246 sderivs[index + 3][1] += v3;
2247 }
2248 }
2249 __syncthreads();
2250 }
2251 }
2252 sderivs[tid][1] *= 0.5f;
2253 if (tid < 2 * numCoefs)
2254 myDerivs[tid] = -sderivs[0][tid];
2255 if (tid + BS < 2 * numCoefs)
2256 myDerivs[tid + BS] = -sderivs[0][tid + BS];
2257 }
2258
2259
one_body_derivs(float C[],float * R[],float * gradLogPsi[],int cfirst,int clast,int efirst,int elast,int numCoefs,float rMax,float * derivs[],int numWalkers)2260 void one_body_derivs(float C[],
2261 float* R[],
2262 float* gradLogPsi[],
2263 int cfirst,
2264 int clast,
2265 int efirst,
2266 int elast,
2267 int numCoefs,
2268 float rMax,
2269 float* derivs[],
2270 int numWalkers)
2271 {
2272 const int BS = 32;
2273 dim3 dimBlock(BS);
2274 dim3 dimGrid(numWalkers);
2275 one_body_derivs_kernel<float, BS, 1>
2276 <<<dimGrid, dimBlock>>>(C, R, gradLogPsi, cfirst, clast, efirst, elast, numCoefs, rMax, derivs);
2277 }
2278
2279
one_body_derivs(double C[],double * R[],double * gradLogPsi[],int cfirst,int clast,int efirst,int elast,int numCoefs,double rMax,double * derivs[],int numWalkers)2280 void one_body_derivs(double C[],
2281 double* R[],
2282 double* gradLogPsi[],
2283 int cfirst,
2284 int clast,
2285 int efirst,
2286 int elast,
2287 int numCoefs,
2288 double rMax,
2289 double* derivs[],
2290 int numWalkers)
2291 {
2292 const int BS = 32;
2293 dim3 dimBlock(BS);
2294 dim3 dimGrid(numWalkers);
2295 one_body_derivs_kernel<double, BS, 1>
2296 <<<dimGrid, dimBlock>>>(C, R, gradLogPsi, cfirst, clast, efirst, elast, numCoefs, rMax, derivs);
2297 }
2298
2299 // Ye: use offset to recycle the old routines
2300 // block size can be further optimized.
2301 #ifdef QMC_COMPLEX
one_body_derivs(float C[],float * R[],std::complex<float> * gradLogPsi[],int cfirst,int clast,int efirst,int elast,int numCoefs,float rMax,float * derivs[],int numWalkers)2302 void one_body_derivs(float C[],
2303 float* R[],
2304 std::complex<float>* gradLogPsi[],
2305 int cfirst,
2306 int clast,
2307 int efirst,
2308 int elast,
2309 int numCoefs,
2310 float rMax,
2311 float* derivs[],
2312 int numWalkers)
2313 {
2314 const int BS = 32;
2315 dim3 dimBlock(BS);
2316 dim3 dimGrid(numWalkers);
2317
2318 one_body_derivs_kernel<float, BS, 2>
2319 <<<dimGrid, dimBlock>>>(C, R, (float**)gradLogPsi, cfirst, clast, efirst, elast, numCoefs, rMax, derivs);
2320 }
2321
2322
one_body_derivs(double C[],double * R[],std::complex<double> * gradLogPsi[],int cfirst,int clast,int efirst,int elast,int numCoefs,double rMax,double * derivs[],int numWalkers)2323 void one_body_derivs(double C[],
2324 double* R[],
2325 std::complex<double>* gradLogPsi[],
2326 int cfirst,
2327 int clast,
2328 int efirst,
2329 int elast,
2330 int numCoefs,
2331 double rMax,
2332 double* derivs[],
2333 int numWalkers)
2334 {
2335 const int BS = 32;
2336 dim3 dimBlock(BS);
2337 dim3 dimGrid(numWalkers);
2338
2339 one_body_derivs_kernel<double, BS, 2>
2340 <<<dimGrid, dimBlock>>>(C, R, (double**)gradLogPsi, cfirst, clast, efirst, elast, numCoefs, rMax, derivs);
2341 }
2342
2343 #endif
2344
test()2345 void test()
2346 {
2347 dim3 dimBlock(32);
2348 dim3 dimGrid(1000);
2349 float* R[1000];
2350 float spline_coefs[10];
2351 float dr = 0.1;
2352 float sum[1000];
2353 two_body_sum_kernel<float, 32><<<dimGrid, dimBlock>>>(R, 0, 100, 0, 100, spline_coefs, 10, dr, sum);
2354 }
2355