1 //////////////////////////////////////////////////////////////////////////////////////
2 // This file is distributed under the University of Illinois/NCSA Open Source License.
3 // See LICENSE file in top directory for details.
4 //
5 // Copyright (c) 2016 Jeongnim Kim and QMCPACK developers.
6 //
7 // File developed by: Ken Esler, kpesler@gmail.com, University of Illinois at Urbana-Champaign
8 // Jeremy McMinnis, jmcminis@gmail.com, University of Illinois at Urbana-Champaign
9 // Jeongnim Kim, jeongnim.kim@gmail.com, University of Illinois at Urbana-Champaign
10 // Paul R. C. Kent, kentpr@ornl.gov, Oak Ridge National Laboratory
11 // Ye Luo, yeluo@anl.gov, Argonne National Laboratory
12 //
13 // File created by: Ken Esler, kpesler@gmail.com, University of Illinois at Urbana-Champaign
14 //////////////////////////////////////////////////////////////////////////////////////
15
16
17 #include "determinant_update.h"
18 #include <stdio.h>
19 #include <unistd.h>
20 #include <stdlib.h>
21 #include <sys/time.h>
22 #ifdef QMC_COMPLEX
23 #include <thrust/complex.h>
24 //Prior to CUDA9.0, bulk::uninitialized_array was included with thrust library
25 //#include <thrust/system/cuda/detail/bulk/uninitialized.hpp>
26 #include "uninitialized_array.hpp"
27 #endif
28 #include "CUDA_legacy/gpu_misc.h"
29
30 #define DET_BLOCK_SIZE 8
31
32 template<typename T, int BS>
update_inverse_cuda1(updateJob * updateList,int N,int rowstride)33 __global__ void update_inverse_cuda1(updateJob* updateList, int N, int rowstride)
34 {
35 __shared__ T *A, *Ainv, *u, *Ainv_delta, *Ainv_colk;
36 __shared__ int k;
37 if (threadIdx.x == 0)
38 {
39 updateJob job = updateList[blockIdx.y];
40 A = (T*)job.A;
41 Ainv = (T*)job.Ainv;
42 u = (T*)job.newRow;
43 Ainv_delta = (T*)job.AinvDelta;
44 Ainv_colk = (T*)job.AinvColk;
45 k = job.iat;
46 }
47 __syncthreads();
48 // Store the product Ainv * u in shared memory
49 T Ainv_delta_tid;
50 __shared__ T Ainv_colk_shared[BS], delta[BS];
51 Ainv_delta_tid = 0.0f;
52 __syncthreads();
53 int col = blockIdx.x * BS + threadIdx.x;
54 int numblocks = (N + BS - 1) / BS;
55 int kBlock = k / BS;
56 // If the column I need to pull from Ainv is in this thread block
57 // domain, do the following
58 __syncthreads();
59 for (int block = 0; block < numblocks; block++)
60 {
61 delta[threadIdx.x] = u[block * BS + threadIdx.x] - A[k * rowstride + block * BS + threadIdx.x];
62 __syncthreads();
63 int istop = min(BS, N - block * BS);
64 for (int i = 0; i < istop; i++)
65 {
66 int row = block * BS + i;
67 T a = Ainv[row * rowstride + col];
68 if (col == k)
69 Ainv_colk_shared[i] = a;
70 Ainv_delta_tid += a * delta[i];
71 __syncthreads();
72 }
73 if (blockIdx.x == kBlock)
74 if (block * BS + threadIdx.x < N)
75 Ainv_colk[block * BS + threadIdx.x] = Ainv_colk_shared[threadIdx.x];
76 __syncthreads();
77 }
78 // Write the data back to global memory
79 if (col < N)
80 Ainv_delta[col] = Ainv_delta_tid;
81 __syncthreads();
82 }
83
84
85 template<typename T, int BS>
update_inverse_cuda2(updateJob * updateList,int N,int rowstride)86 __global__ void update_inverse_cuda2(updateJob* updateList, int N, int rowstride)
87 {
88 __shared__ T *A, *u, *Ainv, *Ainv_delta, *Ainv_colk;
89 int tid = threadIdx.x;
90 __shared__ int k;
91 if (threadIdx.x == 0)
92 {
93 updateJob job = updateList[blockIdx.y];
94 A = (T*)job.A;
95 u = (T*)job.newRow;
96 Ainv = (T*)job.Ainv;
97 Ainv_delta = (T*)job.AinvDelta;
98 Ainv_colk = (T*)job.AinvColk;
99 k = job.iat;
100 }
101 __syncthreads();
102 T Ainv_delta_tid;
103 __shared__ T Ainv_colk_shared[BS];
104 int col = blockIdx.x * BS + threadIdx.x;
105 // Read the data back from global memory
106 Ainv_delta_tid = Ainv_delta[col];
107 Ainv_colk_shared[threadIdx.x] = Ainv_colk[col];
108 if (col < N)
109 A[k * rowstride + col] = u[col];
110 __syncthreads();
111 __shared__ T prefact;
112 if (threadIdx.x == 0)
113 prefact = -1.0f / (1.0f + Ainv_delta[k]);
114 int numblocks = N / BS + ((N % BS) ? 1 : 0);
115 __syncthreads();
116 for (int block = 0; block < numblocks; block++)
117 {
118 Ainv_colk_shared[tid] = prefact * Ainv_colk[block * BS + threadIdx.x];
119 __syncthreads();
120 T* Ainv_row = Ainv + block * BS * rowstride + col;
121 int istop = min(BS, N - block * BS);
122 if (col < N)
123 for (int i = 0; i < istop; i++, Ainv_row += rowstride)
124 *Ainv_row += Ainv_delta_tid * Ainv_colk_shared[i];
125 __syncthreads();
126 }
127 }
128
129
update_inverse_cuda(updateJob updateList[],float dummy,int N,int rowstride,int numWalkers)130 void update_inverse_cuda(updateJob updateList[], float dummy, int N, int rowstride, int numWalkers)
131 {
132 const int BS1 = 64;
133 const int BS2 = 64;
134 int NB1 = (N + BS1 - 1) / BS1;
135 int NB2 = (N + BS2 - 1) / BS2;
136 dim3 dimBlock1(BS1);
137 dim3 dimGrid1(NB1, numWalkers);
138 dim3 dimBlock2(BS2);
139 dim3 dimGrid2(NB2, numWalkers);
140 update_inverse_cuda1<float, BS1><<<dimGrid1, dimBlock1>>>(updateList, N, rowstride);
141 update_inverse_cuda2<float, BS2><<<dimGrid2, dimBlock2>>>(updateList, N, rowstride);
142 }
143
144
update_inverse_cuda(updateJob updateList[],double dummy,int N,int rowstride,int numWalkers)145 void update_inverse_cuda(updateJob updateList[], double dummy, int N, int rowstride, int numWalkers)
146 {
147 const int BS1 = 32;
148 const int BS2 = 32;
149 int NB1 = N / BS1 + ((N % BS1) ? 1 : 0);
150 int NB2 = N / BS2 + ((N % BS2) ? 1 : 0);
151 dim3 dimBlock1(BS1);
152 dim3 dimGrid1(NB1, numWalkers);
153 dim3 dimBlock2(BS2);
154 dim3 dimGrid2(NB2, numWalkers);
155 update_inverse_cuda1<double, BS1><<<dimGrid1, dimBlock1>>>(updateList, N, rowstride);
156 update_inverse_cuda2<double, BS2><<<dimGrid2, dimBlock2>>>(updateList, N, rowstride);
157 cudaError_t err = cudaGetLastError();
158 if (err != cudaSuccess)
159 {
160 fprintf(stderr, "CUDA error in update_inverse_cuda:\n %s\n", cudaGetErrorString(err));
161 abort();
162 }
163 }
164
165 /* Update of Ainv after rank-1 update to A, using Sherman-Morrison algorithm,
166 part 1. See also K. P. Esler, J. Kim, D. M. Ceperley, and L. Shulenburger:
167 Accelerating Quantum Monte Carlo Simulations of Real Materials on GPU
168 Clusters. Computing in Science & Engineering, Vol. 14, No. 1, Jan/Feb 2012,
169 pp. 40-51 (section "Inverses and Updates").
170
171 Replace the Sherman-Morrison algorithm with the Fahy variant as the CPU.
172 Eq.(26) from S. Fahy, X. W. Wang, Steven G. Louie, Phys. Rev. B. 42, 3503
173
174 u is the new row replacing the k-th row of A. Compute u * Ainv (gemv) and
175 return it via u_Ainv. Copy out the k-th column of Ainv as a contiguous
176 vector Ainv_colk for use by the second part of the update process. The
177 dimension of the square matrices A and Ainv is N, and the stride
178 (in elements) between consecutive rows of Ainv (and A) is rowstride.
179 All matrices are stored in row-major order.
180 */
181 template<typename T, int BS>
update_inverse_core1(T * __restrict__ A,const T * __restrict__ Ainv,const T * __restrict__ u,T * __restrict__ u_Ainv,T * __restrict__ Ainv_colk,int k,int N,int rowstride)182 __device__ __forceinline__ void update_inverse_core1(T* __restrict__ A,
183 const T* __restrict__ Ainv,
184 const T* __restrict__ u,
185 T* __restrict__ u_Ainv,
186 T* __restrict__ Ainv_colk,
187 int k,
188 int N,
189 int rowstride)
190 {
191 #ifdef QMC_COMPLEX
192 __shared__ uninitialized_array<T, BS> delta;
193 #else
194 __shared__ T delta[BS];
195 #endif
196 T sum(0);
197 int tidx = threadIdx.x;
198 int col_Ainv = blockIdx.x * BS + tidx;
199 int numBlocks = (N + BS - 1) / BS;
200 for (int block = 0; block < numBlocks; block++)
201 {
202 int blockStart = block * BS;
203 int col_A = blockStart + tidx;
204 if (col_A < N)
205 {
206 delta[tidx] = u[col_A];
207 A[k * rowstride + col_A] = u[col_A];
208 }
209 __syncthreads();
210 for (int i = 0; i < min(BS, N - blockStart); i++)
211 {
212 const int row_Ainv = blockStart + i;
213 sum += delta[i] * Ainv[row_Ainv * rowstride + col_Ainv];
214 }
215 __syncthreads();
216 }
217 // Write segment of row vector delta*Ainv back to global memory
218 if (col_Ainv < N)
219 {
220 u_Ainv[col_Ainv] = sum;
221 // save the column k of Ainv
222 Ainv_colk[col_Ainv] = Ainv[col_Ainv * rowstride + k];
223 }
224 }
225
226 /* Update of Ainv after rank-1 update to A, using Sherman-Morrison algorithm,
227 part 2. See also K. P. Esler, J. Kim, D. M. Ceperley, and L. Shulenburger:
228 Accelerating Quantum Monte Carlo Simulations of Real Materials on GPU
229 Clusters. Computing in Science & Engineering, Vol. 14, No. 1, Jan/Feb 2012,
230 pp. 40-51 (section "Inverses and Updates").
231
232 Replace the Sherman-Morrison algorithm with the Fahy variant as the CPU.
233 Eq.(26) from S. Fahy, X. W. Wang, Steven G. Louie, Phys. Rev. B. 42, 3503
234
235 Ainv * ek, the k-th column of Ainv, has been extracted in the first step,
236 and is passed in as column vector Ainv_colk. Step 1 also computed the row
237 vector u*Ainv, passed in via u_Ainv. We need to multiply (ger) Ainv_colk
238 with u_Ainv, scale the result by -1/u*Ainv*ek, then add this to Ainv.
239 delta*Ainv*ek is simply the k-th element of u_Ainv. The dimension of
240 the square matrices A and Ainv is N, and the stride (in elements) between
241 consecutive rows of Ainv and A is rowstride. All matrices are stored in
242 row-major order.
243 */
244 template<typename T, int BS>
update_inverse_core2(T * __restrict__ Ainv,const T * __restrict__ u_Ainv,const T * __restrict__ Ainv_colk,int k,int N,int rowstride)245 __device__ __forceinline__ void update_inverse_core2(T* __restrict__ Ainv,
246 const T* __restrict__ u_Ainv,
247 const T* __restrict__ Ainv_colk,
248 int k,
249 int N,
250 int rowstride)
251 {
252 T u_Ainv_shared;
253 #ifdef QMC_COMPLEX
254 __shared__ uninitialized_array<T, BS> Ainv_colk_shared;
255 #else
256 __shared__ T Ainv_colk_shared[BS];
257 #endif
258 T prefact;
259 int tidx = threadIdx.x;
260 int col_Ainv = blockIdx.x * BS + tidx;
261 int numBlocks = (N + BS - 1) / BS;
262 // Cache one segment of row vector delta*Ainv, and replace one segment of
263 // the k-th row of A with the corresponding segment from row vector u
264 if (col_Ainv < N)
265 u_Ainv_shared = u_Ainv[col_Ainv];
266 if (col_Ainv == k)
267 u_Ainv_shared = u_Ainv[k] - T(1);
268 prefact = -T(1) / u_Ainv[k];
269 for (int block = 0; block < numBlocks; block++)
270 {
271 int blockStart = block * BS;
272 int row_Ainv;
273 row_Ainv = blockStart + tidx;
274 // cache and scale next segment of k-th column of Ainv
275 __syncthreads();
276 if (row_Ainv < N)
277 {
278 Ainv_colk_shared[tidx] = prefact * Ainv_colk[row_Ainv];
279 }
280 __syncthreads();
281 if (col_Ainv < N)
282 {
283 for (int i = 0; i < min(BS, N - blockStart); i++)
284 {
285 row_Ainv = blockStart + i;
286 // update one segment of current row of Ainv
287 Ainv[row_Ainv * rowstride + col_Ainv] += u_Ainv_shared * Ainv_colk_shared[i];
288 }
289 }
290 }
291 }
292
293 template<typename T, int BS>
update_inverse_core2_subblock(T * __restrict__ Ainv,const T * __restrict__ u_Ainv,const T * __restrict__ Ainv_colk,int k,int N,int rowstride)294 __device__ __forceinline__ void update_inverse_core2_subblock(T* __restrict__ Ainv,
295 const T* __restrict__ u_Ainv,
296 const T* __restrict__ Ainv_colk,
297 int k,
298 int N,
299 int rowstride)
300 {
301 T u_Ainv_shared;
302 #ifdef QMC_COMPLEX
303 __shared__ uninitialized_array<T, BS> Ainv_colk_shared;
304 #else
305 __shared__ T Ainv_colk_shared[BS];
306 #endif
307 T prefact;
308 int tidx = threadIdx.x;
309 int col_Ainv = blockIdx.y * BS + tidx;
310 // int numBlocks = (N + BS - 1) / BS;
311 // Cache one segment of row vector delta*Ainv, and replace one segment of
312 // the k-th row of A with the corresponding segment from row vector u
313 if (col_Ainv < N)
314 u_Ainv_shared = u_Ainv[col_Ainv];
315 if (col_Ainv == k)
316 u_Ainv_shared = u_Ainv[k] - T(1);
317 prefact = -T(1) / u_Ainv[k];
318 const int blockStart = blockIdx.z * BS;
319 int row_Ainv;
320 row_Ainv = blockStart + tidx;
321 // cache and scale next segment of k-th column of Ainv
322 __syncthreads();
323 if (row_Ainv < N)
324 {
325 Ainv_colk_shared[tidx] = prefact * Ainv_colk[row_Ainv];
326 }
327 __syncthreads();
328 if (col_Ainv < N)
329 {
330 for (int i = 0; i < min(BS, N - blockStart); i++)
331 {
332 row_Ainv = blockStart + i;
333 // update one segment of current row of Ainv
334 Ainv[row_Ainv * rowstride + col_Ainv] += u_Ainv_shared * Ainv_colk_shared[i];
335 }
336 }
337 }
338
339 /////////////////////////////////////////////////
340 // New version with fewer PCI transfers needed //
341 /////////////////////////////////////////////////
342
343 template<typename T, int BS>
update_inverse_kernel1(T ** data,int * iat,int A_off,int Ainv_off,int newRow_off,int AinvDelta_off,int AinvColk_off,int N,int rowstride)344 __global__ void update_inverse_kernel1(T** data,
345 int* iat,
346 int A_off,
347 int Ainv_off,
348 int newRow_off,
349 int AinvDelta_off,
350 int AinvColk_off,
351 int N,
352 int rowstride)
353 {
354 T* const sdata = data[blockIdx.y];
355 T* A = sdata + A_off; // A
356 const T* Ainv = sdata + Ainv_off; // Ainv
357 const T* u = sdata + newRow_off; // new k-th row of A
358 T* u_Ainv = sdata + AinvDelta_off; // u * Ainv
359 T* Ainv_colk = sdata + AinvColk_off; // k-th column of orig. Ainv
360 int k = iat[blockIdx.y];
361 update_inverse_core1<T, BS>(A, Ainv, u, u_Ainv, Ainv_colk, k, N, rowstride);
362 }
363
364 template<typename T, int BS>
update_inverse_kernel2(T ** data,int * iat,int A_off,int Ainv_off,int newRow_off,int AinvDelta_off,int AinvColk_off,int N,int rowstride)365 __global__ void update_inverse_kernel2(T** data,
366 int* iat,
367 int A_off,
368 int Ainv_off,
369 int newRow_off,
370 int AinvDelta_off,
371 int AinvColk_off,
372 int N,
373 int rowstride)
374
375 {
376 T* const sdata = data[blockIdx.y];
377 T* Ainv = sdata + Ainv_off; // Ainv
378 const T* u_Ainv = sdata + AinvDelta_off; // u * Ainv
379 const T* Ainv_colk = sdata + AinvColk_off; // k-th column of orig. Ainv
380 int k = iat[blockIdx.y];
381 update_inverse_core2<T, BS>(Ainv, u_Ainv, Ainv_colk, k, N, rowstride);
382 }
383
update_inverse_cuda(float ** data,int iat[],int A_off,int Ainv_off,int newRow_off,int AinvDelta_off,int AinvColk_off,int N,int rowstride,int numWalkers)384 void update_inverse_cuda(float** data,
385 int iat[],
386 int A_off,
387 int Ainv_off,
388 int newRow_off,
389 int AinvDelta_off,
390 int AinvColk_off,
391 int N,
392 int rowstride,
393 int numWalkers)
394 {
395 const int BS1 = 128;
396 const int BS2 = 128;
397 int NB1 = (N + BS1 - 1) / BS1;
398 int NB2 = (N + BS2 - 1) / BS2;
399 dim3 dimBlock1(BS1);
400 dim3 dimGrid1(NB1, numWalkers);
401 dim3 dimBlock2(BS2);
402 dim3 dimGrid2(NB2, numWalkers);
403 update_inverse_kernel1<float, BS1>
404 <<<dimGrid1, dimBlock1>>>(data, iat, A_off, Ainv_off, newRow_off, AinvDelta_off, AinvColk_off, N, rowstride);
405 update_inverse_kernel2<float, BS2>
406 <<<dimGrid2, dimBlock2>>>(data, iat, A_off, Ainv_off, newRow_off, AinvDelta_off, AinvColk_off, N, rowstride);
407 cudaError_t err = cudaGetLastError();
408 if (err != cudaSuccess)
409 {
410 fprintf(stderr, "CUDA error in update_inverse_cuda:\n %s\n", cudaGetErrorString(err));
411 abort();
412 }
413 }
414
update_inverse_cuda(double ** data,int iat[],int A_off,int Ainv_off,int newRow_off,int AinvDelta_off,int AinvColk_off,int N,int rowstride,int numWalkers)415 void update_inverse_cuda(double** data,
416 int iat[],
417 int A_off,
418 int Ainv_off,
419 int newRow_off,
420 int AinvDelta_off,
421 int AinvColk_off,
422 int N,
423 int rowstride,
424 int numWalkers)
425 {
426 const int BS1 = 32;
427 const int BS2 = 32;
428 int NB1 = (N + BS1 - 1) / BS1;
429 int NB2 = (N + BS2 - 1) / BS2;
430 dim3 dimBlock1(BS1);
431 dim3 dimGrid1(NB1, numWalkers);
432 dim3 dimBlock2(BS2);
433 dim3 dimGrid2(NB2, numWalkers);
434 update_inverse_kernel1<double, BS1>
435 <<<dimGrid1, dimBlock1>>>(data, iat, A_off, Ainv_off, newRow_off, AinvDelta_off, AinvColk_off, N, rowstride);
436 update_inverse_kernel2<double, BS2>
437 <<<dimGrid2, dimBlock2>>>(data, iat, A_off, Ainv_off, newRow_off, AinvDelta_off, AinvColk_off, N, rowstride);
438 cudaError_t err = cudaGetLastError();
439 if (err != cudaSuccess)
440 {
441 fprintf(stderr, "CUDA error in update_inverse_cuda:\n %s\n", cudaGetErrorString(err));
442 abort();
443 }
444 }
445
446 #ifdef QMC_COMPLEX
447
update_inverse_cuda(std::complex<float> ** data,int iat[],int A_off,int Ainv_off,int newRow_off,int AinvDelta_off,int AinvColk_off,int N,int rowstride,int numWalkers)448 void update_inverse_cuda(std::complex<float>** data,
449 int iat[],
450 int A_off,
451 int Ainv_off,
452 int newRow_off,
453 int AinvDelta_off,
454 int AinvColk_off,
455 int N,
456 int rowstride,
457 int numWalkers)
458 {
459 const int BS1 = 128;
460 const int BS2 = 128;
461 int NB1 = (N + BS1 - 1) / BS1;
462 int NB2 = (N + BS2 - 1) / BS2;
463 dim3 dimBlock1(BS1);
464 dim3 dimGrid1(NB1, numWalkers);
465 dim3 dimBlock2(BS2);
466 dim3 dimGrid2(NB2, numWalkers);
467
468 update_inverse_kernel1<thrust::complex<float>, BS1><<<dimGrid1, dimBlock1>>>((thrust::complex<float>**)data,
469 iat,
470 A_off,
471 Ainv_off,
472 newRow_off,
473 AinvDelta_off,
474 AinvColk_off,
475 N,
476 rowstride);
477 update_inverse_kernel2<thrust::complex<float>, BS2><<<dimGrid2, dimBlock2>>>((thrust::complex<float>**)data,
478 iat,
479 A_off,
480 Ainv_off,
481 newRow_off,
482 AinvDelta_off,
483 AinvColk_off,
484 N,
485 rowstride);
486 cudaError_t err = cudaGetLastError();
487 if (err != cudaSuccess)
488 {
489 fprintf(stderr, "CUDA error in update_inverse_cuda:\n %s\n", cudaGetErrorString(err));
490 abort();
491 }
492 }
493
update_inverse_cuda(std::complex<double> ** data,int iat[],int A_off,int Ainv_off,int newRow_off,int AinvDelta_off,int AinvColk_off,int N,int rowstride,int numWalkers)494 void update_inverse_cuda(std::complex<double>** data,
495 int iat[],
496 int A_off,
497 int Ainv_off,
498 int newRow_off,
499 int AinvDelta_off,
500 int AinvColk_off,
501 int N,
502 int rowstride,
503 int numWalkers)
504 {
505 const int BS1 = 32;
506 const int BS2 = 32;
507 int NB1 = (N + BS1 - 1) / BS1;
508 int NB2 = (N + BS2 - 1) / BS2;
509 dim3 dimBlock1(BS1);
510 dim3 dimGrid1(NB1, numWalkers);
511 dim3 dimBlock2(BS2);
512 dim3 dimGrid2(NB2, numWalkers);
513
514 update_inverse_kernel1<thrust::complex<double>, BS1><<<dimGrid1, dimBlock1>>>((thrust::complex<double>**)data,
515 iat,
516 A_off,
517 Ainv_off,
518 newRow_off,
519 AinvDelta_off,
520 AinvColk_off,
521 N,
522 rowstride);
523 update_inverse_kernel2<thrust::complex<double>, BS2><<<dimGrid2, dimBlock2>>>((thrust::complex<double>**)data,
524 iat,
525 A_off,
526 Ainv_off,
527 newRow_off,
528 AinvDelta_off,
529 AinvColk_off,
530 N,
531 rowstride);
532 cudaError_t err = cudaGetLastError();
533 if (err != cudaSuccess)
534 {
535 fprintf(stderr, "CUDA error in update_inverse_cuda:\n %s\n", cudaGetErrorString(err));
536 abort();
537 }
538 }
539
540
541 #endif
542
543
544 template<typename T, int BS>
update_inverse_kernel1(T ** data,int k,int A_off,int Ainv_off,int newRow_off,int AinvDelta_off,int AinvColk_off,int N,int rowstride)545 __global__ void update_inverse_kernel1(T** data,
546 int k,
547 int A_off,
548 int Ainv_off,
549 int newRow_off,
550 int AinvDelta_off,
551 int AinvColk_off,
552 int N,
553 int rowstride)
554 {
555 T* const sdata = data[blockIdx.y];
556 T* A = sdata + A_off; // A
557 const T* Ainv = sdata + Ainv_off; // Ainv
558 const T* u = sdata + newRow_off; // new k-th row of A
559 T* u_Ainv = sdata + AinvDelta_off; // u * Ainv
560 T* Ainv_colk = sdata + AinvColk_off; // k-th column of orig. Ainv
561 update_inverse_core1<T, BS>(A, Ainv, u, u_Ainv, Ainv_colk, k, N, rowstride);
562 }
563
564 template<typename T, int BS>
update_inverse_kernel2(T ** data,int k,int A_off,int Ainv_off,int newRow_off,int AinvDelta_off,int AinvColk_off,int N,int rowstride)565 __global__ void update_inverse_kernel2(T** data,
566 int k,
567 int A_off,
568 int Ainv_off,
569 int newRow_off,
570 int AinvDelta_off,
571 int AinvColk_off,
572 int N,
573 int rowstride)
574
575 {
576 T* const sdata = data[blockIdx.y];
577 T* A = sdata + A_off; // A
578 T* Ainv = sdata + Ainv_off; // Ainv
579 const T* u = sdata + newRow_off; // new k-th row of A
580 const T* u_Ainv = sdata + AinvDelta_off; // u * Ainv
581 const T* Ainv_colk = sdata + AinvColk_off; // k-th column of orig. Ainv
582 update_inverse_core2<T, BS>(A, Ainv, u, u_Ainv, Ainv_colk, k, N, rowstride);
583 }
584
585 template<typename T, int BS>
update_inverse_kernel2_subblock(T ** data,int k,int A_off,int Ainv_off,int newRow_off,int AinvDelta_off,int AinvColk_off,int N,int rowstride)586 __global__ void update_inverse_kernel2_subblock(T** data,
587 int k,
588 int A_off,
589 int Ainv_off,
590 int newRow_off,
591 int AinvDelta_off,
592 int AinvColk_off,
593 int N,
594 int rowstride)
595
596 {
597 T* const sdata = data[blockIdx.x];
598 T* Ainv = sdata + Ainv_off; // Ainv
599 const T* u_Ainv = sdata + AinvDelta_off; // u * Ainv
600 const T* Ainv_colk = sdata + AinvColk_off; // k-th column of orig. Ainv
601 update_inverse_core2_subblock<T, BS>(Ainv, u_Ainv, Ainv_colk, k, N, rowstride);
602 }
603
update_inverse_cuda(float ** data,int iat,int A_off,int Ainv_off,int newRow_off,int AinvDelta_off,int AinvColk_off,int N,int rowstride,int numWalkers)604 void update_inverse_cuda(float** data,
605 int iat,
606 int A_off,
607 int Ainv_off,
608 int newRow_off,
609 int AinvDelta_off,
610 int AinvColk_off,
611 int N,
612 int rowstride,
613 int numWalkers)
614 {
615 const int BS1 = 128;
616 const int BS2 = 128;
617 int NB1 = (N + BS1 - 1) / BS1;
618 int NB2 = (N + BS2 - 1) / BS2;
619 dim3 dimBlock1(BS1);
620 dim3 dimGrid1(NB1, numWalkers);
621 dim3 dimBlock2(BS2);
622 dim3 dimGrid2(NB2, numWalkers);
623 update_inverse_kernel1<float, BS1>
624 <<<dimGrid1, dimBlock1>>>(data, iat, A_off, Ainv_off, newRow_off, AinvDelta_off, AinvColk_off, N, rowstride);
625 /* reference implementation, replaced by _subblock version. Ye Luo
626 update_inverse_kernel2<float,BS2><<<dimGrid2,dimBlock2>>>
627 (data, iat, A_off, Ainv_off, newRow_off, AinvDelta_off, AinvColk_off,
628 N, rowstride);
629 */
630 dim3 dimGrid3(numWalkers, NB2, NB2);
631 update_inverse_kernel2_subblock<float, BS2>
632 <<<dimGrid3, dimBlock2>>>(data, iat, A_off, Ainv_off, newRow_off, AinvDelta_off, AinvColk_off, N, rowstride);
633 cudaError_t err = cudaGetLastError();
634 if (err != cudaSuccess)
635 {
636 fprintf(stderr, "CUDA error in update_inverse_cuda:\n %s\n", cudaGetErrorString(err));
637 abort();
638 }
639 }
640
update_inverse_cuda(double ** data,int iat,int A_off,int Ainv_off,int newRow_off,int AinvDelta_off,int AinvColk_off,int N,int rowstride,int numWalkers)641 void update_inverse_cuda(double** data,
642 int iat,
643 int A_off,
644 int Ainv_off,
645 int newRow_off,
646 int AinvDelta_off,
647 int AinvColk_off,
648 int N,
649 int rowstride,
650 int numWalkers)
651 {
652 const int BS1 = 64;
653 const int BS2 = 64;
654 int NB1 = (N + BS1 - 1) / BS1;
655 int NB2 = (N + BS2 - 1) / BS2;
656 dim3 dimBlock1(BS1);
657 dim3 dimGrid1(NB1, numWalkers);
658 dim3 dimBlock2(BS2);
659 dim3 dimGrid2(NB2, numWalkers);
660 update_inverse_kernel1<double, BS1>
661 <<<dimGrid1, dimBlock1>>>(data, iat, A_off, Ainv_off, newRow_off, AinvDelta_off, AinvColk_off, N, rowstride);
662 /* reference implementation, replaced by _subblock version. Ye Luo
663 update_inverse_kernel2<double,BS2><<<dimGrid2,dimBlock2>>>
664 (data, iat, A_off, Ainv_off, newRow_off, AinvDelta_off, AinvColk_off,
665 N, rowstride);
666 */
667 dim3 dimGrid3(numWalkers, NB2, NB2);
668 update_inverse_kernel2_subblock<double, BS2>
669 <<<dimGrid3, dimBlock2>>>(data, iat, A_off, Ainv_off, newRow_off, AinvDelta_off, AinvColk_off, N, rowstride);
670 cudaError_t err = cudaGetLastError();
671 if (err != cudaSuccess)
672 {
673 fprintf(stderr, "CUDA error in update_inverse_cuda:\n %s\n", cudaGetErrorString(err));
674 abort();
675 }
676 }
677
678 #ifdef QMC_COMPLEX
update_inverse_cuda(std::complex<float> ** data,int iat,int A_off,int Ainv_off,int newRow_off,int AinvDelta_off,int AinvColk_off,int N,int rowstride,int numWalkers)679 void update_inverse_cuda(std::complex<float>** data,
680 int iat,
681 int A_off,
682 int Ainv_off,
683 int newRow_off,
684 int AinvDelta_off,
685 int AinvColk_off,
686 int N,
687 int rowstride,
688 int numWalkers)
689 {
690 const int BS1 = 128;
691 const int BS2 = 128;
692 int NB1 = (N + BS1 - 1) / BS1;
693 int NB2 = (N + BS2 - 1) / BS2;
694 dim3 dimBlock1(BS1);
695 dim3 dimGrid1(NB1, numWalkers);
696 dim3 dimBlock2(BS2);
697 dim3 dimGrid2(NB2, numWalkers);
698
699 update_inverse_kernel1<thrust::complex<float>, BS1><<<dimGrid1, dimBlock1>>>((thrust::complex<float>**)data,
700 iat,
701 A_off,
702 Ainv_off,
703 newRow_off,
704 AinvDelta_off,
705 AinvColk_off,
706 N,
707 rowstride);
708 /* reference implementation, replaced by _subblock version. Ye Luo
709 update_inverse_kernel2<thrust::complex<float>,BS2><<<dimGrid2,dimBlock2>>>
710 ((thrust::complex<float>**)data, iat, A_off, Ainv_off, newRow_off, AinvDelta_off, AinvColk_off,
711 N, rowstride);
712 */
713 dim3 dimGrid3(numWalkers, NB2, NB2);
714 update_inverse_kernel2_subblock<thrust::complex<float>, BS2><<<dimGrid3, dimBlock2>>>((thrust::complex<float>**)data,
715 iat,
716 A_off,
717 Ainv_off,
718 newRow_off,
719 AinvDelta_off,
720 AinvColk_off,
721 N,
722 rowstride);
723 cudaError_t err = cudaGetLastError();
724 if (err != cudaSuccess)
725 {
726 fprintf(stderr, "CUDA error in update_inverse_cuda:\n %s\n", cudaGetErrorString(err));
727 abort();
728 }
729 }
730
update_inverse_cuda(std::complex<double> ** data,int iat,int A_off,int Ainv_off,int newRow_off,int AinvDelta_off,int AinvColk_off,int N,int rowstride,int numWalkers)731 void update_inverse_cuda(std::complex<double>** data,
732 int iat,
733 int A_off,
734 int Ainv_off,
735 int newRow_off,
736 int AinvDelta_off,
737 int AinvColk_off,
738 int N,
739 int rowstride,
740 int numWalkers)
741 {
742 const int BS1 = 64;
743 const int BS2 = 64;
744 int NB1 = (N + BS1 - 1) / BS1;
745 int NB2 = (N + BS2 - 1) / BS2;
746 dim3 dimBlock1(BS1);
747 dim3 dimGrid1(NB1, numWalkers);
748 dim3 dimBlock2(BS2);
749 dim3 dimGrid2(NB2, numWalkers);
750
751 update_inverse_kernel1<thrust::complex<double>, BS1><<<dimGrid1, dimBlock1>>>((thrust::complex<double>**)data,
752 iat,
753 A_off,
754 Ainv_off,
755 newRow_off,
756 AinvDelta_off,
757 AinvColk_off,
758 N,
759 rowstride);
760 /* reference implementation, replaced by _subblock version. Ye Luo
761 update_inverse_kernel2<thrust::complex<double>,BS2><<<dimGrid2,dimBlock2>>>
762 ((thrust::complex<double>**)data, iat, A_off, Ainv_off, newRow_off, AinvDelta_off, AinvColk_off,
763 N, rowstride);
764 */
765 dim3 dimGrid3(numWalkers, NB2, NB2);
766 update_inverse_kernel2_subblock<thrust::complex<double>, BS2>
767 <<<dimGrid3, dimBlock2>>>((thrust::complex<double>**)data,
768 iat,
769 A_off,
770 Ainv_off,
771 newRow_off,
772 AinvDelta_off,
773 AinvColk_off,
774 N,
775 rowstride);
776 cudaError_t err = cudaGetLastError();
777 if (err != cudaSuccess)
778 {
779 fprintf(stderr, "CUDA error in update_inverse_cuda:\n %s\n", cudaGetErrorString(err));
780 abort();
781 }
782 }
783
784 #endif
785
786 // The first kernel just computes AinvT * u and also stores the kth
787 // col of Ainv in global memory
788 template<typename T, int BS>
update_inverse_cuda1(T ** A_g,T ** Ainv_g,T ** u_g,T ** Ainv_delta_g,T ** Ainv_colk_g,int N,int rowstride,int k)789 __global__ void update_inverse_cuda1(T** A_g,
790 T** Ainv_g,
791 T** u_g,
792 T** Ainv_delta_g,
793 T** Ainv_colk_g,
794 int N,
795 int rowstride,
796 int k)
797 {
798 __shared__ T *A, *Ainv, *u, *Ainv_delta, *Ainv_colk;
799 if (threadIdx.x == 0)
800 {
801 A = A_g[blockIdx.y];
802 Ainv = Ainv_g[blockIdx.y];
803 u = u_g[blockIdx.y];
804 Ainv_delta = Ainv_delta_g[blockIdx.y];
805 Ainv_colk = Ainv_colk_g[blockIdx.y];
806 }
807 __syncthreads();
808 // Store the product Ainv * u in shared memory
809 T Ainv_delta_tid;
810 __shared__ T Ainv_colk_shared[BS], delta[BS];
811 Ainv_delta_tid = 0.0f;
812 __syncthreads();
813 int col = blockIdx.x * BS + threadIdx.x;
814 int numblocks = N / BS + ((N % BS) ? 1 : 0);
815 int kBlock = k / BS;
816 // If the column I need to pull from Ainv is in this thread block
817 // domain, do the following
818 __syncthreads();
819 for (int block = 0; block < numblocks; block++)
820 {
821 delta[threadIdx.x] = u[block * BS + threadIdx.x] - A[k * rowstride + block * BS + threadIdx.x];
822 __syncthreads();
823 for (int i = 0; i < BS; i++)
824 {
825 int row = block * BS + i;
826 T a = Ainv[row * rowstride + col];
827 if (col == k)
828 Ainv_colk_shared[i] = a;
829 Ainv_delta_tid += a * delta[i];
830 __syncthreads();
831 }
832 if (blockIdx.x == kBlock)
833 Ainv_colk[block * BS + threadIdx.x] = Ainv_colk_shared[threadIdx.x];
834 __syncthreads();
835 }
836 __syncthreads();
837 // Write the data back to global memory
838 Ainv_delta[col] = Ainv_delta_tid;
839 __syncthreads();
840 }
841
842
843 template<typename T, int BS>
update_inverse_cuda2(T ** A_g,T ** Ainv_g,T ** u_g,T ** Ainv_delta_g,T ** Ainv_colk_g,int N,int rowstride,int k)844 __global__ void update_inverse_cuda2(T** A_g,
845 T** Ainv_g,
846 T** u_g,
847 T** Ainv_delta_g,
848 T** Ainv_colk_g,
849 int N,
850 int rowstride,
851 int k)
852 {
853 __shared__ T *A, *u, *Ainv, *Ainv_delta, *Ainv_colk;
854 int tid = threadIdx.x;
855 if (threadIdx.x == 0)
856 {
857 A = A_g[blockIdx.y];
858 u = u_g[blockIdx.y];
859 Ainv = Ainv_g[blockIdx.y];
860 Ainv_delta = Ainv_delta_g[blockIdx.y];
861 Ainv_colk = Ainv_colk_g[blockIdx.y];
862 }
863 __syncthreads();
864 T Ainv_delta_tid;
865 __shared__ T Ainv_colk_shared[BS];
866 int col = blockIdx.x * BS + threadIdx.x;
867 // Read the data back from global memory
868 Ainv_delta_tid = Ainv_delta[col];
869 Ainv_colk_shared[threadIdx.x] = Ainv_colk[col];
870 if (col < N)
871 A[k * rowstride + col] = u[col];
872 __syncthreads();
873 __shared__ T prefact;
874 if (threadIdx.x == 0)
875 prefact = -1.0f / (1.0f + Ainv_delta[k]);
876 int numblocks = N / BS + ((N % BS) ? 1 : 0);
877 __syncthreads();
878 for (int block = 0; block < numblocks; block++)
879 {
880 Ainv_colk_shared[tid] = prefact * Ainv_colk[block * BS + threadIdx.x];
881 __syncthreads();
882 T* Ainv_row = Ainv + block * BS * rowstride + col;
883 for (int i = 0; i < BS; i++, Ainv_row += rowstride)
884 *Ainv_row += Ainv_delta_tid * Ainv_colk_shared[i];
885 __syncthreads();
886 }
887 }
888
889
update_inverse_cuda(float * A_g[],float * Ainv_g[],float * u_g[],float * Ainv_delta_g[],float * Ainv_colk_g[],int N,int rowstride,int iat,int numWalkers)890 void update_inverse_cuda(float* A_g[],
891 float* Ainv_g[],
892 float* u_g[],
893 float* Ainv_delta_g[],
894 float* Ainv_colk_g[],
895 int N,
896 int rowstride,
897 int iat,
898 int numWalkers)
899 {
900 const int BS1 = 64;
901 const int BS2 = 64;
902 int NB1 = N / BS1 + ((N % BS1) ? 1 : 0);
903 int NB2 = N / BS2 + ((N % BS2) ? 1 : 0);
904 dim3 dimBlock1(BS1);
905 dim3 dimGrid1(NB1, numWalkers);
906 dim3 dimBlock2(BS2);
907 dim3 dimGrid2(NB2, numWalkers);
908 update_inverse_cuda1<float, BS1>
909 <<<dimGrid1, dimBlock1>>>(A_g, Ainv_g, u_g, Ainv_delta_g, Ainv_colk_g, N, rowstride, iat);
910 update_inverse_cuda2<float, BS2>
911 <<<dimGrid2, dimBlock2>>>(A_g, Ainv_g, u_g, Ainv_delta_g, Ainv_colk_g, N, rowstride, iat);
912 cudaError_t err = cudaGetLastError();
913 if (err != cudaSuccess)
914 {
915 fprintf(stderr, "CUDA error in update_inverse_cuda:\n %s\n", cudaGetErrorString(err));
916 abort();
917 }
918 }
919
update_inverse_cuda(double * A_g[],double * Ainv_g[],double * u_g[],double * Ainv_delta_g[],double * Ainv_colk_g[],int N,int rowstride,int iat,int numWalkers)920 void update_inverse_cuda(double* A_g[],
921 double* Ainv_g[],
922 double* u_g[],
923 double* Ainv_delta_g[],
924 double* Ainv_colk_g[],
925 int N,
926 int rowstride,
927 int iat,
928 int numWalkers)
929 {
930 const int BS1 = 32;
931 const int BS2 = 32;
932 int NB1 = N / BS1 + ((N % BS1) ? 1 : 0);
933 int NB2 = N / BS2 + ((N % BS2) ? 1 : 0);
934 dim3 dimBlock1(BS1);
935 dim3 dimGrid1(NB1, numWalkers);
936 dim3 dimBlock2(BS2);
937 dim3 dimGrid2(NB2, numWalkers);
938 update_inverse_cuda1<double, BS1>
939 <<<dimGrid1, dimBlock1>>>(A_g, Ainv_g, u_g, Ainv_delta_g, Ainv_colk_g, N, rowstride, iat);
940 update_inverse_cuda2<double, BS2>
941 <<<dimGrid2, dimBlock2>>>(A_g, Ainv_g, u_g, Ainv_delta_g, Ainv_colk_g, N, rowstride, iat);
942 }
943
944
945 template<typename T, int BS, int MAXN>
update_inverse_transpose_cuda(T ** A_g,T ** AinvT_g,T ** u_g,int N,int row_stride,int elec)946 __global__ void update_inverse_transpose_cuda(T** A_g, T** AinvT_g, T** u_g, int N, int row_stride, int elec)
947 {
948 __shared__ T AinvT_row[MAXN], Ainv_colk[MAXN], delta[MAXN];
949 int numBlocks = N / blockDim.x + ((N % blockDim.x) ? 1 : 0);
950 //int numBlocks = 4;
951 __shared__ T *A, *AinvT, *u;
952 if (threadIdx.x == 0)
953 {
954 A = A_g[blockIdx.x];
955 AinvT = AinvT_g[blockIdx.x];
956 u = u_g[blockIdx.x];
957 }
958 __syncthreads();
959 T prefactor;
960 __shared__ T sum[BS];
961 sum[threadIdx.x] = 0.0f;
962 for (int block = 0; block < numBlocks; block++)
963 {
964 int off = block * BS + threadIdx.x;
965 T u1 = u[off];
966 delta[off] = u1 - A[elec * row_stride + off];
967 Ainv_colk[off] = AinvT[elec * row_stride + off];
968 A[elec * row_stride + off] = u1;
969 sum[threadIdx.x] += (Ainv_colk[off] * delta[off]);
970 }
971 __syncthreads();
972 for (int s = (BS >> 1); s > 0; s >>= 1)
973 {
974 __syncthreads();
975 if (threadIdx.x < s)
976 sum[threadIdx.x] += sum[threadIdx.x + s];
977 }
978 __syncthreads();
979 prefactor = -1.0f / (1.0f + sum[0]);
980 for (int row = 0; row < N; row++)
981 {
982 // First load row into shared memory
983 sum[threadIdx.x] = 0.0;
984 for (int block = 0; block < numBlocks; block++)
985 {
986 int off = block * BS + threadIdx.x;
987 AinvT_row[off] = AinvT[row * row_stride + off];
988 sum[threadIdx.x] += (AinvT_row[off] * delta[off]);
989 }
990 // Now sum across row to get Ainv_delta
991 for (int s = BS >> 1; s > 0; s >>= 1)
992 {
993 __syncthreads();
994 if (threadIdx.x < s)
995 sum[threadIdx.x] += sum[threadIdx.x + s];
996 }
997 __syncthreads();
998 // sum[0] now has the AinvT * delta
999 // Add on outer product
1000 for (int block = 0; block < numBlocks; block++)
1001 {
1002 int off = BS * block + threadIdx.x;
1003 AinvT[row * row_stride + off] = AinvT_row[off] + prefactor * sum[0] * Ainv_colk[off];
1004 }
1005 __syncthreads();
1006 }
1007 }
1008
1009
1010 template<typename T, int BS, int MAXN>
update_inverse_transpose_cuda_2pass(T ** A_g,T ** AinvT_g,T ** u_g,int N,int row_stride,int elec)1011 __global__ void update_inverse_transpose_cuda_2pass(T** A_g, T** AinvT_g, T** u_g, int N, int row_stride, int elec)
1012 {
1013 __shared__ T Ainv_colk[MAXN], delta[MAXN];
1014 int numBlocks = N / blockDim.x + ((N % blockDim.x) ? 1 : 0);
1015 //int numBlocks = 4;
1016 __shared__ T *A, *AinvT, *u;
1017 if (threadIdx.x == 0)
1018 {
1019 A = A_g[blockIdx.x];
1020 AinvT = AinvT_g[blockIdx.x];
1021 u = u_g[blockIdx.x];
1022 }
1023 __syncthreads();
1024 T prefactor;
1025 __shared__ T sum[BS];
1026 sum[threadIdx.x] = 0.0f;
1027 for (int block = 0; block < numBlocks; block++)
1028 {
1029 int off = block * BS + threadIdx.x;
1030 T u1 = u[off];
1031 delta[off] = u1 - A[elec * row_stride + off];
1032 Ainv_colk[off] = AinvT[elec * row_stride + off];
1033 A[elec * row_stride + off] = u1;
1034 sum[threadIdx.x] += (Ainv_colk[off] * delta[off]);
1035 }
1036 __syncthreads();
1037 for (int s = (BS >> 1); s > 0; s >>= 1)
1038 {
1039 __syncthreads();
1040 if (threadIdx.x < s && threadIdx.y == 0)
1041 sum[threadIdx.x] += sum[threadIdx.x + s];
1042 }
1043 __syncthreads();
1044 prefactor = -1.0f / (1.0f + sum[0]);
1045 __shared__ T sum2[BS][BS + 1];
1046 for (int b1 = 0; b1 < numBlocks; b1++)
1047 {
1048 sum[threadIdx.x] = 0.0f;
1049 for (int i = 0; i < BS; i++)
1050 sum2[i][threadIdx.x] = 0.0f;
1051 // Compute Ainv * delta;
1052 for (int i = 0; i < BS; i++)
1053 {
1054 int row = b1 * BS + i;
1055 for (int b2 = 0; b2 < numBlocks; b2++)
1056 {
1057 int col = b2 * BS + threadIdx.x;
1058 sum2[i][threadIdx.x] += AinvT[row * row_stride + col] * delta[col];
1059 }
1060 }
1061 __syncthreads();
1062 for (int i = 0; i < BS; i++)
1063 sum[threadIdx.x] += prefactor * sum2[threadIdx.x][i];
1064 // Outer product
1065 for (int i = 0; i < BS; i++)
1066 {
1067 int row = b1 * BS + i;
1068 for (int b2 = 0; b2 < numBlocks; b2++)
1069 {
1070 int col = b2 * BS + threadIdx.x;
1071 AinvT[row * row_stride + col] += sum[i] * Ainv_colk[col];
1072 }
1073 }
1074 }
1075 }
1076
1077
1078 template<typename T, int BS>
calc_ratios_transpose(T ** AinvT_list,T ** new_row_list,T * ratio_out,int N,int row_stride,int elec,int numMats)1079 __global__ void calc_ratios_transpose(T** AinvT_list,
1080 T** new_row_list,
1081 T* ratio_out,
1082 int N,
1083 int row_stride,
1084 int elec,
1085 int numMats)
1086 {
1087 __shared__ T *AinvT[BS], *new_row[BS];
1088 int matNum = blockIdx.x * BS + threadIdx.x;
1089 if (matNum < numMats)
1090 {
1091 AinvT[threadIdx.x] = AinvT_list[matNum] + row_stride * BS;
1092 new_row[threadIdx.x] = new_row_list[matNum];
1093 }
1094 __shared__ T AinvT_phi[BS][BS + 1];
1095 __shared__ T ratio[BS];
1096 ratio[threadIdx.x] = 0.0;
1097 int numBlocks = N / BS;
1098 if (numBlocks * BS < N)
1099 numBlocks++;
1100 for (int block = 0; block < numBlocks; block++)
1101 {
1102 int col = block * BS + threadIdx.x;
1103 // First, read the data into shared memory
1104 for (int i = 0; i < BS; i++)
1105 AinvT_phi[i][threadIdx.x] = (AinvT[i])[col] * (new_row[i])[col];
1106 __syncthreads();
1107 // Now sum
1108 for (int i = 0; i < BS; i++)
1109 ratio[threadIdx.x] += AinvT_phi[threadIdx.x][i];
1110 }
1111 if (matNum < numMats)
1112 ratio_out[matNum] = ratio[threadIdx.x];
1113 }
1114
1115
1116 template<typename T, int BS>
calc_ratios(T ** Ainv_list,T ** new_row_list,T * ratio,int N,int row_stride,int elec)1117 __global__ void calc_ratios(T** Ainv_list, T** new_row_list, T* ratio, int N, int row_stride, int elec)
1118 {
1119 int tid = threadIdx.x;
1120 int col = /*blockIdx.x*BS * */ tid;
1121 __shared__ T *Ainv, *new_row;
1122 if (tid == 0)
1123 {
1124 Ainv = Ainv_list[blockIdx.x];
1125 new_row = new_row_list[blockIdx.x];
1126 }
1127 __syncthreads();
1128 #ifdef QMC_COMPLEX
1129 __shared__ uninitialized_array<T, BS> new_row_shared;
1130 #else
1131 __shared__ T new_row_shared[BS];
1132 #endif
1133 if (col < N)
1134 new_row_shared[tid] = new_row[tid];
1135 #ifdef QMC_COMPLEX
1136 __shared__ uninitialized_array<T, BS> Ainv_colk_shared;
1137 #else
1138 __shared__ T Ainv_colk_shared[BS];
1139 #endif
1140 // This is *highly* uncoallesced, but we just have to eat it to allow
1141 // other kernels to operate quickly.
1142 if (col < N)
1143 Ainv_colk_shared[tid] = Ainv[col * row_stride + elec];
1144 __syncthreads();
1145 #ifdef QMC_COMPLEX
1146 __shared__ uninitialized_array<T, BS> Ainv_new_row;
1147 #else
1148 __shared__ T Ainv_new_row[BS];
1149 #endif
1150 if (col < N)
1151 Ainv_new_row[tid] = Ainv_colk_shared[tid] * new_row_shared[tid];
1152 __syncthreads();
1153 // Now, we have to dot
1154 for (unsigned int s = BS / 2; s > 0; s >>= 1)
1155 {
1156 if (tid < s && (tid + s) < N)
1157 Ainv_new_row[tid] += Ainv_new_row[tid + s];
1158 __syncthreads();
1159 }
1160 if (tid == 0)
1161 ratio[blockIdx.x] = Ainv_new_row[0];
1162 }
1163
1164
determinant_ratios_cuda(float * Ainv_list[],float * new_row_list[],float * ratios,int N,int row_stride,int iat,int numWalkers)1165 void determinant_ratios_cuda(float* Ainv_list[],
1166 float* new_row_list[],
1167 float* ratios,
1168 int N,
1169 int row_stride,
1170 int iat,
1171 int numWalkers)
1172 {
1173 dim3 dimBlock(N);
1174 dim3 dimGrid(numWalkers);
1175 if (N <= 32)
1176 calc_ratios<float, 32><<<dimGrid, dimBlock>>>(Ainv_list, new_row_list, ratios, N, row_stride, iat);
1177 else if (N <= 64)
1178 calc_ratios<float, 64><<<dimGrid, dimBlock>>>(Ainv_list, new_row_list, ratios, N, row_stride, iat);
1179 else if (N <= 128)
1180 calc_ratios<float, 128><<<dimGrid, dimBlock>>>(Ainv_list, new_row_list, ratios, N, row_stride, iat);
1181 else if (N <= 256)
1182 calc_ratios<float, 256><<<dimGrid, dimBlock>>>(Ainv_list, new_row_list, ratios, N, row_stride, iat);
1183 else if (N <= 512)
1184 calc_ratios<float, 512><<<dimGrid, dimBlock>>>(Ainv_list, new_row_list, ratios, N, row_stride, iat);
1185 else if (N <= 1024)
1186 calc_ratios<float, 1024><<<dimGrid, dimBlock>>>(Ainv_list, new_row_list, ratios, N, row_stride, iat);
1187 else
1188 {
1189 fprintf(stdout, "Error: N too large for CUDA evaluation.\n");
1190 abort();
1191 }
1192 }
1193
determinant_ratios_cuda(double * Ainv_list[],double * new_row_list[],double * ratios,int N,int row_stride,int iat,int numWalkers)1194 void determinant_ratios_cuda(double* Ainv_list[],
1195 double* new_row_list[],
1196 double* ratios,
1197 int N,
1198 int row_stride,
1199 int iat,
1200 int numWalkers)
1201 {
1202 dim3 dimBlock(N);
1203 dim3 dimGrid(numWalkers);
1204 if (N <= 32)
1205 calc_ratios<double, 32><<<dimGrid, dimBlock>>>(Ainv_list, new_row_list, ratios, N, row_stride, iat);
1206 else if (N <= 64)
1207 calc_ratios<double, 64><<<dimGrid, dimBlock>>>(Ainv_list, new_row_list, ratios, N, row_stride, iat);
1208 else if (N <= 128)
1209 calc_ratios<double, 128><<<dimGrid, dimBlock>>>(Ainv_list, new_row_list, ratios, N, row_stride, iat);
1210 else if (N <= 256)
1211 calc_ratios<double, 256><<<dimGrid, dimBlock>>>(Ainv_list, new_row_list, ratios, N, row_stride, iat);
1212 else if (N <= 512)
1213 calc_ratios<double, 512><<<dimGrid, dimBlock>>>(Ainv_list, new_row_list, ratios, N, row_stride, iat);
1214 else
1215 {
1216 fprintf(stdout, "Error: N too large for CUDA evaluation.\n");
1217 abort();
1218 }
1219 }
1220
1221 #ifdef QMC_COMPLEX
determinant_ratios_cuda(std::complex<float> * Ainv_list[],std::complex<float> * new_row_list[],std::complex<float> * ratios,int N,int row_stride,int iat,int numWalkers)1222 void determinant_ratios_cuda(std::complex<float>* Ainv_list[],
1223 std::complex<float>* new_row_list[],
1224 std::complex<float>* ratios,
1225 int N,
1226 int row_stride,
1227 int iat,
1228 int numWalkers)
1229 {
1230 dim3 dimBlock(N);
1231 dim3 dimGrid(numWalkers);
1232
1233 if (N <= 32)
1234 calc_ratios<thrust::complex<float>, 32><<<dimGrid, dimBlock>>>((thrust::complex<float>**)Ainv_list,
1235 (thrust::complex<float>**)new_row_list,
1236 (thrust::complex<float>*)ratios,
1237 N,
1238 row_stride,
1239 iat);
1240 else if (N <= 64)
1241 calc_ratios<thrust::complex<float>, 64><<<dimGrid, dimBlock>>>((thrust::complex<float>**)Ainv_list,
1242 (thrust::complex<float>**)new_row_list,
1243 (thrust::complex<float>*)ratios,
1244 N,
1245 row_stride,
1246 iat);
1247 else if (N <= 128)
1248 calc_ratios<thrust::complex<float>, 128><<<dimGrid, dimBlock>>>((thrust::complex<float>**)Ainv_list,
1249 (thrust::complex<float>**)new_row_list,
1250 (thrust::complex<float>*)ratios,
1251 N,
1252 row_stride,
1253 iat);
1254 else if (N <= 256)
1255 calc_ratios<thrust::complex<float>, 256><<<dimGrid, dimBlock>>>((thrust::complex<float>**)Ainv_list,
1256 (thrust::complex<float>**)new_row_list,
1257 (thrust::complex<float>*)ratios,
1258 N,
1259 row_stride,
1260 iat);
1261 else if (N <= 512)
1262 calc_ratios<thrust::complex<float>, 512><<<dimGrid, dimBlock>>>((thrust::complex<float>**)Ainv_list,
1263 (thrust::complex<float>**)new_row_list,
1264 (thrust::complex<float>*)ratios,
1265 N,
1266 row_stride,
1267 iat);
1268 else if (N <= 1024)
1269 calc_ratios<thrust::complex<float>, 1024><<<dimGrid, dimBlock>>>((thrust::complex<float>**)Ainv_list,
1270 (thrust::complex<float>**)new_row_list,
1271 (thrust::complex<float>*)ratios,
1272 N,
1273 row_stride,
1274 iat);
1275 else
1276 {
1277 fprintf(stdout, "Error: N too large for CUDA evaluation.\n");
1278 abort();
1279 }
1280 }
1281
determinant_ratios_cuda(std::complex<double> * Ainv_list[],std::complex<double> * new_row_list[],std::complex<double> * ratios,int N,int row_stride,int iat,int numWalkers)1282 void determinant_ratios_cuda(std::complex<double>* Ainv_list[],
1283 std::complex<double>* new_row_list[],
1284 std::complex<double>* ratios,
1285 int N,
1286 int row_stride,
1287 int iat,
1288 int numWalkers)
1289 {
1290 dim3 dimBlock(N);
1291 dim3 dimGrid(numWalkers);
1292
1293 if (N <= 32)
1294 calc_ratios<thrust::complex<double>, 32><<<dimGrid, dimBlock>>>((thrust::complex<double>**)Ainv_list,
1295 (thrust::complex<double>**)new_row_list,
1296 (thrust::complex<double>*)ratios,
1297 N,
1298 row_stride,
1299 iat);
1300 else if (N <= 64)
1301 calc_ratios<thrust::complex<double>, 64><<<dimGrid, dimBlock>>>((thrust::complex<double>**)Ainv_list,
1302 (thrust::complex<double>**)new_row_list,
1303 (thrust::complex<double>*)ratios,
1304 N,
1305 row_stride,
1306 iat);
1307 else if (N <= 128)
1308 calc_ratios<thrust::complex<double>, 128><<<dimGrid, dimBlock>>>((thrust::complex<double>**)Ainv_list,
1309 (thrust::complex<double>**)new_row_list,
1310 (thrust::complex<double>*)ratios,
1311 N,
1312 row_stride,
1313 iat);
1314 else if (N <= 256)
1315 calc_ratios<thrust::complex<double>, 256><<<dimGrid, dimBlock>>>((thrust::complex<double>**)Ainv_list,
1316 (thrust::complex<double>**)new_row_list,
1317 (thrust::complex<double>*)ratios,
1318 N,
1319 row_stride,
1320 iat);
1321 else if (N <= 512)
1322 calc_ratios<thrust::complex<double>, 512><<<dimGrid, dimBlock>>>((thrust::complex<double>**)Ainv_list,
1323 (thrust::complex<double>**)new_row_list,
1324 (thrust::complex<double>*)ratios,
1325 N,
1326 row_stride,
1327 iat);
1328 else
1329 {
1330 fprintf(stdout, "Error: N too large for CUDA evaluation.\n");
1331 abort();
1332 }
1333 }
1334 #endif
1335
1336
1337 template<typename T, int BS>
calc_ratio_grad_lapl(T ** Ainv_list,T ** new_row_list,T ** grad_lapl_list,T * ratio_grad_lapl,int N,int row_stride,int elec)1338 __global__ void calc_ratio_grad_lapl(T** Ainv_list,
1339 T** new_row_list,
1340 T** grad_lapl_list,
1341 T* ratio_grad_lapl,
1342 int N,
1343 int row_stride,
1344 int elec)
1345 {
1346 int tid = threadIdx.x;
1347 int NB = N / BS + ((N % BS) ? 1 : 0);
1348 __shared__ T *Ainv, *new_row, *grad_lapl;
1349 if (tid == 0)
1350 {
1351 Ainv = Ainv_list[blockIdx.x];
1352 new_row = new_row_list[blockIdx.x];
1353 grad_lapl = grad_lapl_list[blockIdx.x];
1354 }
1355 const int BS1 = BS + 1;
1356 const int BS2 = 2 * BS1;
1357 const int BS3 = 3 * BS1;
1358 const int BS4 = 4 * BS1;
1359 __syncthreads();
1360 #ifdef QMC_COMPLEX
1361 __shared__ uninitialized_array<T, BS> Ainv_colk_shared;
1362 __shared__ uninitialized_array<T, 5 * BS1> ratio_prod;
1363 #else
1364 __shared__ T Ainv_colk_shared[BS];
1365 __shared__ T ratio_prod[5 * BS1];
1366 #endif
1367 ratio_prod[tid] = 0.0f;
1368 ratio_prod[BS1 + tid] = 0.0f;
1369 ratio_prod[BS2 + tid] = 0.0f;
1370 ratio_prod[BS3 + tid] = 0.0f;
1371 ratio_prod[BS4 + tid] = 0.0f;
1372 // This is *highly* uncoallesced, but we just have to eat it to allow
1373 // other kernels to operate quickly.
1374 __syncthreads();
1375 for (int block = 0; block < NB; block++)
1376 {
1377 int col = block * BS + tid;
1378 if (col < N)
1379 Ainv_colk_shared[tid] = Ainv[col * row_stride + elec];
1380 __syncthreads();
1381 if (col < N)
1382 {
1383 ratio_prod[tid] += Ainv_colk_shared[tid] * new_row[col];
1384 ratio_prod[BS1 + tid] += Ainv_colk_shared[tid] * grad_lapl[0 * row_stride + col];
1385 ratio_prod[BS2 + tid] += Ainv_colk_shared[tid] * grad_lapl[1 * row_stride + col];
1386 ratio_prod[BS3 + tid] += Ainv_colk_shared[tid] * grad_lapl[2 * row_stride + col];
1387 ratio_prod[BS4 + tid] += Ainv_colk_shared[tid] * grad_lapl[3 * row_stride + col];
1388 }
1389 __syncthreads();
1390 }
1391 // Now, we have to sum
1392 for (unsigned int s = BS / 2; s > 0; s >>= 1)
1393 {
1394 if (tid < s)
1395 {
1396 ratio_prod[tid] += ratio_prod[tid + s]; // Value
1397 ratio_prod[BS1 + tid] += ratio_prod[BS1 + tid + s]; // grad_x
1398 ratio_prod[BS2 + tid] += ratio_prod[BS2 + tid + s]; // grad_y
1399 ratio_prod[BS3 + tid] += ratio_prod[BS3 + tid + s]; // grad_z
1400 ratio_prod[BS4 + tid] += ratio_prod[BS4 + tid + s]; // lapl
1401 }
1402 __syncthreads();
1403 }
1404 // Subtract off gradient^2 from laplacian
1405 if (tid == 0)
1406 {
1407 ratio_prod[BS4] -=
1408 (ratio_prod[BS1] * ratio_prod[BS1] + ratio_prod[BS2] * ratio_prod[BS2] + ratio_prod[BS3] * ratio_prod[BS3]);
1409 }
1410 __syncthreads();
1411 // Present gradient and laplacian are w.r.t old position. Divide by
1412 // ratio to make it w.r.t. new position
1413 if (tid < 4)
1414 ratio_prod[(tid + 1) * BS1] /= ratio_prod[0];
1415 __syncthreads();
1416 if (tid < 5)
1417 ratio_grad_lapl[5 * blockIdx.x + tid] = ratio_prod[tid * BS1];
1418 }
1419
1420
1421 template<typename T, int BS>
calc_ratio_grad_lapl(T ** Ainv_list,T ** new_row_list,T ** grad_lapl_list,T * ratio_grad_lapl,int N,int row_stride,int * elec_list)1422 __global__ void calc_ratio_grad_lapl(T** Ainv_list,
1423 T** new_row_list,
1424 T** grad_lapl_list,
1425 T* ratio_grad_lapl,
1426 int N,
1427 int row_stride,
1428 int* elec_list)
1429 {
1430 int tid = threadIdx.x;
1431 int NB = N / BS + ((N % BS) ? 1 : 0);
1432 __shared__ T *Ainv, *new_row, *grad_lapl;
1433 __shared__ int elec;
1434 if (tid == 0)
1435 {
1436 Ainv = Ainv_list[blockIdx.x];
1437 new_row = new_row_list[blockIdx.x];
1438 grad_lapl = grad_lapl_list[blockIdx.x];
1439 elec = elec_list[blockIdx.x];
1440 }
1441 __syncthreads();
1442 const int BS1 = BS + 1;
1443 const int BS2 = 2 * BS1;
1444 const int BS3 = 3 * BS1;
1445 const int BS4 = 4 * BS1;
1446 #ifdef QMC_COMPLEX
1447 __shared__ uninitialized_array<T, BS> Ainv_colk_shared;
1448 __shared__ uninitialized_array<T, 5 * BS1> ratio_prod;
1449 #else
1450 __shared__ T Ainv_colk_shared[BS];
1451 __shared__ T ratio_prod[5 * BS1];
1452 #endif
1453 ratio_prod[tid] = 0.0f;
1454 ratio_prod[BS1 + tid] = 0.0f;
1455 ratio_prod[BS2 + tid] = 0.0f;
1456 ratio_prod[BS3 + tid] = 0.0f;
1457 ratio_prod[BS4 + tid] = 0.0f;
1458 // This is *highly* uncoallesced, but we just have to eat it to allow
1459 // other kernels to operate quickly.
1460 __syncthreads();
1461 for (int block = 0; block < NB; block++)
1462 {
1463 int col = block * BS + tid;
1464 if (col < N)
1465 Ainv_colk_shared[tid] = Ainv[col * row_stride + elec];
1466 __syncthreads();
1467 if (col < N)
1468 {
1469 ratio_prod[tid] += Ainv_colk_shared[tid] * new_row[col];
1470 ratio_prod[BS1 + tid] += Ainv_colk_shared[tid] * grad_lapl[0 * row_stride + col];
1471 ratio_prod[BS2 + tid] += Ainv_colk_shared[tid] * grad_lapl[1 * row_stride + col];
1472 ratio_prod[BS3 + tid] += Ainv_colk_shared[tid] * grad_lapl[2 * row_stride + col];
1473 ratio_prod[BS4 + tid] += Ainv_colk_shared[tid] * grad_lapl[3 * row_stride + col];
1474 }
1475 __syncthreads();
1476 }
1477 // Now, we have to sum
1478 for (unsigned int s = BS / 2; s > 0; s >>= 1)
1479 {
1480 if (tid < s)
1481 {
1482 ratio_prod[tid] += ratio_prod[tid + s]; // Value
1483 ratio_prod[BS1 + tid] += ratio_prod[BS1 + tid + s]; // grad_x
1484 ratio_prod[BS2 + tid] += ratio_prod[BS2 + tid + s]; // grad_y
1485 ratio_prod[BS3 + tid] += ratio_prod[BS3 + tid + s]; // grad_z
1486 ratio_prod[BS4 + tid] += ratio_prod[BS4 + tid + s]; // lapl
1487 }
1488 __syncthreads();
1489 }
1490 // Subtract off gradient^2 from laplacian
1491 if (tid == 0)
1492 {
1493 ratio_prod[BS4] -=
1494 (ratio_prod[BS1] * ratio_prod[BS1] + ratio_prod[BS2] * ratio_prod[BS2] + ratio_prod[BS3] * ratio_prod[BS3]);
1495 }
1496 __syncthreads();
1497 // Present gradient and laplacian are w.r.t old position. Divide by
1498 // ratio to make it w.r.t. new position
1499 if (tid < 4)
1500 ratio_prod[BS1 * (tid + 1)] /= ratio_prod[0];
1501 // Copy data back for every walker
1502 if (tid < 5)
1503 ratio_grad_lapl[5 * blockIdx.x + tid] = ratio_prod[tid * BS1];
1504 }
1505
1506
determinant_ratios_grad_lapl_cuda(float * Ainv_list[],float * new_row_list[],float * grad_lapl_list[],float ratios_grad_lapl[],int N,int row_stride,int iat,int numWalkers)1507 void determinant_ratios_grad_lapl_cuda(float* Ainv_list[],
1508 float* new_row_list[],
1509 float* grad_lapl_list[],
1510 float ratios_grad_lapl[],
1511 int N,
1512 int row_stride,
1513 int iat,
1514 int numWalkers)
1515 {
1516 const int BS = 32;
1517 dim3 dimBlock(BS);
1518 dim3 dimGrid(numWalkers);
1519 calc_ratio_grad_lapl<float, BS><<<dimGrid, dimBlock, 0, gpu::kernelStream>>>(Ainv_list,
1520 new_row_list,
1521 grad_lapl_list,
1522 ratios_grad_lapl,
1523 N,
1524 row_stride,
1525 iat);
1526 }
1527
1528
determinant_ratios_grad_lapl_cuda(double * Ainv_list[],double * new_row_list[],double * grad_lapl_list[],double ratios_grad_lapl[],int N,int row_stride,int iat,int numWalkers)1529 void determinant_ratios_grad_lapl_cuda(double* Ainv_list[],
1530 double* new_row_list[],
1531 double* grad_lapl_list[],
1532 double ratios_grad_lapl[],
1533 int N,
1534 int row_stride,
1535 int iat,
1536 int numWalkers)
1537 {
1538 const int BS = 32;
1539 dim3 dimBlock(BS);
1540 dim3 dimGrid(numWalkers);
1541 calc_ratio_grad_lapl<double, BS><<<dimGrid, dimBlock, 0, gpu::kernelStream>>>(Ainv_list,
1542 new_row_list,
1543 grad_lapl_list,
1544 ratios_grad_lapl,
1545 N,
1546 row_stride,
1547 iat);
1548 }
1549
1550
1551 #ifdef QMC_COMPLEX
determinant_ratios_grad_lapl_cuda(std::complex<float> * Ainv_list[],std::complex<float> * new_row_list[],std::complex<float> * grad_lapl_list[],std::complex<float> ratios_grad_lapl[],int N,int row_stride,int iat,int numWalkers)1552 void determinant_ratios_grad_lapl_cuda(std::complex<float>* Ainv_list[],
1553 std::complex<float>* new_row_list[],
1554 std::complex<float>* grad_lapl_list[],
1555 std::complex<float> ratios_grad_lapl[],
1556 int N,
1557 int row_stride,
1558 int iat,
1559 int numWalkers)
1560 {
1561 const int BS = 32;
1562 dim3 dimBlock(BS);
1563 dim3 dimGrid(numWalkers);
1564
1565 calc_ratio_grad_lapl<thrust::complex<float>, BS>
1566 <<<dimGrid, dimBlock, 0, gpu::kernelStream>>>((thrust::complex<float>**)Ainv_list,
1567 (thrust::complex<float>**)new_row_list,
1568 (thrust::complex<float>**)grad_lapl_list,
1569 (thrust::complex<float>*)ratios_grad_lapl,
1570 N,
1571 row_stride,
1572 iat);
1573 }
1574
1575
determinant_ratios_grad_lapl_cuda(std::complex<double> * Ainv_list[],std::complex<double> * new_row_list[],std::complex<double> * grad_lapl_list[],std::complex<double> ratios_grad_lapl[],int N,int row_stride,int iat,int numWalkers)1576 void determinant_ratios_grad_lapl_cuda(std::complex<double>* Ainv_list[],
1577 std::complex<double>* new_row_list[],
1578 std::complex<double>* grad_lapl_list[],
1579 std::complex<double> ratios_grad_lapl[],
1580 int N,
1581 int row_stride,
1582 int iat,
1583 int numWalkers)
1584 {
1585 const int BS = 32;
1586 dim3 dimBlock(BS);
1587 dim3 dimGrid(numWalkers);
1588
1589 calc_ratio_grad_lapl<thrust::complex<double>, BS>
1590 <<<dimGrid, dimBlock, 0, gpu::kernelStream>>>((thrust::complex<double>**)Ainv_list,
1591 (thrust::complex<double>**)new_row_list,
1592 (thrust::complex<double>**)grad_lapl_list,
1593 (thrust::complex<double>*)ratios_grad_lapl,
1594 N,
1595 row_stride,
1596 iat);
1597 }
1598 #endif
1599
1600
determinant_ratios_grad_lapl_cuda(float * Ainv_list[],float * new_row_list[],float * grad_lapl_list[],float ratios_grad_lapl[],int N,int row_stride,int iat_list[],int numWalkers)1601 void determinant_ratios_grad_lapl_cuda(float* Ainv_list[],
1602 float* new_row_list[],
1603 float* grad_lapl_list[],
1604 float ratios_grad_lapl[],
1605 int N,
1606 int row_stride,
1607 int iat_list[],
1608 int numWalkers)
1609 {
1610 const int BS = 32;
1611 dim3 dimBlock(BS);
1612 dim3 dimGrid(numWalkers);
1613 calc_ratio_grad_lapl<float, BS><<<dimGrid, dimBlock, 0, gpu::kernelStream>>>(Ainv_list,
1614 new_row_list,
1615 grad_lapl_list,
1616 ratios_grad_lapl,
1617 N,
1618 row_stride,
1619 iat_list);
1620 }
1621
1622
determinant_ratios_grad_lapl_cuda(double * Ainv_list[],double * new_row_list[],double * grad_lapl_list[],double ratios_grad_lapl[],int N,int row_stride,int iat_list[],int numWalkers)1623 void determinant_ratios_grad_lapl_cuda(double* Ainv_list[],
1624 double* new_row_list[],
1625 double* grad_lapl_list[],
1626 double ratios_grad_lapl[],
1627 int N,
1628 int row_stride,
1629 int iat_list[],
1630 int numWalkers)
1631 {
1632 const int BS = 32;
1633 dim3 dimBlock(BS);
1634 dim3 dimGrid(numWalkers);
1635 calc_ratio_grad_lapl<double, BS><<<dimGrid, dimBlock, 0, gpu::kernelStream>>>(Ainv_list,
1636 new_row_list,
1637 grad_lapl_list,
1638 ratios_grad_lapl,
1639 N,
1640 row_stride,
1641 iat_list);
1642 }
1643
1644
1645 #ifdef QMC_COMPLEX
determinant_ratios_grad_lapl_cuda(std::complex<float> * Ainv_list[],std::complex<float> * new_row_list[],std::complex<float> * grad_lapl_list[],std::complex<float> ratios_grad_lapl[],int N,int row_stride,int iat_list[],int numWalkers)1646 void determinant_ratios_grad_lapl_cuda(std::complex<float>* Ainv_list[],
1647 std::complex<float>* new_row_list[],
1648 std::complex<float>* grad_lapl_list[],
1649 std::complex<float> ratios_grad_lapl[],
1650 int N,
1651 int row_stride,
1652 int iat_list[],
1653 int numWalkers)
1654 {
1655 const int BS = 32;
1656 dim3 dimBlock(BS);
1657 dim3 dimGrid(numWalkers);
1658
1659 calc_ratio_grad_lapl<thrust::complex<float>, BS>
1660 <<<dimGrid, dimBlock, 0, gpu::kernelStream>>>((thrust::complex<float>**)Ainv_list,
1661 (thrust::complex<float>**)new_row_list,
1662 (thrust::complex<float>**)grad_lapl_list,
1663 (thrust::complex<float>*)ratios_grad_lapl,
1664 N,
1665 row_stride,
1666 iat_list);
1667 }
1668
1669
determinant_ratios_grad_lapl_cuda(std::complex<double> * Ainv_list[],std::complex<double> * new_row_list[],std::complex<double> * grad_lapl_list[],std::complex<double> ratios_grad_lapl[],int N,int row_stride,int iat_list[],int numWalkers)1670 void determinant_ratios_grad_lapl_cuda(std::complex<double>* Ainv_list[],
1671 std::complex<double>* new_row_list[],
1672 std::complex<double>* grad_lapl_list[],
1673 std::complex<double> ratios_grad_lapl[],
1674 int N,
1675 int row_stride,
1676 int iat_list[],
1677 int numWalkers)
1678 {
1679 const int BS = 32;
1680 dim3 dimBlock(BS);
1681 dim3 dimGrid(numWalkers);
1682
1683 calc_ratio_grad_lapl<thrust::complex<double>, BS>
1684 <<<dimGrid, dimBlock, 0, gpu::kernelStream>>>((thrust::complex<double>**)Ainv_list,
1685 (thrust::complex<double>**)new_row_list,
1686 (thrust::complex<double>**)grad_lapl_list,
1687 (thrust::complex<double>*)ratios_grad_lapl,
1688 N,
1689 row_stride,
1690 iat_list);
1691 }
1692 #endif
1693
1694
1695 template<typename T, int BS>
calc_grad_kernel(T ** Ainv_list,T ** grad_lapl_list,T * grad,int N,int row_stride,int elec)1696 __global__ void calc_grad_kernel(T** Ainv_list, T** grad_lapl_list, T* grad, int N, int row_stride, int elec)
1697 {
1698 int tid = threadIdx.x;
1699 int NB = N / BS + ((N % BS) ? 1 : 0);
1700 __shared__ T *Ainv, *grad_lapl;
1701 if (tid == 0)
1702 {
1703 Ainv = Ainv_list[blockIdx.x];
1704 grad_lapl = grad_lapl_list[blockIdx.x] + 4 * elec * row_stride;
1705 }
1706 __syncthreads();
1707 const int BS1 = BS + 1;
1708 const int BS2 = 2 * BS1;
1709 #ifdef QMC_COMPLEX
1710 __shared__ uninitialized_array<T, BS> Ainv_colk_shared;
1711 __shared__ uninitialized_array<T, 3 * BS1> ratio_prod;
1712 #else
1713 __shared__ T Ainv_colk_shared[BS];
1714 __shared__ T ratio_prod[3 * BS1];
1715 #endif
1716 ratio_prod[tid] = 0.0f;
1717 ratio_prod[BS1 + tid] = 0.0f;
1718 ratio_prod[BS2 + tid] = 0.0f;
1719 // This is *highly* uncoallesced, but we just have to eat it to allow
1720 // other kernels to operate quickly.
1721 __syncthreads();
1722 for (int block = 0; block < NB; block++)
1723 {
1724 int col = block * BS + tid;
1725 if (col < N)
1726 Ainv_colk_shared[tid] = Ainv[col * row_stride + elec];
1727 __syncthreads();
1728 if (col < N)
1729 {
1730 ratio_prod[tid] += Ainv_colk_shared[tid] * grad_lapl[0 * row_stride + col];
1731 ratio_prod[BS1 + tid] += Ainv_colk_shared[tid] * grad_lapl[1 * row_stride + col];
1732 ratio_prod[BS2 + tid] += Ainv_colk_shared[tid] * grad_lapl[2 * row_stride + col];
1733 }
1734 __syncthreads();
1735 }
1736 // Now, we have to sum
1737 for (unsigned int s = BS / 2; s > 0; s >>= 1)
1738 {
1739 if (tid < s)
1740 {
1741 ratio_prod[tid] += ratio_prod[tid + s]; // grad_x
1742 ratio_prod[BS1 + tid] += ratio_prod[BS1 + tid + s]; // grad_y
1743 ratio_prod[BS2 + tid] += ratio_prod[BS2 + tid + s]; // grad_z
1744 }
1745 __syncthreads();
1746 }
1747 if (tid < 3)
1748 grad[3 * blockIdx.x + tid] = ratio_prod[tid * BS1];
1749 }
1750
calc_gradient(float * Ainv_list[],float * grad_lapl_list[],float grad[],int N,int row_stride,int elec,int numWalkers)1751 void calc_gradient(float* Ainv_list[],
1752 float* grad_lapl_list[],
1753 float grad[],
1754 int N,
1755 int row_stride,
1756 int elec,
1757 int numWalkers)
1758 {
1759 const int BS = 32;
1760 dim3 dimBlock(BS);
1761 dim3 dimGrid(numWalkers);
1762 calc_grad_kernel<float, BS>
1763 <<<dimGrid, dimBlock, 0, gpu::kernelStream>>>(Ainv_list, grad_lapl_list, grad, N, row_stride, elec);
1764 }
1765
calc_gradient(double * Ainv_list[],double * grad_lapl_list[],double grad[],int N,int row_stride,int elec,int numWalkers)1766 void calc_gradient(double* Ainv_list[],
1767 double* grad_lapl_list[],
1768 double grad[],
1769 int N,
1770 int row_stride,
1771 int elec,
1772 int numWalkers)
1773 {
1774 const int BS = 32;
1775 dim3 dimBlock(BS);
1776 dim3 dimGrid(numWalkers);
1777 calc_grad_kernel<double, BS>
1778 <<<dimGrid, dimBlock, 0, gpu::kernelStream>>>(Ainv_list, grad_lapl_list, grad, N, row_stride, elec);
1779 }
1780
1781 #ifdef QMC_COMPLEX
calc_gradient(std::complex<float> * Ainv_list[],std::complex<float> * grad_lapl_list[],std::complex<float> grad[],int N,int row_stride,int elec,int numWalkers)1782 void calc_gradient(std::complex<float>* Ainv_list[],
1783 std::complex<float>* grad_lapl_list[],
1784 std::complex<float> grad[],
1785 int N,
1786 int row_stride,
1787 int elec,
1788 int numWalkers)
1789 {
1790 const int BS = 32;
1791 dim3 dimBlock(BS);
1792 dim3 dimGrid(numWalkers);
1793
1794 calc_grad_kernel<thrust::complex<float>, BS>
1795 <<<dimGrid, dimBlock, 0, gpu::kernelStream>>>((thrust::complex<float>**)Ainv_list,
1796 (thrust::complex<float>**)grad_lapl_list,
1797 (thrust::complex<float>*)grad,
1798 N,
1799 row_stride,
1800 elec);
1801 }
1802
calc_gradient(std::complex<double> * Ainv_list[],std::complex<double> * grad_lapl_list[],std::complex<double> grad[],int N,int row_stride,int elec,int numWalkers)1803 void calc_gradient(std::complex<double>* Ainv_list[],
1804 std::complex<double>* grad_lapl_list[],
1805 std::complex<double> grad[],
1806 int N,
1807 int row_stride,
1808 int elec,
1809 int numWalkers)
1810 {
1811 const int BS = 32;
1812 dim3 dimBlock(BS);
1813 dim3 dimGrid(numWalkers);
1814
1815 calc_grad_kernel<thrust::complex<double>, BS>
1816 <<<dimGrid, dimBlock, 0, gpu::kernelStream>>>((thrust::complex<double>**)Ainv_list,
1817 (thrust::complex<double>**)grad_lapl_list,
1818 (thrust::complex<double>*)grad,
1819 N,
1820 row_stride,
1821 elec);
1822 }
1823 #endif
1824
1825
1826 #define RATIO_BS 16
1827
1828 template<typename T>
all_ratios_kernel(T ** Ainv_list,T ** new_mat_list,T ** ratio_list,int N,int row_stride)1829 __global__ void all_ratios_kernel(T** Ainv_list, T** new_mat_list, T** ratio_list, int N, int row_stride)
1830 {
1831 __shared__ T *Ainv, *new_mat, *ratio;
1832 if (threadIdx.x == 0 && threadIdx.y == 0)
1833 {
1834 Ainv = Ainv_list[blockIdx.x];
1835 new_mat = new_mat_list[blockIdx.x];
1836 ratio = ratio_list[blockIdx.x];
1837 }
1838 __shared__ T Ainv_block[RATIO_BS][RATIO_BS + 1];
1839 // __shared__ T new_block[RATIO_BS][RATIO_BS+1];
1840 __shared__ T ratio_block[RATIO_BS][RATIO_BS + 1];
1841 unsigned int numBlocks = N >> 4;
1842 if (N & 15)
1843 numBlocks++;
1844 for (unsigned int yBlock = 0; yBlock < numBlocks; yBlock++)
1845 {
1846 ratio_block[threadIdx.y][threadIdx.x] = 0.0f;
1847 __syncthreads();
1848 for (unsigned int xBlock = 0; xBlock < numBlocks; xBlock++)
1849 {
1850 unsigned int xIndex = yBlock * RATIO_BS + threadIdx.x;
1851 unsigned int yIndex = xBlock * RATIO_BS + threadIdx.y;
1852 unsigned int index = yIndex * row_stride + xIndex;
1853 if ((xIndex < N) && (yIndex < N))
1854 Ainv_block[threadIdx.x][threadIdx.y] = Ainv[index];
1855 __syncthreads();
1856 xIndex = xBlock * RATIO_BS + threadIdx.x;
1857 yIndex = yBlock * RATIO_BS + threadIdx.y;
1858 index = yIndex * row_stride + xIndex;
1859 if ((xIndex < N) && (yIndex < N))
1860 ratio_block[threadIdx.y][threadIdx.x] += new_mat[index] * Ainv_block[threadIdx.y][threadIdx.x];
1861 __syncthreads();
1862 }
1863 __syncthreads();
1864 // Now, we have to do the reduction across the ratio_blocks
1865 if (threadIdx.x < 8)
1866 ratio_block[threadIdx.y][threadIdx.x] += ratio_block[threadIdx.y][threadIdx.x + 8];
1867 if (threadIdx.x < 4)
1868 ratio_block[threadIdx.y][threadIdx.x] += ratio_block[threadIdx.y][threadIdx.x + 4];
1869 if (threadIdx.x < 2)
1870 ratio_block[threadIdx.y][threadIdx.x] += ratio_block[threadIdx.y][threadIdx.x + 2];
1871 if (threadIdx.x < 1)
1872 ratio_block[threadIdx.y][threadIdx.x] += ratio_block[threadIdx.y][threadIdx.x + 1];
1873 __syncthreads();
1874 if (threadIdx.y == 0 && (yBlock * RATIO_BS + threadIdx.x) < N)
1875 ratio[yBlock * RATIO_BS + threadIdx.x] = ratio_block[threadIdx.x][0];
1876 }
1877 }
1878
1879
calc_all_ratios(float * Ainv_list[],float * new_mat_list[],float * ratio_list[],int N,int row_stride,int num_mats)1880 void calc_all_ratios(float* Ainv_list[],
1881 float* new_mat_list[],
1882 float* ratio_list[],
1883 int N,
1884 int row_stride,
1885 int num_mats)
1886 {
1887 dim3 dimBlock(RATIO_BS, RATIO_BS);
1888 dim3 dimGrid(num_mats);
1889 all_ratios_kernel<float><<<dimGrid, dimBlock>>>(Ainv_list, new_mat_list, ratio_list, N, row_stride);
1890 }
1891
1892
1893 const int MAX_RATIO_ROWS = 20;
1894
1895 template<typename T, int BS>
calc_many_ratios_kernel(T ** Ainv_list,T ** new_row_list,T ** ratio_list,int * num_ratio_list,int N,int row_stride,int * elec_list)1896 __global__ void calc_many_ratios_kernel(T** Ainv_list,
1897 T** new_row_list,
1898 T** ratio_list,
1899 int* num_ratio_list,
1900 int N,
1901 int row_stride,
1902 int* elec_list)
1903 {
1904 int tid = threadIdx.x;
1905 __shared__ T *Ainv, *new_rows, *ratios;
1906 __shared__ int num_ratios, elec;
1907 if (tid == 0)
1908 {
1909 Ainv = Ainv_list[blockIdx.x];
1910 new_rows = new_row_list[blockIdx.x];
1911 num_ratios = num_ratio_list[blockIdx.x];
1912 ratios = ratio_list[blockIdx.x];
1913 elec = elec_list[blockIdx.x];
1914 }
1915 __syncthreads();
1916 int NB = N / BS + ((N % BS) ? 1 : 0);
1917 #ifdef QMC_COMPLEX
1918 __shared__ uninitialized_array<T, BS> Ainv_shared;
1919 #else
1920 __shared__ T Ainv_shared[BS];
1921 #endif
1922 const int BS1 = BS + 1;
1923 // __shared__ T row[BS];
1924 // We use BS+1 to avoid bank conflicts in the writing.
1925 #ifdef QMC_COMPLEX
1926 __shared__ uninitialized_array<T, MAX_RATIO_ROWS * BS1> ratio_sum;
1927 #else
1928 __shared__ T ratio_sum[MAX_RATIO_ROWS * BS1];
1929 #endif
1930 for (int iratio = 0; iratio < num_ratios; iratio++)
1931 ratio_sum[iratio * BS1 + tid] = (T)0.0;
1932 __syncthreads();
1933 for (int block = 0; block < NB; block++)
1934 {
1935 int off = block * BS + tid;
1936 bool mask = off < N;
1937 if (mask)
1938 Ainv_shared[tid] = Ainv[off * row_stride + elec];
1939 __syncthreads();
1940 for (int iratio = 0; iratio < num_ratios; iratio++)
1941 if (mask)
1942 ratio_sum[iratio * BS1 + tid] += Ainv_shared[tid] * new_rows[iratio * row_stride + off];
1943 __syncthreads();
1944 }
1945 // now, sum up ratios
1946 for (int iratio = 0; iratio < num_ratios; iratio++)
1947 {
1948 for (int s = BS >> 1; s > 0; s >>= 1)
1949 {
1950 if (tid < s)
1951 ratio_sum[iratio * BS1 + tid] += ratio_sum[iratio * BS1 + tid + s];
1952 __syncthreads();
1953 }
1954 }
1955 // Store sums in parallel
1956 if (tid < num_ratios)
1957 ratios[tid] = ratio_sum[tid * BS1];
1958 }
1959
calc_many_ratios(float * Ainv_list[],float * new_row_list[],float * ratio_list[],int num_ratio_list[],int N,int row_stride,int elec_list[],int numWalkers)1960 void calc_many_ratios(float* Ainv_list[],
1961 float* new_row_list[],
1962 float* ratio_list[],
1963 int num_ratio_list[],
1964 int N,
1965 int row_stride,
1966 int elec_list[],
1967 int numWalkers)
1968 {
1969 const int BS = 32;
1970 dim3 dimBlock(BS);
1971 dim3 dimGrid(numWalkers);
1972 calc_many_ratios_kernel<float, BS>
1973 <<<dimGrid, dimBlock>>>(Ainv_list, new_row_list, ratio_list, num_ratio_list, N, row_stride, elec_list);
1974 }
1975
calc_many_ratios(double * Ainv_list[],double * new_row_list[],double * ratio_list[],int num_ratio_list[],int N,int row_stride,int elec_list[],int numWalkers)1976 void calc_many_ratios(double* Ainv_list[],
1977 double* new_row_list[],
1978 double* ratio_list[],
1979 int num_ratio_list[],
1980 int N,
1981 int row_stride,
1982 int elec_list[],
1983 int numWalkers)
1984 {
1985 const int BS = 32;
1986 dim3 dimBlock(BS);
1987 dim3 dimGrid(numWalkers);
1988 calc_many_ratios_kernel<double, BS>
1989 <<<dimGrid, dimBlock>>>(Ainv_list, new_row_list, ratio_list, num_ratio_list, N, row_stride, elec_list);
1990 }
1991
1992
1993 #ifdef QMC_COMPLEX
calc_many_ratios(std::complex<float> * Ainv_list[],std::complex<float> * new_row_list[],std::complex<float> * ratio_list[],int num_ratio_list[],int N,int row_stride,int elec_list[],int numWalkers)1994 void calc_many_ratios(std::complex<float>* Ainv_list[],
1995 std::complex<float>* new_row_list[],
1996 std::complex<float>* ratio_list[],
1997 int num_ratio_list[],
1998 int N,
1999 int row_stride,
2000 int elec_list[],
2001 int numWalkers)
2002 {
2003 const int BS = 32;
2004 dim3 dimBlock(BS);
2005 dim3 dimGrid(numWalkers);
2006
2007 calc_many_ratios_kernel<thrust::complex<float>, BS><<<dimGrid, dimBlock>>>((thrust::complex<float>**)(Ainv_list),
2008 (thrust::complex<float>**)new_row_list,
2009 (thrust::complex<float>**)ratio_list,
2010 num_ratio_list,
2011 N,
2012 row_stride,
2013 elec_list);
2014 }
2015
2016
calc_many_ratios(std::complex<double> * Ainv_list[],std::complex<double> * new_row_list[],std::complex<double> * ratio_list[],int num_ratio_list[],int N,int row_stride,int elec_list[],int numWalkers)2017 void calc_many_ratios(std::complex<double>* Ainv_list[],
2018 std::complex<double>* new_row_list[],
2019 std::complex<double>* ratio_list[],
2020 int num_ratio_list[],
2021 int N,
2022 int row_stride,
2023 int elec_list[],
2024 int numWalkers)
2025 {
2026 const int BS = 32;
2027 dim3 dimBlock(BS);
2028 dim3 dimGrid(numWalkers);
2029
2030 calc_many_ratios_kernel<thrust::complex<double>, BS><<<dimGrid, dimBlock>>>((thrust::complex<double>**)Ainv_list,
2031 (thrust::complex<double>**)new_row_list,
2032 (thrust::complex<double>**)ratio_list,
2033 num_ratio_list,
2034 N,
2035 row_stride,
2036 elec_list);
2037 }
2038 #endif
2039
2040
2041 #define SCALE_BS 64
2042
2043 //__constant__ float GGt[3][3];
2044
2045
2046 template<typename T>
scale_grad_lapl_kernel(T ** grad_list,T ** hess_list,T ** grad_lapl_list,T * Linv,int N)2047 __global__ void scale_grad_lapl_kernel(T** grad_list, T** hess_list, T** grad_lapl_list, T* Linv, int N)
2048 {
2049 __shared__ T gradBlock[3][SCALE_BS];
2050 __shared__ T hessBlock[6][SCALE_BS];
2051 // __shared__ float outBlock [4][SCALE_BS];
2052 __shared__ T G[3][3], GGt[3][3];
2053 __shared__ T *grad, *hess, *out;
2054 if (threadIdx.x == 0)
2055 {
2056 grad = grad_list[blockIdx.y];
2057 hess = hess_list[blockIdx.y];
2058 out = grad_lapl_list[blockIdx.y];
2059 }
2060 int i = threadIdx.x / 3;
2061 int j = threadIdx.x % 3;
2062 if (threadIdx.x < 9)
2063 G[i][j] = Linv[threadIdx.x];
2064 __syncthreads();
2065 if (threadIdx.x < 9)
2066 {
2067 GGt[i][j] = (G[i][0] * G[0][j] + G[i][1] * G[1][j] + G[i][2] * G[2][j]);
2068 }
2069 // Load the gradients into shared memory
2070 for (int i = 0; i < 3; i++)
2071 {
2072 unsigned int gIndex = (3 * blockIdx.x + i) * SCALE_BS + threadIdx.x;
2073 if (gIndex < 3 * N)
2074 gradBlock[i][threadIdx.x] = grad[gIndex];
2075 }
2076 // Load the hessian into shared memory
2077 for (int i = 0; i < 6; i++)
2078 {
2079 unsigned int hIndex = (6 * blockIdx.x + i) * SCALE_BS + threadIdx.x;
2080 if (hIndex < 6 * N)
2081 hessBlock[i][threadIdx.x] = hess[hIndex];
2082 }
2083 // Now, loop through the rows that I own and compute the
2084 // dimensioned gradients and laplacians from the
2085 // dimensionless gradients and Hessians.
2086 int row = blockIdx.x * SCALE_BS;
2087 T val;
2088 // x component of gradient
2089 val =
2090 (G[0][0] * gradBlock[0][threadIdx.x] + G[0][1] * gradBlock[1][threadIdx.x] + G[0][2] * gradBlock[2][threadIdx.x]);
2091 out[row + 0 * N + threadIdx.x] = val;
2092 // y component of gradient
2093 val =
2094 (G[1][0] * gradBlock[0][threadIdx.x] + G[1][1] * gradBlock[1][threadIdx.x] + G[1][2] * gradBlock[2][threadIdx.x]);
2095 out[row + 1 * N + threadIdx.x] = val;
2096 // z component of gradient
2097 val =
2098 (G[2][0] * gradBlock[0][threadIdx.x] + G[2][1] * gradBlock[1][threadIdx.x] + G[2][2] * gradBlock[2][threadIdx.x]);
2099 out[row + 2 * N + threadIdx.x] = val;
2100 // Hessian = H00 H01 H02 H11 H12 H22
2101 // Matrix = [0 1 2]
2102 // [1 3 4]
2103 // [2 4 5]
2104 // laplacian = Trace(GGt*Hessian)
2105 val = (GGt[0][0] * hessBlock[0][threadIdx.x] + GGt[0][1] * hessBlock[1][threadIdx.x] +
2106 GGt[0][2] * hessBlock[2][threadIdx.x] + GGt[1][0] * hessBlock[1][threadIdx.x] +
2107 GGt[1][1] * hessBlock[3][threadIdx.x] + GGt[1][2] * hessBlock[4][threadIdx.x] +
2108 GGt[2][0] * hessBlock[2][threadIdx.x] + GGt[2][1] * hessBlock[4][threadIdx.x] +
2109 GGt[2][2] * hessBlock[5][threadIdx.x]);
2110 out[row + 3 * N + threadIdx.x] = val;
2111 }
2112
2113
2114 // This function reads the vectors pointed to by grad_list and
2115 // hess_list. These are in memory as
2116 // [grad0_x grad0_y grad0_z grad1_x grad1_y ... ] and
2117 // [hess0_xx hess0_xy hess0_xy hess0_yy hess0_yz hess0_zz ...]
2118 // It the writes the data into memory as
2119 // [grad0_x grad1_x ... grad(N-1)_x grad0_y ... grad(N-1)_x lapl0
2120 // lapl1...]
2121
scale_grad_lapl(float * grad_list[],float * hess_list[],float * grad_lapl_list[],float Linv[],int N,int num_walkers)2122 void scale_grad_lapl(float* grad_list[],
2123 float* hess_list[],
2124 float* grad_lapl_list[],
2125 float Linv[],
2126 int N,
2127 int num_walkers)
2128 {
2129 dim3 dimBlock(SCALE_BS);
2130 dim3 dimGrid(N / SCALE_BS, num_walkers);
2131 if (N % SCALE_BS)
2132 dimGrid.x++;
2133 scale_grad_lapl_kernel<float><<<dimGrid, dimBlock>>>(grad_list, hess_list, grad_lapl_list, Linv, N);
2134 }
2135
2136
2137 template<typename T, int BS>
all_ratios_grad_lapl_kernel(T ** Ainv_list,T ** grad_lapl_list,T ** out_list,int N,int row_stride)2138 __global__ void all_ratios_grad_lapl_kernel(T** Ainv_list, T** grad_lapl_list, T** out_list, int N, int row_stride)
2139 {
2140 __shared__ T *Ainv, *gl_array, *out;
2141 if (threadIdx.x == 0 && threadIdx.y == 0)
2142 {
2143 Ainv = Ainv_list[blockIdx.x];
2144 gl_array = grad_lapl_list[blockIdx.x];
2145 out = out_list[blockIdx.x];
2146 }
2147 __syncthreads();
2148 #ifdef QMC_COMPLEX
2149 __shared__ uninitialized_array<T, BS*(BS + 1)> Ainv_block;
2150 __shared__ uninitialized_array<T, BS*(BS + 1)> grad_lapl_block[4];
2151 #else
2152 __shared__ T Ainv_block[BS * (BS + 1)];
2153 __shared__ T grad_lapl_block[4][BS * (BS + 1)];
2154 #endif
2155 const unsigned int numBlocks = (N + BS - 1) / BS;
2156 const unsigned int index_local = threadIdx.y * (BS + 1) + threadIdx.x;
2157 for (unsigned int yBlock = 0; yBlock < numBlocks; yBlock++)
2158 {
2159 grad_lapl_block[0][index_local] = 0.0f;
2160 grad_lapl_block[1][index_local] = 0.0f;
2161 grad_lapl_block[2][index_local] = 0.0f;
2162 grad_lapl_block[3][index_local] = 0.0f;
2163 for (unsigned int xBlock = 0; xBlock < numBlocks; xBlock++)
2164 {
2165 unsigned int xIndex = yBlock * BS + threadIdx.x;
2166 unsigned int yIndex = xBlock * BS + threadIdx.y;
2167 unsigned int index = yIndex * row_stride + xIndex;
2168
2169 if ((xIndex < N) && (yIndex < N))
2170 Ainv_block[threadIdx.x * (BS + 1) + threadIdx.y] = Ainv[index];
2171 __syncthreads();
2172 xIndex = xBlock * BS + threadIdx.x;
2173 yIndex = yBlock * BS + threadIdx.y;
2174 index = 4 * yIndex * row_stride + xIndex;
2175 if ((xIndex < N) && (yIndex < N))
2176 {
2177 grad_lapl_block[0][index_local] += gl_array[index + 0 * row_stride] * Ainv_block[index_local];
2178 grad_lapl_block[1][index_local] += gl_array[index + 1 * row_stride] * Ainv_block[index_local];
2179 grad_lapl_block[2][index_local] += gl_array[index + 2 * row_stride] * Ainv_block[index_local];
2180 grad_lapl_block[3][index_local] += gl_array[index + 3 * row_stride] * Ainv_block[index_local];
2181 }
2182 __syncthreads();
2183 }
2184 // Now, we have to do the reduction across the lapl_blocks
2185 for (int s = BS >> 1; s > 0; s >>= 1)
2186 {
2187 if (threadIdx.x < s)
2188 {
2189 grad_lapl_block[0][index_local] += grad_lapl_block[0][index_local + s];
2190 grad_lapl_block[1][index_local] += grad_lapl_block[1][index_local + s];
2191 grad_lapl_block[2][index_local] += grad_lapl_block[2][index_local + s];
2192 grad_lapl_block[3][index_local] += grad_lapl_block[3][index_local + s];
2193 }
2194 __syncthreads();
2195 }
2196 // unsigned int yIndex = yBlock * RATIO_BS + threadIdx.x;
2197 // if (threadIdx.y == 0 && yIndex < N) {
2198 // out[4*yIndex+0] = grad_lapl_block[0][threadIdx.x][0];
2199 // out[4*yIndex+1] = grad_lapl_block[1][threadIdx.x][0];
2200 // out[4*yIndex+2] = grad_lapl_block[2][threadIdx.x][0];
2201 // out[4*yIndex+3] = grad_lapl_block[3][threadIdx.x][0];
2202 // }
2203 //unsigned int yIndex = 4*yBlock*RATIO_BS + 4*threadIdx.y + threadIdx.x;
2204 unsigned int ix = BS * threadIdx.y + threadIdx.x;
2205 unsigned int yIndex = BS * yBlock + (ix >> 2);
2206 if (ix < BS * 4 && yIndex < N)
2207 out[4 * BS * yBlock + ix] = grad_lapl_block[ix & 3][(ix >> 2) * (BS + 1)];
2208 // IMPORTANT!!!
2209 __syncthreads();
2210 }
2211 }
2212
calc_grad_lapl(float * Ainv_list[],float * grad_lapl_list[],float * out_list[],int N,int row_stride,int num_mats)2213 void calc_grad_lapl(float* Ainv_list[], float* grad_lapl_list[], float* out_list[], int N, int row_stride, int num_mats)
2214 {
2215 dim3 dimBlock(RATIO_BS, RATIO_BS);
2216 dim3 dimGrid(num_mats);
2217 all_ratios_grad_lapl_kernel<float, RATIO_BS>
2218 <<<dimGrid, dimBlock>>>(Ainv_list, grad_lapl_list, out_list, N, row_stride);
2219 }
2220
2221
calc_grad_lapl(double * Ainv_list[],double * grad_lapl_list[],double * out_list[],int N,int row_stride,int num_mats)2222 void calc_grad_lapl(double* Ainv_list[],
2223 double* grad_lapl_list[],
2224 double* out_list[],
2225 int N,
2226 int row_stride,
2227 int num_mats)
2228 {
2229 dim3 dimBlock(RATIO_BS, RATIO_BS);
2230 dim3 dimGrid(num_mats);
2231 all_ratios_grad_lapl_kernel<double, RATIO_BS>
2232 <<<dimGrid, dimBlock>>>(Ainv_list, grad_lapl_list, out_list, N, row_stride);
2233 }
2234
2235
2236 #ifdef QMC_COMPLEX
calc_grad_lapl(std::complex<float> * Ainv_list[],std::complex<float> * grad_lapl_list[],std::complex<float> * out_list[],int N,int row_stride,int num_mats)2237 void calc_grad_lapl(std::complex<float>* Ainv_list[],
2238 std::complex<float>* grad_lapl_list[],
2239 std::complex<float>* out_list[],
2240 int N,
2241 int row_stride,
2242 int num_mats)
2243 {
2244 dim3 dimBlock(RATIO_BS, RATIO_BS);
2245 dim3 dimGrid(num_mats);
2246
2247 all_ratios_grad_lapl_kernel<thrust::complex<float>, RATIO_BS>
2248 <<<dimGrid, dimBlock>>>((thrust::complex<float>**)Ainv_list,
2249 (thrust::complex<float>**)grad_lapl_list,
2250 (thrust::complex<float>**)out_list,
2251 N,
2252 row_stride);
2253 }
2254
2255
calc_grad_lapl(std::complex<double> * Ainv_list[],std::complex<double> * grad_lapl_list[],std::complex<double> * out_list[],int N,int row_stride,int num_mats)2256 void calc_grad_lapl(std::complex<double>* Ainv_list[],
2257 std::complex<double>* grad_lapl_list[],
2258 std::complex<double>* out_list[],
2259 int N,
2260 int row_stride,
2261 int num_mats)
2262 {
2263 dim3 dimBlock(RATIO_BS, RATIO_BS);
2264 dim3 dimGrid(num_mats);
2265
2266 all_ratios_grad_lapl_kernel<thrust::complex<double>, RATIO_BS>
2267 <<<dimGrid, dimBlock>>>((thrust::complex<double>**)Ainv_list,
2268 (thrust::complex<double>**)grad_lapl_list,
2269 (thrust::complex<double>**)out_list,
2270 N,
2271 row_stride);
2272 }
2273 #endif
2274
2275 #define COPY_BS 256
2276
2277 template<typename T>
multi_copy(T ** dest,T ** src,int len)2278 __global__ void multi_copy(T** dest, T** src, int len)
2279 {
2280 __shared__ T *mysrc, *mydest;
2281 if (threadIdx.x == 0)
2282 {
2283 mysrc = src[blockIdx.y];
2284 mydest = dest[blockIdx.y];
2285 }
2286 __syncthreads();
2287 int i = blockIdx.x * COPY_BS + threadIdx.x;
2288 if (i < len)
2289 mydest[i] = mysrc[i];
2290 }
2291
2292
2293 template<typename T>
multi_copy(T ** buff,int dest_off,int src_off,int len)2294 __global__ void multi_copy(T** buff, int dest_off, int src_off, int len)
2295 {
2296 __shared__ T *mysrc, *mydest;
2297 if (threadIdx.x == 0)
2298 {
2299 T* ptr = buff[blockIdx.y];
2300 mysrc = ptr + src_off;
2301 mydest = ptr + dest_off;
2302 }
2303 __syncthreads();
2304 int i = blockIdx.x * COPY_BS + threadIdx.x;
2305 if (i < len)
2306 mydest[i] = mysrc[i];
2307 }
2308
2309
multi_copy(float * dest[],float * src[],int len,int num)2310 void multi_copy(float* dest[], float* src[], int len, int num)
2311 {
2312 dim3 dimBlock(COPY_BS);
2313 dim3 dimGrid((len + COPY_BS - 1) / COPY_BS, num);
2314 multi_copy<float><<<dimGrid, dimBlock>>>(dest, src, len);
2315 }
2316
multi_copy(double * dest[],double * src[],int len,int num)2317 void multi_copy(double* dest[], double* src[], int len, int num)
2318 {
2319 dim3 dimBlock(COPY_BS);
2320 dim3 dimGrid(len / COPY_BS, num);
2321 if (len % COPY_BS)
2322 dimGrid.x++;
2323 multi_copy<double><<<dimGrid, dimBlock>>>(dest, src, len);
2324 }
2325
2326
2327 #ifdef QMC_COMPLEX
multi_copy(std::complex<float> * dest[],std::complex<float> * src[],int len,int num)2328 void multi_copy(std::complex<float>* dest[], std::complex<float>* src[], int len, int num)
2329 {
2330 dim3 dimBlock(COPY_BS);
2331 dim3 dimGrid((len + COPY_BS - 1) / COPY_BS, num);
2332
2333 multi_copy<thrust::complex<float>>
2334 <<<dimGrid, dimBlock>>>((thrust::complex<float>**)dest, (thrust::complex<float>**)src, len);
2335 }
2336
multi_copy(std::complex<double> * dest[],std::complex<double> * src[],int len,int num)2337 void multi_copy(std::complex<double>* dest[], std::complex<double>* src[], int len, int num)
2338 {
2339 dim3 dimBlock(COPY_BS);
2340 dim3 dimGrid((len + COPY_BS - 1) / COPY_BS, num);
2341
2342 multi_copy<thrust::complex<double>>
2343 <<<dimGrid, dimBlock>>>((thrust::complex<double>**)dest, (thrust::complex<double>**)src, len);
2344 }
2345 #endif
2346
2347
multi_copy(float * buff[],int dest_off,int src_off,int len,int num)2348 void multi_copy(float* buff[], int dest_off, int src_off, int len, int num)
2349 {
2350 dim3 dimBlock(COPY_BS);
2351 dim3 dimGrid((len + COPY_BS - 1) / COPY_BS, num);
2352 multi_copy<float><<<dimGrid, dimBlock>>>(buff, dest_off, src_off, len);
2353 }
2354
multi_copy(double * buff[],int dest_off,int src_off,int len,int num)2355 void multi_copy(double* buff[], int dest_off, int src_off, int len, int num)
2356 {
2357 dim3 dimBlock(COPY_BS);
2358 dim3 dimGrid(len / COPY_BS, num);
2359 if (len % COPY_BS)
2360 dimGrid.x++;
2361 multi_copy<double><<<dimGrid, dimBlock>>>(buff, dest_off, src_off, len);
2362 }
2363
2364
2365 #ifdef QMC_COMPLEX
multi_copy(std::complex<float> * buff[],int dest_off,int src_off,int len,int num)2366 void multi_copy(std::complex<float>* buff[], int dest_off, int src_off, int len, int num)
2367 {
2368 dim3 dimBlock(COPY_BS);
2369 dim3 dimGrid((len + COPY_BS - 1) / COPY_BS, num);
2370
2371 multi_copy<thrust::complex<float>><<<dimGrid, dimBlock>>>((thrust::complex<float>**)buff, dest_off, src_off, len);
2372 }
2373
multi_copy(std::complex<double> * buff[],int dest_off,int src_off,int len,int num)2374 void multi_copy(std::complex<double>* buff[], int dest_off, int src_off, int len, int num)
2375 {
2376 dim3 dimBlock(COPY_BS);
2377 dim3 dimGrid((len + COPY_BS - 1) / COPY_BS, num);
2378
2379 multi_copy<thrust::complex<double>><<<dimGrid, dimBlock>>>((thrust::complex<double>**)buff, dest_off, src_off, len);
2380 }
2381 #endif
2382
2383
2384 #include <stdlib.h>
2385 #include <time.h>
2386
test_all_ratios_kernel()2387 void test_all_ratios_kernel()
2388 {
2389 int N = 128;
2390 float *A, *A_d, *Ainv, *Ainv_d, *ratio, *ratio_d;
2391 cudaMalloc((void**)&A_d, N * N * sizeof(float));
2392 cudaMalloc((void**)&Ainv_d, N * N * sizeof(float));
2393 cudaMalloc((void**)&ratio_d, 1 * N * sizeof(float));
2394 A = (float*)malloc(N * N * sizeof(float));
2395 Ainv = (float*)malloc(N * N * sizeof(float));
2396 ratio = (float*)malloc(1 * N * sizeof(float));
2397 // float ratio2[N];
2398 for (int i = 0; i < N; i++)
2399 for (int j = 0; j < N; j++)
2400 {
2401 A[i * N + j] = 1.0f + drand48();
2402 Ainv[i * N + j] = 1.0f + drand48();
2403 }
2404 cudaMemcpyAsync(A_d, A, N * N * sizeof(float), cudaMemcpyHostToDevice);
2405 cudaMemcpyAsync(Ainv_d, Ainv, N * N * sizeof(float), cudaMemcpyHostToDevice);
2406 float **A_list, **A_list_d, **Ainv_list, **Ainv_list_d, **ratio_list, **ratio_list_d;
2407 int numMats = 2000;
2408 cudaMalloc((void**)&A_list_d, numMats * sizeof(float*));
2409 cudaMalloc((void**)&Ainv_list_d, numMats * sizeof(float*));
2410 cudaMalloc((void**)&ratio_list_d, numMats * sizeof(float*));
2411 A_list = (float**)malloc(numMats * sizeof(float*));
2412 Ainv_list = (float**)malloc(numMats * sizeof(float*));
2413 ratio_list = (float**)malloc(numMats * sizeof(float*));
2414 for (int i = 0; i < numMats; i++)
2415 {
2416 A_list[i] = A_d;
2417 Ainv_list[i] = Ainv_d;
2418 ratio_list[i] = ratio_d;
2419 }
2420 cudaMemcpyAsync(A_list_d, A_list, numMats * sizeof(float*), cudaMemcpyHostToDevice);
2421 cudaMemcpyAsync(Ainv_list_d, Ainv_list, numMats * sizeof(float*), cudaMemcpyHostToDevice);
2422 cudaMemcpyAsync(ratio_list_d, ratio_list, numMats * sizeof(float*), cudaMemcpyHostToDevice);
2423 clock_t start = clock();
2424 for (int i = 0; i < 1000; i++)
2425 calc_all_ratios(Ainv_list_d, A_list_d, ratio_list_d, N, N, numMats);
2426 clock_t end = clock();
2427 double time = (double)(end - start) / (double)CLOCKS_PER_SEC;
2428 fprintf(stderr, "start = %ld\n", start);
2429 fprintf(stderr, "end = %ld\n", end);
2430 double rate = 1000.0 / time;
2431 fprintf(stderr, "Rate = %1.2f generations per second.\n", rate);
2432 cudaMemcpy(ratio, ratio_d, N * sizeof(float), cudaMemcpyDeviceToHost);
2433 // for (int i=0; i<N; i++) {
2434 // ratio2[i] = 0.0f;
2435 // for (int j=0; j<N; j++)
2436 // ratio2[i] += A[i*N+j]*Ainv[j*N+i];
2437 // fprintf (stderr, "%3d %10.6f %10.6f\n", i, ratio2[i], ratio[i]);
2438 // }
2439 }
2440
2441
test_all_grad_lapl_kernel()2442 void test_all_grad_lapl_kernel()
2443 {
2444 int N = 128;
2445 float *A, *A_d, *Ainv, *Ainv_d, *ratio, *ratio_d;
2446 cudaMalloc((void**)&A_d, 4 * N * N * sizeof(float));
2447 cudaMalloc((void**)&Ainv_d, N * N * sizeof(float));
2448 cudaMalloc((void**)&ratio_d, 4 * N * sizeof(float));
2449 A = (float*)malloc(4 * N * N * sizeof(float));
2450 Ainv = (float*)malloc(1 * N * N * sizeof(float));
2451 ratio = (float*)malloc(4 * N * sizeof(float));
2452 float ratio2[4 * N];
2453 for (int i = 0; i < N; i++)
2454 for (int j = 0; j < N; j++)
2455 {
2456 Ainv[i * N + j] = 1.0f + drand48();
2457 for (int k = 0; k < 4; k++)
2458 A[4 * (i * N + j) + k] = 1.0f + drand48();
2459 }
2460 cudaMemcpyAsync(A_d, A, 4 * N * N * sizeof(float), cudaMemcpyHostToDevice);
2461 cudaMemcpyAsync(Ainv_d, Ainv, 1 * N * N * sizeof(float), cudaMemcpyHostToDevice);
2462 float **A_list, **A_list_d, **Ainv_list, **Ainv_list_d, **ratio_list, **ratio_list_d;
2463 int numMats = 2000;
2464 cudaMalloc((void**)&A_list_d, numMats * sizeof(float*));
2465 cudaMalloc((void**)&Ainv_list_d, numMats * sizeof(float*));
2466 cudaMalloc((void**)&ratio_list_d, numMats * sizeof(float*));
2467 A_list = (float**)malloc(numMats * sizeof(float*));
2468 Ainv_list = (float**)malloc(numMats * sizeof(float*));
2469 ratio_list = (float**)malloc(numMats * sizeof(float*));
2470 for (int i = 0; i < numMats; i++)
2471 {
2472 A_list[i] = A_d;
2473 Ainv_list[i] = Ainv_d;
2474 ratio_list[i] = ratio_d;
2475 }
2476 cudaMemcpyAsync(A_list_d, A_list, numMats * sizeof(float*), cudaMemcpyHostToDevice);
2477 cudaMemcpyAsync(Ainv_list_d, Ainv_list, numMats * sizeof(float*), cudaMemcpyHostToDevice);
2478 cudaMemcpyAsync(ratio_list_d, ratio_list, numMats * sizeof(float*), cudaMemcpyHostToDevice);
2479 struct timeval tstart, tend;
2480 gettimeofday(&tstart, NULL);
2481 for (int i = 0; i < 1; i++)
2482 calc_grad_lapl(Ainv_list_d, A_list_d, ratio_list_d, N, N, numMats);
2483 cudaMemcpy(ratio, ratio_d, 4 * N * sizeof(float), cudaMemcpyDeviceToHost);
2484 gettimeofday(&tend, NULL);
2485 double start = (double)tstart.tv_sec + 1.0e-6 * (double)tstart.tv_usec;
2486 double end = (double)tend.tv_sec + 1.0e-6 * (double)tend.tv_usec;
2487 fprintf(stderr, "start = %f\n", start);
2488 fprintf(stderr, "end = %f\n", end);
2489 double rate = 100.0 / (end - start);
2490 fprintf(stderr, "Rate = %1.2f generations per second.\n", rate);
2491 for (int i = 0; i < N; i++)
2492 {
2493 for (int k = 0; k < 4; k++)
2494 ratio2[4 * i + k] = 0.0f;
2495 for (int j = 0; j < N; j++)
2496 for (int k = 0; k < 4; k++)
2497 ratio2[4 * i + k] += A[(4 * i + k) * N + j] * Ainv[j * N + i];
2498 for (int k = 0; k < 4; k++)
2499 fprintf(stderr, "%3d %10.6f %10.6f\n", 4 * i + k, ratio2[4 * i + k], ratio[4 * i + k]);
2500 }
2501 }
2502
2503
2504 template<typename T>
woodbury_update_16(T ** Ainv_trans,T ** delta,T ** Ainv_delta,int N,int rowstride)2505 __global__ void woodbury_update_16(T** Ainv_trans, T** delta, T** Ainv_delta, int N, int rowstride)
2506 {
2507 T *myAinv, *mydelta, *myAinv_delta;
2508 int tid = threadIdx.x;
2509 myAinv = Ainv_trans[blockIdx.y];
2510 myAinv_delta = Ainv_delta[blockIdx.y];
2511 mydelta = delta[blockIdx.y];
2512 int first_row = blockIdx.x * 16;
2513 __shared__ T Ainv_s[16][17], delta_s[2][17], Ainv_delta_s[16][17];
2514 int nb = (N + 15) / 16;
2515 for (int row = 0; row < 16; row++)
2516 if (tid < 16)
2517 Ainv_delta_s[row][tid] = 0.0f;
2518 __syncthreads();
2519 int col = tid & 15;
2520 for (int block = 0; block < nb; block++)
2521 {
2522 int nend = N - block * 16;
2523 int c = block * 16 + tid;
2524 if (tid < 16)
2525 for (int row = 0; row < 16; row++)
2526 {
2527 Ainv_s[row][col] = myAinv[(first_row + row) * rowstride + c];
2528 }
2529 __syncthreads();
2530 for (int irow = 0; irow < 8; irow++)
2531 {
2532 int odd = tid > 15;
2533 int row = 2 * irow + odd;
2534 delta_s[odd][tid] = mydelta[row * rowstride + c];
2535 if (row + first_row < N && col < nend)
2536 for (int k = 0; k < 16; k++)
2537 Ainv_delta_s[row][col] += Ainv_s[col][k] * delta_s[odd][k];
2538 }
2539 __syncthreads();
2540 }
2541 int mycol = blockIdx.x * 16 + tid;
2542 if (tid < 16 && mycol < N)
2543 for (int row = 0; row < 16; row++)
2544 myAinv_delta[row * rowstride + mycol] = Ainv_delta_s[row][tid];
2545 }
2546
2547
2548 // Require 64 threads per block
2549 template<typename T>
block_inverse_16(T A[16][17])2550 __device__ inline void block_inverse_16(T A[16][17])
2551 {
2552 int tid = threadIdx.x;
2553 __shared__ T Acolk[16];
2554 // if (blockIdx.y == 0 && tid == 0) {
2555 // printf ("Ablock:\n");
2556 // for (int i=0; i<16; i++) {
2557 // for (int j=0; j<16; j++)
2558 // printf ("%14.8e ", A[i][j]);
2559 // printf ("\n");
2560 // }
2561 // }
2562 for (int k = 0; k < 16; k++)
2563 {
2564 T pivotInv = 1.0f / A[k][k];
2565 if (tid < 16)
2566 {
2567 T tmp = (tid == k) ? 0.0f : -pivotInv * A[tid][k];
2568 A[tid][k] = tmp;
2569 Acolk[tid] = tmp;
2570 }
2571 __syncthreads();
2572 int row = tid >> 4;
2573 int col = tid & 0x0f;
2574 T Arowk = A[k][col];
2575 A[row + 0][col] += Arowk * Acolk[row + 0];
2576 A[row + 4][col] += Arowk * Acolk[row + 4];
2577 A[row + 8][col] += Arowk * Acolk[row + 8];
2578 A[row + 12][col] += Arowk * Acolk[row + 12];
2579 __syncthreads();
2580 if (tid < 16)
2581 {
2582 if (tid == k)
2583 A[k][tid] = pivotInv;
2584 else
2585 A[k][tid] *= pivotInv;
2586 }
2587 __syncthreads();
2588 }
2589 // if (blockIdx.y == 0 && tid == 0) {
2590 // printf ("Ainvblock:\n");
2591 // for (int i=0; i<16; i++) {
2592 // for (int j=0; j<16; j++)
2593 // printf ("%14.8e ", A[i][j]);
2594 // printf ("\n");
2595 // }
2596 // }
2597 }
2598
2599
2600 // This routine performs the first half of a Woodbury formula update
2601 // for updating 16 rows at a time of the A matrix. It assumes that
2602 // the inverse matrix is stored in transposed form. This kernel
2603 // computes transpose(Ainv) * delta, where delta is 16xN matrix of the
2604 // changes in A. kblock indicates which block of 16 rows has been
2605 // changed. Each blockIdx.x computes a different 16x16 block for the
2606 // product.
2607 template<typename T>
woodbury_update_16a(T ** Ainv_trans,T ** delta,T ** Ainv_delta,T ** inv_block,int N,int rowstride,int kblock)2608 __global__ void woodbury_update_16a(T** Ainv_trans,
2609 T** delta,
2610 T** Ainv_delta,
2611 T** inv_block,
2612 int N,
2613 int rowstride,
2614 int kblock)
2615 {
2616 T *myAinv, *mydelta, *myAinv_delta, *myinvblock;
2617 int tid = threadIdx.x;
2618 myAinv = Ainv_trans[blockIdx.y];
2619 myAinv_delta = Ainv_delta[blockIdx.y];
2620 mydelta = delta[blockIdx.y];
2621 myinvblock = inv_block[blockIdx.y];
2622 int first_row = blockIdx.x * 32;
2623 __shared__ T Ainv1[16][17], Ainv2[16][17], delta_s[4][17], Ainv_delta1[16][17], Ainv_delta2[16][17];
2624 int nb = (N + 15) / 16;
2625 int row = tid >> 4;
2626 int col = tid & 0x0f;
2627 Ainv_delta1[row + 0][col] = Ainv_delta2[row + 0][col] = 0.0f;
2628 Ainv_delta1[row + 4][col] = Ainv_delta2[row + 4][col] = 0.0f;
2629 Ainv_delta1[row + 8][col] = Ainv_delta2[row + 8][col] = 0.0f;
2630 Ainv_delta1[row + 12][col] = Ainv_delta2[row + 12][col] = 0.0f;
2631 __syncthreads();
2632 for (int block = 0; block < nb; block++)
2633 {
2634 int c = block * 16 + col;
2635 int row = tid >> 4;
2636 for (int irow = 0; irow < 4; irow++, row += 4)
2637 {
2638 Ainv1[row][col] = myAinv[(first_row + row) * rowstride + c];
2639 Ainv2[row][col] = myAinv[(first_row + row + 16) * rowstride + c];
2640 }
2641 __syncthreads();
2642 row = tid >> 4;
2643 int row2 = row;
2644 for (int irow = 0; irow < 4; irow++, row += 4)
2645 {
2646 delta_s[row2][col] = mydelta[row * rowstride + c];
2647 T mysum1 = Ainv_delta1[row][col];
2648 T mysum2 = Ainv_delta2[row][col];
2649 if (row + first_row < N && c < N)
2650 for (int k = 0; k < 16; k++)
2651 {
2652 mysum1 += Ainv1[col][k] * delta_s[row2][k];
2653 mysum2 += Ainv2[col][k] * delta_s[row2][k];
2654 }
2655 Ainv_delta1[row][col] = mysum1;
2656 Ainv_delta2[row][col] = mysum2;
2657 }
2658 __syncthreads();
2659 }
2660 int mycol = blockIdx.x * 32 + col;
2661 row = tid >> 4;
2662 if (mycol < N)
2663 {
2664 for (int irow = 0; irow < 4; irow++, row += 4)
2665 {
2666 myAinv_delta[row * rowstride + mycol] = Ainv_delta1[row][col];
2667 myAinv_delta[row * rowstride + mycol + 16] = Ainv_delta2[row][col];
2668 }
2669 }
2670 __syncthreads();
2671 row = tid >> 4;
2672 col = tid & 0x0f;
2673 if (2 * blockIdx.x + 1 == kblock)
2674 {
2675 if (tid < 16)
2676 Ainv_delta2[tid][tid] += 1.0f;
2677 __syncthreads();
2678 block_inverse_16<T>(Ainv_delta2);
2679 for (int irow = 0; irow < 4; irow++, row += 4)
2680 myinvblock[row * 16 + col] = Ainv_delta2[row][col];
2681 }
2682 if (2 * blockIdx.x == kblock)
2683 {
2684 if (tid < 16)
2685 Ainv_delta1[tid][tid] += 1.0f;
2686 __syncthreads();
2687 block_inverse_16<T>(Ainv_delta1);
2688 for (int irow = 0; irow < 4; irow++, row += 4)
2689 myinvblock[row * 16 + col] = Ainv_delta1[row][col];
2690 }
2691 }
2692
2693
2694 template<typename T>
woodbury_update_16b(T ** Ainv_trans,T ** delta,T ** Ainv_delta,T ** inv_block,int N,int rowstride,int kblock)2695 __global__ void woodbury_update_16b(T** Ainv_trans,
2696 T** delta,
2697 T** Ainv_delta,
2698 T** inv_block,
2699 int N,
2700 int rowstride,
2701 int kblock)
2702 {
2703 int tid = threadIdx.x;
2704 __shared__ T B1[16][17], B2[16][17], B3[16][17], B4[16][17];
2705 __shared__ T *myAinv, *myAinv_delta, *myinv_block;
2706 if (tid == 0)
2707 {
2708 myAinv = Ainv_trans[blockIdx.y];
2709 myAinv_delta = Ainv_delta[blockIdx.y];
2710 myinv_block = inv_block[blockIdx.y];
2711 }
2712 __syncthreads();
2713 int row = tid >> 4;
2714 int col = tid & 0x0f;
2715 int c = blockIdx.x * 32 + col;
2716 for (int i = 0; i < 4; i++, row += 4)
2717 {
2718 B1[row][col] = myinv_block[16 * row + col];
2719 B2[row][col] = myAinv[(16 * kblock + row) * rowstride + c];
2720 }
2721 __syncthreads();
2722 row = tid >> 4;
2723 // Now, multiply Ainv block by inv_block
2724 for (int i = 0; i < 4; i++, row += 4)
2725 {
2726 T mysum = 0.0f;
2727 for (int j = 0; j < 16; j++)
2728 mysum += B2[j][row] * B1[j][col];
2729 B3[row][col] = mysum;
2730 }
2731 __syncthreads();
2732 row = tid >> 4;
2733 for (int i = 0; i < 4; i++, row += 4)
2734 B2[row][col] = myAinv[(16 * kblock + row) * rowstride + c + 16];
2735 __syncthreads();
2736 row = tid >> 4;
2737 // Now, multiply Ainv block by inv_block
2738 for (int i = 0; i < 4; i++, row += 4)
2739 {
2740 T mysum = 0.0f;
2741 for (int j = 0; j < 16; j++)
2742 mysum += B2[j][row] * B1[j][col];
2743 B4[row][col] = mysum;
2744 }
2745 // Now do outer product
2746 int nb = (N + 15) >> 4;
2747 for (int block = 0; block < nb; block++)
2748 {
2749 row = tid >> 4;
2750 col = tid & 0x0f;
2751 for (int i = 0; i < 4; i++, row += 4)
2752 B1[row][col] = myAinv_delta[row * rowstride + col + 16 * block];
2753 __syncthreads();
2754 row = tid >> 4;
2755 col = tid & 0x0f;
2756 for (int irow = 0; irow < 4; irow++, row += 4)
2757 {
2758 T mysum3 = myAinv[(16 * block + row) * rowstride + c];
2759 T mysum4 = myAinv[(16 * block + row) * rowstride + c + 16];
2760 for (int k = 0; k < 16; k++)
2761 {
2762 mysum3 -= B3[col][k] * B1[k][row];
2763 mysum4 -= B4[col][k] * B1[k][row];
2764 }
2765 myAinv[(16 * block + row) * rowstride + c] = mysum3;
2766 myAinv[(16 * block + row) * rowstride + c + 16] = mysum4;
2767 }
2768 }
2769 }
2770
2771
2772 template<typename T>
woodbury_update_32(T ** Ainv_trans,T ** delta,T ** Ainv_delta,int N,int rowstride)2773 __global__ void woodbury_update_32(T** Ainv_trans, T** delta, T** Ainv_delta, int N, int rowstride)
2774 {
2775 T *myAinv, *mydelta, *myAinv_delta;
2776 int tid = threadIdx.x;
2777 myAinv = Ainv_trans[blockIdx.y];
2778 myAinv_delta = Ainv_delta[blockIdx.y];
2779 mydelta = delta[blockIdx.y];
2780 int first_row = blockIdx.x * 32;
2781 __shared__ T Ainv_s[32][33], delta_s[32][33], Ainv_delta_s[32][33];
2782 int nb = (N + 31) / 32;
2783 for (int row = 0; row < 32; row++)
2784 if (tid < 32)
2785 Ainv_delta_s[row][tid] = 0.0f;
2786 __syncthreads();
2787 int col = tid;
2788 for (int block = 0; block < nb; block++)
2789 {
2790 int nend = N - block * 32;
2791 int c = block * 32 + tid;
2792 for (int row = 0; row < 32; row++)
2793 {
2794 Ainv_s[row][tid] = myAinv[(first_row + row) * rowstride + c];
2795 delta_s[row][tid] = mydelta[row * rowstride + c];
2796 }
2797 __syncthreads();
2798 for (int row = 0; row < 32; row++)
2799 {
2800 if (row + first_row < N && col < nend)
2801 for (int k = 0; k < 32; k++)
2802 Ainv_delta_s[row][col] += Ainv_s[row][k] * delta_s[col][k];
2803 }
2804 __syncthreads();
2805 }
2806 int mycol = blockIdx.x * 32 + tid;
2807 if (mycol < N)
2808 for (int row = 0; row < 32; row++)
2809 myAinv_delta[row * rowstride + mycol] = Ainv_delta_s[row][tid];
2810 }
2811
2812
2813 #ifdef CUDA_TEST_MAIN
2814
2815 #include <omp.h>
2816 #include "lapacke.h"
2817
2818 #define MAT_SIZE 256
2819 #define NUM_MATS 512
2820
2821 // Compute inverse of n*n matrix A via LAPACKE
MatrixInverse(double * A,int n)2822 void MatrixInverse(double* A, int n)
2823 {
2824 lapack_int* ipiv_array = new int[n];
2825 lapack_int ierror;
2826 ierror = LAPACKE_dgetrf(LAPACK_COL_MAJOR, n, n, A, n, ipiv_array);
2827 if (ierror != 0)
2828 {
2829 fprintf(stderr, "dgetrf failure. Error %i\n", ierror);
2830 exit(1);
2831 }
2832 ierror = LAPACKE_dgetri(LAPACK_COL_MAJOR, n, A, n, ipiv_array);
2833 if (ierror != 0)
2834 {
2835 fprintf(stderr, "dgetri failure. Error %i\n", ierror);
2836 exit(1);
2837 }
2838 delete[] ipiv_array;
2839 return;
2840 }
2841
test_update()2842 void test_update()
2843 {
2844 int const N = MAT_SIZE;
2845 double *A, *Ainv;
2846 int numMats = NUM_MATS;
2847 float *A_h, *Ainv_h, *u_h;
2848 float *Ainv_d, *Ainv_u_d, *Ainv_colk_d, *u_d;
2849 A = (double*)malloc(N * N * sizeof(double));
2850 Ainv = (double*)malloc(N * N * sizeof(double));
2851 Ainv_h = (float*)malloc(N * N * sizeof(float));
2852 A_h = (float*)malloc(N * N * sizeof(float));
2853 u_h = (float*)malloc(N * sizeof(float));
2854 cudaMalloc((void**)&Ainv_d, N * N * sizeof(float));
2855 cudaMalloc((void**)&Ainv_d, N * N * sizeof(float));
2856 cudaMalloc((void**)&u_d, N * sizeof(float));
2857 cudaMalloc((void**)&Ainv_u_d, N * sizeof(float));
2858 cudaMalloc((void**)&Ainv_colk_d, N * sizeof(float));
2859 float **AinvList, **Ainv_uList, **AList, **Ainv_colkList, **uList;
2860 AList = (float**)malloc(NUM_MATS * sizeof(float*));
2861 AinvList = (float**)malloc(NUM_MATS * sizeof(float*));
2862 Ainv_uList = (float**)malloc(NUM_MATS * sizeof(float*));
2863 Ainv_colkList = (float**)malloc(NUM_MATS * sizeof(float*));
2864 uList = (float**)malloc(NUM_MATS * sizeof(float*));
2865 float **AList_d, **AinvList_d, **Ainv_uList_d, **Ainv_colkList_d, **uList_d;
2866 cudaMalloc((void**)&AList_d, numMats * sizeof(float*));
2867 cudaMalloc((void**)&AinvList_d, numMats * sizeof(float*));
2868 cudaMalloc((void**)&Ainv_uList_d, numMats * sizeof(float*));
2869 cudaMalloc((void**)&Ainv_colkList_d, numMats * sizeof(float*));
2870 cudaMalloc((void**)&uList_d, numMats * sizeof(float*));
2871 for (int mat = 0; mat < numMats; mat++)
2872 {
2873 cudaMalloc((void**)&(AList[mat]), N * N * sizeof(float) + 1000);
2874 cudaMalloc((void**)&(AinvList[mat]), N * N * sizeof(float) + 1000);
2875 cudaMalloc((void**)&(Ainv_uList[mat]), N * sizeof(float) + 1000);
2876 cudaMalloc((void**)&(Ainv_colkList[mat]), N * sizeof(float) + 1000);
2877 cudaMalloc((void**)&(uList[mat]), N * sizeof(float) + 1000);
2878 }
2879 cudaMemcpyAsync(AList_d, AList, numMats * sizeof(float*), cudaMemcpyHostToDevice);
2880 cudaMemcpyAsync(AinvList_d, AinvList, numMats * sizeof(float*), cudaMemcpyHostToDevice);
2881 cudaMemcpyAsync(Ainv_uList_d, Ainv_uList, numMats * sizeof(float*), cudaMemcpyHostToDevice);
2882 cudaMemcpyAsync(Ainv_colkList_d, Ainv_colkList, numMats * sizeof(float*), cudaMemcpyHostToDevice);
2883 cudaMemcpyAsync(uList_d, uList, numMats * sizeof(float*), cudaMemcpyHostToDevice);
2884 srand48((long int)12341313);
2885 int row = 0;
2886 for (int mat = 0; mat < numMats; mat++)
2887 {
2888 if (mat == 0)
2889 {
2890 for (int i = 0; i < N; i++)
2891 {
2892 u_h[i] = drand48();
2893 for (int j = 0; j < N; j++)
2894 A[i * N + j] = Ainv[i * N + j] = A_h[i * N + j] = drand48();
2895 }
2896 // for (int i=0; i<N; i++)
2897 // u_h[i] = A_h[row*N+i];
2898 MatrixInverse(Ainv, N);
2899 for (int i = 0; i < N; i++)
2900 for (int j = 0; j < N; j++)
2901 Ainv_h[i * N + j] = (float)Ainv[i * N + j];
2902 }
2903 // for (int i=0; i<N; i++)
2904 // u_h[i] = A_h[row*N+i];
2905 cudaMemcpyAsync(AList[mat], A_h, N * N * sizeof(float), cudaMemcpyHostToDevice);
2906 cudaMemcpyAsync(AinvList[mat], Ainv_h, N * N * sizeof(float), cudaMemcpyHostToDevice);
2907 cudaMemcpyAsync(uList[mat], u_h, N * sizeof(float), cudaMemcpyHostToDevice);
2908 }
2909 dim3 dimBlock2(64);
2910 dim3 dimGrid2((N + 63) / 64, NUM_MATS);
2911 double start = omp_get_wtime();
2912 for (int i = 0; i < 100; i++)
2913 {
2914 update_inverse_cuda1<float, 64>
2915 <<<dimGrid2, dimBlock2>>>(AList_d, AinvList_d, uList_d, Ainv_uList_d, Ainv_colkList_d, N, N, row);
2916 update_inverse_cuda2<float, 64>
2917 <<<dimGrid2, dimBlock2>>>(AList_d, AinvList_d, uList_d, Ainv_uList_d, Ainv_colkList_d, N, N, row);
2918 }
2919 cudaDeviceSynchronize();
2920 double end = omp_get_wtime();
2921 fprintf(stderr, "Rate = %12.8f updates per second.\n", (double)(100 * NUM_MATS) / (end - start));
2922 cudaMemcpy(Ainv_h, AinvList[0], N * N * sizeof(float), cudaMemcpyDeviceToHost);
2923 /* for (int j=0; j<16; j++)
2924 for (int i=0; i<N; i++)
2925 A[(row+j)*N+i] += delta_h[j*N+i];
2926 for (int i=0; i<N; i++)
2927 for (int j=0; j<N; j++) {
2928 double ident = 0.0;
2929 for (int k=0; k<N; k++)
2930 ident += Ainv_h[i*N+k]*A[k*N+j];
2931 if ((i==j && fabs(ident - 1.0) > 1.0e-4) ||
2932 (i!=j && fabs(ident) > 1.0e-4))
2933 fprintf (stderr, "Error in matrix inverse, (%d, %d) = %1.8f\n", i, j, ident);
2934 }*/
2935 fprintf(stderr, "Finished.\n");
2936 }
2937
test_update_transpose()2938 void test_update_transpose()
2939 {
2940 const int N = MAT_SIZE;
2941 double *A, *Ainv;
2942 int numMats = NUM_MATS;
2943 float *A_h, *Ainv_h, *u_h;
2944 float *Ainv_d, *Ainv_u_d, *Ainv_colk_d, *u_d;
2945 A = (double*)malloc(N * N * sizeof(double));
2946 Ainv = (double*)malloc(N * N * sizeof(double));
2947 Ainv_h = (float*)malloc(N * N * sizeof(float));
2948 A_h = (float*)malloc(N * N * sizeof(float));
2949 u_h = (float*)malloc(N * sizeof(float));
2950 cudaMalloc((void**)&Ainv_d, N * N * sizeof(float));
2951 cudaMalloc((void**)&Ainv_d, N * N * sizeof(float));
2952 cudaMalloc((void**)&u_d, N * sizeof(float));
2953 cudaMalloc((void**)&Ainv_u_d, N * sizeof(float));
2954 cudaMalloc((void**)&Ainv_colk_d, N * sizeof(float));
2955 float **AinvList, **Ainv_uList, **AList, **Ainv_colkList, **uList;
2956 AList = (float**)malloc(NUM_MATS * sizeof(float*));
2957 AinvList = (float**)malloc(NUM_MATS * sizeof(float*));
2958 Ainv_uList = (float**)malloc(NUM_MATS * sizeof(float*));
2959 Ainv_colkList = (float**)malloc(NUM_MATS * sizeof(float*));
2960 uList = (float**)malloc(NUM_MATS * sizeof(float*));
2961 float **AList_d, **AinvList_d, **Ainv_uList_d, **Ainv_colkList_d, **uList_d;
2962 cudaMalloc((void**)&AList_d, numMats * sizeof(float*));
2963 cudaMalloc((void**)&AinvList_d, numMats * sizeof(float*));
2964 cudaMalloc((void**)&Ainv_uList_d, numMats * sizeof(float*));
2965 cudaMalloc((void**)&Ainv_colkList_d, numMats * sizeof(float*));
2966 cudaMalloc((void**)&uList_d, numMats * sizeof(float*));
2967 for (int mat = 0; mat < numMats; mat++)
2968 {
2969 cudaMalloc((void**)&(AList[mat]), N * N * sizeof(float));
2970 cudaMalloc((void**)&(AinvList[mat]), N * N * sizeof(float));
2971 cudaMalloc((void**)&(Ainv_uList[mat]), N * sizeof(float));
2972 cudaMalloc((void**)&(Ainv_colkList[mat]), N * sizeof(float));
2973 cudaMalloc((void**)&(uList[mat]), N * sizeof(float));
2974 }
2975 cudaMemcpyAsync(AList_d, AList, numMats * sizeof(float*), cudaMemcpyHostToDevice);
2976 cudaMemcpyAsync(AinvList_d, AinvList, numMats * sizeof(float*), cudaMemcpyHostToDevice);
2977 cudaMemcpyAsync(Ainv_uList_d, Ainv_uList, numMats * sizeof(float*), cudaMemcpyHostToDevice);
2978 cudaMemcpyAsync(Ainv_colkList_d, Ainv_colkList, numMats * sizeof(float*), cudaMemcpyHostToDevice);
2979 cudaMemcpyAsync(uList_d, uList, numMats * sizeof(float*), cudaMemcpyHostToDevice);
2980 srand48((long int)12341313);
2981 int row = 1;
2982 for (int mat = 0; mat < numMats; mat++)
2983 {
2984 if (mat == 0)
2985 {
2986 for (int i = 0; i < N; i++)
2987 {
2988 for (int j = 0; j < N; j++)
2989 A[i * N + j] = Ainv[i * N + j] = A_h[i * N + j] = drand48();
2990 // u_h[i] = drand48();
2991 }
2992 for (int j = 0; j < N; j++)
2993 u_h[j] = drand48(); //A[N*row+j];
2994 MatrixInverse(Ainv, N);
2995 for (int i = 0; i < N; i++)
2996 for (int j = 0; j < N; j++)
2997 Ainv_h[j * N + i] = (float)Ainv[i * N + j];
2998 }
2999 // for (int i=0; i<N; i++)
3000 // u_h[i] = A_h[row*N+i];
3001 cudaMemcpyAsync(AList[mat], A_h, N * N * sizeof(float), cudaMemcpyHostToDevice);
3002 cudaMemcpyAsync(AinvList[mat], Ainv_h, N * N * sizeof(float), cudaMemcpyHostToDevice);
3003 cudaMemcpyAsync(uList[mat], u_h, N * sizeof(float), cudaMemcpyHostToDevice);
3004 }
3005 dim3 dimBlock(DET_BLOCK_SIZE);
3006 dim3 dimGrid(NUM_MATS);
3007 clock_t start = clock();
3008 for (int i = 0; i < 1000; i++)
3009 {
3010 update_inverse_transpose_cuda<float, DET_BLOCK_SIZE, N>
3011 <<<dimGrid, dimBlock>>>(AList_d, AinvList_d, uList_d, N, N, row);
3012 // update_inverse_transpose_cuda_2pass<float,DET_BLOCK_SIZE,N><<<dimGrid,dimBlock>>>
3013 // (AList_d, AinvList_d, uList_d, N, N, row);
3014 }
3015 cudaDeviceSynchronize();
3016 clock_t end = clock();
3017 fprintf(stderr, "Rate = %12.8f updates per second.\n",
3018 (double)(1000 * NUM_MATS) / ((double)(end - start) / (double)CLOCKS_PER_SEC));
3019 cudaMemcpy(Ainv_h, AinvList[1], N * N * sizeof(float), cudaMemcpyDeviceToHost);
3020 for (int i = 0; i < N; i++)
3021 A[row * N + i] = u_h[i];
3022 for (int i = 0; i < N; i++)
3023 for (int j = 0; j < N; j++)
3024 {
3025 double ident = 0.0;
3026 for (int k = 0; k < N; k++)
3027 ident += Ainv_h[k * N + i] * A[k * N + j];
3028 if ((i == j && fabs(ident - 1.0) > 1.0e-4) || (i != j && fabs(ident) > 1.0e-4))
3029 fprintf(stderr, "Error in matrix inverse, (%d, %d) = %1.8f\n", i, j, ident);
3030 }
3031 fprintf(stderr, "Finished.\n");
3032 }
3033
3034
test_woodbury()3035 void test_woodbury()
3036 {
3037 int const N = MAT_SIZE;
3038 int M = 16;
3039 int updateBlock = 3;
3040 double *A, *Ainv, *Anew, *Anew_inv;
3041 int numMats = NUM_MATS;
3042 float *A_h, *Ainv_h, *delta_h, *Ainv_delta_h, *Anew_h, *Anew_inv_h;
3043 ;
3044 float *Ainv_d, *Ainv_delta_d, *Ainv_colk_d, *delta_d;
3045 A = (double*)malloc(N * N * sizeof(double));
3046 Anew = (double*)malloc(N * N * sizeof(double));
3047 Ainv = (double*)malloc(N * N * sizeof(double));
3048 Anew_inv = (double*)malloc(N * N * sizeof(double));
3049 Ainv_h = (float*)malloc(N * N * sizeof(float));
3050 A_h = (float*)malloc(N * N * sizeof(float));
3051 delta_h = (float*)malloc(N * M * sizeof(float));
3052 Ainv_delta_h = (float*)malloc(N * M * sizeof(float));
3053 Anew_h = (float*)malloc(N * N * sizeof(float));
3054 Anew_inv_h = (float*)malloc(N * N * sizeof(float));
3055 cudaMalloc((void**)&Ainv_d, N * N * sizeof(float));
3056 cudaMalloc((void**)&delta_d, N * M * sizeof(float));
3057 cudaMalloc((void**)&Ainv_delta_d, N * M * sizeof(float));
3058 cudaMalloc((void**)&Ainv_colk_d, N * sizeof(float));
3059 float **AinvList, **Ainv_deltaList, **AList, **Ainv_colkList, **deltaList, **invBlockList;
3060 AList = (float**)malloc(NUM_MATS * sizeof(float*));
3061 AinvList = (float**)malloc(NUM_MATS * sizeof(float*));
3062 Ainv_deltaList = (float**)malloc(NUM_MATS * sizeof(float*));
3063 Ainv_colkList = (float**)malloc(NUM_MATS * sizeof(float*));
3064 deltaList = (float**)malloc(NUM_MATS * sizeof(float*));
3065 invBlockList = (float**)malloc(NUM_MATS * sizeof(float*));
3066 float **AList_d, **AinvList_d, **Ainv_deltaList_d, **Ainv_colkList_d, **deltaList_d, **invBlockList_d;
3067 cudaMalloc((void**)&AinvList_d, numMats * sizeof(float*));
3068 cudaMalloc((void**)&Ainv_deltaList_d, numMats * sizeof(float*));
3069 cudaMalloc((void**)&deltaList_d, numMats * sizeof(float*));
3070 cudaMalloc((void**)&invBlockList_d, numMats * sizeof(float*));
3071 for (int mat = 0; mat < numMats; mat++)
3072 {
3073 // cudaMalloc((void**)&(AList[mat]) , N*N*sizeof(float)+1000);
3074 cudaMalloc((void**)&(AinvList[mat]), N * N * sizeof(float) + 1000);
3075 cudaMalloc((void**)&(Ainv_deltaList[mat]), N * M * sizeof(float) + 1000);
3076 cudaMalloc((void**)&(deltaList[mat]), N * M * sizeof(float) + 1000);
3077 cudaMalloc((void**)&(invBlockList[mat]), M * M * sizeof(float) + 1000);
3078 cudaError_t err = cudaGetLastError();
3079 if (err != cudaSuccess)
3080 {
3081 fprintf(stderr, "CUDA error in test_woodbury malloc:\n %s\n", cudaGetErrorString(err));
3082 abort();
3083 }
3084 }
3085 cudaMemcpyAsync(AinvList_d, AinvList, numMats * sizeof(float*), cudaMemcpyHostToDevice);
3086 cudaMemcpyAsync(Ainv_deltaList_d, Ainv_deltaList, numMats * sizeof(float*), cudaMemcpyHostToDevice);
3087 cudaMemcpyAsync(deltaList_d, deltaList, numMats * sizeof(float*), cudaMemcpyHostToDevice);
3088 cudaMemcpyAsync(invBlockList_d, invBlockList, numMats * sizeof(float*), cudaMemcpyHostToDevice);
3089 srand48((long int)12341313);
3090 int row = 0;
3091 for (int mat = 0; mat < numMats; mat++)
3092 {
3093 if (mat == 0)
3094 {
3095 for (int i = 0; i < N; i++)
3096 {
3097 for (int j = 0; j < N; j++)
3098 {
3099 A[i * N + j] = Ainv[i * N + j] = A_h[i * N + j] = drand48();
3100 Anew[i * N + j] = Anew_inv[i * N + j] = A[i * N + j];
3101 }
3102 }
3103 for (int i = 0; i < 16; i++)
3104 for (int j = 0; j < N; j++)
3105 {
3106 delta_h[i * N + j] = drand48() - 0.5;
3107 Anew[(updateBlock * 16 + i) * N + j] = Anew_inv[(updateBlock * 16 + i) * N + j] =
3108 A[(updateBlock * 16 + i) * N + j] + delta_h[i * N + j];
3109 }
3110 // for (int i=0; i<N; i++)
3111 // delta_h[i] = A_h[row*N+i];
3112 MatrixInverse(Ainv, N);
3113 MatrixInverse(Anew_inv, N);
3114 for (int i = 0; i < N; i++)
3115 for (int j = 0; j < N; j++)
3116 {
3117 Ainv_h[i * N + j] = (float)Ainv[j * N + i];
3118 Anew_inv_h[i * N + j] = (float)Anew_inv[j * N + i];
3119 }
3120 }
3121 // for (int i=0; i<N; i++)
3122 // delta_h[i] = A_h[row*N+i];
3123 // cudaMemcpyAsync (AList[mat], A_h, N*N*sizeof(float),
3124 // cudaMemcpyHostToDevice);
3125 cudaMemcpyAsync(AinvList[mat], Ainv_h, N * N * sizeof(float), cudaMemcpyHostToDevice);
3126 cudaMemcpyAsync(deltaList[mat], delta_h, N * M * sizeof(float), cudaMemcpyHostToDevice);
3127 }
3128 cudaError_t err = cudaGetLastError();
3129 if (err != cudaSuccess)
3130 {
3131 fprintf(stderr, "CUDA error in test_woodbury memcopy's:\n %s\n", cudaGetErrorString(err));
3132 abort();
3133 }
3134 dim3 dimBlock2(64);
3135 dim3 dimGrida((N + 15) / 16, numMats);
3136 dim3 dimGridb((N + 31) / 32, numMats);
3137 //dim3 dimGrid2((N/32), numMats);
3138 double start = omp_get_wtime();
3139 for (int i = 0; i < 100; i++)
3140 {
3141 woodbury_update_16a<float>
3142 <<<dimGridb, dimBlock2>>>(AinvList_d, deltaList_d, Ainv_deltaList_d, invBlockList_d, N, N, updateBlock);
3143 woodbury_update_16b<float>
3144 <<<dimGridb, dimBlock2>>>(AinvList_d, deltaList_d, Ainv_deltaList_d, invBlockList_d, N, N, updateBlock);
3145 }
3146 cudaDeviceSynchronize();
3147 err = cudaGetLastError();
3148 if (err != cudaSuccess)
3149 {
3150 fprintf(stderr, "CUDA error in woodbury_update_16:\n %s\n", cudaGetErrorString(err));
3151 abort();
3152 }
3153 double end = omp_get_wtime();
3154 fprintf(stderr, "Rate = %12.8f updates per second.\n", (double)(100 * NUM_MATS) / (end - start));
3155 fprintf(stderr, "About to copy %ld back\n", N * M * sizeof(float));
3156 cudaMemcpy(Ainv_delta_h, Ainv_deltaList[0], N * M * sizeof(float), cudaMemcpyDeviceToHost);
3157 fprintf(stderr, "About to copy %ld back\n", N * N * sizeof(float));
3158 cudaMemcpy(Anew_inv_h, AinvList[0], N * N * sizeof(float), cudaMemcpyDeviceToHost);
3159 err = cudaGetLastError();
3160 if (err != cudaSuccess)
3161 {
3162 fprintf(stderr, "CUDA error in test_woodbury memcopy back:\n %s\n", cudaGetErrorString(err));
3163 abort();
3164 }
3165 fprintf(stderr, "Copied result back.\n");
3166 float Ainv_delta[N * M];
3167 for (int i = 0; i < N * M; i++)
3168 Ainv_delta[i] = 0.0;
3169 for (int i = 0; i < 16; i++)
3170 for (int j = 0; j < N; j++)
3171 for (int k = 0; k < N; k++)
3172 Ainv_delta[i * N + j] += Ainv_h[j * N + k] * delta_h[i * N + k];
3173 fprintf(stderr, "Ainv_delta_cpu = %1.8e\n", Ainv_delta[51]);
3174 fprintf(stderr, "Ainv_delta_gpu = %1.8e\n", Ainv_delta_h[51]);
3175 int i = 10;
3176 int j = 17;
3177 FILE* fcpu = fopen("Ainv_cpu.dat", "w");
3178 FILE* fgpu = fopen("Ainv_gpu.dat", "w");
3179 for (int i = 0; i < N; i++)
3180 {
3181 for (int j = 0; j < N; j++)
3182 {
3183 fprintf(fcpu, "%16.8e ", Anew_inv[i * N + j]);
3184 fprintf(fgpu, "%16.8e ", Anew_inv_h[j * N + i]);
3185 }
3186 fprintf(fcpu, "\n");
3187 fprintf(fgpu, "\n");
3188 }
3189 fprintf(stderr, "Anew_inv cpu = %1.8e\n", Anew_inv[i * N + j]);
3190 fprintf(stderr, "Anew_inv_gpu = %1.8e\n", Anew_inv_h[j * N + i]);
3191 // cudaMemcpy (Ainv_h, AinvList[0], N*N*sizeof(float),cudaMemcpyDeviceToHost);
3192 // for (int i=0; i<N; i++)
3193 // A[row*N+i] = delta_h[i];
3194 // for (int i=0; i<N; i++)
3195 // for (int j=0; j<N; j++) {
3196 // double ident = 0.0;
3197 // for (int k=0; k<N; k++)
3198 // ident += Ainv_h[i*N+k]*A[k*N+j];
3199 // if ((i==j && fabs(ident - 1.0) > 1.0e-4) ||
3200 // (i!=j && fabs(ident) > 1.0e-4))
3201 // fprintf (stderr, "Error in matrix inverse, (%d, %d) = %1.8f\n", i, j, ident);
3202 // }
3203 fprintf(stderr, "Finished.\n");
3204 }
3205
3206
3207 // Note: compile with:
3208 // nvcc -o determinant_update -DCUDA_TEST_MAIN -arch=sm_70 determinant_update.cu ../../../Platforms/CUDA_legacy/gpu_misc.cpp -I../../../Platforms -I../../../../../build_llvmdev_legacy_cuda_full_precision/src/ -lcublas -lopenblas -Xcompiler -fopenmp
3209 // build_llvm.../src path on above line should be location of a config.h produced by QMCPACK cmake
main()3210 int main()
3211 {
3212 //test_all_ratios_kernel();
3213 // test_all_grad_lapl_kernel();
3214 test_update();
3215 // test_update_transpose();
3216 test_woodbury();
3217
3218 return 0;
3219 }
3220
3221 #endif
3222