1 //////////////////////////////////////////////////////////////////////////////////////
2 // This file is distributed under the University of Illinois/NCSA Open Source License.
3 // See LICENSE file in top directory for details.
4 //
5 // Copyright (c) 2016 Jeongnim Kim and QMCPACK developers.
6 //
7 // File developed by: Ken Esler, kpesler@gmail.com, University of Illinois at Urbana-Champaign
8 //                    Jeremy McMinnis, jmcminis@gmail.com, University of Illinois at Urbana-Champaign
9 //                    Jeongnim Kim, jeongnim.kim@gmail.com, University of Illinois at Urbana-Champaign
10 //                    Paul R. C. Kent, kentpr@ornl.gov, Oak Ridge National Laboratory
11 //                    Ye Luo, yeluo@anl.gov, Argonne National Laboratory
12 //
13 // File created by: Ken Esler, kpesler@gmail.com, University of Illinois at Urbana-Champaign
14 //////////////////////////////////////////////////////////////////////////////////////
15 
16 
17 #include "determinant_update.h"
18 #include <stdio.h>
19 #include <unistd.h>
20 #include <stdlib.h>
21 #include <sys/time.h>
22 #ifdef QMC_COMPLEX
23 #include <thrust/complex.h>
24 //Prior to CUDA9.0, bulk::uninitialized_array was included with thrust library
25 //#include <thrust/system/cuda/detail/bulk/uninitialized.hpp>
26 #include "uninitialized_array.hpp"
27 #endif
28 #include "CUDA_legacy/gpu_misc.h"
29 
30 #define DET_BLOCK_SIZE 8
31 
32 template<typename T, int BS>
update_inverse_cuda1(updateJob * updateList,int N,int rowstride)33 __global__ void update_inverse_cuda1(updateJob* updateList, int N, int rowstride)
34 {
35   __shared__ T *A, *Ainv, *u, *Ainv_delta, *Ainv_colk;
36   __shared__ int k;
37   if (threadIdx.x == 0)
38   {
39     updateJob job = updateList[blockIdx.y];
40     A             = (T*)job.A;
41     Ainv          = (T*)job.Ainv;
42     u             = (T*)job.newRow;
43     Ainv_delta    = (T*)job.AinvDelta;
44     Ainv_colk     = (T*)job.AinvColk;
45     k             = job.iat;
46   }
47   __syncthreads();
48   // Store the product Ainv * u in shared memory
49   T Ainv_delta_tid;
50   __shared__ T Ainv_colk_shared[BS], delta[BS];
51   Ainv_delta_tid = 0.0f;
52   __syncthreads();
53   int col       = blockIdx.x * BS + threadIdx.x;
54   int numblocks = (N + BS - 1) / BS;
55   int kBlock    = k / BS;
56   // If the column I need to pull from Ainv is in this thread block
57   // domain, do the following
58   __syncthreads();
59   for (int block = 0; block < numblocks; block++)
60   {
61     delta[threadIdx.x] = u[block * BS + threadIdx.x] - A[k * rowstride + block * BS + threadIdx.x];
62     __syncthreads();
63     int istop = min(BS, N - block * BS);
64     for (int i = 0; i < istop; i++)
65     {
66       int row = block * BS + i;
67       T a     = Ainv[row * rowstride + col];
68       if (col == k)
69         Ainv_colk_shared[i] = a;
70       Ainv_delta_tid += a * delta[i];
71       __syncthreads();
72     }
73     if (blockIdx.x == kBlock)
74       if (block * BS + threadIdx.x < N)
75         Ainv_colk[block * BS + threadIdx.x] = Ainv_colk_shared[threadIdx.x];
76     __syncthreads();
77   }
78   // Write the data back to global memory
79   if (col < N)
80     Ainv_delta[col] = Ainv_delta_tid;
81   __syncthreads();
82 }
83 
84 
85 template<typename T, int BS>
update_inverse_cuda2(updateJob * updateList,int N,int rowstride)86 __global__ void update_inverse_cuda2(updateJob* updateList, int N, int rowstride)
87 {
88   __shared__ T *A, *u, *Ainv, *Ainv_delta, *Ainv_colk;
89   int tid = threadIdx.x;
90   __shared__ int k;
91   if (threadIdx.x == 0)
92   {
93     updateJob job = updateList[blockIdx.y];
94     A             = (T*)job.A;
95     u             = (T*)job.newRow;
96     Ainv          = (T*)job.Ainv;
97     Ainv_delta    = (T*)job.AinvDelta;
98     Ainv_colk     = (T*)job.AinvColk;
99     k             = job.iat;
100   }
101   __syncthreads();
102   T Ainv_delta_tid;
103   __shared__ T Ainv_colk_shared[BS];
104   int col = blockIdx.x * BS + threadIdx.x;
105   // Read the data back from global memory
106   Ainv_delta_tid                = Ainv_delta[col];
107   Ainv_colk_shared[threadIdx.x] = Ainv_colk[col];
108   if (col < N)
109     A[k * rowstride + col] = u[col];
110   __syncthreads();
111   __shared__ T prefact;
112   if (threadIdx.x == 0)
113     prefact = -1.0f / (1.0f + Ainv_delta[k]);
114   int numblocks = N / BS + ((N % BS) ? 1 : 0);
115   __syncthreads();
116   for (int block = 0; block < numblocks; block++)
117   {
118     Ainv_colk_shared[tid] = prefact * Ainv_colk[block * BS + threadIdx.x];
119     __syncthreads();
120     T* Ainv_row = Ainv + block * BS * rowstride + col;
121     int istop   = min(BS, N - block * BS);
122     if (col < N)
123       for (int i = 0; i < istop; i++, Ainv_row += rowstride)
124         *Ainv_row += Ainv_delta_tid * Ainv_colk_shared[i];
125     __syncthreads();
126   }
127 }
128 
129 
update_inverse_cuda(updateJob updateList[],float dummy,int N,int rowstride,int numWalkers)130 void update_inverse_cuda(updateJob updateList[], float dummy, int N, int rowstride, int numWalkers)
131 {
132   const int BS1 = 64;
133   const int BS2 = 64;
134   int NB1       = (N + BS1 - 1) / BS1;
135   int NB2       = (N + BS2 - 1) / BS2;
136   dim3 dimBlock1(BS1);
137   dim3 dimGrid1(NB1, numWalkers);
138   dim3 dimBlock2(BS2);
139   dim3 dimGrid2(NB2, numWalkers);
140   update_inverse_cuda1<float, BS1><<<dimGrid1, dimBlock1>>>(updateList, N, rowstride);
141   update_inverse_cuda2<float, BS2><<<dimGrid2, dimBlock2>>>(updateList, N, rowstride);
142 }
143 
144 
update_inverse_cuda(updateJob updateList[],double dummy,int N,int rowstride,int numWalkers)145 void update_inverse_cuda(updateJob updateList[], double dummy, int N, int rowstride, int numWalkers)
146 {
147   const int BS1 = 32;
148   const int BS2 = 32;
149   int NB1       = N / BS1 + ((N % BS1) ? 1 : 0);
150   int NB2       = N / BS2 + ((N % BS2) ? 1 : 0);
151   dim3 dimBlock1(BS1);
152   dim3 dimGrid1(NB1, numWalkers);
153   dim3 dimBlock2(BS2);
154   dim3 dimGrid2(NB2, numWalkers);
155   update_inverse_cuda1<double, BS1><<<dimGrid1, dimBlock1>>>(updateList, N, rowstride);
156   update_inverse_cuda2<double, BS2><<<dimGrid2, dimBlock2>>>(updateList, N, rowstride);
157   cudaError_t err = cudaGetLastError();
158   if (err != cudaSuccess)
159   {
160     fprintf(stderr, "CUDA error in update_inverse_cuda:\n  %s\n", cudaGetErrorString(err));
161     abort();
162   }
163 }
164 
165 /* Update of Ainv after rank-1 update to A, using Sherman-Morrison algorithm,
166    part 1. See also K. P. Esler, J. Kim, D. M. Ceperley, and L. Shulenburger:
167    Accelerating Quantum Monte Carlo Simulations of Real Materials on GPU
168    Clusters. Computing in Science & Engineering, Vol. 14, No. 1, Jan/Feb 2012,
169    pp. 40-51 (section "Inverses and Updates").
170 
171    Replace the Sherman-Morrison algorithm with the Fahy variant as the CPU.
172    Eq.(26) from S. Fahy, X. W. Wang, Steven G. Louie, Phys. Rev. B. 42, 3503
173 
174    u is the new row replacing the k-th row of A. Compute u * Ainv (gemv) and
175    return it via u_Ainv. Copy out the k-th column of Ainv as a contiguous
176    vector Ainv_colk for use by the second part of the update process. The
177    dimension of the square matrices A and Ainv is N, and the stride
178    (in elements) between consecutive rows of Ainv (and A) is rowstride.
179    All matrices are stored in row-major order.
180 */
181 template<typename T, int BS>
update_inverse_core1(T * __restrict__ A,const T * __restrict__ Ainv,const T * __restrict__ u,T * __restrict__ u_Ainv,T * __restrict__ Ainv_colk,int k,int N,int rowstride)182 __device__ __forceinline__ void update_inverse_core1(T* __restrict__ A,
183                                                      const T* __restrict__ Ainv,
184                                                      const T* __restrict__ u,
185                                                      T* __restrict__ u_Ainv,
186                                                      T* __restrict__ Ainv_colk,
187                                                      int k,
188                                                      int N,
189                                                      int rowstride)
190 {
191 #ifdef QMC_COMPLEX
192   __shared__ uninitialized_array<T, BS> delta;
193 #else
194   __shared__ T delta[BS];
195 #endif
196   T sum(0);
197   int tidx      = threadIdx.x;
198   int col_Ainv  = blockIdx.x * BS + tidx;
199   int numBlocks = (N + BS - 1) / BS;
200   for (int block = 0; block < numBlocks; block++)
201   {
202     int blockStart = block * BS;
203     int col_A      = blockStart + tidx;
204     if (col_A < N)
205     {
206       delta[tidx]              = u[col_A];
207       A[k * rowstride + col_A] = u[col_A];
208     }
209     __syncthreads();
210     for (int i = 0; i < min(BS, N - blockStart); i++)
211     {
212       const int row_Ainv = blockStart + i;
213       sum += delta[i] * Ainv[row_Ainv * rowstride + col_Ainv];
214     }
215     __syncthreads();
216   }
217   // Write segment of row vector delta*Ainv back to global memory
218   if (col_Ainv < N)
219   {
220     u_Ainv[col_Ainv] = sum;
221     // save the column k of Ainv
222     Ainv_colk[col_Ainv] = Ainv[col_Ainv * rowstride + k];
223   }
224 }
225 
226 /* Update of Ainv after rank-1 update to A, using Sherman-Morrison algorithm,
227    part 2. See also K. P. Esler, J. Kim, D. M. Ceperley, and L. Shulenburger:
228    Accelerating Quantum Monte Carlo Simulations of Real Materials on GPU
229    Clusters. Computing in Science & Engineering, Vol. 14, No. 1, Jan/Feb 2012,
230    pp. 40-51 (section "Inverses and Updates").
231 
232    Replace the Sherman-Morrison algorithm with the Fahy variant as the CPU.
233    Eq.(26) from S. Fahy, X. W. Wang, Steven G. Louie, Phys. Rev. B. 42, 3503
234 
235    Ainv * ek, the k-th column of Ainv, has been extracted in the first step,
236    and is passed in as column vector Ainv_colk. Step 1 also computed the row
237    vector u*Ainv, passed in via u_Ainv. We need to multiply (ger) Ainv_colk
238    with u_Ainv, scale the result by -1/u*Ainv*ek, then add this to Ainv.
239    delta*Ainv*ek is simply the k-th element of u_Ainv. The dimension of
240    the square matrices A and Ainv is N, and the stride (in elements) between
241    consecutive rows of Ainv and A is rowstride. All matrices are stored in
242    row-major order.
243 */
244 template<typename T, int BS>
update_inverse_core2(T * __restrict__ Ainv,const T * __restrict__ u_Ainv,const T * __restrict__ Ainv_colk,int k,int N,int rowstride)245 __device__ __forceinline__ void update_inverse_core2(T* __restrict__ Ainv,
246                                                      const T* __restrict__ u_Ainv,
247                                                      const T* __restrict__ Ainv_colk,
248                                                      int k,
249                                                      int N,
250                                                      int rowstride)
251 {
252   T u_Ainv_shared;
253 #ifdef QMC_COMPLEX
254   __shared__ uninitialized_array<T, BS> Ainv_colk_shared;
255 #else
256   __shared__ T Ainv_colk_shared[BS];
257 #endif
258   T prefact;
259   int tidx      = threadIdx.x;
260   int col_Ainv  = blockIdx.x * BS + tidx;
261   int numBlocks = (N + BS - 1) / BS;
262   // Cache one segment of row vector delta*Ainv, and replace one segment of
263   // the k-th row of A with the corresponding segment from row vector u
264   if (col_Ainv < N)
265     u_Ainv_shared = u_Ainv[col_Ainv];
266   if (col_Ainv == k)
267     u_Ainv_shared = u_Ainv[k] - T(1);
268   prefact = -T(1) / u_Ainv[k];
269   for (int block = 0; block < numBlocks; block++)
270   {
271     int blockStart = block * BS;
272     int row_Ainv;
273     row_Ainv = blockStart + tidx;
274     // cache and scale next segment of k-th column of Ainv
275     __syncthreads();
276     if (row_Ainv < N)
277     {
278       Ainv_colk_shared[tidx] = prefact * Ainv_colk[row_Ainv];
279     }
280     __syncthreads();
281     if (col_Ainv < N)
282     {
283       for (int i = 0; i < min(BS, N - blockStart); i++)
284       {
285         row_Ainv = blockStart + i;
286         // update one segment of current row of Ainv
287         Ainv[row_Ainv * rowstride + col_Ainv] += u_Ainv_shared * Ainv_colk_shared[i];
288       }
289     }
290   }
291 }
292 
293 template<typename T, int BS>
update_inverse_core2_subblock(T * __restrict__ Ainv,const T * __restrict__ u_Ainv,const T * __restrict__ Ainv_colk,int k,int N,int rowstride)294 __device__ __forceinline__ void update_inverse_core2_subblock(T* __restrict__ Ainv,
295                                                               const T* __restrict__ u_Ainv,
296                                                               const T* __restrict__ Ainv_colk,
297                                                               int k,
298                                                               int N,
299                                                               int rowstride)
300 {
301   T u_Ainv_shared;
302 #ifdef QMC_COMPLEX
303   __shared__ uninitialized_array<T, BS> Ainv_colk_shared;
304 #else
305   __shared__ T Ainv_colk_shared[BS];
306 #endif
307   T prefact;
308   int tidx     = threadIdx.x;
309   int col_Ainv = blockIdx.y * BS + tidx;
310   // int numBlocks = (N + BS - 1) / BS;
311   // Cache one segment of row vector delta*Ainv, and replace one segment of
312   // the k-th row of A with the corresponding segment from row vector u
313   if (col_Ainv < N)
314     u_Ainv_shared = u_Ainv[col_Ainv];
315   if (col_Ainv == k)
316     u_Ainv_shared = u_Ainv[k] - T(1);
317   prefact              = -T(1) / u_Ainv[k];
318   const int blockStart = blockIdx.z * BS;
319   int row_Ainv;
320   row_Ainv = blockStart + tidx;
321   // cache and scale next segment of k-th column of Ainv
322   __syncthreads();
323   if (row_Ainv < N)
324   {
325     Ainv_colk_shared[tidx] = prefact * Ainv_colk[row_Ainv];
326   }
327   __syncthreads();
328   if (col_Ainv < N)
329   {
330     for (int i = 0; i < min(BS, N - blockStart); i++)
331     {
332       row_Ainv = blockStart + i;
333       // update one segment of current row of Ainv
334       Ainv[row_Ainv * rowstride + col_Ainv] += u_Ainv_shared * Ainv_colk_shared[i];
335     }
336   }
337 }
338 
339 /////////////////////////////////////////////////
340 // New version with fewer PCI transfers needed //
341 /////////////////////////////////////////////////
342 
343 template<typename T, int BS>
update_inverse_kernel1(T ** data,int * iat,int A_off,int Ainv_off,int newRow_off,int AinvDelta_off,int AinvColk_off,int N,int rowstride)344 __global__ void update_inverse_kernel1(T** data,
345                                        int* iat,
346                                        int A_off,
347                                        int Ainv_off,
348                                        int newRow_off,
349                                        int AinvDelta_off,
350                                        int AinvColk_off,
351                                        int N,
352                                        int rowstride)
353 {
354   T* const sdata = data[blockIdx.y];
355   T* A           = sdata + A_off;         // A
356   const T* Ainv  = sdata + Ainv_off;      // Ainv
357   const T* u     = sdata + newRow_off;    // new k-th row of A
358   T* u_Ainv      = sdata + AinvDelta_off; // u * Ainv
359   T* Ainv_colk   = sdata + AinvColk_off;  // k-th column of orig. Ainv
360   int k          = iat[blockIdx.y];
361   update_inverse_core1<T, BS>(A, Ainv, u, u_Ainv, Ainv_colk, k, N, rowstride);
362 }
363 
364 template<typename T, int BS>
update_inverse_kernel2(T ** data,int * iat,int A_off,int Ainv_off,int newRow_off,int AinvDelta_off,int AinvColk_off,int N,int rowstride)365 __global__ void update_inverse_kernel2(T** data,
366                                        int* iat,
367                                        int A_off,
368                                        int Ainv_off,
369                                        int newRow_off,
370                                        int AinvDelta_off,
371                                        int AinvColk_off,
372                                        int N,
373                                        int rowstride)
374 
375 {
376   T* const sdata     = data[blockIdx.y];
377   T* Ainv            = sdata + Ainv_off;      // Ainv
378   const T* u_Ainv    = sdata + AinvDelta_off; // u * Ainv
379   const T* Ainv_colk = sdata + AinvColk_off;  // k-th column of orig. Ainv
380   int k              = iat[blockIdx.y];
381   update_inverse_core2<T, BS>(Ainv, u_Ainv, Ainv_colk, k, N, rowstride);
382 }
383 
update_inverse_cuda(float ** data,int iat[],int A_off,int Ainv_off,int newRow_off,int AinvDelta_off,int AinvColk_off,int N,int rowstride,int numWalkers)384 void update_inverse_cuda(float** data,
385                          int iat[],
386                          int A_off,
387                          int Ainv_off,
388                          int newRow_off,
389                          int AinvDelta_off,
390                          int AinvColk_off,
391                          int N,
392                          int rowstride,
393                          int numWalkers)
394 {
395   const int BS1 = 128;
396   const int BS2 = 128;
397   int NB1       = (N + BS1 - 1) / BS1;
398   int NB2       = (N + BS2 - 1) / BS2;
399   dim3 dimBlock1(BS1);
400   dim3 dimGrid1(NB1, numWalkers);
401   dim3 dimBlock2(BS2);
402   dim3 dimGrid2(NB2, numWalkers);
403   update_inverse_kernel1<float, BS1>
404       <<<dimGrid1, dimBlock1>>>(data, iat, A_off, Ainv_off, newRow_off, AinvDelta_off, AinvColk_off, N, rowstride);
405   update_inverse_kernel2<float, BS2>
406       <<<dimGrid2, dimBlock2>>>(data, iat, A_off, Ainv_off, newRow_off, AinvDelta_off, AinvColk_off, N, rowstride);
407   cudaError_t err = cudaGetLastError();
408   if (err != cudaSuccess)
409   {
410     fprintf(stderr, "CUDA error in update_inverse_cuda:\n  %s\n", cudaGetErrorString(err));
411     abort();
412   }
413 }
414 
update_inverse_cuda(double ** data,int iat[],int A_off,int Ainv_off,int newRow_off,int AinvDelta_off,int AinvColk_off,int N,int rowstride,int numWalkers)415 void update_inverse_cuda(double** data,
416                          int iat[],
417                          int A_off,
418                          int Ainv_off,
419                          int newRow_off,
420                          int AinvDelta_off,
421                          int AinvColk_off,
422                          int N,
423                          int rowstride,
424                          int numWalkers)
425 {
426   const int BS1 = 32;
427   const int BS2 = 32;
428   int NB1       = (N + BS1 - 1) / BS1;
429   int NB2       = (N + BS2 - 1) / BS2;
430   dim3 dimBlock1(BS1);
431   dim3 dimGrid1(NB1, numWalkers);
432   dim3 dimBlock2(BS2);
433   dim3 dimGrid2(NB2, numWalkers);
434   update_inverse_kernel1<double, BS1>
435       <<<dimGrid1, dimBlock1>>>(data, iat, A_off, Ainv_off, newRow_off, AinvDelta_off, AinvColk_off, N, rowstride);
436   update_inverse_kernel2<double, BS2>
437       <<<dimGrid2, dimBlock2>>>(data, iat, A_off, Ainv_off, newRow_off, AinvDelta_off, AinvColk_off, N, rowstride);
438   cudaError_t err = cudaGetLastError();
439   if (err != cudaSuccess)
440   {
441     fprintf(stderr, "CUDA error in update_inverse_cuda:\n  %s\n", cudaGetErrorString(err));
442     abort();
443   }
444 }
445 
446 #ifdef QMC_COMPLEX
447 
update_inverse_cuda(std::complex<float> ** data,int iat[],int A_off,int Ainv_off,int newRow_off,int AinvDelta_off,int AinvColk_off,int N,int rowstride,int numWalkers)448 void update_inverse_cuda(std::complex<float>** data,
449                          int iat[],
450                          int A_off,
451                          int Ainv_off,
452                          int newRow_off,
453                          int AinvDelta_off,
454                          int AinvColk_off,
455                          int N,
456                          int rowstride,
457                          int numWalkers)
458 {
459   const int BS1 = 128;
460   const int BS2 = 128;
461   int NB1       = (N + BS1 - 1) / BS1;
462   int NB2       = (N + BS2 - 1) / BS2;
463   dim3 dimBlock1(BS1);
464   dim3 dimGrid1(NB1, numWalkers);
465   dim3 dimBlock2(BS2);
466   dim3 dimGrid2(NB2, numWalkers);
467 
468   update_inverse_kernel1<thrust::complex<float>, BS1><<<dimGrid1, dimBlock1>>>((thrust::complex<float>**)data,
469                                                                                iat,
470                                                                                A_off,
471                                                                                Ainv_off,
472                                                                                newRow_off,
473                                                                                AinvDelta_off,
474                                                                                AinvColk_off,
475                                                                                N,
476                                                                                rowstride);
477   update_inverse_kernel2<thrust::complex<float>, BS2><<<dimGrid2, dimBlock2>>>((thrust::complex<float>**)data,
478                                                                                iat,
479                                                                                A_off,
480                                                                                Ainv_off,
481                                                                                newRow_off,
482                                                                                AinvDelta_off,
483                                                                                AinvColk_off,
484                                                                                N,
485                                                                                rowstride);
486   cudaError_t err = cudaGetLastError();
487   if (err != cudaSuccess)
488   {
489     fprintf(stderr, "CUDA error in update_inverse_cuda:\n  %s\n", cudaGetErrorString(err));
490     abort();
491   }
492 }
493 
update_inverse_cuda(std::complex<double> ** data,int iat[],int A_off,int Ainv_off,int newRow_off,int AinvDelta_off,int AinvColk_off,int N,int rowstride,int numWalkers)494 void update_inverse_cuda(std::complex<double>** data,
495                          int iat[],
496                          int A_off,
497                          int Ainv_off,
498                          int newRow_off,
499                          int AinvDelta_off,
500                          int AinvColk_off,
501                          int N,
502                          int rowstride,
503                          int numWalkers)
504 {
505   const int BS1 = 32;
506   const int BS2 = 32;
507   int NB1       = (N + BS1 - 1) / BS1;
508   int NB2       = (N + BS2 - 1) / BS2;
509   dim3 dimBlock1(BS1);
510   dim3 dimGrid1(NB1, numWalkers);
511   dim3 dimBlock2(BS2);
512   dim3 dimGrid2(NB2, numWalkers);
513 
514   update_inverse_kernel1<thrust::complex<double>, BS1><<<dimGrid1, dimBlock1>>>((thrust::complex<double>**)data,
515                                                                                 iat,
516                                                                                 A_off,
517                                                                                 Ainv_off,
518                                                                                 newRow_off,
519                                                                                 AinvDelta_off,
520                                                                                 AinvColk_off,
521                                                                                 N,
522                                                                                 rowstride);
523   update_inverse_kernel2<thrust::complex<double>, BS2><<<dimGrid2, dimBlock2>>>((thrust::complex<double>**)data,
524                                                                                 iat,
525                                                                                 A_off,
526                                                                                 Ainv_off,
527                                                                                 newRow_off,
528                                                                                 AinvDelta_off,
529                                                                                 AinvColk_off,
530                                                                                 N,
531                                                                                 rowstride);
532   cudaError_t err = cudaGetLastError();
533   if (err != cudaSuccess)
534   {
535     fprintf(stderr, "CUDA error in update_inverse_cuda:\n  %s\n", cudaGetErrorString(err));
536     abort();
537   }
538 }
539 
540 
541 #endif
542 
543 
544 template<typename T, int BS>
update_inverse_kernel1(T ** data,int k,int A_off,int Ainv_off,int newRow_off,int AinvDelta_off,int AinvColk_off,int N,int rowstride)545 __global__ void update_inverse_kernel1(T** data,
546                                        int k,
547                                        int A_off,
548                                        int Ainv_off,
549                                        int newRow_off,
550                                        int AinvDelta_off,
551                                        int AinvColk_off,
552                                        int N,
553                                        int rowstride)
554 {
555   T* const sdata = data[blockIdx.y];
556   T* A           = sdata + A_off;         // A
557   const T* Ainv  = sdata + Ainv_off;      // Ainv
558   const T* u     = sdata + newRow_off;    // new k-th row of A
559   T* u_Ainv      = sdata + AinvDelta_off; // u * Ainv
560   T* Ainv_colk   = sdata + AinvColk_off;  // k-th column of orig. Ainv
561   update_inverse_core1<T, BS>(A, Ainv, u, u_Ainv, Ainv_colk, k, N, rowstride);
562 }
563 
564 template<typename T, int BS>
update_inverse_kernel2(T ** data,int k,int A_off,int Ainv_off,int newRow_off,int AinvDelta_off,int AinvColk_off,int N,int rowstride)565 __global__ void update_inverse_kernel2(T** data,
566                                        int k,
567                                        int A_off,
568                                        int Ainv_off,
569                                        int newRow_off,
570                                        int AinvDelta_off,
571                                        int AinvColk_off,
572                                        int N,
573                                        int rowstride)
574 
575 {
576   T* const sdata     = data[blockIdx.y];
577   T* A               = sdata + A_off;         // A
578   T* Ainv            = sdata + Ainv_off;      // Ainv
579   const T* u         = sdata + newRow_off;    // new k-th row of A
580   const T* u_Ainv    = sdata + AinvDelta_off; // u * Ainv
581   const T* Ainv_colk = sdata + AinvColk_off;  // k-th column of orig. Ainv
582   update_inverse_core2<T, BS>(A, Ainv, u, u_Ainv, Ainv_colk, k, N, rowstride);
583 }
584 
585 template<typename T, int BS>
update_inverse_kernel2_subblock(T ** data,int k,int A_off,int Ainv_off,int newRow_off,int AinvDelta_off,int AinvColk_off,int N,int rowstride)586 __global__ void update_inverse_kernel2_subblock(T** data,
587                                                 int k,
588                                                 int A_off,
589                                                 int Ainv_off,
590                                                 int newRow_off,
591                                                 int AinvDelta_off,
592                                                 int AinvColk_off,
593                                                 int N,
594                                                 int rowstride)
595 
596 {
597   T* const sdata     = data[blockIdx.x];
598   T* Ainv            = sdata + Ainv_off;      // Ainv
599   const T* u_Ainv    = sdata + AinvDelta_off; // u * Ainv
600   const T* Ainv_colk = sdata + AinvColk_off;  // k-th column of orig. Ainv
601   update_inverse_core2_subblock<T, BS>(Ainv, u_Ainv, Ainv_colk, k, N, rowstride);
602 }
603 
update_inverse_cuda(float ** data,int iat,int A_off,int Ainv_off,int newRow_off,int AinvDelta_off,int AinvColk_off,int N,int rowstride,int numWalkers)604 void update_inverse_cuda(float** data,
605                          int iat,
606                          int A_off,
607                          int Ainv_off,
608                          int newRow_off,
609                          int AinvDelta_off,
610                          int AinvColk_off,
611                          int N,
612                          int rowstride,
613                          int numWalkers)
614 {
615   const int BS1 = 128;
616   const int BS2 = 128;
617   int NB1       = (N + BS1 - 1) / BS1;
618   int NB2       = (N + BS2 - 1) / BS2;
619   dim3 dimBlock1(BS1);
620   dim3 dimGrid1(NB1, numWalkers);
621   dim3 dimBlock2(BS2);
622   dim3 dimGrid2(NB2, numWalkers);
623   update_inverse_kernel1<float, BS1>
624       <<<dimGrid1, dimBlock1>>>(data, iat, A_off, Ainv_off, newRow_off, AinvDelta_off, AinvColk_off, N, rowstride);
625   /* reference implementation, replaced by _subblock version. Ye Luo
626   update_inverse_kernel2<float,BS2><<<dimGrid2,dimBlock2>>>
627   (data, iat, A_off, Ainv_off, newRow_off, AinvDelta_off, AinvColk_off,
628    N, rowstride);
629   */
630   dim3 dimGrid3(numWalkers, NB2, NB2);
631   update_inverse_kernel2_subblock<float, BS2>
632       <<<dimGrid3, dimBlock2>>>(data, iat, A_off, Ainv_off, newRow_off, AinvDelta_off, AinvColk_off, N, rowstride);
633   cudaError_t err = cudaGetLastError();
634   if (err != cudaSuccess)
635   {
636     fprintf(stderr, "CUDA error in update_inverse_cuda:\n  %s\n", cudaGetErrorString(err));
637     abort();
638   }
639 }
640 
update_inverse_cuda(double ** data,int iat,int A_off,int Ainv_off,int newRow_off,int AinvDelta_off,int AinvColk_off,int N,int rowstride,int numWalkers)641 void update_inverse_cuda(double** data,
642                          int iat,
643                          int A_off,
644                          int Ainv_off,
645                          int newRow_off,
646                          int AinvDelta_off,
647                          int AinvColk_off,
648                          int N,
649                          int rowstride,
650                          int numWalkers)
651 {
652   const int BS1 = 64;
653   const int BS2 = 64;
654   int NB1       = (N + BS1 - 1) / BS1;
655   int NB2       = (N + BS2 - 1) / BS2;
656   dim3 dimBlock1(BS1);
657   dim3 dimGrid1(NB1, numWalkers);
658   dim3 dimBlock2(BS2);
659   dim3 dimGrid2(NB2, numWalkers);
660   update_inverse_kernel1<double, BS1>
661       <<<dimGrid1, dimBlock1>>>(data, iat, A_off, Ainv_off, newRow_off, AinvDelta_off, AinvColk_off, N, rowstride);
662   /* reference implementation, replaced by _subblock version. Ye Luo
663   update_inverse_kernel2<double,BS2><<<dimGrid2,dimBlock2>>>
664   (data, iat, A_off, Ainv_off, newRow_off, AinvDelta_off, AinvColk_off,
665    N, rowstride);
666   */
667   dim3 dimGrid3(numWalkers, NB2, NB2);
668   update_inverse_kernel2_subblock<double, BS2>
669       <<<dimGrid3, dimBlock2>>>(data, iat, A_off, Ainv_off, newRow_off, AinvDelta_off, AinvColk_off, N, rowstride);
670   cudaError_t err = cudaGetLastError();
671   if (err != cudaSuccess)
672   {
673     fprintf(stderr, "CUDA error in update_inverse_cuda:\n  %s\n", cudaGetErrorString(err));
674     abort();
675   }
676 }
677 
678 #ifdef QMC_COMPLEX
update_inverse_cuda(std::complex<float> ** data,int iat,int A_off,int Ainv_off,int newRow_off,int AinvDelta_off,int AinvColk_off,int N,int rowstride,int numWalkers)679 void update_inverse_cuda(std::complex<float>** data,
680                          int iat,
681                          int A_off,
682                          int Ainv_off,
683                          int newRow_off,
684                          int AinvDelta_off,
685                          int AinvColk_off,
686                          int N,
687                          int rowstride,
688                          int numWalkers)
689 {
690   const int BS1 = 128;
691   const int BS2 = 128;
692   int NB1       = (N + BS1 - 1) / BS1;
693   int NB2       = (N + BS2 - 1) / BS2;
694   dim3 dimBlock1(BS1);
695   dim3 dimGrid1(NB1, numWalkers);
696   dim3 dimBlock2(BS2);
697   dim3 dimGrid2(NB2, numWalkers);
698 
699   update_inverse_kernel1<thrust::complex<float>, BS1><<<dimGrid1, dimBlock1>>>((thrust::complex<float>**)data,
700                                                                                iat,
701                                                                                A_off,
702                                                                                Ainv_off,
703                                                                                newRow_off,
704                                                                                AinvDelta_off,
705                                                                                AinvColk_off,
706                                                                                N,
707                                                                                rowstride);
708   /* reference implementation, replaced by _subblock version. Ye Luo
709   update_inverse_kernel2<thrust::complex<float>,BS2><<<dimGrid2,dimBlock2>>>
710   ((thrust::complex<float>**)data, iat, A_off, Ainv_off, newRow_off, AinvDelta_off, AinvColk_off,
711    N, rowstride);
712   */
713   dim3 dimGrid3(numWalkers, NB2, NB2);
714   update_inverse_kernel2_subblock<thrust::complex<float>, BS2><<<dimGrid3, dimBlock2>>>((thrust::complex<float>**)data,
715                                                                                         iat,
716                                                                                         A_off,
717                                                                                         Ainv_off,
718                                                                                         newRow_off,
719                                                                                         AinvDelta_off,
720                                                                                         AinvColk_off,
721                                                                                         N,
722                                                                                         rowstride);
723   cudaError_t err = cudaGetLastError();
724   if (err != cudaSuccess)
725   {
726     fprintf(stderr, "CUDA error in update_inverse_cuda:\n  %s\n", cudaGetErrorString(err));
727     abort();
728   }
729 }
730 
update_inverse_cuda(std::complex<double> ** data,int iat,int A_off,int Ainv_off,int newRow_off,int AinvDelta_off,int AinvColk_off,int N,int rowstride,int numWalkers)731 void update_inverse_cuda(std::complex<double>** data,
732                          int iat,
733                          int A_off,
734                          int Ainv_off,
735                          int newRow_off,
736                          int AinvDelta_off,
737                          int AinvColk_off,
738                          int N,
739                          int rowstride,
740                          int numWalkers)
741 {
742   const int BS1 = 64;
743   const int BS2 = 64;
744   int NB1       = (N + BS1 - 1) / BS1;
745   int NB2       = (N + BS2 - 1) / BS2;
746   dim3 dimBlock1(BS1);
747   dim3 dimGrid1(NB1, numWalkers);
748   dim3 dimBlock2(BS2);
749   dim3 dimGrid2(NB2, numWalkers);
750 
751   update_inverse_kernel1<thrust::complex<double>, BS1><<<dimGrid1, dimBlock1>>>((thrust::complex<double>**)data,
752                                                                                 iat,
753                                                                                 A_off,
754                                                                                 Ainv_off,
755                                                                                 newRow_off,
756                                                                                 AinvDelta_off,
757                                                                                 AinvColk_off,
758                                                                                 N,
759                                                                                 rowstride);
760   /* reference implementation, replaced by _subblock version. Ye Luo
761   update_inverse_kernel2<thrust::complex<double>,BS2><<<dimGrid2,dimBlock2>>>
762   ((thrust::complex<double>**)data, iat, A_off, Ainv_off, newRow_off, AinvDelta_off, AinvColk_off,
763    N, rowstride);
764   */
765   dim3 dimGrid3(numWalkers, NB2, NB2);
766   update_inverse_kernel2_subblock<thrust::complex<double>, BS2>
767       <<<dimGrid3, dimBlock2>>>((thrust::complex<double>**)data,
768                                 iat,
769                                 A_off,
770                                 Ainv_off,
771                                 newRow_off,
772                                 AinvDelta_off,
773                                 AinvColk_off,
774                                 N,
775                                 rowstride);
776   cudaError_t err = cudaGetLastError();
777   if (err != cudaSuccess)
778   {
779     fprintf(stderr, "CUDA error in update_inverse_cuda:\n  %s\n", cudaGetErrorString(err));
780     abort();
781   }
782 }
783 
784 #endif
785 
786 // The first kernel just computes AinvT * u and also stores the kth
787 // col of Ainv in global memory
788 template<typename T, int BS>
update_inverse_cuda1(T ** A_g,T ** Ainv_g,T ** u_g,T ** Ainv_delta_g,T ** Ainv_colk_g,int N,int rowstride,int k)789 __global__ void update_inverse_cuda1(T** A_g,
790                                      T** Ainv_g,
791                                      T** u_g,
792                                      T** Ainv_delta_g,
793                                      T** Ainv_colk_g,
794                                      int N,
795                                      int rowstride,
796                                      int k)
797 {
798   __shared__ T *A, *Ainv, *u, *Ainv_delta, *Ainv_colk;
799   if (threadIdx.x == 0)
800   {
801     A          = A_g[blockIdx.y];
802     Ainv       = Ainv_g[blockIdx.y];
803     u          = u_g[blockIdx.y];
804     Ainv_delta = Ainv_delta_g[blockIdx.y];
805     Ainv_colk  = Ainv_colk_g[blockIdx.y];
806   }
807   __syncthreads();
808   // Store the product Ainv * u in shared memory
809   T Ainv_delta_tid;
810   __shared__ T Ainv_colk_shared[BS], delta[BS];
811   Ainv_delta_tid = 0.0f;
812   __syncthreads();
813   int col       = blockIdx.x * BS + threadIdx.x;
814   int numblocks = N / BS + ((N % BS) ? 1 : 0);
815   int kBlock    = k / BS;
816   // If the column I need to pull from Ainv is in this thread block
817   // domain, do the following
818   __syncthreads();
819   for (int block = 0; block < numblocks; block++)
820   {
821     delta[threadIdx.x] = u[block * BS + threadIdx.x] - A[k * rowstride + block * BS + threadIdx.x];
822     __syncthreads();
823     for (int i = 0; i < BS; i++)
824     {
825       int row = block * BS + i;
826       T a     = Ainv[row * rowstride + col];
827       if (col == k)
828         Ainv_colk_shared[i] = a;
829       Ainv_delta_tid += a * delta[i];
830       __syncthreads();
831     }
832     if (blockIdx.x == kBlock)
833       Ainv_colk[block * BS + threadIdx.x] = Ainv_colk_shared[threadIdx.x];
834     __syncthreads();
835   }
836   __syncthreads();
837   // Write the data back to global memory
838   Ainv_delta[col] = Ainv_delta_tid;
839   __syncthreads();
840 }
841 
842 
843 template<typename T, int BS>
update_inverse_cuda2(T ** A_g,T ** Ainv_g,T ** u_g,T ** Ainv_delta_g,T ** Ainv_colk_g,int N,int rowstride,int k)844 __global__ void update_inverse_cuda2(T** A_g,
845                                      T** Ainv_g,
846                                      T** u_g,
847                                      T** Ainv_delta_g,
848                                      T** Ainv_colk_g,
849                                      int N,
850                                      int rowstride,
851                                      int k)
852 {
853   __shared__ T *A, *u, *Ainv, *Ainv_delta, *Ainv_colk;
854   int tid = threadIdx.x;
855   if (threadIdx.x == 0)
856   {
857     A          = A_g[blockIdx.y];
858     u          = u_g[blockIdx.y];
859     Ainv       = Ainv_g[blockIdx.y];
860     Ainv_delta = Ainv_delta_g[blockIdx.y];
861     Ainv_colk  = Ainv_colk_g[blockIdx.y];
862   }
863   __syncthreads();
864   T Ainv_delta_tid;
865   __shared__ T Ainv_colk_shared[BS];
866   int col = blockIdx.x * BS + threadIdx.x;
867   // Read the data back from global memory
868   Ainv_delta_tid                = Ainv_delta[col];
869   Ainv_colk_shared[threadIdx.x] = Ainv_colk[col];
870   if (col < N)
871     A[k * rowstride + col] = u[col];
872   __syncthreads();
873   __shared__ T prefact;
874   if (threadIdx.x == 0)
875     prefact = -1.0f / (1.0f + Ainv_delta[k]);
876   int numblocks = N / BS + ((N % BS) ? 1 : 0);
877   __syncthreads();
878   for (int block = 0; block < numblocks; block++)
879   {
880     Ainv_colk_shared[tid] = prefact * Ainv_colk[block * BS + threadIdx.x];
881     __syncthreads();
882     T* Ainv_row = Ainv + block * BS * rowstride + col;
883     for (int i = 0; i < BS; i++, Ainv_row += rowstride)
884       *Ainv_row += Ainv_delta_tid * Ainv_colk_shared[i];
885     __syncthreads();
886   }
887 }
888 
889 
update_inverse_cuda(float * A_g[],float * Ainv_g[],float * u_g[],float * Ainv_delta_g[],float * Ainv_colk_g[],int N,int rowstride,int iat,int numWalkers)890 void update_inverse_cuda(float* A_g[],
891                          float* Ainv_g[],
892                          float* u_g[],
893                          float* Ainv_delta_g[],
894                          float* Ainv_colk_g[],
895                          int N,
896                          int rowstride,
897                          int iat,
898                          int numWalkers)
899 {
900   const int BS1 = 64;
901   const int BS2 = 64;
902   int NB1       = N / BS1 + ((N % BS1) ? 1 : 0);
903   int NB2       = N / BS2 + ((N % BS2) ? 1 : 0);
904   dim3 dimBlock1(BS1);
905   dim3 dimGrid1(NB1, numWalkers);
906   dim3 dimBlock2(BS2);
907   dim3 dimGrid2(NB2, numWalkers);
908   update_inverse_cuda1<float, BS1>
909       <<<dimGrid1, dimBlock1>>>(A_g, Ainv_g, u_g, Ainv_delta_g, Ainv_colk_g, N, rowstride, iat);
910   update_inverse_cuda2<float, BS2>
911       <<<dimGrid2, dimBlock2>>>(A_g, Ainv_g, u_g, Ainv_delta_g, Ainv_colk_g, N, rowstride, iat);
912   cudaError_t err = cudaGetLastError();
913   if (err != cudaSuccess)
914   {
915     fprintf(stderr, "CUDA error in update_inverse_cuda:\n  %s\n", cudaGetErrorString(err));
916     abort();
917   }
918 }
919 
update_inverse_cuda(double * A_g[],double * Ainv_g[],double * u_g[],double * Ainv_delta_g[],double * Ainv_colk_g[],int N,int rowstride,int iat,int numWalkers)920 void update_inverse_cuda(double* A_g[],
921                          double* Ainv_g[],
922                          double* u_g[],
923                          double* Ainv_delta_g[],
924                          double* Ainv_colk_g[],
925                          int N,
926                          int rowstride,
927                          int iat,
928                          int numWalkers)
929 {
930   const int BS1 = 32;
931   const int BS2 = 32;
932   int NB1       = N / BS1 + ((N % BS1) ? 1 : 0);
933   int NB2       = N / BS2 + ((N % BS2) ? 1 : 0);
934   dim3 dimBlock1(BS1);
935   dim3 dimGrid1(NB1, numWalkers);
936   dim3 dimBlock2(BS2);
937   dim3 dimGrid2(NB2, numWalkers);
938   update_inverse_cuda1<double, BS1>
939       <<<dimGrid1, dimBlock1>>>(A_g, Ainv_g, u_g, Ainv_delta_g, Ainv_colk_g, N, rowstride, iat);
940   update_inverse_cuda2<double, BS2>
941       <<<dimGrid2, dimBlock2>>>(A_g, Ainv_g, u_g, Ainv_delta_g, Ainv_colk_g, N, rowstride, iat);
942 }
943 
944 
945 template<typename T, int BS, int MAXN>
update_inverse_transpose_cuda(T ** A_g,T ** AinvT_g,T ** u_g,int N,int row_stride,int elec)946 __global__ void update_inverse_transpose_cuda(T** A_g, T** AinvT_g, T** u_g, int N, int row_stride, int elec)
947 {
948   __shared__ T AinvT_row[MAXN], Ainv_colk[MAXN], delta[MAXN];
949   int numBlocks = N / blockDim.x + ((N % blockDim.x) ? 1 : 0);
950   //int numBlocks = 4;
951   __shared__ T *A, *AinvT, *u;
952   if (threadIdx.x == 0)
953   {
954     A     = A_g[blockIdx.x];
955     AinvT = AinvT_g[blockIdx.x];
956     u     = u_g[blockIdx.x];
957   }
958   __syncthreads();
959   T prefactor;
960   __shared__ T sum[BS];
961   sum[threadIdx.x] = 0.0f;
962   for (int block = 0; block < numBlocks; block++)
963   {
964     int off                    = block * BS + threadIdx.x;
965     T u1                       = u[off];
966     delta[off]                 = u1 - A[elec * row_stride + off];
967     Ainv_colk[off]             = AinvT[elec * row_stride + off];
968     A[elec * row_stride + off] = u1;
969     sum[threadIdx.x] += (Ainv_colk[off] * delta[off]);
970   }
971   __syncthreads();
972   for (int s = (BS >> 1); s > 0; s >>= 1)
973   {
974     __syncthreads();
975     if (threadIdx.x < s)
976       sum[threadIdx.x] += sum[threadIdx.x + s];
977   }
978   __syncthreads();
979   prefactor = -1.0f / (1.0f + sum[0]);
980   for (int row = 0; row < N; row++)
981   {
982     // First load row into shared memory
983     sum[threadIdx.x] = 0.0;
984     for (int block = 0; block < numBlocks; block++)
985     {
986       int off        = block * BS + threadIdx.x;
987       AinvT_row[off] = AinvT[row * row_stride + off];
988       sum[threadIdx.x] += (AinvT_row[off] * delta[off]);
989     }
990     // Now sum across row to get Ainv_delta
991     for (int s = BS >> 1; s > 0; s >>= 1)
992     {
993       __syncthreads();
994       if (threadIdx.x < s)
995         sum[threadIdx.x] += sum[threadIdx.x + s];
996     }
997     __syncthreads();
998     // sum[0] now has the AinvT * delta
999     // Add on outer product
1000     for (int block = 0; block < numBlocks; block++)
1001     {
1002       int off                       = BS * block + threadIdx.x;
1003       AinvT[row * row_stride + off] = AinvT_row[off] + prefactor * sum[0] * Ainv_colk[off];
1004     }
1005     __syncthreads();
1006   }
1007 }
1008 
1009 
1010 template<typename T, int BS, int MAXN>
update_inverse_transpose_cuda_2pass(T ** A_g,T ** AinvT_g,T ** u_g,int N,int row_stride,int elec)1011 __global__ void update_inverse_transpose_cuda_2pass(T** A_g, T** AinvT_g, T** u_g, int N, int row_stride, int elec)
1012 {
1013   __shared__ T Ainv_colk[MAXN], delta[MAXN];
1014   int numBlocks = N / blockDim.x + ((N % blockDim.x) ? 1 : 0);
1015   //int numBlocks = 4;
1016   __shared__ T *A, *AinvT, *u;
1017   if (threadIdx.x == 0)
1018   {
1019     A     = A_g[blockIdx.x];
1020     AinvT = AinvT_g[blockIdx.x];
1021     u     = u_g[blockIdx.x];
1022   }
1023   __syncthreads();
1024   T prefactor;
1025   __shared__ T sum[BS];
1026   sum[threadIdx.x] = 0.0f;
1027   for (int block = 0; block < numBlocks; block++)
1028   {
1029     int off                    = block * BS + threadIdx.x;
1030     T u1                       = u[off];
1031     delta[off]                 = u1 - A[elec * row_stride + off];
1032     Ainv_colk[off]             = AinvT[elec * row_stride + off];
1033     A[elec * row_stride + off] = u1;
1034     sum[threadIdx.x] += (Ainv_colk[off] * delta[off]);
1035   }
1036   __syncthreads();
1037   for (int s = (BS >> 1); s > 0; s >>= 1)
1038   {
1039     __syncthreads();
1040     if (threadIdx.x < s && threadIdx.y == 0)
1041       sum[threadIdx.x] += sum[threadIdx.x + s];
1042   }
1043   __syncthreads();
1044   prefactor = -1.0f / (1.0f + sum[0]);
1045   __shared__ T sum2[BS][BS + 1];
1046   for (int b1 = 0; b1 < numBlocks; b1++)
1047   {
1048     sum[threadIdx.x] = 0.0f;
1049     for (int i = 0; i < BS; i++)
1050       sum2[i][threadIdx.x] = 0.0f;
1051     // Compute Ainv * delta;
1052     for (int i = 0; i < BS; i++)
1053     {
1054       int row = b1 * BS + i;
1055       for (int b2 = 0; b2 < numBlocks; b2++)
1056       {
1057         int col = b2 * BS + threadIdx.x;
1058         sum2[i][threadIdx.x] += AinvT[row * row_stride + col] * delta[col];
1059       }
1060     }
1061     __syncthreads();
1062     for (int i = 0; i < BS; i++)
1063       sum[threadIdx.x] += prefactor * sum2[threadIdx.x][i];
1064     // Outer product
1065     for (int i = 0; i < BS; i++)
1066     {
1067       int row = b1 * BS + i;
1068       for (int b2 = 0; b2 < numBlocks; b2++)
1069       {
1070         int col = b2 * BS + threadIdx.x;
1071         AinvT[row * row_stride + col] += sum[i] * Ainv_colk[col];
1072       }
1073     }
1074   }
1075 }
1076 
1077 
1078 template<typename T, int BS>
calc_ratios_transpose(T ** AinvT_list,T ** new_row_list,T * ratio_out,int N,int row_stride,int elec,int numMats)1079 __global__ void calc_ratios_transpose(T** AinvT_list,
1080                                       T** new_row_list,
1081                                       T* ratio_out,
1082                                       int N,
1083                                       int row_stride,
1084                                       int elec,
1085                                       int numMats)
1086 {
1087   __shared__ T *AinvT[BS], *new_row[BS];
1088   int matNum = blockIdx.x * BS + threadIdx.x;
1089   if (matNum < numMats)
1090   {
1091     AinvT[threadIdx.x]   = AinvT_list[matNum] + row_stride * BS;
1092     new_row[threadIdx.x] = new_row_list[matNum];
1093   }
1094   __shared__ T AinvT_phi[BS][BS + 1];
1095   __shared__ T ratio[BS];
1096   ratio[threadIdx.x] = 0.0;
1097   int numBlocks      = N / BS;
1098   if (numBlocks * BS < N)
1099     numBlocks++;
1100   for (int block = 0; block < numBlocks; block++)
1101   {
1102     int col = block * BS + threadIdx.x;
1103     // First, read the data into shared memory
1104     for (int i = 0; i < BS; i++)
1105       AinvT_phi[i][threadIdx.x] = (AinvT[i])[col] * (new_row[i])[col];
1106     __syncthreads();
1107     // Now sum
1108     for (int i = 0; i < BS; i++)
1109       ratio[threadIdx.x] += AinvT_phi[threadIdx.x][i];
1110   }
1111   if (matNum < numMats)
1112     ratio_out[matNum] = ratio[threadIdx.x];
1113 }
1114 
1115 
1116 template<typename T, int BS>
calc_ratios(T ** Ainv_list,T ** new_row_list,T * ratio,int N,int row_stride,int elec)1117 __global__ void calc_ratios(T** Ainv_list, T** new_row_list, T* ratio, int N, int row_stride, int elec)
1118 {
1119   int tid = threadIdx.x;
1120   int col = /*blockIdx.x*BS * */ tid;
1121   __shared__ T *Ainv, *new_row;
1122   if (tid == 0)
1123   {
1124     Ainv    = Ainv_list[blockIdx.x];
1125     new_row = new_row_list[blockIdx.x];
1126   }
1127   __syncthreads();
1128 #ifdef QMC_COMPLEX
1129   __shared__ uninitialized_array<T, BS> new_row_shared;
1130 #else
1131   __shared__ T new_row_shared[BS];
1132 #endif
1133   if (col < N)
1134     new_row_shared[tid] = new_row[tid];
1135 #ifdef QMC_COMPLEX
1136   __shared__ uninitialized_array<T, BS> Ainv_colk_shared;
1137 #else
1138   __shared__ T Ainv_colk_shared[BS];
1139 #endif
1140   // This is *highly* uncoallesced, but we just have to eat it to allow
1141   // other kernels to operate quickly.
1142   if (col < N)
1143     Ainv_colk_shared[tid] = Ainv[col * row_stride + elec];
1144   __syncthreads();
1145 #ifdef QMC_COMPLEX
1146   __shared__ uninitialized_array<T, BS> Ainv_new_row;
1147 #else
1148   __shared__ T Ainv_new_row[BS];
1149 #endif
1150   if (col < N)
1151     Ainv_new_row[tid] = Ainv_colk_shared[tid] * new_row_shared[tid];
1152   __syncthreads();
1153   // Now, we have to dot
1154   for (unsigned int s = BS / 2; s > 0; s >>= 1)
1155   {
1156     if (tid < s && (tid + s) < N)
1157       Ainv_new_row[tid] += Ainv_new_row[tid + s];
1158     __syncthreads();
1159   }
1160   if (tid == 0)
1161     ratio[blockIdx.x] = Ainv_new_row[0];
1162 }
1163 
1164 
determinant_ratios_cuda(float * Ainv_list[],float * new_row_list[],float * ratios,int N,int row_stride,int iat,int numWalkers)1165 void determinant_ratios_cuda(float* Ainv_list[],
1166                              float* new_row_list[],
1167                              float* ratios,
1168                              int N,
1169                              int row_stride,
1170                              int iat,
1171                              int numWalkers)
1172 {
1173   dim3 dimBlock(N);
1174   dim3 dimGrid(numWalkers);
1175   if (N <= 32)
1176     calc_ratios<float, 32><<<dimGrid, dimBlock>>>(Ainv_list, new_row_list, ratios, N, row_stride, iat);
1177   else if (N <= 64)
1178     calc_ratios<float, 64><<<dimGrid, dimBlock>>>(Ainv_list, new_row_list, ratios, N, row_stride, iat);
1179   else if (N <= 128)
1180     calc_ratios<float, 128><<<dimGrid, dimBlock>>>(Ainv_list, new_row_list, ratios, N, row_stride, iat);
1181   else if (N <= 256)
1182     calc_ratios<float, 256><<<dimGrid, dimBlock>>>(Ainv_list, new_row_list, ratios, N, row_stride, iat);
1183   else if (N <= 512)
1184     calc_ratios<float, 512><<<dimGrid, dimBlock>>>(Ainv_list, new_row_list, ratios, N, row_stride, iat);
1185   else if (N <= 1024)
1186     calc_ratios<float, 1024><<<dimGrid, dimBlock>>>(Ainv_list, new_row_list, ratios, N, row_stride, iat);
1187   else
1188   {
1189     fprintf(stdout, "Error:  N too large for CUDA evaluation.\n");
1190     abort();
1191   }
1192 }
1193 
determinant_ratios_cuda(double * Ainv_list[],double * new_row_list[],double * ratios,int N,int row_stride,int iat,int numWalkers)1194 void determinant_ratios_cuda(double* Ainv_list[],
1195                              double* new_row_list[],
1196                              double* ratios,
1197                              int N,
1198                              int row_stride,
1199                              int iat,
1200                              int numWalkers)
1201 {
1202   dim3 dimBlock(N);
1203   dim3 dimGrid(numWalkers);
1204   if (N <= 32)
1205     calc_ratios<double, 32><<<dimGrid, dimBlock>>>(Ainv_list, new_row_list, ratios, N, row_stride, iat);
1206   else if (N <= 64)
1207     calc_ratios<double, 64><<<dimGrid, dimBlock>>>(Ainv_list, new_row_list, ratios, N, row_stride, iat);
1208   else if (N <= 128)
1209     calc_ratios<double, 128><<<dimGrid, dimBlock>>>(Ainv_list, new_row_list, ratios, N, row_stride, iat);
1210   else if (N <= 256)
1211     calc_ratios<double, 256><<<dimGrid, dimBlock>>>(Ainv_list, new_row_list, ratios, N, row_stride, iat);
1212   else if (N <= 512)
1213     calc_ratios<double, 512><<<dimGrid, dimBlock>>>(Ainv_list, new_row_list, ratios, N, row_stride, iat);
1214   else
1215   {
1216     fprintf(stdout, "Error:  N too large for CUDA evaluation.\n");
1217     abort();
1218   }
1219 }
1220 
1221 #ifdef QMC_COMPLEX
determinant_ratios_cuda(std::complex<float> * Ainv_list[],std::complex<float> * new_row_list[],std::complex<float> * ratios,int N,int row_stride,int iat,int numWalkers)1222 void determinant_ratios_cuda(std::complex<float>* Ainv_list[],
1223                              std::complex<float>* new_row_list[],
1224                              std::complex<float>* ratios,
1225                              int N,
1226                              int row_stride,
1227                              int iat,
1228                              int numWalkers)
1229 {
1230   dim3 dimBlock(N);
1231   dim3 dimGrid(numWalkers);
1232 
1233   if (N <= 32)
1234     calc_ratios<thrust::complex<float>, 32><<<dimGrid, dimBlock>>>((thrust::complex<float>**)Ainv_list,
1235                                                                    (thrust::complex<float>**)new_row_list,
1236                                                                    (thrust::complex<float>*)ratios,
1237                                                                    N,
1238                                                                    row_stride,
1239                                                                    iat);
1240   else if (N <= 64)
1241     calc_ratios<thrust::complex<float>, 64><<<dimGrid, dimBlock>>>((thrust::complex<float>**)Ainv_list,
1242                                                                    (thrust::complex<float>**)new_row_list,
1243                                                                    (thrust::complex<float>*)ratios,
1244                                                                    N,
1245                                                                    row_stride,
1246                                                                    iat);
1247   else if (N <= 128)
1248     calc_ratios<thrust::complex<float>, 128><<<dimGrid, dimBlock>>>((thrust::complex<float>**)Ainv_list,
1249                                                                     (thrust::complex<float>**)new_row_list,
1250                                                                     (thrust::complex<float>*)ratios,
1251                                                                     N,
1252                                                                     row_stride,
1253                                                                     iat);
1254   else if (N <= 256)
1255     calc_ratios<thrust::complex<float>, 256><<<dimGrid, dimBlock>>>((thrust::complex<float>**)Ainv_list,
1256                                                                     (thrust::complex<float>**)new_row_list,
1257                                                                     (thrust::complex<float>*)ratios,
1258                                                                     N,
1259                                                                     row_stride,
1260                                                                     iat);
1261   else if (N <= 512)
1262     calc_ratios<thrust::complex<float>, 512><<<dimGrid, dimBlock>>>((thrust::complex<float>**)Ainv_list,
1263                                                                     (thrust::complex<float>**)new_row_list,
1264                                                                     (thrust::complex<float>*)ratios,
1265                                                                     N,
1266                                                                     row_stride,
1267                                                                     iat);
1268   else if (N <= 1024)
1269     calc_ratios<thrust::complex<float>, 1024><<<dimGrid, dimBlock>>>((thrust::complex<float>**)Ainv_list,
1270                                                                      (thrust::complex<float>**)new_row_list,
1271                                                                      (thrust::complex<float>*)ratios,
1272                                                                      N,
1273                                                                      row_stride,
1274                                                                      iat);
1275   else
1276   {
1277     fprintf(stdout, "Error:  N too large for CUDA evaluation.\n");
1278     abort();
1279   }
1280 }
1281 
determinant_ratios_cuda(std::complex<double> * Ainv_list[],std::complex<double> * new_row_list[],std::complex<double> * ratios,int N,int row_stride,int iat,int numWalkers)1282 void determinant_ratios_cuda(std::complex<double>* Ainv_list[],
1283                              std::complex<double>* new_row_list[],
1284                              std::complex<double>* ratios,
1285                              int N,
1286                              int row_stride,
1287                              int iat,
1288                              int numWalkers)
1289 {
1290   dim3 dimBlock(N);
1291   dim3 dimGrid(numWalkers);
1292 
1293   if (N <= 32)
1294     calc_ratios<thrust::complex<double>, 32><<<dimGrid, dimBlock>>>((thrust::complex<double>**)Ainv_list,
1295                                                                     (thrust::complex<double>**)new_row_list,
1296                                                                     (thrust::complex<double>*)ratios,
1297                                                                     N,
1298                                                                     row_stride,
1299                                                                     iat);
1300   else if (N <= 64)
1301     calc_ratios<thrust::complex<double>, 64><<<dimGrid, dimBlock>>>((thrust::complex<double>**)Ainv_list,
1302                                                                     (thrust::complex<double>**)new_row_list,
1303                                                                     (thrust::complex<double>*)ratios,
1304                                                                     N,
1305                                                                     row_stride,
1306                                                                     iat);
1307   else if (N <= 128)
1308     calc_ratios<thrust::complex<double>, 128><<<dimGrid, dimBlock>>>((thrust::complex<double>**)Ainv_list,
1309                                                                      (thrust::complex<double>**)new_row_list,
1310                                                                      (thrust::complex<double>*)ratios,
1311                                                                      N,
1312                                                                      row_stride,
1313                                                                      iat);
1314   else if (N <= 256)
1315     calc_ratios<thrust::complex<double>, 256><<<dimGrid, dimBlock>>>((thrust::complex<double>**)Ainv_list,
1316                                                                      (thrust::complex<double>**)new_row_list,
1317                                                                      (thrust::complex<double>*)ratios,
1318                                                                      N,
1319                                                                      row_stride,
1320                                                                      iat);
1321   else if (N <= 512)
1322     calc_ratios<thrust::complex<double>, 512><<<dimGrid, dimBlock>>>((thrust::complex<double>**)Ainv_list,
1323                                                                      (thrust::complex<double>**)new_row_list,
1324                                                                      (thrust::complex<double>*)ratios,
1325                                                                      N,
1326                                                                      row_stride,
1327                                                                      iat);
1328   else
1329   {
1330     fprintf(stdout, "Error:  N too large for CUDA evaluation.\n");
1331     abort();
1332   }
1333 }
1334 #endif
1335 
1336 
1337 template<typename T, int BS>
calc_ratio_grad_lapl(T ** Ainv_list,T ** new_row_list,T ** grad_lapl_list,T * ratio_grad_lapl,int N,int row_stride,int elec)1338 __global__ void calc_ratio_grad_lapl(T** Ainv_list,
1339                                      T** new_row_list,
1340                                      T** grad_lapl_list,
1341                                      T* ratio_grad_lapl,
1342                                      int N,
1343                                      int row_stride,
1344                                      int elec)
1345 {
1346   int tid = threadIdx.x;
1347   int NB  = N / BS + ((N % BS) ? 1 : 0);
1348   __shared__ T *Ainv, *new_row, *grad_lapl;
1349   if (tid == 0)
1350   {
1351     Ainv      = Ainv_list[blockIdx.x];
1352     new_row   = new_row_list[blockIdx.x];
1353     grad_lapl = grad_lapl_list[blockIdx.x];
1354   }
1355   const int BS1 = BS + 1;
1356   const int BS2 = 2 * BS1;
1357   const int BS3 = 3 * BS1;
1358   const int BS4 = 4 * BS1;
1359   __syncthreads();
1360 #ifdef QMC_COMPLEX
1361   __shared__ uninitialized_array<T, BS> Ainv_colk_shared;
1362   __shared__ uninitialized_array<T, 5 * BS1> ratio_prod;
1363 #else
1364   __shared__ T Ainv_colk_shared[BS];
1365   __shared__ T ratio_prod[5 * BS1];
1366 #endif
1367   ratio_prod[tid]       = 0.0f;
1368   ratio_prod[BS1 + tid] = 0.0f;
1369   ratio_prod[BS2 + tid] = 0.0f;
1370   ratio_prod[BS3 + tid] = 0.0f;
1371   ratio_prod[BS4 + tid] = 0.0f;
1372   // This is *highly* uncoallesced, but we just have to eat it to allow
1373   // other kernels to operate quickly.
1374   __syncthreads();
1375   for (int block = 0; block < NB; block++)
1376   {
1377     int col = block * BS + tid;
1378     if (col < N)
1379       Ainv_colk_shared[tid] = Ainv[col * row_stride + elec];
1380     __syncthreads();
1381     if (col < N)
1382     {
1383       ratio_prod[tid] += Ainv_colk_shared[tid] * new_row[col];
1384       ratio_prod[BS1 + tid] += Ainv_colk_shared[tid] * grad_lapl[0 * row_stride + col];
1385       ratio_prod[BS2 + tid] += Ainv_colk_shared[tid] * grad_lapl[1 * row_stride + col];
1386       ratio_prod[BS3 + tid] += Ainv_colk_shared[tid] * grad_lapl[2 * row_stride + col];
1387       ratio_prod[BS4 + tid] += Ainv_colk_shared[tid] * grad_lapl[3 * row_stride + col];
1388     }
1389     __syncthreads();
1390   }
1391   // Now, we have to sum
1392   for (unsigned int s = BS / 2; s > 0; s >>= 1)
1393   {
1394     if (tid < s)
1395     {
1396       ratio_prod[tid] += ratio_prod[tid + s];             // Value
1397       ratio_prod[BS1 + tid] += ratio_prod[BS1 + tid + s]; // grad_x
1398       ratio_prod[BS2 + tid] += ratio_prod[BS2 + tid + s]; // grad_y
1399       ratio_prod[BS3 + tid] += ratio_prod[BS3 + tid + s]; // grad_z
1400       ratio_prod[BS4 + tid] += ratio_prod[BS4 + tid + s]; // lapl
1401     }
1402     __syncthreads();
1403   }
1404   // Subtract off gradient^2 from laplacian
1405   if (tid == 0)
1406   {
1407     ratio_prod[BS4] -=
1408         (ratio_prod[BS1] * ratio_prod[BS1] + ratio_prod[BS2] * ratio_prod[BS2] + ratio_prod[BS3] * ratio_prod[BS3]);
1409   }
1410   __syncthreads();
1411   // Present gradient and laplacian are w.r.t old position.  Divide by
1412   // ratio to make it w.r.t. new position
1413   if (tid < 4)
1414     ratio_prod[(tid + 1) * BS1] /= ratio_prod[0];
1415   __syncthreads();
1416   if (tid < 5)
1417     ratio_grad_lapl[5 * blockIdx.x + tid] = ratio_prod[tid * BS1];
1418 }
1419 
1420 
1421 template<typename T, int BS>
calc_ratio_grad_lapl(T ** Ainv_list,T ** new_row_list,T ** grad_lapl_list,T * ratio_grad_lapl,int N,int row_stride,int * elec_list)1422 __global__ void calc_ratio_grad_lapl(T** Ainv_list,
1423                                      T** new_row_list,
1424                                      T** grad_lapl_list,
1425                                      T* ratio_grad_lapl,
1426                                      int N,
1427                                      int row_stride,
1428                                      int* elec_list)
1429 {
1430   int tid = threadIdx.x;
1431   int NB  = N / BS + ((N % BS) ? 1 : 0);
1432   __shared__ T *Ainv, *new_row, *grad_lapl;
1433   __shared__ int elec;
1434   if (tid == 0)
1435   {
1436     Ainv      = Ainv_list[blockIdx.x];
1437     new_row   = new_row_list[blockIdx.x];
1438     grad_lapl = grad_lapl_list[blockIdx.x];
1439     elec      = elec_list[blockIdx.x];
1440   }
1441   __syncthreads();
1442   const int BS1 = BS + 1;
1443   const int BS2 = 2 * BS1;
1444   const int BS3 = 3 * BS1;
1445   const int BS4 = 4 * BS1;
1446 #ifdef QMC_COMPLEX
1447   __shared__ uninitialized_array<T, BS> Ainv_colk_shared;
1448   __shared__ uninitialized_array<T, 5 * BS1> ratio_prod;
1449 #else
1450   __shared__ T Ainv_colk_shared[BS];
1451   __shared__ T ratio_prod[5 * BS1];
1452 #endif
1453   ratio_prod[tid]       = 0.0f;
1454   ratio_prod[BS1 + tid] = 0.0f;
1455   ratio_prod[BS2 + tid] = 0.0f;
1456   ratio_prod[BS3 + tid] = 0.0f;
1457   ratio_prod[BS4 + tid] = 0.0f;
1458   // This is *highly* uncoallesced, but we just have to eat it to allow
1459   // other kernels to operate quickly.
1460   __syncthreads();
1461   for (int block = 0; block < NB; block++)
1462   {
1463     int col = block * BS + tid;
1464     if (col < N)
1465       Ainv_colk_shared[tid] = Ainv[col * row_stride + elec];
1466     __syncthreads();
1467     if (col < N)
1468     {
1469       ratio_prod[tid] += Ainv_colk_shared[tid] * new_row[col];
1470       ratio_prod[BS1 + tid] += Ainv_colk_shared[tid] * grad_lapl[0 * row_stride + col];
1471       ratio_prod[BS2 + tid] += Ainv_colk_shared[tid] * grad_lapl[1 * row_stride + col];
1472       ratio_prod[BS3 + tid] += Ainv_colk_shared[tid] * grad_lapl[2 * row_stride + col];
1473       ratio_prod[BS4 + tid] += Ainv_colk_shared[tid] * grad_lapl[3 * row_stride + col];
1474     }
1475     __syncthreads();
1476   }
1477   // Now, we have to sum
1478   for (unsigned int s = BS / 2; s > 0; s >>= 1)
1479   {
1480     if (tid < s)
1481     {
1482       ratio_prod[tid] += ratio_prod[tid + s];             // Value
1483       ratio_prod[BS1 + tid] += ratio_prod[BS1 + tid + s]; // grad_x
1484       ratio_prod[BS2 + tid] += ratio_prod[BS2 + tid + s]; // grad_y
1485       ratio_prod[BS3 + tid] += ratio_prod[BS3 + tid + s]; // grad_z
1486       ratio_prod[BS4 + tid] += ratio_prod[BS4 + tid + s]; // lapl
1487     }
1488     __syncthreads();
1489   }
1490   // Subtract off gradient^2 from laplacian
1491   if (tid == 0)
1492   {
1493     ratio_prod[BS4] -=
1494         (ratio_prod[BS1] * ratio_prod[BS1] + ratio_prod[BS2] * ratio_prod[BS2] + ratio_prod[BS3] * ratio_prod[BS3]);
1495   }
1496   __syncthreads();
1497   // Present gradient and laplacian are w.r.t old position.  Divide by
1498   // ratio to make it w.r.t. new position
1499   if (tid < 4)
1500     ratio_prod[BS1 * (tid + 1)] /= ratio_prod[0];
1501   // Copy data back for every walker
1502   if (tid < 5)
1503     ratio_grad_lapl[5 * blockIdx.x + tid] = ratio_prod[tid * BS1];
1504 }
1505 
1506 
determinant_ratios_grad_lapl_cuda(float * Ainv_list[],float * new_row_list[],float * grad_lapl_list[],float ratios_grad_lapl[],int N,int row_stride,int iat,int numWalkers)1507 void determinant_ratios_grad_lapl_cuda(float* Ainv_list[],
1508                                        float* new_row_list[],
1509                                        float* grad_lapl_list[],
1510                                        float ratios_grad_lapl[],
1511                                        int N,
1512                                        int row_stride,
1513                                        int iat,
1514                                        int numWalkers)
1515 {
1516   const int BS = 32;
1517   dim3 dimBlock(BS);
1518   dim3 dimGrid(numWalkers);
1519   calc_ratio_grad_lapl<float, BS><<<dimGrid, dimBlock, 0, gpu::kernelStream>>>(Ainv_list,
1520                                                                                new_row_list,
1521                                                                                grad_lapl_list,
1522                                                                                ratios_grad_lapl,
1523                                                                                N,
1524                                                                                row_stride,
1525                                                                                iat);
1526 }
1527 
1528 
determinant_ratios_grad_lapl_cuda(double * Ainv_list[],double * new_row_list[],double * grad_lapl_list[],double ratios_grad_lapl[],int N,int row_stride,int iat,int numWalkers)1529 void determinant_ratios_grad_lapl_cuda(double* Ainv_list[],
1530                                        double* new_row_list[],
1531                                        double* grad_lapl_list[],
1532                                        double ratios_grad_lapl[],
1533                                        int N,
1534                                        int row_stride,
1535                                        int iat,
1536                                        int numWalkers)
1537 {
1538   const int BS = 32;
1539   dim3 dimBlock(BS);
1540   dim3 dimGrid(numWalkers);
1541   calc_ratio_grad_lapl<double, BS><<<dimGrid, dimBlock, 0, gpu::kernelStream>>>(Ainv_list,
1542                                                                                 new_row_list,
1543                                                                                 grad_lapl_list,
1544                                                                                 ratios_grad_lapl,
1545                                                                                 N,
1546                                                                                 row_stride,
1547                                                                                 iat);
1548 }
1549 
1550 
1551 #ifdef QMC_COMPLEX
determinant_ratios_grad_lapl_cuda(std::complex<float> * Ainv_list[],std::complex<float> * new_row_list[],std::complex<float> * grad_lapl_list[],std::complex<float> ratios_grad_lapl[],int N,int row_stride,int iat,int numWalkers)1552 void determinant_ratios_grad_lapl_cuda(std::complex<float>* Ainv_list[],
1553                                        std::complex<float>* new_row_list[],
1554                                        std::complex<float>* grad_lapl_list[],
1555                                        std::complex<float> ratios_grad_lapl[],
1556                                        int N,
1557                                        int row_stride,
1558                                        int iat,
1559                                        int numWalkers)
1560 {
1561   const int BS = 32;
1562   dim3 dimBlock(BS);
1563   dim3 dimGrid(numWalkers);
1564 
1565   calc_ratio_grad_lapl<thrust::complex<float>, BS>
1566       <<<dimGrid, dimBlock, 0, gpu::kernelStream>>>((thrust::complex<float>**)Ainv_list,
1567                                                     (thrust::complex<float>**)new_row_list,
1568                                                     (thrust::complex<float>**)grad_lapl_list,
1569                                                     (thrust::complex<float>*)ratios_grad_lapl,
1570                                                     N,
1571                                                     row_stride,
1572                                                     iat);
1573 }
1574 
1575 
determinant_ratios_grad_lapl_cuda(std::complex<double> * Ainv_list[],std::complex<double> * new_row_list[],std::complex<double> * grad_lapl_list[],std::complex<double> ratios_grad_lapl[],int N,int row_stride,int iat,int numWalkers)1576 void determinant_ratios_grad_lapl_cuda(std::complex<double>* Ainv_list[],
1577                                        std::complex<double>* new_row_list[],
1578                                        std::complex<double>* grad_lapl_list[],
1579                                        std::complex<double> ratios_grad_lapl[],
1580                                        int N,
1581                                        int row_stride,
1582                                        int iat,
1583                                        int numWalkers)
1584 {
1585   const int BS = 32;
1586   dim3 dimBlock(BS);
1587   dim3 dimGrid(numWalkers);
1588 
1589   calc_ratio_grad_lapl<thrust::complex<double>, BS>
1590       <<<dimGrid, dimBlock, 0, gpu::kernelStream>>>((thrust::complex<double>**)Ainv_list,
1591                                                     (thrust::complex<double>**)new_row_list,
1592                                                     (thrust::complex<double>**)grad_lapl_list,
1593                                                     (thrust::complex<double>*)ratios_grad_lapl,
1594                                                     N,
1595                                                     row_stride,
1596                                                     iat);
1597 }
1598 #endif
1599 
1600 
determinant_ratios_grad_lapl_cuda(float * Ainv_list[],float * new_row_list[],float * grad_lapl_list[],float ratios_grad_lapl[],int N,int row_stride,int iat_list[],int numWalkers)1601 void determinant_ratios_grad_lapl_cuda(float* Ainv_list[],
1602                                        float* new_row_list[],
1603                                        float* grad_lapl_list[],
1604                                        float ratios_grad_lapl[],
1605                                        int N,
1606                                        int row_stride,
1607                                        int iat_list[],
1608                                        int numWalkers)
1609 {
1610   const int BS = 32;
1611   dim3 dimBlock(BS);
1612   dim3 dimGrid(numWalkers);
1613   calc_ratio_grad_lapl<float, BS><<<dimGrid, dimBlock, 0, gpu::kernelStream>>>(Ainv_list,
1614                                                                                new_row_list,
1615                                                                                grad_lapl_list,
1616                                                                                ratios_grad_lapl,
1617                                                                                N,
1618                                                                                row_stride,
1619                                                                                iat_list);
1620 }
1621 
1622 
determinant_ratios_grad_lapl_cuda(double * Ainv_list[],double * new_row_list[],double * grad_lapl_list[],double ratios_grad_lapl[],int N,int row_stride,int iat_list[],int numWalkers)1623 void determinant_ratios_grad_lapl_cuda(double* Ainv_list[],
1624                                        double* new_row_list[],
1625                                        double* grad_lapl_list[],
1626                                        double ratios_grad_lapl[],
1627                                        int N,
1628                                        int row_stride,
1629                                        int iat_list[],
1630                                        int numWalkers)
1631 {
1632   const int BS = 32;
1633   dim3 dimBlock(BS);
1634   dim3 dimGrid(numWalkers);
1635   calc_ratio_grad_lapl<double, BS><<<dimGrid, dimBlock, 0, gpu::kernelStream>>>(Ainv_list,
1636                                                                                 new_row_list,
1637                                                                                 grad_lapl_list,
1638                                                                                 ratios_grad_lapl,
1639                                                                                 N,
1640                                                                                 row_stride,
1641                                                                                 iat_list);
1642 }
1643 
1644 
1645 #ifdef QMC_COMPLEX
determinant_ratios_grad_lapl_cuda(std::complex<float> * Ainv_list[],std::complex<float> * new_row_list[],std::complex<float> * grad_lapl_list[],std::complex<float> ratios_grad_lapl[],int N,int row_stride,int iat_list[],int numWalkers)1646 void determinant_ratios_grad_lapl_cuda(std::complex<float>* Ainv_list[],
1647                                        std::complex<float>* new_row_list[],
1648                                        std::complex<float>* grad_lapl_list[],
1649                                        std::complex<float> ratios_grad_lapl[],
1650                                        int N,
1651                                        int row_stride,
1652                                        int iat_list[],
1653                                        int numWalkers)
1654 {
1655   const int BS = 32;
1656   dim3 dimBlock(BS);
1657   dim3 dimGrid(numWalkers);
1658 
1659   calc_ratio_grad_lapl<thrust::complex<float>, BS>
1660       <<<dimGrid, dimBlock, 0, gpu::kernelStream>>>((thrust::complex<float>**)Ainv_list,
1661                                                     (thrust::complex<float>**)new_row_list,
1662                                                     (thrust::complex<float>**)grad_lapl_list,
1663                                                     (thrust::complex<float>*)ratios_grad_lapl,
1664                                                     N,
1665                                                     row_stride,
1666                                                     iat_list);
1667 }
1668 
1669 
determinant_ratios_grad_lapl_cuda(std::complex<double> * Ainv_list[],std::complex<double> * new_row_list[],std::complex<double> * grad_lapl_list[],std::complex<double> ratios_grad_lapl[],int N,int row_stride,int iat_list[],int numWalkers)1670 void determinant_ratios_grad_lapl_cuda(std::complex<double>* Ainv_list[],
1671                                        std::complex<double>* new_row_list[],
1672                                        std::complex<double>* grad_lapl_list[],
1673                                        std::complex<double> ratios_grad_lapl[],
1674                                        int N,
1675                                        int row_stride,
1676                                        int iat_list[],
1677                                        int numWalkers)
1678 {
1679   const int BS = 32;
1680   dim3 dimBlock(BS);
1681   dim3 dimGrid(numWalkers);
1682 
1683   calc_ratio_grad_lapl<thrust::complex<double>, BS>
1684       <<<dimGrid, dimBlock, 0, gpu::kernelStream>>>((thrust::complex<double>**)Ainv_list,
1685                                                     (thrust::complex<double>**)new_row_list,
1686                                                     (thrust::complex<double>**)grad_lapl_list,
1687                                                     (thrust::complex<double>*)ratios_grad_lapl,
1688                                                     N,
1689                                                     row_stride,
1690                                                     iat_list);
1691 }
1692 #endif
1693 
1694 
1695 template<typename T, int BS>
calc_grad_kernel(T ** Ainv_list,T ** grad_lapl_list,T * grad,int N,int row_stride,int elec)1696 __global__ void calc_grad_kernel(T** Ainv_list, T** grad_lapl_list, T* grad, int N, int row_stride, int elec)
1697 {
1698   int tid = threadIdx.x;
1699   int NB  = N / BS + ((N % BS) ? 1 : 0);
1700   __shared__ T *Ainv, *grad_lapl;
1701   if (tid == 0)
1702   {
1703     Ainv      = Ainv_list[blockIdx.x];
1704     grad_lapl = grad_lapl_list[blockIdx.x] + 4 * elec * row_stride;
1705   }
1706   __syncthreads();
1707   const int BS1 = BS + 1;
1708   const int BS2 = 2 * BS1;
1709 #ifdef QMC_COMPLEX
1710   __shared__ uninitialized_array<T, BS> Ainv_colk_shared;
1711   __shared__ uninitialized_array<T, 3 * BS1> ratio_prod;
1712 #else
1713   __shared__ T Ainv_colk_shared[BS];
1714   __shared__ T ratio_prod[3 * BS1];
1715 #endif
1716   ratio_prod[tid]       = 0.0f;
1717   ratio_prod[BS1 + tid] = 0.0f;
1718   ratio_prod[BS2 + tid] = 0.0f;
1719   // This is *highly* uncoallesced, but we just have to eat it to allow
1720   // other kernels to operate quickly.
1721   __syncthreads();
1722   for (int block = 0; block < NB; block++)
1723   {
1724     int col = block * BS + tid;
1725     if (col < N)
1726       Ainv_colk_shared[tid] = Ainv[col * row_stride + elec];
1727     __syncthreads();
1728     if (col < N)
1729     {
1730       ratio_prod[tid] += Ainv_colk_shared[tid] * grad_lapl[0 * row_stride + col];
1731       ratio_prod[BS1 + tid] += Ainv_colk_shared[tid] * grad_lapl[1 * row_stride + col];
1732       ratio_prod[BS2 + tid] += Ainv_colk_shared[tid] * grad_lapl[2 * row_stride + col];
1733     }
1734     __syncthreads();
1735   }
1736   // Now, we have to sum
1737   for (unsigned int s = BS / 2; s > 0; s >>= 1)
1738   {
1739     if (tid < s)
1740     {
1741       ratio_prod[tid] += ratio_prod[tid + s];             // grad_x
1742       ratio_prod[BS1 + tid] += ratio_prod[BS1 + tid + s]; // grad_y
1743       ratio_prod[BS2 + tid] += ratio_prod[BS2 + tid + s]; // grad_z
1744     }
1745     __syncthreads();
1746   }
1747   if (tid < 3)
1748     grad[3 * blockIdx.x + tid] = ratio_prod[tid * BS1];
1749 }
1750 
calc_gradient(float * Ainv_list[],float * grad_lapl_list[],float grad[],int N,int row_stride,int elec,int numWalkers)1751 void calc_gradient(float* Ainv_list[],
1752                    float* grad_lapl_list[],
1753                    float grad[],
1754                    int N,
1755                    int row_stride,
1756                    int elec,
1757                    int numWalkers)
1758 {
1759   const int BS = 32;
1760   dim3 dimBlock(BS);
1761   dim3 dimGrid(numWalkers);
1762   calc_grad_kernel<float, BS>
1763       <<<dimGrid, dimBlock, 0, gpu::kernelStream>>>(Ainv_list, grad_lapl_list, grad, N, row_stride, elec);
1764 }
1765 
calc_gradient(double * Ainv_list[],double * grad_lapl_list[],double grad[],int N,int row_stride,int elec,int numWalkers)1766 void calc_gradient(double* Ainv_list[],
1767                    double* grad_lapl_list[],
1768                    double grad[],
1769                    int N,
1770                    int row_stride,
1771                    int elec,
1772                    int numWalkers)
1773 {
1774   const int BS = 32;
1775   dim3 dimBlock(BS);
1776   dim3 dimGrid(numWalkers);
1777   calc_grad_kernel<double, BS>
1778       <<<dimGrid, dimBlock, 0, gpu::kernelStream>>>(Ainv_list, grad_lapl_list, grad, N, row_stride, elec);
1779 }
1780 
1781 #ifdef QMC_COMPLEX
calc_gradient(std::complex<float> * Ainv_list[],std::complex<float> * grad_lapl_list[],std::complex<float> grad[],int N,int row_stride,int elec,int numWalkers)1782 void calc_gradient(std::complex<float>* Ainv_list[],
1783                    std::complex<float>* grad_lapl_list[],
1784                    std::complex<float> grad[],
1785                    int N,
1786                    int row_stride,
1787                    int elec,
1788                    int numWalkers)
1789 {
1790   const int BS = 32;
1791   dim3 dimBlock(BS);
1792   dim3 dimGrid(numWalkers);
1793 
1794   calc_grad_kernel<thrust::complex<float>, BS>
1795       <<<dimGrid, dimBlock, 0, gpu::kernelStream>>>((thrust::complex<float>**)Ainv_list,
1796                                                     (thrust::complex<float>**)grad_lapl_list,
1797                                                     (thrust::complex<float>*)grad,
1798                                                     N,
1799                                                     row_stride,
1800                                                     elec);
1801 }
1802 
calc_gradient(std::complex<double> * Ainv_list[],std::complex<double> * grad_lapl_list[],std::complex<double> grad[],int N,int row_stride,int elec,int numWalkers)1803 void calc_gradient(std::complex<double>* Ainv_list[],
1804                    std::complex<double>* grad_lapl_list[],
1805                    std::complex<double> grad[],
1806                    int N,
1807                    int row_stride,
1808                    int elec,
1809                    int numWalkers)
1810 {
1811   const int BS = 32;
1812   dim3 dimBlock(BS);
1813   dim3 dimGrid(numWalkers);
1814 
1815   calc_grad_kernel<thrust::complex<double>, BS>
1816       <<<dimGrid, dimBlock, 0, gpu::kernelStream>>>((thrust::complex<double>**)Ainv_list,
1817                                                     (thrust::complex<double>**)grad_lapl_list,
1818                                                     (thrust::complex<double>*)grad,
1819                                                     N,
1820                                                     row_stride,
1821                                                     elec);
1822 }
1823 #endif
1824 
1825 
1826 #define RATIO_BS 16
1827 
1828 template<typename T>
all_ratios_kernel(T ** Ainv_list,T ** new_mat_list,T ** ratio_list,int N,int row_stride)1829 __global__ void all_ratios_kernel(T** Ainv_list, T** new_mat_list, T** ratio_list, int N, int row_stride)
1830 {
1831   __shared__ T *Ainv, *new_mat, *ratio;
1832   if (threadIdx.x == 0 && threadIdx.y == 0)
1833   {
1834     Ainv    = Ainv_list[blockIdx.x];
1835     new_mat = new_mat_list[blockIdx.x];
1836     ratio   = ratio_list[blockIdx.x];
1837   }
1838   __shared__ T Ainv_block[RATIO_BS][RATIO_BS + 1];
1839   // __shared__ T new_block[RATIO_BS][RATIO_BS+1];
1840   __shared__ T ratio_block[RATIO_BS][RATIO_BS + 1];
1841   unsigned int numBlocks = N >> 4;
1842   if (N & 15)
1843     numBlocks++;
1844   for (unsigned int yBlock = 0; yBlock < numBlocks; yBlock++)
1845   {
1846     ratio_block[threadIdx.y][threadIdx.x] = 0.0f;
1847     __syncthreads();
1848     for (unsigned int xBlock = 0; xBlock < numBlocks; xBlock++)
1849     {
1850       unsigned int xIndex = yBlock * RATIO_BS + threadIdx.x;
1851       unsigned int yIndex = xBlock * RATIO_BS + threadIdx.y;
1852       unsigned int index  = yIndex * row_stride + xIndex;
1853       if ((xIndex < N) && (yIndex < N))
1854         Ainv_block[threadIdx.x][threadIdx.y] = Ainv[index];
1855       __syncthreads();
1856       xIndex = xBlock * RATIO_BS + threadIdx.x;
1857       yIndex = yBlock * RATIO_BS + threadIdx.y;
1858       index  = yIndex * row_stride + xIndex;
1859       if ((xIndex < N) && (yIndex < N))
1860         ratio_block[threadIdx.y][threadIdx.x] += new_mat[index] * Ainv_block[threadIdx.y][threadIdx.x];
1861       __syncthreads();
1862     }
1863     __syncthreads();
1864     // Now, we have to do the reduction across the ratio_blocks
1865     if (threadIdx.x < 8)
1866       ratio_block[threadIdx.y][threadIdx.x] += ratio_block[threadIdx.y][threadIdx.x + 8];
1867     if (threadIdx.x < 4)
1868       ratio_block[threadIdx.y][threadIdx.x] += ratio_block[threadIdx.y][threadIdx.x + 4];
1869     if (threadIdx.x < 2)
1870       ratio_block[threadIdx.y][threadIdx.x] += ratio_block[threadIdx.y][threadIdx.x + 2];
1871     if (threadIdx.x < 1)
1872       ratio_block[threadIdx.y][threadIdx.x] += ratio_block[threadIdx.y][threadIdx.x + 1];
1873     __syncthreads();
1874     if (threadIdx.y == 0 && (yBlock * RATIO_BS + threadIdx.x) < N)
1875       ratio[yBlock * RATIO_BS + threadIdx.x] = ratio_block[threadIdx.x][0];
1876   }
1877 }
1878 
1879 
calc_all_ratios(float * Ainv_list[],float * new_mat_list[],float * ratio_list[],int N,int row_stride,int num_mats)1880 void calc_all_ratios(float* Ainv_list[],
1881                      float* new_mat_list[],
1882                      float* ratio_list[],
1883                      int N,
1884                      int row_stride,
1885                      int num_mats)
1886 {
1887   dim3 dimBlock(RATIO_BS, RATIO_BS);
1888   dim3 dimGrid(num_mats);
1889   all_ratios_kernel<float><<<dimGrid, dimBlock>>>(Ainv_list, new_mat_list, ratio_list, N, row_stride);
1890 }
1891 
1892 
1893 const int MAX_RATIO_ROWS = 20;
1894 
1895 template<typename T, int BS>
calc_many_ratios_kernel(T ** Ainv_list,T ** new_row_list,T ** ratio_list,int * num_ratio_list,int N,int row_stride,int * elec_list)1896 __global__ void calc_many_ratios_kernel(T** Ainv_list,
1897                                         T** new_row_list,
1898                                         T** ratio_list,
1899                                         int* num_ratio_list,
1900                                         int N,
1901                                         int row_stride,
1902                                         int* elec_list)
1903 {
1904   int tid = threadIdx.x;
1905   __shared__ T *Ainv, *new_rows, *ratios;
1906   __shared__ int num_ratios, elec;
1907   if (tid == 0)
1908   {
1909     Ainv       = Ainv_list[blockIdx.x];
1910     new_rows   = new_row_list[blockIdx.x];
1911     num_ratios = num_ratio_list[blockIdx.x];
1912     ratios     = ratio_list[blockIdx.x];
1913     elec       = elec_list[blockIdx.x];
1914   }
1915   __syncthreads();
1916   int NB = N / BS + ((N % BS) ? 1 : 0);
1917 #ifdef QMC_COMPLEX
1918   __shared__ uninitialized_array<T, BS> Ainv_shared;
1919 #else
1920   __shared__ T Ainv_shared[BS];
1921 #endif
1922   const int BS1 = BS + 1;
1923 //  __shared__ T row[BS];
1924 // We use BS+1 to avoid bank conflicts in the writing.
1925 #ifdef QMC_COMPLEX
1926   __shared__ uninitialized_array<T, MAX_RATIO_ROWS * BS1> ratio_sum;
1927 #else
1928   __shared__ T ratio_sum[MAX_RATIO_ROWS * BS1];
1929 #endif
1930   for (int iratio = 0; iratio < num_ratios; iratio++)
1931     ratio_sum[iratio * BS1 + tid] = (T)0.0;
1932   __syncthreads();
1933   for (int block = 0; block < NB; block++)
1934   {
1935     int off   = block * BS + tid;
1936     bool mask = off < N;
1937     if (mask)
1938       Ainv_shared[tid] = Ainv[off * row_stride + elec];
1939     __syncthreads();
1940     for (int iratio = 0; iratio < num_ratios; iratio++)
1941       if (mask)
1942         ratio_sum[iratio * BS1 + tid] += Ainv_shared[tid] * new_rows[iratio * row_stride + off];
1943     __syncthreads();
1944   }
1945   // now, sum up ratios
1946   for (int iratio = 0; iratio < num_ratios; iratio++)
1947   {
1948     for (int s = BS >> 1; s > 0; s >>= 1)
1949     {
1950       if (tid < s)
1951         ratio_sum[iratio * BS1 + tid] += ratio_sum[iratio * BS1 + tid + s];
1952       __syncthreads();
1953     }
1954   }
1955   // Store sums in parallel
1956   if (tid < num_ratios)
1957     ratios[tid] = ratio_sum[tid * BS1];
1958 }
1959 
calc_many_ratios(float * Ainv_list[],float * new_row_list[],float * ratio_list[],int num_ratio_list[],int N,int row_stride,int elec_list[],int numWalkers)1960 void calc_many_ratios(float* Ainv_list[],
1961                       float* new_row_list[],
1962                       float* ratio_list[],
1963                       int num_ratio_list[],
1964                       int N,
1965                       int row_stride,
1966                       int elec_list[],
1967                       int numWalkers)
1968 {
1969   const int BS = 32;
1970   dim3 dimBlock(BS);
1971   dim3 dimGrid(numWalkers);
1972   calc_many_ratios_kernel<float, BS>
1973       <<<dimGrid, dimBlock>>>(Ainv_list, new_row_list, ratio_list, num_ratio_list, N, row_stride, elec_list);
1974 }
1975 
calc_many_ratios(double * Ainv_list[],double * new_row_list[],double * ratio_list[],int num_ratio_list[],int N,int row_stride,int elec_list[],int numWalkers)1976 void calc_many_ratios(double* Ainv_list[],
1977                       double* new_row_list[],
1978                       double* ratio_list[],
1979                       int num_ratio_list[],
1980                       int N,
1981                       int row_stride,
1982                       int elec_list[],
1983                       int numWalkers)
1984 {
1985   const int BS = 32;
1986   dim3 dimBlock(BS);
1987   dim3 dimGrid(numWalkers);
1988   calc_many_ratios_kernel<double, BS>
1989       <<<dimGrid, dimBlock>>>(Ainv_list, new_row_list, ratio_list, num_ratio_list, N, row_stride, elec_list);
1990 }
1991 
1992 
1993 #ifdef QMC_COMPLEX
calc_many_ratios(std::complex<float> * Ainv_list[],std::complex<float> * new_row_list[],std::complex<float> * ratio_list[],int num_ratio_list[],int N,int row_stride,int elec_list[],int numWalkers)1994 void calc_many_ratios(std::complex<float>* Ainv_list[],
1995                       std::complex<float>* new_row_list[],
1996                       std::complex<float>* ratio_list[],
1997                       int num_ratio_list[],
1998                       int N,
1999                       int row_stride,
2000                       int elec_list[],
2001                       int numWalkers)
2002 {
2003   const int BS = 32;
2004   dim3 dimBlock(BS);
2005   dim3 dimGrid(numWalkers);
2006 
2007   calc_many_ratios_kernel<thrust::complex<float>, BS><<<dimGrid, dimBlock>>>((thrust::complex<float>**)(Ainv_list),
2008                                                                              (thrust::complex<float>**)new_row_list,
2009                                                                              (thrust::complex<float>**)ratio_list,
2010                                                                              num_ratio_list,
2011                                                                              N,
2012                                                                              row_stride,
2013                                                                              elec_list);
2014 }
2015 
2016 
calc_many_ratios(std::complex<double> * Ainv_list[],std::complex<double> * new_row_list[],std::complex<double> * ratio_list[],int num_ratio_list[],int N,int row_stride,int elec_list[],int numWalkers)2017 void calc_many_ratios(std::complex<double>* Ainv_list[],
2018                       std::complex<double>* new_row_list[],
2019                       std::complex<double>* ratio_list[],
2020                       int num_ratio_list[],
2021                       int N,
2022                       int row_stride,
2023                       int elec_list[],
2024                       int numWalkers)
2025 {
2026   const int BS = 32;
2027   dim3 dimBlock(BS);
2028   dim3 dimGrid(numWalkers);
2029 
2030   calc_many_ratios_kernel<thrust::complex<double>, BS><<<dimGrid, dimBlock>>>((thrust::complex<double>**)Ainv_list,
2031                                                                               (thrust::complex<double>**)new_row_list,
2032                                                                               (thrust::complex<double>**)ratio_list,
2033                                                                               num_ratio_list,
2034                                                                               N,
2035                                                                               row_stride,
2036                                                                               elec_list);
2037 }
2038 #endif
2039 
2040 
2041 #define SCALE_BS 64
2042 
2043 //__constant__ float GGt[3][3];
2044 
2045 
2046 template<typename T>
scale_grad_lapl_kernel(T ** grad_list,T ** hess_list,T ** grad_lapl_list,T * Linv,int N)2047 __global__ void scale_grad_lapl_kernel(T** grad_list, T** hess_list, T** grad_lapl_list, T* Linv, int N)
2048 {
2049   __shared__ T gradBlock[3][SCALE_BS];
2050   __shared__ T hessBlock[6][SCALE_BS];
2051   //  __shared__ float outBlock [4][SCALE_BS];
2052   __shared__ T G[3][3], GGt[3][3];
2053   __shared__ T *grad, *hess, *out;
2054   if (threadIdx.x == 0)
2055   {
2056     grad = grad_list[blockIdx.y];
2057     hess = hess_list[blockIdx.y];
2058     out  = grad_lapl_list[blockIdx.y];
2059   }
2060   int i = threadIdx.x / 3;
2061   int j = threadIdx.x % 3;
2062   if (threadIdx.x < 9)
2063     G[i][j] = Linv[threadIdx.x];
2064   __syncthreads();
2065   if (threadIdx.x < 9)
2066   {
2067     GGt[i][j] = (G[i][0] * G[0][j] + G[i][1] * G[1][j] + G[i][2] * G[2][j]);
2068   }
2069   // Load the gradients into shared memory
2070   for (int i = 0; i < 3; i++)
2071   {
2072     unsigned int gIndex = (3 * blockIdx.x + i) * SCALE_BS + threadIdx.x;
2073     if (gIndex < 3 * N)
2074       gradBlock[i][threadIdx.x] = grad[gIndex];
2075   }
2076   // Load the hessian into shared memory
2077   for (int i = 0; i < 6; i++)
2078   {
2079     unsigned int hIndex = (6 * blockIdx.x + i) * SCALE_BS + threadIdx.x;
2080     if (hIndex < 6 * N)
2081       hessBlock[i][threadIdx.x] = hess[hIndex];
2082   }
2083   // Now, loop through the rows that I own and compute the
2084   // dimensioned gradients and laplacians from the
2085   // dimensionless gradients and Hessians.
2086   int row = blockIdx.x * SCALE_BS;
2087   T val;
2088   // x component of gradient
2089   val =
2090       (G[0][0] * gradBlock[0][threadIdx.x] + G[0][1] * gradBlock[1][threadIdx.x] + G[0][2] * gradBlock[2][threadIdx.x]);
2091   out[row + 0 * N + threadIdx.x] = val;
2092   // y component of gradient
2093   val =
2094       (G[1][0] * gradBlock[0][threadIdx.x] + G[1][1] * gradBlock[1][threadIdx.x] + G[1][2] * gradBlock[2][threadIdx.x]);
2095   out[row + 1 * N + threadIdx.x] = val;
2096   // z component of gradient
2097   val =
2098       (G[2][0] * gradBlock[0][threadIdx.x] + G[2][1] * gradBlock[1][threadIdx.x] + G[2][2] * gradBlock[2][threadIdx.x]);
2099   out[row + 2 * N + threadIdx.x] = val;
2100   // Hessian = H00 H01 H02 H11 H12 H22
2101   // Matrix = [0 1 2]
2102   //          [1 3 4]
2103   //          [2 4 5]
2104   // laplacian = Trace(GGt*Hessian)
2105   val                            = (GGt[0][0] * hessBlock[0][threadIdx.x] + GGt[0][1] * hessBlock[1][threadIdx.x] +
2106          GGt[0][2] * hessBlock[2][threadIdx.x] + GGt[1][0] * hessBlock[1][threadIdx.x] +
2107          GGt[1][1] * hessBlock[3][threadIdx.x] + GGt[1][2] * hessBlock[4][threadIdx.x] +
2108          GGt[2][0] * hessBlock[2][threadIdx.x] + GGt[2][1] * hessBlock[4][threadIdx.x] +
2109          GGt[2][2] * hessBlock[5][threadIdx.x]);
2110   out[row + 3 * N + threadIdx.x] = val;
2111 }
2112 
2113 
2114 // This function reads the vectors pointed to by grad_list and
2115 // hess_list.  These are in memory as
2116 // [grad0_x grad0_y grad0_z grad1_x grad1_y ... ] and
2117 // [hess0_xx hess0_xy hess0_xy hess0_yy hess0_yz hess0_zz ...]
2118 // It the writes the data into memory as
2119 // [grad0_x grad1_x ... grad(N-1)_x grad0_y ... grad(N-1)_x lapl0
2120 // lapl1...]
2121 
scale_grad_lapl(float * grad_list[],float * hess_list[],float * grad_lapl_list[],float Linv[],int N,int num_walkers)2122 void scale_grad_lapl(float* grad_list[],
2123                      float* hess_list[],
2124                      float* grad_lapl_list[],
2125                      float Linv[],
2126                      int N,
2127                      int num_walkers)
2128 {
2129   dim3 dimBlock(SCALE_BS);
2130   dim3 dimGrid(N / SCALE_BS, num_walkers);
2131   if (N % SCALE_BS)
2132     dimGrid.x++;
2133   scale_grad_lapl_kernel<float><<<dimGrid, dimBlock>>>(grad_list, hess_list, grad_lapl_list, Linv, N);
2134 }
2135 
2136 
2137 template<typename T, int BS>
all_ratios_grad_lapl_kernel(T ** Ainv_list,T ** grad_lapl_list,T ** out_list,int N,int row_stride)2138 __global__ void all_ratios_grad_lapl_kernel(T** Ainv_list, T** grad_lapl_list, T** out_list, int N, int row_stride)
2139 {
2140   __shared__ T *Ainv, *gl_array, *out;
2141   if (threadIdx.x == 0 && threadIdx.y == 0)
2142   {
2143     Ainv     = Ainv_list[blockIdx.x];
2144     gl_array = grad_lapl_list[blockIdx.x];
2145     out      = out_list[blockIdx.x];
2146   }
2147   __syncthreads();
2148 #ifdef QMC_COMPLEX
2149   __shared__ uninitialized_array<T, BS*(BS + 1)> Ainv_block;
2150   __shared__ uninitialized_array<T, BS*(BS + 1)> grad_lapl_block[4];
2151 #else
2152   __shared__ T Ainv_block[BS * (BS + 1)];
2153   __shared__ T grad_lapl_block[4][BS * (BS + 1)];
2154 #endif
2155   const unsigned int numBlocks   = (N + BS - 1) / BS;
2156   const unsigned int index_local = threadIdx.y * (BS + 1) + threadIdx.x;
2157   for (unsigned int yBlock = 0; yBlock < numBlocks; yBlock++)
2158   {
2159     grad_lapl_block[0][index_local] = 0.0f;
2160     grad_lapl_block[1][index_local] = 0.0f;
2161     grad_lapl_block[2][index_local] = 0.0f;
2162     grad_lapl_block[3][index_local] = 0.0f;
2163     for (unsigned int xBlock = 0; xBlock < numBlocks; xBlock++)
2164     {
2165       unsigned int xIndex = yBlock * BS + threadIdx.x;
2166       unsigned int yIndex = xBlock * BS + threadIdx.y;
2167       unsigned int index  = yIndex * row_stride + xIndex;
2168 
2169       if ((xIndex < N) && (yIndex < N))
2170         Ainv_block[threadIdx.x * (BS + 1) + threadIdx.y] = Ainv[index];
2171       __syncthreads();
2172       xIndex = xBlock * BS + threadIdx.x;
2173       yIndex = yBlock * BS + threadIdx.y;
2174       index  = 4 * yIndex * row_stride + xIndex;
2175       if ((xIndex < N) && (yIndex < N))
2176       {
2177         grad_lapl_block[0][index_local] += gl_array[index + 0 * row_stride] * Ainv_block[index_local];
2178         grad_lapl_block[1][index_local] += gl_array[index + 1 * row_stride] * Ainv_block[index_local];
2179         grad_lapl_block[2][index_local] += gl_array[index + 2 * row_stride] * Ainv_block[index_local];
2180         grad_lapl_block[3][index_local] += gl_array[index + 3 * row_stride] * Ainv_block[index_local];
2181       }
2182       __syncthreads();
2183     }
2184     // Now, we have to do the reduction across the lapl_blocks
2185     for (int s = BS >> 1; s > 0; s >>= 1)
2186     {
2187       if (threadIdx.x < s)
2188       {
2189         grad_lapl_block[0][index_local] += grad_lapl_block[0][index_local + s];
2190         grad_lapl_block[1][index_local] += grad_lapl_block[1][index_local + s];
2191         grad_lapl_block[2][index_local] += grad_lapl_block[2][index_local + s];
2192         grad_lapl_block[3][index_local] += grad_lapl_block[3][index_local + s];
2193       }
2194       __syncthreads();
2195     }
2196     // unsigned int yIndex = yBlock * RATIO_BS + threadIdx.x;
2197     // if (threadIdx.y == 0 && yIndex < N) {
2198     //   out[4*yIndex+0] = grad_lapl_block[0][threadIdx.x][0];
2199     //   out[4*yIndex+1] = grad_lapl_block[1][threadIdx.x][0];
2200     //   out[4*yIndex+2] = grad_lapl_block[2][threadIdx.x][0];
2201     //   out[4*yIndex+3] = grad_lapl_block[3][threadIdx.x][0];
2202     // }
2203     //unsigned int yIndex = 4*yBlock*RATIO_BS + 4*threadIdx.y + threadIdx.x;
2204     unsigned int ix     = BS * threadIdx.y + threadIdx.x;
2205     unsigned int yIndex = BS * yBlock + (ix >> 2);
2206     if (ix < BS * 4 && yIndex < N)
2207       out[4 * BS * yBlock + ix] = grad_lapl_block[ix & 3][(ix >> 2) * (BS + 1)];
2208     // IMPORTANT!!!
2209     __syncthreads();
2210   }
2211 }
2212 
calc_grad_lapl(float * Ainv_list[],float * grad_lapl_list[],float * out_list[],int N,int row_stride,int num_mats)2213 void calc_grad_lapl(float* Ainv_list[], float* grad_lapl_list[], float* out_list[], int N, int row_stride, int num_mats)
2214 {
2215   dim3 dimBlock(RATIO_BS, RATIO_BS);
2216   dim3 dimGrid(num_mats);
2217   all_ratios_grad_lapl_kernel<float, RATIO_BS>
2218       <<<dimGrid, dimBlock>>>(Ainv_list, grad_lapl_list, out_list, N, row_stride);
2219 }
2220 
2221 
calc_grad_lapl(double * Ainv_list[],double * grad_lapl_list[],double * out_list[],int N,int row_stride,int num_mats)2222 void calc_grad_lapl(double* Ainv_list[],
2223                     double* grad_lapl_list[],
2224                     double* out_list[],
2225                     int N,
2226                     int row_stride,
2227                     int num_mats)
2228 {
2229   dim3 dimBlock(RATIO_BS, RATIO_BS);
2230   dim3 dimGrid(num_mats);
2231   all_ratios_grad_lapl_kernel<double, RATIO_BS>
2232       <<<dimGrid, dimBlock>>>(Ainv_list, grad_lapl_list, out_list, N, row_stride);
2233 }
2234 
2235 
2236 #ifdef QMC_COMPLEX
calc_grad_lapl(std::complex<float> * Ainv_list[],std::complex<float> * grad_lapl_list[],std::complex<float> * out_list[],int N,int row_stride,int num_mats)2237 void calc_grad_lapl(std::complex<float>* Ainv_list[],
2238                     std::complex<float>* grad_lapl_list[],
2239                     std::complex<float>* out_list[],
2240                     int N,
2241                     int row_stride,
2242                     int num_mats)
2243 {
2244   dim3 dimBlock(RATIO_BS, RATIO_BS);
2245   dim3 dimGrid(num_mats);
2246 
2247   all_ratios_grad_lapl_kernel<thrust::complex<float>, RATIO_BS>
2248       <<<dimGrid, dimBlock>>>((thrust::complex<float>**)Ainv_list,
2249                               (thrust::complex<float>**)grad_lapl_list,
2250                               (thrust::complex<float>**)out_list,
2251                               N,
2252                               row_stride);
2253 }
2254 
2255 
calc_grad_lapl(std::complex<double> * Ainv_list[],std::complex<double> * grad_lapl_list[],std::complex<double> * out_list[],int N,int row_stride,int num_mats)2256 void calc_grad_lapl(std::complex<double>* Ainv_list[],
2257                     std::complex<double>* grad_lapl_list[],
2258                     std::complex<double>* out_list[],
2259                     int N,
2260                     int row_stride,
2261                     int num_mats)
2262 {
2263   dim3 dimBlock(RATIO_BS, RATIO_BS);
2264   dim3 dimGrid(num_mats);
2265 
2266   all_ratios_grad_lapl_kernel<thrust::complex<double>, RATIO_BS>
2267       <<<dimGrid, dimBlock>>>((thrust::complex<double>**)Ainv_list,
2268                               (thrust::complex<double>**)grad_lapl_list,
2269                               (thrust::complex<double>**)out_list,
2270                               N,
2271                               row_stride);
2272 }
2273 #endif
2274 
2275 #define COPY_BS 256
2276 
2277 template<typename T>
multi_copy(T ** dest,T ** src,int len)2278 __global__ void multi_copy(T** dest, T** src, int len)
2279 {
2280   __shared__ T *mysrc, *mydest;
2281   if (threadIdx.x == 0)
2282   {
2283     mysrc  = src[blockIdx.y];
2284     mydest = dest[blockIdx.y];
2285   }
2286   __syncthreads();
2287   int i = blockIdx.x * COPY_BS + threadIdx.x;
2288   if (i < len)
2289     mydest[i] = mysrc[i];
2290 }
2291 
2292 
2293 template<typename T>
multi_copy(T ** buff,int dest_off,int src_off,int len)2294 __global__ void multi_copy(T** buff, int dest_off, int src_off, int len)
2295 {
2296   __shared__ T *mysrc, *mydest;
2297   if (threadIdx.x == 0)
2298   {
2299     T* ptr = buff[blockIdx.y];
2300     mysrc  = ptr + src_off;
2301     mydest = ptr + dest_off;
2302   }
2303   __syncthreads();
2304   int i = blockIdx.x * COPY_BS + threadIdx.x;
2305   if (i < len)
2306     mydest[i] = mysrc[i];
2307 }
2308 
2309 
multi_copy(float * dest[],float * src[],int len,int num)2310 void multi_copy(float* dest[], float* src[], int len, int num)
2311 {
2312   dim3 dimBlock(COPY_BS);
2313   dim3 dimGrid((len + COPY_BS - 1) / COPY_BS, num);
2314   multi_copy<float><<<dimGrid, dimBlock>>>(dest, src, len);
2315 }
2316 
multi_copy(double * dest[],double * src[],int len,int num)2317 void multi_copy(double* dest[], double* src[], int len, int num)
2318 {
2319   dim3 dimBlock(COPY_BS);
2320   dim3 dimGrid(len / COPY_BS, num);
2321   if (len % COPY_BS)
2322     dimGrid.x++;
2323   multi_copy<double><<<dimGrid, dimBlock>>>(dest, src, len);
2324 }
2325 
2326 
2327 #ifdef QMC_COMPLEX
multi_copy(std::complex<float> * dest[],std::complex<float> * src[],int len,int num)2328 void multi_copy(std::complex<float>* dest[], std::complex<float>* src[], int len, int num)
2329 {
2330   dim3 dimBlock(COPY_BS);
2331   dim3 dimGrid((len + COPY_BS - 1) / COPY_BS, num);
2332 
2333   multi_copy<thrust::complex<float>>
2334       <<<dimGrid, dimBlock>>>((thrust::complex<float>**)dest, (thrust::complex<float>**)src, len);
2335 }
2336 
multi_copy(std::complex<double> * dest[],std::complex<double> * src[],int len,int num)2337 void multi_copy(std::complex<double>* dest[], std::complex<double>* src[], int len, int num)
2338 {
2339   dim3 dimBlock(COPY_BS);
2340   dim3 dimGrid((len + COPY_BS - 1) / COPY_BS, num);
2341 
2342   multi_copy<thrust::complex<double>>
2343       <<<dimGrid, dimBlock>>>((thrust::complex<double>**)dest, (thrust::complex<double>**)src, len);
2344 }
2345 #endif
2346 
2347 
multi_copy(float * buff[],int dest_off,int src_off,int len,int num)2348 void multi_copy(float* buff[], int dest_off, int src_off, int len, int num)
2349 {
2350   dim3 dimBlock(COPY_BS);
2351   dim3 dimGrid((len + COPY_BS - 1) / COPY_BS, num);
2352   multi_copy<float><<<dimGrid, dimBlock>>>(buff, dest_off, src_off, len);
2353 }
2354 
multi_copy(double * buff[],int dest_off,int src_off,int len,int num)2355 void multi_copy(double* buff[], int dest_off, int src_off, int len, int num)
2356 {
2357   dim3 dimBlock(COPY_BS);
2358   dim3 dimGrid(len / COPY_BS, num);
2359   if (len % COPY_BS)
2360     dimGrid.x++;
2361   multi_copy<double><<<dimGrid, dimBlock>>>(buff, dest_off, src_off, len);
2362 }
2363 
2364 
2365 #ifdef QMC_COMPLEX
multi_copy(std::complex<float> * buff[],int dest_off,int src_off,int len,int num)2366 void multi_copy(std::complex<float>* buff[], int dest_off, int src_off, int len, int num)
2367 {
2368   dim3 dimBlock(COPY_BS);
2369   dim3 dimGrid((len + COPY_BS - 1) / COPY_BS, num);
2370 
2371   multi_copy<thrust::complex<float>><<<dimGrid, dimBlock>>>((thrust::complex<float>**)buff, dest_off, src_off, len);
2372 }
2373 
multi_copy(std::complex<double> * buff[],int dest_off,int src_off,int len,int num)2374 void multi_copy(std::complex<double>* buff[], int dest_off, int src_off, int len, int num)
2375 {
2376   dim3 dimBlock(COPY_BS);
2377   dim3 dimGrid((len + COPY_BS - 1) / COPY_BS, num);
2378 
2379   multi_copy<thrust::complex<double>><<<dimGrid, dimBlock>>>((thrust::complex<double>**)buff, dest_off, src_off, len);
2380 }
2381 #endif
2382 
2383 
2384 #include <stdlib.h>
2385 #include <time.h>
2386 
test_all_ratios_kernel()2387 void test_all_ratios_kernel()
2388 {
2389   int N = 128;
2390   float *A, *A_d, *Ainv, *Ainv_d, *ratio, *ratio_d;
2391   cudaMalloc((void**)&A_d, N * N * sizeof(float));
2392   cudaMalloc((void**)&Ainv_d, N * N * sizeof(float));
2393   cudaMalloc((void**)&ratio_d, 1 * N * sizeof(float));
2394   A     = (float*)malloc(N * N * sizeof(float));
2395   Ainv  = (float*)malloc(N * N * sizeof(float));
2396   ratio = (float*)malloc(1 * N * sizeof(float));
2397   //  float ratio2[N];
2398   for (int i = 0; i < N; i++)
2399     for (int j = 0; j < N; j++)
2400     {
2401       A[i * N + j]    = 1.0f + drand48();
2402       Ainv[i * N + j] = 1.0f + drand48();
2403     }
2404   cudaMemcpyAsync(A_d, A, N * N * sizeof(float), cudaMemcpyHostToDevice);
2405   cudaMemcpyAsync(Ainv_d, Ainv, N * N * sizeof(float), cudaMemcpyHostToDevice);
2406   float **A_list, **A_list_d, **Ainv_list, **Ainv_list_d, **ratio_list, **ratio_list_d;
2407   int numMats = 2000;
2408   cudaMalloc((void**)&A_list_d, numMats * sizeof(float*));
2409   cudaMalloc((void**)&Ainv_list_d, numMats * sizeof(float*));
2410   cudaMalloc((void**)&ratio_list_d, numMats * sizeof(float*));
2411   A_list     = (float**)malloc(numMats * sizeof(float*));
2412   Ainv_list  = (float**)malloc(numMats * sizeof(float*));
2413   ratio_list = (float**)malloc(numMats * sizeof(float*));
2414   for (int i = 0; i < numMats; i++)
2415   {
2416     A_list[i]     = A_d;
2417     Ainv_list[i]  = Ainv_d;
2418     ratio_list[i] = ratio_d;
2419   }
2420   cudaMemcpyAsync(A_list_d, A_list, numMats * sizeof(float*), cudaMemcpyHostToDevice);
2421   cudaMemcpyAsync(Ainv_list_d, Ainv_list, numMats * sizeof(float*), cudaMemcpyHostToDevice);
2422   cudaMemcpyAsync(ratio_list_d, ratio_list, numMats * sizeof(float*), cudaMemcpyHostToDevice);
2423   clock_t start = clock();
2424   for (int i = 0; i < 1000; i++)
2425     calc_all_ratios(Ainv_list_d, A_list_d, ratio_list_d, N, N, numMats);
2426   clock_t end = clock();
2427   double time = (double)(end - start) / (double)CLOCKS_PER_SEC;
2428   fprintf(stderr, "start = %ld\n", start);
2429   fprintf(stderr, "end = %ld\n", end);
2430   double rate = 1000.0 / time;
2431   fprintf(stderr, "Rate = %1.2f generations per second.\n", rate);
2432   cudaMemcpy(ratio, ratio_d, N * sizeof(float), cudaMemcpyDeviceToHost);
2433   // for (int i=0; i<N; i++) {
2434   //   ratio2[i] = 0.0f;
2435   //   for (int j=0; j<N; j++)
2436   //     ratio2[i] += A[i*N+j]*Ainv[j*N+i];
2437   //   fprintf (stderr, "%3d  %10.6f  %10.6f\n", i, ratio2[i], ratio[i]);
2438   // }
2439 }
2440 
2441 
test_all_grad_lapl_kernel()2442 void test_all_grad_lapl_kernel()
2443 {
2444   int N = 128;
2445   float *A, *A_d, *Ainv, *Ainv_d, *ratio, *ratio_d;
2446   cudaMalloc((void**)&A_d, 4 * N * N * sizeof(float));
2447   cudaMalloc((void**)&Ainv_d, N * N * sizeof(float));
2448   cudaMalloc((void**)&ratio_d, 4 * N * sizeof(float));
2449   A     = (float*)malloc(4 * N * N * sizeof(float));
2450   Ainv  = (float*)malloc(1 * N * N * sizeof(float));
2451   ratio = (float*)malloc(4 * N * sizeof(float));
2452   float ratio2[4 * N];
2453   for (int i = 0; i < N; i++)
2454     for (int j = 0; j < N; j++)
2455     {
2456       Ainv[i * N + j] = 1.0f + drand48();
2457       for (int k = 0; k < 4; k++)
2458         A[4 * (i * N + j) + k] = 1.0f + drand48();
2459     }
2460   cudaMemcpyAsync(A_d, A, 4 * N * N * sizeof(float), cudaMemcpyHostToDevice);
2461   cudaMemcpyAsync(Ainv_d, Ainv, 1 * N * N * sizeof(float), cudaMemcpyHostToDevice);
2462   float **A_list, **A_list_d, **Ainv_list, **Ainv_list_d, **ratio_list, **ratio_list_d;
2463   int numMats = 2000;
2464   cudaMalloc((void**)&A_list_d, numMats * sizeof(float*));
2465   cudaMalloc((void**)&Ainv_list_d, numMats * sizeof(float*));
2466   cudaMalloc((void**)&ratio_list_d, numMats * sizeof(float*));
2467   A_list     = (float**)malloc(numMats * sizeof(float*));
2468   Ainv_list  = (float**)malloc(numMats * sizeof(float*));
2469   ratio_list = (float**)malloc(numMats * sizeof(float*));
2470   for (int i = 0; i < numMats; i++)
2471   {
2472     A_list[i]     = A_d;
2473     Ainv_list[i]  = Ainv_d;
2474     ratio_list[i] = ratio_d;
2475   }
2476   cudaMemcpyAsync(A_list_d, A_list, numMats * sizeof(float*), cudaMemcpyHostToDevice);
2477   cudaMemcpyAsync(Ainv_list_d, Ainv_list, numMats * sizeof(float*), cudaMemcpyHostToDevice);
2478   cudaMemcpyAsync(ratio_list_d, ratio_list, numMats * sizeof(float*), cudaMemcpyHostToDevice);
2479   struct timeval tstart, tend;
2480   gettimeofday(&tstart, NULL);
2481   for (int i = 0; i < 1; i++)
2482     calc_grad_lapl(Ainv_list_d, A_list_d, ratio_list_d, N, N, numMats);
2483   cudaMemcpy(ratio, ratio_d, 4 * N * sizeof(float), cudaMemcpyDeviceToHost);
2484   gettimeofday(&tend, NULL);
2485   double start = (double)tstart.tv_sec + 1.0e-6 * (double)tstart.tv_usec;
2486   double end   = (double)tend.tv_sec + 1.0e-6 * (double)tend.tv_usec;
2487   fprintf(stderr, "start = %f\n", start);
2488   fprintf(stderr, "end = %f\n", end);
2489   double rate = 100.0 / (end - start);
2490   fprintf(stderr, "Rate = %1.2f generations per second.\n", rate);
2491   for (int i = 0; i < N; i++)
2492   {
2493     for (int k = 0; k < 4; k++)
2494       ratio2[4 * i + k] = 0.0f;
2495     for (int j = 0; j < N; j++)
2496       for (int k = 0; k < 4; k++)
2497         ratio2[4 * i + k] += A[(4 * i + k) * N + j] * Ainv[j * N + i];
2498     for (int k = 0; k < 4; k++)
2499       fprintf(stderr, "%3d  %10.6f  %10.6f\n", 4 * i + k, ratio2[4 * i + k], ratio[4 * i + k]);
2500   }
2501 }
2502 
2503 
2504 template<typename T>
woodbury_update_16(T ** Ainv_trans,T ** delta,T ** Ainv_delta,int N,int rowstride)2505 __global__ void woodbury_update_16(T** Ainv_trans, T** delta, T** Ainv_delta, int N, int rowstride)
2506 {
2507   T *myAinv, *mydelta, *myAinv_delta;
2508   int tid       = threadIdx.x;
2509   myAinv        = Ainv_trans[blockIdx.y];
2510   myAinv_delta  = Ainv_delta[blockIdx.y];
2511   mydelta       = delta[blockIdx.y];
2512   int first_row = blockIdx.x * 16;
2513   __shared__ T Ainv_s[16][17], delta_s[2][17], Ainv_delta_s[16][17];
2514   int nb = (N + 15) / 16;
2515   for (int row = 0; row < 16; row++)
2516     if (tid < 16)
2517       Ainv_delta_s[row][tid] = 0.0f;
2518   __syncthreads();
2519   int col = tid & 15;
2520   for (int block = 0; block < nb; block++)
2521   {
2522     int nend = N - block * 16;
2523     int c    = block * 16 + tid;
2524     if (tid < 16)
2525       for (int row = 0; row < 16; row++)
2526       {
2527         Ainv_s[row][col] = myAinv[(first_row + row) * rowstride + c];
2528       }
2529     __syncthreads();
2530     for (int irow = 0; irow < 8; irow++)
2531     {
2532       int odd           = tid > 15;
2533       int row           = 2 * irow + odd;
2534       delta_s[odd][tid] = mydelta[row * rowstride + c];
2535       if (row + first_row < N && col < nend)
2536         for (int k = 0; k < 16; k++)
2537           Ainv_delta_s[row][col] += Ainv_s[col][k] * delta_s[odd][k];
2538     }
2539     __syncthreads();
2540   }
2541   int mycol = blockIdx.x * 16 + tid;
2542   if (tid < 16 && mycol < N)
2543     for (int row = 0; row < 16; row++)
2544       myAinv_delta[row * rowstride + mycol] = Ainv_delta_s[row][tid];
2545 }
2546 
2547 
2548 // Require 64 threads per block
2549 template<typename T>
block_inverse_16(T A[16][17])2550 __device__ inline void block_inverse_16(T A[16][17])
2551 {
2552   int tid = threadIdx.x;
2553   __shared__ T Acolk[16];
2554   // if (blockIdx.y == 0 && tid == 0) {
2555   //   printf ("Ablock:\n");
2556   //   for (int i=0; i<16; i++) {
2557   //     for (int j=0; j<16; j++)
2558   // 	printf ("%14.8e ", A[i][j]);
2559   //     printf ("\n");
2560   //   }
2561   // }
2562   for (int k = 0; k < 16; k++)
2563   {
2564     T pivotInv = 1.0f / A[k][k];
2565     if (tid < 16)
2566     {
2567       T tmp      = (tid == k) ? 0.0f : -pivotInv * A[tid][k];
2568       A[tid][k]  = tmp;
2569       Acolk[tid] = tmp;
2570     }
2571     __syncthreads();
2572     int row = tid >> 4;
2573     int col = tid & 0x0f;
2574     T Arowk = A[k][col];
2575     A[row + 0][col] += Arowk * Acolk[row + 0];
2576     A[row + 4][col] += Arowk * Acolk[row + 4];
2577     A[row + 8][col] += Arowk * Acolk[row + 8];
2578     A[row + 12][col] += Arowk * Acolk[row + 12];
2579     __syncthreads();
2580     if (tid < 16)
2581     {
2582       if (tid == k)
2583         A[k][tid] = pivotInv;
2584       else
2585         A[k][tid] *= pivotInv;
2586     }
2587     __syncthreads();
2588   }
2589   // if (blockIdx.y == 0 && tid == 0) {
2590   //   printf ("Ainvblock:\n");
2591   //   for (int i=0; i<16; i++) {
2592   //     for (int j=0; j<16; j++)
2593   // 	printf ("%14.8e ", A[i][j]);
2594   //     printf ("\n");
2595   //   }
2596   // }
2597 }
2598 
2599 
2600 // This routine performs the first half of a Woodbury formula update
2601 // for updating 16 rows at a time of the A matrix.  It assumes that
2602 // the inverse matrix is stored in transposed form.  This kernel
2603 // computes transpose(Ainv) * delta, where delta is 16xN matrix of the
2604 // changes in A.  kblock indicates which block of 16 rows has been
2605 // changed.  Each blockIdx.x computes a different 16x16 block for the
2606 // product.
2607 template<typename T>
woodbury_update_16a(T ** Ainv_trans,T ** delta,T ** Ainv_delta,T ** inv_block,int N,int rowstride,int kblock)2608 __global__ void woodbury_update_16a(T** Ainv_trans,
2609                                     T** delta,
2610                                     T** Ainv_delta,
2611                                     T** inv_block,
2612                                     int N,
2613                                     int rowstride,
2614                                     int kblock)
2615 {
2616   T *myAinv, *mydelta, *myAinv_delta, *myinvblock;
2617   int tid       = threadIdx.x;
2618   myAinv        = Ainv_trans[blockIdx.y];
2619   myAinv_delta  = Ainv_delta[blockIdx.y];
2620   mydelta       = delta[blockIdx.y];
2621   myinvblock    = inv_block[blockIdx.y];
2622   int first_row = blockIdx.x * 32;
2623   __shared__ T Ainv1[16][17], Ainv2[16][17], delta_s[4][17], Ainv_delta1[16][17], Ainv_delta2[16][17];
2624   int nb                    = (N + 15) / 16;
2625   int row                   = tid >> 4;
2626   int col                   = tid & 0x0f;
2627   Ainv_delta1[row + 0][col] = Ainv_delta2[row + 0][col] = 0.0f;
2628   Ainv_delta1[row + 4][col] = Ainv_delta2[row + 4][col] = 0.0f;
2629   Ainv_delta1[row + 8][col] = Ainv_delta2[row + 8][col] = 0.0f;
2630   Ainv_delta1[row + 12][col] = Ainv_delta2[row + 12][col] = 0.0f;
2631   __syncthreads();
2632   for (int block = 0; block < nb; block++)
2633   {
2634     int c   = block * 16 + col;
2635     int row = tid >> 4;
2636     for (int irow = 0; irow < 4; irow++, row += 4)
2637     {
2638       Ainv1[row][col] = myAinv[(first_row + row) * rowstride + c];
2639       Ainv2[row][col] = myAinv[(first_row + row + 16) * rowstride + c];
2640     }
2641     __syncthreads();
2642     row      = tid >> 4;
2643     int row2 = row;
2644     for (int irow = 0; irow < 4; irow++, row += 4)
2645     {
2646       delta_s[row2][col] = mydelta[row * rowstride + c];
2647       T mysum1           = Ainv_delta1[row][col];
2648       T mysum2           = Ainv_delta2[row][col];
2649       if (row + first_row < N && c < N)
2650         for (int k = 0; k < 16; k++)
2651         {
2652           mysum1 += Ainv1[col][k] * delta_s[row2][k];
2653           mysum2 += Ainv2[col][k] * delta_s[row2][k];
2654         }
2655       Ainv_delta1[row][col] = mysum1;
2656       Ainv_delta2[row][col] = mysum2;
2657     }
2658     __syncthreads();
2659   }
2660   int mycol = blockIdx.x * 32 + col;
2661   row       = tid >> 4;
2662   if (mycol < N)
2663   {
2664     for (int irow = 0; irow < 4; irow++, row += 4)
2665     {
2666       myAinv_delta[row * rowstride + mycol]      = Ainv_delta1[row][col];
2667       myAinv_delta[row * rowstride + mycol + 16] = Ainv_delta2[row][col];
2668     }
2669   }
2670   __syncthreads();
2671   row = tid >> 4;
2672   col = tid & 0x0f;
2673   if (2 * blockIdx.x + 1 == kblock)
2674   {
2675     if (tid < 16)
2676       Ainv_delta2[tid][tid] += 1.0f;
2677     __syncthreads();
2678     block_inverse_16<T>(Ainv_delta2);
2679     for (int irow = 0; irow < 4; irow++, row += 4)
2680       myinvblock[row * 16 + col] = Ainv_delta2[row][col];
2681   }
2682   if (2 * blockIdx.x == kblock)
2683   {
2684     if (tid < 16)
2685       Ainv_delta1[tid][tid] += 1.0f;
2686     __syncthreads();
2687     block_inverse_16<T>(Ainv_delta1);
2688     for (int irow = 0; irow < 4; irow++, row += 4)
2689       myinvblock[row * 16 + col] = Ainv_delta1[row][col];
2690   }
2691 }
2692 
2693 
2694 template<typename T>
woodbury_update_16b(T ** Ainv_trans,T ** delta,T ** Ainv_delta,T ** inv_block,int N,int rowstride,int kblock)2695 __global__ void woodbury_update_16b(T** Ainv_trans,
2696                                     T** delta,
2697                                     T** Ainv_delta,
2698                                     T** inv_block,
2699                                     int N,
2700                                     int rowstride,
2701                                     int kblock)
2702 {
2703   int tid = threadIdx.x;
2704   __shared__ T B1[16][17], B2[16][17], B3[16][17], B4[16][17];
2705   __shared__ T *myAinv, *myAinv_delta, *myinv_block;
2706   if (tid == 0)
2707   {
2708     myAinv       = Ainv_trans[blockIdx.y];
2709     myAinv_delta = Ainv_delta[blockIdx.y];
2710     myinv_block  = inv_block[blockIdx.y];
2711   }
2712   __syncthreads();
2713   int row = tid >> 4;
2714   int col = tid & 0x0f;
2715   int c   = blockIdx.x * 32 + col;
2716   for (int i = 0; i < 4; i++, row += 4)
2717   {
2718     B1[row][col] = myinv_block[16 * row + col];
2719     B2[row][col] = myAinv[(16 * kblock + row) * rowstride + c];
2720   }
2721   __syncthreads();
2722   row = tid >> 4;
2723   // Now, multiply Ainv block by inv_block
2724   for (int i = 0; i < 4; i++, row += 4)
2725   {
2726     T mysum = 0.0f;
2727     for (int j = 0; j < 16; j++)
2728       mysum += B2[j][row] * B1[j][col];
2729     B3[row][col] = mysum;
2730   }
2731   __syncthreads();
2732   row = tid >> 4;
2733   for (int i = 0; i < 4; i++, row += 4)
2734     B2[row][col] = myAinv[(16 * kblock + row) * rowstride + c + 16];
2735   __syncthreads();
2736   row = tid >> 4;
2737   // Now, multiply Ainv block by inv_block
2738   for (int i = 0; i < 4; i++, row += 4)
2739   {
2740     T mysum = 0.0f;
2741     for (int j = 0; j < 16; j++)
2742       mysum += B2[j][row] * B1[j][col];
2743     B4[row][col] = mysum;
2744   }
2745   // Now do outer product
2746   int nb = (N + 15) >> 4;
2747   for (int block = 0; block < nb; block++)
2748   {
2749     row = tid >> 4;
2750     col = tid & 0x0f;
2751     for (int i = 0; i < 4; i++, row += 4)
2752       B1[row][col] = myAinv_delta[row * rowstride + col + 16 * block];
2753     __syncthreads();
2754     row = tid >> 4;
2755     col = tid & 0x0f;
2756     for (int irow = 0; irow < 4; irow++, row += 4)
2757     {
2758       T mysum3 = myAinv[(16 * block + row) * rowstride + c];
2759       T mysum4 = myAinv[(16 * block + row) * rowstride + c + 16];
2760       for (int k = 0; k < 16; k++)
2761       {
2762         mysum3 -= B3[col][k] * B1[k][row];
2763         mysum4 -= B4[col][k] * B1[k][row];
2764       }
2765       myAinv[(16 * block + row) * rowstride + c]      = mysum3;
2766       myAinv[(16 * block + row) * rowstride + c + 16] = mysum4;
2767     }
2768   }
2769 }
2770 
2771 
2772 template<typename T>
woodbury_update_32(T ** Ainv_trans,T ** delta,T ** Ainv_delta,int N,int rowstride)2773 __global__ void woodbury_update_32(T** Ainv_trans, T** delta, T** Ainv_delta, int N, int rowstride)
2774 {
2775   T *myAinv, *mydelta, *myAinv_delta;
2776   int tid       = threadIdx.x;
2777   myAinv        = Ainv_trans[blockIdx.y];
2778   myAinv_delta  = Ainv_delta[blockIdx.y];
2779   mydelta       = delta[blockIdx.y];
2780   int first_row = blockIdx.x * 32;
2781   __shared__ T Ainv_s[32][33], delta_s[32][33], Ainv_delta_s[32][33];
2782   int nb = (N + 31) / 32;
2783   for (int row = 0; row < 32; row++)
2784     if (tid < 32)
2785       Ainv_delta_s[row][tid] = 0.0f;
2786   __syncthreads();
2787   int col = tid;
2788   for (int block = 0; block < nb; block++)
2789   {
2790     int nend = N - block * 32;
2791     int c    = block * 32 + tid;
2792     for (int row = 0; row < 32; row++)
2793     {
2794       Ainv_s[row][tid]  = myAinv[(first_row + row) * rowstride + c];
2795       delta_s[row][tid] = mydelta[row * rowstride + c];
2796     }
2797     __syncthreads();
2798     for (int row = 0; row < 32; row++)
2799     {
2800       if (row + first_row < N && col < nend)
2801         for (int k = 0; k < 32; k++)
2802           Ainv_delta_s[row][col] += Ainv_s[row][k] * delta_s[col][k];
2803     }
2804     __syncthreads();
2805   }
2806   int mycol = blockIdx.x * 32 + tid;
2807   if (mycol < N)
2808     for (int row = 0; row < 32; row++)
2809       myAinv_delta[row * rowstride + mycol] = Ainv_delta_s[row][tid];
2810 }
2811 
2812 
2813 #ifdef CUDA_TEST_MAIN
2814 
2815 #include <omp.h>
2816 #include "lapacke.h"
2817 
2818 #define MAT_SIZE 256
2819 #define NUM_MATS 512
2820 
2821 // Compute inverse of n*n matrix A via LAPACKE
MatrixInverse(double * A,int n)2822 void MatrixInverse(double* A, int n)
2823 {
2824   lapack_int* ipiv_array = new int[n];
2825   lapack_int ierror;
2826   ierror = LAPACKE_dgetrf(LAPACK_COL_MAJOR, n, n, A, n, ipiv_array);
2827   if (ierror != 0)
2828   {
2829     fprintf(stderr, "dgetrf failure. Error %i\n", ierror);
2830     exit(1);
2831   }
2832   ierror = LAPACKE_dgetri(LAPACK_COL_MAJOR, n, A, n, ipiv_array);
2833   if (ierror != 0)
2834   {
2835     fprintf(stderr, "dgetri failure. Error %i\n", ierror);
2836     exit(1);
2837   }
2838   delete[] ipiv_array;
2839   return;
2840 }
2841 
test_update()2842 void test_update()
2843 {
2844   int const N = MAT_SIZE;
2845   double *A, *Ainv;
2846   int numMats = NUM_MATS;
2847   float *A_h, *Ainv_h, *u_h;
2848   float *Ainv_d, *Ainv_u_d, *Ainv_colk_d, *u_d;
2849   A      = (double*)malloc(N * N * sizeof(double));
2850   Ainv   = (double*)malloc(N * N * sizeof(double));
2851   Ainv_h = (float*)malloc(N * N * sizeof(float));
2852   A_h    = (float*)malloc(N * N * sizeof(float));
2853   u_h    = (float*)malloc(N * sizeof(float));
2854   cudaMalloc((void**)&Ainv_d, N * N * sizeof(float));
2855   cudaMalloc((void**)&Ainv_d, N * N * sizeof(float));
2856   cudaMalloc((void**)&u_d, N * sizeof(float));
2857   cudaMalloc((void**)&Ainv_u_d, N * sizeof(float));
2858   cudaMalloc((void**)&Ainv_colk_d, N * sizeof(float));
2859   float **AinvList, **Ainv_uList, **AList, **Ainv_colkList, **uList;
2860   AList         = (float**)malloc(NUM_MATS * sizeof(float*));
2861   AinvList      = (float**)malloc(NUM_MATS * sizeof(float*));
2862   Ainv_uList    = (float**)malloc(NUM_MATS * sizeof(float*));
2863   Ainv_colkList = (float**)malloc(NUM_MATS * sizeof(float*));
2864   uList         = (float**)malloc(NUM_MATS * sizeof(float*));
2865   float **AList_d, **AinvList_d, **Ainv_uList_d, **Ainv_colkList_d, **uList_d;
2866   cudaMalloc((void**)&AList_d, numMats * sizeof(float*));
2867   cudaMalloc((void**)&AinvList_d, numMats * sizeof(float*));
2868   cudaMalloc((void**)&Ainv_uList_d, numMats * sizeof(float*));
2869   cudaMalloc((void**)&Ainv_colkList_d, numMats * sizeof(float*));
2870   cudaMalloc((void**)&uList_d, numMats * sizeof(float*));
2871   for (int mat = 0; mat < numMats; mat++)
2872   {
2873     cudaMalloc((void**)&(AList[mat]), N * N * sizeof(float) + 1000);
2874     cudaMalloc((void**)&(AinvList[mat]), N * N * sizeof(float) + 1000);
2875     cudaMalloc((void**)&(Ainv_uList[mat]), N * sizeof(float) + 1000);
2876     cudaMalloc((void**)&(Ainv_colkList[mat]), N * sizeof(float) + 1000);
2877     cudaMalloc((void**)&(uList[mat]), N * sizeof(float) + 1000);
2878   }
2879   cudaMemcpyAsync(AList_d, AList, numMats * sizeof(float*), cudaMemcpyHostToDevice);
2880   cudaMemcpyAsync(AinvList_d, AinvList, numMats * sizeof(float*), cudaMemcpyHostToDevice);
2881   cudaMemcpyAsync(Ainv_uList_d, Ainv_uList, numMats * sizeof(float*), cudaMemcpyHostToDevice);
2882   cudaMemcpyAsync(Ainv_colkList_d, Ainv_colkList, numMats * sizeof(float*), cudaMemcpyHostToDevice);
2883   cudaMemcpyAsync(uList_d, uList, numMats * sizeof(float*), cudaMemcpyHostToDevice);
2884   srand48((long int)12341313);
2885   int row = 0;
2886   for (int mat = 0; mat < numMats; mat++)
2887   {
2888     if (mat == 0)
2889     {
2890       for (int i = 0; i < N; i++)
2891       {
2892         u_h[i] = drand48();
2893         for (int j = 0; j < N; j++)
2894           A[i * N + j] = Ainv[i * N + j] = A_h[i * N + j] = drand48();
2895       }
2896       // for (int i=0; i<N; i++)
2897       // 	u_h[i] = A_h[row*N+i];
2898       MatrixInverse(Ainv, N);
2899       for (int i = 0; i < N; i++)
2900         for (int j = 0; j < N; j++)
2901           Ainv_h[i * N + j] = (float)Ainv[i * N + j];
2902     }
2903     // for (int i=0; i<N; i++)
2904     //   u_h[i] = A_h[row*N+i];
2905     cudaMemcpyAsync(AList[mat], A_h, N * N * sizeof(float), cudaMemcpyHostToDevice);
2906     cudaMemcpyAsync(AinvList[mat], Ainv_h, N * N * sizeof(float), cudaMemcpyHostToDevice);
2907     cudaMemcpyAsync(uList[mat], u_h, N * sizeof(float), cudaMemcpyHostToDevice);
2908   }
2909   dim3 dimBlock2(64);
2910   dim3 dimGrid2((N + 63) / 64, NUM_MATS);
2911   double start = omp_get_wtime();
2912   for (int i = 0; i < 100; i++)
2913   {
2914     update_inverse_cuda1<float, 64>
2915         <<<dimGrid2, dimBlock2>>>(AList_d, AinvList_d, uList_d, Ainv_uList_d, Ainv_colkList_d, N, N, row);
2916     update_inverse_cuda2<float, 64>
2917         <<<dimGrid2, dimBlock2>>>(AList_d, AinvList_d, uList_d, Ainv_uList_d, Ainv_colkList_d, N, N, row);
2918   }
2919   cudaDeviceSynchronize();
2920   double end = omp_get_wtime();
2921   fprintf(stderr, "Rate = %12.8f updates per second.\n", (double)(100 * NUM_MATS) / (end - start));
2922   cudaMemcpy(Ainv_h, AinvList[0], N * N * sizeof(float), cudaMemcpyDeviceToHost);
2923   /*  for (int j=0; j<16; j++)
2924     for (int i=0; i<N; i++)
2925       A[(row+j)*N+i] += delta_h[j*N+i];
2926   for (int i=0; i<N; i++)
2927     for (int j=0; j<N; j++) {
2928       double ident = 0.0;
2929       for (int k=0; k<N; k++)
2930   	ident += Ainv_h[i*N+k]*A[k*N+j];
2931       if ((i==j && fabs(ident - 1.0) > 1.0e-4) ||
2932       	  (i!=j && fabs(ident) > 1.0e-4))
2933       	fprintf (stderr, "Error in matrix inverse, (%d, %d) = %1.8f\n", i, j, ident);
2934   }*/
2935   fprintf(stderr, "Finished.\n");
2936 }
2937 
test_update_transpose()2938 void test_update_transpose()
2939 {
2940   const int N = MAT_SIZE;
2941   double *A, *Ainv;
2942   int numMats = NUM_MATS;
2943   float *A_h, *Ainv_h, *u_h;
2944   float *Ainv_d, *Ainv_u_d, *Ainv_colk_d, *u_d;
2945   A      = (double*)malloc(N * N * sizeof(double));
2946   Ainv   = (double*)malloc(N * N * sizeof(double));
2947   Ainv_h = (float*)malloc(N * N * sizeof(float));
2948   A_h    = (float*)malloc(N * N * sizeof(float));
2949   u_h    = (float*)malloc(N * sizeof(float));
2950   cudaMalloc((void**)&Ainv_d, N * N * sizeof(float));
2951   cudaMalloc((void**)&Ainv_d, N * N * sizeof(float));
2952   cudaMalloc((void**)&u_d, N * sizeof(float));
2953   cudaMalloc((void**)&Ainv_u_d, N * sizeof(float));
2954   cudaMalloc((void**)&Ainv_colk_d, N * sizeof(float));
2955   float **AinvList, **Ainv_uList, **AList, **Ainv_colkList, **uList;
2956   AList         = (float**)malloc(NUM_MATS * sizeof(float*));
2957   AinvList      = (float**)malloc(NUM_MATS * sizeof(float*));
2958   Ainv_uList    = (float**)malloc(NUM_MATS * sizeof(float*));
2959   Ainv_colkList = (float**)malloc(NUM_MATS * sizeof(float*));
2960   uList         = (float**)malloc(NUM_MATS * sizeof(float*));
2961   float **AList_d, **AinvList_d, **Ainv_uList_d, **Ainv_colkList_d, **uList_d;
2962   cudaMalloc((void**)&AList_d, numMats * sizeof(float*));
2963   cudaMalloc((void**)&AinvList_d, numMats * sizeof(float*));
2964   cudaMalloc((void**)&Ainv_uList_d, numMats * sizeof(float*));
2965   cudaMalloc((void**)&Ainv_colkList_d, numMats * sizeof(float*));
2966   cudaMalloc((void**)&uList_d, numMats * sizeof(float*));
2967   for (int mat = 0; mat < numMats; mat++)
2968   {
2969     cudaMalloc((void**)&(AList[mat]), N * N * sizeof(float));
2970     cudaMalloc((void**)&(AinvList[mat]), N * N * sizeof(float));
2971     cudaMalloc((void**)&(Ainv_uList[mat]), N * sizeof(float));
2972     cudaMalloc((void**)&(Ainv_colkList[mat]), N * sizeof(float));
2973     cudaMalloc((void**)&(uList[mat]), N * sizeof(float));
2974   }
2975   cudaMemcpyAsync(AList_d, AList, numMats * sizeof(float*), cudaMemcpyHostToDevice);
2976   cudaMemcpyAsync(AinvList_d, AinvList, numMats * sizeof(float*), cudaMemcpyHostToDevice);
2977   cudaMemcpyAsync(Ainv_uList_d, Ainv_uList, numMats * sizeof(float*), cudaMemcpyHostToDevice);
2978   cudaMemcpyAsync(Ainv_colkList_d, Ainv_colkList, numMats * sizeof(float*), cudaMemcpyHostToDevice);
2979   cudaMemcpyAsync(uList_d, uList, numMats * sizeof(float*), cudaMemcpyHostToDevice);
2980   srand48((long int)12341313);
2981   int row = 1;
2982   for (int mat = 0; mat < numMats; mat++)
2983   {
2984     if (mat == 0)
2985     {
2986       for (int i = 0; i < N; i++)
2987       {
2988         for (int j = 0; j < N; j++)
2989           A[i * N + j] = Ainv[i * N + j] = A_h[i * N + j] = drand48();
2990         //	u_h[i] = drand48();
2991       }
2992       for (int j = 0; j < N; j++)
2993         u_h[j] = drand48(); //A[N*row+j];
2994       MatrixInverse(Ainv, N);
2995       for (int i = 0; i < N; i++)
2996         for (int j = 0; j < N; j++)
2997           Ainv_h[j * N + i] = (float)Ainv[i * N + j];
2998     }
2999     // for (int i=0; i<N; i++)
3000     //   u_h[i] = A_h[row*N+i];
3001     cudaMemcpyAsync(AList[mat], A_h, N * N * sizeof(float), cudaMemcpyHostToDevice);
3002     cudaMemcpyAsync(AinvList[mat], Ainv_h, N * N * sizeof(float), cudaMemcpyHostToDevice);
3003     cudaMemcpyAsync(uList[mat], u_h, N * sizeof(float), cudaMemcpyHostToDevice);
3004   }
3005   dim3 dimBlock(DET_BLOCK_SIZE);
3006   dim3 dimGrid(NUM_MATS);
3007   clock_t start = clock();
3008   for (int i = 0; i < 1000; i++)
3009   {
3010     update_inverse_transpose_cuda<float, DET_BLOCK_SIZE, N>
3011         <<<dimGrid, dimBlock>>>(AList_d, AinvList_d, uList_d, N, N, row);
3012     // update_inverse_transpose_cuda_2pass<float,DET_BLOCK_SIZE,N><<<dimGrid,dimBlock>>>
3013     //   (AList_d, AinvList_d, uList_d, N, N, row);
3014   }
3015   cudaDeviceSynchronize();
3016   clock_t end = clock();
3017   fprintf(stderr, "Rate = %12.8f updates per second.\n",
3018           (double)(1000 * NUM_MATS) / ((double)(end - start) / (double)CLOCKS_PER_SEC));
3019   cudaMemcpy(Ainv_h, AinvList[1], N * N * sizeof(float), cudaMemcpyDeviceToHost);
3020   for (int i = 0; i < N; i++)
3021     A[row * N + i] = u_h[i];
3022   for (int i = 0; i < N; i++)
3023     for (int j = 0; j < N; j++)
3024     {
3025       double ident = 0.0;
3026       for (int k = 0; k < N; k++)
3027         ident += Ainv_h[k * N + i] * A[k * N + j];
3028       if ((i == j && fabs(ident - 1.0) > 1.0e-4) || (i != j && fabs(ident) > 1.0e-4))
3029         fprintf(stderr, "Error in matrix inverse, (%d, %d) = %1.8f\n", i, j, ident);
3030     }
3031   fprintf(stderr, "Finished.\n");
3032 }
3033 
3034 
test_woodbury()3035 void test_woodbury()
3036 {
3037   int const N     = MAT_SIZE;
3038   int M           = 16;
3039   int updateBlock = 3;
3040   double *A, *Ainv, *Anew, *Anew_inv;
3041   int numMats = NUM_MATS;
3042   float *A_h, *Ainv_h, *delta_h, *Ainv_delta_h, *Anew_h, *Anew_inv_h;
3043   ;
3044   float *Ainv_d, *Ainv_delta_d, *Ainv_colk_d, *delta_d;
3045   A            = (double*)malloc(N * N * sizeof(double));
3046   Anew         = (double*)malloc(N * N * sizeof(double));
3047   Ainv         = (double*)malloc(N * N * sizeof(double));
3048   Anew_inv     = (double*)malloc(N * N * sizeof(double));
3049   Ainv_h       = (float*)malloc(N * N * sizeof(float));
3050   A_h          = (float*)malloc(N * N * sizeof(float));
3051   delta_h      = (float*)malloc(N * M * sizeof(float));
3052   Ainv_delta_h = (float*)malloc(N * M * sizeof(float));
3053   Anew_h       = (float*)malloc(N * N * sizeof(float));
3054   Anew_inv_h   = (float*)malloc(N * N * sizeof(float));
3055   cudaMalloc((void**)&Ainv_d, N * N * sizeof(float));
3056   cudaMalloc((void**)&delta_d, N * M * sizeof(float));
3057   cudaMalloc((void**)&Ainv_delta_d, N * M * sizeof(float));
3058   cudaMalloc((void**)&Ainv_colk_d, N * sizeof(float));
3059   float **AinvList, **Ainv_deltaList, **AList, **Ainv_colkList, **deltaList, **invBlockList;
3060   AList          = (float**)malloc(NUM_MATS * sizeof(float*));
3061   AinvList       = (float**)malloc(NUM_MATS * sizeof(float*));
3062   Ainv_deltaList = (float**)malloc(NUM_MATS * sizeof(float*));
3063   Ainv_colkList  = (float**)malloc(NUM_MATS * sizeof(float*));
3064   deltaList      = (float**)malloc(NUM_MATS * sizeof(float*));
3065   invBlockList   = (float**)malloc(NUM_MATS * sizeof(float*));
3066   float **AList_d, **AinvList_d, **Ainv_deltaList_d, **Ainv_colkList_d, **deltaList_d, **invBlockList_d;
3067   cudaMalloc((void**)&AinvList_d, numMats * sizeof(float*));
3068   cudaMalloc((void**)&Ainv_deltaList_d, numMats * sizeof(float*));
3069   cudaMalloc((void**)&deltaList_d, numMats * sizeof(float*));
3070   cudaMalloc((void**)&invBlockList_d, numMats * sizeof(float*));
3071   for (int mat = 0; mat < numMats; mat++)
3072   {
3073     //    cudaMalloc((void**)&(AList[mat])        , N*N*sizeof(float)+1000);
3074     cudaMalloc((void**)&(AinvList[mat]), N * N * sizeof(float) + 1000);
3075     cudaMalloc((void**)&(Ainv_deltaList[mat]), N * M * sizeof(float) + 1000);
3076     cudaMalloc((void**)&(deltaList[mat]), N * M * sizeof(float) + 1000);
3077     cudaMalloc((void**)&(invBlockList[mat]), M * M * sizeof(float) + 1000);
3078     cudaError_t err = cudaGetLastError();
3079     if (err != cudaSuccess)
3080     {
3081       fprintf(stderr, "CUDA error in test_woodbury malloc:\n  %s\n", cudaGetErrorString(err));
3082       abort();
3083     }
3084   }
3085   cudaMemcpyAsync(AinvList_d, AinvList, numMats * sizeof(float*), cudaMemcpyHostToDevice);
3086   cudaMemcpyAsync(Ainv_deltaList_d, Ainv_deltaList, numMats * sizeof(float*), cudaMemcpyHostToDevice);
3087   cudaMemcpyAsync(deltaList_d, deltaList, numMats * sizeof(float*), cudaMemcpyHostToDevice);
3088   cudaMemcpyAsync(invBlockList_d, invBlockList, numMats * sizeof(float*), cudaMemcpyHostToDevice);
3089   srand48((long int)12341313);
3090   int row = 0;
3091   for (int mat = 0; mat < numMats; mat++)
3092   {
3093     if (mat == 0)
3094     {
3095       for (int i = 0; i < N; i++)
3096       {
3097         for (int j = 0; j < N; j++)
3098         {
3099           A[i * N + j] = Ainv[i * N + j] = A_h[i * N + j] = drand48();
3100           Anew[i * N + j] = Anew_inv[i * N + j] = A[i * N + j];
3101         }
3102       }
3103       for (int i = 0; i < 16; i++)
3104         for (int j = 0; j < N; j++)
3105         {
3106           delta_h[i * N + j]                   = drand48() - 0.5;
3107           Anew[(updateBlock * 16 + i) * N + j] = Anew_inv[(updateBlock * 16 + i) * N + j] =
3108               A[(updateBlock * 16 + i) * N + j] + delta_h[i * N + j];
3109         }
3110       // for (int i=0; i<N; i++)
3111       // 	delta_h[i] = A_h[row*N+i];
3112       MatrixInverse(Ainv, N);
3113       MatrixInverse(Anew_inv, N);
3114       for (int i = 0; i < N; i++)
3115         for (int j = 0; j < N; j++)
3116         {
3117           Ainv_h[i * N + j]     = (float)Ainv[j * N + i];
3118           Anew_inv_h[i * N + j] = (float)Anew_inv[j * N + i];
3119         }
3120     }
3121     // for (int i=0; i<N; i++)
3122     //   delta_h[i] = A_h[row*N+i];
3123     // cudaMemcpyAsync (AList[mat], A_h, N*N*sizeof(float),
3124     // 		cudaMemcpyHostToDevice);
3125     cudaMemcpyAsync(AinvList[mat], Ainv_h, N * N * sizeof(float), cudaMemcpyHostToDevice);
3126     cudaMemcpyAsync(deltaList[mat], delta_h, N * M * sizeof(float), cudaMemcpyHostToDevice);
3127   }
3128   cudaError_t err = cudaGetLastError();
3129   if (err != cudaSuccess)
3130   {
3131     fprintf(stderr, "CUDA error in test_woodbury memcopy's:\n  %s\n", cudaGetErrorString(err));
3132     abort();
3133   }
3134   dim3 dimBlock2(64);
3135   dim3 dimGrida((N + 15) / 16, numMats);
3136   dim3 dimGridb((N + 31) / 32, numMats);
3137   //dim3 dimGrid2((N/32), numMats);
3138   double start = omp_get_wtime();
3139   for (int i = 0; i < 100; i++)
3140   {
3141     woodbury_update_16a<float>
3142         <<<dimGridb, dimBlock2>>>(AinvList_d, deltaList_d, Ainv_deltaList_d, invBlockList_d, N, N, updateBlock);
3143     woodbury_update_16b<float>
3144         <<<dimGridb, dimBlock2>>>(AinvList_d, deltaList_d, Ainv_deltaList_d, invBlockList_d, N, N, updateBlock);
3145   }
3146   cudaDeviceSynchronize();
3147   err = cudaGetLastError();
3148   if (err != cudaSuccess)
3149   {
3150     fprintf(stderr, "CUDA error in woodbury_update_16:\n  %s\n", cudaGetErrorString(err));
3151     abort();
3152   }
3153   double end = omp_get_wtime();
3154   fprintf(stderr, "Rate = %12.8f updates per second.\n", (double)(100 * NUM_MATS) / (end - start));
3155   fprintf(stderr, "About to copy %ld back\n", N * M * sizeof(float));
3156   cudaMemcpy(Ainv_delta_h, Ainv_deltaList[0], N * M * sizeof(float), cudaMemcpyDeviceToHost);
3157   fprintf(stderr, "About to copy %ld back\n", N * N * sizeof(float));
3158   cudaMemcpy(Anew_inv_h, AinvList[0], N * N * sizeof(float), cudaMemcpyDeviceToHost);
3159   err = cudaGetLastError();
3160   if (err != cudaSuccess)
3161   {
3162     fprintf(stderr, "CUDA error in test_woodbury memcopy back:\n  %s\n", cudaGetErrorString(err));
3163     abort();
3164   }
3165   fprintf(stderr, "Copied result back.\n");
3166   float Ainv_delta[N * M];
3167   for (int i = 0; i < N * M; i++)
3168     Ainv_delta[i] = 0.0;
3169   for (int i = 0; i < 16; i++)
3170     for (int j = 0; j < N; j++)
3171       for (int k = 0; k < N; k++)
3172         Ainv_delta[i * N + j] += Ainv_h[j * N + k] * delta_h[i * N + k];
3173   fprintf(stderr, "Ainv_delta_cpu = %1.8e\n", Ainv_delta[51]);
3174   fprintf(stderr, "Ainv_delta_gpu = %1.8e\n", Ainv_delta_h[51]);
3175   int i      = 10;
3176   int j      = 17;
3177   FILE* fcpu = fopen("Ainv_cpu.dat", "w");
3178   FILE* fgpu = fopen("Ainv_gpu.dat", "w");
3179   for (int i = 0; i < N; i++)
3180   {
3181     for (int j = 0; j < N; j++)
3182     {
3183       fprintf(fcpu, "%16.8e ", Anew_inv[i * N + j]);
3184       fprintf(fgpu, "%16.8e ", Anew_inv_h[j * N + i]);
3185     }
3186     fprintf(fcpu, "\n");
3187     fprintf(fgpu, "\n");
3188   }
3189   fprintf(stderr, "Anew_inv cpu = %1.8e\n", Anew_inv[i * N + j]);
3190   fprintf(stderr, "Anew_inv_gpu = %1.8e\n", Anew_inv_h[j * N + i]);
3191   // cudaMemcpy (Ainv_h, AinvList[0], N*N*sizeof(float),cudaMemcpyDeviceToHost);
3192   // for (int i=0; i<N; i++)
3193   //   A[row*N+i] = delta_h[i];
3194   // for (int i=0; i<N; i++)
3195   //   for (int j=0; j<N; j++) {
3196   //     double ident = 0.0;
3197   //     for (int k=0; k<N; k++)
3198   // 	ident += Ainv_h[i*N+k]*A[k*N+j];
3199   //     if ((i==j && fabs(ident - 1.0) > 1.0e-4) ||
3200   //     	  (i!=j && fabs(ident) > 1.0e-4))
3201   //     	fprintf (stderr, "Error in matrix inverse, (%d, %d) = %1.8f\n", i, j, ident);
3202   //   }
3203   fprintf(stderr, "Finished.\n");
3204 }
3205 
3206 
3207 // Note: compile with:
3208 // nvcc -o determinant_update -DCUDA_TEST_MAIN -arch=sm_70 determinant_update.cu ../../../Platforms/CUDA_legacy/gpu_misc.cpp -I../../../Platforms -I../../../../../build_llvmdev_legacy_cuda_full_precision/src/ -lcublas -lopenblas -Xcompiler -fopenmp
3209 // build_llvm.../src path on above line should be location of a config.h produced by QMCPACK cmake
main()3210 int main()
3211 {
3212   //test_all_ratios_kernel();
3213   // test_all_grad_lapl_kernel();
3214   test_update();
3215   // test_update_transpose();
3216   test_woodbury();
3217 
3218   return 0;
3219 }
3220 
3221 #endif
3222