1 //////////////////////////////////////////////////////////////////////
2 // This file is distributed under the University of Illinois/NCSA Open Source
3 // License.  See LICENSE file in top directory for details.
4 //
5 // Copyright (c) 2016 Jeongnim Kim and QMCPACK developers.
6 //
7 // File developed by:
8 //    Lawrence Livermore National Laboratory
9 //
10 // File created by:
11 // Miguel A. Morales, moralessilva2@llnl.gov
12 //    Lawrence Livermore National Laboratory
13 ////////////////////////////////////////////////////////////////////////////////
14 
15 #ifndef AFQMC_LAPACK_GPU_HPP
16 #define AFQMC_LAPACK_GPU_HPP
17 
18 #include <cassert>
19 #include "AFQMC/Utilities/type_conversion.hpp"
20 #include "AFQMC/Memory/custom_pointers.hpp"
21 #include "AFQMC/Memory/arch.hpp"
22 #include "AFQMC/Numerics/detail/CUDA/cublas_wrapper.hpp"
23 #include "AFQMC/Numerics/detail/CUDA/cusolver_wrapper.hpp"
24 #include "AFQMC/Numerics/detail/CUDA/Kernels/setIdentity.cuh"
25 
26 namespace device
27 {
28 using qmcplusplus::afqmc::remove_complex;
29 
30 template<typename T, typename R, typename I>
hevr(char JOBZ,char RANGE,char UPLO,int N,device_pointer<T> A,int LDA,T VL,T VU,int IL,int IU,T ABSTOL,int & M,device_pointer<T> W,device_pointer<T> Z,int LDZ,device_pointer<I> ISUPPZ,device_pointer<T> WORK,int & LWORK,device_pointer<R> RWORK,int & LRWORK,device_pointer<I> IWORK,int & LIWORK,int & INFO)31 inline static void hevr(char JOBZ,
32                         char RANGE,
33                         char UPLO,
34                         int N,
35                         device_pointer<T> A,
36                         int LDA,
37                         T VL,
38                         T VU,
39                         int IL,
40                         int IU,
41                         T ABSTOL,
42                         int& M,
43                         device_pointer<T> W,
44                         device_pointer<T> Z,
45                         int LDZ,
46                         device_pointer<I> ISUPPZ,
47                         device_pointer<T> WORK,
48                         int& LWORK,
49                         device_pointer<R> RWORK,
50                         int& LRWORK,
51                         device_pointer<I> IWORK,
52                         int& LIWORK,
53                         int& INFO)
54 {
55   throw std::runtime_error("Error: hevr not implemented in gpu.");
56 }
57 
58 // getrf_bufferSize
59 template<typename T>
getrf_bufferSize(const int n,const int m,device_pointer<T> a,int lda,int & lwork)60 inline static void getrf_bufferSize(const int n, const int m, device_pointer<T> a, int lda, int& lwork)
61 {
62   cusolver::cusolver_getrf_bufferSize(*a.handles.cusolverDn_handle, n, m, to_address(a), lda, &lwork);
63 }
64 
65 template<typename T, typename R, typename I>
getrf(const int n,const int m,device_pointer<T> && a,int lda,device_pointer<I> && piv,int & st,device_pointer<R> work)66 inline static void getrf(const int n,
67                          const int m,
68                          device_pointer<T>&& a,
69                          int lda,
70                          device_pointer<I>&& piv,
71                          int& st,
72                          device_pointer<R> work)
73 {
74   cusolverStatus_t status = cusolver::cusolver_getrf(*a.handles.cusolverDn_handle, n, m, to_address(a), lda,
75                                                      to_address(work), to_address(piv), to_address(piv) + n);
76   arch::memcopy(&st, to_address(piv) + n, sizeof(int), arch::memcopyD2H);
77   if (CUSOLVER_STATUS_SUCCESS != status)
78   {
79     std::cerr << " cublas_getrf status, info: " << status << " " << st << std::endl;
80     std::cerr.flush();
81     throw std::runtime_error("Error: cublas_getrf returned error code.");
82   }
83 }
84 
85 // getrfBatched
86 template<typename T, typename I>
getrfBatched(const int n,device_pointer<T> * a,int lda,device_pointer<I> piv,device_pointer<I> info,int batchSize)87 inline static void getrfBatched(const int n,
88                                 device_pointer<T>* a,
89                                 int lda,
90                                 device_pointer<I> piv,
91                                 device_pointer<I> info,
92                                 int batchSize)
93 {
94   T** A_d;
95   T** A_h;
96   A_h = new T*[batchSize];
97   for (int i = 0; i < batchSize; i++)
98     A_h[i] = to_address(a[i]);
99   arch::malloc((void**)&A_d, batchSize * sizeof(*A_h));
100   arch::memcopy(A_d, A_h, batchSize * sizeof(*A_h), arch::memcopyH2D);
101   cublasStatus_t status = cublas::cublas_getrfBatched(*(a[0]).handles.cublas_handle, n, A_d, lda, to_address(piv),
102                                                       to_address(info), batchSize);
103   if (CUBLAS_STATUS_SUCCESS != status)
104     throw std::runtime_error("Error: cublas_getrf returned error code.");
105   arch::free(A_d);
106   delete[] A_h;
107 }
108 
109 // getri_bufferSize
110 template<typename T>
getri_bufferSize(int n,device_pointer<T> a,int lda,int & lwork)111 inline static void getri_bufferSize(int n, device_pointer<T> a, int lda, int& lwork)
112 {
113   // gpu uses getrs to invert matrix, which requires n*n workspace
114   lwork = n * n;
115 }
116 
117 // write separate query function to avoid hack!!!
118 template<typename T, typename R, typename I>
getri(int n,device_pointer<T> a,int lda,device_pointer<I> piv,device_pointer<R> work,int n1,int & status)119 inline static void getri(int n,
120                          device_pointer<T> a,
121                          int lda,
122                          device_pointer<I> piv,
123                          device_pointer<R> work,
124                          int n1,
125                          int& status)
126 {
127   if (n1 < n * n)
128     throw std::runtime_error("Error: getri<GPU_MEMORY_POINTER_TYPE> required buffer space of n*n.");
129   if (lda != n)
130     throw std::runtime_error("Error: getri<GPU_MEMORY_POINTER_TYPE> required lda = 1.");
131 
132   int* info;
133   arch::malloc((void**)&info, sizeof(int), "lapack_cuda_gpu_ptr::getri");
134 
135   kernels::set_identity(n, n, to_address(work), n);
136   if (CUSOLVER_STATUS_SUCCESS !=
137       cusolver::cusolver_getrs(*a.handles.cusolverDn_handle, CUBLAS_OP_N, n, n, to_address(a), lda, to_address(piv),
138                                to_address(work), n, info))
139     throw std::runtime_error("Error: cusolver_getrs returned error code.");
140   arch::memcopy(to_address(a), to_address(work), n * n * sizeof(T), arch::memcopyD2D);
141   arch::memcopy(&status, info, sizeof(int), arch::memcopyD2H);
142   arch::free(info);
143 }
144 
145 // getriBatched
146 template<typename T, typename I>
getriBatched(int n,device_pointer<T> * a,int lda,device_pointer<I> piv,device_pointer<T> * ainv,int ldc,device_pointer<I> info,int batchSize)147 inline static void getriBatched(int n,
148                                 device_pointer<T>* a,
149                                 int lda,
150                                 device_pointer<I> piv,
151                                 device_pointer<T>* ainv,
152                                 int ldc,
153                                 device_pointer<I> info,
154                                 int batchSize)
155 {
156   T **A_d, **C_d;
157   T **A_h, **C_h;
158   A_h = new T*[batchSize];
159   C_h = new T*[batchSize];
160   for (int i = 0; i < batchSize; i++)
161   {
162     A_h[i] = to_address(a[i]);
163     C_h[i] = to_address(ainv[i]);
164   }
165   arch::malloc((void**)&A_d, batchSize * sizeof(*A_h));
166   arch::malloc((void**)&C_d, batchSize * sizeof(*C_h));
167   arch::memcopy(A_d, A_h, batchSize * sizeof(*A_h), arch::memcopyH2D);
168   arch::memcopy(C_d, C_h, batchSize * sizeof(*C_h), arch::memcopyH2D);
169   cublasStatus_t status = cublas::cublas_getriBatched(*(a[0]).handles.cublas_handle, n, A_d, lda, to_address(piv), C_d,
170                                                       ldc, to_address(info), batchSize);
171   if (CUBLAS_STATUS_SUCCESS != status)
172     throw std::runtime_error("Error: cublas_getri returned error code.");
173   arch::free(A_d);
174   arch::free(C_d);
175   delete[] A_h;
176   delete[] C_h;
177 }
178 
179 // matinveBatched
180 template<typename T1, typename T2, typename I>
matinvBatched(int n,device_pointer<T1> * a,int lda,device_pointer<T2> * ainv,int lda_inv,device_pointer<I> info,int batchSize)181 inline static void matinvBatched(int n,
182                                  device_pointer<T1>* a,
183                                  int lda,
184                                  device_pointer<T2>* ainv,
185                                  int lda_inv,
186                                  device_pointer<I> info,
187                                  int batchSize)
188 {
189   T1 **A_d, **A_h;
190   T2 **C_d, **C_h;
191   A_h = new T1*[batchSize];
192   C_h = new T2*[batchSize];
193   for (int i = 0; i < batchSize; i++)
194   {
195     A_h[i] = to_address(a[i]);
196     C_h[i] = to_address(ainv[i]);
197   }
198   arch::malloc((void**)&A_d, batchSize * sizeof(*A_h));
199   arch::malloc((void**)&C_d, batchSize * sizeof(*C_h));
200   arch::memcopy(A_d, A_h, batchSize * sizeof(*A_h), arch::memcopyH2D);
201   arch::memcopy(C_d, C_h, batchSize * sizeof(*C_h), arch::memcopyH2D);
202   cublasStatus_t status = cublas::cublas_matinvBatched(*(a[0]).handles.cublas_handle, n, A_d, lda, C_d, lda_inv,
203                                                        to_address(info), batchSize);
204   if (CUBLAS_STATUS_SUCCESS != status)
205     throw std::runtime_error("Error: cublas_matinv returned error code.");
206   arch::free(A_d);
207   arch::free(C_d);
208   delete[] A_h;
209   delete[] C_h;
210 }
211 
212 // geqrf
213 template<typename T>
geqrf_bufferSize(int m,int n,device_pointer<T> a,int lda,int & lwork)214 inline static void geqrf_bufferSize(int m, int n, device_pointer<T> a, int lda, int& lwork)
215 {
216   if (CUSOLVER_STATUS_SUCCESS !=
217       cusolver::cusolver_geqrf_bufferSize(*a.handles.cusolverDn_handle, m, n, to_address(a), lda, &lwork))
218     throw std::runtime_error("Error: cusolver_geqrf_bufferSize returned error code.");
219 }
220 
221 template<typename T>
geqrf(int M,int N,device_pointer<T> A,const int LDA,device_pointer<T> TAU,device_pointer<T> WORK,int LWORK,int & INFO)222 inline static void geqrf(int M,
223                          int N,
224                          device_pointer<T> A,
225                          const int LDA,
226                          device_pointer<T> TAU,
227                          device_pointer<T> WORK,
228                          int LWORK,
229                          int& INFO)
230 {
231   // allocating here for now
232   int* piv;
233   arch::malloc((void**)&piv, sizeof(int), "lapack_cuda_gpu_ptr::geqrf");
234 
235   cusolverStatus_t status = cusolver::cusolver_geqrf(*A.handles.cusolverDn_handle, M, N, to_address(A), LDA,
236                                                      to_address(TAU), to_address(WORK), LWORK, piv);
237   arch::memcopy(&INFO, piv, sizeof(int), arch::memcopyD2H);
238   if (CUSOLVER_STATUS_SUCCESS != status)
239   {
240     int st;
241     std::cerr << " cublas_geqrf status, info: " << status << " " << INFO << std::endl;
242     std::cerr.flush();
243     throw std::runtime_error("Error: cublas_geqrf returned error code.");
244   }
245   arch::free(piv);
246 }
247 
248 // gelqf
249 template<typename T>
gelqf_bufferSize(int m,int n,device_pointer<T> a,int lda,int & lwork)250 inline static void gelqf_bufferSize(int m, int n, device_pointer<T> a, int lda, int& lwork)
251 {
252   lwork = 0;
253 }
254 
255 template<typename T>
gelqf(int M,int N,device_pointer<T> A,const int LDA,device_pointer<T> TAU,device_pointer<T> WORK,int LWORK,int & INFO)256 inline static void gelqf(int M,
257                          int N,
258                          device_pointer<T> A,
259                          const int LDA,
260                          device_pointer<T> TAU,
261                          device_pointer<T> WORK,
262                          int LWORK,
263                          int& INFO)
264 {
265   throw std::runtime_error("Error: gelqf not implemented in CUDA backend. \n");
266 }
267 
268 // gqr
269 template<typename T>
gqr_bufferSize(int m,int n,int k,device_pointer<T> a,int lda,int & lwork)270 static void gqr_bufferSize(int m, int n, int k, device_pointer<T> a, int lda, int& lwork)
271 {
272   if (CUSOLVER_STATUS_SUCCESS !=
273       cusolver::cusolver_gqr_bufferSize(*a.handles.cusolverDn_handle, m, n, k, to_address(a), lda, &lwork))
274     throw std::runtime_error("Error: cusolver_gqr_bufferSize returned error code.");
275 }
276 
277 template<typename T>
gqr(int M,int N,int K,device_pointer<T> A,const int LDA,device_pointer<T> TAU,device_pointer<T> WORK,int LWORK,int & INFO)278 void static gqr(int M,
279                 int N,
280                 int K,
281                 device_pointer<T> A,
282                 const int LDA,
283                 device_pointer<T> TAU,
284                 device_pointer<T> WORK,
285                 int LWORK,
286                 int& INFO)
287 {
288   // allocating here for now
289   int* piv;
290   arch::malloc((void**)&piv, sizeof(int), "lapack_cuda_gpu_ptr::gqr");
291 
292   cusolverStatus_t status = cusolver::cusolver_gqr(*A.handles.cusolverDn_handle, M, N, K, to_address(A), LDA,
293                                                    to_address(TAU), to_address(WORK), LWORK, piv);
294   arch::memcopy(&INFO, piv, sizeof(int), arch::memcopyD2H);
295   if (CUSOLVER_STATUS_SUCCESS != status)
296   {
297     int st;
298     std::cerr << " cublas_gqr status, info: " << status << " " << INFO << std::endl;
299     std::cerr.flush();
300     throw std::runtime_error("Error: cublas_gqr returned error code.");
301   }
302   arch::free(piv);
303 }
304 
305 template<typename T, typename I>
gqrStrided(int M,int N,int K,device_pointer<T> A,const int LDA,const int Astride,device_pointer<T> TAU,const int Tstride,device_pointer<T> WORK,int LWORK,device_pointer<I> info,int batchSize)306 void static gqrStrided(int M,
307                        int N,
308                        int K,
309                        device_pointer<T> A,
310                        const int LDA,
311                        const int Astride,
312                        device_pointer<T> TAU,
313                        const int Tstride,
314                        device_pointer<T> WORK,
315                        int LWORK,
316                        device_pointer<I> info,
317                        int batchSize)
318 {
319   cusolverStatus_t status =
320       cusolver::cusolver_gqr_strided(*A.handles.cusolverDn_handle, M, N, K, to_address(A), LDA, Astride,
321                                      to_address(TAU), Tstride, to_address(WORK), LWORK, to_address(info), batchSize);
322   if (CUSOLVER_STATUS_SUCCESS != status)
323   {
324     std::cerr << " cublas_gqr_strided status: " << status << std::endl;
325     std::cerr.flush();
326     throw std::runtime_error("Error: cublas_gqr_strided returned error code.");
327   }
328 }
329 
330 // glq
331 template<typename T>
glq_bufferSize(int m,int n,int k,device_pointer<T> a,int lda,int & lwork)332 static void glq_bufferSize(int m, int n, int k, device_pointer<T> a, int lda, int& lwork)
333 {
334   lwork = 0;
335 }
336 
337 template<typename T>
glq(int M,int N,int K,device_pointer<T> A,const int LDA,device_pointer<T> TAU,device_pointer<T> WORK,int LWORK,int & INFO)338 void static glq(int M,
339                 int N,
340                 int K,
341                 device_pointer<T> A,
342                 const int LDA,
343                 device_pointer<T> TAU,
344                 device_pointer<T> WORK,
345                 int LWORK,
346                 int& INFO)
347 {
348   throw std::runtime_error("Error: glq not implemented in CUDA backend. \n");
349 }
350 
351 // batched geqrf
352 template<typename T, typename I>
geqrfBatched(int M,int N,device_pointer<T> * A,const int LDA,device_pointer<T> * TAU,device_pointer<I> info,int batchSize)353 inline static void geqrfBatched(int M,
354                                 int N,
355                                 device_pointer<T>* A,
356                                 const int LDA,
357                                 device_pointer<T>* TAU,
358                                 device_pointer<I> info,
359                                 int batchSize)
360 {
361   T** B_h = new T*[2 * batchSize];
362   T** A_h(B_h);
363   T** T_h(B_h + batchSize);
364   for (int i = 0; i < batchSize; i++)
365     A_h[i] = to_address(A[i]);
366   for (int i = 0; i < batchSize; i++)
367     T_h[i] = to_address(TAU[i]);
368   T** B_d;
369   std::vector<int> inf(batchSize);
370   arch::malloc((void**)&B_d, 2 * batchSize * sizeof(*B_h));
371   arch::memcopy(B_d, B_h, 2 * batchSize * sizeof(*B_h), arch::memcopyH2D);
372   T** A_d(B_d);
373   T** T_d(B_d + batchSize);
374   cublasStatus_t status = cublas::cublas_geqrfBatched(*(A[0]).handles.cublas_handle, M, N, A_d, LDA, T_d,
375                                                       to_address(inf.data()), batchSize);
376   if (CUBLAS_STATUS_SUCCESS != status)
377     throw std::runtime_error("Error: cublas_geqrfBatched returned error code.");
378   arch::free(B_d);
379   delete[] B_h;
380 }
381 
382 template<typename T, typename I>
geqrfStrided(int M,int N,device_pointer<T> A,const int LDA,const int Astride,device_pointer<T> TAU,const int Tstride,device_pointer<I> info,int batchSize)383 inline static void geqrfStrided(int M,
384                                 int N,
385                                 device_pointer<T> A,
386                                 const int LDA,
387                                 const int Astride,
388                                 device_pointer<T> TAU,
389                                 const int Tstride,
390                                 device_pointer<I> info,
391                                 int batchSize)
392 {
393   /*
394     T **B_h = new T*[2*batchSize];
395     T **A_h(B_h);
396     T **T_h(B_h+batchSize);
397     for(int i=0; i<batchSize; i++)
398       A_h[i] = to_address(A)+i*Astride;
399     for(int i=0; i<batchSize; i++)
400       T_h[i] = to_address(TAU)+i*Tstride;
401     T **B_d;
402     arch::malloc((void **)&B_d,  2*batchSize*sizeof(*B_h));
403     arch::memcopy(B_d, B_h, 2*batchSize*sizeof(*B_h), arch::memcopyH2D);
404     T **A_d(B_d);
405     T **T_d(B_d+batchSize);
406 */
407 
408   std::vector<int> inf(batchSize);
409   T** A_h = new T*[batchSize];
410   T** T_h = new T*[batchSize];
411   for (int i = 0; i < batchSize; i++)
412     A_h[i] = to_address(A) + i * Astride;
413   for (int i = 0; i < batchSize; i++)
414     T_h[i] = to_address(TAU) + i * Tstride;
415   T **A_d, **T_d;
416   arch::malloc((void**)&A_d, batchSize * sizeof(*A_h));
417   arch::memcopy(A_d, A_h, batchSize * sizeof(*A_h), arch::memcopyH2D);
418   arch::malloc((void**)&T_d, batchSize * sizeof(*T_h));
419   arch::memcopy(T_d, T_h, batchSize * sizeof(*T_h), arch::memcopyH2D);
420   cublasStatus_t status =
421       cublas::cublas_geqrfBatched(*A.handles.cublas_handle, M, N, A_d, LDA, T_d, to_address(inf.data()), batchSize);
422   for (int i = 0; i < batchSize; i++)
423     assert(inf[i] == 0);
424   if (CUBLAS_STATUS_SUCCESS != status)
425     throw std::runtime_error("Error: cublas_geqrfBatched returned error code.");
426   arch::free(A_d);
427   delete[] A_h;
428   arch::free(T_d);
429   delete[] T_h;
430 }
431 
432 // gesvd_bufferSize
433 template<typename T>
gesvd_bufferSize(const int m,const int n,device_pointer<T> a,int & lwork)434 inline static void gesvd_bufferSize(const int m, const int n, device_pointer<T> a, int& lwork)
435 {
436   cusolver::cusolver_gesvd_bufferSize(*a.handles.cusolverDn_handle, m, n, to_address(a), &lwork);
437 }
438 
439 template<typename T, typename R>
gesvd(char jobU,char jobVT,const int m,const int n,device_pointer<T> && A,int lda,device_pointer<R> && S,device_pointer<T> && U,int ldu,device_pointer<T> && VT,int ldvt,device_pointer<T> && W,int lw,int & st)440 inline static void gesvd(char jobU,
441                          char jobVT,
442                          const int m,
443                          const int n,
444                          device_pointer<T>&& A,
445                          int lda,
446                          device_pointer<R>&& S,
447                          device_pointer<T>&& U,
448                          int ldu,
449                          device_pointer<T>&& VT,
450                          int ldvt,
451                          device_pointer<T>&& W,
452                          int lw,
453                          int& st)
454 {
455   int* devSt;
456   arch::malloc((void**)&devSt, sizeof(int));
457   cusolverStatus_t status =
458       cusolver::cusolver_gesvd(*A.handles.cusolverDn_handle, jobU, jobVT, m, n, to_address(A), lda, to_address(S),
459                                to_address(U), ldu, to_address(VT), ldvt, to_address(W), lw, devSt);
460   arch::memcopy(&st, devSt, sizeof(int), arch::memcopyD2H);
461   if (CUSOLVER_STATUS_SUCCESS != status)
462   {
463     std::cerr << " cublas_gesvd status, info: " << status << " " << st << std::endl;
464     std::cerr.flush();
465     throw std::runtime_error("Error: cublas_gesvd returned error code.");
466   }
467   arch::free(devSt);
468 }
469 
470 template<typename T, typename R>
gesvd(char jobU,char jobVT,const int m,const int n,device_pointer<T> && A,int lda,device_pointer<R> && S,device_pointer<T> && U,int ldu,device_pointer<T> && VT,int ldvt,device_pointer<T> && W,int lw,device_pointer<R> && RW,int & st)471 inline static void gesvd(char jobU,
472                          char jobVT,
473                          const int m,
474                          const int n,
475                          device_pointer<T>&& A,
476                          int lda,
477                          device_pointer<R>&& S,
478                          device_pointer<T>&& U,
479                          int ldu,
480                          device_pointer<T>&& VT,
481                          int ldvt,
482                          device_pointer<T>&& W,
483                          int lw,
484                          device_pointer<R>&& RW,
485                          int& st)
486 {
487   int* devSt;
488   arch::malloc((void**)&devSt, sizeof(int));
489   cusolverStatus_t status =
490       cusolver::cusolver_gesvd(*A.handles.cusolverDn_handle, jobU, jobVT, m, n, to_address(A), lda, to_address(S),
491                                to_address(U), ldu, to_address(VT), ldvt, to_address(W), lw, devSt);
492   arch::memcopy(&st, devSt, sizeof(int), arch::memcopyD2H);
493   if (CUSOLVER_STATUS_SUCCESS != status)
494   {
495     std::cerr << " cublas_gesvd status, info: " << status << " " << st << std::endl;
496     std::cerr.flush();
497     throw std::runtime_error("Error: cublas_gesvd returned error code.");
498   }
499   arch::free(devSt);
500 }
501 
502 
503 } // namespace device
504 
505 #endif
506