1 //////////////////////////////////////////////////////////////////////
2 // This file is distributed under the University of Illinois/NCSA Open Source
3 // License. See LICENSE file in top directory for details.
4 //
5 // Copyright (c) 2016 Jeongnim Kim and QMCPACK developers.
6 //
7 // File developed by:
8 // Lawrence Livermore National Laboratory
9 //
10 // File created by:
11 // Miguel A. Morales, moralessilva2@llnl.gov
12 // Lawrence Livermore National Laboratory
13 ////////////////////////////////////////////////////////////////////////////////
14
15 #ifndef AFQMC_LAPACK_GPU_HPP
16 #define AFQMC_LAPACK_GPU_HPP
17
18 #include <cassert>
19 #include "AFQMC/Utilities/type_conversion.hpp"
20 #include "AFQMC/Memory/custom_pointers.hpp"
21 #include "AFQMC/Memory/arch.hpp"
22 #include "AFQMC/Numerics/detail/CUDA/cublas_wrapper.hpp"
23 #include "AFQMC/Numerics/detail/CUDA/cusolver_wrapper.hpp"
24 #include "AFQMC/Numerics/detail/CUDA/Kernels/setIdentity.cuh"
25
26 namespace device
27 {
28 using qmcplusplus::afqmc::remove_complex;
29
30 template<typename T, typename R, typename I>
hevr(char JOBZ,char RANGE,char UPLO,int N,device_pointer<T> A,int LDA,T VL,T VU,int IL,int IU,T ABSTOL,int & M,device_pointer<T> W,device_pointer<T> Z,int LDZ,device_pointer<I> ISUPPZ,device_pointer<T> WORK,int & LWORK,device_pointer<R> RWORK,int & LRWORK,device_pointer<I> IWORK,int & LIWORK,int & INFO)31 inline static void hevr(char JOBZ,
32 char RANGE,
33 char UPLO,
34 int N,
35 device_pointer<T> A,
36 int LDA,
37 T VL,
38 T VU,
39 int IL,
40 int IU,
41 T ABSTOL,
42 int& M,
43 device_pointer<T> W,
44 device_pointer<T> Z,
45 int LDZ,
46 device_pointer<I> ISUPPZ,
47 device_pointer<T> WORK,
48 int& LWORK,
49 device_pointer<R> RWORK,
50 int& LRWORK,
51 device_pointer<I> IWORK,
52 int& LIWORK,
53 int& INFO)
54 {
55 throw std::runtime_error("Error: hevr not implemented in gpu.");
56 }
57
58 // getrf_bufferSize
59 template<typename T>
getrf_bufferSize(const int n,const int m,device_pointer<T> a,int lda,int & lwork)60 inline static void getrf_bufferSize(const int n, const int m, device_pointer<T> a, int lda, int& lwork)
61 {
62 cusolver::cusolver_getrf_bufferSize(*a.handles.cusolverDn_handle, n, m, to_address(a), lda, &lwork);
63 }
64
65 template<typename T, typename R, typename I>
getrf(const int n,const int m,device_pointer<T> && a,int lda,device_pointer<I> && piv,int & st,device_pointer<R> work)66 inline static void getrf(const int n,
67 const int m,
68 device_pointer<T>&& a,
69 int lda,
70 device_pointer<I>&& piv,
71 int& st,
72 device_pointer<R> work)
73 {
74 cusolverStatus_t status = cusolver::cusolver_getrf(*a.handles.cusolverDn_handle, n, m, to_address(a), lda,
75 to_address(work), to_address(piv), to_address(piv) + n);
76 arch::memcopy(&st, to_address(piv) + n, sizeof(int), arch::memcopyD2H);
77 if (CUSOLVER_STATUS_SUCCESS != status)
78 {
79 std::cerr << " cublas_getrf status, info: " << status << " " << st << std::endl;
80 std::cerr.flush();
81 throw std::runtime_error("Error: cublas_getrf returned error code.");
82 }
83 }
84
85 // getrfBatched
86 template<typename T, typename I>
getrfBatched(const int n,device_pointer<T> * a,int lda,device_pointer<I> piv,device_pointer<I> info,int batchSize)87 inline static void getrfBatched(const int n,
88 device_pointer<T>* a,
89 int lda,
90 device_pointer<I> piv,
91 device_pointer<I> info,
92 int batchSize)
93 {
94 T** A_d;
95 T** A_h;
96 A_h = new T*[batchSize];
97 for (int i = 0; i < batchSize; i++)
98 A_h[i] = to_address(a[i]);
99 arch::malloc((void**)&A_d, batchSize * sizeof(*A_h));
100 arch::memcopy(A_d, A_h, batchSize * sizeof(*A_h), arch::memcopyH2D);
101 cublasStatus_t status = cublas::cublas_getrfBatched(*(a[0]).handles.cublas_handle, n, A_d, lda, to_address(piv),
102 to_address(info), batchSize);
103 if (CUBLAS_STATUS_SUCCESS != status)
104 throw std::runtime_error("Error: cublas_getrf returned error code.");
105 arch::free(A_d);
106 delete[] A_h;
107 }
108
109 // getri_bufferSize
110 template<typename T>
getri_bufferSize(int n,device_pointer<T> a,int lda,int & lwork)111 inline static void getri_bufferSize(int n, device_pointer<T> a, int lda, int& lwork)
112 {
113 // gpu uses getrs to invert matrix, which requires n*n workspace
114 lwork = n * n;
115 }
116
117 // write separate query function to avoid hack!!!
118 template<typename T, typename R, typename I>
getri(int n,device_pointer<T> a,int lda,device_pointer<I> piv,device_pointer<R> work,int n1,int & status)119 inline static void getri(int n,
120 device_pointer<T> a,
121 int lda,
122 device_pointer<I> piv,
123 device_pointer<R> work,
124 int n1,
125 int& status)
126 {
127 if (n1 < n * n)
128 throw std::runtime_error("Error: getri<GPU_MEMORY_POINTER_TYPE> required buffer space of n*n.");
129 if (lda != n)
130 throw std::runtime_error("Error: getri<GPU_MEMORY_POINTER_TYPE> required lda = 1.");
131
132 int* info;
133 arch::malloc((void**)&info, sizeof(int), "lapack_cuda_gpu_ptr::getri");
134
135 kernels::set_identity(n, n, to_address(work), n);
136 if (CUSOLVER_STATUS_SUCCESS !=
137 cusolver::cusolver_getrs(*a.handles.cusolverDn_handle, CUBLAS_OP_N, n, n, to_address(a), lda, to_address(piv),
138 to_address(work), n, info))
139 throw std::runtime_error("Error: cusolver_getrs returned error code.");
140 arch::memcopy(to_address(a), to_address(work), n * n * sizeof(T), arch::memcopyD2D);
141 arch::memcopy(&status, info, sizeof(int), arch::memcopyD2H);
142 arch::free(info);
143 }
144
145 // getriBatched
146 template<typename T, typename I>
getriBatched(int n,device_pointer<T> * a,int lda,device_pointer<I> piv,device_pointer<T> * ainv,int ldc,device_pointer<I> info,int batchSize)147 inline static void getriBatched(int n,
148 device_pointer<T>* a,
149 int lda,
150 device_pointer<I> piv,
151 device_pointer<T>* ainv,
152 int ldc,
153 device_pointer<I> info,
154 int batchSize)
155 {
156 T **A_d, **C_d;
157 T **A_h, **C_h;
158 A_h = new T*[batchSize];
159 C_h = new T*[batchSize];
160 for (int i = 0; i < batchSize; i++)
161 {
162 A_h[i] = to_address(a[i]);
163 C_h[i] = to_address(ainv[i]);
164 }
165 arch::malloc((void**)&A_d, batchSize * sizeof(*A_h));
166 arch::malloc((void**)&C_d, batchSize * sizeof(*C_h));
167 arch::memcopy(A_d, A_h, batchSize * sizeof(*A_h), arch::memcopyH2D);
168 arch::memcopy(C_d, C_h, batchSize * sizeof(*C_h), arch::memcopyH2D);
169 cublasStatus_t status = cublas::cublas_getriBatched(*(a[0]).handles.cublas_handle, n, A_d, lda, to_address(piv), C_d,
170 ldc, to_address(info), batchSize);
171 if (CUBLAS_STATUS_SUCCESS != status)
172 throw std::runtime_error("Error: cublas_getri returned error code.");
173 arch::free(A_d);
174 arch::free(C_d);
175 delete[] A_h;
176 delete[] C_h;
177 }
178
179 // matinveBatched
180 template<typename T1, typename T2, typename I>
matinvBatched(int n,device_pointer<T1> * a,int lda,device_pointer<T2> * ainv,int lda_inv,device_pointer<I> info,int batchSize)181 inline static void matinvBatched(int n,
182 device_pointer<T1>* a,
183 int lda,
184 device_pointer<T2>* ainv,
185 int lda_inv,
186 device_pointer<I> info,
187 int batchSize)
188 {
189 T1 **A_d, **A_h;
190 T2 **C_d, **C_h;
191 A_h = new T1*[batchSize];
192 C_h = new T2*[batchSize];
193 for (int i = 0; i < batchSize; i++)
194 {
195 A_h[i] = to_address(a[i]);
196 C_h[i] = to_address(ainv[i]);
197 }
198 arch::malloc((void**)&A_d, batchSize * sizeof(*A_h));
199 arch::malloc((void**)&C_d, batchSize * sizeof(*C_h));
200 arch::memcopy(A_d, A_h, batchSize * sizeof(*A_h), arch::memcopyH2D);
201 arch::memcopy(C_d, C_h, batchSize * sizeof(*C_h), arch::memcopyH2D);
202 cublasStatus_t status = cublas::cublas_matinvBatched(*(a[0]).handles.cublas_handle, n, A_d, lda, C_d, lda_inv,
203 to_address(info), batchSize);
204 if (CUBLAS_STATUS_SUCCESS != status)
205 throw std::runtime_error("Error: cublas_matinv returned error code.");
206 arch::free(A_d);
207 arch::free(C_d);
208 delete[] A_h;
209 delete[] C_h;
210 }
211
212 // geqrf
213 template<typename T>
geqrf_bufferSize(int m,int n,device_pointer<T> a,int lda,int & lwork)214 inline static void geqrf_bufferSize(int m, int n, device_pointer<T> a, int lda, int& lwork)
215 {
216 if (CUSOLVER_STATUS_SUCCESS !=
217 cusolver::cusolver_geqrf_bufferSize(*a.handles.cusolverDn_handle, m, n, to_address(a), lda, &lwork))
218 throw std::runtime_error("Error: cusolver_geqrf_bufferSize returned error code.");
219 }
220
221 template<typename T>
geqrf(int M,int N,device_pointer<T> A,const int LDA,device_pointer<T> TAU,device_pointer<T> WORK,int LWORK,int & INFO)222 inline static void geqrf(int M,
223 int N,
224 device_pointer<T> A,
225 const int LDA,
226 device_pointer<T> TAU,
227 device_pointer<T> WORK,
228 int LWORK,
229 int& INFO)
230 {
231 // allocating here for now
232 int* piv;
233 arch::malloc((void**)&piv, sizeof(int), "lapack_cuda_gpu_ptr::geqrf");
234
235 cusolverStatus_t status = cusolver::cusolver_geqrf(*A.handles.cusolverDn_handle, M, N, to_address(A), LDA,
236 to_address(TAU), to_address(WORK), LWORK, piv);
237 arch::memcopy(&INFO, piv, sizeof(int), arch::memcopyD2H);
238 if (CUSOLVER_STATUS_SUCCESS != status)
239 {
240 int st;
241 std::cerr << " cublas_geqrf status, info: " << status << " " << INFO << std::endl;
242 std::cerr.flush();
243 throw std::runtime_error("Error: cublas_geqrf returned error code.");
244 }
245 arch::free(piv);
246 }
247
248 // gelqf
249 template<typename T>
gelqf_bufferSize(int m,int n,device_pointer<T> a,int lda,int & lwork)250 inline static void gelqf_bufferSize(int m, int n, device_pointer<T> a, int lda, int& lwork)
251 {
252 lwork = 0;
253 }
254
255 template<typename T>
gelqf(int M,int N,device_pointer<T> A,const int LDA,device_pointer<T> TAU,device_pointer<T> WORK,int LWORK,int & INFO)256 inline static void gelqf(int M,
257 int N,
258 device_pointer<T> A,
259 const int LDA,
260 device_pointer<T> TAU,
261 device_pointer<T> WORK,
262 int LWORK,
263 int& INFO)
264 {
265 throw std::runtime_error("Error: gelqf not implemented in CUDA backend. \n");
266 }
267
268 // gqr
269 template<typename T>
gqr_bufferSize(int m,int n,int k,device_pointer<T> a,int lda,int & lwork)270 static void gqr_bufferSize(int m, int n, int k, device_pointer<T> a, int lda, int& lwork)
271 {
272 if (CUSOLVER_STATUS_SUCCESS !=
273 cusolver::cusolver_gqr_bufferSize(*a.handles.cusolverDn_handle, m, n, k, to_address(a), lda, &lwork))
274 throw std::runtime_error("Error: cusolver_gqr_bufferSize returned error code.");
275 }
276
277 template<typename T>
gqr(int M,int N,int K,device_pointer<T> A,const int LDA,device_pointer<T> TAU,device_pointer<T> WORK,int LWORK,int & INFO)278 void static gqr(int M,
279 int N,
280 int K,
281 device_pointer<T> A,
282 const int LDA,
283 device_pointer<T> TAU,
284 device_pointer<T> WORK,
285 int LWORK,
286 int& INFO)
287 {
288 // allocating here for now
289 int* piv;
290 arch::malloc((void**)&piv, sizeof(int), "lapack_cuda_gpu_ptr::gqr");
291
292 cusolverStatus_t status = cusolver::cusolver_gqr(*A.handles.cusolverDn_handle, M, N, K, to_address(A), LDA,
293 to_address(TAU), to_address(WORK), LWORK, piv);
294 arch::memcopy(&INFO, piv, sizeof(int), arch::memcopyD2H);
295 if (CUSOLVER_STATUS_SUCCESS != status)
296 {
297 int st;
298 std::cerr << " cublas_gqr status, info: " << status << " " << INFO << std::endl;
299 std::cerr.flush();
300 throw std::runtime_error("Error: cublas_gqr returned error code.");
301 }
302 arch::free(piv);
303 }
304
305 template<typename T, typename I>
gqrStrided(int M,int N,int K,device_pointer<T> A,const int LDA,const int Astride,device_pointer<T> TAU,const int Tstride,device_pointer<T> WORK,int LWORK,device_pointer<I> info,int batchSize)306 void static gqrStrided(int M,
307 int N,
308 int K,
309 device_pointer<T> A,
310 const int LDA,
311 const int Astride,
312 device_pointer<T> TAU,
313 const int Tstride,
314 device_pointer<T> WORK,
315 int LWORK,
316 device_pointer<I> info,
317 int batchSize)
318 {
319 cusolverStatus_t status =
320 cusolver::cusolver_gqr_strided(*A.handles.cusolverDn_handle, M, N, K, to_address(A), LDA, Astride,
321 to_address(TAU), Tstride, to_address(WORK), LWORK, to_address(info), batchSize);
322 if (CUSOLVER_STATUS_SUCCESS != status)
323 {
324 std::cerr << " cublas_gqr_strided status: " << status << std::endl;
325 std::cerr.flush();
326 throw std::runtime_error("Error: cublas_gqr_strided returned error code.");
327 }
328 }
329
330 // glq
331 template<typename T>
glq_bufferSize(int m,int n,int k,device_pointer<T> a,int lda,int & lwork)332 static void glq_bufferSize(int m, int n, int k, device_pointer<T> a, int lda, int& lwork)
333 {
334 lwork = 0;
335 }
336
337 template<typename T>
glq(int M,int N,int K,device_pointer<T> A,const int LDA,device_pointer<T> TAU,device_pointer<T> WORK,int LWORK,int & INFO)338 void static glq(int M,
339 int N,
340 int K,
341 device_pointer<T> A,
342 const int LDA,
343 device_pointer<T> TAU,
344 device_pointer<T> WORK,
345 int LWORK,
346 int& INFO)
347 {
348 throw std::runtime_error("Error: glq not implemented in CUDA backend. \n");
349 }
350
351 // batched geqrf
352 template<typename T, typename I>
geqrfBatched(int M,int N,device_pointer<T> * A,const int LDA,device_pointer<T> * TAU,device_pointer<I> info,int batchSize)353 inline static void geqrfBatched(int M,
354 int N,
355 device_pointer<T>* A,
356 const int LDA,
357 device_pointer<T>* TAU,
358 device_pointer<I> info,
359 int batchSize)
360 {
361 T** B_h = new T*[2 * batchSize];
362 T** A_h(B_h);
363 T** T_h(B_h + batchSize);
364 for (int i = 0; i < batchSize; i++)
365 A_h[i] = to_address(A[i]);
366 for (int i = 0; i < batchSize; i++)
367 T_h[i] = to_address(TAU[i]);
368 T** B_d;
369 std::vector<int> inf(batchSize);
370 arch::malloc((void**)&B_d, 2 * batchSize * sizeof(*B_h));
371 arch::memcopy(B_d, B_h, 2 * batchSize * sizeof(*B_h), arch::memcopyH2D);
372 T** A_d(B_d);
373 T** T_d(B_d + batchSize);
374 cublasStatus_t status = cublas::cublas_geqrfBatched(*(A[0]).handles.cublas_handle, M, N, A_d, LDA, T_d,
375 to_address(inf.data()), batchSize);
376 if (CUBLAS_STATUS_SUCCESS != status)
377 throw std::runtime_error("Error: cublas_geqrfBatched returned error code.");
378 arch::free(B_d);
379 delete[] B_h;
380 }
381
382 template<typename T, typename I>
geqrfStrided(int M,int N,device_pointer<T> A,const int LDA,const int Astride,device_pointer<T> TAU,const int Tstride,device_pointer<I> info,int batchSize)383 inline static void geqrfStrided(int M,
384 int N,
385 device_pointer<T> A,
386 const int LDA,
387 const int Astride,
388 device_pointer<T> TAU,
389 const int Tstride,
390 device_pointer<I> info,
391 int batchSize)
392 {
393 /*
394 T **B_h = new T*[2*batchSize];
395 T **A_h(B_h);
396 T **T_h(B_h+batchSize);
397 for(int i=0; i<batchSize; i++)
398 A_h[i] = to_address(A)+i*Astride;
399 for(int i=0; i<batchSize; i++)
400 T_h[i] = to_address(TAU)+i*Tstride;
401 T **B_d;
402 arch::malloc((void **)&B_d, 2*batchSize*sizeof(*B_h));
403 arch::memcopy(B_d, B_h, 2*batchSize*sizeof(*B_h), arch::memcopyH2D);
404 T **A_d(B_d);
405 T **T_d(B_d+batchSize);
406 */
407
408 std::vector<int> inf(batchSize);
409 T** A_h = new T*[batchSize];
410 T** T_h = new T*[batchSize];
411 for (int i = 0; i < batchSize; i++)
412 A_h[i] = to_address(A) + i * Astride;
413 for (int i = 0; i < batchSize; i++)
414 T_h[i] = to_address(TAU) + i * Tstride;
415 T **A_d, **T_d;
416 arch::malloc((void**)&A_d, batchSize * sizeof(*A_h));
417 arch::memcopy(A_d, A_h, batchSize * sizeof(*A_h), arch::memcopyH2D);
418 arch::malloc((void**)&T_d, batchSize * sizeof(*T_h));
419 arch::memcopy(T_d, T_h, batchSize * sizeof(*T_h), arch::memcopyH2D);
420 cublasStatus_t status =
421 cublas::cublas_geqrfBatched(*A.handles.cublas_handle, M, N, A_d, LDA, T_d, to_address(inf.data()), batchSize);
422 for (int i = 0; i < batchSize; i++)
423 assert(inf[i] == 0);
424 if (CUBLAS_STATUS_SUCCESS != status)
425 throw std::runtime_error("Error: cublas_geqrfBatched returned error code.");
426 arch::free(A_d);
427 delete[] A_h;
428 arch::free(T_d);
429 delete[] T_h;
430 }
431
432 // gesvd_bufferSize
433 template<typename T>
gesvd_bufferSize(const int m,const int n,device_pointer<T> a,int & lwork)434 inline static void gesvd_bufferSize(const int m, const int n, device_pointer<T> a, int& lwork)
435 {
436 cusolver::cusolver_gesvd_bufferSize(*a.handles.cusolverDn_handle, m, n, to_address(a), &lwork);
437 }
438
439 template<typename T, typename R>
gesvd(char jobU,char jobVT,const int m,const int n,device_pointer<T> && A,int lda,device_pointer<R> && S,device_pointer<T> && U,int ldu,device_pointer<T> && VT,int ldvt,device_pointer<T> && W,int lw,int & st)440 inline static void gesvd(char jobU,
441 char jobVT,
442 const int m,
443 const int n,
444 device_pointer<T>&& A,
445 int lda,
446 device_pointer<R>&& S,
447 device_pointer<T>&& U,
448 int ldu,
449 device_pointer<T>&& VT,
450 int ldvt,
451 device_pointer<T>&& W,
452 int lw,
453 int& st)
454 {
455 int* devSt;
456 arch::malloc((void**)&devSt, sizeof(int));
457 cusolverStatus_t status =
458 cusolver::cusolver_gesvd(*A.handles.cusolverDn_handle, jobU, jobVT, m, n, to_address(A), lda, to_address(S),
459 to_address(U), ldu, to_address(VT), ldvt, to_address(W), lw, devSt);
460 arch::memcopy(&st, devSt, sizeof(int), arch::memcopyD2H);
461 if (CUSOLVER_STATUS_SUCCESS != status)
462 {
463 std::cerr << " cublas_gesvd status, info: " << status << " " << st << std::endl;
464 std::cerr.flush();
465 throw std::runtime_error("Error: cublas_gesvd returned error code.");
466 }
467 arch::free(devSt);
468 }
469
470 template<typename T, typename R>
gesvd(char jobU,char jobVT,const int m,const int n,device_pointer<T> && A,int lda,device_pointer<R> && S,device_pointer<T> && U,int ldu,device_pointer<T> && VT,int ldvt,device_pointer<T> && W,int lw,device_pointer<R> && RW,int & st)471 inline static void gesvd(char jobU,
472 char jobVT,
473 const int m,
474 const int n,
475 device_pointer<T>&& A,
476 int lda,
477 device_pointer<R>&& S,
478 device_pointer<T>&& U,
479 int ldu,
480 device_pointer<T>&& VT,
481 int ldvt,
482 device_pointer<T>&& W,
483 int lw,
484 device_pointer<R>&& RW,
485 int& st)
486 {
487 int* devSt;
488 arch::malloc((void**)&devSt, sizeof(int));
489 cusolverStatus_t status =
490 cusolver::cusolver_gesvd(*A.handles.cusolverDn_handle, jobU, jobVT, m, n, to_address(A), lda, to_address(S),
491 to_address(U), ldu, to_address(VT), ldvt, to_address(W), lw, devSt);
492 arch::memcopy(&st, devSt, sizeof(int), arch::memcopyD2H);
493 if (CUSOLVER_STATUS_SUCCESS != status)
494 {
495 std::cerr << " cublas_gesvd status, info: " << status << " " << st << std::endl;
496 std::cerr.flush();
497 throw std::runtime_error("Error: cublas_gesvd returned error code.");
498 }
499 arch::free(devSt);
500 }
501
502
503 } // namespace device
504
505 #endif
506