1 //////////////////////////////////////////////////////////////////////////////////////
2 // This file is distributed under the University of Illinois/NCSA Open Source License.
3 // See LICENSE file in top directory for details.
4 //
5 // Copyright (c) 2016 Jeongnim Kim and QMCPACK developers.
6 //
7 // File developed by: Ken Esler, kpesler@gmail.com, University of Illinois at Urbana-Champaign
8 // Jeremy McMinnis, jmcminis@gmail.com, University of Illinois at Urbana-Champaign
9 // Jeongnim Kim, jeongnim.kim@gmail.com, University of Illinois at Urbana-Champaign
10 // Ying Wai Li, yingwaili@ornl.gov, Oak Ridge National Laboratory
11 // Ye Luo, yeluo@anl.gov, Argonne National Laboratory
12 //
13 // File created by: Ken Esler, kpesler@gmail.com, University of Illinois at Urbana-Champaign
14 //////////////////////////////////////////////////////////////////////////////////////
15
16
17 // ============ Matrix inversion using cuBLAS library ============ //
18 //
19 // To compile as a standalone test program:
20 //
21 // 1. Make sure libcublas.so is in the search path
22 // 2. cd to build/ directory
23 // 3. For real numbers, compile with
24 // nvcc -o cuda_inverse -arch=sm_35 -lcublas -DCUDA_TEST_MAIN
25 // ../src/Numerics/CUDA/cuda_inverse.cu
26 //
27 // For complex numbers, compile with
28 // nvcc -o cuda_inverse -arch=sm_35 -lcublas -DCUDA_TEST_MAIN
29 // -DQMC_COMPLEX=1 ../src/Numerics/CUDA/cuda_inverse.cu
30 //
31 // =============================================================== //
32
33 #include <cstdio>
34 #include <unistd.h>
35 #include <sstream>
36 #include <vector>
37 #include <iostream>
38 #include <complex>
39 #include <cuda.h>
40 #include <cublas_v2.h>
41 #include <cuComplex.h>
42
43 #define CONVERT_BS 256
44 #define INVERSE_BS 16
45
46
callAndCheckError(cudaError_t cudaFunc,const int line)47 void callAndCheckError(cudaError_t cudaFunc, const int line)
48 {
49 if (cudaFunc != cudaSuccess)
50 {
51 fprintf(stderr, "CUDA error in %s, line %d \n", __FILE__, line);
52 fprintf(stderr, "CUDA error message : %s \n", cudaGetErrorString(cudaFunc));
53 fflush(stderr);
54 abort();
55 }
56 }
57
callAndCheckError(cublasStatus_t cublasFunc,const int line)58 void callAndCheckError(cublasStatus_t cublasFunc, const int line)
59 {
60 if (cublasFunc != CUBLAS_STATUS_SUCCESS)
61 {
62 fprintf(stderr, "CUBLAS error in %s, line %d \n", __FILE__, line);
63 fprintf(stderr, "CUBLAS error message: ");
64 switch (cublasFunc)
65 {
66 case CUBLAS_STATUS_NOT_INITIALIZED:
67 fprintf(stderr, "CUBLAS_STATUS_NOT_INITIALIZED\n");
68 break;
69 case CUBLAS_STATUS_ALLOC_FAILED:
70 fprintf(stderr, "CUBLAS_STATUS_ALLOC_FAILED\n");
71 break;
72 case CUBLAS_STATUS_INVALID_VALUE:
73 fprintf(stderr, "CUBLAS_STATUS_INVALID_VALUE\n");
74 break;
75 case CUBLAS_STATUS_ARCH_MISMATCH:
76 fprintf(stderr, "CUBLAS_STATUS_ARCH_MISMATCH\n");
77 break;
78 case CUBLAS_STATUS_MAPPING_ERROR:
79 fprintf(stderr, "CUBLAS_STATUS_MAPPING_ERROR\n");
80 break;
81 case CUBLAS_STATUS_EXECUTION_FAILED:
82 fprintf(stderr, "CUBLAS_STATUS_EXECUTION_FAILED\n");
83 break;
84 case CUBLAS_STATUS_INTERNAL_ERROR:
85 fprintf(stderr, "CUBLAS_STATUS_INTERNAL_ERROR\n");
86 break;
87 #if (CUDA_VERSION >= 6050)
88 case CUBLAS_STATUS_NOT_SUPPORTED:
89 fprintf(stderr, "CUBLAS_STATUS_NOT_SUPPORTED\n");
90 break;
91 case CUBLAS_STATUS_LICENSE_ERROR:
92 fprintf(stderr, "CUBLAS_STATUS_LICENSE_ERROR\n");
93 break;
94 #endif
95 default:
96 fprintf(stderr, "unknown\n");
97 }
98 fflush(stderr);
99 abort();
100 }
101 }
102
103 // Convert matrix elements from one type (Tsrc) in the source matrix to
104 // another type (Tdest) and put them in the destination matrix
105 // (assumed src and dest have the same dimensions)
106 template<typename Tdest, typename Tsrc>
convert(Tdest ** dest_list,Tsrc ** src_list,int len)107 __global__ void convert(Tdest** dest_list, Tsrc** src_list, int len)
108 {
109 __shared__ Tsrc* mysrc;
110 __shared__ Tdest* mydest;
111 if (threadIdx.x == 0)
112 {
113 mysrc = src_list[blockIdx.y];
114 mydest = dest_list[blockIdx.y];
115 }
116 __syncthreads();
117 int i = blockIdx.x * CONVERT_BS + threadIdx.x;
118 if (i < len)
119 mydest[i] = (Tdest)mysrc[i];
120 }
121
122 // Convert for complex numbers
123 template<typename Tdest, typename Tdest2, typename Tsrc>
convert_complex(Tdest ** dest_list,Tsrc ** src_list,int len)124 __global__ void convert_complex(Tdest** dest_list, Tsrc** src_list, int len)
125 {
126 __shared__ Tsrc* mysrc;
127 __shared__ Tdest* mydest;
128 if (threadIdx.x == 0)
129 {
130 mysrc = src_list[blockIdx.y];
131 mydest = dest_list[blockIdx.y];
132 }
133 __syncthreads();
134 int i = blockIdx.x * CONVERT_BS + threadIdx.x;
135 if (i < len)
136 {
137 mydest[i].x = (Tdest2)mysrc[i].x;
138 mydest[i].y = (Tdest2)mysrc[i].y;
139 }
140 }
141
142 // Four matrix inversion functions
143 // 1. for float matrices
144 // useHigherPrecision = false --> single precision operations
145 // useHigherPrecision = true --> double precision operations (default)
cublas_inverse(cublasHandle_t handle,float * Alist_d[],float * Ainvlist_d[],float * AWorklist_d[],float * AinvWorklist_d[],int * PivotArray,int * infoArray,int N,int rowStride,int numMats,bool useHigherPrecision)146 void cublas_inverse(cublasHandle_t handle,
147 float* Alist_d[],
148 float* Ainvlist_d[],
149 float* AWorklist_d[],
150 float* AinvWorklist_d[],
151 int* PivotArray,
152 int* infoArray,
153 int N,
154 int rowStride,
155 int numMats,
156 bool useHigherPrecision)
157 {
158 // Info array tells if a matrix inversion is successful
159 // = 0 : successful
160 // = k : U(k,k) = 0; inversion failed
161
162 // If double precision operations are desired...
163 if (useHigherPrecision)
164 {
165 // (i) convert elements in Alist from float to double, put them in AWorklist
166 dim3 dimBlockConvert(CONVERT_BS);
167 dim3 dimGridConvert((N * rowStride + (CONVERT_BS - 1)) / CONVERT_BS, numMats);
168 convert<<<dimGridConvert, dimBlockConvert>>>((double**)AWorklist_d, Alist_d, N * rowStride);
169
170 // (ii) call cublas to do matrix inversion
171 // LU decomposition
172 callAndCheckError(cublasDgetrfBatched(handle, N, (double**)AWorklist_d, rowStride, PivotArray, infoArray, numMats),
173 __LINE__);
174
175 // Inversion
176 #if (CUDA_VERSION >= 6050)
177 callAndCheckError(cublasDgetriBatched(handle, N, (const double**)AWorklist_d, rowStride, PivotArray,
178 (double**)AinvWorklist_d, rowStride, infoArray + numMats, numMats),
179 __LINE__);
180 #else
181 callAndCheckError(cublasDgetriBatched(handle, N, (double**)AWorklist_d, rowStride, PivotArray,
182 (double**)AinvWorklist_d, rowStride, infoArray + numMats, numMats),
183 __LINE__);
184 #endif
185
186 // (iii) convert results back to single precision
187 convert<<<dimGridConvert, dimBlockConvert>>>(Ainvlist_d, (double**)AinvWorklist_d, N * rowStride);
188 }
189 // else, carry out single precision operations
190 else
191 {
192 // Call cublas to do matrix inversion
193 // LU decomposition
194 callAndCheckError(cublasSgetrfBatched(handle, N, Alist_d, rowStride, PivotArray, infoArray, numMats), __LINE__);
195
196 // Inversion
197 #if (CUDA_VERSION >= 6050)
198 callAndCheckError(cublasSgetriBatched(handle, N, (const float**)Alist_d, rowStride, PivotArray, Ainvlist_d,
199 rowStride, infoArray + numMats, numMats),
200 __LINE__);
201 #else
202 callAndCheckError(cublasSgetriBatched(handle, N, Alist_d, rowStride, PivotArray, Ainvlist_d, rowStride,
203 infoArray + numMats, numMats),
204 __LINE__);
205 #endif
206 }
207
208 cudaDeviceSynchronize();
209 }
210
211 // 2. for double matrices
cublas_inverse(cublasHandle_t handle,double * Alist_d[],double * Ainvlist_d[],double * AWorklist_d[],double * AinvWorklist_d[],int * PivotArray,int * infoArray,int N,int rowStride,int numMats,bool useHigherPrecision)212 void cublas_inverse(cublasHandle_t handle,
213 double* Alist_d[],
214 double* Ainvlist_d[],
215 double* AWorklist_d[],
216 double* AinvWorklist_d[],
217 int* PivotArray,
218 int* infoArray,
219 int N,
220 int rowStride,
221 int numMats,
222 bool useHigherPrecision)
223 {
224 // Info array tells if a matrix inversion is successful
225 // = 0 : successful
226 // = k : U(k,k) = 0; inversion failed
227
228 // (i) copy all the elements of Alist to AWorklist
229 dim3 dimBlockConvert(CONVERT_BS);
230 dim3 dimGridConvert((N * rowStride + (CONVERT_BS - 1)) / CONVERT_BS, numMats);
231 convert<<<dimGridConvert, dimBlockConvert>>>(AWorklist_d, Alist_d, N * rowStride);
232
233 // (ii) call cublas functions to do inversion
234 // LU decomposition
235 callAndCheckError(cublasDgetrfBatched(handle, N, AWorklist_d, rowStride, PivotArray, infoArray, numMats), __LINE__);
236
237 // Inversion
238 #if (CUDA_VERSION >= 6050)
239 callAndCheckError(cublasDgetriBatched(handle, N, (const double**)AWorklist_d, rowStride, PivotArray, Ainvlist_d,
240 rowStride, infoArray + numMats, numMats),
241 __LINE__);
242 #else
243 callAndCheckError(cublasDgetriBatched(handle, N, AWorklist_d, rowStride, PivotArray, Ainvlist_d, rowStride,
244 infoArray + numMats, numMats),
245 __LINE__);
246 #endif
247
248 cudaDeviceSynchronize();
249 }
250
251 // 3. for complex float matrices
252 // useHigherPrecision = false --> single precision operations
253 // useHigherPrecision = true --> double precision operations (default)
cublas_inverse(cublasHandle_t handle,std::complex<float> * Alist_d[],std::complex<float> * Ainvlist_d[],std::complex<float> * AWorklist_d[],std::complex<float> * AinvWorklist_d[],int * PivotArray,int * infoArray,int N,int rowStride,int numMats,bool useHigherPrecision)254 void cublas_inverse(cublasHandle_t handle,
255 std::complex<float>* Alist_d[],
256 std::complex<float>* Ainvlist_d[],
257 std::complex<float>* AWorklist_d[],
258 std::complex<float>* AinvWorklist_d[],
259 int* PivotArray,
260 int* infoArray,
261 int N,
262 int rowStride,
263 int numMats,
264 bool useHigherPrecision)
265 {
266 // Info array tells if a matrix inversion is successful
267 // = 0 : successful
268 // = k : U(k,k) = 0; inversion failed
269
270 // If double precision operations are desired...
271 if (useHigherPrecision)
272 {
273 // (i) convert elements in Alist from float to double, put them in AWorklist
274 dim3 dimBlockConvert(CONVERT_BS);
275 dim3 dimGridConvert((N * rowStride + (CONVERT_BS - 1)) / CONVERT_BS, numMats);
276 convert_complex<cuDoubleComplex, double, cuComplex>
277 <<<dimGridConvert, dimBlockConvert>>>((cuDoubleComplex**)AWorklist_d, (cuComplex**)Alist_d, N * rowStride);
278
279 // (ii) call cublas to do matrix inversion
280 // LU decomposition
281 callAndCheckError(cublasZgetrfBatched(handle, N, (cuDoubleComplex**)AWorklist_d, rowStride, PivotArray, infoArray,
282 numMats),
283 __LINE__);
284 // Inversion
285 #if (CUDA_VERSION >= 6050)
286 callAndCheckError(cublasZgetriBatched(handle, N, (const cuDoubleComplex**)AWorklist_d, rowStride, PivotArray,
287 (cuDoubleComplex**)AinvWorklist_d, rowStride, infoArray + numMats, numMats),
288 __LINE__);
289 #else
290 callAndCheckError(cublasZgetriBatched(handle, N, (cuDoubleComplex**)AWorklist_d, rowStride, PivotArray,
291 (cuDoubleComplex**)AinvWorklist_d, rowStride, infoArray + numMats, numMats),
292 __LINE__);
293 #endif
294
295 // (iii) convert results back to single precision
296 convert_complex<cuComplex, float, cuDoubleComplex>
297 <<<dimGridConvert, dimBlockConvert>>>((cuComplex**)Ainvlist_d, (cuDoubleComplex**)AinvWorklist_d,
298 N * rowStride);
299 }
300 // else, carry out single precision operations
301 else
302 {
303 // Call cublas to do matrix inversion
304 // LU decomposition
305 callAndCheckError(cublasCgetrfBatched(handle, N, (cuComplex**)Alist_d, rowStride, PivotArray, infoArray, numMats),
306 __LINE__);
307
308 // Inversion
309 #if (CUDA_VERSION >= 6050)
310 callAndCheckError(cublasCgetriBatched(handle, N, (const cuComplex**)Alist_d, rowStride, PivotArray,
311 (cuComplex**)Ainvlist_d, rowStride, infoArray + numMats, numMats),
312 __LINE__);
313 #else
314 callAndCheckError(cublasCgetriBatched(handle, N, (cuComplex**)Alist_d, rowStride, PivotArray,
315 (cuComplex**)Ainvlist_d, rowStride, infoArray + numMats, numMats),
316 __LINE__);
317 #endif
318 }
319
320 cudaDeviceSynchronize();
321 }
322
323 // 4. for complex double matrices
cublas_inverse(cublasHandle_t handle,std::complex<double> * Alist_d[],std::complex<double> * Ainvlist_d[],std::complex<double> * AWorklist_d[],std::complex<double> * AinvWorklist_d[],int * PivotArray,int * infoArray,int N,int rowStride,int numMats,bool useHigherPrecision)324 void cublas_inverse(cublasHandle_t handle,
325 std::complex<double>* Alist_d[],
326 std::complex<double>* Ainvlist_d[],
327 std::complex<double>* AWorklist_d[],
328 std::complex<double>* AinvWorklist_d[],
329 int* PivotArray,
330 int* infoArray,
331 int N,
332 int rowStride,
333 int numMats,
334 bool useHigherPrecision)
335 {
336 // Info array tells if a matrix inversion is successful
337 // = 0 : successful
338 // = k : U(k,k) = 0; inversion failed
339
340 // (i) copy all the elements of Alist to AWorklist
341 dim3 dimBlockConvert(CONVERT_BS);
342 dim3 dimGridConvert((N * rowStride + (CONVERT_BS - 1)) / CONVERT_BS, numMats);
343 convert_complex<cuDoubleComplex, double, cuDoubleComplex>
344 <<<dimGridConvert, dimBlockConvert>>>((cuDoubleComplex**)AWorklist_d, (cuDoubleComplex**)Alist_d, N * rowStride);
345
346 // (ii) call cublas to do matrix inversion
347 // LU decomposition
348 callAndCheckError(cublasZgetrfBatched(handle, N, (cuDoubleComplex**)AWorklist_d, rowStride, PivotArray, infoArray,
349 numMats),
350 __LINE__);
351 // Inversion
352 #if (CUDA_VERSION >= 6050)
353 callAndCheckError(cublasZgetriBatched(handle, N, (const cuDoubleComplex**)AWorklist_d, rowStride, PivotArray,
354 (cuDoubleComplex**)Ainvlist_d, rowStride, infoArray + numMats, numMats),
355 __LINE__);
356 #else
357 callAndCheckError(cublasZgetriBatched(handle, N, (cuDoubleComplex**)AWorklist_d, rowStride, PivotArray,
358 (cuDoubleComplex**)Ainvlist_d, rowStride, infoArray + numMats, numMats),
359 __LINE__);
360 #endif
361
362 cudaDeviceSynchronize();
363 }
364
365
366 //////////////////////////////////////////////////////
367 // Test routines //
368 //////////////////////////////////////////////////////
369
370 #ifdef CUDA_TEST_MAIN
371
372 template<typename T>
test_cublas_inverse(int matSize,int numMats)373 void test_cublas_inverse(int matSize, int numMats)
374 {
375 // Initialize cublas
376 cublasHandle_t handle;
377 callAndCheckError(cublasCreate(&handle), __LINE__);
378
379 srand48((long)12394);
380 int N = matSize;
381 int row_stride = (matSize + 15) / 16 * 16;
382 T **Alist, **AWorklist;
383 T **Alist_d, **AWorklist_d;
384 T **Clist, **CWorklist;
385 T **Clist_d, **CWorklist_d;
386
387 // Allocate arrays of pointers (one set on host, one set on device)
388 // pointing to the starting address (on device) of each matrix and its buffer
389 // (similar to DiracDeterminantCUDA)
390 Alist = (T**)malloc(numMats * sizeof(T*));
391 callAndCheckError(cudaMalloc((void**)&Alist_d, numMats * sizeof(T*)), __LINE__);
392
393 AWorklist = (T**)malloc(numMats * sizeof(T*));
394 callAndCheckError(cudaMalloc((void**)&AWorklist_d, numMats * sizeof(T*)), __LINE__);
395
396 Clist = (T**)malloc(numMats * sizeof(T*));
397 callAndCheckError(cudaMalloc((void**)&Clist_d, numMats * sizeof(T*)), __LINE__);
398
399 CWorklist = (T**)malloc(numMats * sizeof(T*));
400 callAndCheckError(cudaMalloc((void**)&CWorklist_d, numMats * sizeof(T*)), __LINE__);
401
402 // Generate matrices filled with random numbers
403 T* A = (T*)malloc(sizeof(T) * numMats * N * row_stride);
404
405 for (int j = 0; j < numMats; j++)
406 for (int i = 0; i < N * row_stride; i++)
407 {
408 #ifndef QMC_COMPLEX
409 A[j * N * row_stride + i] = 1.0 * (drand48() - 0.5);
410 #else
411 A[j * N * row_stride + i] = T(1.0 * (drand48() - 0.5), 1.0 * (drand48() - 0.5));
412 #endif
413 }
414
415 // Allocate memory on device for each matrix
416 for (int mat = 0; mat < numMats; mat++)
417 {
418 callAndCheckError(cudaMalloc((void**)&(Alist[mat]), N * row_stride * sizeof(T)), __LINE__);
419
420 callAndCheckError(cudaMemcpyAsync(Alist[mat], &A[mat * N * row_stride], N * row_stride * sizeof(T),
421 cudaMemcpyHostToDevice),
422 __LINE__);
423
424 callAndCheckError(cudaMalloc((void**)&(AWorklist[mat]), 2 * N * row_stride * sizeof(T)), __LINE__);
425
426 callAndCheckError(cudaMalloc((void**)&(Clist[mat]), N * row_stride * sizeof(T)), __LINE__);
427
428 callAndCheckError(cudaMalloc((void**)&(CWorklist[mat]), 2 * N * row_stride * sizeof(T)), __LINE__);
429 }
430
431 // Copy the starting address of each matrix
432 callAndCheckError(cudaMemcpyAsync(Alist_d, Alist, numMats * sizeof(T*), cudaMemcpyHostToDevice), __LINE__);
433
434 callAndCheckError(cudaMemcpyAsync(AWorklist_d, AWorklist, numMats * sizeof(T*), cudaMemcpyHostToDevice), __LINE__);
435
436 callAndCheckError(cudaMemcpyAsync(Clist_d, Clist, numMats * sizeof(T*), cudaMemcpyHostToDevice), __LINE__);
437
438 callAndCheckError(cudaMemcpyAsync(CWorklist_d, CWorklist, numMats * sizeof(T*), cudaMemcpyHostToDevice), __LINE__);
439
440 cudaDeviceSynchronize();
441
442 clock_t start = clock();
443
444 // Call cublas functions to do inversion
445 cublas_inverse(handle, Alist_d, Clist_d, AWorklist_d, CWorklist_d, N, row_stride, numMats, true);
446 cudaDeviceSynchronize();
447
448 clock_t end = clock();
449 double t = double(end - start) / double(CLOCKS_PER_SEC) / double(numMats);
450 double rate = 1.0 / t;
451 fprintf(stderr, "Rate is %1.3f matrix inversions per second.\n", rate);
452
453 // Copy A^(-1) back to host memory Ainv; one matrix at a time
454 // Calculate error of A^(-1)A from unit matrix I
455 for (int mat = 0; mat < numMats; mat++)
456 {
457 T Ainv[N * row_stride];
458 callAndCheckError(cudaMemcpy(Ainv, Clist[mat], N * row_stride * sizeof(T), cudaMemcpyDeviceToHost), __LINE__);
459
460 double error = 0.0;
461 for (int i = 0; i < N; i++)
462 for (int j = 0; j < N; j++)
463 {
464 T val = 0.0;
465 for (int k = 0; k < N; k++)
466 val += Ainv[i * row_stride + k] * A[mat * N * row_stride + k * row_stride + j];
467 double diff = (i == j) ? (1.0f - std::real(val)) : std::real(val);
468 error += diff * diff;
469 }
470 fprintf(stderr, "error = %1.8e\n", sqrt(error / (double)(N * N)));
471 }
472
473 // Finalize cublas
474 callAndCheckError(cublasDestroy(handle), __LINE__);
475
476 // Free resources on both host and device
477 for (int mat = 0; mat < numMats; mat++)
478 {
479 cudaFree(Alist[mat]);
480 cudaFree(Clist[mat]);
481 cudaFree(AWorklist[mat]);
482 cudaFree(CWorklist[mat]);
483 }
484 cudaFree(Alist_d);
485 cudaFree(Clist_d);
486 cudaFree(AWorklist_d);
487 cudaFree(CWorklist_d);
488 free(Alist);
489 free(Clist);
490 free(AWorklist);
491 free(CWorklist);
492 free(A);
493
494 // Reset device. Required for memory leak debugging
495 cudaDeviceReset();
496 }
497
498
main(int argc,char ** argv)499 int main(int argc, char** argv)
500 {
501 int matSize = 0;
502 int numMats = 0;
503
504 if (argc == 3)
505 {
506 matSize = atoi(argv[1]);
507 numMats = atoi(argv[2]);
508 }
509 else
510 {
511 printf("Usage: ./cuda_inverse [matrix size] [number of matrices]\n");
512 exit(1);
513 }
514
515 #ifndef QMC_COMPLEX
516 test_cublas_inverse<double>(matSize, numMats);
517 test_cublas_inverse<float>(matSize, numMats);
518 #else
519 test_cublas_inverse<std::complex<double>>(matSize, numMats);
520 test_cublas_inverse<std::complex<float>>(matSize, numMats);
521 #endif
522
523 return 0;
524 }
525
526 #endif
527