1 //////////////////////////////////////////////////////////////////////////////////////
2 // This file is distributed under the University of Illinois/NCSA Open Source License.
3 // See LICENSE file in top directory for details.
4 //
5 // Copyright (c) 2016 Jeongnim Kim and QMCPACK developers.
6 //
7 // File developed by: Ken Esler, kpesler@gmail.com, University of Illinois at Urbana-Champaign
8 //		      Jeremy McMinnis, jmcminis@gmail.com, University of Illinois at Urbana-Champaign
9 //    		      Jeongnim Kim, jeongnim.kim@gmail.com, University of Illinois at Urbana-Champaign
10 //                    Ying Wai Li, yingwaili@ornl.gov, Oak Ridge National Laboratory
11 //                    Ye Luo, yeluo@anl.gov, Argonne National Laboratory
12 //
13 // File created by:  Ken Esler, kpesler@gmail.com, University of Illinois at Urbana-Champaign
14 //////////////////////////////////////////////////////////////////////////////////////
15 
16 
17 // ============ Matrix inversion using cuBLAS library ============ //
18 //
19 // To compile as a standalone test program:
20 //
21 // 1. Make sure libcublas.so is in the search path
22 // 2. cd to build/ directory
23 // 3. For real numbers, compile with
24 //    nvcc -o cuda_inverse -arch=sm_35 -lcublas -DCUDA_TEST_MAIN
25 //         ../src/Numerics/CUDA/cuda_inverse.cu
26 //
27 //    For complex numbers, compile with
28 //    nvcc -o cuda_inverse -arch=sm_35 -lcublas -DCUDA_TEST_MAIN
29 //         -DQMC_COMPLEX=1 ../src/Numerics/CUDA/cuda_inverse.cu
30 //
31 // =============================================================== //
32 
33 #include <cstdio>
34 #include <unistd.h>
35 #include <sstream>
36 #include <vector>
37 #include <iostream>
38 #include <complex>
39 #include <cuda.h>
40 #include <cublas_v2.h>
41 #include <cuComplex.h>
42 
43 #define CONVERT_BS 256
44 #define INVERSE_BS 16
45 
46 
callAndCheckError(cudaError_t cudaFunc,const int line)47 void callAndCheckError(cudaError_t cudaFunc, const int line)
48 {
49   if (cudaFunc != cudaSuccess)
50   {
51     fprintf(stderr, "CUDA error in %s, line %d \n", __FILE__, line);
52     fprintf(stderr, "CUDA error message : %s \n", cudaGetErrorString(cudaFunc));
53     fflush(stderr);
54     abort();
55   }
56 }
57 
callAndCheckError(cublasStatus_t cublasFunc,const int line)58 void callAndCheckError(cublasStatus_t cublasFunc, const int line)
59 {
60   if (cublasFunc != CUBLAS_STATUS_SUCCESS)
61   {
62     fprintf(stderr, "CUBLAS error in %s, line %d \n", __FILE__, line);
63     fprintf(stderr, "CUBLAS error message: ");
64     switch (cublasFunc)
65     {
66     case CUBLAS_STATUS_NOT_INITIALIZED:
67       fprintf(stderr, "CUBLAS_STATUS_NOT_INITIALIZED\n");
68       break;
69     case CUBLAS_STATUS_ALLOC_FAILED:
70       fprintf(stderr, "CUBLAS_STATUS_ALLOC_FAILED\n");
71       break;
72     case CUBLAS_STATUS_INVALID_VALUE:
73       fprintf(stderr, "CUBLAS_STATUS_INVALID_VALUE\n");
74       break;
75     case CUBLAS_STATUS_ARCH_MISMATCH:
76       fprintf(stderr, "CUBLAS_STATUS_ARCH_MISMATCH\n");
77       break;
78     case CUBLAS_STATUS_MAPPING_ERROR:
79       fprintf(stderr, "CUBLAS_STATUS_MAPPING_ERROR\n");
80       break;
81     case CUBLAS_STATUS_EXECUTION_FAILED:
82       fprintf(stderr, "CUBLAS_STATUS_EXECUTION_FAILED\n");
83       break;
84     case CUBLAS_STATUS_INTERNAL_ERROR:
85       fprintf(stderr, "CUBLAS_STATUS_INTERNAL_ERROR\n");
86       break;
87 #if (CUDA_VERSION >= 6050)
88     case CUBLAS_STATUS_NOT_SUPPORTED:
89       fprintf(stderr, "CUBLAS_STATUS_NOT_SUPPORTED\n");
90       break;
91     case CUBLAS_STATUS_LICENSE_ERROR:
92       fprintf(stderr, "CUBLAS_STATUS_LICENSE_ERROR\n");
93       break;
94 #endif
95     default:
96       fprintf(stderr, "unknown\n");
97     }
98     fflush(stderr);
99     abort();
100   }
101 }
102 
103 // Convert matrix elements from one type (Tsrc) in the source matrix to
104 // another type (Tdest) and put them in the destination matrix
105 // (assumed src and dest have the same dimensions)
106 template<typename Tdest, typename Tsrc>
convert(Tdest ** dest_list,Tsrc ** src_list,int len)107 __global__ void convert(Tdest** dest_list, Tsrc** src_list, int len)
108 {
109   __shared__ Tsrc* mysrc;
110   __shared__ Tdest* mydest;
111   if (threadIdx.x == 0)
112   {
113     mysrc  = src_list[blockIdx.y];
114     mydest = dest_list[blockIdx.y];
115   }
116   __syncthreads();
117   int i = blockIdx.x * CONVERT_BS + threadIdx.x;
118   if (i < len)
119     mydest[i] = (Tdest)mysrc[i];
120 }
121 
122 // Convert for complex numbers
123 template<typename Tdest, typename Tdest2, typename Tsrc>
convert_complex(Tdest ** dest_list,Tsrc ** src_list,int len)124 __global__ void convert_complex(Tdest** dest_list, Tsrc** src_list, int len)
125 {
126   __shared__ Tsrc* mysrc;
127   __shared__ Tdest* mydest;
128   if (threadIdx.x == 0)
129   {
130     mysrc  = src_list[blockIdx.y];
131     mydest = dest_list[blockIdx.y];
132   }
133   __syncthreads();
134   int i = blockIdx.x * CONVERT_BS + threadIdx.x;
135   if (i < len)
136   {
137     mydest[i].x = (Tdest2)mysrc[i].x;
138     mydest[i].y = (Tdest2)mysrc[i].y;
139   }
140 }
141 
142 // Four matrix inversion functions
143 // 1. for float matrices
144 //    useHigherPrecision = false --> single precision operations
145 //    useHigherPrecision = true  --> double precision operations  (default)
cublas_inverse(cublasHandle_t handle,float * Alist_d[],float * Ainvlist_d[],float * AWorklist_d[],float * AinvWorklist_d[],int * PivotArray,int * infoArray,int N,int rowStride,int numMats,bool useHigherPrecision)146 void cublas_inverse(cublasHandle_t handle,
147                     float* Alist_d[],
148                     float* Ainvlist_d[],
149                     float* AWorklist_d[],
150                     float* AinvWorklist_d[],
151                     int* PivotArray,
152                     int* infoArray,
153                     int N,
154                     int rowStride,
155                     int numMats,
156                     bool useHigherPrecision)
157 {
158   // Info array tells if a matrix inversion is successful
159   // = 0 : successful
160   // = k : U(k,k) = 0; inversion failed
161 
162   // If double precision operations are desired...
163   if (useHigherPrecision)
164   {
165     // (i)   convert elements in Alist from float to double, put them in AWorklist
166     dim3 dimBlockConvert(CONVERT_BS);
167     dim3 dimGridConvert((N * rowStride + (CONVERT_BS - 1)) / CONVERT_BS, numMats);
168     convert<<<dimGridConvert, dimBlockConvert>>>((double**)AWorklist_d, Alist_d, N * rowStride);
169 
170     // (ii)  call cublas to do matrix inversion
171     //       LU decomposition
172     callAndCheckError(cublasDgetrfBatched(handle, N, (double**)AWorklist_d, rowStride, PivotArray, infoArray, numMats),
173                       __LINE__);
174 
175     //       Inversion
176 #if (CUDA_VERSION >= 6050)
177     callAndCheckError(cublasDgetriBatched(handle, N, (const double**)AWorklist_d, rowStride, PivotArray,
178                                           (double**)AinvWorklist_d, rowStride, infoArray + numMats, numMats),
179                       __LINE__);
180 #else
181     callAndCheckError(cublasDgetriBatched(handle, N, (double**)AWorklist_d, rowStride, PivotArray,
182                                           (double**)AinvWorklist_d, rowStride, infoArray + numMats, numMats),
183                       __LINE__);
184 #endif
185 
186     // (iii) convert results back to single precision
187     convert<<<dimGridConvert, dimBlockConvert>>>(Ainvlist_d, (double**)AinvWorklist_d, N * rowStride);
188   }
189   // else, carry out single precision operations
190   else
191   {
192     // Call cublas to do matrix inversion
193     // LU decomposition
194     callAndCheckError(cublasSgetrfBatched(handle, N, Alist_d, rowStride, PivotArray, infoArray, numMats), __LINE__);
195 
196     // Inversion
197 #if (CUDA_VERSION >= 6050)
198     callAndCheckError(cublasSgetriBatched(handle, N, (const float**)Alist_d, rowStride, PivotArray, Ainvlist_d,
199                                           rowStride, infoArray + numMats, numMats),
200                       __LINE__);
201 #else
202     callAndCheckError(cublasSgetriBatched(handle, N, Alist_d, rowStride, PivotArray, Ainvlist_d, rowStride,
203                                           infoArray + numMats, numMats),
204                       __LINE__);
205 #endif
206   }
207 
208   cudaDeviceSynchronize();
209 }
210 
211 // 2. for double matrices
cublas_inverse(cublasHandle_t handle,double * Alist_d[],double * Ainvlist_d[],double * AWorklist_d[],double * AinvWorklist_d[],int * PivotArray,int * infoArray,int N,int rowStride,int numMats,bool useHigherPrecision)212 void cublas_inverse(cublasHandle_t handle,
213                     double* Alist_d[],
214                     double* Ainvlist_d[],
215                     double* AWorklist_d[],
216                     double* AinvWorklist_d[],
217                     int* PivotArray,
218                     int* infoArray,
219                     int N,
220                     int rowStride,
221                     int numMats,
222                     bool useHigherPrecision)
223 {
224   // Info array tells if a matrix inversion is successful
225   // = 0 : successful
226   // = k : U(k,k) = 0; inversion failed
227 
228   // (i)   copy all the elements of Alist to AWorklist
229   dim3 dimBlockConvert(CONVERT_BS);
230   dim3 dimGridConvert((N * rowStride + (CONVERT_BS - 1)) / CONVERT_BS, numMats);
231   convert<<<dimGridConvert, dimBlockConvert>>>(AWorklist_d, Alist_d, N * rowStride);
232 
233   // (ii)  call cublas functions to do inversion
234   //       LU decomposition
235   callAndCheckError(cublasDgetrfBatched(handle, N, AWorklist_d, rowStride, PivotArray, infoArray, numMats), __LINE__);
236 
237   //       Inversion
238 #if (CUDA_VERSION >= 6050)
239   callAndCheckError(cublasDgetriBatched(handle, N, (const double**)AWorklist_d, rowStride, PivotArray, Ainvlist_d,
240                                         rowStride, infoArray + numMats, numMats),
241                     __LINE__);
242 #else
243   callAndCheckError(cublasDgetriBatched(handle, N, AWorklist_d, rowStride, PivotArray, Ainvlist_d, rowStride,
244                                         infoArray + numMats, numMats),
245                     __LINE__);
246 #endif
247 
248   cudaDeviceSynchronize();
249 }
250 
251 // 3. for complex float matrices
252 //    useHigherPrecision = false --> single precision operations
253 //    useHigherPrecision = true  --> double precision operations  (default)
cublas_inverse(cublasHandle_t handle,std::complex<float> * Alist_d[],std::complex<float> * Ainvlist_d[],std::complex<float> * AWorklist_d[],std::complex<float> * AinvWorklist_d[],int * PivotArray,int * infoArray,int N,int rowStride,int numMats,bool useHigherPrecision)254 void cublas_inverse(cublasHandle_t handle,
255                     std::complex<float>* Alist_d[],
256                     std::complex<float>* Ainvlist_d[],
257                     std::complex<float>* AWorklist_d[],
258                     std::complex<float>* AinvWorklist_d[],
259                     int* PivotArray,
260                     int* infoArray,
261                     int N,
262                     int rowStride,
263                     int numMats,
264                     bool useHigherPrecision)
265 {
266   // Info array tells if a matrix inversion is successful
267   // = 0 : successful
268   // = k : U(k,k) = 0; inversion failed
269 
270   // If double precision operations are desired...
271   if (useHigherPrecision)
272   {
273     // (i)   convert elements in Alist from float to double, put them in AWorklist
274     dim3 dimBlockConvert(CONVERT_BS);
275     dim3 dimGridConvert((N * rowStride + (CONVERT_BS - 1)) / CONVERT_BS, numMats);
276     convert_complex<cuDoubleComplex, double, cuComplex>
277         <<<dimGridConvert, dimBlockConvert>>>((cuDoubleComplex**)AWorklist_d, (cuComplex**)Alist_d, N * rowStride);
278 
279     // (ii)  call cublas to do matrix inversion
280     //       LU decomposition
281     callAndCheckError(cublasZgetrfBatched(handle, N, (cuDoubleComplex**)AWorklist_d, rowStride, PivotArray, infoArray,
282                                           numMats),
283                       __LINE__);
284     //       Inversion
285 #if (CUDA_VERSION >= 6050)
286     callAndCheckError(cublasZgetriBatched(handle, N, (const cuDoubleComplex**)AWorklist_d, rowStride, PivotArray,
287                                           (cuDoubleComplex**)AinvWorklist_d, rowStride, infoArray + numMats, numMats),
288                       __LINE__);
289 #else
290     callAndCheckError(cublasZgetriBatched(handle, N, (cuDoubleComplex**)AWorklist_d, rowStride, PivotArray,
291                                           (cuDoubleComplex**)AinvWorklist_d, rowStride, infoArray + numMats, numMats),
292                       __LINE__);
293 #endif
294 
295     // (iii) convert results back to single precision
296     convert_complex<cuComplex, float, cuDoubleComplex>
297         <<<dimGridConvert, dimBlockConvert>>>((cuComplex**)Ainvlist_d, (cuDoubleComplex**)AinvWorklist_d,
298                                               N * rowStride);
299   }
300   // else, carry out single precision operations
301   else
302   {
303     // Call cublas to do matrix inversion
304     // LU decomposition
305     callAndCheckError(cublasCgetrfBatched(handle, N, (cuComplex**)Alist_d, rowStride, PivotArray, infoArray, numMats),
306                       __LINE__);
307 
308     // Inversion
309 #if (CUDA_VERSION >= 6050)
310     callAndCheckError(cublasCgetriBatched(handle, N, (const cuComplex**)Alist_d, rowStride, PivotArray,
311                                           (cuComplex**)Ainvlist_d, rowStride, infoArray + numMats, numMats),
312                       __LINE__);
313 #else
314     callAndCheckError(cublasCgetriBatched(handle, N, (cuComplex**)Alist_d, rowStride, PivotArray,
315                                           (cuComplex**)Ainvlist_d, rowStride, infoArray + numMats, numMats),
316                       __LINE__);
317 #endif
318   }
319 
320   cudaDeviceSynchronize();
321 }
322 
323 // 4. for complex double matrices
cublas_inverse(cublasHandle_t handle,std::complex<double> * Alist_d[],std::complex<double> * Ainvlist_d[],std::complex<double> * AWorklist_d[],std::complex<double> * AinvWorklist_d[],int * PivotArray,int * infoArray,int N,int rowStride,int numMats,bool useHigherPrecision)324 void cublas_inverse(cublasHandle_t handle,
325                     std::complex<double>* Alist_d[],
326                     std::complex<double>* Ainvlist_d[],
327                     std::complex<double>* AWorklist_d[],
328                     std::complex<double>* AinvWorklist_d[],
329                     int* PivotArray,
330                     int* infoArray,
331                     int N,
332                     int rowStride,
333                     int numMats,
334                     bool useHigherPrecision)
335 {
336   // Info array tells if a matrix inversion is successful
337   // = 0 : successful
338   // = k : U(k,k) = 0; inversion failed
339 
340   // (i)   copy all the elements of Alist to AWorklist
341   dim3 dimBlockConvert(CONVERT_BS);
342   dim3 dimGridConvert((N * rowStride + (CONVERT_BS - 1)) / CONVERT_BS, numMats);
343   convert_complex<cuDoubleComplex, double, cuDoubleComplex>
344       <<<dimGridConvert, dimBlockConvert>>>((cuDoubleComplex**)AWorklist_d, (cuDoubleComplex**)Alist_d, N * rowStride);
345 
346   // (ii)  call cublas to do matrix inversion
347   //       LU decomposition
348   callAndCheckError(cublasZgetrfBatched(handle, N, (cuDoubleComplex**)AWorklist_d, rowStride, PivotArray, infoArray,
349                                         numMats),
350                     __LINE__);
351   //       Inversion
352 #if (CUDA_VERSION >= 6050)
353   callAndCheckError(cublasZgetriBatched(handle, N, (const cuDoubleComplex**)AWorklist_d, rowStride, PivotArray,
354                                         (cuDoubleComplex**)Ainvlist_d, rowStride, infoArray + numMats, numMats),
355                     __LINE__);
356 #else
357   callAndCheckError(cublasZgetriBatched(handle, N, (cuDoubleComplex**)AWorklist_d, rowStride, PivotArray,
358                                         (cuDoubleComplex**)Ainvlist_d, rowStride, infoArray + numMats, numMats),
359                     __LINE__);
360 #endif
361 
362   cudaDeviceSynchronize();
363 }
364 
365 
366 //////////////////////////////////////////////////////
367 //                  Test routines                   //
368 //////////////////////////////////////////////////////
369 
370 #ifdef CUDA_TEST_MAIN
371 
372 template<typename T>
test_cublas_inverse(int matSize,int numMats)373 void test_cublas_inverse(int matSize, int numMats)
374 {
375   // Initialize cublas
376   cublasHandle_t handle;
377   callAndCheckError(cublasCreate(&handle), __LINE__);
378 
379   srand48((long)12394);
380   int N          = matSize;
381   int row_stride = (matSize + 15) / 16 * 16;
382   T **Alist, **AWorklist;
383   T **Alist_d, **AWorklist_d;
384   T **Clist, **CWorklist;
385   T **Clist_d, **CWorklist_d;
386 
387   // Allocate arrays of pointers (one set on host, one set on device)
388   // pointing to the starting address (on device) of each matrix and its buffer
389   // (similar to DiracDeterminantCUDA)
390   Alist = (T**)malloc(numMats * sizeof(T*));
391   callAndCheckError(cudaMalloc((void**)&Alist_d, numMats * sizeof(T*)), __LINE__);
392 
393   AWorklist = (T**)malloc(numMats * sizeof(T*));
394   callAndCheckError(cudaMalloc((void**)&AWorklist_d, numMats * sizeof(T*)), __LINE__);
395 
396   Clist = (T**)malloc(numMats * sizeof(T*));
397   callAndCheckError(cudaMalloc((void**)&Clist_d, numMats * sizeof(T*)), __LINE__);
398 
399   CWorklist = (T**)malloc(numMats * sizeof(T*));
400   callAndCheckError(cudaMalloc((void**)&CWorklist_d, numMats * sizeof(T*)), __LINE__);
401 
402   // Generate matrices filled with random numbers
403   T* A = (T*)malloc(sizeof(T) * numMats * N * row_stride);
404 
405   for (int j = 0; j < numMats; j++)
406     for (int i = 0; i < N * row_stride; i++)
407     {
408 #ifndef QMC_COMPLEX
409       A[j * N * row_stride + i] = 1.0 * (drand48() - 0.5);
410 #else
411       A[j * N * row_stride + i] = T(1.0 * (drand48() - 0.5), 1.0 * (drand48() - 0.5));
412 #endif
413     }
414 
415   // Allocate memory on device for each matrix
416   for (int mat = 0; mat < numMats; mat++)
417   {
418     callAndCheckError(cudaMalloc((void**)&(Alist[mat]), N * row_stride * sizeof(T)), __LINE__);
419 
420     callAndCheckError(cudaMemcpyAsync(Alist[mat], &A[mat * N * row_stride], N * row_stride * sizeof(T),
421                                       cudaMemcpyHostToDevice),
422                       __LINE__);
423 
424     callAndCheckError(cudaMalloc((void**)&(AWorklist[mat]), 2 * N * row_stride * sizeof(T)), __LINE__);
425 
426     callAndCheckError(cudaMalloc((void**)&(Clist[mat]), N * row_stride * sizeof(T)), __LINE__);
427 
428     callAndCheckError(cudaMalloc((void**)&(CWorklist[mat]), 2 * N * row_stride * sizeof(T)), __LINE__);
429   }
430 
431   // Copy the starting address of each matrix
432   callAndCheckError(cudaMemcpyAsync(Alist_d, Alist, numMats * sizeof(T*), cudaMemcpyHostToDevice), __LINE__);
433 
434   callAndCheckError(cudaMemcpyAsync(AWorklist_d, AWorklist, numMats * sizeof(T*), cudaMemcpyHostToDevice), __LINE__);
435 
436   callAndCheckError(cudaMemcpyAsync(Clist_d, Clist, numMats * sizeof(T*), cudaMemcpyHostToDevice), __LINE__);
437 
438   callAndCheckError(cudaMemcpyAsync(CWorklist_d, CWorklist, numMats * sizeof(T*), cudaMemcpyHostToDevice), __LINE__);
439 
440   cudaDeviceSynchronize();
441 
442   clock_t start = clock();
443 
444   // Call cublas functions to do inversion
445   cublas_inverse(handle, Alist_d, Clist_d, AWorklist_d, CWorklist_d, N, row_stride, numMats, true);
446   cudaDeviceSynchronize();
447 
448   clock_t end = clock();
449   double t    = double(end - start) / double(CLOCKS_PER_SEC) / double(numMats);
450   double rate = 1.0 / t;
451   fprintf(stderr, "Rate is %1.3f matrix inversions per second.\n", rate);
452 
453   // Copy A^(-1) back to host memory Ainv; one matrix at a time
454   // Calculate error of A^(-1)A from unit matrix I
455   for (int mat = 0; mat < numMats; mat++)
456   {
457     T Ainv[N * row_stride];
458     callAndCheckError(cudaMemcpy(Ainv, Clist[mat], N * row_stride * sizeof(T), cudaMemcpyDeviceToHost), __LINE__);
459 
460     double error = 0.0;
461     for (int i = 0; i < N; i++)
462       for (int j = 0; j < N; j++)
463       {
464         T val = 0.0;
465         for (int k = 0; k < N; k++)
466           val += Ainv[i * row_stride + k] * A[mat * N * row_stride + k * row_stride + j];
467         double diff = (i == j) ? (1.0f - std::real(val)) : std::real(val);
468         error += diff * diff;
469       }
470     fprintf(stderr, "error = %1.8e\n", sqrt(error / (double)(N * N)));
471   }
472 
473   // Finalize cublas
474   callAndCheckError(cublasDestroy(handle), __LINE__);
475 
476   // Free resources on both host and device
477   for (int mat = 0; mat < numMats; mat++)
478   {
479     cudaFree(Alist[mat]);
480     cudaFree(Clist[mat]);
481     cudaFree(AWorklist[mat]);
482     cudaFree(CWorklist[mat]);
483   }
484   cudaFree(Alist_d);
485   cudaFree(Clist_d);
486   cudaFree(AWorklist_d);
487   cudaFree(CWorklist_d);
488   free(Alist);
489   free(Clist);
490   free(AWorklist);
491   free(CWorklist);
492   free(A);
493 
494   // Reset device. Required for memory leak debugging
495   cudaDeviceReset();
496 }
497 
498 
main(int argc,char ** argv)499 int main(int argc, char** argv)
500 {
501   int matSize = 0;
502   int numMats = 0;
503 
504   if (argc == 3)
505   {
506     matSize = atoi(argv[1]);
507     numMats = atoi(argv[2]);
508   }
509   else
510   {
511     printf("Usage: ./cuda_inverse [matrix size] [number of matrices]\n");
512     exit(1);
513   }
514 
515 #ifndef QMC_COMPLEX
516   test_cublas_inverse<double>(matSize, numMats);
517   test_cublas_inverse<float>(matSize, numMats);
518 #else
519   test_cublas_inverse<std::complex<double>>(matSize, numMats);
520   test_cublas_inverse<std::complex<float>>(matSize, numMats);
521 #endif
522 
523   return 0;
524 }
525 
526 #endif
527