1 // Copyright (C) 2017  Davis E. King (davis@dlib.net)
2 // License: Boost Software License   See LICENSE.txt for the full license.
3 #ifndef DLIB_DNN_CuSOLVER_CU_
4 #define DLIB_DNN_CuSOLVER_CU_
5 
6 #ifdef DLIB_USE_CUDA
7 
8 #include "cusolver_dlibapi.h"
9 #include <cublas_v2.h>
10 #include <cusolverDn.h>
11 #include "cuda_utils.h"
12 
13 // ----------------------------------------------------------------------------------------
14 
cusolver_get_error_string(cusolverStatus_t s)15 static const char* cusolver_get_error_string(cusolverStatus_t s)
16 {
17     switch(s)
18     {
19         case CUSOLVER_STATUS_NOT_INITIALIZED:
20             return "CUDA Runtime API initialization failed.";
21         case CUSOLVER_STATUS_ALLOC_FAILED:
22             return "CUDA Resources could not be allocated.";
23         default:
24             return "A call to cuSolver failed";
25     }
26 }
27 
28 // Check the return value of a call to the cuSolver runtime for an error condition.
29 #define CHECK_CUSOLVER(call)                                                      \
30 do{                                                                              \
31     const cusolverStatus_t error = call;                                         \
32     if (error != CUSOLVER_STATUS_SUCCESS)                                        \
33     {                                                                          \
34         std::ostringstream sout;                                               \
35         sout << "Error while calling " << #call << " in file " << __FILE__ << ":" << __LINE__ << ". ";\
36         sout << "code: " << error << ", reason: " << cusolver_get_error_string(error);\
37         throw dlib::cusolver_error(sout.str());                                \
38     }                                                                          \
39 }while(false)
40 
41 // ----------------------------------------------------------------------------------------
42 // ----------------------------------------------------------------------------------------
43 
44 namespace dlib
45 {
46     namespace cuda
47     {
48 
49     // -----------------------------------------------------------------------------------
50 
51         class cusolver_context
52         {
53         public:
54             // not copyable
55             cusolver_context(const cusolver_context&) = delete;
56             cusolver_context& operator=(const cusolver_context&) = delete;
57 
cusolver_context()58             cusolver_context()
59             {
60                 handles.resize(16);
61             }
~cusolver_context()62             ~cusolver_context()
63             {
64                 for (auto h : handles)
65                 {
66                     if (h)
67                         cusolverDnDestroy(h);
68                 }
69             }
70 
get_handle()71             cusolverDnHandle_t get_handle (
72             )
73             {
74                 int new_device_id;
75                 CHECK_CUDA(cudaGetDevice(&new_device_id));
76                 // make room for more devices if needed
77                 if (new_device_id >= (long)handles.size())
78                     handles.resize(new_device_id+16);
79 
80                 // If we don't have a handle already for this device then make one
81                 if (!handles[new_device_id])
82                     CHECK_CUSOLVER(cusolverDnCreate(&handles[new_device_id]));
83 
84                 // Finally, return the handle for the current device
85                 return handles[new_device_id];
86             }
87 
88         private:
89 
90             std::vector<cusolverDnHandle_t> handles;
91         };
92 
context()93         static cusolverDnHandle_t context()
94         {
95             thread_local cusolver_context c;
96             return c.get_handle();
97         }
98 
99     // ------------------------------------------------------------------------------------
100     // ------------------------------------------------------------------------------------
101     // ------------------------------------------------------------------------------------
102 
_cuda_set_to_identity_matrix(float * m,size_t nr)103         __global__ void _cuda_set_to_identity_matrix(float* m, size_t nr)
104         {
105             for (auto j : grid_stride_range(0, nr*nr))
106             {
107                 if (j%(nr+1) == 0)
108                     m[j] = 1;
109                 else
110                     m[j] = 0;
111             }
112         }
113 
set_to_identity_matrix(tensor & m)114         void set_to_identity_matrix (
115             tensor& m
116         )
117         {
118             DLIB_CASSERT(m.size() == m.num_samples()*m.num_samples());
119             launch_kernel(_cuda_set_to_identity_matrix, max_jobs(m.size()), m.device(), m.num_samples());
120         }
121 
122     // ------------------------------------------------------------------------------------
123 
~inv()124         inv::~inv()
125         {
126             sync_if_needed();
127         }
128 
129     // ------------------------------------------------------------------------------------
130 
131         void inv::
operator ()(const tensor & m_,resizable_tensor & out)132         operator() (
133             const tensor& m_,
134             resizable_tensor& out
135         )
136         {
137             DLIB_CASSERT(m_.size() == m_.num_samples()*m_.num_samples(), "Input matrix must be square if you want to invert it.");
138             m = m_;
139 
140             out.copy_size(m);
141             set_to_identity_matrix(out);
142 
143             const int nc = m.num_samples();
144             int Lwork;
145             CHECK_CUSOLVER(cusolverDnSgetrf_bufferSize(context(), nc , nc, m.device(), nc, &Lwork));
146 
147             if (Lwork > (int)workspace.size())
148             {
149                 sync_if_needed();
150                 workspace = cuda_data_ptr<float>(Lwork);
151             }
152             if (nc > (int)Ipiv.size())
153             {
154                 sync_if_needed();
155                 Ipiv = cuda_data_ptr<int>(nc);
156             }
157             if (info.size() != 1)
158             {
159                 info = cuda_data_ptr<int>(1);
160             }
161 
162             CHECK_CUSOLVER(cusolverDnSgetrf(context(), nc, nc, m.device(), nc, workspace, Ipiv, info));
163             CHECK_CUSOLVER(cusolverDnSgetrs(context(), CUBLAS_OP_N, nc, nc, m.device(), nc, Ipiv, out.device(), nc, info));
164             did_work_lately = true;
165         }
166 
167     // ------------------------------------------------------------------------------------
168 
169         int inv::
get_last_status()170         get_last_status(
171         )
172         {
173             std::vector<int> linfo;
174             memcpy(linfo, info);
175             if (linfo.size() != 0)
176                 return linfo[0];
177             else
178                 return 0;
179         }
180 
181     // ------------------------------------------------------------------------------------
182 
183         void inv::
sync_if_needed()184         sync_if_needed()
185         {
186             if (did_work_lately)
187             {
188                 did_work_lately = false;
189                 // make sure we wait until any previous kernel launches have finished
190                 // before we do something like deallocate the GPU memory.
191                 cudaDeviceSynchronize();
192             }
193         }
194 
195     // ------------------------------------------------------------------------------------
196 
197     }
198 }
199 
200 #endif // DLIB_USE_CUDA
201 
202 #endif // DLIB_DNN_CuSOLVER_CU_
203 
204 
205