1 // Copyright (C) 2017 Davis E. King (davis@dlib.net)
2 // License: Boost Software License See LICENSE.txt for the full license.
3 #ifndef DLIB_DNN_CuSOLVER_CU_
4 #define DLIB_DNN_CuSOLVER_CU_
5
6 #ifdef DLIB_USE_CUDA
7
8 #include "cusolver_dlibapi.h"
9 #include <cublas_v2.h>
10 #include <cusolverDn.h>
11 #include "cuda_utils.h"
12
13 // ----------------------------------------------------------------------------------------
14
cusolver_get_error_string(cusolverStatus_t s)15 static const char* cusolver_get_error_string(cusolverStatus_t s)
16 {
17 switch(s)
18 {
19 case CUSOLVER_STATUS_NOT_INITIALIZED:
20 return "CUDA Runtime API initialization failed.";
21 case CUSOLVER_STATUS_ALLOC_FAILED:
22 return "CUDA Resources could not be allocated.";
23 default:
24 return "A call to cuSolver failed";
25 }
26 }
27
28 // Check the return value of a call to the cuSolver runtime for an error condition.
29 #define CHECK_CUSOLVER(call) \
30 do{ \
31 const cusolverStatus_t error = call; \
32 if (error != CUSOLVER_STATUS_SUCCESS) \
33 { \
34 std::ostringstream sout; \
35 sout << "Error while calling " << #call << " in file " << __FILE__ << ":" << __LINE__ << ". ";\
36 sout << "code: " << error << ", reason: " << cusolver_get_error_string(error);\
37 throw dlib::cusolver_error(sout.str()); \
38 } \
39 }while(false)
40
41 // ----------------------------------------------------------------------------------------
42 // ----------------------------------------------------------------------------------------
43
44 namespace dlib
45 {
46 namespace cuda
47 {
48
49 // -----------------------------------------------------------------------------------
50
51 class cusolver_context
52 {
53 public:
54 // not copyable
55 cusolver_context(const cusolver_context&) = delete;
56 cusolver_context& operator=(const cusolver_context&) = delete;
57
cusolver_context()58 cusolver_context()
59 {
60 handles.resize(16);
61 }
~cusolver_context()62 ~cusolver_context()
63 {
64 for (auto h : handles)
65 {
66 if (h)
67 cusolverDnDestroy(h);
68 }
69 }
70
get_handle()71 cusolverDnHandle_t get_handle (
72 )
73 {
74 int new_device_id;
75 CHECK_CUDA(cudaGetDevice(&new_device_id));
76 // make room for more devices if needed
77 if (new_device_id >= (long)handles.size())
78 handles.resize(new_device_id+16);
79
80 // If we don't have a handle already for this device then make one
81 if (!handles[new_device_id])
82 CHECK_CUSOLVER(cusolverDnCreate(&handles[new_device_id]));
83
84 // Finally, return the handle for the current device
85 return handles[new_device_id];
86 }
87
88 private:
89
90 std::vector<cusolverDnHandle_t> handles;
91 };
92
context()93 static cusolverDnHandle_t context()
94 {
95 thread_local cusolver_context c;
96 return c.get_handle();
97 }
98
99 // ------------------------------------------------------------------------------------
100 // ------------------------------------------------------------------------------------
101 // ------------------------------------------------------------------------------------
102
_cuda_set_to_identity_matrix(float * m,size_t nr)103 __global__ void _cuda_set_to_identity_matrix(float* m, size_t nr)
104 {
105 for (auto j : grid_stride_range(0, nr*nr))
106 {
107 if (j%(nr+1) == 0)
108 m[j] = 1;
109 else
110 m[j] = 0;
111 }
112 }
113
set_to_identity_matrix(tensor & m)114 void set_to_identity_matrix (
115 tensor& m
116 )
117 {
118 DLIB_CASSERT(m.size() == m.num_samples()*m.num_samples());
119 launch_kernel(_cuda_set_to_identity_matrix, max_jobs(m.size()), m.device(), m.num_samples());
120 }
121
122 // ------------------------------------------------------------------------------------
123
~inv()124 inv::~inv()
125 {
126 sync_if_needed();
127 }
128
129 // ------------------------------------------------------------------------------------
130
131 void inv::
operator ()(const tensor & m_,resizable_tensor & out)132 operator() (
133 const tensor& m_,
134 resizable_tensor& out
135 )
136 {
137 DLIB_CASSERT(m_.size() == m_.num_samples()*m_.num_samples(), "Input matrix must be square if you want to invert it.");
138 m = m_;
139
140 out.copy_size(m);
141 set_to_identity_matrix(out);
142
143 const int nc = m.num_samples();
144 int Lwork;
145 CHECK_CUSOLVER(cusolverDnSgetrf_bufferSize(context(), nc , nc, m.device(), nc, &Lwork));
146
147 if (Lwork > (int)workspace.size())
148 {
149 sync_if_needed();
150 workspace = cuda_data_ptr<float>(Lwork);
151 }
152 if (nc > (int)Ipiv.size())
153 {
154 sync_if_needed();
155 Ipiv = cuda_data_ptr<int>(nc);
156 }
157 if (info.size() != 1)
158 {
159 info = cuda_data_ptr<int>(1);
160 }
161
162 CHECK_CUSOLVER(cusolverDnSgetrf(context(), nc, nc, m.device(), nc, workspace, Ipiv, info));
163 CHECK_CUSOLVER(cusolverDnSgetrs(context(), CUBLAS_OP_N, nc, nc, m.device(), nc, Ipiv, out.device(), nc, info));
164 did_work_lately = true;
165 }
166
167 // ------------------------------------------------------------------------------------
168
169 int inv::
get_last_status()170 get_last_status(
171 )
172 {
173 std::vector<int> linfo;
174 memcpy(linfo, info);
175 if (linfo.size() != 0)
176 return linfo[0];
177 else
178 return 0;
179 }
180
181 // ------------------------------------------------------------------------------------
182
183 void inv::
sync_if_needed()184 sync_if_needed()
185 {
186 if (did_work_lately)
187 {
188 did_work_lately = false;
189 // make sure we wait until any previous kernel launches have finished
190 // before we do something like deallocate the GPU memory.
191 cudaDeviceSynchronize();
192 }
193 }
194
195 // ------------------------------------------------------------------------------------
196
197 }
198 }
199
200 #endif // DLIB_USE_CUDA
201
202 #endif // DLIB_DNN_CuSOLVER_CU_
203
204
205