1 // This file is part of OpenCV project.
2 // It is subject to the license terms in the LICENSE file found in the top-level directory
3 // of this distribution and at http://opencv.org/license.html.
4 
5 #ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_CUBLAS_HPP
6 #define OPENCV_DNN_SRC_CUDA4DNN_CSL_CUBLAS_HPP
7 
8 #include "error.hpp"
9 #include "stream.hpp"
10 #include "pointer.hpp"
11 
12 #include <opencv2/core.hpp>
13 
14 #include <cublas_v2.h>
15 
16 #include <cstddef>
17 #include <memory>
18 #include <utility>
19 
20 #define CUDA4DNN_CHECK_CUBLAS(call) \
21     ::cv::dnn::cuda4dnn::csl::cublas::detail::check((call), CV_Func, __FILE__, __LINE__)
22 
23 namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cublas {
24 
25     /** @brief exception class for errors thrown by the cuBLAS API */
26     class cuBLASException : public CUDAException {
27     public:
28         using CUDAException::CUDAException;
29     };
30 
31     namespace detail {
check(cublasStatus_t status,const char * func,const char * file,int line)32         static void check(cublasStatus_t status, const char* func, const char* file, int line) {
33             auto cublasGetErrorString = [](cublasStatus_t err) {
34                 switch (err) {
35                 case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
36                 case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
37                 case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
38                 case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
39                 case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
40                 case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
41                 case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
42                 case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
43                 case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED";
44                 case CUBLAS_STATUS_LICENSE_ERROR: return "CUBLAS_STATUS_LICENSE_ERROR";
45                 }
46                 return "UNKNOWN_CUBLAS_ERROR";
47             };
48 
49             if (status != CUBLAS_STATUS_SUCCESS)
50                 throw cuBLASException(Error::GpuApiCallError, cublasGetErrorString(status), func, file, line);
51         }
52     }
53 
54     /** non-copyable cuBLAS smart handle
55      *
56      * UniqueHandle is a smart non-sharable wrapper for cuBLAS handle which ensures that the handle
57      * is destroyed after use. The handle must always be associated with a non-default stream. The stream
58      * must be specified during construction.
59      *
60      * Refer to stream API for more information for the choice of forcing non-default streams.
61      */
62     class UniqueHandle {
63     public:
UniqueHandle()64         UniqueHandle() noexcept : handle{ nullptr } { }
65         UniqueHandle(UniqueHandle&) = delete;
UniqueHandle(UniqueHandle && other)66         UniqueHandle(UniqueHandle&& other) noexcept {
67             stream = std::move(other.stream);
68             handle = other.handle;
69             other.handle = nullptr;
70         }
71 
72         /** creates a cuBLAS handle and associates it with the stream specified
73          *
74          * Exception Guarantee: Basic
75          */
UniqueHandle(Stream strm)76         UniqueHandle(Stream strm) : stream(std::move(strm)) {
77             CV_Assert(stream);
78             CUDA4DNN_CHECK_CUBLAS(cublasCreate(&handle));
79             try {
80                 CUDA4DNN_CHECK_CUBLAS(cublasSetStream(handle, stream.get()));
81             } catch (...) {
82                 /* cublasDestroy won't throw if a valid handle is passed */
83                 CUDA4DNN_CHECK_CUBLAS(cublasDestroy(handle));
84                 throw;
85             }
86         }
87 
~UniqueHandle()88         ~UniqueHandle() noexcept {
89             if (handle) {
90                 /* cublasDestroy won't throw if a valid handle is passed */
91                 CUDA4DNN_CHECK_CUBLAS(cublasDestroy(handle));
92             }
93         }
94 
95         UniqueHandle& operator=(const UniqueHandle&) = delete;
operator =(UniqueHandle && other)96         UniqueHandle& operator=(UniqueHandle&& other) noexcept {
97             CV_Assert(other);
98             if (&other != this) {
99                 UniqueHandle(std::move(*this)); /* destroy current handle */
100                 stream = std::move(other.stream);
101                 handle = other.handle;
102                 other.handle = nullptr;
103             }
104             return *this;
105         }
106 
107         /** returns the raw cuBLAS handle */
get() const108         cublasHandle_t get() const noexcept {
109             CV_Assert(handle);
110             return handle;
111         }
112 
113         /** returns true if the handle is valid */
operator bool() const114         explicit operator bool() const noexcept { return static_cast<bool>(handle); }
115 
116     private:
117         Stream stream;
118         cublasHandle_t handle;
119     };
120 
121     /** @brief sharable cuBLAS smart handle
122      *
123      * Handle is a smart sharable wrapper for cuBLAS handle which ensures that the handle
124      * is destroyed after all references to the handle are destroyed. The handle must always
125      * be associated with a non-default stream. The stream must be specified during construction.
126      *
127      * @note Moving a Handle object to another invalidates the former
128      */
129     class Handle {
130     public:
131         Handle() = default;
132         Handle(const Handle&) = default;
133         Handle(Handle&&) = default;
134 
135         /** creates a cuBLAS handle and associates it with the stream specified
136          *
137          * Exception Guarantee: Basic
138          */
Handle(Stream strm)139         Handle(Stream strm) : handle(std::make_shared<UniqueHandle>(std::move(strm))) { }
140 
141         Handle& operator=(const Handle&) = default;
142         Handle& operator=(Handle&&) = default;
143 
144         /** returns true if the handle is valid */
operator bool() const145         explicit operator bool() const noexcept { return static_cast<bool>(handle); }
146 
147         /** returns the raw cuBLAS handle */
get() const148         cublasHandle_t get() const noexcept {
149             CV_Assert(handle);
150             return handle->get();
151         }
152 
153     private:
154         std::shared_ptr<UniqueHandle> handle;
155     };
156 
157     /** @brief GEMM for colummn-major matrices
158      *
159      * \f$ C = \alpha AB + \beta C \f$
160      *
161      * @tparam          T           matrix element type (must be `half` or `float`)
162      *
163      * @param           handle      valid cuBLAS Handle
164      * @param           transa      use transposed matrix of A for computation
165      * @param           transb      use transposed matrix of B for computation
166      * @param           rows_c      number of rows in C
167      * @param           cols_c      number of columns in C
168      * @param           common_dim  common dimension of A (or trans A) and B (or trans B)
169      * @param           alpha       scale factor for AB
170      * @param[in]       A           pointer to column-major matrix A in device memory
171      * @param           lda         leading dimension of matrix A
172      * @param[in]       B           pointer to column-major matrix B in device memory
173      * @param           ldb         leading dimension of matrix B
174      * @param           beta        scale factor for C
175      * @param[in,out]   C           pointer to column-major matrix C in device memory
176      * @param           ldc         leading dimension of matrix C
177      *
178      * Exception Guarantee: Basic
179      */
180     template <class T>
181     void gemm(const Handle& handle,
182         bool transa, bool transb,
183         std::size_t rows_c, std::size_t cols_c, std::size_t common_dim,
184         T alpha, const DevicePtr<const T> A, std::size_t lda,
185         const DevicePtr<const T> B, std::size_t ldb,
186         T beta, const DevicePtr<T> C, std::size_t ldc);
187 
188     template <> inline
gemm(const Handle & handle,bool transa,bool transb,std::size_t rows_c,std::size_t cols_c,std::size_t common_dim,half alpha,const DevicePtr<const half> A,std::size_t lda,const DevicePtr<const half> B,std::size_t ldb,half beta,const DevicePtr<half> C,std::size_t ldc)189     void gemm<half>(const Handle& handle,
190         bool transa, bool transb,
191         std::size_t rows_c, std::size_t cols_c, std::size_t common_dim,
192         half alpha, const DevicePtr<const half> A, std::size_t lda,
193         const DevicePtr<const half> B, std::size_t ldb,
194         half beta, const DevicePtr<half> C, std::size_t ldc)
195     {
196         CV_Assert(handle);
197 
198         auto opa = transa ? CUBLAS_OP_T : CUBLAS_OP_N,
199             opb = transb ? CUBLAS_OP_T : CUBLAS_OP_N;
200         int irows_c = static_cast<int>(rows_c),
201             icols_c = static_cast<int>(cols_c),
202             icommon_dim = static_cast<int>(common_dim),
203             ilda = static_cast<int>(lda),
204             ildb = static_cast<int>(ldb),
205             ildc = static_cast<int>(ldc);
206 
207         CUDA4DNN_CHECK_CUBLAS(
208             cublasHgemm(
209                 handle.get(),
210                 opa, opb,
211                 irows_c, icols_c, icommon_dim,
212                 &alpha, A.get(), ilda,
213                 B.get(), ildb,
214                 &beta, C.get(), ildc
215             )
216         );
217     }
218 
219     template <> inline
gemm(const Handle & handle,bool transa,bool transb,std::size_t rows_c,std::size_t cols_c,std::size_t common_dim,float alpha,const DevicePtr<const float> A,std::size_t lda,const DevicePtr<const float> B,std::size_t ldb,float beta,const DevicePtr<float> C,std::size_t ldc)220     void gemm<float>(const Handle& handle,
221         bool transa, bool transb,
222         std::size_t rows_c, std::size_t cols_c, std::size_t common_dim,
223         float alpha, const DevicePtr<const float> A, std::size_t lda,
224         const DevicePtr<const float> B, std::size_t ldb,
225         float beta, const DevicePtr<float> C, std::size_t ldc)
226     {
227         CV_Assert(handle);
228 
229         auto opa = transa ? CUBLAS_OP_T : CUBLAS_OP_N,
230             opb = transb ? CUBLAS_OP_T : CUBLAS_OP_N;
231         int irows_c = static_cast<int>(rows_c),
232             icols_c = static_cast<int>(cols_c),
233             icommon_dim = static_cast<int>(common_dim),
234             ilda = static_cast<int>(lda),
235             ildb = static_cast<int>(ldb),
236             ildc = static_cast<int>(ldc);
237 
238         CUDA4DNN_CHECK_CUBLAS(
239             cublasSgemm(
240                 handle.get(),
241                 opa, opb,
242                 irows_c, icols_c, icommon_dim,
243                 &alpha, A.get(), ilda,
244                 B.get(), ildb,
245                 &beta, C.get(), ildc
246             )
247         );
248     }
249 
250     /** @brief Strided batched GEMM for colummn-major matrices
251      *
252      * \f$ C_i = \alpha A_i B_i + \beta C_i \f$ for a stack of matrices A, B and C indexed by i
253      *
254      * @tparam          T           matrix element type (must be `half` or `float`)
255      *
256      * @param           handle      valid cuBLAS Handle
257      * @param           transa      use transposed matrix of A_i for computation
258      * @param           transb      use transposed matrix of B_i for computation
259      * @param           rows_c      number of rows in C_i
260      * @param           cols_c      number of columns in C_i
261      * @param           common_dim  common dimension of A_i (or trans A_i) and B_i (or trans B_i)
262      * @param           alpha       scale factor for A_i B_i
263      * @param[in]       A           pointer to stack of column-major matrices A in device memory
264      * @param           lda         leading dimension of matrix A_i
265      * @param           strideA     stride between matrices in A
266      * @param[in]       B           pointer to stack of column-major matrices B in device memory
267      * @param           ldb         leading dimension of matrix B_i
268      * @param           strideB     stride between matrices in B
269      * @param           beta        scale factor for C_i
270      * @param[in,out]   C           pointer to stack of column-major matrices C in device memory
271      * @param           ldc         leading dimension of matrix C_i
272      * @param           strideC     stride between matrices in C
273      * @param           batchCount  number of matrices in the batch
274      *
275      * Exception Guarantee: Basic
276      */
277     template <class T>
278     void gemmStridedBatched(const Handle& handle,
279         bool transa, bool transb,
280         std::size_t rows_c, std::size_t cols_c, std::size_t common_dim,
281         T alpha, const DevicePtr<const T> A, std::size_t lda, std::size_t strideA,
282         const DevicePtr<const T> B, std::size_t ldb, std::size_t strideB,
283         T beta, const DevicePtr<T> C, std::size_t ldc, std::size_t strideC,
284         std::size_t batchCount);
285 
286     template <> inline
gemmStridedBatched(const Handle & handle,bool transa,bool transb,std::size_t rows_c,std::size_t cols_c,std::size_t common_dim,half alpha,const DevicePtr<const half> A,std::size_t lda,std::size_t strideA,const DevicePtr<const half> B,std::size_t ldb,std::size_t strideB,half beta,const DevicePtr<half> C,std::size_t ldc,std::size_t strideC,std::size_t batchCount)287     void gemmStridedBatched<half>(const Handle& handle,
288         bool transa, bool transb,
289         std::size_t rows_c, std::size_t cols_c, std::size_t common_dim,
290         half alpha, const DevicePtr<const half> A, std::size_t lda, std::size_t strideA,
291         const DevicePtr<const half> B, std::size_t ldb, std::size_t strideB,
292         half beta, const DevicePtr<half> C, std::size_t ldc, std::size_t strideC,
293         std::size_t batchCount)
294     {
295         CV_Assert(handle);
296 
297         const auto opa = transa ? CUBLAS_OP_T : CUBLAS_OP_N,
298                    opb = transb ? CUBLAS_OP_T : CUBLAS_OP_N;
299         const auto irows_c = static_cast<int>(rows_c),
300                    icols_c = static_cast<int>(cols_c),
301                    icommon_dim = static_cast<int>(common_dim),
302                    ilda = static_cast<int>(lda),
303                    ildb = static_cast<int>(ldb),
304                    ildc = static_cast<int>(ldc);
305 
306         const auto batch_count = static_cast<int>(batchCount);
307         const auto stride_a = static_cast<long long int>(strideA),
308                    stride_b = static_cast<long long int>(strideB),
309                    stride_c = static_cast<long long int>(strideC);
310 
311         CV_Assert(stride_c >= irows_c * icols_c); // output matrices must not overlap
312 
313         CUDA4DNN_CHECK_CUBLAS(
314             cublasHgemmStridedBatched(
315                 handle.get(),
316                 opa, opb,
317                 irows_c, icols_c, icommon_dim,
318                 &alpha, A.get(), ilda, stride_a,
319                 B.get(), ildb, stride_b,
320                 &beta, C.get(), ildc, stride_c,
321                 batch_count
322             )
323         );
324     }
325 
326     template <> inline
gemmStridedBatched(const Handle & handle,bool transa,bool transb,std::size_t rows_c,std::size_t cols_c,std::size_t common_dim,float alpha,const DevicePtr<const float> A,std::size_t lda,std::size_t strideA,const DevicePtr<const float> B,std::size_t ldb,std::size_t strideB,float beta,const DevicePtr<float> C,std::size_t ldc,std::size_t strideC,std::size_t batchCount)327     void gemmStridedBatched<float>(const Handle& handle,
328         bool transa, bool transb,
329         std::size_t rows_c, std::size_t cols_c, std::size_t common_dim,
330         float alpha, const DevicePtr<const float> A, std::size_t lda, std::size_t strideA,
331         const DevicePtr<const float> B, std::size_t ldb, std::size_t strideB,
332         float beta, const DevicePtr<float> C, std::size_t ldc, std::size_t strideC,
333         std::size_t batchCount)
334     {
335         CV_Assert(handle);
336 
337         const auto opa = transa ? CUBLAS_OP_T : CUBLAS_OP_N,
338                    opb = transb ? CUBLAS_OP_T : CUBLAS_OP_N;
339         const auto irows_c = static_cast<int>(rows_c),
340                    icols_c = static_cast<int>(cols_c),
341                    icommon_dim = static_cast<int>(common_dim),
342                    ilda = static_cast<int>(lda),
343                    ildb = static_cast<int>(ldb),
344                    ildc = static_cast<int>(ldc);
345 
346         const auto batch_count = static_cast<int>(batchCount);
347         const auto stride_a = static_cast<long long int>(strideA),
348                    stride_b = static_cast<long long int>(strideB),
349                    stride_c = static_cast<long long int>(strideC);
350 
351         CV_Assert(stride_c >= irows_c * icols_c); // output matrices must not overlap
352 
353         CUDA4DNN_CHECK_CUBLAS(
354             cublasSgemmStridedBatched(
355                 handle.get(),
356                 opa, opb,
357                 irows_c, icols_c, icommon_dim,
358                 &alpha, A.get(), ilda, stride_a,
359                 B.get(), ildb, stride_b,
360                 &beta, C.get(), ildc, stride_c,
361                 batch_count
362             )
363         );
364     }
365 
366 }}}}} /* namespace cv::dnn::cuda4dnn::csl::cublas */
367 
368 #endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_CUBLAS_HPP */
369