1 // This file is part of OpenCV project. 2 // It is subject to the license terms in the LICENSE file found in the top-level directory 3 // of this distribution and at http://opencv.org/license.html. 4 5 #ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_CUBLAS_HPP 6 #define OPENCV_DNN_SRC_CUDA4DNN_CSL_CUBLAS_HPP 7 8 #include "error.hpp" 9 #include "stream.hpp" 10 #include "pointer.hpp" 11 12 #include <opencv2/core.hpp> 13 14 #include <cublas_v2.h> 15 16 #include <cstddef> 17 #include <memory> 18 #include <utility> 19 20 #define CUDA4DNN_CHECK_CUBLAS(call) \ 21 ::cv::dnn::cuda4dnn::csl::cublas::detail::check((call), CV_Func, __FILE__, __LINE__) 22 23 namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cublas { 24 25 /** @brief exception class for errors thrown by the cuBLAS API */ 26 class cuBLASException : public CUDAException { 27 public: 28 using CUDAException::CUDAException; 29 }; 30 31 namespace detail { check(cublasStatus_t status,const char * func,const char * file,int line)32 static void check(cublasStatus_t status, const char* func, const char* file, int line) { 33 auto cublasGetErrorString = [](cublasStatus_t err) { 34 switch (err) { 35 case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS"; 36 case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED"; 37 case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED"; 38 case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE"; 39 case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH"; 40 case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR"; 41 case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED"; 42 case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR"; 43 case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED"; 44 case CUBLAS_STATUS_LICENSE_ERROR: return "CUBLAS_STATUS_LICENSE_ERROR"; 45 } 46 return "UNKNOWN_CUBLAS_ERROR"; 47 }; 48 49 if (status != CUBLAS_STATUS_SUCCESS) 50 throw cuBLASException(Error::GpuApiCallError, cublasGetErrorString(status), func, file, line); 51 } 52 } 53 54 /** non-copyable cuBLAS smart handle 55 * 56 * UniqueHandle is a smart non-sharable wrapper for cuBLAS handle which ensures that the handle 57 * is destroyed after use. The handle must always be associated with a non-default stream. The stream 58 * must be specified during construction. 59 * 60 * Refer to stream API for more information for the choice of forcing non-default streams. 61 */ 62 class UniqueHandle { 63 public: UniqueHandle()64 UniqueHandle() noexcept : handle{ nullptr } { } 65 UniqueHandle(UniqueHandle&) = delete; UniqueHandle(UniqueHandle && other)66 UniqueHandle(UniqueHandle&& other) noexcept { 67 stream = std::move(other.stream); 68 handle = other.handle; 69 other.handle = nullptr; 70 } 71 72 /** creates a cuBLAS handle and associates it with the stream specified 73 * 74 * Exception Guarantee: Basic 75 */ UniqueHandle(Stream strm)76 UniqueHandle(Stream strm) : stream(std::move(strm)) { 77 CV_Assert(stream); 78 CUDA4DNN_CHECK_CUBLAS(cublasCreate(&handle)); 79 try { 80 CUDA4DNN_CHECK_CUBLAS(cublasSetStream(handle, stream.get())); 81 } catch (...) { 82 /* cublasDestroy won't throw if a valid handle is passed */ 83 CUDA4DNN_CHECK_CUBLAS(cublasDestroy(handle)); 84 throw; 85 } 86 } 87 ~UniqueHandle()88 ~UniqueHandle() noexcept { 89 if (handle) { 90 /* cublasDestroy won't throw if a valid handle is passed */ 91 CUDA4DNN_CHECK_CUBLAS(cublasDestroy(handle)); 92 } 93 } 94 95 UniqueHandle& operator=(const UniqueHandle&) = delete; operator =(UniqueHandle && other)96 UniqueHandle& operator=(UniqueHandle&& other) noexcept { 97 CV_Assert(other); 98 if (&other != this) { 99 UniqueHandle(std::move(*this)); /* destroy current handle */ 100 stream = std::move(other.stream); 101 handle = other.handle; 102 other.handle = nullptr; 103 } 104 return *this; 105 } 106 107 /** returns the raw cuBLAS handle */ get() const108 cublasHandle_t get() const noexcept { 109 CV_Assert(handle); 110 return handle; 111 } 112 113 /** returns true if the handle is valid */ operator bool() const114 explicit operator bool() const noexcept { return static_cast<bool>(handle); } 115 116 private: 117 Stream stream; 118 cublasHandle_t handle; 119 }; 120 121 /** @brief sharable cuBLAS smart handle 122 * 123 * Handle is a smart sharable wrapper for cuBLAS handle which ensures that the handle 124 * is destroyed after all references to the handle are destroyed. The handle must always 125 * be associated with a non-default stream. The stream must be specified during construction. 126 * 127 * @note Moving a Handle object to another invalidates the former 128 */ 129 class Handle { 130 public: 131 Handle() = default; 132 Handle(const Handle&) = default; 133 Handle(Handle&&) = default; 134 135 /** creates a cuBLAS handle and associates it with the stream specified 136 * 137 * Exception Guarantee: Basic 138 */ Handle(Stream strm)139 Handle(Stream strm) : handle(std::make_shared<UniqueHandle>(std::move(strm))) { } 140 141 Handle& operator=(const Handle&) = default; 142 Handle& operator=(Handle&&) = default; 143 144 /** returns true if the handle is valid */ operator bool() const145 explicit operator bool() const noexcept { return static_cast<bool>(handle); } 146 147 /** returns the raw cuBLAS handle */ get() const148 cublasHandle_t get() const noexcept { 149 CV_Assert(handle); 150 return handle->get(); 151 } 152 153 private: 154 std::shared_ptr<UniqueHandle> handle; 155 }; 156 157 /** @brief GEMM for colummn-major matrices 158 * 159 * \f$ C = \alpha AB + \beta C \f$ 160 * 161 * @tparam T matrix element type (must be `half` or `float`) 162 * 163 * @param handle valid cuBLAS Handle 164 * @param transa use transposed matrix of A for computation 165 * @param transb use transposed matrix of B for computation 166 * @param rows_c number of rows in C 167 * @param cols_c number of columns in C 168 * @param common_dim common dimension of A (or trans A) and B (or trans B) 169 * @param alpha scale factor for AB 170 * @param[in] A pointer to column-major matrix A in device memory 171 * @param lda leading dimension of matrix A 172 * @param[in] B pointer to column-major matrix B in device memory 173 * @param ldb leading dimension of matrix B 174 * @param beta scale factor for C 175 * @param[in,out] C pointer to column-major matrix C in device memory 176 * @param ldc leading dimension of matrix C 177 * 178 * Exception Guarantee: Basic 179 */ 180 template <class T> 181 void gemm(const Handle& handle, 182 bool transa, bool transb, 183 std::size_t rows_c, std::size_t cols_c, std::size_t common_dim, 184 T alpha, const DevicePtr<const T> A, std::size_t lda, 185 const DevicePtr<const T> B, std::size_t ldb, 186 T beta, const DevicePtr<T> C, std::size_t ldc); 187 188 template <> inline gemm(const Handle & handle,bool transa,bool transb,std::size_t rows_c,std::size_t cols_c,std::size_t common_dim,half alpha,const DevicePtr<const half> A,std::size_t lda,const DevicePtr<const half> B,std::size_t ldb,half beta,const DevicePtr<half> C,std::size_t ldc)189 void gemm<half>(const Handle& handle, 190 bool transa, bool transb, 191 std::size_t rows_c, std::size_t cols_c, std::size_t common_dim, 192 half alpha, const DevicePtr<const half> A, std::size_t lda, 193 const DevicePtr<const half> B, std::size_t ldb, 194 half beta, const DevicePtr<half> C, std::size_t ldc) 195 { 196 CV_Assert(handle); 197 198 auto opa = transa ? CUBLAS_OP_T : CUBLAS_OP_N, 199 opb = transb ? CUBLAS_OP_T : CUBLAS_OP_N; 200 int irows_c = static_cast<int>(rows_c), 201 icols_c = static_cast<int>(cols_c), 202 icommon_dim = static_cast<int>(common_dim), 203 ilda = static_cast<int>(lda), 204 ildb = static_cast<int>(ldb), 205 ildc = static_cast<int>(ldc); 206 207 CUDA4DNN_CHECK_CUBLAS( 208 cublasHgemm( 209 handle.get(), 210 opa, opb, 211 irows_c, icols_c, icommon_dim, 212 &alpha, A.get(), ilda, 213 B.get(), ildb, 214 &beta, C.get(), ildc 215 ) 216 ); 217 } 218 219 template <> inline gemm(const Handle & handle,bool transa,bool transb,std::size_t rows_c,std::size_t cols_c,std::size_t common_dim,float alpha,const DevicePtr<const float> A,std::size_t lda,const DevicePtr<const float> B,std::size_t ldb,float beta,const DevicePtr<float> C,std::size_t ldc)220 void gemm<float>(const Handle& handle, 221 bool transa, bool transb, 222 std::size_t rows_c, std::size_t cols_c, std::size_t common_dim, 223 float alpha, const DevicePtr<const float> A, std::size_t lda, 224 const DevicePtr<const float> B, std::size_t ldb, 225 float beta, const DevicePtr<float> C, std::size_t ldc) 226 { 227 CV_Assert(handle); 228 229 auto opa = transa ? CUBLAS_OP_T : CUBLAS_OP_N, 230 opb = transb ? CUBLAS_OP_T : CUBLAS_OP_N; 231 int irows_c = static_cast<int>(rows_c), 232 icols_c = static_cast<int>(cols_c), 233 icommon_dim = static_cast<int>(common_dim), 234 ilda = static_cast<int>(lda), 235 ildb = static_cast<int>(ldb), 236 ildc = static_cast<int>(ldc); 237 238 CUDA4DNN_CHECK_CUBLAS( 239 cublasSgemm( 240 handle.get(), 241 opa, opb, 242 irows_c, icols_c, icommon_dim, 243 &alpha, A.get(), ilda, 244 B.get(), ildb, 245 &beta, C.get(), ildc 246 ) 247 ); 248 } 249 250 /** @brief Strided batched GEMM for colummn-major matrices 251 * 252 * \f$ C_i = \alpha A_i B_i + \beta C_i \f$ for a stack of matrices A, B and C indexed by i 253 * 254 * @tparam T matrix element type (must be `half` or `float`) 255 * 256 * @param handle valid cuBLAS Handle 257 * @param transa use transposed matrix of A_i for computation 258 * @param transb use transposed matrix of B_i for computation 259 * @param rows_c number of rows in C_i 260 * @param cols_c number of columns in C_i 261 * @param common_dim common dimension of A_i (or trans A_i) and B_i (or trans B_i) 262 * @param alpha scale factor for A_i B_i 263 * @param[in] A pointer to stack of column-major matrices A in device memory 264 * @param lda leading dimension of matrix A_i 265 * @param strideA stride between matrices in A 266 * @param[in] B pointer to stack of column-major matrices B in device memory 267 * @param ldb leading dimension of matrix B_i 268 * @param strideB stride between matrices in B 269 * @param beta scale factor for C_i 270 * @param[in,out] C pointer to stack of column-major matrices C in device memory 271 * @param ldc leading dimension of matrix C_i 272 * @param strideC stride between matrices in C 273 * @param batchCount number of matrices in the batch 274 * 275 * Exception Guarantee: Basic 276 */ 277 template <class T> 278 void gemmStridedBatched(const Handle& handle, 279 bool transa, bool transb, 280 std::size_t rows_c, std::size_t cols_c, std::size_t common_dim, 281 T alpha, const DevicePtr<const T> A, std::size_t lda, std::size_t strideA, 282 const DevicePtr<const T> B, std::size_t ldb, std::size_t strideB, 283 T beta, const DevicePtr<T> C, std::size_t ldc, std::size_t strideC, 284 std::size_t batchCount); 285 286 template <> inline gemmStridedBatched(const Handle & handle,bool transa,bool transb,std::size_t rows_c,std::size_t cols_c,std::size_t common_dim,half alpha,const DevicePtr<const half> A,std::size_t lda,std::size_t strideA,const DevicePtr<const half> B,std::size_t ldb,std::size_t strideB,half beta,const DevicePtr<half> C,std::size_t ldc,std::size_t strideC,std::size_t batchCount)287 void gemmStridedBatched<half>(const Handle& handle, 288 bool transa, bool transb, 289 std::size_t rows_c, std::size_t cols_c, std::size_t common_dim, 290 half alpha, const DevicePtr<const half> A, std::size_t lda, std::size_t strideA, 291 const DevicePtr<const half> B, std::size_t ldb, std::size_t strideB, 292 half beta, const DevicePtr<half> C, std::size_t ldc, std::size_t strideC, 293 std::size_t batchCount) 294 { 295 CV_Assert(handle); 296 297 const auto opa = transa ? CUBLAS_OP_T : CUBLAS_OP_N, 298 opb = transb ? CUBLAS_OP_T : CUBLAS_OP_N; 299 const auto irows_c = static_cast<int>(rows_c), 300 icols_c = static_cast<int>(cols_c), 301 icommon_dim = static_cast<int>(common_dim), 302 ilda = static_cast<int>(lda), 303 ildb = static_cast<int>(ldb), 304 ildc = static_cast<int>(ldc); 305 306 const auto batch_count = static_cast<int>(batchCount); 307 const auto stride_a = static_cast<long long int>(strideA), 308 stride_b = static_cast<long long int>(strideB), 309 stride_c = static_cast<long long int>(strideC); 310 311 CV_Assert(stride_c >= irows_c * icols_c); // output matrices must not overlap 312 313 CUDA4DNN_CHECK_CUBLAS( 314 cublasHgemmStridedBatched( 315 handle.get(), 316 opa, opb, 317 irows_c, icols_c, icommon_dim, 318 &alpha, A.get(), ilda, stride_a, 319 B.get(), ildb, stride_b, 320 &beta, C.get(), ildc, stride_c, 321 batch_count 322 ) 323 ); 324 } 325 326 template <> inline gemmStridedBatched(const Handle & handle,bool transa,bool transb,std::size_t rows_c,std::size_t cols_c,std::size_t common_dim,float alpha,const DevicePtr<const float> A,std::size_t lda,std::size_t strideA,const DevicePtr<const float> B,std::size_t ldb,std::size_t strideB,float beta,const DevicePtr<float> C,std::size_t ldc,std::size_t strideC,std::size_t batchCount)327 void gemmStridedBatched<float>(const Handle& handle, 328 bool transa, bool transb, 329 std::size_t rows_c, std::size_t cols_c, std::size_t common_dim, 330 float alpha, const DevicePtr<const float> A, std::size_t lda, std::size_t strideA, 331 const DevicePtr<const float> B, std::size_t ldb, std::size_t strideB, 332 float beta, const DevicePtr<float> C, std::size_t ldc, std::size_t strideC, 333 std::size_t batchCount) 334 { 335 CV_Assert(handle); 336 337 const auto opa = transa ? CUBLAS_OP_T : CUBLAS_OP_N, 338 opb = transb ? CUBLAS_OP_T : CUBLAS_OP_N; 339 const auto irows_c = static_cast<int>(rows_c), 340 icols_c = static_cast<int>(cols_c), 341 icommon_dim = static_cast<int>(common_dim), 342 ilda = static_cast<int>(lda), 343 ildb = static_cast<int>(ldb), 344 ildc = static_cast<int>(ldc); 345 346 const auto batch_count = static_cast<int>(batchCount); 347 const auto stride_a = static_cast<long long int>(strideA), 348 stride_b = static_cast<long long int>(strideB), 349 stride_c = static_cast<long long int>(strideC); 350 351 CV_Assert(stride_c >= irows_c * icols_c); // output matrices must not overlap 352 353 CUDA4DNN_CHECK_CUBLAS( 354 cublasSgemmStridedBatched( 355 handle.get(), 356 opa, opb, 357 irows_c, icols_c, icommon_dim, 358 &alpha, A.get(), ilda, stride_a, 359 B.get(), ildb, stride_b, 360 &beta, C.get(), ildc, stride_c, 361 batch_count 362 ) 363 ); 364 } 365 366 }}}}} /* namespace cv::dnn::cuda4dnn::csl::cublas */ 367 368 #endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_CUBLAS_HPP */ 369