1 // This file is part of OpenCV project. 2 // It is subject to the license terms in the LICENSE file found in the top-level directory 3 // of this distribution and at http://opencv.org/license.html. 4 5 #ifndef OPENCV_DNN_SRC_OP_CUDA_HPP 6 #define OPENCV_DNN_SRC_OP_CUDA_HPP 7 8 #ifdef HAVE_CUDA 9 #include "cuda4dnn/csl/stream.hpp" 10 #include "cuda4dnn/csl/event.hpp" 11 #include "cuda4dnn/csl/cublas.hpp" 12 #include "cuda4dnn/csl/cudnn.hpp" 13 #include "cuda4dnn/csl/tensor.hpp" 14 #include "cuda4dnn/csl/memory.hpp" 15 #include "cuda4dnn/csl/workspace.hpp" 16 #include "cuda4dnn/kernels/fp_conversion.hpp" 17 #endif 18 19 #include <opencv2/dnn/shape_utils.hpp> 20 #include <opencv2/core.hpp> 21 22 #include <cstddef> 23 #include <memory> 24 #include <iterator> 25 26 namespace cv { namespace dnn { 27 IS_DNN_CUDA_TARGET(int id)28 constexpr bool IS_DNN_CUDA_TARGET(int id) { 29 return id == DNN_TARGET_CUDA_FP16 || id == DNN_TARGET_CUDA; 30 } 31 haveCUDA()32 constexpr bool haveCUDA() { 33 #ifdef HAVE_CUDA 34 return true; 35 #else 36 return false; 37 #endif 38 } 39 40 #ifdef HAVE_CUDA 41 namespace cuda4dnn { namespace csl { 42 struct CSLContext { 43 Stream stream; 44 cublas::Handle cublas_handle; 45 cudnn::Handle cudnn_handle; 46 }; 47 48 /** @brief creates Tensor object from cv::Mat (only the header is created, i.e. no data is copied) 49 * 50 * \tparam T element type for the tensor 51 * \param[in] mat cv::Mat from which the shape must be inferred 52 * 53 * \return a Tensor object with the shape of \p mat 54 */ 55 template <class T> makeTensorHeader(const Mat & mat)56 Tensor<T> makeTensorHeader(const Mat& mat) { 57 auto sizes = shape(mat); 58 return Tensor<T>(std::begin(sizes), std::end(sizes)); 59 } 60 61 /** @brief copies data from a cv::Mat to TensorType 62 * 63 * \tparam T the type of the elements contained in TensorType object 64 * 65 * \param[in] srcMat source matrix 66 * \param[out] destTensor destination tensor 67 * \param stream CUDA stream to use for the memory transfer 68 * 69 * The memory copy starts from beginning \p srcMat. The number of elements copied is 70 * equal to the number of elements in \p destTensor. 71 * 72 * Pre-conditions: 73 * - \p srcMat must contain elements of type CV_32F 74 * - the size of \p srcMat must be larger than or equal to the size of \p destTensor 75 * 76 * @note best performance when \p srcMat is continuous and page-locked 77 * @note blocks calling thread if \p srcMat is not page-locked 78 */ 79 template <class T> 80 void copyMatToTensor(const Mat& srcMat, const TensorSpan<T> destTensor, const Stream& stream); 81 82 template <> inline copyMatToTensor(const Mat & srcMat,const TensorSpan<half> destTensor,const Stream & stream)83 void copyMatToTensor(const Mat& srcMat, const TensorSpan<half> destTensor, const Stream& stream) { 84 /* should perhaps convert cv::Mat of different type to the required type and copy */ 85 CV_Assert(srcMat.type() == CV_32F); 86 CV_Assert(srcMat.total() >= destTensor.size()); 87 88 Mat temp; 89 srcMat.convertTo(temp, CV_16F); 90 CV_Assert(temp.isContinuous()); 91 92 memcpy<half>(destTensor.get(), reinterpret_cast<half*>(temp.data), destTensor.size(), stream); 93 } 94 95 template <> inline copyMatToTensor(const Mat & srcMat,const TensorSpan<float> destTensor,const Stream & stream)96 void copyMatToTensor(const Mat& srcMat, const TensorSpan<float> destTensor, const Stream& stream) { 97 /* should perhaps convert cv::Mat of different type to the required type and copy */ 98 CV_Assert(srcMat.type() == CV_32F); 99 CV_Assert(srcMat.total() >= destTensor.size()); 100 101 Mat temp = srcMat.isContinuous() ? srcMat : srcMat.clone(); 102 CV_Assert(temp.isContinuous()); 103 104 memcpy<float>(destTensor.get(), reinterpret_cast<float*>(temp.data), destTensor.size(), stream); 105 } 106 107 /** @brief copies data from a TensorType to a cv::Mat 108 * 109 * \tparam T the type of the elements contained in TensorType object 110 * 111 * \param[in] srcTensor source tensor 112 * \param[out] destMat destination matrix 113 * \param stream CUDA stream to use for the memory transfer 114 * 115 * The entire memory block held by the \p srcTensor is copied to \p destMat. 116 * 117 * Pre-conditions: 118 * - \p destMat must contain elements of type CV_32F 119 * - the size of \p destMat must be larger than or equal to the size of \p srcTensor 120 * 121 * @note best performance when \p destMat is continuous and page-locked 122 * @note blocks calling thread if \p destMat is not page-locked 123 */ 124 template <class T> 125 void copyTensorToMat(TensorView<T> srcTensor, Mat& destMat, const Stream& stream); 126 127 template <> inline copyTensorToMat(TensorView<half> srcTensor,Mat & destMat,const Stream & stream)128 void copyTensorToMat(TensorView<half> srcTensor, Mat& destMat, const Stream& stream) { 129 CV_Assert(destMat.type() == CV_32F); 130 CV_Assert(destMat.total() >= srcTensor.size()); 131 132 Mat temp(shape(destMat), CV_16F); 133 CV_Assert(temp.isContinuous()); 134 135 memcpy<half>(reinterpret_cast<half*>(temp.data), srcTensor.get(), srcTensor.size(), stream); 136 137 temp.convertTo(destMat, CV_32F); 138 } 139 140 template <> inline copyTensorToMat(TensorView<float> srcTensor,Mat & destMat,const Stream & stream)141 void copyTensorToMat(TensorView<float> srcTensor, Mat& destMat, const Stream& stream) { 142 CV_Assert(destMat.type() == CV_32F); 143 CV_Assert(destMat.total() >= srcTensor.size()); 144 145 Mat temp = destMat.isContinuous() ? destMat : destMat.clone(); 146 CV_Assert(temp.isContinuous()); 147 148 memcpy<float>(reinterpret_cast<float*>(temp.data), srcTensor.get(), srcTensor.size(), stream); 149 150 if (temp.data != destMat.data) 151 temp.copyTo(destMat); 152 } 153 }} /* namespace cuda4dnn::csl */ 154 155 /** base class for CUDA operation nodes (for all supported targets) */ 156 class CUDABackendNode : public BackendNode { 157 public: CUDABackendNode()158 CUDABackendNode() : BackendNode(DNN_BACKEND_CUDA) { } ~CUDABackendNode()159 virtual ~CUDABackendNode() { } 160 161 virtual void forward( 162 const std::vector<cv::Ptr<BackendWrapper>>& inputs, 163 const std::vector<cv::Ptr<BackendWrapper>>& outputs, 164 cuda4dnn::csl::Workspace& workspace) = 0; 165 get_workspace_memory_in_bytes() const166 virtual std::size_t get_workspace_memory_in_bytes() const noexcept { return 0; } 167 }; 168 169 /** @brief utility function which creates CUDA node of correct type from `targetId` 170 * 171 * CUDA operation nodes take the type of data they operate on as a template parameter. 172 * For example, ConcatOp<float> is an operation node which concats tensors of `float` type 173 * into a tensor of `float` type. 174 * 175 * This utility function aids the creation of nodes of different types and eliminates the 176 * need for CUDA target constants (`DNN_TARGET_XXX`) to appear in the operation code which 177 * reduces coupling between modules. 178 * 179 * Example: 180 * template <class T> 181 * class ConcatOp : public CUDABackendNode; 182 * 183 * // returns a cv::Ptr to a ConcatOp<half> object 184 * auto node = make_cuda_node<ConcatOp>(DNN_TARGET_CUDA_FP16, axis); 185 * 186 * // returns a cv::Ptr to a ConcatOp<float> object 187 * auto node = make_cuda_node<ConcatOp>(DNN_TARGET_CUDA, axis); 188 */ 189 template <template <class> class NodeType, class ...Args> make_cuda_node(int targetId,Args &&...args)190 cv::Ptr<BackendNode> make_cuda_node(int targetId, Args&& ...args) { 191 switch (targetId) 192 { 193 case DNN_TARGET_CUDA_FP16: 194 return Ptr<BackendNode>(new NodeType<half>(std::forward<Args>(args)...)); 195 case DNN_TARGET_CUDA: 196 return Ptr<BackendNode>(new NodeType<float>(std::forward<Args>(args)...)); 197 default: 198 CV_Assert(IS_DNN_CUDA_TARGET(targetId)); 199 } 200 return Ptr<BackendNode>(); 201 } 202 203 /* base class for all CUDA backend/target wrappers */ 204 class CUDABackendWrapper : public BackendWrapper { 205 public: CUDABackendWrapper(int targetId)206 CUDABackendWrapper(int targetId) : BackendWrapper(DNN_BACKEND_CUDA, targetId) { } ~CUDABackendWrapper()207 virtual ~CUDABackendWrapper() { } 208 209 void copyToHost() override = 0; 210 virtual void copyToHostInBackground() = 0; 211 void setHostDirty() override = 0; 212 213 virtual void copyToDevice() = 0; 214 virtual void setDeviceDirty() = 0; 215 216 virtual MatShape getShape() const noexcept = 0; 217 virtual std::size_t getRank() const noexcept = 0; 218 219 /** @note setting the stream updates the stream for all wrappers which use the same tensor */ 220 virtual void setStream(cuda4dnn::csl::Stream stream, cuda4dnn::csl::Stream h2d_stream) noexcept = 0; 221 222 virtual void update(const MatShape& shape, std::size_t offset) = 0; 223 }; 224 225 namespace cuda4dnn { namespace detail { 226 227 template <class U> 228 void convert_D2H(const cv::Mat& mat, cuda4dnn::csl::View<U> view, cuda4dnn::csl::ManagedPtr<float>& device_temp, const cuda4dnn::csl::Stream& stream); 229 230 template <> inline convert_D2H(const cv::Mat & mat,cuda4dnn::csl::View<half> view,cuda4dnn::csl::ManagedPtr<float> & device_temp,const cuda4dnn::csl::Stream & stream)231 void convert_D2H<half>(const cv::Mat& mat, cuda4dnn::csl::View<half> view, cuda4dnn::csl::ManagedPtr<float>& device_temp, const cuda4dnn::csl::Stream& stream) { 232 if (device_temp.size() < view.size()) 233 device_temp.reset(view.size()); 234 auto temp_span = cuda4dnn::csl::Span<float>(device_temp.get(), view.size()); 235 236 cuda4dnn::kernels::fp16_to_fp32(stream, temp_span, view); 237 cuda4dnn::csl::memcpy<float>(reinterpret_cast<float*>(mat.data), temp_span.data(), view.size(), stream); 238 } 239 240 template <> inline convert_D2H(const cv::Mat & mat,cuda4dnn::csl::View<float> view,cuda4dnn::csl::ManagedPtr<float> & device_temp,const cuda4dnn::csl::Stream & stream)241 void convert_D2H<float>(const cv::Mat& mat, cuda4dnn::csl::View<float> view, cuda4dnn::csl::ManagedPtr<float>& device_temp, const cuda4dnn::csl::Stream& stream) { 242 cuda4dnn::csl::memcpy<float>(reinterpret_cast<float*>(mat.data), view.data(), view.size(), stream); 243 } 244 245 template <class U> 246 void convert_D2H_background(const cv::Mat& mat, cuda4dnn::csl::View<U> view, cuda4dnn::csl::ManagedPtr<float>& device_temp, const cuda4dnn::csl::Stream& stream, const cuda4dnn::csl::Stream& d2h_stream, cuda4dnn::csl::Event& d2h_event); 247 248 template <> inline convert_D2H_background(const cv::Mat & mat,cuda4dnn::csl::View<half> view,cuda4dnn::csl::ManagedPtr<float> & device_temp,const cuda4dnn::csl::Stream & stream,const cuda4dnn::csl::Stream & d2h_stream,cuda4dnn::csl::Event & d2h_event)249 void convert_D2H_background<half>(const cv::Mat& mat, cuda4dnn::csl::View<half> view, cuda4dnn::csl::ManagedPtr<float>& device_temp, const cuda4dnn::csl::Stream& stream, const cuda4dnn::csl::Stream& d2h_stream, cuda4dnn::csl::Event& d2h_event) { 250 if (device_temp.size() < view.size()) 251 device_temp.reset(view.size()); 252 auto temp_span = cuda4dnn::csl::Span<float>(device_temp.get(), view.size()); 253 254 /* The conversion kernel should can be executed in the background stream for better 255 * performance. We do it in the inference stream to prevent an unexplained performance 256 * regression on RTX 2080 Ti. Executing conversion kernel in the background stream causes 257 * everything to slow down (even operations that appear before the background transfer). 258 * 259 * TODO: identify the cause and move conversion kernel to the background stream 260 */ 261 cuda4dnn::kernels::fp16_to_fp32(stream, temp_span, view); 262 263 d2h_event.record(stream); // mark position in inference stream 264 cuda4dnn::csl::StreamWaitOnEvent(d2h_stream, d2h_event); // don't start transfer until data is available 265 cuda4dnn::csl::memcpy<float>(reinterpret_cast<float*>(mat.data), temp_span.data(), view.size(), d2h_stream); 266 } 267 268 template <> inline convert_D2H_background(const cv::Mat & mat,cuda4dnn::csl::View<float> view,cuda4dnn::csl::ManagedPtr<float> & device_temp,const cuda4dnn::csl::Stream & stream,const cuda4dnn::csl::Stream & d2h_stream,cuda4dnn::csl::Event & d2h_event)269 void convert_D2H_background<float>(const cv::Mat& mat, cuda4dnn::csl::View<float> view, cuda4dnn::csl::ManagedPtr<float>& device_temp, const cuda4dnn::csl::Stream& stream, const cuda4dnn::csl::Stream& d2h_stream, cuda4dnn::csl::Event& d2h_event) { 270 d2h_event.record(stream); 271 cuda4dnn::csl::StreamWaitOnEvent(d2h_stream, d2h_event); 272 cuda4dnn::csl::memcpy<float>(reinterpret_cast<float*>(mat.data), view.data(), view.size(), d2h_stream); 273 } 274 275 template <class U> 276 void convert_H2D(cuda4dnn::csl::Span<U> span, const cv::Mat& mat, cuda4dnn::csl::ManagedPtr<float>& device_temp, const cuda4dnn::csl::Stream& stream); 277 278 template <> inline convert_H2D(cuda4dnn::csl::Span<half> span,const cv::Mat & mat,cuda4dnn::csl::ManagedPtr<float> & device_temp,const cuda4dnn::csl::Stream & stream)279 void convert_H2D<half>(cuda4dnn::csl::Span<half> span, const cv::Mat& mat, cuda4dnn::csl::ManagedPtr<float>& device_temp, const cuda4dnn::csl::Stream& stream) { 280 if (device_temp.size() < span.size()) 281 device_temp.reset(span.size()); 282 auto temp_span = cuda4dnn::csl::Span<float>(device_temp.get(), span.size()); 283 284 cuda4dnn::csl::memcpy<float>(temp_span.data(), reinterpret_cast<float*>(mat.data), span.size(), stream); 285 cuda4dnn::kernels::fp32_to_fp16(stream, span, temp_span); 286 } 287 288 template <> inline convert_H2D(cuda4dnn::csl::Span<float> span,const cv::Mat & mat,cuda4dnn::csl::ManagedPtr<float> & device_temp,const cuda4dnn::csl::Stream & stream)289 void convert_H2D<float>(cuda4dnn::csl::Span<float> span, const cv::Mat& mat, cuda4dnn::csl::ManagedPtr<float>& device_temp, const cuda4dnn::csl::Stream& stream) { 290 cuda4dnn::csl::memcpy<float>(span.data(), reinterpret_cast<float*>(mat.data), span.size(), stream); 291 } 292 }} /* namespace cuda4dnn::detail */ 293 294 template <class T, int TargetID> 295 class GenericCUDABackendWrapper final : public CUDABackendWrapper { 296 public: 297 using value_type = T; 298 using tensor_span_type = cuda4dnn::csl::TensorSpan<value_type>; 299 using tensor_view_type = cuda4dnn::csl::TensorView<value_type>; 300 301 /* Pre-conditions: 302 * - there must be no other instance of `GenericCUDABackendWrapper` which wraps the host memory used by `m` 303 * - the host memory must remain allocated throughout the lifetime of this object 304 * 305 * Post-conditions: 306 * - the host memory used by \p m "may" be page-locked 307 */ GenericCUDABackendWrapper(Mat & m)308 GenericCUDABackendWrapper(Mat& m) 309 : CUDABackendWrapper(TargetID) 310 { 311 shape = cv::dnn::shape(m); 312 offset = 0; 313 314 shared_block = std::make_shared<shared_block_type>(); 315 shared_block->host_dirty = true; 316 shared_block->device_dirty = false; 317 318 shared_block->host = m; 319 320 try { 321 shared_block->memGuard = cuda4dnn::csl::MemoryLockGuard(m.data, m.total() * m.elemSize()); 322 } catch (...) { 323 /* a common reason for failure is that the host system (for example, a Jetson device) does not support it */ 324 /* we ignore the failure as this is just an optimization and not a requirement */ 325 } 326 327 shared_block->device = cuda4dnn::csl::ManagedPtr<T>(m.total()); 328 } 329 GenericCUDABackendWrapper(const Ptr<BackendWrapper> & base_,const MatShape & shape_)330 GenericCUDABackendWrapper(const Ptr<BackendWrapper>& base_, const MatShape& shape_) 331 : CUDABackendWrapper(TargetID) 332 { 333 const Ptr<GenericCUDABackendWrapper> base = base_.dynamicCast<GenericCUDABackendWrapper>(); 334 CV_Assert(base); 335 336 shape = shape_; 337 offset = 0; 338 shared_block = base->shared_block; 339 340 auto numel = total(shape_); 341 if (numel > shared_block->device.size()) 342 { 343 /* if the host memory was already page-locked, release it and register again with the new size */ 344 shared_block->memGuard = cuda4dnn::csl::MemoryLockGuard(); 345 try { 346 CV_Assert(shared_block->host.type() == CV_32F); 347 shared_block->memGuard = cuda4dnn::csl::MemoryLockGuard(shared_block->host.data, numel * sizeof(float)); 348 } catch (...) { 349 /* a common reason for failure is that the host system (for example, a Jetson device) does not support it */ 350 /* we ignore the failure as this is just an optimization and not a requirement */ 351 } 352 shared_block->device.reset(numel); 353 } 354 } 355 create(Mat & m)356 static Ptr<BackendWrapper> create(Mat& m) { 357 return Ptr<BackendWrapper>(new GenericCUDABackendWrapper(m)); 358 } 359 create(const Ptr<BackendWrapper> & base,const MatShape & shape)360 static Ptr<BackendWrapper> create(const Ptr<BackendWrapper>& base, const MatShape& shape) { 361 return Ptr<BackendWrapper>(new GenericCUDABackendWrapper(base, shape)); 362 } 363 copyToHost()364 void copyToHost() override { 365 if (shared_block->device_dirty) { 366 CV_Assert(offset == 0); /* we cannot track each piece of the memory separately */ 367 368 shared_block->host_dirty = false; 369 shared_block->device_dirty = false; 370 371 /* If the wrapper is being reused, the device tensor might be larger in size than the wrapper. 372 * Using the device tensor does not give incorrect code but leads to unused region of memory being copied. 373 * 374 * We use a view to ensure that only the required region of memory is copied. 375 */ 376 auto view = tensor_view_type(shared_block->device.get(), std::begin(shape), std::end(shape)); 377 378 auto& mat = shared_block->host; 379 CV_Assert(mat.isContinuous()); 380 CV_Assert(mat.type() == CV_32F); 381 382 cuda4dnn::detail::convert_D2H<T>(mat, view, shared_block->device_temp, shared_block->stream); 383 shared_block->stream.synchronize(); 384 } else if(shared_block->d2h_event && shared_block->d2h_event.busy()) { 385 /* wait for the background copy to finish */ 386 shared_block->d2h_event.synchronize(); 387 } 388 } 389 copyToHostInBackground()390 void copyToHostInBackground() override { 391 CV_Assert(shared_block->d2h_stream); 392 if (shared_block->device_dirty) { 393 shared_block->host_dirty = false; 394 shared_block->device_dirty = false; 395 396 auto view = tensor_view_type(shared_block->device.get(), std::begin(shape), std::end(shape)); 397 398 auto& mat = shared_block->host; 399 CV_Assert(mat.isContinuous()); 400 CV_Assert(mat.type() == CV_32F); 401 402 if (!shared_block->d2h_event) 403 shared_block->d2h_event = cuda4dnn::csl::Event(true); 404 cuda4dnn::detail::convert_D2H_background<T>(mat, view, shared_block->device_temp, shared_block->stream, shared_block->d2h_stream, shared_block->d2h_event); 405 shared_block->d2h_event.record(shared_block->d2h_stream); // record position so that we can check status later 406 } 407 } 408 setHostDirty()409 void setHostDirty() override { 410 shared_block->device_dirty = false; 411 shared_block->host_dirty = true; 412 } 413 copyToDevice()414 void copyToDevice() override { 415 if (shared_block->host_dirty) { 416 CV_Assert(offset == 0); /* we cannot track each piece of the memory separately */ 417 418 shared_block->host_dirty = false; 419 shared_block->device_dirty = false; 420 421 auto span = tensor_span_type(shared_block->device.get(), std::begin(shape), std::end(shape)); 422 423 auto& mat = shared_block->host; 424 CV_Assert(mat.isContinuous()); 425 CV_Assert(mat.type() == CV_32F); 426 427 cuda4dnn::detail::convert_H2D<T>(span, mat, shared_block->device_temp, shared_block->stream); 428 } 429 } 430 setDeviceDirty()431 void setDeviceDirty() override { 432 shared_block->device_dirty = true; 433 shared_block->host_dirty = false; 434 } 435 getShape() const436 MatShape getShape() const noexcept override { return shape; } 437 getRank() const438 std::size_t getRank() const noexcept override { return shape.size(); } 439 setStream(cuda4dnn::csl::Stream stream,cuda4dnn::csl::Stream d2h_stream)440 void setStream(cuda4dnn::csl::Stream stream, cuda4dnn::csl::Stream d2h_stream) noexcept override { 441 shared_block->stream = std::move(stream); 442 shared_block->d2h_stream = std::move(d2h_stream); 443 } 444 update(const MatShape & shape_,std::size_t offset_)445 void update(const MatShape& shape_, std::size_t offset_) override { 446 auto total = std::accumulate(std::begin(shape_), std::end(shape_), 1, std::multiplies<MatShape::value_type>()); 447 if (offset_ + total > shared_block->device.size()) { 448 CV_Error(Error::BadOffset, "shape and offset provided can potentially leads to OOB access"); 449 } 450 shape = shape_; 451 offset = offset_; 452 } 453 getMutableHostMat()454 cv::Mat getMutableHostMat() noexcept { 455 CV_Assert(offset == 0); /* we cannot track each piece of the memory separately */ 456 copyToHost(); 457 setHostDirty(); 458 return shared_block->host; 459 } 460 getImmutableHostMat() const461 const cv::Mat getImmutableHostMat() const noexcept { 462 CV_Assert(offset == 0); /* we cannot track each piece of the memory separately */ 463 copyToHost(); 464 return shared_block->host; 465 } 466 467 /* Optimization Note: use getSpan() and getView() judiciously 468 * 469 * getSpan() is meant to be used when the memory is going to be modified 470 * getView() is meant to be used when the memory is only going to be read 471 * 472 * getSpan() marks the device memory as dirty but getView() does not 473 * 474 * getView() implicitly performs host to device memory transfer if required 475 * getSpan() does not perform any synchronization (use copyToDevice if sync. is required) 476 */ getSpan()477 tensor_span_type getSpan() noexcept { 478 setDeviceDirty(); 479 return tensor_span_type(shared_block->device.get() + offset, std::begin(shape), std::end(shape)); 480 } 481 getView()482 tensor_view_type getView() noexcept { 483 copyToDevice(); 484 return tensor_view_type(shared_block->device.get() + offset, std::begin(shape), std::end(shape)); 485 } 486 487 private: 488 /* The same tensor memory can be reused by different layers whenever possible. 489 * Hence, it is possible for different backend wrappers to point to the same memory. 490 * However, it may use only a part of that memory and have a different shape. 491 * 492 * We store the common information such as device tensor and its corresponding host memory in 493 * a shared block. The shared block is shared by all backend wrappers which use the same memory. 494 * The shape, which can be different for different wrappers, is stored as a member object. 495 */ 496 497 MatShape shape; 498 std::size_t offset; 499 500 struct shared_block_type { 501 bool host_dirty; 502 bool device_dirty; 503 504 cv::Mat host; 505 cuda4dnn::csl::MemoryLockGuard memGuard; /* keeps host memory page-locked if possible */ 506 507 cuda4dnn::csl::ManagedPtr<T> device; 508 cuda4dnn::csl::ManagedPtr<float> device_temp; /* use for conversions */ 509 cuda4dnn::csl::Stream stream; 510 511 cuda4dnn::csl::Event d2h_event; 512 cuda4dnn::csl::Stream d2h_stream; 513 }; 514 515 std::shared_ptr<shared_block_type> shared_block; 516 }; 517 518 using CUDABackendWrapperFP16 = GenericCUDABackendWrapper<half, DNN_TARGET_CUDA_FP16>; 519 using CUDABackendWrapperFP32 = GenericCUDABackendWrapper<float, DNN_TARGET_CUDA>; 520 521 template <class T> struct GetCUDABackendWrapperType_ { }; 522 template <> struct GetCUDABackendWrapperType_<half> { typedef CUDABackendWrapperFP16 type; }; 523 template <> struct GetCUDABackendWrapperType_<float> { typedef CUDABackendWrapperFP32 type; }; 524 525 template <class T> 526 using GetCUDABackendWrapperType = typename GetCUDABackendWrapperType_<T>::type; 527 528 #endif 529 }} /* namespace cv::dnn */ 530 531 #endif /* OPENCV_DNN_SRC_OP_CUDA_HPP */ 532