1 // This file is part of OpenCV project.
2 // It is subject to the license terms in the LICENSE file found in the top-level directory
3 // of this distribution and at http://opencv.org/license.html.
4 
5 #ifndef OPENCV_DNN_SRC_OP_CUDA_HPP
6 #define OPENCV_DNN_SRC_OP_CUDA_HPP
7 
8 #ifdef HAVE_CUDA
9 #include "cuda4dnn/csl/stream.hpp"
10 #include "cuda4dnn/csl/event.hpp"
11 #include "cuda4dnn/csl/cublas.hpp"
12 #include "cuda4dnn/csl/cudnn.hpp"
13 #include "cuda4dnn/csl/tensor.hpp"
14 #include "cuda4dnn/csl/memory.hpp"
15 #include "cuda4dnn/csl/workspace.hpp"
16 #include "cuda4dnn/kernels/fp_conversion.hpp"
17 #endif
18 
19 #include <opencv2/dnn/shape_utils.hpp>
20 #include <opencv2/core.hpp>
21 
22 #include <cstddef>
23 #include <memory>
24 #include <iterator>
25 
26 namespace cv { namespace dnn {
27 
IS_DNN_CUDA_TARGET(int id)28     constexpr bool IS_DNN_CUDA_TARGET(int id) {
29         return id == DNN_TARGET_CUDA_FP16 || id == DNN_TARGET_CUDA;
30     }
31 
haveCUDA()32     constexpr bool haveCUDA() {
33 #ifdef HAVE_CUDA
34         return true;
35 #else
36         return false;
37 #endif
38     }
39 
40 #ifdef HAVE_CUDA
41     namespace cuda4dnn { namespace csl {
42         struct CSLContext {
43             Stream stream;
44             cublas::Handle cublas_handle;
45             cudnn::Handle cudnn_handle;
46         };
47 
48         /** @brief creates Tensor object from cv::Mat (only the header is created, i.e. no data is copied)
49          *
50          * \tparam      T   element type for the tensor
51          * \param[in]   mat cv::Mat from which the shape must be inferred
52          *
53          * \return a Tensor object with the shape of \p mat
54          */
55         template <class T>
makeTensorHeader(const Mat & mat)56         Tensor<T> makeTensorHeader(const Mat& mat) {
57             auto sizes = shape(mat);
58             return Tensor<T>(std::begin(sizes), std::end(sizes));
59         }
60 
61         /** @brief copies data from a cv::Mat to TensorType
62          *
63          * \tparam  T   the type of the elements contained in TensorType object
64          *
65          * \param[in]   srcMat      source matrix
66          * \param[out]  destTensor  destination tensor
67          * \param       stream      CUDA stream to use for the memory transfer
68          *
69          * The memory copy starts from beginning \p srcMat. The number of elements copied is
70          * equal to the number of elements in \p destTensor.
71          *
72          * Pre-conditions:
73          * - \p srcMat must contain elements of type CV_32F
74          * - the size of \p srcMat must be larger than or equal to the size of \p destTensor
75          *
76          * @note best performance when \p srcMat is continuous and page-locked
77          * @note blocks calling thread if \p srcMat is not page-locked
78          */
79         template <class T>
80         void copyMatToTensor(const Mat& srcMat, const TensorSpan<T> destTensor, const Stream& stream);
81 
82         template <> inline
copyMatToTensor(const Mat & srcMat,const TensorSpan<half> destTensor,const Stream & stream)83         void copyMatToTensor(const Mat& srcMat, const TensorSpan<half> destTensor, const Stream& stream) {
84             /* should perhaps convert cv::Mat of different type to the required type and copy */
85             CV_Assert(srcMat.type() == CV_32F);
86             CV_Assert(srcMat.total() >= destTensor.size());
87 
88             Mat temp;
89             srcMat.convertTo(temp, CV_16F);
90             CV_Assert(temp.isContinuous());
91 
92             memcpy<half>(destTensor.get(), reinterpret_cast<half*>(temp.data), destTensor.size(), stream);
93         }
94 
95         template <> inline
copyMatToTensor(const Mat & srcMat,const TensorSpan<float> destTensor,const Stream & stream)96         void copyMatToTensor(const Mat& srcMat, const TensorSpan<float> destTensor, const Stream& stream) {
97             /* should perhaps convert cv::Mat of different type to the required type and copy */
98             CV_Assert(srcMat.type() == CV_32F);
99             CV_Assert(srcMat.total() >= destTensor.size());
100 
101             Mat temp = srcMat.isContinuous() ? srcMat : srcMat.clone();
102             CV_Assert(temp.isContinuous());
103 
104             memcpy<float>(destTensor.get(), reinterpret_cast<float*>(temp.data), destTensor.size(), stream);
105         }
106 
107         /** @brief copies data from a TensorType to a cv::Mat
108          *
109          * \tparam  T   the type of the elements contained in TensorType object
110          *
111          * \param[in]   srcTensor   source tensor
112          * \param[out]  destMat     destination matrix
113          * \param       stream      CUDA stream to use for the memory transfer
114          *
115          * The entire memory block held by the \p srcTensor is copied to \p destMat.
116          *
117          * Pre-conditions:
118          * - \p destMat must contain elements of type CV_32F
119          * - the size of \p destMat must be larger than or equal to the size of \p srcTensor
120          *
121          * @note best performance when \p destMat is continuous and page-locked
122          * @note blocks calling thread if \p destMat is not page-locked
123          */
124         template <class T>
125         void copyTensorToMat(TensorView<T> srcTensor, Mat& destMat, const Stream& stream);
126 
127         template <> inline
copyTensorToMat(TensorView<half> srcTensor,Mat & destMat,const Stream & stream)128         void copyTensorToMat(TensorView<half> srcTensor, Mat& destMat, const Stream& stream) {
129             CV_Assert(destMat.type() == CV_32F);
130             CV_Assert(destMat.total() >= srcTensor.size());
131 
132             Mat temp(shape(destMat), CV_16F);
133             CV_Assert(temp.isContinuous());
134 
135             memcpy<half>(reinterpret_cast<half*>(temp.data), srcTensor.get(), srcTensor.size(), stream);
136 
137             temp.convertTo(destMat, CV_32F);
138         }
139 
140         template <> inline
copyTensorToMat(TensorView<float> srcTensor,Mat & destMat,const Stream & stream)141         void copyTensorToMat(TensorView<float> srcTensor, Mat& destMat, const Stream& stream) {
142             CV_Assert(destMat.type() == CV_32F);
143             CV_Assert(destMat.total() >= srcTensor.size());
144 
145             Mat temp = destMat.isContinuous() ? destMat : destMat.clone();
146             CV_Assert(temp.isContinuous());
147 
148             memcpy<float>(reinterpret_cast<float*>(temp.data), srcTensor.get(), srcTensor.size(), stream);
149 
150             if (temp.data != destMat.data)
151                 temp.copyTo(destMat);
152         }
153     }} /* namespace cuda4dnn::csl */
154 
155     /** base class for CUDA operation nodes (for all supported targets) */
156     class CUDABackendNode : public BackendNode {
157     public:
CUDABackendNode()158         CUDABackendNode() : BackendNode(DNN_BACKEND_CUDA) { }
~CUDABackendNode()159         virtual ~CUDABackendNode() { }
160 
161         virtual void forward(
162             const std::vector<cv::Ptr<BackendWrapper>>& inputs,
163             const std::vector<cv::Ptr<BackendWrapper>>& outputs,
164             cuda4dnn::csl::Workspace& workspace) = 0;
165 
get_workspace_memory_in_bytes() const166         virtual std::size_t get_workspace_memory_in_bytes() const noexcept { return 0; }
167     };
168 
169     /** @brief utility function which creates CUDA node of correct type from `targetId`
170      *
171      * CUDA operation nodes take the type of data they operate on as a template parameter.
172      * For example, ConcatOp<float> is an operation node which concats tensors of `float` type
173      * into a tensor of `float` type.
174      *
175      * This utility function aids the creation of nodes of different types and eliminates the
176      * need for CUDA target constants (`DNN_TARGET_XXX`) to appear in the operation code which
177      * reduces coupling between modules.
178      *
179      * Example:
180      * template <class T>
181      * class ConcatOp : public CUDABackendNode;
182      *
183      * // returns a cv::Ptr to a ConcatOp<half> object
184      * auto node = make_cuda_node<ConcatOp>(DNN_TARGET_CUDA_FP16, axis);
185      *
186      * // returns a cv::Ptr to a ConcatOp<float> object
187      * auto node = make_cuda_node<ConcatOp>(DNN_TARGET_CUDA, axis);
188      */
189     template <template <class> class NodeType, class ...Args>
make_cuda_node(int targetId,Args &&...args)190     cv::Ptr<BackendNode> make_cuda_node(int targetId, Args&& ...args) {
191         switch (targetId)
192         {
193         case DNN_TARGET_CUDA_FP16:
194             return Ptr<BackendNode>(new NodeType<half>(std::forward<Args>(args)...));
195         case DNN_TARGET_CUDA:
196             return Ptr<BackendNode>(new NodeType<float>(std::forward<Args>(args)...));
197         default:
198             CV_Assert(IS_DNN_CUDA_TARGET(targetId));
199         }
200         return Ptr<BackendNode>();
201     }
202 
203     /* base class for all CUDA backend/target wrappers */
204     class CUDABackendWrapper : public BackendWrapper {
205     public:
CUDABackendWrapper(int targetId)206         CUDABackendWrapper(int targetId) : BackendWrapper(DNN_BACKEND_CUDA, targetId) { }
~CUDABackendWrapper()207         virtual ~CUDABackendWrapper() { }
208 
209         void copyToHost() override = 0;
210         virtual void copyToHostInBackground() = 0;
211         void setHostDirty() override = 0;
212 
213         virtual void copyToDevice() = 0;
214         virtual void setDeviceDirty() = 0;
215 
216         virtual MatShape getShape() const noexcept = 0;
217         virtual std::size_t getRank() const noexcept = 0;
218 
219         /** @note setting the stream updates the stream for all wrappers which use the same tensor */
220         virtual void setStream(cuda4dnn::csl::Stream stream, cuda4dnn::csl::Stream h2d_stream) noexcept = 0;
221 
222         virtual void update(const MatShape& shape, std::size_t offset) = 0;
223     };
224 
225     namespace cuda4dnn { namespace detail {
226 
227         template <class U>
228         void convert_D2H(const cv::Mat& mat, cuda4dnn::csl::View<U> view, cuda4dnn::csl::ManagedPtr<float>& device_temp, const cuda4dnn::csl::Stream& stream);
229 
230         template <> inline
convert_D2H(const cv::Mat & mat,cuda4dnn::csl::View<half> view,cuda4dnn::csl::ManagedPtr<float> & device_temp,const cuda4dnn::csl::Stream & stream)231         void convert_D2H<half>(const cv::Mat& mat, cuda4dnn::csl::View<half> view, cuda4dnn::csl::ManagedPtr<float>& device_temp, const cuda4dnn::csl::Stream& stream) {
232             if (device_temp.size() < view.size())
233                 device_temp.reset(view.size());
234             auto temp_span = cuda4dnn::csl::Span<float>(device_temp.get(), view.size());
235 
236             cuda4dnn::kernels::fp16_to_fp32(stream, temp_span, view);
237             cuda4dnn::csl::memcpy<float>(reinterpret_cast<float*>(mat.data), temp_span.data(), view.size(), stream);
238         }
239 
240         template <> inline
convert_D2H(const cv::Mat & mat,cuda4dnn::csl::View<float> view,cuda4dnn::csl::ManagedPtr<float> & device_temp,const cuda4dnn::csl::Stream & stream)241         void convert_D2H<float>(const cv::Mat& mat, cuda4dnn::csl::View<float> view, cuda4dnn::csl::ManagedPtr<float>& device_temp, const cuda4dnn::csl::Stream& stream) {
242             cuda4dnn::csl::memcpy<float>(reinterpret_cast<float*>(mat.data), view.data(), view.size(), stream);
243         }
244 
245         template <class U>
246         void convert_D2H_background(const cv::Mat& mat, cuda4dnn::csl::View<U> view, cuda4dnn::csl::ManagedPtr<float>& device_temp, const cuda4dnn::csl::Stream& stream, const cuda4dnn::csl::Stream& d2h_stream, cuda4dnn::csl::Event& d2h_event);
247 
248         template <> inline
convert_D2H_background(const cv::Mat & mat,cuda4dnn::csl::View<half> view,cuda4dnn::csl::ManagedPtr<float> & device_temp,const cuda4dnn::csl::Stream & stream,const cuda4dnn::csl::Stream & d2h_stream,cuda4dnn::csl::Event & d2h_event)249         void convert_D2H_background<half>(const cv::Mat& mat, cuda4dnn::csl::View<half> view, cuda4dnn::csl::ManagedPtr<float>& device_temp, const cuda4dnn::csl::Stream& stream, const cuda4dnn::csl::Stream& d2h_stream, cuda4dnn::csl::Event& d2h_event) {
250             if (device_temp.size() < view.size())
251                 device_temp.reset(view.size());
252             auto temp_span = cuda4dnn::csl::Span<float>(device_temp.get(), view.size());
253 
254             /* The conversion kernel should can be executed in the background stream for better
255              * performance. We do it in the inference stream to prevent an unexplained performance
256              * regression on RTX 2080 Ti. Executing conversion kernel in the background stream causes
257              * everything to slow down (even operations that appear before the background transfer).
258              *
259              * TODO: identify the cause and move conversion kernel to the background stream
260              */
261             cuda4dnn::kernels::fp16_to_fp32(stream, temp_span, view);
262 
263             d2h_event.record(stream); // mark position in inference stream
264             cuda4dnn::csl::StreamWaitOnEvent(d2h_stream, d2h_event); // don't start transfer until data is available
265             cuda4dnn::csl::memcpy<float>(reinterpret_cast<float*>(mat.data), temp_span.data(), view.size(), d2h_stream);
266         }
267 
268         template <> inline
convert_D2H_background(const cv::Mat & mat,cuda4dnn::csl::View<float> view,cuda4dnn::csl::ManagedPtr<float> & device_temp,const cuda4dnn::csl::Stream & stream,const cuda4dnn::csl::Stream & d2h_stream,cuda4dnn::csl::Event & d2h_event)269         void convert_D2H_background<float>(const cv::Mat& mat, cuda4dnn::csl::View<float> view, cuda4dnn::csl::ManagedPtr<float>& device_temp, const cuda4dnn::csl::Stream& stream, const cuda4dnn::csl::Stream& d2h_stream, cuda4dnn::csl::Event& d2h_event) {
270             d2h_event.record(stream);
271             cuda4dnn::csl::StreamWaitOnEvent(d2h_stream, d2h_event);
272             cuda4dnn::csl::memcpy<float>(reinterpret_cast<float*>(mat.data), view.data(), view.size(), d2h_stream);
273         }
274 
275         template <class U>
276         void convert_H2D(cuda4dnn::csl::Span<U> span, const cv::Mat& mat, cuda4dnn::csl::ManagedPtr<float>& device_temp, const cuda4dnn::csl::Stream& stream);
277 
278         template <> inline
convert_H2D(cuda4dnn::csl::Span<half> span,const cv::Mat & mat,cuda4dnn::csl::ManagedPtr<float> & device_temp,const cuda4dnn::csl::Stream & stream)279         void convert_H2D<half>(cuda4dnn::csl::Span<half> span, const cv::Mat& mat, cuda4dnn::csl::ManagedPtr<float>& device_temp, const cuda4dnn::csl::Stream& stream) {
280             if (device_temp.size() < span.size())
281                 device_temp.reset(span.size());
282             auto temp_span = cuda4dnn::csl::Span<float>(device_temp.get(), span.size());
283 
284             cuda4dnn::csl::memcpy<float>(temp_span.data(), reinterpret_cast<float*>(mat.data), span.size(), stream);
285             cuda4dnn::kernels::fp32_to_fp16(stream, span, temp_span);
286         }
287 
288         template <> inline
convert_H2D(cuda4dnn::csl::Span<float> span,const cv::Mat & mat,cuda4dnn::csl::ManagedPtr<float> & device_temp,const cuda4dnn::csl::Stream & stream)289         void convert_H2D<float>(cuda4dnn::csl::Span<float> span, const cv::Mat& mat, cuda4dnn::csl::ManagedPtr<float>& device_temp, const cuda4dnn::csl::Stream& stream) {
290             cuda4dnn::csl::memcpy<float>(span.data(), reinterpret_cast<float*>(mat.data), span.size(), stream);
291         }
292     }} /* namespace cuda4dnn::detail */
293 
294     template <class T, int TargetID>
295     class GenericCUDABackendWrapper final : public CUDABackendWrapper {
296     public:
297         using value_type = T;
298         using tensor_span_type = cuda4dnn::csl::TensorSpan<value_type>;
299         using tensor_view_type = cuda4dnn::csl::TensorView<value_type>;
300 
301         /* Pre-conditions:
302          * - there must be no other instance of `GenericCUDABackendWrapper` which wraps the host memory used by `m`
303          * - the host memory must remain allocated throughout the lifetime of this object
304          *
305          * Post-conditions:
306          * - the host memory used by \p m "may" be page-locked
307          */
GenericCUDABackendWrapper(Mat & m)308         GenericCUDABackendWrapper(Mat& m)
309             : CUDABackendWrapper(TargetID)
310         {
311             shape = cv::dnn::shape(m);
312             offset = 0;
313 
314             shared_block = std::make_shared<shared_block_type>();
315             shared_block->host_dirty = true;
316             shared_block->device_dirty = false;
317 
318             shared_block->host = m;
319 
320             try {
321                 shared_block->memGuard = cuda4dnn::csl::MemoryLockGuard(m.data, m.total() * m.elemSize());
322             } catch (...) {
323                 /* a common reason for failure is that the host system (for example, a Jetson device) does not support it */
324                 /* we ignore the failure as this is just an optimization and not a requirement */
325             }
326 
327             shared_block->device = cuda4dnn::csl::ManagedPtr<T>(m.total());
328         }
329 
GenericCUDABackendWrapper(const Ptr<BackendWrapper> & base_,const MatShape & shape_)330         GenericCUDABackendWrapper(const Ptr<BackendWrapper>& base_, const MatShape& shape_)
331             : CUDABackendWrapper(TargetID)
332         {
333             const Ptr<GenericCUDABackendWrapper> base = base_.dynamicCast<GenericCUDABackendWrapper>();
334             CV_Assert(base);
335 
336             shape = shape_;
337             offset = 0;
338             shared_block = base->shared_block;
339 
340             auto numel = total(shape_);
341             if (numel > shared_block->device.size())
342             {
343                 /* if the host memory was already page-locked, release it and register again with the new size */
344                 shared_block->memGuard = cuda4dnn::csl::MemoryLockGuard();
345                 try {
346                     CV_Assert(shared_block->host.type() == CV_32F);
347                     shared_block->memGuard = cuda4dnn::csl::MemoryLockGuard(shared_block->host.data, numel * sizeof(float));
348                 } catch (...) {
349                     /* a common reason for failure is that the host system (for example, a Jetson device) does not support it */
350                     /* we ignore the failure as this is just an optimization and not a requirement */
351                 }
352                 shared_block->device.reset(numel);
353             }
354         }
355 
create(Mat & m)356         static Ptr<BackendWrapper> create(Mat& m) {
357             return Ptr<BackendWrapper>(new GenericCUDABackendWrapper(m));
358         }
359 
create(const Ptr<BackendWrapper> & base,const MatShape & shape)360         static Ptr<BackendWrapper> create(const Ptr<BackendWrapper>& base, const MatShape& shape) {
361             return Ptr<BackendWrapper>(new GenericCUDABackendWrapper(base, shape));
362         }
363 
copyToHost()364         void copyToHost() override {
365             if (shared_block->device_dirty) {
366                 CV_Assert(offset == 0); /* we cannot track each piece of the memory separately */
367 
368                 shared_block->host_dirty = false;
369                 shared_block->device_dirty = false;
370 
371                 /* If the wrapper is being reused, the device tensor might be larger in size than the wrapper.
372                  * Using the device tensor does not give incorrect code but leads to unused region of memory being copied.
373                  *
374                  * We use a view to ensure that only the required region of memory is copied.
375                  */
376                 auto view = tensor_view_type(shared_block->device.get(), std::begin(shape), std::end(shape));
377 
378                 auto& mat = shared_block->host;
379                 CV_Assert(mat.isContinuous());
380                 CV_Assert(mat.type() == CV_32F);
381 
382                 cuda4dnn::detail::convert_D2H<T>(mat, view, shared_block->device_temp, shared_block->stream);
383                 shared_block->stream.synchronize();
384             } else if(shared_block->d2h_event && shared_block->d2h_event.busy()) {
385                 /* wait for the background copy to finish */
386                 shared_block->d2h_event.synchronize();
387             }
388         }
389 
copyToHostInBackground()390         void copyToHostInBackground() override {
391             CV_Assert(shared_block->d2h_stream);
392             if (shared_block->device_dirty) {
393                 shared_block->host_dirty = false;
394                 shared_block->device_dirty = false;
395 
396                 auto view = tensor_view_type(shared_block->device.get(), std::begin(shape), std::end(shape));
397 
398                 auto& mat = shared_block->host;
399                 CV_Assert(mat.isContinuous());
400                 CV_Assert(mat.type() == CV_32F);
401 
402                 if (!shared_block->d2h_event)
403                     shared_block->d2h_event = cuda4dnn::csl::Event(true);
404                 cuda4dnn::detail::convert_D2H_background<T>(mat, view, shared_block->device_temp, shared_block->stream, shared_block->d2h_stream, shared_block->d2h_event);
405                 shared_block->d2h_event.record(shared_block->d2h_stream); // record position so that we can check status later
406             }
407         }
408 
setHostDirty()409         void setHostDirty() override {
410             shared_block->device_dirty = false;
411             shared_block->host_dirty = true;
412         }
413 
copyToDevice()414         void copyToDevice() override {
415             if (shared_block->host_dirty) {
416                 CV_Assert(offset == 0); /* we cannot track each piece of the memory separately */
417 
418                 shared_block->host_dirty = false;
419                 shared_block->device_dirty = false;
420 
421                 auto span = tensor_span_type(shared_block->device.get(), std::begin(shape), std::end(shape));
422 
423                 auto& mat = shared_block->host;
424                 CV_Assert(mat.isContinuous());
425                 CV_Assert(mat.type() == CV_32F);
426 
427                 cuda4dnn::detail::convert_H2D<T>(span, mat, shared_block->device_temp, shared_block->stream);
428             }
429         }
430 
setDeviceDirty()431         void setDeviceDirty() override {
432             shared_block->device_dirty = true;
433             shared_block->host_dirty = false;
434         }
435 
getShape() const436         MatShape getShape() const noexcept override { return shape; }
437 
getRank() const438         std::size_t getRank() const noexcept override { return shape.size(); }
439 
setStream(cuda4dnn::csl::Stream stream,cuda4dnn::csl::Stream d2h_stream)440         void setStream(cuda4dnn::csl::Stream stream, cuda4dnn::csl::Stream d2h_stream) noexcept override {
441             shared_block->stream = std::move(stream);
442             shared_block->d2h_stream = std::move(d2h_stream);
443         }
444 
update(const MatShape & shape_,std::size_t offset_)445         void update(const MatShape& shape_, std::size_t offset_) override {
446             auto total = std::accumulate(std::begin(shape_), std::end(shape_), 1, std::multiplies<MatShape::value_type>());
447             if (offset_ + total > shared_block->device.size()) {
448                 CV_Error(Error::BadOffset, "shape and offset provided can potentially leads to OOB access");
449             }
450             shape = shape_;
451             offset = offset_;
452         }
453 
getMutableHostMat()454         cv::Mat getMutableHostMat() noexcept {
455             CV_Assert(offset == 0); /* we cannot track each piece of the memory separately */
456             copyToHost();
457             setHostDirty();
458             return shared_block->host;
459         }
460 
getImmutableHostMat() const461         const cv::Mat getImmutableHostMat() const noexcept {
462             CV_Assert(offset == 0); /* we cannot track each piece of the memory separately */
463             copyToHost();
464             return shared_block->host;
465         }
466 
467         /* Optimization Note: use getSpan() and getView() judiciously
468          *
469          * getSpan() is meant to be used when the memory is going to be modified
470          * getView() is meant to be used when the memory is only going to be read
471          *
472          * getSpan() marks the device memory as dirty but getView() does not
473          *
474          * getView() implicitly performs host to device memory transfer if required
475          * getSpan() does not perform any synchronization (use copyToDevice if sync. is required)
476          */
getSpan()477         tensor_span_type getSpan() noexcept {
478             setDeviceDirty();
479             return tensor_span_type(shared_block->device.get() + offset, std::begin(shape), std::end(shape));
480         }
481 
getView()482         tensor_view_type getView() noexcept {
483             copyToDevice();
484             return tensor_view_type(shared_block->device.get() + offset, std::begin(shape), std::end(shape));
485         }
486 
487     private:
488         /* The same tensor memory can be reused by different layers whenever possible.
489          * Hence, it is possible for different backend wrappers to point to the same memory.
490          * However, it may use only a part of that memory and have a different shape.
491          *
492          * We store the common information such as device tensor and its corresponding host memory in
493          * a shared block. The shared block is shared by all backend wrappers which use the same memory.
494          * The shape, which can be different for different wrappers, is stored as a member object.
495          */
496 
497         MatShape shape;
498         std::size_t offset;
499 
500         struct shared_block_type {
501             bool host_dirty;
502             bool device_dirty;
503 
504             cv::Mat host;
505             cuda4dnn::csl::MemoryLockGuard memGuard; /* keeps host memory page-locked if possible */
506 
507             cuda4dnn::csl::ManagedPtr<T> device;
508             cuda4dnn::csl::ManagedPtr<float> device_temp; /* use for conversions */
509             cuda4dnn::csl::Stream stream;
510 
511             cuda4dnn::csl::Event d2h_event;
512             cuda4dnn::csl::Stream d2h_stream;
513         };
514 
515         std::shared_ptr<shared_block_type> shared_block;
516     };
517 
518     using CUDABackendWrapperFP16 = GenericCUDABackendWrapper<half, DNN_TARGET_CUDA_FP16>;
519     using CUDABackendWrapperFP32 = GenericCUDABackendWrapper<float, DNN_TARGET_CUDA>;
520 
521     template <class T> struct GetCUDABackendWrapperType_ { };
522     template <> struct GetCUDABackendWrapperType_<half> { typedef CUDABackendWrapperFP16 type; };
523     template <> struct GetCUDABackendWrapperType_<float> { typedef CUDABackendWrapperFP32 type; };
524 
525     template <class T>
526     using GetCUDABackendWrapperType = typename GetCUDABackendWrapperType_<T>::type;
527 
528 #endif
529 }} /* namespace cv::dnn */
530 
531 #endif /* OPENCV_DNN_SRC_OP_CUDA_HPP */
532