1 // This file is part of OpenCV project.
2 // It is subject to the license terms in the LICENSE file found in the top-level directory
3 // of this distribution and at http://opencv.org/license.html.
4 
5 #ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_POINTER_HPP
6 #define OPENCV_DNN_SRC_CUDA4DNN_CSL_POINTER_HPP
7 
8 #include "nvcc_defs.hpp"
9 #include "error.hpp"
10 #include "stream.hpp"
11 
12 #include <opencv2/core.hpp>
13 
14 #include <cuda_runtime_api.h>
15 
16 #include <cstddef>
17 #include <type_traits>
18 #include <ostream>
19 
20 namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
21 
22     /** @brief provides a type-safe device pointer
23      *
24      * DevicePtr wraps a raw pointer and mimics its behaviour. It does not implicitly convert
25      * to a raw pointer. This ensures that accidental mixing of host and device pointers do not happen.
26      *
27      * It is meant to point to locations in device memory. Hence, it provides dereferencing or
28      * array subscript capability for device code only.
29      *
30      * A `const DevicePtr<T>` represents an immutable pointer to a mutable memory.
31      * A `DevicePtr<const T>` represents a mutable pointer to an immutable memory.
32      * A `const DevicePtr<const T>` represents an immutable pointer to an immutable memory.
33      *
34      * A `DevicePtr<T>` can implicitly convert to `DevicePtr<const T>`.
35      *
36      * Specializations:
37      * - DevicePtr<void>/DevicePtr<const void> do not support pointer arithmetic (but relational operators are provided)
38      * - any device pointer pointing to mutable memory is implicitly convertible to DevicePtr<void>
39      * - any device pointer is implicitly convertible to DevicePtr<const void>
40      * - DevicePtr<void> can be explicitly converted to any device pointer
41      * - DevicePtr<const void> can be explicitly converted to any device pointer pointing to immutable memory
42      */
43     template <class T>
44     class DevicePtr {
45         static_assert(std::is_standard_layout<T>::value, "T must satisfy StandardLayoutType");
46 
47     public:
48         using element_type = T;
49         using difference_type = std::ptrdiff_t;
50         using pointer = typename std::add_pointer<element_type>::type;
51         using reference = typename std::add_lvalue_reference<element_type>::type;
52 
53         DevicePtr() = default;
DevicePtr(pointer ptr_)54         CUDA4DNN_HOST_DEVICE explicit DevicePtr(pointer ptr_) noexcept : ptr{ ptr_ } { }
55 
operator =(pointer ptr_)56         CUDA4DNN_HOST_DEVICE DevicePtr operator=(pointer ptr_) noexcept { ptr = ptr_; return *this; }
57 
get() const58         CUDA4DNN_HOST_DEVICE pointer get() const noexcept { return ptr; };
59 
operator [](difference_type idx) const60         CUDA4DNN_DEVICE reference operator[](difference_type idx) const noexcept { return get()[idx]; }
operator *() const61         CUDA4DNN_DEVICE reference operator*() const noexcept { return *get(); }
operator ->() const62         CUDA4DNN_DEVICE pointer operator->() const noexcept { return get(); }
63 
64         template<class U = T, typename std::enable_if<!std::is_const<U>::value, bool>::type = true>
operator DevicePtr<typename std::add_const<U>::type>() const65         CUDA4DNN_HOST_DEVICE operator DevicePtr<typename std::add_const<U>::type>() const noexcept {
66             return DevicePtr<typename std::add_const<U>::type>{ptr};
67         }
68 
operator bool() const69         CUDA4DNN_HOST_DEVICE explicit operator bool() const noexcept { return ptr; }
70 
operator ++()71         CUDA4DNN_HOST_DEVICE DevicePtr operator++() noexcept {
72             ++ptr;
73             return *this;
74         }
75 
operator ++(int)76         CUDA4DNN_HOST_DEVICE DevicePtr operator++(int) noexcept {
77             auto tmp = DevicePtr(*this);
78             ptr++;
79             return tmp;
80         }
81 
operator --()82         CUDA4DNN_HOST_DEVICE DevicePtr operator--() noexcept {
83             --ptr;
84             return *this;
85         }
86 
operator --(int)87         CUDA4DNN_HOST_DEVICE DevicePtr operator--(int) noexcept {
88             auto tmp = DevicePtr(*this);
89             ptr--;
90             return tmp;
91         }
92 
operator +=(std::ptrdiff_t offset)93         CUDA4DNN_HOST_DEVICE DevicePtr operator+=(std::ptrdiff_t offset) noexcept {
94             ptr += offset;
95             return *this;
96         }
97 
operator -=(std::ptrdiff_t offset)98         CUDA4DNN_HOST_DEVICE DevicePtr operator-=(std::ptrdiff_t offset) noexcept {
99             ptr -= offset;
100             return *this;
101         }
102 
operator +(DevicePtr lhs,std::ptrdiff_t offset)103         CUDA4DNN_HOST_DEVICE friend DevicePtr operator+(DevicePtr lhs, std::ptrdiff_t offset) noexcept {
104             return lhs += offset;
105         }
106 
operator -(DevicePtr lhs,std::ptrdiff_t offset)107         CUDA4DNN_HOST_DEVICE friend DevicePtr operator-(DevicePtr lhs, std::ptrdiff_t offset) noexcept {
108             return lhs -= offset;
109         }
110 
operator -(DevicePtr lhs,DevicePtr rhs)111         CUDA4DNN_HOST_DEVICE friend difference_type operator-(DevicePtr lhs, DevicePtr rhs) noexcept {
112             return lhs.ptr - rhs.ptr;
113         }
114 
operator ==(DevicePtr lhs,DevicePtr rhs)115         CUDA4DNN_HOST_DEVICE friend bool operator==(DevicePtr lhs, DevicePtr rhs) noexcept { return lhs.ptr == rhs.ptr; }
operator !=(DevicePtr lhs,DevicePtr rhs)116         CUDA4DNN_HOST_DEVICE friend bool operator!=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(lhs == rhs); }
operator <(DevicePtr lhs,DevicePtr rhs)117         CUDA4DNN_HOST_DEVICE friend bool operator<(DevicePtr lhs, DevicePtr rhs) noexcept { return lhs.ptr < rhs.ptr; }
operator >(DevicePtr lhs,DevicePtr rhs)118         CUDA4DNN_HOST_DEVICE friend bool operator>(DevicePtr lhs, DevicePtr rhs) noexcept { return rhs < lhs; }
operator <=(DevicePtr lhs,DevicePtr rhs)119         CUDA4DNN_HOST_DEVICE friend bool operator<=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(rhs < lhs); }
operator >=(DevicePtr lhs,DevicePtr rhs)120         CUDA4DNN_HOST_DEVICE friend bool operator>=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(lhs < rhs); }
121 
operator pointer() const122         CUDA4DNN_HOST_DEVICE explicit operator pointer() const noexcept { return ptr; }
123 
swap(DevicePtr & lhs,DevicePtr & rhs)124         CUDA4DNN_HOST friend void swap(DevicePtr& lhs, DevicePtr& rhs) noexcept {
125             using std::swap;
126             swap(lhs.ptr, rhs.ptr);
127         }
128 
129         template <class U, class V>
operator <<(std::basic_ostream<U,V> & os,DevicePtr other)130         CUDA4DNN_HOST friend std::basic_ostream<U, V>& operator<<(std::basic_ostream<U, V>& os, DevicePtr other) {
131             os << other.get() << " (device)";
132             return os;
133         }
134 
135     private:
136         pointer ptr;
137     };
138 
139     template <>
140     class DevicePtr<const void> {
141     public:
142         using element_type = const void;
143         using pointer = typename std::add_pointer<element_type>::type;
144 
145         DevicePtr() = default;
146 
147         /* host const void pointer to const void device pointer */
DevicePtr(pointer ptr_)148         CUDA4DNN_HOST_DEVICE explicit DevicePtr(pointer ptr_) noexcept : ptr{ ptr_ } { }
149 
150         /* allow any device pointer to be implicitly convereted to void device pointer */
151         template <class T>
DevicePtr(DevicePtr<T> ptr_)152         CUDA4DNN_HOST_DEVICE DevicePtr(DevicePtr<T> ptr_) noexcept : ptr{ ptr_.get() } { }
153 
operator =(pointer ptr_)154         CUDA4DNN_HOST_DEVICE DevicePtr operator=(pointer ptr_) noexcept { ptr = ptr_; return *this; }
155 
get() const156         CUDA4DNN_HOST_DEVICE pointer get() const noexcept { return ptr; };
157 
operator bool() const158         CUDA4DNN_HOST_DEVICE explicit operator bool() const noexcept { return ptr; }
159 
operator ==(DevicePtr lhs,DevicePtr rhs)160         CUDA4DNN_HOST_DEVICE friend bool operator==(DevicePtr lhs, DevicePtr rhs) noexcept { return lhs.ptr == rhs.ptr; }
operator !=(DevicePtr lhs,DevicePtr rhs)161         CUDA4DNN_HOST_DEVICE friend bool operator!=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(lhs == rhs); }
operator <(DevicePtr lhs,DevicePtr rhs)162         CUDA4DNN_HOST_DEVICE friend bool operator<(DevicePtr lhs, DevicePtr rhs) noexcept { return lhs.ptr < rhs.ptr; }
operator >(DevicePtr lhs,DevicePtr rhs)163         CUDA4DNN_HOST_DEVICE friend bool operator>(DevicePtr lhs, DevicePtr rhs) noexcept { return rhs < lhs; }
operator <=(DevicePtr lhs,DevicePtr rhs)164         CUDA4DNN_HOST_DEVICE friend bool operator<=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(rhs < lhs); }
operator >=(DevicePtr lhs,DevicePtr rhs)165         CUDA4DNN_HOST_DEVICE friend bool operator>=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(lhs < rhs); }
166 
167         /* explicit conversion into host void pointer */
operator pointer() const168         CUDA4DNN_HOST_DEVICE explicit operator pointer() const noexcept { return ptr; }
169 
170         /* const void device pointer can be explicitly casted into any const device pointer type */
171         template <class T, typename std::enable_if<std::is_const<T>::value, bool>::type = true>
operator DevicePtr<T>() const172         CUDA4DNN_HOST_DEVICE explicit operator DevicePtr<T>() const noexcept {
173             return static_cast<T*>(ptr);
174         }
175 
swap(DevicePtr & lhs,DevicePtr & rhs)176         CUDA4DNN_HOST friend void swap(DevicePtr& lhs, DevicePtr& rhs) noexcept {
177             using std::swap;
178             swap(lhs.ptr, rhs.ptr);
179         }
180 
181         template <class U, class V>
operator <<(std::basic_ostream<U,V> & os,DevicePtr other)182         CUDA4DNN_HOST friend std::basic_ostream<U, V>& operator<<(std::basic_ostream<U, V>& os, DevicePtr other) {
183             os << other.get() << " (device)";
184             return os;
185         }
186 
187     private:
188         pointer ptr;
189     };
190 
191     template <>
192     class DevicePtr<void> {
193     public:
194         using element_type = void;
195         using pointer = typename std::add_pointer<element_type>::type;
196 
197         DevicePtr() = default;
198 
199         /* host pointer to device pointer */
DevicePtr(pointer ptr_)200         CUDA4DNN_HOST_DEVICE explicit DevicePtr(pointer ptr_) noexcept : ptr{ ptr_ } { }
201 
202         /* allow any device pointer to mutable memory to be implicitly convereted to void device pointer */
203         template <class T, typename std::enable_if<!std::is_const<T>::value, bool>::type = false>
DevicePtr(DevicePtr<T> ptr_)204         CUDA4DNN_HOST_DEVICE DevicePtr(DevicePtr<T> ptr_) noexcept : ptr { ptr_.get() } { }
205 
operator =(pointer ptr_)206         CUDA4DNN_HOST_DEVICE DevicePtr operator=(pointer ptr_) noexcept { ptr = ptr_; return *this; }
207 
get() const208         CUDA4DNN_HOST_DEVICE pointer get() const noexcept { return ptr; };
209 
operator DevicePtr<const void>() const210         CUDA4DNN_HOST_DEVICE operator DevicePtr<const void>() const noexcept { return DevicePtr<const void>{ptr}; }
211 
operator bool() const212         CUDA4DNN_HOST_DEVICE explicit operator bool() const noexcept { return ptr; }
213 
operator ==(DevicePtr lhs,DevicePtr rhs)214         CUDA4DNN_HOST_DEVICE friend bool operator==(DevicePtr lhs, DevicePtr rhs) noexcept { return lhs.ptr == rhs.ptr; }
operator !=(DevicePtr lhs,DevicePtr rhs)215         CUDA4DNN_HOST_DEVICE friend bool operator!=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(lhs == rhs); }
operator <(DevicePtr lhs,DevicePtr rhs)216         CUDA4DNN_HOST_DEVICE friend bool operator<(DevicePtr lhs, DevicePtr rhs) noexcept { return lhs.ptr < rhs.ptr; }
operator >(DevicePtr lhs,DevicePtr rhs)217         CUDA4DNN_HOST_DEVICE friend bool operator>(DevicePtr lhs, DevicePtr rhs) noexcept { return rhs < lhs; }
operator <=(DevicePtr lhs,DevicePtr rhs)218         CUDA4DNN_HOST_DEVICE friend bool operator<=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(rhs < lhs); }
operator >=(DevicePtr lhs,DevicePtr rhs)219         CUDA4DNN_HOST_DEVICE friend bool operator>=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(lhs < rhs); }
220 
221         /* explicit conversion into host void pointer */
operator pointer() const222         CUDA4DNN_HOST_DEVICE explicit operator pointer() const noexcept { return ptr; }
223 
224         /* void device pointer can be explicitly casted into any device pointer type */
225         template <class T>
operator DevicePtr<T>() const226         CUDA4DNN_HOST_DEVICE explicit operator DevicePtr<T>() const noexcept {
227             return DevicePtr<T>(static_cast<T*>(ptr));
228         }
229 
swap(DevicePtr & lhs,DevicePtr & rhs)230         CUDA4DNN_HOST friend void swap(DevicePtr& lhs, DevicePtr& rhs) noexcept {
231             using std::swap;
232             swap(lhs.ptr, rhs.ptr);
233         }
234 
235         template <class U, class V>
operator <<(std::basic_ostream<U,V> & os,DevicePtr other)236         CUDA4DNN_HOST friend std::basic_ostream<U, V>& operator<<(std::basic_ostream<U, V>& os, DevicePtr other) {
237             os << other.get() << " (device)";
238             return os;
239         }
240 
241     private:
242         pointer ptr;
243     };
244 
245     template <class T>
is_aligned(DevicePtr<const T> ptr,std::size_t alignment)246     bool is_aligned(DevicePtr<const T> ptr, std::size_t alignment) {
247         auto addr = reinterpret_cast<std::intptr_t>(ptr.get());
248         return addr % alignment == 0;
249     }
250 
251     /** copies \p n elements from \p src to \p dest4
252      *
253      * \param[in]   src     device pointer
254      * \param[out]  dest    host pointer
255      *
256      * Pre-conditions:
257      * - memory pointed by \p dest and \p src must be large enough to hold \p n elements
258      *
259      * Exception Guarantee: Basic
260      */
261     template <class T>
memcpy(T * dest,DevicePtr<const T> src,std::size_t n)262     void memcpy(T *dest, DevicePtr<const T> src, std::size_t n) {
263         if (n <= 0) {
264             CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive");
265         }
266 
267         CUDA4DNN_CHECK_CUDA(cudaMemcpy(dest, src.get(), n * sizeof(T), cudaMemcpyDefault));
268     }
269 
270     /** copies \p n elements from \p src to \p dest
271      *
272      * \param[in]   src     host pointer
273      * \param[out]  dest    device pointer
274      *
275      * Pre-conditions:
276      * - memory pointed by \p dest and \p src must be large enough to hold \p n elements
277      *
278      * Exception Guarantee: Basic
279      */
280     template <class T>
memcpy(DevicePtr<T> dest,const T * src,std::size_t n)281     void memcpy(DevicePtr<T> dest, const T* src, std::size_t n) {
282         if (n <= 0) {
283             CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive");
284         }
285 
286         CUDA4DNN_CHECK_CUDA(cudaMemcpy(dest.get(), src, n * sizeof(T), cudaMemcpyDefault));
287     }
288 
289     /** copies \p n elements from \p src to \p dest
290      *
291      * \param[in]   src     device pointer
292      * \param[out]  dest    device pointer
293      *
294      * Pre-conditions:
295      * - memory pointed by \p dest and \p src must be large enough to hold \p n elements
296      *
297      * Exception Guarantee: Basic
298      */
299     template <class T>
memcpy(DevicePtr<T> dest,DevicePtr<const T> src,std::size_t n)300     void memcpy(DevicePtr<T> dest, DevicePtr<const T> src, std::size_t n) {
301         if (n <= 0) {
302             CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive");
303         }
304 
305         CUDA4DNN_CHECK_CUDA(cudaMemcpy(dest.get(), src.get(), n * sizeof(T), cudaMemcpyDefault));
306     }
307 
308     /** sets \p n elements to \p ch in \p dest
309      *
310      * \param[in]   src     device pointer
311      * \param[out]  ch      8-bit value to fill the device memory with
312      *
313      * Pre-conditions:
314      * - memory pointed by \p dest must be large enough to hold \p n elements
315      *
316      * Exception Guarantee: Basic
317      */
318     template <class T>
memset(DevicePtr<T> dest,std::int8_t ch,std::size_t n)319     void memset(DevicePtr<T> dest, std::int8_t ch, std::size_t n) {
320         if (n <= 0) {
321             CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive");
322         }
323 
324         CUDA4DNN_CHECK_CUDA(cudaMemset(dest.get(), ch, n * sizeof(T)));
325     }
326 
327     /** copies \p n elements from \p src to \p dest asynchronously
328      *
329      * \param[in]   src     device pointer
330      * \param[out]  dest    host pointer
331      * \param       stream  CUDA stream that has to be used for the memory transfer
332      *
333      * Pre-conditions:
334      * - memory pointed by \p dest and \p src must be large enough to hold \p n elements
335      * - \p dest points to page-locked memory
336      *
337      * Exception Guarantee: Basic
338      */
339     template <class T>
memcpy(T * dest,DevicePtr<const T> src,std::size_t n,const Stream & stream)340     void memcpy(T *dest, DevicePtr<const T> src, std::size_t n, const Stream& stream) {
341         if (n <= 0) {
342             CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive");
343         }
344 
345         CUDA4DNN_CHECK_CUDA(cudaMemcpyAsync(dest, src.get(), n * sizeof(T), cudaMemcpyDefault, stream.get()));
346     }
347 
348     /** copies data from memory pointed by \p src to \p dest asynchronously
349      *
350      * \param[in]   src     host pointer
351      * \param[out]  dest    device pointer
352      * \param       stream  CUDA stream that has to be used for the memory transfer
353      *
354      * Pre-conditions:
355      * - memory pointed by \p dest and \p src must be large enough to hold \p n elements
356      * - \p src points to page-locked memory
357      *
358      * Exception Guarantee: Basic
359      */
360     template <class T>
memcpy(DevicePtr<T> dest,const T * src,std::size_t n,const Stream & stream)361     void memcpy(DevicePtr<T> dest, const T *src, std::size_t n, const Stream& stream) {
362         if (n <= 0) {
363             CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive");
364         }
365 
366         CUDA4DNN_CHECK_CUDA(cudaMemcpyAsync(dest.get(), src, n * sizeof(T), cudaMemcpyDefault, stream.get()));
367     }
368 
369     /** copies \p n elements from \p src to \p dest asynchronously
370      *
371      * \param[in]   src     device pointer
372      * \param[out]  dest    device pointer
373      * \param       stream  CUDA stream that has to be used for the memory transfer
374      *
375      * Pre-conditions:
376      * - memory pointed by \p dest and \p src must be large enough to hold \p n elements
377      *
378      * Exception Guarantee: Basic
379      */
380     template <class T>
memcpy(DevicePtr<T> dest,DevicePtr<const T> src,std::size_t n,const Stream & stream)381     void memcpy(DevicePtr<T> dest, DevicePtr<const T> src, std::size_t n, const Stream& stream) {
382         if (n <= 0) {
383             CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive");
384         }
385 
386         CUDA4DNN_CHECK_CUDA(cudaMemcpyAsync(dest.get(), src.get(), n * sizeof(T), cudaMemcpyDefault, stream.get()));
387     }
388 
389     /** sets \p n elements to \p ch in \p dest asynchronously
390      *
391      * \param[in]   src     device pointer
392      * \param[out]  ch      8-bit value to fill the device memory with
393      * \param       stream  CUDA stream that has to be used for the memory operation
394      *
395      * Pre-conditions:
396      * - memory pointed by \p dest must be large enough to hold \p n elements
397      *
398      * Exception Guarantee: Basic
399      */
400     template <class T>
memset(DevicePtr<T> dest,std::int8_t ch,std::size_t n,const Stream & stream)401     void memset(DevicePtr<T> dest, std::int8_t ch, std::size_t n, const Stream& stream) {
402         if (n <= 0) {
403             CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive");
404         }
405 
406         CUDA4DNN_CHECK_CUDA(cudaMemsetAsync(dest.get(), ch, n * sizeof(T), stream.get()));
407     }
408 
409 }}}} /* namespace cv::dnn::cuda4dnn::csl */
410 
411 #endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_POINTER_HPP */
412