1 // This file is part of OpenCV project. 2 // It is subject to the license terms in the LICENSE file found in the top-level directory 3 // of this distribution and at http://opencv.org/license.html. 4 5 #ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_POINTER_HPP 6 #define OPENCV_DNN_SRC_CUDA4DNN_CSL_POINTER_HPP 7 8 #include "nvcc_defs.hpp" 9 #include "error.hpp" 10 #include "stream.hpp" 11 12 #include <opencv2/core.hpp> 13 14 #include <cuda_runtime_api.h> 15 16 #include <cstddef> 17 #include <type_traits> 18 #include <ostream> 19 20 namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { 21 22 /** @brief provides a type-safe device pointer 23 * 24 * DevicePtr wraps a raw pointer and mimics its behaviour. It does not implicitly convert 25 * to a raw pointer. This ensures that accidental mixing of host and device pointers do not happen. 26 * 27 * It is meant to point to locations in device memory. Hence, it provides dereferencing or 28 * array subscript capability for device code only. 29 * 30 * A `const DevicePtr<T>` represents an immutable pointer to a mutable memory. 31 * A `DevicePtr<const T>` represents a mutable pointer to an immutable memory. 32 * A `const DevicePtr<const T>` represents an immutable pointer to an immutable memory. 33 * 34 * A `DevicePtr<T>` can implicitly convert to `DevicePtr<const T>`. 35 * 36 * Specializations: 37 * - DevicePtr<void>/DevicePtr<const void> do not support pointer arithmetic (but relational operators are provided) 38 * - any device pointer pointing to mutable memory is implicitly convertible to DevicePtr<void> 39 * - any device pointer is implicitly convertible to DevicePtr<const void> 40 * - DevicePtr<void> can be explicitly converted to any device pointer 41 * - DevicePtr<const void> can be explicitly converted to any device pointer pointing to immutable memory 42 */ 43 template <class T> 44 class DevicePtr { 45 static_assert(std::is_standard_layout<T>::value, "T must satisfy StandardLayoutType"); 46 47 public: 48 using element_type = T; 49 using difference_type = std::ptrdiff_t; 50 using pointer = typename std::add_pointer<element_type>::type; 51 using reference = typename std::add_lvalue_reference<element_type>::type; 52 53 DevicePtr() = default; DevicePtr(pointer ptr_)54 CUDA4DNN_HOST_DEVICE explicit DevicePtr(pointer ptr_) noexcept : ptr{ ptr_ } { } 55 operator =(pointer ptr_)56 CUDA4DNN_HOST_DEVICE DevicePtr operator=(pointer ptr_) noexcept { ptr = ptr_; return *this; } 57 get() const58 CUDA4DNN_HOST_DEVICE pointer get() const noexcept { return ptr; }; 59 operator [](difference_type idx) const60 CUDA4DNN_DEVICE reference operator[](difference_type idx) const noexcept { return get()[idx]; } operator *() const61 CUDA4DNN_DEVICE reference operator*() const noexcept { return *get(); } operator ->() const62 CUDA4DNN_DEVICE pointer operator->() const noexcept { return get(); } 63 64 template<class U = T, typename std::enable_if<!std::is_const<U>::value, bool>::type = true> operator DevicePtr<typename std::add_const<U>::type>() const65 CUDA4DNN_HOST_DEVICE operator DevicePtr<typename std::add_const<U>::type>() const noexcept { 66 return DevicePtr<typename std::add_const<U>::type>{ptr}; 67 } 68 operator bool() const69 CUDA4DNN_HOST_DEVICE explicit operator bool() const noexcept { return ptr; } 70 operator ++()71 CUDA4DNN_HOST_DEVICE DevicePtr operator++() noexcept { 72 ++ptr; 73 return *this; 74 } 75 operator ++(int)76 CUDA4DNN_HOST_DEVICE DevicePtr operator++(int) noexcept { 77 auto tmp = DevicePtr(*this); 78 ptr++; 79 return tmp; 80 } 81 operator --()82 CUDA4DNN_HOST_DEVICE DevicePtr operator--() noexcept { 83 --ptr; 84 return *this; 85 } 86 operator --(int)87 CUDA4DNN_HOST_DEVICE DevicePtr operator--(int) noexcept { 88 auto tmp = DevicePtr(*this); 89 ptr--; 90 return tmp; 91 } 92 operator +=(std::ptrdiff_t offset)93 CUDA4DNN_HOST_DEVICE DevicePtr operator+=(std::ptrdiff_t offset) noexcept { 94 ptr += offset; 95 return *this; 96 } 97 operator -=(std::ptrdiff_t offset)98 CUDA4DNN_HOST_DEVICE DevicePtr operator-=(std::ptrdiff_t offset) noexcept { 99 ptr -= offset; 100 return *this; 101 } 102 operator +(DevicePtr lhs,std::ptrdiff_t offset)103 CUDA4DNN_HOST_DEVICE friend DevicePtr operator+(DevicePtr lhs, std::ptrdiff_t offset) noexcept { 104 return lhs += offset; 105 } 106 operator -(DevicePtr lhs,std::ptrdiff_t offset)107 CUDA4DNN_HOST_DEVICE friend DevicePtr operator-(DevicePtr lhs, std::ptrdiff_t offset) noexcept { 108 return lhs -= offset; 109 } 110 operator -(DevicePtr lhs,DevicePtr rhs)111 CUDA4DNN_HOST_DEVICE friend difference_type operator-(DevicePtr lhs, DevicePtr rhs) noexcept { 112 return lhs.ptr - rhs.ptr; 113 } 114 operator ==(DevicePtr lhs,DevicePtr rhs)115 CUDA4DNN_HOST_DEVICE friend bool operator==(DevicePtr lhs, DevicePtr rhs) noexcept { return lhs.ptr == rhs.ptr; } operator !=(DevicePtr lhs,DevicePtr rhs)116 CUDA4DNN_HOST_DEVICE friend bool operator!=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(lhs == rhs); } operator <(DevicePtr lhs,DevicePtr rhs)117 CUDA4DNN_HOST_DEVICE friend bool operator<(DevicePtr lhs, DevicePtr rhs) noexcept { return lhs.ptr < rhs.ptr; } operator >(DevicePtr lhs,DevicePtr rhs)118 CUDA4DNN_HOST_DEVICE friend bool operator>(DevicePtr lhs, DevicePtr rhs) noexcept { return rhs < lhs; } operator <=(DevicePtr lhs,DevicePtr rhs)119 CUDA4DNN_HOST_DEVICE friend bool operator<=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(rhs < lhs); } operator >=(DevicePtr lhs,DevicePtr rhs)120 CUDA4DNN_HOST_DEVICE friend bool operator>=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(lhs < rhs); } 121 operator pointer() const122 CUDA4DNN_HOST_DEVICE explicit operator pointer() const noexcept { return ptr; } 123 swap(DevicePtr & lhs,DevicePtr & rhs)124 CUDA4DNN_HOST friend void swap(DevicePtr& lhs, DevicePtr& rhs) noexcept { 125 using std::swap; 126 swap(lhs.ptr, rhs.ptr); 127 } 128 129 template <class U, class V> operator <<(std::basic_ostream<U,V> & os,DevicePtr other)130 CUDA4DNN_HOST friend std::basic_ostream<U, V>& operator<<(std::basic_ostream<U, V>& os, DevicePtr other) { 131 os << other.get() << " (device)"; 132 return os; 133 } 134 135 private: 136 pointer ptr; 137 }; 138 139 template <> 140 class DevicePtr<const void> { 141 public: 142 using element_type = const void; 143 using pointer = typename std::add_pointer<element_type>::type; 144 145 DevicePtr() = default; 146 147 /* host const void pointer to const void device pointer */ DevicePtr(pointer ptr_)148 CUDA4DNN_HOST_DEVICE explicit DevicePtr(pointer ptr_) noexcept : ptr{ ptr_ } { } 149 150 /* allow any device pointer to be implicitly convereted to void device pointer */ 151 template <class T> DevicePtr(DevicePtr<T> ptr_)152 CUDA4DNN_HOST_DEVICE DevicePtr(DevicePtr<T> ptr_) noexcept : ptr{ ptr_.get() } { } 153 operator =(pointer ptr_)154 CUDA4DNN_HOST_DEVICE DevicePtr operator=(pointer ptr_) noexcept { ptr = ptr_; return *this; } 155 get() const156 CUDA4DNN_HOST_DEVICE pointer get() const noexcept { return ptr; }; 157 operator bool() const158 CUDA4DNN_HOST_DEVICE explicit operator bool() const noexcept { return ptr; } 159 operator ==(DevicePtr lhs,DevicePtr rhs)160 CUDA4DNN_HOST_DEVICE friend bool operator==(DevicePtr lhs, DevicePtr rhs) noexcept { return lhs.ptr == rhs.ptr; } operator !=(DevicePtr lhs,DevicePtr rhs)161 CUDA4DNN_HOST_DEVICE friend bool operator!=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(lhs == rhs); } operator <(DevicePtr lhs,DevicePtr rhs)162 CUDA4DNN_HOST_DEVICE friend bool operator<(DevicePtr lhs, DevicePtr rhs) noexcept { return lhs.ptr < rhs.ptr; } operator >(DevicePtr lhs,DevicePtr rhs)163 CUDA4DNN_HOST_DEVICE friend bool operator>(DevicePtr lhs, DevicePtr rhs) noexcept { return rhs < lhs; } operator <=(DevicePtr lhs,DevicePtr rhs)164 CUDA4DNN_HOST_DEVICE friend bool operator<=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(rhs < lhs); } operator >=(DevicePtr lhs,DevicePtr rhs)165 CUDA4DNN_HOST_DEVICE friend bool operator>=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(lhs < rhs); } 166 167 /* explicit conversion into host void pointer */ operator pointer() const168 CUDA4DNN_HOST_DEVICE explicit operator pointer() const noexcept { return ptr; } 169 170 /* const void device pointer can be explicitly casted into any const device pointer type */ 171 template <class T, typename std::enable_if<std::is_const<T>::value, bool>::type = true> operator DevicePtr<T>() const172 CUDA4DNN_HOST_DEVICE explicit operator DevicePtr<T>() const noexcept { 173 return static_cast<T*>(ptr); 174 } 175 swap(DevicePtr & lhs,DevicePtr & rhs)176 CUDA4DNN_HOST friend void swap(DevicePtr& lhs, DevicePtr& rhs) noexcept { 177 using std::swap; 178 swap(lhs.ptr, rhs.ptr); 179 } 180 181 template <class U, class V> operator <<(std::basic_ostream<U,V> & os,DevicePtr other)182 CUDA4DNN_HOST friend std::basic_ostream<U, V>& operator<<(std::basic_ostream<U, V>& os, DevicePtr other) { 183 os << other.get() << " (device)"; 184 return os; 185 } 186 187 private: 188 pointer ptr; 189 }; 190 191 template <> 192 class DevicePtr<void> { 193 public: 194 using element_type = void; 195 using pointer = typename std::add_pointer<element_type>::type; 196 197 DevicePtr() = default; 198 199 /* host pointer to device pointer */ DevicePtr(pointer ptr_)200 CUDA4DNN_HOST_DEVICE explicit DevicePtr(pointer ptr_) noexcept : ptr{ ptr_ } { } 201 202 /* allow any device pointer to mutable memory to be implicitly convereted to void device pointer */ 203 template <class T, typename std::enable_if<!std::is_const<T>::value, bool>::type = false> DevicePtr(DevicePtr<T> ptr_)204 CUDA4DNN_HOST_DEVICE DevicePtr(DevicePtr<T> ptr_) noexcept : ptr { ptr_.get() } { } 205 operator =(pointer ptr_)206 CUDA4DNN_HOST_DEVICE DevicePtr operator=(pointer ptr_) noexcept { ptr = ptr_; return *this; } 207 get() const208 CUDA4DNN_HOST_DEVICE pointer get() const noexcept { return ptr; }; 209 operator DevicePtr<const void>() const210 CUDA4DNN_HOST_DEVICE operator DevicePtr<const void>() const noexcept { return DevicePtr<const void>{ptr}; } 211 operator bool() const212 CUDA4DNN_HOST_DEVICE explicit operator bool() const noexcept { return ptr; } 213 operator ==(DevicePtr lhs,DevicePtr rhs)214 CUDA4DNN_HOST_DEVICE friend bool operator==(DevicePtr lhs, DevicePtr rhs) noexcept { return lhs.ptr == rhs.ptr; } operator !=(DevicePtr lhs,DevicePtr rhs)215 CUDA4DNN_HOST_DEVICE friend bool operator!=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(lhs == rhs); } operator <(DevicePtr lhs,DevicePtr rhs)216 CUDA4DNN_HOST_DEVICE friend bool operator<(DevicePtr lhs, DevicePtr rhs) noexcept { return lhs.ptr < rhs.ptr; } operator >(DevicePtr lhs,DevicePtr rhs)217 CUDA4DNN_HOST_DEVICE friend bool operator>(DevicePtr lhs, DevicePtr rhs) noexcept { return rhs < lhs; } operator <=(DevicePtr lhs,DevicePtr rhs)218 CUDA4DNN_HOST_DEVICE friend bool operator<=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(rhs < lhs); } operator >=(DevicePtr lhs,DevicePtr rhs)219 CUDA4DNN_HOST_DEVICE friend bool operator>=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(lhs < rhs); } 220 221 /* explicit conversion into host void pointer */ operator pointer() const222 CUDA4DNN_HOST_DEVICE explicit operator pointer() const noexcept { return ptr; } 223 224 /* void device pointer can be explicitly casted into any device pointer type */ 225 template <class T> operator DevicePtr<T>() const226 CUDA4DNN_HOST_DEVICE explicit operator DevicePtr<T>() const noexcept { 227 return DevicePtr<T>(static_cast<T*>(ptr)); 228 } 229 swap(DevicePtr & lhs,DevicePtr & rhs)230 CUDA4DNN_HOST friend void swap(DevicePtr& lhs, DevicePtr& rhs) noexcept { 231 using std::swap; 232 swap(lhs.ptr, rhs.ptr); 233 } 234 235 template <class U, class V> operator <<(std::basic_ostream<U,V> & os,DevicePtr other)236 CUDA4DNN_HOST friend std::basic_ostream<U, V>& operator<<(std::basic_ostream<U, V>& os, DevicePtr other) { 237 os << other.get() << " (device)"; 238 return os; 239 } 240 241 private: 242 pointer ptr; 243 }; 244 245 template <class T> is_aligned(DevicePtr<const T> ptr,std::size_t alignment)246 bool is_aligned(DevicePtr<const T> ptr, std::size_t alignment) { 247 auto addr = reinterpret_cast<std::intptr_t>(ptr.get()); 248 return addr % alignment == 0; 249 } 250 251 /** copies \p n elements from \p src to \p dest4 252 * 253 * \param[in] src device pointer 254 * \param[out] dest host pointer 255 * 256 * Pre-conditions: 257 * - memory pointed by \p dest and \p src must be large enough to hold \p n elements 258 * 259 * Exception Guarantee: Basic 260 */ 261 template <class T> memcpy(T * dest,DevicePtr<const T> src,std::size_t n)262 void memcpy(T *dest, DevicePtr<const T> src, std::size_t n) { 263 if (n <= 0) { 264 CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive"); 265 } 266 267 CUDA4DNN_CHECK_CUDA(cudaMemcpy(dest, src.get(), n * sizeof(T), cudaMemcpyDefault)); 268 } 269 270 /** copies \p n elements from \p src to \p dest 271 * 272 * \param[in] src host pointer 273 * \param[out] dest device pointer 274 * 275 * Pre-conditions: 276 * - memory pointed by \p dest and \p src must be large enough to hold \p n elements 277 * 278 * Exception Guarantee: Basic 279 */ 280 template <class T> memcpy(DevicePtr<T> dest,const T * src,std::size_t n)281 void memcpy(DevicePtr<T> dest, const T* src, std::size_t n) { 282 if (n <= 0) { 283 CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive"); 284 } 285 286 CUDA4DNN_CHECK_CUDA(cudaMemcpy(dest.get(), src, n * sizeof(T), cudaMemcpyDefault)); 287 } 288 289 /** copies \p n elements from \p src to \p dest 290 * 291 * \param[in] src device pointer 292 * \param[out] dest device pointer 293 * 294 * Pre-conditions: 295 * - memory pointed by \p dest and \p src must be large enough to hold \p n elements 296 * 297 * Exception Guarantee: Basic 298 */ 299 template <class T> memcpy(DevicePtr<T> dest,DevicePtr<const T> src,std::size_t n)300 void memcpy(DevicePtr<T> dest, DevicePtr<const T> src, std::size_t n) { 301 if (n <= 0) { 302 CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive"); 303 } 304 305 CUDA4DNN_CHECK_CUDA(cudaMemcpy(dest.get(), src.get(), n * sizeof(T), cudaMemcpyDefault)); 306 } 307 308 /** sets \p n elements to \p ch in \p dest 309 * 310 * \param[in] src device pointer 311 * \param[out] ch 8-bit value to fill the device memory with 312 * 313 * Pre-conditions: 314 * - memory pointed by \p dest must be large enough to hold \p n elements 315 * 316 * Exception Guarantee: Basic 317 */ 318 template <class T> memset(DevicePtr<T> dest,std::int8_t ch,std::size_t n)319 void memset(DevicePtr<T> dest, std::int8_t ch, std::size_t n) { 320 if (n <= 0) { 321 CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive"); 322 } 323 324 CUDA4DNN_CHECK_CUDA(cudaMemset(dest.get(), ch, n * sizeof(T))); 325 } 326 327 /** copies \p n elements from \p src to \p dest asynchronously 328 * 329 * \param[in] src device pointer 330 * \param[out] dest host pointer 331 * \param stream CUDA stream that has to be used for the memory transfer 332 * 333 * Pre-conditions: 334 * - memory pointed by \p dest and \p src must be large enough to hold \p n elements 335 * - \p dest points to page-locked memory 336 * 337 * Exception Guarantee: Basic 338 */ 339 template <class T> memcpy(T * dest,DevicePtr<const T> src,std::size_t n,const Stream & stream)340 void memcpy(T *dest, DevicePtr<const T> src, std::size_t n, const Stream& stream) { 341 if (n <= 0) { 342 CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive"); 343 } 344 345 CUDA4DNN_CHECK_CUDA(cudaMemcpyAsync(dest, src.get(), n * sizeof(T), cudaMemcpyDefault, stream.get())); 346 } 347 348 /** copies data from memory pointed by \p src to \p dest asynchronously 349 * 350 * \param[in] src host pointer 351 * \param[out] dest device pointer 352 * \param stream CUDA stream that has to be used for the memory transfer 353 * 354 * Pre-conditions: 355 * - memory pointed by \p dest and \p src must be large enough to hold \p n elements 356 * - \p src points to page-locked memory 357 * 358 * Exception Guarantee: Basic 359 */ 360 template <class T> memcpy(DevicePtr<T> dest,const T * src,std::size_t n,const Stream & stream)361 void memcpy(DevicePtr<T> dest, const T *src, std::size_t n, const Stream& stream) { 362 if (n <= 0) { 363 CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive"); 364 } 365 366 CUDA4DNN_CHECK_CUDA(cudaMemcpyAsync(dest.get(), src, n * sizeof(T), cudaMemcpyDefault, stream.get())); 367 } 368 369 /** copies \p n elements from \p src to \p dest asynchronously 370 * 371 * \param[in] src device pointer 372 * \param[out] dest device pointer 373 * \param stream CUDA stream that has to be used for the memory transfer 374 * 375 * Pre-conditions: 376 * - memory pointed by \p dest and \p src must be large enough to hold \p n elements 377 * 378 * Exception Guarantee: Basic 379 */ 380 template <class T> memcpy(DevicePtr<T> dest,DevicePtr<const T> src,std::size_t n,const Stream & stream)381 void memcpy(DevicePtr<T> dest, DevicePtr<const T> src, std::size_t n, const Stream& stream) { 382 if (n <= 0) { 383 CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive"); 384 } 385 386 CUDA4DNN_CHECK_CUDA(cudaMemcpyAsync(dest.get(), src.get(), n * sizeof(T), cudaMemcpyDefault, stream.get())); 387 } 388 389 /** sets \p n elements to \p ch in \p dest asynchronously 390 * 391 * \param[in] src device pointer 392 * \param[out] ch 8-bit value to fill the device memory with 393 * \param stream CUDA stream that has to be used for the memory operation 394 * 395 * Pre-conditions: 396 * - memory pointed by \p dest must be large enough to hold \p n elements 397 * 398 * Exception Guarantee: Basic 399 */ 400 template <class T> memset(DevicePtr<T> dest,std::int8_t ch,std::size_t n,const Stream & stream)401 void memset(DevicePtr<T> dest, std::int8_t ch, std::size_t n, const Stream& stream) { 402 if (n <= 0) { 403 CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive"); 404 } 405 406 CUDA4DNN_CHECK_CUDA(cudaMemsetAsync(dest.get(), ch, n * sizeof(T), stream.get())); 407 } 408 409 }}}} /* namespace cv::dnn::cuda4dnn::csl */ 410 411 #endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_POINTER_HPP */ 412