1 /*M/////////////////////////////////////////////////////////////////////////////////////// 2 // 3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 4 // 5 // By downloading, copying, installing or using the software you agree to this license. 6 // If you do not agree to this license, do not download, install, 7 // copy or use the software. 8 // 9 // 10 // License Agreement 11 // For Open Source Computer Vision Library 12 // 13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. 14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved. 15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved. 16 // Third party copyrights are property of their respective owners. 17 // 18 // Redistribution and use in source and binary forms, with or without modification, 19 // are permitted provided that the following conditions are met: 20 // 21 // * Redistribution's of source code must retain the above copyright notice, 22 // this list of conditions and the following disclaimer. 23 // 24 // * Redistribution's in binary form must reproduce the above copyright notice, 25 // this list of conditions and the following disclaimer in the documentation 26 // and/or other materials provided with the distribution. 27 // 28 // * The name of the copyright holders may not be used to endorse or promote products 29 // derived from this software without specific prior written permission. 30 // 31 // This software is provided by the copyright holders and contributors "as is" and 32 // any express or implied warranties, including, but not limited to, the implied 33 // warranties of merchantability and fitness for a particular purpose are disclaimed. 34 // In no event shall the Intel Corporation or contributors be liable for any direct, 35 // indirect, incidental, special, exemplary, or consequential damages 36 // (including, but not limited to, procurement of substitute goods or services; 37 // loss of use, data, or profits; or business interruption) however caused 38 // and on any theory of liability, whether in contract, strict liability, 39 // or tort (including negligence or otherwise) arising in any way out of 40 // the use of this software, even if advised of the possibility of such damage. 41 // 42 //M*/ 43 44 #ifndef OPENCV_CORE_CUDA_HPP 45 #define OPENCV_CORE_CUDA_HPP 46 47 #ifndef __cplusplus 48 # error cuda.hpp header must be compiled as C++ 49 #endif 50 51 #include "opencv2/core.hpp" 52 #include "opencv2/core/cuda_types.hpp" 53 54 /** 55 @defgroup cuda CUDA-accelerated Computer Vision 56 @{ 57 @defgroup cudacore Core part 58 @{ 59 @defgroup cudacore_init Initialization and Information 60 @defgroup cudacore_struct Data Structures 61 @} 62 @} 63 */ 64 65 namespace cv { namespace cuda { 66 67 //! @addtogroup cudacore_struct 68 //! @{ 69 70 //=================================================================================== 71 // GpuMat 72 //=================================================================================== 73 74 /** @brief Base storage class for GPU memory with reference counting. 75 76 Its interface matches the Mat interface with the following limitations: 77 78 - no arbitrary dimensions support (only 2D) 79 - no functions that return references to their data (because references on GPU are not valid for 80 CPU) 81 - no expression templates technique support 82 83 Beware that the latter limitation may lead to overloaded matrix operators that cause memory 84 allocations. The GpuMat class is convertible to cuda::PtrStepSz and cuda::PtrStep so it can be 85 passed directly to the kernel. 86 87 @note In contrast with Mat, in most cases GpuMat::isContinuous() == false . This means that rows are 88 aligned to a size depending on the hardware. Single-row GpuMat is always a continuous matrix. 89 90 @note You are not recommended to leave static or global GpuMat variables allocated, that is, to rely 91 on its destructor. The destruction order of such variables and CUDA context is undefined. GPU memory 92 release function returns error if the CUDA context has been destroyed before. 93 94 Some member functions are described as a "Blocking Call" while some are described as a 95 "Non-Blocking Call". Blocking functions are synchronous to host. It is guaranteed that the GPU 96 operation is finished when the function returns. However, non-blocking functions are asynchronous to 97 host. Those functions may return even if the GPU operation is not finished. 98 99 Compared to their blocking counterpart, non-blocking functions accept Stream as an additional 100 argument. If a non-default stream is passed, the GPU operation may overlap with operations in other 101 streams. 102 103 @sa Mat 104 */ 105 class CV_EXPORTS_W GpuMat 106 { 107 public: 108 class CV_EXPORTS_W Allocator 109 { 110 public: ~Allocator()111 virtual ~Allocator() {} 112 113 // allocator must fill data, step and refcount fields 114 virtual bool allocate(GpuMat* mat, int rows, int cols, size_t elemSize) = 0; 115 virtual void free(GpuMat* mat) = 0; 116 }; 117 118 //! default allocator 119 CV_WRAP static GpuMat::Allocator* defaultAllocator(); 120 CV_WRAP static void setDefaultAllocator(GpuMat::Allocator* allocator); 121 122 //! default constructor 123 CV_WRAP explicit GpuMat(GpuMat::Allocator* allocator = GpuMat::defaultAllocator()); 124 125 //! constructs GpuMat of the specified size and type 126 CV_WRAP GpuMat(int rows, int cols, int type, GpuMat::Allocator* allocator = GpuMat::defaultAllocator()); 127 CV_WRAP GpuMat(Size size, int type, GpuMat::Allocator* allocator = GpuMat::defaultAllocator()); 128 129 //! constructs GpuMat and fills it with the specified value _s 130 CV_WRAP GpuMat(int rows, int cols, int type, Scalar s, GpuMat::Allocator* allocator = GpuMat::defaultAllocator()); 131 CV_WRAP GpuMat(Size size, int type, Scalar s, GpuMat::Allocator* allocator = GpuMat::defaultAllocator()); 132 133 //! copy constructor 134 CV_WRAP GpuMat(const GpuMat& m); 135 136 //! constructor for GpuMat headers pointing to user-allocated data 137 GpuMat(int rows, int cols, int type, void* data, size_t step = Mat::AUTO_STEP); 138 GpuMat(Size size, int type, void* data, size_t step = Mat::AUTO_STEP); 139 140 //! creates a GpuMat header for a part of the bigger matrix 141 CV_WRAP GpuMat(const GpuMat& m, Range rowRange, Range colRange); 142 CV_WRAP GpuMat(const GpuMat& m, Rect roi); 143 144 //! builds GpuMat from host memory (Blocking call) 145 CV_WRAP explicit GpuMat(InputArray arr, GpuMat::Allocator* allocator = GpuMat::defaultAllocator()); 146 147 //! destructor - calls release() 148 ~GpuMat(); 149 150 //! assignment operators 151 GpuMat& operator =(const GpuMat& m); 152 153 //! allocates new GpuMat data unless the GpuMat already has specified size and type 154 CV_WRAP void create(int rows, int cols, int type); 155 CV_WRAP void create(Size size, int type); 156 157 //! decreases reference counter, deallocate the data when reference counter reaches 0 158 void release(); 159 160 //! swaps with other smart pointer 161 CV_WRAP void swap(GpuMat& mat); 162 163 /** @brief Performs data upload to GpuMat (Blocking call) 164 165 This function copies data from host memory to device memory. As being a blocking call, it is 166 guaranteed that the copy operation is finished when this function returns. 167 */ 168 CV_WRAP void upload(InputArray arr); 169 170 /** @brief Performs data upload to GpuMat (Non-Blocking call) 171 172 This function copies data from host memory to device memory. As being a non-blocking call, this 173 function may return even if the copy operation is not finished. 174 175 The copy operation may be overlapped with operations in other non-default streams if \p stream is 176 not the default stream and \p dst is HostMem allocated with HostMem::PAGE_LOCKED option. 177 */ 178 CV_WRAP void upload(InputArray arr, Stream& stream); 179 180 /** @brief Performs data download from GpuMat (Blocking call) 181 182 This function copies data from device memory to host memory. As being a blocking call, it is 183 guaranteed that the copy operation is finished when this function returns. 184 */ 185 CV_WRAP void download(OutputArray dst) const; 186 187 /** @brief Performs data download from GpuMat (Non-Blocking call) 188 189 This function copies data from device memory to host memory. As being a non-blocking call, this 190 function may return even if the copy operation is not finished. 191 192 The copy operation may be overlapped with operations in other non-default streams if \p stream is 193 not the default stream and \p dst is HostMem allocated with HostMem::PAGE_LOCKED option. 194 */ 195 CV_WRAP void download(OutputArray dst, Stream& stream) const; 196 197 //! returns deep copy of the GpuMat, i.e. the data is copied 198 CV_WRAP GpuMat clone() const; 199 200 //! copies the GpuMat content to device memory (Blocking call) 201 CV_WRAP void copyTo(OutputArray dst) const; 202 203 //! copies the GpuMat content to device memory (Non-Blocking call) 204 CV_WRAP void copyTo(OutputArray dst, Stream& stream) const; 205 206 //! copies those GpuMat elements to "m" that are marked with non-zero mask elements (Blocking call) 207 CV_WRAP void copyTo(OutputArray dst, InputArray mask) const; 208 209 //! copies those GpuMat elements to "m" that are marked with non-zero mask elements (Non-Blocking call) 210 CV_WRAP void copyTo(OutputArray dst, InputArray mask, Stream& stream) const; 211 212 //! sets some of the GpuMat elements to s (Blocking call) 213 CV_WRAP GpuMat& setTo(Scalar s); 214 215 //! sets some of the GpuMat elements to s (Non-Blocking call) 216 CV_WRAP GpuMat& setTo(Scalar s, Stream& stream); 217 218 //! sets some of the GpuMat elements to s, according to the mask (Blocking call) 219 CV_WRAP GpuMat& setTo(Scalar s, InputArray mask); 220 221 //! sets some of the GpuMat elements to s, according to the mask (Non-Blocking call) 222 CV_WRAP GpuMat& setTo(Scalar s, InputArray mask, Stream& stream); 223 224 //! converts GpuMat to another datatype (Blocking call) 225 CV_WRAP void convertTo(OutputArray dst, int rtype) const; 226 227 //! converts GpuMat to another datatype (Non-Blocking call) 228 CV_WRAP void convertTo(OutputArray dst, int rtype, Stream& stream) const; 229 230 //! converts GpuMat to another datatype with scaling (Blocking call) 231 CV_WRAP void convertTo(OutputArray dst, int rtype, double alpha, double beta = 0.0) const; 232 233 //! converts GpuMat to another datatype with scaling (Non-Blocking call) 234 CV_WRAP void convertTo(OutputArray dst, int rtype, double alpha, Stream& stream) const; 235 236 //! converts GpuMat to another datatype with scaling (Non-Blocking call) 237 CV_WRAP void convertTo(OutputArray dst, int rtype, double alpha, double beta, Stream& stream) const; 238 239 CV_WRAP void assignTo(GpuMat& m, int type = -1) const; 240 241 //! returns pointer to y-th row 242 uchar* ptr(int y = 0); 243 const uchar* ptr(int y = 0) const; 244 245 //! template version of the above method 246 template<typename _Tp> _Tp* ptr(int y = 0); 247 template<typename _Tp> const _Tp* ptr(int y = 0) const; 248 249 template <typename _Tp> operator PtrStepSz<_Tp>() const; 250 template <typename _Tp> operator PtrStep<_Tp>() const; 251 252 //! returns a new GpuMat header for the specified row 253 CV_WRAP GpuMat row(int y) const; 254 255 //! returns a new GpuMat header for the specified column 256 CV_WRAP GpuMat col(int x) const; 257 258 //! ... for the specified row span 259 CV_WRAP GpuMat rowRange(int startrow, int endrow) const; 260 CV_WRAP GpuMat rowRange(Range r) const; 261 262 //! ... for the specified column span 263 CV_WRAP GpuMat colRange(int startcol, int endcol) const; 264 CV_WRAP GpuMat colRange(Range r) const; 265 266 //! extracts a rectangular sub-GpuMat (this is a generalized form of row, rowRange etc.) 267 GpuMat operator ()(Range rowRange, Range colRange) const; 268 GpuMat operator ()(Rect roi) const; 269 270 //! creates alternative GpuMat header for the same data, with different 271 //! number of channels and/or different number of rows 272 CV_WRAP GpuMat reshape(int cn, int rows = 0) const; 273 274 //! locates GpuMat header within a parent GpuMat 275 CV_WRAP void locateROI(Size& wholeSize, Point& ofs) const; 276 277 //! moves/resizes the current GpuMat ROI inside the parent GpuMat 278 CV_WRAP GpuMat& adjustROI(int dtop, int dbottom, int dleft, int dright); 279 280 //! returns true iff the GpuMat data is continuous 281 //! (i.e. when there are no gaps between successive rows) 282 CV_WRAP bool isContinuous() const; 283 284 //! returns element size in bytes 285 CV_WRAP size_t elemSize() const; 286 287 //! returns the size of element channel in bytes 288 CV_WRAP size_t elemSize1() const; 289 290 //! returns element type 291 CV_WRAP int type() const; 292 293 //! returns element type 294 CV_WRAP int depth() const; 295 296 //! returns number of channels 297 CV_WRAP int channels() const; 298 299 //! returns step/elemSize1() 300 CV_WRAP size_t step1() const; 301 302 //! returns GpuMat size : width == number of columns, height == number of rows 303 CV_WRAP Size size() const; 304 305 //! returns true if GpuMat data is NULL 306 CV_WRAP bool empty() const; 307 308 // returns pointer to cuda memory 309 CV_WRAP void* cudaPtr() const; 310 311 //! internal use method: updates the continuity flag 312 CV_WRAP void updateContinuityFlag(); 313 314 /*! includes several bit-fields: 315 - the magic signature 316 - continuity flag 317 - depth 318 - number of channels 319 */ 320 int flags; 321 322 //! the number of rows and columns 323 int rows, cols; 324 325 //! a distance between successive rows in bytes; includes the gap if any 326 CV_PROP size_t step; 327 328 //! pointer to the data 329 uchar* data; 330 331 //! pointer to the reference counter; 332 //! when GpuMat points to user-allocated data, the pointer is NULL 333 int* refcount; 334 335 //! helper fields used in locateROI and adjustROI 336 uchar* datastart; 337 const uchar* dataend; 338 339 //! allocator 340 Allocator* allocator; 341 }; 342 343 struct CV_EXPORTS_W GpuData 344 { 345 explicit GpuData(size_t _size); 346 ~GpuData(); 347 348 GpuData(const GpuData&) = delete; 349 GpuData& operator=(const GpuData&) = delete; 350 351 GpuData(GpuData&&) = delete; 352 GpuData& operator=(GpuData&&) = delete; 353 354 uchar* data; 355 size_t size; 356 }; 357 358 class CV_EXPORTS_W GpuMatND 359 { 360 public: 361 using SizeArray = std::vector<int>; 362 using StepArray = std::vector<size_t>; 363 using IndexArray = std::vector<int>; 364 365 //! destructor 366 ~GpuMatND(); 367 368 //! default constructor 369 GpuMatND(); 370 371 /** @overload 372 @param size Array of integers specifying an n-dimensional array shape. 373 @param type Array type. Use CV_8UC1, ..., CV_16FC4 to create 1-4 channel matrices, or 374 CV_8UC(n), ..., CV_64FC(n) to create multi-channel (up to CV_CN_MAX channels) matrices. 375 */ 376 GpuMatND(SizeArray size, int type); 377 378 /** @overload 379 @param size Array of integers specifying an n-dimensional array shape. 380 @param type Array type. Use CV_8UC1, ..., CV_16FC4 to create 1-4 channel matrices, or 381 CV_8UC(n), ..., CV_64FC(n) to create multi-channel (up to CV_CN_MAX channels) matrices. 382 @param data Pointer to the user data. Matrix constructors that take data and step parameters do not 383 allocate matrix data. Instead, they just initialize the matrix header that points to the specified 384 data, which means that no data is copied. This operation is very efficient and can be used to 385 process external data using OpenCV functions. The external data is not automatically deallocated, so 386 you should take care of it. 387 @param step Array of _size.size()-1 steps in case of a multi-dimensional array (the last step is always 388 set to the element size). If not specified, the matrix is assumed to be continuous. 389 */ 390 GpuMatND(SizeArray size, int type, void* data, StepArray step = StepArray()); 391 392 /** @brief Allocates GPU memory. 393 Suppose there is some GPU memory already allocated. In that case, this method may choose to reuse that 394 GPU memory under the specific condition: it must be of the same size and type, not externally allocated, 395 the GPU memory is continuous(i.e., isContinuous() is true), and is not a sub-matrix of another GpuMatND 396 (i.e., isSubmatrix() is false). In other words, this method guarantees that the GPU memory allocated by 397 this method is always continuous and is not a sub-region of another GpuMatND. 398 */ 399 void create(SizeArray size, int type); 400 401 void release(); 402 403 void swap(GpuMatND& m) noexcept; 404 405 /** @brief Creates a full copy of the array and the underlying data. 406 The method creates a full copy of the array. It mimics the behavior of Mat::clone(), i.e. 407 the original step is not taken into account. So, the array copy is a continuous array 408 occupying total()\*elemSize() bytes. 409 */ 410 GpuMatND clone() const; 411 412 /** @overload 413 This overload is non-blocking, so it may return even if the copy operation is not finished. 414 */ 415 GpuMatND clone(Stream& stream) const; 416 417 /** @brief Extracts a sub-matrix. 418 The operator makes a new header for the specified sub-array of \*this. 419 The operator is an O(1) operation, that is, no matrix data is copied. 420 @param ranges Array of selected ranges along each dimension. 421 */ 422 GpuMatND operator()(const std::vector<Range>& ranges) const; 423 424 /** @brief Creates a GpuMat header for a 2D plane part of an n-dim matrix. 425 @note The returned GpuMat is constructed with the constructor for user-allocated data. 426 That is, It does not perform reference counting. 427 @note This function does not increment this GpuMatND's reference counter. 428 */ 429 GpuMat createGpuMatHeader(IndexArray idx, Range rowRange, Range colRange) const; 430 431 /** @overload 432 Creates a GpuMat header if this GpuMatND is effectively 2D. 433 @note The returned GpuMat is constructed with the constructor for user-allocated data. 434 That is, It does not perform reference counting. 435 @note This function does not increment this GpuMatND's reference counter. 436 */ 437 GpuMat createGpuMatHeader() const; 438 439 /** @brief Extracts a 2D plane part of an n-dim matrix. 440 It differs from createGpuMatHeader(IndexArray, Range, Range) in that it clones a part of this 441 GpuMatND to the returned GpuMat. 442 @note This operator does not increment this GpuMatND's reference counter; 443 */ 444 GpuMat operator()(IndexArray idx, Range rowRange, Range colRange) const; 445 446 /** @brief Extracts a 2D plane part of an n-dim matrix if this GpuMatND is effectively 2D. 447 It differs from createGpuMatHeader() in that it clones a part of this GpuMatND. 448 @note This operator does not increment this GpuMatND's reference counter; 449 */ 450 operator GpuMat() const; 451 452 GpuMatND(const GpuMatND&) = default; 453 GpuMatND& operator=(const GpuMatND&) = default; 454 455 #if defined(__GNUC__) && __GNUC__ < 5 456 // error: function '...' defaulted on its first declaration with an exception-specification 457 // that differs from the implicit declaration '...' 458 459 GpuMatND(GpuMatND&&) = default; 460 GpuMatND& operator=(GpuMatND&&) = default; 461 #else 462 GpuMatND(GpuMatND&&) noexcept = default; 463 GpuMatND& operator=(GpuMatND&&) noexcept = default; 464 #endif 465 466 void upload(InputArray src); 467 void upload(InputArray src, Stream& stream); 468 void download(OutputArray dst) const; 469 void download(OutputArray dst, Stream& stream) const; 470 471 //! returns true iff the GpuMatND data is continuous 472 //! (i.e. when there are no gaps between successive rows) 473 bool isContinuous() const; 474 475 //! returns true if the matrix is a sub-matrix of another matrix 476 bool isSubmatrix() const; 477 478 //! returns element size in bytes 479 size_t elemSize() const; 480 481 //! returns the size of element channel in bytes 482 size_t elemSize1() const; 483 484 //! returns true if data is null 485 bool empty() const; 486 487 //! returns true if not empty and points to external(user-allocated) gpu memory 488 bool external() const; 489 490 //! returns pointer to the first byte of the GPU memory 491 uchar* getDevicePtr() const; 492 493 //! returns the total number of array elements 494 size_t total() const; 495 496 //! returns the size of underlying memory in bytes 497 size_t totalMemSize() const; 498 499 //! returns element type 500 int type() const; 501 502 private: 503 //! internal use 504 void setFields(SizeArray size, int type, StepArray step = StepArray()); 505 506 public: 507 /*! includes several bit-fields: 508 - the magic signature 509 - continuity flag 510 - depth 511 - number of channels 512 */ 513 int flags; 514 515 //! matrix dimensionality 516 int dims; 517 518 //! shape of this array 519 SizeArray size; 520 521 /*! step values 522 Their semantics is identical to the semantics of step for Mat. 523 */ 524 StepArray step; 525 526 private: 527 /*! internal use 528 If this GpuMatND holds external memory, this is empty. 529 */ 530 std::shared_ptr<GpuData> data_; 531 532 /*! internal use 533 If this GpuMatND manages memory with reference counting, this value is 534 always equal to data_->data. If this GpuMatND holds external memory, 535 data_ is empty and data points to the external memory. 536 */ 537 uchar* data; 538 539 /*! internal use 540 If this GpuMatND is a sub-matrix of a larger matrix, this value is the 541 difference of the first byte between the sub-matrix and the whole matrix. 542 */ 543 size_t offset; 544 }; 545 546 /** @brief Creates a continuous matrix. 547 548 @param rows Row count. 549 @param cols Column count. 550 @param type Type of the matrix. 551 @param arr Destination matrix. This parameter changes only if it has a proper type and area ( 552 \f$\texttt{rows} \times \texttt{cols}\f$ ). 553 554 Matrix is called continuous if its elements are stored continuously, that is, without gaps at the 555 end of each row. 556 */ 557 CV_EXPORTS_W void createContinuous(int rows, int cols, int type, OutputArray arr); 558 559 /** @brief Ensures that the size of a matrix is big enough and the matrix has a proper type. 560 561 @param rows Minimum desired number of rows. 562 @param cols Minimum desired number of columns. 563 @param type Desired matrix type. 564 @param arr Destination matrix. 565 566 The function does not reallocate memory if the matrix has proper attributes already. 567 */ 568 CV_EXPORTS_W void ensureSizeIsEnough(int rows, int cols, int type, OutputArray arr); 569 570 /** @brief BufferPool for use with CUDA streams 571 572 BufferPool utilizes Stream's allocator to create new buffers for GpuMat's. It is 573 only useful when enabled with #setBufferPoolUsage. 574 575 @code 576 setBufferPoolUsage(true); 577 @endcode 578 579 @note #setBufferPoolUsage must be called \em before any Stream declaration. 580 581 Users may specify custom allocator for Stream and may implement their own stream based 582 functions utilizing the same underlying GPU memory management. 583 584 If custom allocator is not specified, BufferPool utilizes StackAllocator by 585 default. StackAllocator allocates a chunk of GPU device memory beforehand, 586 and when GpuMat is declared later on, it is given the pre-allocated memory. 587 This kind of strategy reduces the number of calls for memory allocating APIs 588 such as cudaMalloc or cudaMallocPitch. 589 590 Below is an example that utilizes BufferPool with StackAllocator: 591 592 @code 593 #include <opencv2/opencv.hpp> 594 595 using namespace cv; 596 using namespace cv::cuda 597 598 int main() 599 { 600 setBufferPoolUsage(true); // Tell OpenCV that we are going to utilize BufferPool 601 setBufferPoolConfig(getDevice(), 1024 * 1024 * 64, 2); // Allocate 64 MB, 2 stacks (default is 10 MB, 5 stacks) 602 603 Stream stream1, stream2; // Each stream uses 1 stack 604 BufferPool pool1(stream1), pool2(stream2); 605 606 GpuMat d_src1 = pool1.getBuffer(4096, 4096, CV_8UC1); // 16MB 607 GpuMat d_dst1 = pool1.getBuffer(4096, 4096, CV_8UC3); // 48MB, pool1 is now full 608 609 GpuMat d_src2 = pool2.getBuffer(1024, 1024, CV_8UC1); // 1MB 610 GpuMat d_dst2 = pool2.getBuffer(1024, 1024, CV_8UC3); // 3MB 611 612 cvtColor(d_src1, d_dst1, CV_GRAY2BGR, 0, stream1); 613 cvtColor(d_src2, d_dst2, CV_GRAY2BGR, 0, stream2); 614 } 615 @endcode 616 617 If we allocate another GpuMat on pool1 in the above example, it will be carried out by 618 the DefaultAllocator since the stack for pool1 is full. 619 620 @code 621 GpuMat d_add1 = pool1.getBuffer(1024, 1024, CV_8UC1); // Stack for pool1 is full, memory is allocated with DefaultAllocator 622 @endcode 623 624 If a third stream is declared in the above example, allocating with #getBuffer 625 within that stream will also be carried out by the DefaultAllocator because we've run out of 626 stacks. 627 628 @code 629 Stream stream3; // Only 2 stacks were allocated, we've run out of stacks 630 BufferPool pool3(stream3); 631 GpuMat d_src3 = pool3.getBuffer(1024, 1024, CV_8UC1); // Memory is allocated with DefaultAllocator 632 @endcode 633 634 @warning When utilizing StackAllocator, deallocation order is important. 635 636 Just like a stack, deallocation must be done in LIFO order. Below is an example of 637 erroneous usage that violates LIFO rule. If OpenCV is compiled in Debug mode, this 638 sample code will emit CV_Assert error. 639 640 @code 641 int main() 642 { 643 setBufferPoolUsage(true); // Tell OpenCV that we are going to utilize BufferPool 644 Stream stream; // A default size (10 MB) stack is allocated to this stream 645 BufferPool pool(stream); 646 647 GpuMat mat1 = pool.getBuffer(1024, 1024, CV_8UC1); // Allocate mat1 (1MB) 648 GpuMat mat2 = pool.getBuffer(1024, 1024, CV_8UC1); // Allocate mat2 (1MB) 649 650 mat1.release(); // erroneous usage : mat2 must be deallocated before mat1 651 } 652 @endcode 653 654 Since C++ local variables are destroyed in the reverse order of construction, 655 the code sample below satisfies the LIFO rule. Local GpuMat's are deallocated 656 and the corresponding memory is automatically returned to the pool for later usage. 657 658 @code 659 int main() 660 { 661 setBufferPoolUsage(true); // Tell OpenCV that we are going to utilize BufferPool 662 setBufferPoolConfig(getDevice(), 1024 * 1024 * 64, 2); // Allocate 64 MB, 2 stacks (default is 10 MB, 5 stacks) 663 664 Stream stream1, stream2; // Each stream uses 1 stack 665 BufferPool pool1(stream1), pool2(stream2); 666 667 for (int i = 0; i < 10; i++) 668 { 669 GpuMat d_src1 = pool1.getBuffer(4096, 4096, CV_8UC1); // 16MB 670 GpuMat d_dst1 = pool1.getBuffer(4096, 4096, CV_8UC3); // 48MB, pool1 is now full 671 672 GpuMat d_src2 = pool2.getBuffer(1024, 1024, CV_8UC1); // 1MB 673 GpuMat d_dst2 = pool2.getBuffer(1024, 1024, CV_8UC3); // 3MB 674 675 d_src1.setTo(Scalar(i), stream1); 676 d_src2.setTo(Scalar(i), stream2); 677 678 cvtColor(d_src1, d_dst1, CV_GRAY2BGR, 0, stream1); 679 cvtColor(d_src2, d_dst2, CV_GRAY2BGR, 0, stream2); 680 // The order of destruction of the local variables is: 681 // d_dst2 => d_src2 => d_dst1 => d_src1 682 // LIFO rule is satisfied, this code runs without error 683 } 684 } 685 @endcode 686 */ 687 class CV_EXPORTS_W BufferPool 688 { 689 public: 690 691 //! Gets the BufferPool for the given stream. 692 explicit BufferPool(Stream& stream); 693 694 //! Allocates a new GpuMat of given size and type. 695 CV_WRAP GpuMat getBuffer(int rows, int cols, int type); 696 697 //! Allocates a new GpuMat of given size and type. getBuffer(Size size,int type)698 CV_WRAP GpuMat getBuffer(Size size, int type) { return getBuffer(size.height, size.width, type); } 699 700 //! Returns the allocator associated with the stream. getAllocator() const701 CV_WRAP Ptr<GpuMat::Allocator> getAllocator() const { return allocator_; } 702 703 private: 704 Ptr<GpuMat::Allocator> allocator_; 705 }; 706 707 //! BufferPool management (must be called before Stream creation) 708 CV_EXPORTS_W void setBufferPoolUsage(bool on); 709 CV_EXPORTS_W void setBufferPoolConfig(int deviceId, size_t stackSize, int stackCount); 710 711 //=================================================================================== 712 // HostMem 713 //=================================================================================== 714 715 /** @brief Class with reference counting wrapping special memory type allocation functions from CUDA. 716 717 Its interface is also Mat-like but with additional memory type parameters. 718 719 - **PAGE_LOCKED** sets a page locked memory type used commonly for fast and asynchronous 720 uploading/downloading data from/to GPU. 721 - **SHARED** specifies a zero copy memory allocation that enables mapping the host memory to GPU 722 address space, if supported. 723 - **WRITE_COMBINED** sets the write combined buffer that is not cached by CPU. Such buffers are 724 used to supply GPU with data when GPU only reads it. The advantage is a better CPU cache 725 utilization. 726 727 @note Allocation size of such memory types is usually limited. For more details, see *CUDA 2.2 728 Pinned Memory APIs* document or *CUDA C Programming Guide*. 729 */ 730 class CV_EXPORTS_W HostMem 731 { 732 public: 733 enum AllocType { PAGE_LOCKED = 1, SHARED = 2, WRITE_COMBINED = 4 }; 734 735 static MatAllocator* getAllocator(HostMem::AllocType alloc_type = HostMem::AllocType::PAGE_LOCKED); 736 737 CV_WRAP explicit HostMem(HostMem::AllocType alloc_type = HostMem::AllocType::PAGE_LOCKED); 738 739 HostMem(const HostMem& m); 740 741 CV_WRAP HostMem(int rows, int cols, int type, HostMem::AllocType alloc_type = HostMem::AllocType::PAGE_LOCKED); 742 CV_WRAP HostMem(Size size, int type, HostMem::AllocType alloc_type = HostMem::AllocType::PAGE_LOCKED); 743 744 //! creates from host memory with coping data 745 CV_WRAP explicit HostMem(InputArray arr, HostMem::AllocType alloc_type = HostMem::AllocType::PAGE_LOCKED); 746 747 ~HostMem(); 748 749 HostMem& operator =(const HostMem& m); 750 751 //! swaps with other smart pointer 752 CV_WRAP void swap(HostMem& b); 753 754 //! returns deep copy of the matrix, i.e. the data is copied 755 CV_WRAP HostMem clone() const; 756 757 //! allocates new matrix data unless the matrix already has specified size and type. 758 CV_WRAP void create(int rows, int cols, int type); 759 void create(Size size, int type); 760 761 //! creates alternative HostMem header for the same data, with different 762 //! number of channels and/or different number of rows 763 CV_WRAP HostMem reshape(int cn, int rows = 0) const; 764 765 //! decrements reference counter and released memory if needed. 766 void release(); 767 768 //! returns matrix header with disabled reference counting for HostMem data. 769 CV_WRAP Mat createMatHeader() const; 770 771 /** @brief Maps CPU memory to GPU address space and creates the cuda::GpuMat header without reference counting 772 for it. 773 774 This can be done only if memory was allocated with the SHARED flag and if it is supported by the 775 hardware. Laptops often share video and CPU memory, so address spaces can be mapped, which 776 eliminates an extra copy. 777 */ 778 GpuMat createGpuMatHeader() const; 779 780 // Please see cv::Mat for descriptions 781 CV_WRAP bool isContinuous() const; 782 CV_WRAP size_t elemSize() const; 783 CV_WRAP size_t elemSize1() const; 784 CV_WRAP int type() const; 785 CV_WRAP int depth() const; 786 CV_WRAP int channels() const; 787 CV_WRAP size_t step1() const; 788 CV_WRAP Size size() const; 789 CV_WRAP bool empty() const; 790 791 // Please see cv::Mat for descriptions 792 int flags; 793 int rows, cols; 794 CV_PROP size_t step; 795 796 uchar* data; 797 int* refcount; 798 799 uchar* datastart; 800 const uchar* dataend; 801 802 AllocType alloc_type; 803 }; 804 805 /** @brief Page-locks the memory of matrix and maps it for the device(s). 806 807 @param m Input matrix. 808 */ 809 CV_EXPORTS_W void registerPageLocked(Mat& m); 810 811 /** @brief Unmaps the memory of matrix and makes it pageable again. 812 813 @param m Input matrix. 814 */ 815 CV_EXPORTS_W void unregisterPageLocked(Mat& m); 816 817 //=================================================================================== 818 // Stream 819 //=================================================================================== 820 821 /** @brief This class encapsulates a queue of asynchronous calls. 822 823 @note Currently, you may face problems if an operation is enqueued twice with different data. Some 824 functions use the constant GPU memory, and next call may update the memory before the previous one 825 has been finished. But calling different operations asynchronously is safe because each operation 826 has its own constant buffer. Memory copy/upload/download/set operations to the buffers you hold are 827 also safe. 828 829 @note The Stream class is not thread-safe. Please use different Stream objects for different CPU threads. 830 831 @code 832 void thread1() 833 { 834 cv::cuda::Stream stream1; 835 cv::cuda::func1(..., stream1); 836 } 837 838 void thread2() 839 { 840 cv::cuda::Stream stream2; 841 cv::cuda::func2(..., stream2); 842 } 843 @endcode 844 845 @note By default all CUDA routines are launched in Stream::Null() object, if the stream is not specified by user. 846 In multi-threading environment the stream objects must be passed explicitly (see previous note). 847 */ 848 class CV_EXPORTS_W Stream 849 { 850 typedef void (Stream::*bool_type)() const; this_type_does_not_support_comparisons() const851 void this_type_does_not_support_comparisons() const {} 852 853 public: 854 typedef void (*StreamCallback)(int status, void* userData); 855 856 //! creates a new asynchronous stream 857 CV_WRAP Stream(); 858 859 //! creates a new asynchronous stream with custom allocator 860 CV_WRAP Stream(const Ptr<GpuMat::Allocator>& allocator); 861 862 /** @brief creates a new Stream using the cudaFlags argument to determine the behaviors of the stream 863 864 @note The cudaFlags parameter is passed to the underlying api cudaStreamCreateWithFlags() and 865 supports the same parameter values. 866 @code 867 // creates an OpenCV cuda::Stream that manages an asynchronous, non-blocking, 868 // non-default CUDA stream 869 cv::cuda::Stream cvStream(cudaStreamNonBlocking); 870 @endcode 871 */ 872 CV_WRAP Stream(const size_t cudaFlags); 873 874 /** @brief Returns true if the current stream queue is finished. Otherwise, it returns false. 875 */ 876 CV_WRAP bool queryIfComplete() const; 877 878 /** @brief Blocks the current CPU thread until all operations in the stream are complete. 879 */ 880 CV_WRAP void waitForCompletion(); 881 882 /** @brief Makes a compute stream wait on an event. 883 */ 884 CV_WRAP void waitEvent(const Event& event); 885 886 /** @brief Adds a callback to be called on the host after all currently enqueued items in the stream have 887 completed. 888 889 @note Callbacks must not make any CUDA API calls. Callbacks must not perform any synchronization 890 that may depend on outstanding device work or other callbacks that are not mandated to run earlier. 891 Callbacks without a mandated order (in independent streams) execute in undefined order and may be 892 serialized. 893 */ 894 void enqueueHostCallback(StreamCallback callback, void* userData); 895 896 //! return Stream object for default CUDA stream 897 CV_WRAP static Stream& Null(); 898 899 //! returns true if stream object is not default (!= 0) 900 operator bool_type() const; 901 902 //! return Pointer to CUDA stream 903 CV_WRAP void* cudaPtr() const; 904 905 class Impl; 906 907 private: 908 Ptr<Impl> impl_; 909 Stream(const Ptr<Impl>& impl); 910 911 friend struct StreamAccessor; 912 friend class BufferPool; 913 friend class DefaultDeviceInitializer; 914 }; 915 916 class CV_EXPORTS_W Event 917 { 918 public: 919 enum CreateFlags 920 { 921 DEFAULT = 0x00, /**< Default event flag */ 922 BLOCKING_SYNC = 0x01, /**< Event uses blocking synchronization */ 923 DISABLE_TIMING = 0x02, /**< Event will not record timing data */ 924 INTERPROCESS = 0x04 /**< Event is suitable for interprocess use. DisableTiming must be set */ 925 }; 926 927 CV_WRAP explicit Event(Event::CreateFlags flags = Event::CreateFlags::DEFAULT); 928 929 //! records an event 930 CV_WRAP void record(Stream& stream = Stream::Null()); 931 932 //! queries an event's status 933 CV_WRAP bool queryIfComplete() const; 934 935 //! waits for an event to complete 936 CV_WRAP void waitForCompletion(); 937 938 //! computes the elapsed time between events 939 CV_WRAP static float elapsedTime(const Event& start, const Event& end); 940 941 class Impl; 942 943 private: 944 Ptr<Impl> impl_; 945 Event(const Ptr<Impl>& impl); 946 947 friend struct EventAccessor; 948 }; 949 950 //! @} cudacore_struct 951 952 //=================================================================================== 953 // Initialization & Info 954 //=================================================================================== 955 956 //! @addtogroup cudacore_init 957 //! @{ 958 959 /** @brief Returns the number of installed CUDA-enabled devices. 960 961 Use this function before any other CUDA functions calls. If OpenCV is compiled without CUDA support, 962 this function returns 0. If the CUDA driver is not installed, or is incompatible, this function 963 returns -1. 964 */ 965 CV_EXPORTS_W int getCudaEnabledDeviceCount(); 966 967 /** @brief Sets a device and initializes it for the current thread. 968 969 @param device System index of a CUDA device starting with 0. 970 971 If the call of this function is omitted, a default device is initialized at the fist CUDA usage. 972 */ 973 CV_EXPORTS_W void setDevice(int device); 974 975 /** @brief Returns the current device index set by cuda::setDevice or initialized by default. 976 */ 977 CV_EXPORTS_W int getDevice(); 978 979 /** @brief Explicitly destroys and cleans up all resources associated with the current device in the current 980 process. 981 982 Any subsequent API call to this device will reinitialize the device. 983 */ 984 CV_EXPORTS_W void resetDevice(); 985 986 /** @brief Enumeration providing CUDA computing features. 987 */ 988 enum FeatureSet 989 { 990 FEATURE_SET_COMPUTE_10 = 10, 991 FEATURE_SET_COMPUTE_11 = 11, 992 FEATURE_SET_COMPUTE_12 = 12, 993 FEATURE_SET_COMPUTE_13 = 13, 994 FEATURE_SET_COMPUTE_20 = 20, 995 FEATURE_SET_COMPUTE_21 = 21, 996 FEATURE_SET_COMPUTE_30 = 30, 997 FEATURE_SET_COMPUTE_32 = 32, 998 FEATURE_SET_COMPUTE_35 = 35, 999 FEATURE_SET_COMPUTE_50 = 50, 1000 1001 GLOBAL_ATOMICS = FEATURE_SET_COMPUTE_11, 1002 SHARED_ATOMICS = FEATURE_SET_COMPUTE_12, 1003 NATIVE_DOUBLE = FEATURE_SET_COMPUTE_13, 1004 WARP_SHUFFLE_FUNCTIONS = FEATURE_SET_COMPUTE_30, 1005 DYNAMIC_PARALLELISM = FEATURE_SET_COMPUTE_35 1006 }; 1007 1008 //! checks whether current device supports the given feature 1009 CV_EXPORTS bool deviceSupports(FeatureSet feature_set); 1010 1011 /** @brief Class providing a set of static methods to check what NVIDIA\* card architecture the CUDA module was 1012 built for. 1013 1014 According to the CUDA C Programming Guide Version 3.2: "PTX code produced for some specific compute 1015 capability can always be compiled to binary code of greater or equal compute capability". 1016 */ 1017 class CV_EXPORTS_W TargetArchs 1018 { 1019 public: 1020 /** @brief The following method checks whether the module was built with the support of the given feature: 1021 1022 @param feature_set Features to be checked. See :ocvcuda::FeatureSet. 1023 */ 1024 static bool builtWith(FeatureSet feature_set); 1025 1026 /** @brief There is a set of methods to check whether the module contains intermediate (PTX) or binary CUDA 1027 code for the given architecture(s): 1028 1029 @param major Major compute capability version. 1030 @param minor Minor compute capability version. 1031 */ 1032 CV_WRAP static bool has(int major, int minor); 1033 CV_WRAP static bool hasPtx(int major, int minor); 1034 CV_WRAP static bool hasBin(int major, int minor); 1035 1036 CV_WRAP static bool hasEqualOrLessPtx(int major, int minor); 1037 CV_WRAP static bool hasEqualOrGreater(int major, int minor); 1038 CV_WRAP static bool hasEqualOrGreaterPtx(int major, int minor); 1039 CV_WRAP static bool hasEqualOrGreaterBin(int major, int minor); 1040 }; 1041 1042 /** @brief Class providing functionality for querying the specified GPU properties. 1043 */ 1044 class CV_EXPORTS_W DeviceInfo 1045 { 1046 public: 1047 //! creates DeviceInfo object for the current GPU 1048 CV_WRAP DeviceInfo(); 1049 1050 /** @brief The constructors. 1051 1052 @param device_id System index of the CUDA device starting with 0. 1053 1054 Constructs the DeviceInfo object for the specified device. If device_id parameter is missed, it 1055 constructs an object for the current device. 1056 */ 1057 CV_WRAP DeviceInfo(int device_id); 1058 1059 /** @brief Returns system index of the CUDA device starting with 0. 1060 */ 1061 CV_WRAP int deviceID() const; 1062 1063 //! ASCII string identifying device 1064 const char* name() const; 1065 1066 //! global memory available on device in bytes 1067 CV_WRAP size_t totalGlobalMem() const; 1068 1069 //! shared memory available per block in bytes 1070 CV_WRAP size_t sharedMemPerBlock() const; 1071 1072 //! 32-bit registers available per block 1073 CV_WRAP int regsPerBlock() const; 1074 1075 //! warp size in threads 1076 CV_WRAP int warpSize() const; 1077 1078 //! maximum pitch in bytes allowed by memory copies 1079 CV_WRAP size_t memPitch() const; 1080 1081 //! maximum number of threads per block 1082 CV_WRAP int maxThreadsPerBlock() const; 1083 1084 //! maximum size of each dimension of a block 1085 CV_WRAP Vec3i maxThreadsDim() const; 1086 1087 //! maximum size of each dimension of a grid 1088 CV_WRAP Vec3i maxGridSize() const; 1089 1090 //! clock frequency in kilohertz 1091 CV_WRAP int clockRate() const; 1092 1093 //! constant memory available on device in bytes 1094 CV_WRAP size_t totalConstMem() const; 1095 1096 //! major compute capability 1097 CV_WRAP int majorVersion() const; 1098 1099 //! minor compute capability 1100 CV_WRAP int minorVersion() const; 1101 1102 //! alignment requirement for textures 1103 CV_WRAP size_t textureAlignment() const; 1104 1105 //! pitch alignment requirement for texture references bound to pitched memory 1106 CV_WRAP size_t texturePitchAlignment() const; 1107 1108 //! number of multiprocessors on device 1109 CV_WRAP int multiProcessorCount() const; 1110 1111 //! specified whether there is a run time limit on kernels 1112 CV_WRAP bool kernelExecTimeoutEnabled() const; 1113 1114 //! device is integrated as opposed to discrete 1115 CV_WRAP bool integrated() const; 1116 1117 //! device can map host memory with cudaHostAlloc/cudaHostGetDevicePointer 1118 CV_WRAP bool canMapHostMemory() const; 1119 1120 enum ComputeMode 1121 { 1122 ComputeModeDefault, /**< default compute mode (Multiple threads can use cudaSetDevice with this device) */ 1123 ComputeModeExclusive, /**< compute-exclusive-thread mode (Only one thread in one process will be able to use cudaSetDevice with this device) */ 1124 ComputeModeProhibited, /**< compute-prohibited mode (No threads can use cudaSetDevice with this device) */ 1125 ComputeModeExclusiveProcess /**< compute-exclusive-process mode (Many threads in one process will be able to use cudaSetDevice with this device) */ 1126 }; 1127 1128 //! compute mode 1129 CV_WRAP DeviceInfo::ComputeMode computeMode() const; 1130 1131 //! maximum 1D texture size 1132 CV_WRAP int maxTexture1D() const; 1133 1134 //! maximum 1D mipmapped texture size 1135 CV_WRAP int maxTexture1DMipmap() const; 1136 1137 //! maximum size for 1D textures bound to linear memory 1138 CV_WRAP int maxTexture1DLinear() const; 1139 1140 //! maximum 2D texture dimensions 1141 CV_WRAP Vec2i maxTexture2D() const; 1142 1143 //! maximum 2D mipmapped texture dimensions 1144 CV_WRAP Vec2i maxTexture2DMipmap() const; 1145 1146 //! maximum dimensions (width, height, pitch) for 2D textures bound to pitched memory 1147 CV_WRAP Vec3i maxTexture2DLinear() const; 1148 1149 //! maximum 2D texture dimensions if texture gather operations have to be performed 1150 CV_WRAP Vec2i maxTexture2DGather() const; 1151 1152 //! maximum 3D texture dimensions 1153 CV_WRAP Vec3i maxTexture3D() const; 1154 1155 //! maximum Cubemap texture dimensions 1156 CV_WRAP int maxTextureCubemap() const; 1157 1158 //! maximum 1D layered texture dimensions 1159 CV_WRAP Vec2i maxTexture1DLayered() const; 1160 1161 //! maximum 2D layered texture dimensions 1162 CV_WRAP Vec3i maxTexture2DLayered() const; 1163 1164 //! maximum Cubemap layered texture dimensions 1165 CV_WRAP Vec2i maxTextureCubemapLayered() const; 1166 1167 //! maximum 1D surface size 1168 CV_WRAP int maxSurface1D() const; 1169 1170 //! maximum 2D surface dimensions 1171 CV_WRAP Vec2i maxSurface2D() const; 1172 1173 //! maximum 3D surface dimensions 1174 CV_WRAP Vec3i maxSurface3D() const; 1175 1176 //! maximum 1D layered surface dimensions 1177 CV_WRAP Vec2i maxSurface1DLayered() const; 1178 1179 //! maximum 2D layered surface dimensions 1180 CV_WRAP Vec3i maxSurface2DLayered() const; 1181 1182 //! maximum Cubemap surface dimensions 1183 CV_WRAP int maxSurfaceCubemap() const; 1184 1185 //! maximum Cubemap layered surface dimensions 1186 CV_WRAP Vec2i maxSurfaceCubemapLayered() const; 1187 1188 //! alignment requirements for surfaces 1189 CV_WRAP size_t surfaceAlignment() const; 1190 1191 //! device can possibly execute multiple kernels concurrently 1192 CV_WRAP bool concurrentKernels() const; 1193 1194 //! device has ECC support enabled 1195 CV_WRAP bool ECCEnabled() const; 1196 1197 //! PCI bus ID of the device 1198 CV_WRAP int pciBusID() const; 1199 1200 //! PCI device ID of the device 1201 CV_WRAP int pciDeviceID() const; 1202 1203 //! PCI domain ID of the device 1204 CV_WRAP int pciDomainID() const; 1205 1206 //! true if device is a Tesla device using TCC driver, false otherwise 1207 CV_WRAP bool tccDriver() const; 1208 1209 //! number of asynchronous engines 1210 CV_WRAP int asyncEngineCount() const; 1211 1212 //! device shares a unified address space with the host 1213 CV_WRAP bool unifiedAddressing() const; 1214 1215 //! peak memory clock frequency in kilohertz 1216 CV_WRAP int memoryClockRate() const; 1217 1218 //! global memory bus width in bits 1219 CV_WRAP int memoryBusWidth() const; 1220 1221 //! size of L2 cache in bytes 1222 CV_WRAP int l2CacheSize() const; 1223 1224 //! maximum resident threads per multiprocessor 1225 CV_WRAP int maxThreadsPerMultiProcessor() const; 1226 1227 //! gets free and total device memory 1228 CV_WRAP void queryMemory(size_t& totalMemory, size_t& freeMemory) const; 1229 CV_WRAP size_t freeMemory() const; 1230 CV_WRAP size_t totalMemory() const; 1231 1232 /** @brief Provides information on CUDA feature support. 1233 1234 @param feature_set Features to be checked. See cuda::FeatureSet. 1235 1236 This function returns true if the device has the specified CUDA feature. Otherwise, it returns false 1237 */ 1238 bool supports(FeatureSet feature_set) const; 1239 1240 /** @brief Checks the CUDA module and device compatibility. 1241 1242 This function returns true if the CUDA module can be run on the specified device. Otherwise, it 1243 returns false . 1244 */ 1245 CV_WRAP bool isCompatible() const; 1246 1247 private: 1248 int device_id_; 1249 }; 1250 1251 CV_EXPORTS_W void printCudaDeviceInfo(int device); 1252 CV_EXPORTS_W void printShortCudaDeviceInfo(int device); 1253 1254 /** @brief Converts an array to half precision floating number. 1255 1256 @param _src input array. 1257 @param _dst output array. 1258 @param stream Stream for the asynchronous version. 1259 @sa convertFp16 1260 */ 1261 CV_EXPORTS void convertFp16(InputArray _src, OutputArray _dst, Stream& stream = Stream::Null()); 1262 1263 //! @} cudacore_init 1264 1265 }} // namespace cv { namespace cuda { 1266 1267 1268 #include "opencv2/core/cuda.inl.hpp" 1269 1270 #endif /* OPENCV_CORE_CUDA_HPP */ 1271