1 // This file is part of OpenCV project. 2 // It is subject to the license terms in the LICENSE file found in the top-level directory 3 // of this distribution and at http://opencv.org/license.html. 4 5 #ifndef OPENCV_DNN_CUDA4DNN_CSL_CUDNN_LRN_HPP 6 #define OPENCV_DNN_CUDA4DNN_CSL_CUDNN_LRN_HPP 7 8 #include "cudnn.hpp" 9 10 #include "../pointer.hpp" 11 #include "../workspace.hpp" 12 13 #include <opencv2/core.hpp> 14 15 #include <cudnn.h> 16 17 #include <cstddef> 18 19 namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cudnn { 20 21 class LRNDescriptor { 22 public: 23 enum class LRNType { 24 ACROSS_CHANNELS, 25 WITHIN_CHANNEL 26 }; 27 LRNDescriptor()28 LRNDescriptor() noexcept : descriptor{ nullptr } { } 29 LRNDescriptor(const LRNDescriptor&) = delete; LRNDescriptor(LRNDescriptor && other)30 LRNDescriptor(LRNDescriptor&& other) noexcept 31 : descriptor{ other.descriptor }, type{ other.type } { 32 other.descriptor = nullptr; 33 } 34 35 /** sets up a LRN descriptor 36 * 37 * @param local_size size of the normalization window 38 * @param alpha variance scaling parameter 39 * @param beta power parameter 40 * @param k bias parameter 41 * 42 * @note \p alpha is divided by the window width in across channels mode 43 * @note \p alpha is divided by the (window width)^spatialDimensions in within channel mode 44 * 45 * @note the \p alpha, \p beta and \p k will be type casted to the tensor datatype during operation 46 * 47 * Exception Guarantee: Basic 48 */ LRNDescriptor(std::size_t local_size,double alpha,double beta,double k,LRNType type_)49 LRNDescriptor(std::size_t local_size, double alpha, double beta, double k, LRNType type_) { 50 constructor(local_size, alpha, beta, k, type_); 51 } 52 ~LRNDescriptor()53 ~LRNDescriptor() noexcept { 54 if (descriptor != nullptr) { 55 /* cudnnDestroyLRNDescriptor will not fail for a valid descriptor */ 56 CUDA4DNN_CHECK_CUDNN(cudnnDestroyLRNDescriptor(descriptor)); 57 } 58 } 59 60 LRNDescriptor& operator=(const LRNDescriptor&) = delete; operator =(LRNDescriptor && other)61 LRNDescriptor& operator=(LRNDescriptor&& other) noexcept { 62 descriptor = other.descriptor; 63 type = other.type; 64 other.descriptor = nullptr; 65 return *this; 66 }; 67 get() const68 cudnnLRNDescriptor_t get() const noexcept { return descriptor; } getType() const69 LRNType getType() const noexcept { return type; } 70 71 private: constructor(std::size_t local_size,double alpha,double beta,double k,LRNType type_)72 void constructor(std::size_t local_size, double alpha, double beta, double k, LRNType type_) { 73 CV_Assert(CUDNN_LRN_MIN_N <= local_size && local_size <= CUDNN_LRN_MAX_N); 74 75 type = type_; 76 77 CUDA4DNN_CHECK_CUDNN(cudnnCreateLRNDescriptor(&descriptor)); 78 try { 79 CUDA4DNN_CHECK_CUDNN( 80 cudnnSetLRNDescriptor( 81 descriptor, 82 local_size, 83 alpha, 84 beta, 85 k 86 ) 87 ); 88 } catch (...) { 89 /* cudnnDestroyLRNDescriptor will not fail for a valid descriptor */ 90 CUDA4DNN_CHECK_CUDNN(cudnnDestroyLRNDescriptor(descriptor)); 91 throw; 92 } 93 } 94 95 cudnnLRNDescriptor_t descriptor; 96 LRNType type; 97 }; 98 99 /** @brief performs local response normalization 100 * 101 * dstValue = alpha * result + beta * priorDstValue 102 * 103 * @tparam T element type (must be `half` or `float`) 104 * 105 * @param handle valid cuDNN Handle 106 * @param lrnDesc LRN description 107 * @param inputDesc tensor descriptor describing the input 108 * @param[in] inputPtr pointer to input tensor in device memory 109 * @param alpha result scale factor 110 * @param beta previous value scale factor 111 * @param outputDesc tensor descriptor describing the output 112 * @param[out] outputPtr pointer to output tensor in device memory 113 * @param workspace workspace memory which meets the requirements of \p convAlgo 114 * 115 * Exception Guarantee: Basic 116 */ 117 template <class T> LRNForward(const Handle & handle,const LRNDescriptor & lrnDesc,const TensorDescriptor<T> & inputDesc,DevicePtr<const T> inputPtr,T alpha,T beta,const TensorDescriptor<T> & outputDesc,DevicePtr<T> outputPtr,WorkspaceInstance workspace)118 void LRNForward( 119 const Handle& handle, 120 const LRNDescriptor& lrnDesc, 121 const TensorDescriptor<T>& inputDesc, 122 DevicePtr<const T> inputPtr, 123 T alpha, T beta, 124 const TensorDescriptor<T>& outputDesc, 125 DevicePtr<T> outputPtr, 126 WorkspaceInstance workspace) 127 { 128 CV_Assert(handle); 129 130 if (lrnDesc.getType() == LRNDescriptor::LRNType::ACROSS_CHANNELS) { 131 CUDA4DNN_CHECK_CUDNN( 132 cudnnLRNCrossChannelForward( 133 handle.get(), 134 lrnDesc.get(), CUDNN_LRN_CROSS_CHANNEL_DIM1, 135 &alpha, inputDesc.get(), inputPtr.get(), 136 &beta, outputDesc.get(), outputPtr.get() 137 ) 138 ); 139 } else if (lrnDesc.getType() == LRNDescriptor::LRNType::WITHIN_CHANNEL) { 140 std::size_t size; 141 CUDA4DNN_CHECK_CUDNN(cudnnGetTensorSizeInBytes(inputDesc.get(), &size)); 142 143 DevicePtr<void> temp1 = workspace.get_span<half>(size).data(); 144 DevicePtr<void> temp2 = workspace.get_span<half>(size).data(); 145 146 CUDA4DNN_CHECK_CUDNN( 147 cudnnDivisiveNormalizationForward( 148 handle.get(), 149 lrnDesc.get(), CUDNN_DIVNORM_PRECOMPUTED_MEANS, 150 &alpha, inputDesc.get(), inputPtr.get(), 151 NULL, 152 static_cast<void*>(temp1), static_cast<void*>(temp2), 153 &beta, outputDesc.get(), outputPtr.get() 154 ) 155 ); 156 } 157 } 158 159 template <> inline LRNForward(const Handle & handle,const LRNDescriptor & lrnDesc,const TensorDescriptor<half> & inputDesc,DevicePtr<const half> inputPtr,half alpha,half beta,const TensorDescriptor<half> & outputDesc,DevicePtr<half> outputPtr,WorkspaceInstance workspace)160 void LRNForward( 161 const Handle& handle, 162 const LRNDescriptor& lrnDesc, 163 const TensorDescriptor<half>& inputDesc, 164 DevicePtr<const half> inputPtr, 165 half alpha, half beta, 166 const TensorDescriptor<half>& outputDesc, 167 DevicePtr<half> outputPtr, 168 WorkspaceInstance workspace) 169 { 170 CV_Assert(handle); 171 172 /* we specalize for fp16 as the scaling factors must be provided as `float` */ 173 float alpha_ = alpha, beta_ = beta; 174 if (lrnDesc.getType() == LRNDescriptor::LRNType::ACROSS_CHANNELS) { 175 CUDA4DNN_CHECK_CUDNN( 176 cudnnLRNCrossChannelForward( 177 handle.get(), 178 lrnDesc.get(), CUDNN_LRN_CROSS_CHANNEL_DIM1, 179 &alpha_, inputDesc.get(), inputPtr.get(), 180 &beta_, outputDesc.get(), outputPtr.get() 181 ) 182 ); 183 } else if (lrnDesc.getType() == LRNDescriptor::LRNType::WITHIN_CHANNEL) { 184 std::size_t size; 185 CUDA4DNN_CHECK_CUDNN(cudnnGetTensorSizeInBytes(inputDesc.get(), &size)); 186 187 DevicePtr<void> temp1 = workspace.get_span<half>(size).data(); 188 DevicePtr<void> temp2 = workspace.get_span<half>(size).data(); 189 190 CUDA4DNN_CHECK_CUDNN( 191 cudnnDivisiveNormalizationForward( 192 handle.get(), 193 lrnDesc.get(), CUDNN_DIVNORM_PRECOMPUTED_MEANS, 194 &alpha_, inputDesc.get(), inputPtr.get(), 195 NULL, 196 static_cast<void*>(temp1), static_cast<void*>(temp2), 197 &beta_, outputDesc.get(), outputPtr.get() 198 ) 199 ); 200 } 201 } 202 203 }}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */ 204 205 #endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_LRN_HPP */ 206