1 // This file is part of OpenCV project.
2 // It is subject to the license terms in the LICENSE file found in the top-level directory
3 // of this distribution and at http://opencv.org/license.html.
4 
5 #ifndef OPENCV_DNN_CUDA4DNN_CSL_CUDNN_LRN_HPP
6 #define OPENCV_DNN_CUDA4DNN_CSL_CUDNN_LRN_HPP
7 
8 #include "cudnn.hpp"
9 
10 #include "../pointer.hpp"
11 #include "../workspace.hpp"
12 
13 #include <opencv2/core.hpp>
14 
15 #include <cudnn.h>
16 
17 #include <cstddef>
18 
19 namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cudnn {
20 
21     class LRNDescriptor {
22     public:
23         enum class LRNType {
24             ACROSS_CHANNELS,
25             WITHIN_CHANNEL
26         };
27 
LRNDescriptor()28         LRNDescriptor() noexcept : descriptor{ nullptr } { }
29         LRNDescriptor(const LRNDescriptor&) = delete;
LRNDescriptor(LRNDescriptor && other)30         LRNDescriptor(LRNDescriptor&& other) noexcept
31             : descriptor{ other.descriptor }, type{ other.type } {
32             other.descriptor = nullptr;
33         }
34 
35         /** sets up a LRN descriptor
36          *
37          * @param local_size    size of the normalization window
38          * @param alpha         variance scaling parameter
39          * @param beta          power parameter
40          * @param k             bias parameter
41          *
42          * @note \p alpha is divided by the window width in across channels mode
43          * @note \p alpha is divided by the (window width)^spatialDimensions in within channel mode
44          *
45          * @note the \p alpha, \p beta and \p k will be type casted to the tensor datatype during operation
46          *
47          * Exception Guarantee: Basic
48          */
LRNDescriptor(std::size_t local_size,double alpha,double beta,double k,LRNType type_)49         LRNDescriptor(std::size_t local_size, double alpha, double beta, double k, LRNType type_) {
50             constructor(local_size, alpha, beta, k, type_);
51         }
52 
~LRNDescriptor()53         ~LRNDescriptor() noexcept {
54             if (descriptor != nullptr) {
55                 /* cudnnDestroyLRNDescriptor will not fail for a valid descriptor */
56                 CUDA4DNN_CHECK_CUDNN(cudnnDestroyLRNDescriptor(descriptor));
57             }
58         }
59 
60         LRNDescriptor& operator=(const LRNDescriptor&) = delete;
operator =(LRNDescriptor && other)61         LRNDescriptor& operator=(LRNDescriptor&& other) noexcept {
62             descriptor = other.descriptor;
63             type = other.type;
64             other.descriptor = nullptr;
65             return *this;
66         };
67 
get() const68         cudnnLRNDescriptor_t get() const noexcept { return descriptor; }
getType() const69         LRNType getType() const noexcept { return type; }
70 
71     private:
constructor(std::size_t local_size,double alpha,double beta,double k,LRNType type_)72         void constructor(std::size_t local_size, double alpha, double beta, double k, LRNType type_) {
73             CV_Assert(CUDNN_LRN_MIN_N <= local_size && local_size <= CUDNN_LRN_MAX_N);
74 
75             type = type_;
76 
77             CUDA4DNN_CHECK_CUDNN(cudnnCreateLRNDescriptor(&descriptor));
78             try {
79                 CUDA4DNN_CHECK_CUDNN(
80                     cudnnSetLRNDescriptor(
81                         descriptor,
82                         local_size,
83                         alpha,
84                         beta,
85                         k
86                     )
87                );
88             } catch (...) {
89                 /* cudnnDestroyLRNDescriptor will not fail for a valid descriptor */
90                 CUDA4DNN_CHECK_CUDNN(cudnnDestroyLRNDescriptor(descriptor));
91                 throw;
92             }
93         }
94 
95         cudnnLRNDescriptor_t descriptor;
96         LRNType type;
97     };
98 
99     /** @brief performs local response normalization
100      *
101      * dstValue = alpha * result + beta * priorDstValue
102      *
103      * @tparam          T           element type (must be `half` or `float`)
104      *
105      * @param           handle      valid cuDNN Handle
106      * @param           lrnDesc     LRN description
107      * @param           inputDesc   tensor descriptor describing the input
108      * @param[in]       inputPtr    pointer to input tensor in device memory
109      * @param           alpha       result scale factor
110      * @param           beta        previous value scale factor
111      * @param           outputDesc  tensor descriptor describing the output
112      * @param[out]      outputPtr   pointer to output tensor in device memory
113      * @param           workspace   workspace memory which meets the requirements of \p convAlgo
114      *
115      * Exception Guarantee: Basic
116      */
117     template <class T>
LRNForward(const Handle & handle,const LRNDescriptor & lrnDesc,const TensorDescriptor<T> & inputDesc,DevicePtr<const T> inputPtr,T alpha,T beta,const TensorDescriptor<T> & outputDesc,DevicePtr<T> outputPtr,WorkspaceInstance workspace)118     void LRNForward(
119         const Handle& handle,
120         const LRNDescriptor& lrnDesc,
121         const TensorDescriptor<T>& inputDesc,
122         DevicePtr<const T> inputPtr,
123         T alpha, T beta,
124         const TensorDescriptor<T>& outputDesc,
125         DevicePtr<T> outputPtr,
126         WorkspaceInstance workspace)
127     {
128         CV_Assert(handle);
129 
130         if (lrnDesc.getType() == LRNDescriptor::LRNType::ACROSS_CHANNELS) {
131             CUDA4DNN_CHECK_CUDNN(
132                 cudnnLRNCrossChannelForward(
133                     handle.get(),
134                     lrnDesc.get(), CUDNN_LRN_CROSS_CHANNEL_DIM1,
135                     &alpha, inputDesc.get(), inputPtr.get(),
136                     &beta, outputDesc.get(), outputPtr.get()
137                 )
138             );
139         } else if (lrnDesc.getType() == LRNDescriptor::LRNType::WITHIN_CHANNEL) {
140             std::size_t size;
141             CUDA4DNN_CHECK_CUDNN(cudnnGetTensorSizeInBytes(inputDesc.get(), &size));
142 
143             DevicePtr<void> temp1 = workspace.get_span<half>(size).data();
144             DevicePtr<void> temp2 = workspace.get_span<half>(size).data();
145 
146             CUDA4DNN_CHECK_CUDNN(
147                 cudnnDivisiveNormalizationForward(
148                     handle.get(),
149                     lrnDesc.get(), CUDNN_DIVNORM_PRECOMPUTED_MEANS,
150                     &alpha, inputDesc.get(), inputPtr.get(),
151                     NULL,
152                     static_cast<void*>(temp1), static_cast<void*>(temp2),
153                     &beta, outputDesc.get(), outputPtr.get()
154                 )
155             );
156         }
157     }
158 
159     template <> inline
LRNForward(const Handle & handle,const LRNDescriptor & lrnDesc,const TensorDescriptor<half> & inputDesc,DevicePtr<const half> inputPtr,half alpha,half beta,const TensorDescriptor<half> & outputDesc,DevicePtr<half> outputPtr,WorkspaceInstance workspace)160     void LRNForward(
161        const Handle& handle,
162        const LRNDescriptor& lrnDesc,
163        const TensorDescriptor<half>& inputDesc,
164        DevicePtr<const half> inputPtr,
165        half alpha, half beta,
166        const TensorDescriptor<half>& outputDesc,
167        DevicePtr<half> outputPtr,
168         WorkspaceInstance workspace)
169     {
170         CV_Assert(handle);
171 
172         /* we specalize for fp16 as the scaling factors must be provided as `float` */
173         float alpha_ = alpha, beta_ = beta;
174         if (lrnDesc.getType() == LRNDescriptor::LRNType::ACROSS_CHANNELS) {
175             CUDA4DNN_CHECK_CUDNN(
176                 cudnnLRNCrossChannelForward(
177                     handle.get(),
178                     lrnDesc.get(), CUDNN_LRN_CROSS_CHANNEL_DIM1,
179                     &alpha_, inputDesc.get(), inputPtr.get(),
180                     &beta_, outputDesc.get(), outputPtr.get()
181                 )
182             );
183         } else if (lrnDesc.getType() == LRNDescriptor::LRNType::WITHIN_CHANNEL) {
184             std::size_t size;
185             CUDA4DNN_CHECK_CUDNN(cudnnGetTensorSizeInBytes(inputDesc.get(), &size));
186 
187             DevicePtr<void> temp1 = workspace.get_span<half>(size).data();
188             DevicePtr<void> temp2 = workspace.get_span<half>(size).data();
189 
190             CUDA4DNN_CHECK_CUDNN(
191                 cudnnDivisiveNormalizationForward(
192                     handle.get(),
193                     lrnDesc.get(), CUDNN_DIVNORM_PRECOMPUTED_MEANS,
194                     &alpha_, inputDesc.get(), inputPtr.get(),
195                     NULL,
196                     static_cast<void*>(temp1), static_cast<void*>(temp2),
197                     &beta_, outputDesc.get(), outputPtr.get()
198                 )
199             );
200         }
201     }
202 
203 }}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */
204 
205 #endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_LRN_HPP */
206