1 // This file is part of OpenCV project.
2 // It is subject to the license terms in the LICENSE file found in the top-level directory
3 // of this distribution and at http://opencv.org/license.html.
4 
5 #ifndef OPENCV_DNN_CUDA4DNN_CSL_CUDNN_TRANSPOSE_CONVOLUTION_HPP
6 #define OPENCV_DNN_CUDA4DNN_CSL_CUDNN_TRANSPOSE_CONVOLUTION_HPP
7 
8 #include "cudnn.hpp"
9 #include "convolution.hpp"
10 
11 #include "../pointer.hpp"
12 #include "../workspace.hpp"
13 
14 #include <cudnn.h>
15 
16 #include <cstddef>
17 
18 namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cudnn {
19 
20     /** wrapper around a transpose convolution algorithm
21      *
22      * @tparam  T   type of elements being transpose-convolved
23      */
24     template <class T>
25     class TransposeConvolutionAlgorithm {
26     public:
TransposeConvolutionAlgorithm()27         TransposeConvolutionAlgorithm() noexcept : workspace_size{ 0 } { }
28         TransposeConvolutionAlgorithm(TransposeConvolutionAlgorithm&) = default;
29         TransposeConvolutionAlgorithm(TransposeConvolutionAlgorithm&&) = default;
30 
TransposeConvolutionAlgorithm(const Handle & handle,const ConvolutionDescriptor<T> & convDesc,const FilterDescriptor<T> & filterDesc,const TensorDescriptor<T> & inputDesc,const TensorDescriptor<T> & outputDesc)31         TransposeConvolutionAlgorithm(
32             const Handle& handle,
33             const ConvolutionDescriptor<T>& convDesc,
34             const FilterDescriptor<T>& filterDesc,
35             const TensorDescriptor<T>& inputDesc,
36             const TensorDescriptor<T>& outputDesc)
37         {
38 #if CUDNN_MAJOR >= 8
39             int requestedAlgoCount = 0, returnedAlgoCount = 0;
40             CUDA4DNN_CHECK_CUDNN(cudnnGetConvolutionBackwardDataAlgorithmMaxCount(handle.get(), &requestedAlgoCount));
41             std::vector<cudnnConvolutionBwdDataAlgoPerf_t> results(requestedAlgoCount);
42             CUDA4DNN_CHECK_CUDNN(
43                 cudnnGetConvolutionBackwardDataAlgorithm_v7(
44                     handle.get(),
45                     filterDesc.get(), inputDesc.get(), convDesc.get(), outputDesc.get(),
46                     requestedAlgoCount,
47                     &returnedAlgoCount,
48                     &results[0]
49                 )
50             );
51 
52             size_t free_memory, total_memory;
53             CUDA4DNN_CHECK_CUDA(cudaMemGetInfo(&free_memory, &total_memory));
54 
55             bool found_conv_algorithm = false;
56             for (int i = 0; i < returnedAlgoCount; i++)
57             {
58                 if (results[i].status == CUDNN_STATUS_SUCCESS &&
59                     results[i].algo != CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED &&
60                     results[i].memory < free_memory)
61                 {
62                     found_conv_algorithm = true;
63                     dalgo = results[i].algo;
64                     workspace_size = results[i].memory;
65                     break;
66                 }
67             }
68 
69             if (!found_conv_algorithm)
70                 CV_Error (cv::Error::GpuApiCallError, "cuDNN did not return a suitable algorithm for transpose convolution.");
71 #else
72             CUDA4DNN_CHECK_CUDNN(
73                 cudnnGetConvolutionBackwardDataAlgorithm(
74                     handle.get(),
75                     filterDesc.get(), inputDesc.get(), convDesc.get(), outputDesc.get(),
76                     CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST,
77                     0, /* no memory limit */
78                     &dalgo
79                 )
80             );
81 
82             CUDA4DNN_CHECK_CUDNN(
83                 cudnnGetConvolutionBackwardDataWorkspaceSize(
84                     handle.get(),
85                     filterDesc.get(), inputDesc.get(), convDesc.get(), outputDesc.get(),
86                     dalgo, &workspace_size
87                 )
88             );
89 #endif
90         }
91 
92         TransposeConvolutionAlgorithm& operator=(const TransposeConvolutionAlgorithm&) = default;
93         TransposeConvolutionAlgorithm& operator=(TransposeConvolutionAlgorithm&& other) = default;
94 
get() const95         cudnnConvolutionBwdDataAlgo_t get() const noexcept { return dalgo; }
96 
get_workspace_size() const97         std::size_t get_workspace_size() const noexcept { return workspace_size; }
98 
99     private:
100         cudnnConvolutionBwdDataAlgo_t dalgo;
101         std::size_t workspace_size;
102     };
103 
104     /** @brief performs transpose convolution
105       *
106       * dstValue = alpha * result + beta * priorDstValue
107       *
108       * @tparam          T              transpose convolution element type (must be `half` or `float`)
109       *
110       * @param           handle         valid cuDNN Handle
111       * @param           convDesc       convolution description
112       * @param           transConvAlgo  algorithm to use for convolution
113       * @param           workspace      workspace memory which meets the requirements of \p convAlgo
114       * @param           filterDesc     filter descriptor
115       * @param[in]       filterPtr      pointer to device memory containing the filters
116       * @param           inputDesc      tensor descriptor describing the input
117       * @param[in]       inputPtr       pointer to input tensor in device memory
118       * @param           alpha          result scale factor
119       * @param           beta           previous value scale factor
120       * @param           outputDesc     tensor descriptor describing the output
121       * @param[out]      outputPtr      pointer to output tensor in device memory
122       *
123       * Exception Guarantee: Basic
124       */
125     template <class T>
transpose_convolve(const Handle & handle,const ConvolutionDescriptor<T> & convDesc,const TransposeConvolutionAlgorithm<T> & transConvAlgo,WorkspaceInstance workspace,const FilterDescriptor<T> & filterDesc,DevicePtr<const T> filterPtr,const TensorDescriptor<T> & inputDesc,DevicePtr<const T> inputPtr,T alpha,T beta,const TensorDescriptor<T> & outputDesc,DevicePtr<T> outputPtr)126     void transpose_convolve(
127         const Handle& handle,
128         const ConvolutionDescriptor<T>& convDesc,
129         const TransposeConvolutionAlgorithm<T>& transConvAlgo,
130         WorkspaceInstance workspace,
131         const FilterDescriptor<T>& filterDesc,
132         DevicePtr<const T> filterPtr,
133         const TensorDescriptor<T>& inputDesc,
134         DevicePtr<const T> inputPtr,
135         T alpha, T beta,
136         const TensorDescriptor<T>& outputDesc,
137         DevicePtr<T> outputPtr)
138     {
139         CUDA4DNN_CHECK_CUDNN(
140             cudnnConvolutionBackwardData(
141                 handle.get(),
142                 &alpha,
143                 filterDesc.get(), filterPtr.get(),
144                 inputDesc.get(), inputPtr.get(),
145                 convDesc.get(), transConvAlgo.get(),
146                 static_cast<void*>(workspace.get()), workspace.size_in_bytes(),
147                 &beta, outputDesc.get(), outputPtr.get()
148             )
149         );
150     }
151 
152     template <> inline
transpose_convolve(const Handle & handle,const ConvolutionDescriptor<half> & convDesc,const TransposeConvolutionAlgorithm<half> & convAlgo,WorkspaceInstance workspace,const FilterDescriptor<half> & filterDesc,DevicePtr<const half> filterPtr,const TensorDescriptor<half> & inputDesc,DevicePtr<const half> inputPtr,half alpha,half beta,const TensorDescriptor<half> & outputDesc,DevicePtr<half> outputPtr)153     void transpose_convolve(
154         const Handle& handle,
155         const ConvolutionDescriptor<half>& convDesc,
156         const TransposeConvolutionAlgorithm<half>& convAlgo,
157         WorkspaceInstance workspace,
158         const FilterDescriptor<half>& filterDesc,
159         DevicePtr<const half> filterPtr,
160         const TensorDescriptor<half>& inputDesc,
161         DevicePtr<const half> inputPtr,
162         half alpha, half beta,
163         const TensorDescriptor<half>& outputDesc,
164         DevicePtr<half> outputPtr)
165     {
166         /* we specalize for fp16 as the scaling factors must be provided as `float` */
167         float alpha_ = alpha, beta_ = beta;
168         CUDA4DNN_CHECK_CUDNN(
169             cudnnConvolutionBackwardData(
170                 handle.get(),
171                 &alpha_,
172                 filterDesc.get(), filterPtr.get(),
173                 inputDesc.get(), inputPtr.get(),
174                 convDesc.get(), convAlgo.get(),
175                 static_cast<void*>(workspace.get()), workspace.size_in_bytes(),
176                 &beta_, outputDesc.get(), outputPtr.get()
177             )
178         );
179     }
180 
181 }}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */
182 
183 #endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_TRANSPOSE_CONVOLUTION_HPP */
184