1 // This file is part of OpenCV project. 2 // It is subject to the license terms in the LICENSE file found in the top-level directory 3 // of this distribution and at http://opencv.org/license.html. 4 5 #ifndef OPENCV_DNN_CUDA4DNN_CSL_CUDNN_TRANSPOSE_CONVOLUTION_HPP 6 #define OPENCV_DNN_CUDA4DNN_CSL_CUDNN_TRANSPOSE_CONVOLUTION_HPP 7 8 #include "cudnn.hpp" 9 #include "convolution.hpp" 10 11 #include "../pointer.hpp" 12 #include "../workspace.hpp" 13 14 #include <cudnn.h> 15 16 #include <cstddef> 17 18 namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cudnn { 19 20 /** wrapper around a transpose convolution algorithm 21 * 22 * @tparam T type of elements being transpose-convolved 23 */ 24 template <class T> 25 class TransposeConvolutionAlgorithm { 26 public: TransposeConvolutionAlgorithm()27 TransposeConvolutionAlgorithm() noexcept : workspace_size{ 0 } { } 28 TransposeConvolutionAlgorithm(TransposeConvolutionAlgorithm&) = default; 29 TransposeConvolutionAlgorithm(TransposeConvolutionAlgorithm&&) = default; 30 TransposeConvolutionAlgorithm(const Handle & handle,const ConvolutionDescriptor<T> & convDesc,const FilterDescriptor<T> & filterDesc,const TensorDescriptor<T> & inputDesc,const TensorDescriptor<T> & outputDesc)31 TransposeConvolutionAlgorithm( 32 const Handle& handle, 33 const ConvolutionDescriptor<T>& convDesc, 34 const FilterDescriptor<T>& filterDesc, 35 const TensorDescriptor<T>& inputDesc, 36 const TensorDescriptor<T>& outputDesc) 37 { 38 #if CUDNN_MAJOR >= 8 39 int requestedAlgoCount = 0, returnedAlgoCount = 0; 40 CUDA4DNN_CHECK_CUDNN(cudnnGetConvolutionBackwardDataAlgorithmMaxCount(handle.get(), &requestedAlgoCount)); 41 std::vector<cudnnConvolutionBwdDataAlgoPerf_t> results(requestedAlgoCount); 42 CUDA4DNN_CHECK_CUDNN( 43 cudnnGetConvolutionBackwardDataAlgorithm_v7( 44 handle.get(), 45 filterDesc.get(), inputDesc.get(), convDesc.get(), outputDesc.get(), 46 requestedAlgoCount, 47 &returnedAlgoCount, 48 &results[0] 49 ) 50 ); 51 52 size_t free_memory, total_memory; 53 CUDA4DNN_CHECK_CUDA(cudaMemGetInfo(&free_memory, &total_memory)); 54 55 bool found_conv_algorithm = false; 56 for (int i = 0; i < returnedAlgoCount; i++) 57 { 58 if (results[i].status == CUDNN_STATUS_SUCCESS && 59 results[i].algo != CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED && 60 results[i].memory < free_memory) 61 { 62 found_conv_algorithm = true; 63 dalgo = results[i].algo; 64 workspace_size = results[i].memory; 65 break; 66 } 67 } 68 69 if (!found_conv_algorithm) 70 CV_Error (cv::Error::GpuApiCallError, "cuDNN did not return a suitable algorithm for transpose convolution."); 71 #else 72 CUDA4DNN_CHECK_CUDNN( 73 cudnnGetConvolutionBackwardDataAlgorithm( 74 handle.get(), 75 filterDesc.get(), inputDesc.get(), convDesc.get(), outputDesc.get(), 76 CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST, 77 0, /* no memory limit */ 78 &dalgo 79 ) 80 ); 81 82 CUDA4DNN_CHECK_CUDNN( 83 cudnnGetConvolutionBackwardDataWorkspaceSize( 84 handle.get(), 85 filterDesc.get(), inputDesc.get(), convDesc.get(), outputDesc.get(), 86 dalgo, &workspace_size 87 ) 88 ); 89 #endif 90 } 91 92 TransposeConvolutionAlgorithm& operator=(const TransposeConvolutionAlgorithm&) = default; 93 TransposeConvolutionAlgorithm& operator=(TransposeConvolutionAlgorithm&& other) = default; 94 get() const95 cudnnConvolutionBwdDataAlgo_t get() const noexcept { return dalgo; } 96 get_workspace_size() const97 std::size_t get_workspace_size() const noexcept { return workspace_size; } 98 99 private: 100 cudnnConvolutionBwdDataAlgo_t dalgo; 101 std::size_t workspace_size; 102 }; 103 104 /** @brief performs transpose convolution 105 * 106 * dstValue = alpha * result + beta * priorDstValue 107 * 108 * @tparam T transpose convolution element type (must be `half` or `float`) 109 * 110 * @param handle valid cuDNN Handle 111 * @param convDesc convolution description 112 * @param transConvAlgo algorithm to use for convolution 113 * @param workspace workspace memory which meets the requirements of \p convAlgo 114 * @param filterDesc filter descriptor 115 * @param[in] filterPtr pointer to device memory containing the filters 116 * @param inputDesc tensor descriptor describing the input 117 * @param[in] inputPtr pointer to input tensor in device memory 118 * @param alpha result scale factor 119 * @param beta previous value scale factor 120 * @param outputDesc tensor descriptor describing the output 121 * @param[out] outputPtr pointer to output tensor in device memory 122 * 123 * Exception Guarantee: Basic 124 */ 125 template <class T> transpose_convolve(const Handle & handle,const ConvolutionDescriptor<T> & convDesc,const TransposeConvolutionAlgorithm<T> & transConvAlgo,WorkspaceInstance workspace,const FilterDescriptor<T> & filterDesc,DevicePtr<const T> filterPtr,const TensorDescriptor<T> & inputDesc,DevicePtr<const T> inputPtr,T alpha,T beta,const TensorDescriptor<T> & outputDesc,DevicePtr<T> outputPtr)126 void transpose_convolve( 127 const Handle& handle, 128 const ConvolutionDescriptor<T>& convDesc, 129 const TransposeConvolutionAlgorithm<T>& transConvAlgo, 130 WorkspaceInstance workspace, 131 const FilterDescriptor<T>& filterDesc, 132 DevicePtr<const T> filterPtr, 133 const TensorDescriptor<T>& inputDesc, 134 DevicePtr<const T> inputPtr, 135 T alpha, T beta, 136 const TensorDescriptor<T>& outputDesc, 137 DevicePtr<T> outputPtr) 138 { 139 CUDA4DNN_CHECK_CUDNN( 140 cudnnConvolutionBackwardData( 141 handle.get(), 142 &alpha, 143 filterDesc.get(), filterPtr.get(), 144 inputDesc.get(), inputPtr.get(), 145 convDesc.get(), transConvAlgo.get(), 146 static_cast<void*>(workspace.get()), workspace.size_in_bytes(), 147 &beta, outputDesc.get(), outputPtr.get() 148 ) 149 ); 150 } 151 152 template <> inline transpose_convolve(const Handle & handle,const ConvolutionDescriptor<half> & convDesc,const TransposeConvolutionAlgorithm<half> & convAlgo,WorkspaceInstance workspace,const FilterDescriptor<half> & filterDesc,DevicePtr<const half> filterPtr,const TensorDescriptor<half> & inputDesc,DevicePtr<const half> inputPtr,half alpha,half beta,const TensorDescriptor<half> & outputDesc,DevicePtr<half> outputPtr)153 void transpose_convolve( 154 const Handle& handle, 155 const ConvolutionDescriptor<half>& convDesc, 156 const TransposeConvolutionAlgorithm<half>& convAlgo, 157 WorkspaceInstance workspace, 158 const FilterDescriptor<half>& filterDesc, 159 DevicePtr<const half> filterPtr, 160 const TensorDescriptor<half>& inputDesc, 161 DevicePtr<const half> inputPtr, 162 half alpha, half beta, 163 const TensorDescriptor<half>& outputDesc, 164 DevicePtr<half> outputPtr) 165 { 166 /* we specalize for fp16 as the scaling factors must be provided as `float` */ 167 float alpha_ = alpha, beta_ = beta; 168 CUDA4DNN_CHECK_CUDNN( 169 cudnnConvolutionBackwardData( 170 handle.get(), 171 &alpha_, 172 filterDesc.get(), filterPtr.get(), 173 inputDesc.get(), inputPtr.get(), 174 convDesc.get(), convAlgo.get(), 175 static_cast<void*>(workspace.get()), workspace.size_in_bytes(), 176 &beta_, outputDesc.get(), outputPtr.get() 177 ) 178 ); 179 } 180 181 }}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */ 182 183 #endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_TRANSPOSE_CONVOLUTION_HPP */ 184