1 /******************************************************************************* 2 * Copyright 2020 Intel Corporation 3 * Copyright 2020 Codeplay Software Limited 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 *******************************************************************************/ 17 18 #ifndef GPU_NVIDIA_CUDNN_POOLING_IMPL_HPP 19 #define GPU_NVIDIA_CUDNN_POOLING_IMPL_HPP 20 21 #include <cudnn.h> 22 23 #include "gpu/nvidia/sycl_cuda_utils.hpp" 24 25 namespace dnnl { 26 namespace impl { 27 namespace gpu { 28 namespace nvidia { 29 30 struct cudnn_pooling_impl_base_t { 31 virtual status_t init(const pooling_pd_t *pd) = 0; 32 ~cudnn_pooling_impl_base_tdnnl::impl::gpu::nvidia::cudnn_pooling_impl_base_t33 virtual ~cudnn_pooling_impl_base_t() { 34 for (size_t i = 0; i < NUM_IO; ++i) { 35 if (tensor_descs_[i]) { 36 CUDNN_EXECUTE_FUNC_V( 37 cudnnDestroyTensorDescriptor, tensor_descs_[i]); 38 } 39 } 40 41 if (pool_desc_) { 42 CUDNN_EXECUTE_FUNC_V(cudnnDestroyPoolingDescriptor, pool_desc_); 43 } 44 } 45 46 virtual void execute(cudnnHandle_t handle, void *x, void *y, void *ws_x, 47 void *ws_y) const = 0; 48 49 protected: init_commondnnl::impl::gpu::nvidia::cudnn_pooling_impl_base_t50 status_t init_common(const pooling_pd_t *pd) { 51 ndims_ = std::max(4, pd->ndims()); 52 kernel_ndims_ = ndims_ - 2; 53 54 // Only 1D, 2D and 3D pooling is supported by cuDNN 55 if (kernel_ndims_ > 3) { return status::unimplemented; } 56 57 // cuDNN requires symmetric padding, however it seems that 58 // configurations where padding in the beginning > padding at the end of 59 // dimensions work as expected. When padding at the end of any dimension 60 // > padding in the beginning of that dimension the results are wrong 61 // since the data is rearranged incorrectly due to the limitation that 62 // padding has to be the same. This applies to configurations which use 63 // the "average include padding" algorithm. Therefore, such 64 // configurations return status::unimplemented since the results are 65 // wrong. 66 if (pd->desc()->alg_kind == alg_kind::pooling_avg_include_padding 67 && (pd->padL() < pd->padR() || pd->padT() < pd->padB() 68 || pd->padFront() < pd->padBack())) { 69 return status::unimplemented; 70 } 71 72 is_training_ = pd->desc()->prop_kind == prop_kind::forward_training; 73 bool is_fwd = pd->is_fwd(); 74 auto src_md = is_fwd ? pd->src_md() : pd->diff_src_md(); 75 auto dst_md = is_fwd ? pd->dst_md() : pd->diff_dst_md(); 76 77 if (has_zero_dims(src_md->dims, pd->ndims()) 78 || has_zero_dims(dst_md->dims, pd->ndims())) { 79 return status::success; 80 } 81 82 if (is_training_) { 83 auto src_wrap = memory_desc_wrapper(src_md); 84 auto dst_wrap = memory_desc_wrapper(dst_md); 85 x_size_bytes_ = src_wrap.size(); 86 y_size_bytes_ = dst_wrap.size(); 87 } 88 89 convert_dims(src_md->padded_dims, dims_[src], pd->ndims()); 90 convert_dims(dst_md->padded_dims, dims_[dst], pd->ndims()); 91 92 convert_dims(src_md->format_desc.blocking.strides, strides_[src], 93 pd->ndims()); 94 convert_dims(dst_md->format_desc.blocking.strides, strides_[dst], 95 pd->ndims()); 96 97 convert_dims(pd->desc()->kernel, kernel_dims_, kernel_ndims_); 98 99 // If 1D pooling 100 if (pd->ndims() == 3) { 101 // Convert to [n, c, 1, w] since the current format is 102 // [n, c, w, 1] 103 dims_[src][3] = dims_[src][2]; 104 dims_[src][2] = 1; 105 106 dims_[dst][3] = dims_[dst][2]; 107 dims_[dst][2] = 1; 108 109 // Set kernel dimensions to [1, kw] 110 kernel_dims_[1] = kernel_dims_[0]; 111 kernel_dims_[0] = 1; 112 } 113 114 if (ndims_ == 4) { 115 kernel_padding_[0] = static_cast<int>(pd->padT()); 116 kernel_padding_[1] = static_cast<int>(pd->padL()); 117 118 kernel_strides_[0] = static_cast<int>(pd->KSH()); 119 kernel_strides_[1] = static_cast<int>(pd->KSW()); 120 } else { 121 kernel_padding_[0] = static_cast<int>(pd->padFront()); 122 kernel_padding_[1] = static_cast<int>(pd->padT()); 123 kernel_padding_[2] = static_cast<int>(pd->padL()); 124 125 kernel_strides_[0] = static_cast<int>(pd->KSD()); 126 kernel_strides_[1] = static_cast<int>(pd->KSH()); 127 kernel_strides_[2] = static_cast<int>(pd->KSW()); 128 } 129 130 CHECK(convert_data_type(src_md, &data_types_[src])); 131 CHECK(convert_data_type(dst_md, &data_types_[dst])); 132 133 CHECK(convert_alg_kind(pd->desc()->alg_kind, &pool_mode_)); 134 135 cudnnTensorFormat_t src_format, dst_format; 136 CHECK(get_format(src_md, src_format)); 137 CHECK(get_format(dst_md, dst_format)); 138 139 CHECK(create_and_set_tensor_descriptor_ex(&tensor_descs_[src], 140 src_format, data_types_[src], ndims_, dims_[src])); 141 CHECK(create_and_set_tensor_descriptor_ex(&tensor_descs_[dst], 142 dst_format, data_types_[dst], ndims_, dims_[dst])); 143 144 CHECK(create_and_set_pooling_descriptor(pd)); 145 146 return status::success; 147 } 148 create_and_set_pooling_descriptordnnl::impl::gpu::nvidia::cudnn_pooling_impl_base_t149 status_t create_and_set_pooling_descriptor(const pooling_pd_t *pd) { 150 CHECK(CUDNN_EXECUTE_FUNC_S(cudnnCreatePoolingDescriptor, &pool_desc_)); 151 152 CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetPoolingNdDescriptor, pool_desc_, 153 pool_mode_, CUDNN_PROPAGATE_NAN, kernel_ndims_, kernel_dims_, 154 kernel_padding_, kernel_strides_)); 155 156 return status::success; 157 } 158 convert_alg_kinddnnl::impl::gpu::nvidia::cudnn_pooling_impl_base_t159 status_t convert_alg_kind( 160 alg_kind_t alg_kind, cudnnPoolingMode_t *cudnn_alg_kind) const { 161 switch (alg_kind) { 162 case alg_kind::pooling_max: 163 *cudnn_alg_kind = CUDNN_POOLING_MAX; 164 break; 165 case alg_kind::pooling_avg_include_padding: 166 *cudnn_alg_kind = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING; 167 break; 168 case alg_kind::pooling_avg_exclude_padding: 169 *cudnn_alg_kind = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING; 170 break; 171 default: return status::unimplemented; 172 } 173 174 return status::success; 175 } 176 177 enum io { src = 0, dst, NUM_IO }; 178 cudnnDataType_t data_types_[NUM_IO]; 179 cudnnTensorDescriptor_t tensor_descs_[NUM_IO] = {}; 180 cudnnPoolingDescriptor_t pool_desc_; 181 cudnnPoolingMode_t pool_mode_ = CUDNN_POOLING_MAX; 182 int dims_[NUM_IO][DNNL_MAX_NDIMS]; 183 int strides_[NUM_IO][DNNL_MAX_NDIMS]; 184 int kernel_dims_[DNNL_MAX_NDIMS]; 185 int kernel_padding_[DNNL_MAX_NDIMS]; 186 int kernel_strides_[DNNL_MAX_NDIMS]; 187 const float alpha_ = 1.f, beta_ = 0.f; 188 int ndims_, kernel_ndims_; 189 bool is_training_ = false; 190 std::size_t x_size_bytes_ = 0, y_size_bytes_ = 0; 191 }; 192 193 struct cudnn_pooling_fwd_impl_t : public cudnn_pooling_impl_base_t { initdnnl::impl::gpu::nvidia::cudnn_pooling_fwd_impl_t194 status_t init(const pooling_pd_t *pd) override { 195 return cudnn_pooling_impl_base_t::init_common(pd); 196 } 197 executednnl::impl::gpu::nvidia::cudnn_pooling_fwd_impl_t198 void execute(cudnnHandle_t handle, void *x, void *y, void *ws_x, 199 void *ws_y) const override { 200 201 CUDNN_EXECUTE_FUNC(cudnnPoolingForward, handle, pool_desc_, &alpha_, 202 tensor_descs_[src], x, &beta_, tensor_descs_[dst], y); 203 204 if (is_training_) { 205 // Copy x and y into workspace so that they can be used 206 // in the backward pass 207 cudnnAddTensor(handle, &alpha_, tensor_descs_[src], x, &beta_, 208 tensor_descs_[src], ws_x); 209 cudnnAddTensor(handle, &alpha_, tensor_descs_[dst], y, &beta_, 210 tensor_descs_[dst], ws_y); 211 } 212 } 213 }; 214 215 struct cudnn_pooling_bwd_impl_t : public cudnn_pooling_impl_base_t { initdnnl::impl::gpu::nvidia::cudnn_pooling_bwd_impl_t216 status_t init(const pooling_pd_t *pd) override { 217 return cudnn_pooling_impl_base_t::init_common(pd); 218 } 219 executednnl::impl::gpu::nvidia::cudnn_pooling_bwd_impl_t220 void execute(cudnnHandle_t handle, void *dx, void *dy, void *ws_x, 221 void *ws_y) const override { 222 223 CUDNN_EXECUTE_FUNC(cudnnPoolingBackward, handle, pool_desc_, &alpha_, 224 tensor_descs_[dst], ws_y, tensor_descs_[dst], dy, 225 tensor_descs_[src], ws_x, &beta_, tensor_descs_[src], dx); 226 } 227 }; 228 229 } // namespace nvidia 230 } // namespace gpu 231 } // namespace impl 232 } // namespace dnnl 233 234 #endif 235