1 /*******************************************************************************
2 * Copyright 2020 Intel Corporation
3 * Copyright 2020 Codeplay Software Limited
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 *     http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *******************************************************************************/
17 
18 #ifndef GPU_NVIDIA_CUDNN_POOLING_IMPL_HPP
19 #define GPU_NVIDIA_CUDNN_POOLING_IMPL_HPP
20 
21 #include <cudnn.h>
22 
23 #include "gpu/nvidia/sycl_cuda_utils.hpp"
24 
25 namespace dnnl {
26 namespace impl {
27 namespace gpu {
28 namespace nvidia {
29 
30 struct cudnn_pooling_impl_base_t {
31     virtual status_t init(const pooling_pd_t *pd) = 0;
32 
~cudnn_pooling_impl_base_tdnnl::impl::gpu::nvidia::cudnn_pooling_impl_base_t33     virtual ~cudnn_pooling_impl_base_t() {
34         for (size_t i = 0; i < NUM_IO; ++i) {
35             if (tensor_descs_[i]) {
36                 CUDNN_EXECUTE_FUNC_V(
37                         cudnnDestroyTensorDescriptor, tensor_descs_[i]);
38             }
39         }
40 
41         if (pool_desc_) {
42             CUDNN_EXECUTE_FUNC_V(cudnnDestroyPoolingDescriptor, pool_desc_);
43         }
44     }
45 
46     virtual void execute(cudnnHandle_t handle, void *x, void *y, void *ws_x,
47             void *ws_y) const = 0;
48 
49 protected:
init_commondnnl::impl::gpu::nvidia::cudnn_pooling_impl_base_t50     status_t init_common(const pooling_pd_t *pd) {
51         ndims_ = std::max(4, pd->ndims());
52         kernel_ndims_ = ndims_ - 2;
53 
54         // Only 1D, 2D and 3D pooling is supported by cuDNN
55         if (kernel_ndims_ > 3) { return status::unimplemented; }
56 
57         // cuDNN requires symmetric padding, however it seems that
58         // configurations where padding in the beginning > padding at the end of
59         // dimensions work as expected. When padding at the end of any dimension
60         // > padding in the beginning of that dimension the results are wrong
61         // since the data is rearranged incorrectly due to the limitation that
62         // padding has to be the same. This applies to configurations which use
63         // the "average include padding" algorithm. Therefore, such
64         // configurations return status::unimplemented since the results are
65         // wrong.
66         if (pd->desc()->alg_kind == alg_kind::pooling_avg_include_padding
67                 && (pd->padL() < pd->padR() || pd->padT() < pd->padB()
68                         || pd->padFront() < pd->padBack())) {
69             return status::unimplemented;
70         }
71 
72         is_training_ = pd->desc()->prop_kind == prop_kind::forward_training;
73         bool is_fwd = pd->is_fwd();
74         auto src_md = is_fwd ? pd->src_md() : pd->diff_src_md();
75         auto dst_md = is_fwd ? pd->dst_md() : pd->diff_dst_md();
76 
77         if (has_zero_dims(src_md->dims, pd->ndims())
78                 || has_zero_dims(dst_md->dims, pd->ndims())) {
79             return status::success;
80         }
81 
82         if (is_training_) {
83             auto src_wrap = memory_desc_wrapper(src_md);
84             auto dst_wrap = memory_desc_wrapper(dst_md);
85             x_size_bytes_ = src_wrap.size();
86             y_size_bytes_ = dst_wrap.size();
87         }
88 
89         convert_dims(src_md->padded_dims, dims_[src], pd->ndims());
90         convert_dims(dst_md->padded_dims, dims_[dst], pd->ndims());
91 
92         convert_dims(src_md->format_desc.blocking.strides, strides_[src],
93                 pd->ndims());
94         convert_dims(dst_md->format_desc.blocking.strides, strides_[dst],
95                 pd->ndims());
96 
97         convert_dims(pd->desc()->kernel, kernel_dims_, kernel_ndims_);
98 
99         // If 1D pooling
100         if (pd->ndims() == 3) {
101             // Convert to [n, c, 1, w] since the current format is
102             // [n, c, w, 1]
103             dims_[src][3] = dims_[src][2];
104             dims_[src][2] = 1;
105 
106             dims_[dst][3] = dims_[dst][2];
107             dims_[dst][2] = 1;
108 
109             // Set kernel dimensions to [1, kw]
110             kernel_dims_[1] = kernel_dims_[0];
111             kernel_dims_[0] = 1;
112         }
113 
114         if (ndims_ == 4) {
115             kernel_padding_[0] = static_cast<int>(pd->padT());
116             kernel_padding_[1] = static_cast<int>(pd->padL());
117 
118             kernel_strides_[0] = static_cast<int>(pd->KSH());
119             kernel_strides_[1] = static_cast<int>(pd->KSW());
120         } else {
121             kernel_padding_[0] = static_cast<int>(pd->padFront());
122             kernel_padding_[1] = static_cast<int>(pd->padT());
123             kernel_padding_[2] = static_cast<int>(pd->padL());
124 
125             kernel_strides_[0] = static_cast<int>(pd->KSD());
126             kernel_strides_[1] = static_cast<int>(pd->KSH());
127             kernel_strides_[2] = static_cast<int>(pd->KSW());
128         }
129 
130         CHECK(convert_data_type(src_md, &data_types_[src]));
131         CHECK(convert_data_type(dst_md, &data_types_[dst]));
132 
133         CHECK(convert_alg_kind(pd->desc()->alg_kind, &pool_mode_));
134 
135         cudnnTensorFormat_t src_format, dst_format;
136         CHECK(get_format(src_md, src_format));
137         CHECK(get_format(dst_md, dst_format));
138 
139         CHECK(create_and_set_tensor_descriptor_ex(&tensor_descs_[src],
140                 src_format, data_types_[src], ndims_, dims_[src]));
141         CHECK(create_and_set_tensor_descriptor_ex(&tensor_descs_[dst],
142                 dst_format, data_types_[dst], ndims_, dims_[dst]));
143 
144         CHECK(create_and_set_pooling_descriptor(pd));
145 
146         return status::success;
147     }
148 
create_and_set_pooling_descriptordnnl::impl::gpu::nvidia::cudnn_pooling_impl_base_t149     status_t create_and_set_pooling_descriptor(const pooling_pd_t *pd) {
150         CHECK(CUDNN_EXECUTE_FUNC_S(cudnnCreatePoolingDescriptor, &pool_desc_));
151 
152         CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetPoolingNdDescriptor, pool_desc_,
153                 pool_mode_, CUDNN_PROPAGATE_NAN, kernel_ndims_, kernel_dims_,
154                 kernel_padding_, kernel_strides_));
155 
156         return status::success;
157     }
158 
convert_alg_kinddnnl::impl::gpu::nvidia::cudnn_pooling_impl_base_t159     status_t convert_alg_kind(
160             alg_kind_t alg_kind, cudnnPoolingMode_t *cudnn_alg_kind) const {
161         switch (alg_kind) {
162             case alg_kind::pooling_max:
163                 *cudnn_alg_kind = CUDNN_POOLING_MAX;
164                 break;
165             case alg_kind::pooling_avg_include_padding:
166                 *cudnn_alg_kind = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
167                 break;
168             case alg_kind::pooling_avg_exclude_padding:
169                 *cudnn_alg_kind = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
170                 break;
171             default: return status::unimplemented;
172         }
173 
174         return status::success;
175     }
176 
177     enum io { src = 0, dst, NUM_IO };
178     cudnnDataType_t data_types_[NUM_IO];
179     cudnnTensorDescriptor_t tensor_descs_[NUM_IO] = {};
180     cudnnPoolingDescriptor_t pool_desc_;
181     cudnnPoolingMode_t pool_mode_ = CUDNN_POOLING_MAX;
182     int dims_[NUM_IO][DNNL_MAX_NDIMS];
183     int strides_[NUM_IO][DNNL_MAX_NDIMS];
184     int kernel_dims_[DNNL_MAX_NDIMS];
185     int kernel_padding_[DNNL_MAX_NDIMS];
186     int kernel_strides_[DNNL_MAX_NDIMS];
187     const float alpha_ = 1.f, beta_ = 0.f;
188     int ndims_, kernel_ndims_;
189     bool is_training_ = false;
190     std::size_t x_size_bytes_ = 0, y_size_bytes_ = 0;
191 };
192 
193 struct cudnn_pooling_fwd_impl_t : public cudnn_pooling_impl_base_t {
initdnnl::impl::gpu::nvidia::cudnn_pooling_fwd_impl_t194     status_t init(const pooling_pd_t *pd) override {
195         return cudnn_pooling_impl_base_t::init_common(pd);
196     }
197 
executednnl::impl::gpu::nvidia::cudnn_pooling_fwd_impl_t198     void execute(cudnnHandle_t handle, void *x, void *y, void *ws_x,
199             void *ws_y) const override {
200 
201         CUDNN_EXECUTE_FUNC(cudnnPoolingForward, handle, pool_desc_, &alpha_,
202                 tensor_descs_[src], x, &beta_, tensor_descs_[dst], y);
203 
204         if (is_training_) {
205             // Copy x and y into workspace so that they can be used
206             // in the backward pass
207             cudnnAddTensor(handle, &alpha_, tensor_descs_[src], x, &beta_,
208                     tensor_descs_[src], ws_x);
209             cudnnAddTensor(handle, &alpha_, tensor_descs_[dst], y, &beta_,
210                     tensor_descs_[dst], ws_y);
211         }
212     }
213 };
214 
215 struct cudnn_pooling_bwd_impl_t : public cudnn_pooling_impl_base_t {
initdnnl::impl::gpu::nvidia::cudnn_pooling_bwd_impl_t216     status_t init(const pooling_pd_t *pd) override {
217         return cudnn_pooling_impl_base_t::init_common(pd);
218     }
219 
executednnl::impl::gpu::nvidia::cudnn_pooling_bwd_impl_t220     void execute(cudnnHandle_t handle, void *dx, void *dy, void *ws_x,
221             void *ws_y) const override {
222 
223         CUDNN_EXECUTE_FUNC(cudnnPoolingBackward, handle, pool_desc_, &alpha_,
224                 tensor_descs_[dst], ws_y, tensor_descs_[dst], dy,
225                 tensor_descs_[src], ws_x, &beta_, tensor_descs_[src], dx);
226     }
227 };
228 
229 } // namespace nvidia
230 } // namespace gpu
231 } // namespace impl
232 } // namespace dnnl
233 
234 #endif
235