1 /*******************************************************************************
2 * Copyright 2020-2021 Intel Corporation
3 * Copyright 2020 Codeplay Software Limited
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 *     http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *******************************************************************************/
17 
18 #ifndef GPU_NVIDIA_CUDNN_CONVOLUTION_IMPL_HPP
19 #define GPU_NVIDIA_CUDNN_CONVOLUTION_IMPL_HPP
20 
21 #include "cudnn.h"
22 
23 #include "common/c_types_map.hpp"
24 #include "common/convolution_pd.hpp"
25 #include "gpu/nvidia/cudnn_conv_filter_adjustment_base.hpp"
26 #include "gpu/nvidia/cudnn_convolution_pd.hpp"
27 #include "gpu/nvidia/sycl_cuda_engine.hpp"
28 #include "gpu/nvidia/sycl_cuda_scoped_context.hpp"
29 #include "gpu/nvidia/sycl_cuda_stream.hpp"
30 #include "gpu/nvidia/sycl_cuda_utils.hpp"
31 
32 namespace dnnl {
33 namespace impl {
34 namespace gpu {
35 namespace nvidia {
36 
37 struct cudnn_convolution_impl_base_t
38     : public cudnn_conv_filter_adjustment_base_t {
39 protected:
40     enum io { x = 0, bias, weights, y, NUM_IO };
41     memory_desc_t dnnl_descs[NUM_IO];
42     cudnnConvolutionDescriptor_t conv_desc;
43     int padding[CUDNN_DIM_MAX];
44     int dilation[CUDNN_DIM_MAX];
45     cudnnTensorDescriptor_t descs[NUM_IO];
46     cudnnDataType_t data_types[NUM_IO];
47     int ndims[NUM_IO];
48     int dims[NUM_IO][DNNL_MAX_NDIMS];
49     int strides[NUM_IO + 1][DNNL_MAX_NDIMS];
50     int filter_strides[DNNL_MAX_NDIMS];
51     cudnnTensorFormat_t formats[NUM_IO];
52     bool filter_needs_transform = false;
53     cudnnFilterDescriptor_t weights_desc;
54     float alpha = 0.f;
55     float beta = 0.f;
56     int group_count = 1;
57     bool with_groups = false;
58     size_t scratchpad_size = 0;
59     bool with_bias = false;
60 
61     bool do_scaling = false;
62     float output_scaling = 1.0f;
63     cudnnDataType_t computation_data_type = CUDNN_DATA_FLOAT;
64     cudnnDataType_t reorder_type = CUDNN_DATA_INT8;
65 
66 public:
~cudnn_convolution_impl_base_tdnnl::impl::gpu::nvidia::cudnn_convolution_impl_base_t67     virtual ~cudnn_convolution_impl_base_t() {
68         CUDNN_EXECUTE_FUNC_V(cudnnDestroyFilterDescriptor, weights_desc);
69         CUDNN_EXECUTE_FUNC_V(cudnnDestroyConvolutionDescriptor, conv_desc);
70         for (size_t i = 0; i < io::NUM_IO; i++) {
71             CUDNN_EXECUTE_FUNC_V(cudnnDestroyTensorDescriptor, descs[i]);
72         }
73     }
74     virtual status_t configure_alg_kind(engine_t *, convolution_pd_t *pd) = 0;
75 
supported_filter_formatdnnl::impl::gpu::nvidia::cudnn_convolution_impl_base_t76     virtual bool supported_filter_format(const memory_desc_t *md) const {
77         const memory_desc_wrapper mem_wrapper(md);
78 
79         return (mem_wrapper.matches_one_of_tag(format_tag::ab, format_tag::abc,
80                         format_tag::abcd, format_tag::abcde, format_tag::abcdef)
81                 || (with_groups ? mem_wrapper.matches_one_of_tag(
82                             format_tag::gowi, format_tag::gohwi,
83                             format_tag::godhwi)
84                                 : mem_wrapper.matches_one_of_tag(
85                                         format_tag::owi, format_tag::ohwi,
86                                         format_tag::odhwi)));
87     }
88 
using_transformed_filterdnnl::impl::gpu::nvidia::cudnn_convolution_impl_base_t89     bool using_transformed_filter() const { return filter_needs_transform; }
with_scratchpaddnnl::impl::gpu::nvidia::cudnn_convolution_impl_base_t90     bool with_scratchpad() const { return scratchpad_size > 0; }
91 
initdnnl::impl::gpu::nvidia::cudnn_convolution_impl_base_t92     virtual status_t init(engine_t *engine, convolution_pd_t *pd,
93             bool use_scratch_dst = false) {
94         CHECK(configure_parameters(pd, use_scratch_dst));
95         CHECK(create_cudnn_descs(pd));
96         CHECK(check_output_dims());
97         CHECK(configure_alg_kind(engine, pd));
98         CHECK(init_scratchpad(engine, pd));
99 
100         return status::success;
101     }
102 
init_zero_dimsdnnl::impl::gpu::nvidia::cudnn_convolution_impl_base_t103     virtual status_t init_zero_dims(convolution_pd_t *pd) {
104         return status::success;
105     }
get_dims_and_stridesdnnl::impl::gpu::nvidia::cudnn_convolution_impl_base_t106     void get_dims_and_strides(int io) {
107         convert_dims(
108                 dnnl_descs[io].dims, dims[io], dnnl_descs[io].ndims, ndims[io]);
109         if (ndims[io] > dnnl_descs[io].ndims) {
110             std::swap(dims[io][ndims[io] - 1], dims[io][ndims[io] - 2]);
111             if (ndims[io] == 4) {
112                 if (formats[io] == CUDNN_TENSOR_NHWC) {
113                     propagate_strides(strides[io], dims[io], {1, 3, 2, 0});
114                 } else {
115                     propagate_strides(strides[io], dims[io], {3, 2, 1, 0});
116                 }
117             }
118         } else {
119             convert_dims(dnnl_descs[io].format_desc.blocking.strides,
120                     strides[io], dnnl_descs[io].ndims, ndims[io]);
121         }
122     }
configure_parametersdnnl::impl::gpu::nvidia::cudnn_convolution_impl_base_t123     status_t configure_parameters(
124             const convolution_pd_t *pd, bool use_scratch_dst) {
125         if (pd->ndims() > CUDNN_DIM_MAX) { return status::invalid_arguments; }
126         CHECK(set_padding_and_dilation(pd));
127         with_groups = pd->with_groups();
128         with_bias = pd->with_bias();
129         alpha = 1.0f;
130         beta = 0.0f;
131         output_scaling = pd->attr()->output_scales_.scales_[0];
132         do_scaling = output_scaling != 1.f;
133         dnnl_descs[x] = *pd->invariant_src_md();
134         dnnl_descs[weights] = *pd->invariant_wei_md();
135         dnnl_descs[y] = *pd->invariant_dst_md();
136         if (with_bias) dnnl_descs[bias] = *pd->invariant_bia_md();
137 
138         ndims[x] = std::max(dnnl_descs[x].ndims, 4);
139         ndims[weights] = std::max(dnnl_descs[weights].ndims, 4 + with_groups);
140         ndims[y] = std::max(dnnl_descs[y].ndims, 4);
141 
142         CHECK(convert_data_type(&dnnl_descs[x], &data_types[x]));
143         CHECK(convert_data_type(&dnnl_descs[weights], &data_types[weights]));
144         CHECK(convert_data_type(&dnnl_descs[y], &data_types[y]));
145 
146         CHECK(get_formats());
147         set_compute_format();
148         get_dims_and_strides(x);
149         get_dims_and_strides(weights);
150         get_dims_and_strides(y);
151 
152         if (!supported_filter_format(&dnnl_descs[weights])) {
153             set_filter_format(
154                     ndims[weights], dims[weights], strides[NUM_IO], formats[x]);
155             CHECK(init_filter_transformation(data_types[weights],
156                     ndims[weights], dims[weights], strides[weights],
157                     strides[NUM_IO]));
158             filter_needs_transform = true;
159             // we transform the filter based on src format
160             formats[weights] = formats[x];
161         } else {
162             CHECK(get_filter_format());
163             get_dims_and_strides(weights);
164         }
165         if (with_groups) {
166             dims[weights][1] *= pd->G();
167             ndims[weights] = std::max(4, ndims[weights] - with_groups);
168         }
169 
170         if (with_bias) {
171             ndims[bias] = dnnl_descs[bias].ndims;
172             CHECK(convert_data_type(&dnnl_descs[bias], &data_types[bias]));
173             convert_dims(
174                     dnnl_descs[bias].dims, dims[bias], ndims[bias], ndims[y]);
175             std::swap(dims[bias][0], dims[bias][1]);
176             convert_dims(dnnl_descs[bias].format_desc.blocking.strides,
177                     strides[bias], ndims[bias], ndims[y]);
178             ndims[bias] = ndims[y];
179         }
180 
181         return status::success;
182     }
183 
create_cudnn_descsdnnl::impl::gpu::nvidia::cudnn_convolution_impl_base_t184     status_t create_cudnn_descs(const convolution_pd_t *pd) {
185         CHECK(create_and_set_convolution_desc(pd));
186         CHECK(create_and_set_tensor_descriptor(
187                 &descs[x], data_types[x], ndims[x], dims[x], strides[x]));
188         CHECK(create_and_set_filter_descriptor(&weights_desc, formats[weights],
189                 data_types[weights], ndims[weights],
190                 dims[weights] + with_groups, strides[weights]));
191         CHECK(create_and_set_tensor_descriptor(
192                 &descs[y], data_types[y], ndims[y], dims[y], strides[y]));
193 
194         if (with_bias) {
195             CHECK(create_and_set_tensor_descriptor(&descs[bias],
196                     data_types[bias], ndims[bias], dims[bias], strides[bias]));
197         }
198 
199         return status::success;
200     }
init_scratchpaddnnl::impl::gpu::nvidia::cudnn_convolution_impl_base_t201     virtual status_t init_scratchpad(engine_t *engine, convolution_pd_t *pd) {
202         if (filter_needs_transform) {
203             auto sz = memory_desc_wrapper(&dnnl_descs[weights]).size();
204             auto data_size
205                     = types::data_type_size(pd->invariant_wei_md(0)->data_type);
206             pd->scratchpad_registry().registrar().book(
207                     memory_tracking::names::key_conv_cudnn_filter, sz,
208                     data_size);
209         }
210         return status::success;
211     };
212 
create_and_set_convolution_descdnnl::impl::gpu::nvidia::cudnn_convolution_impl_base_t213     status_t create_and_set_convolution_desc(const convolution_pd_t *pd) {
214         CUDNN_EXECUTE_FUNC_V(cudnnCreateConvolutionDescriptor, &conv_desc);
215         CUDNN_EXECUTE_FUNC_V(cudnnSetConvolutionNdDescriptor, conv_desc,
216                 ndims[x] - 2, padding, filter_strides, dilation,
217                 cudnnConvolutionMode_t::CUDNN_CROSS_CORRELATION,
218                 computation_data_type);
219         // Check for groups and set group count if necessary
220         if (with_groups) {
221             group_count = pd->G();
222             if (group_count > 1)
223                 CHECK(CUDNN_EXECUTE_FUNC_S(
224                         cudnnSetConvolutionGroupCount, conv_desc, group_count));
225         }
226         return status::success;
227     }
228 
set_padding_and_dilationdnnl::impl::gpu::nvidia::cudnn_convolution_impl_base_t229     status_t set_padding_and_dilation(const convolution_pd_t *pd) {
230         int actual_ndims = pd->ndims();
231         if (actual_ndims == 3) {
232             padding[0] = 0;
233             padding[1] = static_cast<int>(pd->padL());
234             dilation[0] = 1;
235             dilation[1] = static_cast<int>(pd->KDW() + 1);
236 
237             filter_strides[0] = 1;
238             filter_strides[1] = static_cast<int>(pd->KSW());
239         } else if (actual_ndims == 4) {
240             padding[0] = static_cast<int>(pd->padT());
241             padding[1] = static_cast<int>(pd->padL());
242 
243             dilation[0] = static_cast<int>(pd->KDH() + 1);
244             dilation[1] = static_cast<int>(pd->KDW() + 1);
245 
246             filter_strides[0] = static_cast<int>(pd->KSH());
247             filter_strides[1] = static_cast<int>(pd->KSW());
248         } else {
249             padding[0] = static_cast<int>(pd->padFront());
250             padding[1] = static_cast<int>(pd->padT());
251             padding[2] = static_cast<int>(pd->padL());
252 
253             dilation[0] = static_cast<int>(pd->KDD() + 1);
254             dilation[1] = static_cast<int>(pd->KDH() + 1);
255             dilation[2] = static_cast<int>(pd->KDW() + 1);
256 
257             filter_strides[0] = static_cast<int>(pd->KSD());
258             filter_strides[1] = static_cast<int>(pd->KSH());
259             filter_strides[2] = static_cast<int>(pd->KSW());
260         }
261         return status::success;
262     }
263 
264     virtual void execute(
265             cudnnHandle_t handle, const std::vector<void *> &args) const = 0;
266 
execute_sumdnnl::impl::gpu::nvidia::cudnn_convolution_impl_base_t267     void execute_sum(cudnnHandle_t handle, void *x, void *y, float alpha_,
268             float beta_) const {
269         float alpha = alpha_;
270         float beta = beta_;
271         CUDNN_EXECUTE_FUNC_V(cudnnAddTensor, handle, &alpha, descs[io::y], x,
272                 &beta, descs[io::y], y);
273     }
274 
execute_scalednnl::impl::gpu::nvidia::cudnn_convolution_impl_base_t275     void execute_scale(cudnnHandle_t handle, void *y) const {
276         if (do_scaling) {
277             CUDNN_EXECUTE_FUNC_V(
278                     cudnnScaleTensor, handle, descs[io::y], y, &output_scaling);
279         }
280     }
281 
execute_set_weights_biasdnnl::impl::gpu::nvidia::cudnn_convolution_impl_base_t282     void execute_set_weights_bias(
283             cudnnHandle_t handle, void *weights, void *bias, float value) {
284         CUDNN_EXECUTE_FUNC_V(
285                 cudnnSetTensor, handle, descs[io::weights], weights, &value);
286         if (bias) {
287             CUDNN_EXECUTE_FUNC_V(
288                     cudnnSetTensor, handle, descs[io::bias], bias, &value);
289         }
290     }
291 
with_eltwisednnl::impl::gpu::nvidia::cudnn_convolution_impl_base_t292     bool with_eltwise(const convolution_pd_t *pd, int position) const {
293         return pd->attr()->post_ops_.contain(primitive_kind::eltwise, position);
294     }
295 
check_output_dimsdnnl::impl::gpu::nvidia::cudnn_convolution_impl_base_t296     status_t check_output_dims() const {
297         int expected_dims[CUDNN_DIM_MAX] = {};
298         CUDNN_EXECUTE_FUNC_V(cudnnGetConvolutionNdForwardOutputDim, conv_desc,
299                 descs[x], weights_desc, ndims[y], &expected_dims[0]);
300         for (size_t i = 0; i < ndims[y]; i++) {
301             if (dims[y][i] != expected_dims[i]) return status::unimplemented;
302         }
303         return status::success;
304     }
305 
set_compute_formatdnnl::impl::gpu::nvidia::cudnn_convolution_impl_base_t306     void set_compute_format() {
307         if (data_types[x] == CUDNN_DATA_INT8) {
308             computation_data_type = CUDNN_DATA_INT32;
309         } else {
310             computation_data_type = data_types[y];
311         }
312     }
313 
get_filter_formatdnnl::impl::gpu::nvidia::cudnn_convolution_impl_base_t314     status_t get_filter_format() {
315         memory_desc_wrapper wrapper(&dnnl_descs[weights]);
316         if (wrapper.matches_one_of_tag(format_tag::ab, format_tag::abc,
317                     format_tag::abcd, format_tag::abcde, format_tag::abcdef)) {
318             formats[weights] = cudnnTensorFormat_t::CUDNN_TENSOR_NCHW;
319         } else if ((!with_groups
320                            && wrapper.matches_one_of_tag(format_tag::owi,
321                                    format_tag::ohwi, format_tag::odhwi))
322                 || (with_groups
323                         && wrapper.matches_one_of_tag(format_tag::gowi,
324                                 format_tag::gohwi, format_tag::godhwi))) {
325             formats[weights] = cudnnTensorFormat_t::CUDNN_TENSOR_NHWC;
326         } else {
327             return status::unimplemented;
328         }
329 
330         return status::success;
331     }
332 
get_formatsdnnl::impl::gpu::nvidia::cudnn_convolution_impl_base_t333     status_t get_formats() {
334         CHECK(get_format(&dnnl_descs[x], formats[x]));
335         CHECK(get_format(&dnnl_descs[y], formats[y]));
336         return status::success;
337     }
338 
set_filter_nhwcdnnl::impl::gpu::nvidia::cudnn_convolution_impl_base_t339     void set_filter_nhwc(int filter_ndims, int *transform_filter_strides,
340             int *filter_dims) override {
341         if (with_groups) {
342             switch (filter_ndims) {
343                 case 4: // Convert to krsc
344                     return propagate_strides(transform_filter_strides,
345                             filter_dims, {2, 3, 1, 0});
346                 case 5:
347                     return propagate_strides(transform_filter_strides,
348                             filter_dims, {2, 4, 3, 1, 0});
349                 case 6:
350                     return propagate_strides(transform_filter_strides,
351                             filter_dims, {2, 5, 4, 3, 1, 0});
352             }
353         } else {
354             cudnn_conv_filter_adjustment_base_t::set_filter_nhwc(
355                     filter_ndims, transform_filter_strides, filter_dims);
356         }
357     }
358 };
359 
360 struct cudnn_convolution_impl_fwd_t : public cudnn_convolution_impl_base_t {
361 protected:
362     cudnnActivationDescriptor_t activation_desc = nullptr;
363     cudnnActivationDescriptor_t eltwise_desc = nullptr;
364     cudnnTensorDescriptor_t reorder_dst_desc = nullptr;
365     cudnnConvolutionFwdAlgo_t fwd_alg_kind;
366     std::vector<cudnnConvolutionFwdAlgoPerf_t> perf;
367     int requested_algo_count = 0;
368     int returned_algo_count = 0;
369     int num_post_ops = 0;
370     primitive_kind_t post_ops[2];
371     bool need_reorder = false;
372     bool use_temp_dst = false;
373     float sum_scale = 1.0f;
374 
375 public:
~cudnn_convolution_impl_fwd_tdnnl::impl::gpu::nvidia::cudnn_convolution_impl_fwd_t376     virtual ~cudnn_convolution_impl_fwd_t() {
377         if (activation_desc)
378             CUDNN_EXECUTE_FUNC_V(
379                     cudnnDestroyActivationDescriptor, activation_desc);
380         if (eltwise_desc)
381             CUDNN_EXECUTE_FUNC_V(
382                     cudnnDestroyActivationDescriptor, eltwise_desc);
383         if (reorder_dst_desc)
384             CUDNN_EXECUTE_FUNC_V(
385                     cudnnDestroyTensorDescriptor, reorder_dst_desc);
386     }
387 
configure_post_opsdnnl::impl::gpu::nvidia::cudnn_convolution_impl_fwd_t388     status_t configure_post_ops(convolution_pd_t *pd) {
389         auto &p = pd->attr()->post_ops_;
390         num_post_ops = p.len();
391         if (data_types[y] == CUDNN_DATA_INT8 && p.len() > 0) {
392             data_types[y] = CUDNN_DATA_FLOAT;
393             need_reorder = true;
394         }
395         for (size_t i = 0; i < p.len(); i++) {
396             post_ops[i] = p.entry_[i].kind;
397             if (post_ops[i] == dnnl_eltwise) {
398                 create_and_set_eltwise_descriptor(pd);
399             }
400             if (post_ops[i] == dnnl_sum) { sum_scale = p.entry_[i].sum.scale; }
401         }
402 
403         if (need_reorder)
404             CHECK(create_and_set_tensor_descriptor_ex(&reorder_dst_desc,
405                     formats[y], reorder_type, ndims[y], dims[y]));
406 
407         return status::success;
408     }
409 
initdnnl::impl::gpu::nvidia::cudnn_convolution_impl_fwd_t410     status_t init(engine_t *engine, convolution_pd_t *pd,
411             bool use_scratch_dst) override {
412         use_temp_dst = use_scratch_dst;
413         CHECK(configure_parameters(pd, use_temp_dst));
414         CHECK(configure_post_ops(pd));
415         CHECK(create_cudnn_descs(pd));
416         CHECK(configure_alg_kind(engine, pd));
417         CHECK(init_scratchpad(engine, pd));
418 
419         return status::success;
420     }
421 
execute_reorderdnnl::impl::gpu::nvidia::cudnn_convolution_impl_fwd_t422     void execute_reorder(cudnnHandle_t handle, void *src, void *dst,
423             bool flip_formats) const {
424         const float alpha = 1.0f;
425         const float beta = 0.0f;
426         if (flip_formats) {
427             CUDNN_EXECUTE_FUNC_V(cudnnTransformTensor, handle, &alpha,
428                     reorder_dst_desc, src, &beta, descs[y], dst);
429         } else {
430             CUDNN_EXECUTE_FUNC_V(cudnnTransformTensor, handle, &alpha, descs[y],
431                     src, &beta, reorder_dst_desc, dst);
432         }
433     }
434 
execute_eltwisednnl::impl::gpu::nvidia::cudnn_convolution_impl_fwd_t435     void execute_eltwise(cudnnHandle_t handle, void *src, void *dst) const {
436         float alpha = 1.0f;
437         float beta = 0.0f;
438         CUDNN_EXECUTE_FUNC_V(cudnnActivationForward, handle, eltwise_desc,
439                 &alpha, descs[io::y], src, &beta, descs[io::y], dst);
440     }
441 
executednnl::impl::gpu::nvidia::cudnn_convolution_impl_fwd_t442     void execute(cudnnHandle_t handle,
443             const std::vector<void *> &args) const override {
444         auto x = args[0], weights = args[1], y = args[2], bias = args[3],
445              scratchpad = args[4], post_op_scratch = args[6],
446              post_op_reorder = args[7];
447         void *output = use_temp_dst ? post_op_scratch : y;
448         if (using_transformed_filter()) {
449             auto w_scratch = args[5];
450             transform_filter(handle, weights, w_scratch);
451             weights = w_scratch;
452         }
453         if (computation_data_type == CUDNN_DATA_INT32 && bias) {
454             CUDNN_EXECUTE_FUNC_V(cudnnConvolutionBiasActivationForward, handle,
455                     &alpha, descs[io::x], x, weights_desc, weights, conv_desc,
456                     fwd_alg_kind, scratchpad, scratchpad_size, &beta,
457                     descs[io::y], output, descs[io::bias], bias,
458                     activation_desc, descs[io::y], output);
459         } else {
460             const float bias_alpha = 1.0f;
461             const float bias_beta = 1.0f;
462             CUDNN_EXECUTE_FUNC_V(cudnnConvolutionForward, handle, &alpha,
463                     descs[io::x], x, weights_desc, weights, conv_desc,
464                     fwd_alg_kind, scratchpad, scratchpad_size, &beta,
465                     descs[io::y], output);
466             if (with_bias) {
467                 CUDNN_EXECUTE_FUNC_V(cudnnAddTensor, handle, &bias_alpha,
468                         descs[io::bias], bias, &bias_beta, descs[io::y],
469                         output);
470             }
471         }
472         execute_scale(handle, output);
473         for (int i = 0; i < num_post_ops; i++) {
474             bool last_op = i == num_post_ops - 1 && !need_reorder;
475             if (last_op) output = y;
476             switch (post_ops[i]) {
477                 case dnnl_sum:
478                     if (need_reorder) {
479                         execute_reorder(handle, y, post_op_reorder, true);
480                         execute_sum(handle, post_op_reorder, post_op_scratch,
481                                 sum_scale, 1.0f);
482                     } else if (last_op) {
483                         execute_sum(
484                                 handle, post_op_scratch, y, 1.0f, sum_scale);
485                     } else {
486                         execute_sum(
487                                 handle, y, post_op_scratch, sum_scale, 1.0f);
488                     }
489 
490                     break;
491 
492                 case dnnl_eltwise:
493                     execute_eltwise(handle, post_op_scratch, output);
494                     break;
495             }
496         }
497 
498         if (need_reorder) {
499             execute_reorder(handle, post_op_scratch, y, false);
500         }
501     }
init_scratchpaddnnl::impl::gpu::nvidia::cudnn_convolution_impl_fwd_t502     status_t init_scratchpad(engine_t *engine, convolution_pd_t *pd) {
503         auto &sycl_engine = *utils::downcast<sycl_cuda_engine_t *>(engine);
504         stream_t *service_stream;
505         CHECK(sycl_engine.get_service_stream(service_stream));
506 
507         auto cuda_stream
508                 = utils::downcast<sycl_cuda_stream_t *>(service_stream);
509         auto handle = cuda_stream->get_cudnn_handle();
510 
511         CHECK(CUDNN_EXECUTE_FUNC_S(cudnnGetConvolutionForwardWorkspaceSize,
512                 handle, descs[x], weights_desc, conv_desc, descs[y],
513                 fwd_alg_kind, &scratchpad_size));
514         if (scratchpad_size > 0)
515             pd->scratchpad_registry().registrar().book(
516                     memory_tracking::names::key_conv_cudnn_algo,
517                     scratchpad_size, size_t(1));
518 
519         return cudnn_convolution_impl_base_t::init_scratchpad(engine, pd);
520     }
configure_alg_kinddnnl::impl::gpu::nvidia::cudnn_convolution_impl_fwd_t521     status_t configure_alg_kind(
522             engine_t *engine, convolution_pd_t *pd) override {
523         auto &sycl_engine = *utils::downcast<sycl_cuda_engine_t *>(engine);
524         cuda_sycl_scoped_context_handler_t sc(sycl_engine);
525         stream_t *service_stream;
526         CHECK(sycl_engine.get_service_stream(service_stream));
527 
528         auto cuda_stream
529                 = utils::downcast<sycl_cuda_stream_t *>(service_stream);
530         auto handle = cuda_stream->get_cudnn_handle();
531 
532         CHECK(CUDNN_EXECUTE_FUNC_S(cudnnGetConvolutionForwardAlgorithmMaxCount,
533                 handle, &requested_algo_count));
534         perf.resize(requested_algo_count);
535         CHECK(CUDNN_EXECUTE_FUNC_S(cudnnFindConvolutionForwardAlgorithm, handle,
536                 descs[x], weights_desc, conv_desc, descs[y],
537                 requested_algo_count, &returned_algo_count, perf.data()));
538 
539         auto submit_status = CUDNN_STATUS_NOT_SUPPORTED;
540         for (size_t i = 0; i < returned_algo_count; i++) {
541             submit_status = perf[i].status;
542             if (submit_status == CUDNN_STATUS_SUCCESS) {
543                 // cudnnFindConvolutionForwardAlgorithm can erroneously report
544                 // algorithms for int8 which does not work so ensure that we
545                 // only allow CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
546                 // in this case.
547                 if (computation_data_type == CUDNN_DATA_INT32
548                         && perf[i].algo
549                                 != CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) {
550                     continue;
551                 }
552                 switch (pd->desc()->alg_kind) {
553                     case dnnl_convolution_auto:
554                         if (utils::one_of(perf[i].algo,
555                                     CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
556                                     CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM,
557                                     CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM)) {
558                             utils::downcast<cudnn_convolution_fwd_pd_t *>(pd)
559                                     ->set_alg_kind(dnnl_convolution_direct);
560                         } else {
561                             utils::downcast<cudnn_convolution_fwd_pd_t *>(pd)
562                                     ->set_alg_kind(dnnl_convolution_winograd);
563                         }
564                         break;
565                     case dnnl_convolution_direct:
566                         if (!utils::one_of(perf[i].algo,
567                                     CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
568                                     CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM,
569                                     CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM))
570                             continue;
571                         break;
572                     case dnnl_convolution_winograd:
573                         if (!utils::one_of(perf[i].algo,
574                                     CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD,
575                                     CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED))
576                             continue;
577                         break;
578                     default: return status::unimplemented;
579                 }
580                 fwd_alg_kind = perf[i].algo;
581                 CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetConvolutionMathType,
582                         conv_desc, perf[i].mathType));
583                 break;
584             }
585         }
586 
587         if (submit_status != CUDNN_STATUS_SUCCESS) return status::unimplemented;
588 
589         if (fwd_alg_kind == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) {
590             CHECK(CUDNN_EXECUTE_FUNC_S(
591                     cudnnCreateActivationDescriptor, &activation_desc));
592             CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetActivationDescriptor,
593                     activation_desc,
594                     cudnnActivationMode_t::CUDNN_ACTIVATION_IDENTITY,
595                     CUDNN_NOT_PROPAGATE_NAN, 1.0));
596         }
597 
598         return status::success;
599     }
600 
create_and_set_eltwise_descriptordnnl::impl::gpu::nvidia::cudnn_convolution_impl_fwd_t601     status_t create_and_set_eltwise_descriptor(const convolution_pd_t *pd) {
602 
603         CHECK(CUDNN_EXECUTE_FUNC_S(
604                 cudnnCreateActivationDescriptor, &eltwise_desc));
605 
606         cudnnActivationMode_t act_mode;
607         switch (eltwise_algorithm_kind(pd)) {
608             case alg_kind::eltwise_tanh:
609                 act_mode = CUDNN_ACTIVATION_TANH;
610                 break;
611             case alg_kind::eltwise_elu: act_mode = CUDNN_ACTIVATION_ELU; break;
612             case alg_kind::eltwise_relu:
613                 act_mode = CUDNN_ACTIVATION_RELU;
614                 break;
615             case alg_kind::eltwise_logistic:
616                 act_mode = CUDNN_ACTIVATION_SIGMOID;
617                 break;
618             case alg_kind::eltwise_bounded_relu:
619                 act_mode = CUDNN_ACTIVATION_CLIPPED_RELU;
620                 break;
621             default: return status::unimplemented;
622         }
623         CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetActivationDescriptor, eltwise_desc,
624                 act_mode, cudnnNanPropagation_t::CUDNN_NOT_PROPAGATE_NAN,
625                 eltwise_alpha(pd)));
626 
627         return status::success;
628     }
629 
eltwise_algorithm_kinddnnl::impl::gpu::nvidia::cudnn_convolution_impl_fwd_t630     dnnl::impl::alg_kind_t eltwise_algorithm_kind(
631             const convolution_pd_t *pd) const {
632         const int eltwise_idx
633                 = pd->attr()->post_ops_.find(primitive_kind::eltwise);
634         return pd->attr()->post_ops_.entry_[eltwise_idx].eltwise.alg;
635     }
636 
eltwise_alphadnnl::impl::gpu::nvidia::cudnn_convolution_impl_fwd_t637     float eltwise_alpha(const convolution_pd_t *pd) const {
638         const int eltwise_idx
639                 = pd->attr()->post_ops_.find(primitive_kind::eltwise);
640         return pd->attr()->post_ops_.entry_[eltwise_idx].eltwise.alpha;
641     }
642 };
643 
644 struct cudnn_convolution_impl_bwd_data_t
645     : public cudnn_convolution_impl_base_t {
646 protected:
647     cudnnConvolutionBwdDataAlgo_t bwd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
648     std::vector<cudnnConvolutionBwdDataAlgoPerf_t> perf;
649     int requested_algo_count = 0;
650     int returned_algo_count = 0;
configure_alg_kinddnnl::impl::gpu::nvidia::cudnn_convolution_impl_bwd_data_t651     status_t configure_alg_kind(
652             engine_t *engine, convolution_pd_t *pd) override {
653         auto &sycl_engine = *utils::downcast<sycl_cuda_engine_t *>(engine);
654         cuda_sycl_scoped_context_handler_t sc(sycl_engine);
655         stream_t *service_stream;
656         CHECK(sycl_engine.get_service_stream(service_stream));
657 
658         auto cuda_stream
659                 = utils::downcast<sycl_cuda_stream_t *>(service_stream);
660         auto handle = cuda_stream->get_cudnn_handle();
661 
662         CHECK(CUDNN_EXECUTE_FUNC_S(
663                 cudnnGetConvolutionBackwardDataAlgorithmMaxCount, handle,
664                 &requested_algo_count));
665         perf.resize(requested_algo_count);
666         CHECK(CUDNN_EXECUTE_FUNC_S(cudnnFindConvolutionBackwardDataAlgorithm,
667                 handle, weights_desc, descs[y], conv_desc, descs[x],
668                 requested_algo_count, &returned_algo_count, perf.data()));
669         for (size_t i = 0; i < returned_algo_count; i++) {
670             if (perf[i].status == CUDNN_STATUS_SUCCESS) {
671                 switch (pd->desc()->alg_kind) {
672                     case dnnl_convolution_auto:
673                         if (utils::one_of(perf[i].algo,
674                                     CUDNN_CONVOLUTION_BWD_DATA_ALGO_0,
675                                     CUDNN_CONVOLUTION_BWD_DATA_ALGO_1)) {
676                             utils::downcast<cudnn_convolution_bwd_data_pd_t *>(
677                                     pd)
678                                     ->set_alg_kind(dnnl_convolution_direct);
679                         } else {
680                             utils::downcast<cudnn_convolution_bwd_data_pd_t *>(
681                                     pd)
682                                     ->set_alg_kind(dnnl_convolution_winograd);
683                         }
684                         break;
685                     case dnnl_convolution_direct:
686                         if (!utils::one_of(perf[i].algo,
687                                     CUDNN_CONVOLUTION_BWD_DATA_ALGO_0,
688                                     CUDNN_CONVOLUTION_BWD_DATA_ALGO_1))
689                             continue;
690                         break;
691                     case dnnl_convolution_winograd:
692                         if (!utils::one_of(perf[i].algo,
693                                     CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD,
694                                     CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED))
695                             continue;
696                         break;
697                     default: return status::unimplemented;
698                 }
699                 bwd_algo = perf[i].algo;
700                 CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetConvolutionMathType,
701                         conv_desc, perf[i].mathType));
702                 break;
703             } else {
704                 return status::unimplemented;
705             }
706         }
707 
708         return status::success;
709     }
710 
init_scratchpaddnnl::impl::gpu::nvidia::cudnn_convolution_impl_bwd_data_t711     status_t init_scratchpad(engine_t *engine, convolution_pd_t *pd) override {
712         auto &sycl_engine = *utils::downcast<sycl_cuda_engine_t *>(engine);
713         stream_t *service_stream;
714         CHECK(sycl_engine.get_service_stream(service_stream));
715 
716         auto cuda_stream
717                 = utils::downcast<sycl_cuda_stream_t *>(service_stream);
718         auto handle = cuda_stream->get_cudnn_handle();
719 
720         CHECK(CUDNN_EXECUTE_FUNC_S(cudnnGetConvolutionBackwardDataWorkspaceSize,
721                 handle, weights_desc, descs[io::y], conv_desc, descs[io::x],
722                 bwd_algo, &scratchpad_size));
723         if (scratchpad_size > 0)
724             pd->scratchpad_registry().registrar().book(
725                     memory_tracking::names::key_conv_cudnn_algo,
726                     scratchpad_size, size_t(1));
727 
728         return cudnn_convolution_impl_base_t::init_scratchpad(engine, pd);
729     }
730 
executednnl::impl::gpu::nvidia::cudnn_convolution_impl_bwd_data_t731     void execute(cudnnHandle_t handle,
732             const std::vector<void *> &args) const override {
733         auto x = args[0], weights = args[1], y = args[2], bias = args[3],
734              scratchpad = args[4];
735         if (using_transformed_filter()) {
736             auto w_scratch = args[5];
737             transform_filter(handle, weights, w_scratch);
738             weights = w_scratch;
739         }
740         const float bias_alpha = 1.0f;
741         const float bias_beta = 1.0f;
742         CUDNN_EXECUTE_FUNC_V(cudnnConvolutionBackwardData, handle, &alpha,
743                 weights_desc, weights, descs[io::y], y, conv_desc, bwd_algo,
744                 scratchpad, scratchpad_size, &beta, descs[io::x], x);
745         if (with_bias) {
746             CUDNN_EXECUTE_FUNC_V(cudnnAddTensor, handle, &bias_alpha,
747                     descs[io::bias], bias, &bias_beta, descs[io::x], x);
748         }
749     }
750 };
751 
752 struct cudnn_convolution_impl_bwd_weights_t
753     : public cudnn_convolution_impl_base_t {
754 protected:
755     cudnnConvolutionBwdFilterAlgo_t bwd_filter_algo
756             = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
757     std::vector<cudnnConvolutionBwdFilterAlgoPerf_t> perf;
758     int requested_algo_count = 0;
759     int returned_algo_count = 0;
760 
761 public:
init_zero_dimsdnnl::impl::gpu::nvidia::cudnn_convolution_impl_bwd_weights_t762     status_t init_zero_dims(convolution_pd_t *pd) override {
763         if (pd->ndims() > CUDNN_DIM_MAX) { return status::invalid_arguments; }
764         dnnl_descs[weights] = *pd->invariant_wei_md();
765         CHECK(get_format(&dnnl_descs[weights], formats[weights], true));
766         ndims[y] = pd->invariant_dst_md()->ndims;
767         ndims[weights] = dnnl_descs[weights].ndims - pd->with_groups();
768         CHECK(convert_data_type(&dnnl_descs[weights], &data_types[weights]));
769         convert_dims(dnnl_descs[weights].dims + pd->with_groups(),
770                 dims[weights], ndims[weights]);
771         ndims[weights] = std::max(4, ndims[weights]);
772         convert_dims(dnnl_descs[weights].format_desc.blocking.strides,
773                 strides[weights], ndims[weights]);
774         CHECK(create_and_set_tensor_descriptor(&descs[weights],
775                 data_types[weights], ndims[weights], dims[weights],
776                 strides[weights]));
777 
778         if (pd->with_bias()) {
779             dnnl_descs[bias] = *pd->invariant_bia_md();
780             ndims[bias] = dnnl_descs[bias].ndims;
781             CHECK(convert_data_type(&dnnl_descs[bias], &data_types[bias]));
782             convert_dims(dnnl_descs[bias].padded_dims, dims[bias], ndims[bias],
783                     ndims[y]);
784             std::swap(dims[bias][0], dims[bias][1]);
785             convert_dims(dnnl_descs[bias].format_desc.blocking.strides,
786                     strides[bias], ndims[bias], ndims[weights]);
787             ndims[bias] = ndims[y];
788             CHECK(create_and_set_tensor_descriptor(&descs[bias],
789                     data_types[bias], ndims[bias], dims[bias], strides[bias]));
790         }
791         return status::success;
792     }
configure_alg_kinddnnl::impl::gpu::nvidia::cudnn_convolution_impl_bwd_weights_t793     virtual status_t configure_alg_kind(
794             engine_t *engine, convolution_pd_t *pd) {
795         auto &sycl_engine = *utils::downcast<sycl_cuda_engine_t *>(engine);
796         cuda_sycl_scoped_context_handler_t sc(sycl_engine);
797         stream_t *service_stream;
798         CHECK(sycl_engine.get_service_stream(service_stream));
799 
800         auto cuda_stream
801                 = utils::downcast<sycl_cuda_stream_t *>(service_stream);
802         auto handle = cuda_stream->get_cudnn_handle();
803 
804         CHECK(CUDNN_EXECUTE_FUNC_S(
805                 cudnnGetConvolutionBackwardFilterAlgorithmMaxCount, handle,
806                 &requested_algo_count));
807         perf.resize(requested_algo_count);
808         CHECK(CUDNN_EXECUTE_FUNC_S(cudnnFindConvolutionBackwardFilterAlgorithm,
809                 handle, descs[x], descs[y], conv_desc, weights_desc,
810                 requested_algo_count, &returned_algo_count, perf.data()));
811         for (size_t i = 0; i < returned_algo_count; i++) {
812             if (perf[i].status == CUDNN_STATUS_SUCCESS) {
813                 switch (pd->desc()->alg_kind) {
814                     case dnnl_convolution_auto:
815                         if (utils::one_of(perf[i].algo,
816                                     CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0,
817                                     CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1,
818                                     CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3)) {
819                             utils::downcast<
820                                     cudnn_convolution_bwd_weights_pd_t *>(pd)
821                                     ->set_alg_kind(dnnl_convolution_direct);
822                         } else {
823                             utils::downcast<
824                                     cudnn_convolution_bwd_weights_pd_t *>(pd)
825                                     ->set_alg_kind(dnnl_convolution_winograd);
826                         }
827                         break;
828                     case dnnl_convolution_direct:
829                         if (!utils::one_of(perf[i].algo,
830                                     CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0,
831                                     CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1,
832                                     CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3))
833                             continue;
834                         break;
835                     case dnnl_convolution_winograd:
836                         if (!utils::one_of(perf[i].algo,
837                                     CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD,
838                                     CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED))
839                             continue;
840                         break;
841                     default: return status::unimplemented;
842                 }
843                 bwd_filter_algo = perf[i].algo;
844                 CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetConvolutionMathType,
845                         conv_desc, perf[i].mathType));
846                 break;
847             } else {
848                 return status::unimplemented;
849             }
850         }
851 
852         return status::success;
853     }
854 
init_scratchpaddnnl::impl::gpu::nvidia::cudnn_convolution_impl_bwd_weights_t855     status_t init_scratchpad(engine_t *engine, convolution_pd_t *pd) override {
856         auto &sycl_engine = *utils::downcast<sycl_cuda_engine_t *>(engine);
857         stream_t *service_stream;
858         CHECK(sycl_engine.get_service_stream(service_stream));
859 
860         auto cuda_stream
861                 = utils::downcast<sycl_cuda_stream_t *>(service_stream);
862         auto handle = cuda_stream->get_cudnn_handle();
863 
864         CHECK(CUDNN_EXECUTE_FUNC_S(
865                 cudnnGetConvolutionBackwardFilterWorkspaceSize, handle,
866                 descs[io::x], descs[io::y], conv_desc, weights_desc,
867                 bwd_filter_algo, &scratchpad_size));
868         if (scratchpad_size > 0)
869             pd->scratchpad_registry().registrar().book(
870                     memory_tracking::names::key_conv_cudnn_algo,
871                     scratchpad_size, size_t(1));
872 
873         return cudnn_convolution_impl_base_t::init_scratchpad(engine, pd);
874     }
875 
executednnl::impl::gpu::nvidia::cudnn_convolution_impl_bwd_weights_t876     void execute(cudnnHandle_t handle,
877             const std::vector<void *> &args) const override {
878         auto x = args[0], weights = args[1], y = args[2], bias = args[3],
879              scratchpad = args[4];
880         auto filter = weights;
881         if (using_transformed_filter()) {
882             auto w_scratch = args[5];
883             transform_filter(handle, weights, w_scratch);
884             filter = w_scratch;
885         }
886         const float bias_alpha = 1.0f;
887         const float bias_beta = 0.0f;
888         CUDNN_EXECUTE_FUNC_V(cudnnConvolutionBackwardFilter, handle, &alpha,
889                 descs[io::x], x, descs[io::y], y, conv_desc, bwd_filter_algo,
890                 scratchpad, scratchpad_size, &beta, weights_desc, filter);
891         if (with_bias) {
892             CUDNN_EXECUTE_FUNC_V(cudnnConvolutionBackwardBias, handle,
893                     &bias_alpha, descs[io::y], y, &bias_beta, descs[io::bias],
894                     bias);
895         }
896         if (using_transformed_filter()) {
897             undo_transform_filter(handle, filter, weights);
898         }
899     }
900 };
901 
902 } // namespace nvidia
903 } // namespace gpu
904 } // namespace impl
905 } // namespace dnnl
906 
907 #endif
908