1 /******************************************************************************* 2 * Copyright 2020-2021 Intel Corporation 3 * Copyright 2020 Codeplay Software Limited 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 *******************************************************************************/ 17 18 #ifndef GPU_NVIDIA_CUDNN_CONVOLUTION_IMPL_HPP 19 #define GPU_NVIDIA_CUDNN_CONVOLUTION_IMPL_HPP 20 21 #include "cudnn.h" 22 23 #include "common/c_types_map.hpp" 24 #include "common/convolution_pd.hpp" 25 #include "gpu/nvidia/cudnn_conv_filter_adjustment_base.hpp" 26 #include "gpu/nvidia/cudnn_convolution_pd.hpp" 27 #include "gpu/nvidia/sycl_cuda_engine.hpp" 28 #include "gpu/nvidia/sycl_cuda_scoped_context.hpp" 29 #include "gpu/nvidia/sycl_cuda_stream.hpp" 30 #include "gpu/nvidia/sycl_cuda_utils.hpp" 31 32 namespace dnnl { 33 namespace impl { 34 namespace gpu { 35 namespace nvidia { 36 37 struct cudnn_convolution_impl_base_t 38 : public cudnn_conv_filter_adjustment_base_t { 39 protected: 40 enum io { x = 0, bias, weights, y, NUM_IO }; 41 memory_desc_t dnnl_descs[NUM_IO]; 42 cudnnConvolutionDescriptor_t conv_desc; 43 int padding[CUDNN_DIM_MAX]; 44 int dilation[CUDNN_DIM_MAX]; 45 cudnnTensorDescriptor_t descs[NUM_IO]; 46 cudnnDataType_t data_types[NUM_IO]; 47 int ndims[NUM_IO]; 48 int dims[NUM_IO][DNNL_MAX_NDIMS]; 49 int strides[NUM_IO + 1][DNNL_MAX_NDIMS]; 50 int filter_strides[DNNL_MAX_NDIMS]; 51 cudnnTensorFormat_t formats[NUM_IO]; 52 bool filter_needs_transform = false; 53 cudnnFilterDescriptor_t weights_desc; 54 float alpha = 0.f; 55 float beta = 0.f; 56 int group_count = 1; 57 bool with_groups = false; 58 size_t scratchpad_size = 0; 59 bool with_bias = false; 60 61 bool do_scaling = false; 62 float output_scaling = 1.0f; 63 cudnnDataType_t computation_data_type = CUDNN_DATA_FLOAT; 64 cudnnDataType_t reorder_type = CUDNN_DATA_INT8; 65 66 public: ~cudnn_convolution_impl_base_tdnnl::impl::gpu::nvidia::cudnn_convolution_impl_base_t67 virtual ~cudnn_convolution_impl_base_t() { 68 CUDNN_EXECUTE_FUNC_V(cudnnDestroyFilterDescriptor, weights_desc); 69 CUDNN_EXECUTE_FUNC_V(cudnnDestroyConvolutionDescriptor, conv_desc); 70 for (size_t i = 0; i < io::NUM_IO; i++) { 71 CUDNN_EXECUTE_FUNC_V(cudnnDestroyTensorDescriptor, descs[i]); 72 } 73 } 74 virtual status_t configure_alg_kind(engine_t *, convolution_pd_t *pd) = 0; 75 supported_filter_formatdnnl::impl::gpu::nvidia::cudnn_convolution_impl_base_t76 virtual bool supported_filter_format(const memory_desc_t *md) const { 77 const memory_desc_wrapper mem_wrapper(md); 78 79 return (mem_wrapper.matches_one_of_tag(format_tag::ab, format_tag::abc, 80 format_tag::abcd, format_tag::abcde, format_tag::abcdef) 81 || (with_groups ? mem_wrapper.matches_one_of_tag( 82 format_tag::gowi, format_tag::gohwi, 83 format_tag::godhwi) 84 : mem_wrapper.matches_one_of_tag( 85 format_tag::owi, format_tag::ohwi, 86 format_tag::odhwi))); 87 } 88 using_transformed_filterdnnl::impl::gpu::nvidia::cudnn_convolution_impl_base_t89 bool using_transformed_filter() const { return filter_needs_transform; } with_scratchpaddnnl::impl::gpu::nvidia::cudnn_convolution_impl_base_t90 bool with_scratchpad() const { return scratchpad_size > 0; } 91 initdnnl::impl::gpu::nvidia::cudnn_convolution_impl_base_t92 virtual status_t init(engine_t *engine, convolution_pd_t *pd, 93 bool use_scratch_dst = false) { 94 CHECK(configure_parameters(pd, use_scratch_dst)); 95 CHECK(create_cudnn_descs(pd)); 96 CHECK(check_output_dims()); 97 CHECK(configure_alg_kind(engine, pd)); 98 CHECK(init_scratchpad(engine, pd)); 99 100 return status::success; 101 } 102 init_zero_dimsdnnl::impl::gpu::nvidia::cudnn_convolution_impl_base_t103 virtual status_t init_zero_dims(convolution_pd_t *pd) { 104 return status::success; 105 } get_dims_and_stridesdnnl::impl::gpu::nvidia::cudnn_convolution_impl_base_t106 void get_dims_and_strides(int io) { 107 convert_dims( 108 dnnl_descs[io].dims, dims[io], dnnl_descs[io].ndims, ndims[io]); 109 if (ndims[io] > dnnl_descs[io].ndims) { 110 std::swap(dims[io][ndims[io] - 1], dims[io][ndims[io] - 2]); 111 if (ndims[io] == 4) { 112 if (formats[io] == CUDNN_TENSOR_NHWC) { 113 propagate_strides(strides[io], dims[io], {1, 3, 2, 0}); 114 } else { 115 propagate_strides(strides[io], dims[io], {3, 2, 1, 0}); 116 } 117 } 118 } else { 119 convert_dims(dnnl_descs[io].format_desc.blocking.strides, 120 strides[io], dnnl_descs[io].ndims, ndims[io]); 121 } 122 } configure_parametersdnnl::impl::gpu::nvidia::cudnn_convolution_impl_base_t123 status_t configure_parameters( 124 const convolution_pd_t *pd, bool use_scratch_dst) { 125 if (pd->ndims() > CUDNN_DIM_MAX) { return status::invalid_arguments; } 126 CHECK(set_padding_and_dilation(pd)); 127 with_groups = pd->with_groups(); 128 with_bias = pd->with_bias(); 129 alpha = 1.0f; 130 beta = 0.0f; 131 output_scaling = pd->attr()->output_scales_.scales_[0]; 132 do_scaling = output_scaling != 1.f; 133 dnnl_descs[x] = *pd->invariant_src_md(); 134 dnnl_descs[weights] = *pd->invariant_wei_md(); 135 dnnl_descs[y] = *pd->invariant_dst_md(); 136 if (with_bias) dnnl_descs[bias] = *pd->invariant_bia_md(); 137 138 ndims[x] = std::max(dnnl_descs[x].ndims, 4); 139 ndims[weights] = std::max(dnnl_descs[weights].ndims, 4 + with_groups); 140 ndims[y] = std::max(dnnl_descs[y].ndims, 4); 141 142 CHECK(convert_data_type(&dnnl_descs[x], &data_types[x])); 143 CHECK(convert_data_type(&dnnl_descs[weights], &data_types[weights])); 144 CHECK(convert_data_type(&dnnl_descs[y], &data_types[y])); 145 146 CHECK(get_formats()); 147 set_compute_format(); 148 get_dims_and_strides(x); 149 get_dims_and_strides(weights); 150 get_dims_and_strides(y); 151 152 if (!supported_filter_format(&dnnl_descs[weights])) { 153 set_filter_format( 154 ndims[weights], dims[weights], strides[NUM_IO], formats[x]); 155 CHECK(init_filter_transformation(data_types[weights], 156 ndims[weights], dims[weights], strides[weights], 157 strides[NUM_IO])); 158 filter_needs_transform = true; 159 // we transform the filter based on src format 160 formats[weights] = formats[x]; 161 } else { 162 CHECK(get_filter_format()); 163 get_dims_and_strides(weights); 164 } 165 if (with_groups) { 166 dims[weights][1] *= pd->G(); 167 ndims[weights] = std::max(4, ndims[weights] - with_groups); 168 } 169 170 if (with_bias) { 171 ndims[bias] = dnnl_descs[bias].ndims; 172 CHECK(convert_data_type(&dnnl_descs[bias], &data_types[bias])); 173 convert_dims( 174 dnnl_descs[bias].dims, dims[bias], ndims[bias], ndims[y]); 175 std::swap(dims[bias][0], dims[bias][1]); 176 convert_dims(dnnl_descs[bias].format_desc.blocking.strides, 177 strides[bias], ndims[bias], ndims[y]); 178 ndims[bias] = ndims[y]; 179 } 180 181 return status::success; 182 } 183 create_cudnn_descsdnnl::impl::gpu::nvidia::cudnn_convolution_impl_base_t184 status_t create_cudnn_descs(const convolution_pd_t *pd) { 185 CHECK(create_and_set_convolution_desc(pd)); 186 CHECK(create_and_set_tensor_descriptor( 187 &descs[x], data_types[x], ndims[x], dims[x], strides[x])); 188 CHECK(create_and_set_filter_descriptor(&weights_desc, formats[weights], 189 data_types[weights], ndims[weights], 190 dims[weights] + with_groups, strides[weights])); 191 CHECK(create_and_set_tensor_descriptor( 192 &descs[y], data_types[y], ndims[y], dims[y], strides[y])); 193 194 if (with_bias) { 195 CHECK(create_and_set_tensor_descriptor(&descs[bias], 196 data_types[bias], ndims[bias], dims[bias], strides[bias])); 197 } 198 199 return status::success; 200 } init_scratchpaddnnl::impl::gpu::nvidia::cudnn_convolution_impl_base_t201 virtual status_t init_scratchpad(engine_t *engine, convolution_pd_t *pd) { 202 if (filter_needs_transform) { 203 auto sz = memory_desc_wrapper(&dnnl_descs[weights]).size(); 204 auto data_size 205 = types::data_type_size(pd->invariant_wei_md(0)->data_type); 206 pd->scratchpad_registry().registrar().book( 207 memory_tracking::names::key_conv_cudnn_filter, sz, 208 data_size); 209 } 210 return status::success; 211 }; 212 create_and_set_convolution_descdnnl::impl::gpu::nvidia::cudnn_convolution_impl_base_t213 status_t create_and_set_convolution_desc(const convolution_pd_t *pd) { 214 CUDNN_EXECUTE_FUNC_V(cudnnCreateConvolutionDescriptor, &conv_desc); 215 CUDNN_EXECUTE_FUNC_V(cudnnSetConvolutionNdDescriptor, conv_desc, 216 ndims[x] - 2, padding, filter_strides, dilation, 217 cudnnConvolutionMode_t::CUDNN_CROSS_CORRELATION, 218 computation_data_type); 219 // Check for groups and set group count if necessary 220 if (with_groups) { 221 group_count = pd->G(); 222 if (group_count > 1) 223 CHECK(CUDNN_EXECUTE_FUNC_S( 224 cudnnSetConvolutionGroupCount, conv_desc, group_count)); 225 } 226 return status::success; 227 } 228 set_padding_and_dilationdnnl::impl::gpu::nvidia::cudnn_convolution_impl_base_t229 status_t set_padding_and_dilation(const convolution_pd_t *pd) { 230 int actual_ndims = pd->ndims(); 231 if (actual_ndims == 3) { 232 padding[0] = 0; 233 padding[1] = static_cast<int>(pd->padL()); 234 dilation[0] = 1; 235 dilation[1] = static_cast<int>(pd->KDW() + 1); 236 237 filter_strides[0] = 1; 238 filter_strides[1] = static_cast<int>(pd->KSW()); 239 } else if (actual_ndims == 4) { 240 padding[0] = static_cast<int>(pd->padT()); 241 padding[1] = static_cast<int>(pd->padL()); 242 243 dilation[0] = static_cast<int>(pd->KDH() + 1); 244 dilation[1] = static_cast<int>(pd->KDW() + 1); 245 246 filter_strides[0] = static_cast<int>(pd->KSH()); 247 filter_strides[1] = static_cast<int>(pd->KSW()); 248 } else { 249 padding[0] = static_cast<int>(pd->padFront()); 250 padding[1] = static_cast<int>(pd->padT()); 251 padding[2] = static_cast<int>(pd->padL()); 252 253 dilation[0] = static_cast<int>(pd->KDD() + 1); 254 dilation[1] = static_cast<int>(pd->KDH() + 1); 255 dilation[2] = static_cast<int>(pd->KDW() + 1); 256 257 filter_strides[0] = static_cast<int>(pd->KSD()); 258 filter_strides[1] = static_cast<int>(pd->KSH()); 259 filter_strides[2] = static_cast<int>(pd->KSW()); 260 } 261 return status::success; 262 } 263 264 virtual void execute( 265 cudnnHandle_t handle, const std::vector<void *> &args) const = 0; 266 execute_sumdnnl::impl::gpu::nvidia::cudnn_convolution_impl_base_t267 void execute_sum(cudnnHandle_t handle, void *x, void *y, float alpha_, 268 float beta_) const { 269 float alpha = alpha_; 270 float beta = beta_; 271 CUDNN_EXECUTE_FUNC_V(cudnnAddTensor, handle, &alpha, descs[io::y], x, 272 &beta, descs[io::y], y); 273 } 274 execute_scalednnl::impl::gpu::nvidia::cudnn_convolution_impl_base_t275 void execute_scale(cudnnHandle_t handle, void *y) const { 276 if (do_scaling) { 277 CUDNN_EXECUTE_FUNC_V( 278 cudnnScaleTensor, handle, descs[io::y], y, &output_scaling); 279 } 280 } 281 execute_set_weights_biasdnnl::impl::gpu::nvidia::cudnn_convolution_impl_base_t282 void execute_set_weights_bias( 283 cudnnHandle_t handle, void *weights, void *bias, float value) { 284 CUDNN_EXECUTE_FUNC_V( 285 cudnnSetTensor, handle, descs[io::weights], weights, &value); 286 if (bias) { 287 CUDNN_EXECUTE_FUNC_V( 288 cudnnSetTensor, handle, descs[io::bias], bias, &value); 289 } 290 } 291 with_eltwisednnl::impl::gpu::nvidia::cudnn_convolution_impl_base_t292 bool with_eltwise(const convolution_pd_t *pd, int position) const { 293 return pd->attr()->post_ops_.contain(primitive_kind::eltwise, position); 294 } 295 check_output_dimsdnnl::impl::gpu::nvidia::cudnn_convolution_impl_base_t296 status_t check_output_dims() const { 297 int expected_dims[CUDNN_DIM_MAX] = {}; 298 CUDNN_EXECUTE_FUNC_V(cudnnGetConvolutionNdForwardOutputDim, conv_desc, 299 descs[x], weights_desc, ndims[y], &expected_dims[0]); 300 for (size_t i = 0; i < ndims[y]; i++) { 301 if (dims[y][i] != expected_dims[i]) return status::unimplemented; 302 } 303 return status::success; 304 } 305 set_compute_formatdnnl::impl::gpu::nvidia::cudnn_convolution_impl_base_t306 void set_compute_format() { 307 if (data_types[x] == CUDNN_DATA_INT8) { 308 computation_data_type = CUDNN_DATA_INT32; 309 } else { 310 computation_data_type = data_types[y]; 311 } 312 } 313 get_filter_formatdnnl::impl::gpu::nvidia::cudnn_convolution_impl_base_t314 status_t get_filter_format() { 315 memory_desc_wrapper wrapper(&dnnl_descs[weights]); 316 if (wrapper.matches_one_of_tag(format_tag::ab, format_tag::abc, 317 format_tag::abcd, format_tag::abcde, format_tag::abcdef)) { 318 formats[weights] = cudnnTensorFormat_t::CUDNN_TENSOR_NCHW; 319 } else if ((!with_groups 320 && wrapper.matches_one_of_tag(format_tag::owi, 321 format_tag::ohwi, format_tag::odhwi)) 322 || (with_groups 323 && wrapper.matches_one_of_tag(format_tag::gowi, 324 format_tag::gohwi, format_tag::godhwi))) { 325 formats[weights] = cudnnTensorFormat_t::CUDNN_TENSOR_NHWC; 326 } else { 327 return status::unimplemented; 328 } 329 330 return status::success; 331 } 332 get_formatsdnnl::impl::gpu::nvidia::cudnn_convolution_impl_base_t333 status_t get_formats() { 334 CHECK(get_format(&dnnl_descs[x], formats[x])); 335 CHECK(get_format(&dnnl_descs[y], formats[y])); 336 return status::success; 337 } 338 set_filter_nhwcdnnl::impl::gpu::nvidia::cudnn_convolution_impl_base_t339 void set_filter_nhwc(int filter_ndims, int *transform_filter_strides, 340 int *filter_dims) override { 341 if (with_groups) { 342 switch (filter_ndims) { 343 case 4: // Convert to krsc 344 return propagate_strides(transform_filter_strides, 345 filter_dims, {2, 3, 1, 0}); 346 case 5: 347 return propagate_strides(transform_filter_strides, 348 filter_dims, {2, 4, 3, 1, 0}); 349 case 6: 350 return propagate_strides(transform_filter_strides, 351 filter_dims, {2, 5, 4, 3, 1, 0}); 352 } 353 } else { 354 cudnn_conv_filter_adjustment_base_t::set_filter_nhwc( 355 filter_ndims, transform_filter_strides, filter_dims); 356 } 357 } 358 }; 359 360 struct cudnn_convolution_impl_fwd_t : public cudnn_convolution_impl_base_t { 361 protected: 362 cudnnActivationDescriptor_t activation_desc = nullptr; 363 cudnnActivationDescriptor_t eltwise_desc = nullptr; 364 cudnnTensorDescriptor_t reorder_dst_desc = nullptr; 365 cudnnConvolutionFwdAlgo_t fwd_alg_kind; 366 std::vector<cudnnConvolutionFwdAlgoPerf_t> perf; 367 int requested_algo_count = 0; 368 int returned_algo_count = 0; 369 int num_post_ops = 0; 370 primitive_kind_t post_ops[2]; 371 bool need_reorder = false; 372 bool use_temp_dst = false; 373 float sum_scale = 1.0f; 374 375 public: ~cudnn_convolution_impl_fwd_tdnnl::impl::gpu::nvidia::cudnn_convolution_impl_fwd_t376 virtual ~cudnn_convolution_impl_fwd_t() { 377 if (activation_desc) 378 CUDNN_EXECUTE_FUNC_V( 379 cudnnDestroyActivationDescriptor, activation_desc); 380 if (eltwise_desc) 381 CUDNN_EXECUTE_FUNC_V( 382 cudnnDestroyActivationDescriptor, eltwise_desc); 383 if (reorder_dst_desc) 384 CUDNN_EXECUTE_FUNC_V( 385 cudnnDestroyTensorDescriptor, reorder_dst_desc); 386 } 387 configure_post_opsdnnl::impl::gpu::nvidia::cudnn_convolution_impl_fwd_t388 status_t configure_post_ops(convolution_pd_t *pd) { 389 auto &p = pd->attr()->post_ops_; 390 num_post_ops = p.len(); 391 if (data_types[y] == CUDNN_DATA_INT8 && p.len() > 0) { 392 data_types[y] = CUDNN_DATA_FLOAT; 393 need_reorder = true; 394 } 395 for (size_t i = 0; i < p.len(); i++) { 396 post_ops[i] = p.entry_[i].kind; 397 if (post_ops[i] == dnnl_eltwise) { 398 create_and_set_eltwise_descriptor(pd); 399 } 400 if (post_ops[i] == dnnl_sum) { sum_scale = p.entry_[i].sum.scale; } 401 } 402 403 if (need_reorder) 404 CHECK(create_and_set_tensor_descriptor_ex(&reorder_dst_desc, 405 formats[y], reorder_type, ndims[y], dims[y])); 406 407 return status::success; 408 } 409 initdnnl::impl::gpu::nvidia::cudnn_convolution_impl_fwd_t410 status_t init(engine_t *engine, convolution_pd_t *pd, 411 bool use_scratch_dst) override { 412 use_temp_dst = use_scratch_dst; 413 CHECK(configure_parameters(pd, use_temp_dst)); 414 CHECK(configure_post_ops(pd)); 415 CHECK(create_cudnn_descs(pd)); 416 CHECK(configure_alg_kind(engine, pd)); 417 CHECK(init_scratchpad(engine, pd)); 418 419 return status::success; 420 } 421 execute_reorderdnnl::impl::gpu::nvidia::cudnn_convolution_impl_fwd_t422 void execute_reorder(cudnnHandle_t handle, void *src, void *dst, 423 bool flip_formats) const { 424 const float alpha = 1.0f; 425 const float beta = 0.0f; 426 if (flip_formats) { 427 CUDNN_EXECUTE_FUNC_V(cudnnTransformTensor, handle, &alpha, 428 reorder_dst_desc, src, &beta, descs[y], dst); 429 } else { 430 CUDNN_EXECUTE_FUNC_V(cudnnTransformTensor, handle, &alpha, descs[y], 431 src, &beta, reorder_dst_desc, dst); 432 } 433 } 434 execute_eltwisednnl::impl::gpu::nvidia::cudnn_convolution_impl_fwd_t435 void execute_eltwise(cudnnHandle_t handle, void *src, void *dst) const { 436 float alpha = 1.0f; 437 float beta = 0.0f; 438 CUDNN_EXECUTE_FUNC_V(cudnnActivationForward, handle, eltwise_desc, 439 &alpha, descs[io::y], src, &beta, descs[io::y], dst); 440 } 441 executednnl::impl::gpu::nvidia::cudnn_convolution_impl_fwd_t442 void execute(cudnnHandle_t handle, 443 const std::vector<void *> &args) const override { 444 auto x = args[0], weights = args[1], y = args[2], bias = args[3], 445 scratchpad = args[4], post_op_scratch = args[6], 446 post_op_reorder = args[7]; 447 void *output = use_temp_dst ? post_op_scratch : y; 448 if (using_transformed_filter()) { 449 auto w_scratch = args[5]; 450 transform_filter(handle, weights, w_scratch); 451 weights = w_scratch; 452 } 453 if (computation_data_type == CUDNN_DATA_INT32 && bias) { 454 CUDNN_EXECUTE_FUNC_V(cudnnConvolutionBiasActivationForward, handle, 455 &alpha, descs[io::x], x, weights_desc, weights, conv_desc, 456 fwd_alg_kind, scratchpad, scratchpad_size, &beta, 457 descs[io::y], output, descs[io::bias], bias, 458 activation_desc, descs[io::y], output); 459 } else { 460 const float bias_alpha = 1.0f; 461 const float bias_beta = 1.0f; 462 CUDNN_EXECUTE_FUNC_V(cudnnConvolutionForward, handle, &alpha, 463 descs[io::x], x, weights_desc, weights, conv_desc, 464 fwd_alg_kind, scratchpad, scratchpad_size, &beta, 465 descs[io::y], output); 466 if (with_bias) { 467 CUDNN_EXECUTE_FUNC_V(cudnnAddTensor, handle, &bias_alpha, 468 descs[io::bias], bias, &bias_beta, descs[io::y], 469 output); 470 } 471 } 472 execute_scale(handle, output); 473 for (int i = 0; i < num_post_ops; i++) { 474 bool last_op = i == num_post_ops - 1 && !need_reorder; 475 if (last_op) output = y; 476 switch (post_ops[i]) { 477 case dnnl_sum: 478 if (need_reorder) { 479 execute_reorder(handle, y, post_op_reorder, true); 480 execute_sum(handle, post_op_reorder, post_op_scratch, 481 sum_scale, 1.0f); 482 } else if (last_op) { 483 execute_sum( 484 handle, post_op_scratch, y, 1.0f, sum_scale); 485 } else { 486 execute_sum( 487 handle, y, post_op_scratch, sum_scale, 1.0f); 488 } 489 490 break; 491 492 case dnnl_eltwise: 493 execute_eltwise(handle, post_op_scratch, output); 494 break; 495 } 496 } 497 498 if (need_reorder) { 499 execute_reorder(handle, post_op_scratch, y, false); 500 } 501 } init_scratchpaddnnl::impl::gpu::nvidia::cudnn_convolution_impl_fwd_t502 status_t init_scratchpad(engine_t *engine, convolution_pd_t *pd) { 503 auto &sycl_engine = *utils::downcast<sycl_cuda_engine_t *>(engine); 504 stream_t *service_stream; 505 CHECK(sycl_engine.get_service_stream(service_stream)); 506 507 auto cuda_stream 508 = utils::downcast<sycl_cuda_stream_t *>(service_stream); 509 auto handle = cuda_stream->get_cudnn_handle(); 510 511 CHECK(CUDNN_EXECUTE_FUNC_S(cudnnGetConvolutionForwardWorkspaceSize, 512 handle, descs[x], weights_desc, conv_desc, descs[y], 513 fwd_alg_kind, &scratchpad_size)); 514 if (scratchpad_size > 0) 515 pd->scratchpad_registry().registrar().book( 516 memory_tracking::names::key_conv_cudnn_algo, 517 scratchpad_size, size_t(1)); 518 519 return cudnn_convolution_impl_base_t::init_scratchpad(engine, pd); 520 } configure_alg_kinddnnl::impl::gpu::nvidia::cudnn_convolution_impl_fwd_t521 status_t configure_alg_kind( 522 engine_t *engine, convolution_pd_t *pd) override { 523 auto &sycl_engine = *utils::downcast<sycl_cuda_engine_t *>(engine); 524 cuda_sycl_scoped_context_handler_t sc(sycl_engine); 525 stream_t *service_stream; 526 CHECK(sycl_engine.get_service_stream(service_stream)); 527 528 auto cuda_stream 529 = utils::downcast<sycl_cuda_stream_t *>(service_stream); 530 auto handle = cuda_stream->get_cudnn_handle(); 531 532 CHECK(CUDNN_EXECUTE_FUNC_S(cudnnGetConvolutionForwardAlgorithmMaxCount, 533 handle, &requested_algo_count)); 534 perf.resize(requested_algo_count); 535 CHECK(CUDNN_EXECUTE_FUNC_S(cudnnFindConvolutionForwardAlgorithm, handle, 536 descs[x], weights_desc, conv_desc, descs[y], 537 requested_algo_count, &returned_algo_count, perf.data())); 538 539 auto submit_status = CUDNN_STATUS_NOT_SUPPORTED; 540 for (size_t i = 0; i < returned_algo_count; i++) { 541 submit_status = perf[i].status; 542 if (submit_status == CUDNN_STATUS_SUCCESS) { 543 // cudnnFindConvolutionForwardAlgorithm can erroneously report 544 // algorithms for int8 which does not work so ensure that we 545 // only allow CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM 546 // in this case. 547 if (computation_data_type == CUDNN_DATA_INT32 548 && perf[i].algo 549 != CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) { 550 continue; 551 } 552 switch (pd->desc()->alg_kind) { 553 case dnnl_convolution_auto: 554 if (utils::one_of(perf[i].algo, 555 CUDNN_CONVOLUTION_FWD_ALGO_GEMM, 556 CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM, 557 CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM)) { 558 utils::downcast<cudnn_convolution_fwd_pd_t *>(pd) 559 ->set_alg_kind(dnnl_convolution_direct); 560 } else { 561 utils::downcast<cudnn_convolution_fwd_pd_t *>(pd) 562 ->set_alg_kind(dnnl_convolution_winograd); 563 } 564 break; 565 case dnnl_convolution_direct: 566 if (!utils::one_of(perf[i].algo, 567 CUDNN_CONVOLUTION_FWD_ALGO_GEMM, 568 CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM, 569 CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM)) 570 continue; 571 break; 572 case dnnl_convolution_winograd: 573 if (!utils::one_of(perf[i].algo, 574 CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD, 575 CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED)) 576 continue; 577 break; 578 default: return status::unimplemented; 579 } 580 fwd_alg_kind = perf[i].algo; 581 CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetConvolutionMathType, 582 conv_desc, perf[i].mathType)); 583 break; 584 } 585 } 586 587 if (submit_status != CUDNN_STATUS_SUCCESS) return status::unimplemented; 588 589 if (fwd_alg_kind == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) { 590 CHECK(CUDNN_EXECUTE_FUNC_S( 591 cudnnCreateActivationDescriptor, &activation_desc)); 592 CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetActivationDescriptor, 593 activation_desc, 594 cudnnActivationMode_t::CUDNN_ACTIVATION_IDENTITY, 595 CUDNN_NOT_PROPAGATE_NAN, 1.0)); 596 } 597 598 return status::success; 599 } 600 create_and_set_eltwise_descriptordnnl::impl::gpu::nvidia::cudnn_convolution_impl_fwd_t601 status_t create_and_set_eltwise_descriptor(const convolution_pd_t *pd) { 602 603 CHECK(CUDNN_EXECUTE_FUNC_S( 604 cudnnCreateActivationDescriptor, &eltwise_desc)); 605 606 cudnnActivationMode_t act_mode; 607 switch (eltwise_algorithm_kind(pd)) { 608 case alg_kind::eltwise_tanh: 609 act_mode = CUDNN_ACTIVATION_TANH; 610 break; 611 case alg_kind::eltwise_elu: act_mode = CUDNN_ACTIVATION_ELU; break; 612 case alg_kind::eltwise_relu: 613 act_mode = CUDNN_ACTIVATION_RELU; 614 break; 615 case alg_kind::eltwise_logistic: 616 act_mode = CUDNN_ACTIVATION_SIGMOID; 617 break; 618 case alg_kind::eltwise_bounded_relu: 619 act_mode = CUDNN_ACTIVATION_CLIPPED_RELU; 620 break; 621 default: return status::unimplemented; 622 } 623 CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetActivationDescriptor, eltwise_desc, 624 act_mode, cudnnNanPropagation_t::CUDNN_NOT_PROPAGATE_NAN, 625 eltwise_alpha(pd))); 626 627 return status::success; 628 } 629 eltwise_algorithm_kinddnnl::impl::gpu::nvidia::cudnn_convolution_impl_fwd_t630 dnnl::impl::alg_kind_t eltwise_algorithm_kind( 631 const convolution_pd_t *pd) const { 632 const int eltwise_idx 633 = pd->attr()->post_ops_.find(primitive_kind::eltwise); 634 return pd->attr()->post_ops_.entry_[eltwise_idx].eltwise.alg; 635 } 636 eltwise_alphadnnl::impl::gpu::nvidia::cudnn_convolution_impl_fwd_t637 float eltwise_alpha(const convolution_pd_t *pd) const { 638 const int eltwise_idx 639 = pd->attr()->post_ops_.find(primitive_kind::eltwise); 640 return pd->attr()->post_ops_.entry_[eltwise_idx].eltwise.alpha; 641 } 642 }; 643 644 struct cudnn_convolution_impl_bwd_data_t 645 : public cudnn_convolution_impl_base_t { 646 protected: 647 cudnnConvolutionBwdDataAlgo_t bwd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1; 648 std::vector<cudnnConvolutionBwdDataAlgoPerf_t> perf; 649 int requested_algo_count = 0; 650 int returned_algo_count = 0; configure_alg_kinddnnl::impl::gpu::nvidia::cudnn_convolution_impl_bwd_data_t651 status_t configure_alg_kind( 652 engine_t *engine, convolution_pd_t *pd) override { 653 auto &sycl_engine = *utils::downcast<sycl_cuda_engine_t *>(engine); 654 cuda_sycl_scoped_context_handler_t sc(sycl_engine); 655 stream_t *service_stream; 656 CHECK(sycl_engine.get_service_stream(service_stream)); 657 658 auto cuda_stream 659 = utils::downcast<sycl_cuda_stream_t *>(service_stream); 660 auto handle = cuda_stream->get_cudnn_handle(); 661 662 CHECK(CUDNN_EXECUTE_FUNC_S( 663 cudnnGetConvolutionBackwardDataAlgorithmMaxCount, handle, 664 &requested_algo_count)); 665 perf.resize(requested_algo_count); 666 CHECK(CUDNN_EXECUTE_FUNC_S(cudnnFindConvolutionBackwardDataAlgorithm, 667 handle, weights_desc, descs[y], conv_desc, descs[x], 668 requested_algo_count, &returned_algo_count, perf.data())); 669 for (size_t i = 0; i < returned_algo_count; i++) { 670 if (perf[i].status == CUDNN_STATUS_SUCCESS) { 671 switch (pd->desc()->alg_kind) { 672 case dnnl_convolution_auto: 673 if (utils::one_of(perf[i].algo, 674 CUDNN_CONVOLUTION_BWD_DATA_ALGO_0, 675 CUDNN_CONVOLUTION_BWD_DATA_ALGO_1)) { 676 utils::downcast<cudnn_convolution_bwd_data_pd_t *>( 677 pd) 678 ->set_alg_kind(dnnl_convolution_direct); 679 } else { 680 utils::downcast<cudnn_convolution_bwd_data_pd_t *>( 681 pd) 682 ->set_alg_kind(dnnl_convolution_winograd); 683 } 684 break; 685 case dnnl_convolution_direct: 686 if (!utils::one_of(perf[i].algo, 687 CUDNN_CONVOLUTION_BWD_DATA_ALGO_0, 688 CUDNN_CONVOLUTION_BWD_DATA_ALGO_1)) 689 continue; 690 break; 691 case dnnl_convolution_winograd: 692 if (!utils::one_of(perf[i].algo, 693 CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD, 694 CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED)) 695 continue; 696 break; 697 default: return status::unimplemented; 698 } 699 bwd_algo = perf[i].algo; 700 CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetConvolutionMathType, 701 conv_desc, perf[i].mathType)); 702 break; 703 } else { 704 return status::unimplemented; 705 } 706 } 707 708 return status::success; 709 } 710 init_scratchpaddnnl::impl::gpu::nvidia::cudnn_convolution_impl_bwd_data_t711 status_t init_scratchpad(engine_t *engine, convolution_pd_t *pd) override { 712 auto &sycl_engine = *utils::downcast<sycl_cuda_engine_t *>(engine); 713 stream_t *service_stream; 714 CHECK(sycl_engine.get_service_stream(service_stream)); 715 716 auto cuda_stream 717 = utils::downcast<sycl_cuda_stream_t *>(service_stream); 718 auto handle = cuda_stream->get_cudnn_handle(); 719 720 CHECK(CUDNN_EXECUTE_FUNC_S(cudnnGetConvolutionBackwardDataWorkspaceSize, 721 handle, weights_desc, descs[io::y], conv_desc, descs[io::x], 722 bwd_algo, &scratchpad_size)); 723 if (scratchpad_size > 0) 724 pd->scratchpad_registry().registrar().book( 725 memory_tracking::names::key_conv_cudnn_algo, 726 scratchpad_size, size_t(1)); 727 728 return cudnn_convolution_impl_base_t::init_scratchpad(engine, pd); 729 } 730 executednnl::impl::gpu::nvidia::cudnn_convolution_impl_bwd_data_t731 void execute(cudnnHandle_t handle, 732 const std::vector<void *> &args) const override { 733 auto x = args[0], weights = args[1], y = args[2], bias = args[3], 734 scratchpad = args[4]; 735 if (using_transformed_filter()) { 736 auto w_scratch = args[5]; 737 transform_filter(handle, weights, w_scratch); 738 weights = w_scratch; 739 } 740 const float bias_alpha = 1.0f; 741 const float bias_beta = 1.0f; 742 CUDNN_EXECUTE_FUNC_V(cudnnConvolutionBackwardData, handle, &alpha, 743 weights_desc, weights, descs[io::y], y, conv_desc, bwd_algo, 744 scratchpad, scratchpad_size, &beta, descs[io::x], x); 745 if (with_bias) { 746 CUDNN_EXECUTE_FUNC_V(cudnnAddTensor, handle, &bias_alpha, 747 descs[io::bias], bias, &bias_beta, descs[io::x], x); 748 } 749 } 750 }; 751 752 struct cudnn_convolution_impl_bwd_weights_t 753 : public cudnn_convolution_impl_base_t { 754 protected: 755 cudnnConvolutionBwdFilterAlgo_t bwd_filter_algo 756 = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1; 757 std::vector<cudnnConvolutionBwdFilterAlgoPerf_t> perf; 758 int requested_algo_count = 0; 759 int returned_algo_count = 0; 760 761 public: init_zero_dimsdnnl::impl::gpu::nvidia::cudnn_convolution_impl_bwd_weights_t762 status_t init_zero_dims(convolution_pd_t *pd) override { 763 if (pd->ndims() > CUDNN_DIM_MAX) { return status::invalid_arguments; } 764 dnnl_descs[weights] = *pd->invariant_wei_md(); 765 CHECK(get_format(&dnnl_descs[weights], formats[weights], true)); 766 ndims[y] = pd->invariant_dst_md()->ndims; 767 ndims[weights] = dnnl_descs[weights].ndims - pd->with_groups(); 768 CHECK(convert_data_type(&dnnl_descs[weights], &data_types[weights])); 769 convert_dims(dnnl_descs[weights].dims + pd->with_groups(), 770 dims[weights], ndims[weights]); 771 ndims[weights] = std::max(4, ndims[weights]); 772 convert_dims(dnnl_descs[weights].format_desc.blocking.strides, 773 strides[weights], ndims[weights]); 774 CHECK(create_and_set_tensor_descriptor(&descs[weights], 775 data_types[weights], ndims[weights], dims[weights], 776 strides[weights])); 777 778 if (pd->with_bias()) { 779 dnnl_descs[bias] = *pd->invariant_bia_md(); 780 ndims[bias] = dnnl_descs[bias].ndims; 781 CHECK(convert_data_type(&dnnl_descs[bias], &data_types[bias])); 782 convert_dims(dnnl_descs[bias].padded_dims, dims[bias], ndims[bias], 783 ndims[y]); 784 std::swap(dims[bias][0], dims[bias][1]); 785 convert_dims(dnnl_descs[bias].format_desc.blocking.strides, 786 strides[bias], ndims[bias], ndims[weights]); 787 ndims[bias] = ndims[y]; 788 CHECK(create_and_set_tensor_descriptor(&descs[bias], 789 data_types[bias], ndims[bias], dims[bias], strides[bias])); 790 } 791 return status::success; 792 } configure_alg_kinddnnl::impl::gpu::nvidia::cudnn_convolution_impl_bwd_weights_t793 virtual status_t configure_alg_kind( 794 engine_t *engine, convolution_pd_t *pd) { 795 auto &sycl_engine = *utils::downcast<sycl_cuda_engine_t *>(engine); 796 cuda_sycl_scoped_context_handler_t sc(sycl_engine); 797 stream_t *service_stream; 798 CHECK(sycl_engine.get_service_stream(service_stream)); 799 800 auto cuda_stream 801 = utils::downcast<sycl_cuda_stream_t *>(service_stream); 802 auto handle = cuda_stream->get_cudnn_handle(); 803 804 CHECK(CUDNN_EXECUTE_FUNC_S( 805 cudnnGetConvolutionBackwardFilterAlgorithmMaxCount, handle, 806 &requested_algo_count)); 807 perf.resize(requested_algo_count); 808 CHECK(CUDNN_EXECUTE_FUNC_S(cudnnFindConvolutionBackwardFilterAlgorithm, 809 handle, descs[x], descs[y], conv_desc, weights_desc, 810 requested_algo_count, &returned_algo_count, perf.data())); 811 for (size_t i = 0; i < returned_algo_count; i++) { 812 if (perf[i].status == CUDNN_STATUS_SUCCESS) { 813 switch (pd->desc()->alg_kind) { 814 case dnnl_convolution_auto: 815 if (utils::one_of(perf[i].algo, 816 CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0, 817 CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1, 818 CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3)) { 819 utils::downcast< 820 cudnn_convolution_bwd_weights_pd_t *>(pd) 821 ->set_alg_kind(dnnl_convolution_direct); 822 } else { 823 utils::downcast< 824 cudnn_convolution_bwd_weights_pd_t *>(pd) 825 ->set_alg_kind(dnnl_convolution_winograd); 826 } 827 break; 828 case dnnl_convolution_direct: 829 if (!utils::one_of(perf[i].algo, 830 CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0, 831 CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1, 832 CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3)) 833 continue; 834 break; 835 case dnnl_convolution_winograd: 836 if (!utils::one_of(perf[i].algo, 837 CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD, 838 CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED)) 839 continue; 840 break; 841 default: return status::unimplemented; 842 } 843 bwd_filter_algo = perf[i].algo; 844 CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetConvolutionMathType, 845 conv_desc, perf[i].mathType)); 846 break; 847 } else { 848 return status::unimplemented; 849 } 850 } 851 852 return status::success; 853 } 854 init_scratchpaddnnl::impl::gpu::nvidia::cudnn_convolution_impl_bwd_weights_t855 status_t init_scratchpad(engine_t *engine, convolution_pd_t *pd) override { 856 auto &sycl_engine = *utils::downcast<sycl_cuda_engine_t *>(engine); 857 stream_t *service_stream; 858 CHECK(sycl_engine.get_service_stream(service_stream)); 859 860 auto cuda_stream 861 = utils::downcast<sycl_cuda_stream_t *>(service_stream); 862 auto handle = cuda_stream->get_cudnn_handle(); 863 864 CHECK(CUDNN_EXECUTE_FUNC_S( 865 cudnnGetConvolutionBackwardFilterWorkspaceSize, handle, 866 descs[io::x], descs[io::y], conv_desc, weights_desc, 867 bwd_filter_algo, &scratchpad_size)); 868 if (scratchpad_size > 0) 869 pd->scratchpad_registry().registrar().book( 870 memory_tracking::names::key_conv_cudnn_algo, 871 scratchpad_size, size_t(1)); 872 873 return cudnn_convolution_impl_base_t::init_scratchpad(engine, pd); 874 } 875 executednnl::impl::gpu::nvidia::cudnn_convolution_impl_bwd_weights_t876 void execute(cudnnHandle_t handle, 877 const std::vector<void *> &args) const override { 878 auto x = args[0], weights = args[1], y = args[2], bias = args[3], 879 scratchpad = args[4]; 880 auto filter = weights; 881 if (using_transformed_filter()) { 882 auto w_scratch = args[5]; 883 transform_filter(handle, weights, w_scratch); 884 filter = w_scratch; 885 } 886 const float bias_alpha = 1.0f; 887 const float bias_beta = 0.0f; 888 CUDNN_EXECUTE_FUNC_V(cudnnConvolutionBackwardFilter, handle, &alpha, 889 descs[io::x], x, descs[io::y], y, conv_desc, bwd_filter_algo, 890 scratchpad, scratchpad_size, &beta, weights_desc, filter); 891 if (with_bias) { 892 CUDNN_EXECUTE_FUNC_V(cudnnConvolutionBackwardBias, handle, 893 &bias_alpha, descs[io::y], y, &bias_beta, descs[io::bias], 894 bias); 895 } 896 if (using_transformed_filter()) { 897 undo_transform_filter(handle, filter, weights); 898 } 899 } 900 }; 901 902 } // namespace nvidia 903 } // namespace gpu 904 } // namespace impl 905 } // namespace dnnl 906 907 #endif 908