1 /******************************************************************************* 2 * Copyright 2020 Intel Corporation 3 * Copyright 2020 Codeplay Software Limited 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 *******************************************************************************/ 17 18 #ifndef GPU_NVIDIA_CUDNN_REORDER_IMPL_HPP 19 #define GPU_NVIDIA_CUDNN_REORDER_IMPL_HPP 20 21 #include "common/type_helpers.hpp" 22 #include "gpu/nvidia/sycl_cuda_utils.hpp" 23 24 namespace dnnl { 25 namespace impl { 26 namespace gpu { 27 namespace nvidia { 28 29 struct cudnn_reorder_generic_t { 30 public: 31 virtual status_t init(const reorder_pd_t *pd) = 0; 32 33 virtual void execute(cudnnHandle_t handle, void *src, void *dst) const = 0; 34 ~cudnn_reorder_generic_tdnnl::impl::gpu::nvidia::cudnn_reorder_generic_t35 virtual ~cudnn_reorder_generic_t() { 36 CUDNN_EXECUTE_FUNC_V(cudnnDestroyTensorDescriptor, src_desc_); 37 CUDNN_EXECUTE_FUNC_V(cudnnDestroyTensorDescriptor, dst_desc_); 38 } 39 dst_offset_in_bytesdnnl::impl::gpu::nvidia::cudnn_reorder_generic_t40 int dst_offset_in_bytes() { return dst_offset_in_bytes_; } src_offset_in_bytesdnnl::impl::gpu::nvidia::cudnn_reorder_generic_t41 int src_offset_in_bytes() { return src_offset_in_bytes_; } 42 43 protected: 44 cudnnDataType_t src_data_type_; 45 cudnnDataType_t dst_data_type_; 46 int ndims_; 47 int dims_[DNNL_MAX_NDIMS]; 48 cudnnTensorDescriptor_t src_desc_; 49 cudnnTensorDescriptor_t dst_desc_; 50 float alpha_, beta_; 51 int dst_offset_in_bytes_ = 0; 52 int src_offset_in_bytes_ = 0; 53 }; 54 55 // This structure is used when the memory format includes blocking 56 struct cudnn_reorder_ex_t : public cudnn_reorder_generic_t { 57 public: initdnnl::impl::gpu::nvidia::cudnn_reorder_ex_t58 status_t init(const reorder_pd_t *pd) override { 59 // If any of the dimensions are 0 we should not continue with creating 60 // cudnn descriptors 61 memory_desc_wrapper wrap(pd->src_md()); 62 if (wrap.size() == 0) { return status::success; } 63 // Validity checks 64 assert(pd->dst_md()->ndims == pd->src_md()->ndims); 65 66 get_format(pd->src_md(), src_format_); 67 get_format(pd->dst_md(), dst_format_); 68 dst_offset_in_bytes_ = pd->dst_md()->offset0 69 * types::data_type_size(pd->dst_md()->data_type); 70 src_offset_in_bytes_ = pd->src_md()->offset0 71 * types::data_type_size(pd->src_md()->data_type); 72 alpha_ = pd->alpha(); 73 beta_ = pd->beta(); 74 75 CHECK(convert_data_type(pd->src_md(), &src_data_type_)); 76 CHECK(convert_data_type(pd->dst_md(), &dst_data_type_)); 77 78 convert_dims(pd->src_md()->padded_dims, dims_, pd->src_md()->ndims); 79 80 ndims_ = pd->dst_md()->ndims > 4 ? pd->dst_md()->ndims : 4; 81 82 // Create and set tensor transform descriptor 83 CHECK(CUDNN_EXECUTE_FUNC_S( 84 cudnnCreateTensorTransformDescriptor, &trans_desc_)); 85 CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetTensorTransformDescriptor, 86 trans_desc_, ndims_, dst_format_, nullptr, nullptr, nullptr, 87 cudnnFoldingDirection_t::CUDNN_TRANSFORM_FOLD)); 88 // Create and set source tensor descriptor 89 CHECK(CUDNN_EXECUTE_FUNC_S(cudnnCreateTensorDescriptor, &src_desc_)); 90 CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetTensorNdDescriptorEx, src_desc_, 91 src_format_, src_data_type_, ndims_, dims_)); 92 // Create and set destination tensor descriptor 93 CHECK(CUDNN_EXECUTE_FUNC_S(cudnnCreateTensorDescriptor, &dst_desc_)); 94 CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetTensorNdDescriptorEx, dst_desc_, 95 dst_format_, dst_data_type_, ndims_, dims_)); 96 return status::success; 97 } 98 executednnl::impl::gpu::nvidia::cudnn_reorder_ex_t99 void execute(cudnnHandle_t handle, void *src, void *dst) const override { 100 // cudnnTransformTensorEx() function is required to support blocking. 101 // It requires the output tensor to be in cuDNN supported format. 102 CUDNN_EXECUTE_FUNC(cudnnTransformTensorEx, handle, trans_desc_, &alpha_, 103 src_desc_, src, &beta_, dst_desc_, dst); 104 } 105 ~cudnn_reorder_ex_tdnnl::impl::gpu::nvidia::cudnn_reorder_ex_t106 ~cudnn_reorder_ex_t() { 107 CUDNN_EXECUTE_FUNC_V( 108 cudnnDestroyTensorTransformDescriptor, trans_desc_); 109 } 110 111 private: 112 cudnnTensorFormat_t src_format_; 113 cudnnTensorFormat_t dst_format_; 114 cudnnTensorTransformDescriptor_t trans_desc_; 115 116 using cudnn_reorder_generic_t::cudnn_reorder_generic_t; 117 }; 118 119 // This structure is used when the memory format does not include blocking 120 struct cudnn_reorder_stride_t : public cudnn_reorder_generic_t { 121 public: initdnnl::impl::gpu::nvidia::cudnn_reorder_stride_t122 status_t init(const reorder_pd_t *pd) override { 123 // If any of the dimensions are 0 we should not continue with creating 124 // cudnn descriptors 125 memory_desc_wrapper wrap(pd->src_md()); 126 if (wrap.size() == 0) { return status::success; } 127 128 // Validity checks 129 assert(pd->dst_md()->ndims == pd->src_md()->ndims); 130 dst_offset_in_bytes_ = pd->dst_md()->offset0 131 * types::data_type_size(pd->dst_md()->data_type); 132 src_offset_in_bytes_ = pd->src_md()->offset0 133 * types::data_type_size(pd->src_md()->data_type); 134 alpha_ = pd->alpha(); 135 beta_ = pd->beta(); 136 137 convert_dims(pd->dst_md()->dims, dims_, pd->dst_md()->ndims); 138 convert_dims(pd->src_md()->format_desc.blocking.strides, src_strides_, 139 pd->src_md()->ndims); 140 convert_dims(pd->dst_md()->format_desc.blocking.strides, dst_strides_, 141 pd->dst_md()->ndims); 142 adjust_dim_for_dnn(dims_, pd->dst_md()->ndims, pd->src_md()); 143 adjust_stride_for_dnn(src_strides_, pd->dst_md()->ndims, pd->src_md()); 144 adjust_stride_for_dnn(dst_strides_, pd->dst_md()->ndims, pd->dst_md()); 145 ndims_ = pd->dst_md()->ndims >= 4 ? pd->dst_md()->ndims 146 + pd->dst_md()->format_desc.blocking.inner_nblks 147 : 4; 148 bool vectorized = has_different_block_size(pd->src_md(), pd->dst_md()); 149 CHECK(convert_data_type(pd->src_md(), &src_data_type_, vectorized)); 150 CHECK(convert_data_type(pd->dst_md(), &dst_data_type_, vectorized)); 151 // Create and set source tensor descriptor 152 CHECK(CUDNN_EXECUTE_FUNC_S(cudnnCreateTensorDescriptor, &src_desc_)); 153 CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetTensorNdDescriptor, src_desc_, 154 src_data_type_, ndims_, dims_, src_strides_)); 155 // Create and set destination tensor descriptor 156 CHECK(CUDNN_EXECUTE_FUNC_S(cudnnCreateTensorDescriptor, &dst_desc_)); 157 CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetTensorNdDescriptor, dst_desc_, 158 dst_data_type_, ndims_, dims_, dst_strides_)); 159 return status::success; 160 } 161 executednnl::impl::gpu::nvidia::cudnn_reorder_stride_t162 void execute(cudnnHandle_t handle, void *src, void *dst) const override { 163 // We don't need to specify the format (deducible using the strides) 164 // in case of cudnnTransformTensor(). 165 // For example, this is useful when converting from abcd to bacd 166 CUDNN_EXECUTE_FUNC(cudnnTransformTensor, handle, &alpha_, src_desc_, 167 src, &beta_, dst_desc_, dst); 168 } 169 170 private: 171 int src_strides_[DNNL_MAX_NDIMS]; 172 int dst_strides_[DNNL_MAX_NDIMS]; 173 174 using cudnn_reorder_generic_t::cudnn_reorder_generic_t; 175 }; 176 177 } // namespace nvidia 178 } // namespace gpu 179 } // namespace impl 180 } // namespace dnnl 181 182 #endif 183