1 /*******************************************************************************
2 * Copyright 2020 Intel Corporation
3 * Copyright 2020 Codeplay Software Limited
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 *     http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *******************************************************************************/
17 
18 #ifndef GPU_NVIDIA_CUDNN_REORDER_IMPL_HPP
19 #define GPU_NVIDIA_CUDNN_REORDER_IMPL_HPP
20 
21 #include "common/type_helpers.hpp"
22 #include "gpu/nvidia/sycl_cuda_utils.hpp"
23 
24 namespace dnnl {
25 namespace impl {
26 namespace gpu {
27 namespace nvidia {
28 
29 struct cudnn_reorder_generic_t {
30 public:
31     virtual status_t init(const reorder_pd_t *pd) = 0;
32 
33     virtual void execute(cudnnHandle_t handle, void *src, void *dst) const = 0;
34 
~cudnn_reorder_generic_tdnnl::impl::gpu::nvidia::cudnn_reorder_generic_t35     virtual ~cudnn_reorder_generic_t() {
36         CUDNN_EXECUTE_FUNC_V(cudnnDestroyTensorDescriptor, src_desc_);
37         CUDNN_EXECUTE_FUNC_V(cudnnDestroyTensorDescriptor, dst_desc_);
38     }
39 
dst_offset_in_bytesdnnl::impl::gpu::nvidia::cudnn_reorder_generic_t40     int dst_offset_in_bytes() { return dst_offset_in_bytes_; }
src_offset_in_bytesdnnl::impl::gpu::nvidia::cudnn_reorder_generic_t41     int src_offset_in_bytes() { return src_offset_in_bytes_; }
42 
43 protected:
44     cudnnDataType_t src_data_type_;
45     cudnnDataType_t dst_data_type_;
46     int ndims_;
47     int dims_[DNNL_MAX_NDIMS];
48     cudnnTensorDescriptor_t src_desc_;
49     cudnnTensorDescriptor_t dst_desc_;
50     float alpha_, beta_;
51     int dst_offset_in_bytes_ = 0;
52     int src_offset_in_bytes_ = 0;
53 };
54 
55 // This structure is used when the memory format includes blocking
56 struct cudnn_reorder_ex_t : public cudnn_reorder_generic_t {
57 public:
initdnnl::impl::gpu::nvidia::cudnn_reorder_ex_t58     status_t init(const reorder_pd_t *pd) override {
59         // If any of the dimensions are 0 we should not continue with creating
60         // cudnn descriptors
61         memory_desc_wrapper wrap(pd->src_md());
62         if (wrap.size() == 0) { return status::success; }
63         // Validity checks
64         assert(pd->dst_md()->ndims == pd->src_md()->ndims);
65 
66         get_format(pd->src_md(), src_format_);
67         get_format(pd->dst_md(), dst_format_);
68         dst_offset_in_bytes_ = pd->dst_md()->offset0
69                 * types::data_type_size(pd->dst_md()->data_type);
70         src_offset_in_bytes_ = pd->src_md()->offset0
71                 * types::data_type_size(pd->src_md()->data_type);
72         alpha_ = pd->alpha();
73         beta_ = pd->beta();
74 
75         CHECK(convert_data_type(pd->src_md(), &src_data_type_));
76         CHECK(convert_data_type(pd->dst_md(), &dst_data_type_));
77 
78         convert_dims(pd->src_md()->padded_dims, dims_, pd->src_md()->ndims);
79 
80         ndims_ = pd->dst_md()->ndims > 4 ? pd->dst_md()->ndims : 4;
81 
82         // Create and set tensor transform descriptor
83         CHECK(CUDNN_EXECUTE_FUNC_S(
84                 cudnnCreateTensorTransformDescriptor, &trans_desc_));
85         CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetTensorTransformDescriptor,
86                 trans_desc_, ndims_, dst_format_, nullptr, nullptr, nullptr,
87                 cudnnFoldingDirection_t::CUDNN_TRANSFORM_FOLD));
88         // Create and set source tensor descriptor
89         CHECK(CUDNN_EXECUTE_FUNC_S(cudnnCreateTensorDescriptor, &src_desc_));
90         CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetTensorNdDescriptorEx, src_desc_,
91                 src_format_, src_data_type_, ndims_, dims_));
92         // Create and set destination tensor descriptor
93         CHECK(CUDNN_EXECUTE_FUNC_S(cudnnCreateTensorDescriptor, &dst_desc_));
94         CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetTensorNdDescriptorEx, dst_desc_,
95                 dst_format_, dst_data_type_, ndims_, dims_));
96         return status::success;
97     }
98 
executednnl::impl::gpu::nvidia::cudnn_reorder_ex_t99     void execute(cudnnHandle_t handle, void *src, void *dst) const override {
100         // cudnnTransformTensorEx() function is required to support blocking.
101         // It requires the output tensor to be in cuDNN supported format.
102         CUDNN_EXECUTE_FUNC(cudnnTransformTensorEx, handle, trans_desc_, &alpha_,
103                 src_desc_, src, &beta_, dst_desc_, dst);
104     }
105 
~cudnn_reorder_ex_tdnnl::impl::gpu::nvidia::cudnn_reorder_ex_t106     ~cudnn_reorder_ex_t() {
107         CUDNN_EXECUTE_FUNC_V(
108                 cudnnDestroyTensorTransformDescriptor, trans_desc_);
109     }
110 
111 private:
112     cudnnTensorFormat_t src_format_;
113     cudnnTensorFormat_t dst_format_;
114     cudnnTensorTransformDescriptor_t trans_desc_;
115 
116     using cudnn_reorder_generic_t::cudnn_reorder_generic_t;
117 };
118 
119 // This structure is used when the memory format does not include blocking
120 struct cudnn_reorder_stride_t : public cudnn_reorder_generic_t {
121 public:
initdnnl::impl::gpu::nvidia::cudnn_reorder_stride_t122     status_t init(const reorder_pd_t *pd) override {
123         // If any of the dimensions are 0 we should not continue with creating
124         // cudnn descriptors
125         memory_desc_wrapper wrap(pd->src_md());
126         if (wrap.size() == 0) { return status::success; }
127 
128         // Validity checks
129         assert(pd->dst_md()->ndims == pd->src_md()->ndims);
130         dst_offset_in_bytes_ = pd->dst_md()->offset0
131                 * types::data_type_size(pd->dst_md()->data_type);
132         src_offset_in_bytes_ = pd->src_md()->offset0
133                 * types::data_type_size(pd->src_md()->data_type);
134         alpha_ = pd->alpha();
135         beta_ = pd->beta();
136 
137         convert_dims(pd->dst_md()->dims, dims_, pd->dst_md()->ndims);
138         convert_dims(pd->src_md()->format_desc.blocking.strides, src_strides_,
139                 pd->src_md()->ndims);
140         convert_dims(pd->dst_md()->format_desc.blocking.strides, dst_strides_,
141                 pd->dst_md()->ndims);
142         adjust_dim_for_dnn(dims_, pd->dst_md()->ndims, pd->src_md());
143         adjust_stride_for_dnn(src_strides_, pd->dst_md()->ndims, pd->src_md());
144         adjust_stride_for_dnn(dst_strides_, pd->dst_md()->ndims, pd->dst_md());
145         ndims_ = pd->dst_md()->ndims >= 4 ? pd->dst_md()->ndims
146                         + pd->dst_md()->format_desc.blocking.inner_nblks
147                                           : 4;
148         bool vectorized = has_different_block_size(pd->src_md(), pd->dst_md());
149         CHECK(convert_data_type(pd->src_md(), &src_data_type_, vectorized));
150         CHECK(convert_data_type(pd->dst_md(), &dst_data_type_, vectorized));
151         // Create and set source tensor descriptor
152         CHECK(CUDNN_EXECUTE_FUNC_S(cudnnCreateTensorDescriptor, &src_desc_));
153         CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetTensorNdDescriptor, src_desc_,
154                 src_data_type_, ndims_, dims_, src_strides_));
155         // Create and set destination tensor descriptor
156         CHECK(CUDNN_EXECUTE_FUNC_S(cudnnCreateTensorDescriptor, &dst_desc_));
157         CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetTensorNdDescriptor, dst_desc_,
158                 dst_data_type_, ndims_, dims_, dst_strides_));
159         return status::success;
160     }
161 
executednnl::impl::gpu::nvidia::cudnn_reorder_stride_t162     void execute(cudnnHandle_t handle, void *src, void *dst) const override {
163         // We don't need to specify the format (deducible using the strides)
164         // in case of cudnnTransformTensor().
165         // For example, this is useful when converting from abcd to bacd
166         CUDNN_EXECUTE_FUNC(cudnnTransformTensor, handle, &alpha_, src_desc_,
167                 src, &beta_, dst_desc_, dst);
168     }
169 
170 private:
171     int src_strides_[DNNL_MAX_NDIMS];
172     int dst_strides_[DNNL_MAX_NDIMS];
173 
174     using cudnn_reorder_generic_t::cudnn_reorder_generic_t;
175 };
176 
177 } // namespace nvidia
178 } // namespace gpu
179 } // namespace impl
180 } // namespace dnnl
181 
182 #endif
183