1 /*******************************************************************************
2 * Copyright 2020 Intel Corporation
3 * Copyright 2020 Codeplay Software Limited
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 *     http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *******************************************************************************/
17 
18 #ifndef GPU_NVIDIA_CUDNN_INNER_PRODUCT_IMPL_HPP
19 #define GPU_NVIDIA_CUDNN_INNER_PRODUCT_IMPL_HPP
20 
21 #include "cublas_v2.h"
22 #include "cudnn.h"
23 
24 #include "common/type_helpers.hpp"
25 #include "gpu/nvidia/sycl_cuda_engine.hpp"
26 #include "gpu/nvidia/sycl_cuda_utils.hpp"
27 
28 namespace dnnl {
29 namespace impl {
30 namespace gpu {
31 namespace nvidia {
32 namespace {
get_4d_tensor_descriptor(const memory_desc_t * mem_desc1,int * dims,int * strides)33 inline void get_4d_tensor_descriptor(
34         const memory_desc_t *mem_desc1, int *dims, int *strides) {
35     memory_desc_t mem_desc = *mem_desc1;
36 
37     // Forcing tensors dims less than 4 to be 4 {n c h w};
38     using namespace format_tag;
39     auto set_dim = [&]() {
40         if (mem_desc.ndims == 3) {
41             mem_desc.ndims = 4;
42             mem_desc.dims[3] = mem_desc.dims[2];
43             mem_desc.dims[2] = 1;
44             mem_desc.padded_dims[3] = mem_desc.padded_dims[2];
45             mem_desc.padded_dims[2] = 1;
46         } else if (mem_desc.ndims == 2) {
47             mem_desc.ndims = 4;
48             mem_desc.dims[3] = 1;
49             mem_desc.dims[2] = 1;
50             mem_desc.padded_dims[3] = 1;
51             mem_desc.padded_dims[2] = 1;
52         }
53     };
54     auto &stride = mem_desc.format_desc.blocking.strides;
55     auto &dim = mem_desc.dims;
56     // Forcing strides < 4 to be 4
57     if (memory_desc_matches_tag(mem_desc, nwc)) {
58         set_dim();
59         //  promoting nwc(owi) to NHWC = {wc 1 c} to {wc 1 wc c}
60         mem_desc.format_desc.blocking.strides[3]
61                 = mem_desc.format_desc.blocking.strides[2];
62         mem_desc.format_desc.blocking.strides[2]
63                 = mem_desc.format_desc.blocking.strides[0];
64         assert(memory_desc_matches_tag(mem_desc, nhwc)
65                 && "Tag is not set to NHWC");
66     } else if (memory_desc_matches_tag(mem_desc, ncw)) {
67         set_dim();
68         // promoting ncw(oiw) to NCHW = {wc w 1} to {wc w w 1}
69         mem_desc.format_desc.blocking.strides[3]
70                 = mem_desc.format_desc.blocking.strides[2];
71         mem_desc.format_desc.blocking.strides[2]
72                 = mem_desc.format_desc.blocking.strides[1];
73         assert(memory_desc_matches_tag(mem_desc, nchw)
74                 && "Tag is not set to NCHW");
75     } else if (memory_desc_matches_tag(mem_desc, wio)) {
76         set_dim();
77         // promoting wcn(wio) to HWCN = {1 n nc} to {1 n ncw nc}
78         mem_desc.format_desc.blocking.strides[3]
79                 = mem_desc.format_desc.blocking.strides[2];
80         mem_desc.format_desc.blocking.strides[2] *= mem_desc.dims[3];
81         assert(memory_desc_matches_tag(mem_desc, hwio)
82                 && " Tag is not set to HWIO");
83     } else if (memory_desc_matches_tag(mem_desc, nc)) {
84         set_dim();
85         // fixing strides
86         // promoting nc(oi) to NCHW = {c 1} to {c 1 1 1}
87         mem_desc.format_desc.blocking.strides[2]
88                 = mem_desc.format_desc.blocking.strides[1];
89         mem_desc.format_desc.blocking.strides[3]
90                 = mem_desc.format_desc.blocking.strides[1];
91         assert(memory_desc_matches_tag(mem_desc, nchw)
92                 && " Tag is not set to NCHW");
93     } else if (memory_desc_matches_tag(mem_desc, cn)) {
94         set_dim();
95         // fixing strides cn(oi) to HWCN = {1 n} to {1 n nc nc}.
96         // Note that CHWN exists as well, but for inner product
97         // we convert it to HWCN. Other primitives may need
98         // different conversion.
99         mem_desc.format_desc.blocking.strides[2]
100                 = mem_desc.format_desc.blocking.strides[1]
101                 * mem_desc.padded_dims[1];
102         mem_desc.format_desc.blocking.strides[3]
103                 = mem_desc.format_desc.blocking.strides[2];
104         assert(memory_desc_matches_tag(mem_desc, hwio)
105                 && " Tag is not set to NCHW");
106     }
107     convert_dnnl_dims_array(mem_desc.dims, dims, mem_desc.ndims);
108     convert_dnnl_dims_array(
109             mem_desc.format_desc.blocking.strides, strides, mem_desc.ndims);
110 }
111 } // namespace
112 struct cudnn_inner_product_impl_base_t {
113     // The io enum requires the weights be the last parameter to ensure
114     // tensor_descs is contiguous.
115     enum io { src = 0, bia, dst, wei, NUM_IO };
116     cudnnDataType_t data_types_[NUM_IO + 1]; // +1 data-type for accumulation
117     int ndims_;
118     int dims_[NUM_IO][DNNL_MAX_NDIMS];
119     // one extra stride added for transform filter
120     int strides_[NUM_IO + 1][DNNL_MAX_NDIMS];
121 
122     cudnnTensorDescriptor_t tensor_descs_[NUM_IO - 1] = {};
123 
124     size_t workspace_size_ = 0;
125     float alpha_ = 1, beta_ = 0;
126     bool with_bias_;
127     bool scale_bias_ = false;
128     bool with_relu_ = false, with_eltwise_ = false, with_sum_ = false;
129     bool filter_using_spatial_format_ = false;
130 
need_to_transform_filterdnnl::impl::gpu::nvidia::cudnn_inner_product_impl_base_t131     virtual bool need_to_transform_filter() const {
132         return filter_using_spatial_format_;
133     }
134 
ip_using_scratchpaddnnl::impl::gpu::nvidia::cudnn_inner_product_impl_base_t135     virtual bool ip_using_scratchpad() const { return (workspace_size_ > 0); }
conv_using_scale_scratchpaddnnl::impl::gpu::nvidia::cudnn_inner_product_impl_base_t136     bool conv_using_scale_scratchpad() const { return scale_bias_; }
137 
set_bias_dimsdnnl::impl::gpu::nvidia::cudnn_inner_product_impl_base_t138     void set_bias_dims(cudnnTensorFormat_t format, int ndims, int bias_dim) {
139         // Set the dimensions and strides for the bias.
140         // Note that the second dimension of bias and the first dimension
141         // of filter should be equal, as cuDNN always stores dimensions in
142         // NCDHW order. The first dimension of filter must be equal to the
143         // second dimension of bias
144         for (size_t i = 0; i < ndims; ++i) {
145             dims_[io::bia][i] = 1;
146             strides_[io::bia][i] = (format != CUDNN_TENSOR_NHWC ? 1 : bias_dim);
147         }
148         dims_[io::bia][1] = bias_dim;
149         strides_[io::bia][1] = 1;
150         strides_[io::bia][0] = bias_dim;
151     }
152     virtual status_t init(engine_t * /*engine*/, inner_product_pd_t * /*pd*/,
153             bool /*with_relu*/, bool /*with_eltwise*/, bool /*with_sum */,
154             bool /*using_fused_path_for_blocking*/)
155             = 0;
156 
157     virtual void execute(cudnnHandle_t /*handle*/,
158             cublasHandle_t /*cublas_handle*/,
159             const std::vector<void *> & /*args*/) const = 0;
160 };
161 
162 struct cudnn_inner_product_fwd_base_t : public cudnn_inner_product_impl_base_t {
163     float output_scales_; // alpha in gemm
164     float sum_scale_; // beta in gemm
eltwise_alphadnnl::impl::gpu::nvidia::cudnn_inner_product_fwd_base_t165     float eltwise_alpha(const inner_product_pd_t *pd) const {
166         const int eltwise_idx
167                 = pd->attr()->post_ops_.find(primitive_kind::eltwise);
168         return with_eltwise_
169                 ? pd->attr()->post_ops_.entry_[eltwise_idx].eltwise.alpha
170                 : 0.0f;
171     }
sum_scalednnl::impl::gpu::nvidia::cudnn_inner_product_fwd_base_t172     float sum_scale(const inner_product_pd_t *pd) const {
173         const int sum_idx = pd->attr()->post_ops_.find(primitive_kind::sum);
174         return with_sum_ ? pd->attr()->post_ops_.entry_[sum_idx].sum.scale
175                          : 0.0f;
176     }
177 
eltwise_algorithm_kinddnnl::impl::gpu::nvidia::cudnn_inner_product_fwd_base_t178     dnnl::impl::alg_kind_t eltwise_algorithm_kind(
179             const inner_product_pd_t *pd) const {
180         const int eltwise_idx
181                 = pd->attr()->post_ops_.find(primitive_kind::eltwise);
182         return pd->attr()->post_ops_.entry_[eltwise_idx].eltwise.alg;
183     }
184 };
185 
186 } // namespace nvidia
187 } // namespace gpu
188 } // namespace impl
189 } // namespace dnnl
190 
191 #endif
192