1 /*******************************************************************************
2 * Copyright 2020 Intel Corporation
3 * Copyright 2020 Codeplay Software Limited
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *******************************************************************************/
17
18 #ifndef GPU_NVIDIA_CUDNN_INNER_PRODUCT_IMPL_HPP
19 #define GPU_NVIDIA_CUDNN_INNER_PRODUCT_IMPL_HPP
20
21 #include "cublas_v2.h"
22 #include "cudnn.h"
23
24 #include "common/type_helpers.hpp"
25 #include "gpu/nvidia/sycl_cuda_engine.hpp"
26 #include "gpu/nvidia/sycl_cuda_utils.hpp"
27
28 namespace dnnl {
29 namespace impl {
30 namespace gpu {
31 namespace nvidia {
32 namespace {
get_4d_tensor_descriptor(const memory_desc_t * mem_desc1,int * dims,int * strides)33 inline void get_4d_tensor_descriptor(
34 const memory_desc_t *mem_desc1, int *dims, int *strides) {
35 memory_desc_t mem_desc = *mem_desc1;
36
37 // Forcing tensors dims less than 4 to be 4 {n c h w};
38 using namespace format_tag;
39 auto set_dim = [&]() {
40 if (mem_desc.ndims == 3) {
41 mem_desc.ndims = 4;
42 mem_desc.dims[3] = mem_desc.dims[2];
43 mem_desc.dims[2] = 1;
44 mem_desc.padded_dims[3] = mem_desc.padded_dims[2];
45 mem_desc.padded_dims[2] = 1;
46 } else if (mem_desc.ndims == 2) {
47 mem_desc.ndims = 4;
48 mem_desc.dims[3] = 1;
49 mem_desc.dims[2] = 1;
50 mem_desc.padded_dims[3] = 1;
51 mem_desc.padded_dims[2] = 1;
52 }
53 };
54 auto &stride = mem_desc.format_desc.blocking.strides;
55 auto &dim = mem_desc.dims;
56 // Forcing strides < 4 to be 4
57 if (memory_desc_matches_tag(mem_desc, nwc)) {
58 set_dim();
59 // promoting nwc(owi) to NHWC = {wc 1 c} to {wc 1 wc c}
60 mem_desc.format_desc.blocking.strides[3]
61 = mem_desc.format_desc.blocking.strides[2];
62 mem_desc.format_desc.blocking.strides[2]
63 = mem_desc.format_desc.blocking.strides[0];
64 assert(memory_desc_matches_tag(mem_desc, nhwc)
65 && "Tag is not set to NHWC");
66 } else if (memory_desc_matches_tag(mem_desc, ncw)) {
67 set_dim();
68 // promoting ncw(oiw) to NCHW = {wc w 1} to {wc w w 1}
69 mem_desc.format_desc.blocking.strides[3]
70 = mem_desc.format_desc.blocking.strides[2];
71 mem_desc.format_desc.blocking.strides[2]
72 = mem_desc.format_desc.blocking.strides[1];
73 assert(memory_desc_matches_tag(mem_desc, nchw)
74 && "Tag is not set to NCHW");
75 } else if (memory_desc_matches_tag(mem_desc, wio)) {
76 set_dim();
77 // promoting wcn(wio) to HWCN = {1 n nc} to {1 n ncw nc}
78 mem_desc.format_desc.blocking.strides[3]
79 = mem_desc.format_desc.blocking.strides[2];
80 mem_desc.format_desc.blocking.strides[2] *= mem_desc.dims[3];
81 assert(memory_desc_matches_tag(mem_desc, hwio)
82 && " Tag is not set to HWIO");
83 } else if (memory_desc_matches_tag(mem_desc, nc)) {
84 set_dim();
85 // fixing strides
86 // promoting nc(oi) to NCHW = {c 1} to {c 1 1 1}
87 mem_desc.format_desc.blocking.strides[2]
88 = mem_desc.format_desc.blocking.strides[1];
89 mem_desc.format_desc.blocking.strides[3]
90 = mem_desc.format_desc.blocking.strides[1];
91 assert(memory_desc_matches_tag(mem_desc, nchw)
92 && " Tag is not set to NCHW");
93 } else if (memory_desc_matches_tag(mem_desc, cn)) {
94 set_dim();
95 // fixing strides cn(oi) to HWCN = {1 n} to {1 n nc nc}.
96 // Note that CHWN exists as well, but for inner product
97 // we convert it to HWCN. Other primitives may need
98 // different conversion.
99 mem_desc.format_desc.blocking.strides[2]
100 = mem_desc.format_desc.blocking.strides[1]
101 * mem_desc.padded_dims[1];
102 mem_desc.format_desc.blocking.strides[3]
103 = mem_desc.format_desc.blocking.strides[2];
104 assert(memory_desc_matches_tag(mem_desc, hwio)
105 && " Tag is not set to NCHW");
106 }
107 convert_dnnl_dims_array(mem_desc.dims, dims, mem_desc.ndims);
108 convert_dnnl_dims_array(
109 mem_desc.format_desc.blocking.strides, strides, mem_desc.ndims);
110 }
111 } // namespace
112 struct cudnn_inner_product_impl_base_t {
113 // The io enum requires the weights be the last parameter to ensure
114 // tensor_descs is contiguous.
115 enum io { src = 0, bia, dst, wei, NUM_IO };
116 cudnnDataType_t data_types_[NUM_IO + 1]; // +1 data-type for accumulation
117 int ndims_;
118 int dims_[NUM_IO][DNNL_MAX_NDIMS];
119 // one extra stride added for transform filter
120 int strides_[NUM_IO + 1][DNNL_MAX_NDIMS];
121
122 cudnnTensorDescriptor_t tensor_descs_[NUM_IO - 1] = {};
123
124 size_t workspace_size_ = 0;
125 float alpha_ = 1, beta_ = 0;
126 bool with_bias_;
127 bool scale_bias_ = false;
128 bool with_relu_ = false, with_eltwise_ = false, with_sum_ = false;
129 bool filter_using_spatial_format_ = false;
130
need_to_transform_filterdnnl::impl::gpu::nvidia::cudnn_inner_product_impl_base_t131 virtual bool need_to_transform_filter() const {
132 return filter_using_spatial_format_;
133 }
134
ip_using_scratchpaddnnl::impl::gpu::nvidia::cudnn_inner_product_impl_base_t135 virtual bool ip_using_scratchpad() const { return (workspace_size_ > 0); }
conv_using_scale_scratchpaddnnl::impl::gpu::nvidia::cudnn_inner_product_impl_base_t136 bool conv_using_scale_scratchpad() const { return scale_bias_; }
137
set_bias_dimsdnnl::impl::gpu::nvidia::cudnn_inner_product_impl_base_t138 void set_bias_dims(cudnnTensorFormat_t format, int ndims, int bias_dim) {
139 // Set the dimensions and strides for the bias.
140 // Note that the second dimension of bias and the first dimension
141 // of filter should be equal, as cuDNN always stores dimensions in
142 // NCDHW order. The first dimension of filter must be equal to the
143 // second dimension of bias
144 for (size_t i = 0; i < ndims; ++i) {
145 dims_[io::bia][i] = 1;
146 strides_[io::bia][i] = (format != CUDNN_TENSOR_NHWC ? 1 : bias_dim);
147 }
148 dims_[io::bia][1] = bias_dim;
149 strides_[io::bia][1] = 1;
150 strides_[io::bia][0] = bias_dim;
151 }
152 virtual status_t init(engine_t * /*engine*/, inner_product_pd_t * /*pd*/,
153 bool /*with_relu*/, bool /*with_eltwise*/, bool /*with_sum */,
154 bool /*using_fused_path_for_blocking*/)
155 = 0;
156
157 virtual void execute(cudnnHandle_t /*handle*/,
158 cublasHandle_t /*cublas_handle*/,
159 const std::vector<void *> & /*args*/) const = 0;
160 };
161
162 struct cudnn_inner_product_fwd_base_t : public cudnn_inner_product_impl_base_t {
163 float output_scales_; // alpha in gemm
164 float sum_scale_; // beta in gemm
eltwise_alphadnnl::impl::gpu::nvidia::cudnn_inner_product_fwd_base_t165 float eltwise_alpha(const inner_product_pd_t *pd) const {
166 const int eltwise_idx
167 = pd->attr()->post_ops_.find(primitive_kind::eltwise);
168 return with_eltwise_
169 ? pd->attr()->post_ops_.entry_[eltwise_idx].eltwise.alpha
170 : 0.0f;
171 }
sum_scalednnl::impl::gpu::nvidia::cudnn_inner_product_fwd_base_t172 float sum_scale(const inner_product_pd_t *pd) const {
173 const int sum_idx = pd->attr()->post_ops_.find(primitive_kind::sum);
174 return with_sum_ ? pd->attr()->post_ops_.entry_[sum_idx].sum.scale
175 : 0.0f;
176 }
177
eltwise_algorithm_kinddnnl::impl::gpu::nvidia::cudnn_inner_product_fwd_base_t178 dnnl::impl::alg_kind_t eltwise_algorithm_kind(
179 const inner_product_pd_t *pd) const {
180 const int eltwise_idx
181 = pd->attr()->post_ops_.find(primitive_kind::eltwise);
182 return pd->attr()->post_ops_.entry_[eltwise_idx].eltwise.alg;
183 }
184 };
185
186 } // namespace nvidia
187 } // namespace gpu
188 } // namespace impl
189 } // namespace dnnl
190
191 #endif
192