1 /*******************************************************************************
2 * Copyright 2020-2021 Arm Ltd. and affiliates
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 *******************************************************************************/
16
17 #include "cpu/aarch64/acl_convolution_utils.hpp"
18 #include "oneapi/dnnl/dnnl.hpp"
19
20 namespace dnnl {
21 namespace impl {
22 namespace cpu {
23 namespace aarch64 {
24
25 namespace acl_convolution_utils {
26
27 using namespace dnnl::impl::status;
28 using namespace dnnl::impl::utils;
29 using namespace dnnl::impl::alg_kind;
30 using namespace prop_kind;
31 using namespace data_type;
32 using uint = unsigned int;
33
acl_init_conf(acl_conv_conf_t & acp,memory_desc_t & src_md,memory_desc_t & weights_md,memory_desc_t & dst_md,memory_desc_t & bias_md,const convolution_desc_t & cd,const primitive_attr_t & attr)34 status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,
35 memory_desc_t &weights_md, memory_desc_t &dst_md,
36 memory_desc_t &bias_md, const convolution_desc_t &cd,
37 const primitive_attr_t &attr) {
38
39 const memory_desc_wrapper src_d(&src_md);
40 const memory_desc_wrapper wei_d(&weights_md);
41 const memory_desc_wrapper dst_d(&dst_md);
42 const memory_desc_wrapper bia_d(&bias_md);
43
44 auto math_mode = get_fpmath_mode();
45 acp.fast_math = one_of(math_mode, fpmath_mode::bf16, fpmath_mode::any);
46
47 // Compute Library currently supports forward propagation only
48 const prop_kind_t prop_kind = cd.prop_kind;
49 const bool is_fwd = (prop_kind == dnnl_forward_training)
50 || (prop_kind == dnnl_forward_inference);
51 if (!is_fwd) return status::unimplemented;
52
53 const int with_groups = wei_d.ndims() == src_d.ndims() + 1;
54 const int ndims = src_d.ndims();
55 const bool is_1d = ndims == 3;
56 const bool is_3d = ndims == 5;
57 bool is_nspc;
58
59 // Compute Library unsupported shape scenarios
60 if (one_of(true, is_3d, is_1d, with_groups)) {
61 return status::unimplemented;
62 }
63
64 // batch size
65 const int mb = src_d.dims()[0];
66
67 // src/input channels, height, width
68 const int ic = src_d.dims()[1];
69 const int ih = src_d.dims()[ndims - 2];
70 const int iw = src_d.dims()[ndims - 1];
71
72 // dst/output channels, height, width
73 const int oc = dst_d.dims()[1];
74 const int oh = dst_d.dims()[ndims - 2];
75 const int ow = dst_d.dims()[ndims - 1];
76
77 // weights height and width
78 const int kh = wei_d.dims()[with_groups + ndims - 2];
79 const int kw = wei_d.dims()[with_groups + ndims - 1];
80
81 // height and width strides
82 const int stride_h = cd.strides[ndims - 4];
83 const int stride_w = cd.strides[ndims - 3];
84
85 // height and width dilations
86 int dilate_h = cd.dilates[ndims - 4];
87 int dilate_w = cd.dilates[ndims - 3];
88 // oneDNN dilations: dk = 1 + (k_size - 1) * (dilate_size + 1)
89 // Compute Library dilations: dk = dilate_size * (k_size - 1) + 1
90 // thus acl_dilation = oneDNN_dilation + 1
91 dilate_h += 1;
92 dilate_w += 1;
93
94 acp.dilation_info = arm_compute::Size2D(dilate_w, dilate_h);
95
96 // left, right, top, bottom padding
97 const int l_pad = cd.padding[0][1];
98 const int t_pad = cd.padding[0][0];
99 // Compute Library assumes the padding to be \geq 0, and r(b)_pad may be
100 // equal to -1 in oneDNN for some cases, when the very right (bottom)
101 // spatial elements of the input tensor are not used in the convolution.
102 // On the other hand l(t)_pad are guaranteed to be non-negative.
103 const int r_pad = std::max(static_cast<int>(cd.padding[1][1]), 0);
104 const int b_pad = std::max(static_cast<int>(cd.padding[1][0]), 0);
105
106 acp.padstride_info = arm_compute::PadStrideInfo(stride_w, stride_h,
107 static_cast<unsigned int>(l_pad), static_cast<unsigned int>(r_pad),
108 static_cast<unsigned int>(t_pad), static_cast<unsigned int>(b_pad),
109 arm_compute::DimensionRoundingType::FLOOR);
110
111 acp.with_bias = cd.bias_desc.format_kind != format_kind::undef;
112
113 auto set_or_check_tags = [&](format_tag_t desired_src_tag,
114 format_tag_t desired_dst_tag) -> status_t {
115 using namespace format_tag;
116 auto src_tag = any, dst_tag = any;
117
118 if (src_d.format_kind() == format_kind::any) {
119 CHECK(memory_desc_init_by_tag(src_md, desired_src_tag));
120 src_tag = desired_src_tag;
121 } else {
122 src_tag = memory_desc_matches_one_of_tag(src_md, nhwc, nchw);
123 }
124
125 if (dst_d.format_kind() == format_kind::any) {
126 CHECK(memory_desc_init_by_tag(dst_md, desired_dst_tag));
127 dst_tag = desired_dst_tag;
128 } else {
129 dst_tag = memory_desc_matches_one_of_tag(dst_md, nhwc, nchw);
130 }
131
132 if (acp.with_bias && bias_md.format_kind == format_kind::any)
133 CHECK(memory_desc_init_by_tag(bias_md, x));
134
135 is_nspc = utils::one_of(src_tag, nhwc);
136
137 memory_desc_t want_wei_md = weights_md;
138 auto wei_tag = is_nspc ? ohwi : oihw;
139 CHECK(memory_desc_init_by_tag(want_wei_md, wei_tag));
140
141 // Compute Library does not support mismatching layouts
142 if ((src_tag != wei_tag) || (src_tag != dst_tag))
143 return status::unimplemented;
144
145 if (weights_md.format_kind == format_kind::any) {
146 weights_md = want_wei_md;
147 }
148 return (want_wei_md == weights_md) ? status::success
149 : status::unimplemented;
150 };
151
152 auto default_dat_tag = format_tag::nhwc;
153 if (set_or_check_tags(default_dat_tag, default_dat_tag) != status::success)
154 return status::unimplemented;
155
156 const auto acl_layout = is_nspc ? arm_compute::DataLayout::NHWC
157 : arm_compute::DataLayout::NCHW;
158
159 auto acl_src_data_t = acl_common_utils::get_acl_data_t(src_d.data_type());
160 auto acl_wei_data_t = acl_common_utils::get_acl_data_t(wei_d.data_type());
161 auto acl_dst_data_t = acl_common_utils::get_acl_data_t(dst_d.data_type());
162 auto acl_bia_data_t = acl_common_utils::get_acl_data_t(bia_d.data_type());
163
164 if (acl_bia_data_t == arm_compute::DataType::UNKNOWN)
165 acl_bia_data_t = arm_compute::DataType::F32;
166
167 // clang-format off
168 acp.src_info = arm_compute::TensorInfo(
169 is_nspc ? arm_compute::TensorShape(ic, iw, ih, mb) :
170 arm_compute::TensorShape(iw, ih, ic, mb),
171 1,
172 acl_src_data_t,
173 acl_layout);
174
175 acp.wei_info = arm_compute::TensorInfo(
176 is_nspc ? arm_compute::TensorShape(ic, kw, kh, oc) :
177 arm_compute::TensorShape(kw, kh, ic, oc),
178 1,
179 acl_wei_data_t,
180 acl_layout);
181
182 acp.dst_info = arm_compute::TensorInfo(
183 is_nspc ? arm_compute::TensorShape(oc, ow, oh, mb) :
184 arm_compute::TensorShape(ow, oh, oc, mb),
185 1,
186 acl_dst_data_t,
187 acl_layout);
188
189 acp.bia_info = arm_compute::TensorInfo(
190 acp.with_bias ? arm_compute::TensorShape(oc)
191 : arm_compute::TensorShape(),
192 1,
193 acl_bia_data_t,
194 acl_layout);
195 // clang-format on
196
197 // Add quantization info to tensors
198 acp.is_int8 = utils::one_of(src_d.data_type(), s8, u8)
199 && wei_d.data_type() == s8;
200
201 if (acp.is_int8) {
202 const float *scales = attr.output_scales_.scales_;
203 acp.src_info.set_quantization_info(arm_compute::QuantizationInfo(1, 0));
204 acp.bia_info.set_quantization_info(arm_compute::QuantizationInfo(1, 0));
205 acp.wei_info.set_quantization_info(arm_compute::QuantizationInfo(1, 0));
206 acp.dst_info.set_quantization_info(
207 arm_compute::QuantizationInfo(1.0f / scales[0], 0));
208 }
209
210 // Post-convolutional operations (post-ops)
211 const auto &post_ops = attr.post_ops_;
212 // is_eltwise(true) here stands for eltwise.scale == 1.f check
213 acp.sum_with_eltwise = (post_ops.len() == 2) && post_ops.entry_[0].is_sum()
214 && post_ops.entry_[1].is_eltwise(true);
215 acp.act_info = acl_common_utils::get_acl_act(attr);
216
217 if (acp.sum_with_eltwise) {
218 // clang-format off
219 // Validate activation layer manually to check for return status
220 auto acl_al_st = arm_compute::NEActivationLayer::validate(
221 &acp.dst_info,
222 &acp.dst_info,
223 acp.act_info);
224 // clang-format on
225 if (acl_al_st.error_code() != arm_compute::ErrorCode::OK) {
226 MAYBE_REPORT_ACL_ERROR(acl_al_st.error_description().c_str());
227 return status::unimplemented;
228 }
229
230 // clang-format off
231 // Validate arithmetic addition manually to check for return status
232 auto acl_aa_st = arm_compute::NEArithmeticAddition::validate(
233 &acp.dst_info,
234 &acp.dst_info,
235 &acp.dst_info,
236 arm_compute::ConvertPolicy::SATURATE);
237 // clang-format on
238 if (acl_aa_st.error_code() != arm_compute::ErrorCode::OK) {
239 MAYBE_REPORT_ACL_ERROR(acl_aa_st.error_description().c_str());
240 return status::unimplemented;
241 }
242 }
243
244 return status::success;
245 }
246
init_conf_gemm(acl_conv_conf_t & acp,memory_desc_t & src_md,memory_desc_t & weights_md,memory_desc_t & dst_md,memory_desc_t & bias_md,const convolution_desc_t & cd,const primitive_attr_t & attr)247 status_t init_conf_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
248 memory_desc_t &weights_md, memory_desc_t &dst_md,
249 memory_desc_t &bias_md, const convolution_desc_t &cd,
250 const primitive_attr_t &attr) {
251
252 // General Compute Library checks, memory tags are also set there
253 CHECK(acl_init_conf(acp, src_md, weights_md, dst_md, bias_md, cd, attr));
254
255 // clang-format off
256 // Validate convolution manually to check for return status
257 auto acl_st = arm_compute::NEGEMMConvolutionLayer::validate(
258 &acp.src_info,
259 &acp.wei_info,
260 acp.with_bias ? &acp.bia_info : nullptr,
261 &acp.dst_info,
262 acp.padstride_info,
263 acp.weights_info,
264 acp.dilation_info,
265 acp.act_info,
266 acp.fast_math);
267 // clang-format on
268 if (acl_st.error_code() != arm_compute::ErrorCode::OK) {
269 MAYBE_REPORT_ACL_ERROR(acl_st.error_description().c_str());
270 return status::unimplemented;
271 }
272
273 return status::success;
274 }
275
init_conf_indirect_gemm(acl_conv_conf_t & acp,memory_desc_t & src_md,memory_desc_t & weights_md,memory_desc_t & dst_md,memory_desc_t & bias_md,const convolution_desc_t & cd,const primitive_attr_t & attr)276 status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
277 memory_desc_t &weights_md, memory_desc_t &dst_md,
278 memory_desc_t &bias_md, const convolution_desc_t &cd,
279 const primitive_attr_t &attr) {
280 // Indirect convolution results in slowdown for low thread count or 1x1
281 // kernels, so fall back to GEMM-based convolution in these cases
282 if (one_of(true, weights_md.dims[2] == 1, // kh
283 weights_md.dims[3] == 1, // kw
284 dnnl_get_max_threads() < 28)) {
285 return status::unimplemented;
286 }
287
288 CHECK(acl_init_conf(acp, src_md, weights_md, dst_md, bias_md, cd, attr));
289
290 // clang-format off
291 // NOTE: indirect convolution method supports only nhwc layout.
292 auto acl_st = arm_compute::NEGEMMConv2d::validate(
293 &acp.src_info,
294 &acp.wei_info,
295 acp.with_bias ? &acp.bia_info : nullptr,
296 &acp.dst_info,
297 arm_compute::Conv2dInfo(acp.padstride_info,
298 acp.dilation_info,
299 acp.act_info,
300 acp.fast_math,
301 1));
302 // clang-format on
303 if (acl_st.error_code() != arm_compute::ErrorCode::OK) {
304 MAYBE_REPORT_ACL_ERROR(acl_st.error_description().c_str());
305 return status::unimplemented;
306 }
307
308 return status::success;
309 }
310
init_conf_wino(acl_conv_conf_t & acp,memory_desc_t & src_md,memory_desc_t & weights_md,memory_desc_t & dst_md,memory_desc_t & bias_md,const convolution_desc_t & cd,const primitive_attr_t & attr)311 status_t init_conf_wino(acl_conv_conf_t &acp, memory_desc_t &src_md,
312 memory_desc_t &weights_md, memory_desc_t &dst_md,
313 memory_desc_t &bias_md, const convolution_desc_t &cd,
314 const primitive_attr_t &attr) {
315
316 // Under these conditions, fallback to faster GEMM-based convolution
317 // unless the user explicitly specifies Winograd algorithm
318 // clang-format off
319 if (one_of(true, src_md.dims[2] > 112, // ih
320 src_md.dims[3] > 112, // iw
321 src_md.dims[1] < 64, // ic
322 dst_md.dims[1] < 64, // oc
323 dnnl_get_max_threads() > 28)
324 && cd.alg_kind == alg_kind::convolution_auto) {
325 return status::unimplemented;
326 }
327 // clang-format on
328
329 // General Compute Library checks, memory tags are also set there
330 CHECK(acl_init_conf(acp, src_md, weights_md, dst_md, bias_md, cd, attr));
331
332 const bool wino_shape_ok // unit strides only, no dilations
333 = (acp.padstride_info.stride() == std::pair<uint, uint> {1, 1})
334 && (acp.dilation_info == arm_compute::Size2D(1, 1));
335 if (!wino_shape_ok) return status::unimplemented;
336
337 // clang-format off
338 // Validate convolution manually to check for return status
339 auto acl_st = arm_compute::NEWinogradConvolutionLayer::validate(
340 &acp.src_info,
341 &acp.wei_info,
342 acp.with_bias ? &acp.bia_info : nullptr,
343 &acp.dst_info,
344 acp.padstride_info,
345 acp.act_info,
346 true); // enable_fast_math flag in ACL Winograd
347 // clang-format on
348 if (acl_st.error_code() != arm_compute::ErrorCode::OK) {
349 MAYBE_REPORT_ACL_ERROR(acl_st.error_description().c_str());
350 return status::unimplemented;
351 }
352
353 return status::success;
354 }
355
356 } // namespace acl_convolution_utils
357
358 } // namespace aarch64
359 } // namespace cpu
360 } // namespace impl
361 } // namespace dnnl
362