1 /*******************************************************************************
2 * Copyright 2020-2021 Arm Ltd. and affiliates
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *     http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 *******************************************************************************/
16 
17 #include "cpu/aarch64/acl_convolution_utils.hpp"
18 #include "oneapi/dnnl/dnnl.hpp"
19 
20 namespace dnnl {
21 namespace impl {
22 namespace cpu {
23 namespace aarch64 {
24 
25 namespace acl_convolution_utils {
26 
27 using namespace dnnl::impl::status;
28 using namespace dnnl::impl::utils;
29 using namespace dnnl::impl::alg_kind;
30 using namespace prop_kind;
31 using namespace data_type;
32 using uint = unsigned int;
33 
acl_init_conf(acl_conv_conf_t & acp,memory_desc_t & src_md,memory_desc_t & weights_md,memory_desc_t & dst_md,memory_desc_t & bias_md,const convolution_desc_t & cd,const primitive_attr_t & attr)34 status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,
35         memory_desc_t &weights_md, memory_desc_t &dst_md,
36         memory_desc_t &bias_md, const convolution_desc_t &cd,
37         const primitive_attr_t &attr) {
38 
39     const memory_desc_wrapper src_d(&src_md);
40     const memory_desc_wrapper wei_d(&weights_md);
41     const memory_desc_wrapper dst_d(&dst_md);
42     const memory_desc_wrapper bia_d(&bias_md);
43 
44     auto math_mode = get_fpmath_mode();
45     acp.fast_math = one_of(math_mode, fpmath_mode::bf16, fpmath_mode::any);
46 
47     // Compute Library currently supports forward propagation only
48     const prop_kind_t prop_kind = cd.prop_kind;
49     const bool is_fwd = (prop_kind == dnnl_forward_training)
50             || (prop_kind == dnnl_forward_inference);
51     if (!is_fwd) return status::unimplemented;
52 
53     const int with_groups = wei_d.ndims() == src_d.ndims() + 1;
54     const int ndims = src_d.ndims();
55     const bool is_1d = ndims == 3;
56     const bool is_3d = ndims == 5;
57     bool is_nspc;
58 
59     // Compute Library unsupported shape scenarios
60     if (one_of(true, is_3d, is_1d, with_groups)) {
61         return status::unimplemented;
62     }
63 
64     // batch size
65     const int mb = src_d.dims()[0];
66 
67     // src/input  channels, height, width
68     const int ic = src_d.dims()[1];
69     const int ih = src_d.dims()[ndims - 2];
70     const int iw = src_d.dims()[ndims - 1];
71 
72     // dst/output channels, height, width
73     const int oc = dst_d.dims()[1];
74     const int oh = dst_d.dims()[ndims - 2];
75     const int ow = dst_d.dims()[ndims - 1];
76 
77     // weights height and width
78     const int kh = wei_d.dims()[with_groups + ndims - 2];
79     const int kw = wei_d.dims()[with_groups + ndims - 1];
80 
81     // height and width strides
82     const int stride_h = cd.strides[ndims - 4];
83     const int stride_w = cd.strides[ndims - 3];
84 
85     // height and width dilations
86     int dilate_h = cd.dilates[ndims - 4];
87     int dilate_w = cd.dilates[ndims - 3];
88     // oneDNN dilations:          dk = 1 + (k_size - 1) * (dilate_size + 1)
89     // Compute Library dilations: dk = dilate_size * (k_size - 1) + 1
90     // thus acl_dilation = oneDNN_dilation + 1
91     dilate_h += 1;
92     dilate_w += 1;
93 
94     acp.dilation_info = arm_compute::Size2D(dilate_w, dilate_h);
95 
96     // left, right, top, bottom padding
97     const int l_pad = cd.padding[0][1];
98     const int t_pad = cd.padding[0][0];
99     // Compute Library assumes the padding to be \geq 0, and r(b)_pad may be
100     // equal to -1 in oneDNN for some cases, when the very right (bottom)
101     // spatial elements of the input tensor are not used in the convolution.
102     // On the other hand l(t)_pad are guaranteed to be non-negative.
103     const int r_pad = std::max(static_cast<int>(cd.padding[1][1]), 0);
104     const int b_pad = std::max(static_cast<int>(cd.padding[1][0]), 0);
105 
106     acp.padstride_info = arm_compute::PadStrideInfo(stride_w, stride_h,
107             static_cast<unsigned int>(l_pad), static_cast<unsigned int>(r_pad),
108             static_cast<unsigned int>(t_pad), static_cast<unsigned int>(b_pad),
109             arm_compute::DimensionRoundingType::FLOOR);
110 
111     acp.with_bias = cd.bias_desc.format_kind != format_kind::undef;
112 
113     auto set_or_check_tags = [&](format_tag_t desired_src_tag,
114                                      format_tag_t desired_dst_tag) -> status_t {
115         using namespace format_tag;
116         auto src_tag = any, dst_tag = any;
117 
118         if (src_d.format_kind() == format_kind::any) {
119             CHECK(memory_desc_init_by_tag(src_md, desired_src_tag));
120             src_tag = desired_src_tag;
121         } else {
122             src_tag = memory_desc_matches_one_of_tag(src_md, nhwc, nchw);
123         }
124 
125         if (dst_d.format_kind() == format_kind::any) {
126             CHECK(memory_desc_init_by_tag(dst_md, desired_dst_tag));
127             dst_tag = desired_dst_tag;
128         } else {
129             dst_tag = memory_desc_matches_one_of_tag(dst_md, nhwc, nchw);
130         }
131 
132         if (acp.with_bias && bias_md.format_kind == format_kind::any)
133             CHECK(memory_desc_init_by_tag(bias_md, x));
134 
135         is_nspc = utils::one_of(src_tag, nhwc);
136 
137         memory_desc_t want_wei_md = weights_md;
138         auto wei_tag = is_nspc ? ohwi : oihw;
139         CHECK(memory_desc_init_by_tag(want_wei_md, wei_tag));
140 
141         // Compute Library does not support mismatching layouts
142         if ((src_tag != wei_tag) || (src_tag != dst_tag))
143             return status::unimplemented;
144 
145         if (weights_md.format_kind == format_kind::any) {
146             weights_md = want_wei_md;
147         }
148         return (want_wei_md == weights_md) ? status::success
149                                            : status::unimplemented;
150     };
151 
152     auto default_dat_tag = format_tag::nhwc;
153     if (set_or_check_tags(default_dat_tag, default_dat_tag) != status::success)
154         return status::unimplemented;
155 
156     const auto acl_layout = is_nspc ? arm_compute::DataLayout::NHWC
157                                     : arm_compute::DataLayout::NCHW;
158 
159     auto acl_src_data_t = acl_common_utils::get_acl_data_t(src_d.data_type());
160     auto acl_wei_data_t = acl_common_utils::get_acl_data_t(wei_d.data_type());
161     auto acl_dst_data_t = acl_common_utils::get_acl_data_t(dst_d.data_type());
162     auto acl_bia_data_t = acl_common_utils::get_acl_data_t(bia_d.data_type());
163 
164     if (acl_bia_data_t == arm_compute::DataType::UNKNOWN)
165         acl_bia_data_t = arm_compute::DataType::F32;
166 
167     // clang-format off
168     acp.src_info = arm_compute::TensorInfo(
169             is_nspc ? arm_compute::TensorShape(ic, iw, ih, mb) :
170             arm_compute::TensorShape(iw, ih, ic, mb),
171             1,
172             acl_src_data_t,
173             acl_layout);
174 
175     acp.wei_info = arm_compute::TensorInfo(
176             is_nspc ? arm_compute::TensorShape(ic, kw, kh, oc) :
177             arm_compute::TensorShape(kw, kh, ic, oc),
178             1,
179             acl_wei_data_t,
180             acl_layout);
181 
182     acp.dst_info = arm_compute::TensorInfo(
183             is_nspc ? arm_compute::TensorShape(oc, ow, oh, mb) :
184             arm_compute::TensorShape(ow, oh, oc, mb),
185             1,
186             acl_dst_data_t,
187             acl_layout);
188 
189     acp.bia_info = arm_compute::TensorInfo(
190             acp.with_bias ? arm_compute::TensorShape(oc)
191                           : arm_compute::TensorShape(),
192             1,
193             acl_bia_data_t,
194             acl_layout);
195     // clang-format on
196 
197     // Add quantization info to tensors
198     acp.is_int8 = utils::one_of(src_d.data_type(), s8, u8)
199             && wei_d.data_type() == s8;
200 
201     if (acp.is_int8) {
202         const float *scales = attr.output_scales_.scales_;
203         acp.src_info.set_quantization_info(arm_compute::QuantizationInfo(1, 0));
204         acp.bia_info.set_quantization_info(arm_compute::QuantizationInfo(1, 0));
205         acp.wei_info.set_quantization_info(arm_compute::QuantizationInfo(1, 0));
206         acp.dst_info.set_quantization_info(
207                 arm_compute::QuantizationInfo(1.0f / scales[0], 0));
208     }
209 
210     // Post-convolutional operations (post-ops)
211     const auto &post_ops = attr.post_ops_;
212     // is_eltwise(true) here stands for eltwise.scale == 1.f check
213     acp.sum_with_eltwise = (post_ops.len() == 2) && post_ops.entry_[0].is_sum()
214             && post_ops.entry_[1].is_eltwise(true);
215     acp.act_info = acl_common_utils::get_acl_act(attr);
216 
217     if (acp.sum_with_eltwise) {
218         // clang-format off
219         // Validate activation layer manually to check for return status
220         auto acl_al_st = arm_compute::NEActivationLayer::validate(
221             &acp.dst_info,
222             &acp.dst_info,
223             acp.act_info);
224         // clang-format on
225         if (acl_al_st.error_code() != arm_compute::ErrorCode::OK) {
226             MAYBE_REPORT_ACL_ERROR(acl_al_st.error_description().c_str());
227             return status::unimplemented;
228         }
229 
230         // clang-format off
231         // Validate arithmetic addition manually to check for return status
232         auto acl_aa_st = arm_compute::NEArithmeticAddition::validate(
233             &acp.dst_info,
234             &acp.dst_info,
235             &acp.dst_info,
236             arm_compute::ConvertPolicy::SATURATE);
237         // clang-format on
238         if (acl_aa_st.error_code() != arm_compute::ErrorCode::OK) {
239             MAYBE_REPORT_ACL_ERROR(acl_aa_st.error_description().c_str());
240             return status::unimplemented;
241         }
242     }
243 
244     return status::success;
245 }
246 
init_conf_gemm(acl_conv_conf_t & acp,memory_desc_t & src_md,memory_desc_t & weights_md,memory_desc_t & dst_md,memory_desc_t & bias_md,const convolution_desc_t & cd,const primitive_attr_t & attr)247 status_t init_conf_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
248         memory_desc_t &weights_md, memory_desc_t &dst_md,
249         memory_desc_t &bias_md, const convolution_desc_t &cd,
250         const primitive_attr_t &attr) {
251 
252     // General Compute Library checks, memory tags are also set there
253     CHECK(acl_init_conf(acp, src_md, weights_md, dst_md, bias_md, cd, attr));
254 
255     // clang-format off
256     // Validate convolution manually to check for return status
257     auto acl_st = arm_compute::NEGEMMConvolutionLayer::validate(
258         &acp.src_info,
259         &acp.wei_info,
260         acp.with_bias ? &acp.bia_info : nullptr,
261         &acp.dst_info,
262         acp.padstride_info,
263         acp.weights_info,
264         acp.dilation_info,
265         acp.act_info,
266         acp.fast_math);
267     // clang-format on
268     if (acl_st.error_code() != arm_compute::ErrorCode::OK) {
269         MAYBE_REPORT_ACL_ERROR(acl_st.error_description().c_str());
270         return status::unimplemented;
271     }
272 
273     return status::success;
274 }
275 
init_conf_indirect_gemm(acl_conv_conf_t & acp,memory_desc_t & src_md,memory_desc_t & weights_md,memory_desc_t & dst_md,memory_desc_t & bias_md,const convolution_desc_t & cd,const primitive_attr_t & attr)276 status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
277         memory_desc_t &weights_md, memory_desc_t &dst_md,
278         memory_desc_t &bias_md, const convolution_desc_t &cd,
279         const primitive_attr_t &attr) {
280     // Indirect convolution results in slowdown for low thread count or 1x1
281     // kernels, so fall back to GEMM-based convolution in these cases
282     if (one_of(true, weights_md.dims[2] == 1, // kh
283                 weights_md.dims[3] == 1, // kw
284                 dnnl_get_max_threads() < 28)) {
285         return status::unimplemented;
286     }
287 
288     CHECK(acl_init_conf(acp, src_md, weights_md, dst_md, bias_md, cd, attr));
289 
290     // clang-format off
291     // NOTE: indirect convolution method supports only nhwc layout.
292     auto acl_st = arm_compute::NEGEMMConv2d::validate(
293         &acp.src_info,
294         &acp.wei_info,
295         acp.with_bias ? &acp.bia_info : nullptr,
296         &acp.dst_info,
297         arm_compute::Conv2dInfo(acp.padstride_info,
298                                 acp.dilation_info,
299                                 acp.act_info,
300                                 acp.fast_math,
301                                 1));
302     // clang-format on
303     if (acl_st.error_code() != arm_compute::ErrorCode::OK) {
304         MAYBE_REPORT_ACL_ERROR(acl_st.error_description().c_str());
305         return status::unimplemented;
306     }
307 
308     return status::success;
309 }
310 
init_conf_wino(acl_conv_conf_t & acp,memory_desc_t & src_md,memory_desc_t & weights_md,memory_desc_t & dst_md,memory_desc_t & bias_md,const convolution_desc_t & cd,const primitive_attr_t & attr)311 status_t init_conf_wino(acl_conv_conf_t &acp, memory_desc_t &src_md,
312         memory_desc_t &weights_md, memory_desc_t &dst_md,
313         memory_desc_t &bias_md, const convolution_desc_t &cd,
314         const primitive_attr_t &attr) {
315 
316     // Under these conditions, fallback to faster GEMM-based convolution
317     // unless the user explicitly specifies Winograd algorithm
318     // clang-format off
319     if (one_of(true, src_md.dims[2] > 112, // ih
320                 src_md.dims[3] > 112, // iw
321                 src_md.dims[1] < 64, // ic
322                 dst_md.dims[1] < 64, // oc
323                 dnnl_get_max_threads() > 28)
324             && cd.alg_kind == alg_kind::convolution_auto) {
325         return status::unimplemented;
326     }
327     // clang-format on
328 
329     // General Compute Library checks, memory tags are also set there
330     CHECK(acl_init_conf(acp, src_md, weights_md, dst_md, bias_md, cd, attr));
331 
332     const bool wino_shape_ok // unit strides only, no dilations
333             = (acp.padstride_info.stride() == std::pair<uint, uint> {1, 1})
334             && (acp.dilation_info == arm_compute::Size2D(1, 1));
335     if (!wino_shape_ok) return status::unimplemented;
336 
337     // clang-format off
338     // Validate convolution manually to check for return status
339     auto acl_st = arm_compute::NEWinogradConvolutionLayer::validate(
340         &acp.src_info,
341         &acp.wei_info,
342         acp.with_bias ? &acp.bia_info : nullptr,
343         &acp.dst_info,
344         acp.padstride_info,
345         acp.act_info,
346         true); // enable_fast_math flag in ACL Winograd
347     // clang-format on
348     if (acl_st.error_code() != arm_compute::ErrorCode::OK) {
349         MAYBE_REPORT_ACL_ERROR(acl_st.error_description().c_str());
350         return status::unimplemented;
351     }
352 
353     return status::success;
354 }
355 
356 } // namespace acl_convolution_utils
357 
358 } // namespace aarch64
359 } // namespace cpu
360 } // namespace impl
361 } // namespace dnnl
362