1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
8 //
9 //
10 // License Agreement
11 // For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
16 // Copyright (C) 2015, Itseez Inc., all rights reserved.
17 // Third party copyrights are property of their respective owners.
18 //
19 // Redistribution and use in source and binary forms, with or without modification,
20 // are permitted provided that the following conditions are met:
21 //
22 // * Redistribution's of source code must retain the above copyright notice,
23 // this list of conditions and the following disclaimer.
24 //
25 // * Redistribution's in binary form must reproduce the above copyright notice,
26 // this list of conditions and the following disclaimer in the documentation
27 // and/or other materials provided with the distribution.
28 //
29 // * The name of the copyright holders may not be used to endorse or promote products
30 // derived from this software without specific prior written permission.
31 //
32 // This software is provided by the copyright holders and contributors "as is" and
33 // any express or implied warranties, including, but not limited to, the implied
34 // warranties of merchantability and fitness for a particular purpose are disclaimed.
35 // In no event shall the Intel Corporation or contributors be liable for any direct,
36 // indirect, incidental, special, exemplary, or consequential damages
37 // (including, but not limited to, procurement of substitute goods or services;
38 // loss of use, data, or profits; or business interruption) however caused
39 // and on any theory of liability, whether in contract, strict liability,
40 // or tort (including negligence or otherwise) arising in any way out of
41 // the use of this software, even if advised of the possibility of such damage.
42 //
43 //M*/
44
45 #ifndef OPENCV_HAL_INTRIN_NEON_HPP
46 #define OPENCV_HAL_INTRIN_NEON_HPP
47
48 #include <algorithm>
49 #include "opencv2/core/utility.hpp"
50
51 namespace cv
52 {
53
54 //! @cond IGNORED
55
56 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
57
58 #define CV_SIMD128 1
59 #if defined(__aarch64__) || defined(_M_ARM64)
60 #define CV_SIMD128_64F 1
61 #else
62 #define CV_SIMD128_64F 0
63 #endif
64
65 // The following macro checks if the code is being compiled for the
66 // AArch64 execution state of Armv8, to enable the 128-bit
67 // intrinsics. The macro `__ARM_64BIT_STATE` is the one recommended by
68 // the Arm C Language Extension (ACLE) specifications [1] to check the
69 // availability of 128-bit intrinsics, and it is supporrted by clang
70 // and gcc. The macro `_M_ARM64` is the equivalent one for Microsoft
71 // Visual Studio [2] .
72 //
73 // [1] https://developer.arm.com/documentation/101028/0012/13--Advanced-SIMD--Neon--intrinsics
74 // [2] https://docs.microsoft.com/en-us/cpp/preprocessor/predefined-macros
75 #if defined(__ARM_64BIT_STATE) || defined(_M_ARM64)
76 #define CV_NEON_AARCH64 1
77 #else
78 #define CV_NEON_AARCH64 0
79 #endif
80
81 // TODO
82 #define CV_NEON_DOT 0
83
84 //////////// Utils ////////////
85
86 #if CV_SIMD128_64F
87 #define OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv, _Tpvx2, suffix) \
88 inline void _v128_unzip(const _Tpv& a, const _Tpv& b, _Tpv& c, _Tpv& d) \
89 { c = vuzp1q_##suffix(a, b); d = vuzp2q_##suffix(a, b); }
90 #define OPENCV_HAL_IMPL_NEON_UNZIP_L(_Tpv, _Tpvx2, suffix) \
91 inline void _v128_unzip(const _Tpv&a, const _Tpv&b, _Tpv& c, _Tpv& d) \
92 { c = vuzp1_##suffix(a, b); d = vuzp2_##suffix(a, b); }
93 #else
94 #define OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv, _Tpvx2, suffix) \
95 inline void _v128_unzip(const _Tpv& a, const _Tpv& b, _Tpv& c, _Tpv& d) \
96 { _Tpvx2 ab = vuzpq_##suffix(a, b); c = ab.val[0]; d = ab.val[1]; }
97 #define OPENCV_HAL_IMPL_NEON_UNZIP_L(_Tpv, _Tpvx2, suffix) \
98 inline void _v128_unzip(const _Tpv& a, const _Tpv& b, _Tpv& c, _Tpv& d) \
99 { _Tpvx2 ab = vuzp_##suffix(a, b); c = ab.val[0]; d = ab.val[1]; }
100 #endif
101
102 #if CV_SIMD128_64F
103 #define OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv, suffix) \
104 template <typename T> static inline \
105 _Tpv vreinterpretq_##suffix##_f64(T a) { return (_Tpv) a; } \
106 template <typename T> static inline \
107 float64x2_t vreinterpretq_f64_##suffix(T a) { return (float64x2_t) a; }
108 #else
109 #define OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv, suffix)
110 #endif
111
112 #define OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(_Tpv, _Tpvl, suffix) \
113 OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv##_t, _Tpv##x2_t, suffix) \
114 OPENCV_HAL_IMPL_NEON_UNZIP_L(_Tpvl##_t, _Tpvl##x2_t, suffix) \
115 OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv##_t, suffix)
116
117 #define OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_I64(_Tpv, _Tpvl, suffix) \
118 OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv##_t, suffix)
119
120 #define OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_F64(_Tpv, _Tpvl, suffix) \
121 OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv##_t, _Tpv##x2_t, suffix)
122
123 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(uint8x16, uint8x8, u8)
124 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(int8x16, int8x8, s8)
125 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(uint16x8, uint16x4, u16)
126 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(int16x8, int16x4, s16)
127 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(uint32x4, uint32x2, u32)
128 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(int32x4, int32x2, s32)
129 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(float32x4, float32x2, f32)
130 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_I64(uint64x2, uint64x1, u64)
131 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_I64(int64x2, int64x1, s64)
132 #if CV_SIMD128_64F
133 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_F64(float64x2, float64x1,f64)
134 #endif
135
136 //////////// Types ////////////
137
138 struct v_uint8x16
139 {
140 typedef uchar lane_type;
141 enum { nlanes = 16 };
142
v_uint8x16cv::v_uint8x16143 v_uint8x16() {}
v_uint8x16cv::v_uint8x16144 explicit v_uint8x16(uint8x16_t v) : val(v) {}
v_uint8x16cv::v_uint8x16145 v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
146 uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
147 {
148 uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
149 val = vld1q_u8(v);
150 }
get0cv::v_uint8x16151 uchar get0() const
152 {
153 return vgetq_lane_u8(val, 0);
154 }
155
156 uint8x16_t val;
157 };
158
159 struct v_int8x16
160 {
161 typedef schar lane_type;
162 enum { nlanes = 16 };
163
v_int8x16cv::v_int8x16164 v_int8x16() {}
v_int8x16cv::v_int8x16165 explicit v_int8x16(int8x16_t v) : val(v) {}
v_int8x16cv::v_int8x16166 v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
167 schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
168 {
169 schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
170 val = vld1q_s8(v);
171 }
get0cv::v_int8x16172 schar get0() const
173 {
174 return vgetq_lane_s8(val, 0);
175 }
176
177 int8x16_t val;
178 };
179
180 struct v_uint16x8
181 {
182 typedef ushort lane_type;
183 enum { nlanes = 8 };
184
v_uint16x8cv::v_uint16x8185 v_uint16x8() {}
v_uint16x8cv::v_uint16x8186 explicit v_uint16x8(uint16x8_t v) : val(v) {}
v_uint16x8cv::v_uint16x8187 v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
188 {
189 ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
190 val = vld1q_u16(v);
191 }
get0cv::v_uint16x8192 ushort get0() const
193 {
194 return vgetq_lane_u16(val, 0);
195 }
196
197 uint16x8_t val;
198 };
199
200 struct v_int16x8
201 {
202 typedef short lane_type;
203 enum { nlanes = 8 };
204
v_int16x8cv::v_int16x8205 v_int16x8() {}
v_int16x8cv::v_int16x8206 explicit v_int16x8(int16x8_t v) : val(v) {}
v_int16x8cv::v_int16x8207 v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
208 {
209 short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
210 val = vld1q_s16(v);
211 }
get0cv::v_int16x8212 short get0() const
213 {
214 return vgetq_lane_s16(val, 0);
215 }
216
217 int16x8_t val;
218 };
219
220 struct v_uint32x4
221 {
222 typedef unsigned lane_type;
223 enum { nlanes = 4 };
224
v_uint32x4cv::v_uint32x4225 v_uint32x4() {}
v_uint32x4cv::v_uint32x4226 explicit v_uint32x4(uint32x4_t v) : val(v) {}
v_uint32x4cv::v_uint32x4227 v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
228 {
229 unsigned v[] = {v0, v1, v2, v3};
230 val = vld1q_u32(v);
231 }
get0cv::v_uint32x4232 unsigned get0() const
233 {
234 return vgetq_lane_u32(val, 0);
235 }
236
237 uint32x4_t val;
238 };
239
240 struct v_int32x4
241 {
242 typedef int lane_type;
243 enum { nlanes = 4 };
244
v_int32x4cv::v_int32x4245 v_int32x4() {}
v_int32x4cv::v_int32x4246 explicit v_int32x4(int32x4_t v) : val(v) {}
v_int32x4cv::v_int32x4247 v_int32x4(int v0, int v1, int v2, int v3)
248 {
249 int v[] = {v0, v1, v2, v3};
250 val = vld1q_s32(v);
251 }
get0cv::v_int32x4252 int get0() const
253 {
254 return vgetq_lane_s32(val, 0);
255 }
256 int32x4_t val;
257 };
258
259 struct v_float32x4
260 {
261 typedef float lane_type;
262 enum { nlanes = 4 };
263
v_float32x4cv::v_float32x4264 v_float32x4() {}
v_float32x4cv::v_float32x4265 explicit v_float32x4(float32x4_t v) : val(v) {}
v_float32x4cv::v_float32x4266 v_float32x4(float v0, float v1, float v2, float v3)
267 {
268 float v[] = {v0, v1, v2, v3};
269 val = vld1q_f32(v);
270 }
get0cv::v_float32x4271 float get0() const
272 {
273 return vgetq_lane_f32(val, 0);
274 }
275 float32x4_t val;
276 };
277
278 struct v_uint64x2
279 {
280 typedef uint64 lane_type;
281 enum { nlanes = 2 };
282
v_uint64x2cv::v_uint64x2283 v_uint64x2() {}
v_uint64x2cv::v_uint64x2284 explicit v_uint64x2(uint64x2_t v) : val(v) {}
v_uint64x2cv::v_uint64x2285 v_uint64x2(uint64 v0, uint64 v1)
286 {
287 uint64 v[] = {v0, v1};
288 val = vld1q_u64(v);
289 }
get0cv::v_uint64x2290 uint64 get0() const
291 {
292 return vgetq_lane_u64(val, 0);
293 }
294 uint64x2_t val;
295 };
296
297 struct v_int64x2
298 {
299 typedef int64 lane_type;
300 enum { nlanes = 2 };
301
v_int64x2cv::v_int64x2302 v_int64x2() {}
v_int64x2cv::v_int64x2303 explicit v_int64x2(int64x2_t v) : val(v) {}
v_int64x2cv::v_int64x2304 v_int64x2(int64 v0, int64 v1)
305 {
306 int64 v[] = {v0, v1};
307 val = vld1q_s64(v);
308 }
get0cv::v_int64x2309 int64 get0() const
310 {
311 return vgetq_lane_s64(val, 0);
312 }
313 int64x2_t val;
314 };
315
316 #if CV_SIMD128_64F
317 struct v_float64x2
318 {
319 typedef double lane_type;
320 enum { nlanes = 2 };
321
v_float64x2cv::v_float64x2322 v_float64x2() {}
v_float64x2cv::v_float64x2323 explicit v_float64x2(float64x2_t v) : val(v) {}
v_float64x2cv::v_float64x2324 v_float64x2(double v0, double v1)
325 {
326 double v[] = {v0, v1};
327 val = vld1q_f64(v);
328 }
get0cv::v_float64x2329 double get0() const
330 {
331 return vgetq_lane_f64(val, 0);
332 }
333 float64x2_t val;
334 };
335 #endif
336
337 #define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \
338 inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(vdupq_n_##suffix((_Tp)0)); } \
339 inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(vdupq_n_##suffix(v)); } \
340 inline _Tpv##_t vreinterpretq_##suffix##_##suffix(_Tpv##_t v) { return v; } \
341 inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16(vreinterpretq_u8_##suffix(v.val)); } \
342 inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16(vreinterpretq_s8_##suffix(v.val)); } \
343 inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8(vreinterpretq_u16_##suffix(v.val)); } \
344 inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8(vreinterpretq_s16_##suffix(v.val)); } \
345 inline v_uint32x4 v_reinterpret_as_u32(const v_##_Tpv& v) { return v_uint32x4(vreinterpretq_u32_##suffix(v.val)); } \
346 inline v_int32x4 v_reinterpret_as_s32(const v_##_Tpv& v) { return v_int32x4(vreinterpretq_s32_##suffix(v.val)); } \
347 inline v_uint64x2 v_reinterpret_as_u64(const v_##_Tpv& v) { return v_uint64x2(vreinterpretq_u64_##suffix(v.val)); } \
348 inline v_int64x2 v_reinterpret_as_s64(const v_##_Tpv& v) { return v_int64x2(vreinterpretq_s64_##suffix(v.val)); } \
349 inline v_float32x4 v_reinterpret_as_f32(const v_##_Tpv& v) { return v_float32x4(vreinterpretq_f32_##suffix(v.val)); }
350
OPENCV_HAL_IMPL_NEON_INIT(uint8x16,uchar,u8)351 OPENCV_HAL_IMPL_NEON_INIT(uint8x16, uchar, u8)
352 OPENCV_HAL_IMPL_NEON_INIT(int8x16, schar, s8)
353 OPENCV_HAL_IMPL_NEON_INIT(uint16x8, ushort, u16)
354 OPENCV_HAL_IMPL_NEON_INIT(int16x8, short, s16)
355 OPENCV_HAL_IMPL_NEON_INIT(uint32x4, unsigned, u32)
356 OPENCV_HAL_IMPL_NEON_INIT(int32x4, int, s32)
357 OPENCV_HAL_IMPL_NEON_INIT(uint64x2, uint64, u64)
358 OPENCV_HAL_IMPL_NEON_INIT(int64x2, int64, s64)
359 OPENCV_HAL_IMPL_NEON_INIT(float32x4, float, f32)
360 #if CV_SIMD128_64F
361 #define OPENCV_HAL_IMPL_NEON_INIT_64(_Tpv, suffix) \
362 inline v_float64x2 v_reinterpret_as_f64(const v_##_Tpv& v) { return v_float64x2(vreinterpretq_f64_##suffix(v.val)); }
363 OPENCV_HAL_IMPL_NEON_INIT(float64x2, double, f64)
364 OPENCV_HAL_IMPL_NEON_INIT_64(uint8x16, u8)
365 OPENCV_HAL_IMPL_NEON_INIT_64(int8x16, s8)
366 OPENCV_HAL_IMPL_NEON_INIT_64(uint16x8, u16)
367 OPENCV_HAL_IMPL_NEON_INIT_64(int16x8, s16)
368 OPENCV_HAL_IMPL_NEON_INIT_64(uint32x4, u32)
369 OPENCV_HAL_IMPL_NEON_INIT_64(int32x4, s32)
370 OPENCV_HAL_IMPL_NEON_INIT_64(uint64x2, u64)
371 OPENCV_HAL_IMPL_NEON_INIT_64(int64x2, s64)
372 OPENCV_HAL_IMPL_NEON_INIT_64(float32x4, f32)
373 OPENCV_HAL_IMPL_NEON_INIT_64(float64x2, f64)
374 #endif
375
376 #define OPENCV_HAL_IMPL_NEON_PACK(_Tpvec, _Tp, hreg, suffix, _Tpwvec, pack, mov, rshr) \
377 inline _Tpvec v_##pack(const _Tpwvec& a, const _Tpwvec& b) \
378 { \
379 hreg a1 = mov(a.val), b1 = mov(b.val); \
380 return _Tpvec(vcombine_##suffix(a1, b1)); \
381 } \
382 inline void v_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
383 { \
384 hreg a1 = mov(a.val); \
385 vst1_##suffix(ptr, a1); \
386 } \
387 template<int n> inline \
388 _Tpvec v_rshr_##pack(const _Tpwvec& a, const _Tpwvec& b) \
389 { \
390 hreg a1 = rshr(a.val, n); \
391 hreg b1 = rshr(b.val, n); \
392 return _Tpvec(vcombine_##suffix(a1, b1)); \
393 } \
394 template<int n> inline \
395 void v_rshr_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
396 { \
397 hreg a1 = rshr(a.val, n); \
398 vst1_##suffix(ptr, a1); \
399 }
400
401 OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_uint16x8, pack, vqmovn_u16, vqrshrn_n_u16)
402 OPENCV_HAL_IMPL_NEON_PACK(v_int8x16, schar, int8x8_t, s8, v_int16x8, pack, vqmovn_s16, vqrshrn_n_s16)
403 OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_uint32x4, pack, vqmovn_u32, vqrshrn_n_u32)
404 OPENCV_HAL_IMPL_NEON_PACK(v_int16x8, short, int16x4_t, s16, v_int32x4, pack, vqmovn_s32, vqrshrn_n_s32)
405 OPENCV_HAL_IMPL_NEON_PACK(v_uint32x4, unsigned, uint32x2_t, u32, v_uint64x2, pack, vmovn_u64, vrshrn_n_u64)
406 OPENCV_HAL_IMPL_NEON_PACK(v_int32x4, int, int32x2_t, s32, v_int64x2, pack, vmovn_s64, vrshrn_n_s64)
407
408 OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_int16x8, pack_u, vqmovun_s16, vqrshrun_n_s16)
409 OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_int32x4, pack_u, vqmovun_s32, vqrshrun_n_s32)
410
411 // pack boolean
412 inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
413 {
414 uint8x16_t ab = vcombine_u8(vmovn_u16(a.val), vmovn_u16(b.val));
415 return v_uint8x16(ab);
416 }
417
v_pack_b(const v_uint32x4 & a,const v_uint32x4 & b,const v_uint32x4 & c,const v_uint32x4 & d)418 inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
419 const v_uint32x4& c, const v_uint32x4& d)
420 {
421 uint16x8_t nab = vcombine_u16(vmovn_u32(a.val), vmovn_u32(b.val));
422 uint16x8_t ncd = vcombine_u16(vmovn_u32(c.val), vmovn_u32(d.val));
423 return v_uint8x16(vcombine_u8(vmovn_u16(nab), vmovn_u16(ncd)));
424 }
425
v_pack_b(const v_uint64x2 & a,const v_uint64x2 & b,const v_uint64x2 & c,const v_uint64x2 & d,const v_uint64x2 & e,const v_uint64x2 & f,const v_uint64x2 & g,const v_uint64x2 & h)426 inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
427 const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
428 const v_uint64x2& g, const v_uint64x2& h)
429 {
430 uint32x4_t ab = vcombine_u32(vmovn_u64(a.val), vmovn_u64(b.val));
431 uint32x4_t cd = vcombine_u32(vmovn_u64(c.val), vmovn_u64(d.val));
432 uint32x4_t ef = vcombine_u32(vmovn_u64(e.val), vmovn_u64(f.val));
433 uint32x4_t gh = vcombine_u32(vmovn_u64(g.val), vmovn_u64(h.val));
434
435 uint16x8_t abcd = vcombine_u16(vmovn_u32(ab), vmovn_u32(cd));
436 uint16x8_t efgh = vcombine_u16(vmovn_u32(ef), vmovn_u32(gh));
437 return v_uint8x16(vcombine_u8(vmovn_u16(abcd), vmovn_u16(efgh)));
438 }
439
v_matmul(const v_float32x4 & v,const v_float32x4 & m0,const v_float32x4 & m1,const v_float32x4 & m2,const v_float32x4 & m3)440 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
441 const v_float32x4& m1, const v_float32x4& m2,
442 const v_float32x4& m3)
443 {
444 float32x2_t vl = vget_low_f32(v.val), vh = vget_high_f32(v.val);
445 float32x4_t res = vmulq_lane_f32(m0.val, vl, 0);
446 res = vmlaq_lane_f32(res, m1.val, vl, 1);
447 res = vmlaq_lane_f32(res, m2.val, vh, 0);
448 res = vmlaq_lane_f32(res, m3.val, vh, 1);
449 return v_float32x4(res);
450 }
451
v_matmuladd(const v_float32x4 & v,const v_float32x4 & m0,const v_float32x4 & m1,const v_float32x4 & m2,const v_float32x4 & a)452 inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
453 const v_float32x4& m1, const v_float32x4& m2,
454 const v_float32x4& a)
455 {
456 float32x2_t vl = vget_low_f32(v.val), vh = vget_high_f32(v.val);
457 float32x4_t res = vmulq_lane_f32(m0.val, vl, 0);
458 res = vmlaq_lane_f32(res, m1.val, vl, 1);
459 res = vmlaq_lane_f32(res, m2.val, vh, 0);
460 res = vaddq_f32(res, a.val);
461 return v_float32x4(res);
462 }
463
464 #define OPENCV_HAL_IMPL_NEON_BIN_OP(bin_op, _Tpvec, intrin) \
465 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
466 { \
467 return _Tpvec(intrin(a.val, b.val)); \
468 } \
469 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
470 { \
471 a.val = intrin(a.val, b.val); \
472 return a; \
473 }
474
475 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint8x16, vqaddq_u8)
476 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint8x16, vqsubq_u8)
477 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int8x16, vqaddq_s8)
478 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int8x16, vqsubq_s8)
479 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint16x8, vqaddq_u16)
480 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint16x8, vqsubq_u16)
481 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int16x8, vqaddq_s16)
482 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int16x8, vqsubq_s16)
483 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int32x4, vaddq_s32)
484 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int32x4, vsubq_s32)
485 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int32x4, vmulq_s32)
486 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint32x4, vaddq_u32)
487 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint32x4, vsubq_u32)
488 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_uint32x4, vmulq_u32)
489 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_float32x4, vaddq_f32)
490 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_float32x4, vsubq_f32)
491 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_float32x4, vmulq_f32)
492 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int64x2, vaddq_s64)
493 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int64x2, vsubq_s64)
494 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint64x2, vaddq_u64)
495 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint64x2, vsubq_u64)
496 #if CV_SIMD128_64F
497 OPENCV_HAL_IMPL_NEON_BIN_OP(/, v_float32x4, vdivq_f32)
498 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_float64x2, vaddq_f64)
499 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_float64x2, vsubq_f64)
500 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_float64x2, vmulq_f64)
501 OPENCV_HAL_IMPL_NEON_BIN_OP(/, v_float64x2, vdivq_f64)
502 #else
503 inline v_float32x4 operator / (const v_float32x4& a, const v_float32x4& b)
504 {
505 float32x4_t reciprocal = vrecpeq_f32(b.val);
506 reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
507 reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
508 return v_float32x4(vmulq_f32(a.val, reciprocal));
509 }
510 inline v_float32x4& operator /= (v_float32x4& a, const v_float32x4& b)
511 {
512 float32x4_t reciprocal = vrecpeq_f32(b.val);
513 reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
514 reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
515 a.val = vmulq_f32(a.val, reciprocal);
516 return a;
517 }
518 #endif
519
520 // saturating multiply 8-bit, 16-bit
521 #define OPENCV_HAL_IMPL_NEON_MUL_SAT(_Tpvec, _Tpwvec) \
522 inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
523 { \
524 _Tpwvec c, d; \
525 v_mul_expand(a, b, c, d); \
526 return v_pack(c, d); \
527 } \
528 inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
529 { a = a * b; return a; }
530
OPENCV_HAL_IMPL_NEON_MUL_SAT(v_int8x16,v_int16x8)531 OPENCV_HAL_IMPL_NEON_MUL_SAT(v_int8x16, v_int16x8)
532 OPENCV_HAL_IMPL_NEON_MUL_SAT(v_uint8x16, v_uint16x8)
533 OPENCV_HAL_IMPL_NEON_MUL_SAT(v_int16x8, v_int32x4)
534 OPENCV_HAL_IMPL_NEON_MUL_SAT(v_uint16x8, v_uint32x4)
535
536 // Multiply and expand
537 inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b,
538 v_int16x8& c, v_int16x8& d)
539 {
540 c.val = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
541 #if CV_NEON_AARCH64
542 d.val = vmull_high_s8(a.val, b.val);
543 #else // #if CV_NEON_AARCH64
544 d.val = vmull_s8(vget_high_s8(a.val), vget_high_s8(b.val));
545 #endif // #if CV_NEON_AARCH64
546 }
547
v_mul_expand(const v_uint8x16 & a,const v_uint8x16 & b,v_uint16x8 & c,v_uint16x8 & d)548 inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
549 v_uint16x8& c, v_uint16x8& d)
550 {
551 c.val = vmull_u8(vget_low_u8(a.val), vget_low_u8(b.val));
552 #if CV_NEON_AARCH64
553 d.val = vmull_high_u8(a.val, b.val);
554 #else // #if CV_NEON_AARCH64
555 d.val = vmull_u8(vget_high_u8(a.val), vget_high_u8(b.val));
556 #endif // #if CV_NEON_AARCH64
557 }
558
v_mul_expand(const v_int16x8 & a,const v_int16x8 & b,v_int32x4 & c,v_int32x4 & d)559 inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
560 v_int32x4& c, v_int32x4& d)
561 {
562 c.val = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
563 #if CV_NEON_AARCH64
564 d.val = vmull_high_s16(a.val, b.val);
565 #else // #if CV_NEON_AARCH64
566 d.val = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
567 #endif // #if CV_NEON_AARCH64
568 }
569
v_mul_expand(const v_uint16x8 & a,const v_uint16x8 & b,v_uint32x4 & c,v_uint32x4 & d)570 inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
571 v_uint32x4& c, v_uint32x4& d)
572 {
573 c.val = vmull_u16(vget_low_u16(a.val), vget_low_u16(b.val));
574 #if CV_NEON_AARCH64
575 d.val = vmull_high_u16(a.val, b.val);
576 #else // #if CV_NEON_AARCH64
577 d.val = vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val));
578 #endif // #if CV_NEON_AARCH64
579 }
580
v_mul_expand(const v_uint32x4 & a,const v_uint32x4 & b,v_uint64x2 & c,v_uint64x2 & d)581 inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
582 v_uint64x2& c, v_uint64x2& d)
583 {
584 c.val = vmull_u32(vget_low_u32(a.val), vget_low_u32(b.val));
585 #if CV_NEON_AARCH64
586 d.val = vmull_high_u32(a.val, b.val);
587 #else // #if CV_NEON_AARCH64
588 d.val = vmull_u32(vget_high_u32(a.val), vget_high_u32(b.val));
589 #endif // #if CV_NEON_AARCH64
590 }
591
v_mul_hi(const v_int16x8 & a,const v_int16x8 & b)592 inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
593 {
594 return v_int16x8(vcombine_s16(
595 vshrn_n_s32(vmull_s16( vget_low_s16(a.val), vget_low_s16(b.val)), 16),
596 vshrn_n_s32(
597 #if CV_NEON_AARCH64
598 vmull_high_s16(a.val, b.val)
599 #else // #if CV_NEON_AARCH64
600 vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val))
601 #endif // #if CV_NEON_AARCH64
602 , 16)
603 ));
604 }
v_mul_hi(const v_uint16x8 & a,const v_uint16x8 & b)605 inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
606 {
607 return v_uint16x8(vcombine_u16(
608 vshrn_n_u32(vmull_u16( vget_low_u16(a.val), vget_low_u16(b.val)), 16),
609 vshrn_n_u32(
610 #if CV_NEON_AARCH64
611 vmull_high_u16(a.val, b.val)
612 #else // #if CV_NEON_AARCH64
613 vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val))
614 #endif // #if CV_NEON_AARCH64
615 , 16)
616 ));
617 }
618
619 //////// Dot Product ////////
620
621 // 16 >> 32
v_dotprod(const v_int16x8 & a,const v_int16x8 & b)622 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
623 {
624 int16x8_t uzp1, uzp2;
625 _v128_unzip(a.val, b.val, uzp1, uzp2);
626 int16x4_t a0 = vget_low_s16(uzp1);
627 int16x4_t b0 = vget_high_s16(uzp1);
628 int16x4_t a1 = vget_low_s16(uzp2);
629 int16x4_t b1 = vget_high_s16(uzp2);
630 int32x4_t p = vmull_s16(a0, b0);
631 return v_int32x4(vmlal_s16(p, a1, b1));
632 }
v_dotprod(const v_int16x8 & a,const v_int16x8 & b,const v_int32x4 & c)633 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
634 {
635 int16x8_t uzp1, uzp2;
636 _v128_unzip(a.val, b.val, uzp1, uzp2);
637 int16x4_t a0 = vget_low_s16(uzp1);
638 int16x4_t b0 = vget_high_s16(uzp1);
639 int16x4_t a1 = vget_low_s16(uzp2);
640 int16x4_t b1 = vget_high_s16(uzp2);
641 int32x4_t p = vmlal_s16(c.val, a0, b0);
642 return v_int32x4(vmlal_s16(p, a1, b1));
643 }
644
645 // 32 >> 64
v_dotprod(const v_int32x4 & a,const v_int32x4 & b)646 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
647 {
648 int32x4_t uzp1, uzp2;
649 _v128_unzip(a.val, b.val, uzp1, uzp2);
650 int32x2_t a0 = vget_low_s32(uzp1);
651 int32x2_t b0 = vget_high_s32(uzp1);
652 int32x2_t a1 = vget_low_s32(uzp2);
653 int32x2_t b1 = vget_high_s32(uzp2);
654 int64x2_t p = vmull_s32(a0, b0);
655 return v_int64x2(vmlal_s32(p, a1, b1));
656 }
v_dotprod(const v_int32x4 & a,const v_int32x4 & b,const v_int64x2 & c)657 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
658 {
659 int32x4_t uzp1, uzp2;
660 _v128_unzip(a.val, b.val, uzp1, uzp2);
661 int32x2_t a0 = vget_low_s32(uzp1);
662 int32x2_t b0 = vget_high_s32(uzp1);
663 int32x2_t a1 = vget_low_s32(uzp2);
664 int32x2_t b1 = vget_high_s32(uzp2);
665 int64x2_t p = vmlal_s32(c.val, a0, b0);
666 return v_int64x2(vmlal_s32(p, a1, b1));
667 }
668
669 // 8 >> 32
v_dotprod_expand(const v_uint8x16 & a,const v_uint8x16 & b)670 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
671 {
672 #if CV_NEON_DOT
673 return v_uint32x4(vdotq_u32(vdupq_n_u32(0), a.val, b.val));
674 #else
675 const uint8x16_t zero = vreinterpretq_u8_u32(vdupq_n_u32(0));
676 const uint8x16_t mask = vreinterpretq_u8_u32(vdupq_n_u32(0x00FF00FF));
677 const uint16x8_t zero32 = vreinterpretq_u16_u32(vdupq_n_u32(0));
678 const uint16x8_t mask32 = vreinterpretq_u16_u32(vdupq_n_u32(0x0000FFFF));
679
680 uint16x8_t even = vmulq_u16(vreinterpretq_u16_u8(vbslq_u8(mask, a.val, zero)),
681 vreinterpretq_u16_u8(vbslq_u8(mask, b.val, zero)));
682 uint16x8_t odd = vmulq_u16(vshrq_n_u16(vreinterpretq_u16_u8(a.val), 8),
683 vshrq_n_u16(vreinterpretq_u16_u8(b.val), 8));
684
685 uint32x4_t s0 = vaddq_u32(vreinterpretq_u32_u16(vbslq_u16(mask32, even, zero32)),
686 vreinterpretq_u32_u16(vbslq_u16(mask32, odd, zero32)));
687 uint32x4_t s1 = vaddq_u32(vshrq_n_u32(vreinterpretq_u32_u16(even), 16),
688 vshrq_n_u32(vreinterpretq_u32_u16(odd), 16));
689 return v_uint32x4(vaddq_u32(s0, s1));
690 #endif
691 }
v_dotprod_expand(const v_uint8x16 & a,const v_uint8x16 & b,const v_uint32x4 & c)692 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b,
693 const v_uint32x4& c)
694 {
695 #if CV_NEON_DOT
696 return v_uint32x4(vdotq_u32(c.val, a.val, b.val));
697 #else
698 return v_dotprod_expand(a, b) + c;
699 #endif
700 }
701
v_dotprod_expand(const v_int8x16 & a,const v_int8x16 & b)702 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
703 {
704 #if CV_NEON_DOT
705 return v_int32x4(vdotq_s32(vdupq_n_s32(0), a.val, b.val));
706 #else
707 int16x8_t p0 = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
708 int16x8_t p1 = vmull_s8(vget_high_s8(a.val), vget_high_s8(b.val));
709 int16x8_t uzp1, uzp2;
710 _v128_unzip(p0, p1, uzp1, uzp2);
711 int16x8_t sum = vaddq_s16(uzp1, uzp2);
712 int16x4_t uzpl1, uzpl2;
713 _v128_unzip(vget_low_s16(sum), vget_high_s16(sum), uzpl1, uzpl2);
714 return v_int32x4(vaddl_s16(uzpl1, uzpl2));
715 #endif
716 }
v_dotprod_expand(const v_int8x16 & a,const v_int8x16 & b,const v_int32x4 & c)717 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b,
718 const v_int32x4& c)
719 {
720 #if CV_NEON_DOT
721 return v_int32x4(vdotq_s32(c.val, a.val, b.val));
722 #else
723 return v_dotprod_expand(a, b) + c;
724 #endif
725 }
726
727 // 16 >> 64
v_dotprod_expand(const v_uint16x8 & a,const v_uint16x8 & b)728 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
729 {
730 const uint16x8_t zero = vreinterpretq_u16_u32(vdupq_n_u32(0));
731 const uint16x8_t mask = vreinterpretq_u16_u32(vdupq_n_u32(0x0000FFFF));
732
733 uint32x4_t even = vmulq_u32(vreinterpretq_u32_u16(vbslq_u16(mask, a.val, zero)),
734 vreinterpretq_u32_u16(vbslq_u16(mask, b.val, zero)));
735 uint32x4_t odd = vmulq_u32(vshrq_n_u32(vreinterpretq_u32_u16(a.val), 16),
736 vshrq_n_u32(vreinterpretq_u32_u16(b.val), 16));
737 uint32x4_t uzp1, uzp2;
738 _v128_unzip(even, odd, uzp1, uzp2);
739 uint64x2_t s0 = vaddl_u32(vget_low_u32(uzp1), vget_high_u32(uzp1));
740 uint64x2_t s1 = vaddl_u32(vget_low_u32(uzp2), vget_high_u32(uzp2));
741 return v_uint64x2(vaddq_u64(s0, s1));
742 }
v_dotprod_expand(const v_uint16x8 & a,const v_uint16x8 & b,const v_uint64x2 & c)743 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
744 { return v_dotprod_expand(a, b) + c; }
745
v_dotprod_expand(const v_int16x8 & a,const v_int16x8 & b)746 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
747 {
748 int32x4_t p0 = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
749 int32x4_t p1 = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
750
751 int32x4_t uzp1, uzp2;
752 _v128_unzip(p0, p1, uzp1, uzp2);
753 int32x4_t sum = vaddq_s32(uzp1, uzp2);
754
755 int32x2_t uzpl1, uzpl2;
756 _v128_unzip(vget_low_s32(sum), vget_high_s32(sum), uzpl1, uzpl2);
757 return v_int64x2(vaddl_s32(uzpl1, uzpl2));
758 }
v_dotprod_expand(const v_int16x8 & a,const v_int16x8 & b,const v_int64x2 & c)759 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b,
760 const v_int64x2& c)
761 { return v_dotprod_expand(a, b) + c; }
762
763 // 32 >> 64f
764 #if CV_SIMD128_64F
v_dotprod_expand(const v_int32x4 & a,const v_int32x4 & b)765 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
766 { return v_cvt_f64(v_dotprod(a, b)); }
v_dotprod_expand(const v_int32x4 & a,const v_int32x4 & b,const v_float64x2 & c)767 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b,
768 const v_float64x2& c)
769 { return v_dotprod_expand(a, b) + c; }
770 #endif
771
772 //////// Fast Dot Product ////////
773
774 // 16 >> 32
v_dotprod_fast(const v_int16x8 & a,const v_int16x8 & b)775 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
776 {
777 #if CV_NEON_AARCH64
778 int32x4_t p = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
779 return v_int32x4(vmlal_high_s16(p, a.val, b.val));
780 #else
781 int16x4_t a0 = vget_low_s16(a.val);
782 int16x4_t a1 = vget_high_s16(a.val);
783 int16x4_t b0 = vget_low_s16(b.val);
784 int16x4_t b1 = vget_high_s16(b.val);
785 int32x4_t p = vmull_s16(a0, b0);
786 return v_int32x4(vmlal_s16(p, a1, b1));
787 #endif
788 }
v_dotprod_fast(const v_int16x8 & a,const v_int16x8 & b,const v_int32x4 & c)789 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
790 {
791 #if CV_NEON_AARCH64
792 int32x4_t p = vmlal_s16(c.val, vget_low_s16(a.val), vget_low_s16(b.val));
793 return v_int32x4(vmlal_high_s16(p, a.val, b.val));
794 #else
795 int16x4_t a0 = vget_low_s16(a.val);
796 int16x4_t a1 = vget_high_s16(a.val);
797 int16x4_t b0 = vget_low_s16(b.val);
798 int16x4_t b1 = vget_high_s16(b.val);
799 int32x4_t p = vmlal_s16(c.val, a0, b0);
800 return v_int32x4(vmlal_s16(p, a1, b1));
801 #endif
802 }
803
804 // 32 >> 64
v_dotprod_fast(const v_int32x4 & a,const v_int32x4 & b)805 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
806 {
807 #if CV_NEON_AARCH64
808 int64x2_t p = vmull_s32(vget_low_s32(a.val), vget_low_s32(b.val));
809 return v_int64x2(vmlal_high_s32(p, a.val, b.val));
810 #else
811 int32x2_t a0 = vget_low_s32(a.val);
812 int32x2_t a1 = vget_high_s32(a.val);
813 int32x2_t b0 = vget_low_s32(b.val);
814 int32x2_t b1 = vget_high_s32(b.val);
815 int64x2_t p = vmull_s32(a0, b0);
816 return v_int64x2(vmlal_s32(p, a1, b1));
817 #endif
818 }
v_dotprod_fast(const v_int32x4 & a,const v_int32x4 & b,const v_int64x2 & c)819 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
820 {
821 #if CV_NEON_AARCH64
822 int64x2_t p = vmlal_s32(c.val, vget_low_s32(a.val), vget_low_s32(b.val));
823 return v_int64x2(vmlal_high_s32(p, a.val, b.val));
824 #else
825 int32x2_t a0 = vget_low_s32(a.val);
826 int32x2_t a1 = vget_high_s32(a.val);
827 int32x2_t b0 = vget_low_s32(b.val);
828 int32x2_t b1 = vget_high_s32(b.val);
829 int64x2_t p = vmlal_s32(c.val, a0, b0);
830 return v_int64x2(vmlal_s32(p, a1, b1));
831 #endif
832 }
833
834 // 8 >> 32
v_dotprod_expand_fast(const v_uint8x16 & a,const v_uint8x16 & b)835 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
836 {
837 #if CV_NEON_DOT
838 return v_uint32x4(vdotq_u32(vdupq_n_u32(0), a.val, b.val));
839 #else
840 uint16x8_t p0 = vmull_u8(vget_low_u8(a.val), vget_low_u8(b.val));
841 uint16x8_t p1 = vmull_u8(vget_high_u8(a.val), vget_high_u8(b.val));
842 uint32x4_t s0 = vaddl_u16(vget_low_u16(p0), vget_low_u16(p1));
843 uint32x4_t s1 = vaddl_u16(vget_high_u16(p0), vget_high_u16(p1));
844 return v_uint32x4(vaddq_u32(s0, s1));
845 #endif
846 }
v_dotprod_expand_fast(const v_uint8x16 & a,const v_uint8x16 & b,const v_uint32x4 & c)847 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
848 {
849 #if CV_NEON_DOT
850 return v_uint32x4(vdotq_u32(c.val, a.val, b.val));
851 #else
852 return v_dotprod_expand_fast(a, b) + c;
853 #endif
854 }
855
v_dotprod_expand_fast(const v_int8x16 & a,const v_int8x16 & b)856 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
857 {
858 #if CV_NEON_DOT
859 return v_int32x4(vdotq_s32(vdupq_n_s32(0), a.val, b.val));
860 #else
861 int16x8_t prod = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
862 prod = vmlal_s8(prod, vget_high_s8(a.val), vget_high_s8(b.val));
863 return v_int32x4(vaddl_s16(vget_low_s16(prod), vget_high_s16(prod)));
864 #endif
865 }
v_dotprod_expand_fast(const v_int8x16 & a,const v_int8x16 & b,const v_int32x4 & c)866 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
867 {
868 #if CV_NEON_DOT
869 return v_int32x4(vdotq_s32(c.val, a.val, b.val));
870 #else
871 return v_dotprod_expand_fast(a, b) + c;
872 #endif
873 }
874
875 // 16 >> 64
v_dotprod_expand_fast(const v_uint16x8 & a,const v_uint16x8 & b)876 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
877 {
878 uint32x4_t p0 = vmull_u16(vget_low_u16(a.val), vget_low_u16(b.val));
879 uint32x4_t p1 = vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val));
880 uint64x2_t s0 = vaddl_u32(vget_low_u32(p0), vget_high_u32(p0));
881 uint64x2_t s1 = vaddl_u32(vget_low_u32(p1), vget_high_u32(p1));
882 return v_uint64x2(vaddq_u64(s0, s1));
883 }
v_dotprod_expand_fast(const v_uint16x8 & a,const v_uint16x8 & b,const v_uint64x2 & c)884 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
885 { return v_dotprod_expand_fast(a, b) + c; }
886
v_dotprod_expand_fast(const v_int16x8 & a,const v_int16x8 & b)887 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
888 {
889 int32x4_t prod = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
890 prod = vmlal_s16(prod, vget_high_s16(a.val), vget_high_s16(b.val));
891 return v_int64x2(vaddl_s32(vget_low_s32(prod), vget_high_s32(prod)));
892 }
v_dotprod_expand_fast(const v_int16x8 & a,const v_int16x8 & b,const v_int64x2 & c)893 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
894 { return v_dotprod_expand_fast(a, b) + c; }
895
896 // 32 >> 64f
897 #if CV_SIMD128_64F
v_dotprod_expand_fast(const v_int32x4 & a,const v_int32x4 & b)898 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
899 { return v_cvt_f64(v_dotprod_fast(a, b)); }
v_dotprod_expand_fast(const v_int32x4 & a,const v_int32x4 & b,const v_float64x2 & c)900 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
901 { return v_dotprod_expand_fast(a, b) + c; }
902 #endif
903
904
905 #define OPENCV_HAL_IMPL_NEON_LOGIC_OP(_Tpvec, suffix) \
906 OPENCV_HAL_IMPL_NEON_BIN_OP(&, _Tpvec, vandq_##suffix) \
907 OPENCV_HAL_IMPL_NEON_BIN_OP(|, _Tpvec, vorrq_##suffix) \
908 OPENCV_HAL_IMPL_NEON_BIN_OP(^, _Tpvec, veorq_##suffix) \
909 inline _Tpvec operator ~ (const _Tpvec& a) \
910 { \
911 return _Tpvec(vreinterpretq_##suffix##_u8(vmvnq_u8(vreinterpretq_u8_##suffix(a.val)))); \
912 }
913
914 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint8x16, u8)
915 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int8x16, s8)
916 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint16x8, u16)
917 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int16x8, s16)
918 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint32x4, u32)
919 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int32x4, s32)
920 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint64x2, u64)
921 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int64x2, s64)
922
923 #define OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(bin_op, intrin) \
924 inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
925 { \
926 return v_float32x4(vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val)))); \
927 } \
928 inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
929 { \
930 a.val = vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val))); \
931 return a; \
932 }
933
934 OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(&, vandq_s32)
935 OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(|, vorrq_s32)
936 OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(^, veorq_s32)
937
938 inline v_float32x4 operator ~ (const v_float32x4& a)
939 {
940 return v_float32x4(vreinterpretq_f32_s32(vmvnq_s32(vreinterpretq_s32_f32(a.val))));
941 }
942
943 #if CV_SIMD128_64F
v_sqrt(const v_float32x4 & x)944 inline v_float32x4 v_sqrt(const v_float32x4& x)
945 {
946 return v_float32x4(vsqrtq_f32(x.val));
947 }
948
v_invsqrt(const v_float32x4 & x)949 inline v_float32x4 v_invsqrt(const v_float32x4& x)
950 {
951 v_float32x4 one = v_setall_f32(1.0f);
952 return one / v_sqrt(x);
953 }
954 #else
v_sqrt(const v_float32x4 & x)955 inline v_float32x4 v_sqrt(const v_float32x4& x)
956 {
957 float32x4_t x1 = vmaxq_f32(x.val, vdupq_n_f32(FLT_MIN));
958 float32x4_t e = vrsqrteq_f32(x1);
959 e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
960 e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
961 return v_float32x4(vmulq_f32(x.val, e));
962 }
963
v_invsqrt(const v_float32x4 & x)964 inline v_float32x4 v_invsqrt(const v_float32x4& x)
965 {
966 float32x4_t e = vrsqrteq_f32(x.val);
967 e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e);
968 e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e);
969 return v_float32x4(e);
970 }
971 #endif
972
973 #define OPENCV_HAL_IMPL_NEON_ABS(_Tpuvec, _Tpsvec, usuffix, ssuffix) \
974 inline _Tpuvec v_abs(const _Tpsvec& a) { return v_reinterpret_as_##usuffix(_Tpsvec(vabsq_##ssuffix(a.val))); }
975
OPENCV_HAL_IMPL_NEON_ABS(v_uint8x16,v_int8x16,u8,s8)976 OPENCV_HAL_IMPL_NEON_ABS(v_uint8x16, v_int8x16, u8, s8)
977 OPENCV_HAL_IMPL_NEON_ABS(v_uint16x8, v_int16x8, u16, s16)
978 OPENCV_HAL_IMPL_NEON_ABS(v_uint32x4, v_int32x4, u32, s32)
979
980 inline v_float32x4 v_abs(v_float32x4 x)
981 { return v_float32x4(vabsq_f32(x.val)); }
982
983 #if CV_SIMD128_64F
984 #define OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(bin_op, intrin) \
985 inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
986 { \
987 return v_float64x2(vreinterpretq_f64_s64(intrin(vreinterpretq_s64_f64(a.val), vreinterpretq_s64_f64(b.val)))); \
988 } \
989 inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
990 { \
991 a.val = vreinterpretq_f64_s64(intrin(vreinterpretq_s64_f64(a.val), vreinterpretq_s64_f64(b.val))); \
992 return a; \
993 }
994
995 OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(&, vandq_s64)
996 OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(|, vorrq_s64)
997 OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(^, veorq_s64)
998
operator ~(const v_float64x2 & a)999 inline v_float64x2 operator ~ (const v_float64x2& a)
1000 {
1001 return v_float64x2(vreinterpretq_f64_s32(vmvnq_s32(vreinterpretq_s32_f64(a.val))));
1002 }
1003
v_sqrt(const v_float64x2 & x)1004 inline v_float64x2 v_sqrt(const v_float64x2& x)
1005 {
1006 return v_float64x2(vsqrtq_f64(x.val));
1007 }
1008
v_invsqrt(const v_float64x2 & x)1009 inline v_float64x2 v_invsqrt(const v_float64x2& x)
1010 {
1011 v_float64x2 one = v_setall_f64(1.0f);
1012 return one / v_sqrt(x);
1013 }
1014
v_abs(v_float64x2 x)1015 inline v_float64x2 v_abs(v_float64x2 x)
1016 { return v_float64x2(vabsq_f64(x.val)); }
1017 #endif
1018
1019 // TODO: exp, log, sin, cos
1020
1021 #define OPENCV_HAL_IMPL_NEON_BIN_FUNC(_Tpvec, func, intrin) \
1022 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
1023 { \
1024 return _Tpvec(intrin(a.val, b.val)); \
1025 }
1026
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16,v_min,vminq_u8)1027 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_min, vminq_u8)
1028 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_max, vmaxq_u8)
1029 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_min, vminq_s8)
1030 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_max, vmaxq_s8)
1031 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_min, vminq_u16)
1032 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_max, vmaxq_u16)
1033 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_min, vminq_s16)
1034 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_max, vmaxq_s16)
1035 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_min, vminq_u32)
1036 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_max, vmaxq_u32)
1037 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_min, vminq_s32)
1038 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_max, vmaxq_s32)
1039 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_min, vminq_f32)
1040 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_max, vmaxq_f32)
1041 #if CV_SIMD128_64F
1042 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_min, vminq_f64)
1043 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_max, vmaxq_f64)
1044 #endif
1045
1046 #if CV_SIMD128_64F
1047 inline int64x2_t vmvnq_s64(int64x2_t a)
1048 {
1049 int64x2_t vx = vreinterpretq_s64_u32(vdupq_n_u32(0xFFFFFFFF));
1050 return veorq_s64(a, vx);
1051 }
vmvnq_u64(uint64x2_t a)1052 inline uint64x2_t vmvnq_u64(uint64x2_t a)
1053 {
1054 uint64x2_t vx = vreinterpretq_u64_u32(vdupq_n_u32(0xFFFFFFFF));
1055 return veorq_u64(a, vx);
1056 }
1057 #endif
1058 #define OPENCV_HAL_IMPL_NEON_INT_CMP_OP(_Tpvec, cast, suffix, not_suffix) \
1059 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
1060 { return _Tpvec(cast(vceqq_##suffix(a.val, b.val))); } \
1061 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
1062 { return _Tpvec(cast(vmvnq_##not_suffix(vceqq_##suffix(a.val, b.val)))); } \
1063 inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
1064 { return _Tpvec(cast(vcltq_##suffix(a.val, b.val))); } \
1065 inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
1066 { return _Tpvec(cast(vcgtq_##suffix(a.val, b.val))); } \
1067 inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
1068 { return _Tpvec(cast(vcleq_##suffix(a.val, b.val))); } \
1069 inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
1070 { return _Tpvec(cast(vcgeq_##suffix(a.val, b.val))); }
1071
OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint8x16,OPENCV_HAL_NOP,u8,u8)1072 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint8x16, OPENCV_HAL_NOP, u8, u8)
1073 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int8x16, vreinterpretq_s8_u8, s8, u8)
1074 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint16x8, OPENCV_HAL_NOP, u16, u16)
1075 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int16x8, vreinterpretq_s16_u16, s16, u16)
1076 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint32x4, OPENCV_HAL_NOP, u32, u32)
1077 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int32x4, vreinterpretq_s32_u32, s32, u32)
1078 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float32x4, vreinterpretq_f32_u32, f32, u32)
1079 #if CV_SIMD128_64F
1080 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint64x2, OPENCV_HAL_NOP, u64, u64)
1081 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int64x2, vreinterpretq_s64_u64, s64, u64)
1082 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float64x2, vreinterpretq_f64_u64, f64, u64)
1083 #endif
1084
1085 inline v_float32x4 v_not_nan(const v_float32x4& a)
1086 { return v_float32x4(vreinterpretq_f32_u32(vceqq_f32(a.val, a.val))); }
1087 #if CV_SIMD128_64F
v_not_nan(const v_float64x2 & a)1088 inline v_float64x2 v_not_nan(const v_float64x2& a)
1089 { return v_float64x2(vreinterpretq_f64_u64(vceqq_f64(a.val, a.val))); }
1090 #endif
1091
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16,v_add_wrap,vaddq_u8)1092 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_add_wrap, vaddq_u8)
1093 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_add_wrap, vaddq_s8)
1094 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_add_wrap, vaddq_u16)
1095 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_add_wrap, vaddq_s16)
1096 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_sub_wrap, vsubq_u8)
1097 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_sub_wrap, vsubq_s8)
1098 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_sub_wrap, vsubq_u16)
1099 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_sub_wrap, vsubq_s16)
1100 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_mul_wrap, vmulq_u8)
1101 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_mul_wrap, vmulq_s8)
1102 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_mul_wrap, vmulq_u16)
1103 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_mul_wrap, vmulq_s16)
1104
1105 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_absdiff, vabdq_u8)
1106 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_absdiff, vabdq_u16)
1107 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_absdiff, vabdq_u32)
1108 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_absdiff, vabdq_f32)
1109 #if CV_SIMD128_64F
1110 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_absdiff, vabdq_f64)
1111 #endif
1112
1113 /** Saturating absolute difference **/
1114 inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
1115 { return v_int8x16(vqabsq_s8(vqsubq_s8(a.val, b.val))); }
v_absdiffs(const v_int16x8 & a,const v_int16x8 & b)1116 inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
1117 { return v_int16x8(vqabsq_s16(vqsubq_s16(a.val, b.val))); }
1118
1119 #define OPENCV_HAL_IMPL_NEON_BIN_FUNC2(_Tpvec, _Tpvec2, cast, func, intrin) \
1120 inline _Tpvec2 func(const _Tpvec& a, const _Tpvec& b) \
1121 { \
1122 return _Tpvec2(cast(intrin(a.val, b.val))); \
1123 }
1124
OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int8x16,v_uint8x16,vreinterpretq_u8_s8,v_absdiff,vabdq_s8)1125 OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int8x16, v_uint8x16, vreinterpretq_u8_s8, v_absdiff, vabdq_s8)
1126 OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int16x8, v_uint16x8, vreinterpretq_u16_s16, v_absdiff, vabdq_s16)
1127 OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int32x4, v_uint32x4, vreinterpretq_u32_s32, v_absdiff, vabdq_s32)
1128
1129 inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b)
1130 {
1131 v_float32x4 x(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val));
1132 return v_sqrt(x);
1133 }
1134
v_sqr_magnitude(const v_float32x4 & a,const v_float32x4 & b)1135 inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b)
1136 {
1137 return v_float32x4(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val));
1138 }
1139
v_fma(const v_float32x4 & a,const v_float32x4 & b,const v_float32x4 & c)1140 inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
1141 {
1142 #if CV_SIMD128_64F
1143 // ARMv8, which adds support for 64-bit floating-point (so CV_SIMD128_64F is defined),
1144 // also adds FMA support both for single- and double-precision floating-point vectors
1145 return v_float32x4(vfmaq_f32(c.val, a.val, b.val));
1146 #else
1147 return v_float32x4(vmlaq_f32(c.val, a.val, b.val));
1148 #endif
1149 }
1150
v_fma(const v_int32x4 & a,const v_int32x4 & b,const v_int32x4 & c)1151 inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
1152 {
1153 return v_int32x4(vmlaq_s32(c.val, a.val, b.val));
1154 }
1155
v_muladd(const v_float32x4 & a,const v_float32x4 & b,const v_float32x4 & c)1156 inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
1157 {
1158 return v_fma(a, b, c);
1159 }
1160
v_muladd(const v_int32x4 & a,const v_int32x4 & b,const v_int32x4 & c)1161 inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
1162 {
1163 return v_fma(a, b, c);
1164 }
1165
1166 #if CV_SIMD128_64F
v_magnitude(const v_float64x2 & a,const v_float64x2 & b)1167 inline v_float64x2 v_magnitude(const v_float64x2& a, const v_float64x2& b)
1168 {
1169 v_float64x2 x(vaddq_f64(vmulq_f64(a.val, a.val), vmulq_f64(b.val, b.val)));
1170 return v_sqrt(x);
1171 }
1172
v_sqr_magnitude(const v_float64x2 & a,const v_float64x2 & b)1173 inline v_float64x2 v_sqr_magnitude(const v_float64x2& a, const v_float64x2& b)
1174 {
1175 return v_float64x2(vaddq_f64(vmulq_f64(a.val, a.val), vmulq_f64(b.val, b.val)));
1176 }
1177
v_fma(const v_float64x2 & a,const v_float64x2 & b,const v_float64x2 & c)1178 inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
1179 {
1180 return v_float64x2(vfmaq_f64(c.val, a.val, b.val));
1181 }
1182
v_muladd(const v_float64x2 & a,const v_float64x2 & b,const v_float64x2 & c)1183 inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
1184 {
1185 return v_fma(a, b, c);
1186 }
1187 #endif
1188
1189 // trade efficiency for convenience
1190 #define OPENCV_HAL_IMPL_NEON_SHIFT_OP(_Tpvec, suffix, _Tps, ssuffix) \
1191 inline _Tpvec operator << (const _Tpvec& a, int n) \
1192 { return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)n))); } \
1193 inline _Tpvec operator >> (const _Tpvec& a, int n) \
1194 { return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)-n))); } \
1195 template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
1196 { return _Tpvec(vshlq_n_##suffix(a.val, n)); } \
1197 template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
1198 { return _Tpvec(vshrq_n_##suffix(a.val, n)); } \
1199 template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
1200 { return _Tpvec(vrshrq_n_##suffix(a.val, n)); }
1201
OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint8x16,u8,schar,s8)1202 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint8x16, u8, schar, s8)
1203 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int8x16, s8, schar, s8)
1204 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint16x8, u16, short, s16)
1205 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int16x8, s16, short, s16)
1206 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint32x4, u32, int, s32)
1207 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int32x4, s32, int, s32)
1208 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint64x2, u64, int64, s64)
1209 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int64x2, s64, int64, s64)
1210
1211 #define OPENCV_HAL_IMPL_NEON_ROTATE_OP(_Tpvec, suffix) \
1212 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
1213 { return _Tpvec(vextq_##suffix(a.val, vdupq_n_##suffix(0), n)); } \
1214 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
1215 { return _Tpvec(vextq_##suffix(vdupq_n_##suffix(0), a.val, _Tpvec::nlanes - n)); } \
1216 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
1217 { return a; } \
1218 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
1219 { return _Tpvec(vextq_##suffix(a.val, b.val, n)); } \
1220 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
1221 { return _Tpvec(vextq_##suffix(b.val, a.val, _Tpvec::nlanes - n)); } \
1222 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
1223 { CV_UNUSED(b); return a; }
1224
1225 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint8x16, u8)
1226 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int8x16, s8)
1227 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint16x8, u16)
1228 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int16x8, s16)
1229 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint32x4, u32)
1230 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int32x4, s32)
1231 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_float32x4, f32)
1232 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint64x2, u64)
1233 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int64x2, s64)
1234 #if CV_SIMD128_64F
1235 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_float64x2, f64)
1236 #endif
1237
1238 #if defined(__clang__) && defined(__aarch64__)
1239 // avoid LD2 instruction. details: https://github.com/opencv/opencv/issues/14863
1240 #define OPENCV_HAL_IMPL_NEON_LOAD_LOW_OP(_Tpvec, _Tp, suffix) \
1241 inline _Tpvec v_load_low(const _Tp* ptr) \
1242 { \
1243 typedef uint64 CV_DECL_ALIGNED(1) unaligned_uint64; \
1244 uint64 v = *(unaligned_uint64*)ptr; \
1245 return _Tpvec(v_reinterpret_as_##suffix(v_uint64x2(v, (uint64)123456))); \
1246 }
1247 #else
1248 #define OPENCV_HAL_IMPL_NEON_LOAD_LOW_OP(_Tpvec, _Tp, suffix) \
1249 inline _Tpvec v_load_low(const _Tp* ptr) \
1250 { return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr), vdup_n_##suffix((_Tp)0))); }
1251 #endif
1252
1253 #define OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(_Tpvec, _Tp, suffix) \
1254 inline _Tpvec v_load(const _Tp* ptr) \
1255 { return _Tpvec(vld1q_##suffix(ptr)); } \
1256 inline _Tpvec v_load_aligned(const _Tp* ptr) \
1257 { return _Tpvec(vld1q_##suffix(ptr)); } \
1258 OPENCV_HAL_IMPL_NEON_LOAD_LOW_OP(_Tpvec, _Tp, suffix) \
1259 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
1260 { return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr0), vld1_##suffix(ptr1))); } \
1261 inline void v_store(_Tp* ptr, const _Tpvec& a) \
1262 { vst1q_##suffix(ptr, a.val); } \
1263 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
1264 { vst1q_##suffix(ptr, a.val); } \
1265 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
1266 { vst1q_##suffix(ptr, a.val); } \
1267 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
1268 { vst1q_##suffix(ptr, a.val); } \
1269 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
1270 { vst1_##suffix(ptr, vget_low_##suffix(a.val)); } \
1271 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
1272 { vst1_##suffix(ptr, vget_high_##suffix(a.val)); }
1273
1274 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint8x16, uchar, u8)
1275 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int8x16, schar, s8)
1276 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint16x8, ushort, u16)
1277 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int16x8, short, s16)
1278 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint32x4, unsigned, u32)
1279 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int32x4, int, s32)
1280 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint64x2, uint64, u64)
1281 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int64x2, int64, s64)
1282 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float32x4, float, f32)
1283 #if CV_SIMD128_64F
1284 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float64x2, double, f64)
1285 #endif
1286
1287 inline unsigned v_reduce_sum(const v_uint8x16& a)
1288 {
1289 #if CV_NEON_AARCH64
1290 uint16_t t0 = vaddlvq_u8(a.val);
1291 return t0;
1292 #else // #if CV_NEON_AARCH64
1293 uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(a.val));
1294 uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1295 return vget_lane_u32(vpadd_u32(t1, t1), 0);
1296 #endif // #if CV_NEON_AARCH64
1297 }
v_reduce_sum(const v_int8x16 & a)1298 inline int v_reduce_sum(const v_int8x16& a)
1299 {
1300 #if CV_NEON_AARCH64
1301 int16_t t0 = vaddlvq_s8(a.val);
1302 return t0;
1303 #else // #if CV_NEON_AARCH64
1304 int32x4_t t0 = vpaddlq_s16(vpaddlq_s8(a.val));
1305 int32x2_t t1 = vpadd_s32(vget_low_s32(t0), vget_high_s32(t0));
1306 return vget_lane_s32(vpadd_s32(t1, t1), 0);
1307 #endif // #if CV_NEON_AARCH64
1308 }
v_reduce_sum(const v_uint16x8 & a)1309 inline unsigned v_reduce_sum(const v_uint16x8& a)
1310 {
1311 #if CV_NEON_AARCH64
1312 uint32_t t0 = vaddlvq_u16(a.val);
1313 return t0;
1314 #else // #if CV_NEON_AARCH64
1315 uint32x4_t t0 = vpaddlq_u16(a.val);
1316 uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1317 return vget_lane_u32(vpadd_u32(t1, t1), 0);
1318 #endif // #if CV_NEON_AARCH64
1319 }
v_reduce_sum(const v_int16x8 & a)1320 inline int v_reduce_sum(const v_int16x8& a)
1321 {
1322 #if CV_NEON_AARCH64
1323 int32_t t0 = vaddlvq_s16(a.val);
1324 return t0;
1325 #else // #if CV_NEON_AARCH64
1326 int32x4_t t0 = vpaddlq_s16(a.val);
1327 int32x2_t t1 = vpadd_s32(vget_low_s32(t0), vget_high_s32(t0));
1328 return vget_lane_s32(vpadd_s32(t1, t1), 0);
1329 #endif // #if CV_NEON_AARCH64
1330 }
1331
1332 #if CV_NEON_AARCH64
1333 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
1334 inline scalartype v_reduce_##func(const _Tpvec& a) \
1335 { \
1336 return v##vectorfunc##vq_##suffix(a.val); \
1337 }
1338 #else // #if CV_NEON_AARCH64
1339 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
1340 inline scalartype v_reduce_##func(const _Tpvec& a) \
1341 { \
1342 _Tpnvec##_t a0 = vp##vectorfunc##_##suffix(vget_low_##suffix(a.val), vget_high_##suffix(a.val)); \
1343 a0 = vp##vectorfunc##_##suffix(a0, a0); \
1344 a0 = vp##vectorfunc##_##suffix(a0, a0); \
1345 return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, a0),0); \
1346 }
1347 #endif // #if CV_NEON_AARCH64
1348
OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(v_uint8x16,uint8x8,uchar,max,max,u8)1349 OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(v_uint8x16, uint8x8, uchar, max, max, u8)
1350 OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(v_uint8x16, uint8x8, uchar, min, min, u8)
1351 OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(v_int8x16, int8x8, schar, max, max, s8)
1352 OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(v_int8x16, int8x8, schar, min, min, s8)
1353
1354 #if CV_NEON_AARCH64
1355 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
1356 inline scalartype v_reduce_##func(const _Tpvec& a) \
1357 { \
1358 return v##vectorfunc##vq_##suffix(a.val); \
1359 }
1360 #else // #if CV_NEON_AARCH64
1361 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
1362 inline scalartype v_reduce_##func(const _Tpvec& a) \
1363 { \
1364 _Tpnvec##_t a0 = vp##vectorfunc##_##suffix(vget_low_##suffix(a.val), vget_high_##suffix(a.val)); \
1365 a0 = vp##vectorfunc##_##suffix(a0, a0); \
1366 return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, a0),0); \
1367 }
1368 #endif // #if CV_NEON_AARCH64
1369
1370 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, ushort, max, max, u16)
1371 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, ushort, min, min, u16)
1372 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, max, max, s16)
1373 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, min, min, s16)
1374
1375 #if CV_NEON_AARCH64
1376 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
1377 inline scalartype v_reduce_##func(const _Tpvec& a) \
1378 { \
1379 return v##vectorfunc##vq_##suffix(a.val); \
1380 }
1381 #else // #if CV_NEON_AARCH64
1382 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
1383 inline scalartype v_reduce_##func(const _Tpvec& a) \
1384 { \
1385 _Tpnvec##_t a0 = vp##vectorfunc##_##suffix(vget_low_##suffix(a.val), vget_high_##suffix(a.val)); \
1386 return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, vget_high_##suffix(a.val)),0); \
1387 }
1388 #endif // #if CV_NEON_AARCH64
1389
1390 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, uint32x2, unsigned, sum, add, u32)
1391 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, uint32x2, unsigned, max, max, u32)
1392 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, uint32x2, unsigned, min, min, u32)
1393 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int32x2, int, sum, add, s32)
1394 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int32x2, int, max, max, s32)
1395 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int32x2, int, min, min, s32)
1396 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, sum, add, f32)
1397 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, max, max, f32)
1398 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, min, min, f32)
1399
1400 inline uint64 v_reduce_sum(const v_uint64x2& a)
1401 {
1402 #if CV_NEON_AARCH64
1403 return vaddvq_u64(a.val);
1404 #else // #if CV_NEON_AARCH64
1405 return vget_lane_u64(vadd_u64(vget_low_u64(a.val), vget_high_u64(a.val)),0);
1406 #endif // #if CV_NEON_AARCH64
1407 }
v_reduce_sum(const v_int64x2 & a)1408 inline int64 v_reduce_sum(const v_int64x2& a)
1409 {
1410 #if CV_NEON_AARCH64
1411 return vaddvq_s64(a.val);
1412 #else // #if CV_NEON_AARCH64
1413 return vget_lane_s64(vadd_s64(vget_low_s64(a.val), vget_high_s64(a.val)),0);
1414 #endif // #if CV_NEON_AARCH64
1415 }
1416 #if CV_SIMD128_64F
v_reduce_sum(const v_float64x2 & a)1417 inline double v_reduce_sum(const v_float64x2& a)
1418 {
1419 return vaddvq_f64(a.val);
1420 }
1421 #endif
1422
v_reduce_sum4(const v_float32x4 & a,const v_float32x4 & b,const v_float32x4 & c,const v_float32x4 & d)1423 inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
1424 const v_float32x4& c, const v_float32x4& d)
1425 {
1426 #if CV_NEON_AARCH64
1427 float32x4_t ab = vpaddq_f32(a.val, b.val); // a0+a1 a2+a3 b0+b1 b2+b3
1428 float32x4_t cd = vpaddq_f32(c.val, d.val); // c0+c1 d0+d1 c2+c3 d2+d3
1429 return v_float32x4(vpaddq_f32(ab, cd)); // sumA sumB sumC sumD
1430 #else // #if CV_NEON_AARCH64
1431 float32x4x2_t ab = vtrnq_f32(a.val, b.val);
1432 float32x4x2_t cd = vtrnq_f32(c.val, d.val);
1433
1434 float32x4_t u0 = vaddq_f32(ab.val[0], ab.val[1]); // a0+a1 b0+b1 a2+a3 b2+b3
1435 float32x4_t u1 = vaddq_f32(cd.val[0], cd.val[1]); // c0+c1 d0+d1 c2+c3 d2+d3
1436
1437 float32x4_t v0 = vcombine_f32(vget_low_f32(u0), vget_low_f32(u1));
1438 float32x4_t v1 = vcombine_f32(vget_high_f32(u0), vget_high_f32(u1));
1439
1440 return v_float32x4(vaddq_f32(v0, v1));
1441 #endif // #if CV_NEON_AARCH64
1442 }
1443
v_reduce_sad(const v_uint8x16 & a,const v_uint8x16 & b)1444 inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
1445 {
1446 #if CV_NEON_AARCH64
1447 uint8x16_t t0 = vabdq_u8(a.val, b.val);
1448 uint16_t t1 = vaddlvq_u8(t0);
1449 return t1;
1450 #else // #if CV_NEON_AARCH64
1451 uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(vabdq_u8(a.val, b.val)));
1452 uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1453 return vget_lane_u32(vpadd_u32(t1, t1), 0);
1454 #endif // #if CV_NEON_AARCH64
1455 }
v_reduce_sad(const v_int8x16 & a,const v_int8x16 & b)1456 inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
1457 {
1458 #if CV_NEON_AARCH64
1459 uint8x16_t t0 = vreinterpretq_u8_s8(vabdq_s8(a.val, b.val));
1460 uint16_t t1 = vaddlvq_u8(t0);
1461 return t1;
1462 #else // #if CV_NEON_AARCH64
1463 uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(vreinterpretq_u8_s8(vabdq_s8(a.val, b.val))));
1464 uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1465 return vget_lane_u32(vpadd_u32(t1, t1), 0);
1466 #endif // #if CV_NEON_AARCH64
1467 }
v_reduce_sad(const v_uint16x8 & a,const v_uint16x8 & b)1468 inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
1469 {
1470 #if CV_NEON_AARCH64
1471 uint16x8_t t0 = vabdq_u16(a.val, b.val);
1472 uint32_t t1 = vaddlvq_u16(t0);
1473 return t1;
1474 #else // #if CV_NEON_AARCH64
1475 uint32x4_t t0 = vpaddlq_u16(vabdq_u16(a.val, b.val));
1476 uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1477 return vget_lane_u32(vpadd_u32(t1, t1), 0);
1478 #endif // #if CV_NEON_AARCH64
1479 }
v_reduce_sad(const v_int16x8 & a,const v_int16x8 & b)1480 inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
1481 {
1482 #if CV_NEON_AARCH64
1483 uint16x8_t t0 = vreinterpretq_u16_s16(vabdq_s16(a.val, b.val));
1484 uint32_t t1 = vaddlvq_u16(t0);
1485 return t1;
1486 #else // #if CV_NEON_AARCH64
1487 uint32x4_t t0 = vpaddlq_u16(vreinterpretq_u16_s16(vabdq_s16(a.val, b.val)));
1488 uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1489 return vget_lane_u32(vpadd_u32(t1, t1), 0);
1490 #endif // #if CV_NEON_AARCH64
1491 }
v_reduce_sad(const v_uint32x4 & a,const v_uint32x4 & b)1492 inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
1493 {
1494 #if CV_NEON_AARCH64
1495 uint32x4_t t0 = vabdq_u32(a.val, b.val);
1496 uint32_t t1 = vaddvq_u32(t0);
1497 return t1;
1498 #else // #if CV_NEON_AARCH64
1499 uint32x4_t t0 = vabdq_u32(a.val, b.val);
1500 uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1501 return vget_lane_u32(vpadd_u32(t1, t1), 0);
1502 #endif // #if CV_NEON_AARCH64
1503 }
v_reduce_sad(const v_int32x4 & a,const v_int32x4 & b)1504 inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)
1505 {
1506 #if CV_NEON_AARCH64
1507 uint32x4_t t0 = vreinterpretq_u32_s32(vabdq_s32(a.val, b.val));
1508 uint32_t t1 = vaddvq_u32(t0);
1509 return t1;
1510 #else // #if CV_NEON_AARCH64
1511 uint32x4_t t0 = vreinterpretq_u32_s32(vabdq_s32(a.val, b.val));
1512 uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1513 return vget_lane_u32(vpadd_u32(t1, t1), 0);
1514 #endif // #if CV_NEON_AARCH64
1515 }
v_reduce_sad(const v_float32x4 & a,const v_float32x4 & b)1516 inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
1517 {
1518 #if CV_NEON_AARCH64
1519 float32x4_t t0 = vabdq_f32(a.val, b.val);
1520 return vaddvq_f32(t0);
1521 #else // #if CV_NEON_AARCH64
1522 float32x4_t t0 = vabdq_f32(a.val, b.val);
1523 float32x2_t t1 = vpadd_f32(vget_low_f32(t0), vget_high_f32(t0));
1524 return vget_lane_f32(vpadd_f32(t1, t1), 0);
1525 #endif // #if CV_NEON_AARCH64
1526 }
1527
v_popcount(const v_uint8x16 & a)1528 inline v_uint8x16 v_popcount(const v_uint8x16& a)
1529 { return v_uint8x16(vcntq_u8(a.val)); }
v_popcount(const v_int8x16 & a)1530 inline v_uint8x16 v_popcount(const v_int8x16& a)
1531 { return v_uint8x16(vcntq_u8(vreinterpretq_u8_s8(a.val))); }
v_popcount(const v_uint16x8 & a)1532 inline v_uint16x8 v_popcount(const v_uint16x8& a)
1533 { return v_uint16x8(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u16(a.val)))); }
v_popcount(const v_int16x8 & a)1534 inline v_uint16x8 v_popcount(const v_int16x8& a)
1535 { return v_uint16x8(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_s16(a.val)))); }
v_popcount(const v_uint32x4 & a)1536 inline v_uint32x4 v_popcount(const v_uint32x4& a)
1537 { return v_uint32x4(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u32(a.val))))); }
v_popcount(const v_int32x4 & a)1538 inline v_uint32x4 v_popcount(const v_int32x4& a)
1539 { return v_uint32x4(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_s32(a.val))))); }
v_popcount(const v_uint64x2 & a)1540 inline v_uint64x2 v_popcount(const v_uint64x2& a)
1541 { return v_uint64x2(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(a.val)))))); }
v_popcount(const v_int64x2 & a)1542 inline v_uint64x2 v_popcount(const v_int64x2& a)
1543 { return v_uint64x2(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_s64(a.val)))))); }
1544
v_signmask(const v_uint8x16 & a)1545 inline int v_signmask(const v_uint8x16& a)
1546 {
1547 #if CV_NEON_AARCH64
1548 const int8x16_t signPosition = {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7};
1549 const uint8x16_t byteOrder = {0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15};
1550 uint8x16_t v0 = vshlq_u8(vshrq_n_u8(a.val, 7), signPosition);
1551 uint8x16_t v1 = vqtbl1q_u8(v0, byteOrder);
1552 uint32_t t0 = vaddlvq_u16(vreinterpretq_u16_u8(v1));
1553 return t0;
1554 #else // #if CV_NEON_AARCH64
1555 int8x8_t m0 = vcreate_s8(CV_BIG_UINT(0x0706050403020100));
1556 uint8x16_t v0 = vshlq_u8(vshrq_n_u8(a.val, 7), vcombine_s8(m0, m0));
1557 uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v0)));
1558 return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 8);
1559 #endif // #if CV_NEON_AARCH64
1560 }
1561
v_signmask(const v_int8x16 & a)1562 inline int v_signmask(const v_int8x16& a)
1563 { return v_signmask(v_reinterpret_as_u8(a)); }
1564
v_signmask(const v_uint16x8 & a)1565 inline int v_signmask(const v_uint16x8& a)
1566 {
1567 #if CV_NEON_AARCH64
1568 const int16x8_t signPosition = {0,1,2,3,4,5,6,7};
1569 uint16x8_t v0 = vshlq_u16(vshrq_n_u16(a.val, 15), signPosition);
1570 uint32_t t0 = vaddlvq_u16(v0);
1571 return t0;
1572 #else // #if CV_NEON_AARCH64
1573 int16x4_t m0 = vcreate_s16(CV_BIG_UINT(0x0003000200010000));
1574 uint16x8_t v0 = vshlq_u16(vshrq_n_u16(a.val, 15), vcombine_s16(m0, m0));
1575 uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(v0));
1576 return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 4);
1577 #endif // #if CV_NEON_AARCH64
1578 }
v_signmask(const v_int16x8 & a)1579 inline int v_signmask(const v_int16x8& a)
1580 { return v_signmask(v_reinterpret_as_u16(a)); }
1581
v_signmask(const v_uint32x4 & a)1582 inline int v_signmask(const v_uint32x4& a)
1583 {
1584 #if CV_NEON_AARCH64
1585 const int32x4_t signPosition = {0,1,2,3};
1586 uint32x4_t v0 = vshlq_u32(vshrq_n_u32(a.val, 31), signPosition);
1587 uint32_t t0 = vaddvq_u32(v0);
1588 return t0;
1589 #else // #if CV_NEON_AARCH64
1590 int32x2_t m0 = vcreate_s32(CV_BIG_UINT(0x0000000100000000));
1591 uint32x4_t v0 = vshlq_u32(vshrq_n_u32(a.val, 31), vcombine_s32(m0, m0));
1592 uint64x2_t v1 = vpaddlq_u32(v0);
1593 return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 2);
1594 #endif // #if CV_NEON_AARCH64
1595 }
v_signmask(const v_int32x4 & a)1596 inline int v_signmask(const v_int32x4& a)
1597 { return v_signmask(v_reinterpret_as_u32(a)); }
v_signmask(const v_float32x4 & a)1598 inline int v_signmask(const v_float32x4& a)
1599 { return v_signmask(v_reinterpret_as_u32(a)); }
v_signmask(const v_uint64x2 & a)1600 inline int v_signmask(const v_uint64x2& a)
1601 {
1602 #if CV_NEON_AARCH64
1603 const int64x2_t signPosition = {0,1};
1604 uint64x2_t v0 = vshlq_u64(vshrq_n_u64(a.val, 63), signPosition);
1605 uint64_t t0 = vaddvq_u64(v0);
1606 return t0;
1607 #else // #if CV_NEON_AARCH64
1608 int64x1_t m0 = vdup_n_s64(0);
1609 uint64x2_t v0 = vshlq_u64(vshrq_n_u64(a.val, 63), vcombine_s64(m0, m0));
1610 return (int)vgetq_lane_u64(v0, 0) + ((int)vgetq_lane_u64(v0, 1) << 1);
1611 #endif // #if CV_NEON_AARCH64
1612 }
v_signmask(const v_int64x2 & a)1613 inline int v_signmask(const v_int64x2& a)
1614 { return v_signmask(v_reinterpret_as_u64(a)); }
1615 #if CV_SIMD128_64F
v_signmask(const v_float64x2 & a)1616 inline int v_signmask(const v_float64x2& a)
1617 { return v_signmask(v_reinterpret_as_u64(a)); }
1618 #endif
1619
v_scan_forward(const v_int8x16 & a)1620 inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmask(a)); }
v_scan_forward(const v_uint8x16 & a)1621 inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(a)); }
v_scan_forward(const v_int16x8 & a)1622 inline int v_scan_forward(const v_int16x8& a) { return trailingZeros32(v_signmask(a)); }
v_scan_forward(const v_uint16x8 & a)1623 inline int v_scan_forward(const v_uint16x8& a) { return trailingZeros32(v_signmask(a)); }
v_scan_forward(const v_int32x4 & a)1624 inline int v_scan_forward(const v_int32x4& a) { return trailingZeros32(v_signmask(a)); }
v_scan_forward(const v_uint32x4 & a)1625 inline int v_scan_forward(const v_uint32x4& a) { return trailingZeros32(v_signmask(a)); }
v_scan_forward(const v_float32x4 & a)1626 inline int v_scan_forward(const v_float32x4& a) { return trailingZeros32(v_signmask(a)); }
v_scan_forward(const v_int64x2 & a)1627 inline int v_scan_forward(const v_int64x2& a) { return trailingZeros32(v_signmask(a)); }
v_scan_forward(const v_uint64x2 & a)1628 inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signmask(a)); }
1629 #if CV_SIMD128_64F
v_scan_forward(const v_float64x2 & a)1630 inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(a)); }
1631 #endif
1632
1633 #if CV_NEON_AARCH64
1634 #define OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(_Tpvec, suffix, shift) \
1635 inline bool v_check_all(const v_##_Tpvec& a) \
1636 { \
1637 return (vminvq_##suffix(a.val) >> shift) != 0; \
1638 } \
1639 inline bool v_check_any(const v_##_Tpvec& a) \
1640 { \
1641 return (vmaxvq_##suffix(a.val) >> shift) != 0; \
1642 }
1643 #else // #if CV_NEON_AARCH64
1644 #define OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(_Tpvec, suffix, shift) \
1645 inline bool v_check_all(const v_##_Tpvec& a) \
1646 { \
1647 _Tpvec##_t v0 = vshrq_n_##suffix(vmvnq_##suffix(a.val), shift); \
1648 uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \
1649 return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) == 0; \
1650 } \
1651 inline bool v_check_any(const v_##_Tpvec& a) \
1652 { \
1653 _Tpvec##_t v0 = vshrq_n_##suffix(a.val, shift); \
1654 uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \
1655 return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) != 0; \
1656 }
1657 #endif // #if CV_NEON_AARCH64
1658
1659 OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint8x16, u8, 7)
1660 OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint16x8, u16, 15)
1661 OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint32x4, u32, 31)
1662
v_check_all(const v_uint64x2 & a)1663 inline bool v_check_all(const v_uint64x2& a)
1664 {
1665 uint64x2_t v0 = vshrq_n_u64(a.val, 63);
1666 return (vgetq_lane_u64(v0, 0) & vgetq_lane_u64(v0, 1)) == 1;
1667 }
v_check_any(const v_uint64x2 & a)1668 inline bool v_check_any(const v_uint64x2& a)
1669 {
1670 uint64x2_t v0 = vshrq_n_u64(a.val, 63);
1671 return (vgetq_lane_u64(v0, 0) | vgetq_lane_u64(v0, 1)) != 0;
1672 }
1673
v_check_all(const v_int8x16 & a)1674 inline bool v_check_all(const v_int8x16& a)
1675 { return v_check_all(v_reinterpret_as_u8(a)); }
v_check_all(const v_int16x8 & a)1676 inline bool v_check_all(const v_int16x8& a)
1677 { return v_check_all(v_reinterpret_as_u16(a)); }
v_check_all(const v_int32x4 & a)1678 inline bool v_check_all(const v_int32x4& a)
1679 { return v_check_all(v_reinterpret_as_u32(a)); }
v_check_all(const v_float32x4 & a)1680 inline bool v_check_all(const v_float32x4& a)
1681 { return v_check_all(v_reinterpret_as_u32(a)); }
1682
v_check_any(const v_int8x16 & a)1683 inline bool v_check_any(const v_int8x16& a)
1684 { return v_check_any(v_reinterpret_as_u8(a)); }
v_check_any(const v_int16x8 & a)1685 inline bool v_check_any(const v_int16x8& a)
1686 { return v_check_any(v_reinterpret_as_u16(a)); }
v_check_any(const v_int32x4 & a)1687 inline bool v_check_any(const v_int32x4& a)
1688 { return v_check_any(v_reinterpret_as_u32(a)); }
v_check_any(const v_float32x4 & a)1689 inline bool v_check_any(const v_float32x4& a)
1690 { return v_check_any(v_reinterpret_as_u32(a)); }
1691
v_check_all(const v_int64x2 & a)1692 inline bool v_check_all(const v_int64x2& a)
1693 { return v_check_all(v_reinterpret_as_u64(a)); }
v_check_any(const v_int64x2 & a)1694 inline bool v_check_any(const v_int64x2& a)
1695 { return v_check_any(v_reinterpret_as_u64(a)); }
1696 #if CV_SIMD128_64F
v_check_all(const v_float64x2 & a)1697 inline bool v_check_all(const v_float64x2& a)
1698 { return v_check_all(v_reinterpret_as_u64(a)); }
v_check_any(const v_float64x2 & a)1699 inline bool v_check_any(const v_float64x2& a)
1700 { return v_check_any(v_reinterpret_as_u64(a)); }
1701 #endif
1702
1703 #define OPENCV_HAL_IMPL_NEON_SELECT(_Tpvec, suffix, usuffix) \
1704 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
1705 { \
1706 return _Tpvec(vbslq_##suffix(vreinterpretq_##usuffix##_##suffix(mask.val), a.val, b.val)); \
1707 }
1708
OPENCV_HAL_IMPL_NEON_SELECT(v_uint8x16,u8,u8)1709 OPENCV_HAL_IMPL_NEON_SELECT(v_uint8x16, u8, u8)
1710 OPENCV_HAL_IMPL_NEON_SELECT(v_int8x16, s8, u8)
1711 OPENCV_HAL_IMPL_NEON_SELECT(v_uint16x8, u16, u16)
1712 OPENCV_HAL_IMPL_NEON_SELECT(v_int16x8, s16, u16)
1713 OPENCV_HAL_IMPL_NEON_SELECT(v_uint32x4, u32, u32)
1714 OPENCV_HAL_IMPL_NEON_SELECT(v_int32x4, s32, u32)
1715 OPENCV_HAL_IMPL_NEON_SELECT(v_float32x4, f32, u32)
1716 #if CV_SIMD128_64F
1717 OPENCV_HAL_IMPL_NEON_SELECT(v_float64x2, f64, u64)
1718 #endif
1719
1720 #if CV_NEON_AARCH64
1721 #define OPENCV_HAL_IMPL_NEON_EXPAND(_Tpvec, _Tpwvec, _Tp, suffix) \
1722 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
1723 { \
1724 b0.val = vmovl_##suffix(vget_low_##suffix(a.val)); \
1725 b1.val = vmovl_high_##suffix(a.val); \
1726 } \
1727 inline _Tpwvec v_expand_low(const _Tpvec& a) \
1728 { \
1729 return _Tpwvec(vmovl_##suffix(vget_low_##suffix(a.val))); \
1730 } \
1731 inline _Tpwvec v_expand_high(const _Tpvec& a) \
1732 { \
1733 return _Tpwvec(vmovl_high_##suffix(a.val)); \
1734 } \
1735 inline _Tpwvec v_load_expand(const _Tp* ptr) \
1736 { \
1737 return _Tpwvec(vmovl_##suffix(vld1_##suffix(ptr))); \
1738 }
1739 #else
1740 #define OPENCV_HAL_IMPL_NEON_EXPAND(_Tpvec, _Tpwvec, _Tp, suffix) \
1741 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
1742 { \
1743 b0.val = vmovl_##suffix(vget_low_##suffix(a.val)); \
1744 b1.val = vmovl_##suffix(vget_high_##suffix(a.val)); \
1745 } \
1746 inline _Tpwvec v_expand_low(const _Tpvec& a) \
1747 { \
1748 return _Tpwvec(vmovl_##suffix(vget_low_##suffix(a.val))); \
1749 } \
1750 inline _Tpwvec v_expand_high(const _Tpvec& a) \
1751 { \
1752 return _Tpwvec(vmovl_##suffix(vget_high_##suffix(a.val))); \
1753 } \
1754 inline _Tpwvec v_load_expand(const _Tp* ptr) \
1755 { \
1756 return _Tpwvec(vmovl_##suffix(vld1_##suffix(ptr))); \
1757 }
1758 #endif
1759
1760 OPENCV_HAL_IMPL_NEON_EXPAND(v_uint8x16, v_uint16x8, uchar, u8)
1761 OPENCV_HAL_IMPL_NEON_EXPAND(v_int8x16, v_int16x8, schar, s8)
1762 OPENCV_HAL_IMPL_NEON_EXPAND(v_uint16x8, v_uint32x4, ushort, u16)
1763 OPENCV_HAL_IMPL_NEON_EXPAND(v_int16x8, v_int32x4, short, s16)
1764 OPENCV_HAL_IMPL_NEON_EXPAND(v_uint32x4, v_uint64x2, uint, u32)
1765 OPENCV_HAL_IMPL_NEON_EXPAND(v_int32x4, v_int64x2, int, s32)
1766
1767 inline v_uint32x4 v_load_expand_q(const uchar* ptr)
1768 {
1769 typedef unsigned int CV_DECL_ALIGNED(1) unaligned_uint;
1770 uint8x8_t v0 = vcreate_u8(*(unaligned_uint*)ptr);
1771 uint16x4_t v1 = vget_low_u16(vmovl_u8(v0));
1772 return v_uint32x4(vmovl_u16(v1));
1773 }
1774
v_load_expand_q(const schar * ptr)1775 inline v_int32x4 v_load_expand_q(const schar* ptr)
1776 {
1777 typedef unsigned int CV_DECL_ALIGNED(1) unaligned_uint;
1778 int8x8_t v0 = vcreate_s8(*(unaligned_uint*)ptr);
1779 int16x4_t v1 = vget_low_s16(vmovl_s8(v0));
1780 return v_int32x4(vmovl_s16(v1));
1781 }
1782
1783 #if defined(__aarch64__) || defined(_M_ARM64)
1784 #define OPENCV_HAL_IMPL_NEON_UNPACKS(_Tpvec, suffix) \
1785 inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
1786 { \
1787 b0.val = vzip1q_##suffix(a0.val, a1.val); \
1788 b1.val = vzip2q_##suffix(a0.val, a1.val); \
1789 } \
1790 inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1791 { \
1792 return v_##_Tpvec(vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val))); \
1793 } \
1794 inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1795 { \
1796 return v_##_Tpvec(vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val))); \
1797 } \
1798 inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
1799 { \
1800 c.val = vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val)); \
1801 d.val = vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val)); \
1802 }
1803 #else
1804 #define OPENCV_HAL_IMPL_NEON_UNPACKS(_Tpvec, suffix) \
1805 inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
1806 { \
1807 _Tpvec##x2_t p = vzipq_##suffix(a0.val, a1.val); \
1808 b0.val = p.val[0]; \
1809 b1.val = p.val[1]; \
1810 } \
1811 inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1812 { \
1813 return v_##_Tpvec(vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val))); \
1814 } \
1815 inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1816 { \
1817 return v_##_Tpvec(vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val))); \
1818 } \
1819 inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
1820 { \
1821 c.val = vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val)); \
1822 d.val = vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val)); \
1823 }
1824 #endif
1825
OPENCV_HAL_IMPL_NEON_UNPACKS(uint8x16,u8)1826 OPENCV_HAL_IMPL_NEON_UNPACKS(uint8x16, u8)
1827 OPENCV_HAL_IMPL_NEON_UNPACKS(int8x16, s8)
1828 OPENCV_HAL_IMPL_NEON_UNPACKS(uint16x8, u16)
1829 OPENCV_HAL_IMPL_NEON_UNPACKS(int16x8, s16)
1830 OPENCV_HAL_IMPL_NEON_UNPACKS(uint32x4, u32)
1831 OPENCV_HAL_IMPL_NEON_UNPACKS(int32x4, s32)
1832 OPENCV_HAL_IMPL_NEON_UNPACKS(float32x4, f32)
1833 #if CV_SIMD128_64F
1834 OPENCV_HAL_IMPL_NEON_UNPACKS(float64x2, f64)
1835 #endif
1836
1837 inline v_uint8x16 v_reverse(const v_uint8x16 &a)
1838 {
1839 uint8x16_t vec = vrev64q_u8(a.val);
1840 return v_uint8x16(vextq_u8(vec, vec, 8));
1841 }
1842
v_reverse(const v_int8x16 & a)1843 inline v_int8x16 v_reverse(const v_int8x16 &a)
1844 { return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
1845
v_reverse(const v_uint16x8 & a)1846 inline v_uint16x8 v_reverse(const v_uint16x8 &a)
1847 {
1848 uint16x8_t vec = vrev64q_u16(a.val);
1849 return v_uint16x8(vextq_u16(vec, vec, 4));
1850 }
1851
v_reverse(const v_int16x8 & a)1852 inline v_int16x8 v_reverse(const v_int16x8 &a)
1853 { return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
1854
v_reverse(const v_uint32x4 & a)1855 inline v_uint32x4 v_reverse(const v_uint32x4 &a)
1856 {
1857 uint32x4_t vec = vrev64q_u32(a.val);
1858 return v_uint32x4(vextq_u32(vec, vec, 2));
1859 }
1860
v_reverse(const v_int32x4 & a)1861 inline v_int32x4 v_reverse(const v_int32x4 &a)
1862 { return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
1863
v_reverse(const v_float32x4 & a)1864 inline v_float32x4 v_reverse(const v_float32x4 &a)
1865 { return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
1866
v_reverse(const v_uint64x2 & a)1867 inline v_uint64x2 v_reverse(const v_uint64x2 &a)
1868 {
1869 uint64x2_t vec = a.val;
1870 uint64x1_t vec_lo = vget_low_u64(vec);
1871 uint64x1_t vec_hi = vget_high_u64(vec);
1872 return v_uint64x2(vcombine_u64(vec_hi, vec_lo));
1873 }
1874
v_reverse(const v_int64x2 & a)1875 inline v_int64x2 v_reverse(const v_int64x2 &a)
1876 { return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
1877
1878 #if CV_SIMD128_64F
v_reverse(const v_float64x2 & a)1879 inline v_float64x2 v_reverse(const v_float64x2 &a)
1880 { return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
1881 #endif
1882
1883 #define OPENCV_HAL_IMPL_NEON_EXTRACT(_Tpvec, suffix) \
1884 template <int s> \
1885 inline v_##_Tpvec v_extract(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1886 { \
1887 return v_##_Tpvec(vextq_##suffix(a.val, b.val, s)); \
1888 }
1889
OPENCV_HAL_IMPL_NEON_EXTRACT(uint8x16,u8)1890 OPENCV_HAL_IMPL_NEON_EXTRACT(uint8x16, u8)
1891 OPENCV_HAL_IMPL_NEON_EXTRACT(int8x16, s8)
1892 OPENCV_HAL_IMPL_NEON_EXTRACT(uint16x8, u16)
1893 OPENCV_HAL_IMPL_NEON_EXTRACT(int16x8, s16)
1894 OPENCV_HAL_IMPL_NEON_EXTRACT(uint32x4, u32)
1895 OPENCV_HAL_IMPL_NEON_EXTRACT(int32x4, s32)
1896 OPENCV_HAL_IMPL_NEON_EXTRACT(uint64x2, u64)
1897 OPENCV_HAL_IMPL_NEON_EXTRACT(int64x2, s64)
1898 OPENCV_HAL_IMPL_NEON_EXTRACT(float32x4, f32)
1899 #if CV_SIMD128_64F
1900 OPENCV_HAL_IMPL_NEON_EXTRACT(float64x2, f64)
1901 #endif
1902
1903 #define OPENCV_HAL_IMPL_NEON_EXTRACT_N(_Tpvec, _Tp, suffix) \
1904 template<int i> inline _Tp v_extract_n(_Tpvec v) { return vgetq_lane_##suffix(v.val, i); }
1905
1906 OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint8x16, uchar, u8)
1907 OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int8x16, schar, s8)
1908 OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint16x8, ushort, u16)
1909 OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int16x8, short, s16)
1910 OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint32x4, uint, u32)
1911 OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int32x4, int, s32)
1912 OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint64x2, uint64, u64)
1913 OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int64x2, int64, s64)
1914 OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_float32x4, float, f32)
1915 #if CV_SIMD128_64F
1916 OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_float64x2, double, f64)
1917 #endif
1918
1919 #define OPENCV_HAL_IMPL_NEON_BROADCAST(_Tpvec, _Tp, suffix) \
1920 template<int i> inline _Tpvec v_broadcast_element(_Tpvec v) { _Tp t = v_extract_n<i>(v); return v_setall_##suffix(t); }
1921
1922 OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint8x16, uchar, u8)
1923 OPENCV_HAL_IMPL_NEON_BROADCAST(v_int8x16, schar, s8)
1924 OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint16x8, ushort, u16)
1925 OPENCV_HAL_IMPL_NEON_BROADCAST(v_int16x8, short, s16)
1926 OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint32x4, uint, u32)
1927 OPENCV_HAL_IMPL_NEON_BROADCAST(v_int32x4, int, s32)
1928 OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint64x2, uint64, u64)
1929 OPENCV_HAL_IMPL_NEON_BROADCAST(v_int64x2, int64, s64)
1930 OPENCV_HAL_IMPL_NEON_BROADCAST(v_float32x4, float, f32)
1931 #if CV_SIMD128_64F
1932 OPENCV_HAL_IMPL_NEON_BROADCAST(v_float64x2, double, f64)
1933 #endif
1934
1935 #if CV_SIMD128_64F
1936 inline v_int32x4 v_round(const v_float32x4& a)
1937 {
1938 float32x4_t a_ = a.val;
1939 int32x4_t result;
1940 __asm__ ("fcvtns %0.4s, %1.4s"
1941 : "=w"(result)
1942 : "w"(a_)
1943 : /* No clobbers */);
1944 return v_int32x4(result);
1945 }
1946 #else
1947 inline v_int32x4 v_round(const v_float32x4& a)
1948 {
1949 static const int32x4_t v_sign = vdupq_n_s32(1 << 31),
1950 v_05 = vreinterpretq_s32_f32(vdupq_n_f32(0.5f));
1951
1952 int32x4_t v_addition = vorrq_s32(v_05, vandq_s32(v_sign, vreinterpretq_s32_f32(a.val)));
1953 return v_int32x4(vcvtq_s32_f32(vaddq_f32(a.val, vreinterpretq_f32_s32(v_addition))));
1954 }
1955 #endif
v_floor(const v_float32x4 & a)1956 inline v_int32x4 v_floor(const v_float32x4& a)
1957 {
1958 int32x4_t a1 = vcvtq_s32_f32(a.val);
1959 uint32x4_t mask = vcgtq_f32(vcvtq_f32_s32(a1), a.val);
1960 return v_int32x4(vaddq_s32(a1, vreinterpretq_s32_u32(mask)));
1961 }
1962
v_ceil(const v_float32x4 & a)1963 inline v_int32x4 v_ceil(const v_float32x4& a)
1964 {
1965 int32x4_t a1 = vcvtq_s32_f32(a.val);
1966 uint32x4_t mask = vcgtq_f32(a.val, vcvtq_f32_s32(a1));
1967 return v_int32x4(vsubq_s32(a1, vreinterpretq_s32_u32(mask)));
1968 }
1969
v_trunc(const v_float32x4 & a)1970 inline v_int32x4 v_trunc(const v_float32x4& a)
1971 { return v_int32x4(vcvtq_s32_f32(a.val)); }
1972
1973 #if CV_SIMD128_64F
v_round(const v_float64x2 & a)1974 inline v_int32x4 v_round(const v_float64x2& a)
1975 {
1976 static const int32x2_t zero = vdup_n_s32(0);
1977 return v_int32x4(vcombine_s32(vmovn_s64(vcvtaq_s64_f64(a.val)), zero));
1978 }
1979
v_round(const v_float64x2 & a,const v_float64x2 & b)1980 inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
1981 {
1982 return v_int32x4(vcombine_s32(vmovn_s64(vcvtaq_s64_f64(a.val)), vmovn_s64(vcvtaq_s64_f64(b.val))));
1983 }
1984
v_floor(const v_float64x2 & a)1985 inline v_int32x4 v_floor(const v_float64x2& a)
1986 {
1987 static const int32x2_t zero = vdup_n_s32(0);
1988 int64x2_t a1 = vcvtq_s64_f64(a.val);
1989 uint64x2_t mask = vcgtq_f64(vcvtq_f64_s64(a1), a.val);
1990 a1 = vaddq_s64(a1, vreinterpretq_s64_u64(mask));
1991 return v_int32x4(vcombine_s32(vmovn_s64(a1), zero));
1992 }
1993
v_ceil(const v_float64x2 & a)1994 inline v_int32x4 v_ceil(const v_float64x2& a)
1995 {
1996 static const int32x2_t zero = vdup_n_s32(0);
1997 int64x2_t a1 = vcvtq_s64_f64(a.val);
1998 uint64x2_t mask = vcgtq_f64(a.val, vcvtq_f64_s64(a1));
1999 a1 = vsubq_s64(a1, vreinterpretq_s64_u64(mask));
2000 return v_int32x4(vcombine_s32(vmovn_s64(a1), zero));
2001 }
2002
v_trunc(const v_float64x2 & a)2003 inline v_int32x4 v_trunc(const v_float64x2& a)
2004 {
2005 static const int32x2_t zero = vdup_n_s32(0);
2006 return v_int32x4(vcombine_s32(vmovn_s64(vcvtaq_s64_f64(a.val)), zero));
2007 }
2008 #endif
2009
2010 #if CV_NEON_AARCH64
2011 #define OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(_Tpvec, suffix) \
2012 inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
2013 const v_##_Tpvec& a2, const v_##_Tpvec& a3, \
2014 v_##_Tpvec& b0, v_##_Tpvec& b1, \
2015 v_##_Tpvec& b2, v_##_Tpvec& b3) \
2016 { \
2017 /* -- Pass 1: 64b transpose */ \
2018 _Tpvec##_t t0 = vreinterpretq_##suffix##32_##suffix##64( \
2019 vtrn1q_##suffix##64(vreinterpretq_##suffix##64_##suffix##32(a0.val), \
2020 vreinterpretq_##suffix##64_##suffix##32(a2.val))); \
2021 _Tpvec##_t t1 = vreinterpretq_##suffix##32_##suffix##64( \
2022 vtrn1q_##suffix##64(vreinterpretq_##suffix##64_##suffix##32(a1.val), \
2023 vreinterpretq_##suffix##64_##suffix##32(a3.val))); \
2024 _Tpvec##_t t2 = vreinterpretq_##suffix##32_##suffix##64( \
2025 vtrn2q_##suffix##64(vreinterpretq_##suffix##64_##suffix##32(a0.val), \
2026 vreinterpretq_##suffix##64_##suffix##32(a2.val))); \
2027 _Tpvec##_t t3 = vreinterpretq_##suffix##32_##suffix##64( \
2028 vtrn2q_##suffix##64(vreinterpretq_##suffix##64_##suffix##32(a1.val), \
2029 vreinterpretq_##suffix##64_##suffix##32(a3.val))); \
2030 /* -- Pass 2: 32b transpose */ \
2031 b0.val = vtrn1q_##suffix##32(t0, t1); \
2032 b1.val = vtrn2q_##suffix##32(t0, t1); \
2033 b2.val = vtrn1q_##suffix##32(t2, t3); \
2034 b3.val = vtrn2q_##suffix##32(t2, t3); \
2035 }
2036
OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(uint32x4,u)2037 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(uint32x4, u)
2038 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(int32x4, s)
2039 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(float32x4, f)
2040 #else // #if CV_NEON_AARCH64
2041 #define OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(_Tpvec, suffix) \
2042 inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
2043 const v_##_Tpvec& a2, const v_##_Tpvec& a3, \
2044 v_##_Tpvec& b0, v_##_Tpvec& b1, \
2045 v_##_Tpvec& b2, v_##_Tpvec& b3) \
2046 { \
2047 /* m00 m01 m02 m03 */ \
2048 /* m10 m11 m12 m13 */ \
2049 /* m20 m21 m22 m23 */ \
2050 /* m30 m31 m32 m33 */ \
2051 _Tpvec##x2_t t0 = vtrnq_##suffix(a0.val, a1.val); \
2052 _Tpvec##x2_t t1 = vtrnq_##suffix(a2.val, a3.val); \
2053 /* m00 m10 m02 m12 */ \
2054 /* m01 m11 m03 m13 */ \
2055 /* m20 m30 m22 m32 */ \
2056 /* m21 m31 m23 m33 */ \
2057 b0.val = vcombine_##suffix(vget_low_##suffix(t0.val[0]), vget_low_##suffix(t1.val[0])); \
2058 b1.val = vcombine_##suffix(vget_low_##suffix(t0.val[1]), vget_low_##suffix(t1.val[1])); \
2059 b2.val = vcombine_##suffix(vget_high_##suffix(t0.val[0]), vget_high_##suffix(t1.val[0])); \
2060 b3.val = vcombine_##suffix(vget_high_##suffix(t0.val[1]), vget_high_##suffix(t1.val[1])); \
2061 }
2062
2063 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(uint32x4, u32)
2064 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(int32x4, s32)
2065 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(float32x4, f32)
2066 #endif // #if CV_NEON_AARCH64
2067
2068 #define OPENCV_HAL_IMPL_NEON_INTERLEAVED(_Tpvec, _Tp, suffix) \
2069 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \
2070 { \
2071 _Tpvec##x2_t v = vld2q_##suffix(ptr); \
2072 a.val = v.val[0]; \
2073 b.val = v.val[1]; \
2074 } \
2075 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
2076 { \
2077 _Tpvec##x3_t v = vld3q_##suffix(ptr); \
2078 a.val = v.val[0]; \
2079 b.val = v.val[1]; \
2080 c.val = v.val[2]; \
2081 } \
2082 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
2083 v_##_Tpvec& c, v_##_Tpvec& d) \
2084 { \
2085 _Tpvec##x4_t v = vld4q_##suffix(ptr); \
2086 a.val = v.val[0]; \
2087 b.val = v.val[1]; \
2088 c.val = v.val[2]; \
2089 d.val = v.val[3]; \
2090 } \
2091 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
2092 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
2093 { \
2094 _Tpvec##x2_t v; \
2095 v.val[0] = a.val; \
2096 v.val[1] = b.val; \
2097 vst2q_##suffix(ptr, v); \
2098 } \
2099 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
2100 const v_##_Tpvec& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
2101 { \
2102 _Tpvec##x3_t v; \
2103 v.val[0] = a.val; \
2104 v.val[1] = b.val; \
2105 v.val[2] = c.val; \
2106 vst3q_##suffix(ptr, v); \
2107 } \
2108 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
2109 const v_##_Tpvec& c, const v_##_Tpvec& d, \
2110 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
2111 { \
2112 _Tpvec##x4_t v; \
2113 v.val[0] = a.val; \
2114 v.val[1] = b.val; \
2115 v.val[2] = c.val; \
2116 v.val[3] = d.val; \
2117 vst4q_##suffix(ptr, v); \
2118 }
2119
2120 #define OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(tp, suffix) \
2121 inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, v_##tp##x2& b ) \
2122 { \
2123 tp##x1_t a0 = vld1_##suffix(ptr); \
2124 tp##x1_t b0 = vld1_##suffix(ptr + 1); \
2125 tp##x1_t a1 = vld1_##suffix(ptr + 2); \
2126 tp##x1_t b1 = vld1_##suffix(ptr + 3); \
2127 a = v_##tp##x2(vcombine_##suffix(a0, a1)); \
2128 b = v_##tp##x2(vcombine_##suffix(b0, b1)); \
2129 } \
2130 \
2131 inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, \
2132 v_##tp##x2& b, v_##tp##x2& c ) \
2133 { \
2134 tp##x1_t a0 = vld1_##suffix(ptr); \
2135 tp##x1_t b0 = vld1_##suffix(ptr + 1); \
2136 tp##x1_t c0 = vld1_##suffix(ptr + 2); \
2137 tp##x1_t a1 = vld1_##suffix(ptr + 3); \
2138 tp##x1_t b1 = vld1_##suffix(ptr + 4); \
2139 tp##x1_t c1 = vld1_##suffix(ptr + 5); \
2140 a = v_##tp##x2(vcombine_##suffix(a0, a1)); \
2141 b = v_##tp##x2(vcombine_##suffix(b0, b1)); \
2142 c = v_##tp##x2(vcombine_##suffix(c0, c1)); \
2143 } \
2144 \
2145 inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, v_##tp##x2& b, \
2146 v_##tp##x2& c, v_##tp##x2& d ) \
2147 { \
2148 tp##x1_t a0 = vld1_##suffix(ptr); \
2149 tp##x1_t b0 = vld1_##suffix(ptr + 1); \
2150 tp##x1_t c0 = vld1_##suffix(ptr + 2); \
2151 tp##x1_t d0 = vld1_##suffix(ptr + 3); \
2152 tp##x1_t a1 = vld1_##suffix(ptr + 4); \
2153 tp##x1_t b1 = vld1_##suffix(ptr + 5); \
2154 tp##x1_t c1 = vld1_##suffix(ptr + 6); \
2155 tp##x1_t d1 = vld1_##suffix(ptr + 7); \
2156 a = v_##tp##x2(vcombine_##suffix(a0, a1)); \
2157 b = v_##tp##x2(vcombine_##suffix(b0, b1)); \
2158 c = v_##tp##x2(vcombine_##suffix(c0, c1)); \
2159 d = v_##tp##x2(vcombine_##suffix(d0, d1)); \
2160 } \
2161 \
2162 inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b, \
2163 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
2164 { \
2165 vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
2166 vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
2167 vst1_##suffix(ptr + 2, vget_high_##suffix(a.val)); \
2168 vst1_##suffix(ptr + 3, vget_high_##suffix(b.val)); \
2169 } \
2170 \
2171 inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, \
2172 const v_##tp##x2& b, const v_##tp##x2& c, \
2173 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
2174 { \
2175 vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
2176 vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
2177 vst1_##suffix(ptr + 2, vget_low_##suffix(c.val)); \
2178 vst1_##suffix(ptr + 3, vget_high_##suffix(a.val)); \
2179 vst1_##suffix(ptr + 4, vget_high_##suffix(b.val)); \
2180 vst1_##suffix(ptr + 5, vget_high_##suffix(c.val)); \
2181 } \
2182 \
2183 inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b, \
2184 const v_##tp##x2& c, const v_##tp##x2& d, \
2185 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
2186 { \
2187 vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
2188 vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
2189 vst1_##suffix(ptr + 2, vget_low_##suffix(c.val)); \
2190 vst1_##suffix(ptr + 3, vget_low_##suffix(d.val)); \
2191 vst1_##suffix(ptr + 4, vget_high_##suffix(a.val)); \
2192 vst1_##suffix(ptr + 5, vget_high_##suffix(b.val)); \
2193 vst1_##suffix(ptr + 6, vget_high_##suffix(c.val)); \
2194 vst1_##suffix(ptr + 7, vget_high_##suffix(d.val)); \
2195 }
2196
2197 OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint8x16, uchar, u8)
2198 OPENCV_HAL_IMPL_NEON_INTERLEAVED(int8x16, schar, s8)
2199 OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint16x8, ushort, u16)
2200 OPENCV_HAL_IMPL_NEON_INTERLEAVED(int16x8, short, s16)
2201 OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint32x4, unsigned, u32)
2202 OPENCV_HAL_IMPL_NEON_INTERLEAVED(int32x4, int, s32)
2203 OPENCV_HAL_IMPL_NEON_INTERLEAVED(float32x4, float, f32)
2204 #if CV_SIMD128_64F
2205 OPENCV_HAL_IMPL_NEON_INTERLEAVED(float64x2, double, f64)
2206 #endif
2207
2208 OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(int64, s64)
2209 OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(uint64, u64)
2210
2211 inline v_float32x4 v_cvt_f32(const v_int32x4& a)
2212 {
2213 return v_float32x4(vcvtq_f32_s32(a.val));
2214 }
2215
2216 #if CV_SIMD128_64F
v_cvt_f32(const v_float64x2 & a)2217 inline v_float32x4 v_cvt_f32(const v_float64x2& a)
2218 {
2219 float32x2_t zero = vdup_n_f32(0.0f);
2220 return v_float32x4(vcombine_f32(vcvt_f32_f64(a.val), zero));
2221 }
2222
v_cvt_f32(const v_float64x2 & a,const v_float64x2 & b)2223 inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
2224 {
2225 return v_float32x4(vcombine_f32(vcvt_f32_f64(a.val), vcvt_f32_f64(b.val)));
2226 }
2227
v_cvt_f64(const v_int32x4 & a)2228 inline v_float64x2 v_cvt_f64(const v_int32x4& a)
2229 {
2230 return v_float64x2(vcvt_f64_f32(vcvt_f32_s32(vget_low_s32(a.val))));
2231 }
2232
v_cvt_f64_high(const v_int32x4 & a)2233 inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
2234 {
2235 return v_float64x2(vcvt_f64_f32(vcvt_f32_s32(vget_high_s32(a.val))));
2236 }
2237
v_cvt_f64(const v_float32x4 & a)2238 inline v_float64x2 v_cvt_f64(const v_float32x4& a)
2239 {
2240 return v_float64x2(vcvt_f64_f32(vget_low_f32(a.val)));
2241 }
2242
v_cvt_f64_high(const v_float32x4 & a)2243 inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
2244 {
2245 return v_float64x2(vcvt_f64_f32(vget_high_f32(a.val)));
2246 }
2247
v_cvt_f64(const v_int64x2 & a)2248 inline v_float64x2 v_cvt_f64(const v_int64x2& a)
2249 { return v_float64x2(vcvtq_f64_s64(a.val)); }
2250
2251 #endif
2252
2253 ////////////// Lookup table access ////////////////////
2254
v_lut(const schar * tab,const int * idx)2255 inline v_int8x16 v_lut(const schar* tab, const int* idx)
2256 {
2257 schar CV_DECL_ALIGNED(32) elems[16] =
2258 {
2259 tab[idx[ 0]],
2260 tab[idx[ 1]],
2261 tab[idx[ 2]],
2262 tab[idx[ 3]],
2263 tab[idx[ 4]],
2264 tab[idx[ 5]],
2265 tab[idx[ 6]],
2266 tab[idx[ 7]],
2267 tab[idx[ 8]],
2268 tab[idx[ 9]],
2269 tab[idx[10]],
2270 tab[idx[11]],
2271 tab[idx[12]],
2272 tab[idx[13]],
2273 tab[idx[14]],
2274 tab[idx[15]]
2275 };
2276 return v_int8x16(vld1q_s8(elems));
2277 }
v_lut_pairs(const schar * tab,const int * idx)2278 inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
2279 {
2280 schar CV_DECL_ALIGNED(32) elems[16] =
2281 {
2282 tab[idx[0]],
2283 tab[idx[0] + 1],
2284 tab[idx[1]],
2285 tab[idx[1] + 1],
2286 tab[idx[2]],
2287 tab[idx[2] + 1],
2288 tab[idx[3]],
2289 tab[idx[3] + 1],
2290 tab[idx[4]],
2291 tab[idx[4] + 1],
2292 tab[idx[5]],
2293 tab[idx[5] + 1],
2294 tab[idx[6]],
2295 tab[idx[6] + 1],
2296 tab[idx[7]],
2297 tab[idx[7] + 1]
2298 };
2299 return v_int8x16(vld1q_s8(elems));
2300 }
v_lut_quads(const schar * tab,const int * idx)2301 inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
2302 {
2303 schar CV_DECL_ALIGNED(32) elems[16] =
2304 {
2305 tab[idx[0]],
2306 tab[idx[0] + 1],
2307 tab[idx[0] + 2],
2308 tab[idx[0] + 3],
2309 tab[idx[1]],
2310 tab[idx[1] + 1],
2311 tab[idx[1] + 2],
2312 tab[idx[1] + 3],
2313 tab[idx[2]],
2314 tab[idx[2] + 1],
2315 tab[idx[2] + 2],
2316 tab[idx[2] + 3],
2317 tab[idx[3]],
2318 tab[idx[3] + 1],
2319 tab[idx[3] + 2],
2320 tab[idx[3] + 3]
2321 };
2322 return v_int8x16(vld1q_s8(elems));
2323 }
v_lut(const uchar * tab,const int * idx)2324 inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
v_lut_pairs(const uchar * tab,const int * idx)2325 inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); }
v_lut_quads(const uchar * tab,const int * idx)2326 inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); }
2327
v_lut(const short * tab,const int * idx)2328 inline v_int16x8 v_lut(const short* tab, const int* idx)
2329 {
2330 short CV_DECL_ALIGNED(32) elems[8] =
2331 {
2332 tab[idx[0]],
2333 tab[idx[1]],
2334 tab[idx[2]],
2335 tab[idx[3]],
2336 tab[idx[4]],
2337 tab[idx[5]],
2338 tab[idx[6]],
2339 tab[idx[7]]
2340 };
2341 return v_int16x8(vld1q_s16(elems));
2342 }
v_lut_pairs(const short * tab,const int * idx)2343 inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
2344 {
2345 short CV_DECL_ALIGNED(32) elems[8] =
2346 {
2347 tab[idx[0]],
2348 tab[idx[0] + 1],
2349 tab[idx[1]],
2350 tab[idx[1] + 1],
2351 tab[idx[2]],
2352 tab[idx[2] + 1],
2353 tab[idx[3]],
2354 tab[idx[3] + 1]
2355 };
2356 return v_int16x8(vld1q_s16(elems));
2357 }
v_lut_quads(const short * tab,const int * idx)2358 inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
2359 {
2360 return v_int16x8(vcombine_s16(vld1_s16(tab + idx[0]), vld1_s16(tab + idx[1])));
2361 }
v_lut(const ushort * tab,const int * idx)2362 inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((short*)tab, idx)); }
v_lut_pairs(const ushort * tab,const int * idx)2363 inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); }
v_lut_quads(const ushort * tab,const int * idx)2364 inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((short*)tab, idx)); }
2365
v_lut(const int * tab,const int * idx)2366 inline v_int32x4 v_lut(const int* tab, const int* idx)
2367 {
2368 int CV_DECL_ALIGNED(32) elems[4] =
2369 {
2370 tab[idx[0]],
2371 tab[idx[1]],
2372 tab[idx[2]],
2373 tab[idx[3]]
2374 };
2375 return v_int32x4(vld1q_s32(elems));
2376 }
v_lut_pairs(const int * tab,const int * idx)2377 inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
2378 {
2379 return v_int32x4(vcombine_s32(vld1_s32(tab + idx[0]), vld1_s32(tab + idx[1])));
2380 }
v_lut_quads(const int * tab,const int * idx)2381 inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
2382 {
2383 return v_int32x4(vld1q_s32(tab + idx[0]));
2384 }
v_lut(const unsigned * tab,const int * idx)2385 inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((int*)tab, idx)); }
v_lut_pairs(const unsigned * tab,const int * idx)2386 inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((int*)tab, idx)); }
v_lut_quads(const unsigned * tab,const int * idx)2387 inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((int*)tab, idx)); }
2388
v_lut(const int64_t * tab,const int * idx)2389 inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
2390 {
2391 return v_int64x2(vcombine_s64(vcreate_s64(tab[idx[0]]), vcreate_s64(tab[idx[1]])));
2392 }
v_lut_pairs(const int64_t * tab,const int * idx)2393 inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
2394 {
2395 return v_int64x2(vld1q_s64(tab + idx[0]));
2396 }
v_lut(const uint64_t * tab,const int * idx)2397 inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
v_lut_pairs(const uint64_t * tab,const int * idx)2398 inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
2399
v_lut(const float * tab,const int * idx)2400 inline v_float32x4 v_lut(const float* tab, const int* idx)
2401 {
2402 float CV_DECL_ALIGNED(32) elems[4] =
2403 {
2404 tab[idx[0]],
2405 tab[idx[1]],
2406 tab[idx[2]],
2407 tab[idx[3]]
2408 };
2409 return v_float32x4(vld1q_f32(elems));
2410 }
v_lut_pairs(const float * tab,const int * idx)2411 inline v_float32x4 v_lut_pairs(const float* tab, const int* idx)
2412 {
2413 typedef uint64 CV_DECL_ALIGNED(1) unaligned_uint64;
2414
2415 uint64 CV_DECL_ALIGNED(32) elems[2] =
2416 {
2417 *(unaligned_uint64*)(tab + idx[0]),
2418 *(unaligned_uint64*)(tab + idx[1])
2419 };
2420 return v_float32x4(vreinterpretq_f32_u64(vld1q_u64(elems)));
2421 }
v_lut_quads(const float * tab,const int * idx)2422 inline v_float32x4 v_lut_quads(const float* tab, const int* idx)
2423 {
2424 return v_float32x4(vld1q_f32(tab + idx[0]));
2425 }
2426
v_lut(const int * tab,const v_int32x4 & idxvec)2427 inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
2428 {
2429 int CV_DECL_ALIGNED(32) elems[4] =
2430 {
2431 tab[vgetq_lane_s32(idxvec.val, 0)],
2432 tab[vgetq_lane_s32(idxvec.val, 1)],
2433 tab[vgetq_lane_s32(idxvec.val, 2)],
2434 tab[vgetq_lane_s32(idxvec.val, 3)]
2435 };
2436 return v_int32x4(vld1q_s32(elems));
2437 }
2438
v_lut(const unsigned * tab,const v_int32x4 & idxvec)2439 inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
2440 {
2441 unsigned CV_DECL_ALIGNED(32) elems[4] =
2442 {
2443 tab[vgetq_lane_s32(idxvec.val, 0)],
2444 tab[vgetq_lane_s32(idxvec.val, 1)],
2445 tab[vgetq_lane_s32(idxvec.val, 2)],
2446 tab[vgetq_lane_s32(idxvec.val, 3)]
2447 };
2448 return v_uint32x4(vld1q_u32(elems));
2449 }
2450
v_lut(const float * tab,const v_int32x4 & idxvec)2451 inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
2452 {
2453 float CV_DECL_ALIGNED(32) elems[4] =
2454 {
2455 tab[vgetq_lane_s32(idxvec.val, 0)],
2456 tab[vgetq_lane_s32(idxvec.val, 1)],
2457 tab[vgetq_lane_s32(idxvec.val, 2)],
2458 tab[vgetq_lane_s32(idxvec.val, 3)]
2459 };
2460 return v_float32x4(vld1q_f32(elems));
2461 }
2462
v_lut_deinterleave(const float * tab,const v_int32x4 & idxvec,v_float32x4 & x,v_float32x4 & y)2463 inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
2464 {
2465 /*int CV_DECL_ALIGNED(32) idx[4];
2466 v_store(idx, idxvec);
2467
2468 float32x4_t xy02 = vcombine_f32(vld1_f32(tab + idx[0]), vld1_f32(tab + idx[2]));
2469 float32x4_t xy13 = vcombine_f32(vld1_f32(tab + idx[1]), vld1_f32(tab + idx[3]));
2470
2471 float32x4x2_t xxyy = vuzpq_f32(xy02, xy13);
2472 x = v_float32x4(xxyy.val[0]);
2473 y = v_float32x4(xxyy.val[1]);*/
2474 int CV_DECL_ALIGNED(32) idx[4];
2475 v_store_aligned(idx, idxvec);
2476
2477 x = v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
2478 y = v_float32x4(tab[idx[0]+1], tab[idx[1]+1], tab[idx[2]+1], tab[idx[3]+1]);
2479 }
2480
v_interleave_pairs(const v_int8x16 & vec)2481 inline v_int8x16 v_interleave_pairs(const v_int8x16& vec)
2482 {
2483 return v_int8x16(vcombine_s8(vtbl1_s8(vget_low_s8(vec.val), vcreate_s8(0x0705060403010200)), vtbl1_s8(vget_high_s8(vec.val), vcreate_s8(0x0705060403010200))));
2484 }
v_interleave_pairs(const v_uint8x16 & vec)2485 inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
v_interleave_quads(const v_int8x16 & vec)2486 inline v_int8x16 v_interleave_quads(const v_int8x16& vec)
2487 {
2488 return v_int8x16(vcombine_s8(vtbl1_s8(vget_low_s8(vec.val), vcreate_s8(0x0703060205010400)), vtbl1_s8(vget_high_s8(vec.val), vcreate_s8(0x0703060205010400))));
2489 }
v_interleave_quads(const v_uint8x16 & vec)2490 inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
2491
v_interleave_pairs(const v_int16x8 & vec)2492 inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
2493 {
2494 return v_int16x8(vreinterpretq_s16_s8(vcombine_s8(vtbl1_s8(vget_low_s8(vreinterpretq_s8_s16(vec.val)), vcreate_s8(0x0706030205040100)), vtbl1_s8(vget_high_s8(vreinterpretq_s8_s16(vec.val)), vcreate_s8(0x0706030205040100)))));
2495 }
v_interleave_pairs(const v_uint16x8 & vec)2496 inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
v_interleave_quads(const v_int16x8 & vec)2497 inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
2498 {
2499 int16x4x2_t res = vzip_s16(vget_low_s16(vec.val), vget_high_s16(vec.val));
2500 return v_int16x8(vcombine_s16(res.val[0], res.val[1]));
2501 }
v_interleave_quads(const v_uint16x8 & vec)2502 inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
2503
v_interleave_pairs(const v_int32x4 & vec)2504 inline v_int32x4 v_interleave_pairs(const v_int32x4& vec)
2505 {
2506 int32x2x2_t res = vzip_s32(vget_low_s32(vec.val), vget_high_s32(vec.val));
2507 return v_int32x4(vcombine_s32(res.val[0], res.val[1]));
2508 }
v_interleave_pairs(const v_uint32x4 & vec)2509 inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
v_interleave_pairs(const v_float32x4 & vec)2510 inline v_float32x4 v_interleave_pairs(const v_float32x4& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
2511
v_pack_triplets(const v_int8x16 & vec)2512 inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
2513 {
2514 return v_int8x16(vextq_s8(vcombine_s8(vtbl1_s8(vget_low_s8(vec.val), vcreate_s8(0x0605040201000000)), vtbl1_s8(vget_high_s8(vec.val), vcreate_s8(0x0807060504020100))), vdupq_n_s8(0), 2));
2515 }
v_pack_triplets(const v_uint8x16 & vec)2516 inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
2517
v_pack_triplets(const v_int16x8 & vec)2518 inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
2519 {
2520 return v_int16x8(vreinterpretq_s16_s8(vextq_s8(vcombine_s8(vtbl1_s8(vget_low_s8(vreinterpretq_s8_s16(vec.val)), vcreate_s8(0x0504030201000000)), vget_high_s8(vreinterpretq_s8_s16(vec.val))), vdupq_n_s8(0), 2)));
2521 }
v_pack_triplets(const v_uint16x8 & vec)2522 inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
2523
v_pack_triplets(const v_int32x4 & vec)2524 inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
v_pack_triplets(const v_uint32x4 & vec)2525 inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
v_pack_triplets(const v_float32x4 & vec)2526 inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
2527
2528 #if CV_SIMD128_64F
v_lut(const double * tab,const int * idx)2529 inline v_float64x2 v_lut(const double* tab, const int* idx)
2530 {
2531 double CV_DECL_ALIGNED(32) elems[2] =
2532 {
2533 tab[idx[0]],
2534 tab[idx[1]]
2535 };
2536 return v_float64x2(vld1q_f64(elems));
2537 }
2538
v_lut_pairs(const double * tab,const int * idx)2539 inline v_float64x2 v_lut_pairs(const double* tab, const int* idx)
2540 {
2541 return v_float64x2(vld1q_f64(tab + idx[0]));
2542 }
2543
v_lut(const double * tab,const v_int32x4 & idxvec)2544 inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
2545 {
2546 double CV_DECL_ALIGNED(32) elems[2] =
2547 {
2548 tab[vgetq_lane_s32(idxvec.val, 0)],
2549 tab[vgetq_lane_s32(idxvec.val, 1)],
2550 };
2551 return v_float64x2(vld1q_f64(elems));
2552 }
2553
v_lut_deinterleave(const double * tab,const v_int32x4 & idxvec,v_float64x2 & x,v_float64x2 & y)2554 inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
2555 {
2556 int CV_DECL_ALIGNED(32) idx[4];
2557 v_store_aligned(idx, idxvec);
2558
2559 x = v_float64x2(tab[idx[0]], tab[idx[1]]);
2560 y = v_float64x2(tab[idx[0]+1], tab[idx[1]+1]);
2561 }
2562 #endif
2563
2564 ////// FP16 support ///////
2565 #if CV_FP16
v_load_expand(const float16_t * ptr)2566 inline v_float32x4 v_load_expand(const float16_t* ptr)
2567 {
2568 float16x4_t v =
2569 #ifndef vld1_f16 // APPLE compiler defines vld1_f16 as macro
2570 (float16x4_t)vld1_s16((const short*)ptr);
2571 #else
2572 vld1_f16((const __fp16*)ptr);
2573 #endif
2574 return v_float32x4(vcvt_f32_f16(v));
2575 }
2576
v_pack_store(float16_t * ptr,const v_float32x4 & v)2577 inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
2578 {
2579 float16x4_t hv = vcvt_f16_f32(v.val);
2580
2581 #ifndef vst1_f16 // APPLE compiler defines vst1_f16 as macro
2582 vst1_s16((short*)ptr, (int16x4_t)hv);
2583 #else
2584 vst1_f16((__fp16*)ptr, hv);
2585 #endif
2586 }
2587 #else
v_load_expand(const float16_t * ptr)2588 inline v_float32x4 v_load_expand(const float16_t* ptr)
2589 {
2590 const int N = 4;
2591 float buf[N];
2592 for( int i = 0; i < N; i++ ) buf[i] = (float)ptr[i];
2593 return v_load(buf);
2594 }
2595
v_pack_store(float16_t * ptr,const v_float32x4 & v)2596 inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
2597 {
2598 const int N = 4;
2599 float buf[N];
2600 v_store(buf, v);
2601 for( int i = 0; i < N; i++ ) ptr[i] = float16_t(buf[i]);
2602 }
2603 #endif
2604
v_cleanup()2605 inline void v_cleanup() {}
2606
2607 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
2608
2609 //! @endcond
2610
2611 }
2612
2613 #endif
2614