1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                          License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
16 // Copyright (C) 2015, Itseez Inc., all rights reserved.
17 // Third party copyrights are property of their respective owners.
18 //
19 // Redistribution and use in source and binary forms, with or without modification,
20 // are permitted provided that the following conditions are met:
21 //
22 //   * Redistribution's of source code must retain the above copyright notice,
23 //     this list of conditions and the following disclaimer.
24 //
25 //   * Redistribution's in binary form must reproduce the above copyright notice,
26 //     this list of conditions and the following disclaimer in the documentation
27 //     and/or other materials provided with the distribution.
28 //
29 //   * The name of the copyright holders may not be used to endorse or promote products
30 //     derived from this software without specific prior written permission.
31 //
32 // This software is provided by the copyright holders and contributors "as is" and
33 // any express or implied warranties, including, but not limited to, the implied
34 // warranties of merchantability and fitness for a particular purpose are disclaimed.
35 // In no event shall the Intel Corporation or contributors be liable for any direct,
36 // indirect, incidental, special, exemplary, or consequential damages
37 // (including, but not limited to, procurement of substitute goods or services;
38 // loss of use, data, or profits; or business interruption) however caused
39 // and on any theory of liability, whether in contract, strict liability,
40 // or tort (including negligence or otherwise) arising in any way out of
41 // the use of this software, even if advised of the possibility of such damage.
42 //
43 //M*/
44 
45 #ifndef OPENCV_HAL_INTRIN_NEON_HPP
46 #define OPENCV_HAL_INTRIN_NEON_HPP
47 
48 #include <algorithm>
49 #include "opencv2/core/utility.hpp"
50 
51 namespace cv
52 {
53 
54 //! @cond IGNORED
55 
56 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
57 
58 #define CV_SIMD128 1
59 #if defined(__aarch64__) || defined(_M_ARM64)
60 #define CV_SIMD128_64F 1
61 #else
62 #define CV_SIMD128_64F 0
63 #endif
64 
65 // The following macro checks if the code is being compiled for the
66 // AArch64 execution state of Armv8, to enable the 128-bit
67 // intrinsics. The macro `__ARM_64BIT_STATE` is the one recommended by
68 // the Arm C Language Extension (ACLE) specifications [1] to check the
69 // availability of 128-bit intrinsics, and it is supporrted by clang
70 // and gcc. The macro `_M_ARM64` is the equivalent one for Microsoft
71 // Visual Studio [2] .
72 //
73 // [1] https://developer.arm.com/documentation/101028/0012/13--Advanced-SIMD--Neon--intrinsics
74 // [2] https://docs.microsoft.com/en-us/cpp/preprocessor/predefined-macros
75 #if defined(__ARM_64BIT_STATE) || defined(_M_ARM64)
76 #define CV_NEON_AARCH64 1
77 #else
78 #define CV_NEON_AARCH64 0
79 #endif
80 
81 // TODO
82 #define CV_NEON_DOT 0
83 
84 //////////// Utils ////////////
85 
86 #if CV_SIMD128_64F
87 #define OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv, _Tpvx2, suffix) \
88     inline void _v128_unzip(const _Tpv& a, const _Tpv& b, _Tpv& c, _Tpv& d) \
89     { c = vuzp1q_##suffix(a, b); d = vuzp2q_##suffix(a, b); }
90 #define OPENCV_HAL_IMPL_NEON_UNZIP_L(_Tpv, _Tpvx2, suffix) \
91     inline void _v128_unzip(const _Tpv&a, const _Tpv&b, _Tpv& c, _Tpv& d) \
92     { c = vuzp1_##suffix(a, b); d = vuzp2_##suffix(a, b); }
93 #else
94 #define OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv, _Tpvx2, suffix) \
95     inline void _v128_unzip(const _Tpv& a, const _Tpv& b, _Tpv& c, _Tpv& d) \
96     { _Tpvx2 ab = vuzpq_##suffix(a, b); c = ab.val[0]; d = ab.val[1]; }
97 #define OPENCV_HAL_IMPL_NEON_UNZIP_L(_Tpv, _Tpvx2, suffix) \
98     inline void _v128_unzip(const _Tpv& a, const _Tpv& b, _Tpv& c, _Tpv& d) \
99     { _Tpvx2 ab = vuzp_##suffix(a, b); c = ab.val[0]; d = ab.val[1]; }
100 #endif
101 
102 #if CV_SIMD128_64F
103 #define OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv, suffix) \
104     template <typename T> static inline \
105     _Tpv vreinterpretq_##suffix##_f64(T a) { return (_Tpv) a; } \
106     template <typename T> static inline \
107     float64x2_t vreinterpretq_f64_##suffix(T a) { return (float64x2_t) a; }
108 #else
109 #define OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv, suffix)
110 #endif
111 
112 #define OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(_Tpv, _Tpvl, suffix) \
113     OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv##_t, _Tpv##x2_t, suffix) \
114     OPENCV_HAL_IMPL_NEON_UNZIP_L(_Tpvl##_t, _Tpvl##x2_t, suffix) \
115     OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv##_t, suffix)
116 
117 #define OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_I64(_Tpv, _Tpvl, suffix) \
118     OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv##_t, suffix)
119 
120 #define OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_F64(_Tpv, _Tpvl, suffix) \
121     OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv##_t, _Tpv##x2_t, suffix)
122 
123 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(uint8x16, uint8x8,  u8)
124 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(int8x16,  int8x8,   s8)
125 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(uint16x8, uint16x4, u16)
126 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(int16x8,  int16x4,  s16)
127 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(uint32x4, uint32x2, u32)
128 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(int32x4,  int32x2,  s32)
129 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(float32x4, float32x2, f32)
130 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_I64(uint64x2, uint64x1, u64)
131 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_I64(int64x2,  int64x1,  s64)
132 #if CV_SIMD128_64F
133 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_F64(float64x2, float64x1,f64)
134 #endif
135 
136 //////////// Types ////////////
137 
138 struct v_uint8x16
139 {
140     typedef uchar lane_type;
141     enum { nlanes = 16 };
142 
v_uint8x16cv::v_uint8x16143     v_uint8x16() {}
v_uint8x16cv::v_uint8x16144     explicit v_uint8x16(uint8x16_t v) : val(v) {}
v_uint8x16cv::v_uint8x16145     v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
146                uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
147     {
148         uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
149         val = vld1q_u8(v);
150     }
get0cv::v_uint8x16151     uchar get0() const
152     {
153         return vgetq_lane_u8(val, 0);
154     }
155 
156     uint8x16_t val;
157 };
158 
159 struct v_int8x16
160 {
161     typedef schar lane_type;
162     enum { nlanes = 16 };
163 
v_int8x16cv::v_int8x16164     v_int8x16() {}
v_int8x16cv::v_int8x16165     explicit v_int8x16(int8x16_t v) : val(v) {}
v_int8x16cv::v_int8x16166     v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
167                schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
168     {
169         schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
170         val = vld1q_s8(v);
171     }
get0cv::v_int8x16172     schar get0() const
173     {
174         return vgetq_lane_s8(val, 0);
175     }
176 
177     int8x16_t val;
178 };
179 
180 struct v_uint16x8
181 {
182     typedef ushort lane_type;
183     enum { nlanes = 8 };
184 
v_uint16x8cv::v_uint16x8185     v_uint16x8() {}
v_uint16x8cv::v_uint16x8186     explicit v_uint16x8(uint16x8_t v) : val(v) {}
v_uint16x8cv::v_uint16x8187     v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
188     {
189         ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
190         val = vld1q_u16(v);
191     }
get0cv::v_uint16x8192     ushort get0() const
193     {
194         return vgetq_lane_u16(val, 0);
195     }
196 
197     uint16x8_t val;
198 };
199 
200 struct v_int16x8
201 {
202     typedef short lane_type;
203     enum { nlanes = 8 };
204 
v_int16x8cv::v_int16x8205     v_int16x8() {}
v_int16x8cv::v_int16x8206     explicit v_int16x8(int16x8_t v) : val(v) {}
v_int16x8cv::v_int16x8207     v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
208     {
209         short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
210         val = vld1q_s16(v);
211     }
get0cv::v_int16x8212     short get0() const
213     {
214         return vgetq_lane_s16(val, 0);
215     }
216 
217     int16x8_t val;
218 };
219 
220 struct v_uint32x4
221 {
222     typedef unsigned lane_type;
223     enum { nlanes = 4 };
224 
v_uint32x4cv::v_uint32x4225     v_uint32x4() {}
v_uint32x4cv::v_uint32x4226     explicit v_uint32x4(uint32x4_t v) : val(v) {}
v_uint32x4cv::v_uint32x4227     v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
228     {
229         unsigned v[] = {v0, v1, v2, v3};
230         val = vld1q_u32(v);
231     }
get0cv::v_uint32x4232     unsigned get0() const
233     {
234         return vgetq_lane_u32(val, 0);
235     }
236 
237     uint32x4_t val;
238 };
239 
240 struct v_int32x4
241 {
242     typedef int lane_type;
243     enum { nlanes = 4 };
244 
v_int32x4cv::v_int32x4245     v_int32x4() {}
v_int32x4cv::v_int32x4246     explicit v_int32x4(int32x4_t v) : val(v) {}
v_int32x4cv::v_int32x4247     v_int32x4(int v0, int v1, int v2, int v3)
248     {
249         int v[] = {v0, v1, v2, v3};
250         val = vld1q_s32(v);
251     }
get0cv::v_int32x4252     int get0() const
253     {
254         return vgetq_lane_s32(val, 0);
255     }
256     int32x4_t val;
257 };
258 
259 struct v_float32x4
260 {
261     typedef float lane_type;
262     enum { nlanes = 4 };
263 
v_float32x4cv::v_float32x4264     v_float32x4() {}
v_float32x4cv::v_float32x4265     explicit v_float32x4(float32x4_t v) : val(v) {}
v_float32x4cv::v_float32x4266     v_float32x4(float v0, float v1, float v2, float v3)
267     {
268         float v[] = {v0, v1, v2, v3};
269         val = vld1q_f32(v);
270     }
get0cv::v_float32x4271     float get0() const
272     {
273         return vgetq_lane_f32(val, 0);
274     }
275     float32x4_t val;
276 };
277 
278 struct v_uint64x2
279 {
280     typedef uint64 lane_type;
281     enum { nlanes = 2 };
282 
v_uint64x2cv::v_uint64x2283     v_uint64x2() {}
v_uint64x2cv::v_uint64x2284     explicit v_uint64x2(uint64x2_t v) : val(v) {}
v_uint64x2cv::v_uint64x2285     v_uint64x2(uint64 v0, uint64 v1)
286     {
287         uint64 v[] = {v0, v1};
288         val = vld1q_u64(v);
289     }
get0cv::v_uint64x2290     uint64 get0() const
291     {
292         return vgetq_lane_u64(val, 0);
293     }
294     uint64x2_t val;
295 };
296 
297 struct v_int64x2
298 {
299     typedef int64 lane_type;
300     enum { nlanes = 2 };
301 
v_int64x2cv::v_int64x2302     v_int64x2() {}
v_int64x2cv::v_int64x2303     explicit v_int64x2(int64x2_t v) : val(v) {}
v_int64x2cv::v_int64x2304     v_int64x2(int64 v0, int64 v1)
305     {
306         int64 v[] = {v0, v1};
307         val = vld1q_s64(v);
308     }
get0cv::v_int64x2309     int64 get0() const
310     {
311         return vgetq_lane_s64(val, 0);
312     }
313     int64x2_t val;
314 };
315 
316 #if CV_SIMD128_64F
317 struct v_float64x2
318 {
319     typedef double lane_type;
320     enum { nlanes = 2 };
321 
v_float64x2cv::v_float64x2322     v_float64x2() {}
v_float64x2cv::v_float64x2323     explicit v_float64x2(float64x2_t v) : val(v) {}
v_float64x2cv::v_float64x2324     v_float64x2(double v0, double v1)
325     {
326         double v[] = {v0, v1};
327         val = vld1q_f64(v);
328     }
get0cv::v_float64x2329     double get0() const
330     {
331         return vgetq_lane_f64(val, 0);
332     }
333     float64x2_t val;
334 };
335 #endif
336 
337 #define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \
338 inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(vdupq_n_##suffix((_Tp)0)); } \
339 inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(vdupq_n_##suffix(v)); } \
340 inline _Tpv##_t vreinterpretq_##suffix##_##suffix(_Tpv##_t v) { return v; } \
341 inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16(vreinterpretq_u8_##suffix(v.val)); } \
342 inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16(vreinterpretq_s8_##suffix(v.val)); } \
343 inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8(vreinterpretq_u16_##suffix(v.val)); } \
344 inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8(vreinterpretq_s16_##suffix(v.val)); } \
345 inline v_uint32x4 v_reinterpret_as_u32(const v_##_Tpv& v) { return v_uint32x4(vreinterpretq_u32_##suffix(v.val)); } \
346 inline v_int32x4 v_reinterpret_as_s32(const v_##_Tpv& v) { return v_int32x4(vreinterpretq_s32_##suffix(v.val)); } \
347 inline v_uint64x2 v_reinterpret_as_u64(const v_##_Tpv& v) { return v_uint64x2(vreinterpretq_u64_##suffix(v.val)); } \
348 inline v_int64x2 v_reinterpret_as_s64(const v_##_Tpv& v) { return v_int64x2(vreinterpretq_s64_##suffix(v.val)); } \
349 inline v_float32x4 v_reinterpret_as_f32(const v_##_Tpv& v) { return v_float32x4(vreinterpretq_f32_##suffix(v.val)); }
350 
OPENCV_HAL_IMPL_NEON_INIT(uint8x16,uchar,u8)351 OPENCV_HAL_IMPL_NEON_INIT(uint8x16, uchar, u8)
352 OPENCV_HAL_IMPL_NEON_INIT(int8x16, schar, s8)
353 OPENCV_HAL_IMPL_NEON_INIT(uint16x8, ushort, u16)
354 OPENCV_HAL_IMPL_NEON_INIT(int16x8, short, s16)
355 OPENCV_HAL_IMPL_NEON_INIT(uint32x4, unsigned, u32)
356 OPENCV_HAL_IMPL_NEON_INIT(int32x4, int, s32)
357 OPENCV_HAL_IMPL_NEON_INIT(uint64x2, uint64, u64)
358 OPENCV_HAL_IMPL_NEON_INIT(int64x2, int64, s64)
359 OPENCV_HAL_IMPL_NEON_INIT(float32x4, float, f32)
360 #if CV_SIMD128_64F
361 #define OPENCV_HAL_IMPL_NEON_INIT_64(_Tpv, suffix) \
362 inline v_float64x2 v_reinterpret_as_f64(const v_##_Tpv& v) { return v_float64x2(vreinterpretq_f64_##suffix(v.val)); }
363 OPENCV_HAL_IMPL_NEON_INIT(float64x2, double, f64)
364 OPENCV_HAL_IMPL_NEON_INIT_64(uint8x16, u8)
365 OPENCV_HAL_IMPL_NEON_INIT_64(int8x16, s8)
366 OPENCV_HAL_IMPL_NEON_INIT_64(uint16x8, u16)
367 OPENCV_HAL_IMPL_NEON_INIT_64(int16x8, s16)
368 OPENCV_HAL_IMPL_NEON_INIT_64(uint32x4, u32)
369 OPENCV_HAL_IMPL_NEON_INIT_64(int32x4, s32)
370 OPENCV_HAL_IMPL_NEON_INIT_64(uint64x2, u64)
371 OPENCV_HAL_IMPL_NEON_INIT_64(int64x2, s64)
372 OPENCV_HAL_IMPL_NEON_INIT_64(float32x4, f32)
373 OPENCV_HAL_IMPL_NEON_INIT_64(float64x2, f64)
374 #endif
375 
376 #define OPENCV_HAL_IMPL_NEON_PACK(_Tpvec, _Tp, hreg, suffix, _Tpwvec, pack, mov, rshr) \
377 inline _Tpvec v_##pack(const _Tpwvec& a, const _Tpwvec& b) \
378 { \
379     hreg a1 = mov(a.val), b1 = mov(b.val); \
380     return _Tpvec(vcombine_##suffix(a1, b1)); \
381 } \
382 inline void v_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
383 { \
384     hreg a1 = mov(a.val); \
385     vst1_##suffix(ptr, a1); \
386 } \
387 template<int n> inline \
388 _Tpvec v_rshr_##pack(const _Tpwvec& a, const _Tpwvec& b) \
389 { \
390     hreg a1 = rshr(a.val, n); \
391     hreg b1 = rshr(b.val, n); \
392     return _Tpvec(vcombine_##suffix(a1, b1)); \
393 } \
394 template<int n> inline \
395 void v_rshr_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
396 { \
397     hreg a1 = rshr(a.val, n); \
398     vst1_##suffix(ptr, a1); \
399 }
400 
401 OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_uint16x8, pack, vqmovn_u16, vqrshrn_n_u16)
402 OPENCV_HAL_IMPL_NEON_PACK(v_int8x16, schar, int8x8_t, s8, v_int16x8, pack, vqmovn_s16, vqrshrn_n_s16)
403 OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_uint32x4, pack, vqmovn_u32, vqrshrn_n_u32)
404 OPENCV_HAL_IMPL_NEON_PACK(v_int16x8, short, int16x4_t, s16, v_int32x4, pack, vqmovn_s32, vqrshrn_n_s32)
405 OPENCV_HAL_IMPL_NEON_PACK(v_uint32x4, unsigned, uint32x2_t, u32, v_uint64x2, pack, vmovn_u64, vrshrn_n_u64)
406 OPENCV_HAL_IMPL_NEON_PACK(v_int32x4, int, int32x2_t, s32, v_int64x2, pack, vmovn_s64, vrshrn_n_s64)
407 
408 OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_int16x8, pack_u, vqmovun_s16, vqrshrun_n_s16)
409 OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_int32x4, pack_u, vqmovun_s32, vqrshrun_n_s32)
410 
411 // pack boolean
412 inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
413 {
414     uint8x16_t ab = vcombine_u8(vmovn_u16(a.val), vmovn_u16(b.val));
415     return v_uint8x16(ab);
416 }
417 
v_pack_b(const v_uint32x4 & a,const v_uint32x4 & b,const v_uint32x4 & c,const v_uint32x4 & d)418 inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
419                            const v_uint32x4& c, const v_uint32x4& d)
420 {
421     uint16x8_t nab = vcombine_u16(vmovn_u32(a.val), vmovn_u32(b.val));
422     uint16x8_t ncd = vcombine_u16(vmovn_u32(c.val), vmovn_u32(d.val));
423     return v_uint8x16(vcombine_u8(vmovn_u16(nab), vmovn_u16(ncd)));
424 }
425 
v_pack_b(const v_uint64x2 & a,const v_uint64x2 & b,const v_uint64x2 & c,const v_uint64x2 & d,const v_uint64x2 & e,const v_uint64x2 & f,const v_uint64x2 & g,const v_uint64x2 & h)426 inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
427                            const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
428                            const v_uint64x2& g, const v_uint64x2& h)
429 {
430     uint32x4_t ab = vcombine_u32(vmovn_u64(a.val), vmovn_u64(b.val));
431     uint32x4_t cd = vcombine_u32(vmovn_u64(c.val), vmovn_u64(d.val));
432     uint32x4_t ef = vcombine_u32(vmovn_u64(e.val), vmovn_u64(f.val));
433     uint32x4_t gh = vcombine_u32(vmovn_u64(g.val), vmovn_u64(h.val));
434 
435     uint16x8_t abcd = vcombine_u16(vmovn_u32(ab), vmovn_u32(cd));
436     uint16x8_t efgh = vcombine_u16(vmovn_u32(ef), vmovn_u32(gh));
437     return v_uint8x16(vcombine_u8(vmovn_u16(abcd), vmovn_u16(efgh)));
438 }
439 
v_matmul(const v_float32x4 & v,const v_float32x4 & m0,const v_float32x4 & m1,const v_float32x4 & m2,const v_float32x4 & m3)440 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
441                             const v_float32x4& m1, const v_float32x4& m2,
442                             const v_float32x4& m3)
443 {
444     float32x2_t vl = vget_low_f32(v.val), vh = vget_high_f32(v.val);
445     float32x4_t res = vmulq_lane_f32(m0.val, vl, 0);
446     res = vmlaq_lane_f32(res, m1.val, vl, 1);
447     res = vmlaq_lane_f32(res, m2.val, vh, 0);
448     res = vmlaq_lane_f32(res, m3.val, vh, 1);
449     return v_float32x4(res);
450 }
451 
v_matmuladd(const v_float32x4 & v,const v_float32x4 & m0,const v_float32x4 & m1,const v_float32x4 & m2,const v_float32x4 & a)452 inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
453                                const v_float32x4& m1, const v_float32x4& m2,
454                                const v_float32x4& a)
455 {
456     float32x2_t vl = vget_low_f32(v.val), vh = vget_high_f32(v.val);
457     float32x4_t res = vmulq_lane_f32(m0.val, vl, 0);
458     res = vmlaq_lane_f32(res, m1.val, vl, 1);
459     res = vmlaq_lane_f32(res, m2.val, vh, 0);
460     res = vaddq_f32(res, a.val);
461     return v_float32x4(res);
462 }
463 
464 #define OPENCV_HAL_IMPL_NEON_BIN_OP(bin_op, _Tpvec, intrin) \
465 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
466 { \
467     return _Tpvec(intrin(a.val, b.val)); \
468 } \
469 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
470 { \
471     a.val = intrin(a.val, b.val); \
472     return a; \
473 }
474 
475 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint8x16, vqaddq_u8)
476 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint8x16, vqsubq_u8)
477 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int8x16, vqaddq_s8)
478 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int8x16, vqsubq_s8)
479 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint16x8, vqaddq_u16)
480 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint16x8, vqsubq_u16)
481 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int16x8, vqaddq_s16)
482 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int16x8, vqsubq_s16)
483 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int32x4, vaddq_s32)
484 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int32x4, vsubq_s32)
485 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int32x4, vmulq_s32)
486 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint32x4, vaddq_u32)
487 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint32x4, vsubq_u32)
488 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_uint32x4, vmulq_u32)
489 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_float32x4, vaddq_f32)
490 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_float32x4, vsubq_f32)
491 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_float32x4, vmulq_f32)
492 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int64x2, vaddq_s64)
493 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int64x2, vsubq_s64)
494 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint64x2, vaddq_u64)
495 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint64x2, vsubq_u64)
496 #if CV_SIMD128_64F
497 OPENCV_HAL_IMPL_NEON_BIN_OP(/, v_float32x4, vdivq_f32)
498 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_float64x2, vaddq_f64)
499 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_float64x2, vsubq_f64)
500 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_float64x2, vmulq_f64)
501 OPENCV_HAL_IMPL_NEON_BIN_OP(/, v_float64x2, vdivq_f64)
502 #else
503 inline v_float32x4 operator / (const v_float32x4& a, const v_float32x4& b)
504 {
505     float32x4_t reciprocal = vrecpeq_f32(b.val);
506     reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
507     reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
508     return v_float32x4(vmulq_f32(a.val, reciprocal));
509 }
510 inline v_float32x4& operator /= (v_float32x4& a, const v_float32x4& b)
511 {
512     float32x4_t reciprocal = vrecpeq_f32(b.val);
513     reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
514     reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
515     a.val = vmulq_f32(a.val, reciprocal);
516     return a;
517 }
518 #endif
519 
520 // saturating multiply 8-bit, 16-bit
521 #define OPENCV_HAL_IMPL_NEON_MUL_SAT(_Tpvec, _Tpwvec)            \
522     inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b)  \
523     {                                                            \
524         _Tpwvec c, d;                                            \
525         v_mul_expand(a, b, c, d);                                \
526         return v_pack(c, d);                                     \
527     }                                                            \
528     inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b)      \
529     { a = a * b; return a; }
530 
OPENCV_HAL_IMPL_NEON_MUL_SAT(v_int8x16,v_int16x8)531 OPENCV_HAL_IMPL_NEON_MUL_SAT(v_int8x16,  v_int16x8)
532 OPENCV_HAL_IMPL_NEON_MUL_SAT(v_uint8x16, v_uint16x8)
533 OPENCV_HAL_IMPL_NEON_MUL_SAT(v_int16x8,  v_int32x4)
534 OPENCV_HAL_IMPL_NEON_MUL_SAT(v_uint16x8, v_uint32x4)
535 
536 //  Multiply and expand
537 inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b,
538                          v_int16x8& c, v_int16x8& d)
539 {
540     c.val = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
541 #if CV_NEON_AARCH64
542     d.val = vmull_high_s8(a.val, b.val);
543 #else // #if CV_NEON_AARCH64
544     d.val = vmull_s8(vget_high_s8(a.val), vget_high_s8(b.val));
545 #endif // #if CV_NEON_AARCH64
546 }
547 
v_mul_expand(const v_uint8x16 & a,const v_uint8x16 & b,v_uint16x8 & c,v_uint16x8 & d)548 inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
549                          v_uint16x8& c, v_uint16x8& d)
550 {
551     c.val = vmull_u8(vget_low_u8(a.val), vget_low_u8(b.val));
552 #if CV_NEON_AARCH64
553     d.val = vmull_high_u8(a.val, b.val);
554 #else // #if CV_NEON_AARCH64
555     d.val = vmull_u8(vget_high_u8(a.val), vget_high_u8(b.val));
556 #endif // #if CV_NEON_AARCH64
557 }
558 
v_mul_expand(const v_int16x8 & a,const v_int16x8 & b,v_int32x4 & c,v_int32x4 & d)559 inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
560                          v_int32x4& c, v_int32x4& d)
561 {
562     c.val = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
563 #if CV_NEON_AARCH64
564     d.val = vmull_high_s16(a.val, b.val);
565 #else // #if CV_NEON_AARCH64
566     d.val = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
567 #endif // #if CV_NEON_AARCH64
568 }
569 
v_mul_expand(const v_uint16x8 & a,const v_uint16x8 & b,v_uint32x4 & c,v_uint32x4 & d)570 inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
571                          v_uint32x4& c, v_uint32x4& d)
572 {
573     c.val = vmull_u16(vget_low_u16(a.val), vget_low_u16(b.val));
574 #if CV_NEON_AARCH64
575     d.val = vmull_high_u16(a.val, b.val);
576 #else // #if CV_NEON_AARCH64
577     d.val = vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val));
578 #endif // #if CV_NEON_AARCH64
579 }
580 
v_mul_expand(const v_uint32x4 & a,const v_uint32x4 & b,v_uint64x2 & c,v_uint64x2 & d)581 inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
582                          v_uint64x2& c, v_uint64x2& d)
583 {
584     c.val = vmull_u32(vget_low_u32(a.val), vget_low_u32(b.val));
585 #if CV_NEON_AARCH64
586     d.val = vmull_high_u32(a.val, b.val);
587 #else // #if CV_NEON_AARCH64
588     d.val = vmull_u32(vget_high_u32(a.val), vget_high_u32(b.val));
589 #endif // #if CV_NEON_AARCH64
590 }
591 
v_mul_hi(const v_int16x8 & a,const v_int16x8 & b)592 inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
593 {
594     return v_int16x8(vcombine_s16(
595                                   vshrn_n_s32(vmull_s16( vget_low_s16(a.val),  vget_low_s16(b.val)), 16),
596                                   vshrn_n_s32(
597 #if CV_NEON_AARCH64
598                                     vmull_high_s16(a.val, b.val)
599 #else // #if CV_NEON_AARCH64
600                                     vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val))
601 #endif // #if CV_NEON_AARCH64
602                                     , 16)
603                                  ));
604 }
v_mul_hi(const v_uint16x8 & a,const v_uint16x8 & b)605 inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
606 {
607     return v_uint16x8(vcombine_u16(
608                                    vshrn_n_u32(vmull_u16( vget_low_u16(a.val),  vget_low_u16(b.val)), 16),
609                                    vshrn_n_u32(
610 #if CV_NEON_AARCH64
611                                     vmull_high_u16(a.val, b.val)
612 #else // #if CV_NEON_AARCH64
613                                     vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val))
614 #endif // #if CV_NEON_AARCH64
615                                     , 16)
616                                   ));
617 }
618 
619 //////// Dot Product ////////
620 
621 // 16 >> 32
v_dotprod(const v_int16x8 & a,const v_int16x8 & b)622 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
623 {
624     int16x8_t uzp1, uzp2;
625     _v128_unzip(a.val, b.val, uzp1, uzp2);
626     int16x4_t a0 = vget_low_s16(uzp1);
627     int16x4_t b0 = vget_high_s16(uzp1);
628     int16x4_t a1 = vget_low_s16(uzp2);
629     int16x4_t b1 = vget_high_s16(uzp2);
630     int32x4_t p = vmull_s16(a0, b0);
631     return v_int32x4(vmlal_s16(p, a1, b1));
632 }
v_dotprod(const v_int16x8 & a,const v_int16x8 & b,const v_int32x4 & c)633 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
634 {
635     int16x8_t uzp1, uzp2;
636     _v128_unzip(a.val, b.val, uzp1, uzp2);
637     int16x4_t a0 = vget_low_s16(uzp1);
638     int16x4_t b0 = vget_high_s16(uzp1);
639     int16x4_t a1 = vget_low_s16(uzp2);
640     int16x4_t b1 = vget_high_s16(uzp2);
641     int32x4_t p = vmlal_s16(c.val, a0, b0);
642     return v_int32x4(vmlal_s16(p, a1, b1));
643 }
644 
645 // 32 >> 64
v_dotprod(const v_int32x4 & a,const v_int32x4 & b)646 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
647 {
648     int32x4_t uzp1, uzp2;
649     _v128_unzip(a.val, b.val, uzp1, uzp2);
650     int32x2_t a0 = vget_low_s32(uzp1);
651     int32x2_t b0 = vget_high_s32(uzp1);
652     int32x2_t a1 = vget_low_s32(uzp2);
653     int32x2_t b1 = vget_high_s32(uzp2);
654     int64x2_t p = vmull_s32(a0, b0);
655     return v_int64x2(vmlal_s32(p, a1, b1));
656 }
v_dotprod(const v_int32x4 & a,const v_int32x4 & b,const v_int64x2 & c)657 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
658 {
659     int32x4_t uzp1, uzp2;
660     _v128_unzip(a.val, b.val, uzp1, uzp2);
661     int32x2_t a0 = vget_low_s32(uzp1);
662     int32x2_t b0 = vget_high_s32(uzp1);
663     int32x2_t a1 = vget_low_s32(uzp2);
664     int32x2_t b1 = vget_high_s32(uzp2);
665     int64x2_t p = vmlal_s32(c.val, a0, b0);
666     return v_int64x2(vmlal_s32(p, a1, b1));
667 }
668 
669 // 8 >> 32
v_dotprod_expand(const v_uint8x16 & a,const v_uint8x16 & b)670 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
671 {
672 #if CV_NEON_DOT
673     return v_uint32x4(vdotq_u32(vdupq_n_u32(0), a.val, b.val));
674 #else
675     const uint8x16_t zero   = vreinterpretq_u8_u32(vdupq_n_u32(0));
676     const uint8x16_t mask   = vreinterpretq_u8_u32(vdupq_n_u32(0x00FF00FF));
677     const uint16x8_t zero32 = vreinterpretq_u16_u32(vdupq_n_u32(0));
678     const uint16x8_t mask32 = vreinterpretq_u16_u32(vdupq_n_u32(0x0000FFFF));
679 
680     uint16x8_t even = vmulq_u16(vreinterpretq_u16_u8(vbslq_u8(mask, a.val, zero)),
681                                 vreinterpretq_u16_u8(vbslq_u8(mask, b.val, zero)));
682     uint16x8_t odd  = vmulq_u16(vshrq_n_u16(vreinterpretq_u16_u8(a.val), 8),
683                                 vshrq_n_u16(vreinterpretq_u16_u8(b.val), 8));
684 
685     uint32x4_t s0 = vaddq_u32(vreinterpretq_u32_u16(vbslq_u16(mask32, even, zero32)),
686                               vreinterpretq_u32_u16(vbslq_u16(mask32, odd,  zero32)));
687     uint32x4_t s1 = vaddq_u32(vshrq_n_u32(vreinterpretq_u32_u16(even), 16),
688                               vshrq_n_u32(vreinterpretq_u32_u16(odd),  16));
689     return v_uint32x4(vaddq_u32(s0, s1));
690 #endif
691 }
v_dotprod_expand(const v_uint8x16 & a,const v_uint8x16 & b,const v_uint32x4 & c)692 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b,
693                                    const v_uint32x4& c)
694 {
695 #if CV_NEON_DOT
696     return v_uint32x4(vdotq_u32(c.val, a.val, b.val));
697 #else
698     return v_dotprod_expand(a, b) + c;
699 #endif
700 }
701 
v_dotprod_expand(const v_int8x16 & a,const v_int8x16 & b)702 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
703 {
704 #if CV_NEON_DOT
705     return v_int32x4(vdotq_s32(vdupq_n_s32(0), a.val, b.val));
706 #else
707     int16x8_t p0  = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
708     int16x8_t p1  = vmull_s8(vget_high_s8(a.val), vget_high_s8(b.val));
709     int16x8_t uzp1, uzp2;
710     _v128_unzip(p0, p1, uzp1, uzp2);
711     int16x8_t sum = vaddq_s16(uzp1, uzp2);
712     int16x4_t uzpl1, uzpl2;
713     _v128_unzip(vget_low_s16(sum), vget_high_s16(sum), uzpl1, uzpl2);
714     return v_int32x4(vaddl_s16(uzpl1, uzpl2));
715 #endif
716 }
v_dotprod_expand(const v_int8x16 & a,const v_int8x16 & b,const v_int32x4 & c)717 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b,
718                                   const v_int32x4& c)
719 {
720 #if CV_NEON_DOT
721     return v_int32x4(vdotq_s32(c.val, a.val, b.val));
722 #else
723     return v_dotprod_expand(a, b) + c;
724 #endif
725 }
726 
727 // 16 >> 64
v_dotprod_expand(const v_uint16x8 & a,const v_uint16x8 & b)728 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
729 {
730     const uint16x8_t zero = vreinterpretq_u16_u32(vdupq_n_u32(0));
731     const uint16x8_t mask = vreinterpretq_u16_u32(vdupq_n_u32(0x0000FFFF));
732 
733     uint32x4_t even = vmulq_u32(vreinterpretq_u32_u16(vbslq_u16(mask, a.val, zero)),
734                                 vreinterpretq_u32_u16(vbslq_u16(mask, b.val, zero)));
735     uint32x4_t odd  = vmulq_u32(vshrq_n_u32(vreinterpretq_u32_u16(a.val), 16),
736                                 vshrq_n_u32(vreinterpretq_u32_u16(b.val), 16));
737     uint32x4_t uzp1, uzp2;
738     _v128_unzip(even, odd, uzp1, uzp2);
739     uint64x2_t s0  = vaddl_u32(vget_low_u32(uzp1), vget_high_u32(uzp1));
740     uint64x2_t s1  = vaddl_u32(vget_low_u32(uzp2), vget_high_u32(uzp2));
741     return v_uint64x2(vaddq_u64(s0, s1));
742 }
v_dotprod_expand(const v_uint16x8 & a,const v_uint16x8 & b,const v_uint64x2 & c)743 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
744 { return v_dotprod_expand(a, b) + c; }
745 
v_dotprod_expand(const v_int16x8 & a,const v_int16x8 & b)746 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
747 {
748     int32x4_t p0  = vmull_s16(vget_low_s16(a.val),  vget_low_s16(b.val));
749     int32x4_t p1  = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
750 
751     int32x4_t uzp1, uzp2;
752     _v128_unzip(p0, p1, uzp1, uzp2);
753     int32x4_t sum = vaddq_s32(uzp1, uzp2);
754 
755     int32x2_t uzpl1, uzpl2;
756     _v128_unzip(vget_low_s32(sum), vget_high_s32(sum), uzpl1, uzpl2);
757     return v_int64x2(vaddl_s32(uzpl1, uzpl2));
758 }
v_dotprod_expand(const v_int16x8 & a,const v_int16x8 & b,const v_int64x2 & c)759 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b,
760                                   const v_int64x2& c)
761 { return v_dotprod_expand(a, b) + c; }
762 
763 // 32 >> 64f
764 #if CV_SIMD128_64F
v_dotprod_expand(const v_int32x4 & a,const v_int32x4 & b)765 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
766 { return v_cvt_f64(v_dotprod(a, b)); }
v_dotprod_expand(const v_int32x4 & a,const v_int32x4 & b,const v_float64x2 & c)767 inline v_float64x2 v_dotprod_expand(const v_int32x4& a,   const v_int32x4& b,
768                                     const v_float64x2& c)
769 { return v_dotprod_expand(a, b) + c; }
770 #endif
771 
772 //////// Fast Dot Product ////////
773 
774 // 16 >> 32
v_dotprod_fast(const v_int16x8 & a,const v_int16x8 & b)775 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
776 {
777 #if CV_NEON_AARCH64
778     int32x4_t p = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
779     return v_int32x4(vmlal_high_s16(p, a.val, b.val));
780 #else
781     int16x4_t a0 = vget_low_s16(a.val);
782     int16x4_t a1 = vget_high_s16(a.val);
783     int16x4_t b0 = vget_low_s16(b.val);
784     int16x4_t b1 = vget_high_s16(b.val);
785     int32x4_t p = vmull_s16(a0, b0);
786     return v_int32x4(vmlal_s16(p, a1, b1));
787 #endif
788 }
v_dotprod_fast(const v_int16x8 & a,const v_int16x8 & b,const v_int32x4 & c)789 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
790 {
791 #if CV_NEON_AARCH64
792     int32x4_t p = vmlal_s16(c.val, vget_low_s16(a.val), vget_low_s16(b.val));
793     return v_int32x4(vmlal_high_s16(p, a.val, b.val));
794 #else
795     int16x4_t a0 = vget_low_s16(a.val);
796     int16x4_t a1 = vget_high_s16(a.val);
797     int16x4_t b0 = vget_low_s16(b.val);
798     int16x4_t b1 = vget_high_s16(b.val);
799     int32x4_t p = vmlal_s16(c.val, a0, b0);
800     return v_int32x4(vmlal_s16(p, a1, b1));
801 #endif
802 }
803 
804 // 32 >> 64
v_dotprod_fast(const v_int32x4 & a,const v_int32x4 & b)805 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
806 {
807 #if CV_NEON_AARCH64
808     int64x2_t p = vmull_s32(vget_low_s32(a.val), vget_low_s32(b.val));
809     return v_int64x2(vmlal_high_s32(p, a.val, b.val));
810 #else
811     int32x2_t a0 = vget_low_s32(a.val);
812     int32x2_t a1 = vget_high_s32(a.val);
813     int32x2_t b0 = vget_low_s32(b.val);
814     int32x2_t b1 = vget_high_s32(b.val);
815     int64x2_t p = vmull_s32(a0, b0);
816     return v_int64x2(vmlal_s32(p, a1, b1));
817 #endif
818 }
v_dotprod_fast(const v_int32x4 & a,const v_int32x4 & b,const v_int64x2 & c)819 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
820 {
821 #if CV_NEON_AARCH64
822     int64x2_t p = vmlal_s32(c.val, vget_low_s32(a.val), vget_low_s32(b.val));
823     return v_int64x2(vmlal_high_s32(p, a.val, b.val));
824 #else
825     int32x2_t a0 = vget_low_s32(a.val);
826     int32x2_t a1 = vget_high_s32(a.val);
827     int32x2_t b0 = vget_low_s32(b.val);
828     int32x2_t b1 = vget_high_s32(b.val);
829     int64x2_t p = vmlal_s32(c.val, a0, b0);
830     return v_int64x2(vmlal_s32(p, a1, b1));
831 #endif
832 }
833 
834 // 8 >> 32
v_dotprod_expand_fast(const v_uint8x16 & a,const v_uint8x16 & b)835 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
836 {
837 #if CV_NEON_DOT
838     return v_uint32x4(vdotq_u32(vdupq_n_u32(0), a.val, b.val));
839 #else
840     uint16x8_t p0 = vmull_u8(vget_low_u8(a.val), vget_low_u8(b.val));
841     uint16x8_t p1 = vmull_u8(vget_high_u8(a.val), vget_high_u8(b.val));
842     uint32x4_t s0 = vaddl_u16(vget_low_u16(p0), vget_low_u16(p1));
843     uint32x4_t s1 = vaddl_u16(vget_high_u16(p0), vget_high_u16(p1));
844     return v_uint32x4(vaddq_u32(s0, s1));
845 #endif
846 }
v_dotprod_expand_fast(const v_uint8x16 & a,const v_uint8x16 & b,const v_uint32x4 & c)847 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
848 {
849 #if CV_NEON_DOT
850     return v_uint32x4(vdotq_u32(c.val, a.val, b.val));
851 #else
852     return v_dotprod_expand_fast(a, b) + c;
853 #endif
854 }
855 
v_dotprod_expand_fast(const v_int8x16 & a,const v_int8x16 & b)856 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
857 {
858 #if CV_NEON_DOT
859     return v_int32x4(vdotq_s32(vdupq_n_s32(0), a.val, b.val));
860 #else
861     int16x8_t prod = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
862     prod = vmlal_s8(prod, vget_high_s8(a.val), vget_high_s8(b.val));
863     return v_int32x4(vaddl_s16(vget_low_s16(prod), vget_high_s16(prod)));
864 #endif
865 }
v_dotprod_expand_fast(const v_int8x16 & a,const v_int8x16 & b,const v_int32x4 & c)866 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
867 {
868 #if CV_NEON_DOT
869     return v_int32x4(vdotq_s32(c.val, a.val, b.val));
870 #else
871     return v_dotprod_expand_fast(a, b) + c;
872 #endif
873 }
874 
875 // 16 >> 64
v_dotprod_expand_fast(const v_uint16x8 & a,const v_uint16x8 & b)876 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
877 {
878     uint32x4_t p0  = vmull_u16(vget_low_u16(a.val),  vget_low_u16(b.val));
879     uint32x4_t p1  = vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val));
880     uint64x2_t s0  = vaddl_u32(vget_low_u32(p0), vget_high_u32(p0));
881     uint64x2_t s1  = vaddl_u32(vget_low_u32(p1), vget_high_u32(p1));
882     return v_uint64x2(vaddq_u64(s0, s1));
883 }
v_dotprod_expand_fast(const v_uint16x8 & a,const v_uint16x8 & b,const v_uint64x2 & c)884 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
885 { return v_dotprod_expand_fast(a, b) + c; }
886 
v_dotprod_expand_fast(const v_int16x8 & a,const v_int16x8 & b)887 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
888 {
889     int32x4_t prod = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
890     prod = vmlal_s16(prod, vget_high_s16(a.val), vget_high_s16(b.val));
891     return v_int64x2(vaddl_s32(vget_low_s32(prod), vget_high_s32(prod)));
892 }
v_dotprod_expand_fast(const v_int16x8 & a,const v_int16x8 & b,const v_int64x2 & c)893 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
894 { return v_dotprod_expand_fast(a, b) + c; }
895 
896 // 32 >> 64f
897 #if CV_SIMD128_64F
v_dotprod_expand_fast(const v_int32x4 & a,const v_int32x4 & b)898 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
899 { return v_cvt_f64(v_dotprod_fast(a, b)); }
v_dotprod_expand_fast(const v_int32x4 & a,const v_int32x4 & b,const v_float64x2 & c)900 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
901 { return v_dotprod_expand_fast(a, b) + c; }
902 #endif
903 
904 
905 #define OPENCV_HAL_IMPL_NEON_LOGIC_OP(_Tpvec, suffix) \
906     OPENCV_HAL_IMPL_NEON_BIN_OP(&, _Tpvec, vandq_##suffix) \
907     OPENCV_HAL_IMPL_NEON_BIN_OP(|, _Tpvec, vorrq_##suffix) \
908     OPENCV_HAL_IMPL_NEON_BIN_OP(^, _Tpvec, veorq_##suffix) \
909     inline _Tpvec operator ~ (const _Tpvec& a) \
910     { \
911         return _Tpvec(vreinterpretq_##suffix##_u8(vmvnq_u8(vreinterpretq_u8_##suffix(a.val)))); \
912     }
913 
914 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint8x16, u8)
915 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int8x16, s8)
916 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint16x8, u16)
917 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int16x8, s16)
918 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint32x4, u32)
919 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int32x4, s32)
920 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint64x2, u64)
921 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int64x2, s64)
922 
923 #define OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(bin_op, intrin) \
924 inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
925 { \
926     return v_float32x4(vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val)))); \
927 } \
928 inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
929 { \
930     a.val = vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val))); \
931     return a; \
932 }
933 
934 OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(&, vandq_s32)
935 OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(|, vorrq_s32)
936 OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(^, veorq_s32)
937 
938 inline v_float32x4 operator ~ (const v_float32x4& a)
939 {
940     return v_float32x4(vreinterpretq_f32_s32(vmvnq_s32(vreinterpretq_s32_f32(a.val))));
941 }
942 
943 #if CV_SIMD128_64F
v_sqrt(const v_float32x4 & x)944 inline v_float32x4 v_sqrt(const v_float32x4& x)
945 {
946     return v_float32x4(vsqrtq_f32(x.val));
947 }
948 
v_invsqrt(const v_float32x4 & x)949 inline v_float32x4 v_invsqrt(const v_float32x4& x)
950 {
951     v_float32x4 one = v_setall_f32(1.0f);
952     return one / v_sqrt(x);
953 }
954 #else
v_sqrt(const v_float32x4 & x)955 inline v_float32x4 v_sqrt(const v_float32x4& x)
956 {
957     float32x4_t x1 = vmaxq_f32(x.val, vdupq_n_f32(FLT_MIN));
958     float32x4_t e = vrsqrteq_f32(x1);
959     e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
960     e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
961     return v_float32x4(vmulq_f32(x.val, e));
962 }
963 
v_invsqrt(const v_float32x4 & x)964 inline v_float32x4 v_invsqrt(const v_float32x4& x)
965 {
966     float32x4_t e = vrsqrteq_f32(x.val);
967     e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e);
968     e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e);
969     return v_float32x4(e);
970 }
971 #endif
972 
973 #define OPENCV_HAL_IMPL_NEON_ABS(_Tpuvec, _Tpsvec, usuffix, ssuffix) \
974 inline _Tpuvec v_abs(const _Tpsvec& a) { return v_reinterpret_as_##usuffix(_Tpsvec(vabsq_##ssuffix(a.val))); }
975 
OPENCV_HAL_IMPL_NEON_ABS(v_uint8x16,v_int8x16,u8,s8)976 OPENCV_HAL_IMPL_NEON_ABS(v_uint8x16, v_int8x16, u8, s8)
977 OPENCV_HAL_IMPL_NEON_ABS(v_uint16x8, v_int16x8, u16, s16)
978 OPENCV_HAL_IMPL_NEON_ABS(v_uint32x4, v_int32x4, u32, s32)
979 
980 inline v_float32x4 v_abs(v_float32x4 x)
981 { return v_float32x4(vabsq_f32(x.val)); }
982 
983 #if CV_SIMD128_64F
984 #define OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(bin_op, intrin) \
985 inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
986 { \
987     return v_float64x2(vreinterpretq_f64_s64(intrin(vreinterpretq_s64_f64(a.val), vreinterpretq_s64_f64(b.val)))); \
988 } \
989 inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
990 { \
991     a.val = vreinterpretq_f64_s64(intrin(vreinterpretq_s64_f64(a.val), vreinterpretq_s64_f64(b.val))); \
992     return a; \
993 }
994 
995 OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(&, vandq_s64)
996 OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(|, vorrq_s64)
997 OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(^, veorq_s64)
998 
operator ~(const v_float64x2 & a)999 inline v_float64x2 operator ~ (const v_float64x2& a)
1000 {
1001     return v_float64x2(vreinterpretq_f64_s32(vmvnq_s32(vreinterpretq_s32_f64(a.val))));
1002 }
1003 
v_sqrt(const v_float64x2 & x)1004 inline v_float64x2 v_sqrt(const v_float64x2& x)
1005 {
1006     return v_float64x2(vsqrtq_f64(x.val));
1007 }
1008 
v_invsqrt(const v_float64x2 & x)1009 inline v_float64x2 v_invsqrt(const v_float64x2& x)
1010 {
1011     v_float64x2 one = v_setall_f64(1.0f);
1012     return one / v_sqrt(x);
1013 }
1014 
v_abs(v_float64x2 x)1015 inline v_float64x2 v_abs(v_float64x2 x)
1016 { return v_float64x2(vabsq_f64(x.val)); }
1017 #endif
1018 
1019 // TODO: exp, log, sin, cos
1020 
1021 #define OPENCV_HAL_IMPL_NEON_BIN_FUNC(_Tpvec, func, intrin) \
1022 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
1023 { \
1024     return _Tpvec(intrin(a.val, b.val)); \
1025 }
1026 
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16,v_min,vminq_u8)1027 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_min, vminq_u8)
1028 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_max, vmaxq_u8)
1029 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_min, vminq_s8)
1030 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_max, vmaxq_s8)
1031 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_min, vminq_u16)
1032 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_max, vmaxq_u16)
1033 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_min, vminq_s16)
1034 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_max, vmaxq_s16)
1035 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_min, vminq_u32)
1036 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_max, vmaxq_u32)
1037 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_min, vminq_s32)
1038 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_max, vmaxq_s32)
1039 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_min, vminq_f32)
1040 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_max, vmaxq_f32)
1041 #if CV_SIMD128_64F
1042 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_min, vminq_f64)
1043 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_max, vmaxq_f64)
1044 #endif
1045 
1046 #if CV_SIMD128_64F
1047 inline int64x2_t vmvnq_s64(int64x2_t a)
1048 {
1049     int64x2_t vx = vreinterpretq_s64_u32(vdupq_n_u32(0xFFFFFFFF));
1050     return veorq_s64(a, vx);
1051 }
vmvnq_u64(uint64x2_t a)1052 inline uint64x2_t vmvnq_u64(uint64x2_t a)
1053 {
1054     uint64x2_t vx = vreinterpretq_u64_u32(vdupq_n_u32(0xFFFFFFFF));
1055     return veorq_u64(a, vx);
1056 }
1057 #endif
1058 #define OPENCV_HAL_IMPL_NEON_INT_CMP_OP(_Tpvec, cast, suffix, not_suffix) \
1059 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
1060 { return _Tpvec(cast(vceqq_##suffix(a.val, b.val))); } \
1061 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
1062 { return _Tpvec(cast(vmvnq_##not_suffix(vceqq_##suffix(a.val, b.val)))); } \
1063 inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
1064 { return _Tpvec(cast(vcltq_##suffix(a.val, b.val))); } \
1065 inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
1066 { return _Tpvec(cast(vcgtq_##suffix(a.val, b.val))); } \
1067 inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
1068 { return _Tpvec(cast(vcleq_##suffix(a.val, b.val))); } \
1069 inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
1070 { return _Tpvec(cast(vcgeq_##suffix(a.val, b.val))); }
1071 
OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint8x16,OPENCV_HAL_NOP,u8,u8)1072 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint8x16, OPENCV_HAL_NOP, u8, u8)
1073 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int8x16, vreinterpretq_s8_u8, s8, u8)
1074 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint16x8, OPENCV_HAL_NOP, u16, u16)
1075 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int16x8, vreinterpretq_s16_u16, s16, u16)
1076 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint32x4, OPENCV_HAL_NOP, u32, u32)
1077 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int32x4, vreinterpretq_s32_u32, s32, u32)
1078 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float32x4, vreinterpretq_f32_u32, f32, u32)
1079 #if CV_SIMD128_64F
1080 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint64x2, OPENCV_HAL_NOP, u64, u64)
1081 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int64x2, vreinterpretq_s64_u64, s64, u64)
1082 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float64x2, vreinterpretq_f64_u64, f64, u64)
1083 #endif
1084 
1085 inline v_float32x4 v_not_nan(const v_float32x4& a)
1086 { return v_float32x4(vreinterpretq_f32_u32(vceqq_f32(a.val, a.val))); }
1087 #if CV_SIMD128_64F
v_not_nan(const v_float64x2 & a)1088 inline v_float64x2 v_not_nan(const v_float64x2& a)
1089 { return v_float64x2(vreinterpretq_f64_u64(vceqq_f64(a.val, a.val))); }
1090 #endif
1091 
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16,v_add_wrap,vaddq_u8)1092 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_add_wrap, vaddq_u8)
1093 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_add_wrap, vaddq_s8)
1094 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_add_wrap, vaddq_u16)
1095 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_add_wrap, vaddq_s16)
1096 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_sub_wrap, vsubq_u8)
1097 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_sub_wrap, vsubq_s8)
1098 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_sub_wrap, vsubq_u16)
1099 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_sub_wrap, vsubq_s16)
1100 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_mul_wrap, vmulq_u8)
1101 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_mul_wrap, vmulq_s8)
1102 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_mul_wrap, vmulq_u16)
1103 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_mul_wrap, vmulq_s16)
1104 
1105 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_absdiff, vabdq_u8)
1106 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_absdiff, vabdq_u16)
1107 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_absdiff, vabdq_u32)
1108 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_absdiff, vabdq_f32)
1109 #if CV_SIMD128_64F
1110 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_absdiff, vabdq_f64)
1111 #endif
1112 
1113 /** Saturating absolute difference **/
1114 inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
1115 { return v_int8x16(vqabsq_s8(vqsubq_s8(a.val, b.val))); }
v_absdiffs(const v_int16x8 & a,const v_int16x8 & b)1116 inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
1117 { return v_int16x8(vqabsq_s16(vqsubq_s16(a.val, b.val))); }
1118 
1119 #define OPENCV_HAL_IMPL_NEON_BIN_FUNC2(_Tpvec, _Tpvec2, cast, func, intrin) \
1120 inline _Tpvec2 func(const _Tpvec& a, const _Tpvec& b) \
1121 { \
1122     return _Tpvec2(cast(intrin(a.val, b.val))); \
1123 }
1124 
OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int8x16,v_uint8x16,vreinterpretq_u8_s8,v_absdiff,vabdq_s8)1125 OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int8x16, v_uint8x16, vreinterpretq_u8_s8, v_absdiff, vabdq_s8)
1126 OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int16x8, v_uint16x8, vreinterpretq_u16_s16, v_absdiff, vabdq_s16)
1127 OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int32x4, v_uint32x4, vreinterpretq_u32_s32, v_absdiff, vabdq_s32)
1128 
1129 inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b)
1130 {
1131     v_float32x4 x(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val));
1132     return v_sqrt(x);
1133 }
1134 
v_sqr_magnitude(const v_float32x4 & a,const v_float32x4 & b)1135 inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b)
1136 {
1137     return v_float32x4(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val));
1138 }
1139 
v_fma(const v_float32x4 & a,const v_float32x4 & b,const v_float32x4 & c)1140 inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
1141 {
1142 #if CV_SIMD128_64F
1143     // ARMv8, which adds support for 64-bit floating-point (so CV_SIMD128_64F is defined),
1144     // also adds FMA support both for single- and double-precision floating-point vectors
1145     return v_float32x4(vfmaq_f32(c.val, a.val, b.val));
1146 #else
1147     return v_float32x4(vmlaq_f32(c.val, a.val, b.val));
1148 #endif
1149 }
1150 
v_fma(const v_int32x4 & a,const v_int32x4 & b,const v_int32x4 & c)1151 inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
1152 {
1153     return v_int32x4(vmlaq_s32(c.val, a.val, b.val));
1154 }
1155 
v_muladd(const v_float32x4 & a,const v_float32x4 & b,const v_float32x4 & c)1156 inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
1157 {
1158     return v_fma(a, b, c);
1159 }
1160 
v_muladd(const v_int32x4 & a,const v_int32x4 & b,const v_int32x4 & c)1161 inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
1162 {
1163     return v_fma(a, b, c);
1164 }
1165 
1166 #if CV_SIMD128_64F
v_magnitude(const v_float64x2 & a,const v_float64x2 & b)1167 inline v_float64x2 v_magnitude(const v_float64x2& a, const v_float64x2& b)
1168 {
1169     v_float64x2 x(vaddq_f64(vmulq_f64(a.val, a.val), vmulq_f64(b.val, b.val)));
1170     return v_sqrt(x);
1171 }
1172 
v_sqr_magnitude(const v_float64x2 & a,const v_float64x2 & b)1173 inline v_float64x2 v_sqr_magnitude(const v_float64x2& a, const v_float64x2& b)
1174 {
1175     return v_float64x2(vaddq_f64(vmulq_f64(a.val, a.val), vmulq_f64(b.val, b.val)));
1176 }
1177 
v_fma(const v_float64x2 & a,const v_float64x2 & b,const v_float64x2 & c)1178 inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
1179 {
1180     return v_float64x2(vfmaq_f64(c.val, a.val, b.val));
1181 }
1182 
v_muladd(const v_float64x2 & a,const v_float64x2 & b,const v_float64x2 & c)1183 inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
1184 {
1185     return v_fma(a, b, c);
1186 }
1187 #endif
1188 
1189 // trade efficiency for convenience
1190 #define OPENCV_HAL_IMPL_NEON_SHIFT_OP(_Tpvec, suffix, _Tps, ssuffix) \
1191 inline _Tpvec operator << (const _Tpvec& a, int n) \
1192 { return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)n))); } \
1193 inline _Tpvec operator >> (const _Tpvec& a, int n) \
1194 { return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)-n))); } \
1195 template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
1196 { return _Tpvec(vshlq_n_##suffix(a.val, n)); } \
1197 template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
1198 { return _Tpvec(vshrq_n_##suffix(a.val, n)); } \
1199 template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
1200 { return _Tpvec(vrshrq_n_##suffix(a.val, n)); }
1201 
OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint8x16,u8,schar,s8)1202 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint8x16, u8, schar, s8)
1203 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int8x16, s8, schar, s8)
1204 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint16x8, u16, short, s16)
1205 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int16x8, s16, short, s16)
1206 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint32x4, u32, int, s32)
1207 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int32x4, s32, int, s32)
1208 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint64x2, u64, int64, s64)
1209 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int64x2, s64, int64, s64)
1210 
1211 #define OPENCV_HAL_IMPL_NEON_ROTATE_OP(_Tpvec, suffix) \
1212 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
1213 { return _Tpvec(vextq_##suffix(a.val, vdupq_n_##suffix(0), n)); } \
1214 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
1215 { return _Tpvec(vextq_##suffix(vdupq_n_##suffix(0), a.val, _Tpvec::nlanes - n)); } \
1216 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
1217 { return a; } \
1218 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
1219 { return _Tpvec(vextq_##suffix(a.val, b.val, n)); } \
1220 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
1221 { return _Tpvec(vextq_##suffix(b.val, a.val, _Tpvec::nlanes - n)); } \
1222 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
1223 { CV_UNUSED(b); return a; }
1224 
1225 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint8x16, u8)
1226 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int8x16, s8)
1227 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint16x8, u16)
1228 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int16x8, s16)
1229 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint32x4, u32)
1230 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int32x4, s32)
1231 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_float32x4, f32)
1232 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint64x2, u64)
1233 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int64x2, s64)
1234 #if CV_SIMD128_64F
1235 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_float64x2, f64)
1236 #endif
1237 
1238 #if defined(__clang__) && defined(__aarch64__)
1239 // avoid LD2 instruction. details: https://github.com/opencv/opencv/issues/14863
1240 #define OPENCV_HAL_IMPL_NEON_LOAD_LOW_OP(_Tpvec, _Tp, suffix) \
1241 inline _Tpvec v_load_low(const _Tp* ptr) \
1242 { \
1243 typedef uint64 CV_DECL_ALIGNED(1) unaligned_uint64; \
1244 uint64 v = *(unaligned_uint64*)ptr; \
1245 return _Tpvec(v_reinterpret_as_##suffix(v_uint64x2(v, (uint64)123456))); \
1246 }
1247 #else
1248 #define OPENCV_HAL_IMPL_NEON_LOAD_LOW_OP(_Tpvec, _Tp, suffix) \
1249 inline _Tpvec v_load_low(const _Tp* ptr) \
1250 { return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr), vdup_n_##suffix((_Tp)0))); }
1251 #endif
1252 
1253 #define OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(_Tpvec, _Tp, suffix) \
1254 inline _Tpvec v_load(const _Tp* ptr) \
1255 { return _Tpvec(vld1q_##suffix(ptr)); } \
1256 inline _Tpvec v_load_aligned(const _Tp* ptr) \
1257 { return _Tpvec(vld1q_##suffix(ptr)); } \
1258 OPENCV_HAL_IMPL_NEON_LOAD_LOW_OP(_Tpvec, _Tp, suffix) \
1259 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
1260 { return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr0), vld1_##suffix(ptr1))); } \
1261 inline void v_store(_Tp* ptr, const _Tpvec& a) \
1262 { vst1q_##suffix(ptr, a.val); } \
1263 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
1264 { vst1q_##suffix(ptr, a.val); } \
1265 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
1266 { vst1q_##suffix(ptr, a.val); } \
1267 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
1268 { vst1q_##suffix(ptr, a.val); } \
1269 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
1270 { vst1_##suffix(ptr, vget_low_##suffix(a.val)); } \
1271 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
1272 { vst1_##suffix(ptr, vget_high_##suffix(a.val)); }
1273 
1274 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint8x16, uchar, u8)
1275 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int8x16, schar, s8)
1276 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint16x8, ushort, u16)
1277 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int16x8, short, s16)
1278 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint32x4, unsigned, u32)
1279 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int32x4, int, s32)
1280 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint64x2, uint64, u64)
1281 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int64x2, int64, s64)
1282 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float32x4, float, f32)
1283 #if CV_SIMD128_64F
1284 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float64x2, double, f64)
1285 #endif
1286 
1287 inline unsigned v_reduce_sum(const v_uint8x16& a)
1288 {
1289 #if CV_NEON_AARCH64
1290     uint16_t t0 = vaddlvq_u8(a.val);
1291     return t0;
1292 #else // #if CV_NEON_AARCH64
1293     uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(a.val));
1294     uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1295     return vget_lane_u32(vpadd_u32(t1, t1), 0);
1296 #endif // #if CV_NEON_AARCH64
1297 }
v_reduce_sum(const v_int8x16 & a)1298 inline int v_reduce_sum(const v_int8x16& a)
1299 {
1300 #if CV_NEON_AARCH64
1301     int16_t t0 = vaddlvq_s8(a.val);
1302     return t0;
1303 #else // #if CV_NEON_AARCH64
1304     int32x4_t t0 = vpaddlq_s16(vpaddlq_s8(a.val));
1305     int32x2_t t1 = vpadd_s32(vget_low_s32(t0), vget_high_s32(t0));
1306     return vget_lane_s32(vpadd_s32(t1, t1), 0);
1307 #endif // #if CV_NEON_AARCH64
1308 }
v_reduce_sum(const v_uint16x8 & a)1309 inline unsigned v_reduce_sum(const v_uint16x8& a)
1310 {
1311 #if CV_NEON_AARCH64
1312     uint32_t t0 = vaddlvq_u16(a.val);
1313     return t0;
1314 #else // #if CV_NEON_AARCH64
1315     uint32x4_t t0 = vpaddlq_u16(a.val);
1316     uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1317     return vget_lane_u32(vpadd_u32(t1, t1), 0);
1318 #endif // #if CV_NEON_AARCH64
1319 }
v_reduce_sum(const v_int16x8 & a)1320 inline int v_reduce_sum(const v_int16x8& a)
1321 {
1322 #if CV_NEON_AARCH64
1323     int32_t t0 = vaddlvq_s16(a.val);
1324     return t0;
1325 #else // #if CV_NEON_AARCH64
1326     int32x4_t t0 = vpaddlq_s16(a.val);
1327     int32x2_t t1 = vpadd_s32(vget_low_s32(t0), vget_high_s32(t0));
1328     return vget_lane_s32(vpadd_s32(t1, t1), 0);
1329 #endif // #if CV_NEON_AARCH64
1330 }
1331 
1332 #if CV_NEON_AARCH64
1333 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
1334 inline scalartype v_reduce_##func(const _Tpvec& a) \
1335 { \
1336     return v##vectorfunc##vq_##suffix(a.val); \
1337 }
1338 #else // #if CV_NEON_AARCH64
1339 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
1340 inline scalartype v_reduce_##func(const _Tpvec& a) \
1341 { \
1342     _Tpnvec##_t a0 = vp##vectorfunc##_##suffix(vget_low_##suffix(a.val), vget_high_##suffix(a.val)); \
1343     a0 = vp##vectorfunc##_##suffix(a0, a0); \
1344     a0 = vp##vectorfunc##_##suffix(a0, a0); \
1345     return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, a0),0); \
1346 }
1347 #endif // #if CV_NEON_AARCH64
1348 
OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(v_uint8x16,uint8x8,uchar,max,max,u8)1349 OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(v_uint8x16, uint8x8, uchar, max, max, u8)
1350 OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(v_uint8x16, uint8x8, uchar, min, min, u8)
1351 OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(v_int8x16, int8x8, schar, max, max, s8)
1352 OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(v_int8x16, int8x8, schar, min, min, s8)
1353 
1354 #if CV_NEON_AARCH64
1355 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
1356 inline scalartype v_reduce_##func(const _Tpvec& a) \
1357 { \
1358     return v##vectorfunc##vq_##suffix(a.val); \
1359 }
1360 #else // #if CV_NEON_AARCH64
1361 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
1362 inline scalartype v_reduce_##func(const _Tpvec& a) \
1363 { \
1364     _Tpnvec##_t a0 = vp##vectorfunc##_##suffix(vget_low_##suffix(a.val), vget_high_##suffix(a.val)); \
1365     a0 = vp##vectorfunc##_##suffix(a0, a0); \
1366     return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, a0),0); \
1367 }
1368 #endif // #if CV_NEON_AARCH64
1369 
1370 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, ushort, max, max, u16)
1371 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, ushort, min, min, u16)
1372 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, max, max, s16)
1373 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, min, min, s16)
1374 
1375 #if CV_NEON_AARCH64
1376 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
1377 inline scalartype v_reduce_##func(const _Tpvec& a) \
1378 { \
1379     return v##vectorfunc##vq_##suffix(a.val); \
1380 }
1381 #else // #if CV_NEON_AARCH64
1382 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
1383 inline scalartype v_reduce_##func(const _Tpvec& a) \
1384 { \
1385     _Tpnvec##_t a0 = vp##vectorfunc##_##suffix(vget_low_##suffix(a.val), vget_high_##suffix(a.val)); \
1386     return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, vget_high_##suffix(a.val)),0); \
1387 }
1388 #endif // #if CV_NEON_AARCH64
1389 
1390 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, uint32x2, unsigned, sum, add, u32)
1391 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, uint32x2, unsigned, max, max, u32)
1392 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, uint32x2, unsigned, min, min, u32)
1393 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int32x2, int, sum, add, s32)
1394 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int32x2, int, max, max, s32)
1395 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int32x2, int, min, min, s32)
1396 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, sum, add, f32)
1397 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, max, max, f32)
1398 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, min, min, f32)
1399 
1400 inline uint64 v_reduce_sum(const v_uint64x2& a)
1401 {
1402 #if CV_NEON_AARCH64
1403     return vaddvq_u64(a.val);
1404 #else // #if CV_NEON_AARCH64
1405     return vget_lane_u64(vadd_u64(vget_low_u64(a.val), vget_high_u64(a.val)),0);
1406 #endif // #if CV_NEON_AARCH64
1407 }
v_reduce_sum(const v_int64x2 & a)1408 inline int64 v_reduce_sum(const v_int64x2& a)
1409 {
1410 #if CV_NEON_AARCH64
1411     return vaddvq_s64(a.val);
1412 #else // #if CV_NEON_AARCH64
1413     return vget_lane_s64(vadd_s64(vget_low_s64(a.val), vget_high_s64(a.val)),0);
1414 #endif // #if CV_NEON_AARCH64
1415 }
1416 #if CV_SIMD128_64F
v_reduce_sum(const v_float64x2 & a)1417 inline double v_reduce_sum(const v_float64x2& a)
1418 {
1419     return vaddvq_f64(a.val);
1420 }
1421 #endif
1422 
v_reduce_sum4(const v_float32x4 & a,const v_float32x4 & b,const v_float32x4 & c,const v_float32x4 & d)1423 inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
1424                                  const v_float32x4& c, const v_float32x4& d)
1425 {
1426 #if CV_NEON_AARCH64
1427     float32x4_t ab = vpaddq_f32(a.val, b.val); // a0+a1 a2+a3 b0+b1 b2+b3
1428     float32x4_t cd = vpaddq_f32(c.val, d.val); // c0+c1 d0+d1 c2+c3 d2+d3
1429     return v_float32x4(vpaddq_f32(ab, cd));  // sumA sumB sumC sumD
1430 #else // #if CV_NEON_AARCH64
1431     float32x4x2_t ab = vtrnq_f32(a.val, b.val);
1432     float32x4x2_t cd = vtrnq_f32(c.val, d.val);
1433 
1434     float32x4_t u0 = vaddq_f32(ab.val[0], ab.val[1]); // a0+a1 b0+b1 a2+a3 b2+b3
1435     float32x4_t u1 = vaddq_f32(cd.val[0], cd.val[1]); // c0+c1 d0+d1 c2+c3 d2+d3
1436 
1437     float32x4_t v0 = vcombine_f32(vget_low_f32(u0), vget_low_f32(u1));
1438     float32x4_t v1 = vcombine_f32(vget_high_f32(u0), vget_high_f32(u1));
1439 
1440     return v_float32x4(vaddq_f32(v0, v1));
1441 #endif // #if CV_NEON_AARCH64
1442 }
1443 
v_reduce_sad(const v_uint8x16 & a,const v_uint8x16 & b)1444 inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
1445 {
1446 #if CV_NEON_AARCH64
1447     uint8x16_t t0 = vabdq_u8(a.val, b.val);
1448     uint16_t t1 = vaddlvq_u8(t0);
1449     return t1;
1450 #else // #if CV_NEON_AARCH64
1451     uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(vabdq_u8(a.val, b.val)));
1452     uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1453     return vget_lane_u32(vpadd_u32(t1, t1), 0);
1454 #endif // #if CV_NEON_AARCH64
1455 }
v_reduce_sad(const v_int8x16 & a,const v_int8x16 & b)1456 inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
1457 {
1458 #if CV_NEON_AARCH64
1459     uint8x16_t t0 = vreinterpretq_u8_s8(vabdq_s8(a.val, b.val));
1460     uint16_t t1 = vaddlvq_u8(t0);
1461     return t1;
1462 #else // #if CV_NEON_AARCH64
1463     uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(vreinterpretq_u8_s8(vabdq_s8(a.val, b.val))));
1464     uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1465     return vget_lane_u32(vpadd_u32(t1, t1), 0);
1466 #endif // #if CV_NEON_AARCH64
1467 }
v_reduce_sad(const v_uint16x8 & a,const v_uint16x8 & b)1468 inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
1469 {
1470 #if CV_NEON_AARCH64
1471     uint16x8_t t0 = vabdq_u16(a.val, b.val);
1472     uint32_t t1 = vaddlvq_u16(t0);
1473     return t1;
1474 #else // #if CV_NEON_AARCH64
1475     uint32x4_t t0 = vpaddlq_u16(vabdq_u16(a.val, b.val));
1476     uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1477     return vget_lane_u32(vpadd_u32(t1, t1), 0);
1478 #endif // #if CV_NEON_AARCH64
1479 }
v_reduce_sad(const v_int16x8 & a,const v_int16x8 & b)1480 inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
1481 {
1482 #if CV_NEON_AARCH64
1483     uint16x8_t t0 = vreinterpretq_u16_s16(vabdq_s16(a.val, b.val));
1484     uint32_t t1 = vaddlvq_u16(t0);
1485     return t1;
1486 #else // #if CV_NEON_AARCH64
1487     uint32x4_t t0 = vpaddlq_u16(vreinterpretq_u16_s16(vabdq_s16(a.val, b.val)));
1488     uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1489     return vget_lane_u32(vpadd_u32(t1, t1), 0);
1490 #endif // #if CV_NEON_AARCH64
1491 }
v_reduce_sad(const v_uint32x4 & a,const v_uint32x4 & b)1492 inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
1493 {
1494 #if CV_NEON_AARCH64
1495     uint32x4_t t0 = vabdq_u32(a.val, b.val);
1496     uint32_t t1 = vaddvq_u32(t0);
1497     return t1;
1498 #else // #if CV_NEON_AARCH64
1499     uint32x4_t t0 = vabdq_u32(a.val, b.val);
1500     uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1501     return vget_lane_u32(vpadd_u32(t1, t1), 0);
1502 #endif // #if CV_NEON_AARCH64
1503 }
v_reduce_sad(const v_int32x4 & a,const v_int32x4 & b)1504 inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)
1505 {
1506 #if CV_NEON_AARCH64
1507     uint32x4_t t0 = vreinterpretq_u32_s32(vabdq_s32(a.val, b.val));
1508     uint32_t t1 = vaddvq_u32(t0);
1509     return t1;
1510 #else // #if CV_NEON_AARCH64
1511     uint32x4_t t0 = vreinterpretq_u32_s32(vabdq_s32(a.val, b.val));
1512     uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1513     return vget_lane_u32(vpadd_u32(t1, t1), 0);
1514 #endif // #if CV_NEON_AARCH64
1515 }
v_reduce_sad(const v_float32x4 & a,const v_float32x4 & b)1516 inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
1517 {
1518 #if CV_NEON_AARCH64
1519     float32x4_t t0 = vabdq_f32(a.val, b.val);
1520     return vaddvq_f32(t0);
1521 #else // #if CV_NEON_AARCH64
1522     float32x4_t t0 = vabdq_f32(a.val, b.val);
1523     float32x2_t t1 = vpadd_f32(vget_low_f32(t0), vget_high_f32(t0));
1524     return vget_lane_f32(vpadd_f32(t1, t1), 0);
1525 #endif // #if CV_NEON_AARCH64
1526 }
1527 
v_popcount(const v_uint8x16 & a)1528 inline v_uint8x16 v_popcount(const v_uint8x16& a)
1529 { return v_uint8x16(vcntq_u8(a.val)); }
v_popcount(const v_int8x16 & a)1530 inline v_uint8x16 v_popcount(const v_int8x16& a)
1531 { return v_uint8x16(vcntq_u8(vreinterpretq_u8_s8(a.val))); }
v_popcount(const v_uint16x8 & a)1532 inline v_uint16x8 v_popcount(const v_uint16x8& a)
1533 { return v_uint16x8(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u16(a.val)))); }
v_popcount(const v_int16x8 & a)1534 inline v_uint16x8 v_popcount(const v_int16x8& a)
1535 { return v_uint16x8(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_s16(a.val)))); }
v_popcount(const v_uint32x4 & a)1536 inline v_uint32x4 v_popcount(const v_uint32x4& a)
1537 { return v_uint32x4(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u32(a.val))))); }
v_popcount(const v_int32x4 & a)1538 inline v_uint32x4 v_popcount(const v_int32x4& a)
1539 { return v_uint32x4(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_s32(a.val))))); }
v_popcount(const v_uint64x2 & a)1540 inline v_uint64x2 v_popcount(const v_uint64x2& a)
1541 { return v_uint64x2(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(a.val)))))); }
v_popcount(const v_int64x2 & a)1542 inline v_uint64x2 v_popcount(const v_int64x2& a)
1543 { return v_uint64x2(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_s64(a.val)))))); }
1544 
v_signmask(const v_uint8x16 & a)1545 inline int v_signmask(const v_uint8x16& a)
1546 {
1547 #if CV_NEON_AARCH64
1548     const int8x16_t signPosition = {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7};
1549     const uint8x16_t byteOrder = {0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15};
1550     uint8x16_t v0 = vshlq_u8(vshrq_n_u8(a.val, 7), signPosition);
1551     uint8x16_t v1 = vqtbl1q_u8(v0, byteOrder);
1552     uint32_t t0 = vaddlvq_u16(vreinterpretq_u16_u8(v1));
1553     return t0;
1554 #else // #if CV_NEON_AARCH64
1555     int8x8_t m0 = vcreate_s8(CV_BIG_UINT(0x0706050403020100));
1556     uint8x16_t v0 = vshlq_u8(vshrq_n_u8(a.val, 7), vcombine_s8(m0, m0));
1557     uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v0)));
1558     return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 8);
1559 #endif // #if CV_NEON_AARCH64
1560 }
1561 
v_signmask(const v_int8x16 & a)1562 inline int v_signmask(const v_int8x16& a)
1563 { return v_signmask(v_reinterpret_as_u8(a)); }
1564 
v_signmask(const v_uint16x8 & a)1565 inline int v_signmask(const v_uint16x8& a)
1566 {
1567 #if CV_NEON_AARCH64
1568     const int16x8_t signPosition = {0,1,2,3,4,5,6,7};
1569     uint16x8_t v0 = vshlq_u16(vshrq_n_u16(a.val, 15), signPosition);
1570     uint32_t t0 = vaddlvq_u16(v0);
1571     return t0;
1572 #else // #if CV_NEON_AARCH64
1573     int16x4_t m0 = vcreate_s16(CV_BIG_UINT(0x0003000200010000));
1574     uint16x8_t v0 = vshlq_u16(vshrq_n_u16(a.val, 15), vcombine_s16(m0, m0));
1575     uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(v0));
1576     return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 4);
1577 #endif // #if CV_NEON_AARCH64
1578 }
v_signmask(const v_int16x8 & a)1579 inline int v_signmask(const v_int16x8& a)
1580 { return v_signmask(v_reinterpret_as_u16(a)); }
1581 
v_signmask(const v_uint32x4 & a)1582 inline int v_signmask(const v_uint32x4& a)
1583 {
1584 #if CV_NEON_AARCH64
1585     const int32x4_t signPosition = {0,1,2,3};
1586     uint32x4_t v0 = vshlq_u32(vshrq_n_u32(a.val, 31), signPosition);
1587     uint32_t t0 = vaddvq_u32(v0);
1588     return t0;
1589 #else // #if CV_NEON_AARCH64
1590     int32x2_t m0 = vcreate_s32(CV_BIG_UINT(0x0000000100000000));
1591     uint32x4_t v0 = vshlq_u32(vshrq_n_u32(a.val, 31), vcombine_s32(m0, m0));
1592     uint64x2_t v1 = vpaddlq_u32(v0);
1593     return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 2);
1594 #endif // #if CV_NEON_AARCH64
1595 }
v_signmask(const v_int32x4 & a)1596 inline int v_signmask(const v_int32x4& a)
1597 { return v_signmask(v_reinterpret_as_u32(a)); }
v_signmask(const v_float32x4 & a)1598 inline int v_signmask(const v_float32x4& a)
1599 { return v_signmask(v_reinterpret_as_u32(a)); }
v_signmask(const v_uint64x2 & a)1600 inline int v_signmask(const v_uint64x2& a)
1601 {
1602 #if CV_NEON_AARCH64
1603     const int64x2_t signPosition = {0,1};
1604     uint64x2_t v0 = vshlq_u64(vshrq_n_u64(a.val, 63), signPosition);
1605     uint64_t t0 = vaddvq_u64(v0);
1606     return t0;
1607 #else // #if CV_NEON_AARCH64
1608     int64x1_t m0 = vdup_n_s64(0);
1609     uint64x2_t v0 = vshlq_u64(vshrq_n_u64(a.val, 63), vcombine_s64(m0, m0));
1610     return (int)vgetq_lane_u64(v0, 0) + ((int)vgetq_lane_u64(v0, 1) << 1);
1611 #endif // #if CV_NEON_AARCH64
1612 }
v_signmask(const v_int64x2 & a)1613 inline int v_signmask(const v_int64x2& a)
1614 { return v_signmask(v_reinterpret_as_u64(a)); }
1615 #if CV_SIMD128_64F
v_signmask(const v_float64x2 & a)1616 inline int v_signmask(const v_float64x2& a)
1617 { return v_signmask(v_reinterpret_as_u64(a)); }
1618 #endif
1619 
v_scan_forward(const v_int8x16 & a)1620 inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmask(a)); }
v_scan_forward(const v_uint8x16 & a)1621 inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(a)); }
v_scan_forward(const v_int16x8 & a)1622 inline int v_scan_forward(const v_int16x8& a) { return trailingZeros32(v_signmask(a)); }
v_scan_forward(const v_uint16x8 & a)1623 inline int v_scan_forward(const v_uint16x8& a) { return trailingZeros32(v_signmask(a)); }
v_scan_forward(const v_int32x4 & a)1624 inline int v_scan_forward(const v_int32x4& a) { return trailingZeros32(v_signmask(a)); }
v_scan_forward(const v_uint32x4 & a)1625 inline int v_scan_forward(const v_uint32x4& a) { return trailingZeros32(v_signmask(a)); }
v_scan_forward(const v_float32x4 & a)1626 inline int v_scan_forward(const v_float32x4& a) { return trailingZeros32(v_signmask(a)); }
v_scan_forward(const v_int64x2 & a)1627 inline int v_scan_forward(const v_int64x2& a) { return trailingZeros32(v_signmask(a)); }
v_scan_forward(const v_uint64x2 & a)1628 inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signmask(a)); }
1629 #if CV_SIMD128_64F
v_scan_forward(const v_float64x2 & a)1630 inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(a)); }
1631 #endif
1632 
1633 #if CV_NEON_AARCH64
1634     #define OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(_Tpvec, suffix, shift) \
1635     inline bool v_check_all(const v_##_Tpvec& a) \
1636     { \
1637         return (vminvq_##suffix(a.val) >> shift) != 0; \
1638     } \
1639     inline bool v_check_any(const v_##_Tpvec& a) \
1640     { \
1641         return (vmaxvq_##suffix(a.val) >> shift) != 0; \
1642     }
1643 #else // #if CV_NEON_AARCH64
1644     #define OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(_Tpvec, suffix, shift) \
1645     inline bool v_check_all(const v_##_Tpvec& a) \
1646     { \
1647         _Tpvec##_t v0 = vshrq_n_##suffix(vmvnq_##suffix(a.val), shift); \
1648         uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \
1649         return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) == 0; \
1650     } \
1651     inline bool v_check_any(const v_##_Tpvec& a) \
1652     { \
1653         _Tpvec##_t v0 = vshrq_n_##suffix(a.val, shift); \
1654         uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \
1655         return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) != 0; \
1656     }
1657 #endif // #if CV_NEON_AARCH64
1658 
1659 OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint8x16, u8, 7)
1660 OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint16x8, u16, 15)
1661 OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint32x4, u32, 31)
1662 
v_check_all(const v_uint64x2 & a)1663 inline bool v_check_all(const v_uint64x2& a)
1664 {
1665     uint64x2_t v0 = vshrq_n_u64(a.val, 63);
1666     return (vgetq_lane_u64(v0, 0) & vgetq_lane_u64(v0, 1)) == 1;
1667 }
v_check_any(const v_uint64x2 & a)1668 inline bool v_check_any(const v_uint64x2& a)
1669 {
1670     uint64x2_t v0 = vshrq_n_u64(a.val, 63);
1671     return (vgetq_lane_u64(v0, 0) | vgetq_lane_u64(v0, 1)) != 0;
1672 }
1673 
v_check_all(const v_int8x16 & a)1674 inline bool v_check_all(const v_int8x16& a)
1675 { return v_check_all(v_reinterpret_as_u8(a)); }
v_check_all(const v_int16x8 & a)1676 inline bool v_check_all(const v_int16x8& a)
1677 { return v_check_all(v_reinterpret_as_u16(a)); }
v_check_all(const v_int32x4 & a)1678 inline bool v_check_all(const v_int32x4& a)
1679 { return v_check_all(v_reinterpret_as_u32(a)); }
v_check_all(const v_float32x4 & a)1680 inline bool v_check_all(const v_float32x4& a)
1681 { return v_check_all(v_reinterpret_as_u32(a)); }
1682 
v_check_any(const v_int8x16 & a)1683 inline bool v_check_any(const v_int8x16& a)
1684 { return v_check_any(v_reinterpret_as_u8(a)); }
v_check_any(const v_int16x8 & a)1685 inline bool v_check_any(const v_int16x8& a)
1686 { return v_check_any(v_reinterpret_as_u16(a)); }
v_check_any(const v_int32x4 & a)1687 inline bool v_check_any(const v_int32x4& a)
1688 { return v_check_any(v_reinterpret_as_u32(a)); }
v_check_any(const v_float32x4 & a)1689 inline bool v_check_any(const v_float32x4& a)
1690 { return v_check_any(v_reinterpret_as_u32(a)); }
1691 
v_check_all(const v_int64x2 & a)1692 inline bool v_check_all(const v_int64x2& a)
1693 { return v_check_all(v_reinterpret_as_u64(a)); }
v_check_any(const v_int64x2 & a)1694 inline bool v_check_any(const v_int64x2& a)
1695 { return v_check_any(v_reinterpret_as_u64(a)); }
1696 #if CV_SIMD128_64F
v_check_all(const v_float64x2 & a)1697 inline bool v_check_all(const v_float64x2& a)
1698 { return v_check_all(v_reinterpret_as_u64(a)); }
v_check_any(const v_float64x2 & a)1699 inline bool v_check_any(const v_float64x2& a)
1700 { return v_check_any(v_reinterpret_as_u64(a)); }
1701 #endif
1702 
1703 #define OPENCV_HAL_IMPL_NEON_SELECT(_Tpvec, suffix, usuffix) \
1704 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
1705 { \
1706     return _Tpvec(vbslq_##suffix(vreinterpretq_##usuffix##_##suffix(mask.val), a.val, b.val)); \
1707 }
1708 
OPENCV_HAL_IMPL_NEON_SELECT(v_uint8x16,u8,u8)1709 OPENCV_HAL_IMPL_NEON_SELECT(v_uint8x16, u8, u8)
1710 OPENCV_HAL_IMPL_NEON_SELECT(v_int8x16, s8, u8)
1711 OPENCV_HAL_IMPL_NEON_SELECT(v_uint16x8, u16, u16)
1712 OPENCV_HAL_IMPL_NEON_SELECT(v_int16x8, s16, u16)
1713 OPENCV_HAL_IMPL_NEON_SELECT(v_uint32x4, u32, u32)
1714 OPENCV_HAL_IMPL_NEON_SELECT(v_int32x4, s32, u32)
1715 OPENCV_HAL_IMPL_NEON_SELECT(v_float32x4, f32, u32)
1716 #if CV_SIMD128_64F
1717 OPENCV_HAL_IMPL_NEON_SELECT(v_float64x2, f64, u64)
1718 #endif
1719 
1720 #if CV_NEON_AARCH64
1721 #define OPENCV_HAL_IMPL_NEON_EXPAND(_Tpvec, _Tpwvec, _Tp, suffix) \
1722 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
1723 { \
1724     b0.val = vmovl_##suffix(vget_low_##suffix(a.val)); \
1725     b1.val = vmovl_high_##suffix(a.val); \
1726 } \
1727 inline _Tpwvec v_expand_low(const _Tpvec& a) \
1728 { \
1729     return _Tpwvec(vmovl_##suffix(vget_low_##suffix(a.val))); \
1730 } \
1731 inline _Tpwvec v_expand_high(const _Tpvec& a) \
1732 { \
1733     return _Tpwvec(vmovl_high_##suffix(a.val)); \
1734 } \
1735 inline _Tpwvec v_load_expand(const _Tp* ptr) \
1736 { \
1737     return _Tpwvec(vmovl_##suffix(vld1_##suffix(ptr))); \
1738 }
1739 #else
1740 #define OPENCV_HAL_IMPL_NEON_EXPAND(_Tpvec, _Tpwvec, _Tp, suffix) \
1741 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
1742 { \
1743     b0.val = vmovl_##suffix(vget_low_##suffix(a.val)); \
1744     b1.val = vmovl_##suffix(vget_high_##suffix(a.val)); \
1745 } \
1746 inline _Tpwvec v_expand_low(const _Tpvec& a) \
1747 { \
1748     return _Tpwvec(vmovl_##suffix(vget_low_##suffix(a.val))); \
1749 } \
1750 inline _Tpwvec v_expand_high(const _Tpvec& a) \
1751 { \
1752     return _Tpwvec(vmovl_##suffix(vget_high_##suffix(a.val))); \
1753 } \
1754 inline _Tpwvec v_load_expand(const _Tp* ptr) \
1755 { \
1756     return _Tpwvec(vmovl_##suffix(vld1_##suffix(ptr))); \
1757 }
1758 #endif
1759 
1760 OPENCV_HAL_IMPL_NEON_EXPAND(v_uint8x16, v_uint16x8, uchar, u8)
1761 OPENCV_HAL_IMPL_NEON_EXPAND(v_int8x16, v_int16x8, schar, s8)
1762 OPENCV_HAL_IMPL_NEON_EXPAND(v_uint16x8, v_uint32x4, ushort, u16)
1763 OPENCV_HAL_IMPL_NEON_EXPAND(v_int16x8, v_int32x4, short, s16)
1764 OPENCV_HAL_IMPL_NEON_EXPAND(v_uint32x4, v_uint64x2, uint, u32)
1765 OPENCV_HAL_IMPL_NEON_EXPAND(v_int32x4, v_int64x2, int, s32)
1766 
1767 inline v_uint32x4 v_load_expand_q(const uchar* ptr)
1768 {
1769     typedef unsigned int CV_DECL_ALIGNED(1) unaligned_uint;
1770     uint8x8_t v0 = vcreate_u8(*(unaligned_uint*)ptr);
1771     uint16x4_t v1 = vget_low_u16(vmovl_u8(v0));
1772     return v_uint32x4(vmovl_u16(v1));
1773 }
1774 
v_load_expand_q(const schar * ptr)1775 inline v_int32x4 v_load_expand_q(const schar* ptr)
1776 {
1777     typedef unsigned int CV_DECL_ALIGNED(1) unaligned_uint;
1778     int8x8_t v0 = vcreate_s8(*(unaligned_uint*)ptr);
1779     int16x4_t v1 = vget_low_s16(vmovl_s8(v0));
1780     return v_int32x4(vmovl_s16(v1));
1781 }
1782 
1783 #if defined(__aarch64__) || defined(_M_ARM64)
1784 #define OPENCV_HAL_IMPL_NEON_UNPACKS(_Tpvec, suffix) \
1785 inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
1786 { \
1787     b0.val = vzip1q_##suffix(a0.val, a1.val); \
1788     b1.val = vzip2q_##suffix(a0.val, a1.val); \
1789 } \
1790 inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1791 { \
1792     return v_##_Tpvec(vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val))); \
1793 } \
1794 inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1795 { \
1796     return v_##_Tpvec(vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val))); \
1797 } \
1798 inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
1799 { \
1800     c.val = vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val)); \
1801     d.val = vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val)); \
1802 }
1803 #else
1804 #define OPENCV_HAL_IMPL_NEON_UNPACKS(_Tpvec, suffix) \
1805 inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
1806 { \
1807     _Tpvec##x2_t p = vzipq_##suffix(a0.val, a1.val); \
1808     b0.val = p.val[0]; \
1809     b1.val = p.val[1]; \
1810 } \
1811 inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1812 { \
1813     return v_##_Tpvec(vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val))); \
1814 } \
1815 inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1816 { \
1817     return v_##_Tpvec(vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val))); \
1818 } \
1819 inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
1820 { \
1821     c.val = vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val)); \
1822     d.val = vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val)); \
1823 }
1824 #endif
1825 
OPENCV_HAL_IMPL_NEON_UNPACKS(uint8x16,u8)1826 OPENCV_HAL_IMPL_NEON_UNPACKS(uint8x16, u8)
1827 OPENCV_HAL_IMPL_NEON_UNPACKS(int8x16, s8)
1828 OPENCV_HAL_IMPL_NEON_UNPACKS(uint16x8, u16)
1829 OPENCV_HAL_IMPL_NEON_UNPACKS(int16x8, s16)
1830 OPENCV_HAL_IMPL_NEON_UNPACKS(uint32x4, u32)
1831 OPENCV_HAL_IMPL_NEON_UNPACKS(int32x4, s32)
1832 OPENCV_HAL_IMPL_NEON_UNPACKS(float32x4, f32)
1833 #if CV_SIMD128_64F
1834 OPENCV_HAL_IMPL_NEON_UNPACKS(float64x2, f64)
1835 #endif
1836 
1837 inline v_uint8x16 v_reverse(const v_uint8x16 &a)
1838 {
1839     uint8x16_t vec = vrev64q_u8(a.val);
1840     return v_uint8x16(vextq_u8(vec, vec, 8));
1841 }
1842 
v_reverse(const v_int8x16 & a)1843 inline v_int8x16 v_reverse(const v_int8x16 &a)
1844 { return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
1845 
v_reverse(const v_uint16x8 & a)1846 inline v_uint16x8 v_reverse(const v_uint16x8 &a)
1847 {
1848     uint16x8_t vec = vrev64q_u16(a.val);
1849     return v_uint16x8(vextq_u16(vec, vec, 4));
1850 }
1851 
v_reverse(const v_int16x8 & a)1852 inline v_int16x8 v_reverse(const v_int16x8 &a)
1853 { return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
1854 
v_reverse(const v_uint32x4 & a)1855 inline v_uint32x4 v_reverse(const v_uint32x4 &a)
1856 {
1857     uint32x4_t vec = vrev64q_u32(a.val);
1858     return v_uint32x4(vextq_u32(vec, vec, 2));
1859 }
1860 
v_reverse(const v_int32x4 & a)1861 inline v_int32x4 v_reverse(const v_int32x4 &a)
1862 { return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
1863 
v_reverse(const v_float32x4 & a)1864 inline v_float32x4 v_reverse(const v_float32x4 &a)
1865 { return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
1866 
v_reverse(const v_uint64x2 & a)1867 inline v_uint64x2 v_reverse(const v_uint64x2 &a)
1868 {
1869     uint64x2_t vec = a.val;
1870     uint64x1_t vec_lo = vget_low_u64(vec);
1871     uint64x1_t vec_hi = vget_high_u64(vec);
1872     return v_uint64x2(vcombine_u64(vec_hi, vec_lo));
1873 }
1874 
v_reverse(const v_int64x2 & a)1875 inline v_int64x2 v_reverse(const v_int64x2 &a)
1876 { return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
1877 
1878 #if CV_SIMD128_64F
v_reverse(const v_float64x2 & a)1879 inline v_float64x2 v_reverse(const v_float64x2 &a)
1880 { return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
1881 #endif
1882 
1883 #define OPENCV_HAL_IMPL_NEON_EXTRACT(_Tpvec, suffix) \
1884 template <int s> \
1885 inline v_##_Tpvec v_extract(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1886 { \
1887     return v_##_Tpvec(vextq_##suffix(a.val, b.val, s)); \
1888 }
1889 
OPENCV_HAL_IMPL_NEON_EXTRACT(uint8x16,u8)1890 OPENCV_HAL_IMPL_NEON_EXTRACT(uint8x16, u8)
1891 OPENCV_HAL_IMPL_NEON_EXTRACT(int8x16, s8)
1892 OPENCV_HAL_IMPL_NEON_EXTRACT(uint16x8, u16)
1893 OPENCV_HAL_IMPL_NEON_EXTRACT(int16x8, s16)
1894 OPENCV_HAL_IMPL_NEON_EXTRACT(uint32x4, u32)
1895 OPENCV_HAL_IMPL_NEON_EXTRACT(int32x4, s32)
1896 OPENCV_HAL_IMPL_NEON_EXTRACT(uint64x2, u64)
1897 OPENCV_HAL_IMPL_NEON_EXTRACT(int64x2, s64)
1898 OPENCV_HAL_IMPL_NEON_EXTRACT(float32x4, f32)
1899 #if CV_SIMD128_64F
1900 OPENCV_HAL_IMPL_NEON_EXTRACT(float64x2, f64)
1901 #endif
1902 
1903 #define OPENCV_HAL_IMPL_NEON_EXTRACT_N(_Tpvec, _Tp, suffix) \
1904 template<int i> inline _Tp v_extract_n(_Tpvec v) { return vgetq_lane_##suffix(v.val, i); }
1905 
1906 OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint8x16, uchar, u8)
1907 OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int8x16, schar, s8)
1908 OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint16x8, ushort, u16)
1909 OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int16x8, short, s16)
1910 OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint32x4, uint, u32)
1911 OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int32x4, int, s32)
1912 OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint64x2, uint64, u64)
1913 OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int64x2, int64, s64)
1914 OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_float32x4, float, f32)
1915 #if CV_SIMD128_64F
1916 OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_float64x2, double, f64)
1917 #endif
1918 
1919 #define OPENCV_HAL_IMPL_NEON_BROADCAST(_Tpvec, _Tp, suffix) \
1920 template<int i> inline _Tpvec v_broadcast_element(_Tpvec v) { _Tp t = v_extract_n<i>(v); return v_setall_##suffix(t); }
1921 
1922 OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint8x16, uchar, u8)
1923 OPENCV_HAL_IMPL_NEON_BROADCAST(v_int8x16, schar, s8)
1924 OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint16x8, ushort, u16)
1925 OPENCV_HAL_IMPL_NEON_BROADCAST(v_int16x8, short, s16)
1926 OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint32x4, uint, u32)
1927 OPENCV_HAL_IMPL_NEON_BROADCAST(v_int32x4, int, s32)
1928 OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint64x2, uint64, u64)
1929 OPENCV_HAL_IMPL_NEON_BROADCAST(v_int64x2, int64, s64)
1930 OPENCV_HAL_IMPL_NEON_BROADCAST(v_float32x4, float, f32)
1931 #if CV_SIMD128_64F
1932 OPENCV_HAL_IMPL_NEON_BROADCAST(v_float64x2, double, f64)
1933 #endif
1934 
1935 #if CV_SIMD128_64F
1936 inline v_int32x4 v_round(const v_float32x4& a)
1937 {
1938     float32x4_t a_ = a.val;
1939     int32x4_t result;
1940     __asm__ ("fcvtns %0.4s, %1.4s"
1941              : "=w"(result)
1942              : "w"(a_)
1943              : /* No clobbers */);
1944     return v_int32x4(result);
1945 }
1946 #else
1947 inline v_int32x4 v_round(const v_float32x4& a)
1948 {
1949     static const int32x4_t v_sign = vdupq_n_s32(1 << 31),
1950         v_05 = vreinterpretq_s32_f32(vdupq_n_f32(0.5f));
1951 
1952     int32x4_t v_addition = vorrq_s32(v_05, vandq_s32(v_sign, vreinterpretq_s32_f32(a.val)));
1953     return v_int32x4(vcvtq_s32_f32(vaddq_f32(a.val, vreinterpretq_f32_s32(v_addition))));
1954 }
1955 #endif
v_floor(const v_float32x4 & a)1956 inline v_int32x4 v_floor(const v_float32x4& a)
1957 {
1958     int32x4_t a1 = vcvtq_s32_f32(a.val);
1959     uint32x4_t mask = vcgtq_f32(vcvtq_f32_s32(a1), a.val);
1960     return v_int32x4(vaddq_s32(a1, vreinterpretq_s32_u32(mask)));
1961 }
1962 
v_ceil(const v_float32x4 & a)1963 inline v_int32x4 v_ceil(const v_float32x4& a)
1964 {
1965     int32x4_t a1 = vcvtq_s32_f32(a.val);
1966     uint32x4_t mask = vcgtq_f32(a.val, vcvtq_f32_s32(a1));
1967     return v_int32x4(vsubq_s32(a1, vreinterpretq_s32_u32(mask)));
1968 }
1969 
v_trunc(const v_float32x4 & a)1970 inline v_int32x4 v_trunc(const v_float32x4& a)
1971 { return v_int32x4(vcvtq_s32_f32(a.val)); }
1972 
1973 #if CV_SIMD128_64F
v_round(const v_float64x2 & a)1974 inline v_int32x4 v_round(const v_float64x2& a)
1975 {
1976     static const int32x2_t zero = vdup_n_s32(0);
1977     return v_int32x4(vcombine_s32(vmovn_s64(vcvtaq_s64_f64(a.val)), zero));
1978 }
1979 
v_round(const v_float64x2 & a,const v_float64x2 & b)1980 inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
1981 {
1982     return v_int32x4(vcombine_s32(vmovn_s64(vcvtaq_s64_f64(a.val)), vmovn_s64(vcvtaq_s64_f64(b.val))));
1983 }
1984 
v_floor(const v_float64x2 & a)1985 inline v_int32x4 v_floor(const v_float64x2& a)
1986 {
1987     static const int32x2_t zero = vdup_n_s32(0);
1988     int64x2_t a1 = vcvtq_s64_f64(a.val);
1989     uint64x2_t mask = vcgtq_f64(vcvtq_f64_s64(a1), a.val);
1990     a1 = vaddq_s64(a1, vreinterpretq_s64_u64(mask));
1991     return v_int32x4(vcombine_s32(vmovn_s64(a1), zero));
1992 }
1993 
v_ceil(const v_float64x2 & a)1994 inline v_int32x4 v_ceil(const v_float64x2& a)
1995 {
1996     static const int32x2_t zero = vdup_n_s32(0);
1997     int64x2_t a1 = vcvtq_s64_f64(a.val);
1998     uint64x2_t mask = vcgtq_f64(a.val, vcvtq_f64_s64(a1));
1999     a1 = vsubq_s64(a1, vreinterpretq_s64_u64(mask));
2000     return v_int32x4(vcombine_s32(vmovn_s64(a1), zero));
2001 }
2002 
v_trunc(const v_float64x2 & a)2003 inline v_int32x4 v_trunc(const v_float64x2& a)
2004 {
2005     static const int32x2_t zero = vdup_n_s32(0);
2006     return v_int32x4(vcombine_s32(vmovn_s64(vcvtaq_s64_f64(a.val)), zero));
2007 }
2008 #endif
2009 
2010 #if CV_NEON_AARCH64
2011 #define OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(_Tpvec, suffix) \
2012 inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
2013                          const v_##_Tpvec& a2, const v_##_Tpvec& a3, \
2014                          v_##_Tpvec& b0, v_##_Tpvec& b1, \
2015                          v_##_Tpvec& b2, v_##_Tpvec& b3) \
2016 { \
2017     /* -- Pass 1: 64b transpose */ \
2018     _Tpvec##_t t0 = vreinterpretq_##suffix##32_##suffix##64( \
2019                         vtrn1q_##suffix##64(vreinterpretq_##suffix##64_##suffix##32(a0.val), \
2020                                             vreinterpretq_##suffix##64_##suffix##32(a2.val))); \
2021     _Tpvec##_t t1 = vreinterpretq_##suffix##32_##suffix##64( \
2022                         vtrn1q_##suffix##64(vreinterpretq_##suffix##64_##suffix##32(a1.val), \
2023                                             vreinterpretq_##suffix##64_##suffix##32(a3.val))); \
2024     _Tpvec##_t t2 = vreinterpretq_##suffix##32_##suffix##64( \
2025                         vtrn2q_##suffix##64(vreinterpretq_##suffix##64_##suffix##32(a0.val), \
2026                                             vreinterpretq_##suffix##64_##suffix##32(a2.val))); \
2027     _Tpvec##_t t3 = vreinterpretq_##suffix##32_##suffix##64( \
2028                         vtrn2q_##suffix##64(vreinterpretq_##suffix##64_##suffix##32(a1.val), \
2029                                             vreinterpretq_##suffix##64_##suffix##32(a3.val))); \
2030     /* -- Pass 2: 32b transpose */ \
2031     b0.val = vtrn1q_##suffix##32(t0, t1); \
2032     b1.val = vtrn2q_##suffix##32(t0, t1); \
2033     b2.val = vtrn1q_##suffix##32(t2, t3); \
2034     b3.val = vtrn2q_##suffix##32(t2, t3); \
2035 }
2036 
OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(uint32x4,u)2037 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(uint32x4, u)
2038 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(int32x4, s)
2039 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(float32x4, f)
2040 #else // #if CV_NEON_AARCH64
2041 #define OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(_Tpvec, suffix) \
2042 inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
2043                          const v_##_Tpvec& a2, const v_##_Tpvec& a3, \
2044                          v_##_Tpvec& b0, v_##_Tpvec& b1, \
2045                          v_##_Tpvec& b2, v_##_Tpvec& b3) \
2046 { \
2047     /* m00 m01 m02 m03 */ \
2048     /* m10 m11 m12 m13 */ \
2049     /* m20 m21 m22 m23 */ \
2050     /* m30 m31 m32 m33 */ \
2051     _Tpvec##x2_t t0 = vtrnq_##suffix(a0.val, a1.val); \
2052     _Tpvec##x2_t t1 = vtrnq_##suffix(a2.val, a3.val); \
2053     /* m00 m10 m02 m12 */ \
2054     /* m01 m11 m03 m13 */ \
2055     /* m20 m30 m22 m32 */ \
2056     /* m21 m31 m23 m33 */ \
2057     b0.val = vcombine_##suffix(vget_low_##suffix(t0.val[0]), vget_low_##suffix(t1.val[0])); \
2058     b1.val = vcombine_##suffix(vget_low_##suffix(t0.val[1]), vget_low_##suffix(t1.val[1])); \
2059     b2.val = vcombine_##suffix(vget_high_##suffix(t0.val[0]), vget_high_##suffix(t1.val[0])); \
2060     b3.val = vcombine_##suffix(vget_high_##suffix(t0.val[1]), vget_high_##suffix(t1.val[1])); \
2061 }
2062 
2063 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(uint32x4, u32)
2064 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(int32x4, s32)
2065 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(float32x4, f32)
2066 #endif // #if CV_NEON_AARCH64
2067 
2068 #define OPENCV_HAL_IMPL_NEON_INTERLEAVED(_Tpvec, _Tp, suffix) \
2069 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \
2070 { \
2071     _Tpvec##x2_t v = vld2q_##suffix(ptr); \
2072     a.val = v.val[0]; \
2073     b.val = v.val[1]; \
2074 } \
2075 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
2076 { \
2077     _Tpvec##x3_t v = vld3q_##suffix(ptr); \
2078     a.val = v.val[0]; \
2079     b.val = v.val[1]; \
2080     c.val = v.val[2]; \
2081 } \
2082 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
2083                                 v_##_Tpvec& c, v_##_Tpvec& d) \
2084 { \
2085     _Tpvec##x4_t v = vld4q_##suffix(ptr); \
2086     a.val = v.val[0]; \
2087     b.val = v.val[1]; \
2088     c.val = v.val[2]; \
2089     d.val = v.val[3]; \
2090 } \
2091 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
2092                                 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
2093 { \
2094     _Tpvec##x2_t v; \
2095     v.val[0] = a.val; \
2096     v.val[1] = b.val; \
2097     vst2q_##suffix(ptr, v); \
2098 } \
2099 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
2100                                 const v_##_Tpvec& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
2101 { \
2102     _Tpvec##x3_t v; \
2103     v.val[0] = a.val; \
2104     v.val[1] = b.val; \
2105     v.val[2] = c.val; \
2106     vst3q_##suffix(ptr, v); \
2107 } \
2108 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
2109                                 const v_##_Tpvec& c, const v_##_Tpvec& d, \
2110                                 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
2111 { \
2112     _Tpvec##x4_t v; \
2113     v.val[0] = a.val; \
2114     v.val[1] = b.val; \
2115     v.val[2] = c.val; \
2116     v.val[3] = d.val; \
2117     vst4q_##suffix(ptr, v); \
2118 }
2119 
2120 #define OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(tp, suffix) \
2121 inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, v_##tp##x2& b ) \
2122 { \
2123     tp##x1_t a0 = vld1_##suffix(ptr); \
2124     tp##x1_t b0 = vld1_##suffix(ptr + 1); \
2125     tp##x1_t a1 = vld1_##suffix(ptr + 2); \
2126     tp##x1_t b1 = vld1_##suffix(ptr + 3); \
2127     a = v_##tp##x2(vcombine_##suffix(a0, a1)); \
2128     b = v_##tp##x2(vcombine_##suffix(b0, b1)); \
2129 } \
2130  \
2131 inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, \
2132                                  v_##tp##x2& b, v_##tp##x2& c ) \
2133 { \
2134     tp##x1_t a0 = vld1_##suffix(ptr); \
2135     tp##x1_t b0 = vld1_##suffix(ptr + 1); \
2136     tp##x1_t c0 = vld1_##suffix(ptr + 2); \
2137     tp##x1_t a1 = vld1_##suffix(ptr + 3); \
2138     tp##x1_t b1 = vld1_##suffix(ptr + 4); \
2139     tp##x1_t c1 = vld1_##suffix(ptr + 5); \
2140     a = v_##tp##x2(vcombine_##suffix(a0, a1)); \
2141     b = v_##tp##x2(vcombine_##suffix(b0, b1)); \
2142     c = v_##tp##x2(vcombine_##suffix(c0, c1)); \
2143 } \
2144  \
2145 inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, v_##tp##x2& b, \
2146                                  v_##tp##x2& c, v_##tp##x2& d ) \
2147 { \
2148     tp##x1_t a0 = vld1_##suffix(ptr); \
2149     tp##x1_t b0 = vld1_##suffix(ptr + 1); \
2150     tp##x1_t c0 = vld1_##suffix(ptr + 2); \
2151     tp##x1_t d0 = vld1_##suffix(ptr + 3); \
2152     tp##x1_t a1 = vld1_##suffix(ptr + 4); \
2153     tp##x1_t b1 = vld1_##suffix(ptr + 5); \
2154     tp##x1_t c1 = vld1_##suffix(ptr + 6); \
2155     tp##x1_t d1 = vld1_##suffix(ptr + 7); \
2156     a = v_##tp##x2(vcombine_##suffix(a0, a1)); \
2157     b = v_##tp##x2(vcombine_##suffix(b0, b1)); \
2158     c = v_##tp##x2(vcombine_##suffix(c0, c1)); \
2159     d = v_##tp##x2(vcombine_##suffix(d0, d1)); \
2160 } \
2161  \
2162 inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b, \
2163                                 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
2164 { \
2165     vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
2166     vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
2167     vst1_##suffix(ptr + 2, vget_high_##suffix(a.val)); \
2168     vst1_##suffix(ptr + 3, vget_high_##suffix(b.val)); \
2169 } \
2170  \
2171 inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, \
2172                                 const v_##tp##x2& b, const v_##tp##x2& c, \
2173                                 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
2174 { \
2175     vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
2176     vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
2177     vst1_##suffix(ptr + 2, vget_low_##suffix(c.val)); \
2178     vst1_##suffix(ptr + 3, vget_high_##suffix(a.val)); \
2179     vst1_##suffix(ptr + 4, vget_high_##suffix(b.val)); \
2180     vst1_##suffix(ptr + 5, vget_high_##suffix(c.val)); \
2181 } \
2182  \
2183 inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b, \
2184                                 const v_##tp##x2& c, const v_##tp##x2& d, \
2185                                 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
2186 { \
2187     vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
2188     vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
2189     vst1_##suffix(ptr + 2, vget_low_##suffix(c.val)); \
2190     vst1_##suffix(ptr + 3, vget_low_##suffix(d.val)); \
2191     vst1_##suffix(ptr + 4, vget_high_##suffix(a.val)); \
2192     vst1_##suffix(ptr + 5, vget_high_##suffix(b.val)); \
2193     vst1_##suffix(ptr + 6, vget_high_##suffix(c.val)); \
2194     vst1_##suffix(ptr + 7, vget_high_##suffix(d.val)); \
2195 }
2196 
2197 OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint8x16, uchar, u8)
2198 OPENCV_HAL_IMPL_NEON_INTERLEAVED(int8x16, schar, s8)
2199 OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint16x8, ushort, u16)
2200 OPENCV_HAL_IMPL_NEON_INTERLEAVED(int16x8, short, s16)
2201 OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint32x4, unsigned, u32)
2202 OPENCV_HAL_IMPL_NEON_INTERLEAVED(int32x4, int, s32)
2203 OPENCV_HAL_IMPL_NEON_INTERLEAVED(float32x4, float, f32)
2204 #if CV_SIMD128_64F
2205 OPENCV_HAL_IMPL_NEON_INTERLEAVED(float64x2, double, f64)
2206 #endif
2207 
2208 OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(int64, s64)
2209 OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(uint64, u64)
2210 
2211 inline v_float32x4 v_cvt_f32(const v_int32x4& a)
2212 {
2213     return v_float32x4(vcvtq_f32_s32(a.val));
2214 }
2215 
2216 #if CV_SIMD128_64F
v_cvt_f32(const v_float64x2 & a)2217 inline v_float32x4 v_cvt_f32(const v_float64x2& a)
2218 {
2219     float32x2_t zero = vdup_n_f32(0.0f);
2220     return v_float32x4(vcombine_f32(vcvt_f32_f64(a.val), zero));
2221 }
2222 
v_cvt_f32(const v_float64x2 & a,const v_float64x2 & b)2223 inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
2224 {
2225     return v_float32x4(vcombine_f32(vcvt_f32_f64(a.val), vcvt_f32_f64(b.val)));
2226 }
2227 
v_cvt_f64(const v_int32x4 & a)2228 inline v_float64x2 v_cvt_f64(const v_int32x4& a)
2229 {
2230     return v_float64x2(vcvt_f64_f32(vcvt_f32_s32(vget_low_s32(a.val))));
2231 }
2232 
v_cvt_f64_high(const v_int32x4 & a)2233 inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
2234 {
2235     return v_float64x2(vcvt_f64_f32(vcvt_f32_s32(vget_high_s32(a.val))));
2236 }
2237 
v_cvt_f64(const v_float32x4 & a)2238 inline v_float64x2 v_cvt_f64(const v_float32x4& a)
2239 {
2240     return v_float64x2(vcvt_f64_f32(vget_low_f32(a.val)));
2241 }
2242 
v_cvt_f64_high(const v_float32x4 & a)2243 inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
2244 {
2245     return v_float64x2(vcvt_f64_f32(vget_high_f32(a.val)));
2246 }
2247 
v_cvt_f64(const v_int64x2 & a)2248 inline v_float64x2 v_cvt_f64(const v_int64x2& a)
2249 {  return v_float64x2(vcvtq_f64_s64(a.val)); }
2250 
2251 #endif
2252 
2253 ////////////// Lookup table access ////////////////////
2254 
v_lut(const schar * tab,const int * idx)2255 inline v_int8x16 v_lut(const schar* tab, const int* idx)
2256 {
2257     schar CV_DECL_ALIGNED(32) elems[16] =
2258     {
2259         tab[idx[ 0]],
2260         tab[idx[ 1]],
2261         tab[idx[ 2]],
2262         tab[idx[ 3]],
2263         tab[idx[ 4]],
2264         tab[idx[ 5]],
2265         tab[idx[ 6]],
2266         tab[idx[ 7]],
2267         tab[idx[ 8]],
2268         tab[idx[ 9]],
2269         tab[idx[10]],
2270         tab[idx[11]],
2271         tab[idx[12]],
2272         tab[idx[13]],
2273         tab[idx[14]],
2274         tab[idx[15]]
2275     };
2276     return v_int8x16(vld1q_s8(elems));
2277 }
v_lut_pairs(const schar * tab,const int * idx)2278 inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
2279 {
2280     schar CV_DECL_ALIGNED(32) elems[16] =
2281     {
2282         tab[idx[0]],
2283         tab[idx[0] + 1],
2284         tab[idx[1]],
2285         tab[idx[1] + 1],
2286         tab[idx[2]],
2287         tab[idx[2] + 1],
2288         tab[idx[3]],
2289         tab[idx[3] + 1],
2290         tab[idx[4]],
2291         tab[idx[4] + 1],
2292         tab[idx[5]],
2293         tab[idx[5] + 1],
2294         tab[idx[6]],
2295         tab[idx[6] + 1],
2296         tab[idx[7]],
2297         tab[idx[7] + 1]
2298     };
2299     return v_int8x16(vld1q_s8(elems));
2300 }
v_lut_quads(const schar * tab,const int * idx)2301 inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
2302 {
2303     schar CV_DECL_ALIGNED(32) elems[16] =
2304     {
2305         tab[idx[0]],
2306         tab[idx[0] + 1],
2307         tab[idx[0] + 2],
2308         tab[idx[0] + 3],
2309         tab[idx[1]],
2310         tab[idx[1] + 1],
2311         tab[idx[1] + 2],
2312         tab[idx[1] + 3],
2313         tab[idx[2]],
2314         tab[idx[2] + 1],
2315         tab[idx[2] + 2],
2316         tab[idx[2] + 3],
2317         tab[idx[3]],
2318         tab[idx[3] + 1],
2319         tab[idx[3] + 2],
2320         tab[idx[3] + 3]
2321     };
2322     return v_int8x16(vld1q_s8(elems));
2323 }
v_lut(const uchar * tab,const int * idx)2324 inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
v_lut_pairs(const uchar * tab,const int * idx)2325 inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); }
v_lut_quads(const uchar * tab,const int * idx)2326 inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); }
2327 
v_lut(const short * tab,const int * idx)2328 inline v_int16x8 v_lut(const short* tab, const int* idx)
2329 {
2330     short CV_DECL_ALIGNED(32) elems[8] =
2331     {
2332         tab[idx[0]],
2333         tab[idx[1]],
2334         tab[idx[2]],
2335         tab[idx[3]],
2336         tab[idx[4]],
2337         tab[idx[5]],
2338         tab[idx[6]],
2339         tab[idx[7]]
2340     };
2341     return v_int16x8(vld1q_s16(elems));
2342 }
v_lut_pairs(const short * tab,const int * idx)2343 inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
2344 {
2345     short CV_DECL_ALIGNED(32) elems[8] =
2346     {
2347         tab[idx[0]],
2348         tab[idx[0] + 1],
2349         tab[idx[1]],
2350         tab[idx[1] + 1],
2351         tab[idx[2]],
2352         tab[idx[2] + 1],
2353         tab[idx[3]],
2354         tab[idx[3] + 1]
2355     };
2356     return v_int16x8(vld1q_s16(elems));
2357 }
v_lut_quads(const short * tab,const int * idx)2358 inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
2359 {
2360     return v_int16x8(vcombine_s16(vld1_s16(tab + idx[0]), vld1_s16(tab + idx[1])));
2361 }
v_lut(const ushort * tab,const int * idx)2362 inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((short*)tab, idx)); }
v_lut_pairs(const ushort * tab,const int * idx)2363 inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); }
v_lut_quads(const ushort * tab,const int * idx)2364 inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((short*)tab, idx)); }
2365 
v_lut(const int * tab,const int * idx)2366 inline v_int32x4 v_lut(const int* tab, const int* idx)
2367 {
2368     int CV_DECL_ALIGNED(32) elems[4] =
2369     {
2370         tab[idx[0]],
2371         tab[idx[1]],
2372         tab[idx[2]],
2373         tab[idx[3]]
2374     };
2375     return v_int32x4(vld1q_s32(elems));
2376 }
v_lut_pairs(const int * tab,const int * idx)2377 inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
2378 {
2379     return v_int32x4(vcombine_s32(vld1_s32(tab + idx[0]), vld1_s32(tab + idx[1])));
2380 }
v_lut_quads(const int * tab,const int * idx)2381 inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
2382 {
2383     return v_int32x4(vld1q_s32(tab + idx[0]));
2384 }
v_lut(const unsigned * tab,const int * idx)2385 inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((int*)tab, idx)); }
v_lut_pairs(const unsigned * tab,const int * idx)2386 inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((int*)tab, idx)); }
v_lut_quads(const unsigned * tab,const int * idx)2387 inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((int*)tab, idx)); }
2388 
v_lut(const int64_t * tab,const int * idx)2389 inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
2390 {
2391     return v_int64x2(vcombine_s64(vcreate_s64(tab[idx[0]]), vcreate_s64(tab[idx[1]])));
2392 }
v_lut_pairs(const int64_t * tab,const int * idx)2393 inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
2394 {
2395     return v_int64x2(vld1q_s64(tab + idx[0]));
2396 }
v_lut(const uint64_t * tab,const int * idx)2397 inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
v_lut_pairs(const uint64_t * tab,const int * idx)2398 inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
2399 
v_lut(const float * tab,const int * idx)2400 inline v_float32x4 v_lut(const float* tab, const int* idx)
2401 {
2402     float CV_DECL_ALIGNED(32) elems[4] =
2403     {
2404         tab[idx[0]],
2405         tab[idx[1]],
2406         tab[idx[2]],
2407         tab[idx[3]]
2408     };
2409     return v_float32x4(vld1q_f32(elems));
2410 }
v_lut_pairs(const float * tab,const int * idx)2411 inline v_float32x4 v_lut_pairs(const float* tab, const int* idx)
2412 {
2413     typedef uint64 CV_DECL_ALIGNED(1) unaligned_uint64;
2414 
2415     uint64 CV_DECL_ALIGNED(32) elems[2] =
2416     {
2417         *(unaligned_uint64*)(tab + idx[0]),
2418         *(unaligned_uint64*)(tab + idx[1])
2419     };
2420     return v_float32x4(vreinterpretq_f32_u64(vld1q_u64(elems)));
2421 }
v_lut_quads(const float * tab,const int * idx)2422 inline v_float32x4 v_lut_quads(const float* tab, const int* idx)
2423 {
2424     return v_float32x4(vld1q_f32(tab + idx[0]));
2425 }
2426 
v_lut(const int * tab,const v_int32x4 & idxvec)2427 inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
2428 {
2429     int CV_DECL_ALIGNED(32) elems[4] =
2430     {
2431         tab[vgetq_lane_s32(idxvec.val, 0)],
2432         tab[vgetq_lane_s32(idxvec.val, 1)],
2433         tab[vgetq_lane_s32(idxvec.val, 2)],
2434         tab[vgetq_lane_s32(idxvec.val, 3)]
2435     };
2436     return v_int32x4(vld1q_s32(elems));
2437 }
2438 
v_lut(const unsigned * tab,const v_int32x4 & idxvec)2439 inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
2440 {
2441     unsigned CV_DECL_ALIGNED(32) elems[4] =
2442     {
2443         tab[vgetq_lane_s32(idxvec.val, 0)],
2444         tab[vgetq_lane_s32(idxvec.val, 1)],
2445         tab[vgetq_lane_s32(idxvec.val, 2)],
2446         tab[vgetq_lane_s32(idxvec.val, 3)]
2447     };
2448     return v_uint32x4(vld1q_u32(elems));
2449 }
2450 
v_lut(const float * tab,const v_int32x4 & idxvec)2451 inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
2452 {
2453     float CV_DECL_ALIGNED(32) elems[4] =
2454     {
2455         tab[vgetq_lane_s32(idxvec.val, 0)],
2456         tab[vgetq_lane_s32(idxvec.val, 1)],
2457         tab[vgetq_lane_s32(idxvec.val, 2)],
2458         tab[vgetq_lane_s32(idxvec.val, 3)]
2459     };
2460     return v_float32x4(vld1q_f32(elems));
2461 }
2462 
v_lut_deinterleave(const float * tab,const v_int32x4 & idxvec,v_float32x4 & x,v_float32x4 & y)2463 inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
2464 {
2465     /*int CV_DECL_ALIGNED(32) idx[4];
2466     v_store(idx, idxvec);
2467 
2468     float32x4_t xy02 = vcombine_f32(vld1_f32(tab + idx[0]), vld1_f32(tab + idx[2]));
2469     float32x4_t xy13 = vcombine_f32(vld1_f32(tab + idx[1]), vld1_f32(tab + idx[3]));
2470 
2471     float32x4x2_t xxyy = vuzpq_f32(xy02, xy13);
2472     x = v_float32x4(xxyy.val[0]);
2473     y = v_float32x4(xxyy.val[1]);*/
2474     int CV_DECL_ALIGNED(32) idx[4];
2475     v_store_aligned(idx, idxvec);
2476 
2477     x = v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
2478     y = v_float32x4(tab[idx[0]+1], tab[idx[1]+1], tab[idx[2]+1], tab[idx[3]+1]);
2479 }
2480 
v_interleave_pairs(const v_int8x16 & vec)2481 inline v_int8x16 v_interleave_pairs(const v_int8x16& vec)
2482 {
2483     return v_int8x16(vcombine_s8(vtbl1_s8(vget_low_s8(vec.val), vcreate_s8(0x0705060403010200)), vtbl1_s8(vget_high_s8(vec.val), vcreate_s8(0x0705060403010200))));
2484 }
v_interleave_pairs(const v_uint8x16 & vec)2485 inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
v_interleave_quads(const v_int8x16 & vec)2486 inline v_int8x16 v_interleave_quads(const v_int8x16& vec)
2487 {
2488     return v_int8x16(vcombine_s8(vtbl1_s8(vget_low_s8(vec.val), vcreate_s8(0x0703060205010400)), vtbl1_s8(vget_high_s8(vec.val), vcreate_s8(0x0703060205010400))));
2489 }
v_interleave_quads(const v_uint8x16 & vec)2490 inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
2491 
v_interleave_pairs(const v_int16x8 & vec)2492 inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
2493 {
2494     return v_int16x8(vreinterpretq_s16_s8(vcombine_s8(vtbl1_s8(vget_low_s8(vreinterpretq_s8_s16(vec.val)), vcreate_s8(0x0706030205040100)), vtbl1_s8(vget_high_s8(vreinterpretq_s8_s16(vec.val)), vcreate_s8(0x0706030205040100)))));
2495 }
v_interleave_pairs(const v_uint16x8 & vec)2496 inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
v_interleave_quads(const v_int16x8 & vec)2497 inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
2498 {
2499     int16x4x2_t res = vzip_s16(vget_low_s16(vec.val), vget_high_s16(vec.val));
2500     return v_int16x8(vcombine_s16(res.val[0], res.val[1]));
2501 }
v_interleave_quads(const v_uint16x8 & vec)2502 inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
2503 
v_interleave_pairs(const v_int32x4 & vec)2504 inline v_int32x4 v_interleave_pairs(const v_int32x4& vec)
2505 {
2506     int32x2x2_t res = vzip_s32(vget_low_s32(vec.val), vget_high_s32(vec.val));
2507     return v_int32x4(vcombine_s32(res.val[0], res.val[1]));
2508 }
v_interleave_pairs(const v_uint32x4 & vec)2509 inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
v_interleave_pairs(const v_float32x4 & vec)2510 inline v_float32x4 v_interleave_pairs(const v_float32x4& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
2511 
v_pack_triplets(const v_int8x16 & vec)2512 inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
2513 {
2514     return v_int8x16(vextq_s8(vcombine_s8(vtbl1_s8(vget_low_s8(vec.val), vcreate_s8(0x0605040201000000)), vtbl1_s8(vget_high_s8(vec.val), vcreate_s8(0x0807060504020100))), vdupq_n_s8(0), 2));
2515 }
v_pack_triplets(const v_uint8x16 & vec)2516 inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
2517 
v_pack_triplets(const v_int16x8 & vec)2518 inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
2519 {
2520     return v_int16x8(vreinterpretq_s16_s8(vextq_s8(vcombine_s8(vtbl1_s8(vget_low_s8(vreinterpretq_s8_s16(vec.val)), vcreate_s8(0x0504030201000000)), vget_high_s8(vreinterpretq_s8_s16(vec.val))), vdupq_n_s8(0), 2)));
2521 }
v_pack_triplets(const v_uint16x8 & vec)2522 inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
2523 
v_pack_triplets(const v_int32x4 & vec)2524 inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
v_pack_triplets(const v_uint32x4 & vec)2525 inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
v_pack_triplets(const v_float32x4 & vec)2526 inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
2527 
2528 #if CV_SIMD128_64F
v_lut(const double * tab,const int * idx)2529 inline v_float64x2 v_lut(const double* tab, const int* idx)
2530 {
2531     double CV_DECL_ALIGNED(32) elems[2] =
2532     {
2533         tab[idx[0]],
2534         tab[idx[1]]
2535     };
2536     return v_float64x2(vld1q_f64(elems));
2537 }
2538 
v_lut_pairs(const double * tab,const int * idx)2539 inline v_float64x2 v_lut_pairs(const double* tab, const int* idx)
2540 {
2541     return v_float64x2(vld1q_f64(tab + idx[0]));
2542 }
2543 
v_lut(const double * tab,const v_int32x4 & idxvec)2544 inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
2545 {
2546     double CV_DECL_ALIGNED(32) elems[2] =
2547     {
2548         tab[vgetq_lane_s32(idxvec.val, 0)],
2549         tab[vgetq_lane_s32(idxvec.val, 1)],
2550     };
2551     return v_float64x2(vld1q_f64(elems));
2552 }
2553 
v_lut_deinterleave(const double * tab,const v_int32x4 & idxvec,v_float64x2 & x,v_float64x2 & y)2554 inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
2555 {
2556     int CV_DECL_ALIGNED(32) idx[4];
2557     v_store_aligned(idx, idxvec);
2558 
2559     x = v_float64x2(tab[idx[0]], tab[idx[1]]);
2560     y = v_float64x2(tab[idx[0]+1], tab[idx[1]+1]);
2561 }
2562 #endif
2563 
2564 ////// FP16 support ///////
2565 #if CV_FP16
v_load_expand(const float16_t * ptr)2566 inline v_float32x4 v_load_expand(const float16_t* ptr)
2567 {
2568     float16x4_t v =
2569     #ifndef vld1_f16 // APPLE compiler defines vld1_f16 as macro
2570         (float16x4_t)vld1_s16((const short*)ptr);
2571     #else
2572         vld1_f16((const __fp16*)ptr);
2573     #endif
2574     return v_float32x4(vcvt_f32_f16(v));
2575 }
2576 
v_pack_store(float16_t * ptr,const v_float32x4 & v)2577 inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
2578 {
2579     float16x4_t hv = vcvt_f16_f32(v.val);
2580 
2581     #ifndef vst1_f16 // APPLE compiler defines vst1_f16 as macro
2582         vst1_s16((short*)ptr, (int16x4_t)hv);
2583     #else
2584         vst1_f16((__fp16*)ptr, hv);
2585     #endif
2586 }
2587 #else
v_load_expand(const float16_t * ptr)2588 inline v_float32x4 v_load_expand(const float16_t* ptr)
2589 {
2590     const int N = 4;
2591     float buf[N];
2592     for( int i = 0; i < N; i++ ) buf[i] = (float)ptr[i];
2593     return v_load(buf);
2594 }
2595 
v_pack_store(float16_t * ptr,const v_float32x4 & v)2596 inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
2597 {
2598     const int N = 4;
2599     float buf[N];
2600     v_store(buf, v);
2601     for( int i = 0; i < N; i++ ) ptr[i] = float16_t(buf[i]);
2602 }
2603 #endif
2604 
v_cleanup()2605 inline void v_cleanup() {}
2606 
2607 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
2608 
2609 //! @endcond
2610 
2611 }
2612 
2613 #endif
2614