1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                          License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
16 // Copyright (C) 2015, Itseez Inc., all rights reserved.
17 // Third party copyrights are property of their respective owners.
18 //
19 // Redistribution and use in source and binary forms, with or without modification,
20 // are permitted provided that the following conditions are met:
21 //
22 //   * Redistribution's of source code must retain the above copyright notice,
23 //     this list of conditions and the following disclaimer.
24 //
25 //   * Redistribution's in binary form must reproduce the above copyright notice,
26 //     this list of conditions and the following disclaimer in the documentation
27 //     and/or other materials provided with the distribution.
28 //
29 //   * The name of the copyright holders may not be used to endorse or promote products
30 //     derived from this software without specific prior written permission.
31 //
32 // This software is provided by the copyright holders and contributors "as is" and
33 // any express or implied warranties, including, but not limited to, the implied
34 // warranties of merchantability and fitness for a particular purpose are disclaimed.
35 // In no event shall the Intel Corporation or contributors be liable for any direct,
36 // indirect, incidental, special, exemplary, or consequential damages
37 // (including, but not limited to, procurement of substitute goods or services;
38 // loss of use, data, or profits; or business interruption) however caused
39 // and on any theory of liability, whether in contract, strict liability,
40 // or tort (including negligence or otherwise) arising in any way out of
41 // the use of this software, even if advised of the possibility of such damage.
42 //
43 //M*/
44 
45 #ifndef OPENCV_HAL_SSE_HPP
46 #define OPENCV_HAL_SSE_HPP
47 
48 #include <algorithm>
49 #include "opencv2/core/utility.hpp"
50 
51 #define CV_SIMD128 1
52 #define CV_SIMD128_64F 1
53 #define CV_SIMD128_FP16 0  // no native operations with FP16 type.
54 
55 namespace cv
56 {
57 
58 //! @cond IGNORED
59 
60 //
61 // Compilation troubleshooting:
62 // - MSVC: error C2719: 'a': formal parameter with requested alignment of 16 won't be aligned
63 //   Replace parameter declaration to const reference:
64 //   -v_int32x4 a
65 //   +const v_int32x4& a
66 //
67 
68 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
69 
70 ///////// Types ////////////
71 
72 struct v_uint8x16
73 {
74     typedef uchar lane_type;
75     typedef __m128i vector_type;
76     enum { nlanes = 16 };
77 
78     /* coverity[uninit_ctor]: suppress warning */
v_uint8x16cv::v_uint8x1679     v_uint8x16() {}
v_uint8x16cv::v_uint8x1680     explicit v_uint8x16(__m128i v) : val(v) {}
v_uint8x16cv::v_uint8x1681     v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
82                uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
83     {
84         val = _mm_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3,
85                             (char)v4, (char)v5, (char)v6, (char)v7,
86                             (char)v8, (char)v9, (char)v10, (char)v11,
87                             (char)v12, (char)v13, (char)v14, (char)v15);
88     }
89 
get0cv::v_uint8x1690     uchar get0() const
91     {
92         return (uchar)_mm_cvtsi128_si32(val);
93     }
94 
95     __m128i val;
96 };
97 
98 struct v_int8x16
99 {
100     typedef schar lane_type;
101     typedef __m128i vector_type;
102     enum { nlanes = 16 };
103 
104     /* coverity[uninit_ctor]: suppress warning */
v_int8x16cv::v_int8x16105     v_int8x16() {}
v_int8x16cv::v_int8x16106     explicit v_int8x16(__m128i v) : val(v) {}
v_int8x16cv::v_int8x16107     v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
108               schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
109     {
110         val = _mm_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3,
111                             (char)v4, (char)v5, (char)v6, (char)v7,
112                             (char)v8, (char)v9, (char)v10, (char)v11,
113                             (char)v12, (char)v13, (char)v14, (char)v15);
114     }
115 
get0cv::v_int8x16116     schar get0() const
117     {
118         return (schar)_mm_cvtsi128_si32(val);
119     }
120 
121     __m128i val;
122 };
123 
124 struct v_uint16x8
125 {
126     typedef ushort lane_type;
127     typedef __m128i vector_type;
128     enum { nlanes = 8 };
129 
130     /* coverity[uninit_ctor]: suppress warning */
v_uint16x8cv::v_uint16x8131     v_uint16x8() {}
v_uint16x8cv::v_uint16x8132     explicit v_uint16x8(__m128i v) : val(v) {}
v_uint16x8cv::v_uint16x8133     v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
134     {
135         val = _mm_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3,
136                              (short)v4, (short)v5, (short)v6, (short)v7);
137     }
138 
get0cv::v_uint16x8139     ushort get0() const
140     {
141         return (ushort)_mm_cvtsi128_si32(val);
142     }
143 
144     __m128i val;
145 };
146 
147 struct v_int16x8
148 {
149     typedef short lane_type;
150     typedef __m128i vector_type;
151     enum { nlanes = 8 };
152 
153     /* coverity[uninit_ctor]: suppress warning */
v_int16x8cv::v_int16x8154     v_int16x8() {}
v_int16x8cv::v_int16x8155     explicit v_int16x8(__m128i v) : val(v) {}
v_int16x8cv::v_int16x8156     v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
157     {
158         val = _mm_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3,
159                              (short)v4, (short)v5, (short)v6, (short)v7);
160     }
161 
get0cv::v_int16x8162     short get0() const
163     {
164         return (short)_mm_cvtsi128_si32(val);
165     }
166 
167     __m128i val;
168 };
169 
170 struct v_uint32x4
171 {
172     typedef unsigned lane_type;
173     typedef __m128i vector_type;
174     enum { nlanes = 4 };
175 
176     /* coverity[uninit_ctor]: suppress warning */
v_uint32x4cv::v_uint32x4177     v_uint32x4() {}
v_uint32x4cv::v_uint32x4178     explicit v_uint32x4(__m128i v) : val(v) {}
v_uint32x4cv::v_uint32x4179     v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
180     {
181         val = _mm_setr_epi32((int)v0, (int)v1, (int)v2, (int)v3);
182     }
183 
get0cv::v_uint32x4184     unsigned get0() const
185     {
186         return (unsigned)_mm_cvtsi128_si32(val);
187     }
188 
189     __m128i val;
190 };
191 
192 struct v_int32x4
193 {
194     typedef int lane_type;
195     typedef __m128i vector_type;
196     enum { nlanes = 4 };
197 
198     /* coverity[uninit_ctor]: suppress warning */
v_int32x4cv::v_int32x4199     v_int32x4() {}
v_int32x4cv::v_int32x4200     explicit v_int32x4(__m128i v) : val(v) {}
v_int32x4cv::v_int32x4201     v_int32x4(int v0, int v1, int v2, int v3)
202     {
203         val = _mm_setr_epi32(v0, v1, v2, v3);
204     }
205 
get0cv::v_int32x4206     int get0() const
207     {
208         return _mm_cvtsi128_si32(val);
209     }
210 
211     __m128i val;
212 };
213 
214 struct v_float32x4
215 {
216     typedef float lane_type;
217     typedef __m128 vector_type;
218     enum { nlanes = 4 };
219 
220     /* coverity[uninit_ctor]: suppress warning */
v_float32x4cv::v_float32x4221     v_float32x4() {}
v_float32x4cv::v_float32x4222     explicit v_float32x4(__m128 v) : val(v) {}
v_float32x4cv::v_float32x4223     v_float32x4(float v0, float v1, float v2, float v3)
224     {
225         val = _mm_setr_ps(v0, v1, v2, v3);
226     }
227 
get0cv::v_float32x4228     float get0() const
229     {
230         return _mm_cvtss_f32(val);
231     }
232 
233     __m128 val;
234 };
235 
236 struct v_uint64x2
237 {
238     typedef uint64 lane_type;
239     typedef __m128i vector_type;
240     enum { nlanes = 2 };
241 
242     /* coverity[uninit_ctor]: suppress warning */
v_uint64x2cv::v_uint64x2243     v_uint64x2() {}
v_uint64x2cv::v_uint64x2244     explicit v_uint64x2(__m128i v) : val(v) {}
v_uint64x2cv::v_uint64x2245     v_uint64x2(uint64 v0, uint64 v1)
246     {
247         val = _mm_setr_epi32((int)v0, (int)(v0 >> 32), (int)v1, (int)(v1 >> 32));
248     }
249 
get0cv::v_uint64x2250     uint64 get0() const
251     {
252     #if !defined(__x86_64__) && !defined(_M_X64)
253         int a = _mm_cvtsi128_si32(val);
254         int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32));
255         return (unsigned)a | ((uint64)(unsigned)b << 32);
256     #else
257         return (uint64)_mm_cvtsi128_si64(val);
258     #endif
259     }
260 
261     __m128i val;
262 };
263 
264 struct v_int64x2
265 {
266     typedef int64 lane_type;
267     typedef __m128i vector_type;
268     enum { nlanes = 2 };
269 
270     /* coverity[uninit_ctor]: suppress warning */
v_int64x2cv::v_int64x2271     v_int64x2() {}
v_int64x2cv::v_int64x2272     explicit v_int64x2(__m128i v) : val(v) {}
v_int64x2cv::v_int64x2273     v_int64x2(int64 v0, int64 v1)
274     {
275         val = _mm_setr_epi32((int)v0, (int)(v0 >> 32), (int)v1, (int)(v1 >> 32));
276     }
277 
get0cv::v_int64x2278     int64 get0() const
279     {
280     #if !defined(__x86_64__) && !defined(_M_X64)
281         int a = _mm_cvtsi128_si32(val);
282         int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32));
283         return (int64)((unsigned)a | ((uint64)(unsigned)b << 32));
284     #else
285         return _mm_cvtsi128_si64(val);
286     #endif
287     }
288 
289     __m128i val;
290 };
291 
292 struct v_float64x2
293 {
294     typedef double lane_type;
295     typedef __m128d vector_type;
296     enum { nlanes = 2 };
297 
298     /* coverity[uninit_ctor]: suppress warning */
v_float64x2cv::v_float64x2299     v_float64x2() {}
v_float64x2cv::v_float64x2300     explicit v_float64x2(__m128d v) : val(v) {}
v_float64x2cv::v_float64x2301     v_float64x2(double v0, double v1)
302     {
303         val = _mm_setr_pd(v0, v1);
304     }
305 
get0cv::v_float64x2306     double get0() const
307     {
308         return _mm_cvtsd_f64(val);
309     }
310 
311     __m128d val;
312 };
313 
314 namespace hal_sse_internal
315 {
316     template <typename to_sse_type, typename from_sse_type>
317     to_sse_type v_sse_reinterpret_as(const from_sse_type& val);
318 
319 #define OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(to_sse_type, from_sse_type, sse_cast_intrin) \
320     template<> inline \
321     to_sse_type v_sse_reinterpret_as(const from_sse_type& a) \
322     { return sse_cast_intrin(a); }
323 
324     OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128i, __m128i, OPENCV_HAL_NOP)
325     OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128i, __m128, _mm_castps_si128)
326     OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128i, __m128d, _mm_castpd_si128)
327     OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128, __m128i, _mm_castsi128_ps)
328     OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128, __m128, OPENCV_HAL_NOP)
329     OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128, __m128d, _mm_castpd_ps)
330     OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128d, __m128i, _mm_castsi128_pd)
331     OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128d, __m128, _mm_castps_pd)
332     OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128d, __m128d, OPENCV_HAL_NOP)
333 }
334 
335 #define OPENCV_HAL_IMPL_SSE_INITVEC(_Tpvec, _Tp, suffix, zsuffix, ssuffix, _Tps, cast) \
336 inline _Tpvec v_setzero_##suffix() { return _Tpvec(_mm_setzero_##zsuffix()); } \
337 inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(_mm_set1_##ssuffix((_Tps)v)); } \
338 template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0& a) \
339 { return _Tpvec(cast(a.val)); }
340 
OPENCV_HAL_IMPL_SSE_INITVEC(v_uint8x16,uchar,u8,si128,epi8,schar,OPENCV_HAL_NOP)341 OPENCV_HAL_IMPL_SSE_INITVEC(v_uint8x16, uchar, u8, si128, epi8, schar, OPENCV_HAL_NOP)
342 OPENCV_HAL_IMPL_SSE_INITVEC(v_int8x16, schar, s8, si128, epi8, schar, OPENCV_HAL_NOP)
343 OPENCV_HAL_IMPL_SSE_INITVEC(v_uint16x8, ushort, u16, si128, epi16, short, OPENCV_HAL_NOP)
344 OPENCV_HAL_IMPL_SSE_INITVEC(v_int16x8, short, s16, si128, epi16, short, OPENCV_HAL_NOP)
345 OPENCV_HAL_IMPL_SSE_INITVEC(v_uint32x4, unsigned, u32, si128, epi32, int, OPENCV_HAL_NOP)
346 OPENCV_HAL_IMPL_SSE_INITVEC(v_int32x4, int, s32, si128, epi32, int, OPENCV_HAL_NOP)
347 OPENCV_HAL_IMPL_SSE_INITVEC(v_float32x4, float, f32, ps, ps, float, _mm_castsi128_ps)
348 OPENCV_HAL_IMPL_SSE_INITVEC(v_float64x2, double, f64, pd, pd, double, _mm_castsi128_pd)
349 
350 inline v_uint64x2 v_setzero_u64() { return v_uint64x2(_mm_setzero_si128()); }
v_setzero_s64()351 inline v_int64x2 v_setzero_s64() { return v_int64x2(_mm_setzero_si128()); }
v_setall_u64(uint64 val)352 inline v_uint64x2 v_setall_u64(uint64 val) { return v_uint64x2(val, val); }
v_setall_s64(int64 val)353 inline v_int64x2 v_setall_s64(int64 val) { return v_int64x2(val, val); }
354 
355 template<typename _Tpvec> inline
v_reinterpret_as_u64(const _Tpvec & a)356 v_uint64x2 v_reinterpret_as_u64(const _Tpvec& a) { return v_uint64x2(a.val); }
357 template<typename _Tpvec> inline
v_reinterpret_as_s64(const _Tpvec & a)358 v_int64x2 v_reinterpret_as_s64(const _Tpvec& a) { return v_int64x2(a.val); }
v_reinterpret_as_f32(const v_uint64x2 & a)359 inline v_float32x4 v_reinterpret_as_f32(const v_uint64x2& a)
360 { return v_float32x4(_mm_castsi128_ps(a.val)); }
v_reinterpret_as_f32(const v_int64x2 & a)361 inline v_float32x4 v_reinterpret_as_f32(const v_int64x2& a)
362 { return v_float32x4(_mm_castsi128_ps(a.val)); }
v_reinterpret_as_f64(const v_uint64x2 & a)363 inline v_float64x2 v_reinterpret_as_f64(const v_uint64x2& a)
364 { return v_float64x2(_mm_castsi128_pd(a.val)); }
v_reinterpret_as_f64(const v_int64x2 & a)365 inline v_float64x2 v_reinterpret_as_f64(const v_int64x2& a)
366 { return v_float64x2(_mm_castsi128_pd(a.val)); }
367 
368 #define OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(_Tpvec, suffix) \
369 inline _Tpvec v_reinterpret_as_##suffix(const v_float32x4& a) \
370 { return _Tpvec(_mm_castps_si128(a.val)); } \
371 inline _Tpvec v_reinterpret_as_##suffix(const v_float64x2& a) \
372 { return _Tpvec(_mm_castpd_si128(a.val)); }
373 
OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint8x16,u8)374 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint8x16, u8)
375 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int8x16, s8)
376 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint16x8, u16)
377 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int16x8, s16)
378 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint32x4, u32)
379 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int32x4, s32)
380 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint64x2, u64)
381 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int64x2, s64)
382 
383 inline v_float32x4 v_reinterpret_as_f32(const v_float32x4& a) {return a; }
v_reinterpret_as_f64(const v_float64x2 & a)384 inline v_float64x2 v_reinterpret_as_f64(const v_float64x2& a) {return a; }
v_reinterpret_as_f32(const v_float64x2 & a)385 inline v_float32x4 v_reinterpret_as_f32(const v_float64x2& a) {return v_float32x4(_mm_castpd_ps(a.val)); }
v_reinterpret_as_f64(const v_float32x4 & a)386 inline v_float64x2 v_reinterpret_as_f64(const v_float32x4& a) {return v_float64x2(_mm_castps_pd(a.val)); }
387 
388 //////////////// PACK ///////////////
v_pack(const v_uint16x8 & a,const v_uint16x8 & b)389 inline v_uint8x16 v_pack(const v_uint16x8& a, const v_uint16x8& b)
390 {
391     __m128i delta = _mm_set1_epi16(255);
392     return v_uint8x16(_mm_packus_epi16(_mm_subs_epu16(a.val, _mm_subs_epu16(a.val, delta)),
393                                        _mm_subs_epu16(b.val, _mm_subs_epu16(b.val, delta))));
394 }
395 
v_pack_store(uchar * ptr,const v_uint16x8 & a)396 inline void v_pack_store(uchar* ptr, const v_uint16x8& a)
397 {
398     __m128i delta = _mm_set1_epi16(255);
399     __m128i a1 = _mm_subs_epu16(a.val, _mm_subs_epu16(a.val, delta));
400     _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
401 }
402 
v_pack_u(const v_int16x8 & a,const v_int16x8 & b)403 inline v_uint8x16 v_pack_u(const v_int16x8& a, const v_int16x8& b)
404 { return v_uint8x16(_mm_packus_epi16(a.val, b.val)); }
405 
v_pack_u_store(uchar * ptr,const v_int16x8 & a)406 inline void v_pack_u_store(uchar* ptr, const v_int16x8& a)
407 { _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a.val, a.val)); }
408 
409 template<int n> inline
v_rshr_pack(const v_uint16x8 & a,const v_uint16x8 & b)410 v_uint8x16 v_rshr_pack(const v_uint16x8& a, const v_uint16x8& b)
411 {
412     // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
413     __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
414     return v_uint8x16(_mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(a.val, delta), n),
415                                        _mm_srli_epi16(_mm_adds_epu16(b.val, delta), n)));
416 }
417 
418 template<int n> inline
v_rshr_pack_store(uchar * ptr,const v_uint16x8 & a)419 void v_rshr_pack_store(uchar* ptr, const v_uint16x8& a)
420 {
421     __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
422     __m128i a1 = _mm_srli_epi16(_mm_adds_epu16(a.val, delta), n);
423     _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
424 }
425 
426 template<int n> inline
v_rshr_pack_u(const v_int16x8 & a,const v_int16x8 & b)427 v_uint8x16 v_rshr_pack_u(const v_int16x8& a, const v_int16x8& b)
428 {
429     __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
430     return v_uint8x16(_mm_packus_epi16(_mm_srai_epi16(_mm_adds_epi16(a.val, delta), n),
431                                        _mm_srai_epi16(_mm_adds_epi16(b.val, delta), n)));
432 }
433 
434 template<int n> inline
v_rshr_pack_u_store(uchar * ptr,const v_int16x8 & a)435 void v_rshr_pack_u_store(uchar* ptr, const v_int16x8& a)
436 {
437     __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
438     __m128i a1 = _mm_srai_epi16(_mm_adds_epi16(a.val, delta), n);
439     _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
440 }
441 
v_pack(const v_int16x8 & a,const v_int16x8 & b)442 inline v_int8x16 v_pack(const v_int16x8& a, const v_int16x8& b)
443 { return v_int8x16(_mm_packs_epi16(a.val, b.val)); }
444 
v_pack_store(schar * ptr,const v_int16x8 & a)445 inline void v_pack_store(schar* ptr, const v_int16x8& a)
446 { _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a.val, a.val)); }
447 
448 template<int n> inline
v_rshr_pack(const v_int16x8 & a,const v_int16x8 & b)449 v_int8x16 v_rshr_pack(const v_int16x8& a, const v_int16x8& b)
450 {
451     // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
452     __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
453     return v_int8x16(_mm_packs_epi16(_mm_srai_epi16(_mm_adds_epi16(a.val, delta), n),
454                                      _mm_srai_epi16(_mm_adds_epi16(b.val, delta), n)));
455 }
456 template<int n> inline
v_rshr_pack_store(schar * ptr,const v_int16x8 & a)457 void v_rshr_pack_store(schar* ptr, const v_int16x8& a)
458 {
459     // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
460     __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
461     __m128i a1 = _mm_srai_epi16(_mm_adds_epi16(a.val, delta), n);
462     _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a1, a1));
463 }
464 
465 
466 // byte-wise "mask ? a : b"
v_select_si128(__m128i mask,__m128i a,__m128i b)467 inline __m128i v_select_si128(__m128i mask, __m128i a, __m128i b)
468 {
469 #if CV_SSE4_1
470     return _mm_blendv_epi8(b, a, mask);
471 #else
472     return _mm_xor_si128(b, _mm_and_si128(_mm_xor_si128(a, b), mask));
473 #endif
474 }
475 
v_pack(const v_uint32x4 & a,const v_uint32x4 & b)476 inline v_uint16x8 v_pack(const v_uint32x4& a, const v_uint32x4& b)
477 { return v_uint16x8(_v128_packs_epu32(a.val, b.val)); }
478 
v_pack_store(ushort * ptr,const v_uint32x4 & a)479 inline void v_pack_store(ushort* ptr, const v_uint32x4& a)
480 {
481     __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(65535), delta32 = _mm_set1_epi32(32768);
482     __m128i a1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val), delta32);
483     __m128i r = _mm_packs_epi32(a1, a1);
484     _mm_storel_epi64((__m128i*)ptr, _mm_sub_epi16(r, _mm_set1_epi16(-32768)));
485 }
486 
487 template<int n> inline
v_rshr_pack(const v_uint32x4 & a,const v_uint32x4 & b)488 v_uint16x8 v_rshr_pack(const v_uint32x4& a, const v_uint32x4& b)
489 {
490     __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
491     __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32);
492     __m128i b1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(b.val, delta), n), delta32);
493     return v_uint16x8(_mm_sub_epi16(_mm_packs_epi32(a1, b1), _mm_set1_epi16(-32768)));
494 }
495 
496 template<int n> inline
v_rshr_pack_store(ushort * ptr,const v_uint32x4 & a)497 void v_rshr_pack_store(ushort* ptr, const v_uint32x4& a)
498 {
499     __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
500     __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32);
501     __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
502     _mm_storel_epi64((__m128i*)ptr, a2);
503 }
504 
v_pack_u(const v_int32x4 & a,const v_int32x4 & b)505 inline v_uint16x8 v_pack_u(const v_int32x4& a, const v_int32x4& b)
506 {
507 #if CV_SSE4_1
508     return v_uint16x8(_mm_packus_epi32(a.val, b.val));
509 #else
510     __m128i delta32 = _mm_set1_epi32(32768);
511 
512     // preliminary saturate negative values to zero
513     __m128i a1 = _mm_and_si128(a.val, _mm_cmpgt_epi32(a.val, _mm_set1_epi32(0)));
514     __m128i b1 = _mm_and_si128(b.val, _mm_cmpgt_epi32(b.val, _mm_set1_epi32(0)));
515 
516     __m128i r = _mm_packs_epi32(_mm_sub_epi32(a1, delta32), _mm_sub_epi32(b1, delta32));
517     return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768)));
518 #endif
519 }
520 
v_pack_u_store(ushort * ptr,const v_int32x4 & a)521 inline void v_pack_u_store(ushort* ptr, const v_int32x4& a)
522 {
523 #if CV_SSE4_1
524     _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi32(a.val, a.val));
525 #else
526     __m128i delta32 = _mm_set1_epi32(32768);
527     __m128i a1 = _mm_sub_epi32(a.val, delta32);
528     __m128i r = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
529     _mm_storel_epi64((__m128i*)ptr, r);
530 #endif
531 }
532 
533 template<int n> inline
v_rshr_pack_u(const v_int32x4 & a,const v_int32x4 & b)534 v_uint16x8 v_rshr_pack_u(const v_int32x4& a, const v_int32x4& b)
535 {
536 #if CV_SSE4_1
537     __m128i delta = _mm_set1_epi32(1 << (n - 1));
538     return v_uint16x8(_mm_packus_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n),
539                                        _mm_srai_epi32(_mm_add_epi32(b.val, delta), n)));
540 #else
541     __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
542     __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32);
543     __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
544     __m128i b1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(b.val, delta), n), delta32);
545     __m128i b2 = _mm_sub_epi16(_mm_packs_epi32(b1, b1), _mm_set1_epi16(-32768));
546     return v_uint16x8(_mm_unpacklo_epi64(a2, b2));
547 #endif
548 }
549 
550 template<int n> inline
v_rshr_pack_u_store(ushort * ptr,const v_int32x4 & a)551 void v_rshr_pack_u_store(ushort* ptr, const v_int32x4& a)
552 {
553 #if CV_SSE4_1
554     __m128i delta = _mm_set1_epi32(1 << (n - 1));
555     __m128i a1 = _mm_srai_epi32(_mm_add_epi32(a.val, delta), n);
556     _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi32(a1, a1));
557 #else
558     __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
559     __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32);
560     __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
561     _mm_storel_epi64((__m128i*)ptr, a2);
562 #endif
563 }
564 
v_pack(const v_int32x4 & a,const v_int32x4 & b)565 inline v_int16x8 v_pack(const v_int32x4& a, const v_int32x4& b)
566 { return v_int16x8(_mm_packs_epi32(a.val, b.val)); }
567 
v_pack_store(short * ptr,const v_int32x4 & a)568 inline void v_pack_store(short* ptr, const v_int32x4& a)
569 {
570     _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a.val, a.val));
571 }
572 
573 template<int n> inline
v_rshr_pack(const v_int32x4 & a,const v_int32x4 & b)574 v_int16x8 v_rshr_pack(const v_int32x4& a, const v_int32x4& b)
575 {
576     __m128i delta = _mm_set1_epi32(1 << (n-1));
577     return v_int16x8(_mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n),
578                                      _mm_srai_epi32(_mm_add_epi32(b.val, delta), n)));
579 }
580 
581 template<int n> inline
v_rshr_pack_store(short * ptr,const v_int32x4 & a)582 void v_rshr_pack_store(short* ptr, const v_int32x4& a)
583 {
584     __m128i delta = _mm_set1_epi32(1 << (n-1));
585     __m128i a1 = _mm_srai_epi32(_mm_add_epi32(a.val, delta), n);
586     _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a1, a1));
587 }
588 
589 
590 // [a0 0 | b0 0]  [a1 0 | b1 0]
v_pack(const v_uint64x2 & a,const v_uint64x2 & b)591 inline v_uint32x4 v_pack(const v_uint64x2& a, const v_uint64x2& b)
592 {
593     __m128i v0 = _mm_unpacklo_epi32(a.val, b.val); // a0 a1 0 0
594     __m128i v1 = _mm_unpackhi_epi32(a.val, b.val); // b0 b1 0 0
595     return v_uint32x4(_mm_unpacklo_epi32(v0, v1));
596 }
597 
v_pack_store(unsigned * ptr,const v_uint64x2 & a)598 inline void v_pack_store(unsigned* ptr, const v_uint64x2& a)
599 {
600     __m128i a1 = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 2, 2, 0));
601     _mm_storel_epi64((__m128i*)ptr, a1);
602 }
603 
604 // [a0 0 | b0 0]  [a1 0 | b1 0]
v_pack(const v_int64x2 & a,const v_int64x2 & b)605 inline v_int32x4 v_pack(const v_int64x2& a, const v_int64x2& b)
606 {
607     __m128i v0 = _mm_unpacklo_epi32(a.val, b.val); // a0 a1 0 0
608     __m128i v1 = _mm_unpackhi_epi32(a.val, b.val); // b0 b1 0 0
609     return v_int32x4(_mm_unpacklo_epi32(v0, v1));
610 }
611 
v_pack_store(int * ptr,const v_int64x2 & a)612 inline void v_pack_store(int* ptr, const v_int64x2& a)
613 {
614     __m128i a1 = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 2, 2, 0));
615     _mm_storel_epi64((__m128i*)ptr, a1);
616 }
617 
618 template<int n> inline
v_rshr_pack(const v_uint64x2 & a,const v_uint64x2 & b)619 v_uint32x4 v_rshr_pack(const v_uint64x2& a, const v_uint64x2& b)
620 {
621     uint64 delta = (uint64)1 << (n-1);
622     v_uint64x2 delta2(delta, delta);
623     __m128i a1 = _mm_srli_epi64(_mm_add_epi64(a.val, delta2.val), n);
624     __m128i b1 = _mm_srli_epi64(_mm_add_epi64(b.val, delta2.val), n);
625     __m128i v0 = _mm_unpacklo_epi32(a1, b1); // a0 a1 0 0
626     __m128i v1 = _mm_unpackhi_epi32(a1, b1); // b0 b1 0 0
627     return v_uint32x4(_mm_unpacklo_epi32(v0, v1));
628 }
629 
630 template<int n> inline
v_rshr_pack_store(unsigned * ptr,const v_uint64x2 & a)631 void v_rshr_pack_store(unsigned* ptr, const v_uint64x2& a)
632 {
633     uint64 delta = (uint64)1 << (n-1);
634     v_uint64x2 delta2(delta, delta);
635     __m128i a1 = _mm_srli_epi64(_mm_add_epi64(a.val, delta2.val), n);
636     __m128i a2 = _mm_shuffle_epi32(a1, _MM_SHUFFLE(0, 2, 2, 0));
637     _mm_storel_epi64((__m128i*)ptr, a2);
638 }
639 
v_sign_epi64(__m128i a)640 inline __m128i v_sign_epi64(__m128i a)
641 {
642     return _mm_shuffle_epi32(_mm_srai_epi32(a, 31), _MM_SHUFFLE(3, 3, 1, 1)); // x m0 | x m1
643 }
644 
v_srai_epi64(__m128i a,int imm)645 inline __m128i v_srai_epi64(__m128i a, int imm)
646 {
647     __m128i smask = v_sign_epi64(a);
648     return _mm_xor_si128(_mm_srli_epi64(_mm_xor_si128(a, smask), imm), smask);
649 }
650 
651 template<int n> inline
v_rshr_pack(const v_int64x2 & a,const v_int64x2 & b)652 v_int32x4 v_rshr_pack(const v_int64x2& a, const v_int64x2& b)
653 {
654     int64 delta = (int64)1 << (n-1);
655     v_int64x2 delta2(delta, delta);
656     __m128i a1 = v_srai_epi64(_mm_add_epi64(a.val, delta2.val), n);
657     __m128i b1 = v_srai_epi64(_mm_add_epi64(b.val, delta2.val), n);
658     __m128i v0 = _mm_unpacklo_epi32(a1, b1); // a0 a1 0 0
659     __m128i v1 = _mm_unpackhi_epi32(a1, b1); // b0 b1 0 0
660     return v_int32x4(_mm_unpacklo_epi32(v0, v1));
661 }
662 
663 template<int n> inline
v_rshr_pack_store(int * ptr,const v_int64x2 & a)664 void v_rshr_pack_store(int* ptr, const v_int64x2& a)
665 {
666     int64 delta = (int64)1 << (n-1);
667     v_int64x2 delta2(delta, delta);
668     __m128i a1 = v_srai_epi64(_mm_add_epi64(a.val, delta2.val), n);
669     __m128i a2 = _mm_shuffle_epi32(a1, _MM_SHUFFLE(0, 2, 2, 0));
670     _mm_storel_epi64((__m128i*)ptr, a2);
671 }
672 
673 // pack boolean
v_pack_b(const v_uint16x8 & a,const v_uint16x8 & b)674 inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
675 {
676     __m128i ab = _mm_packs_epi16(a.val, b.val);
677     return v_uint8x16(ab);
678 }
679 
v_pack_b(const v_uint32x4 & a,const v_uint32x4 & b,const v_uint32x4 & c,const v_uint32x4 & d)680 inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
681                            const v_uint32x4& c, const v_uint32x4& d)
682 {
683     __m128i ab = _mm_packs_epi32(a.val, b.val);
684     __m128i cd = _mm_packs_epi32(c.val, d.val);
685     return v_uint8x16(_mm_packs_epi16(ab, cd));
686 }
687 
v_pack_b(const v_uint64x2 & a,const v_uint64x2 & b,const v_uint64x2 & c,const v_uint64x2 & d,const v_uint64x2 & e,const v_uint64x2 & f,const v_uint64x2 & g,const v_uint64x2 & h)688 inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
689                            const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
690                            const v_uint64x2& g, const v_uint64x2& h)
691 {
692     __m128i ab = _mm_packs_epi32(a.val, b.val);
693     __m128i cd = _mm_packs_epi32(c.val, d.val);
694     __m128i ef = _mm_packs_epi32(e.val, f.val);
695     __m128i gh = _mm_packs_epi32(g.val, h.val);
696 
697     __m128i abcd = _mm_packs_epi32(ab, cd);
698     __m128i efgh = _mm_packs_epi32(ef, gh);
699     return v_uint8x16(_mm_packs_epi16(abcd, efgh));
700 }
701 
v_matmul(const v_float32x4 & v,const v_float32x4 & m0,const v_float32x4 & m1,const v_float32x4 & m2,const v_float32x4 & m3)702 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
703                             const v_float32x4& m1, const v_float32x4& m2,
704                             const v_float32x4& m3)
705 {
706     __m128 v0 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(0, 0, 0, 0)), m0.val);
707     __m128 v1 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(1, 1, 1, 1)), m1.val);
708     __m128 v2 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(2, 2, 2, 2)), m2.val);
709     __m128 v3 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(3, 3, 3, 3)), m3.val);
710 
711     return v_float32x4(_mm_add_ps(_mm_add_ps(v0, v1), _mm_add_ps(v2, v3)));
712 }
713 
v_matmuladd(const v_float32x4 & v,const v_float32x4 & m0,const v_float32x4 & m1,const v_float32x4 & m2,const v_float32x4 & a)714 inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
715                                const v_float32x4& m1, const v_float32x4& m2,
716                                const v_float32x4& a)
717 {
718     __m128 v0 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(0, 0, 0, 0)), m0.val);
719     __m128 v1 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(1, 1, 1, 1)), m1.val);
720     __m128 v2 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(2, 2, 2, 2)), m2.val);
721 
722     return v_float32x4(_mm_add_ps(_mm_add_ps(v0, v1), _mm_add_ps(v2, a.val)));
723 }
724 
725 #define OPENCV_HAL_IMPL_SSE_BIN_OP(bin_op, _Tpvec, intrin) \
726     inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
727     { \
728         return _Tpvec(intrin(a.val, b.val)); \
729     } \
730     inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
731     { \
732         a.val = intrin(a.val, b.val); \
733         return a; \
734     }
735 
736 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint8x16, _mm_adds_epu8)
737 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint8x16, _mm_subs_epu8)
738 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int8x16, _mm_adds_epi8)
739 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int8x16, _mm_subs_epi8)
740 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint16x8, _mm_adds_epu16)
741 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint16x8, _mm_subs_epu16)
742 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int16x8, _mm_adds_epi16)
743 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int16x8, _mm_subs_epi16)
744 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint32x4, _mm_add_epi32)
745 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint32x4, _mm_sub_epi32)
746 OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_uint32x4, _v128_mullo_epi32)
747 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int32x4, _mm_add_epi32)
748 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int32x4, _mm_sub_epi32)
749 OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_int32x4, _v128_mullo_epi32)
750 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float32x4, _mm_add_ps)
751 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float32x4, _mm_sub_ps)
752 OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float32x4, _mm_mul_ps)
753 OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float32x4, _mm_div_ps)
754 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float64x2, _mm_add_pd)
755 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float64x2, _mm_sub_pd)
756 OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float64x2, _mm_mul_pd)
757 OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float64x2, _mm_div_pd)
758 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint64x2, _mm_add_epi64)
759 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint64x2, _mm_sub_epi64)
760 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int64x2, _mm_add_epi64)
761 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int64x2, _mm_sub_epi64)
762 
763 // saturating multiply 8-bit, 16-bit
764 #define OPENCV_HAL_IMPL_SSE_MUL_SAT(_Tpvec, _Tpwvec)             \
765     inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b)  \
766     {                                                            \
767         _Tpwvec c, d;                                            \
768         v_mul_expand(a, b, c, d);                                \
769         return v_pack(c, d);                                     \
770     }                                                            \
771     inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b)      \
772     { a = a * b; return a; }
773 
OPENCV_HAL_IMPL_SSE_MUL_SAT(v_uint8x16,v_uint16x8)774 OPENCV_HAL_IMPL_SSE_MUL_SAT(v_uint8x16, v_uint16x8)
775 OPENCV_HAL_IMPL_SSE_MUL_SAT(v_int8x16,  v_int16x8)
776 OPENCV_HAL_IMPL_SSE_MUL_SAT(v_uint16x8, v_uint32x4)
777 OPENCV_HAL_IMPL_SSE_MUL_SAT(v_int16x8,  v_int32x4)
778 
779 //  Multiply and expand
780 inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
781                          v_uint16x8& c, v_uint16x8& d)
782 {
783     v_uint16x8 a0, a1, b0, b1;
784     v_expand(a, a0, a1);
785     v_expand(b, b0, b1);
786     c = v_mul_wrap(a0, b0);
787     d = v_mul_wrap(a1, b1);
788 }
789 
v_mul_expand(const v_int8x16 & a,const v_int8x16 & b,v_int16x8 & c,v_int16x8 & d)790 inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b,
791                          v_int16x8& c, v_int16x8& d)
792 {
793     v_int16x8 a0, a1, b0, b1;
794     v_expand(a, a0, a1);
795     v_expand(b, b0, b1);
796     c = v_mul_wrap(a0, b0);
797     d = v_mul_wrap(a1, b1);
798 }
799 
v_mul_expand(const v_int16x8 & a,const v_int16x8 & b,v_int32x4 & c,v_int32x4 & d)800 inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
801                          v_int32x4& c, v_int32x4& d)
802 {
803     __m128i v0 = _mm_mullo_epi16(a.val, b.val);
804     __m128i v1 = _mm_mulhi_epi16(a.val, b.val);
805     c.val = _mm_unpacklo_epi16(v0, v1);
806     d.val = _mm_unpackhi_epi16(v0, v1);
807 }
808 
v_mul_expand(const v_uint16x8 & a,const v_uint16x8 & b,v_uint32x4 & c,v_uint32x4 & d)809 inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
810                          v_uint32x4& c, v_uint32x4& d)
811 {
812     __m128i v0 = _mm_mullo_epi16(a.val, b.val);
813     __m128i v1 = _mm_mulhi_epu16(a.val, b.val);
814     c.val = _mm_unpacklo_epi16(v0, v1);
815     d.val = _mm_unpackhi_epi16(v0, v1);
816 }
817 
v_mul_expand(const v_uint32x4 & a,const v_uint32x4 & b,v_uint64x2 & c,v_uint64x2 & d)818 inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
819                          v_uint64x2& c, v_uint64x2& d)
820 {
821     __m128i c0 = _mm_mul_epu32(a.val, b.val);
822     __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
823     c.val = _mm_unpacklo_epi64(c0, c1);
824     d.val = _mm_unpackhi_epi64(c0, c1);
825 }
826 
v_mul_hi(const v_int16x8 & a,const v_int16x8 & b)827 inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b) { return v_int16x8(_mm_mulhi_epi16(a.val, b.val)); }
v_mul_hi(const v_uint16x8 & a,const v_uint16x8 & b)828 inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b) { return v_uint16x8(_mm_mulhi_epu16(a.val, b.val)); }
829 
830 //////// Dot Product ////////
831 
832 // 16 >> 32
v_dotprod(const v_int16x8 & a,const v_int16x8 & b)833 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
834 { return v_int32x4(_mm_madd_epi16(a.val, b.val)); }
v_dotprod(const v_int16x8 & a,const v_int16x8 & b,const v_int32x4 & c)835 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
836 { return v_dotprod(a, b) + c; }
837 
838 // 32 >> 64
v_dotprod(const v_int32x4 & a,const v_int32x4 & b)839 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
840 {
841 #if CV_SSE4_1
842     __m128i even = _mm_mul_epi32(a.val, b.val);
843     __m128i odd = _mm_mul_epi32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
844     return v_int64x2(_mm_add_epi64(even, odd));
845 #else
846     __m128i even_u = _mm_mul_epu32(a.val, b.val);
847     __m128i odd_u = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
848     // convert unsigned to signed high multiplication (from: Agner Fog(veclib) and H S Warren: Hacker's delight, 2003, p. 132)
849     __m128i a_sign = _mm_srai_epi32(a.val, 31);
850     __m128i b_sign = _mm_srai_epi32(b.val, 31);
851     // |x * sign of x
852     __m128i axb  = _mm_and_si128(a.val, b_sign);
853     __m128i bxa  = _mm_and_si128(b.val, a_sign);
854     // sum of sign corrections
855     __m128i ssum = _mm_add_epi32(bxa, axb);
856     __m128i even_ssum = _mm_slli_epi64(ssum, 32);
857     __m128i odd_ssum = _mm_and_si128(ssum, _mm_set_epi32(-1, 0, -1, 0));
858     // convert to signed and prod
859     return v_int64x2(_mm_add_epi64(_mm_sub_epi64(even_u, even_ssum), _mm_sub_epi64(odd_u, odd_ssum)));
860 #endif
861 }
v_dotprod(const v_int32x4 & a,const v_int32x4 & b,const v_int64x2 & c)862 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
863 { return v_dotprod(a, b) + c; }
864 
865 // 8 >> 32
v_dotprod_expand(const v_uint8x16 & a,const v_uint8x16 & b)866 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
867 {
868     __m128i a0 = _mm_srli_epi16(_mm_slli_si128(a.val, 1), 8); // even
869     __m128i a1 = _mm_srli_epi16(a.val, 8); // odd
870     __m128i b0 = _mm_srli_epi16(_mm_slli_si128(b.val, 1), 8);
871     __m128i b1 = _mm_srli_epi16(b.val, 8);
872     __m128i p0 = _mm_madd_epi16(a0, b0);
873     __m128i p1 = _mm_madd_epi16(a1, b1);
874     return v_uint32x4(_mm_add_epi32(p0, p1));
875 }
v_dotprod_expand(const v_uint8x16 & a,const v_uint8x16 & b,const v_uint32x4 & c)876 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
877 { return v_dotprod_expand(a, b) + c; }
878 
v_dotprod_expand(const v_int8x16 & a,const v_int8x16 & b)879 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
880 {
881     __m128i a0 = _mm_srai_epi16(_mm_slli_si128(a.val, 1), 8); // even
882     __m128i a1 = _mm_srai_epi16(a.val, 8); // odd
883     __m128i b0 = _mm_srai_epi16(_mm_slli_si128(b.val, 1), 8);
884     __m128i b1 = _mm_srai_epi16(b.val, 8);
885     __m128i p0 = _mm_madd_epi16(a0, b0);
886     __m128i p1 = _mm_madd_epi16(a1, b1);
887     return v_int32x4(_mm_add_epi32(p0, p1));
888 }
v_dotprod_expand(const v_int8x16 & a,const v_int8x16 & b,const v_int32x4 & c)889 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
890 { return v_dotprod_expand(a, b) + c; }
891 
892 // 16 >> 64
v_dotprod_expand(const v_uint16x8 & a,const v_uint16x8 & b)893 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
894 {
895     v_uint32x4 c, d;
896     v_mul_expand(a, b, c, d);
897 
898     v_uint64x2 c0, c1, d0, d1;
899     v_expand(c, c0, c1);
900     v_expand(d, d0, d1);
901 
902     c0 += c1; d0 += d1;
903     return v_uint64x2(_mm_add_epi64(
904         _mm_unpacklo_epi64(c0.val, d0.val),
905         _mm_unpackhi_epi64(c0.val, d0.val)
906     ));
907 }
v_dotprod_expand(const v_uint16x8 & a,const v_uint16x8 & b,const v_uint64x2 & c)908 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
909 { return v_dotprod_expand(a, b) + c; }
910 
v_dotprod_expand(const v_int16x8 & a,const v_int16x8 & b)911 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
912 {
913     v_int32x4 prod = v_dotprod(a, b);
914     v_int64x2 c, d;
915     v_expand(prod, c, d);
916     return v_int64x2(_mm_add_epi64(
917         _mm_unpacklo_epi64(c.val, d.val),
918         _mm_unpackhi_epi64(c.val, d.val)
919     ));
920 }
v_dotprod_expand(const v_int16x8 & a,const v_int16x8 & b,const v_int64x2 & c)921 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
922 { return v_dotprod_expand(a, b) + c; }
923 
924 // 32 >> 64f
v_dotprod_expand(const v_int32x4 & a,const v_int32x4 & b)925 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
926 {
927 #if CV_SSE4_1
928     return v_cvt_f64(v_dotprod(a, b));
929 #else
930     v_float64x2 c = v_cvt_f64(a) * v_cvt_f64(b);
931     v_float64x2 d = v_cvt_f64_high(a) * v_cvt_f64_high(b);
932 
933     return v_float64x2(_mm_add_pd(
934         _mm_unpacklo_pd(c.val, d.val),
935         _mm_unpackhi_pd(c.val, d.val)
936     ));
937 #endif
938 }
v_dotprod_expand(const v_int32x4 & a,const v_int32x4 & b,const v_float64x2 & c)939 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
940 { return v_dotprod_expand(a, b) + c; }
941 
942 //////// Fast Dot Product ////////
943 
944 // 16 >> 32
v_dotprod_fast(const v_int16x8 & a,const v_int16x8 & b)945 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
946 { return v_dotprod(a, b); }
v_dotprod_fast(const v_int16x8 & a,const v_int16x8 & b,const v_int32x4 & c)947 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
948 { return v_dotprod(a, b) + c; }
949 
950 // 32 >> 64
v_dotprod_fast(const v_int32x4 & a,const v_int32x4 & b)951 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
952 { return v_dotprod(a, b); }
v_dotprod_fast(const v_int32x4 & a,const v_int32x4 & b,const v_int64x2 & c)953 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
954 { return v_dotprod_fast(a, b) + c; }
955 
956 // 8 >> 32
v_dotprod_expand_fast(const v_uint8x16 & a,const v_uint8x16 & b)957 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
958 {
959     __m128i a0 = v_expand_low(a).val;
960     __m128i a1 = v_expand_high(a).val;
961     __m128i b0 = v_expand_low(b).val;
962     __m128i b1 = v_expand_high(b).val;
963     __m128i p0 = _mm_madd_epi16(a0, b0);
964     __m128i p1 = _mm_madd_epi16(a1, b1);
965     return v_uint32x4(_mm_add_epi32(p0, p1));
966 }
v_dotprod_expand_fast(const v_uint8x16 & a,const v_uint8x16 & b,const v_uint32x4 & c)967 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
968 { return v_dotprod_expand_fast(a, b) + c; }
969 
v_dotprod_expand_fast(const v_int8x16 & a,const v_int8x16 & b)970 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
971 {
972 #if CV_SSE4_1
973     __m128i a0 = _mm_cvtepi8_epi16(a.val);
974     __m128i a1 = v_expand_high(a).val;
975     __m128i b0 = _mm_cvtepi8_epi16(b.val);
976     __m128i b1 = v_expand_high(b).val;
977     __m128i p0 = _mm_madd_epi16(a0, b0);
978     __m128i p1 = _mm_madd_epi16(a1, b1);
979     return v_int32x4(_mm_add_epi32(p0, p1));
980 #else
981     return v_dotprod_expand(a, b);
982 #endif
983 }
v_dotprod_expand_fast(const v_int8x16 & a,const v_int8x16 & b,const v_int32x4 & c)984 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
985 { return v_dotprod_expand_fast(a, b) + c; }
986 
987 // 16 >> 64
v_dotprod_expand_fast(const v_uint16x8 & a,const v_uint16x8 & b)988 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
989 {
990     v_uint32x4 c, d;
991     v_mul_expand(a, b, c, d);
992 
993     v_uint64x2 c0, c1, d0, d1;
994     v_expand(c, c0, c1);
995     v_expand(d, d0, d1);
996 
997     c0 += c1; d0 += d1;
998     return c0 + d0;
999 }
v_dotprod_expand_fast(const v_uint16x8 & a,const v_uint16x8 & b,const v_uint64x2 & c)1000 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
1001 { return v_dotprod_expand_fast(a, b) + c; }
1002 
v_dotprod_expand_fast(const v_int16x8 & a,const v_int16x8 & b)1003 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
1004 {
1005     v_int32x4 prod = v_dotprod(a, b);
1006     v_int64x2 c, d;
1007     v_expand(prod, c, d);
1008     return c + d;
1009 }
v_dotprod_expand_fast(const v_int16x8 & a,const v_int16x8 & b,const v_int64x2 & c)1010 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
1011 { return v_dotprod_expand_fast(a, b) + c; }
1012 
1013 // 32 >> 64f
1014 v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c);
v_dotprod_expand_fast(const v_int32x4 & a,const v_int32x4 & b)1015 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
1016 { return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_cvt_f64_high(a) * v_cvt_f64_high(b)); }
v_dotprod_expand_fast(const v_int32x4 & a,const v_int32x4 & b,const v_float64x2 & c)1017 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a,   const v_int32x4& b, const v_float64x2& c)
1018 { return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_fma(v_cvt_f64_high(a), v_cvt_f64_high(b), c)); }
1019 
1020 #define OPENCV_HAL_IMPL_SSE_LOGIC_OP(_Tpvec, suffix, not_const) \
1021     OPENCV_HAL_IMPL_SSE_BIN_OP(&, _Tpvec, _mm_and_##suffix) \
1022     OPENCV_HAL_IMPL_SSE_BIN_OP(|, _Tpvec, _mm_or_##suffix) \
1023     OPENCV_HAL_IMPL_SSE_BIN_OP(^, _Tpvec, _mm_xor_##suffix) \
1024     inline _Tpvec operator ~ (const _Tpvec& a) \
1025     { \
1026         return _Tpvec(_mm_xor_##suffix(a.val, not_const)); \
1027     }
1028 
1029 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint8x16, si128, _mm_set1_epi32(-1))
1030 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int8x16, si128, _mm_set1_epi32(-1))
1031 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint16x8, si128, _mm_set1_epi32(-1))
1032 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int16x8, si128, _mm_set1_epi32(-1))
1033 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint32x4, si128, _mm_set1_epi32(-1))
1034 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int32x4, si128, _mm_set1_epi32(-1))
1035 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint64x2, si128, _mm_set1_epi32(-1))
1036 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int64x2, si128, _mm_set1_epi32(-1))
1037 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_float32x4, ps, _mm_castsi128_ps(_mm_set1_epi32(-1)))
1038 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_float64x2, pd, _mm_castsi128_pd(_mm_set1_epi32(-1)))
1039 
v_sqrt(const v_float32x4 & x)1040 inline v_float32x4 v_sqrt(const v_float32x4& x)
1041 { return v_float32x4(_mm_sqrt_ps(x.val)); }
1042 
v_invsqrt(const v_float32x4 & x)1043 inline v_float32x4 v_invsqrt(const v_float32x4& x)
1044 {
1045     const __m128 _0_5 = _mm_set1_ps(0.5f), _1_5 = _mm_set1_ps(1.5f);
1046     __m128 t = x.val;
1047     __m128 h = _mm_mul_ps(t, _0_5);
1048     t = _mm_rsqrt_ps(t);
1049     t = _mm_mul_ps(t, _mm_sub_ps(_1_5, _mm_mul_ps(_mm_mul_ps(t, t), h)));
1050     return v_float32x4(t);
1051 }
1052 
v_sqrt(const v_float64x2 & x)1053 inline v_float64x2 v_sqrt(const v_float64x2& x)
1054 { return v_float64x2(_mm_sqrt_pd(x.val)); }
1055 
v_invsqrt(const v_float64x2 & x)1056 inline v_float64x2 v_invsqrt(const v_float64x2& x)
1057 {
1058     const __m128d v_1 = _mm_set1_pd(1.);
1059     return v_float64x2(_mm_div_pd(v_1, _mm_sqrt_pd(x.val)));
1060 }
1061 
1062 #define OPENCV_HAL_IMPL_SSE_ABS_INT_FUNC(_Tpuvec, _Tpsvec, func, suffix, subWidth) \
1063 inline _Tpuvec v_abs(const _Tpsvec& x) \
1064 { return _Tpuvec(_mm_##func##_ep##suffix(x.val, _mm_sub_ep##subWidth(_mm_setzero_si128(), x.val))); }
1065 
OPENCV_HAL_IMPL_SSE_ABS_INT_FUNC(v_uint8x16,v_int8x16,min,u8,i8)1066 OPENCV_HAL_IMPL_SSE_ABS_INT_FUNC(v_uint8x16, v_int8x16, min, u8, i8)
1067 OPENCV_HAL_IMPL_SSE_ABS_INT_FUNC(v_uint16x8, v_int16x8, max, i16, i16)
1068 inline v_uint32x4 v_abs(const v_int32x4& x)
1069 {
1070     __m128i s = _mm_srli_epi32(x.val, 31);
1071     __m128i f = _mm_srai_epi32(x.val, 31);
1072     return v_uint32x4(_mm_add_epi32(_mm_xor_si128(x.val, f), s));
1073 }
v_abs(const v_float32x4 & x)1074 inline v_float32x4 v_abs(const v_float32x4& x)
1075 { return v_float32x4(_mm_and_ps(x.val, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))); }
v_abs(const v_float64x2 & x)1076 inline v_float64x2 v_abs(const v_float64x2& x)
1077 {
1078     return v_float64x2(_mm_and_pd(x.val,
1079         _mm_castsi128_pd(_mm_srli_epi64(_mm_set1_epi32(-1), 1))));
1080 }
1081 
1082 // TODO: exp, log, sin, cos
1083 
1084 #define OPENCV_HAL_IMPL_SSE_BIN_FUNC(_Tpvec, func, intrin) \
1085 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
1086 { \
1087     return _Tpvec(intrin(a.val, b.val)); \
1088 }
1089 
OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16,v_min,_mm_min_epu8)1090 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_min, _mm_min_epu8)
1091 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_max, _mm_max_epu8)
1092 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_min, _mm_min_epi16)
1093 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_max, _mm_max_epi16)
1094 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float32x4, v_min, _mm_min_ps)
1095 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float32x4, v_max, _mm_max_ps)
1096 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float64x2, v_min, _mm_min_pd)
1097 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float64x2, v_max, _mm_max_pd)
1098 
1099 inline v_int8x16 v_min(const v_int8x16& a, const v_int8x16& b)
1100 {
1101 #if CV_SSE4_1
1102     return v_int8x16(_mm_min_epi8(a.val, b.val));
1103 #else
1104     __m128i delta = _mm_set1_epi8((char)-128);
1105     return v_int8x16(_mm_xor_si128(delta, _mm_min_epu8(_mm_xor_si128(a.val, delta),
1106                                                        _mm_xor_si128(b.val, delta))));
1107 #endif
1108 }
v_max(const v_int8x16 & a,const v_int8x16 & b)1109 inline v_int8x16 v_max(const v_int8x16& a, const v_int8x16& b)
1110 {
1111 #if CV_SSE4_1
1112     return v_int8x16(_mm_max_epi8(a.val, b.val));
1113 #else
1114     __m128i delta = _mm_set1_epi8((char)-128);
1115     return v_int8x16(_mm_xor_si128(delta, _mm_max_epu8(_mm_xor_si128(a.val, delta),
1116                                                        _mm_xor_si128(b.val, delta))));
1117 #endif
1118 }
v_min(const v_uint16x8 & a,const v_uint16x8 & b)1119 inline v_uint16x8 v_min(const v_uint16x8& a, const v_uint16x8& b)
1120 {
1121 #if CV_SSE4_1
1122     return v_uint16x8(_mm_min_epu16(a.val, b.val));
1123 #else
1124     return v_uint16x8(_mm_subs_epu16(a.val, _mm_subs_epu16(a.val, b.val)));
1125 #endif
1126 }
v_max(const v_uint16x8 & a,const v_uint16x8 & b)1127 inline v_uint16x8 v_max(const v_uint16x8& a, const v_uint16x8& b)
1128 {
1129 #if CV_SSE4_1
1130     return v_uint16x8(_mm_max_epu16(a.val, b.val));
1131 #else
1132     return v_uint16x8(_mm_adds_epu16(_mm_subs_epu16(a.val, b.val), b.val));
1133 #endif
1134 }
v_min(const v_uint32x4 & a,const v_uint32x4 & b)1135 inline v_uint32x4 v_min(const v_uint32x4& a, const v_uint32x4& b)
1136 {
1137 #if CV_SSE4_1
1138     return v_uint32x4(_mm_min_epu32(a.val, b.val));
1139 #else
1140     __m128i delta = _mm_set1_epi32((int)0x80000000);
1141     __m128i mask = _mm_cmpgt_epi32(_mm_xor_si128(a.val, delta), _mm_xor_si128(b.val, delta));
1142     return v_uint32x4(v_select_si128(mask, b.val, a.val));
1143 #endif
1144 }
v_max(const v_uint32x4 & a,const v_uint32x4 & b)1145 inline v_uint32x4 v_max(const v_uint32x4& a, const v_uint32x4& b)
1146 {
1147 #if CV_SSE4_1
1148     return v_uint32x4(_mm_max_epu32(a.val, b.val));
1149 #else
1150     __m128i delta = _mm_set1_epi32((int)0x80000000);
1151     __m128i mask = _mm_cmpgt_epi32(_mm_xor_si128(a.val, delta), _mm_xor_si128(b.val, delta));
1152     return v_uint32x4(v_select_si128(mask, a.val, b.val));
1153 #endif
1154 }
v_min(const v_int32x4 & a,const v_int32x4 & b)1155 inline v_int32x4 v_min(const v_int32x4& a, const v_int32x4& b)
1156 {
1157 #if CV_SSE4_1
1158     return v_int32x4(_mm_min_epi32(a.val, b.val));
1159 #else
1160     return v_int32x4(v_select_si128(_mm_cmpgt_epi32(a.val, b.val), b.val, a.val));
1161 #endif
1162 }
v_max(const v_int32x4 & a,const v_int32x4 & b)1163 inline v_int32x4 v_max(const v_int32x4& a, const v_int32x4& b)
1164 {
1165 #if CV_SSE4_1
1166     return v_int32x4(_mm_max_epi32(a.val, b.val));
1167 #else
1168     return v_int32x4(v_select_si128(_mm_cmpgt_epi32(a.val, b.val), a.val, b.val));
1169 #endif
1170 }
1171 
1172 #define OPENCV_HAL_IMPL_SSE_INT_CMP_OP(_Tpuvec, _Tpsvec, suffix, sbit) \
1173 inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \
1174 { return _Tpuvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
1175 inline _Tpuvec operator != (const _Tpuvec& a, const _Tpuvec& b) \
1176 { \
1177     __m128i not_mask = _mm_set1_epi32(-1); \
1178     return _Tpuvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \
1179 } \
1180 inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \
1181 { return _Tpsvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
1182 inline _Tpsvec operator != (const _Tpsvec& a, const _Tpsvec& b) \
1183 { \
1184     __m128i not_mask = _mm_set1_epi32(-1); \
1185     return _Tpsvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \
1186 } \
1187 inline _Tpuvec operator < (const _Tpuvec& a, const _Tpuvec& b) \
1188 { \
1189     __m128i smask = _mm_set1_##suffix(sbit); \
1190     return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask))); \
1191 } \
1192 inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \
1193 { \
1194     __m128i smask = _mm_set1_##suffix(sbit); \
1195     return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask))); \
1196 } \
1197 inline _Tpuvec operator <= (const _Tpuvec& a, const _Tpuvec& b) \
1198 { \
1199     __m128i smask = _mm_set1_##suffix(sbit); \
1200     __m128i not_mask = _mm_set1_epi32(-1); \
1201     __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask)); \
1202     return _Tpuvec(_mm_xor_si128(res, not_mask)); \
1203 } \
1204 inline _Tpuvec operator >= (const _Tpuvec& a, const _Tpuvec& b) \
1205 { \
1206     __m128i smask = _mm_set1_##suffix(sbit); \
1207     __m128i not_mask = _mm_set1_epi32(-1); \
1208     __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask)); \
1209     return _Tpuvec(_mm_xor_si128(res, not_mask)); \
1210 } \
1211 inline _Tpsvec operator < (const _Tpsvec& a, const _Tpsvec& b) \
1212 { \
1213     return _Tpsvec(_mm_cmpgt_##suffix(b.val, a.val)); \
1214 } \
1215 inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \
1216 { \
1217     return _Tpsvec(_mm_cmpgt_##suffix(a.val, b.val)); \
1218 } \
1219 inline _Tpsvec operator <= (const _Tpsvec& a, const _Tpsvec& b) \
1220 { \
1221     __m128i not_mask = _mm_set1_epi32(-1); \
1222     return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(a.val, b.val), not_mask)); \
1223 } \
1224 inline _Tpsvec operator >= (const _Tpsvec& a, const _Tpsvec& b) \
1225 { \
1226     __m128i not_mask = _mm_set1_epi32(-1); \
1227     return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(b.val, a.val), not_mask)); \
1228 }
1229 
1230 OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint8x16, v_int8x16, epi8, (char)-128)
1231 OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint16x8, v_int16x8, epi16, (short)-32768)
1232 OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint32x4, v_int32x4, epi32, (int)0x80000000)
1233 
1234 #define OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(_Tpvec, suffix) \
1235 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
1236 { return _Tpvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
1237 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
1238 { return _Tpvec(_mm_cmpneq_##suffix(a.val, b.val)); } \
1239 inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
1240 { return _Tpvec(_mm_cmplt_##suffix(a.val, b.val)); } \
1241 inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
1242 { return _Tpvec(_mm_cmpgt_##suffix(a.val, b.val)); } \
1243 inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
1244 { return _Tpvec(_mm_cmple_##suffix(a.val, b.val)); } \
1245 inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
1246 { return _Tpvec(_mm_cmpge_##suffix(a.val, b.val)); }
1247 
OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float32x4,ps)1248 OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float32x4, ps)
1249 OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float64x2, pd)
1250 
1251 #if CV_SSE4_1
1252 #define OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(_Tpvec) \
1253 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
1254 { return _Tpvec(_mm_cmpeq_epi64(a.val, b.val)); } \
1255 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
1256 { return ~(a == b); }
1257 #else
1258 #define OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(_Tpvec) \
1259 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
1260 { __m128i cmp = _mm_cmpeq_epi32(a.val, b.val); \
1261   return _Tpvec(_mm_and_si128(cmp, _mm_shuffle_epi32(cmp, _MM_SHUFFLE(2, 3, 0, 1)))); } \
1262 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
1263 { return ~(a == b); }
1264 #endif
1265 
1266 OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_uint64x2)
1267 OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_int64x2)
1268 
1269 inline v_float32x4 v_not_nan(const v_float32x4& a)
1270 { return v_float32x4(_mm_cmpord_ps(a.val, a.val)); }
v_not_nan(const v_float64x2 & a)1271 inline v_float64x2 v_not_nan(const v_float64x2& a)
1272 { return v_float64x2(_mm_cmpord_pd(a.val, a.val)); }
1273 
OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16,v_add_wrap,_mm_add_epi8)1274 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_add_wrap, _mm_add_epi8)
1275 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_add_wrap, _mm_add_epi8)
1276 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_add_wrap, _mm_add_epi16)
1277 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_add_wrap, _mm_add_epi16)
1278 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_sub_wrap, _mm_sub_epi8)
1279 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_sub_wrap, _mm_sub_epi8)
1280 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_sub_wrap, _mm_sub_epi16)
1281 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_sub_wrap, _mm_sub_epi16)
1282 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_mul_wrap, _mm_mullo_epi16)
1283 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_mul_wrap, _mm_mullo_epi16)
1284 
1285 inline v_uint8x16 v_mul_wrap(const v_uint8x16& a, const v_uint8x16& b)
1286 {
1287     __m128i ad = _mm_srai_epi16(a.val, 8);
1288     __m128i bd = _mm_srai_epi16(b.val, 8);
1289     __m128i p0 = _mm_mullo_epi16(a.val, b.val); // even
1290     __m128i p1 = _mm_slli_epi16(_mm_mullo_epi16(ad, bd), 8); // odd
1291     const __m128i b01 = _mm_set1_epi32(0xFF00FF00);
1292     return v_uint8x16(_v128_blendv_epi8(p0, p1, b01));
1293 }
v_mul_wrap(const v_int8x16 & a,const v_int8x16 & b)1294 inline v_int8x16 v_mul_wrap(const v_int8x16& a, const v_int8x16& b)
1295 {
1296     return v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b)));
1297 }
1298 
1299 /** Absolute difference **/
1300 
v_absdiff(const v_uint8x16 & a,const v_uint8x16 & b)1301 inline v_uint8x16 v_absdiff(const v_uint8x16& a, const v_uint8x16& b)
1302 { return v_add_wrap(a - b,  b - a); }
v_absdiff(const v_uint16x8 & a,const v_uint16x8 & b)1303 inline v_uint16x8 v_absdiff(const v_uint16x8& a, const v_uint16x8& b)
1304 { return v_add_wrap(a - b,  b - a); }
v_absdiff(const v_uint32x4 & a,const v_uint32x4 & b)1305 inline v_uint32x4 v_absdiff(const v_uint32x4& a, const v_uint32x4& b)
1306 { return v_max(a, b) - v_min(a, b); }
1307 
v_absdiff(const v_int8x16 & a,const v_int8x16 & b)1308 inline v_uint8x16 v_absdiff(const v_int8x16& a, const v_int8x16& b)
1309 {
1310     v_int8x16 d = v_sub_wrap(a, b);
1311     v_int8x16 m = a < b;
1312     return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
1313 }
v_absdiff(const v_int16x8 & a,const v_int16x8 & b)1314 inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b)
1315 {
1316     return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b)));
1317 }
v_absdiff(const v_int32x4 & a,const v_int32x4 & b)1318 inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
1319 {
1320     v_int32x4 d = a - b;
1321     v_int32x4 m = a < b;
1322     return v_reinterpret_as_u32((d ^ m) - m);
1323 }
1324 
1325 /** Saturating absolute difference **/
v_absdiffs(const v_int8x16 & a,const v_int8x16 & b)1326 inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
1327 {
1328     v_int8x16 d = a - b;
1329     v_int8x16 m = a < b;
1330     return (d ^ m) - m;
1331  }
v_absdiffs(const v_int16x8 & a,const v_int16x8 & b)1332 inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
1333 { return v_max(a, b) - v_min(a, b); }
1334 
1335 
v_fma(const v_int32x4 & a,const v_int32x4 & b,const v_int32x4 & c)1336 inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
1337 {
1338     return a * b + c;
1339 }
1340 
v_muladd(const v_int32x4 & a,const v_int32x4 & b,const v_int32x4 & c)1341 inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
1342 {
1343     return v_fma(a, b, c);
1344 }
1345 
v_fma(const v_float32x4 & a,const v_float32x4 & b,const v_float32x4 & c)1346 inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
1347 {
1348 #if CV_FMA3
1349     return v_float32x4(_mm_fmadd_ps(a.val, b.val, c.val));
1350 #else
1351     return v_float32x4(_mm_add_ps(_mm_mul_ps(a.val, b.val), c.val));
1352 #endif
1353 }
1354 
v_fma(const v_float64x2 & a,const v_float64x2 & b,const v_float64x2 & c)1355 inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
1356 {
1357 #if CV_FMA3
1358     return v_float64x2(_mm_fmadd_pd(a.val, b.val, c.val));
1359 #else
1360     return v_float64x2(_mm_add_pd(_mm_mul_pd(a.val, b.val), c.val));
1361 #endif
1362 }
1363 
1364 #define OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(_Tpvec, _Tp, _Tpreg, suffix, absmask_vec) \
1365 inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
1366 { \
1367     _Tpreg absmask = _mm_castsi128_##suffix(absmask_vec); \
1368     return _Tpvec(_mm_and_##suffix(_mm_sub_##suffix(a.val, b.val), absmask)); \
1369 } \
1370 inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
1371 { \
1372     _Tpvec res = v_fma(a, a, b*b); \
1373     return _Tpvec(_mm_sqrt_##suffix(res.val)); \
1374 } \
1375 inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
1376 { \
1377     return v_fma(a, a, b*b); \
1378 } \
1379 inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1380 { \
1381     return v_fma(a, b, c); \
1382 }
1383 
1384 OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float32x4, float, __m128, ps, _mm_set1_epi32((int)0x7fffffff))
1385 OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float64x2, double, __m128d, pd, _mm_srli_epi64(_mm_set1_epi32(-1), 1))
1386 
1387 #define OPENCV_HAL_IMPL_SSE_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \
1388 inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
1389 { \
1390     return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \
1391 } \
1392 inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
1393 { \
1394     return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \
1395 } \
1396 inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
1397 { \
1398     return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \
1399 } \
1400 inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
1401 { \
1402     return _Tpsvec(srai(a.val, imm)); \
1403 } \
1404 template<int imm> \
1405 inline _Tpuvec v_shl(const _Tpuvec& a) \
1406 { \
1407     return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \
1408 } \
1409 template<int imm> \
1410 inline _Tpsvec v_shl(const _Tpsvec& a) \
1411 { \
1412     return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \
1413 } \
1414 template<int imm> \
1415 inline _Tpuvec v_shr(const _Tpuvec& a) \
1416 { \
1417     return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \
1418 } \
1419 template<int imm> \
1420 inline _Tpsvec v_shr(const _Tpsvec& a) \
1421 { \
1422     return _Tpsvec(srai(a.val, imm)); \
1423 }
1424 
1425 OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint16x8, v_int16x8, epi16, _mm_srai_epi16)
1426 OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint32x4, v_int32x4, epi32, _mm_srai_epi32)
1427 OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint64x2, v_int64x2, epi64, v_srai_epi64)
1428 
1429 namespace hal_sse_internal
1430 {
1431     template <int imm,
1432         bool is_invalid = ((imm < 0) || (imm > 16)),
1433         bool is_first = (imm == 0),
1434         bool is_half = (imm == 8),
1435         bool is_second = (imm == 16),
1436         bool is_other = (((imm > 0) && (imm < 8)) || ((imm > 8) && (imm < 16)))>
1437     class v_sse_palignr_u8_class;
1438 
1439     template <int imm>
1440     class v_sse_palignr_u8_class<imm, true, false, false, false, false>;
1441 
1442     template <int imm>
1443     class v_sse_palignr_u8_class<imm, false, true, false, false, false>
1444     {
1445     public:
operator ()(const __m128i & a,const __m128i &) const1446         inline __m128i operator()(const __m128i& a, const __m128i&) const
1447         {
1448             return a;
1449         }
1450     };
1451 
1452     template <int imm>
1453     class v_sse_palignr_u8_class<imm, false, false, true, false, false>
1454     {
1455     public:
operator ()(const __m128i & a,const __m128i & b) const1456         inline __m128i operator()(const __m128i& a, const __m128i& b) const
1457         {
1458             return _mm_unpacklo_epi64(_mm_unpackhi_epi64(a, a), b);
1459         }
1460     };
1461 
1462     template <int imm>
1463     class v_sse_palignr_u8_class<imm, false, false, false, true, false>
1464     {
1465     public:
operator ()(const __m128i &,const __m128i & b) const1466         inline __m128i operator()(const __m128i&, const __m128i& b) const
1467         {
1468             return b;
1469         }
1470     };
1471 
1472     template <int imm>
1473     class v_sse_palignr_u8_class<imm, false, false, false, false, true>
1474     {
1475 #if CV_SSSE3
1476     public:
operator ()(const __m128i & a,const __m128i & b) const1477         inline __m128i operator()(const __m128i& a, const __m128i& b) const
1478         {
1479             return _mm_alignr_epi8(b, a, imm);
1480         }
1481 #else
1482     public:
1483         inline __m128i operator()(const __m128i& a, const __m128i& b) const
1484         {
1485             enum { imm2 = (sizeof(__m128i) - imm) };
1486             return _mm_or_si128(_mm_srli_si128(a, imm), _mm_slli_si128(b, imm2));
1487         }
1488 #endif
1489     };
1490 
1491     template <int imm>
v_sse_palignr_u8(const __m128i & a,const __m128i & b)1492     inline __m128i v_sse_palignr_u8(const __m128i& a, const __m128i& b)
1493     {
1494         CV_StaticAssert((imm >= 0) && (imm <= 16), "Invalid imm for v_sse_palignr_u8.");
1495         return v_sse_palignr_u8_class<imm>()(a, b);
1496     }
1497 }
1498 
1499 template<int imm, typename _Tpvec>
v_rotate_right(const _Tpvec & a)1500 inline _Tpvec v_rotate_right(const _Tpvec &a)
1501 {
1502     using namespace hal_sse_internal;
1503     enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) };
1504     return _Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
1505         _mm_srli_si128(
1506             v_sse_reinterpret_as<__m128i>(a.val), imm2)));
1507 }
1508 
1509 template<int imm, typename _Tpvec>
v_rotate_left(const _Tpvec & a)1510 inline _Tpvec v_rotate_left(const _Tpvec &a)
1511 {
1512     using namespace hal_sse_internal;
1513     enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) };
1514     return _Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
1515         _mm_slli_si128(
1516             v_sse_reinterpret_as<__m128i>(a.val), imm2)));
1517 }
1518 
1519 template<int imm, typename _Tpvec>
v_rotate_right(const _Tpvec & a,const _Tpvec & b)1520 inline _Tpvec v_rotate_right(const _Tpvec &a, const _Tpvec &b)
1521 {
1522     using namespace hal_sse_internal;
1523     enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) };
1524     return _Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
1525         v_sse_palignr_u8<imm2>(
1526             v_sse_reinterpret_as<__m128i>(a.val),
1527             v_sse_reinterpret_as<__m128i>(b.val))));
1528 }
1529 
1530 template<int imm, typename _Tpvec>
v_rotate_left(const _Tpvec & a,const _Tpvec & b)1531 inline _Tpvec v_rotate_left(const _Tpvec &a, const _Tpvec &b)
1532 {
1533     using namespace hal_sse_internal;
1534     enum { imm2 = ((_Tpvec::nlanes - imm) * sizeof(typename _Tpvec::lane_type)) };
1535     return _Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
1536         v_sse_palignr_u8<imm2>(
1537             v_sse_reinterpret_as<__m128i>(b.val),
1538             v_sse_reinterpret_as<__m128i>(a.val))));
1539 }
1540 
1541 #define OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(_Tpvec, _Tp) \
1542 inline _Tpvec v_load(const _Tp* ptr) \
1543 { return _Tpvec(_mm_loadu_si128((const __m128i*)ptr)); } \
1544 inline _Tpvec v_load_aligned(const _Tp* ptr) \
1545 { return _Tpvec(_mm_load_si128((const __m128i*)ptr)); } \
1546 inline _Tpvec v_load_low(const _Tp* ptr) \
1547 { return _Tpvec(_mm_loadl_epi64((const __m128i*)ptr)); } \
1548 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
1549 { \
1550     return _Tpvec(_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \
1551                                      _mm_loadl_epi64((const __m128i*)ptr1))); \
1552 } \
1553 inline void v_store(_Tp* ptr, const _Tpvec& a) \
1554 { _mm_storeu_si128((__m128i*)ptr, a.val); } \
1555 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
1556 { _mm_store_si128((__m128i*)ptr, a.val); } \
1557 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
1558 { _mm_stream_si128((__m128i*)ptr, a.val); } \
1559 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
1560 { \
1561     if( mode == hal::STORE_UNALIGNED ) \
1562         _mm_storeu_si128((__m128i*)ptr, a.val); \
1563     else if( mode == hal::STORE_ALIGNED_NOCACHE )  \
1564         _mm_stream_si128((__m128i*)ptr, a.val); \
1565     else \
1566         _mm_store_si128((__m128i*)ptr, a.val); \
1567 } \
1568 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
1569 { _mm_storel_epi64((__m128i*)ptr, a.val); } \
1570 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
1571 { _mm_storel_epi64((__m128i*)ptr, _mm_unpackhi_epi64(a.val, a.val)); }
1572 
OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint8x16,uchar)1573 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint8x16, uchar)
1574 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int8x16, schar)
1575 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint16x8, ushort)
1576 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int16x8, short)
1577 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint32x4, unsigned)
1578 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int32x4, int)
1579 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint64x2, uint64)
1580 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int64x2, int64)
1581 
1582 #define OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(_Tpvec, _Tp, suffix) \
1583 inline _Tpvec v_load(const _Tp* ptr) \
1584 { return _Tpvec(_mm_loadu_##suffix(ptr)); } \
1585 inline _Tpvec v_load_aligned(const _Tp* ptr) \
1586 { return _Tpvec(_mm_load_##suffix(ptr)); } \
1587 inline _Tpvec v_load_low(const _Tp* ptr) \
1588 { return _Tpvec(_mm_castsi128_##suffix(_mm_loadl_epi64((const __m128i*)ptr))); } \
1589 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
1590 { \
1591     return _Tpvec(_mm_castsi128_##suffix( \
1592         _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \
1593                            _mm_loadl_epi64((const __m128i*)ptr1)))); \
1594 } \
1595 inline void v_store(_Tp* ptr, const _Tpvec& a) \
1596 { _mm_storeu_##suffix(ptr, a.val); } \
1597 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
1598 { _mm_store_##suffix(ptr, a.val); } \
1599 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
1600 { _mm_stream_##suffix(ptr, a.val); } \
1601 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
1602 { \
1603     if( mode == hal::STORE_UNALIGNED ) \
1604         _mm_storeu_##suffix(ptr, a.val); \
1605     else if( mode == hal::STORE_ALIGNED_NOCACHE )  \
1606         _mm_stream_##suffix(ptr, a.val); \
1607     else \
1608         _mm_store_##suffix(ptr, a.val); \
1609 } \
1610 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
1611 { _mm_storel_epi64((__m128i*)ptr, _mm_cast##suffix##_si128(a.val)); } \
1612 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
1613 { \
1614     __m128i a1 = _mm_cast##suffix##_si128(a.val); \
1615     _mm_storel_epi64((__m128i*)ptr, _mm_unpackhi_epi64(a1, a1)); \
1616 }
1617 
1618 OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float32x4, float, ps)
1619 OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float64x2, double, pd)
1620 
1621 inline unsigned v_reduce_sum(const v_uint8x16& a)
1622 {
1623     __m128i half = _mm_sad_epu8(a.val, _mm_setzero_si128());
1624     return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(half, _mm_unpackhi_epi64(half, half)));
1625 }
v_reduce_sum(const v_int8x16 & a)1626 inline int v_reduce_sum(const v_int8x16& a)
1627 {
1628     __m128i half = _mm_set1_epi8((schar)-128);
1629     half = _mm_sad_epu8(_mm_xor_si128(a.val, half), _mm_setzero_si128());
1630     return _mm_cvtsi128_si32(_mm_add_epi32(half, _mm_unpackhi_epi64(half, half))) - 2048;
1631 }
1632 #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_16(func) \
1633 inline schar v_reduce_##func(const v_int8x16& a) \
1634 { \
1635     __m128i val = a.val; \
1636     __m128i smask = _mm_set1_epi8((schar)-128); \
1637     val = _mm_xor_si128(val, smask); \
1638     val = _mm_##func##_epu8(val, _mm_srli_si128(val,8)); \
1639     val = _mm_##func##_epu8(val, _mm_srli_si128(val,4)); \
1640     val = _mm_##func##_epu8(val, _mm_srli_si128(val,2)); \
1641     val = _mm_##func##_epu8(val, _mm_srli_si128(val,1)); \
1642     return (schar)_mm_cvtsi128_si32(val) ^ (schar)-128; \
1643 } \
1644 inline uchar v_reduce_##func(const v_uint8x16& a) \
1645 { \
1646     __m128i val = a.val; \
1647     val = _mm_##func##_epu8(val, _mm_srli_si128(val,8)); \
1648     val = _mm_##func##_epu8(val, _mm_srli_si128(val,4)); \
1649     val = _mm_##func##_epu8(val, _mm_srli_si128(val,2)); \
1650     val = _mm_##func##_epu8(val, _mm_srli_si128(val,1)); \
1651     return (uchar)_mm_cvtsi128_si32(val); \
1652 }
1653 OPENCV_HAL_IMPL_SSE_REDUCE_OP_16(max)
OPENCV_HAL_IMPL_SSE_REDUCE_OP_16(min)1654 OPENCV_HAL_IMPL_SSE_REDUCE_OP_16(min)
1655 
1656 #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(_Tpvec, scalartype, func, suffix, sbit) \
1657 inline scalartype v_reduce_##func(const v_##_Tpvec& a) \
1658 { \
1659     __m128i val = a.val; \
1660     val = _mm_##func##_##suffix(val, _mm_srli_si128(val,8)); \
1661     val = _mm_##func##_##suffix(val, _mm_srli_si128(val,4)); \
1662     val = _mm_##func##_##suffix(val, _mm_srli_si128(val,2)); \
1663     return (scalartype)_mm_cvtsi128_si32(val); \
1664 } \
1665 inline unsigned scalartype v_reduce_##func(const v_u##_Tpvec& a) \
1666 { \
1667     __m128i val = a.val; \
1668     __m128i smask = _mm_set1_epi16(sbit); \
1669     val = _mm_xor_si128(val, smask); \
1670     val = _mm_##func##_##suffix(val, _mm_srli_si128(val,8)); \
1671     val = _mm_##func##_##suffix(val, _mm_srli_si128(val,4)); \
1672     val = _mm_##func##_##suffix(val, _mm_srli_si128(val,2)); \
1673     return (unsigned scalartype)(_mm_cvtsi128_si32(val) ^  sbit); \
1674 }
1675 OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(int16x8, short, max, epi16, (short)-32768)
1676 OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(int16x8, short, min, epi16, (short)-32768)
1677 
1678 #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(_Tpvec, scalartype, regtype, suffix, cast_from, cast_to, extract) \
1679 inline scalartype v_reduce_sum(const _Tpvec& a) \
1680 { \
1681     regtype val = a.val; \
1682     val = _mm_add_##suffix(val, cast_to(_mm_srli_si128(cast_from(val), 8))); \
1683     val = _mm_add_##suffix(val, cast_to(_mm_srli_si128(cast_from(val), 4))); \
1684     return (scalartype)_mm_cvt##extract(val); \
1685 }
1686 
1687 #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \
1688 inline scalartype v_reduce_##func(const _Tpvec& a) \
1689 { \
1690     scalartype CV_DECL_ALIGNED(16) buf[4]; \
1691     v_store_aligned(buf, a); \
1692     scalartype s0 = scalar_func(buf[0], buf[1]); \
1693     scalartype s1 = scalar_func(buf[2], buf[3]); \
1694     return scalar_func(s0, s1); \
1695 }
1696 
1697 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_uint32x4, unsigned, __m128i, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP, si128_si32)
1698 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_int32x4, int, __m128i, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP, si128_si32)
1699 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_float32x4, float, __m128, ps, _mm_castps_si128, _mm_castsi128_ps, ss_f32)
1700 
1701 inline int v_reduce_sum(const v_int16x8& a)
1702 { return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
v_reduce_sum(const v_uint16x8 & a)1703 inline unsigned v_reduce_sum(const v_uint16x8& a)
1704 { return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
1705 
v_reduce_sum(const v_uint64x2 & a)1706 inline uint64 v_reduce_sum(const v_uint64x2& a)
1707 {
1708     uint64 CV_DECL_ALIGNED(32) idx[2];
1709     v_store_aligned(idx, a);
1710     return idx[0] + idx[1];
1711 }
v_reduce_sum(const v_int64x2 & a)1712 inline int64 v_reduce_sum(const v_int64x2& a)
1713 {
1714     int64 CV_DECL_ALIGNED(32) idx[2];
1715     v_store_aligned(idx, a);
1716     return idx[0] + idx[1];
1717 }
v_reduce_sum(const v_float64x2 & a)1718 inline double v_reduce_sum(const v_float64x2& a)
1719 {
1720     double CV_DECL_ALIGNED(32) idx[2];
1721     v_store_aligned(idx, a);
1722     return idx[0] + idx[1];
1723 }
1724 
v_reduce_sum4(const v_float32x4 & a,const v_float32x4 & b,const v_float32x4 & c,const v_float32x4 & d)1725 inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
1726                                  const v_float32x4& c, const v_float32x4& d)
1727 {
1728 #if CV_SSE3
1729     __m128 ab = _mm_hadd_ps(a.val, b.val);
1730     __m128 cd = _mm_hadd_ps(c.val, d.val);
1731     return v_float32x4(_mm_hadd_ps(ab, cd));
1732 #else
1733     __m128 ac = _mm_add_ps(_mm_unpacklo_ps(a.val, c.val), _mm_unpackhi_ps(a.val, c.val));
1734     __m128 bd = _mm_add_ps(_mm_unpacklo_ps(b.val, d.val), _mm_unpackhi_ps(b.val, d.val));
1735     return v_float32x4(_mm_add_ps(_mm_unpacklo_ps(ac, bd), _mm_unpackhi_ps(ac, bd)));
1736 #endif
1737 }
1738 
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4,unsigned,max,std::max)1739 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, max, std::max)
1740 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, min, std::min)
1741 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, max, std::max)
1742 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, min, std::min)
1743 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, max, std::max)
1744 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, min, std::min)
1745 
1746 inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
1747 {
1748     __m128i half = _mm_sad_epu8(a.val, b.val);
1749     return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(half, _mm_unpackhi_epi64(half, half)));
1750 }
v_reduce_sad(const v_int8x16 & a,const v_int8x16 & b)1751 inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
1752 {
1753     __m128i half = _mm_set1_epi8(0x7f);
1754     half = _mm_sad_epu8(_mm_add_epi8(a.val, half), _mm_add_epi8(b.val, half));
1755     return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(half, _mm_unpackhi_epi64(half, half)));
1756 }
v_reduce_sad(const v_uint16x8 & a,const v_uint16x8 & b)1757 inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
1758 {
1759     v_uint32x4 l, h;
1760     v_expand(v_absdiff(a, b), l, h);
1761     return v_reduce_sum(l + h);
1762 }
v_reduce_sad(const v_int16x8 & a,const v_int16x8 & b)1763 inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
1764 {
1765     v_uint32x4 l, h;
1766     v_expand(v_absdiff(a, b), l, h);
1767     return v_reduce_sum(l + h);
1768 }
v_reduce_sad(const v_uint32x4 & a,const v_uint32x4 & b)1769 inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
1770 {
1771     return v_reduce_sum(v_absdiff(a, b));
1772 }
v_reduce_sad(const v_int32x4 & a,const v_int32x4 & b)1773 inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)
1774 {
1775     return v_reduce_sum(v_absdiff(a, b));
1776 }
v_reduce_sad(const v_float32x4 & a,const v_float32x4 & b)1777 inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
1778 {
1779     return v_reduce_sum(v_absdiff(a, b));
1780 }
1781 
v_popcount(const v_uint8x16 & a)1782 inline v_uint8x16 v_popcount(const v_uint8x16& a)
1783 {
1784     __m128i m1 = _mm_set1_epi32(0x55555555);
1785     __m128i m2 = _mm_set1_epi32(0x33333333);
1786     __m128i m4 = _mm_set1_epi32(0x0f0f0f0f);
1787     __m128i p = a.val;
1788     p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 1), m1), _mm_and_si128(p, m1));
1789     p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 2), m2), _mm_and_si128(p, m2));
1790     p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 4), m4), _mm_and_si128(p, m4));
1791     return v_uint8x16(p);
1792 }
v_popcount(const v_uint16x8 & a)1793 inline v_uint16x8 v_popcount(const v_uint16x8& a)
1794 {
1795     v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a));
1796     p += v_rotate_right<1>(p);
1797     return v_reinterpret_as_u16(p) & v_setall_u16(0x00ff);
1798 }
v_popcount(const v_uint32x4 & a)1799 inline v_uint32x4 v_popcount(const v_uint32x4& a)
1800 {
1801     v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a));
1802     p += v_rotate_right<1>(p);
1803     p += v_rotate_right<2>(p);
1804     return v_reinterpret_as_u32(p) & v_setall_u32(0x000000ff);
1805 }
v_popcount(const v_uint64x2 & a)1806 inline v_uint64x2 v_popcount(const v_uint64x2& a)
1807 {
1808     return v_uint64x2(_mm_sad_epu8(v_popcount(v_reinterpret_as_u8(a)).val, _mm_setzero_si128()));
1809 }
v_popcount(const v_int8x16 & a)1810 inline v_uint8x16 v_popcount(const v_int8x16& a)
1811 { return v_popcount(v_reinterpret_as_u8(a)); }
v_popcount(const v_int16x8 & a)1812 inline v_uint16x8 v_popcount(const v_int16x8& a)
1813 { return v_popcount(v_reinterpret_as_u16(a)); }
v_popcount(const v_int32x4 & a)1814 inline v_uint32x4 v_popcount(const v_int32x4& a)
1815 { return v_popcount(v_reinterpret_as_u32(a)); }
v_popcount(const v_int64x2 & a)1816 inline v_uint64x2 v_popcount(const v_int64x2& a)
1817 { return v_popcount(v_reinterpret_as_u64(a)); }
1818 
1819 #define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(_Tpvec, suffix, cast_op, allmask) \
1820 inline int v_signmask(const _Tpvec& a)   { return _mm_movemask_##suffix(cast_op(a.val)); } \
1821 inline bool v_check_all(const _Tpvec& a) { return _mm_movemask_##suffix(cast_op(a.val)) == allmask; } \
1822 inline bool v_check_any(const _Tpvec& a) { return _mm_movemask_##suffix(cast_op(a.val)) != 0; }
1823 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint8x16, epi8, OPENCV_HAL_NOP, 65535)
1824 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int8x16, epi8, OPENCV_HAL_NOP, 65535)
1825 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint32x4, ps, _mm_castsi128_ps, 15)
1826 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int32x4, ps, _mm_castsi128_ps, 15)
1827 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint64x2, pd, _mm_castsi128_pd, 3)
1828 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int64x2, pd, _mm_castsi128_pd, 3)
1829 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float32x4, ps, OPENCV_HAL_NOP, 15)
1830 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float64x2, pd, OPENCV_HAL_NOP, 3)
1831 
1832 #define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS_SHORT(_Tpvec) \
1833 inline int v_signmask(const _Tpvec& a) { return _mm_movemask_epi8(_mm_packs_epi16(a.val, a.val)) & 255; } \
1834 inline bool v_check_all(const _Tpvec& a) { return (_mm_movemask_epi8(a.val) & 0xaaaa) == 0xaaaa; } \
1835 inline bool v_check_any(const _Tpvec& a) { return (_mm_movemask_epi8(a.val) & 0xaaaa) != 0; }
OPENCV_HAL_IMPL_SSE_CHECK_SIGNS_SHORT(v_uint16x8)1836 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS_SHORT(v_uint16x8)
1837 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS_SHORT(v_int16x8)
1838 
1839 inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
v_scan_forward(const v_uint8x16 & a)1840 inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
v_scan_forward(const v_int16x8 & a)1841 inline int v_scan_forward(const v_int16x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
v_scan_forward(const v_uint16x8 & a)1842 inline int v_scan_forward(const v_uint16x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
v_scan_forward(const v_int32x4 & a)1843 inline int v_scan_forward(const v_int32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
v_scan_forward(const v_uint32x4 & a)1844 inline int v_scan_forward(const v_uint32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
v_scan_forward(const v_float32x4 & a)1845 inline int v_scan_forward(const v_float32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
v_scan_forward(const v_int64x2 & a)1846 inline int v_scan_forward(const v_int64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
v_scan_forward(const v_uint64x2 & a)1847 inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
v_scan_forward(const v_float64x2 & a)1848 inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
1849 
1850 #if CV_SSE4_1
1851 #define OPENCV_HAL_IMPL_SSE_SELECT(_Tpvec, cast_ret, cast, suffix) \
1852 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
1853 { \
1854     return _Tpvec(cast_ret(_mm_blendv_##suffix(cast(b.val), cast(a.val), cast(mask.val)))); \
1855 }
1856 
OPENCV_HAL_IMPL_SSE_SELECT(v_uint8x16,OPENCV_HAL_NOP,OPENCV_HAL_NOP,epi8)1857 OPENCV_HAL_IMPL_SSE_SELECT(v_uint8x16, OPENCV_HAL_NOP, OPENCV_HAL_NOP, epi8)
1858 OPENCV_HAL_IMPL_SSE_SELECT(v_int8x16, OPENCV_HAL_NOP, OPENCV_HAL_NOP, epi8)
1859 OPENCV_HAL_IMPL_SSE_SELECT(v_uint16x8, OPENCV_HAL_NOP, OPENCV_HAL_NOP, epi8)
1860 OPENCV_HAL_IMPL_SSE_SELECT(v_int16x8, OPENCV_HAL_NOP, OPENCV_HAL_NOP, epi8)
1861 OPENCV_HAL_IMPL_SSE_SELECT(v_uint32x4, _mm_castps_si128, _mm_castsi128_ps, ps)
1862 OPENCV_HAL_IMPL_SSE_SELECT(v_int32x4, _mm_castps_si128, _mm_castsi128_ps, ps)
1863 // OPENCV_HAL_IMPL_SSE_SELECT(v_uint64x2, TBD, TBD, pd)
1864 // OPENCV_HAL_IMPL_SSE_SELECT(v_int64x2, TBD, TBD, ps)
1865 OPENCV_HAL_IMPL_SSE_SELECT(v_float32x4, OPENCV_HAL_NOP, OPENCV_HAL_NOP, ps)
1866 OPENCV_HAL_IMPL_SSE_SELECT(v_float64x2, OPENCV_HAL_NOP, OPENCV_HAL_NOP, pd)
1867 
1868 #else // CV_SSE4_1
1869 
1870 #define OPENCV_HAL_IMPL_SSE_SELECT(_Tpvec, suffix) \
1871 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
1872 { \
1873     return _Tpvec(_mm_xor_##suffix(b.val, _mm_and_##suffix(_mm_xor_##suffix(b.val, a.val), mask.val))); \
1874 }
1875 
1876 OPENCV_HAL_IMPL_SSE_SELECT(v_uint8x16, si128)
1877 OPENCV_HAL_IMPL_SSE_SELECT(v_int8x16, si128)
1878 OPENCV_HAL_IMPL_SSE_SELECT(v_uint16x8, si128)
1879 OPENCV_HAL_IMPL_SSE_SELECT(v_int16x8, si128)
1880 OPENCV_HAL_IMPL_SSE_SELECT(v_uint32x4, si128)
1881 OPENCV_HAL_IMPL_SSE_SELECT(v_int32x4, si128)
1882 // OPENCV_HAL_IMPL_SSE_SELECT(v_uint64x2, si128)
1883 // OPENCV_HAL_IMPL_SSE_SELECT(v_int64x2, si128)
1884 OPENCV_HAL_IMPL_SSE_SELECT(v_float32x4, ps)
1885 OPENCV_HAL_IMPL_SSE_SELECT(v_float64x2, pd)
1886 #endif
1887 
1888 /* Expand */
1889 #define OPENCV_HAL_IMPL_SSE_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin)    \
1890     inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
1891     {                                                               \
1892         b0.val = intrin(a.val);                                     \
1893         b1.val = __CV_CAT(intrin, _high)(a.val);                    \
1894     }                                                               \
1895     inline _Tpwvec v_expand_low(const _Tpvec& a)                    \
1896     { return _Tpwvec(intrin(a.val)); }                              \
1897     inline _Tpwvec v_expand_high(const _Tpvec& a)                   \
1898     { return _Tpwvec(__CV_CAT(intrin, _high)(a.val)); }             \
1899     inline _Tpwvec v_load_expand(const _Tp* ptr)                    \
1900     {                                                               \
1901         __m128i a = _mm_loadl_epi64((const __m128i*)ptr);           \
1902         return _Tpwvec(intrin(a));                                  \
1903     }
1904 
1905 OPENCV_HAL_IMPL_SSE_EXPAND(v_uint8x16, v_uint16x8,  uchar,    _v128_cvtepu8_epi16)
1906 OPENCV_HAL_IMPL_SSE_EXPAND(v_int8x16,  v_int16x8,   schar,    _v128_cvtepi8_epi16)
1907 OPENCV_HAL_IMPL_SSE_EXPAND(v_uint16x8, v_uint32x4,  ushort,   _v128_cvtepu16_epi32)
1908 OPENCV_HAL_IMPL_SSE_EXPAND(v_int16x8,  v_int32x4,   short,    _v128_cvtepi16_epi32)
1909 OPENCV_HAL_IMPL_SSE_EXPAND(v_uint32x4, v_uint64x2,  unsigned, _v128_cvtepu32_epi64)
1910 OPENCV_HAL_IMPL_SSE_EXPAND(v_int32x4,  v_int64x2,   int,      _v128_cvtepi32_epi64)
1911 
1912 #define OPENCV_HAL_IMPL_SSE_EXPAND_Q(_Tpvec, _Tp, intrin)  \
1913     inline _Tpvec v_load_expand_q(const _Tp* ptr)          \
1914     {                                                      \
1915         __m128i a = _mm_cvtsi32_si128(*(const int*)ptr);   \
1916         return _Tpvec(intrin(a));                          \
1917     }
1918 
1919 OPENCV_HAL_IMPL_SSE_EXPAND_Q(v_uint32x4, uchar, _v128_cvtepu8_epi32)
1920 OPENCV_HAL_IMPL_SSE_EXPAND_Q(v_int32x4,  schar, _v128_cvtepi8_epi32)
1921 
1922 #define OPENCV_HAL_IMPL_SSE_UNPACKS(_Tpvec, suffix, cast_from, cast_to) \
1923 inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) \
1924 { \
1925     b0.val = _mm_unpacklo_##suffix(a0.val, a1.val); \
1926     b1.val = _mm_unpackhi_##suffix(a0.val, a1.val); \
1927 } \
1928 inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
1929 { \
1930     __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
1931     return _Tpvec(cast_to(_mm_unpacklo_epi64(a1, b1))); \
1932 } \
1933 inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
1934 { \
1935     __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
1936     return _Tpvec(cast_to(_mm_unpackhi_epi64(a1, b1))); \
1937 } \
1938 inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d) \
1939 { \
1940     __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
1941     c.val = cast_to(_mm_unpacklo_epi64(a1, b1)); \
1942     d.val = cast_to(_mm_unpackhi_epi64(a1, b1)); \
1943 }
1944 
1945 OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1946 OPENCV_HAL_IMPL_SSE_UNPACKS(v_int8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1947 OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint16x8, epi16, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1948 OPENCV_HAL_IMPL_SSE_UNPACKS(v_int16x8, epi16, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1949 OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1950 OPENCV_HAL_IMPL_SSE_UNPACKS(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1951 OPENCV_HAL_IMPL_SSE_UNPACKS(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps)
1952 OPENCV_HAL_IMPL_SSE_UNPACKS(v_float64x2, pd, _mm_castpd_si128, _mm_castsi128_pd)
1953 
1954 inline v_uint8x16 v_reverse(const v_uint8x16 &a)
1955 {
1956 #if CV_SSSE3
1957     static const __m128i perm = _mm_setr_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1958     return v_uint8x16(_mm_shuffle_epi8(a.val, perm));
1959 #else
1960     uchar CV_DECL_ALIGNED(32) d[16];
1961     v_store_aligned(d, a);
1962     return v_uint8x16(d[15], d[14], d[13], d[12], d[11], d[10], d[9], d[8], d[7], d[6], d[5], d[4], d[3], d[2], d[1], d[0]);
1963 #endif
1964 }
1965 
v_reverse(const v_int8x16 & a)1966 inline v_int8x16 v_reverse(const v_int8x16 &a)
1967 { return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
1968 
v_reverse(const v_uint16x8 & a)1969 inline v_uint16x8 v_reverse(const v_uint16x8 &a)
1970 {
1971 #if CV_SSSE3
1972     static const __m128i perm = _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
1973     return v_uint16x8(_mm_shuffle_epi8(a.val, perm));
1974 #else
1975     __m128i r = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 1, 2, 3));
1976     r = _mm_shufflelo_epi16(r, _MM_SHUFFLE(2, 3, 0, 1));
1977     r = _mm_shufflehi_epi16(r, _MM_SHUFFLE(2, 3, 0, 1));
1978     return v_uint16x8(r);
1979 #endif
1980 }
1981 
v_reverse(const v_int16x8 & a)1982 inline v_int16x8 v_reverse(const v_int16x8 &a)
1983 { return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
1984 
v_reverse(const v_uint32x4 & a)1985 inline v_uint32x4 v_reverse(const v_uint32x4 &a)
1986 {
1987     return v_uint32x4(_mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 1, 2, 3)));
1988 }
1989 
v_reverse(const v_int32x4 & a)1990 inline v_int32x4 v_reverse(const v_int32x4 &a)
1991 { return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
1992 
v_reverse(const v_float32x4 & a)1993 inline v_float32x4 v_reverse(const v_float32x4 &a)
1994 { return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
1995 
v_reverse(const v_uint64x2 & a)1996 inline v_uint64x2 v_reverse(const v_uint64x2 &a)
1997 {
1998     return v_uint64x2(_mm_shuffle_epi32(a.val, _MM_SHUFFLE(1, 0, 3, 2)));
1999 }
2000 
v_reverse(const v_int64x2 & a)2001 inline v_int64x2 v_reverse(const v_int64x2 &a)
2002 { return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
2003 
v_reverse(const v_float64x2 & a)2004 inline v_float64x2 v_reverse(const v_float64x2 &a)
2005 { return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
2006 
2007 template<int s, typename _Tpvec>
v_extract(const _Tpvec & a,const _Tpvec & b)2008 inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)
2009 {
2010     return v_rotate_right<s>(a, b);
2011 }
2012 
v_round(const v_float32x4 & a)2013 inline v_int32x4 v_round(const v_float32x4& a)
2014 { return v_int32x4(_mm_cvtps_epi32(a.val)); }
2015 
v_floor(const v_float32x4 & a)2016 inline v_int32x4 v_floor(const v_float32x4& a)
2017 {
2018     __m128i a1 = _mm_cvtps_epi32(a.val);
2019     __m128i mask = _mm_castps_si128(_mm_cmpgt_ps(_mm_cvtepi32_ps(a1), a.val));
2020     return v_int32x4(_mm_add_epi32(a1, mask));
2021 }
2022 
v_ceil(const v_float32x4 & a)2023 inline v_int32x4 v_ceil(const v_float32x4& a)
2024 {
2025     __m128i a1 = _mm_cvtps_epi32(a.val);
2026     __m128i mask = _mm_castps_si128(_mm_cmpgt_ps(a.val, _mm_cvtepi32_ps(a1)));
2027     return v_int32x4(_mm_sub_epi32(a1, mask));
2028 }
2029 
v_trunc(const v_float32x4 & a)2030 inline v_int32x4 v_trunc(const v_float32x4& a)
2031 { return v_int32x4(_mm_cvttps_epi32(a.val)); }
2032 
v_round(const v_float64x2 & a)2033 inline v_int32x4 v_round(const v_float64x2& a)
2034 { return v_int32x4(_mm_cvtpd_epi32(a.val)); }
2035 
v_round(const v_float64x2 & a,const v_float64x2 & b)2036 inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
2037 {
2038     __m128i ai = _mm_cvtpd_epi32(a.val), bi = _mm_cvtpd_epi32(b.val);
2039     return v_int32x4(_mm_unpacklo_epi64(ai, bi));
2040 }
2041 
v_floor(const v_float64x2 & a)2042 inline v_int32x4 v_floor(const v_float64x2& a)
2043 {
2044     __m128i a1 = _mm_cvtpd_epi32(a.val);
2045     __m128i mask = _mm_castpd_si128(_mm_cmpgt_pd(_mm_cvtepi32_pd(a1), a.val));
2046     mask = _mm_srli_si128(_mm_slli_si128(mask, 4), 8); // m0 m0 m1 m1 => m0 m1 0 0
2047     return v_int32x4(_mm_add_epi32(a1, mask));
2048 }
2049 
v_ceil(const v_float64x2 & a)2050 inline v_int32x4 v_ceil(const v_float64x2& a)
2051 {
2052     __m128i a1 = _mm_cvtpd_epi32(a.val);
2053     __m128i mask = _mm_castpd_si128(_mm_cmpgt_pd(a.val, _mm_cvtepi32_pd(a1)));
2054     mask = _mm_srli_si128(_mm_slli_si128(mask, 4), 8); // m0 m0 m1 m1 => m0 m1 0 0
2055     return v_int32x4(_mm_sub_epi32(a1, mask));
2056 }
2057 
v_trunc(const v_float64x2 & a)2058 inline v_int32x4 v_trunc(const v_float64x2& a)
2059 { return v_int32x4(_mm_cvttpd_epi32(a.val)); }
2060 
2061 #define OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(_Tpvec, suffix, cast_from, cast_to) \
2062 inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
2063                            const _Tpvec& a2, const _Tpvec& a3, \
2064                            _Tpvec& b0, _Tpvec& b1, \
2065                            _Tpvec& b2, _Tpvec& b3) \
2066 { \
2067     __m128i t0 = cast_from(_mm_unpacklo_##suffix(a0.val, a1.val)); \
2068     __m128i t1 = cast_from(_mm_unpacklo_##suffix(a2.val, a3.val)); \
2069     __m128i t2 = cast_from(_mm_unpackhi_##suffix(a0.val, a1.val)); \
2070     __m128i t3 = cast_from(_mm_unpackhi_##suffix(a2.val, a3.val)); \
2071 \
2072     b0.val = cast_to(_mm_unpacklo_epi64(t0, t1)); \
2073     b1.val = cast_to(_mm_unpackhi_epi64(t0, t1)); \
2074     b2.val = cast_to(_mm_unpacklo_epi64(t2, t3)); \
2075     b3.val = cast_to(_mm_unpackhi_epi64(t2, t3)); \
2076 }
2077 
OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_uint32x4,epi32,OPENCV_HAL_NOP,OPENCV_HAL_NOP)2078 OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
2079 OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
2080 OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps)
2081 
2082 // load deinterleave
2083 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b)
2084 {
2085     __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
2086     __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 16));
2087 
2088     __m128i t10 = _mm_unpacklo_epi8(t00, t01);
2089     __m128i t11 = _mm_unpackhi_epi8(t00, t01);
2090 
2091     __m128i t20 = _mm_unpacklo_epi8(t10, t11);
2092     __m128i t21 = _mm_unpackhi_epi8(t10, t11);
2093 
2094     __m128i t30 = _mm_unpacklo_epi8(t20, t21);
2095     __m128i t31 = _mm_unpackhi_epi8(t20, t21);
2096 
2097     a.val = _mm_unpacklo_epi8(t30, t31);
2098     b.val = _mm_unpackhi_epi8(t30, t31);
2099 }
2100 
v_load_deinterleave(const uchar * ptr,v_uint8x16 & a,v_uint8x16 & b,v_uint8x16 & c)2101 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c)
2102 {
2103 #if CV_SSE4_1
2104     const __m128i m0 = _mm_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
2105     const __m128i m1 = _mm_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
2106     __m128i s0 = _mm_loadu_si128((const __m128i*)ptr);
2107     __m128i s1 = _mm_loadu_si128((const __m128i*)(ptr + 16));
2108     __m128i s2 = _mm_loadu_si128((const __m128i*)(ptr + 32));
2109     __m128i a0 = _mm_blendv_epi8(_mm_blendv_epi8(s0, s1, m0), s2, m1);
2110     __m128i b0 = _mm_blendv_epi8(_mm_blendv_epi8(s1, s2, m0), s0, m1);
2111     __m128i c0 = _mm_blendv_epi8(_mm_blendv_epi8(s2, s0, m0), s1, m1);
2112     const __m128i sh_b = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13);
2113     const __m128i sh_g = _mm_setr_epi8(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14);
2114     const __m128i sh_r = _mm_setr_epi8(2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15);
2115     a0 = _mm_shuffle_epi8(a0, sh_b);
2116     b0 = _mm_shuffle_epi8(b0, sh_g);
2117     c0 = _mm_shuffle_epi8(c0, sh_r);
2118     a.val = a0;
2119     b.val = b0;
2120     c.val = c0;
2121 #elif CV_SSSE3
2122     const __m128i m0 = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 1, 4, 7, 10, 13, 2, 5, 8, 11, 14);
2123     const __m128i m1 = _mm_alignr_epi8(m0, m0, 11);
2124     const __m128i m2 = _mm_alignr_epi8(m0, m0, 6);
2125 
2126     __m128i t0 = _mm_loadu_si128((const __m128i*)ptr);
2127     __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 16));
2128     __m128i t2 = _mm_loadu_si128((const __m128i*)(ptr + 32));
2129 
2130     __m128i s0 = _mm_shuffle_epi8(t0, m0);
2131     __m128i s1 = _mm_shuffle_epi8(t1, m1);
2132     __m128i s2 = _mm_shuffle_epi8(t2, m2);
2133 
2134     t0 = _mm_alignr_epi8(s1, _mm_slli_si128(s0, 10), 5);
2135     a.val = _mm_alignr_epi8(s2, t0, 5);
2136 
2137     t1 = _mm_alignr_epi8(_mm_srli_si128(s1, 5), _mm_slli_si128(s0, 5), 6);
2138     b.val = _mm_alignr_epi8(_mm_srli_si128(s2, 5), t1, 5);
2139 
2140     t2 = _mm_alignr_epi8(_mm_srli_si128(s2, 10), s1, 11);
2141     c.val = _mm_alignr_epi8(t2, s0, 11);
2142 #else
2143     __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
2144     __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 16));
2145     __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 32));
2146 
2147     __m128i t10 = _mm_unpacklo_epi8(t00, _mm_unpackhi_epi64(t01, t01));
2148     __m128i t11 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t00, t00), t02);
2149     __m128i t12 = _mm_unpacklo_epi8(t01, _mm_unpackhi_epi64(t02, t02));
2150 
2151     __m128i t20 = _mm_unpacklo_epi8(t10, _mm_unpackhi_epi64(t11, t11));
2152     __m128i t21 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t10, t10), t12);
2153     __m128i t22 = _mm_unpacklo_epi8(t11, _mm_unpackhi_epi64(t12, t12));
2154 
2155     __m128i t30 = _mm_unpacklo_epi8(t20, _mm_unpackhi_epi64(t21, t21));
2156     __m128i t31 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t20, t20), t22);
2157     __m128i t32 = _mm_unpacklo_epi8(t21, _mm_unpackhi_epi64(t22, t22));
2158 
2159     a.val = _mm_unpacklo_epi8(t30, _mm_unpackhi_epi64(t31, t31));
2160     b.val = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t30, t30), t32);
2161     c.val = _mm_unpacklo_epi8(t31, _mm_unpackhi_epi64(t32, t32));
2162 #endif
2163 }
2164 
v_load_deinterleave(const uchar * ptr,v_uint8x16 & a,v_uint8x16 & b,v_uint8x16 & c,v_uint8x16 & d)2165 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c, v_uint8x16& d)
2166 {
2167     __m128i u0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0 c0 d0 a1 b1 c1 d1 ...
2168     __m128i u1 = _mm_loadu_si128((const __m128i*)(ptr + 16)); // a4 b4 c4 d4 ...
2169     __m128i u2 = _mm_loadu_si128((const __m128i*)(ptr + 32)); // a8 b8 c8 d8 ...
2170     __m128i u3 = _mm_loadu_si128((const __m128i*)(ptr + 48)); // a12 b12 c12 d12 ...
2171 
2172     __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 a8 b0 b8 ...
2173     __m128i v1 = _mm_unpackhi_epi8(u0, u2); // a2 a10 b2 b10 ...
2174     __m128i v2 = _mm_unpacklo_epi8(u1, u3); // a4 a12 b4 b12 ...
2175     __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a6 a14 b6 b14 ...
2176 
2177     u0 = _mm_unpacklo_epi8(v0, v2); // a0 a4 a8 a12 ...
2178     u1 = _mm_unpacklo_epi8(v1, v3); // a2 a6 a10 a14 ...
2179     u2 = _mm_unpackhi_epi8(v0, v2); // a1 a5 a9 a13 ...
2180     u3 = _mm_unpackhi_epi8(v1, v3); // a3 a7 a11 a15 ...
2181 
2182     v0 = _mm_unpacklo_epi8(u0, u1); // a0 a2 a4 a6 ...
2183     v1 = _mm_unpacklo_epi8(u2, u3); // a1 a3 a5 a7 ...
2184     v2 = _mm_unpackhi_epi8(u0, u1); // c0 c2 c4 c6 ...
2185     v3 = _mm_unpackhi_epi8(u2, u3); // c1 c3 c5 c7 ...
2186 
2187     a.val = _mm_unpacklo_epi8(v0, v1);
2188     b.val = _mm_unpackhi_epi8(v0, v1);
2189     c.val = _mm_unpacklo_epi8(v2, v3);
2190     d.val = _mm_unpackhi_epi8(v2, v3);
2191 }
2192 
v_load_deinterleave(const ushort * ptr,v_uint16x8 & a,v_uint16x8 & b)2193 inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b)
2194 {
2195     __m128i v0 = _mm_loadu_si128((__m128i*)(ptr));     // a0 b0 a1 b1 a2 b2 a3 b3
2196     __m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 8)); // a4 b4 a5 b5 a6 b6 a7 b7
2197 
2198     __m128i v2 = _mm_unpacklo_epi16(v0, v1); // a0 a4 b0 b4 a1 a5 b1 b5
2199     __m128i v3 = _mm_unpackhi_epi16(v0, v1); // a2 a6 b2 b6 a3 a7 b3 b7
2200     __m128i v4 = _mm_unpacklo_epi16(v2, v3); // a0 a2 a4 a6 b0 b2 b4 b6
2201     __m128i v5 = _mm_unpackhi_epi16(v2, v3); // a1 a3 a5 a7 b1 b3 b5 b7
2202 
2203     a.val = _mm_unpacklo_epi16(v4, v5); // a0 a1 a2 a3 a4 a5 a6 a7
2204     b.val = _mm_unpackhi_epi16(v4, v5); // b0 b1 ab b3 b4 b5 b6 b7
2205 }
2206 
v_load_deinterleave(const ushort * ptr,v_uint16x8 & a,v_uint16x8 & b,v_uint16x8 & c)2207 inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c)
2208 {
2209 #if CV_SSE4_1
2210     __m128i v0 = _mm_loadu_si128((__m128i*)(ptr));
2211     __m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 8));
2212     __m128i v2 = _mm_loadu_si128((__m128i*)(ptr + 16));
2213     __m128i a0 = _mm_blend_epi16(_mm_blend_epi16(v0, v1, 0x92), v2, 0x24);
2214     __m128i b0 = _mm_blend_epi16(_mm_blend_epi16(v2, v0, 0x92), v1, 0x24);
2215     __m128i c0 = _mm_blend_epi16(_mm_blend_epi16(v1, v2, 0x92), v0, 0x24);
2216 
2217     const __m128i sh_a = _mm_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
2218     const __m128i sh_b = _mm_setr_epi8(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13);
2219     const __m128i sh_c = _mm_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
2220     a0 = _mm_shuffle_epi8(a0, sh_a);
2221     b0 = _mm_shuffle_epi8(b0, sh_b);
2222     c0 = _mm_shuffle_epi8(c0, sh_c);
2223 
2224     a.val = a0;
2225     b.val = b0;
2226     c.val = c0;
2227 #else
2228     __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
2229     __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 8));
2230     __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 16));
2231 
2232     __m128i t10 = _mm_unpacklo_epi16(t00, _mm_unpackhi_epi64(t01, t01));
2233     __m128i t11 = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t00, t00), t02);
2234     __m128i t12 = _mm_unpacklo_epi16(t01, _mm_unpackhi_epi64(t02, t02));
2235 
2236     __m128i t20 = _mm_unpacklo_epi16(t10, _mm_unpackhi_epi64(t11, t11));
2237     __m128i t21 = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t10, t10), t12);
2238     __m128i t22 = _mm_unpacklo_epi16(t11, _mm_unpackhi_epi64(t12, t12));
2239 
2240     a.val = _mm_unpacklo_epi16(t20, _mm_unpackhi_epi64(t21, t21));
2241     b.val = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t20, t20), t22);
2242     c.val = _mm_unpacklo_epi16(t21, _mm_unpackhi_epi64(t22, t22));
2243 #endif
2244 }
2245 
v_load_deinterleave(const ushort * ptr,v_uint16x8 & a,v_uint16x8 & b,v_uint16x8 & c,v_uint16x8 & d)2246 inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c, v_uint16x8& d)
2247 {
2248     __m128i u0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0 c0 d0 a1 b1 c1 d1
2249     __m128i u1 = _mm_loadu_si128((const __m128i*)(ptr + 8)); // a2 b2 c2 d2 ...
2250     __m128i u2 = _mm_loadu_si128((const __m128i*)(ptr + 16)); // a4 b4 c4 d4 ...
2251     __m128i u3 = _mm_loadu_si128((const __m128i*)(ptr + 24)); // a6 b6 c6 d6 ...
2252 
2253     __m128i v0 = _mm_unpacklo_epi16(u0, u2); // a0 a4 b0 b4 ...
2254     __m128i v1 = _mm_unpackhi_epi16(u0, u2); // a1 a5 b1 b5 ...
2255     __m128i v2 = _mm_unpacklo_epi16(u1, u3); // a2 a6 b2 b6 ...
2256     __m128i v3 = _mm_unpackhi_epi16(u1, u3); // a3 a7 b3 b7 ...
2257 
2258     u0 = _mm_unpacklo_epi16(v0, v2); // a0 a2 a4 a6 ...
2259     u1 = _mm_unpacklo_epi16(v1, v3); // a1 a3 a5 a7 ...
2260     u2 = _mm_unpackhi_epi16(v0, v2); // c0 c2 c4 c6 ...
2261     u3 = _mm_unpackhi_epi16(v1, v3); // c1 c3 c5 c7 ...
2262 
2263     a.val = _mm_unpacklo_epi16(u0, u1);
2264     b.val = _mm_unpackhi_epi16(u0, u1);
2265     c.val = _mm_unpacklo_epi16(u2, u3);
2266     d.val = _mm_unpackhi_epi16(u2, u3);
2267 }
2268 
v_load_deinterleave(const unsigned * ptr,v_uint32x4 & a,v_uint32x4 & b)2269 inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b)
2270 {
2271     __m128i v0 = _mm_loadu_si128((__m128i*)(ptr));     // a0 b0 a1 b1
2272     __m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 4)); // a2 b2 a3 b3
2273 
2274     __m128i v2 = _mm_unpacklo_epi32(v0, v1); // a0 a2 b0 b2
2275     __m128i v3 = _mm_unpackhi_epi32(v0, v1); // a1 a3 b1 b3
2276 
2277     a.val = _mm_unpacklo_epi32(v2, v3); // a0 a1 a2 a3
2278     b.val = _mm_unpackhi_epi32(v2, v3); // b0 b1 ab b3
2279 }
2280 
v_load_deinterleave(const unsigned * ptr,v_uint32x4 & a,v_uint32x4 & b,v_uint32x4 & c)2281 inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c)
2282 {
2283     __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
2284     __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 4));
2285     __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 8));
2286 
2287     __m128i t10 = _mm_unpacklo_epi32(t00, _mm_unpackhi_epi64(t01, t01));
2288     __m128i t11 = _mm_unpacklo_epi32(_mm_unpackhi_epi64(t00, t00), t02);
2289     __m128i t12 = _mm_unpacklo_epi32(t01, _mm_unpackhi_epi64(t02, t02));
2290 
2291     a.val = _mm_unpacklo_epi32(t10, _mm_unpackhi_epi64(t11, t11));
2292     b.val = _mm_unpacklo_epi32(_mm_unpackhi_epi64(t10, t10), t12);
2293     c.val = _mm_unpacklo_epi32(t11, _mm_unpackhi_epi64(t12, t12));
2294 }
2295 
v_load_deinterleave(const unsigned * ptr,v_uint32x4 & a,v_uint32x4 & b,v_uint32x4 & c,v_uint32x4 & d)2296 inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c, v_uint32x4& d)
2297 {
2298     v_uint32x4 s0(_mm_loadu_si128((const __m128i*)ptr));        // a0 b0 c0 d0
2299     v_uint32x4 s1(_mm_loadu_si128((const __m128i*)(ptr + 4)));  // a1 b1 c1 d1
2300     v_uint32x4 s2(_mm_loadu_si128((const __m128i*)(ptr + 8)));  // a2 b2 c2 d2
2301     v_uint32x4 s3(_mm_loadu_si128((const __m128i*)(ptr + 12))); // a3 b3 c3 d3
2302 
2303     v_transpose4x4(s0, s1, s2, s3, a, b, c, d);
2304 }
2305 
v_load_deinterleave(const float * ptr,v_float32x4 & a,v_float32x4 & b)2306 inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b)
2307 {
2308     __m128 u0 = _mm_loadu_ps(ptr);       // a0 b0 a1 b1
2309     __m128 u1 = _mm_loadu_ps((ptr + 4)); // a2 b2 a3 b3
2310 
2311     a.val = _mm_shuffle_ps(u0, u1, _MM_SHUFFLE(2, 0, 2, 0)); // a0 a1 a2 a3
2312     b.val = _mm_shuffle_ps(u0, u1, _MM_SHUFFLE(3, 1, 3, 1)); // b0 b1 ab b3
2313 }
2314 
v_load_deinterleave(const float * ptr,v_float32x4 & a,v_float32x4 & b,v_float32x4 & c)2315 inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b, v_float32x4& c)
2316 {
2317     __m128 t0 = _mm_loadu_ps(ptr + 0);
2318     __m128 t1 = _mm_loadu_ps(ptr + 4);
2319     __m128 t2 = _mm_loadu_ps(ptr + 8);
2320 
2321     __m128 at12 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(0, 1, 0, 2));
2322     a.val = _mm_shuffle_ps(t0, at12, _MM_SHUFFLE(2, 0, 3, 0));
2323 
2324     __m128 bt01 = _mm_shuffle_ps(t0, t1, _MM_SHUFFLE(0, 0, 0, 1));
2325     __m128 bt12 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(0, 2, 0, 3));
2326     b.val = _mm_shuffle_ps(bt01, bt12, _MM_SHUFFLE(2, 0, 2, 0));
2327 
2328     __m128 ct01 = _mm_shuffle_ps(t0, t1, _MM_SHUFFLE(0, 1, 0, 2));
2329     c.val = _mm_shuffle_ps(ct01, t2, _MM_SHUFFLE(3, 0, 2, 0));
2330 }
2331 
v_load_deinterleave(const float * ptr,v_float32x4 & a,v_float32x4 & b,v_float32x4 & c,v_float32x4 & d)2332 inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b, v_float32x4& c, v_float32x4& d)
2333 {
2334     __m128 t0 = _mm_loadu_ps(ptr +  0);
2335     __m128 t1 = _mm_loadu_ps(ptr +  4);
2336     __m128 t2 = _mm_loadu_ps(ptr +  8);
2337     __m128 t3 = _mm_loadu_ps(ptr + 12);
2338     __m128 t02lo = _mm_unpacklo_ps(t0, t2);
2339     __m128 t13lo = _mm_unpacklo_ps(t1, t3);
2340     __m128 t02hi = _mm_unpackhi_ps(t0, t2);
2341     __m128 t13hi = _mm_unpackhi_ps(t1, t3);
2342     a.val = _mm_unpacklo_ps(t02lo, t13lo);
2343     b.val = _mm_unpackhi_ps(t02lo, t13lo);
2344     c.val = _mm_unpacklo_ps(t02hi, t13hi);
2345     d.val = _mm_unpackhi_ps(t02hi, t13hi);
2346 }
2347 
v_load_deinterleave(const uint64 * ptr,v_uint64x2 & a,v_uint64x2 & b)2348 inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b)
2349 {
2350     __m128i t0 = _mm_loadu_si128((const __m128i*)ptr);
2351     __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 2));
2352 
2353     a = v_uint64x2(_mm_unpacklo_epi64(t0, t1));
2354     b = v_uint64x2(_mm_unpackhi_epi64(t0, t1));
2355 }
2356 
v_load_deinterleave(const uint64 * ptr,v_uint64x2 & a,v_uint64x2 & b,v_uint64x2 & c)2357 inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b, v_uint64x2& c)
2358 {
2359     __m128i t0 = _mm_loadu_si128((const __m128i*)ptr); // a0, b0
2360     __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 2)); // c0, a1
2361     __m128i t2 = _mm_loadu_si128((const __m128i*)(ptr + 4)); // b1, c1
2362 
2363     t1 = _mm_shuffle_epi32(t1, 0x4e); // a1, c0
2364 
2365     a = v_uint64x2(_mm_unpacklo_epi64(t0, t1));
2366     b = v_uint64x2(_mm_unpacklo_epi64(_mm_unpackhi_epi64(t0, t0), t2));
2367     c = v_uint64x2(_mm_unpackhi_epi64(t1, t2));
2368 }
2369 
v_load_deinterleave(const uint64 * ptr,v_uint64x2 & a,v_uint64x2 & b,v_uint64x2 & c,v_uint64x2 & d)2370 inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a,
2371                                 v_uint64x2& b, v_uint64x2& c, v_uint64x2& d)
2372 {
2373     __m128i t0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0
2374     __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 2)); // c0 d0
2375     __m128i t2 = _mm_loadu_si128((const __m128i*)(ptr + 4)); // a1 b1
2376     __m128i t3 = _mm_loadu_si128((const __m128i*)(ptr + 6)); // c1 d1
2377 
2378     a = v_uint64x2(_mm_unpacklo_epi64(t0, t2));
2379     b = v_uint64x2(_mm_unpackhi_epi64(t0, t2));
2380     c = v_uint64x2(_mm_unpacklo_epi64(t1, t3));
2381     d = v_uint64x2(_mm_unpackhi_epi64(t1, t3));
2382 }
2383 
2384 // store interleave
2385 
v_store_interleave(uchar * ptr,const v_uint8x16 & a,const v_uint8x16 & b,hal::StoreMode mode=hal::STORE_UNALIGNED)2386 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
2387                                 hal::StoreMode mode = hal::STORE_UNALIGNED)
2388 {
2389     __m128i v0 = _mm_unpacklo_epi8(a.val, b.val);
2390     __m128i v1 = _mm_unpackhi_epi8(a.val, b.val);
2391 
2392     if( mode == hal::STORE_ALIGNED_NOCACHE )
2393     {
2394         _mm_stream_si128((__m128i*)(ptr), v0);
2395         _mm_stream_si128((__m128i*)(ptr + 16), v1);
2396     }
2397     else if( mode == hal::STORE_ALIGNED )
2398     {
2399         _mm_store_si128((__m128i*)(ptr), v0);
2400         _mm_store_si128((__m128i*)(ptr + 16), v1);
2401     }
2402     else
2403     {
2404         _mm_storeu_si128((__m128i*)(ptr), v0);
2405         _mm_storeu_si128((__m128i*)(ptr + 16), v1);
2406     }
2407 }
2408 
v_store_interleave(uchar * ptr,const v_uint8x16 & a,const v_uint8x16 & b,const v_uint8x16 & c,hal::StoreMode mode=hal::STORE_UNALIGNED)2409 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
2410                                 const v_uint8x16& c, hal::StoreMode mode = hal::STORE_UNALIGNED)
2411 {
2412 #if CV_SSE4_1
2413     const __m128i sh_a = _mm_setr_epi8(0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5);
2414     const __m128i sh_b = _mm_setr_epi8(5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10);
2415     const __m128i sh_c = _mm_setr_epi8(10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15);
2416     __m128i a0 = _mm_shuffle_epi8(a.val, sh_a);
2417     __m128i b0 = _mm_shuffle_epi8(b.val, sh_b);
2418     __m128i c0 = _mm_shuffle_epi8(c.val, sh_c);
2419 
2420     const __m128i m0 = _mm_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
2421     const __m128i m1 = _mm_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
2422     __m128i v0 = _mm_blendv_epi8(_mm_blendv_epi8(a0, b0, m1), c0, m0);
2423     __m128i v1 = _mm_blendv_epi8(_mm_blendv_epi8(b0, c0, m1), a0, m0);
2424     __m128i v2 = _mm_blendv_epi8(_mm_blendv_epi8(c0, a0, m1), b0, m0);
2425 #elif CV_SSSE3
2426     const __m128i m0 = _mm_setr_epi8(0, 6, 11, 1, 7, 12, 2, 8, 13, 3, 9, 14, 4, 10, 15, 5);
2427     const __m128i m1 = _mm_setr_epi8(5, 11, 0, 6, 12, 1, 7, 13, 2, 8, 14, 3, 9, 15, 4, 10);
2428     const __m128i m2 = _mm_setr_epi8(10, 0, 5, 11, 1, 6, 12, 2, 7, 13, 3, 8, 14, 4, 9, 15);
2429 
2430     __m128i t0 = _mm_alignr_epi8(b.val, _mm_slli_si128(a.val, 10), 5);
2431     t0 = _mm_alignr_epi8(c.val, t0, 5);
2432     __m128i v0 = _mm_shuffle_epi8(t0, m0);
2433 
2434     __m128i t1 = _mm_alignr_epi8(_mm_srli_si128(b.val, 5), _mm_slli_si128(a.val, 5), 6);
2435     t1 = _mm_alignr_epi8(_mm_srli_si128(c.val, 5), t1, 5);
2436     __m128i v1 = _mm_shuffle_epi8(t1, m1);
2437 
2438     __m128i t2 = _mm_alignr_epi8(_mm_srli_si128(c.val, 10), b.val, 11);
2439     t2 = _mm_alignr_epi8(t2, a.val, 11);
2440     __m128i v2 = _mm_shuffle_epi8(t2, m2);
2441 #else
2442     __m128i z = _mm_setzero_si128();
2443     __m128i ab0 = _mm_unpacklo_epi8(a.val, b.val);
2444     __m128i ab1 = _mm_unpackhi_epi8(a.val, b.val);
2445     __m128i c0 = _mm_unpacklo_epi8(c.val, z);
2446     __m128i c1 = _mm_unpackhi_epi8(c.val, z);
2447 
2448     __m128i p00 = _mm_unpacklo_epi16(ab0, c0);
2449     __m128i p01 = _mm_unpackhi_epi16(ab0, c0);
2450     __m128i p02 = _mm_unpacklo_epi16(ab1, c1);
2451     __m128i p03 = _mm_unpackhi_epi16(ab1, c1);
2452 
2453     __m128i p10 = _mm_unpacklo_epi32(p00, p01);
2454     __m128i p11 = _mm_unpackhi_epi32(p00, p01);
2455     __m128i p12 = _mm_unpacklo_epi32(p02, p03);
2456     __m128i p13 = _mm_unpackhi_epi32(p02, p03);
2457 
2458     __m128i p20 = _mm_unpacklo_epi64(p10, p11);
2459     __m128i p21 = _mm_unpackhi_epi64(p10, p11);
2460     __m128i p22 = _mm_unpacklo_epi64(p12, p13);
2461     __m128i p23 = _mm_unpackhi_epi64(p12, p13);
2462 
2463     p20 = _mm_slli_si128(p20, 1);
2464     p22 = _mm_slli_si128(p22, 1);
2465 
2466     __m128i p30 = _mm_slli_epi64(_mm_unpacklo_epi32(p20, p21), 8);
2467     __m128i p31 = _mm_srli_epi64(_mm_unpackhi_epi32(p20, p21), 8);
2468     __m128i p32 = _mm_slli_epi64(_mm_unpacklo_epi32(p22, p23), 8);
2469     __m128i p33 = _mm_srli_epi64(_mm_unpackhi_epi32(p22, p23), 8);
2470 
2471     __m128i p40 = _mm_unpacklo_epi64(p30, p31);
2472     __m128i p41 = _mm_unpackhi_epi64(p30, p31);
2473     __m128i p42 = _mm_unpacklo_epi64(p32, p33);
2474     __m128i p43 = _mm_unpackhi_epi64(p32, p33);
2475 
2476     __m128i v0 = _mm_or_si128(_mm_srli_si128(p40, 2), _mm_slli_si128(p41, 10));
2477     __m128i v1 = _mm_or_si128(_mm_srli_si128(p41, 6), _mm_slli_si128(p42, 6));
2478     __m128i v2 = _mm_or_si128(_mm_srli_si128(p42, 10), _mm_slli_si128(p43, 2));
2479 #endif
2480 
2481     if( mode == hal::STORE_ALIGNED_NOCACHE )
2482     {
2483         _mm_stream_si128((__m128i*)(ptr), v0);
2484         _mm_stream_si128((__m128i*)(ptr + 16), v1);
2485         _mm_stream_si128((__m128i*)(ptr + 32), v2);
2486     }
2487     else if( mode == hal::STORE_ALIGNED )
2488     {
2489         _mm_store_si128((__m128i*)(ptr), v0);
2490         _mm_store_si128((__m128i*)(ptr + 16), v1);
2491         _mm_store_si128((__m128i*)(ptr + 32), v2);
2492     }
2493     else
2494     {
2495         _mm_storeu_si128((__m128i*)(ptr), v0);
2496         _mm_storeu_si128((__m128i*)(ptr + 16), v1);
2497         _mm_storeu_si128((__m128i*)(ptr + 32), v2);
2498     }
2499 }
2500 
v_store_interleave(uchar * ptr,const v_uint8x16 & a,const v_uint8x16 & b,const v_uint8x16 & c,const v_uint8x16 & d,hal::StoreMode mode=hal::STORE_UNALIGNED)2501 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
2502                                 const v_uint8x16& c, const v_uint8x16& d,
2503                                 hal::StoreMode mode = hal::STORE_UNALIGNED)
2504 {
2505     // a0 a1 a2 a3 ....
2506     // b0 b1 b2 b3 ....
2507     // c0 c1 c2 c3 ....
2508     // d0 d1 d2 d3 ....
2509     __m128i u0 = _mm_unpacklo_epi8(a.val, c.val); // a0 c0 a1 c1 ...
2510     __m128i u1 = _mm_unpackhi_epi8(a.val, c.val); // a8 c8 a9 c9 ...
2511     __m128i u2 = _mm_unpacklo_epi8(b.val, d.val); // b0 d0 b1 d1 ...
2512     __m128i u3 = _mm_unpackhi_epi8(b.val, d.val); // b8 d8 b9 d9 ...
2513 
2514     __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 b0 c0 d0 ...
2515     __m128i v1 = _mm_unpackhi_epi8(u0, u2); // a4 b4 c4 d4 ...
2516     __m128i v2 = _mm_unpacklo_epi8(u1, u3); // a8 b8 c8 d8 ...
2517     __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a12 b12 c12 d12 ...
2518 
2519     if( mode == hal::STORE_ALIGNED_NOCACHE )
2520     {
2521         _mm_stream_si128((__m128i*)(ptr), v0);
2522         _mm_stream_si128((__m128i*)(ptr + 16), v1);
2523         _mm_stream_si128((__m128i*)(ptr + 32), v2);
2524         _mm_stream_si128((__m128i*)(ptr + 48), v3);
2525     }
2526     else if( mode == hal::STORE_ALIGNED )
2527     {
2528         _mm_store_si128((__m128i*)(ptr), v0);
2529         _mm_store_si128((__m128i*)(ptr + 16), v1);
2530         _mm_store_si128((__m128i*)(ptr + 32), v2);
2531         _mm_store_si128((__m128i*)(ptr + 48), v3);
2532     }
2533     else
2534     {
2535         _mm_storeu_si128((__m128i*)(ptr), v0);
2536         _mm_storeu_si128((__m128i*)(ptr + 16), v1);
2537         _mm_storeu_si128((__m128i*)(ptr + 32), v2);
2538         _mm_storeu_si128((__m128i*)(ptr + 48), v3);
2539     }
2540 }
2541 
v_store_interleave(ushort * ptr,const v_uint16x8 & a,const v_uint16x8 & b,hal::StoreMode mode=hal::STORE_UNALIGNED)2542 inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
2543                                 hal::StoreMode mode = hal::STORE_UNALIGNED)
2544 {
2545     __m128i v0 = _mm_unpacklo_epi16(a.val, b.val);
2546     __m128i v1 = _mm_unpackhi_epi16(a.val, b.val);
2547 
2548     if( mode == hal::STORE_ALIGNED_NOCACHE )
2549     {
2550         _mm_stream_si128((__m128i*)(ptr), v0);
2551         _mm_stream_si128((__m128i*)(ptr + 8), v1);
2552     }
2553     else if( mode == hal::STORE_ALIGNED )
2554     {
2555         _mm_store_si128((__m128i*)(ptr), v0);
2556         _mm_store_si128((__m128i*)(ptr + 8), v1);
2557     }
2558     else
2559     {
2560         _mm_storeu_si128((__m128i*)(ptr), v0);
2561         _mm_storeu_si128((__m128i*)(ptr + 8), v1);
2562     }
2563 }
2564 
v_store_interleave(ushort * ptr,const v_uint16x8 & a,const v_uint16x8 & b,const v_uint16x8 & c,hal::StoreMode mode=hal::STORE_UNALIGNED)2565 inline void v_store_interleave( ushort* ptr, const v_uint16x8& a,
2566                                 const v_uint16x8& b, const v_uint16x8& c,
2567                                 hal::StoreMode mode = hal::STORE_UNALIGNED)
2568 {
2569 #if CV_SSE4_1
2570     const __m128i sh_a = _mm_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
2571     const __m128i sh_b = _mm_setr_epi8(10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5);
2572     const __m128i sh_c = _mm_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
2573     __m128i a0 = _mm_shuffle_epi8(a.val, sh_a);
2574     __m128i b0 = _mm_shuffle_epi8(b.val, sh_b);
2575     __m128i c0 = _mm_shuffle_epi8(c.val, sh_c);
2576 
2577     __m128i v0 = _mm_blend_epi16(_mm_blend_epi16(a0, b0, 0x92), c0, 0x24);
2578     __m128i v1 = _mm_blend_epi16(_mm_blend_epi16(c0, a0, 0x92), b0, 0x24);
2579     __m128i v2 = _mm_blend_epi16(_mm_blend_epi16(b0, c0, 0x92), a0, 0x24);
2580 #else
2581     __m128i z = _mm_setzero_si128();
2582     __m128i ab0 = _mm_unpacklo_epi16(a.val, b.val);
2583     __m128i ab1 = _mm_unpackhi_epi16(a.val, b.val);
2584     __m128i c0 = _mm_unpacklo_epi16(c.val, z);
2585     __m128i c1 = _mm_unpackhi_epi16(c.val, z);
2586 
2587     __m128i p10 = _mm_unpacklo_epi32(ab0, c0);
2588     __m128i p11 = _mm_unpackhi_epi32(ab0, c0);
2589     __m128i p12 = _mm_unpacklo_epi32(ab1, c1);
2590     __m128i p13 = _mm_unpackhi_epi32(ab1, c1);
2591 
2592     __m128i p20 = _mm_unpacklo_epi64(p10, p11);
2593     __m128i p21 = _mm_unpackhi_epi64(p10, p11);
2594     __m128i p22 = _mm_unpacklo_epi64(p12, p13);
2595     __m128i p23 = _mm_unpackhi_epi64(p12, p13);
2596 
2597     p20 = _mm_slli_si128(p20, 2);
2598     p22 = _mm_slli_si128(p22, 2);
2599 
2600     __m128i p30 = _mm_unpacklo_epi64(p20, p21);
2601     __m128i p31 = _mm_unpackhi_epi64(p20, p21);
2602     __m128i p32 = _mm_unpacklo_epi64(p22, p23);
2603     __m128i p33 = _mm_unpackhi_epi64(p22, p23);
2604 
2605     __m128i v0 = _mm_or_si128(_mm_srli_si128(p30, 2), _mm_slli_si128(p31, 10));
2606     __m128i v1 = _mm_or_si128(_mm_srli_si128(p31, 6), _mm_slli_si128(p32, 6));
2607     __m128i v2 = _mm_or_si128(_mm_srli_si128(p32, 10), _mm_slli_si128(p33, 2));
2608 #endif
2609     if( mode == hal::STORE_ALIGNED_NOCACHE )
2610     {
2611         _mm_stream_si128((__m128i*)(ptr), v0);
2612         _mm_stream_si128((__m128i*)(ptr + 8), v1);
2613         _mm_stream_si128((__m128i*)(ptr + 16), v2);
2614     }
2615     else if( mode == hal::STORE_ALIGNED )
2616     {
2617         _mm_store_si128((__m128i*)(ptr), v0);
2618         _mm_store_si128((__m128i*)(ptr + 8), v1);
2619         _mm_store_si128((__m128i*)(ptr + 16), v2);
2620     }
2621     else
2622     {
2623         _mm_storeu_si128((__m128i*)(ptr), v0);
2624         _mm_storeu_si128((__m128i*)(ptr + 8), v1);
2625         _mm_storeu_si128((__m128i*)(ptr + 16), v2);
2626     }
2627 }
2628 
v_store_interleave(ushort * ptr,const v_uint16x8 & a,const v_uint16x8 & b,const v_uint16x8 & c,const v_uint16x8 & d,hal::StoreMode mode=hal::STORE_UNALIGNED)2629 inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
2630                                 const v_uint16x8& c, const v_uint16x8& d,
2631                                 hal::StoreMode mode = hal::STORE_UNALIGNED)
2632 {
2633     // a0 a1 a2 a3 ....
2634     // b0 b1 b2 b3 ....
2635     // c0 c1 c2 c3 ....
2636     // d0 d1 d2 d3 ....
2637     __m128i u0 = _mm_unpacklo_epi16(a.val, c.val); // a0 c0 a1 c1 ...
2638     __m128i u1 = _mm_unpackhi_epi16(a.val, c.val); // a4 c4 a5 c5 ...
2639     __m128i u2 = _mm_unpacklo_epi16(b.val, d.val); // b0 d0 b1 d1 ...
2640     __m128i u3 = _mm_unpackhi_epi16(b.val, d.val); // b4 d4 b5 d5 ...
2641 
2642     __m128i v0 = _mm_unpacklo_epi16(u0, u2); // a0 b0 c0 d0 ...
2643     __m128i v1 = _mm_unpackhi_epi16(u0, u2); // a2 b2 c2 d2 ...
2644     __m128i v2 = _mm_unpacklo_epi16(u1, u3); // a4 b4 c4 d4 ...
2645     __m128i v3 = _mm_unpackhi_epi16(u1, u3); // a6 b6 c6 d6 ...
2646 
2647     if( mode == hal::STORE_ALIGNED_NOCACHE )
2648     {
2649         _mm_stream_si128((__m128i*)(ptr), v0);
2650         _mm_stream_si128((__m128i*)(ptr + 8), v1);
2651         _mm_stream_si128((__m128i*)(ptr + 16), v2);
2652         _mm_stream_si128((__m128i*)(ptr + 24), v3);
2653     }
2654     else if( mode == hal::STORE_ALIGNED )
2655     {
2656         _mm_store_si128((__m128i*)(ptr), v0);
2657         _mm_store_si128((__m128i*)(ptr + 8), v1);
2658         _mm_store_si128((__m128i*)(ptr + 16), v2);
2659         _mm_store_si128((__m128i*)(ptr + 24), v3);
2660     }
2661     else
2662     {
2663         _mm_storeu_si128((__m128i*)(ptr), v0);
2664         _mm_storeu_si128((__m128i*)(ptr + 8), v1);
2665         _mm_storeu_si128((__m128i*)(ptr + 16), v2);
2666         _mm_storeu_si128((__m128i*)(ptr + 24), v3);
2667     }
2668 }
2669 
v_store_interleave(unsigned * ptr,const v_uint32x4 & a,const v_uint32x4 & b,hal::StoreMode mode=hal::STORE_UNALIGNED)2670 inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
2671                                 hal::StoreMode mode = hal::STORE_UNALIGNED)
2672 {
2673     __m128i v0 = _mm_unpacklo_epi32(a.val, b.val);
2674     __m128i v1 = _mm_unpackhi_epi32(a.val, b.val);
2675 
2676     if( mode == hal::STORE_ALIGNED_NOCACHE )
2677     {
2678         _mm_stream_si128((__m128i*)(ptr), v0);
2679         _mm_stream_si128((__m128i*)(ptr + 4), v1);
2680     }
2681     else if( mode == hal::STORE_ALIGNED )
2682     {
2683         _mm_store_si128((__m128i*)(ptr), v0);
2684         _mm_store_si128((__m128i*)(ptr + 4), v1);
2685     }
2686     else
2687     {
2688         _mm_storeu_si128((__m128i*)(ptr), v0);
2689         _mm_storeu_si128((__m128i*)(ptr + 4), v1);
2690     }
2691 }
2692 
v_store_interleave(unsigned * ptr,const v_uint32x4 & a,const v_uint32x4 & b,const v_uint32x4 & c,hal::StoreMode mode=hal::STORE_UNALIGNED)2693 inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
2694                                 const v_uint32x4& c, hal::StoreMode mode = hal::STORE_UNALIGNED)
2695 {
2696     v_uint32x4 z = v_setzero_u32(), u0, u1, u2, u3;
2697     v_transpose4x4(a, b, c, z, u0, u1, u2, u3);
2698 
2699     __m128i v0 = _mm_or_si128(u0.val, _mm_slli_si128(u1.val, 12));
2700     __m128i v1 = _mm_or_si128(_mm_srli_si128(u1.val, 4), _mm_slli_si128(u2.val, 8));
2701     __m128i v2 = _mm_or_si128(_mm_srli_si128(u2.val, 8), _mm_slli_si128(u3.val, 4));
2702 
2703     if( mode == hal::STORE_ALIGNED_NOCACHE )
2704     {
2705         _mm_stream_si128((__m128i*)(ptr), v0);
2706         _mm_stream_si128((__m128i*)(ptr + 4), v1);
2707         _mm_stream_si128((__m128i*)(ptr + 8), v2);
2708     }
2709     else if( mode == hal::STORE_ALIGNED )
2710     {
2711         _mm_store_si128((__m128i*)(ptr), v0);
2712         _mm_store_si128((__m128i*)(ptr + 4), v1);
2713         _mm_store_si128((__m128i*)(ptr + 8), v2);
2714     }
2715     else
2716     {
2717         _mm_storeu_si128((__m128i*)(ptr), v0);
2718         _mm_storeu_si128((__m128i*)(ptr + 4), v1);
2719         _mm_storeu_si128((__m128i*)(ptr + 8), v2);
2720     }
2721 }
2722 
v_store_interleave(unsigned * ptr,const v_uint32x4 & a,const v_uint32x4 & b,const v_uint32x4 & c,const v_uint32x4 & d,hal::StoreMode mode=hal::STORE_UNALIGNED)2723 inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
2724                                const v_uint32x4& c, const v_uint32x4& d,
2725                                hal::StoreMode mode = hal::STORE_UNALIGNED)
2726 {
2727     v_uint32x4 v0, v1, v2, v3;
2728     v_transpose4x4(a, b, c, d, v0, v1, v2, v3);
2729 
2730     if( mode == hal::STORE_ALIGNED_NOCACHE )
2731     {
2732         _mm_stream_si128((__m128i*)(ptr), v0.val);
2733         _mm_stream_si128((__m128i*)(ptr + 4), v1.val);
2734         _mm_stream_si128((__m128i*)(ptr + 8), v2.val);
2735         _mm_stream_si128((__m128i*)(ptr + 12), v3.val);
2736     }
2737     else if( mode == hal::STORE_ALIGNED )
2738     {
2739         _mm_store_si128((__m128i*)(ptr), v0.val);
2740         _mm_store_si128((__m128i*)(ptr + 4), v1.val);
2741         _mm_store_si128((__m128i*)(ptr + 8), v2.val);
2742         _mm_store_si128((__m128i*)(ptr + 12), v3.val);
2743     }
2744     else
2745     {
2746         _mm_storeu_si128((__m128i*)(ptr), v0.val);
2747         _mm_storeu_si128((__m128i*)(ptr + 4), v1.val);
2748         _mm_storeu_si128((__m128i*)(ptr + 8), v2.val);
2749         _mm_storeu_si128((__m128i*)(ptr + 12), v3.val);
2750     }
2751 }
2752 
2753 // 2-channel, float only
v_store_interleave(float * ptr,const v_float32x4 & a,const v_float32x4 & b,hal::StoreMode mode=hal::STORE_UNALIGNED)2754 inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
2755                                hal::StoreMode mode = hal::STORE_UNALIGNED)
2756 {
2757     __m128 v0 = _mm_unpacklo_ps(a.val, b.val); // a0 b0 a1 b1
2758     __m128 v1 = _mm_unpackhi_ps(a.val, b.val); // a2 b2 a3 b3
2759 
2760     if( mode == hal::STORE_ALIGNED_NOCACHE )
2761     {
2762         _mm_stream_ps(ptr, v0);
2763         _mm_stream_ps(ptr + 4, v1);
2764     }
2765     else if( mode == hal::STORE_ALIGNED )
2766     {
2767         _mm_store_ps(ptr, v0);
2768         _mm_store_ps(ptr + 4, v1);
2769     }
2770     else
2771     {
2772         _mm_storeu_ps(ptr, v0);
2773         _mm_storeu_ps(ptr + 4, v1);
2774     }
2775 }
2776 
v_store_interleave(float * ptr,const v_float32x4 & a,const v_float32x4 & b,const v_float32x4 & c,hal::StoreMode mode=hal::STORE_UNALIGNED)2777 inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
2778                                const v_float32x4& c, hal::StoreMode mode = hal::STORE_UNALIGNED)
2779 {
2780     __m128 u0 = _mm_shuffle_ps(a.val, b.val, _MM_SHUFFLE(0, 0, 0, 0));
2781     __m128 u1 = _mm_shuffle_ps(c.val, a.val, _MM_SHUFFLE(1, 1, 0, 0));
2782     __m128 v0 = _mm_shuffle_ps(u0, u1, _MM_SHUFFLE(2, 0, 2, 0));
2783     __m128 u2 = _mm_shuffle_ps(b.val, c.val, _MM_SHUFFLE(1, 1, 1, 1));
2784     __m128 u3 = _mm_shuffle_ps(a.val, b.val, _MM_SHUFFLE(2, 2, 2, 2));
2785     __m128 v1 = _mm_shuffle_ps(u2, u3, _MM_SHUFFLE(2, 0, 2, 0));
2786     __m128 u4 = _mm_shuffle_ps(c.val, a.val, _MM_SHUFFLE(3, 3, 2, 2));
2787     __m128 u5 = _mm_shuffle_ps(b.val, c.val, _MM_SHUFFLE(3, 3, 3, 3));
2788     __m128 v2 = _mm_shuffle_ps(u4, u5, _MM_SHUFFLE(2, 0, 2, 0));
2789 
2790     if( mode == hal::STORE_ALIGNED_NOCACHE )
2791     {
2792         _mm_stream_ps(ptr, v0);
2793         _mm_stream_ps(ptr + 4, v1);
2794         _mm_stream_ps(ptr + 8, v2);
2795     }
2796     else if( mode == hal::STORE_ALIGNED )
2797     {
2798         _mm_store_ps(ptr, v0);
2799         _mm_store_ps(ptr + 4, v1);
2800         _mm_store_ps(ptr + 8, v2);
2801     }
2802     else
2803     {
2804         _mm_storeu_ps(ptr, v0);
2805         _mm_storeu_ps(ptr + 4, v1);
2806         _mm_storeu_ps(ptr + 8, v2);
2807     }
2808 }
2809 
v_store_interleave(float * ptr,const v_float32x4 & a,const v_float32x4 & b,const v_float32x4 & c,const v_float32x4 & d,hal::StoreMode mode=hal::STORE_UNALIGNED)2810 inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
2811                                const v_float32x4& c, const v_float32x4& d,
2812                                hal::StoreMode mode = hal::STORE_UNALIGNED)
2813 {
2814     __m128 u0 = _mm_unpacklo_ps(a.val, c.val);
2815     __m128 u1 = _mm_unpacklo_ps(b.val, d.val);
2816     __m128 u2 = _mm_unpackhi_ps(a.val, c.val);
2817     __m128 u3 = _mm_unpackhi_ps(b.val, d.val);
2818     __m128 v0 = _mm_unpacklo_ps(u0, u1);
2819     __m128 v2 = _mm_unpacklo_ps(u2, u3);
2820     __m128 v1 = _mm_unpackhi_ps(u0, u1);
2821     __m128 v3 = _mm_unpackhi_ps(u2, u3);
2822 
2823     if( mode == hal::STORE_ALIGNED_NOCACHE )
2824     {
2825         _mm_stream_ps(ptr, v0);
2826         _mm_stream_ps(ptr + 4, v1);
2827         _mm_stream_ps(ptr + 8, v2);
2828         _mm_stream_ps(ptr + 12, v3);
2829     }
2830     else if( mode == hal::STORE_ALIGNED )
2831     {
2832         _mm_store_ps(ptr, v0);
2833         _mm_store_ps(ptr + 4, v1);
2834         _mm_store_ps(ptr + 8, v2);
2835         _mm_store_ps(ptr + 12, v3);
2836     }
2837     else
2838     {
2839         _mm_storeu_ps(ptr, v0);
2840         _mm_storeu_ps(ptr + 4, v1);
2841         _mm_storeu_ps(ptr + 8, v2);
2842         _mm_storeu_ps(ptr + 12, v3);
2843     }
2844 }
2845 
v_store_interleave(uint64 * ptr,const v_uint64x2 & a,const v_uint64x2 & b,hal::StoreMode mode=hal::STORE_UNALIGNED)2846 inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
2847                                hal::StoreMode mode = hal::STORE_UNALIGNED)
2848 {
2849     __m128i v0 = _mm_unpacklo_epi64(a.val, b.val);
2850     __m128i v1 = _mm_unpackhi_epi64(a.val, b.val);
2851 
2852     if( mode == hal::STORE_ALIGNED_NOCACHE )
2853     {
2854         _mm_stream_si128((__m128i*)(ptr), v0);
2855         _mm_stream_si128((__m128i*)(ptr + 2), v1);
2856     }
2857     else if( mode == hal::STORE_ALIGNED )
2858     {
2859         _mm_store_si128((__m128i*)(ptr), v0);
2860         _mm_store_si128((__m128i*)(ptr + 2), v1);
2861     }
2862     else
2863     {
2864         _mm_storeu_si128((__m128i*)(ptr), v0);
2865         _mm_storeu_si128((__m128i*)(ptr + 2), v1);
2866     }
2867 }
2868 
v_store_interleave(uint64 * ptr,const v_uint64x2 & a,const v_uint64x2 & b,const v_uint64x2 & c,hal::StoreMode mode=hal::STORE_UNALIGNED)2869 inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
2870                                const v_uint64x2& c, hal::StoreMode mode = hal::STORE_UNALIGNED)
2871 {
2872     __m128i v0 = _mm_unpacklo_epi64(a.val, b.val);
2873     __m128i v1 = _mm_unpacklo_epi64(c.val, _mm_unpackhi_epi64(a.val, a.val));
2874     __m128i v2 = _mm_unpackhi_epi64(b.val, c.val);
2875 
2876     if( mode == hal::STORE_ALIGNED_NOCACHE )
2877     {
2878         _mm_stream_si128((__m128i*)(ptr), v0);
2879         _mm_stream_si128((__m128i*)(ptr + 2), v1);
2880         _mm_stream_si128((__m128i*)(ptr + 4), v2);
2881     }
2882     else if( mode == hal::STORE_ALIGNED )
2883     {
2884         _mm_store_si128((__m128i*)(ptr), v0);
2885         _mm_store_si128((__m128i*)(ptr + 2), v1);
2886         _mm_store_si128((__m128i*)(ptr + 4), v2);
2887     }
2888     else
2889     {
2890         _mm_storeu_si128((__m128i*)(ptr), v0);
2891         _mm_storeu_si128((__m128i*)(ptr + 2), v1);
2892         _mm_storeu_si128((__m128i*)(ptr + 4), v2);
2893     }
2894 }
2895 
v_store_interleave(uint64 * ptr,const v_uint64x2 & a,const v_uint64x2 & b,const v_uint64x2 & c,const v_uint64x2 & d,hal::StoreMode mode=hal::STORE_UNALIGNED)2896 inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
2897                                const v_uint64x2& c, const v_uint64x2& d,
2898                                hal::StoreMode mode = hal::STORE_UNALIGNED)
2899 {
2900     __m128i v0 = _mm_unpacklo_epi64(a.val, b.val);
2901     __m128i v1 = _mm_unpacklo_epi64(c.val, d.val);
2902     __m128i v2 = _mm_unpackhi_epi64(a.val, b.val);
2903     __m128i v3 = _mm_unpackhi_epi64(c.val, d.val);
2904 
2905     if( mode == hal::STORE_ALIGNED_NOCACHE )
2906     {
2907         _mm_stream_si128((__m128i*)(ptr), v0);
2908         _mm_stream_si128((__m128i*)(ptr + 2), v1);
2909         _mm_stream_si128((__m128i*)(ptr + 4), v2);
2910         _mm_stream_si128((__m128i*)(ptr + 6), v3);
2911     }
2912     else if( mode == hal::STORE_ALIGNED )
2913     {
2914         _mm_store_si128((__m128i*)(ptr), v0);
2915         _mm_store_si128((__m128i*)(ptr + 2), v1);
2916         _mm_store_si128((__m128i*)(ptr + 4), v2);
2917         _mm_store_si128((__m128i*)(ptr + 6), v3);
2918     }
2919     else
2920     {
2921         _mm_storeu_si128((__m128i*)(ptr), v0);
2922         _mm_storeu_si128((__m128i*)(ptr + 2), v1);
2923         _mm_storeu_si128((__m128i*)(ptr + 4), v2);
2924         _mm_storeu_si128((__m128i*)(ptr + 6), v3);
2925     }
2926 }
2927 
2928 #define OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
2929 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \
2930 { \
2931     _Tpvec1 a1, b1; \
2932     v_load_deinterleave((const _Tp1*)ptr, a1, b1); \
2933     a0 = v_reinterpret_as_##suffix0(a1); \
2934     b0 = v_reinterpret_as_##suffix0(b1); \
2935 } \
2936 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \
2937 { \
2938     _Tpvec1 a1, b1, c1; \
2939     v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \
2940     a0 = v_reinterpret_as_##suffix0(a1); \
2941     b0 = v_reinterpret_as_##suffix0(b1); \
2942     c0 = v_reinterpret_as_##suffix0(c1); \
2943 } \
2944 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \
2945 { \
2946     _Tpvec1 a1, b1, c1, d1; \
2947     v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \
2948     a0 = v_reinterpret_as_##suffix0(a1); \
2949     b0 = v_reinterpret_as_##suffix0(b1); \
2950     c0 = v_reinterpret_as_##suffix0(c1); \
2951     d0 = v_reinterpret_as_##suffix0(d1); \
2952 } \
2953 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2954                                 hal::StoreMode mode = hal::STORE_UNALIGNED ) \
2955 { \
2956     _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2957     _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2958     v_store_interleave((_Tp1*)ptr, a1, b1, mode);      \
2959 } \
2960 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2961                                 const _Tpvec0& c0, hal::StoreMode mode = hal::STORE_UNALIGNED ) \
2962 { \
2963     _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2964     _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2965     _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
2966     v_store_interleave((_Tp1*)ptr, a1, b1, c1, mode);  \
2967 } \
2968 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2969                                 const _Tpvec0& c0, const _Tpvec0& d0, \
2970                                 hal::StoreMode mode = hal::STORE_UNALIGNED ) \
2971 { \
2972     _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2973     _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2974     _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
2975     _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
2976     v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1, mode); \
2977 }
2978 
OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int8x16,schar,s8,v_uint8x16,uchar,u8)2979 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int8x16, schar, s8, v_uint8x16, uchar, u8)
2980 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int16x8, short, s16, v_uint16x8, ushort, u16)
2981 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int32x4, int, s32, v_uint32x4, unsigned, u32)
2982 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int64x2, int64, s64, v_uint64x2, uint64, u64)
2983 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_float64x2, double, f64, v_uint64x2, uint64, u64)
2984 
2985 inline v_float32x4 v_cvt_f32(const v_int32x4& a)
2986 {
2987     return v_float32x4(_mm_cvtepi32_ps(a.val));
2988 }
2989 
v_cvt_f32(const v_float64x2 & a)2990 inline v_float32x4 v_cvt_f32(const v_float64x2& a)
2991 {
2992     return v_float32x4(_mm_cvtpd_ps(a.val));
2993 }
2994 
v_cvt_f32(const v_float64x2 & a,const v_float64x2 & b)2995 inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
2996 {
2997     return v_float32x4(_mm_movelh_ps(_mm_cvtpd_ps(a.val), _mm_cvtpd_ps(b.val)));
2998 }
2999 
v_cvt_f64(const v_int32x4 & a)3000 inline v_float64x2 v_cvt_f64(const v_int32x4& a)
3001 {
3002     return v_float64x2(_mm_cvtepi32_pd(a.val));
3003 }
3004 
v_cvt_f64_high(const v_int32x4 & a)3005 inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
3006 {
3007     return v_float64x2(_mm_cvtepi32_pd(_mm_srli_si128(a.val,8)));
3008 }
3009 
v_cvt_f64(const v_float32x4 & a)3010 inline v_float64x2 v_cvt_f64(const v_float32x4& a)
3011 {
3012     return v_float64x2(_mm_cvtps_pd(a.val));
3013 }
3014 
v_cvt_f64_high(const v_float32x4 & a)3015 inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
3016 {
3017     return v_float64x2(_mm_cvtps_pd(_mm_movehl_ps(a.val, a.val)));
3018 }
3019 
3020 // from (Mysticial and wim) https://stackoverflow.com/q/41144668
v_cvt_f64(const v_int64x2 & v)3021 inline v_float64x2 v_cvt_f64(const v_int64x2& v)
3022 {
3023     // constants encoded as floating-point
3024     __m128i magic_i_hi32 = _mm_set1_epi64x(0x4530000080000000); // 2^84 + 2^63
3025     __m128i magic_i_all  = _mm_set1_epi64x(0x4530000080100000); // 2^84 + 2^63 + 2^52
3026     __m128d magic_d_all  = _mm_castsi128_pd(magic_i_all);
3027     // Blend the 32 lowest significant bits of v with magic_int_lo
3028 #if CV_SSE4_1
3029     __m128i magic_i_lo   = _mm_set1_epi64x(0x4330000000000000); // 2^52
3030     __m128i v_lo         = _mm_blend_epi16(v.val, magic_i_lo, 0xcc);
3031 #else
3032     __m128i magic_i_lo   = _mm_set1_epi32(0x43300000); // 2^52
3033     __m128i v_lo         = _mm_unpacklo_epi32(_mm_shuffle_epi32(v.val, _MM_SHUFFLE(0, 0, 2, 0)), magic_i_lo);
3034 #endif
3035     // Extract the 32 most significant bits of v
3036     __m128i v_hi         = _mm_srli_epi64(v.val, 32);
3037     // Flip the msb of v_hi and blend with 0x45300000
3038             v_hi         = _mm_xor_si128(v_hi, magic_i_hi32);
3039     // Compute in double precision
3040     __m128d v_hi_dbl     = _mm_sub_pd(_mm_castsi128_pd(v_hi), magic_d_all);
3041     // (v_hi - magic_d_all) + v_lo  Do not assume associativity of floating point addition
3042     __m128d result       = _mm_add_pd(v_hi_dbl, _mm_castsi128_pd(v_lo));
3043     return v_float64x2(result);
3044 }
3045 
3046 ////////////// Lookup table access ////////////////////
3047 
v_lut(const schar * tab,const int * idx)3048 inline v_int8x16 v_lut(const schar* tab, const int* idx)
3049 {
3050 #if defined(_MSC_VER)
3051     return v_int8x16(_mm_setr_epi8(tab[idx[0]], tab[idx[1]], tab[idx[ 2]], tab[idx[ 3]], tab[idx[ 4]], tab[idx[ 5]], tab[idx[ 6]], tab[idx[ 7]],
3052                                    tab[idx[8]], tab[idx[9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]]));
3053 #else
3054     return v_int8x16(_mm_setr_epi64(
3055                         _mm_setr_pi8(tab[idx[0]], tab[idx[1]], tab[idx[ 2]], tab[idx[ 3]], tab[idx[ 4]], tab[idx[ 5]], tab[idx[ 6]], tab[idx[ 7]]),
3056                         _mm_setr_pi8(tab[idx[8]], tab[idx[9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]])
3057                     ));
3058 #endif
3059 }
v_lut_pairs(const schar * tab,const int * idx)3060 inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
3061 {
3062 #if defined(_MSC_VER)
3063     return v_int8x16(_mm_setr_epi16(*(const short*)(tab + idx[0]), *(const short*)(tab + idx[1]), *(const short*)(tab + idx[2]), *(const short*)(tab + idx[3]),
3064                                     *(const short*)(tab + idx[4]), *(const short*)(tab + idx[5]), *(const short*)(tab + idx[6]), *(const short*)(tab + idx[7])));
3065 #else
3066     return v_int8x16(_mm_setr_epi64(
3067                         _mm_setr_pi16(*(const short*)(tab + idx[0]), *(const short*)(tab + idx[1]), *(const short*)(tab + idx[2]), *(const short*)(tab + idx[3])),
3068                         _mm_setr_pi16(*(const short*)(tab + idx[4]), *(const short*)(tab + idx[5]), *(const short*)(tab + idx[6]), *(const short*)(tab + idx[7]))
3069                     ));
3070 #endif
3071 }
v_lut_quads(const schar * tab,const int * idx)3072 inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
3073 {
3074 #if defined(_MSC_VER)
3075     return v_int8x16(_mm_setr_epi32(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1]),
3076                                     *(const int*)(tab + idx[2]), *(const int*)(tab + idx[3])));
3077 #else
3078     return v_int8x16(_mm_setr_epi64(
3079                         _mm_setr_pi32(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1])),
3080                         _mm_setr_pi32(*(const int*)(tab + idx[2]), *(const int*)(tab + idx[3]))
3081                     ));
3082 #endif
3083 }
v_lut(const uchar * tab,const int * idx)3084 inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((const schar *)tab, idx)); }
v_lut_pairs(const uchar * tab,const int * idx)3085 inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((const schar *)tab, idx)); }
v_lut_quads(const uchar * tab,const int * idx)3086 inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((const schar *)tab, idx)); }
3087 
v_lut(const short * tab,const int * idx)3088 inline v_int16x8 v_lut(const short* tab, const int* idx)
3089 {
3090 #if defined(_MSC_VER)
3091     return v_int16x8(_mm_setr_epi16(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]],
3092                                     tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]]));
3093 #else
3094     return v_int16x8(_mm_setr_epi64(
3095                         _mm_setr_pi16(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]),
3096                         _mm_setr_pi16(tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]])
3097                     ));
3098 #endif
3099 }
v_lut_pairs(const short * tab,const int * idx)3100 inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
3101 {
3102 #if defined(_MSC_VER)
3103     return v_int16x8(_mm_setr_epi32(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1]),
3104                                     *(const int*)(tab + idx[2]), *(const int*)(tab + idx[3])));
3105 #else
3106     return v_int16x8(_mm_setr_epi64(
3107                         _mm_setr_pi32(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1])),
3108                         _mm_setr_pi32(*(const int*)(tab + idx[2]), *(const int*)(tab + idx[3]))
3109                     ));
3110 #endif
3111 }
v_lut_quads(const short * tab,const int * idx)3112 inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
3113 {
3114     return v_int16x8(_mm_set_epi64x(*(const int64_t*)(tab + idx[1]), *(const int64_t*)(tab + idx[0])));
3115 }
v_lut(const ushort * tab,const int * idx)3116 inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((const short *)tab, idx)); }
v_lut_pairs(const ushort * tab,const int * idx)3117 inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((const short *)tab, idx)); }
v_lut_quads(const ushort * tab,const int * idx)3118 inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((const short *)tab, idx)); }
3119 
v_lut(const int * tab,const int * idx)3120 inline v_int32x4 v_lut(const int* tab, const int* idx)
3121 {
3122 #if defined(_MSC_VER)
3123     return v_int32x4(_mm_setr_epi32(tab[idx[0]], tab[idx[1]],
3124                                     tab[idx[2]], tab[idx[3]]));
3125 #else
3126     return v_int32x4(_mm_setr_epi64(
3127                         _mm_setr_pi32(tab[idx[0]], tab[idx[1]]),
3128                         _mm_setr_pi32(tab[idx[2]], tab[idx[3]])
3129                     ));
3130 #endif
3131 }
v_lut_pairs(const int * tab,const int * idx)3132 inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
3133 {
3134     return v_int32x4(_mm_set_epi64x(*(const int64_t*)(tab + idx[1]), *(const int64_t*)(tab + idx[0])));
3135 }
v_lut_quads(const int * tab,const int * idx)3136 inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
3137 {
3138     return v_int32x4(_mm_loadu_si128((const __m128i*)(tab + idx[0])));
3139 }
v_lut(const unsigned * tab,const int * idx)3140 inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((const int *)tab, idx)); }
v_lut_pairs(const unsigned * tab,const int * idx)3141 inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((const int *)tab, idx)); }
v_lut_quads(const unsigned * tab,const int * idx)3142 inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((const int *)tab, idx)); }
3143 
v_lut(const int64_t * tab,const int * idx)3144 inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
3145 {
3146     return v_int64x2(_mm_set_epi64x(tab[idx[1]], tab[idx[0]]));
3147 }
v_lut_pairs(const int64_t * tab,const int * idx)3148 inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
3149 {
3150     return v_int64x2(_mm_loadu_si128((const __m128i*)(tab + idx[0])));
3151 }
v_lut(const uint64_t * tab,const int * idx)3152 inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
v_lut_pairs(const uint64_t * tab,const int * idx)3153 inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
3154 
v_lut(const float * tab,const int * idx)3155 inline v_float32x4 v_lut(const float* tab, const int* idx)
3156 {
3157     return v_float32x4(_mm_setr_ps(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
3158 }
v_lut_pairs(const float * tab,const int * idx)3159 inline v_float32x4 v_lut_pairs(const float* tab, const int* idx) { return v_reinterpret_as_f32(v_lut_pairs((const int *)tab, idx)); }
v_lut_quads(const float * tab,const int * idx)3160 inline v_float32x4 v_lut_quads(const float* tab, const int* idx) { return v_reinterpret_as_f32(v_lut_quads((const int *)tab, idx)); }
3161 
v_lut(const double * tab,const int * idx)3162 inline v_float64x2 v_lut(const double* tab, const int* idx)
3163 {
3164     return v_float64x2(_mm_setr_pd(tab[idx[0]], tab[idx[1]]));
3165 }
v_lut_pairs(const double * tab,const int * idx)3166 inline v_float64x2 v_lut_pairs(const double* tab, const int* idx) { return v_float64x2(_mm_castsi128_pd(_mm_loadu_si128((const __m128i*)(tab + idx[0])))); }
3167 
v_lut(const int * tab,const v_int32x4 & idxvec)3168 inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
3169 {
3170     int CV_DECL_ALIGNED(32) idx[4];
3171     v_store_aligned(idx, idxvec);
3172     return v_int32x4(_mm_setr_epi32(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
3173 }
3174 
v_lut(const unsigned * tab,const v_int32x4 & idxvec)3175 inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
3176 {
3177     return v_reinterpret_as_u32(v_lut((const int *)tab, idxvec));
3178 }
3179 
v_lut(const float * tab,const v_int32x4 & idxvec)3180 inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
3181 {
3182     int CV_DECL_ALIGNED(32) idx[4];
3183     v_store_aligned(idx, idxvec);
3184     return v_float32x4(_mm_setr_ps(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
3185 }
3186 
v_lut(const double * tab,const v_int32x4 & idxvec)3187 inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
3188 {
3189     int idx[2];
3190     v_store_low(idx, idxvec);
3191     return v_float64x2(_mm_setr_pd(tab[idx[0]], tab[idx[1]]));
3192 }
3193 
3194 // loads pairs from the table and deinterleaves them, e.g. returns:
3195 //   x = (tab[idxvec[0], tab[idxvec[1]], tab[idxvec[2]], tab[idxvec[3]]),
3196 //   y = (tab[idxvec[0]+1], tab[idxvec[1]+1], tab[idxvec[2]+1], tab[idxvec[3]+1])
3197 // note that the indices are float's indices, not the float-pair indices.
3198 // in theory, this function can be used to implement bilinear interpolation,
3199 // when idxvec are the offsets within the image.
v_lut_deinterleave(const float * tab,const v_int32x4 & idxvec,v_float32x4 & x,v_float32x4 & y)3200 inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
3201 {
3202     int CV_DECL_ALIGNED(32) idx[4];
3203     v_store_aligned(idx, idxvec);
3204     __m128 z = _mm_setzero_ps();
3205     __m128 xy01 = _mm_loadl_pi(z, (__m64*)(tab + idx[0]));
3206     __m128 xy23 = _mm_loadl_pi(z, (__m64*)(tab + idx[2]));
3207     xy01 = _mm_loadh_pi(xy01, (__m64*)(tab + idx[1]));
3208     xy23 = _mm_loadh_pi(xy23, (__m64*)(tab + idx[3]));
3209     __m128 xxyy02 = _mm_unpacklo_ps(xy01, xy23);
3210     __m128 xxyy13 = _mm_unpackhi_ps(xy01, xy23);
3211     x = v_float32x4(_mm_unpacklo_ps(xxyy02, xxyy13));
3212     y = v_float32x4(_mm_unpackhi_ps(xxyy02, xxyy13));
3213 }
3214 
v_lut_deinterleave(const double * tab,const v_int32x4 & idxvec,v_float64x2 & x,v_float64x2 & y)3215 inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
3216 {
3217     int idx[2];
3218     v_store_low(idx, idxvec);
3219     __m128d xy0 = _mm_loadu_pd(tab + idx[0]);
3220     __m128d xy1 = _mm_loadu_pd(tab + idx[1]);
3221     x = v_float64x2(_mm_unpacklo_pd(xy0, xy1));
3222     y = v_float64x2(_mm_unpackhi_pd(xy0, xy1));
3223 }
3224 
v_interleave_pairs(const v_int8x16 & vec)3225 inline v_int8x16 v_interleave_pairs(const v_int8x16& vec)
3226 {
3227 #if CV_SSSE3
3228     return v_int8x16(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0x0f0d0e0c0b090a08, 0x0705060403010200)));
3229 #else
3230     __m128i a = _mm_shufflelo_epi16(vec.val, _MM_SHUFFLE(3, 1, 2, 0));
3231     a = _mm_shufflehi_epi16(a, _MM_SHUFFLE(3, 1, 2, 0));
3232     a = _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 1, 2, 0));
3233     return v_int8x16(_mm_unpacklo_epi8(a, _mm_unpackhi_epi64(a, a)));
3234 #endif
3235 }
v_interleave_pairs(const v_uint8x16 & vec)3236 inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
v_interleave_quads(const v_int8x16 & vec)3237 inline v_int8x16 v_interleave_quads(const v_int8x16& vec)
3238 {
3239 #if CV_SSSE3
3240     return v_int8x16(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0x0f0b0e0a0d090c08, 0x0703060205010400)));
3241 #else
3242     __m128i a = _mm_shuffle_epi32(vec.val, _MM_SHUFFLE(3, 1, 2, 0));
3243     return v_int8x16(_mm_unpacklo_epi8(a, _mm_unpackhi_epi64(a, a)));
3244 #endif
3245 }
v_interleave_quads(const v_uint8x16 & vec)3246 inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
3247 
v_interleave_pairs(const v_int16x8 & vec)3248 inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
3249 {
3250 #if CV_SSSE3
3251     return v_int16x8(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0x0f0e0b0a0d0c0908, 0x0706030205040100)));
3252 #else
3253     __m128i a = _mm_shufflelo_epi16(vec.val, _MM_SHUFFLE(3, 1, 2, 0));
3254     return v_int16x8(_mm_shufflehi_epi16(a, _MM_SHUFFLE(3, 1, 2, 0)));
3255 #endif
3256 }
v_interleave_pairs(const v_uint16x8 & vec)3257 inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
v_interleave_quads(const v_int16x8 & vec)3258 inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
3259 {
3260 #if CV_SSSE3
3261     return v_int16x8(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0x0f0e07060d0c0504, 0x0b0a030209080100)));
3262 #else
3263     return v_int16x8(_mm_unpacklo_epi16(vec.val, _mm_unpackhi_epi64(vec.val, vec.val)));
3264 #endif
3265 }
v_interleave_quads(const v_uint16x8 & vec)3266 inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
3267 
v_interleave_pairs(const v_int32x4 & vec)3268 inline v_int32x4 v_interleave_pairs(const v_int32x4& vec)
3269 {
3270     return v_int32x4(_mm_shuffle_epi32(vec.val, _MM_SHUFFLE(3, 1, 2, 0)));
3271 }
v_interleave_pairs(const v_uint32x4 & vec)3272 inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
v_interleave_pairs(const v_float32x4 & vec)3273 inline v_float32x4 v_interleave_pairs(const v_float32x4& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
3274 
v_pack_triplets(const v_int8x16 & vec)3275 inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
3276 {
3277 #if CV_SSSE3
3278     return v_int8x16(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0xffffff0f0e0d0c0a, 0x0908060504020100)));
3279 #else
3280     __m128i mask = _mm_set1_epi64x(0x00000000FFFFFFFF);
3281     __m128i a = _mm_srli_si128(_mm_or_si128(_mm_andnot_si128(mask, vec.val), _mm_and_si128(mask, _mm_sll_epi32(vec.val, _mm_set_epi64x(0, 8)))), 1);
3282     return v_int8x16(_mm_srli_si128(_mm_shufflelo_epi16(a, _MM_SHUFFLE(2, 1, 0, 3)), 2));
3283 #endif
3284 }
v_pack_triplets(const v_uint8x16 & vec)3285 inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
3286 
v_pack_triplets(const v_int16x8 & vec)3287 inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
3288 {
3289 #if CV_SSSE3
3290     return v_int16x8(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0xffff0f0e0d0c0b0a, 0x0908050403020100)));
3291 #else
3292     return v_int16x8(_mm_srli_si128(_mm_shufflelo_epi16(vec.val, _MM_SHUFFLE(2, 1, 0, 3)), 2));
3293 #endif
3294 }
v_pack_triplets(const v_uint16x8 & vec)3295 inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
3296 
v_pack_triplets(const v_int32x4 & vec)3297 inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
v_pack_triplets(const v_uint32x4 & vec)3298 inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
v_pack_triplets(const v_float32x4 & vec)3299 inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
3300 
3301 template<int i>
v_extract_n(const v_uint8x16 & v)3302 inline uchar v_extract_n(const v_uint8x16& v)
3303 {
3304 #if CV_SSE4_1
3305     return (uchar)_mm_extract_epi8(v.val, i);
3306 #else
3307     return v_rotate_right<i>(v).get0();
3308 #endif
3309 }
3310 
3311 template<int i>
v_extract_n(const v_int8x16 & v)3312 inline schar v_extract_n(const v_int8x16& v)
3313 {
3314     return (schar)v_extract_n<i>(v_reinterpret_as_u8(v));
3315 }
3316 
3317 template<int i>
v_extract_n(const v_uint16x8 & v)3318 inline ushort v_extract_n(const v_uint16x8& v)
3319 {
3320     return (ushort)_mm_extract_epi16(v.val, i);
3321 }
3322 
3323 template<int i>
v_extract_n(const v_int16x8 & v)3324 inline short v_extract_n(const v_int16x8& v)
3325 {
3326     return (short)v_extract_n<i>(v_reinterpret_as_u16(v));
3327 }
3328 
3329 template<int i>
v_extract_n(const v_uint32x4 & v)3330 inline uint v_extract_n(const v_uint32x4& v)
3331 {
3332 #if CV_SSE4_1
3333     return (uint)_mm_extract_epi32(v.val, i);
3334 #else
3335     return v_rotate_right<i>(v).get0();
3336 #endif
3337 }
3338 
3339 template<int i>
v_extract_n(const v_int32x4 & v)3340 inline int v_extract_n(const v_int32x4& v)
3341 {
3342     return (int)v_extract_n<i>(v_reinterpret_as_u32(v));
3343 }
3344 
3345 template<int i>
v_extract_n(const v_uint64x2 & v)3346 inline uint64 v_extract_n(const v_uint64x2& v)
3347 {
3348 #ifdef CV__SIMD_NATIVE_mm_extract_epi64
3349     return (uint64)_v128_extract_epi64<i>(v.val);
3350 #else
3351     return v_rotate_right<i>(v).get0();
3352 #endif
3353 }
3354 
3355 template<int i>
v_extract_n(const v_int64x2 & v)3356 inline int64 v_extract_n(const v_int64x2& v)
3357 {
3358     return (int64)v_extract_n<i>(v_reinterpret_as_u64(v));
3359 }
3360 
3361 template<int i>
v_extract_n(const v_float32x4 & v)3362 inline float v_extract_n(const v_float32x4& v)
3363 {
3364     union { uint iv; float fv; } d;
3365     d.iv = v_extract_n<i>(v_reinterpret_as_u32(v));
3366     return d.fv;
3367 }
3368 
3369 template<int i>
v_extract_n(const v_float64x2 & v)3370 inline double v_extract_n(const v_float64x2& v)
3371 {
3372     union { uint64 iv; double dv; } d;
3373     d.iv = v_extract_n<i>(v_reinterpret_as_u64(v));
3374     return d.dv;
3375 }
3376 
3377 template<int i>
v_broadcast_element(const v_int32x4 & v)3378 inline v_int32x4 v_broadcast_element(const v_int32x4& v)
3379 {
3380     return v_int32x4(_mm_shuffle_epi32(v.val, _MM_SHUFFLE(i,i,i,i)));
3381 }
3382 
3383 template<int i>
v_broadcast_element(const v_uint32x4 & v)3384 inline v_uint32x4 v_broadcast_element(const v_uint32x4& v)
3385 {
3386     return v_uint32x4(_mm_shuffle_epi32(v.val, _MM_SHUFFLE(i,i,i,i)));
3387 }
3388 
3389 template<int i>
v_broadcast_element(const v_float32x4 & v)3390 inline v_float32x4 v_broadcast_element(const v_float32x4& v)
3391 {
3392     return v_float32x4(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE((char)i,(char)i,(char)i,(char)i)));
3393 }
3394 
3395 ////////////// FP16 support ///////////////////////////
3396 
v_load_expand(const float16_t * ptr)3397 inline v_float32x4 v_load_expand(const float16_t* ptr)
3398 {
3399 #if CV_FP16
3400     return v_float32x4(_mm_cvtph_ps(_mm_loadu_si128((const __m128i*)ptr)));
3401 #else
3402     const __m128i z = _mm_setzero_si128(), delta = _mm_set1_epi32(0x38000000);
3403     const __m128i signmask = _mm_set1_epi32(0x80000000), maxexp = _mm_set1_epi32(0x7c000000);
3404     const __m128 deltaf = _mm_castsi128_ps(_mm_set1_epi32(0x38800000));
3405     __m128i bits = _mm_unpacklo_epi16(z, _mm_loadl_epi64((const __m128i*)ptr)); // h << 16
3406     __m128i e = _mm_and_si128(bits, maxexp), sign = _mm_and_si128(bits, signmask);
3407     __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_xor_si128(bits, sign), 3), delta); // ((h & 0x7fff) << 13) + delta
3408     __m128i zt = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(_mm_add_epi32(t, _mm_set1_epi32(1 << 23))), deltaf));
3409 
3410     t = _mm_add_epi32(t, _mm_and_si128(delta, _mm_cmpeq_epi32(maxexp, e)));
3411     __m128i zmask = _mm_cmpeq_epi32(e, z);
3412     __m128i ft = v_select_si128(zmask, zt, t);
3413     return v_float32x4(_mm_castsi128_ps(_mm_or_si128(ft, sign)));
3414 #endif
3415 }
3416 
v_pack_store(float16_t * ptr,const v_float32x4 & v)3417 inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
3418 {
3419 #if CV_FP16
3420     __m128i fp16_value = _mm_cvtps_ph(v.val, 0);
3421     _mm_storel_epi64((__m128i*)ptr, fp16_value);
3422 #else
3423     const __m128i signmask = _mm_set1_epi32(0x80000000);
3424     const __m128i rval = _mm_set1_epi32(0x3f000000);
3425 
3426     __m128i t = _mm_castps_si128(v.val);
3427     __m128i sign = _mm_srai_epi32(_mm_and_si128(t, signmask), 16);
3428     t = _mm_andnot_si128(signmask, t);
3429 
3430     __m128i finitemask = _mm_cmpgt_epi32(_mm_set1_epi32(0x47800000), t);
3431     __m128i isnan = _mm_cmpgt_epi32(t, _mm_set1_epi32(0x7f800000));
3432     __m128i naninf = v_select_si128(isnan, _mm_set1_epi32(0x7e00), _mm_set1_epi32(0x7c00));
3433     __m128i tinymask = _mm_cmpgt_epi32(_mm_set1_epi32(0x38800000), t);
3434     __m128i tt = _mm_castps_si128(_mm_add_ps(_mm_castsi128_ps(t), _mm_castsi128_ps(rval)));
3435     tt = _mm_sub_epi32(tt, rval);
3436     __m128i odd = _mm_and_si128(_mm_srli_epi32(t, 13), _mm_set1_epi32(1));
3437     __m128i nt = _mm_add_epi32(t, _mm_set1_epi32(0xc8000fff));
3438     nt = _mm_srli_epi32(_mm_add_epi32(nt, odd), 13);
3439     t = v_select_si128(tinymask, tt, nt);
3440     t = v_select_si128(finitemask, t, naninf);
3441     t = _mm_or_si128(t, sign);
3442     t = _mm_packs_epi32(t, t);
3443     _mm_storel_epi64((__m128i*)ptr, t);
3444 #endif
3445 }
3446 
v_cleanup()3447 inline void v_cleanup() {}
3448 
3449 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
3450 
3451 //! @endcond
3452 
3453 }
3454 
3455 #endif
3456