1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
8 //
9 //
10 // License Agreement
11 // For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
16 // Copyright (C) 2015, Itseez Inc., all rights reserved.
17 // Third party copyrights are property of their respective owners.
18 //
19 // Redistribution and use in source and binary forms, with or without modification,
20 // are permitted provided that the following conditions are met:
21 //
22 // * Redistribution's of source code must retain the above copyright notice,
23 // this list of conditions and the following disclaimer.
24 //
25 // * Redistribution's in binary form must reproduce the above copyright notice,
26 // this list of conditions and the following disclaimer in the documentation
27 // and/or other materials provided with the distribution.
28 //
29 // * The name of the copyright holders may not be used to endorse or promote products
30 // derived from this software without specific prior written permission.
31 //
32 // This software is provided by the copyright holders and contributors "as is" and
33 // any express or implied warranties, including, but not limited to, the implied
34 // warranties of merchantability and fitness for a particular purpose are disclaimed.
35 // In no event shall the Intel Corporation or contributors be liable for any direct,
36 // indirect, incidental, special, exemplary, or consequential damages
37 // (including, but not limited to, procurement of substitute goods or services;
38 // loss of use, data, or profits; or business interruption) however caused
39 // and on any theory of liability, whether in contract, strict liability,
40 // or tort (including negligence or otherwise) arising in any way out of
41 // the use of this software, even if advised of the possibility of such damage.
42 //
43 //M*/
44
45 #ifndef OPENCV_HAL_SSE_HPP
46 #define OPENCV_HAL_SSE_HPP
47
48 #include <algorithm>
49 #include "opencv2/core/utility.hpp"
50
51 #define CV_SIMD128 1
52 #define CV_SIMD128_64F 1
53 #define CV_SIMD128_FP16 0 // no native operations with FP16 type.
54
55 namespace cv
56 {
57
58 //! @cond IGNORED
59
60 //
61 // Compilation troubleshooting:
62 // - MSVC: error C2719: 'a': formal parameter with requested alignment of 16 won't be aligned
63 // Replace parameter declaration to const reference:
64 // -v_int32x4 a
65 // +const v_int32x4& a
66 //
67
68 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
69
70 ///////// Types ////////////
71
72 struct v_uint8x16
73 {
74 typedef uchar lane_type;
75 typedef __m128i vector_type;
76 enum { nlanes = 16 };
77
78 /* coverity[uninit_ctor]: suppress warning */
v_uint8x16cv::v_uint8x1679 v_uint8x16() {}
v_uint8x16cv::v_uint8x1680 explicit v_uint8x16(__m128i v) : val(v) {}
v_uint8x16cv::v_uint8x1681 v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
82 uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
83 {
84 val = _mm_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3,
85 (char)v4, (char)v5, (char)v6, (char)v7,
86 (char)v8, (char)v9, (char)v10, (char)v11,
87 (char)v12, (char)v13, (char)v14, (char)v15);
88 }
89
get0cv::v_uint8x1690 uchar get0() const
91 {
92 return (uchar)_mm_cvtsi128_si32(val);
93 }
94
95 __m128i val;
96 };
97
98 struct v_int8x16
99 {
100 typedef schar lane_type;
101 typedef __m128i vector_type;
102 enum { nlanes = 16 };
103
104 /* coverity[uninit_ctor]: suppress warning */
v_int8x16cv::v_int8x16105 v_int8x16() {}
v_int8x16cv::v_int8x16106 explicit v_int8x16(__m128i v) : val(v) {}
v_int8x16cv::v_int8x16107 v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
108 schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
109 {
110 val = _mm_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3,
111 (char)v4, (char)v5, (char)v6, (char)v7,
112 (char)v8, (char)v9, (char)v10, (char)v11,
113 (char)v12, (char)v13, (char)v14, (char)v15);
114 }
115
get0cv::v_int8x16116 schar get0() const
117 {
118 return (schar)_mm_cvtsi128_si32(val);
119 }
120
121 __m128i val;
122 };
123
124 struct v_uint16x8
125 {
126 typedef ushort lane_type;
127 typedef __m128i vector_type;
128 enum { nlanes = 8 };
129
130 /* coverity[uninit_ctor]: suppress warning */
v_uint16x8cv::v_uint16x8131 v_uint16x8() {}
v_uint16x8cv::v_uint16x8132 explicit v_uint16x8(__m128i v) : val(v) {}
v_uint16x8cv::v_uint16x8133 v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
134 {
135 val = _mm_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3,
136 (short)v4, (short)v5, (short)v6, (short)v7);
137 }
138
get0cv::v_uint16x8139 ushort get0() const
140 {
141 return (ushort)_mm_cvtsi128_si32(val);
142 }
143
144 __m128i val;
145 };
146
147 struct v_int16x8
148 {
149 typedef short lane_type;
150 typedef __m128i vector_type;
151 enum { nlanes = 8 };
152
153 /* coverity[uninit_ctor]: suppress warning */
v_int16x8cv::v_int16x8154 v_int16x8() {}
v_int16x8cv::v_int16x8155 explicit v_int16x8(__m128i v) : val(v) {}
v_int16x8cv::v_int16x8156 v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
157 {
158 val = _mm_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3,
159 (short)v4, (short)v5, (short)v6, (short)v7);
160 }
161
get0cv::v_int16x8162 short get0() const
163 {
164 return (short)_mm_cvtsi128_si32(val);
165 }
166
167 __m128i val;
168 };
169
170 struct v_uint32x4
171 {
172 typedef unsigned lane_type;
173 typedef __m128i vector_type;
174 enum { nlanes = 4 };
175
176 /* coverity[uninit_ctor]: suppress warning */
v_uint32x4cv::v_uint32x4177 v_uint32x4() {}
v_uint32x4cv::v_uint32x4178 explicit v_uint32x4(__m128i v) : val(v) {}
v_uint32x4cv::v_uint32x4179 v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
180 {
181 val = _mm_setr_epi32((int)v0, (int)v1, (int)v2, (int)v3);
182 }
183
get0cv::v_uint32x4184 unsigned get0() const
185 {
186 return (unsigned)_mm_cvtsi128_si32(val);
187 }
188
189 __m128i val;
190 };
191
192 struct v_int32x4
193 {
194 typedef int lane_type;
195 typedef __m128i vector_type;
196 enum { nlanes = 4 };
197
198 /* coverity[uninit_ctor]: suppress warning */
v_int32x4cv::v_int32x4199 v_int32x4() {}
v_int32x4cv::v_int32x4200 explicit v_int32x4(__m128i v) : val(v) {}
v_int32x4cv::v_int32x4201 v_int32x4(int v0, int v1, int v2, int v3)
202 {
203 val = _mm_setr_epi32(v0, v1, v2, v3);
204 }
205
get0cv::v_int32x4206 int get0() const
207 {
208 return _mm_cvtsi128_si32(val);
209 }
210
211 __m128i val;
212 };
213
214 struct v_float32x4
215 {
216 typedef float lane_type;
217 typedef __m128 vector_type;
218 enum { nlanes = 4 };
219
220 /* coverity[uninit_ctor]: suppress warning */
v_float32x4cv::v_float32x4221 v_float32x4() {}
v_float32x4cv::v_float32x4222 explicit v_float32x4(__m128 v) : val(v) {}
v_float32x4cv::v_float32x4223 v_float32x4(float v0, float v1, float v2, float v3)
224 {
225 val = _mm_setr_ps(v0, v1, v2, v3);
226 }
227
get0cv::v_float32x4228 float get0() const
229 {
230 return _mm_cvtss_f32(val);
231 }
232
233 __m128 val;
234 };
235
236 struct v_uint64x2
237 {
238 typedef uint64 lane_type;
239 typedef __m128i vector_type;
240 enum { nlanes = 2 };
241
242 /* coverity[uninit_ctor]: suppress warning */
v_uint64x2cv::v_uint64x2243 v_uint64x2() {}
v_uint64x2cv::v_uint64x2244 explicit v_uint64x2(__m128i v) : val(v) {}
v_uint64x2cv::v_uint64x2245 v_uint64x2(uint64 v0, uint64 v1)
246 {
247 val = _mm_setr_epi32((int)v0, (int)(v0 >> 32), (int)v1, (int)(v1 >> 32));
248 }
249
get0cv::v_uint64x2250 uint64 get0() const
251 {
252 #if !defined(__x86_64__) && !defined(_M_X64)
253 int a = _mm_cvtsi128_si32(val);
254 int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32));
255 return (unsigned)a | ((uint64)(unsigned)b << 32);
256 #else
257 return (uint64)_mm_cvtsi128_si64(val);
258 #endif
259 }
260
261 __m128i val;
262 };
263
264 struct v_int64x2
265 {
266 typedef int64 lane_type;
267 typedef __m128i vector_type;
268 enum { nlanes = 2 };
269
270 /* coverity[uninit_ctor]: suppress warning */
v_int64x2cv::v_int64x2271 v_int64x2() {}
v_int64x2cv::v_int64x2272 explicit v_int64x2(__m128i v) : val(v) {}
v_int64x2cv::v_int64x2273 v_int64x2(int64 v0, int64 v1)
274 {
275 val = _mm_setr_epi32((int)v0, (int)(v0 >> 32), (int)v1, (int)(v1 >> 32));
276 }
277
get0cv::v_int64x2278 int64 get0() const
279 {
280 #if !defined(__x86_64__) && !defined(_M_X64)
281 int a = _mm_cvtsi128_si32(val);
282 int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32));
283 return (int64)((unsigned)a | ((uint64)(unsigned)b << 32));
284 #else
285 return _mm_cvtsi128_si64(val);
286 #endif
287 }
288
289 __m128i val;
290 };
291
292 struct v_float64x2
293 {
294 typedef double lane_type;
295 typedef __m128d vector_type;
296 enum { nlanes = 2 };
297
298 /* coverity[uninit_ctor]: suppress warning */
v_float64x2cv::v_float64x2299 v_float64x2() {}
v_float64x2cv::v_float64x2300 explicit v_float64x2(__m128d v) : val(v) {}
v_float64x2cv::v_float64x2301 v_float64x2(double v0, double v1)
302 {
303 val = _mm_setr_pd(v0, v1);
304 }
305
get0cv::v_float64x2306 double get0() const
307 {
308 return _mm_cvtsd_f64(val);
309 }
310
311 __m128d val;
312 };
313
314 namespace hal_sse_internal
315 {
316 template <typename to_sse_type, typename from_sse_type>
317 to_sse_type v_sse_reinterpret_as(const from_sse_type& val);
318
319 #define OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(to_sse_type, from_sse_type, sse_cast_intrin) \
320 template<> inline \
321 to_sse_type v_sse_reinterpret_as(const from_sse_type& a) \
322 { return sse_cast_intrin(a); }
323
324 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128i, __m128i, OPENCV_HAL_NOP)
325 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128i, __m128, _mm_castps_si128)
326 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128i, __m128d, _mm_castpd_si128)
327 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128, __m128i, _mm_castsi128_ps)
328 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128, __m128, OPENCV_HAL_NOP)
329 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128, __m128d, _mm_castpd_ps)
330 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128d, __m128i, _mm_castsi128_pd)
331 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128d, __m128, _mm_castps_pd)
332 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128d, __m128d, OPENCV_HAL_NOP)
333 }
334
335 #define OPENCV_HAL_IMPL_SSE_INITVEC(_Tpvec, _Tp, suffix, zsuffix, ssuffix, _Tps, cast) \
336 inline _Tpvec v_setzero_##suffix() { return _Tpvec(_mm_setzero_##zsuffix()); } \
337 inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(_mm_set1_##ssuffix((_Tps)v)); } \
338 template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0& a) \
339 { return _Tpvec(cast(a.val)); }
340
OPENCV_HAL_IMPL_SSE_INITVEC(v_uint8x16,uchar,u8,si128,epi8,schar,OPENCV_HAL_NOP)341 OPENCV_HAL_IMPL_SSE_INITVEC(v_uint8x16, uchar, u8, si128, epi8, schar, OPENCV_HAL_NOP)
342 OPENCV_HAL_IMPL_SSE_INITVEC(v_int8x16, schar, s8, si128, epi8, schar, OPENCV_HAL_NOP)
343 OPENCV_HAL_IMPL_SSE_INITVEC(v_uint16x8, ushort, u16, si128, epi16, short, OPENCV_HAL_NOP)
344 OPENCV_HAL_IMPL_SSE_INITVEC(v_int16x8, short, s16, si128, epi16, short, OPENCV_HAL_NOP)
345 OPENCV_HAL_IMPL_SSE_INITVEC(v_uint32x4, unsigned, u32, si128, epi32, int, OPENCV_HAL_NOP)
346 OPENCV_HAL_IMPL_SSE_INITVEC(v_int32x4, int, s32, si128, epi32, int, OPENCV_HAL_NOP)
347 OPENCV_HAL_IMPL_SSE_INITVEC(v_float32x4, float, f32, ps, ps, float, _mm_castsi128_ps)
348 OPENCV_HAL_IMPL_SSE_INITVEC(v_float64x2, double, f64, pd, pd, double, _mm_castsi128_pd)
349
350 inline v_uint64x2 v_setzero_u64() { return v_uint64x2(_mm_setzero_si128()); }
v_setzero_s64()351 inline v_int64x2 v_setzero_s64() { return v_int64x2(_mm_setzero_si128()); }
v_setall_u64(uint64 val)352 inline v_uint64x2 v_setall_u64(uint64 val) { return v_uint64x2(val, val); }
v_setall_s64(int64 val)353 inline v_int64x2 v_setall_s64(int64 val) { return v_int64x2(val, val); }
354
355 template<typename _Tpvec> inline
v_reinterpret_as_u64(const _Tpvec & a)356 v_uint64x2 v_reinterpret_as_u64(const _Tpvec& a) { return v_uint64x2(a.val); }
357 template<typename _Tpvec> inline
v_reinterpret_as_s64(const _Tpvec & a)358 v_int64x2 v_reinterpret_as_s64(const _Tpvec& a) { return v_int64x2(a.val); }
v_reinterpret_as_f32(const v_uint64x2 & a)359 inline v_float32x4 v_reinterpret_as_f32(const v_uint64x2& a)
360 { return v_float32x4(_mm_castsi128_ps(a.val)); }
v_reinterpret_as_f32(const v_int64x2 & a)361 inline v_float32x4 v_reinterpret_as_f32(const v_int64x2& a)
362 { return v_float32x4(_mm_castsi128_ps(a.val)); }
v_reinterpret_as_f64(const v_uint64x2 & a)363 inline v_float64x2 v_reinterpret_as_f64(const v_uint64x2& a)
364 { return v_float64x2(_mm_castsi128_pd(a.val)); }
v_reinterpret_as_f64(const v_int64x2 & a)365 inline v_float64x2 v_reinterpret_as_f64(const v_int64x2& a)
366 { return v_float64x2(_mm_castsi128_pd(a.val)); }
367
368 #define OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(_Tpvec, suffix) \
369 inline _Tpvec v_reinterpret_as_##suffix(const v_float32x4& a) \
370 { return _Tpvec(_mm_castps_si128(a.val)); } \
371 inline _Tpvec v_reinterpret_as_##suffix(const v_float64x2& a) \
372 { return _Tpvec(_mm_castpd_si128(a.val)); }
373
OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint8x16,u8)374 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint8x16, u8)
375 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int8x16, s8)
376 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint16x8, u16)
377 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int16x8, s16)
378 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint32x4, u32)
379 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int32x4, s32)
380 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint64x2, u64)
381 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int64x2, s64)
382
383 inline v_float32x4 v_reinterpret_as_f32(const v_float32x4& a) {return a; }
v_reinterpret_as_f64(const v_float64x2 & a)384 inline v_float64x2 v_reinterpret_as_f64(const v_float64x2& a) {return a; }
v_reinterpret_as_f32(const v_float64x2 & a)385 inline v_float32x4 v_reinterpret_as_f32(const v_float64x2& a) {return v_float32x4(_mm_castpd_ps(a.val)); }
v_reinterpret_as_f64(const v_float32x4 & a)386 inline v_float64x2 v_reinterpret_as_f64(const v_float32x4& a) {return v_float64x2(_mm_castps_pd(a.val)); }
387
388 //////////////// PACK ///////////////
v_pack(const v_uint16x8 & a,const v_uint16x8 & b)389 inline v_uint8x16 v_pack(const v_uint16x8& a, const v_uint16x8& b)
390 {
391 __m128i delta = _mm_set1_epi16(255);
392 return v_uint8x16(_mm_packus_epi16(_mm_subs_epu16(a.val, _mm_subs_epu16(a.val, delta)),
393 _mm_subs_epu16(b.val, _mm_subs_epu16(b.val, delta))));
394 }
395
v_pack_store(uchar * ptr,const v_uint16x8 & a)396 inline void v_pack_store(uchar* ptr, const v_uint16x8& a)
397 {
398 __m128i delta = _mm_set1_epi16(255);
399 __m128i a1 = _mm_subs_epu16(a.val, _mm_subs_epu16(a.val, delta));
400 _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
401 }
402
v_pack_u(const v_int16x8 & a,const v_int16x8 & b)403 inline v_uint8x16 v_pack_u(const v_int16x8& a, const v_int16x8& b)
404 { return v_uint8x16(_mm_packus_epi16(a.val, b.val)); }
405
v_pack_u_store(uchar * ptr,const v_int16x8 & a)406 inline void v_pack_u_store(uchar* ptr, const v_int16x8& a)
407 { _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a.val, a.val)); }
408
409 template<int n> inline
v_rshr_pack(const v_uint16x8 & a,const v_uint16x8 & b)410 v_uint8x16 v_rshr_pack(const v_uint16x8& a, const v_uint16x8& b)
411 {
412 // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
413 __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
414 return v_uint8x16(_mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(a.val, delta), n),
415 _mm_srli_epi16(_mm_adds_epu16(b.val, delta), n)));
416 }
417
418 template<int n> inline
v_rshr_pack_store(uchar * ptr,const v_uint16x8 & a)419 void v_rshr_pack_store(uchar* ptr, const v_uint16x8& a)
420 {
421 __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
422 __m128i a1 = _mm_srli_epi16(_mm_adds_epu16(a.val, delta), n);
423 _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
424 }
425
426 template<int n> inline
v_rshr_pack_u(const v_int16x8 & a,const v_int16x8 & b)427 v_uint8x16 v_rshr_pack_u(const v_int16x8& a, const v_int16x8& b)
428 {
429 __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
430 return v_uint8x16(_mm_packus_epi16(_mm_srai_epi16(_mm_adds_epi16(a.val, delta), n),
431 _mm_srai_epi16(_mm_adds_epi16(b.val, delta), n)));
432 }
433
434 template<int n> inline
v_rshr_pack_u_store(uchar * ptr,const v_int16x8 & a)435 void v_rshr_pack_u_store(uchar* ptr, const v_int16x8& a)
436 {
437 __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
438 __m128i a1 = _mm_srai_epi16(_mm_adds_epi16(a.val, delta), n);
439 _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
440 }
441
v_pack(const v_int16x8 & a,const v_int16x8 & b)442 inline v_int8x16 v_pack(const v_int16x8& a, const v_int16x8& b)
443 { return v_int8x16(_mm_packs_epi16(a.val, b.val)); }
444
v_pack_store(schar * ptr,const v_int16x8 & a)445 inline void v_pack_store(schar* ptr, const v_int16x8& a)
446 { _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a.val, a.val)); }
447
448 template<int n> inline
v_rshr_pack(const v_int16x8 & a,const v_int16x8 & b)449 v_int8x16 v_rshr_pack(const v_int16x8& a, const v_int16x8& b)
450 {
451 // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
452 __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
453 return v_int8x16(_mm_packs_epi16(_mm_srai_epi16(_mm_adds_epi16(a.val, delta), n),
454 _mm_srai_epi16(_mm_adds_epi16(b.val, delta), n)));
455 }
456 template<int n> inline
v_rshr_pack_store(schar * ptr,const v_int16x8 & a)457 void v_rshr_pack_store(schar* ptr, const v_int16x8& a)
458 {
459 // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
460 __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
461 __m128i a1 = _mm_srai_epi16(_mm_adds_epi16(a.val, delta), n);
462 _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a1, a1));
463 }
464
465
466 // byte-wise "mask ? a : b"
v_select_si128(__m128i mask,__m128i a,__m128i b)467 inline __m128i v_select_si128(__m128i mask, __m128i a, __m128i b)
468 {
469 #if CV_SSE4_1
470 return _mm_blendv_epi8(b, a, mask);
471 #else
472 return _mm_xor_si128(b, _mm_and_si128(_mm_xor_si128(a, b), mask));
473 #endif
474 }
475
v_pack(const v_uint32x4 & a,const v_uint32x4 & b)476 inline v_uint16x8 v_pack(const v_uint32x4& a, const v_uint32x4& b)
477 { return v_uint16x8(_v128_packs_epu32(a.val, b.val)); }
478
v_pack_store(ushort * ptr,const v_uint32x4 & a)479 inline void v_pack_store(ushort* ptr, const v_uint32x4& a)
480 {
481 __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(65535), delta32 = _mm_set1_epi32(32768);
482 __m128i a1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val), delta32);
483 __m128i r = _mm_packs_epi32(a1, a1);
484 _mm_storel_epi64((__m128i*)ptr, _mm_sub_epi16(r, _mm_set1_epi16(-32768)));
485 }
486
487 template<int n> inline
v_rshr_pack(const v_uint32x4 & a,const v_uint32x4 & b)488 v_uint16x8 v_rshr_pack(const v_uint32x4& a, const v_uint32x4& b)
489 {
490 __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
491 __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32);
492 __m128i b1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(b.val, delta), n), delta32);
493 return v_uint16x8(_mm_sub_epi16(_mm_packs_epi32(a1, b1), _mm_set1_epi16(-32768)));
494 }
495
496 template<int n> inline
v_rshr_pack_store(ushort * ptr,const v_uint32x4 & a)497 void v_rshr_pack_store(ushort* ptr, const v_uint32x4& a)
498 {
499 __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
500 __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32);
501 __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
502 _mm_storel_epi64((__m128i*)ptr, a2);
503 }
504
v_pack_u(const v_int32x4 & a,const v_int32x4 & b)505 inline v_uint16x8 v_pack_u(const v_int32x4& a, const v_int32x4& b)
506 {
507 #if CV_SSE4_1
508 return v_uint16x8(_mm_packus_epi32(a.val, b.val));
509 #else
510 __m128i delta32 = _mm_set1_epi32(32768);
511
512 // preliminary saturate negative values to zero
513 __m128i a1 = _mm_and_si128(a.val, _mm_cmpgt_epi32(a.val, _mm_set1_epi32(0)));
514 __m128i b1 = _mm_and_si128(b.val, _mm_cmpgt_epi32(b.val, _mm_set1_epi32(0)));
515
516 __m128i r = _mm_packs_epi32(_mm_sub_epi32(a1, delta32), _mm_sub_epi32(b1, delta32));
517 return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768)));
518 #endif
519 }
520
v_pack_u_store(ushort * ptr,const v_int32x4 & a)521 inline void v_pack_u_store(ushort* ptr, const v_int32x4& a)
522 {
523 #if CV_SSE4_1
524 _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi32(a.val, a.val));
525 #else
526 __m128i delta32 = _mm_set1_epi32(32768);
527 __m128i a1 = _mm_sub_epi32(a.val, delta32);
528 __m128i r = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
529 _mm_storel_epi64((__m128i*)ptr, r);
530 #endif
531 }
532
533 template<int n> inline
v_rshr_pack_u(const v_int32x4 & a,const v_int32x4 & b)534 v_uint16x8 v_rshr_pack_u(const v_int32x4& a, const v_int32x4& b)
535 {
536 #if CV_SSE4_1
537 __m128i delta = _mm_set1_epi32(1 << (n - 1));
538 return v_uint16x8(_mm_packus_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n),
539 _mm_srai_epi32(_mm_add_epi32(b.val, delta), n)));
540 #else
541 __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
542 __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32);
543 __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
544 __m128i b1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(b.val, delta), n), delta32);
545 __m128i b2 = _mm_sub_epi16(_mm_packs_epi32(b1, b1), _mm_set1_epi16(-32768));
546 return v_uint16x8(_mm_unpacklo_epi64(a2, b2));
547 #endif
548 }
549
550 template<int n> inline
v_rshr_pack_u_store(ushort * ptr,const v_int32x4 & a)551 void v_rshr_pack_u_store(ushort* ptr, const v_int32x4& a)
552 {
553 #if CV_SSE4_1
554 __m128i delta = _mm_set1_epi32(1 << (n - 1));
555 __m128i a1 = _mm_srai_epi32(_mm_add_epi32(a.val, delta), n);
556 _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi32(a1, a1));
557 #else
558 __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
559 __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32);
560 __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
561 _mm_storel_epi64((__m128i*)ptr, a2);
562 #endif
563 }
564
v_pack(const v_int32x4 & a,const v_int32x4 & b)565 inline v_int16x8 v_pack(const v_int32x4& a, const v_int32x4& b)
566 { return v_int16x8(_mm_packs_epi32(a.val, b.val)); }
567
v_pack_store(short * ptr,const v_int32x4 & a)568 inline void v_pack_store(short* ptr, const v_int32x4& a)
569 {
570 _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a.val, a.val));
571 }
572
573 template<int n> inline
v_rshr_pack(const v_int32x4 & a,const v_int32x4 & b)574 v_int16x8 v_rshr_pack(const v_int32x4& a, const v_int32x4& b)
575 {
576 __m128i delta = _mm_set1_epi32(1 << (n-1));
577 return v_int16x8(_mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n),
578 _mm_srai_epi32(_mm_add_epi32(b.val, delta), n)));
579 }
580
581 template<int n> inline
v_rshr_pack_store(short * ptr,const v_int32x4 & a)582 void v_rshr_pack_store(short* ptr, const v_int32x4& a)
583 {
584 __m128i delta = _mm_set1_epi32(1 << (n-1));
585 __m128i a1 = _mm_srai_epi32(_mm_add_epi32(a.val, delta), n);
586 _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a1, a1));
587 }
588
589
590 // [a0 0 | b0 0] [a1 0 | b1 0]
v_pack(const v_uint64x2 & a,const v_uint64x2 & b)591 inline v_uint32x4 v_pack(const v_uint64x2& a, const v_uint64x2& b)
592 {
593 __m128i v0 = _mm_unpacklo_epi32(a.val, b.val); // a0 a1 0 0
594 __m128i v1 = _mm_unpackhi_epi32(a.val, b.val); // b0 b1 0 0
595 return v_uint32x4(_mm_unpacklo_epi32(v0, v1));
596 }
597
v_pack_store(unsigned * ptr,const v_uint64x2 & a)598 inline void v_pack_store(unsigned* ptr, const v_uint64x2& a)
599 {
600 __m128i a1 = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 2, 2, 0));
601 _mm_storel_epi64((__m128i*)ptr, a1);
602 }
603
604 // [a0 0 | b0 0] [a1 0 | b1 0]
v_pack(const v_int64x2 & a,const v_int64x2 & b)605 inline v_int32x4 v_pack(const v_int64x2& a, const v_int64x2& b)
606 {
607 __m128i v0 = _mm_unpacklo_epi32(a.val, b.val); // a0 a1 0 0
608 __m128i v1 = _mm_unpackhi_epi32(a.val, b.val); // b0 b1 0 0
609 return v_int32x4(_mm_unpacklo_epi32(v0, v1));
610 }
611
v_pack_store(int * ptr,const v_int64x2 & a)612 inline void v_pack_store(int* ptr, const v_int64x2& a)
613 {
614 __m128i a1 = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 2, 2, 0));
615 _mm_storel_epi64((__m128i*)ptr, a1);
616 }
617
618 template<int n> inline
v_rshr_pack(const v_uint64x2 & a,const v_uint64x2 & b)619 v_uint32x4 v_rshr_pack(const v_uint64x2& a, const v_uint64x2& b)
620 {
621 uint64 delta = (uint64)1 << (n-1);
622 v_uint64x2 delta2(delta, delta);
623 __m128i a1 = _mm_srli_epi64(_mm_add_epi64(a.val, delta2.val), n);
624 __m128i b1 = _mm_srli_epi64(_mm_add_epi64(b.val, delta2.val), n);
625 __m128i v0 = _mm_unpacklo_epi32(a1, b1); // a0 a1 0 0
626 __m128i v1 = _mm_unpackhi_epi32(a1, b1); // b0 b1 0 0
627 return v_uint32x4(_mm_unpacklo_epi32(v0, v1));
628 }
629
630 template<int n> inline
v_rshr_pack_store(unsigned * ptr,const v_uint64x2 & a)631 void v_rshr_pack_store(unsigned* ptr, const v_uint64x2& a)
632 {
633 uint64 delta = (uint64)1 << (n-1);
634 v_uint64x2 delta2(delta, delta);
635 __m128i a1 = _mm_srli_epi64(_mm_add_epi64(a.val, delta2.val), n);
636 __m128i a2 = _mm_shuffle_epi32(a1, _MM_SHUFFLE(0, 2, 2, 0));
637 _mm_storel_epi64((__m128i*)ptr, a2);
638 }
639
v_sign_epi64(__m128i a)640 inline __m128i v_sign_epi64(__m128i a)
641 {
642 return _mm_shuffle_epi32(_mm_srai_epi32(a, 31), _MM_SHUFFLE(3, 3, 1, 1)); // x m0 | x m1
643 }
644
v_srai_epi64(__m128i a,int imm)645 inline __m128i v_srai_epi64(__m128i a, int imm)
646 {
647 __m128i smask = v_sign_epi64(a);
648 return _mm_xor_si128(_mm_srli_epi64(_mm_xor_si128(a, smask), imm), smask);
649 }
650
651 template<int n> inline
v_rshr_pack(const v_int64x2 & a,const v_int64x2 & b)652 v_int32x4 v_rshr_pack(const v_int64x2& a, const v_int64x2& b)
653 {
654 int64 delta = (int64)1 << (n-1);
655 v_int64x2 delta2(delta, delta);
656 __m128i a1 = v_srai_epi64(_mm_add_epi64(a.val, delta2.val), n);
657 __m128i b1 = v_srai_epi64(_mm_add_epi64(b.val, delta2.val), n);
658 __m128i v0 = _mm_unpacklo_epi32(a1, b1); // a0 a1 0 0
659 __m128i v1 = _mm_unpackhi_epi32(a1, b1); // b0 b1 0 0
660 return v_int32x4(_mm_unpacklo_epi32(v0, v1));
661 }
662
663 template<int n> inline
v_rshr_pack_store(int * ptr,const v_int64x2 & a)664 void v_rshr_pack_store(int* ptr, const v_int64x2& a)
665 {
666 int64 delta = (int64)1 << (n-1);
667 v_int64x2 delta2(delta, delta);
668 __m128i a1 = v_srai_epi64(_mm_add_epi64(a.val, delta2.val), n);
669 __m128i a2 = _mm_shuffle_epi32(a1, _MM_SHUFFLE(0, 2, 2, 0));
670 _mm_storel_epi64((__m128i*)ptr, a2);
671 }
672
673 // pack boolean
v_pack_b(const v_uint16x8 & a,const v_uint16x8 & b)674 inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
675 {
676 __m128i ab = _mm_packs_epi16(a.val, b.val);
677 return v_uint8x16(ab);
678 }
679
v_pack_b(const v_uint32x4 & a,const v_uint32x4 & b,const v_uint32x4 & c,const v_uint32x4 & d)680 inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
681 const v_uint32x4& c, const v_uint32x4& d)
682 {
683 __m128i ab = _mm_packs_epi32(a.val, b.val);
684 __m128i cd = _mm_packs_epi32(c.val, d.val);
685 return v_uint8x16(_mm_packs_epi16(ab, cd));
686 }
687
v_pack_b(const v_uint64x2 & a,const v_uint64x2 & b,const v_uint64x2 & c,const v_uint64x2 & d,const v_uint64x2 & e,const v_uint64x2 & f,const v_uint64x2 & g,const v_uint64x2 & h)688 inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
689 const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
690 const v_uint64x2& g, const v_uint64x2& h)
691 {
692 __m128i ab = _mm_packs_epi32(a.val, b.val);
693 __m128i cd = _mm_packs_epi32(c.val, d.val);
694 __m128i ef = _mm_packs_epi32(e.val, f.val);
695 __m128i gh = _mm_packs_epi32(g.val, h.val);
696
697 __m128i abcd = _mm_packs_epi32(ab, cd);
698 __m128i efgh = _mm_packs_epi32(ef, gh);
699 return v_uint8x16(_mm_packs_epi16(abcd, efgh));
700 }
701
v_matmul(const v_float32x4 & v,const v_float32x4 & m0,const v_float32x4 & m1,const v_float32x4 & m2,const v_float32x4 & m3)702 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
703 const v_float32x4& m1, const v_float32x4& m2,
704 const v_float32x4& m3)
705 {
706 __m128 v0 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(0, 0, 0, 0)), m0.val);
707 __m128 v1 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(1, 1, 1, 1)), m1.val);
708 __m128 v2 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(2, 2, 2, 2)), m2.val);
709 __m128 v3 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(3, 3, 3, 3)), m3.val);
710
711 return v_float32x4(_mm_add_ps(_mm_add_ps(v0, v1), _mm_add_ps(v2, v3)));
712 }
713
v_matmuladd(const v_float32x4 & v,const v_float32x4 & m0,const v_float32x4 & m1,const v_float32x4 & m2,const v_float32x4 & a)714 inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
715 const v_float32x4& m1, const v_float32x4& m2,
716 const v_float32x4& a)
717 {
718 __m128 v0 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(0, 0, 0, 0)), m0.val);
719 __m128 v1 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(1, 1, 1, 1)), m1.val);
720 __m128 v2 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(2, 2, 2, 2)), m2.val);
721
722 return v_float32x4(_mm_add_ps(_mm_add_ps(v0, v1), _mm_add_ps(v2, a.val)));
723 }
724
725 #define OPENCV_HAL_IMPL_SSE_BIN_OP(bin_op, _Tpvec, intrin) \
726 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
727 { \
728 return _Tpvec(intrin(a.val, b.val)); \
729 } \
730 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
731 { \
732 a.val = intrin(a.val, b.val); \
733 return a; \
734 }
735
736 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint8x16, _mm_adds_epu8)
737 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint8x16, _mm_subs_epu8)
738 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int8x16, _mm_adds_epi8)
739 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int8x16, _mm_subs_epi8)
740 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint16x8, _mm_adds_epu16)
741 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint16x8, _mm_subs_epu16)
742 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int16x8, _mm_adds_epi16)
743 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int16x8, _mm_subs_epi16)
744 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint32x4, _mm_add_epi32)
745 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint32x4, _mm_sub_epi32)
746 OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_uint32x4, _v128_mullo_epi32)
747 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int32x4, _mm_add_epi32)
748 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int32x4, _mm_sub_epi32)
749 OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_int32x4, _v128_mullo_epi32)
750 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float32x4, _mm_add_ps)
751 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float32x4, _mm_sub_ps)
752 OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float32x4, _mm_mul_ps)
753 OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float32x4, _mm_div_ps)
754 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float64x2, _mm_add_pd)
755 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float64x2, _mm_sub_pd)
756 OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float64x2, _mm_mul_pd)
757 OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float64x2, _mm_div_pd)
758 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint64x2, _mm_add_epi64)
759 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint64x2, _mm_sub_epi64)
760 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int64x2, _mm_add_epi64)
761 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int64x2, _mm_sub_epi64)
762
763 // saturating multiply 8-bit, 16-bit
764 #define OPENCV_HAL_IMPL_SSE_MUL_SAT(_Tpvec, _Tpwvec) \
765 inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
766 { \
767 _Tpwvec c, d; \
768 v_mul_expand(a, b, c, d); \
769 return v_pack(c, d); \
770 } \
771 inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
772 { a = a * b; return a; }
773
OPENCV_HAL_IMPL_SSE_MUL_SAT(v_uint8x16,v_uint16x8)774 OPENCV_HAL_IMPL_SSE_MUL_SAT(v_uint8x16, v_uint16x8)
775 OPENCV_HAL_IMPL_SSE_MUL_SAT(v_int8x16, v_int16x8)
776 OPENCV_HAL_IMPL_SSE_MUL_SAT(v_uint16x8, v_uint32x4)
777 OPENCV_HAL_IMPL_SSE_MUL_SAT(v_int16x8, v_int32x4)
778
779 // Multiply and expand
780 inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
781 v_uint16x8& c, v_uint16x8& d)
782 {
783 v_uint16x8 a0, a1, b0, b1;
784 v_expand(a, a0, a1);
785 v_expand(b, b0, b1);
786 c = v_mul_wrap(a0, b0);
787 d = v_mul_wrap(a1, b1);
788 }
789
v_mul_expand(const v_int8x16 & a,const v_int8x16 & b,v_int16x8 & c,v_int16x8 & d)790 inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b,
791 v_int16x8& c, v_int16x8& d)
792 {
793 v_int16x8 a0, a1, b0, b1;
794 v_expand(a, a0, a1);
795 v_expand(b, b0, b1);
796 c = v_mul_wrap(a0, b0);
797 d = v_mul_wrap(a1, b1);
798 }
799
v_mul_expand(const v_int16x8 & a,const v_int16x8 & b,v_int32x4 & c,v_int32x4 & d)800 inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
801 v_int32x4& c, v_int32x4& d)
802 {
803 __m128i v0 = _mm_mullo_epi16(a.val, b.val);
804 __m128i v1 = _mm_mulhi_epi16(a.val, b.val);
805 c.val = _mm_unpacklo_epi16(v0, v1);
806 d.val = _mm_unpackhi_epi16(v0, v1);
807 }
808
v_mul_expand(const v_uint16x8 & a,const v_uint16x8 & b,v_uint32x4 & c,v_uint32x4 & d)809 inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
810 v_uint32x4& c, v_uint32x4& d)
811 {
812 __m128i v0 = _mm_mullo_epi16(a.val, b.val);
813 __m128i v1 = _mm_mulhi_epu16(a.val, b.val);
814 c.val = _mm_unpacklo_epi16(v0, v1);
815 d.val = _mm_unpackhi_epi16(v0, v1);
816 }
817
v_mul_expand(const v_uint32x4 & a,const v_uint32x4 & b,v_uint64x2 & c,v_uint64x2 & d)818 inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
819 v_uint64x2& c, v_uint64x2& d)
820 {
821 __m128i c0 = _mm_mul_epu32(a.val, b.val);
822 __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
823 c.val = _mm_unpacklo_epi64(c0, c1);
824 d.val = _mm_unpackhi_epi64(c0, c1);
825 }
826
v_mul_hi(const v_int16x8 & a,const v_int16x8 & b)827 inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b) { return v_int16x8(_mm_mulhi_epi16(a.val, b.val)); }
v_mul_hi(const v_uint16x8 & a,const v_uint16x8 & b)828 inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b) { return v_uint16x8(_mm_mulhi_epu16(a.val, b.val)); }
829
830 //////// Dot Product ////////
831
832 // 16 >> 32
v_dotprod(const v_int16x8 & a,const v_int16x8 & b)833 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
834 { return v_int32x4(_mm_madd_epi16(a.val, b.val)); }
v_dotprod(const v_int16x8 & a,const v_int16x8 & b,const v_int32x4 & c)835 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
836 { return v_dotprod(a, b) + c; }
837
838 // 32 >> 64
v_dotprod(const v_int32x4 & a,const v_int32x4 & b)839 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
840 {
841 #if CV_SSE4_1
842 __m128i even = _mm_mul_epi32(a.val, b.val);
843 __m128i odd = _mm_mul_epi32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
844 return v_int64x2(_mm_add_epi64(even, odd));
845 #else
846 __m128i even_u = _mm_mul_epu32(a.val, b.val);
847 __m128i odd_u = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
848 // convert unsigned to signed high multiplication (from: Agner Fog(veclib) and H S Warren: Hacker's delight, 2003, p. 132)
849 __m128i a_sign = _mm_srai_epi32(a.val, 31);
850 __m128i b_sign = _mm_srai_epi32(b.val, 31);
851 // |x * sign of x
852 __m128i axb = _mm_and_si128(a.val, b_sign);
853 __m128i bxa = _mm_and_si128(b.val, a_sign);
854 // sum of sign corrections
855 __m128i ssum = _mm_add_epi32(bxa, axb);
856 __m128i even_ssum = _mm_slli_epi64(ssum, 32);
857 __m128i odd_ssum = _mm_and_si128(ssum, _mm_set_epi32(-1, 0, -1, 0));
858 // convert to signed and prod
859 return v_int64x2(_mm_add_epi64(_mm_sub_epi64(even_u, even_ssum), _mm_sub_epi64(odd_u, odd_ssum)));
860 #endif
861 }
v_dotprod(const v_int32x4 & a,const v_int32x4 & b,const v_int64x2 & c)862 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
863 { return v_dotprod(a, b) + c; }
864
865 // 8 >> 32
v_dotprod_expand(const v_uint8x16 & a,const v_uint8x16 & b)866 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
867 {
868 __m128i a0 = _mm_srli_epi16(_mm_slli_si128(a.val, 1), 8); // even
869 __m128i a1 = _mm_srli_epi16(a.val, 8); // odd
870 __m128i b0 = _mm_srli_epi16(_mm_slli_si128(b.val, 1), 8);
871 __m128i b1 = _mm_srli_epi16(b.val, 8);
872 __m128i p0 = _mm_madd_epi16(a0, b0);
873 __m128i p1 = _mm_madd_epi16(a1, b1);
874 return v_uint32x4(_mm_add_epi32(p0, p1));
875 }
v_dotprod_expand(const v_uint8x16 & a,const v_uint8x16 & b,const v_uint32x4 & c)876 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
877 { return v_dotprod_expand(a, b) + c; }
878
v_dotprod_expand(const v_int8x16 & a,const v_int8x16 & b)879 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
880 {
881 __m128i a0 = _mm_srai_epi16(_mm_slli_si128(a.val, 1), 8); // even
882 __m128i a1 = _mm_srai_epi16(a.val, 8); // odd
883 __m128i b0 = _mm_srai_epi16(_mm_slli_si128(b.val, 1), 8);
884 __m128i b1 = _mm_srai_epi16(b.val, 8);
885 __m128i p0 = _mm_madd_epi16(a0, b0);
886 __m128i p1 = _mm_madd_epi16(a1, b1);
887 return v_int32x4(_mm_add_epi32(p0, p1));
888 }
v_dotprod_expand(const v_int8x16 & a,const v_int8x16 & b,const v_int32x4 & c)889 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
890 { return v_dotprod_expand(a, b) + c; }
891
892 // 16 >> 64
v_dotprod_expand(const v_uint16x8 & a,const v_uint16x8 & b)893 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
894 {
895 v_uint32x4 c, d;
896 v_mul_expand(a, b, c, d);
897
898 v_uint64x2 c0, c1, d0, d1;
899 v_expand(c, c0, c1);
900 v_expand(d, d0, d1);
901
902 c0 += c1; d0 += d1;
903 return v_uint64x2(_mm_add_epi64(
904 _mm_unpacklo_epi64(c0.val, d0.val),
905 _mm_unpackhi_epi64(c0.val, d0.val)
906 ));
907 }
v_dotprod_expand(const v_uint16x8 & a,const v_uint16x8 & b,const v_uint64x2 & c)908 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
909 { return v_dotprod_expand(a, b) + c; }
910
v_dotprod_expand(const v_int16x8 & a,const v_int16x8 & b)911 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
912 {
913 v_int32x4 prod = v_dotprod(a, b);
914 v_int64x2 c, d;
915 v_expand(prod, c, d);
916 return v_int64x2(_mm_add_epi64(
917 _mm_unpacklo_epi64(c.val, d.val),
918 _mm_unpackhi_epi64(c.val, d.val)
919 ));
920 }
v_dotprod_expand(const v_int16x8 & a,const v_int16x8 & b,const v_int64x2 & c)921 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
922 { return v_dotprod_expand(a, b) + c; }
923
924 // 32 >> 64f
v_dotprod_expand(const v_int32x4 & a,const v_int32x4 & b)925 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
926 {
927 #if CV_SSE4_1
928 return v_cvt_f64(v_dotprod(a, b));
929 #else
930 v_float64x2 c = v_cvt_f64(a) * v_cvt_f64(b);
931 v_float64x2 d = v_cvt_f64_high(a) * v_cvt_f64_high(b);
932
933 return v_float64x2(_mm_add_pd(
934 _mm_unpacklo_pd(c.val, d.val),
935 _mm_unpackhi_pd(c.val, d.val)
936 ));
937 #endif
938 }
v_dotprod_expand(const v_int32x4 & a,const v_int32x4 & b,const v_float64x2 & c)939 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
940 { return v_dotprod_expand(a, b) + c; }
941
942 //////// Fast Dot Product ////////
943
944 // 16 >> 32
v_dotprod_fast(const v_int16x8 & a,const v_int16x8 & b)945 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
946 { return v_dotprod(a, b); }
v_dotprod_fast(const v_int16x8 & a,const v_int16x8 & b,const v_int32x4 & c)947 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
948 { return v_dotprod(a, b) + c; }
949
950 // 32 >> 64
v_dotprod_fast(const v_int32x4 & a,const v_int32x4 & b)951 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
952 { return v_dotprod(a, b); }
v_dotprod_fast(const v_int32x4 & a,const v_int32x4 & b,const v_int64x2 & c)953 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
954 { return v_dotprod_fast(a, b) + c; }
955
956 // 8 >> 32
v_dotprod_expand_fast(const v_uint8x16 & a,const v_uint8x16 & b)957 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
958 {
959 __m128i a0 = v_expand_low(a).val;
960 __m128i a1 = v_expand_high(a).val;
961 __m128i b0 = v_expand_low(b).val;
962 __m128i b1 = v_expand_high(b).val;
963 __m128i p0 = _mm_madd_epi16(a0, b0);
964 __m128i p1 = _mm_madd_epi16(a1, b1);
965 return v_uint32x4(_mm_add_epi32(p0, p1));
966 }
v_dotprod_expand_fast(const v_uint8x16 & a,const v_uint8x16 & b,const v_uint32x4 & c)967 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
968 { return v_dotprod_expand_fast(a, b) + c; }
969
v_dotprod_expand_fast(const v_int8x16 & a,const v_int8x16 & b)970 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
971 {
972 #if CV_SSE4_1
973 __m128i a0 = _mm_cvtepi8_epi16(a.val);
974 __m128i a1 = v_expand_high(a).val;
975 __m128i b0 = _mm_cvtepi8_epi16(b.val);
976 __m128i b1 = v_expand_high(b).val;
977 __m128i p0 = _mm_madd_epi16(a0, b0);
978 __m128i p1 = _mm_madd_epi16(a1, b1);
979 return v_int32x4(_mm_add_epi32(p0, p1));
980 #else
981 return v_dotprod_expand(a, b);
982 #endif
983 }
v_dotprod_expand_fast(const v_int8x16 & a,const v_int8x16 & b,const v_int32x4 & c)984 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
985 { return v_dotprod_expand_fast(a, b) + c; }
986
987 // 16 >> 64
v_dotprod_expand_fast(const v_uint16x8 & a,const v_uint16x8 & b)988 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
989 {
990 v_uint32x4 c, d;
991 v_mul_expand(a, b, c, d);
992
993 v_uint64x2 c0, c1, d0, d1;
994 v_expand(c, c0, c1);
995 v_expand(d, d0, d1);
996
997 c0 += c1; d0 += d1;
998 return c0 + d0;
999 }
v_dotprod_expand_fast(const v_uint16x8 & a,const v_uint16x8 & b,const v_uint64x2 & c)1000 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
1001 { return v_dotprod_expand_fast(a, b) + c; }
1002
v_dotprod_expand_fast(const v_int16x8 & a,const v_int16x8 & b)1003 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
1004 {
1005 v_int32x4 prod = v_dotprod(a, b);
1006 v_int64x2 c, d;
1007 v_expand(prod, c, d);
1008 return c + d;
1009 }
v_dotprod_expand_fast(const v_int16x8 & a,const v_int16x8 & b,const v_int64x2 & c)1010 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
1011 { return v_dotprod_expand_fast(a, b) + c; }
1012
1013 // 32 >> 64f
1014 v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c);
v_dotprod_expand_fast(const v_int32x4 & a,const v_int32x4 & b)1015 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
1016 { return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_cvt_f64_high(a) * v_cvt_f64_high(b)); }
v_dotprod_expand_fast(const v_int32x4 & a,const v_int32x4 & b,const v_float64x2 & c)1017 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
1018 { return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_fma(v_cvt_f64_high(a), v_cvt_f64_high(b), c)); }
1019
1020 #define OPENCV_HAL_IMPL_SSE_LOGIC_OP(_Tpvec, suffix, not_const) \
1021 OPENCV_HAL_IMPL_SSE_BIN_OP(&, _Tpvec, _mm_and_##suffix) \
1022 OPENCV_HAL_IMPL_SSE_BIN_OP(|, _Tpvec, _mm_or_##suffix) \
1023 OPENCV_HAL_IMPL_SSE_BIN_OP(^, _Tpvec, _mm_xor_##suffix) \
1024 inline _Tpvec operator ~ (const _Tpvec& a) \
1025 { \
1026 return _Tpvec(_mm_xor_##suffix(a.val, not_const)); \
1027 }
1028
1029 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint8x16, si128, _mm_set1_epi32(-1))
1030 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int8x16, si128, _mm_set1_epi32(-1))
1031 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint16x8, si128, _mm_set1_epi32(-1))
1032 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int16x8, si128, _mm_set1_epi32(-1))
1033 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint32x4, si128, _mm_set1_epi32(-1))
1034 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int32x4, si128, _mm_set1_epi32(-1))
1035 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint64x2, si128, _mm_set1_epi32(-1))
1036 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int64x2, si128, _mm_set1_epi32(-1))
1037 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_float32x4, ps, _mm_castsi128_ps(_mm_set1_epi32(-1)))
1038 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_float64x2, pd, _mm_castsi128_pd(_mm_set1_epi32(-1)))
1039
v_sqrt(const v_float32x4 & x)1040 inline v_float32x4 v_sqrt(const v_float32x4& x)
1041 { return v_float32x4(_mm_sqrt_ps(x.val)); }
1042
v_invsqrt(const v_float32x4 & x)1043 inline v_float32x4 v_invsqrt(const v_float32x4& x)
1044 {
1045 const __m128 _0_5 = _mm_set1_ps(0.5f), _1_5 = _mm_set1_ps(1.5f);
1046 __m128 t = x.val;
1047 __m128 h = _mm_mul_ps(t, _0_5);
1048 t = _mm_rsqrt_ps(t);
1049 t = _mm_mul_ps(t, _mm_sub_ps(_1_5, _mm_mul_ps(_mm_mul_ps(t, t), h)));
1050 return v_float32x4(t);
1051 }
1052
v_sqrt(const v_float64x2 & x)1053 inline v_float64x2 v_sqrt(const v_float64x2& x)
1054 { return v_float64x2(_mm_sqrt_pd(x.val)); }
1055
v_invsqrt(const v_float64x2 & x)1056 inline v_float64x2 v_invsqrt(const v_float64x2& x)
1057 {
1058 const __m128d v_1 = _mm_set1_pd(1.);
1059 return v_float64x2(_mm_div_pd(v_1, _mm_sqrt_pd(x.val)));
1060 }
1061
1062 #define OPENCV_HAL_IMPL_SSE_ABS_INT_FUNC(_Tpuvec, _Tpsvec, func, suffix, subWidth) \
1063 inline _Tpuvec v_abs(const _Tpsvec& x) \
1064 { return _Tpuvec(_mm_##func##_ep##suffix(x.val, _mm_sub_ep##subWidth(_mm_setzero_si128(), x.val))); }
1065
OPENCV_HAL_IMPL_SSE_ABS_INT_FUNC(v_uint8x16,v_int8x16,min,u8,i8)1066 OPENCV_HAL_IMPL_SSE_ABS_INT_FUNC(v_uint8x16, v_int8x16, min, u8, i8)
1067 OPENCV_HAL_IMPL_SSE_ABS_INT_FUNC(v_uint16x8, v_int16x8, max, i16, i16)
1068 inline v_uint32x4 v_abs(const v_int32x4& x)
1069 {
1070 __m128i s = _mm_srli_epi32(x.val, 31);
1071 __m128i f = _mm_srai_epi32(x.val, 31);
1072 return v_uint32x4(_mm_add_epi32(_mm_xor_si128(x.val, f), s));
1073 }
v_abs(const v_float32x4 & x)1074 inline v_float32x4 v_abs(const v_float32x4& x)
1075 { return v_float32x4(_mm_and_ps(x.val, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))); }
v_abs(const v_float64x2 & x)1076 inline v_float64x2 v_abs(const v_float64x2& x)
1077 {
1078 return v_float64x2(_mm_and_pd(x.val,
1079 _mm_castsi128_pd(_mm_srli_epi64(_mm_set1_epi32(-1), 1))));
1080 }
1081
1082 // TODO: exp, log, sin, cos
1083
1084 #define OPENCV_HAL_IMPL_SSE_BIN_FUNC(_Tpvec, func, intrin) \
1085 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
1086 { \
1087 return _Tpvec(intrin(a.val, b.val)); \
1088 }
1089
OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16,v_min,_mm_min_epu8)1090 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_min, _mm_min_epu8)
1091 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_max, _mm_max_epu8)
1092 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_min, _mm_min_epi16)
1093 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_max, _mm_max_epi16)
1094 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float32x4, v_min, _mm_min_ps)
1095 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float32x4, v_max, _mm_max_ps)
1096 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float64x2, v_min, _mm_min_pd)
1097 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float64x2, v_max, _mm_max_pd)
1098
1099 inline v_int8x16 v_min(const v_int8x16& a, const v_int8x16& b)
1100 {
1101 #if CV_SSE4_1
1102 return v_int8x16(_mm_min_epi8(a.val, b.val));
1103 #else
1104 __m128i delta = _mm_set1_epi8((char)-128);
1105 return v_int8x16(_mm_xor_si128(delta, _mm_min_epu8(_mm_xor_si128(a.val, delta),
1106 _mm_xor_si128(b.val, delta))));
1107 #endif
1108 }
v_max(const v_int8x16 & a,const v_int8x16 & b)1109 inline v_int8x16 v_max(const v_int8x16& a, const v_int8x16& b)
1110 {
1111 #if CV_SSE4_1
1112 return v_int8x16(_mm_max_epi8(a.val, b.val));
1113 #else
1114 __m128i delta = _mm_set1_epi8((char)-128);
1115 return v_int8x16(_mm_xor_si128(delta, _mm_max_epu8(_mm_xor_si128(a.val, delta),
1116 _mm_xor_si128(b.val, delta))));
1117 #endif
1118 }
v_min(const v_uint16x8 & a,const v_uint16x8 & b)1119 inline v_uint16x8 v_min(const v_uint16x8& a, const v_uint16x8& b)
1120 {
1121 #if CV_SSE4_1
1122 return v_uint16x8(_mm_min_epu16(a.val, b.val));
1123 #else
1124 return v_uint16x8(_mm_subs_epu16(a.val, _mm_subs_epu16(a.val, b.val)));
1125 #endif
1126 }
v_max(const v_uint16x8 & a,const v_uint16x8 & b)1127 inline v_uint16x8 v_max(const v_uint16x8& a, const v_uint16x8& b)
1128 {
1129 #if CV_SSE4_1
1130 return v_uint16x8(_mm_max_epu16(a.val, b.val));
1131 #else
1132 return v_uint16x8(_mm_adds_epu16(_mm_subs_epu16(a.val, b.val), b.val));
1133 #endif
1134 }
v_min(const v_uint32x4 & a,const v_uint32x4 & b)1135 inline v_uint32x4 v_min(const v_uint32x4& a, const v_uint32x4& b)
1136 {
1137 #if CV_SSE4_1
1138 return v_uint32x4(_mm_min_epu32(a.val, b.val));
1139 #else
1140 __m128i delta = _mm_set1_epi32((int)0x80000000);
1141 __m128i mask = _mm_cmpgt_epi32(_mm_xor_si128(a.val, delta), _mm_xor_si128(b.val, delta));
1142 return v_uint32x4(v_select_si128(mask, b.val, a.val));
1143 #endif
1144 }
v_max(const v_uint32x4 & a,const v_uint32x4 & b)1145 inline v_uint32x4 v_max(const v_uint32x4& a, const v_uint32x4& b)
1146 {
1147 #if CV_SSE4_1
1148 return v_uint32x4(_mm_max_epu32(a.val, b.val));
1149 #else
1150 __m128i delta = _mm_set1_epi32((int)0x80000000);
1151 __m128i mask = _mm_cmpgt_epi32(_mm_xor_si128(a.val, delta), _mm_xor_si128(b.val, delta));
1152 return v_uint32x4(v_select_si128(mask, a.val, b.val));
1153 #endif
1154 }
v_min(const v_int32x4 & a,const v_int32x4 & b)1155 inline v_int32x4 v_min(const v_int32x4& a, const v_int32x4& b)
1156 {
1157 #if CV_SSE4_1
1158 return v_int32x4(_mm_min_epi32(a.val, b.val));
1159 #else
1160 return v_int32x4(v_select_si128(_mm_cmpgt_epi32(a.val, b.val), b.val, a.val));
1161 #endif
1162 }
v_max(const v_int32x4 & a,const v_int32x4 & b)1163 inline v_int32x4 v_max(const v_int32x4& a, const v_int32x4& b)
1164 {
1165 #if CV_SSE4_1
1166 return v_int32x4(_mm_max_epi32(a.val, b.val));
1167 #else
1168 return v_int32x4(v_select_si128(_mm_cmpgt_epi32(a.val, b.val), a.val, b.val));
1169 #endif
1170 }
1171
1172 #define OPENCV_HAL_IMPL_SSE_INT_CMP_OP(_Tpuvec, _Tpsvec, suffix, sbit) \
1173 inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \
1174 { return _Tpuvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
1175 inline _Tpuvec operator != (const _Tpuvec& a, const _Tpuvec& b) \
1176 { \
1177 __m128i not_mask = _mm_set1_epi32(-1); \
1178 return _Tpuvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \
1179 } \
1180 inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \
1181 { return _Tpsvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
1182 inline _Tpsvec operator != (const _Tpsvec& a, const _Tpsvec& b) \
1183 { \
1184 __m128i not_mask = _mm_set1_epi32(-1); \
1185 return _Tpsvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \
1186 } \
1187 inline _Tpuvec operator < (const _Tpuvec& a, const _Tpuvec& b) \
1188 { \
1189 __m128i smask = _mm_set1_##suffix(sbit); \
1190 return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask))); \
1191 } \
1192 inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \
1193 { \
1194 __m128i smask = _mm_set1_##suffix(sbit); \
1195 return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask))); \
1196 } \
1197 inline _Tpuvec operator <= (const _Tpuvec& a, const _Tpuvec& b) \
1198 { \
1199 __m128i smask = _mm_set1_##suffix(sbit); \
1200 __m128i not_mask = _mm_set1_epi32(-1); \
1201 __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask)); \
1202 return _Tpuvec(_mm_xor_si128(res, not_mask)); \
1203 } \
1204 inline _Tpuvec operator >= (const _Tpuvec& a, const _Tpuvec& b) \
1205 { \
1206 __m128i smask = _mm_set1_##suffix(sbit); \
1207 __m128i not_mask = _mm_set1_epi32(-1); \
1208 __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask)); \
1209 return _Tpuvec(_mm_xor_si128(res, not_mask)); \
1210 } \
1211 inline _Tpsvec operator < (const _Tpsvec& a, const _Tpsvec& b) \
1212 { \
1213 return _Tpsvec(_mm_cmpgt_##suffix(b.val, a.val)); \
1214 } \
1215 inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \
1216 { \
1217 return _Tpsvec(_mm_cmpgt_##suffix(a.val, b.val)); \
1218 } \
1219 inline _Tpsvec operator <= (const _Tpsvec& a, const _Tpsvec& b) \
1220 { \
1221 __m128i not_mask = _mm_set1_epi32(-1); \
1222 return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(a.val, b.val), not_mask)); \
1223 } \
1224 inline _Tpsvec operator >= (const _Tpsvec& a, const _Tpsvec& b) \
1225 { \
1226 __m128i not_mask = _mm_set1_epi32(-1); \
1227 return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(b.val, a.val), not_mask)); \
1228 }
1229
1230 OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint8x16, v_int8x16, epi8, (char)-128)
1231 OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint16x8, v_int16x8, epi16, (short)-32768)
1232 OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint32x4, v_int32x4, epi32, (int)0x80000000)
1233
1234 #define OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(_Tpvec, suffix) \
1235 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
1236 { return _Tpvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
1237 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
1238 { return _Tpvec(_mm_cmpneq_##suffix(a.val, b.val)); } \
1239 inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
1240 { return _Tpvec(_mm_cmplt_##suffix(a.val, b.val)); } \
1241 inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
1242 { return _Tpvec(_mm_cmpgt_##suffix(a.val, b.val)); } \
1243 inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
1244 { return _Tpvec(_mm_cmple_##suffix(a.val, b.val)); } \
1245 inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
1246 { return _Tpvec(_mm_cmpge_##suffix(a.val, b.val)); }
1247
OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float32x4,ps)1248 OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float32x4, ps)
1249 OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float64x2, pd)
1250
1251 #if CV_SSE4_1
1252 #define OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(_Tpvec) \
1253 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
1254 { return _Tpvec(_mm_cmpeq_epi64(a.val, b.val)); } \
1255 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
1256 { return ~(a == b); }
1257 #else
1258 #define OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(_Tpvec) \
1259 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
1260 { __m128i cmp = _mm_cmpeq_epi32(a.val, b.val); \
1261 return _Tpvec(_mm_and_si128(cmp, _mm_shuffle_epi32(cmp, _MM_SHUFFLE(2, 3, 0, 1)))); } \
1262 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
1263 { return ~(a == b); }
1264 #endif
1265
1266 OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_uint64x2)
1267 OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_int64x2)
1268
1269 inline v_float32x4 v_not_nan(const v_float32x4& a)
1270 { return v_float32x4(_mm_cmpord_ps(a.val, a.val)); }
v_not_nan(const v_float64x2 & a)1271 inline v_float64x2 v_not_nan(const v_float64x2& a)
1272 { return v_float64x2(_mm_cmpord_pd(a.val, a.val)); }
1273
OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16,v_add_wrap,_mm_add_epi8)1274 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_add_wrap, _mm_add_epi8)
1275 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_add_wrap, _mm_add_epi8)
1276 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_add_wrap, _mm_add_epi16)
1277 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_add_wrap, _mm_add_epi16)
1278 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_sub_wrap, _mm_sub_epi8)
1279 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_sub_wrap, _mm_sub_epi8)
1280 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_sub_wrap, _mm_sub_epi16)
1281 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_sub_wrap, _mm_sub_epi16)
1282 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_mul_wrap, _mm_mullo_epi16)
1283 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_mul_wrap, _mm_mullo_epi16)
1284
1285 inline v_uint8x16 v_mul_wrap(const v_uint8x16& a, const v_uint8x16& b)
1286 {
1287 __m128i ad = _mm_srai_epi16(a.val, 8);
1288 __m128i bd = _mm_srai_epi16(b.val, 8);
1289 __m128i p0 = _mm_mullo_epi16(a.val, b.val); // even
1290 __m128i p1 = _mm_slli_epi16(_mm_mullo_epi16(ad, bd), 8); // odd
1291 const __m128i b01 = _mm_set1_epi32(0xFF00FF00);
1292 return v_uint8x16(_v128_blendv_epi8(p0, p1, b01));
1293 }
v_mul_wrap(const v_int8x16 & a,const v_int8x16 & b)1294 inline v_int8x16 v_mul_wrap(const v_int8x16& a, const v_int8x16& b)
1295 {
1296 return v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b)));
1297 }
1298
1299 /** Absolute difference **/
1300
v_absdiff(const v_uint8x16 & a,const v_uint8x16 & b)1301 inline v_uint8x16 v_absdiff(const v_uint8x16& a, const v_uint8x16& b)
1302 { return v_add_wrap(a - b, b - a); }
v_absdiff(const v_uint16x8 & a,const v_uint16x8 & b)1303 inline v_uint16x8 v_absdiff(const v_uint16x8& a, const v_uint16x8& b)
1304 { return v_add_wrap(a - b, b - a); }
v_absdiff(const v_uint32x4 & a,const v_uint32x4 & b)1305 inline v_uint32x4 v_absdiff(const v_uint32x4& a, const v_uint32x4& b)
1306 { return v_max(a, b) - v_min(a, b); }
1307
v_absdiff(const v_int8x16 & a,const v_int8x16 & b)1308 inline v_uint8x16 v_absdiff(const v_int8x16& a, const v_int8x16& b)
1309 {
1310 v_int8x16 d = v_sub_wrap(a, b);
1311 v_int8x16 m = a < b;
1312 return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
1313 }
v_absdiff(const v_int16x8 & a,const v_int16x8 & b)1314 inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b)
1315 {
1316 return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b)));
1317 }
v_absdiff(const v_int32x4 & a,const v_int32x4 & b)1318 inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
1319 {
1320 v_int32x4 d = a - b;
1321 v_int32x4 m = a < b;
1322 return v_reinterpret_as_u32((d ^ m) - m);
1323 }
1324
1325 /** Saturating absolute difference **/
v_absdiffs(const v_int8x16 & a,const v_int8x16 & b)1326 inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
1327 {
1328 v_int8x16 d = a - b;
1329 v_int8x16 m = a < b;
1330 return (d ^ m) - m;
1331 }
v_absdiffs(const v_int16x8 & a,const v_int16x8 & b)1332 inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
1333 { return v_max(a, b) - v_min(a, b); }
1334
1335
v_fma(const v_int32x4 & a,const v_int32x4 & b,const v_int32x4 & c)1336 inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
1337 {
1338 return a * b + c;
1339 }
1340
v_muladd(const v_int32x4 & a,const v_int32x4 & b,const v_int32x4 & c)1341 inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
1342 {
1343 return v_fma(a, b, c);
1344 }
1345
v_fma(const v_float32x4 & a,const v_float32x4 & b,const v_float32x4 & c)1346 inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
1347 {
1348 #if CV_FMA3
1349 return v_float32x4(_mm_fmadd_ps(a.val, b.val, c.val));
1350 #else
1351 return v_float32x4(_mm_add_ps(_mm_mul_ps(a.val, b.val), c.val));
1352 #endif
1353 }
1354
v_fma(const v_float64x2 & a,const v_float64x2 & b,const v_float64x2 & c)1355 inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
1356 {
1357 #if CV_FMA3
1358 return v_float64x2(_mm_fmadd_pd(a.val, b.val, c.val));
1359 #else
1360 return v_float64x2(_mm_add_pd(_mm_mul_pd(a.val, b.val), c.val));
1361 #endif
1362 }
1363
1364 #define OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(_Tpvec, _Tp, _Tpreg, suffix, absmask_vec) \
1365 inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
1366 { \
1367 _Tpreg absmask = _mm_castsi128_##suffix(absmask_vec); \
1368 return _Tpvec(_mm_and_##suffix(_mm_sub_##suffix(a.val, b.val), absmask)); \
1369 } \
1370 inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
1371 { \
1372 _Tpvec res = v_fma(a, a, b*b); \
1373 return _Tpvec(_mm_sqrt_##suffix(res.val)); \
1374 } \
1375 inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
1376 { \
1377 return v_fma(a, a, b*b); \
1378 } \
1379 inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1380 { \
1381 return v_fma(a, b, c); \
1382 }
1383
1384 OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float32x4, float, __m128, ps, _mm_set1_epi32((int)0x7fffffff))
1385 OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float64x2, double, __m128d, pd, _mm_srli_epi64(_mm_set1_epi32(-1), 1))
1386
1387 #define OPENCV_HAL_IMPL_SSE_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \
1388 inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
1389 { \
1390 return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \
1391 } \
1392 inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
1393 { \
1394 return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \
1395 } \
1396 inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
1397 { \
1398 return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \
1399 } \
1400 inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
1401 { \
1402 return _Tpsvec(srai(a.val, imm)); \
1403 } \
1404 template<int imm> \
1405 inline _Tpuvec v_shl(const _Tpuvec& a) \
1406 { \
1407 return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \
1408 } \
1409 template<int imm> \
1410 inline _Tpsvec v_shl(const _Tpsvec& a) \
1411 { \
1412 return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \
1413 } \
1414 template<int imm> \
1415 inline _Tpuvec v_shr(const _Tpuvec& a) \
1416 { \
1417 return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \
1418 } \
1419 template<int imm> \
1420 inline _Tpsvec v_shr(const _Tpsvec& a) \
1421 { \
1422 return _Tpsvec(srai(a.val, imm)); \
1423 }
1424
1425 OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint16x8, v_int16x8, epi16, _mm_srai_epi16)
1426 OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint32x4, v_int32x4, epi32, _mm_srai_epi32)
1427 OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint64x2, v_int64x2, epi64, v_srai_epi64)
1428
1429 namespace hal_sse_internal
1430 {
1431 template <int imm,
1432 bool is_invalid = ((imm < 0) || (imm > 16)),
1433 bool is_first = (imm == 0),
1434 bool is_half = (imm == 8),
1435 bool is_second = (imm == 16),
1436 bool is_other = (((imm > 0) && (imm < 8)) || ((imm > 8) && (imm < 16)))>
1437 class v_sse_palignr_u8_class;
1438
1439 template <int imm>
1440 class v_sse_palignr_u8_class<imm, true, false, false, false, false>;
1441
1442 template <int imm>
1443 class v_sse_palignr_u8_class<imm, false, true, false, false, false>
1444 {
1445 public:
operator ()(const __m128i & a,const __m128i &) const1446 inline __m128i operator()(const __m128i& a, const __m128i&) const
1447 {
1448 return a;
1449 }
1450 };
1451
1452 template <int imm>
1453 class v_sse_palignr_u8_class<imm, false, false, true, false, false>
1454 {
1455 public:
operator ()(const __m128i & a,const __m128i & b) const1456 inline __m128i operator()(const __m128i& a, const __m128i& b) const
1457 {
1458 return _mm_unpacklo_epi64(_mm_unpackhi_epi64(a, a), b);
1459 }
1460 };
1461
1462 template <int imm>
1463 class v_sse_palignr_u8_class<imm, false, false, false, true, false>
1464 {
1465 public:
operator ()(const __m128i &,const __m128i & b) const1466 inline __m128i operator()(const __m128i&, const __m128i& b) const
1467 {
1468 return b;
1469 }
1470 };
1471
1472 template <int imm>
1473 class v_sse_palignr_u8_class<imm, false, false, false, false, true>
1474 {
1475 #if CV_SSSE3
1476 public:
operator ()(const __m128i & a,const __m128i & b) const1477 inline __m128i operator()(const __m128i& a, const __m128i& b) const
1478 {
1479 return _mm_alignr_epi8(b, a, imm);
1480 }
1481 #else
1482 public:
1483 inline __m128i operator()(const __m128i& a, const __m128i& b) const
1484 {
1485 enum { imm2 = (sizeof(__m128i) - imm) };
1486 return _mm_or_si128(_mm_srli_si128(a, imm), _mm_slli_si128(b, imm2));
1487 }
1488 #endif
1489 };
1490
1491 template <int imm>
v_sse_palignr_u8(const __m128i & a,const __m128i & b)1492 inline __m128i v_sse_palignr_u8(const __m128i& a, const __m128i& b)
1493 {
1494 CV_StaticAssert((imm >= 0) && (imm <= 16), "Invalid imm for v_sse_palignr_u8.");
1495 return v_sse_palignr_u8_class<imm>()(a, b);
1496 }
1497 }
1498
1499 template<int imm, typename _Tpvec>
v_rotate_right(const _Tpvec & a)1500 inline _Tpvec v_rotate_right(const _Tpvec &a)
1501 {
1502 using namespace hal_sse_internal;
1503 enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) };
1504 return _Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
1505 _mm_srli_si128(
1506 v_sse_reinterpret_as<__m128i>(a.val), imm2)));
1507 }
1508
1509 template<int imm, typename _Tpvec>
v_rotate_left(const _Tpvec & a)1510 inline _Tpvec v_rotate_left(const _Tpvec &a)
1511 {
1512 using namespace hal_sse_internal;
1513 enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) };
1514 return _Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
1515 _mm_slli_si128(
1516 v_sse_reinterpret_as<__m128i>(a.val), imm2)));
1517 }
1518
1519 template<int imm, typename _Tpvec>
v_rotate_right(const _Tpvec & a,const _Tpvec & b)1520 inline _Tpvec v_rotate_right(const _Tpvec &a, const _Tpvec &b)
1521 {
1522 using namespace hal_sse_internal;
1523 enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) };
1524 return _Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
1525 v_sse_palignr_u8<imm2>(
1526 v_sse_reinterpret_as<__m128i>(a.val),
1527 v_sse_reinterpret_as<__m128i>(b.val))));
1528 }
1529
1530 template<int imm, typename _Tpvec>
v_rotate_left(const _Tpvec & a,const _Tpvec & b)1531 inline _Tpvec v_rotate_left(const _Tpvec &a, const _Tpvec &b)
1532 {
1533 using namespace hal_sse_internal;
1534 enum { imm2 = ((_Tpvec::nlanes - imm) * sizeof(typename _Tpvec::lane_type)) };
1535 return _Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
1536 v_sse_palignr_u8<imm2>(
1537 v_sse_reinterpret_as<__m128i>(b.val),
1538 v_sse_reinterpret_as<__m128i>(a.val))));
1539 }
1540
1541 #define OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(_Tpvec, _Tp) \
1542 inline _Tpvec v_load(const _Tp* ptr) \
1543 { return _Tpvec(_mm_loadu_si128((const __m128i*)ptr)); } \
1544 inline _Tpvec v_load_aligned(const _Tp* ptr) \
1545 { return _Tpvec(_mm_load_si128((const __m128i*)ptr)); } \
1546 inline _Tpvec v_load_low(const _Tp* ptr) \
1547 { return _Tpvec(_mm_loadl_epi64((const __m128i*)ptr)); } \
1548 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
1549 { \
1550 return _Tpvec(_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \
1551 _mm_loadl_epi64((const __m128i*)ptr1))); \
1552 } \
1553 inline void v_store(_Tp* ptr, const _Tpvec& a) \
1554 { _mm_storeu_si128((__m128i*)ptr, a.val); } \
1555 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
1556 { _mm_store_si128((__m128i*)ptr, a.val); } \
1557 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
1558 { _mm_stream_si128((__m128i*)ptr, a.val); } \
1559 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
1560 { \
1561 if( mode == hal::STORE_UNALIGNED ) \
1562 _mm_storeu_si128((__m128i*)ptr, a.val); \
1563 else if( mode == hal::STORE_ALIGNED_NOCACHE ) \
1564 _mm_stream_si128((__m128i*)ptr, a.val); \
1565 else \
1566 _mm_store_si128((__m128i*)ptr, a.val); \
1567 } \
1568 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
1569 { _mm_storel_epi64((__m128i*)ptr, a.val); } \
1570 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
1571 { _mm_storel_epi64((__m128i*)ptr, _mm_unpackhi_epi64(a.val, a.val)); }
1572
OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint8x16,uchar)1573 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint8x16, uchar)
1574 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int8x16, schar)
1575 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint16x8, ushort)
1576 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int16x8, short)
1577 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint32x4, unsigned)
1578 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int32x4, int)
1579 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint64x2, uint64)
1580 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int64x2, int64)
1581
1582 #define OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(_Tpvec, _Tp, suffix) \
1583 inline _Tpvec v_load(const _Tp* ptr) \
1584 { return _Tpvec(_mm_loadu_##suffix(ptr)); } \
1585 inline _Tpvec v_load_aligned(const _Tp* ptr) \
1586 { return _Tpvec(_mm_load_##suffix(ptr)); } \
1587 inline _Tpvec v_load_low(const _Tp* ptr) \
1588 { return _Tpvec(_mm_castsi128_##suffix(_mm_loadl_epi64((const __m128i*)ptr))); } \
1589 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
1590 { \
1591 return _Tpvec(_mm_castsi128_##suffix( \
1592 _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \
1593 _mm_loadl_epi64((const __m128i*)ptr1)))); \
1594 } \
1595 inline void v_store(_Tp* ptr, const _Tpvec& a) \
1596 { _mm_storeu_##suffix(ptr, a.val); } \
1597 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
1598 { _mm_store_##suffix(ptr, a.val); } \
1599 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
1600 { _mm_stream_##suffix(ptr, a.val); } \
1601 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
1602 { \
1603 if( mode == hal::STORE_UNALIGNED ) \
1604 _mm_storeu_##suffix(ptr, a.val); \
1605 else if( mode == hal::STORE_ALIGNED_NOCACHE ) \
1606 _mm_stream_##suffix(ptr, a.val); \
1607 else \
1608 _mm_store_##suffix(ptr, a.val); \
1609 } \
1610 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
1611 { _mm_storel_epi64((__m128i*)ptr, _mm_cast##suffix##_si128(a.val)); } \
1612 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
1613 { \
1614 __m128i a1 = _mm_cast##suffix##_si128(a.val); \
1615 _mm_storel_epi64((__m128i*)ptr, _mm_unpackhi_epi64(a1, a1)); \
1616 }
1617
1618 OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float32x4, float, ps)
1619 OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float64x2, double, pd)
1620
1621 inline unsigned v_reduce_sum(const v_uint8x16& a)
1622 {
1623 __m128i half = _mm_sad_epu8(a.val, _mm_setzero_si128());
1624 return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(half, _mm_unpackhi_epi64(half, half)));
1625 }
v_reduce_sum(const v_int8x16 & a)1626 inline int v_reduce_sum(const v_int8x16& a)
1627 {
1628 __m128i half = _mm_set1_epi8((schar)-128);
1629 half = _mm_sad_epu8(_mm_xor_si128(a.val, half), _mm_setzero_si128());
1630 return _mm_cvtsi128_si32(_mm_add_epi32(half, _mm_unpackhi_epi64(half, half))) - 2048;
1631 }
1632 #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_16(func) \
1633 inline schar v_reduce_##func(const v_int8x16& a) \
1634 { \
1635 __m128i val = a.val; \
1636 __m128i smask = _mm_set1_epi8((schar)-128); \
1637 val = _mm_xor_si128(val, smask); \
1638 val = _mm_##func##_epu8(val, _mm_srli_si128(val,8)); \
1639 val = _mm_##func##_epu8(val, _mm_srli_si128(val,4)); \
1640 val = _mm_##func##_epu8(val, _mm_srli_si128(val,2)); \
1641 val = _mm_##func##_epu8(val, _mm_srli_si128(val,1)); \
1642 return (schar)_mm_cvtsi128_si32(val) ^ (schar)-128; \
1643 } \
1644 inline uchar v_reduce_##func(const v_uint8x16& a) \
1645 { \
1646 __m128i val = a.val; \
1647 val = _mm_##func##_epu8(val, _mm_srli_si128(val,8)); \
1648 val = _mm_##func##_epu8(val, _mm_srli_si128(val,4)); \
1649 val = _mm_##func##_epu8(val, _mm_srli_si128(val,2)); \
1650 val = _mm_##func##_epu8(val, _mm_srli_si128(val,1)); \
1651 return (uchar)_mm_cvtsi128_si32(val); \
1652 }
1653 OPENCV_HAL_IMPL_SSE_REDUCE_OP_16(max)
OPENCV_HAL_IMPL_SSE_REDUCE_OP_16(min)1654 OPENCV_HAL_IMPL_SSE_REDUCE_OP_16(min)
1655
1656 #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(_Tpvec, scalartype, func, suffix, sbit) \
1657 inline scalartype v_reduce_##func(const v_##_Tpvec& a) \
1658 { \
1659 __m128i val = a.val; \
1660 val = _mm_##func##_##suffix(val, _mm_srli_si128(val,8)); \
1661 val = _mm_##func##_##suffix(val, _mm_srli_si128(val,4)); \
1662 val = _mm_##func##_##suffix(val, _mm_srli_si128(val,2)); \
1663 return (scalartype)_mm_cvtsi128_si32(val); \
1664 } \
1665 inline unsigned scalartype v_reduce_##func(const v_u##_Tpvec& a) \
1666 { \
1667 __m128i val = a.val; \
1668 __m128i smask = _mm_set1_epi16(sbit); \
1669 val = _mm_xor_si128(val, smask); \
1670 val = _mm_##func##_##suffix(val, _mm_srli_si128(val,8)); \
1671 val = _mm_##func##_##suffix(val, _mm_srli_si128(val,4)); \
1672 val = _mm_##func##_##suffix(val, _mm_srli_si128(val,2)); \
1673 return (unsigned scalartype)(_mm_cvtsi128_si32(val) ^ sbit); \
1674 }
1675 OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(int16x8, short, max, epi16, (short)-32768)
1676 OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(int16x8, short, min, epi16, (short)-32768)
1677
1678 #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(_Tpvec, scalartype, regtype, suffix, cast_from, cast_to, extract) \
1679 inline scalartype v_reduce_sum(const _Tpvec& a) \
1680 { \
1681 regtype val = a.val; \
1682 val = _mm_add_##suffix(val, cast_to(_mm_srli_si128(cast_from(val), 8))); \
1683 val = _mm_add_##suffix(val, cast_to(_mm_srli_si128(cast_from(val), 4))); \
1684 return (scalartype)_mm_cvt##extract(val); \
1685 }
1686
1687 #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \
1688 inline scalartype v_reduce_##func(const _Tpvec& a) \
1689 { \
1690 scalartype CV_DECL_ALIGNED(16) buf[4]; \
1691 v_store_aligned(buf, a); \
1692 scalartype s0 = scalar_func(buf[0], buf[1]); \
1693 scalartype s1 = scalar_func(buf[2], buf[3]); \
1694 return scalar_func(s0, s1); \
1695 }
1696
1697 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_uint32x4, unsigned, __m128i, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP, si128_si32)
1698 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_int32x4, int, __m128i, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP, si128_si32)
1699 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_float32x4, float, __m128, ps, _mm_castps_si128, _mm_castsi128_ps, ss_f32)
1700
1701 inline int v_reduce_sum(const v_int16x8& a)
1702 { return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
v_reduce_sum(const v_uint16x8 & a)1703 inline unsigned v_reduce_sum(const v_uint16x8& a)
1704 { return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
1705
v_reduce_sum(const v_uint64x2 & a)1706 inline uint64 v_reduce_sum(const v_uint64x2& a)
1707 {
1708 uint64 CV_DECL_ALIGNED(32) idx[2];
1709 v_store_aligned(idx, a);
1710 return idx[0] + idx[1];
1711 }
v_reduce_sum(const v_int64x2 & a)1712 inline int64 v_reduce_sum(const v_int64x2& a)
1713 {
1714 int64 CV_DECL_ALIGNED(32) idx[2];
1715 v_store_aligned(idx, a);
1716 return idx[0] + idx[1];
1717 }
v_reduce_sum(const v_float64x2 & a)1718 inline double v_reduce_sum(const v_float64x2& a)
1719 {
1720 double CV_DECL_ALIGNED(32) idx[2];
1721 v_store_aligned(idx, a);
1722 return idx[0] + idx[1];
1723 }
1724
v_reduce_sum4(const v_float32x4 & a,const v_float32x4 & b,const v_float32x4 & c,const v_float32x4 & d)1725 inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
1726 const v_float32x4& c, const v_float32x4& d)
1727 {
1728 #if CV_SSE3
1729 __m128 ab = _mm_hadd_ps(a.val, b.val);
1730 __m128 cd = _mm_hadd_ps(c.val, d.val);
1731 return v_float32x4(_mm_hadd_ps(ab, cd));
1732 #else
1733 __m128 ac = _mm_add_ps(_mm_unpacklo_ps(a.val, c.val), _mm_unpackhi_ps(a.val, c.val));
1734 __m128 bd = _mm_add_ps(_mm_unpacklo_ps(b.val, d.val), _mm_unpackhi_ps(b.val, d.val));
1735 return v_float32x4(_mm_add_ps(_mm_unpacklo_ps(ac, bd), _mm_unpackhi_ps(ac, bd)));
1736 #endif
1737 }
1738
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4,unsigned,max,std::max)1739 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, max, std::max)
1740 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, min, std::min)
1741 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, max, std::max)
1742 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, min, std::min)
1743 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, max, std::max)
1744 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, min, std::min)
1745
1746 inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
1747 {
1748 __m128i half = _mm_sad_epu8(a.val, b.val);
1749 return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(half, _mm_unpackhi_epi64(half, half)));
1750 }
v_reduce_sad(const v_int8x16 & a,const v_int8x16 & b)1751 inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
1752 {
1753 __m128i half = _mm_set1_epi8(0x7f);
1754 half = _mm_sad_epu8(_mm_add_epi8(a.val, half), _mm_add_epi8(b.val, half));
1755 return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(half, _mm_unpackhi_epi64(half, half)));
1756 }
v_reduce_sad(const v_uint16x8 & a,const v_uint16x8 & b)1757 inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
1758 {
1759 v_uint32x4 l, h;
1760 v_expand(v_absdiff(a, b), l, h);
1761 return v_reduce_sum(l + h);
1762 }
v_reduce_sad(const v_int16x8 & a,const v_int16x8 & b)1763 inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
1764 {
1765 v_uint32x4 l, h;
1766 v_expand(v_absdiff(a, b), l, h);
1767 return v_reduce_sum(l + h);
1768 }
v_reduce_sad(const v_uint32x4 & a,const v_uint32x4 & b)1769 inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
1770 {
1771 return v_reduce_sum(v_absdiff(a, b));
1772 }
v_reduce_sad(const v_int32x4 & a,const v_int32x4 & b)1773 inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)
1774 {
1775 return v_reduce_sum(v_absdiff(a, b));
1776 }
v_reduce_sad(const v_float32x4 & a,const v_float32x4 & b)1777 inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
1778 {
1779 return v_reduce_sum(v_absdiff(a, b));
1780 }
1781
v_popcount(const v_uint8x16 & a)1782 inline v_uint8x16 v_popcount(const v_uint8x16& a)
1783 {
1784 __m128i m1 = _mm_set1_epi32(0x55555555);
1785 __m128i m2 = _mm_set1_epi32(0x33333333);
1786 __m128i m4 = _mm_set1_epi32(0x0f0f0f0f);
1787 __m128i p = a.val;
1788 p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 1), m1), _mm_and_si128(p, m1));
1789 p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 2), m2), _mm_and_si128(p, m2));
1790 p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 4), m4), _mm_and_si128(p, m4));
1791 return v_uint8x16(p);
1792 }
v_popcount(const v_uint16x8 & a)1793 inline v_uint16x8 v_popcount(const v_uint16x8& a)
1794 {
1795 v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a));
1796 p += v_rotate_right<1>(p);
1797 return v_reinterpret_as_u16(p) & v_setall_u16(0x00ff);
1798 }
v_popcount(const v_uint32x4 & a)1799 inline v_uint32x4 v_popcount(const v_uint32x4& a)
1800 {
1801 v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a));
1802 p += v_rotate_right<1>(p);
1803 p += v_rotate_right<2>(p);
1804 return v_reinterpret_as_u32(p) & v_setall_u32(0x000000ff);
1805 }
v_popcount(const v_uint64x2 & a)1806 inline v_uint64x2 v_popcount(const v_uint64x2& a)
1807 {
1808 return v_uint64x2(_mm_sad_epu8(v_popcount(v_reinterpret_as_u8(a)).val, _mm_setzero_si128()));
1809 }
v_popcount(const v_int8x16 & a)1810 inline v_uint8x16 v_popcount(const v_int8x16& a)
1811 { return v_popcount(v_reinterpret_as_u8(a)); }
v_popcount(const v_int16x8 & a)1812 inline v_uint16x8 v_popcount(const v_int16x8& a)
1813 { return v_popcount(v_reinterpret_as_u16(a)); }
v_popcount(const v_int32x4 & a)1814 inline v_uint32x4 v_popcount(const v_int32x4& a)
1815 { return v_popcount(v_reinterpret_as_u32(a)); }
v_popcount(const v_int64x2 & a)1816 inline v_uint64x2 v_popcount(const v_int64x2& a)
1817 { return v_popcount(v_reinterpret_as_u64(a)); }
1818
1819 #define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(_Tpvec, suffix, cast_op, allmask) \
1820 inline int v_signmask(const _Tpvec& a) { return _mm_movemask_##suffix(cast_op(a.val)); } \
1821 inline bool v_check_all(const _Tpvec& a) { return _mm_movemask_##suffix(cast_op(a.val)) == allmask; } \
1822 inline bool v_check_any(const _Tpvec& a) { return _mm_movemask_##suffix(cast_op(a.val)) != 0; }
1823 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint8x16, epi8, OPENCV_HAL_NOP, 65535)
1824 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int8x16, epi8, OPENCV_HAL_NOP, 65535)
1825 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint32x4, ps, _mm_castsi128_ps, 15)
1826 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int32x4, ps, _mm_castsi128_ps, 15)
1827 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint64x2, pd, _mm_castsi128_pd, 3)
1828 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int64x2, pd, _mm_castsi128_pd, 3)
1829 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float32x4, ps, OPENCV_HAL_NOP, 15)
1830 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float64x2, pd, OPENCV_HAL_NOP, 3)
1831
1832 #define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS_SHORT(_Tpvec) \
1833 inline int v_signmask(const _Tpvec& a) { return _mm_movemask_epi8(_mm_packs_epi16(a.val, a.val)) & 255; } \
1834 inline bool v_check_all(const _Tpvec& a) { return (_mm_movemask_epi8(a.val) & 0xaaaa) == 0xaaaa; } \
1835 inline bool v_check_any(const _Tpvec& a) { return (_mm_movemask_epi8(a.val) & 0xaaaa) != 0; }
OPENCV_HAL_IMPL_SSE_CHECK_SIGNS_SHORT(v_uint16x8)1836 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS_SHORT(v_uint16x8)
1837 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS_SHORT(v_int16x8)
1838
1839 inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
v_scan_forward(const v_uint8x16 & a)1840 inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
v_scan_forward(const v_int16x8 & a)1841 inline int v_scan_forward(const v_int16x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
v_scan_forward(const v_uint16x8 & a)1842 inline int v_scan_forward(const v_uint16x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
v_scan_forward(const v_int32x4 & a)1843 inline int v_scan_forward(const v_int32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
v_scan_forward(const v_uint32x4 & a)1844 inline int v_scan_forward(const v_uint32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
v_scan_forward(const v_float32x4 & a)1845 inline int v_scan_forward(const v_float32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
v_scan_forward(const v_int64x2 & a)1846 inline int v_scan_forward(const v_int64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
v_scan_forward(const v_uint64x2 & a)1847 inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
v_scan_forward(const v_float64x2 & a)1848 inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
1849
1850 #if CV_SSE4_1
1851 #define OPENCV_HAL_IMPL_SSE_SELECT(_Tpvec, cast_ret, cast, suffix) \
1852 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
1853 { \
1854 return _Tpvec(cast_ret(_mm_blendv_##suffix(cast(b.val), cast(a.val), cast(mask.val)))); \
1855 }
1856
OPENCV_HAL_IMPL_SSE_SELECT(v_uint8x16,OPENCV_HAL_NOP,OPENCV_HAL_NOP,epi8)1857 OPENCV_HAL_IMPL_SSE_SELECT(v_uint8x16, OPENCV_HAL_NOP, OPENCV_HAL_NOP, epi8)
1858 OPENCV_HAL_IMPL_SSE_SELECT(v_int8x16, OPENCV_HAL_NOP, OPENCV_HAL_NOP, epi8)
1859 OPENCV_HAL_IMPL_SSE_SELECT(v_uint16x8, OPENCV_HAL_NOP, OPENCV_HAL_NOP, epi8)
1860 OPENCV_HAL_IMPL_SSE_SELECT(v_int16x8, OPENCV_HAL_NOP, OPENCV_HAL_NOP, epi8)
1861 OPENCV_HAL_IMPL_SSE_SELECT(v_uint32x4, _mm_castps_si128, _mm_castsi128_ps, ps)
1862 OPENCV_HAL_IMPL_SSE_SELECT(v_int32x4, _mm_castps_si128, _mm_castsi128_ps, ps)
1863 // OPENCV_HAL_IMPL_SSE_SELECT(v_uint64x2, TBD, TBD, pd)
1864 // OPENCV_HAL_IMPL_SSE_SELECT(v_int64x2, TBD, TBD, ps)
1865 OPENCV_HAL_IMPL_SSE_SELECT(v_float32x4, OPENCV_HAL_NOP, OPENCV_HAL_NOP, ps)
1866 OPENCV_HAL_IMPL_SSE_SELECT(v_float64x2, OPENCV_HAL_NOP, OPENCV_HAL_NOP, pd)
1867
1868 #else // CV_SSE4_1
1869
1870 #define OPENCV_HAL_IMPL_SSE_SELECT(_Tpvec, suffix) \
1871 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
1872 { \
1873 return _Tpvec(_mm_xor_##suffix(b.val, _mm_and_##suffix(_mm_xor_##suffix(b.val, a.val), mask.val))); \
1874 }
1875
1876 OPENCV_HAL_IMPL_SSE_SELECT(v_uint8x16, si128)
1877 OPENCV_HAL_IMPL_SSE_SELECT(v_int8x16, si128)
1878 OPENCV_HAL_IMPL_SSE_SELECT(v_uint16x8, si128)
1879 OPENCV_HAL_IMPL_SSE_SELECT(v_int16x8, si128)
1880 OPENCV_HAL_IMPL_SSE_SELECT(v_uint32x4, si128)
1881 OPENCV_HAL_IMPL_SSE_SELECT(v_int32x4, si128)
1882 // OPENCV_HAL_IMPL_SSE_SELECT(v_uint64x2, si128)
1883 // OPENCV_HAL_IMPL_SSE_SELECT(v_int64x2, si128)
1884 OPENCV_HAL_IMPL_SSE_SELECT(v_float32x4, ps)
1885 OPENCV_HAL_IMPL_SSE_SELECT(v_float64x2, pd)
1886 #endif
1887
1888 /* Expand */
1889 #define OPENCV_HAL_IMPL_SSE_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin) \
1890 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
1891 { \
1892 b0.val = intrin(a.val); \
1893 b1.val = __CV_CAT(intrin, _high)(a.val); \
1894 } \
1895 inline _Tpwvec v_expand_low(const _Tpvec& a) \
1896 { return _Tpwvec(intrin(a.val)); } \
1897 inline _Tpwvec v_expand_high(const _Tpvec& a) \
1898 { return _Tpwvec(__CV_CAT(intrin, _high)(a.val)); } \
1899 inline _Tpwvec v_load_expand(const _Tp* ptr) \
1900 { \
1901 __m128i a = _mm_loadl_epi64((const __m128i*)ptr); \
1902 return _Tpwvec(intrin(a)); \
1903 }
1904
1905 OPENCV_HAL_IMPL_SSE_EXPAND(v_uint8x16, v_uint16x8, uchar, _v128_cvtepu8_epi16)
1906 OPENCV_HAL_IMPL_SSE_EXPAND(v_int8x16, v_int16x8, schar, _v128_cvtepi8_epi16)
1907 OPENCV_HAL_IMPL_SSE_EXPAND(v_uint16x8, v_uint32x4, ushort, _v128_cvtepu16_epi32)
1908 OPENCV_HAL_IMPL_SSE_EXPAND(v_int16x8, v_int32x4, short, _v128_cvtepi16_epi32)
1909 OPENCV_HAL_IMPL_SSE_EXPAND(v_uint32x4, v_uint64x2, unsigned, _v128_cvtepu32_epi64)
1910 OPENCV_HAL_IMPL_SSE_EXPAND(v_int32x4, v_int64x2, int, _v128_cvtepi32_epi64)
1911
1912 #define OPENCV_HAL_IMPL_SSE_EXPAND_Q(_Tpvec, _Tp, intrin) \
1913 inline _Tpvec v_load_expand_q(const _Tp* ptr) \
1914 { \
1915 __m128i a = _mm_cvtsi32_si128(*(const int*)ptr); \
1916 return _Tpvec(intrin(a)); \
1917 }
1918
1919 OPENCV_HAL_IMPL_SSE_EXPAND_Q(v_uint32x4, uchar, _v128_cvtepu8_epi32)
1920 OPENCV_HAL_IMPL_SSE_EXPAND_Q(v_int32x4, schar, _v128_cvtepi8_epi32)
1921
1922 #define OPENCV_HAL_IMPL_SSE_UNPACKS(_Tpvec, suffix, cast_from, cast_to) \
1923 inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) \
1924 { \
1925 b0.val = _mm_unpacklo_##suffix(a0.val, a1.val); \
1926 b1.val = _mm_unpackhi_##suffix(a0.val, a1.val); \
1927 } \
1928 inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
1929 { \
1930 __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
1931 return _Tpvec(cast_to(_mm_unpacklo_epi64(a1, b1))); \
1932 } \
1933 inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
1934 { \
1935 __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
1936 return _Tpvec(cast_to(_mm_unpackhi_epi64(a1, b1))); \
1937 } \
1938 inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d) \
1939 { \
1940 __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
1941 c.val = cast_to(_mm_unpacklo_epi64(a1, b1)); \
1942 d.val = cast_to(_mm_unpackhi_epi64(a1, b1)); \
1943 }
1944
1945 OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1946 OPENCV_HAL_IMPL_SSE_UNPACKS(v_int8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1947 OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint16x8, epi16, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1948 OPENCV_HAL_IMPL_SSE_UNPACKS(v_int16x8, epi16, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1949 OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1950 OPENCV_HAL_IMPL_SSE_UNPACKS(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1951 OPENCV_HAL_IMPL_SSE_UNPACKS(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps)
1952 OPENCV_HAL_IMPL_SSE_UNPACKS(v_float64x2, pd, _mm_castpd_si128, _mm_castsi128_pd)
1953
1954 inline v_uint8x16 v_reverse(const v_uint8x16 &a)
1955 {
1956 #if CV_SSSE3
1957 static const __m128i perm = _mm_setr_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1958 return v_uint8x16(_mm_shuffle_epi8(a.val, perm));
1959 #else
1960 uchar CV_DECL_ALIGNED(32) d[16];
1961 v_store_aligned(d, a);
1962 return v_uint8x16(d[15], d[14], d[13], d[12], d[11], d[10], d[9], d[8], d[7], d[6], d[5], d[4], d[3], d[2], d[1], d[0]);
1963 #endif
1964 }
1965
v_reverse(const v_int8x16 & a)1966 inline v_int8x16 v_reverse(const v_int8x16 &a)
1967 { return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
1968
v_reverse(const v_uint16x8 & a)1969 inline v_uint16x8 v_reverse(const v_uint16x8 &a)
1970 {
1971 #if CV_SSSE3
1972 static const __m128i perm = _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
1973 return v_uint16x8(_mm_shuffle_epi8(a.val, perm));
1974 #else
1975 __m128i r = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 1, 2, 3));
1976 r = _mm_shufflelo_epi16(r, _MM_SHUFFLE(2, 3, 0, 1));
1977 r = _mm_shufflehi_epi16(r, _MM_SHUFFLE(2, 3, 0, 1));
1978 return v_uint16x8(r);
1979 #endif
1980 }
1981
v_reverse(const v_int16x8 & a)1982 inline v_int16x8 v_reverse(const v_int16x8 &a)
1983 { return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
1984
v_reverse(const v_uint32x4 & a)1985 inline v_uint32x4 v_reverse(const v_uint32x4 &a)
1986 {
1987 return v_uint32x4(_mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 1, 2, 3)));
1988 }
1989
v_reverse(const v_int32x4 & a)1990 inline v_int32x4 v_reverse(const v_int32x4 &a)
1991 { return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
1992
v_reverse(const v_float32x4 & a)1993 inline v_float32x4 v_reverse(const v_float32x4 &a)
1994 { return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
1995
v_reverse(const v_uint64x2 & a)1996 inline v_uint64x2 v_reverse(const v_uint64x2 &a)
1997 {
1998 return v_uint64x2(_mm_shuffle_epi32(a.val, _MM_SHUFFLE(1, 0, 3, 2)));
1999 }
2000
v_reverse(const v_int64x2 & a)2001 inline v_int64x2 v_reverse(const v_int64x2 &a)
2002 { return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
2003
v_reverse(const v_float64x2 & a)2004 inline v_float64x2 v_reverse(const v_float64x2 &a)
2005 { return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
2006
2007 template<int s, typename _Tpvec>
v_extract(const _Tpvec & a,const _Tpvec & b)2008 inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)
2009 {
2010 return v_rotate_right<s>(a, b);
2011 }
2012
v_round(const v_float32x4 & a)2013 inline v_int32x4 v_round(const v_float32x4& a)
2014 { return v_int32x4(_mm_cvtps_epi32(a.val)); }
2015
v_floor(const v_float32x4 & a)2016 inline v_int32x4 v_floor(const v_float32x4& a)
2017 {
2018 __m128i a1 = _mm_cvtps_epi32(a.val);
2019 __m128i mask = _mm_castps_si128(_mm_cmpgt_ps(_mm_cvtepi32_ps(a1), a.val));
2020 return v_int32x4(_mm_add_epi32(a1, mask));
2021 }
2022
v_ceil(const v_float32x4 & a)2023 inline v_int32x4 v_ceil(const v_float32x4& a)
2024 {
2025 __m128i a1 = _mm_cvtps_epi32(a.val);
2026 __m128i mask = _mm_castps_si128(_mm_cmpgt_ps(a.val, _mm_cvtepi32_ps(a1)));
2027 return v_int32x4(_mm_sub_epi32(a1, mask));
2028 }
2029
v_trunc(const v_float32x4 & a)2030 inline v_int32x4 v_trunc(const v_float32x4& a)
2031 { return v_int32x4(_mm_cvttps_epi32(a.val)); }
2032
v_round(const v_float64x2 & a)2033 inline v_int32x4 v_round(const v_float64x2& a)
2034 { return v_int32x4(_mm_cvtpd_epi32(a.val)); }
2035
v_round(const v_float64x2 & a,const v_float64x2 & b)2036 inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
2037 {
2038 __m128i ai = _mm_cvtpd_epi32(a.val), bi = _mm_cvtpd_epi32(b.val);
2039 return v_int32x4(_mm_unpacklo_epi64(ai, bi));
2040 }
2041
v_floor(const v_float64x2 & a)2042 inline v_int32x4 v_floor(const v_float64x2& a)
2043 {
2044 __m128i a1 = _mm_cvtpd_epi32(a.val);
2045 __m128i mask = _mm_castpd_si128(_mm_cmpgt_pd(_mm_cvtepi32_pd(a1), a.val));
2046 mask = _mm_srli_si128(_mm_slli_si128(mask, 4), 8); // m0 m0 m1 m1 => m0 m1 0 0
2047 return v_int32x4(_mm_add_epi32(a1, mask));
2048 }
2049
v_ceil(const v_float64x2 & a)2050 inline v_int32x4 v_ceil(const v_float64x2& a)
2051 {
2052 __m128i a1 = _mm_cvtpd_epi32(a.val);
2053 __m128i mask = _mm_castpd_si128(_mm_cmpgt_pd(a.val, _mm_cvtepi32_pd(a1)));
2054 mask = _mm_srli_si128(_mm_slli_si128(mask, 4), 8); // m0 m0 m1 m1 => m0 m1 0 0
2055 return v_int32x4(_mm_sub_epi32(a1, mask));
2056 }
2057
v_trunc(const v_float64x2 & a)2058 inline v_int32x4 v_trunc(const v_float64x2& a)
2059 { return v_int32x4(_mm_cvttpd_epi32(a.val)); }
2060
2061 #define OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(_Tpvec, suffix, cast_from, cast_to) \
2062 inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
2063 const _Tpvec& a2, const _Tpvec& a3, \
2064 _Tpvec& b0, _Tpvec& b1, \
2065 _Tpvec& b2, _Tpvec& b3) \
2066 { \
2067 __m128i t0 = cast_from(_mm_unpacklo_##suffix(a0.val, a1.val)); \
2068 __m128i t1 = cast_from(_mm_unpacklo_##suffix(a2.val, a3.val)); \
2069 __m128i t2 = cast_from(_mm_unpackhi_##suffix(a0.val, a1.val)); \
2070 __m128i t3 = cast_from(_mm_unpackhi_##suffix(a2.val, a3.val)); \
2071 \
2072 b0.val = cast_to(_mm_unpacklo_epi64(t0, t1)); \
2073 b1.val = cast_to(_mm_unpackhi_epi64(t0, t1)); \
2074 b2.val = cast_to(_mm_unpacklo_epi64(t2, t3)); \
2075 b3.val = cast_to(_mm_unpackhi_epi64(t2, t3)); \
2076 }
2077
OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_uint32x4,epi32,OPENCV_HAL_NOP,OPENCV_HAL_NOP)2078 OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
2079 OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
2080 OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps)
2081
2082 // load deinterleave
2083 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b)
2084 {
2085 __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
2086 __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 16));
2087
2088 __m128i t10 = _mm_unpacklo_epi8(t00, t01);
2089 __m128i t11 = _mm_unpackhi_epi8(t00, t01);
2090
2091 __m128i t20 = _mm_unpacklo_epi8(t10, t11);
2092 __m128i t21 = _mm_unpackhi_epi8(t10, t11);
2093
2094 __m128i t30 = _mm_unpacklo_epi8(t20, t21);
2095 __m128i t31 = _mm_unpackhi_epi8(t20, t21);
2096
2097 a.val = _mm_unpacklo_epi8(t30, t31);
2098 b.val = _mm_unpackhi_epi8(t30, t31);
2099 }
2100
v_load_deinterleave(const uchar * ptr,v_uint8x16 & a,v_uint8x16 & b,v_uint8x16 & c)2101 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c)
2102 {
2103 #if CV_SSE4_1
2104 const __m128i m0 = _mm_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
2105 const __m128i m1 = _mm_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
2106 __m128i s0 = _mm_loadu_si128((const __m128i*)ptr);
2107 __m128i s1 = _mm_loadu_si128((const __m128i*)(ptr + 16));
2108 __m128i s2 = _mm_loadu_si128((const __m128i*)(ptr + 32));
2109 __m128i a0 = _mm_blendv_epi8(_mm_blendv_epi8(s0, s1, m0), s2, m1);
2110 __m128i b0 = _mm_blendv_epi8(_mm_blendv_epi8(s1, s2, m0), s0, m1);
2111 __m128i c0 = _mm_blendv_epi8(_mm_blendv_epi8(s2, s0, m0), s1, m1);
2112 const __m128i sh_b = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13);
2113 const __m128i sh_g = _mm_setr_epi8(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14);
2114 const __m128i sh_r = _mm_setr_epi8(2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15);
2115 a0 = _mm_shuffle_epi8(a0, sh_b);
2116 b0 = _mm_shuffle_epi8(b0, sh_g);
2117 c0 = _mm_shuffle_epi8(c0, sh_r);
2118 a.val = a0;
2119 b.val = b0;
2120 c.val = c0;
2121 #elif CV_SSSE3
2122 const __m128i m0 = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 1, 4, 7, 10, 13, 2, 5, 8, 11, 14);
2123 const __m128i m1 = _mm_alignr_epi8(m0, m0, 11);
2124 const __m128i m2 = _mm_alignr_epi8(m0, m0, 6);
2125
2126 __m128i t0 = _mm_loadu_si128((const __m128i*)ptr);
2127 __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 16));
2128 __m128i t2 = _mm_loadu_si128((const __m128i*)(ptr + 32));
2129
2130 __m128i s0 = _mm_shuffle_epi8(t0, m0);
2131 __m128i s1 = _mm_shuffle_epi8(t1, m1);
2132 __m128i s2 = _mm_shuffle_epi8(t2, m2);
2133
2134 t0 = _mm_alignr_epi8(s1, _mm_slli_si128(s0, 10), 5);
2135 a.val = _mm_alignr_epi8(s2, t0, 5);
2136
2137 t1 = _mm_alignr_epi8(_mm_srli_si128(s1, 5), _mm_slli_si128(s0, 5), 6);
2138 b.val = _mm_alignr_epi8(_mm_srli_si128(s2, 5), t1, 5);
2139
2140 t2 = _mm_alignr_epi8(_mm_srli_si128(s2, 10), s1, 11);
2141 c.val = _mm_alignr_epi8(t2, s0, 11);
2142 #else
2143 __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
2144 __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 16));
2145 __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 32));
2146
2147 __m128i t10 = _mm_unpacklo_epi8(t00, _mm_unpackhi_epi64(t01, t01));
2148 __m128i t11 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t00, t00), t02);
2149 __m128i t12 = _mm_unpacklo_epi8(t01, _mm_unpackhi_epi64(t02, t02));
2150
2151 __m128i t20 = _mm_unpacklo_epi8(t10, _mm_unpackhi_epi64(t11, t11));
2152 __m128i t21 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t10, t10), t12);
2153 __m128i t22 = _mm_unpacklo_epi8(t11, _mm_unpackhi_epi64(t12, t12));
2154
2155 __m128i t30 = _mm_unpacklo_epi8(t20, _mm_unpackhi_epi64(t21, t21));
2156 __m128i t31 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t20, t20), t22);
2157 __m128i t32 = _mm_unpacklo_epi8(t21, _mm_unpackhi_epi64(t22, t22));
2158
2159 a.val = _mm_unpacklo_epi8(t30, _mm_unpackhi_epi64(t31, t31));
2160 b.val = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t30, t30), t32);
2161 c.val = _mm_unpacklo_epi8(t31, _mm_unpackhi_epi64(t32, t32));
2162 #endif
2163 }
2164
v_load_deinterleave(const uchar * ptr,v_uint8x16 & a,v_uint8x16 & b,v_uint8x16 & c,v_uint8x16 & d)2165 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c, v_uint8x16& d)
2166 {
2167 __m128i u0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0 c0 d0 a1 b1 c1 d1 ...
2168 __m128i u1 = _mm_loadu_si128((const __m128i*)(ptr + 16)); // a4 b4 c4 d4 ...
2169 __m128i u2 = _mm_loadu_si128((const __m128i*)(ptr + 32)); // a8 b8 c8 d8 ...
2170 __m128i u3 = _mm_loadu_si128((const __m128i*)(ptr + 48)); // a12 b12 c12 d12 ...
2171
2172 __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 a8 b0 b8 ...
2173 __m128i v1 = _mm_unpackhi_epi8(u0, u2); // a2 a10 b2 b10 ...
2174 __m128i v2 = _mm_unpacklo_epi8(u1, u3); // a4 a12 b4 b12 ...
2175 __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a6 a14 b6 b14 ...
2176
2177 u0 = _mm_unpacklo_epi8(v0, v2); // a0 a4 a8 a12 ...
2178 u1 = _mm_unpacklo_epi8(v1, v3); // a2 a6 a10 a14 ...
2179 u2 = _mm_unpackhi_epi8(v0, v2); // a1 a5 a9 a13 ...
2180 u3 = _mm_unpackhi_epi8(v1, v3); // a3 a7 a11 a15 ...
2181
2182 v0 = _mm_unpacklo_epi8(u0, u1); // a0 a2 a4 a6 ...
2183 v1 = _mm_unpacklo_epi8(u2, u3); // a1 a3 a5 a7 ...
2184 v2 = _mm_unpackhi_epi8(u0, u1); // c0 c2 c4 c6 ...
2185 v3 = _mm_unpackhi_epi8(u2, u3); // c1 c3 c5 c7 ...
2186
2187 a.val = _mm_unpacklo_epi8(v0, v1);
2188 b.val = _mm_unpackhi_epi8(v0, v1);
2189 c.val = _mm_unpacklo_epi8(v2, v3);
2190 d.val = _mm_unpackhi_epi8(v2, v3);
2191 }
2192
v_load_deinterleave(const ushort * ptr,v_uint16x8 & a,v_uint16x8 & b)2193 inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b)
2194 {
2195 __m128i v0 = _mm_loadu_si128((__m128i*)(ptr)); // a0 b0 a1 b1 a2 b2 a3 b3
2196 __m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 8)); // a4 b4 a5 b5 a6 b6 a7 b7
2197
2198 __m128i v2 = _mm_unpacklo_epi16(v0, v1); // a0 a4 b0 b4 a1 a5 b1 b5
2199 __m128i v3 = _mm_unpackhi_epi16(v0, v1); // a2 a6 b2 b6 a3 a7 b3 b7
2200 __m128i v4 = _mm_unpacklo_epi16(v2, v3); // a0 a2 a4 a6 b0 b2 b4 b6
2201 __m128i v5 = _mm_unpackhi_epi16(v2, v3); // a1 a3 a5 a7 b1 b3 b5 b7
2202
2203 a.val = _mm_unpacklo_epi16(v4, v5); // a0 a1 a2 a3 a4 a5 a6 a7
2204 b.val = _mm_unpackhi_epi16(v4, v5); // b0 b1 ab b3 b4 b5 b6 b7
2205 }
2206
v_load_deinterleave(const ushort * ptr,v_uint16x8 & a,v_uint16x8 & b,v_uint16x8 & c)2207 inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c)
2208 {
2209 #if CV_SSE4_1
2210 __m128i v0 = _mm_loadu_si128((__m128i*)(ptr));
2211 __m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 8));
2212 __m128i v2 = _mm_loadu_si128((__m128i*)(ptr + 16));
2213 __m128i a0 = _mm_blend_epi16(_mm_blend_epi16(v0, v1, 0x92), v2, 0x24);
2214 __m128i b0 = _mm_blend_epi16(_mm_blend_epi16(v2, v0, 0x92), v1, 0x24);
2215 __m128i c0 = _mm_blend_epi16(_mm_blend_epi16(v1, v2, 0x92), v0, 0x24);
2216
2217 const __m128i sh_a = _mm_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
2218 const __m128i sh_b = _mm_setr_epi8(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13);
2219 const __m128i sh_c = _mm_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
2220 a0 = _mm_shuffle_epi8(a0, sh_a);
2221 b0 = _mm_shuffle_epi8(b0, sh_b);
2222 c0 = _mm_shuffle_epi8(c0, sh_c);
2223
2224 a.val = a0;
2225 b.val = b0;
2226 c.val = c0;
2227 #else
2228 __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
2229 __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 8));
2230 __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 16));
2231
2232 __m128i t10 = _mm_unpacklo_epi16(t00, _mm_unpackhi_epi64(t01, t01));
2233 __m128i t11 = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t00, t00), t02);
2234 __m128i t12 = _mm_unpacklo_epi16(t01, _mm_unpackhi_epi64(t02, t02));
2235
2236 __m128i t20 = _mm_unpacklo_epi16(t10, _mm_unpackhi_epi64(t11, t11));
2237 __m128i t21 = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t10, t10), t12);
2238 __m128i t22 = _mm_unpacklo_epi16(t11, _mm_unpackhi_epi64(t12, t12));
2239
2240 a.val = _mm_unpacklo_epi16(t20, _mm_unpackhi_epi64(t21, t21));
2241 b.val = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t20, t20), t22);
2242 c.val = _mm_unpacklo_epi16(t21, _mm_unpackhi_epi64(t22, t22));
2243 #endif
2244 }
2245
v_load_deinterleave(const ushort * ptr,v_uint16x8 & a,v_uint16x8 & b,v_uint16x8 & c,v_uint16x8 & d)2246 inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c, v_uint16x8& d)
2247 {
2248 __m128i u0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0 c0 d0 a1 b1 c1 d1
2249 __m128i u1 = _mm_loadu_si128((const __m128i*)(ptr + 8)); // a2 b2 c2 d2 ...
2250 __m128i u2 = _mm_loadu_si128((const __m128i*)(ptr + 16)); // a4 b4 c4 d4 ...
2251 __m128i u3 = _mm_loadu_si128((const __m128i*)(ptr + 24)); // a6 b6 c6 d6 ...
2252
2253 __m128i v0 = _mm_unpacklo_epi16(u0, u2); // a0 a4 b0 b4 ...
2254 __m128i v1 = _mm_unpackhi_epi16(u0, u2); // a1 a5 b1 b5 ...
2255 __m128i v2 = _mm_unpacklo_epi16(u1, u3); // a2 a6 b2 b6 ...
2256 __m128i v3 = _mm_unpackhi_epi16(u1, u3); // a3 a7 b3 b7 ...
2257
2258 u0 = _mm_unpacklo_epi16(v0, v2); // a0 a2 a4 a6 ...
2259 u1 = _mm_unpacklo_epi16(v1, v3); // a1 a3 a5 a7 ...
2260 u2 = _mm_unpackhi_epi16(v0, v2); // c0 c2 c4 c6 ...
2261 u3 = _mm_unpackhi_epi16(v1, v3); // c1 c3 c5 c7 ...
2262
2263 a.val = _mm_unpacklo_epi16(u0, u1);
2264 b.val = _mm_unpackhi_epi16(u0, u1);
2265 c.val = _mm_unpacklo_epi16(u2, u3);
2266 d.val = _mm_unpackhi_epi16(u2, u3);
2267 }
2268
v_load_deinterleave(const unsigned * ptr,v_uint32x4 & a,v_uint32x4 & b)2269 inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b)
2270 {
2271 __m128i v0 = _mm_loadu_si128((__m128i*)(ptr)); // a0 b0 a1 b1
2272 __m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 4)); // a2 b2 a3 b3
2273
2274 __m128i v2 = _mm_unpacklo_epi32(v0, v1); // a0 a2 b0 b2
2275 __m128i v3 = _mm_unpackhi_epi32(v0, v1); // a1 a3 b1 b3
2276
2277 a.val = _mm_unpacklo_epi32(v2, v3); // a0 a1 a2 a3
2278 b.val = _mm_unpackhi_epi32(v2, v3); // b0 b1 ab b3
2279 }
2280
v_load_deinterleave(const unsigned * ptr,v_uint32x4 & a,v_uint32x4 & b,v_uint32x4 & c)2281 inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c)
2282 {
2283 __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
2284 __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 4));
2285 __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 8));
2286
2287 __m128i t10 = _mm_unpacklo_epi32(t00, _mm_unpackhi_epi64(t01, t01));
2288 __m128i t11 = _mm_unpacklo_epi32(_mm_unpackhi_epi64(t00, t00), t02);
2289 __m128i t12 = _mm_unpacklo_epi32(t01, _mm_unpackhi_epi64(t02, t02));
2290
2291 a.val = _mm_unpacklo_epi32(t10, _mm_unpackhi_epi64(t11, t11));
2292 b.val = _mm_unpacklo_epi32(_mm_unpackhi_epi64(t10, t10), t12);
2293 c.val = _mm_unpacklo_epi32(t11, _mm_unpackhi_epi64(t12, t12));
2294 }
2295
v_load_deinterleave(const unsigned * ptr,v_uint32x4 & a,v_uint32x4 & b,v_uint32x4 & c,v_uint32x4 & d)2296 inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c, v_uint32x4& d)
2297 {
2298 v_uint32x4 s0(_mm_loadu_si128((const __m128i*)ptr)); // a0 b0 c0 d0
2299 v_uint32x4 s1(_mm_loadu_si128((const __m128i*)(ptr + 4))); // a1 b1 c1 d1
2300 v_uint32x4 s2(_mm_loadu_si128((const __m128i*)(ptr + 8))); // a2 b2 c2 d2
2301 v_uint32x4 s3(_mm_loadu_si128((const __m128i*)(ptr + 12))); // a3 b3 c3 d3
2302
2303 v_transpose4x4(s0, s1, s2, s3, a, b, c, d);
2304 }
2305
v_load_deinterleave(const float * ptr,v_float32x4 & a,v_float32x4 & b)2306 inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b)
2307 {
2308 __m128 u0 = _mm_loadu_ps(ptr); // a0 b0 a1 b1
2309 __m128 u1 = _mm_loadu_ps((ptr + 4)); // a2 b2 a3 b3
2310
2311 a.val = _mm_shuffle_ps(u0, u1, _MM_SHUFFLE(2, 0, 2, 0)); // a0 a1 a2 a3
2312 b.val = _mm_shuffle_ps(u0, u1, _MM_SHUFFLE(3, 1, 3, 1)); // b0 b1 ab b3
2313 }
2314
v_load_deinterleave(const float * ptr,v_float32x4 & a,v_float32x4 & b,v_float32x4 & c)2315 inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b, v_float32x4& c)
2316 {
2317 __m128 t0 = _mm_loadu_ps(ptr + 0);
2318 __m128 t1 = _mm_loadu_ps(ptr + 4);
2319 __m128 t2 = _mm_loadu_ps(ptr + 8);
2320
2321 __m128 at12 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(0, 1, 0, 2));
2322 a.val = _mm_shuffle_ps(t0, at12, _MM_SHUFFLE(2, 0, 3, 0));
2323
2324 __m128 bt01 = _mm_shuffle_ps(t0, t1, _MM_SHUFFLE(0, 0, 0, 1));
2325 __m128 bt12 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(0, 2, 0, 3));
2326 b.val = _mm_shuffle_ps(bt01, bt12, _MM_SHUFFLE(2, 0, 2, 0));
2327
2328 __m128 ct01 = _mm_shuffle_ps(t0, t1, _MM_SHUFFLE(0, 1, 0, 2));
2329 c.val = _mm_shuffle_ps(ct01, t2, _MM_SHUFFLE(3, 0, 2, 0));
2330 }
2331
v_load_deinterleave(const float * ptr,v_float32x4 & a,v_float32x4 & b,v_float32x4 & c,v_float32x4 & d)2332 inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b, v_float32x4& c, v_float32x4& d)
2333 {
2334 __m128 t0 = _mm_loadu_ps(ptr + 0);
2335 __m128 t1 = _mm_loadu_ps(ptr + 4);
2336 __m128 t2 = _mm_loadu_ps(ptr + 8);
2337 __m128 t3 = _mm_loadu_ps(ptr + 12);
2338 __m128 t02lo = _mm_unpacklo_ps(t0, t2);
2339 __m128 t13lo = _mm_unpacklo_ps(t1, t3);
2340 __m128 t02hi = _mm_unpackhi_ps(t0, t2);
2341 __m128 t13hi = _mm_unpackhi_ps(t1, t3);
2342 a.val = _mm_unpacklo_ps(t02lo, t13lo);
2343 b.val = _mm_unpackhi_ps(t02lo, t13lo);
2344 c.val = _mm_unpacklo_ps(t02hi, t13hi);
2345 d.val = _mm_unpackhi_ps(t02hi, t13hi);
2346 }
2347
v_load_deinterleave(const uint64 * ptr,v_uint64x2 & a,v_uint64x2 & b)2348 inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b)
2349 {
2350 __m128i t0 = _mm_loadu_si128((const __m128i*)ptr);
2351 __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 2));
2352
2353 a = v_uint64x2(_mm_unpacklo_epi64(t0, t1));
2354 b = v_uint64x2(_mm_unpackhi_epi64(t0, t1));
2355 }
2356
v_load_deinterleave(const uint64 * ptr,v_uint64x2 & a,v_uint64x2 & b,v_uint64x2 & c)2357 inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b, v_uint64x2& c)
2358 {
2359 __m128i t0 = _mm_loadu_si128((const __m128i*)ptr); // a0, b0
2360 __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 2)); // c0, a1
2361 __m128i t2 = _mm_loadu_si128((const __m128i*)(ptr + 4)); // b1, c1
2362
2363 t1 = _mm_shuffle_epi32(t1, 0x4e); // a1, c0
2364
2365 a = v_uint64x2(_mm_unpacklo_epi64(t0, t1));
2366 b = v_uint64x2(_mm_unpacklo_epi64(_mm_unpackhi_epi64(t0, t0), t2));
2367 c = v_uint64x2(_mm_unpackhi_epi64(t1, t2));
2368 }
2369
v_load_deinterleave(const uint64 * ptr,v_uint64x2 & a,v_uint64x2 & b,v_uint64x2 & c,v_uint64x2 & d)2370 inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a,
2371 v_uint64x2& b, v_uint64x2& c, v_uint64x2& d)
2372 {
2373 __m128i t0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0
2374 __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 2)); // c0 d0
2375 __m128i t2 = _mm_loadu_si128((const __m128i*)(ptr + 4)); // a1 b1
2376 __m128i t3 = _mm_loadu_si128((const __m128i*)(ptr + 6)); // c1 d1
2377
2378 a = v_uint64x2(_mm_unpacklo_epi64(t0, t2));
2379 b = v_uint64x2(_mm_unpackhi_epi64(t0, t2));
2380 c = v_uint64x2(_mm_unpacklo_epi64(t1, t3));
2381 d = v_uint64x2(_mm_unpackhi_epi64(t1, t3));
2382 }
2383
2384 // store interleave
2385
v_store_interleave(uchar * ptr,const v_uint8x16 & a,const v_uint8x16 & b,hal::StoreMode mode=hal::STORE_UNALIGNED)2386 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
2387 hal::StoreMode mode = hal::STORE_UNALIGNED)
2388 {
2389 __m128i v0 = _mm_unpacklo_epi8(a.val, b.val);
2390 __m128i v1 = _mm_unpackhi_epi8(a.val, b.val);
2391
2392 if( mode == hal::STORE_ALIGNED_NOCACHE )
2393 {
2394 _mm_stream_si128((__m128i*)(ptr), v0);
2395 _mm_stream_si128((__m128i*)(ptr + 16), v1);
2396 }
2397 else if( mode == hal::STORE_ALIGNED )
2398 {
2399 _mm_store_si128((__m128i*)(ptr), v0);
2400 _mm_store_si128((__m128i*)(ptr + 16), v1);
2401 }
2402 else
2403 {
2404 _mm_storeu_si128((__m128i*)(ptr), v0);
2405 _mm_storeu_si128((__m128i*)(ptr + 16), v1);
2406 }
2407 }
2408
v_store_interleave(uchar * ptr,const v_uint8x16 & a,const v_uint8x16 & b,const v_uint8x16 & c,hal::StoreMode mode=hal::STORE_UNALIGNED)2409 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
2410 const v_uint8x16& c, hal::StoreMode mode = hal::STORE_UNALIGNED)
2411 {
2412 #if CV_SSE4_1
2413 const __m128i sh_a = _mm_setr_epi8(0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5);
2414 const __m128i sh_b = _mm_setr_epi8(5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10);
2415 const __m128i sh_c = _mm_setr_epi8(10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15);
2416 __m128i a0 = _mm_shuffle_epi8(a.val, sh_a);
2417 __m128i b0 = _mm_shuffle_epi8(b.val, sh_b);
2418 __m128i c0 = _mm_shuffle_epi8(c.val, sh_c);
2419
2420 const __m128i m0 = _mm_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
2421 const __m128i m1 = _mm_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
2422 __m128i v0 = _mm_blendv_epi8(_mm_blendv_epi8(a0, b0, m1), c0, m0);
2423 __m128i v1 = _mm_blendv_epi8(_mm_blendv_epi8(b0, c0, m1), a0, m0);
2424 __m128i v2 = _mm_blendv_epi8(_mm_blendv_epi8(c0, a0, m1), b0, m0);
2425 #elif CV_SSSE3
2426 const __m128i m0 = _mm_setr_epi8(0, 6, 11, 1, 7, 12, 2, 8, 13, 3, 9, 14, 4, 10, 15, 5);
2427 const __m128i m1 = _mm_setr_epi8(5, 11, 0, 6, 12, 1, 7, 13, 2, 8, 14, 3, 9, 15, 4, 10);
2428 const __m128i m2 = _mm_setr_epi8(10, 0, 5, 11, 1, 6, 12, 2, 7, 13, 3, 8, 14, 4, 9, 15);
2429
2430 __m128i t0 = _mm_alignr_epi8(b.val, _mm_slli_si128(a.val, 10), 5);
2431 t0 = _mm_alignr_epi8(c.val, t0, 5);
2432 __m128i v0 = _mm_shuffle_epi8(t0, m0);
2433
2434 __m128i t1 = _mm_alignr_epi8(_mm_srli_si128(b.val, 5), _mm_slli_si128(a.val, 5), 6);
2435 t1 = _mm_alignr_epi8(_mm_srli_si128(c.val, 5), t1, 5);
2436 __m128i v1 = _mm_shuffle_epi8(t1, m1);
2437
2438 __m128i t2 = _mm_alignr_epi8(_mm_srli_si128(c.val, 10), b.val, 11);
2439 t2 = _mm_alignr_epi8(t2, a.val, 11);
2440 __m128i v2 = _mm_shuffle_epi8(t2, m2);
2441 #else
2442 __m128i z = _mm_setzero_si128();
2443 __m128i ab0 = _mm_unpacklo_epi8(a.val, b.val);
2444 __m128i ab1 = _mm_unpackhi_epi8(a.val, b.val);
2445 __m128i c0 = _mm_unpacklo_epi8(c.val, z);
2446 __m128i c1 = _mm_unpackhi_epi8(c.val, z);
2447
2448 __m128i p00 = _mm_unpacklo_epi16(ab0, c0);
2449 __m128i p01 = _mm_unpackhi_epi16(ab0, c0);
2450 __m128i p02 = _mm_unpacklo_epi16(ab1, c1);
2451 __m128i p03 = _mm_unpackhi_epi16(ab1, c1);
2452
2453 __m128i p10 = _mm_unpacklo_epi32(p00, p01);
2454 __m128i p11 = _mm_unpackhi_epi32(p00, p01);
2455 __m128i p12 = _mm_unpacklo_epi32(p02, p03);
2456 __m128i p13 = _mm_unpackhi_epi32(p02, p03);
2457
2458 __m128i p20 = _mm_unpacklo_epi64(p10, p11);
2459 __m128i p21 = _mm_unpackhi_epi64(p10, p11);
2460 __m128i p22 = _mm_unpacklo_epi64(p12, p13);
2461 __m128i p23 = _mm_unpackhi_epi64(p12, p13);
2462
2463 p20 = _mm_slli_si128(p20, 1);
2464 p22 = _mm_slli_si128(p22, 1);
2465
2466 __m128i p30 = _mm_slli_epi64(_mm_unpacklo_epi32(p20, p21), 8);
2467 __m128i p31 = _mm_srli_epi64(_mm_unpackhi_epi32(p20, p21), 8);
2468 __m128i p32 = _mm_slli_epi64(_mm_unpacklo_epi32(p22, p23), 8);
2469 __m128i p33 = _mm_srli_epi64(_mm_unpackhi_epi32(p22, p23), 8);
2470
2471 __m128i p40 = _mm_unpacklo_epi64(p30, p31);
2472 __m128i p41 = _mm_unpackhi_epi64(p30, p31);
2473 __m128i p42 = _mm_unpacklo_epi64(p32, p33);
2474 __m128i p43 = _mm_unpackhi_epi64(p32, p33);
2475
2476 __m128i v0 = _mm_or_si128(_mm_srli_si128(p40, 2), _mm_slli_si128(p41, 10));
2477 __m128i v1 = _mm_or_si128(_mm_srli_si128(p41, 6), _mm_slli_si128(p42, 6));
2478 __m128i v2 = _mm_or_si128(_mm_srli_si128(p42, 10), _mm_slli_si128(p43, 2));
2479 #endif
2480
2481 if( mode == hal::STORE_ALIGNED_NOCACHE )
2482 {
2483 _mm_stream_si128((__m128i*)(ptr), v0);
2484 _mm_stream_si128((__m128i*)(ptr + 16), v1);
2485 _mm_stream_si128((__m128i*)(ptr + 32), v2);
2486 }
2487 else if( mode == hal::STORE_ALIGNED )
2488 {
2489 _mm_store_si128((__m128i*)(ptr), v0);
2490 _mm_store_si128((__m128i*)(ptr + 16), v1);
2491 _mm_store_si128((__m128i*)(ptr + 32), v2);
2492 }
2493 else
2494 {
2495 _mm_storeu_si128((__m128i*)(ptr), v0);
2496 _mm_storeu_si128((__m128i*)(ptr + 16), v1);
2497 _mm_storeu_si128((__m128i*)(ptr + 32), v2);
2498 }
2499 }
2500
v_store_interleave(uchar * ptr,const v_uint8x16 & a,const v_uint8x16 & b,const v_uint8x16 & c,const v_uint8x16 & d,hal::StoreMode mode=hal::STORE_UNALIGNED)2501 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
2502 const v_uint8x16& c, const v_uint8x16& d,
2503 hal::StoreMode mode = hal::STORE_UNALIGNED)
2504 {
2505 // a0 a1 a2 a3 ....
2506 // b0 b1 b2 b3 ....
2507 // c0 c1 c2 c3 ....
2508 // d0 d1 d2 d3 ....
2509 __m128i u0 = _mm_unpacklo_epi8(a.val, c.val); // a0 c0 a1 c1 ...
2510 __m128i u1 = _mm_unpackhi_epi8(a.val, c.val); // a8 c8 a9 c9 ...
2511 __m128i u2 = _mm_unpacklo_epi8(b.val, d.val); // b0 d0 b1 d1 ...
2512 __m128i u3 = _mm_unpackhi_epi8(b.val, d.val); // b8 d8 b9 d9 ...
2513
2514 __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 b0 c0 d0 ...
2515 __m128i v1 = _mm_unpackhi_epi8(u0, u2); // a4 b4 c4 d4 ...
2516 __m128i v2 = _mm_unpacklo_epi8(u1, u3); // a8 b8 c8 d8 ...
2517 __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a12 b12 c12 d12 ...
2518
2519 if( mode == hal::STORE_ALIGNED_NOCACHE )
2520 {
2521 _mm_stream_si128((__m128i*)(ptr), v0);
2522 _mm_stream_si128((__m128i*)(ptr + 16), v1);
2523 _mm_stream_si128((__m128i*)(ptr + 32), v2);
2524 _mm_stream_si128((__m128i*)(ptr + 48), v3);
2525 }
2526 else if( mode == hal::STORE_ALIGNED )
2527 {
2528 _mm_store_si128((__m128i*)(ptr), v0);
2529 _mm_store_si128((__m128i*)(ptr + 16), v1);
2530 _mm_store_si128((__m128i*)(ptr + 32), v2);
2531 _mm_store_si128((__m128i*)(ptr + 48), v3);
2532 }
2533 else
2534 {
2535 _mm_storeu_si128((__m128i*)(ptr), v0);
2536 _mm_storeu_si128((__m128i*)(ptr + 16), v1);
2537 _mm_storeu_si128((__m128i*)(ptr + 32), v2);
2538 _mm_storeu_si128((__m128i*)(ptr + 48), v3);
2539 }
2540 }
2541
v_store_interleave(ushort * ptr,const v_uint16x8 & a,const v_uint16x8 & b,hal::StoreMode mode=hal::STORE_UNALIGNED)2542 inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
2543 hal::StoreMode mode = hal::STORE_UNALIGNED)
2544 {
2545 __m128i v0 = _mm_unpacklo_epi16(a.val, b.val);
2546 __m128i v1 = _mm_unpackhi_epi16(a.val, b.val);
2547
2548 if( mode == hal::STORE_ALIGNED_NOCACHE )
2549 {
2550 _mm_stream_si128((__m128i*)(ptr), v0);
2551 _mm_stream_si128((__m128i*)(ptr + 8), v1);
2552 }
2553 else if( mode == hal::STORE_ALIGNED )
2554 {
2555 _mm_store_si128((__m128i*)(ptr), v0);
2556 _mm_store_si128((__m128i*)(ptr + 8), v1);
2557 }
2558 else
2559 {
2560 _mm_storeu_si128((__m128i*)(ptr), v0);
2561 _mm_storeu_si128((__m128i*)(ptr + 8), v1);
2562 }
2563 }
2564
v_store_interleave(ushort * ptr,const v_uint16x8 & a,const v_uint16x8 & b,const v_uint16x8 & c,hal::StoreMode mode=hal::STORE_UNALIGNED)2565 inline void v_store_interleave( ushort* ptr, const v_uint16x8& a,
2566 const v_uint16x8& b, const v_uint16x8& c,
2567 hal::StoreMode mode = hal::STORE_UNALIGNED)
2568 {
2569 #if CV_SSE4_1
2570 const __m128i sh_a = _mm_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
2571 const __m128i sh_b = _mm_setr_epi8(10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5);
2572 const __m128i sh_c = _mm_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
2573 __m128i a0 = _mm_shuffle_epi8(a.val, sh_a);
2574 __m128i b0 = _mm_shuffle_epi8(b.val, sh_b);
2575 __m128i c0 = _mm_shuffle_epi8(c.val, sh_c);
2576
2577 __m128i v0 = _mm_blend_epi16(_mm_blend_epi16(a0, b0, 0x92), c0, 0x24);
2578 __m128i v1 = _mm_blend_epi16(_mm_blend_epi16(c0, a0, 0x92), b0, 0x24);
2579 __m128i v2 = _mm_blend_epi16(_mm_blend_epi16(b0, c0, 0x92), a0, 0x24);
2580 #else
2581 __m128i z = _mm_setzero_si128();
2582 __m128i ab0 = _mm_unpacklo_epi16(a.val, b.val);
2583 __m128i ab1 = _mm_unpackhi_epi16(a.val, b.val);
2584 __m128i c0 = _mm_unpacklo_epi16(c.val, z);
2585 __m128i c1 = _mm_unpackhi_epi16(c.val, z);
2586
2587 __m128i p10 = _mm_unpacklo_epi32(ab0, c0);
2588 __m128i p11 = _mm_unpackhi_epi32(ab0, c0);
2589 __m128i p12 = _mm_unpacklo_epi32(ab1, c1);
2590 __m128i p13 = _mm_unpackhi_epi32(ab1, c1);
2591
2592 __m128i p20 = _mm_unpacklo_epi64(p10, p11);
2593 __m128i p21 = _mm_unpackhi_epi64(p10, p11);
2594 __m128i p22 = _mm_unpacklo_epi64(p12, p13);
2595 __m128i p23 = _mm_unpackhi_epi64(p12, p13);
2596
2597 p20 = _mm_slli_si128(p20, 2);
2598 p22 = _mm_slli_si128(p22, 2);
2599
2600 __m128i p30 = _mm_unpacklo_epi64(p20, p21);
2601 __m128i p31 = _mm_unpackhi_epi64(p20, p21);
2602 __m128i p32 = _mm_unpacklo_epi64(p22, p23);
2603 __m128i p33 = _mm_unpackhi_epi64(p22, p23);
2604
2605 __m128i v0 = _mm_or_si128(_mm_srli_si128(p30, 2), _mm_slli_si128(p31, 10));
2606 __m128i v1 = _mm_or_si128(_mm_srli_si128(p31, 6), _mm_slli_si128(p32, 6));
2607 __m128i v2 = _mm_or_si128(_mm_srli_si128(p32, 10), _mm_slli_si128(p33, 2));
2608 #endif
2609 if( mode == hal::STORE_ALIGNED_NOCACHE )
2610 {
2611 _mm_stream_si128((__m128i*)(ptr), v0);
2612 _mm_stream_si128((__m128i*)(ptr + 8), v1);
2613 _mm_stream_si128((__m128i*)(ptr + 16), v2);
2614 }
2615 else if( mode == hal::STORE_ALIGNED )
2616 {
2617 _mm_store_si128((__m128i*)(ptr), v0);
2618 _mm_store_si128((__m128i*)(ptr + 8), v1);
2619 _mm_store_si128((__m128i*)(ptr + 16), v2);
2620 }
2621 else
2622 {
2623 _mm_storeu_si128((__m128i*)(ptr), v0);
2624 _mm_storeu_si128((__m128i*)(ptr + 8), v1);
2625 _mm_storeu_si128((__m128i*)(ptr + 16), v2);
2626 }
2627 }
2628
v_store_interleave(ushort * ptr,const v_uint16x8 & a,const v_uint16x8 & b,const v_uint16x8 & c,const v_uint16x8 & d,hal::StoreMode mode=hal::STORE_UNALIGNED)2629 inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
2630 const v_uint16x8& c, const v_uint16x8& d,
2631 hal::StoreMode mode = hal::STORE_UNALIGNED)
2632 {
2633 // a0 a1 a2 a3 ....
2634 // b0 b1 b2 b3 ....
2635 // c0 c1 c2 c3 ....
2636 // d0 d1 d2 d3 ....
2637 __m128i u0 = _mm_unpacklo_epi16(a.val, c.val); // a0 c0 a1 c1 ...
2638 __m128i u1 = _mm_unpackhi_epi16(a.val, c.val); // a4 c4 a5 c5 ...
2639 __m128i u2 = _mm_unpacklo_epi16(b.val, d.val); // b0 d0 b1 d1 ...
2640 __m128i u3 = _mm_unpackhi_epi16(b.val, d.val); // b4 d4 b5 d5 ...
2641
2642 __m128i v0 = _mm_unpacklo_epi16(u0, u2); // a0 b0 c0 d0 ...
2643 __m128i v1 = _mm_unpackhi_epi16(u0, u2); // a2 b2 c2 d2 ...
2644 __m128i v2 = _mm_unpacklo_epi16(u1, u3); // a4 b4 c4 d4 ...
2645 __m128i v3 = _mm_unpackhi_epi16(u1, u3); // a6 b6 c6 d6 ...
2646
2647 if( mode == hal::STORE_ALIGNED_NOCACHE )
2648 {
2649 _mm_stream_si128((__m128i*)(ptr), v0);
2650 _mm_stream_si128((__m128i*)(ptr + 8), v1);
2651 _mm_stream_si128((__m128i*)(ptr + 16), v2);
2652 _mm_stream_si128((__m128i*)(ptr + 24), v3);
2653 }
2654 else if( mode == hal::STORE_ALIGNED )
2655 {
2656 _mm_store_si128((__m128i*)(ptr), v0);
2657 _mm_store_si128((__m128i*)(ptr + 8), v1);
2658 _mm_store_si128((__m128i*)(ptr + 16), v2);
2659 _mm_store_si128((__m128i*)(ptr + 24), v3);
2660 }
2661 else
2662 {
2663 _mm_storeu_si128((__m128i*)(ptr), v0);
2664 _mm_storeu_si128((__m128i*)(ptr + 8), v1);
2665 _mm_storeu_si128((__m128i*)(ptr + 16), v2);
2666 _mm_storeu_si128((__m128i*)(ptr + 24), v3);
2667 }
2668 }
2669
v_store_interleave(unsigned * ptr,const v_uint32x4 & a,const v_uint32x4 & b,hal::StoreMode mode=hal::STORE_UNALIGNED)2670 inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
2671 hal::StoreMode mode = hal::STORE_UNALIGNED)
2672 {
2673 __m128i v0 = _mm_unpacklo_epi32(a.val, b.val);
2674 __m128i v1 = _mm_unpackhi_epi32(a.val, b.val);
2675
2676 if( mode == hal::STORE_ALIGNED_NOCACHE )
2677 {
2678 _mm_stream_si128((__m128i*)(ptr), v0);
2679 _mm_stream_si128((__m128i*)(ptr + 4), v1);
2680 }
2681 else if( mode == hal::STORE_ALIGNED )
2682 {
2683 _mm_store_si128((__m128i*)(ptr), v0);
2684 _mm_store_si128((__m128i*)(ptr + 4), v1);
2685 }
2686 else
2687 {
2688 _mm_storeu_si128((__m128i*)(ptr), v0);
2689 _mm_storeu_si128((__m128i*)(ptr + 4), v1);
2690 }
2691 }
2692
v_store_interleave(unsigned * ptr,const v_uint32x4 & a,const v_uint32x4 & b,const v_uint32x4 & c,hal::StoreMode mode=hal::STORE_UNALIGNED)2693 inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
2694 const v_uint32x4& c, hal::StoreMode mode = hal::STORE_UNALIGNED)
2695 {
2696 v_uint32x4 z = v_setzero_u32(), u0, u1, u2, u3;
2697 v_transpose4x4(a, b, c, z, u0, u1, u2, u3);
2698
2699 __m128i v0 = _mm_or_si128(u0.val, _mm_slli_si128(u1.val, 12));
2700 __m128i v1 = _mm_or_si128(_mm_srli_si128(u1.val, 4), _mm_slli_si128(u2.val, 8));
2701 __m128i v2 = _mm_or_si128(_mm_srli_si128(u2.val, 8), _mm_slli_si128(u3.val, 4));
2702
2703 if( mode == hal::STORE_ALIGNED_NOCACHE )
2704 {
2705 _mm_stream_si128((__m128i*)(ptr), v0);
2706 _mm_stream_si128((__m128i*)(ptr + 4), v1);
2707 _mm_stream_si128((__m128i*)(ptr + 8), v2);
2708 }
2709 else if( mode == hal::STORE_ALIGNED )
2710 {
2711 _mm_store_si128((__m128i*)(ptr), v0);
2712 _mm_store_si128((__m128i*)(ptr + 4), v1);
2713 _mm_store_si128((__m128i*)(ptr + 8), v2);
2714 }
2715 else
2716 {
2717 _mm_storeu_si128((__m128i*)(ptr), v0);
2718 _mm_storeu_si128((__m128i*)(ptr + 4), v1);
2719 _mm_storeu_si128((__m128i*)(ptr + 8), v2);
2720 }
2721 }
2722
v_store_interleave(unsigned * ptr,const v_uint32x4 & a,const v_uint32x4 & b,const v_uint32x4 & c,const v_uint32x4 & d,hal::StoreMode mode=hal::STORE_UNALIGNED)2723 inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
2724 const v_uint32x4& c, const v_uint32x4& d,
2725 hal::StoreMode mode = hal::STORE_UNALIGNED)
2726 {
2727 v_uint32x4 v0, v1, v2, v3;
2728 v_transpose4x4(a, b, c, d, v0, v1, v2, v3);
2729
2730 if( mode == hal::STORE_ALIGNED_NOCACHE )
2731 {
2732 _mm_stream_si128((__m128i*)(ptr), v0.val);
2733 _mm_stream_si128((__m128i*)(ptr + 4), v1.val);
2734 _mm_stream_si128((__m128i*)(ptr + 8), v2.val);
2735 _mm_stream_si128((__m128i*)(ptr + 12), v3.val);
2736 }
2737 else if( mode == hal::STORE_ALIGNED )
2738 {
2739 _mm_store_si128((__m128i*)(ptr), v0.val);
2740 _mm_store_si128((__m128i*)(ptr + 4), v1.val);
2741 _mm_store_si128((__m128i*)(ptr + 8), v2.val);
2742 _mm_store_si128((__m128i*)(ptr + 12), v3.val);
2743 }
2744 else
2745 {
2746 _mm_storeu_si128((__m128i*)(ptr), v0.val);
2747 _mm_storeu_si128((__m128i*)(ptr + 4), v1.val);
2748 _mm_storeu_si128((__m128i*)(ptr + 8), v2.val);
2749 _mm_storeu_si128((__m128i*)(ptr + 12), v3.val);
2750 }
2751 }
2752
2753 // 2-channel, float only
v_store_interleave(float * ptr,const v_float32x4 & a,const v_float32x4 & b,hal::StoreMode mode=hal::STORE_UNALIGNED)2754 inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
2755 hal::StoreMode mode = hal::STORE_UNALIGNED)
2756 {
2757 __m128 v0 = _mm_unpacklo_ps(a.val, b.val); // a0 b0 a1 b1
2758 __m128 v1 = _mm_unpackhi_ps(a.val, b.val); // a2 b2 a3 b3
2759
2760 if( mode == hal::STORE_ALIGNED_NOCACHE )
2761 {
2762 _mm_stream_ps(ptr, v0);
2763 _mm_stream_ps(ptr + 4, v1);
2764 }
2765 else if( mode == hal::STORE_ALIGNED )
2766 {
2767 _mm_store_ps(ptr, v0);
2768 _mm_store_ps(ptr + 4, v1);
2769 }
2770 else
2771 {
2772 _mm_storeu_ps(ptr, v0);
2773 _mm_storeu_ps(ptr + 4, v1);
2774 }
2775 }
2776
v_store_interleave(float * ptr,const v_float32x4 & a,const v_float32x4 & b,const v_float32x4 & c,hal::StoreMode mode=hal::STORE_UNALIGNED)2777 inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
2778 const v_float32x4& c, hal::StoreMode mode = hal::STORE_UNALIGNED)
2779 {
2780 __m128 u0 = _mm_shuffle_ps(a.val, b.val, _MM_SHUFFLE(0, 0, 0, 0));
2781 __m128 u1 = _mm_shuffle_ps(c.val, a.val, _MM_SHUFFLE(1, 1, 0, 0));
2782 __m128 v0 = _mm_shuffle_ps(u0, u1, _MM_SHUFFLE(2, 0, 2, 0));
2783 __m128 u2 = _mm_shuffle_ps(b.val, c.val, _MM_SHUFFLE(1, 1, 1, 1));
2784 __m128 u3 = _mm_shuffle_ps(a.val, b.val, _MM_SHUFFLE(2, 2, 2, 2));
2785 __m128 v1 = _mm_shuffle_ps(u2, u3, _MM_SHUFFLE(2, 0, 2, 0));
2786 __m128 u4 = _mm_shuffle_ps(c.val, a.val, _MM_SHUFFLE(3, 3, 2, 2));
2787 __m128 u5 = _mm_shuffle_ps(b.val, c.val, _MM_SHUFFLE(3, 3, 3, 3));
2788 __m128 v2 = _mm_shuffle_ps(u4, u5, _MM_SHUFFLE(2, 0, 2, 0));
2789
2790 if( mode == hal::STORE_ALIGNED_NOCACHE )
2791 {
2792 _mm_stream_ps(ptr, v0);
2793 _mm_stream_ps(ptr + 4, v1);
2794 _mm_stream_ps(ptr + 8, v2);
2795 }
2796 else if( mode == hal::STORE_ALIGNED )
2797 {
2798 _mm_store_ps(ptr, v0);
2799 _mm_store_ps(ptr + 4, v1);
2800 _mm_store_ps(ptr + 8, v2);
2801 }
2802 else
2803 {
2804 _mm_storeu_ps(ptr, v0);
2805 _mm_storeu_ps(ptr + 4, v1);
2806 _mm_storeu_ps(ptr + 8, v2);
2807 }
2808 }
2809
v_store_interleave(float * ptr,const v_float32x4 & a,const v_float32x4 & b,const v_float32x4 & c,const v_float32x4 & d,hal::StoreMode mode=hal::STORE_UNALIGNED)2810 inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
2811 const v_float32x4& c, const v_float32x4& d,
2812 hal::StoreMode mode = hal::STORE_UNALIGNED)
2813 {
2814 __m128 u0 = _mm_unpacklo_ps(a.val, c.val);
2815 __m128 u1 = _mm_unpacklo_ps(b.val, d.val);
2816 __m128 u2 = _mm_unpackhi_ps(a.val, c.val);
2817 __m128 u3 = _mm_unpackhi_ps(b.val, d.val);
2818 __m128 v0 = _mm_unpacklo_ps(u0, u1);
2819 __m128 v2 = _mm_unpacklo_ps(u2, u3);
2820 __m128 v1 = _mm_unpackhi_ps(u0, u1);
2821 __m128 v3 = _mm_unpackhi_ps(u2, u3);
2822
2823 if( mode == hal::STORE_ALIGNED_NOCACHE )
2824 {
2825 _mm_stream_ps(ptr, v0);
2826 _mm_stream_ps(ptr + 4, v1);
2827 _mm_stream_ps(ptr + 8, v2);
2828 _mm_stream_ps(ptr + 12, v3);
2829 }
2830 else if( mode == hal::STORE_ALIGNED )
2831 {
2832 _mm_store_ps(ptr, v0);
2833 _mm_store_ps(ptr + 4, v1);
2834 _mm_store_ps(ptr + 8, v2);
2835 _mm_store_ps(ptr + 12, v3);
2836 }
2837 else
2838 {
2839 _mm_storeu_ps(ptr, v0);
2840 _mm_storeu_ps(ptr + 4, v1);
2841 _mm_storeu_ps(ptr + 8, v2);
2842 _mm_storeu_ps(ptr + 12, v3);
2843 }
2844 }
2845
v_store_interleave(uint64 * ptr,const v_uint64x2 & a,const v_uint64x2 & b,hal::StoreMode mode=hal::STORE_UNALIGNED)2846 inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
2847 hal::StoreMode mode = hal::STORE_UNALIGNED)
2848 {
2849 __m128i v0 = _mm_unpacklo_epi64(a.val, b.val);
2850 __m128i v1 = _mm_unpackhi_epi64(a.val, b.val);
2851
2852 if( mode == hal::STORE_ALIGNED_NOCACHE )
2853 {
2854 _mm_stream_si128((__m128i*)(ptr), v0);
2855 _mm_stream_si128((__m128i*)(ptr + 2), v1);
2856 }
2857 else if( mode == hal::STORE_ALIGNED )
2858 {
2859 _mm_store_si128((__m128i*)(ptr), v0);
2860 _mm_store_si128((__m128i*)(ptr + 2), v1);
2861 }
2862 else
2863 {
2864 _mm_storeu_si128((__m128i*)(ptr), v0);
2865 _mm_storeu_si128((__m128i*)(ptr + 2), v1);
2866 }
2867 }
2868
v_store_interleave(uint64 * ptr,const v_uint64x2 & a,const v_uint64x2 & b,const v_uint64x2 & c,hal::StoreMode mode=hal::STORE_UNALIGNED)2869 inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
2870 const v_uint64x2& c, hal::StoreMode mode = hal::STORE_UNALIGNED)
2871 {
2872 __m128i v0 = _mm_unpacklo_epi64(a.val, b.val);
2873 __m128i v1 = _mm_unpacklo_epi64(c.val, _mm_unpackhi_epi64(a.val, a.val));
2874 __m128i v2 = _mm_unpackhi_epi64(b.val, c.val);
2875
2876 if( mode == hal::STORE_ALIGNED_NOCACHE )
2877 {
2878 _mm_stream_si128((__m128i*)(ptr), v0);
2879 _mm_stream_si128((__m128i*)(ptr + 2), v1);
2880 _mm_stream_si128((__m128i*)(ptr + 4), v2);
2881 }
2882 else if( mode == hal::STORE_ALIGNED )
2883 {
2884 _mm_store_si128((__m128i*)(ptr), v0);
2885 _mm_store_si128((__m128i*)(ptr + 2), v1);
2886 _mm_store_si128((__m128i*)(ptr + 4), v2);
2887 }
2888 else
2889 {
2890 _mm_storeu_si128((__m128i*)(ptr), v0);
2891 _mm_storeu_si128((__m128i*)(ptr + 2), v1);
2892 _mm_storeu_si128((__m128i*)(ptr + 4), v2);
2893 }
2894 }
2895
v_store_interleave(uint64 * ptr,const v_uint64x2 & a,const v_uint64x2 & b,const v_uint64x2 & c,const v_uint64x2 & d,hal::StoreMode mode=hal::STORE_UNALIGNED)2896 inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
2897 const v_uint64x2& c, const v_uint64x2& d,
2898 hal::StoreMode mode = hal::STORE_UNALIGNED)
2899 {
2900 __m128i v0 = _mm_unpacklo_epi64(a.val, b.val);
2901 __m128i v1 = _mm_unpacklo_epi64(c.val, d.val);
2902 __m128i v2 = _mm_unpackhi_epi64(a.val, b.val);
2903 __m128i v3 = _mm_unpackhi_epi64(c.val, d.val);
2904
2905 if( mode == hal::STORE_ALIGNED_NOCACHE )
2906 {
2907 _mm_stream_si128((__m128i*)(ptr), v0);
2908 _mm_stream_si128((__m128i*)(ptr + 2), v1);
2909 _mm_stream_si128((__m128i*)(ptr + 4), v2);
2910 _mm_stream_si128((__m128i*)(ptr + 6), v3);
2911 }
2912 else if( mode == hal::STORE_ALIGNED )
2913 {
2914 _mm_store_si128((__m128i*)(ptr), v0);
2915 _mm_store_si128((__m128i*)(ptr + 2), v1);
2916 _mm_store_si128((__m128i*)(ptr + 4), v2);
2917 _mm_store_si128((__m128i*)(ptr + 6), v3);
2918 }
2919 else
2920 {
2921 _mm_storeu_si128((__m128i*)(ptr), v0);
2922 _mm_storeu_si128((__m128i*)(ptr + 2), v1);
2923 _mm_storeu_si128((__m128i*)(ptr + 4), v2);
2924 _mm_storeu_si128((__m128i*)(ptr + 6), v3);
2925 }
2926 }
2927
2928 #define OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
2929 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \
2930 { \
2931 _Tpvec1 a1, b1; \
2932 v_load_deinterleave((const _Tp1*)ptr, a1, b1); \
2933 a0 = v_reinterpret_as_##suffix0(a1); \
2934 b0 = v_reinterpret_as_##suffix0(b1); \
2935 } \
2936 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \
2937 { \
2938 _Tpvec1 a1, b1, c1; \
2939 v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \
2940 a0 = v_reinterpret_as_##suffix0(a1); \
2941 b0 = v_reinterpret_as_##suffix0(b1); \
2942 c0 = v_reinterpret_as_##suffix0(c1); \
2943 } \
2944 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \
2945 { \
2946 _Tpvec1 a1, b1, c1, d1; \
2947 v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \
2948 a0 = v_reinterpret_as_##suffix0(a1); \
2949 b0 = v_reinterpret_as_##suffix0(b1); \
2950 c0 = v_reinterpret_as_##suffix0(c1); \
2951 d0 = v_reinterpret_as_##suffix0(d1); \
2952 } \
2953 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2954 hal::StoreMode mode = hal::STORE_UNALIGNED ) \
2955 { \
2956 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2957 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2958 v_store_interleave((_Tp1*)ptr, a1, b1, mode); \
2959 } \
2960 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2961 const _Tpvec0& c0, hal::StoreMode mode = hal::STORE_UNALIGNED ) \
2962 { \
2963 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2964 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2965 _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
2966 v_store_interleave((_Tp1*)ptr, a1, b1, c1, mode); \
2967 } \
2968 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2969 const _Tpvec0& c0, const _Tpvec0& d0, \
2970 hal::StoreMode mode = hal::STORE_UNALIGNED ) \
2971 { \
2972 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2973 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2974 _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
2975 _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
2976 v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1, mode); \
2977 }
2978
OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int8x16,schar,s8,v_uint8x16,uchar,u8)2979 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int8x16, schar, s8, v_uint8x16, uchar, u8)
2980 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int16x8, short, s16, v_uint16x8, ushort, u16)
2981 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int32x4, int, s32, v_uint32x4, unsigned, u32)
2982 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int64x2, int64, s64, v_uint64x2, uint64, u64)
2983 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_float64x2, double, f64, v_uint64x2, uint64, u64)
2984
2985 inline v_float32x4 v_cvt_f32(const v_int32x4& a)
2986 {
2987 return v_float32x4(_mm_cvtepi32_ps(a.val));
2988 }
2989
v_cvt_f32(const v_float64x2 & a)2990 inline v_float32x4 v_cvt_f32(const v_float64x2& a)
2991 {
2992 return v_float32x4(_mm_cvtpd_ps(a.val));
2993 }
2994
v_cvt_f32(const v_float64x2 & a,const v_float64x2 & b)2995 inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
2996 {
2997 return v_float32x4(_mm_movelh_ps(_mm_cvtpd_ps(a.val), _mm_cvtpd_ps(b.val)));
2998 }
2999
v_cvt_f64(const v_int32x4 & a)3000 inline v_float64x2 v_cvt_f64(const v_int32x4& a)
3001 {
3002 return v_float64x2(_mm_cvtepi32_pd(a.val));
3003 }
3004
v_cvt_f64_high(const v_int32x4 & a)3005 inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
3006 {
3007 return v_float64x2(_mm_cvtepi32_pd(_mm_srli_si128(a.val,8)));
3008 }
3009
v_cvt_f64(const v_float32x4 & a)3010 inline v_float64x2 v_cvt_f64(const v_float32x4& a)
3011 {
3012 return v_float64x2(_mm_cvtps_pd(a.val));
3013 }
3014
v_cvt_f64_high(const v_float32x4 & a)3015 inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
3016 {
3017 return v_float64x2(_mm_cvtps_pd(_mm_movehl_ps(a.val, a.val)));
3018 }
3019
3020 // from (Mysticial and wim) https://stackoverflow.com/q/41144668
v_cvt_f64(const v_int64x2 & v)3021 inline v_float64x2 v_cvt_f64(const v_int64x2& v)
3022 {
3023 // constants encoded as floating-point
3024 __m128i magic_i_hi32 = _mm_set1_epi64x(0x4530000080000000); // 2^84 + 2^63
3025 __m128i magic_i_all = _mm_set1_epi64x(0x4530000080100000); // 2^84 + 2^63 + 2^52
3026 __m128d magic_d_all = _mm_castsi128_pd(magic_i_all);
3027 // Blend the 32 lowest significant bits of v with magic_int_lo
3028 #if CV_SSE4_1
3029 __m128i magic_i_lo = _mm_set1_epi64x(0x4330000000000000); // 2^52
3030 __m128i v_lo = _mm_blend_epi16(v.val, magic_i_lo, 0xcc);
3031 #else
3032 __m128i magic_i_lo = _mm_set1_epi32(0x43300000); // 2^52
3033 __m128i v_lo = _mm_unpacklo_epi32(_mm_shuffle_epi32(v.val, _MM_SHUFFLE(0, 0, 2, 0)), magic_i_lo);
3034 #endif
3035 // Extract the 32 most significant bits of v
3036 __m128i v_hi = _mm_srli_epi64(v.val, 32);
3037 // Flip the msb of v_hi and blend with 0x45300000
3038 v_hi = _mm_xor_si128(v_hi, magic_i_hi32);
3039 // Compute in double precision
3040 __m128d v_hi_dbl = _mm_sub_pd(_mm_castsi128_pd(v_hi), magic_d_all);
3041 // (v_hi - magic_d_all) + v_lo Do not assume associativity of floating point addition
3042 __m128d result = _mm_add_pd(v_hi_dbl, _mm_castsi128_pd(v_lo));
3043 return v_float64x2(result);
3044 }
3045
3046 ////////////// Lookup table access ////////////////////
3047
v_lut(const schar * tab,const int * idx)3048 inline v_int8x16 v_lut(const schar* tab, const int* idx)
3049 {
3050 #if defined(_MSC_VER)
3051 return v_int8x16(_mm_setr_epi8(tab[idx[0]], tab[idx[1]], tab[idx[ 2]], tab[idx[ 3]], tab[idx[ 4]], tab[idx[ 5]], tab[idx[ 6]], tab[idx[ 7]],
3052 tab[idx[8]], tab[idx[9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]]));
3053 #else
3054 return v_int8x16(_mm_setr_epi64(
3055 _mm_setr_pi8(tab[idx[0]], tab[idx[1]], tab[idx[ 2]], tab[idx[ 3]], tab[idx[ 4]], tab[idx[ 5]], tab[idx[ 6]], tab[idx[ 7]]),
3056 _mm_setr_pi8(tab[idx[8]], tab[idx[9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]])
3057 ));
3058 #endif
3059 }
v_lut_pairs(const schar * tab,const int * idx)3060 inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
3061 {
3062 #if defined(_MSC_VER)
3063 return v_int8x16(_mm_setr_epi16(*(const short*)(tab + idx[0]), *(const short*)(tab + idx[1]), *(const short*)(tab + idx[2]), *(const short*)(tab + idx[3]),
3064 *(const short*)(tab + idx[4]), *(const short*)(tab + idx[5]), *(const short*)(tab + idx[6]), *(const short*)(tab + idx[7])));
3065 #else
3066 return v_int8x16(_mm_setr_epi64(
3067 _mm_setr_pi16(*(const short*)(tab + idx[0]), *(const short*)(tab + idx[1]), *(const short*)(tab + idx[2]), *(const short*)(tab + idx[3])),
3068 _mm_setr_pi16(*(const short*)(tab + idx[4]), *(const short*)(tab + idx[5]), *(const short*)(tab + idx[6]), *(const short*)(tab + idx[7]))
3069 ));
3070 #endif
3071 }
v_lut_quads(const schar * tab,const int * idx)3072 inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
3073 {
3074 #if defined(_MSC_VER)
3075 return v_int8x16(_mm_setr_epi32(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1]),
3076 *(const int*)(tab + idx[2]), *(const int*)(tab + idx[3])));
3077 #else
3078 return v_int8x16(_mm_setr_epi64(
3079 _mm_setr_pi32(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1])),
3080 _mm_setr_pi32(*(const int*)(tab + idx[2]), *(const int*)(tab + idx[3]))
3081 ));
3082 #endif
3083 }
v_lut(const uchar * tab,const int * idx)3084 inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((const schar *)tab, idx)); }
v_lut_pairs(const uchar * tab,const int * idx)3085 inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((const schar *)tab, idx)); }
v_lut_quads(const uchar * tab,const int * idx)3086 inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((const schar *)tab, idx)); }
3087
v_lut(const short * tab,const int * idx)3088 inline v_int16x8 v_lut(const short* tab, const int* idx)
3089 {
3090 #if defined(_MSC_VER)
3091 return v_int16x8(_mm_setr_epi16(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]],
3092 tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]]));
3093 #else
3094 return v_int16x8(_mm_setr_epi64(
3095 _mm_setr_pi16(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]),
3096 _mm_setr_pi16(tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]])
3097 ));
3098 #endif
3099 }
v_lut_pairs(const short * tab,const int * idx)3100 inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
3101 {
3102 #if defined(_MSC_VER)
3103 return v_int16x8(_mm_setr_epi32(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1]),
3104 *(const int*)(tab + idx[2]), *(const int*)(tab + idx[3])));
3105 #else
3106 return v_int16x8(_mm_setr_epi64(
3107 _mm_setr_pi32(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1])),
3108 _mm_setr_pi32(*(const int*)(tab + idx[2]), *(const int*)(tab + idx[3]))
3109 ));
3110 #endif
3111 }
v_lut_quads(const short * tab,const int * idx)3112 inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
3113 {
3114 return v_int16x8(_mm_set_epi64x(*(const int64_t*)(tab + idx[1]), *(const int64_t*)(tab + idx[0])));
3115 }
v_lut(const ushort * tab,const int * idx)3116 inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((const short *)tab, idx)); }
v_lut_pairs(const ushort * tab,const int * idx)3117 inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((const short *)tab, idx)); }
v_lut_quads(const ushort * tab,const int * idx)3118 inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((const short *)tab, idx)); }
3119
v_lut(const int * tab,const int * idx)3120 inline v_int32x4 v_lut(const int* tab, const int* idx)
3121 {
3122 #if defined(_MSC_VER)
3123 return v_int32x4(_mm_setr_epi32(tab[idx[0]], tab[idx[1]],
3124 tab[idx[2]], tab[idx[3]]));
3125 #else
3126 return v_int32x4(_mm_setr_epi64(
3127 _mm_setr_pi32(tab[idx[0]], tab[idx[1]]),
3128 _mm_setr_pi32(tab[idx[2]], tab[idx[3]])
3129 ));
3130 #endif
3131 }
v_lut_pairs(const int * tab,const int * idx)3132 inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
3133 {
3134 return v_int32x4(_mm_set_epi64x(*(const int64_t*)(tab + idx[1]), *(const int64_t*)(tab + idx[0])));
3135 }
v_lut_quads(const int * tab,const int * idx)3136 inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
3137 {
3138 return v_int32x4(_mm_loadu_si128((const __m128i*)(tab + idx[0])));
3139 }
v_lut(const unsigned * tab,const int * idx)3140 inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((const int *)tab, idx)); }
v_lut_pairs(const unsigned * tab,const int * idx)3141 inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((const int *)tab, idx)); }
v_lut_quads(const unsigned * tab,const int * idx)3142 inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((const int *)tab, idx)); }
3143
v_lut(const int64_t * tab,const int * idx)3144 inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
3145 {
3146 return v_int64x2(_mm_set_epi64x(tab[idx[1]], tab[idx[0]]));
3147 }
v_lut_pairs(const int64_t * tab,const int * idx)3148 inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
3149 {
3150 return v_int64x2(_mm_loadu_si128((const __m128i*)(tab + idx[0])));
3151 }
v_lut(const uint64_t * tab,const int * idx)3152 inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
v_lut_pairs(const uint64_t * tab,const int * idx)3153 inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
3154
v_lut(const float * tab,const int * idx)3155 inline v_float32x4 v_lut(const float* tab, const int* idx)
3156 {
3157 return v_float32x4(_mm_setr_ps(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
3158 }
v_lut_pairs(const float * tab,const int * idx)3159 inline v_float32x4 v_lut_pairs(const float* tab, const int* idx) { return v_reinterpret_as_f32(v_lut_pairs((const int *)tab, idx)); }
v_lut_quads(const float * tab,const int * idx)3160 inline v_float32x4 v_lut_quads(const float* tab, const int* idx) { return v_reinterpret_as_f32(v_lut_quads((const int *)tab, idx)); }
3161
v_lut(const double * tab,const int * idx)3162 inline v_float64x2 v_lut(const double* tab, const int* idx)
3163 {
3164 return v_float64x2(_mm_setr_pd(tab[idx[0]], tab[idx[1]]));
3165 }
v_lut_pairs(const double * tab,const int * idx)3166 inline v_float64x2 v_lut_pairs(const double* tab, const int* idx) { return v_float64x2(_mm_castsi128_pd(_mm_loadu_si128((const __m128i*)(tab + idx[0])))); }
3167
v_lut(const int * tab,const v_int32x4 & idxvec)3168 inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
3169 {
3170 int CV_DECL_ALIGNED(32) idx[4];
3171 v_store_aligned(idx, idxvec);
3172 return v_int32x4(_mm_setr_epi32(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
3173 }
3174
v_lut(const unsigned * tab,const v_int32x4 & idxvec)3175 inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
3176 {
3177 return v_reinterpret_as_u32(v_lut((const int *)tab, idxvec));
3178 }
3179
v_lut(const float * tab,const v_int32x4 & idxvec)3180 inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
3181 {
3182 int CV_DECL_ALIGNED(32) idx[4];
3183 v_store_aligned(idx, idxvec);
3184 return v_float32x4(_mm_setr_ps(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
3185 }
3186
v_lut(const double * tab,const v_int32x4 & idxvec)3187 inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
3188 {
3189 int idx[2];
3190 v_store_low(idx, idxvec);
3191 return v_float64x2(_mm_setr_pd(tab[idx[0]], tab[idx[1]]));
3192 }
3193
3194 // loads pairs from the table and deinterleaves them, e.g. returns:
3195 // x = (tab[idxvec[0], tab[idxvec[1]], tab[idxvec[2]], tab[idxvec[3]]),
3196 // y = (tab[idxvec[0]+1], tab[idxvec[1]+1], tab[idxvec[2]+1], tab[idxvec[3]+1])
3197 // note that the indices are float's indices, not the float-pair indices.
3198 // in theory, this function can be used to implement bilinear interpolation,
3199 // when idxvec are the offsets within the image.
v_lut_deinterleave(const float * tab,const v_int32x4 & idxvec,v_float32x4 & x,v_float32x4 & y)3200 inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
3201 {
3202 int CV_DECL_ALIGNED(32) idx[4];
3203 v_store_aligned(idx, idxvec);
3204 __m128 z = _mm_setzero_ps();
3205 __m128 xy01 = _mm_loadl_pi(z, (__m64*)(tab + idx[0]));
3206 __m128 xy23 = _mm_loadl_pi(z, (__m64*)(tab + idx[2]));
3207 xy01 = _mm_loadh_pi(xy01, (__m64*)(tab + idx[1]));
3208 xy23 = _mm_loadh_pi(xy23, (__m64*)(tab + idx[3]));
3209 __m128 xxyy02 = _mm_unpacklo_ps(xy01, xy23);
3210 __m128 xxyy13 = _mm_unpackhi_ps(xy01, xy23);
3211 x = v_float32x4(_mm_unpacklo_ps(xxyy02, xxyy13));
3212 y = v_float32x4(_mm_unpackhi_ps(xxyy02, xxyy13));
3213 }
3214
v_lut_deinterleave(const double * tab,const v_int32x4 & idxvec,v_float64x2 & x,v_float64x2 & y)3215 inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
3216 {
3217 int idx[2];
3218 v_store_low(idx, idxvec);
3219 __m128d xy0 = _mm_loadu_pd(tab + idx[0]);
3220 __m128d xy1 = _mm_loadu_pd(tab + idx[1]);
3221 x = v_float64x2(_mm_unpacklo_pd(xy0, xy1));
3222 y = v_float64x2(_mm_unpackhi_pd(xy0, xy1));
3223 }
3224
v_interleave_pairs(const v_int8x16 & vec)3225 inline v_int8x16 v_interleave_pairs(const v_int8x16& vec)
3226 {
3227 #if CV_SSSE3
3228 return v_int8x16(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0x0f0d0e0c0b090a08, 0x0705060403010200)));
3229 #else
3230 __m128i a = _mm_shufflelo_epi16(vec.val, _MM_SHUFFLE(3, 1, 2, 0));
3231 a = _mm_shufflehi_epi16(a, _MM_SHUFFLE(3, 1, 2, 0));
3232 a = _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 1, 2, 0));
3233 return v_int8x16(_mm_unpacklo_epi8(a, _mm_unpackhi_epi64(a, a)));
3234 #endif
3235 }
v_interleave_pairs(const v_uint8x16 & vec)3236 inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
v_interleave_quads(const v_int8x16 & vec)3237 inline v_int8x16 v_interleave_quads(const v_int8x16& vec)
3238 {
3239 #if CV_SSSE3
3240 return v_int8x16(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0x0f0b0e0a0d090c08, 0x0703060205010400)));
3241 #else
3242 __m128i a = _mm_shuffle_epi32(vec.val, _MM_SHUFFLE(3, 1, 2, 0));
3243 return v_int8x16(_mm_unpacklo_epi8(a, _mm_unpackhi_epi64(a, a)));
3244 #endif
3245 }
v_interleave_quads(const v_uint8x16 & vec)3246 inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
3247
v_interleave_pairs(const v_int16x8 & vec)3248 inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
3249 {
3250 #if CV_SSSE3
3251 return v_int16x8(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0x0f0e0b0a0d0c0908, 0x0706030205040100)));
3252 #else
3253 __m128i a = _mm_shufflelo_epi16(vec.val, _MM_SHUFFLE(3, 1, 2, 0));
3254 return v_int16x8(_mm_shufflehi_epi16(a, _MM_SHUFFLE(3, 1, 2, 0)));
3255 #endif
3256 }
v_interleave_pairs(const v_uint16x8 & vec)3257 inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
v_interleave_quads(const v_int16x8 & vec)3258 inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
3259 {
3260 #if CV_SSSE3
3261 return v_int16x8(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0x0f0e07060d0c0504, 0x0b0a030209080100)));
3262 #else
3263 return v_int16x8(_mm_unpacklo_epi16(vec.val, _mm_unpackhi_epi64(vec.val, vec.val)));
3264 #endif
3265 }
v_interleave_quads(const v_uint16x8 & vec)3266 inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
3267
v_interleave_pairs(const v_int32x4 & vec)3268 inline v_int32x4 v_interleave_pairs(const v_int32x4& vec)
3269 {
3270 return v_int32x4(_mm_shuffle_epi32(vec.val, _MM_SHUFFLE(3, 1, 2, 0)));
3271 }
v_interleave_pairs(const v_uint32x4 & vec)3272 inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
v_interleave_pairs(const v_float32x4 & vec)3273 inline v_float32x4 v_interleave_pairs(const v_float32x4& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
3274
v_pack_triplets(const v_int8x16 & vec)3275 inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
3276 {
3277 #if CV_SSSE3
3278 return v_int8x16(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0xffffff0f0e0d0c0a, 0x0908060504020100)));
3279 #else
3280 __m128i mask = _mm_set1_epi64x(0x00000000FFFFFFFF);
3281 __m128i a = _mm_srli_si128(_mm_or_si128(_mm_andnot_si128(mask, vec.val), _mm_and_si128(mask, _mm_sll_epi32(vec.val, _mm_set_epi64x(0, 8)))), 1);
3282 return v_int8x16(_mm_srli_si128(_mm_shufflelo_epi16(a, _MM_SHUFFLE(2, 1, 0, 3)), 2));
3283 #endif
3284 }
v_pack_triplets(const v_uint8x16 & vec)3285 inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
3286
v_pack_triplets(const v_int16x8 & vec)3287 inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
3288 {
3289 #if CV_SSSE3
3290 return v_int16x8(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0xffff0f0e0d0c0b0a, 0x0908050403020100)));
3291 #else
3292 return v_int16x8(_mm_srli_si128(_mm_shufflelo_epi16(vec.val, _MM_SHUFFLE(2, 1, 0, 3)), 2));
3293 #endif
3294 }
v_pack_triplets(const v_uint16x8 & vec)3295 inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
3296
v_pack_triplets(const v_int32x4 & vec)3297 inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
v_pack_triplets(const v_uint32x4 & vec)3298 inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
v_pack_triplets(const v_float32x4 & vec)3299 inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
3300
3301 template<int i>
v_extract_n(const v_uint8x16 & v)3302 inline uchar v_extract_n(const v_uint8x16& v)
3303 {
3304 #if CV_SSE4_1
3305 return (uchar)_mm_extract_epi8(v.val, i);
3306 #else
3307 return v_rotate_right<i>(v).get0();
3308 #endif
3309 }
3310
3311 template<int i>
v_extract_n(const v_int8x16 & v)3312 inline schar v_extract_n(const v_int8x16& v)
3313 {
3314 return (schar)v_extract_n<i>(v_reinterpret_as_u8(v));
3315 }
3316
3317 template<int i>
v_extract_n(const v_uint16x8 & v)3318 inline ushort v_extract_n(const v_uint16x8& v)
3319 {
3320 return (ushort)_mm_extract_epi16(v.val, i);
3321 }
3322
3323 template<int i>
v_extract_n(const v_int16x8 & v)3324 inline short v_extract_n(const v_int16x8& v)
3325 {
3326 return (short)v_extract_n<i>(v_reinterpret_as_u16(v));
3327 }
3328
3329 template<int i>
v_extract_n(const v_uint32x4 & v)3330 inline uint v_extract_n(const v_uint32x4& v)
3331 {
3332 #if CV_SSE4_1
3333 return (uint)_mm_extract_epi32(v.val, i);
3334 #else
3335 return v_rotate_right<i>(v).get0();
3336 #endif
3337 }
3338
3339 template<int i>
v_extract_n(const v_int32x4 & v)3340 inline int v_extract_n(const v_int32x4& v)
3341 {
3342 return (int)v_extract_n<i>(v_reinterpret_as_u32(v));
3343 }
3344
3345 template<int i>
v_extract_n(const v_uint64x2 & v)3346 inline uint64 v_extract_n(const v_uint64x2& v)
3347 {
3348 #ifdef CV__SIMD_NATIVE_mm_extract_epi64
3349 return (uint64)_v128_extract_epi64<i>(v.val);
3350 #else
3351 return v_rotate_right<i>(v).get0();
3352 #endif
3353 }
3354
3355 template<int i>
v_extract_n(const v_int64x2 & v)3356 inline int64 v_extract_n(const v_int64x2& v)
3357 {
3358 return (int64)v_extract_n<i>(v_reinterpret_as_u64(v));
3359 }
3360
3361 template<int i>
v_extract_n(const v_float32x4 & v)3362 inline float v_extract_n(const v_float32x4& v)
3363 {
3364 union { uint iv; float fv; } d;
3365 d.iv = v_extract_n<i>(v_reinterpret_as_u32(v));
3366 return d.fv;
3367 }
3368
3369 template<int i>
v_extract_n(const v_float64x2 & v)3370 inline double v_extract_n(const v_float64x2& v)
3371 {
3372 union { uint64 iv; double dv; } d;
3373 d.iv = v_extract_n<i>(v_reinterpret_as_u64(v));
3374 return d.dv;
3375 }
3376
3377 template<int i>
v_broadcast_element(const v_int32x4 & v)3378 inline v_int32x4 v_broadcast_element(const v_int32x4& v)
3379 {
3380 return v_int32x4(_mm_shuffle_epi32(v.val, _MM_SHUFFLE(i,i,i,i)));
3381 }
3382
3383 template<int i>
v_broadcast_element(const v_uint32x4 & v)3384 inline v_uint32x4 v_broadcast_element(const v_uint32x4& v)
3385 {
3386 return v_uint32x4(_mm_shuffle_epi32(v.val, _MM_SHUFFLE(i,i,i,i)));
3387 }
3388
3389 template<int i>
v_broadcast_element(const v_float32x4 & v)3390 inline v_float32x4 v_broadcast_element(const v_float32x4& v)
3391 {
3392 return v_float32x4(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE((char)i,(char)i,(char)i,(char)i)));
3393 }
3394
3395 ////////////// FP16 support ///////////////////////////
3396
v_load_expand(const float16_t * ptr)3397 inline v_float32x4 v_load_expand(const float16_t* ptr)
3398 {
3399 #if CV_FP16
3400 return v_float32x4(_mm_cvtph_ps(_mm_loadu_si128((const __m128i*)ptr)));
3401 #else
3402 const __m128i z = _mm_setzero_si128(), delta = _mm_set1_epi32(0x38000000);
3403 const __m128i signmask = _mm_set1_epi32(0x80000000), maxexp = _mm_set1_epi32(0x7c000000);
3404 const __m128 deltaf = _mm_castsi128_ps(_mm_set1_epi32(0x38800000));
3405 __m128i bits = _mm_unpacklo_epi16(z, _mm_loadl_epi64((const __m128i*)ptr)); // h << 16
3406 __m128i e = _mm_and_si128(bits, maxexp), sign = _mm_and_si128(bits, signmask);
3407 __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_xor_si128(bits, sign), 3), delta); // ((h & 0x7fff) << 13) + delta
3408 __m128i zt = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(_mm_add_epi32(t, _mm_set1_epi32(1 << 23))), deltaf));
3409
3410 t = _mm_add_epi32(t, _mm_and_si128(delta, _mm_cmpeq_epi32(maxexp, e)));
3411 __m128i zmask = _mm_cmpeq_epi32(e, z);
3412 __m128i ft = v_select_si128(zmask, zt, t);
3413 return v_float32x4(_mm_castsi128_ps(_mm_or_si128(ft, sign)));
3414 #endif
3415 }
3416
v_pack_store(float16_t * ptr,const v_float32x4 & v)3417 inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
3418 {
3419 #if CV_FP16
3420 __m128i fp16_value = _mm_cvtps_ph(v.val, 0);
3421 _mm_storel_epi64((__m128i*)ptr, fp16_value);
3422 #else
3423 const __m128i signmask = _mm_set1_epi32(0x80000000);
3424 const __m128i rval = _mm_set1_epi32(0x3f000000);
3425
3426 __m128i t = _mm_castps_si128(v.val);
3427 __m128i sign = _mm_srai_epi32(_mm_and_si128(t, signmask), 16);
3428 t = _mm_andnot_si128(signmask, t);
3429
3430 __m128i finitemask = _mm_cmpgt_epi32(_mm_set1_epi32(0x47800000), t);
3431 __m128i isnan = _mm_cmpgt_epi32(t, _mm_set1_epi32(0x7f800000));
3432 __m128i naninf = v_select_si128(isnan, _mm_set1_epi32(0x7e00), _mm_set1_epi32(0x7c00));
3433 __m128i tinymask = _mm_cmpgt_epi32(_mm_set1_epi32(0x38800000), t);
3434 __m128i tt = _mm_castps_si128(_mm_add_ps(_mm_castsi128_ps(t), _mm_castsi128_ps(rval)));
3435 tt = _mm_sub_epi32(tt, rval);
3436 __m128i odd = _mm_and_si128(_mm_srli_epi32(t, 13), _mm_set1_epi32(1));
3437 __m128i nt = _mm_add_epi32(t, _mm_set1_epi32(0xc8000fff));
3438 nt = _mm_srli_epi32(_mm_add_epi32(nt, odd), 13);
3439 t = v_select_si128(tinymask, tt, nt);
3440 t = v_select_si128(finitemask, t, naninf);
3441 t = _mm_or_si128(t, sign);
3442 t = _mm_packs_epi32(t, t);
3443 _mm_storel_epi64((__m128i*)ptr, t);
3444 #endif
3445 }
3446
v_cleanup()3447 inline void v_cleanup() {}
3448
3449 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
3450
3451 //! @endcond
3452
3453 }
3454
3455 #endif
3456