1 // This file is part of OpenCV project.
2 // It is subject to the license terms in the LICENSE file found in the top-level directory
3 // of this distribution and at http://opencv.org/license.html
4 
5 #ifndef OPENCV_HAL_INTRIN_SSE_EM_HPP
6 #define OPENCV_HAL_INTRIN_SSE_EM_HPP
7 
8 namespace cv
9 {
10 
11 //! @cond IGNORED
12 
13 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
14 
15 #define OPENCV_HAL_SSE_WRAP_1(fun, tp) \
16     inline tp _v128_##fun(const tp& a) \
17     { return _mm_##fun(a); }
18 
19 #define OPENCV_HAL_SSE_WRAP_2(fun, tp) \
20     inline tp _v128_##fun(const tp& a, const tp& b) \
21     { return _mm_##fun(a, b); }
22 
23 #define OPENCV_HAL_SSE_WRAP_3(fun, tp) \
24     inline tp _v128_##fun(const tp& a, const tp& b, const tp& c) \
25     { return _mm_##fun(a, b, c); }
26 
27 ///////////////////////////// XOP /////////////////////////////
28 
29 // [todo] define CV_XOP
30 #if 1 // CV_XOP
_v128_comgt_epu32(const __m128i & a,const __m128i & b)31 inline __m128i _v128_comgt_epu32(const __m128i& a, const __m128i& b)
32 {
33     const __m128i delta = _mm_set1_epi32((int)0x80000000);
34     return _mm_cmpgt_epi32(_mm_xor_si128(a, delta), _mm_xor_si128(b, delta));
35 }
36 // wrapping XOP
37 #else
38 OPENCV_HAL_SSE_WRAP_2(_v128_comgt_epu32, __m128i)
39 #endif // !CV_XOP
40 
41 ///////////////////////////// SSE4.1 /////////////////////////////
42 
43 #if !CV_SSE4_1
44 
45 /** Swizzle **/
_v128_blendv_epi8(const __m128i & a,const __m128i & b,const __m128i & mask)46 inline __m128i _v128_blendv_epi8(const __m128i& a, const __m128i& b, const __m128i& mask)
47 { return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(b, a), mask)); }
48 
49 /** Convert **/
50 // 8 >> 16
_v128_cvtepu8_epi16(const __m128i & a)51 inline __m128i _v128_cvtepu8_epi16(const __m128i& a)
52 {
53     const __m128i z = _mm_setzero_si128();
54     return _mm_unpacklo_epi8(a, z);
55 }
_v128_cvtepi8_epi16(const __m128i & a)56 inline __m128i _v128_cvtepi8_epi16(const __m128i& a)
57 { return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8); }
58 // 8 >> 32
_v128_cvtepu8_epi32(const __m128i & a)59 inline __m128i _v128_cvtepu8_epi32(const __m128i& a)
60 {
61     const __m128i z = _mm_setzero_si128();
62     return _mm_unpacklo_epi16(_mm_unpacklo_epi8(a, z), z);
63 }
_v128_cvtepi8_epi32(const __m128i & a)64 inline __m128i _v128_cvtepi8_epi32(const __m128i& a)
65 {
66     __m128i r = _mm_unpacklo_epi8(a, a);
67     r = _mm_unpacklo_epi8(r, r);
68     return _mm_srai_epi32(r, 24);
69 }
70 // 16 >> 32
_v128_cvtepu16_epi32(const __m128i & a)71 inline __m128i _v128_cvtepu16_epi32(const __m128i& a)
72 {
73     const __m128i z = _mm_setzero_si128();
74     return _mm_unpacklo_epi16(a, z);
75 }
_v128_cvtepi16_epi32(const __m128i & a)76 inline __m128i _v128_cvtepi16_epi32(const __m128i& a)
77 { return _mm_srai_epi32(_mm_unpacklo_epi16(a, a), 16); }
78 // 32 >> 64
_v128_cvtepu32_epi64(const __m128i & a)79 inline __m128i _v128_cvtepu32_epi64(const __m128i& a)
80 {
81     const __m128i z = _mm_setzero_si128();
82     return _mm_unpacklo_epi32(a, z);
83 }
_v128_cvtepi32_epi64(const __m128i & a)84 inline __m128i _v128_cvtepi32_epi64(const __m128i& a)
85 { return _mm_unpacklo_epi32(a, _mm_srai_epi32(a, 31)); }
86 
87 /** Arithmetic **/
_v128_mullo_epi32(const __m128i & a,const __m128i & b)88 inline __m128i _v128_mullo_epi32(const __m128i& a, const __m128i& b)
89 {
90     __m128i c0 = _mm_mul_epu32(a, b);
91     __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a, 32), _mm_srli_epi64(b, 32));
92     __m128i d0 = _mm_unpacklo_epi32(c0, c1);
93     __m128i d1 = _mm_unpackhi_epi32(c0, c1);
94     return _mm_unpacklo_epi64(d0, d1);
95 }
96 
97 /** Math **/
_v128_min_epu32(const __m128i & a,const __m128i & b)98 inline __m128i _v128_min_epu32(const __m128i& a, const __m128i& b)
99 { return _v128_blendv_epi8(a, b, _v128_comgt_epu32(a, b)); }
100 
101 // wrapping SSE4.1
102 #else
OPENCV_HAL_SSE_WRAP_1(cvtepu8_epi16,__m128i)103 OPENCV_HAL_SSE_WRAP_1(cvtepu8_epi16, __m128i)
104 OPENCV_HAL_SSE_WRAP_1(cvtepi8_epi16, __m128i)
105 OPENCV_HAL_SSE_WRAP_1(cvtepu8_epi32, __m128i)
106 OPENCV_HAL_SSE_WRAP_1(cvtepi8_epi32, __m128i)
107 OPENCV_HAL_SSE_WRAP_1(cvtepu16_epi32, __m128i)
108 OPENCV_HAL_SSE_WRAP_1(cvtepi16_epi32, __m128i)
109 OPENCV_HAL_SSE_WRAP_1(cvtepu32_epi64, __m128i)
110 OPENCV_HAL_SSE_WRAP_1(cvtepi32_epi64, __m128i)
111 OPENCV_HAL_SSE_WRAP_2(min_epu32, __m128i)
112 OPENCV_HAL_SSE_WRAP_2(mullo_epi32, __m128i)
113 OPENCV_HAL_SSE_WRAP_3(blendv_epi8, __m128i)
114 #endif // !CV_SSE4_1
115 
116 ///////////////////////////// Revolutionary /////////////////////////////
117 
118 /** Convert **/
119 // 16 << 8
120 inline __m128i _v128_cvtepu8_epi16_high(const __m128i& a)
121 {
122     const __m128i z = _mm_setzero_si128();
123     return _mm_unpackhi_epi8(a, z);
124 }
_v128_cvtepi8_epi16_high(const __m128i & a)125 inline __m128i _v128_cvtepi8_epi16_high(const __m128i& a)
126 { return _mm_srai_epi16(_mm_unpackhi_epi8(a, a), 8); }
127 // 32 << 16
_v128_cvtepu16_epi32_high(const __m128i & a)128 inline __m128i _v128_cvtepu16_epi32_high(const __m128i& a)
129 {
130     const __m128i z = _mm_setzero_si128();
131     return _mm_unpackhi_epi16(a, z);
132 }
_v128_cvtepi16_epi32_high(const __m128i & a)133 inline __m128i _v128_cvtepi16_epi32_high(const __m128i& a)
134 { return _mm_srai_epi32(_mm_unpackhi_epi16(a, a), 16); }
135 // 64 << 32
_v128_cvtepu32_epi64_high(const __m128i & a)136 inline __m128i _v128_cvtepu32_epi64_high(const __m128i& a)
137 {
138     const __m128i z = _mm_setzero_si128();
139     return _mm_unpackhi_epi32(a, z);
140 }
_v128_cvtepi32_epi64_high(const __m128i & a)141 inline __m128i _v128_cvtepi32_epi64_high(const __m128i& a)
142 { return _mm_unpackhi_epi32(a, _mm_srai_epi32(a, 31)); }
143 
144 /** Miscellaneous **/
_v128_packs_epu32(const __m128i & a,const __m128i & b)145 inline __m128i _v128_packs_epu32(const __m128i& a, const __m128i& b)
146 {
147     const __m128i m = _mm_set1_epi32(65535);
148     __m128i am = _v128_min_epu32(a, m);
149     __m128i bm = _v128_min_epu32(b, m);
150 #if CV_SSE4_1
151     return _mm_packus_epi32(am, bm);
152 #else
153     const __m128i d = _mm_set1_epi32(32768), nd = _mm_set1_epi16(-32768);
154     am = _mm_sub_epi32(am, d);
155     bm = _mm_sub_epi32(bm, d);
156     am = _mm_packs_epi32(am, bm);
157     return _mm_sub_epi16(am, nd);
158 #endif
159 }
160 
161 template<int i>
_v128_extract_epi64(const __m128i & a)162 inline int64 _v128_extract_epi64(const __m128i& a)
163 {
164 #if defined(CV__SIMD_HAVE_mm_extract_epi64) || (CV_SSE4_1 && (defined(__x86_64__)/*GCC*/ || defined(_M_X64)/*MSVC*/))
165 #define CV__SIMD_NATIVE_mm_extract_epi64 1
166     return _mm_extract_epi64(a, i);
167 #else
168     CV_DECL_ALIGNED(16) int64 tmp[2];
169     _mm_store_si128((__m128i*)tmp, a);
170     return tmp[i];
171 #endif
172 }
173 
174 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
175 
176 //! @endcond
177 
178 } // cv::
179 
180 #endif // OPENCV_HAL_INTRIN_SSE_EM_HPP
181