1 /* This file is part of the Vc library. {{{
2 Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are met:
6 * Redistributions of source code must retain the above copyright
7 notice, this list of conditions and the following disclaimer.
8 * Redistributions in binary form must reproduce the above copyright
9 notice, this list of conditions and the following disclaimer in the
10 documentation and/or other materials provided with the distribution.
11 * Neither the names of contributing organizations nor the
12 names of its contributors may be used to endorse or promote products
13 derived from this software without specific prior written permission.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
19 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26 }}}*/
27
28 #ifndef VC_SSE_INTRINSICS_H_
29 #define VC_SSE_INTRINSICS_H_
30
31 #ifdef Vc_MSVC
32 #include <intrin.h>
33 #else
34 #include <x86intrin.h>
35 #endif
36
37 #include "../common/storage.h"
38 #include "const_data.h"
39 #include <cstdlib>
40 #include "types.h"
41 #include "debug.h"
42
43 #if defined(Vc_GCC) && !defined(__OPTIMIZE__)
44 // GCC uses lots of old-style-casts in macros that disguise as intrinsics
45 #pragma GCC diagnostic push
46 #pragma GCC diagnostic ignored "-Wold-style-cast"
47 #endif
48
49 namespace Vc_VERSIONED_NAMESPACE
50 {
51 namespace SseIntrinsics
52 {
53 using SSE::c_general;
54
55 constexpr std::size_t VectorAlignment = 16;
56
57 #if defined(Vc_GCC) && Vc_GCC < 0x40600 && !defined(Vc_DONT_FIX_SSE_SHIFT)
_mm_sll_epi16(__m128i a,__m128i count)58 static Vc_INTRINSIC Vc_CONST __m128i _mm_sll_epi16(__m128i a, __m128i count) { __asm__("psllw %1,%0" : "+x"(a) : "x"(count)); return a; }
_mm_sll_epi32(__m128i a,__m128i count)59 static Vc_INTRINSIC Vc_CONST __m128i _mm_sll_epi32(__m128i a, __m128i count) { __asm__("pslld %1,%0" : "+x"(a) : "x"(count)); return a; }
_mm_sll_epi64(__m128i a,__m128i count)60 static Vc_INTRINSIC Vc_CONST __m128i _mm_sll_epi64(__m128i a, __m128i count) { __asm__("psllq %1,%0" : "+x"(a) : "x"(count)); return a; }
_mm_srl_epi16(__m128i a,__m128i count)61 static Vc_INTRINSIC Vc_CONST __m128i _mm_srl_epi16(__m128i a, __m128i count) { __asm__("psrlw %1,%0" : "+x"(a) : "x"(count)); return a; }
_mm_srl_epi32(__m128i a,__m128i count)62 static Vc_INTRINSIC Vc_CONST __m128i _mm_srl_epi32(__m128i a, __m128i count) { __asm__("psrld %1,%0" : "+x"(a) : "x"(count)); return a; }
_mm_srl_epi64(__m128i a,__m128i count)63 static Vc_INTRINSIC Vc_CONST __m128i _mm_srl_epi64(__m128i a, __m128i count) { __asm__("psrlq %1,%0" : "+x"(a) : "x"(count)); return a; }
64 #endif
65
66 #ifdef Vc_GCC
67 // Redefine the mul/add/sub intrinsics to use GCC-specific operators instead of builtin
68 // functions. This way the fp-contraction optimization step kicks in and creates FMAs! :)
_mm_mul_pd(__m128d a,__m128d b)69 static Vc_INTRINSIC Vc_CONST __m128d _mm_mul_pd(__m128d a, __m128d b) { return static_cast<__m128d>(static_cast<__v2df>(a) * static_cast<__v2df>(b)); }
_mm_add_pd(__m128d a,__m128d b)70 static Vc_INTRINSIC Vc_CONST __m128d _mm_add_pd(__m128d a, __m128d b) { return static_cast<__m128d>(static_cast<__v2df>(a) + static_cast<__v2df>(b)); }
_mm_sub_pd(__m128d a,__m128d b)71 static Vc_INTRINSIC Vc_CONST __m128d _mm_sub_pd(__m128d a, __m128d b) { return static_cast<__m128d>(static_cast<__v2df>(a) - static_cast<__v2df>(b)); }
_mm_mul_ps(__m128 a,__m128 b)72 static Vc_INTRINSIC Vc_CONST __m128 _mm_mul_ps(__m128 a, __m128 b) { return static_cast<__m128 >(static_cast<__v4sf>(a) * static_cast<__v4sf>(b)); }
_mm_add_ps(__m128 a,__m128 b)73 static Vc_INTRINSIC Vc_CONST __m128 _mm_add_ps(__m128 a, __m128 b) { return static_cast<__m128 >(static_cast<__v4sf>(a) + static_cast<__v4sf>(b)); }
_mm_sub_ps(__m128 a,__m128 b)74 static Vc_INTRINSIC Vc_CONST __m128 _mm_sub_ps(__m128 a, __m128 b) { return static_cast<__m128 >(static_cast<__v4sf>(a) - static_cast<__v4sf>(b)); }
75 #endif
76
_mm_setallone_si128()77 static Vc_INTRINSIC Vc_CONST __m128i _mm_setallone_si128() { return _mm_load_si128(reinterpret_cast<const __m128i *>(Common::AllBitsSet)); }
_mm_setallone_pd()78 static Vc_INTRINSIC Vc_CONST __m128d _mm_setallone_pd() { return _mm_load_pd(reinterpret_cast<const double *>(Common::AllBitsSet)); }
_mm_setallone_ps()79 static Vc_INTRINSIC Vc_CONST __m128 _mm_setallone_ps() { return _mm_load_ps(reinterpret_cast<const float *>(Common::AllBitsSet)); }
80
_mm_setone_epi16()81 static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epi16() { return _mm_load_si128(reinterpret_cast<const __m128i *>(c_general::one16)); }
_mm_setone_epu16()82 static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epu16() { return _mm_setone_epi16(); }
_mm_setone_epi32()83 static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epi32() { return _mm_load_si128(reinterpret_cast<const __m128i *>(c_general::one32)); }
_mm_setone_epu32()84 static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epu32() { return _mm_setone_epi32(); }
85
_mm_setone_ps()86 static Vc_INTRINSIC __m128 Vc_CONST _mm_setone_ps() { return _mm_load_ps(c_general::oneFloat); }
_mm_setone_pd()87 static Vc_INTRINSIC __m128d Vc_CONST _mm_setone_pd() { return _mm_load_pd(c_general::oneDouble); }
88
_mm_setabsmask_pd()89 static Vc_INTRINSIC __m128d Vc_CONST _mm_setabsmask_pd() { return _mm_load_pd(reinterpret_cast<const double *>(c_general::absMaskDouble)); }
_mm_setabsmask_ps()90 static Vc_INTRINSIC __m128 Vc_CONST _mm_setabsmask_ps() { return _mm_load_ps(reinterpret_cast<const float *>(c_general::absMaskFloat)); }
_mm_setsignmask_pd()91 static Vc_INTRINSIC __m128d Vc_CONST _mm_setsignmask_pd(){ return _mm_load_pd(reinterpret_cast<const double *>(c_general::signMaskDouble)); }
_mm_setsignmask_ps()92 static Vc_INTRINSIC __m128 Vc_CONST _mm_setsignmask_ps(){ return _mm_load_ps(reinterpret_cast<const float *>(c_general::signMaskFloat)); }
93
setmin_epi8()94 static Vc_INTRINSIC __m128i Vc_CONST setmin_epi8 () { return _mm_set1_epi8(-0x80); }
setmin_epi16()95 static Vc_INTRINSIC __m128i Vc_CONST setmin_epi16() { return _mm_load_si128(reinterpret_cast<const __m128i *>(c_general::minShort)); }
setmin_epi32()96 static Vc_INTRINSIC __m128i Vc_CONST setmin_epi32() { return _mm_load_si128(reinterpret_cast<const __m128i *>(c_general::signMaskFloat)); }
97
98 #if defined(Vc_IMPL_XOP)
cmpgt_epu8(__m128i a,__m128i b)99 static Vc_INTRINSIC __m128i Vc_CONST cmpgt_epu8(__m128i a, __m128i b) { return _mm_comgt_epu8(a, b); }
cmplt_epu16(__m128i a,__m128i b)100 static Vc_INTRINSIC __m128i Vc_CONST cmplt_epu16(__m128i a, __m128i b) { return _mm_comlt_epu16(a, b); }
cmpgt_epu16(__m128i a,__m128i b)101 static Vc_INTRINSIC __m128i Vc_CONST cmpgt_epu16(__m128i a, __m128i b) { return _mm_comgt_epu16(a, b); }
cmplt_epu32(__m128i a,__m128i b)102 static Vc_INTRINSIC __m128i Vc_CONST cmplt_epu32(__m128i a, __m128i b) { return _mm_comlt_epu32(a, b); }
cmpgt_epu32(__m128i a,__m128i b)103 static Vc_INTRINSIC __m128i Vc_CONST cmpgt_epu32(__m128i a, __m128i b) { return _mm_comgt_epu32(a, b); }
cmplt_epu64(__m128i a,__m128i b)104 static Vc_INTRINSIC __m128i Vc_CONST cmplt_epu64(__m128i a, __m128i b) { return _mm_comlt_epu64(a, b); }
105 #else
cmpgt_epu8(__m128i a,__m128i b)106 static Vc_INTRINSIC __m128i Vc_CONST cmpgt_epu8(__m128i a, __m128i b)
107 {
108 return _mm_cmpgt_epi8(_mm_xor_si128(a, setmin_epi8()),
109 _mm_xor_si128(b, setmin_epi8()));
110 }
cmplt_epu16(__m128i a,__m128i b)111 static Vc_INTRINSIC __m128i Vc_CONST cmplt_epu16(__m128i a, __m128i b)
112 {
113 return _mm_cmplt_epi16(_mm_xor_si128(a, setmin_epi16()),
114 _mm_xor_si128(b, setmin_epi16()));
115 }
cmpgt_epu16(__m128i a,__m128i b)116 static Vc_INTRINSIC __m128i Vc_CONST cmpgt_epu16(__m128i a, __m128i b)
117 {
118 return _mm_cmpgt_epi16(_mm_xor_si128(a, setmin_epi16()),
119 _mm_xor_si128(b, setmin_epi16()));
120 }
cmplt_epu32(__m128i a,__m128i b)121 static Vc_INTRINSIC __m128i Vc_CONST cmplt_epu32(__m128i a, __m128i b)
122 {
123 return _mm_cmplt_epi32(_mm_xor_si128(a, setmin_epi32()),
124 _mm_xor_si128(b, setmin_epi32()));
125 }
cmpgt_epu32(__m128i a,__m128i b)126 static Vc_INTRINSIC __m128i Vc_CONST cmpgt_epu32(__m128i a, __m128i b)
127 {
128 return _mm_cmpgt_epi32(_mm_xor_si128(a, setmin_epi32()),
129 _mm_xor_si128(b, setmin_epi32()));
130 }
cmpgt_epi64(__m128i a,__m128i b)131 Vc_INTRINSIC __m128i Vc_CONST cmpgt_epi64(__m128i a, __m128i b)
132 {
133 #ifdef Vc_IMPL_SSE4_2
134 return _mm_cmpgt_epi64(a, b);
135 #else
136 const auto aa = _mm_xor_si128(a, _mm_srli_epi64(setmin_epi32(),32));
137 const auto bb = _mm_xor_si128(b, _mm_srli_epi64(setmin_epi32(),32));
138 const auto gt = _mm_cmpgt_epi32(aa, bb);
139 const auto eq = _mm_cmpeq_epi32(aa, bb);
140 // Algorithm:
141 // 1. if the high 32 bits of gt are true, make the full 64 bits true
142 // 2. if the high 32 bits of gt are false and the high 32 bits of eq are true,
143 // duplicate the low 32 bits of gt to the high 32 bits (note that this requires
144 // unsigned compare on the lower 32 bits, which is the reason for the xors
145 // above)
146 // 3. else make the full 64 bits false
147
148 const auto gt2 =
149 _mm_shuffle_epi32(gt, 0xf5); // dup the high 32 bits to the low 32 bits
150 const auto lo =
151 _mm_shuffle_epi32(_mm_and_si128(_mm_srli_epi64(eq, 32), gt), 0xa0);
152 return _mm_or_si128(gt2, lo);
153 #endif
154 }
155 #endif
156 } // namespace SseIntrinsics
157 } // namespace Vc
158
159 // SSSE3
160 #ifdef Vc_IMPL_SSSE3
161 namespace Vc_VERSIONED_NAMESPACE
162 {
163 namespace SseIntrinsics
164 {
165 // not overriding _mm_set1_epi8 because this one should only be used for non-constants
abs_epi8(__m128i a)166 Vc_INTRINSIC Vc_CONST __m128i abs_epi8(__m128i a) { return _mm_abs_epi8(a); }
abs_epi16(__m128i a)167 Vc_INTRINSIC Vc_CONST __m128i abs_epi16(__m128i a) { return _mm_abs_epi16(a); }
abs_epi32(__m128i a)168 Vc_INTRINSIC Vc_CONST __m128i abs_epi32(__m128i a) { return _mm_abs_epi32(a); }
alignr_epi8(__m128i a,__m128i b)169 template <int s> Vc_INTRINSIC Vc_CONST __m128i alignr_epi8(__m128i a, __m128i b)
170 {
171 return _mm_alignr_epi8(a, b, s & 0x1fu);
172 }
173 } // namespace SseIntrinsics
174 } // namespace Vc
175
176 #else
177
178 namespace Vc_VERSIONED_NAMESPACE
179 {
180 namespace SseIntrinsics
181 {
abs_epi8(__m128i a)182 Vc_INTRINSIC Vc_CONST __m128i abs_epi8 (__m128i a) {
183 __m128i negative = _mm_cmplt_epi8 (a, _mm_setzero_si128());
184 return _mm_add_epi8 (_mm_xor_si128(a, negative), _mm_and_si128(negative, _mm_set1_epi8(1)));
185 }
186 // positive value:
187 // negative == 0
188 // a unchanged after xor
189 // 0 >> 31 -> 0
190 // a + 0 -> a
191 // negative value:
192 // negative == -1
193 // a xor -1 -> -a - 1
194 // -1 >> 31 -> 1
195 // -a - 1 + 1 -> -a
abs_epi16(__m128i a)196 Vc_INTRINSIC Vc_CONST __m128i abs_epi16(__m128i a) {
197 __m128i negative = _mm_cmplt_epi16(a, _mm_setzero_si128());
198 return _mm_add_epi16(_mm_xor_si128(a, negative), _mm_srli_epi16(negative, 15));
199 }
abs_epi32(__m128i a)200 Vc_INTRINSIC Vc_CONST __m128i abs_epi32(__m128i a) {
201 __m128i negative = _mm_cmplt_epi32(a, _mm_setzero_si128());
202 return _mm_add_epi32(_mm_xor_si128(a, negative), _mm_srli_epi32(negative, 31));
203 }
alignr_epi8(__m128i a,__m128i b)204 template <int s> Vc_INTRINSIC Vc_CONST __m128i alignr_epi8(__m128i a, __m128i b)
205 {
206 switch (s & 0x1fu) {
207 case 0: return b;
208 case 1: return _mm_or_si128(_mm_slli_si128(a, 15), _mm_srli_si128(b, 1));
209 case 2: return _mm_or_si128(_mm_slli_si128(a, 14), _mm_srli_si128(b, 2));
210 case 3: return _mm_or_si128(_mm_slli_si128(a, 13), _mm_srli_si128(b, 3));
211 case 4: return _mm_or_si128(_mm_slli_si128(a, 12), _mm_srli_si128(b, 4));
212 case 5: return _mm_or_si128(_mm_slli_si128(a, 11), _mm_srli_si128(b, 5));
213 case 6: return _mm_or_si128(_mm_slli_si128(a, 10), _mm_srli_si128(b, 6));
214 case 7: return _mm_or_si128(_mm_slli_si128(a, 9), _mm_srli_si128(b, 7));
215 case 8: return _mm_or_si128(_mm_slli_si128(a, 8), _mm_srli_si128(b, 8));
216 case 9: return _mm_or_si128(_mm_slli_si128(a, 7), _mm_srli_si128(b, 9));
217 case 10: return _mm_or_si128(_mm_slli_si128(a, 6), _mm_srli_si128(b, 10));
218 case 11: return _mm_or_si128(_mm_slli_si128(a, 5), _mm_srli_si128(b, 11));
219 case 12: return _mm_or_si128(_mm_slli_si128(a, 4), _mm_srli_si128(b, 12));
220 case 13: return _mm_or_si128(_mm_slli_si128(a, 3), _mm_srli_si128(b, 13));
221 case 14: return _mm_or_si128(_mm_slli_si128(a, 2), _mm_srli_si128(b, 14));
222 case 15: return _mm_or_si128(_mm_slli_si128(a, 1), _mm_srli_si128(b, 15));
223 case 16: return a;
224 case 17: return _mm_srli_si128(a, 1);
225 case 18: return _mm_srli_si128(a, 2);
226 case 19: return _mm_srli_si128(a, 3);
227 case 20: return _mm_srli_si128(a, 4);
228 case 21: return _mm_srli_si128(a, 5);
229 case 22: return _mm_srli_si128(a, 6);
230 case 23: return _mm_srli_si128(a, 7);
231 case 24: return _mm_srli_si128(a, 8);
232 case 25: return _mm_srli_si128(a, 9);
233 case 26: return _mm_srli_si128(a, 10);
234 case 27: return _mm_srli_si128(a, 11);
235 case 28: return _mm_srli_si128(a, 12);
236 case 29: return _mm_srli_si128(a, 13);
237 case 30: return _mm_srli_si128(a, 14);
238 case 31: return _mm_srli_si128(a, 15);
239 }
240 return _mm_setzero_si128();
241 }
242 } // namespace SseIntrinsics
243 } // namespace Vc
244 #endif
245
246 // SSE4.1
247 #ifdef Vc_IMPL_SSE4_1
248 namespace Vc_VERSIONED_NAMESPACE
249 {
250 namespace SseIntrinsics
251 {
cmpeq_epi64(__m128i a,__m128i b)252 Vc_INTRINSIC Vc_CONST __m128i cmpeq_epi64(__m128i a, __m128i b)
253 {
254 return _mm_cmpeq_epi64(a, b);
255 }
extract_epi32(__m128i v)256 template <int index> Vc_INTRINSIC Vc_CONST int extract_epi32(__m128i v)
257 {
258 return _mm_extract_epi32(v, index);
259 }
blendv_pd(__m128d a,__m128d b,__m128d c)260 Vc_INTRINSIC Vc_CONST __m128d blendv_pd(__m128d a, __m128d b, __m128d c)
261 {
262 return _mm_blendv_pd(a, b, c);
263 }
blendv_ps(__m128 a,__m128 b,__m128 c)264 Vc_INTRINSIC Vc_CONST __m128 blendv_ps(__m128 a, __m128 b, __m128 c)
265 {
266 return _mm_blendv_ps(a, b, c);
267 }
blendv_epi8(__m128i a,__m128i b,__m128i c)268 Vc_INTRINSIC Vc_CONST __m128i blendv_epi8(__m128i a, __m128i b, __m128i c)
269 {
270 return _mm_blendv_epi8(a, b, c);
271 }
blend_pd(__m128d a,__m128d b)272 template <int mask> Vc_INTRINSIC Vc_CONST __m128d blend_pd(__m128d a, __m128d b)
273 {
274 return _mm_blend_pd(a, b, mask);
275 }
blend_ps(__m128 a,__m128 b)276 template <int mask> Vc_INTRINSIC Vc_CONST __m128 blend_ps(__m128 a, __m128 b)
277 {
278 return _mm_blend_ps(a, b, mask);
279 }
blend_epi16(__m128i a,__m128i b)280 template <int mask> Vc_INTRINSIC Vc_CONST __m128i blend_epi16(__m128i a, __m128i b)
281 {
282 return _mm_blend_epi16(a, b, mask);
283 }
max_epi8(__m128i a,__m128i b)284 Vc_INTRINSIC Vc_CONST __m128i max_epi8(__m128i a, __m128i b)
285 {
286 return _mm_max_epi8(a, b);
287 }
max_epi32(__m128i a,__m128i b)288 Vc_INTRINSIC Vc_CONST __m128i max_epi32(__m128i a, __m128i b)
289 {
290 return _mm_max_epi32(a, b);
291 }
max_epu16(__m128i a,__m128i b)292 Vc_INTRINSIC Vc_CONST __m128i max_epu16(__m128i a, __m128i b)
293 {
294 return _mm_max_epu16(a, b);
295 }
max_epu32(__m128i a,__m128i b)296 Vc_INTRINSIC Vc_CONST __m128i max_epu32(__m128i a, __m128i b)
297 {
298 return _mm_max_epu32(a, b);
299 }
min_epu16(__m128i a,__m128i b)300 Vc_INTRINSIC Vc_CONST __m128i min_epu16(__m128i a, __m128i b)
301 {
302 return _mm_min_epu16(a, b);
303 }
min_epu32(__m128i a,__m128i b)304 Vc_INTRINSIC Vc_CONST __m128i min_epu32(__m128i a, __m128i b)
305 {
306 return _mm_min_epu32(a, b);
307 }
min_epi8(__m128i a,__m128i b)308 Vc_INTRINSIC Vc_CONST __m128i min_epi8(__m128i a, __m128i b)
309 {
310 return _mm_min_epi8(a, b);
311 }
min_epi32(__m128i a,__m128i b)312 Vc_INTRINSIC Vc_CONST __m128i min_epi32(__m128i a, __m128i b)
313 {
314 return _mm_min_epi32(a, b);
315 }
cvtepu8_epi16(__m128i epu8)316 Vc_INTRINSIC Vc_CONST __m128i cvtepu8_epi16(__m128i epu8)
317 {
318 return _mm_cvtepu8_epi16(epu8);
319 }
cvtepi8_epi16(__m128i epi8)320 Vc_INTRINSIC Vc_CONST __m128i cvtepi8_epi16(__m128i epi8)
321 {
322 return _mm_cvtepi8_epi16(epi8);
323 }
cvtepu16_epi32(__m128i epu16)324 Vc_INTRINSIC Vc_CONST __m128i cvtepu16_epi32(__m128i epu16)
325 {
326 return _mm_cvtepu16_epi32(epu16);
327 }
cvtepi16_epi32(__m128i epu16)328 Vc_INTRINSIC Vc_CONST __m128i cvtepi16_epi32(__m128i epu16)
329 {
330 return _mm_cvtepi16_epi32(epu16);
331 }
cvtepu8_epi32(__m128i epu8)332 Vc_INTRINSIC Vc_CONST __m128i cvtepu8_epi32(__m128i epu8)
333 {
334 return _mm_cvtepu8_epi32(epu8);
335 }
cvtepi8_epi32(__m128i epi8)336 Vc_INTRINSIC Vc_CONST __m128i cvtepi8_epi32(__m128i epi8)
337 {
338 return _mm_cvtepi8_epi32(epi8);
339 }
340 } // namespace SseIntrinsics
341 } // namespace Vc
342 #else
343
344 namespace Vc_VERSIONED_NAMESPACE
345 {
346 namespace SseIntrinsics
347 {
cmpeq_epi64(__m128i a,__m128i b)348 Vc_INTRINSIC Vc_CONST __m128i cmpeq_epi64(__m128i a, __m128i b) {
349 auto tmp = _mm_cmpeq_epi32(a, b);
350 return _mm_and_si128(tmp, _mm_shuffle_epi32(tmp, 1*1 + 0*4 + 3*16 + 2*64));
351 }
extract_epi32(__m128i v)352 template <int index> Vc_INTRINSIC Vc_CONST int extract_epi32(__m128i v)
353 {
354 #ifdef Vc_USE_BUILTIN_VECTOR_TYPES
355 typedef int int32v4 __attribute__((__vector_size__(16)));
356 return aliasing_cast<int32v4>(v)[index];
357 #else
358 return _mm_cvtsi128_si32(_mm_srli_si128(v, index * 4));
359 #endif
360 }
blendv_pd(__m128d a,__m128d b,__m128d c)361 Vc_INTRINSIC Vc_CONST __m128d blendv_pd(__m128d a, __m128d b, __m128d c) {
362 #ifdef Vc_GCC
363 return reinterpret_cast<__m128d>(
364 (~reinterpret_cast<__m128i>(c) & reinterpret_cast<__m128i>(a)) |
365 (reinterpret_cast<__m128i>(c) & reinterpret_cast<__m128i>(b)));
366 #else
367 return _mm_or_pd(_mm_andnot_pd(c, a), _mm_and_pd(c, b));
368 #endif
369 }
blendv_ps(__m128 a,__m128 b,__m128 c)370 Vc_INTRINSIC Vc_CONST __m128 blendv_ps(__m128 a, __m128 b, __m128 c) {
371 #ifdef Vc_GCC
372 return reinterpret_cast<__m128>(
373 (~reinterpret_cast<__m128i>(c) & reinterpret_cast<__m128i>(a)) |
374 (reinterpret_cast<__m128i>(c) & reinterpret_cast<__m128i>(b)));
375 #else
376 return _mm_or_ps(_mm_andnot_ps(c, a), _mm_and_ps(c, b));
377 #endif
378 }
blendv_epi8(__m128i a,__m128i b,__m128i c)379 Vc_INTRINSIC Vc_CONST __m128i blendv_epi8(__m128i a, __m128i b, __m128i c) {
380 #ifdef Vc_GCC
381 return (~c & a) | (c & b);
382 #else
383 return _mm_or_si128(_mm_andnot_si128(c, a), _mm_and_si128(c, b));
384 #endif
385 }
386
387 // only use the following blend functions with immediates as mask and, of course, compiling
388 // with optimization
blend_pd(__m128d a,__m128d b)389 template <int mask> Vc_INTRINSIC Vc_CONST __m128d blend_pd(__m128d a, __m128d b)
390 {
391 switch (mask) {
392 case 0x0:
393 return a;
394 case 0x1:
395 return _mm_shuffle_pd(b, a, 2);
396 case 0x2:
397 return _mm_shuffle_pd(a, b, 2);
398 case 0x3:
399 return b;
400 default:
401 abort();
402 return a; // should never be reached, but MSVC needs it else it warns about 'not all control paths return a value'
403 }
404 }
blend_ps(__m128 a,__m128 b)405 template <int mask> Vc_INTRINSIC Vc_CONST __m128 blend_ps(__m128 a, __m128 b)
406 {
407 __m128i c;
408 switch (mask) {
409 case 0x0:
410 return a;
411 case 0x1:
412 c = _mm_srli_si128(_mm_setallone_si128(), 12);
413 break;
414 case 0x2:
415 c = _mm_slli_si128(_mm_srli_si128(_mm_setallone_si128(), 12), 4);
416 break;
417 case 0x3:
418 c = _mm_srli_si128(_mm_setallone_si128(), 8);
419 break;
420 case 0x4:
421 c = _mm_slli_si128(_mm_srli_si128(_mm_setallone_si128(), 12), 8);
422 break;
423 case 0x5:
424 c = _mm_set_epi32(0, -1, 0, -1);
425 break;
426 case 0x6:
427 c = _mm_slli_si128(_mm_srli_si128(_mm_setallone_si128(), 8), 4);
428 break;
429 case 0x7:
430 c = _mm_srli_si128(_mm_setallone_si128(), 4);
431 break;
432 case 0x8:
433 c = _mm_slli_si128(_mm_setallone_si128(), 12);
434 break;
435 case 0x9:
436 c = _mm_set_epi32(-1, 0, 0, -1);
437 break;
438 case 0xa:
439 c = _mm_set_epi32(-1, 0, -1, 0);
440 break;
441 case 0xb:
442 c = _mm_set_epi32(-1, 0, -1, -1);
443 break;
444 case 0xc:
445 c = _mm_slli_si128(_mm_setallone_si128(), 8);
446 break;
447 case 0xd:
448 c = _mm_set_epi32(-1, -1, 0, -1);
449 break;
450 case 0xe:
451 c = _mm_slli_si128(_mm_setallone_si128(), 4);
452 break;
453 case 0xf:
454 return b;
455 default: // may not happen
456 abort();
457 c = _mm_setzero_si128();
458 break;
459 }
460 __m128 _c = _mm_castsi128_ps(c);
461 return _mm_or_ps(_mm_andnot_ps(_c, a), _mm_and_ps(_c, b));
462 }
blend_epi16(__m128i a,__m128i b)463 template <int mask> Vc_INTRINSIC Vc_CONST __m128i blend_epi16(__m128i a, __m128i b)
464 {
465 __m128i c;
466 switch (mask) {
467 case 0x00:
468 return a;
469 case 0x01:
470 c = _mm_srli_si128(_mm_setallone_si128(), 14);
471 break;
472 case 0x03:
473 c = _mm_srli_si128(_mm_setallone_si128(), 12);
474 break;
475 case 0x07:
476 c = _mm_srli_si128(_mm_setallone_si128(), 10);
477 break;
478 case 0x0f:
479 return _mm_unpackhi_epi64(_mm_slli_si128(b, 8), a);
480 case 0x1f:
481 c = _mm_srli_si128(_mm_setallone_si128(), 6);
482 break;
483 case 0x3f:
484 c = _mm_srli_si128(_mm_setallone_si128(), 4);
485 break;
486 case 0x7f:
487 c = _mm_srli_si128(_mm_setallone_si128(), 2);
488 break;
489 case 0x80:
490 c = _mm_slli_si128(_mm_setallone_si128(), 14);
491 break;
492 case 0xc0:
493 c = _mm_slli_si128(_mm_setallone_si128(), 12);
494 break;
495 case 0xe0:
496 c = _mm_slli_si128(_mm_setallone_si128(), 10);
497 break;
498 case 0xf0:
499 c = _mm_slli_si128(_mm_setallone_si128(), 8);
500 break;
501 case 0xf8:
502 c = _mm_slli_si128(_mm_setallone_si128(), 6);
503 break;
504 case 0xfc:
505 c = _mm_slli_si128(_mm_setallone_si128(), 4);
506 break;
507 case 0xfe:
508 c = _mm_slli_si128(_mm_setallone_si128(), 2);
509 break;
510 case 0xff:
511 return b;
512 case 0xcc:
513 return _mm_unpacklo_epi32(_mm_shuffle_epi32(a, _MM_SHUFFLE(2, 0, 2, 0)), _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 1, 3, 1)));
514 case 0x33:
515 return _mm_unpacklo_epi32(_mm_shuffle_epi32(b, _MM_SHUFFLE(2, 0, 2, 0)), _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 1, 3, 1)));
516 default:
517 const __m128i shift = _mm_set_epi16(0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000, -0x7fff);
518 c = _mm_srai_epi16(_mm_mullo_epi16(_mm_set1_epi16(mask), shift), 15);
519 break;
520 }
521 return _mm_or_si128(_mm_andnot_si128(c, a), _mm_and_si128(c, b));
522 }
523
max_epi8(__m128i a,__m128i b)524 Vc_INTRINSIC Vc_CONST __m128i max_epi8 (__m128i a, __m128i b) {
525 return blendv_epi8(b, a, _mm_cmpgt_epi8 (a, b));
526 }
max_epi32(__m128i a,__m128i b)527 Vc_INTRINSIC Vc_CONST __m128i max_epi32(__m128i a, __m128i b) {
528 return blendv_epi8(b, a, _mm_cmpgt_epi32(a, b));
529 }
max_epu16(__m128i a,__m128i b)530 Vc_INTRINSIC Vc_CONST __m128i max_epu16(__m128i a, __m128i b) {
531 return blendv_epi8(b, a, cmpgt_epu16(a, b));
532 }
max_epu32(__m128i a,__m128i b)533 Vc_INTRINSIC Vc_CONST __m128i max_epu32(__m128i a, __m128i b) {
534 return blendv_epi8(b, a, cmpgt_epu32(a, b));
535 }
min_epu16(__m128i a,__m128i b)536 Vc_INTRINSIC Vc_CONST __m128i min_epu16(__m128i a, __m128i b) {
537 return blendv_epi8(a, b, cmpgt_epu16(a, b));
538 }
min_epu32(__m128i a,__m128i b)539 Vc_INTRINSIC Vc_CONST __m128i min_epu32(__m128i a, __m128i b) {
540 return blendv_epi8(a, b, cmpgt_epu32(a, b));
541 }
min_epi8(__m128i a,__m128i b)542 Vc_INTRINSIC Vc_CONST __m128i min_epi8 (__m128i a, __m128i b) {
543 return blendv_epi8(a, b, _mm_cmpgt_epi8 (a, b));
544 }
min_epi32(__m128i a,__m128i b)545 Vc_INTRINSIC Vc_CONST __m128i min_epi32(__m128i a, __m128i b) {
546 return blendv_epi8(a, b, _mm_cmpgt_epi32(a, b));
547 }
cvtepu8_epi16(__m128i epu8)548 Vc_INTRINSIC Vc_CONST __m128i cvtepu8_epi16(__m128i epu8) {
549 return _mm_unpacklo_epi8(epu8, _mm_setzero_si128());
550 }
cvtepi8_epi16(__m128i epi8)551 Vc_INTRINSIC Vc_CONST __m128i cvtepi8_epi16(__m128i epi8) {
552 return _mm_unpacklo_epi8(epi8, _mm_cmplt_epi8(epi8, _mm_setzero_si128()));
553 }
cvtepu16_epi32(__m128i epu16)554 Vc_INTRINSIC Vc_CONST __m128i cvtepu16_epi32(__m128i epu16) {
555 return _mm_unpacklo_epi16(epu16, _mm_setzero_si128());
556 }
cvtepi16_epi32(__m128i epu16)557 Vc_INTRINSIC Vc_CONST __m128i cvtepi16_epi32(__m128i epu16) {
558 return _mm_unpacklo_epi16(epu16, _mm_cmplt_epi16(epu16, _mm_setzero_si128()));
559 }
cvtepu8_epi32(__m128i epu8)560 Vc_INTRINSIC Vc_CONST __m128i cvtepu8_epi32(__m128i epu8) {
561 return cvtepu16_epi32(cvtepu8_epi16(epu8));
562 }
cvtepi8_epi32(__m128i epi8)563 Vc_INTRINSIC Vc_CONST __m128i cvtepi8_epi32(__m128i epi8) {
564 const __m128i neg = _mm_cmplt_epi8(epi8, _mm_setzero_si128());
565 const __m128i epi16 = _mm_unpacklo_epi8(epi8, neg);
566 return _mm_unpacklo_epi16(epi16, _mm_unpacklo_epi8(neg, neg));
567 }
568 } // namespace SseIntrinsics
569 } // namespace Vc
570 #endif
571
572 // SSE4.2
573 namespace Vc_VERSIONED_NAMESPACE
574 {
575 namespace SseIntrinsics
576 {
_mm_stream_load(const float * mem)577 static Vc_INTRINSIC Vc_PURE __m128 _mm_stream_load(const float *mem) {
578 #ifdef Vc_IMPL_SSE4_1
579 return _mm_castsi128_ps(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<float *>(mem))));
580 #else
581 return _mm_load_ps(mem);
582 #endif
583 }
_mm_stream_load(const double * mem)584 static Vc_INTRINSIC Vc_PURE __m128d _mm_stream_load(const double *mem) {
585 #ifdef Vc_IMPL_SSE4_1
586 return _mm_castsi128_pd(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<double *>(mem))));
587 #else
588 return _mm_load_pd(mem);
589 #endif
590 }
_mm_stream_load(const int * mem)591 static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const int *mem) {
592 #ifdef Vc_IMPL_SSE4_1
593 return _mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<int *>(mem)));
594 #else
595 return _mm_load_si128(reinterpret_cast<const __m128i *>(mem));
596 #endif
597 }
_mm_stream_load(const unsigned int * mem)598 static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const unsigned int *mem) {
599 return _mm_stream_load(reinterpret_cast<const int *>(mem));
600 }
_mm_stream_load(const short * mem)601 static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const short *mem) {
602 return _mm_stream_load(reinterpret_cast<const int *>(mem));
603 }
_mm_stream_load(const unsigned short * mem)604 static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const unsigned short *mem) {
605 return _mm_stream_load(reinterpret_cast<const int *>(mem));
606 }
_mm_stream_load(const signed char * mem)607 static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const signed char *mem) {
608 return _mm_stream_load(reinterpret_cast<const int *>(mem));
609 }
_mm_stream_load(const unsigned char * mem)610 static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const unsigned char *mem) {
611 return _mm_stream_load(reinterpret_cast<const int *>(mem));
612 }
613
614 #ifndef __x86_64__
_mm_cvtsi64_si128(int64_t x)615 Vc_INTRINSIC Vc_PURE __m128i _mm_cvtsi64_si128(int64_t x) {
616 return _mm_castpd_si128(_mm_load_sd(reinterpret_cast<const double *>(&x)));
617 }
618 #endif
619
620 #ifdef Vc_IMPL_AVX2
gather(const float * addr,__m128i idx)621 template <int Scale> __m128 gather(const float *addr, __m128i idx)
622 {
623 return _mm_i32gather_ps(addr, idx, Scale);
624 }
gather(const double * addr,__m128i idx)625 template <int Scale> __m128d gather(const double *addr, __m128i idx)
626 {
627 return _mm_i32gather_pd(addr, idx, Scale);
628 }
gather(const int * addr,__m128i idx)629 template <int Scale> __m128i gather(const int *addr, __m128i idx)
630 {
631 return _mm_i32gather_epi32(addr, idx, Scale);
632 }
gather(const unsigned * addr,__m128i idx)633 template <int Scale> __m128i gather(const unsigned *addr, __m128i idx)
634 {
635 return _mm_i32gather_epi32(aliasing_cast<int>(addr), idx, Scale);
636 }
637
gather(__m128 src,__m128 k,const float * addr,__m128i idx)638 template <int Scale> __m128 gather(__m128 src, __m128 k, const float *addr, __m128i idx)
639 {
640 return _mm_mask_i32gather_ps(src, addr, idx, k, Scale);
641 }
642 template <int Scale>
gather(__m128d src,__m128d k,const double * addr,__m128i idx)643 __m128d gather(__m128d src, __m128d k, const double *addr, __m128i idx)
644 {
645 return _mm_mask_i32gather_pd(src, addr, idx, k, Scale);
646 }
gather(__m128i src,__m128i k,const int * addr,__m128i idx)647 template <int Scale> __m128i gather(__m128i src, __m128i k, const int *addr, __m128i idx)
648 {
649 return _mm_mask_i32gather_epi32(src, addr, idx, k, Scale);
650 }
651 template <int Scale>
gather(__m128i src,__m128i k,const unsigned * addr,__m128i idx)652 __m128i gather(__m128i src, __m128i k, const unsigned *addr, __m128i idx)
653 {
654 return _mm_mask_i32gather_epi32(src, aliasing_cast<int>(addr), idx, k, Scale);
655 }
656 #endif
657
658 } // namespace SseIntrinsics
659 } // namespace Vc
660
661 namespace Vc_VERSIONED_NAMESPACE
662 {
663 namespace SSE
664 {
665 using namespace SseIntrinsics;
666
667 template <typename T> struct ParameterHelper
668 {
669 typedef T ByValue;
670 typedef T &Reference;
671 typedef const T &ConstRef;
672 };
673
674 template <typename T> struct VectorHelper
675 {
676 };
677
678 template <typename T> struct VectorTypeHelper
679 {
680 typedef __m128i Type;
681 };
682 template <> struct VectorTypeHelper<double>
683 {
684 typedef __m128d Type;
685 };
686 template <> struct VectorTypeHelper<float>
687 {
688 typedef __m128 Type;
689 };
690
691 template <typename T> struct DetermineGatherMask
692 {
693 typedef T Type;
694 };
695
696 template <typename T> struct VectorTraits
697 {
698 typedef typename VectorTypeHelper<T>::Type VectorType;
699 using EntryType = T;
700 static constexpr size_t Size = sizeof(VectorType) / sizeof(EntryType);
701 typedef Mask<T> MaskType;
702 typedef typename DetermineGatherMask<MaskType>::Type GatherMaskType;
703 typedef Common::VectorMemoryUnion<VectorType, EntryType> StorageType;
704 };
705
706 template <typename T> struct VectorHelperSize;
707 } // namespace SSE
708 } // namespace Vc
709
710 #if defined(Vc_GCC) && !defined(__OPTIMIZE__)
711 #pragma GCC diagnostic pop
712 #endif
713
714 #include "shuffle.h"
715
716 #endif // VC_SSE_INTRINSICS_H_
717