1 /*  This file is part of the Vc library. {{{
2 Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>
3 
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are met:
6     * Redistributions of source code must retain the above copyright
7       notice, this list of conditions and the following disclaimer.
8     * Redistributions in binary form must reproduce the above copyright
9       notice, this list of conditions and the following disclaimer in the
10       documentation and/or other materials provided with the distribution.
11     * Neither the names of contributing organizations nor the
12       names of its contributors may be used to endorse or promote products
13       derived from this software without specific prior written permission.
14 
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
19 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 
26 }}}*/
27 
28 #ifndef VC_SSE_INTRINSICS_H_
29 #define VC_SSE_INTRINSICS_H_
30 
31 #ifdef Vc_MSVC
32 #include <intrin.h>
33 #else
34 #include <x86intrin.h>
35 #endif
36 
37 #include "../common/storage.h"
38 #include "const_data.h"
39 #include <cstdlib>
40 #include "types.h"
41 #include "debug.h"
42 
43 #if defined(Vc_GCC) && !defined(__OPTIMIZE__)
44 // GCC uses lots of old-style-casts in macros that disguise as intrinsics
45 #pragma GCC diagnostic push
46 #pragma GCC diagnostic ignored "-Wold-style-cast"
47 #endif
48 
49 namespace Vc_VERSIONED_NAMESPACE
50 {
51 namespace SseIntrinsics
52 {
53     using SSE::c_general;
54 
55     constexpr std::size_t VectorAlignment = 16;
56 
57 #if defined(Vc_GCC) && Vc_GCC < 0x40600 && !defined(Vc_DONT_FIX_SSE_SHIFT)
_mm_sll_epi16(__m128i a,__m128i count)58     static Vc_INTRINSIC Vc_CONST __m128i _mm_sll_epi16(__m128i a, __m128i count) { __asm__("psllw %1,%0" : "+x"(a) : "x"(count)); return a; }
_mm_sll_epi32(__m128i a,__m128i count)59     static Vc_INTRINSIC Vc_CONST __m128i _mm_sll_epi32(__m128i a, __m128i count) { __asm__("pslld %1,%0" : "+x"(a) : "x"(count)); return a; }
_mm_sll_epi64(__m128i a,__m128i count)60     static Vc_INTRINSIC Vc_CONST __m128i _mm_sll_epi64(__m128i a, __m128i count) { __asm__("psllq %1,%0" : "+x"(a) : "x"(count)); return a; }
_mm_srl_epi16(__m128i a,__m128i count)61     static Vc_INTRINSIC Vc_CONST __m128i _mm_srl_epi16(__m128i a, __m128i count) { __asm__("psrlw %1,%0" : "+x"(a) : "x"(count)); return a; }
_mm_srl_epi32(__m128i a,__m128i count)62     static Vc_INTRINSIC Vc_CONST __m128i _mm_srl_epi32(__m128i a, __m128i count) { __asm__("psrld %1,%0" : "+x"(a) : "x"(count)); return a; }
_mm_srl_epi64(__m128i a,__m128i count)63     static Vc_INTRINSIC Vc_CONST __m128i _mm_srl_epi64(__m128i a, __m128i count) { __asm__("psrlq %1,%0" : "+x"(a) : "x"(count)); return a; }
64 #endif
65 
66 #ifdef Vc_GCC
67     // Redefine the mul/add/sub intrinsics to use GCC-specific operators instead of builtin
68     // functions. This way the fp-contraction optimization step kicks in and creates FMAs! :)
_mm_mul_pd(__m128d a,__m128d b)69     static Vc_INTRINSIC Vc_CONST __m128d _mm_mul_pd(__m128d a, __m128d b) { return static_cast<__m128d>(static_cast<__v2df>(a) * static_cast<__v2df>(b)); }
_mm_add_pd(__m128d a,__m128d b)70     static Vc_INTRINSIC Vc_CONST __m128d _mm_add_pd(__m128d a, __m128d b) { return static_cast<__m128d>(static_cast<__v2df>(a) + static_cast<__v2df>(b)); }
_mm_sub_pd(__m128d a,__m128d b)71     static Vc_INTRINSIC Vc_CONST __m128d _mm_sub_pd(__m128d a, __m128d b) { return static_cast<__m128d>(static_cast<__v2df>(a) - static_cast<__v2df>(b)); }
_mm_mul_ps(__m128 a,__m128 b)72     static Vc_INTRINSIC Vc_CONST __m128  _mm_mul_ps(__m128  a, __m128  b) { return static_cast<__m128 >(static_cast<__v4sf>(a) * static_cast<__v4sf>(b)); }
_mm_add_ps(__m128 a,__m128 b)73     static Vc_INTRINSIC Vc_CONST __m128  _mm_add_ps(__m128  a, __m128  b) { return static_cast<__m128 >(static_cast<__v4sf>(a) + static_cast<__v4sf>(b)); }
_mm_sub_ps(__m128 a,__m128 b)74     static Vc_INTRINSIC Vc_CONST __m128  _mm_sub_ps(__m128  a, __m128  b) { return static_cast<__m128 >(static_cast<__v4sf>(a) - static_cast<__v4sf>(b)); }
75 #endif
76 
_mm_setallone_si128()77     static Vc_INTRINSIC Vc_CONST __m128i _mm_setallone_si128() { return _mm_load_si128(reinterpret_cast<const __m128i *>(Common::AllBitsSet)); }
_mm_setallone_pd()78     static Vc_INTRINSIC Vc_CONST __m128d _mm_setallone_pd() { return _mm_load_pd(reinterpret_cast<const double *>(Common::AllBitsSet)); }
_mm_setallone_ps()79     static Vc_INTRINSIC Vc_CONST __m128  _mm_setallone_ps() { return _mm_load_ps(reinterpret_cast<const float *>(Common::AllBitsSet)); }
80 
_mm_setone_epi16()81     static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epi16()  { return _mm_load_si128(reinterpret_cast<const __m128i *>(c_general::one16)); }
_mm_setone_epu16()82     static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epu16()  { return _mm_setone_epi16(); }
_mm_setone_epi32()83     static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epi32()  { return _mm_load_si128(reinterpret_cast<const __m128i *>(c_general::one32)); }
_mm_setone_epu32()84     static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epu32()  { return _mm_setone_epi32(); }
85 
_mm_setone_ps()86     static Vc_INTRINSIC __m128  Vc_CONST _mm_setone_ps()     { return _mm_load_ps(c_general::oneFloat); }
_mm_setone_pd()87     static Vc_INTRINSIC __m128d Vc_CONST _mm_setone_pd()     { return _mm_load_pd(c_general::oneDouble); }
88 
_mm_setabsmask_pd()89     static Vc_INTRINSIC __m128d Vc_CONST _mm_setabsmask_pd() { return _mm_load_pd(reinterpret_cast<const double *>(c_general::absMaskDouble)); }
_mm_setabsmask_ps()90     static Vc_INTRINSIC __m128  Vc_CONST _mm_setabsmask_ps() { return _mm_load_ps(reinterpret_cast<const float *>(c_general::absMaskFloat)); }
_mm_setsignmask_pd()91     static Vc_INTRINSIC __m128d Vc_CONST _mm_setsignmask_pd(){ return _mm_load_pd(reinterpret_cast<const double *>(c_general::signMaskDouble)); }
_mm_setsignmask_ps()92     static Vc_INTRINSIC __m128  Vc_CONST _mm_setsignmask_ps(){ return _mm_load_ps(reinterpret_cast<const float *>(c_general::signMaskFloat)); }
93 
setmin_epi8()94     static Vc_INTRINSIC __m128i Vc_CONST setmin_epi8 () { return _mm_set1_epi8(-0x80); }
setmin_epi16()95     static Vc_INTRINSIC __m128i Vc_CONST setmin_epi16() { return _mm_load_si128(reinterpret_cast<const __m128i *>(c_general::minShort)); }
setmin_epi32()96     static Vc_INTRINSIC __m128i Vc_CONST setmin_epi32() { return _mm_load_si128(reinterpret_cast<const __m128i *>(c_general::signMaskFloat)); }
97 
98 #if defined(Vc_IMPL_XOP)
cmpgt_epu8(__m128i a,__m128i b)99     static Vc_INTRINSIC __m128i Vc_CONST cmpgt_epu8(__m128i a, __m128i b) { return _mm_comgt_epu8(a, b); }
cmplt_epu16(__m128i a,__m128i b)100     static Vc_INTRINSIC __m128i Vc_CONST cmplt_epu16(__m128i a, __m128i b) { return _mm_comlt_epu16(a, b); }
cmpgt_epu16(__m128i a,__m128i b)101     static Vc_INTRINSIC __m128i Vc_CONST cmpgt_epu16(__m128i a, __m128i b) { return _mm_comgt_epu16(a, b); }
cmplt_epu32(__m128i a,__m128i b)102     static Vc_INTRINSIC __m128i Vc_CONST cmplt_epu32(__m128i a, __m128i b) { return _mm_comlt_epu32(a, b); }
cmpgt_epu32(__m128i a,__m128i b)103     static Vc_INTRINSIC __m128i Vc_CONST cmpgt_epu32(__m128i a, __m128i b) { return _mm_comgt_epu32(a, b); }
cmplt_epu64(__m128i a,__m128i b)104     static Vc_INTRINSIC __m128i Vc_CONST cmplt_epu64(__m128i a, __m128i b) { return _mm_comlt_epu64(a, b); }
105 #else
cmpgt_epu8(__m128i a,__m128i b)106     static Vc_INTRINSIC __m128i Vc_CONST cmpgt_epu8(__m128i a, __m128i b)
107     {
108         return _mm_cmpgt_epi8(_mm_xor_si128(a, setmin_epi8()),
109                               _mm_xor_si128(b, setmin_epi8()));
110     }
cmplt_epu16(__m128i a,__m128i b)111     static Vc_INTRINSIC __m128i Vc_CONST cmplt_epu16(__m128i a, __m128i b)
112     {
113         return _mm_cmplt_epi16(_mm_xor_si128(a, setmin_epi16()),
114                                _mm_xor_si128(b, setmin_epi16()));
115     }
cmpgt_epu16(__m128i a,__m128i b)116     static Vc_INTRINSIC __m128i Vc_CONST cmpgt_epu16(__m128i a, __m128i b)
117     {
118         return _mm_cmpgt_epi16(_mm_xor_si128(a, setmin_epi16()),
119                                _mm_xor_si128(b, setmin_epi16()));
120     }
cmplt_epu32(__m128i a,__m128i b)121     static Vc_INTRINSIC __m128i Vc_CONST cmplt_epu32(__m128i a, __m128i b)
122     {
123         return _mm_cmplt_epi32(_mm_xor_si128(a, setmin_epi32()),
124                                _mm_xor_si128(b, setmin_epi32()));
125     }
cmpgt_epu32(__m128i a,__m128i b)126     static Vc_INTRINSIC __m128i Vc_CONST cmpgt_epu32(__m128i a, __m128i b)
127     {
128         return _mm_cmpgt_epi32(_mm_xor_si128(a, setmin_epi32()),
129                                _mm_xor_si128(b, setmin_epi32()));
130     }
cmpgt_epi64(__m128i a,__m128i b)131     Vc_INTRINSIC __m128i Vc_CONST cmpgt_epi64(__m128i a, __m128i b)
132     {
133 #ifdef Vc_IMPL_SSE4_2
134         return _mm_cmpgt_epi64(a, b);
135 #else
136         const auto aa = _mm_xor_si128(a, _mm_srli_epi64(setmin_epi32(),32));
137         const auto bb = _mm_xor_si128(b, _mm_srli_epi64(setmin_epi32(),32));
138         const auto gt = _mm_cmpgt_epi32(aa, bb);
139         const auto eq = _mm_cmpeq_epi32(aa, bb);
140         // Algorithm:
141         // 1. if the high 32 bits of gt are true, make the full 64 bits true
142         // 2. if the high 32 bits of gt are false and the high 32 bits of eq are true,
143         //    duplicate the low 32 bits of gt to the high 32 bits (note that this requires
144         //    unsigned compare on the lower 32 bits, which is the reason for the xors
145         //    above)
146         // 3. else make the full 64 bits false
147 
148         const auto gt2 =
149             _mm_shuffle_epi32(gt, 0xf5);  // dup the high 32 bits to the low 32 bits
150         const auto lo =
151             _mm_shuffle_epi32(_mm_and_si128(_mm_srli_epi64(eq, 32), gt), 0xa0);
152         return _mm_or_si128(gt2, lo);
153 #endif
154     }
155 #endif
156 }  // namespace SseIntrinsics
157 }  // namespace Vc
158 
159 // SSSE3
160 #ifdef Vc_IMPL_SSSE3
161 namespace Vc_VERSIONED_NAMESPACE
162 {
163 namespace SseIntrinsics
164 {
165     // not overriding _mm_set1_epi8 because this one should only be used for non-constants
abs_epi8(__m128i a)166     Vc_INTRINSIC Vc_CONST __m128i abs_epi8(__m128i a) { return _mm_abs_epi8(a); }
abs_epi16(__m128i a)167     Vc_INTRINSIC Vc_CONST __m128i abs_epi16(__m128i a) { return _mm_abs_epi16(a); }
abs_epi32(__m128i a)168     Vc_INTRINSIC Vc_CONST __m128i abs_epi32(__m128i a) { return _mm_abs_epi32(a); }
alignr_epi8(__m128i a,__m128i b)169     template <int s> Vc_INTRINSIC Vc_CONST __m128i alignr_epi8(__m128i a, __m128i b)
170     {
171         return _mm_alignr_epi8(a, b, s & 0x1fu);
172     }
173 }  // namespace SseIntrinsics
174 }  // namespace Vc
175 
176 #else
177 
178 namespace Vc_VERSIONED_NAMESPACE
179 {
180 namespace SseIntrinsics
181 {
abs_epi8(__m128i a)182     Vc_INTRINSIC Vc_CONST __m128i abs_epi8 (__m128i a) {
183         __m128i negative = _mm_cmplt_epi8 (a, _mm_setzero_si128());
184         return _mm_add_epi8 (_mm_xor_si128(a, negative), _mm_and_si128(negative,  _mm_set1_epi8(1)));
185     }
186     // positive value:
187     //   negative == 0
188     //   a unchanged after xor
189     //   0 >> 31 -> 0
190     //   a + 0 -> a
191     // negative value:
192     //   negative == -1
193     //   a xor -1 -> -a - 1
194     //   -1 >> 31 -> 1
195     //   -a - 1 + 1 -> -a
abs_epi16(__m128i a)196     Vc_INTRINSIC Vc_CONST __m128i abs_epi16(__m128i a) {
197         __m128i negative = _mm_cmplt_epi16(a, _mm_setzero_si128());
198         return _mm_add_epi16(_mm_xor_si128(a, negative), _mm_srli_epi16(negative, 15));
199     }
abs_epi32(__m128i a)200     Vc_INTRINSIC Vc_CONST __m128i abs_epi32(__m128i a) {
201         __m128i negative = _mm_cmplt_epi32(a, _mm_setzero_si128());
202         return _mm_add_epi32(_mm_xor_si128(a, negative), _mm_srli_epi32(negative, 31));
203     }
alignr_epi8(__m128i a,__m128i b)204     template <int s> Vc_INTRINSIC Vc_CONST __m128i alignr_epi8(__m128i a, __m128i b)
205     {
206         switch (s & 0x1fu) {
207             case  0: return b;
208             case  1: return _mm_or_si128(_mm_slli_si128(a, 15), _mm_srli_si128(b,  1));
209             case  2: return _mm_or_si128(_mm_slli_si128(a, 14), _mm_srli_si128(b,  2));
210             case  3: return _mm_or_si128(_mm_slli_si128(a, 13), _mm_srli_si128(b,  3));
211             case  4: return _mm_or_si128(_mm_slli_si128(a, 12), _mm_srli_si128(b,  4));
212             case  5: return _mm_or_si128(_mm_slli_si128(a, 11), _mm_srli_si128(b,  5));
213             case  6: return _mm_or_si128(_mm_slli_si128(a, 10), _mm_srli_si128(b,  6));
214             case  7: return _mm_or_si128(_mm_slli_si128(a,  9), _mm_srli_si128(b,  7));
215             case  8: return _mm_or_si128(_mm_slli_si128(a,  8), _mm_srli_si128(b,  8));
216             case  9: return _mm_or_si128(_mm_slli_si128(a,  7), _mm_srli_si128(b,  9));
217             case 10: return _mm_or_si128(_mm_slli_si128(a,  6), _mm_srli_si128(b, 10));
218             case 11: return _mm_or_si128(_mm_slli_si128(a,  5), _mm_srli_si128(b, 11));
219             case 12: return _mm_or_si128(_mm_slli_si128(a,  4), _mm_srli_si128(b, 12));
220             case 13: return _mm_or_si128(_mm_slli_si128(a,  3), _mm_srli_si128(b, 13));
221             case 14: return _mm_or_si128(_mm_slli_si128(a,  2), _mm_srli_si128(b, 14));
222             case 15: return _mm_or_si128(_mm_slli_si128(a,  1), _mm_srli_si128(b, 15));
223             case 16: return a;
224             case 17: return _mm_srli_si128(a,  1);
225             case 18: return _mm_srli_si128(a,  2);
226             case 19: return _mm_srli_si128(a,  3);
227             case 20: return _mm_srli_si128(a,  4);
228             case 21: return _mm_srli_si128(a,  5);
229             case 22: return _mm_srli_si128(a,  6);
230             case 23: return _mm_srli_si128(a,  7);
231             case 24: return _mm_srli_si128(a,  8);
232             case 25: return _mm_srli_si128(a,  9);
233             case 26: return _mm_srli_si128(a, 10);
234             case 27: return _mm_srli_si128(a, 11);
235             case 28: return _mm_srli_si128(a, 12);
236             case 29: return _mm_srli_si128(a, 13);
237             case 30: return _mm_srli_si128(a, 14);
238             case 31: return _mm_srli_si128(a, 15);
239         }
240         return _mm_setzero_si128();
241     }
242 }  // namespace SseIntrinsics
243 }  // namespace Vc
244 #endif
245 
246 // SSE4.1
247 #ifdef Vc_IMPL_SSE4_1
248 namespace Vc_VERSIONED_NAMESPACE
249 {
250 namespace SseIntrinsics
251 {
cmpeq_epi64(__m128i a,__m128i b)252 Vc_INTRINSIC Vc_CONST __m128i cmpeq_epi64(__m128i a, __m128i b)
253 {
254     return _mm_cmpeq_epi64(a, b);
255 }
extract_epi32(__m128i v)256 template <int index> Vc_INTRINSIC Vc_CONST int extract_epi32(__m128i v)
257 {
258     return _mm_extract_epi32(v, index);
259 }
blendv_pd(__m128d a,__m128d b,__m128d c)260 Vc_INTRINSIC Vc_CONST __m128d blendv_pd(__m128d a, __m128d b, __m128d c)
261 {
262     return _mm_blendv_pd(a, b, c);
263 }
blendv_ps(__m128 a,__m128 b,__m128 c)264 Vc_INTRINSIC Vc_CONST __m128 blendv_ps(__m128 a, __m128 b, __m128 c)
265 {
266     return _mm_blendv_ps(a, b, c);
267 }
blendv_epi8(__m128i a,__m128i b,__m128i c)268 Vc_INTRINSIC Vc_CONST __m128i blendv_epi8(__m128i a, __m128i b, __m128i c)
269 {
270     return _mm_blendv_epi8(a, b, c);
271 }
blend_pd(__m128d a,__m128d b)272 template <int mask> Vc_INTRINSIC Vc_CONST __m128d blend_pd(__m128d a, __m128d b)
273 {
274     return _mm_blend_pd(a, b, mask);
275 }
blend_ps(__m128 a,__m128 b)276 template <int mask> Vc_INTRINSIC Vc_CONST __m128 blend_ps(__m128 a, __m128 b)
277 {
278     return _mm_blend_ps(a, b, mask);
279 }
blend_epi16(__m128i a,__m128i b)280 template <int mask> Vc_INTRINSIC Vc_CONST __m128i blend_epi16(__m128i a, __m128i b)
281 {
282     return _mm_blend_epi16(a, b, mask);
283 }
max_epi8(__m128i a,__m128i b)284 Vc_INTRINSIC Vc_CONST __m128i max_epi8(__m128i a, __m128i b)
285 {
286     return _mm_max_epi8(a, b);
287 }
max_epi32(__m128i a,__m128i b)288 Vc_INTRINSIC Vc_CONST __m128i max_epi32(__m128i a, __m128i b)
289 {
290     return _mm_max_epi32(a, b);
291 }
max_epu16(__m128i a,__m128i b)292 Vc_INTRINSIC Vc_CONST __m128i max_epu16(__m128i a, __m128i b)
293 {
294     return _mm_max_epu16(a, b);
295 }
max_epu32(__m128i a,__m128i b)296 Vc_INTRINSIC Vc_CONST __m128i max_epu32(__m128i a, __m128i b)
297 {
298     return _mm_max_epu32(a, b);
299 }
min_epu16(__m128i a,__m128i b)300 Vc_INTRINSIC Vc_CONST __m128i min_epu16(__m128i a, __m128i b)
301 {
302     return _mm_min_epu16(a, b);
303 }
min_epu32(__m128i a,__m128i b)304 Vc_INTRINSIC Vc_CONST __m128i min_epu32(__m128i a, __m128i b)
305 {
306     return _mm_min_epu32(a, b);
307 }
min_epi8(__m128i a,__m128i b)308 Vc_INTRINSIC Vc_CONST __m128i min_epi8(__m128i a, __m128i b)
309 {
310     return _mm_min_epi8(a, b);
311 }
min_epi32(__m128i a,__m128i b)312 Vc_INTRINSIC Vc_CONST __m128i min_epi32(__m128i a, __m128i b)
313 {
314     return _mm_min_epi32(a, b);
315 }
cvtepu8_epi16(__m128i epu8)316 Vc_INTRINSIC Vc_CONST __m128i cvtepu8_epi16(__m128i epu8)
317 {
318     return _mm_cvtepu8_epi16(epu8);
319 }
cvtepi8_epi16(__m128i epi8)320 Vc_INTRINSIC Vc_CONST __m128i cvtepi8_epi16(__m128i epi8)
321 {
322     return _mm_cvtepi8_epi16(epi8);
323 }
cvtepu16_epi32(__m128i epu16)324 Vc_INTRINSIC Vc_CONST __m128i cvtepu16_epi32(__m128i epu16)
325 {
326     return _mm_cvtepu16_epi32(epu16);
327 }
cvtepi16_epi32(__m128i epu16)328 Vc_INTRINSIC Vc_CONST __m128i cvtepi16_epi32(__m128i epu16)
329 {
330     return _mm_cvtepi16_epi32(epu16);
331 }
cvtepu8_epi32(__m128i epu8)332 Vc_INTRINSIC Vc_CONST __m128i cvtepu8_epi32(__m128i epu8)
333 {
334     return _mm_cvtepu8_epi32(epu8);
335 }
cvtepi8_epi32(__m128i epi8)336 Vc_INTRINSIC Vc_CONST __m128i cvtepi8_epi32(__m128i epi8)
337 {
338     return _mm_cvtepi8_epi32(epi8);
339 }
340 }  // namespace SseIntrinsics
341 }  // namespace Vc
342 #else
343 
344 namespace Vc_VERSIONED_NAMESPACE
345 {
346 namespace SseIntrinsics
347 {
cmpeq_epi64(__m128i a,__m128i b)348     Vc_INTRINSIC Vc_CONST __m128i cmpeq_epi64(__m128i a, __m128i b) {
349         auto tmp = _mm_cmpeq_epi32(a, b);
350         return _mm_and_si128(tmp, _mm_shuffle_epi32(tmp, 1*1 + 0*4 + 3*16 + 2*64));
351     }
extract_epi32(__m128i v)352     template <int index> Vc_INTRINSIC Vc_CONST int extract_epi32(__m128i v)
353     {
354 #ifdef Vc_USE_BUILTIN_VECTOR_TYPES
355         typedef int int32v4 __attribute__((__vector_size__(16)));
356         return aliasing_cast<int32v4>(v)[index];
357 #else
358         return _mm_cvtsi128_si32(_mm_srli_si128(v, index * 4));
359 #endif
360     }
blendv_pd(__m128d a,__m128d b,__m128d c)361     Vc_INTRINSIC Vc_CONST __m128d blendv_pd(__m128d a, __m128d b, __m128d c) {
362 #ifdef Vc_GCC
363         return reinterpret_cast<__m128d>(
364             (~reinterpret_cast<__m128i>(c) & reinterpret_cast<__m128i>(a)) |
365             (reinterpret_cast<__m128i>(c) & reinterpret_cast<__m128i>(b)));
366 #else
367         return _mm_or_pd(_mm_andnot_pd(c, a), _mm_and_pd(c, b));
368 #endif
369     }
blendv_ps(__m128 a,__m128 b,__m128 c)370     Vc_INTRINSIC Vc_CONST __m128  blendv_ps(__m128  a, __m128  b, __m128  c) {
371 #ifdef Vc_GCC
372         return reinterpret_cast<__m128>(
373             (~reinterpret_cast<__m128i>(c) & reinterpret_cast<__m128i>(a)) |
374             (reinterpret_cast<__m128i>(c) & reinterpret_cast<__m128i>(b)));
375 #else
376         return _mm_or_ps(_mm_andnot_ps(c, a), _mm_and_ps(c, b));
377 #endif
378     }
blendv_epi8(__m128i a,__m128i b,__m128i c)379     Vc_INTRINSIC Vc_CONST __m128i blendv_epi8(__m128i a, __m128i b, __m128i c) {
380 #ifdef Vc_GCC
381         return (~c & a) | (c & b);
382 #else
383         return _mm_or_si128(_mm_andnot_si128(c, a), _mm_and_si128(c, b));
384 #endif
385     }
386 
387     // only use the following blend functions with immediates as mask and, of course, compiling
388     // with optimization
blend_pd(__m128d a,__m128d b)389     template <int mask> Vc_INTRINSIC Vc_CONST __m128d blend_pd(__m128d a, __m128d b)
390     {
391         switch (mask) {
392         case 0x0:
393             return a;
394         case 0x1:
395             return _mm_shuffle_pd(b, a, 2);
396         case 0x2:
397             return _mm_shuffle_pd(a, b, 2);
398         case 0x3:
399             return b;
400         default:
401             abort();
402             return a; // should never be reached, but MSVC needs it else it warns about 'not all control paths return a value'
403         }
404     }
blend_ps(__m128 a,__m128 b)405     template <int mask> Vc_INTRINSIC Vc_CONST __m128 blend_ps(__m128 a, __m128 b)
406     {
407         __m128i c;
408         switch (mask) {
409         case 0x0:
410             return a;
411         case 0x1:
412             c = _mm_srli_si128(_mm_setallone_si128(), 12);
413             break;
414         case 0x2:
415             c = _mm_slli_si128(_mm_srli_si128(_mm_setallone_si128(), 12), 4);
416             break;
417         case 0x3:
418             c = _mm_srli_si128(_mm_setallone_si128(), 8);
419             break;
420         case 0x4:
421             c = _mm_slli_si128(_mm_srli_si128(_mm_setallone_si128(), 12), 8);
422             break;
423         case 0x5:
424             c = _mm_set_epi32(0, -1, 0, -1);
425             break;
426         case 0x6:
427             c = _mm_slli_si128(_mm_srli_si128(_mm_setallone_si128(), 8), 4);
428             break;
429         case 0x7:
430             c = _mm_srli_si128(_mm_setallone_si128(), 4);
431             break;
432         case 0x8:
433             c = _mm_slli_si128(_mm_setallone_si128(), 12);
434             break;
435         case 0x9:
436             c = _mm_set_epi32(-1, 0, 0, -1);
437             break;
438         case 0xa:
439             c = _mm_set_epi32(-1, 0, -1, 0);
440             break;
441         case 0xb:
442             c = _mm_set_epi32(-1, 0, -1, -1);
443             break;
444         case 0xc:
445             c = _mm_slli_si128(_mm_setallone_si128(), 8);
446             break;
447         case 0xd:
448             c = _mm_set_epi32(-1, -1, 0, -1);
449             break;
450         case 0xe:
451             c = _mm_slli_si128(_mm_setallone_si128(), 4);
452             break;
453         case 0xf:
454             return b;
455         default: // may not happen
456             abort();
457             c = _mm_setzero_si128();
458             break;
459         }
460         __m128 _c = _mm_castsi128_ps(c);
461         return _mm_or_ps(_mm_andnot_ps(_c, a), _mm_and_ps(_c, b));
462     }
blend_epi16(__m128i a,__m128i b)463     template <int mask> Vc_INTRINSIC Vc_CONST __m128i blend_epi16(__m128i a, __m128i b)
464     {
465         __m128i c;
466         switch (mask) {
467         case 0x00:
468             return a;
469         case 0x01:
470             c = _mm_srli_si128(_mm_setallone_si128(), 14);
471             break;
472         case 0x03:
473             c = _mm_srli_si128(_mm_setallone_si128(), 12);
474             break;
475         case 0x07:
476             c = _mm_srli_si128(_mm_setallone_si128(), 10);
477             break;
478         case 0x0f:
479             return _mm_unpackhi_epi64(_mm_slli_si128(b, 8), a);
480         case 0x1f:
481             c = _mm_srli_si128(_mm_setallone_si128(), 6);
482             break;
483         case 0x3f:
484             c = _mm_srli_si128(_mm_setallone_si128(), 4);
485             break;
486         case 0x7f:
487             c = _mm_srli_si128(_mm_setallone_si128(), 2);
488             break;
489         case 0x80:
490             c = _mm_slli_si128(_mm_setallone_si128(), 14);
491             break;
492         case 0xc0:
493             c = _mm_slli_si128(_mm_setallone_si128(), 12);
494             break;
495         case 0xe0:
496             c = _mm_slli_si128(_mm_setallone_si128(), 10);
497             break;
498         case 0xf0:
499             c = _mm_slli_si128(_mm_setallone_si128(), 8);
500             break;
501         case 0xf8:
502             c = _mm_slli_si128(_mm_setallone_si128(), 6);
503             break;
504         case 0xfc:
505             c = _mm_slli_si128(_mm_setallone_si128(), 4);
506             break;
507         case 0xfe:
508             c = _mm_slli_si128(_mm_setallone_si128(), 2);
509             break;
510         case 0xff:
511             return b;
512         case 0xcc:
513             return _mm_unpacklo_epi32(_mm_shuffle_epi32(a, _MM_SHUFFLE(2, 0, 2, 0)), _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 1, 3, 1)));
514         case 0x33:
515             return _mm_unpacklo_epi32(_mm_shuffle_epi32(b, _MM_SHUFFLE(2, 0, 2, 0)), _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 1, 3, 1)));
516         default:
517             const __m128i shift = _mm_set_epi16(0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000, -0x7fff);
518             c = _mm_srai_epi16(_mm_mullo_epi16(_mm_set1_epi16(mask), shift), 15);
519             break;
520         }
521         return _mm_or_si128(_mm_andnot_si128(c, a), _mm_and_si128(c, b));
522     }
523 
max_epi8(__m128i a,__m128i b)524     Vc_INTRINSIC Vc_CONST __m128i max_epi8 (__m128i a, __m128i b) {
525         return blendv_epi8(b, a, _mm_cmpgt_epi8 (a, b));
526     }
max_epi32(__m128i a,__m128i b)527     Vc_INTRINSIC Vc_CONST __m128i max_epi32(__m128i a, __m128i b) {
528         return blendv_epi8(b, a, _mm_cmpgt_epi32(a, b));
529     }
max_epu16(__m128i a,__m128i b)530     Vc_INTRINSIC Vc_CONST __m128i max_epu16(__m128i a, __m128i b) {
531         return blendv_epi8(b, a, cmpgt_epu16(a, b));
532     }
max_epu32(__m128i a,__m128i b)533     Vc_INTRINSIC Vc_CONST __m128i max_epu32(__m128i a, __m128i b) {
534         return blendv_epi8(b, a, cmpgt_epu32(a, b));
535     }
min_epu16(__m128i a,__m128i b)536     Vc_INTRINSIC Vc_CONST __m128i min_epu16(__m128i a, __m128i b) {
537         return blendv_epi8(a, b, cmpgt_epu16(a, b));
538     }
min_epu32(__m128i a,__m128i b)539     Vc_INTRINSIC Vc_CONST __m128i min_epu32(__m128i a, __m128i b) {
540         return blendv_epi8(a, b, cmpgt_epu32(a, b));
541     }
min_epi8(__m128i a,__m128i b)542     Vc_INTRINSIC Vc_CONST __m128i min_epi8 (__m128i a, __m128i b) {
543         return blendv_epi8(a, b, _mm_cmpgt_epi8 (a, b));
544     }
min_epi32(__m128i a,__m128i b)545     Vc_INTRINSIC Vc_CONST __m128i min_epi32(__m128i a, __m128i b) {
546         return blendv_epi8(a, b, _mm_cmpgt_epi32(a, b));
547     }
cvtepu8_epi16(__m128i epu8)548     Vc_INTRINSIC Vc_CONST __m128i cvtepu8_epi16(__m128i epu8) {
549         return _mm_unpacklo_epi8(epu8, _mm_setzero_si128());
550     }
cvtepi8_epi16(__m128i epi8)551     Vc_INTRINSIC Vc_CONST __m128i cvtepi8_epi16(__m128i epi8) {
552         return _mm_unpacklo_epi8(epi8, _mm_cmplt_epi8(epi8, _mm_setzero_si128()));
553     }
cvtepu16_epi32(__m128i epu16)554     Vc_INTRINSIC Vc_CONST __m128i cvtepu16_epi32(__m128i epu16) {
555         return _mm_unpacklo_epi16(epu16, _mm_setzero_si128());
556     }
cvtepi16_epi32(__m128i epu16)557     Vc_INTRINSIC Vc_CONST __m128i cvtepi16_epi32(__m128i epu16) {
558         return _mm_unpacklo_epi16(epu16, _mm_cmplt_epi16(epu16, _mm_setzero_si128()));
559     }
cvtepu8_epi32(__m128i epu8)560     Vc_INTRINSIC Vc_CONST __m128i cvtepu8_epi32(__m128i epu8) {
561         return cvtepu16_epi32(cvtepu8_epi16(epu8));
562     }
cvtepi8_epi32(__m128i epi8)563     Vc_INTRINSIC Vc_CONST __m128i cvtepi8_epi32(__m128i epi8) {
564         const __m128i neg = _mm_cmplt_epi8(epi8, _mm_setzero_si128());
565         const __m128i epi16 = _mm_unpacklo_epi8(epi8, neg);
566         return _mm_unpacklo_epi16(epi16, _mm_unpacklo_epi8(neg, neg));
567     }
568 }  // namespace SseIntrinsics
569 }  // namespace Vc
570 #endif
571 
572 // SSE4.2
573 namespace Vc_VERSIONED_NAMESPACE
574 {
575 namespace SseIntrinsics
576 {
_mm_stream_load(const float * mem)577     static Vc_INTRINSIC Vc_PURE __m128  _mm_stream_load(const float *mem) {
578 #ifdef Vc_IMPL_SSE4_1
579         return _mm_castsi128_ps(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<float *>(mem))));
580 #else
581         return _mm_load_ps(mem);
582 #endif
583     }
_mm_stream_load(const double * mem)584     static Vc_INTRINSIC Vc_PURE __m128d _mm_stream_load(const double *mem) {
585 #ifdef Vc_IMPL_SSE4_1
586         return _mm_castsi128_pd(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<double *>(mem))));
587 #else
588         return _mm_load_pd(mem);
589 #endif
590     }
_mm_stream_load(const int * mem)591     static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const int *mem) {
592 #ifdef Vc_IMPL_SSE4_1
593         return _mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<int *>(mem)));
594 #else
595         return _mm_load_si128(reinterpret_cast<const __m128i *>(mem));
596 #endif
597     }
_mm_stream_load(const unsigned int * mem)598     static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const unsigned int *mem) {
599         return _mm_stream_load(reinterpret_cast<const int *>(mem));
600     }
_mm_stream_load(const short * mem)601     static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const short *mem) {
602         return _mm_stream_load(reinterpret_cast<const int *>(mem));
603     }
_mm_stream_load(const unsigned short * mem)604     static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const unsigned short *mem) {
605         return _mm_stream_load(reinterpret_cast<const int *>(mem));
606     }
_mm_stream_load(const signed char * mem)607     static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const signed char *mem) {
608         return _mm_stream_load(reinterpret_cast<const int *>(mem));
609     }
_mm_stream_load(const unsigned char * mem)610     static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const unsigned char *mem) {
611         return _mm_stream_load(reinterpret_cast<const int *>(mem));
612     }
613 
614 #ifndef __x86_64__
_mm_cvtsi64_si128(int64_t x)615     Vc_INTRINSIC Vc_PURE __m128i _mm_cvtsi64_si128(int64_t x) {
616         return _mm_castpd_si128(_mm_load_sd(reinterpret_cast<const double *>(&x)));
617     }
618 #endif
619 
620 #ifdef Vc_IMPL_AVX2
gather(const float * addr,__m128i idx)621 template <int Scale> __m128 gather(const float *addr, __m128i idx)
622 {
623     return _mm_i32gather_ps(addr, idx, Scale);
624 }
gather(const double * addr,__m128i idx)625 template <int Scale> __m128d gather(const double *addr, __m128i idx)
626 {
627     return _mm_i32gather_pd(addr, idx, Scale);
628 }
gather(const int * addr,__m128i idx)629 template <int Scale> __m128i gather(const int *addr, __m128i idx)
630 {
631     return _mm_i32gather_epi32(addr, idx, Scale);
632 }
gather(const unsigned * addr,__m128i idx)633 template <int Scale> __m128i gather(const unsigned *addr, __m128i idx)
634 {
635     return _mm_i32gather_epi32(aliasing_cast<int>(addr), idx, Scale);
636 }
637 
gather(__m128 src,__m128 k,const float * addr,__m128i idx)638 template <int Scale> __m128 gather(__m128 src, __m128 k, const float *addr, __m128i idx)
639 {
640     return _mm_mask_i32gather_ps(src, addr, idx, k, Scale);
641 }
642 template <int Scale>
gather(__m128d src,__m128d k,const double * addr,__m128i idx)643 __m128d gather(__m128d src, __m128d k, const double *addr, __m128i idx)
644 {
645     return _mm_mask_i32gather_pd(src, addr, idx, k, Scale);
646 }
gather(__m128i src,__m128i k,const int * addr,__m128i idx)647 template <int Scale> __m128i gather(__m128i src, __m128i k, const int *addr, __m128i idx)
648 {
649     return _mm_mask_i32gather_epi32(src, addr, idx, k, Scale);
650 }
651 template <int Scale>
gather(__m128i src,__m128i k,const unsigned * addr,__m128i idx)652 __m128i gather(__m128i src, __m128i k, const unsigned *addr, __m128i idx)
653 {
654     return _mm_mask_i32gather_epi32(src, aliasing_cast<int>(addr), idx, k, Scale);
655 }
656 #endif
657 
658 }  // namespace SseIntrinsics
659 }  // namespace Vc
660 
661 namespace Vc_VERSIONED_NAMESPACE
662 {
663 namespace SSE
664 {
665 using namespace SseIntrinsics;
666 
667 template <typename T> struct ParameterHelper
668 {
669     typedef T ByValue;
670     typedef T &Reference;
671     typedef const T &ConstRef;
672 };
673 
674 template <typename T> struct VectorHelper
675 {
676 };
677 
678 template <typename T> struct VectorTypeHelper
679 {
680     typedef __m128i Type;
681 };
682 template <> struct VectorTypeHelper<double>
683 {
684     typedef __m128d Type;
685 };
686 template <> struct VectorTypeHelper<float>
687 {
688     typedef __m128 Type;
689 };
690 
691 template <typename T> struct DetermineGatherMask
692 {
693     typedef T Type;
694 };
695 
696 template <typename T> struct VectorTraits
697 {
698     typedef typename VectorTypeHelper<T>::Type VectorType;
699     using EntryType = T;
700     static constexpr size_t Size = sizeof(VectorType) / sizeof(EntryType);
701     typedef Mask<T> MaskType;
702     typedef typename DetermineGatherMask<MaskType>::Type GatherMaskType;
703     typedef Common::VectorMemoryUnion<VectorType, EntryType> StorageType;
704 };
705 
706 template <typename T> struct VectorHelperSize;
707 }  // namespace SSE
708 }  // namespace Vc
709 
710 #if defined(Vc_GCC) && !defined(__OPTIMIZE__)
711 #pragma GCC diagnostic pop
712 #endif
713 
714 #include "shuffle.h"
715 
716 #endif // VC_SSE_INTRINSICS_H_
717