1 /*  This file is part of the Vc library. {{{
2 Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>
3 
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are met:
6     * Redistributions of source code must retain the above copyright
7       notice, this list of conditions and the following disclaimer.
8     * Redistributions in binary form must reproduce the above copyright
9       notice, this list of conditions and the following disclaimer in the
10       documentation and/or other materials provided with the distribution.
11     * Neither the names of contributing organizations nor the
12       names of its contributors may be used to endorse or promote products
13       derived from this software without specific prior written permission.
14 
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
19 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 
26 }}}*/
27 
28 #ifndef VC_SSE_VECTORHELPER_H_
29 #define VC_SSE_VECTORHELPER_H_
30 
31 #include "types.h"
32 #include "../common/loadstoreflags.h"
33 #include <limits>
34 #include "const_data.h"
35 #include "macros.h"
36 
37 namespace Vc_VERSIONED_NAMESPACE
38 {
39 namespace SSE
40 {
41 #define Vc_OP0(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name() { return code; }
42 #define Vc_OP1(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(const VectorType a) { return code; }
43 #define Vc_OP2(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(const VectorType a, const VectorType b) { return code; }
44 #define Vc_OP3(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(const VectorType a, const VectorType b, const VectorType c) { return code; }
45 
46         template<> struct VectorHelper<__m128>
47         {
48             typedef __m128 VectorType;
49 
50             template<typename Flags> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const float *x, typename Flags::EnableIfAligned  = nullptr) { return _mm_load_ps(x); }
51             template<typename Flags> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const float *x, typename Flags::EnableIfUnaligned = nullptr) { return _mm_loadu_ps(x); }
52             template<typename Flags> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const float *x, typename Flags::EnableIfStreaming = nullptr) { return _mm_stream_load(x); }
53 
54             template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VectorType x, typename Flags::EnableIfAligned               = nullptr) { _mm_store_ps(mem, x); }
55             template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VectorType x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm_storeu_ps(mem, x); }
56             template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VectorType x, typename Flags::EnableIfStreaming             = nullptr) { _mm_stream_ps(mem, x); }
57             template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VectorType x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { _mm_maskmoveu_si128(_mm_castps_si128(x), _mm_setallone_si128(), reinterpret_cast<char *>(mem)); }
58 
59             // before AVX there was only one maskstore. load -> blend -> store would break the C++ memory model (read/write of memory that is actually not touched by this thread)
60             template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VectorType x, VectorType m) { _mm_maskmoveu_si128(_mm_castps_si128(x), _mm_castps_si128(m), reinterpret_cast<char *>(mem)); }
61 
62             Vc_OP0(allone, _mm_setallone_ps())
63             Vc_OP0(zero, _mm_setzero_ps())
64             Vc_OP3(blend, blendv_ps(a, b, c))
65         };
66 
67 
68         template<> struct VectorHelper<__m128d>
69         {
70             typedef __m128d VectorType;
71 
72             template<typename Flags> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const double *x, typename Flags::EnableIfAligned   = nullptr) { return _mm_load_pd(x); }
73             template<typename Flags> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const double *x, typename Flags::EnableIfUnaligned = nullptr) { return _mm_loadu_pd(x); }
74             template<typename Flags> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const double *x, typename Flags::EnableIfStreaming = nullptr) { return _mm_stream_load(x); }
75 
76             template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VectorType x, typename Flags::EnableIfAligned               = nullptr) { _mm_store_pd(mem, x); }
77             template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VectorType x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm_storeu_pd(mem, x); }
78             template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VectorType x, typename Flags::EnableIfStreaming             = nullptr) { _mm_stream_pd(mem, x); }
79             template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VectorType x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { _mm_maskmoveu_si128(_mm_castpd_si128(x), _mm_setallone_si128(), reinterpret_cast<char *>(mem)); }
80 
81             // before AVX there was only one maskstore. load -> blend -> store would break the C++ memory model (read/write of memory that is actually not touched by this thread)
82             template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VectorType x, VectorType m) { _mm_maskmoveu_si128(_mm_castpd_si128(x), _mm_castpd_si128(m), reinterpret_cast<char *>(mem)); }
83 
84             Vc_OP0(allone, _mm_setallone_pd())
85             Vc_OP0(zero, _mm_setzero_pd())
86             Vc_OP3(blend, blendv_pd(a, b, c))
87         };
88 
89         template<> struct VectorHelper<__m128i>
90         {
91             typedef __m128i VectorType;
92 
93             template<typename Flags, typename T> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const T *x, typename Flags::EnableIfAligned   = nullptr) { return _mm_load_si128(reinterpret_cast<const VectorType *>(x)); }
94             template<typename Flags, typename T> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const T *x, typename Flags::EnableIfUnaligned = nullptr) { return _mm_loadu_si128(reinterpret_cast<const VectorType *>(x)); }
95             template<typename Flags, typename T> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const T *x, typename Flags::EnableIfStreaming = nullptr) { return _mm_stream_load(x); }
96 
97             template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VectorType x, typename Flags::EnableIfAligned               = nullptr) { _mm_store_si128(reinterpret_cast<VectorType *>(mem), x); }
98             template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VectorType x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm_storeu_si128(reinterpret_cast<VectorType *>(mem), x); }
99             template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VectorType x, typename Flags::EnableIfStreaming             = nullptr) { _mm_stream_si128(reinterpret_cast<VectorType *>(mem), x); }
100             template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VectorType x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { _mm_maskmoveu_si128(x, _mm_setallone_si128(), reinterpret_cast<char *>(mem)); }
101 
102             // before AVX there was only one maskstore. load -> blend -> store would break the C++ memory model (read/write of memory that is actually not touched by this thread)
103             template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VectorType x, VectorType m) { _mm_maskmoveu_si128(x, m, reinterpret_cast<char *>(mem)); }
104 
105             Vc_OP0(allone, _mm_setallone_si128())
106             Vc_OP0(zero, _mm_setzero_si128())
107             Vc_OP3(blend, blendv_epi8(a, b, c))
108         };
109 
110 #undef Vc_OP1
111 #undef Vc_OP2
112 #undef Vc_OP3
113 
114 #define Vc_OP1(op) \
115         static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a) { return Vc_CAT2(_mm_##op##_, Vc_SUFFIX)(a); }
116 #define Vc_OP(op) \
117         static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a, const VectorType b) { return Vc_CAT2(_mm_##op##_ , Vc_SUFFIX)(a, b); }
118 #define Vc_OP_(op) \
119         static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a, const VectorType b) { return Vc_CAT2(_mm_##op    , Vc_SUFFIX)(a, b); }
120 #define Vc_OPx(op, op2) \
121         static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a, const VectorType b) { return Vc_CAT2(_mm_##op2##_, Vc_SUFFIX)(a, b); }
122 #define Vc_OP_CAST_(op) \
123         static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a, const VectorType b) { return Vc_CAT2(_mm_castps_, Vc_SUFFIX)( \
124             _mm_##op##ps(Vc_CAT2(Vc_CAT2(_mm_cast, Vc_SUFFIX), _ps)(a), \
125               Vc_CAT2(Vc_CAT2(_mm_cast, Vc_SUFFIX), _ps)(b))); \
126         }
127 #define Vc_MINMAX \
128         static Vc_ALWAYS_INLINE Vc_CONST VectorType min(VectorType a, VectorType b) { return Vc_CAT2(_mm_min_, Vc_SUFFIX)(a, b); } \
129         static Vc_ALWAYS_INLINE Vc_CONST VectorType max(VectorType a, VectorType b) { return Vc_CAT2(_mm_max_, Vc_SUFFIX)(a, b); }
130 
131         template<> struct VectorHelper<double> {
132             typedef __m128d VectorType;
133             typedef double EntryType;
134 #define Vc_SUFFIX pd
135 
136             Vc_OP_(or_) Vc_OP_(and_) Vc_OP_(xor_)
137             static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, __m128 mask) { return Vc_CAT2(_mm_and_, Vc_SUFFIX)(_mm_castps_pd(mask), a); }
138             static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const double a) { return Vc_CAT2(_mm_set1_, Vc_SUFFIX)(a); }
139             static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const double a, const double b) { return Vc_CAT2(_mm_set_, Vc_SUFFIX)(a, b); }
140             static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm_setzero_, Vc_SUFFIX)(); }
141             static Vc_ALWAYS_INLINE Vc_CONST VectorType one()  { return Vc_CAT2(_mm_setone_, Vc_SUFFIX)(); }// set(1.); }
142 
143 #ifdef Vc_IMPL_FMA4
144             static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) {
145                 v1 = _mm_macc_pd(v1, v2, v3);
146             }
147 #else
148             static inline void fma(VectorType &v1, VectorType v2, VectorType v3) {
149                 VectorType h1 = _mm_and_pd(v1, _mm_load_pd(reinterpret_cast<const double *>(&c_general::highMaskDouble)));
150                 VectorType h2 = _mm_and_pd(v2, _mm_load_pd(reinterpret_cast<const double *>(&c_general::highMaskDouble)));
151 #if defined(Vc_GCC) && Vc_GCC < 0x40703
152                 // GCC before 4.7.3 uses an incorrect optimization where it replaces the subtraction with an andnot
153                 // http://gcc.gnu.org/bugzilla/show_bug.cgi?id=54703
154                 asm("":"+x"(h1), "+x"(h2));
155 #endif
156                 const VectorType l1 = _mm_sub_pd(v1, h1);
157                 const VectorType l2 = _mm_sub_pd(v2, h2);
158                 const VectorType ll = mul(l1, l2);
159                 const VectorType lh = add(mul(l1, h2), mul(h1, l2));
160                 const VectorType hh = mul(h1, h2);
161                 // ll < lh < hh for all entries is certain
162                 const VectorType lh_lt_v3 = _mm_cmplt_pd(abs(lh), abs(v3)); // |lh| < |v3|
163                 const VectorType b = blendv_pd(v3, lh, lh_lt_v3);
164                 const VectorType c = blendv_pd(lh, v3, lh_lt_v3);
165                 v1 = add(add(ll, b), add(c, hh));
166             }
167 #endif
168 
169             Vc_OP(add) Vc_OP(sub) Vc_OP(mul)
170 
171             Vc_OP1(sqrt)
172             static Vc_ALWAYS_INLINE Vc_CONST VectorType rsqrt(VectorType x) {
173                 return _mm_div_pd(one(), sqrt(x));
174             }
175             static Vc_ALWAYS_INLINE Vc_CONST VectorType reciprocal(VectorType x) {
176                 return _mm_div_pd(one(), x);
177             }
178             static Vc_ALWAYS_INLINE Vc_CONST VectorType isNaN(VectorType x) {
179                 return _mm_cmpunord_pd(x, x);
180             }
181             static Vc_ALWAYS_INLINE Vc_CONST VectorType isFinite(VectorType x) {
182                 return _mm_cmpord_pd(x, _mm_mul_pd(zero(), x));
183             }
184             static Vc_ALWAYS_INLINE Vc_CONST VectorType isInfinite(VectorType x) {
185                 return _mm_castsi128_pd(cmpeq_epi64(_mm_castpd_si128(abs(x)), _mm_castpd_si128(_mm_load_pd(c_log<double>::d(1)))));
186             }
187             static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(const VectorType a) {
188                 return Vc_CAT2(_mm_and_, Vc_SUFFIX)(a, _mm_setabsmask_pd());
189             }
190 
191             Vc_MINMAX
192             static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) {
193                 a = _mm_min_sd(a, _mm_unpackhi_pd(a, a));
194                 return _mm_cvtsd_f64(a);
195             }
196             static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) {
197                 a = _mm_max_sd(a, _mm_unpackhi_pd(a, a));
198                 return _mm_cvtsd_f64(a);
199             }
200             static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) {
201                 a = _mm_mul_sd(a, _mm_shuffle_pd(a, a, _MM_SHUFFLE2(0, 1)));
202                 return _mm_cvtsd_f64(a);
203             }
204             static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) {
205                 a = _mm_add_sd(a, _mm_shuffle_pd(a, a, _MM_SHUFFLE2(0, 1)));
206                 return _mm_cvtsd_f64(a);
207             }
208 #undef Vc_SUFFIX
209             static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) {
210 #ifdef Vc_IMPL_SSE4_1
211                 return _mm_round_pd(a, _MM_FROUND_NINT);
212 #else
213                 //XXX: slow: _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
214                 return _mm_cvtepi32_pd(_mm_cvtpd_epi32(a));
215 #endif
216             }
217         };
218 
219         template<> struct VectorHelper<float> {
220             typedef float EntryType;
221             typedef __m128 VectorType;
222 #define Vc_SUFFIX ps
223 
224             Vc_OP_(or_) Vc_OP_(and_) Vc_OP_(xor_)
225             static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, __m128 mask) { return Vc_CAT2(_mm_and_, Vc_SUFFIX)(mask, a); }
226             static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a) { return Vc_CAT2(_mm_set1_, Vc_SUFFIX)(a); }
227             static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a, const float b, const float c, const float d) { return Vc_CAT2(_mm_set_, Vc_SUFFIX)(a, b, c, d); }
228             static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm_setzero_, Vc_SUFFIX)(); }
229             static Vc_ALWAYS_INLINE Vc_CONST VectorType one()  { return Vc_CAT2(_mm_setone_, Vc_SUFFIX)(); }// set(1.f); }
230             static Vc_ALWAYS_INLINE Vc_CONST __m128 concat(__m128d a, __m128d b) { return _mm_movelh_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b)); }
231 
232 #ifdef Vc_IMPL_FMA4
233             static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) {
234                 v1 = _mm_macc_ps(v1, v2, v3);
235             }
236 #else
237             static inline void fma(VectorType &v1, VectorType v2, VectorType v3) {
238                 __m128d v1_0 = _mm_cvtps_pd(v1);
239                 __m128d v1_1 = _mm_cvtps_pd(_mm_movehl_ps(v1, v1));
240                 __m128d v2_0 = _mm_cvtps_pd(v2);
241                 __m128d v2_1 = _mm_cvtps_pd(_mm_movehl_ps(v2, v2));
242                 __m128d v3_0 = _mm_cvtps_pd(v3);
243                 __m128d v3_1 = _mm_cvtps_pd(_mm_movehl_ps(v3, v3));
244                 v1 = _mm_movelh_ps(
245                         _mm_cvtpd_ps(_mm_add_pd(_mm_mul_pd(v1_0, v2_0), v3_0)),
246                         _mm_cvtpd_ps(_mm_add_pd(_mm_mul_pd(v1_1, v2_1), v3_1)));
247             }
248 #endif
249 
250             Vc_OP(add) Vc_OP(sub) Vc_OP(mul)
251 
252             Vc_OP1(sqrt) Vc_OP1(rsqrt)
253             static Vc_ALWAYS_INLINE Vc_CONST VectorType isNaN(VectorType x) {
254                 return _mm_cmpunord_ps(x, x);
255             }
256             static Vc_ALWAYS_INLINE Vc_CONST VectorType isFinite(VectorType x) {
257                 return _mm_cmpord_ps(x, _mm_mul_ps(zero(), x));
258             }
259             static Vc_ALWAYS_INLINE Vc_CONST VectorType isInfinite(VectorType x) {
260                 return _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_castps_si128(abs(x)), _mm_castps_si128(_mm_load_ps(c_log<float>::d(1)))));
261             }
262             static Vc_ALWAYS_INLINE Vc_CONST VectorType reciprocal(VectorType x) {
263                 return _mm_rcp_ps(x);
264             }
265             static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(const VectorType a) {
266                 return Vc_CAT2(_mm_and_, Vc_SUFFIX)(a, _mm_setabsmask_ps());
267             }
268 
269             Vc_MINMAX
270             static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) {
271                 a = _mm_min_ps(a, _mm_movehl_ps(a, a));   // a = min(a0, a2), min(a1, a3), min(a2, a2), min(a3, a3)
272                 a = _mm_min_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1))); // a = min(a0, a1), a1, a2, a3
273                 return _mm_cvtss_f32(a);
274             }
275             static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) {
276                 a = _mm_max_ps(a, _mm_movehl_ps(a, a));   // a = max(a0, a2), max(a1, a3), max(a2, a2), max(a3, a3)
277                 a = _mm_max_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1))); // a = max(a0, a1), a1, a2, a3
278                 return _mm_cvtss_f32(a);
279             }
280             static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) {
281                 a = _mm_mul_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 2, 3)));
282                 a = _mm_mul_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 2, 0, 1)));
283                 return _mm_cvtss_f32(a);
284             }
285             static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) {
286                 a = _mm_add_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 2, 3)));
287                 a = _mm_add_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 2, 0, 1)));
288                 return _mm_cvtss_f32(a);
289             }
290 #undef Vc_SUFFIX
291             static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) {
292 #ifdef Vc_IMPL_SSE4_1
293                 return _mm_round_ps(a, _MM_FROUND_NINT);
294 #else
295                 //XXX slow: _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
296                 return _mm_cvtepi32_ps(_mm_cvtps_epi32(a));
297 #endif
298             }
299         };
300 
301         template<> struct VectorHelper<int> {
302             typedef int EntryType;
303             typedef __m128i VectorType;
304 #define Vc_SUFFIX si128
305 
306             Vc_OP_(or_) Vc_OP_(and_) Vc_OP_(xor_)
307             static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm_setzero_, Vc_SUFFIX)(); }
308             static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, __m128 mask) { return Vc_CAT2(_mm_and_, Vc_SUFFIX)(_mm_castps_si128(mask), a); }
309 #undef Vc_SUFFIX
310 #define Vc_SUFFIX epi32
311             static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return Vc_CAT2(_mm_setone_, Vc_SUFFIX)(); }
312 
313             static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const int a) { return Vc_CAT2(_mm_set1_, Vc_SUFFIX)(a); }
314             static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const int a, const int b, const int c, const int d) { return Vc_CAT2(_mm_set_, Vc_SUFFIX)(a, b, c, d); }
315 
316             static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) { v1 = add(mul(v1, v2), v3); }
317 
318             static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VectorType a, int shift) {
319                 return Vc_CAT2(_mm_slli_, Vc_SUFFIX)(a, shift);
320             }
321             static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VectorType a, int shift) {
322                 return Vc_CAT2(_mm_srai_, Vc_SUFFIX)(a, shift);
323             }
324             static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(const VectorType a) { return abs_epi32(a); }
325             static Vc_ALWAYS_INLINE Vc_CONST VectorType min(VectorType a, VectorType b) { return min_epi32(a, b); }
326             static Vc_ALWAYS_INLINE Vc_CONST VectorType max(VectorType a, VectorType b) { return max_epi32(a, b); }
327             static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) {
328                 a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
329                 // using lo_epi16 for speed here
330                 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
331                 return _mm_cvtsi128_si32(a);
332             }
333             static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) {
334                 a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
335                 // using lo_epi16 for speed here
336                 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
337                 return _mm_cvtsi128_si32(a);
338             }
339             static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) {
340                 a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
341                 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
342                 return _mm_cvtsi128_si32(a);
343             }
344 #ifdef Vc_IMPL_SSE4_1
345             static Vc_ALWAYS_INLINE Vc_CONST VectorType mul(VectorType a, VectorType b) { return _mm_mullo_epi32(a, b); }
346             static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) {
347                 a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
348                 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
349                 return _mm_cvtsi128_si32(a);
350             }
351 #else
352             static inline Vc_CONST VectorType mul(const VectorType a, const VectorType b) {
353                 const VectorType aShift = _mm_srli_si128(a, 4);
354                 const VectorType ab02 = _mm_mul_epu32(a, b); // [a0 * b0, a2 * b2]
355                 const VectorType bShift = _mm_srli_si128(b, 4);
356                 const VectorType ab13 = _mm_mul_epu32(aShift, bShift); // [a1 * b1, a3 * b3]
357                 return _mm_unpacklo_epi32(_mm_shuffle_epi32(ab02, 8), _mm_shuffle_epi32(ab13, 8));
358             }
359 #endif
360 
361             Vc_OP(add) Vc_OP(sub)
362 #undef Vc_SUFFIX
363             static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { return a; }
364         };
365 
366         template<> struct VectorHelper<unsigned int> {
367             typedef unsigned int EntryType;
368             typedef __m128i VectorType;
369 #define Vc_SUFFIX si128
370             Vc_OP_CAST_(or_) Vc_OP_CAST_(and_) Vc_OP_CAST_(xor_)
371             static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm_setzero_, Vc_SUFFIX)(); }
372             static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, __m128 mask) { return Vc_CAT2(_mm_and_, Vc_SUFFIX)(_mm_castps_si128(mask), a); }
373 
374 #undef Vc_SUFFIX
375 #define Vc_SUFFIX epu32
376             static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return Vc_CAT2(_mm_setone_, Vc_SUFFIX)(); }
377 
378             static Vc_ALWAYS_INLINE Vc_CONST VectorType min(VectorType a, VectorType b) { return min_epu32(a, b); }
379             static Vc_ALWAYS_INLINE Vc_CONST VectorType max(VectorType a, VectorType b) { return max_epu32(a, b); }
380             static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) {
381                 a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
382                 // using lo_epi16 for speed here
383                 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
384                 return _mm_cvtsi128_si32(a);
385             }
386             static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) {
387                 a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
388                 // using lo_epi16 for speed here
389                 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
390                 return _mm_cvtsi128_si32(a);
391             }
392             static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) {
393                 a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
394                 // using lo_epi16 for speed here
395                 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
396                 return _mm_cvtsi128_si32(a);
397             }
398             static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) {
399                 a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
400                 // using lo_epi16 for speed here
401                 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
402                 return _mm_cvtsi128_si32(a);
403             }
404 
405             static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) { v1 = add(mul(v1, v2), v3); }
406 
407             static Vc_ALWAYS_INLINE Vc_CONST VectorType mul(const VectorType a, const VectorType b) {
408                 return VectorHelper<int>::mul(a, b);
409             }
410 //X             template<unsigned int b> static Vc_ALWAYS_INLINE Vc_CONST VectorType mul(const VectorType a) {
411 //X                 switch (b) {
412 //X                     case    0: return zero();
413 //X                     case    1: return a;
414 //X                     case    2: return _mm_slli_epi32(a,  1);
415 //X                     case    4: return _mm_slli_epi32(a,  2);
416 //X                     case    8: return _mm_slli_epi32(a,  3);
417 //X                     case   16: return _mm_slli_epi32(a,  4);
418 //X                     case   32: return _mm_slli_epi32(a,  5);
419 //X                     case   64: return _mm_slli_epi32(a,  6);
420 //X                     case  128: return _mm_slli_epi32(a,  7);
421 //X                     case  256: return _mm_slli_epi32(a,  8);
422 //X                     case  512: return _mm_slli_epi32(a,  9);
423 //X                     case 1024: return _mm_slli_epi32(a, 10);
424 //X                     case 2048: return _mm_slli_epi32(a, 11);
425 //X                 }
426 //X                 return mul(a, set(b));
427 //X             }
428 
429 #undef Vc_SUFFIX
430 #define Vc_SUFFIX epi32
431             static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VectorType a, int shift) {
432                 return Vc_CAT2(_mm_slli_, Vc_SUFFIX)(a, shift);
433             }
434             static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VectorType a, int shift) {
435                 return Vc_CAT2(_mm_srli_, Vc_SUFFIX)(a, shift);
436             }
437             static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const unsigned int a) { return Vc_CAT2(_mm_set1_, Vc_SUFFIX)(a); }
438             static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const unsigned int a, const unsigned int b, const unsigned int c, const unsigned int d) { return Vc_CAT2(_mm_set_, Vc_SUFFIX)(a, b, c, d); }
439 
440             Vc_OP(add) Vc_OP(sub)
441 #undef Vc_SUFFIX
442             static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { return a; }
443         };
444 
445         template<> struct VectorHelper<signed short> {
446             typedef __m128i VectorType;
447             typedef signed short EntryType;
448 #define Vc_SUFFIX si128
449 
450             Vc_OP_(or_) Vc_OP_(and_) Vc_OP_(xor_)
451             static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm_setzero_, Vc_SUFFIX)(); }
452             static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, __m128 mask) { return Vc_CAT2(_mm_and_, Vc_SUFFIX)(_mm_castps_si128(mask), a); }
453             static Vc_ALWAYS_INLINE Vc_CONST __m128i concat(__m128i a, __m128i b) { return _mm_packs_epi32(a, b); }
454             static Vc_ALWAYS_INLINE Vc_CONST __m128i expand0(__m128i x) { return _mm_srai_epi32(_mm_unpacklo_epi16(x, x), 16); }
455             static Vc_ALWAYS_INLINE Vc_CONST __m128i expand1(__m128i x) { return _mm_srai_epi32(_mm_unpackhi_epi16(x, x), 16); }
456 
457 #undef Vc_SUFFIX
458 #define Vc_SUFFIX epi16
459             static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return Vc_CAT2(_mm_setone_, Vc_SUFFIX)(); }
460 
461             static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VectorType a, int shift) {
462                 return Vc_CAT2(_mm_slli_, Vc_SUFFIX)(a, shift);
463             }
464             static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VectorType a, int shift) {
465                 return Vc_CAT2(_mm_srai_, Vc_SUFFIX)(a, shift);
466             }
467             static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const EntryType a) { return Vc_CAT2(_mm_set1_, Vc_SUFFIX)(a); }
468             static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const EntryType a, const EntryType b, const EntryType c, const EntryType d,
469                     const EntryType e, const EntryType f, const EntryType g, const EntryType h) {
470                 return Vc_CAT2(_mm_set_, Vc_SUFFIX)(a, b, c, d, e, f, g, h);
471             }
472 
473             static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) {
474                 v1 = add(mul(v1, v2), v3); }
475 
476             static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(const VectorType a) { return abs_epi16(a); }
477 
478             Vc_OPx(mul, mullo)
479             Vc_OP(min) Vc_OP(max)
480             static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) {
481                 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
482                 a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
483                 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
484                 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
485                 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
486             }
487             static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) {
488                 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
489                 a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
490                 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
491                 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
492                 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
493             }
494             static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) {
495                 a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
496                 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
497                 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
498                 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
499             }
500             static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) {
501                 a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
502                 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
503                 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
504                 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
505             }
506 
507             Vc_OP(add) Vc_OP(sub)
508 #undef Vc_SUFFIX
509             static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { return a; }
510         };
511 
512         template<> struct VectorHelper<unsigned short> {
513             typedef __m128i VectorType;
514             typedef unsigned short EntryType;
515 #define Vc_SUFFIX si128
516             Vc_OP_CAST_(or_) Vc_OP_CAST_(and_) Vc_OP_CAST_(xor_)
517             static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm_setzero_, Vc_SUFFIX)(); }
518             static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, __m128 mask) { return Vc_CAT2(_mm_and_, Vc_SUFFIX)(_mm_castps_si128(mask), a); }
519 #ifdef Vc_IMPL_SSE4_1
520             static Vc_ALWAYS_INLINE Vc_CONST __m128i concat(__m128i a, __m128i b) { return _mm_packus_epi32(a, b); }
521 #else
522             // FIXME too bad, but this is broken without SSE 4.1
523             static Vc_ALWAYS_INLINE Vc_CONST __m128i concat(__m128i a, __m128i b) {
524                 auto tmp0 = _mm_unpacklo_epi16(a, b); // 0 4 X X 1 5 X X
525                 auto tmp1 = _mm_unpackhi_epi16(a, b); // 2 6 X X 3 7 X X
526                 auto tmp2 = _mm_unpacklo_epi16(tmp0, tmp1); // 0 2 4 6 X X X X
527                 auto tmp3 = _mm_unpackhi_epi16(tmp0, tmp1); // 1 3 5 7 X X X X
528                 return _mm_unpacklo_epi16(tmp2, tmp3); // 0 1 2 3 4 5 6 7
529             }
530 #endif
531             static Vc_ALWAYS_INLINE Vc_CONST __m128i expand0(__m128i x) { return _mm_unpacklo_epi16(x, _mm_setzero_si128()); }
532             static Vc_ALWAYS_INLINE Vc_CONST __m128i expand1(__m128i x) { return _mm_unpackhi_epi16(x, _mm_setzero_si128()); }
533 
534 #undef Vc_SUFFIX
535 #define Vc_SUFFIX epu16
536             static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return Vc_CAT2(_mm_setone_, Vc_SUFFIX)(); }
537 
538 //X             template<unsigned int b> static Vc_ALWAYS_INLINE Vc_CONST VectorType mul(const VectorType a) {
539 //X                 switch (b) {
540 //X                     case    0: return zero();
541 //X                     case    1: return a;
542 //X                     case    2: return _mm_slli_epi16(a,  1);
543 //X                     case    4: return _mm_slli_epi16(a,  2);
544 //X                     case    8: return _mm_slli_epi16(a,  3);
545 //X                     case   16: return _mm_slli_epi16(a,  4);
546 //X                     case   32: return _mm_slli_epi16(a,  5);
547 //X                     case   64: return _mm_slli_epi16(a,  6);
548 //X                     case  128: return _mm_slli_epi16(a,  7);
549 //X                     case  256: return _mm_slli_epi16(a,  8);
550 //X                     case  512: return _mm_slli_epi16(a,  9);
551 //X                     case 1024: return _mm_slli_epi16(a, 10);
552 //X                     case 2048: return _mm_slli_epi16(a, 11);
553 //X                 }
554 //X                 return mul(a, set(b));
555 //X             }
556 #if !defined(USE_INCORRECT_UNSIGNED_COMPARE) || Vc_IMPL_SSE4_1
557             static Vc_ALWAYS_INLINE Vc_CONST VectorType min(VectorType a, VectorType b) { return min_epu16(a, b); }
558             static Vc_ALWAYS_INLINE Vc_CONST VectorType max(VectorType a, VectorType b) { return max_epu16(a, b); }
559 #endif
560 #undef Vc_SUFFIX
561 #define Vc_SUFFIX epi16
562             static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VectorType a, int shift) {
563                 return Vc_CAT2(_mm_slli_, Vc_SUFFIX)(a, shift);
564             }
565             static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VectorType a, int shift) {
566                 return Vc_CAT2(_mm_srli_, Vc_SUFFIX)(a, shift);
567             }
568 
569             static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) { v1 = add(mul(v1, v2), v3); }
570 
571             Vc_OPx(mul, mullo) // should work correctly for all values
572 #if defined(USE_INCORRECT_UNSIGNED_COMPARE) && !defined(Vc_IMPL_SSE4_1)
573             Vc_OP(min) Vc_OP(max) // XXX breaks for values with MSB set
574 #endif
575             static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) {
576                 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
577                 a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
578                 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
579                 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
580                 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
581             }
582             static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) {
583                 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
584                 a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
585                 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
586                 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
587                 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
588             }
589             static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) {
590                 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
591                 a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
592                 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
593                 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
594                 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
595             }
596             static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) {
597                 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
598                 a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
599                 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
600                 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
601                 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
602             }
603             static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const EntryType a) { return Vc_CAT2(_mm_set1_, Vc_SUFFIX)(a); }
604             static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const EntryType a, const EntryType b, const EntryType c,
605                     const EntryType d, const EntryType e, const EntryType f,
606                     const EntryType g, const EntryType h) {
607                 return Vc_CAT2(_mm_set_, Vc_SUFFIX)(a, b, c, d, e, f, g, h);
608             }
609 
610             Vc_OP(add) Vc_OP(sub)
611 #undef Vc_SUFFIX
612             static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { return a; }
613         };
614 #undef Vc_OP1
615 #undef Vc_OP
616 #undef Vc_OP_
617 #undef Vc_OPx
618 #undef Vc_OP_CAST_
619 #undef Vc_MINMAX
620 
621 }  // namespace SSE
622 }  // namespace Vc
623 
624 #include "vectorhelper.tcc"
625 
626 #endif // VC_SSE_VECTORHELPER_H_
627