1 /*  This file is part of the Vc library. {{{
2 Copyright © 2015 Matthias Kretz <kretz@kde.org>
3 
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are met:
6     * Redistributions of source code must retain the above copyright
7       notice, this list of conditions and the following disclaimer.
8     * Redistributions in binary form must reproduce the above copyright
9       notice, this list of conditions and the following disclaimer in the
10       documentation and/or other materials provided with the distribution.
11     * Neither the names of contributing organizations nor the
12       names of its contributors may be used to endorse or promote products
13       derived from this software without specific prior written permission.
14 
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
19 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 
26 }}}*/
27 
28 #ifndef VC_SSE_DETAIL_H_
29 #define VC_SSE_DETAIL_H_
30 
31 #include "casts.h"
32 #ifdef Vc_IMPL_AVX
33 #include "../avx/intrinsics.h"
34 #endif
35 #include "vectorhelper.h"
36 
37 #include "macros.h"
38 
39 namespace Vc_VERSIONED_NAMESPACE
40 {
41 namespace Detail
42 {
43 // (converting) load functions {{{1
44 template <typename V, typename DstT> struct LoadTag
45 {
46 };
47 
48 // when_(un)aligned{{{2
49 class when_aligned
50 {
51 public:
52     template <typename F> constexpr when_aligned(F, typename F::EnableIfAligned = nullptr)
53     {
54     }
55 };
56 
57 class when_unaligned
58 {
59 public:
60     template <typename F>
61     constexpr when_unaligned(F, typename F::EnableIfUnaligned = nullptr)
62     {
63     }
64 };
65 
66 class when_streaming
67 {
68 public:
69     template <typename F>
70     constexpr when_streaming(F, typename F::EnableIfStreaming = nullptr)
71     {
72     }
73 };
74 
75 // load16{{{2
load16(const float * mem,when_aligned)76 Vc_INTRINSIC __m128 load16(const float *mem, when_aligned)
77 {
78     return _mm_load_ps(mem);
79 }
load16(const float * mem,when_unaligned)80 Vc_INTRINSIC __m128 load16(const float *mem, when_unaligned)
81 {
82     return _mm_loadu_ps(mem);
83 }
load16(const float * mem,when_streaming)84 Vc_INTRINSIC __m128 load16(const float *mem, when_streaming)
85 {
86     return SseIntrinsics::_mm_stream_load(mem);
87 }
load16(const double * mem,when_aligned)88 Vc_INTRINSIC __m128d load16(const double *mem, when_aligned)
89 {
90     return _mm_load_pd(mem);
91 }
load16(const double * mem,when_unaligned)92 Vc_INTRINSIC __m128d load16(const double *mem, when_unaligned)
93 {
94     return _mm_loadu_pd(mem);
95 }
load16(const double * mem,when_streaming)96 Vc_INTRINSIC __m128d load16(const double *mem, when_streaming)
97 {
98     return SseIntrinsics::_mm_stream_load(mem);
99 }
load16(const T * mem,when_aligned)100 template <class T> Vc_INTRINSIC __m128i load16(const T *mem, when_aligned)
101 {
102     static_assert(std::is_integral<T>::value, "load16<T> is only intended for integral T");
103     return _mm_load_si128(reinterpret_cast<const __m128i *>(mem));
104 }
load16(const T * mem,when_unaligned)105 template <class T> Vc_INTRINSIC __m128i load16(const T *mem, when_unaligned)
106 {
107     static_assert(std::is_integral<T>::value, "load16<T> is only intended for integral T");
108     return _mm_loadu_si128(reinterpret_cast<const __m128i *>(mem));
109 }
load16(const T * mem,when_streaming)110 template <class T> Vc_INTRINSIC __m128i load16(const T *mem, when_streaming)
111 {
112     static_assert(std::is_integral<T>::value, "load16<T> is only intended for integral T");
113     return SseIntrinsics::_mm_stream_load(mem);
114 }
115 
116 // MSVC workarounds{{{2
117 #ifdef Vc_MSVC
118 // work around: "fatal error C1001: An internal error has occurred in the compiler."
119 template <typename V, typename DstT, typename F>
120 Vc_INTRINSIC __m128d load(const double *mem, F f,
121                           enable_if<(std::is_same<DstT, double>::value &&
122                                      std::is_same<V, __m128d>::value)> = nullarg)
123 {
124     return load16(mem, f);
125 }
126 
127 template <typename V, typename DstT, typename F>
128 Vc_INTRINSIC __m128 load(const float *mem, F f,
129                          enable_if<(std::is_same<DstT, float>::value &&
130                                     std::is_same<V, __m128>::value)> = nullarg)
131 {
132     return load16(mem, f);
133 }
134 
135 template <typename V, typename DstT, typename F>
136 Vc_INTRINSIC __m128i load(const uint *mem, F f,
137                           enable_if<(std::is_same<DstT, uint>::value &&
138                                      std::is_same<V, __m128i>::value)> = nullarg)
139 {
140     return load16(mem, f);
141 }
142 
143 template <typename V, typename DstT, typename F>
144 Vc_INTRINSIC __m128i load(const int *mem, F f,
145                           enable_if<(std::is_same<DstT, int>::value &&
146                                      std::is_same<V, __m128i>::value)> = nullarg)
147 {
148     return load16(mem, f);
149 }
150 
151 template <typename V, typename DstT, typename F>
152 Vc_INTRINSIC __m128i load(const short *mem, F f,
153                           enable_if<(std::is_same<DstT, short>::value &&
154                                      std::is_same<V, __m128i>::value)> = nullarg)
155 {
156     return load16(mem, f);
157 }
158 
159 template <typename V, typename DstT, typename F>
160 Vc_INTRINSIC __m128i load(const ushort *mem, F f,
161                           enable_if<(std::is_same<DstT, ushort>::value &&
162                                      std::is_same<V, __m128i>::value)> = nullarg)
163 {
164     return load16(mem, f);
165 }
166 #endif  // Vc_MSVC
167 
168 // generic load{{{2
169 template <typename V, typename DstT, typename SrcT, typename Flags,
170           typename = enable_if<
171 #ifdef Vc_MSVC
172               !std::is_same<DstT, SrcT>::value &&
173 #endif
174               (!std::is_integral<DstT>::value || !std::is_integral<SrcT>::value ||
175                sizeof(DstT) >= sizeof(SrcT))>>
load(const SrcT * mem,Flags flags)176 Vc_INTRINSIC V load(const SrcT *mem, Flags flags)
177 {
178     return load(mem, flags, LoadTag<V, DstT>());
179 }
180 
181 // no conversion load from any T {{{2
182 template <typename V, typename T, typename Flags>
183 Vc_INTRINSIC V
184     load(const T *mem, Flags, LoadTag<V, T>, enable_if<sizeof(V) == 16> = nullarg)
185 {
186     return SSE::VectorHelper<V>::template load<Flags>(mem);
187 }
188 
189 // short {{{2
190 template <typename Flags>
load(const ushort * mem,Flags,LoadTag<__m128i,short>)191 Vc_INTRINSIC __m128i load(const ushort *mem, Flags, LoadTag<__m128i, short>)
192 {
193     return SSE::VectorHelper<__m128i>::load<Flags>(mem);
194 }
195 template <typename Flags>
load(const uchar * mem,Flags,LoadTag<__m128i,short>)196 Vc_INTRINSIC __m128i load(const uchar *mem, Flags, LoadTag<__m128i, short>)
197 {
198     // the only available streaming load loads 16 bytes - twice as much as we need =>
199     // can't use it, or we risk an out-of-bounds read and an unaligned load exception
200     return SSE::cvtepu8_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
201 }
202 template <typename Flags>
load(const schar * mem,Flags,LoadTag<__m128i,short>)203 Vc_INTRINSIC __m128i load(const schar *mem, Flags, LoadTag<__m128i, short>)
204 {
205     // the only available streaming load loads 16 bytes - twice as much as we need =>
206     // can't use it, or we risk an out-of-bounds read and an unaligned load exception
207     return SSE::cvtepi8_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
208 }
209 
210 // ushort {{{2
211 template <typename Flags>
load(const uchar * mem,Flags,LoadTag<__m128i,ushort>)212 Vc_INTRINSIC __m128i load(const uchar *mem, Flags, LoadTag<__m128i, ushort>)
213 {
214     // the only available streaming load loads 16 bytes - twice as much as we need =>
215     // can't use it, or we risk an out-of-bounds read and an unaligned load exception
216     return SSE::cvtepu8_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
217 }
218 
219 // int {{{2
220 template <typename Flags>
load(const uint * mem,Flags,LoadTag<__m128i,int>)221 Vc_INTRINSIC __m128i load(const uint *mem, Flags, LoadTag<__m128i, int>)
222 {
223     return SSE::VectorHelper<__m128i>::load<Flags>(mem);
224 }
225 // no difference between streaming and alignment, because the
226 // 32/64 bit loads are not available as streaming loads, and can always be unaligned
227 template <typename Flags>
load(const ushort * mem,Flags,LoadTag<__m128i,int>)228 Vc_INTRINSIC __m128i load(const ushort *mem, Flags, LoadTag<__m128i, int>)
229 {
230     return SSE::cvtepu16_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
231 }
232 template <typename Flags>
load(const short * mem,Flags,LoadTag<__m128i,int>)233 Vc_INTRINSIC __m128i load(const short *mem, Flags, LoadTag<__m128i, int>)
234 {
235     return SSE::cvtepi16_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
236 }
237 template <typename Flags>
load(const uchar * mem,Flags,LoadTag<__m128i,int>)238 Vc_INTRINSIC __m128i load(const uchar *mem, Flags, LoadTag<__m128i, int>)
239 {
240     return SSE::cvtepu8_epi32(_mm_cvtsi32_si128(*aliasing_cast<int>(mem)));
241 }
242 template <typename Flags>
load(const schar * mem,Flags,LoadTag<__m128i,int>)243 Vc_INTRINSIC __m128i load(const schar *mem, Flags, LoadTag<__m128i, int>)
244 {
245     return SSE::cvtepi8_epi32(_mm_cvtsi32_si128(*aliasing_cast<int>(mem)));
246 }
247 
248 // uint {{{2
249 template <typename Flags>
load(const ushort * mem,Flags,LoadTag<__m128i,uint>)250 Vc_INTRINSIC __m128i load(const ushort *mem, Flags, LoadTag<__m128i, uint>)
251 {
252     return SSE::cvtepu16_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
253 }
254 template <typename Flags>
load(const uchar * mem,Flags,LoadTag<__m128i,uint>)255 Vc_INTRINSIC __m128i load(const uchar *mem, Flags, LoadTag<__m128i, uint>)
256 {
257     return SSE::cvtepu8_epi32(_mm_cvtsi32_si128(*aliasing_cast<int>(mem)));
258 }
259 
260 // double {{{2
261 template <typename Flags>
load(const float * mem,Flags,LoadTag<__m128d,double>)262 Vc_INTRINSIC __m128d load(const float *mem, Flags, LoadTag<__m128d, double>)
263 {
264     return SSE::convert<float, double>(
265         _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<const __m64 *>(mem)));
266 }
267 template <typename Flags>
load(const uint * mem,Flags,LoadTag<__m128d,double>)268 Vc_INTRINSIC __m128d load(const uint *mem, Flags, LoadTag<__m128d, double>)
269 {
270     return SSE::convert<uint, double>(
271         _mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
272 }
273 template <typename Flags>
load(const int * mem,Flags,LoadTag<__m128d,double>)274 Vc_INTRINSIC __m128d load(const int *mem, Flags, LoadTag<__m128d, double>)
275 {
276     return SSE::convert<int, double>(
277         _mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
278 }
279 template <typename Flags>
load(const ushort * mem,Flags,LoadTag<__m128d,double>)280 Vc_INTRINSIC __m128d load(const ushort *mem, Flags, LoadTag<__m128d, double>)
281 {
282     return SSE::convert<ushort, double>(
283         _mm_cvtsi32_si128(*aliasing_cast<int>(mem)));
284 }
285 template <typename Flags>
load(const short * mem,Flags,LoadTag<__m128d,double>)286 Vc_INTRINSIC __m128d load(const short *mem, Flags, LoadTag<__m128d, double>)
287 {
288     return SSE::convert<short, double>(
289         _mm_cvtsi32_si128(*aliasing_cast<int>(mem)));
290 }
291 template <typename Flags>
load(const uchar * mem,Flags,LoadTag<__m128d,double>)292 Vc_INTRINSIC __m128d load(const uchar *mem, Flags, LoadTag<__m128d, double>)
293 {
294     return SSE::convert<uchar, double>(
295         _mm_set1_epi16(*aliasing_cast<short>(mem)));
296 }
297 template <typename Flags>
load(const schar * mem,Flags,LoadTag<__m128d,double>)298 Vc_INTRINSIC __m128d load(const schar *mem, Flags, LoadTag<__m128d, double>)
299 {
300     return SSE::convert<char, double>(
301         _mm_set1_epi16(*aliasing_cast<short>(mem)));
302 }
303 
304 // float {{{2
305 template <typename Flags>
load(const double * mem,Flags,LoadTag<__m128,float>)306 Vc_INTRINSIC __m128 load(const double *mem, Flags, LoadTag<__m128, float>)
307 {
308 #ifdef Vc_IMPL_AVX
309     if (Flags::IsUnaligned) {
310         return _mm256_cvtpd_ps(_mm256_loadu_pd(mem));
311     } else if (Flags::IsStreaming) {
312         return _mm256_cvtpd_ps(AvxIntrinsics::stream_load<__m256d>(mem));
313     } else {  // Flags::IsAligned
314         return _mm256_cvtpd_ps(_mm256_load_pd(mem));
315     }
316 #else
317     return _mm_movelh_ps(_mm_cvtpd_ps(SSE::VectorHelper<__m128d>::load<Flags>(&mem[0])),
318                          _mm_cvtpd_ps(SSE::VectorHelper<__m128d>::load<Flags>(&mem[2])));
319 #endif
320 }
321 template <typename Flags>
load(const uint * mem,Flags f,LoadTag<__m128,float>)322 Vc_INTRINSIC __m128 load(const uint *mem, Flags f, LoadTag<__m128, float>)
323 {
324     return SSE::convert<uint, float>(load<__m128i, uint>(mem, f));
325 }
326 template <typename T, typename Flags,
327           typename = enable_if<!std::is_same<T, float>::value>>
load(const T * mem,Flags f,LoadTag<__m128,float>)328 Vc_INTRINSIC __m128 load(const T *mem, Flags f, LoadTag<__m128, float>)
329 {
330     return _mm_cvtepi32_ps(load<__m128i, int>(mem, f));
331 }
332 
333 // shifted{{{1
334 template <int amount, typename T>
shifted(T k)335 Vc_INTRINSIC Vc_CONST enable_if<amount == 0, T> shifted(T k)
336 {
337     return k;
338 }
339 template <int amount, typename T>
shifted(T k)340 Vc_INTRINSIC Vc_CONST enable_if<(sizeof(T) == 16 && amount > 0), T> shifted(T k)
341 {
342     return _mm_srli_si128(k, amount);
343 }
344 template <int amount, typename T>
shifted(T k)345 Vc_INTRINSIC Vc_CONST enable_if<(sizeof(T) == 16 && amount < 0), T> shifted(T k)
346 {
347     return _mm_slli_si128(k, -amount);
348 }
349 
350 // IndexesFromZero{{{1
IndexesFromZero()351 template <typename T, int Size> Vc_INTRINSIC Vc_CONST const T *IndexesFromZero()
352 {
353     if (Size == 4) {
354         return reinterpret_cast<const T *>(SSE::_IndexesFromZero4);
355     } else if (Size == 8) {
356         return reinterpret_cast<const T *>(SSE::_IndexesFromZero8);
357     } else if (Size == 16) {
358         return reinterpret_cast<const T *>(SSE::_IndexesFromZero16);
359     }
360     return 0;
361 }
362 
363 // popcnt{{{1
popcnt4(unsigned int n)364 Vc_INTRINSIC Vc_CONST unsigned int popcnt4(unsigned int n)
365 {
366 #ifdef Vc_IMPL_POPCNT
367     return _mm_popcnt_u32(n);
368 #else
369     n = (n & 0x5U) + ((n >> 1) & 0x5U);
370     n = (n & 0x3U) + ((n >> 2) & 0x3U);
371     return n;
372 #endif
373 }
popcnt8(unsigned int n)374 Vc_INTRINSIC Vc_CONST unsigned int popcnt8(unsigned int n)
375 {
376 #ifdef Vc_IMPL_POPCNT
377     return _mm_popcnt_u32(n);
378 #else
379     n = (n & 0x55U) + ((n >> 1) & 0x55U);
380     n = (n & 0x33U) + ((n >> 2) & 0x33U);
381     n = (n & 0x0fU) + ((n >> 4) & 0x0fU);
382     return n;
383 #endif
384 }
popcnt16(unsigned int n)385 Vc_INTRINSIC Vc_CONST unsigned int popcnt16(unsigned int n)
386 {
387 #ifdef Vc_IMPL_POPCNT
388     return _mm_popcnt_u32(n);
389 #else
390     n = (n & 0x5555U) + ((n >> 1) & 0x5555U);
391     n = (n & 0x3333U) + ((n >> 2) & 0x3333U);
392     n = (n & 0x0f0fU) + ((n >> 4) & 0x0f0fU);
393     n = (n & 0x00ffU) + ((n >> 8) & 0x00ffU);
394     return n;
395 #endif
396 }
popcnt32(unsigned int n)397 Vc_INTRINSIC Vc_CONST unsigned int popcnt32(unsigned int n)
398 {
399 #ifdef Vc_IMPL_POPCNT
400     return _mm_popcnt_u32(n);
401 #else
402     n = (n & 0x55555555U) + ((n >> 1) & 0x55555555U);
403     n = (n & 0x33333333U) + ((n >> 2) & 0x33333333U);
404     n = (n & 0x0f0f0f0fU) + ((n >> 4) & 0x0f0f0f0fU);
405     n = (n & 0x00ff00ffU) + ((n >> 8) & 0x00ff00ffU);
406     n = (n & 0x0000ffffU) + ((n >>16) & 0x0000ffffU);
407     return n;
408 #endif
409 }
410 
411 // mask_cast{{{1
mask_cast(__m128i k)412 template<size_t From, size_t To, typename R> Vc_INTRINSIC Vc_CONST R mask_cast(__m128i k)
413 {
414     static_assert(From == To, "Incorrect mask cast.");
415     static_assert(std::is_same<R, __m128>::value, "Incorrect mask cast.");
416     return SSE::sse_cast<__m128>(k);
417 }
418 
419 template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<2, 4, __m128>(__m128i k)
420 {
421     return SSE::sse_cast<__m128>(_mm_packs_epi16(k, _mm_setzero_si128()));
422 }
423 template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<2, 8, __m128>(__m128i k)
424 {
425     return SSE::sse_cast<__m128>(
426         _mm_packs_epi16(_mm_packs_epi16(k, _mm_setzero_si128()), _mm_setzero_si128()));
427 }
428 
429 template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<4, 2, __m128>(__m128i k)
430 {
431     return SSE::sse_cast<__m128>(_mm_unpacklo_epi32(k, k));
432 }
433 template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<4, 8, __m128>(__m128i k)
434 {
435     return SSE::sse_cast<__m128>(_mm_packs_epi16(k, _mm_setzero_si128()));
436 }
437 
438 template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<8, 2, __m128>(__m128i k)
439 {
440     const auto tmp = _mm_unpacklo_epi16(k, k);
441     return SSE::sse_cast<__m128>(_mm_unpacklo_epi32(tmp, tmp));
442 }
443 template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<8, 4, __m128>(__m128i k)
444 {
445     return SSE::sse_cast<__m128>(_mm_unpacklo_epi16(k, k));
446 }
447 
448 template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<16, 8, __m128>(__m128i k)
449 {
450     return SSE::sse_cast<__m128>(_mm_unpacklo_epi8(k, k));
451 }
452 template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<16, 4, __m128>(__m128i k)
453 {
454     const auto tmp = SSE::sse_cast<__m128i>(mask_cast<16, 8, __m128>(k));
455     return SSE::sse_cast<__m128>(_mm_unpacklo_epi16(tmp, tmp));
456 }
457 template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<16, 2, __m128>(__m128i k)
458 {
459     const auto tmp = SSE::sse_cast<__m128i>(mask_cast<16, 4, __m128>(k));
460     return SSE::sse_cast<__m128>(_mm_unpacklo_epi32(tmp, tmp));
461 }
462 
463 // allone{{{1
464 template <typename V> Vc_INTRINSIC_L Vc_CONST_L V allone() Vc_INTRINSIC_R Vc_CONST_R;
465 template<> Vc_INTRINSIC Vc_CONST __m128  allone<__m128 >() { return SSE::_mm_setallone_ps(); }
466 template<> Vc_INTRINSIC Vc_CONST __m128i allone<__m128i>() { return SSE::_mm_setallone_si128(); }
467 template<> Vc_INTRINSIC Vc_CONST __m128d allone<__m128d>() { return SSE::_mm_setallone_pd(); }
468 
469 // zero{{{1
470 template <typename V> inline V zero();
471 template<> Vc_INTRINSIC Vc_CONST __m128  zero<__m128 >() { return _mm_setzero_ps(); }
472 template<> Vc_INTRINSIC Vc_CONST __m128i zero<__m128i>() { return _mm_setzero_si128(); }
473 template<> Vc_INTRINSIC Vc_CONST __m128d zero<__m128d>() { return _mm_setzero_pd(); }
474 
475 // negate{{{1
negate(__m128 v,std::integral_constant<std::size_t,4>)476 Vc_ALWAYS_INLINE Vc_CONST __m128 negate(__m128 v, std::integral_constant<std::size_t, 4>)
477 {
478     return _mm_xor_ps(v, SSE::_mm_setsignmask_ps());
479 }
negate(__m128d v,std::integral_constant<std::size_t,8>)480 Vc_ALWAYS_INLINE Vc_CONST __m128d negate(__m128d v, std::integral_constant<std::size_t, 8>)
481 {
482     return _mm_xor_pd(v, SSE::_mm_setsignmask_pd());
483 }
negate(__m128i v,std::integral_constant<std::size_t,4>)484 Vc_ALWAYS_INLINE Vc_CONST __m128i negate(__m128i v, std::integral_constant<std::size_t, 4>)
485 {
486 #ifdef Vc_IMPL_SSSE3
487     return _mm_sign_epi32(v, allone<__m128i>());
488 #else
489     return _mm_sub_epi32(_mm_setzero_si128(), v);
490 #endif
491 }
negate(__m128i v,std::integral_constant<std::size_t,2>)492 Vc_ALWAYS_INLINE Vc_CONST __m128i negate(__m128i v, std::integral_constant<std::size_t, 2>)
493 {
494 #ifdef Vc_IMPL_SSSE3
495     return _mm_sign_epi16(v, allone<__m128i>());
496 #else
497     return _mm_sub_epi16(_mm_setzero_si128(), v);
498 #endif
499 }
500 
501 // xor_{{{1
xor_(__m128 a,__m128 b)502 Vc_INTRINSIC __m128 xor_(__m128 a, __m128 b) { return _mm_xor_ps(a, b); }
xor_(__m128d a,__m128d b)503 Vc_INTRINSIC __m128d xor_(__m128d a, __m128d b) { return _mm_xor_pd(a, b); }
xor_(__m128i a,__m128i b)504 Vc_INTRINSIC __m128i xor_(__m128i a, __m128i b) { return _mm_xor_si128(a, b); }
505 
506 // or_{{{1
or_(__m128 a,__m128 b)507 Vc_INTRINSIC __m128 or_(__m128 a, __m128 b) { return _mm_or_ps(a, b); }
or_(__m128d a,__m128d b)508 Vc_INTRINSIC __m128d or_(__m128d a, __m128d b) { return _mm_or_pd(a, b); }
or_(__m128i a,__m128i b)509 Vc_INTRINSIC __m128i or_(__m128i a, __m128i b) { return _mm_or_si128(a, b); }
510 
511 // and_{{{1
and_(__m128 a,__m128 b)512 Vc_INTRINSIC __m128 and_(__m128 a, __m128 b) { return _mm_and_ps(a, b); }
and_(__m128d a,__m128d b)513 Vc_INTRINSIC __m128d and_(__m128d a, __m128d b) { return _mm_and_pd(a, b); }
and_(__m128i a,__m128i b)514 Vc_INTRINSIC __m128i and_(__m128i a, __m128i b) { return _mm_and_si128(a, b); }
515 
516 // andnot_{{{1
andnot_(__m128 a,__m128 b)517 Vc_INTRINSIC __m128 andnot_(__m128 a, __m128 b) { return _mm_andnot_ps(a, b); }
andnot_(__m128d a,__m128d b)518 Vc_INTRINSIC __m128d andnot_(__m128d a, __m128d b) { return _mm_andnot_pd(a, b); }
andnot_(__m128i a,__m128i b)519 Vc_INTRINSIC __m128i andnot_(__m128i a, __m128i b) { return _mm_andnot_si128(a, b); }
520 
521 // not_{{{1
not_(__m128 a)522 Vc_INTRINSIC __m128  not_(__m128  a) { return andnot_(a, allone<__m128 >()); }
not_(__m128d a)523 Vc_INTRINSIC __m128d not_(__m128d a) { return andnot_(a, allone<__m128d>()); }
not_(__m128i a)524 Vc_INTRINSIC __m128i not_(__m128i a) { return andnot_(a, allone<__m128i>()); }
525 
526 // add{{{1
add(__m128 a,__m128 b,float)527 Vc_INTRINSIC __m128  add(__m128  a, __m128  b,  float) { return _mm_add_ps(a, b); }
add(__m128d a,__m128d b,double)528 Vc_INTRINSIC __m128d add(__m128d a, __m128d b, double) { return _mm_add_pd(a, b); }
add(__m128i a,__m128i b,int)529 Vc_INTRINSIC __m128i add(__m128i a, __m128i b,    int) { return _mm_add_epi32(a, b); }
add(__m128i a,__m128i b,uint)530 Vc_INTRINSIC __m128i add(__m128i a, __m128i b,   uint) { return _mm_add_epi32(a, b); }
add(__m128i a,__m128i b,short)531 Vc_INTRINSIC __m128i add(__m128i a, __m128i b,  short) { return _mm_add_epi16(a, b); }
add(__m128i a,__m128i b,ushort)532 Vc_INTRINSIC __m128i add(__m128i a, __m128i b, ushort) { return _mm_add_epi16(a, b); }
add(__m128i a,__m128i b,schar)533 Vc_INTRINSIC __m128i add(__m128i a, __m128i b,  schar) { return _mm_add_epi8 (a, b); }
add(__m128i a,__m128i b,uchar)534 Vc_INTRINSIC __m128i add(__m128i a, __m128i b,  uchar) { return _mm_add_epi8 (a, b); }
535 
536 // sub{{{1
sub(__m128 a,__m128 b,float)537 Vc_INTRINSIC __m128  sub(__m128  a, __m128  b,  float) { return _mm_sub_ps(a, b); }
sub(__m128d a,__m128d b,double)538 Vc_INTRINSIC __m128d sub(__m128d a, __m128d b, double) { return _mm_sub_pd(a, b); }
sub(__m128i a,__m128i b,int)539 Vc_INTRINSIC __m128i sub(__m128i a, __m128i b,    int) { return _mm_sub_epi32(a, b); }
sub(__m128i a,__m128i b,uint)540 Vc_INTRINSIC __m128i sub(__m128i a, __m128i b,   uint) { return _mm_sub_epi32(a, b); }
sub(__m128i a,__m128i b,short)541 Vc_INTRINSIC __m128i sub(__m128i a, __m128i b,  short) { return _mm_sub_epi16(a, b); }
sub(__m128i a,__m128i b,ushort)542 Vc_INTRINSIC __m128i sub(__m128i a, __m128i b, ushort) { return _mm_sub_epi16(a, b); }
sub(__m128i a,__m128i b,schar)543 Vc_INTRINSIC __m128i sub(__m128i a, __m128i b,  schar) { return _mm_sub_epi8 (a, b); }
sub(__m128i a,__m128i b,uchar)544 Vc_INTRINSIC __m128i sub(__m128i a, __m128i b,  uchar) { return _mm_sub_epi8 (a, b); }
545 
546 // mul{{{1
mul(__m128 a,__m128 b,float)547 Vc_INTRINSIC __m128  mul(__m128  a, __m128  b,  float) { return _mm_mul_ps(a, b); }
mul(__m128d a,__m128d b,double)548 Vc_INTRINSIC __m128d mul(__m128d a, __m128d b, double) { return _mm_mul_pd(a, b); }
mul(__m128i a,__m128i b,int)549 Vc_INTRINSIC __m128i mul(__m128i a, __m128i b,    int) {
550 #ifdef Vc_IMPL_SSE4_1
551     return _mm_mullo_epi32(a, b);
552 #else
553     const __m128i aShift = _mm_srli_si128(a, 4);
554     const __m128i ab02 = _mm_mul_epu32(a, b);  // [a0 * b0, a2 * b2]
555     const __m128i bShift = _mm_srli_si128(b, 4);
556     const __m128i ab13 = _mm_mul_epu32(aShift, bShift);  // [a1 * b1, a3 * b3]
557     return _mm_unpacklo_epi32(_mm_shuffle_epi32(ab02, 8), _mm_shuffle_epi32(ab13, 8));
558 #endif
559 }
mul(__m128i a,__m128i b,uint)560 Vc_INTRINSIC __m128i mul(__m128i a, __m128i b,   uint) { return mul(a, b, int()); }
mul(__m128i a,__m128i b,short)561 Vc_INTRINSIC __m128i mul(__m128i a, __m128i b,  short) { return _mm_mullo_epi16(a, b); }
mul(__m128i a,__m128i b,ushort)562 Vc_INTRINSIC __m128i mul(__m128i a, __m128i b, ushort) { return _mm_mullo_epi16(a, b); }
mul(__m128i a,__m128i b,schar)563 Vc_INTRINSIC __m128i mul(__m128i a, __m128i b,  schar) {
564 #ifdef Vc_USE_BUILTIN_VECTOR_TYPES
565     using B = Common::BuiltinType<schar, 16>;
566     const auto x = aliasing_cast<B>(a) * aliasing_cast<B>(b);
567     return reinterpret_cast<const __m128i &>(x);
568 #else
569     return or_(
570         and_(_mm_mullo_epi16(a, b), _mm_slli_epi16(allone<__m128i>(), 8)),
571         _mm_slli_epi16(_mm_mullo_epi16(_mm_srli_si128(a, 1), _mm_srli_si128(b, 1)), 8));
572 #endif
573 }
mul(__m128i a,__m128i b,uchar)574 Vc_INTRINSIC __m128i mul(__m128i a, __m128i b,  uchar) {
575 #ifdef Vc_USE_BUILTIN_VECTOR_TYPES
576     using B = Common::BuiltinType<uchar, 16>;
577     const auto x = aliasing_cast<B>(a) * aliasing_cast<B>(b);
578     return reinterpret_cast<const __m128i &>(x);
579 #else
580     return or_(
581         and_(_mm_mullo_epi16(a, b), _mm_slli_epi16(allone<__m128i>(), 8)),
582         _mm_slli_epi16(_mm_mullo_epi16(_mm_srli_si128(a, 1), _mm_srli_si128(b, 1)), 8));
583 #endif
584 }
585 
586 // div{{{1
div(__m128 a,__m128 b,float)587 Vc_INTRINSIC __m128  div(__m128  a, __m128  b,  float) { return _mm_div_ps(a, b); }
div(__m128d a,__m128d b,double)588 Vc_INTRINSIC __m128d div(__m128d a, __m128d b, double) { return _mm_div_pd(a, b); }
589 
590 // TODO: fma{{{1
591 //Vc_INTRINSIC __m128  fma(__m128  a, __m128  b, __m128  c,  float) { return _mm_mul_ps(a, b); }
592 //Vc_INTRINSIC __m128d fma(__m128d a, __m128d b, __m128d c, double) { return _mm_mul_pd(a, b); }
593 //Vc_INTRINSIC __m128i fma(__m128i a, __m128i b, __m128i c,    int) { }
594 //Vc_INTRINSIC __m128i fma(__m128i a, __m128i b, __m128i c,   uint) { return fma(a, b, int()); }
595 //Vc_INTRINSIC __m128i fma(__m128i a, __m128i b, __m128i c,  short) { return _mm_mullo_epi16(a, b); }
596 //Vc_INTRINSIC __m128i fma(__m128i a, __m128i b, __m128i c, ushort) { return _mm_mullo_epi16(a, b); }
597 //Vc_INTRINSIC __m128i fma(__m128i a, __m128i b, __m128i c,  schar) { }
598 //Vc_INTRINSIC __m128i fma(__m128i a, __m128i b, __m128i c,  uchar) { }
599 
600 // min{{{1
min(__m128 a,__m128 b,float)601 Vc_INTRINSIC __m128  min(__m128  a, __m128  b,  float) { return _mm_min_ps(a, b); }
min(__m128d a,__m128d b,double)602 Vc_INTRINSIC __m128d min(__m128d a, __m128d b, double) { return _mm_min_pd(a, b); }
min(__m128i a,__m128i b,int)603 Vc_INTRINSIC __m128i min(__m128i a, __m128i b,    int) { return SSE::min_epi32(a, b); }
min(__m128i a,__m128i b,uint)604 Vc_INTRINSIC __m128i min(__m128i a, __m128i b,   uint) { return SSE::min_epu32(a, b); }
min(__m128i a,__m128i b,short)605 Vc_INTRINSIC __m128i min(__m128i a, __m128i b,  short) { return _mm_min_epi16(a, b); }
min(__m128i a,__m128i b,ushort)606 Vc_INTRINSIC __m128i min(__m128i a, __m128i b, ushort) { return SSE::min_epu16(a, b); }
min(__m128i a,__m128i b,schar)607 Vc_INTRINSIC __m128i min(__m128i a, __m128i b,  schar) { return SSE::min_epi8 (a, b); }
min(__m128i a,__m128i b,uchar)608 Vc_INTRINSIC __m128i min(__m128i a, __m128i b,  uchar) { return _mm_min_epu8 (a, b); }
609 
610 // max{{{1
max(__m128 a,__m128 b,float)611 Vc_INTRINSIC __m128  max(__m128  a, __m128  b,  float) { return _mm_max_ps(a, b); }
max(__m128d a,__m128d b,double)612 Vc_INTRINSIC __m128d max(__m128d a, __m128d b, double) { return _mm_max_pd(a, b); }
max(__m128i a,__m128i b,int)613 Vc_INTRINSIC __m128i max(__m128i a, __m128i b,    int) { return SSE::max_epi32(a, b); }
max(__m128i a,__m128i b,uint)614 Vc_INTRINSIC __m128i max(__m128i a, __m128i b,   uint) { return SSE::max_epu32(a, b); }
max(__m128i a,__m128i b,short)615 Vc_INTRINSIC __m128i max(__m128i a, __m128i b,  short) { return _mm_max_epi16(a, b); }
max(__m128i a,__m128i b,ushort)616 Vc_INTRINSIC __m128i max(__m128i a, __m128i b, ushort) { return SSE::max_epu16(a, b); }
max(__m128i a,__m128i b,schar)617 Vc_INTRINSIC __m128i max(__m128i a, __m128i b,  schar) { return SSE::max_epi8 (a, b); }
max(__m128i a,__m128i b,uchar)618 Vc_INTRINSIC __m128i max(__m128i a, __m128i b,  uchar) { return _mm_max_epu8 (a, b); }
619 
620 // horizontal add{{{1
add(__m128 a,float)621 Vc_INTRINSIC  float add(__m128  a,  float) {
622     a = _mm_add_ps(a, _mm_movehl_ps(a, a));
623     a = _mm_add_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)));
624     return _mm_cvtss_f32(a);
625 }
add(__m128d a,double)626 Vc_INTRINSIC double add(__m128d a, double) {
627     a = _mm_add_sd(a, _mm_unpackhi_pd(a, a));
628     return _mm_cvtsd_f64(a);
629 }
add(__m128i a,int)630 Vc_INTRINSIC    int add(__m128i a,    int) {
631     a = add(a, _mm_srli_si128(a, 8), int());
632     a = add(a, _mm_srli_si128(a, 4), int());
633     return _mm_cvtsi128_si32(a);
634 }
add(__m128i a,uint)635 Vc_INTRINSIC   uint add(__m128i a,   uint) { return add(a, int()); }
add(__m128i a,short)636 Vc_INTRINSIC  short add(__m128i a,  short) {
637     a = add(a, _mm_srli_si128(a, 8), short());
638     a = add(a, _mm_srli_si128(a, 4), short());
639     a = add(a, _mm_srli_si128(a, 2), short());
640     return _mm_cvtsi128_si32(a);  // & 0xffff is implicit
641 }
add(__m128i a,ushort)642 Vc_INTRINSIC ushort add(__m128i a, ushort) { return add(a, short()); }
add(__m128i a,schar)643 Vc_INTRINSIC  schar add(__m128i a,  schar) {
644     a = add(a, _mm_srli_si128(a, 8), schar());
645     a = add(a, _mm_srli_si128(a, 4), schar());
646     a = add(a, _mm_srli_si128(a, 2), schar());
647     a = add(a, _mm_srli_si128(a, 1), schar());
648     return _mm_cvtsi128_si32(a);  // & 0xff is implicit
649 }
add(__m128i a,uchar)650 Vc_INTRINSIC  uchar add(__m128i a,  uchar) { return add(a, schar()); }
651 
652 // horizontal mul{{{1
mul(__m128 a,float)653 Vc_INTRINSIC  float mul(__m128  a,  float) {
654     a = _mm_mul_ps(a, _mm_movehl_ps(a, a));
655     a = _mm_mul_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)));
656     return _mm_cvtss_f32(a);
657 }
mul(__m128d a,double)658 Vc_INTRINSIC double mul(__m128d a, double) {
659     a = _mm_mul_sd(a, _mm_unpackhi_pd(a, a));
660     return _mm_cvtsd_f64(a);
661 }
mul(__m128i a,int)662 Vc_INTRINSIC    int mul(__m128i a,    int) {
663     a = mul(a, _mm_srli_si128(a, 8), int());
664     a = mul(a, _mm_srli_si128(a, 4), int());
665     return _mm_cvtsi128_si32(a);
666 }
mul(__m128i a,uint)667 Vc_INTRINSIC   uint mul(__m128i a,   uint) { return mul(a, int()); }
mul(__m128i a,short)668 Vc_INTRINSIC  short mul(__m128i a,  short) {
669     a = mul(a, _mm_srli_si128(a, 8), short());
670     a = mul(a, _mm_srli_si128(a, 4), short());
671     a = mul(a, _mm_srli_si128(a, 2), short());
672     return _mm_cvtsi128_si32(a);  // & 0xffff is implicit
673 }
mul(__m128i a,ushort)674 Vc_INTRINSIC ushort mul(__m128i a, ushort) { return mul(a, short()); }
mul(__m128i a,schar)675 Vc_INTRINSIC  schar mul(__m128i a,  schar) {
676     // convert to two short vectors, multiply them and then do horizontal reduction
677     const __m128i s0 = _mm_srai_epi16(a, 1);
678     const __m128i s1 = Detail::and_(a, _mm_set1_epi32(0x0f0f0f0f));
679     return mul(mul(s0, s1, short()), short());
680 }
mul(__m128i a,uchar)681 Vc_INTRINSIC  uchar mul(__m128i a,  uchar) { return mul(a, schar()); }
682 
683 // horizontal min{{{1
min(__m128 a,float)684 Vc_INTRINSIC  float min(__m128  a,  float) {
685     a = _mm_min_ps(a, _mm_movehl_ps(a, a));
686     a = _mm_min_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)));
687     return _mm_cvtss_f32(a);
688 }
min(__m128d a,double)689 Vc_INTRINSIC double min(__m128d a, double) {
690     a = _mm_min_sd(a, _mm_unpackhi_pd(a, a));
691     return _mm_cvtsd_f64(a);
692 }
min(__m128i a,int)693 Vc_INTRINSIC    int min(__m128i a,    int) {
694     a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), int());
695     a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), int());
696     return _mm_cvtsi128_si32(a);
697 }
min(__m128i a,uint)698 Vc_INTRINSIC   uint min(__m128i a,   uint) {
699     a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), uint());
700     a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), uint());
701     return _mm_cvtsi128_si32(a);
702 }
min(__m128i a,short)703 Vc_INTRINSIC  short min(__m128i a,  short) {
704     a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), short());
705     a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), short());
706     a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)), short());
707     return _mm_cvtsi128_si32(a);  // & 0xffff is implicit
708 }
min(__m128i a,ushort)709 Vc_INTRINSIC ushort min(__m128i a, ushort) {
710     a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), ushort());
711     a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), ushort());
712     a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)), ushort());
713     return _mm_cvtsi128_si32(a);  // & 0xffff is implicit
714 }
min(__m128i a,schar)715 Vc_INTRINSIC  schar min(__m128i a,  schar) {
716     a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), schar());
717     a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), schar());
718     a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)), schar());
719     return std::min(schar(_mm_cvtsi128_si32(a) >> 8), schar(_mm_cvtsi128_si32(a)));
720 }
min(__m128i a,uchar)721 Vc_INTRINSIC  uchar min(__m128i a,  uchar) {
722     a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), schar());
723     a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), schar());
724     a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)), schar());
725     return std::min((_mm_cvtsi128_si32(a) >> 8) & 0xff, _mm_cvtsi128_si32(a) & 0xff);
726 }
727 
728 // horizontal max{{{1
max(__m128 a,float)729 Vc_INTRINSIC  float max(__m128  a,  float) {
730     a = _mm_max_ps(a, _mm_movehl_ps(a, a));
731     a = _mm_max_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)));
732     return _mm_cvtss_f32(a);
733 }
max(__m128d a,double)734 Vc_INTRINSIC double max(__m128d a, double) {
735     a = _mm_max_sd(a, _mm_unpackhi_pd(a, a));
736     return _mm_cvtsd_f64(a);
737 }
max(__m128i a,int)738 Vc_INTRINSIC    int max(__m128i a,    int) {
739     a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), int());
740     a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), int());
741     return _mm_cvtsi128_si32(a);
742 }
max(__m128i a,uint)743 Vc_INTRINSIC   uint max(__m128i a,   uint) {
744     a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), uint());
745     a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), uint());
746     return _mm_cvtsi128_si32(a);
747 }
max(__m128i a,short)748 Vc_INTRINSIC  short max(__m128i a,  short) {
749     a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), short());
750     a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), short());
751     a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)), short());
752     return _mm_cvtsi128_si32(a);  // & 0xffff is implicit
753 }
max(__m128i a,ushort)754 Vc_INTRINSIC ushort max(__m128i a, ushort) {
755     a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), ushort());
756     a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), ushort());
757     a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)), ushort());
758     return _mm_cvtsi128_si32(a);  // & 0xffff is implicit
759 }
max(__m128i a,schar)760 Vc_INTRINSIC  schar max(__m128i a,  schar) {
761     a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), schar());
762     a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), schar());
763     a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)), schar());
764     return std::max(schar(_mm_cvtsi128_si32(a) >> 8), schar(_mm_cvtsi128_si32(a)));
765 }
max(__m128i a,uchar)766 Vc_INTRINSIC  uchar max(__m128i a,  uchar) {
767     a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), schar());
768     a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), schar());
769     a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)), schar());
770     return std::max((_mm_cvtsi128_si32(a) >> 8) & 0xff, _mm_cvtsi128_si32(a) & 0xff);
771 }
772 
773 // sorted{{{1
774 template <Vc::Implementation, typename T>
775 Vc_CONST_L SSE::Vector<T> sorted(SSE::Vector<T> x) Vc_CONST_R;
sorted(SSE::Vector<T> x)776 template <typename T> Vc_INTRINSIC Vc_CONST SSE::Vector<T> sorted(SSE::Vector<T> x)
777 {
778     static_assert(!CurrentImplementation::is(ScalarImpl),
779                   "Detail::sorted can only be instantiated if a non-Scalar "
780                   "implementation is selected.");
781     return sorted < CurrentImplementation::is_between(SSE2Impl, SSSE3Impl)
782                ? SSE2Impl
783                : CurrentImplementation::is_between(SSE41Impl, SSE42Impl)
784                      ? SSE41Impl
785                      : CurrentImplementation::current() > (x);
786 }
787 
788 // sanitize{{{1
sanitize(int n)789 template <typename V> constexpr int sanitize(int n)
790 {
791     return (n >= int(sizeof(V)) || n <= -int(sizeof(V))) ? 0 : n;
792 }
793 
794 // rotated{{{1
795 template <typename T, size_t N, typename V>
rotated(V v,int amount)796 static Vc_INTRINSIC Vc_CONST enable_if<(sizeof(V) == 16), V> rotated(V v, int amount)
797 {
798     using namespace SSE;
799     switch (static_cast<unsigned int>(amount) % N) {
800     case 0:
801         return v;
802     case 1:
803         return sse_cast<V>(_mm_alignr_epi8(v, v, sanitize<V>(1 * sizeof(T))));
804     case 2:
805         return sse_cast<V>(_mm_alignr_epi8(v, v, sanitize<V>(2 * sizeof(T))));
806     case 3:
807         return sse_cast<V>(_mm_alignr_epi8(v, v, sanitize<V>(3 * sizeof(T))));
808     case 4:
809         return sse_cast<V>(_mm_alignr_epi8(v, v, sanitize<V>(4 * sizeof(T))));
810     case 5:
811         return sse_cast<V>(_mm_alignr_epi8(v, v, sanitize<V>(5 * sizeof(T))));
812     case 6:
813         return sse_cast<V>(_mm_alignr_epi8(v, v, sanitize<V>(6 * sizeof(T))));
814     case 7:
815         return sse_cast<V>(_mm_alignr_epi8(v, v, sanitize<V>(7 * sizeof(T))));
816     }
817     return sse_cast<V>(_mm_setzero_si128());
818 }
819 
820 //InterleaveImpl{{{1
821 template<typename V, size_t Size, size_t VSize> struct InterleaveImpl;
822 template<typename V> struct InterleaveImpl<V, 8, 16> {
823     template<typename I> static inline void interleave(typename V::EntryType *const data, const I &i,/*{{{*/
824             const typename V::AsArg v0, const typename V::AsArg v1)
825     {
826         const __m128i tmp0 = _mm_unpacklo_epi16(v0.data(), v1.data());
827         const __m128i tmp1 = _mm_unpackhi_epi16(v0.data(), v1.data());
828 #ifdef __x86_64__
829         const long long tmp00 = _mm_cvtsi128_si64(tmp0);
830         const long long tmp01 = _mm_cvtsi128_si64(_mm_unpackhi_epi64(tmp0, tmp0));
831         const long long tmp10 = _mm_cvtsi128_si64(tmp1);
832         const long long tmp11 = _mm_cvtsi128_si64(_mm_unpackhi_epi64(tmp1, tmp1));
833         aliasing_cast<int>(data[i[0]]) = tmp00;
834         aliasing_cast<int>(data[i[1]]) = tmp00 >> 32;
835         aliasing_cast<int>(data[i[2]]) = tmp01;
836         aliasing_cast<int>(data[i[3]]) = tmp01 >> 32;
837         aliasing_cast<int>(data[i[4]]) = tmp10;
838         aliasing_cast<int>(data[i[5]]) = tmp10 >> 32;
839         aliasing_cast<int>(data[i[6]]) = tmp11;
840         aliasing_cast<int>(data[i[7]]) = tmp11 >> 32;
841 #elif defined(Vc_IMPL_SSE4_1)
842         using namespace SseIntrinsics;
843         aliasing_cast<int>(data[i[0]]) = _mm_cvtsi128_si32(tmp0);
844         aliasing_cast<int>(data[i[1]]) = extract_epi32<1>(tmp0);
845         aliasing_cast<int>(data[i[2]]) = extract_epi32<2>(tmp0);
846         aliasing_cast<int>(data[i[3]]) = extract_epi32<3>(tmp0);
847         aliasing_cast<int>(data[i[4]]) = _mm_cvtsi128_si32(tmp1);
848         aliasing_cast<int>(data[i[5]]) = extract_epi32<1>(tmp1);
849         aliasing_cast<int>(data[i[6]]) = extract_epi32<2>(tmp1);
850         aliasing_cast<int>(data[i[7]]) = extract_epi32<3>(tmp1);
851 #else
852         aliasing_cast<int>(data[i[0]]) = _mm_cvtsi128_si32(tmp0);
853         aliasing_cast<int>(data[i[1]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp0, 4));
854         aliasing_cast<int>(data[i[2]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp0, 8));
855         aliasing_cast<int>(data[i[3]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp0, 12));
856         aliasing_cast<int>(data[i[4]]) = _mm_cvtsi128_si32(tmp1);
857         aliasing_cast<int>(data[i[5]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp1, 4));
858         aliasing_cast<int>(data[i[6]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp1, 8));
859         aliasing_cast<int>(data[i[7]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp1, 12));
860 #endif
861     }/*}}}*/
862     static inline void interleave(typename V::EntryType *const data, const Common::SuccessiveEntries<2> &i,/*{{{*/
863             const typename V::AsArg v0, const typename V::AsArg v1)
864     {
865         const __m128i tmp0 = _mm_unpacklo_epi16(v0.data(), v1.data());
866         const __m128i tmp1 = _mm_unpackhi_epi16(v0.data(), v1.data());
867         V(tmp0).store(&data[i[0]], Vc::Unaligned);
868         V(tmp1).store(&data[i[4]], Vc::Unaligned);
869     }/*}}}*/
870     template<typename I> static inline void interleave(typename V::EntryType *const data, const I &i,/*{{{*/
871             const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2)
872     {
873 #if defined Vc_USE_MASKMOV_SCATTER && !defined Vc_MSVC
874         // MSVC fails to compile the MMX intrinsics
875         const __m64 mask = _mm_set_pi16(0, -1, -1, -1);
876         const __m128i tmp0 = _mm_unpacklo_epi16(v0.data(), v2.data());
877         const __m128i tmp1 = _mm_unpackhi_epi16(v0.data(), v2.data());
878         const __m128i tmp2 = _mm_unpacklo_epi16(v1.data(), v1.data());
879         const __m128i tmp3 = _mm_unpackhi_epi16(v1.data(), v1.data());
880 
881         const __m128i tmp4 = _mm_unpacklo_epi16(tmp0, tmp2);
882         const __m128i tmp5 = _mm_unpackhi_epi16(tmp0, tmp2);
883         const __m128i tmp6 = _mm_unpacklo_epi16(tmp1, tmp3);
884         const __m128i tmp7 = _mm_unpackhi_epi16(tmp1, tmp3);
885 
886         _mm_maskmove_si64(_mm_movepi64_pi64(tmp4), mask, reinterpret_cast<char *>(&data[i[0]]));
887         _mm_maskmove_si64(_mm_movepi64_pi64(_mm_srli_si128(tmp4, 8)), mask, reinterpret_cast<char *>(&data[i[1]]));
888         _mm_maskmove_si64(_mm_movepi64_pi64(tmp5), mask, reinterpret_cast<char *>(&data[i[2]]));
889         _mm_maskmove_si64(_mm_movepi64_pi64(_mm_srli_si128(tmp5, 8)), mask, reinterpret_cast<char *>(&data[i[3]]));
890         _mm_maskmove_si64(_mm_movepi64_pi64(tmp6), mask, reinterpret_cast<char *>(&data[i[4]]));
891         _mm_maskmove_si64(_mm_movepi64_pi64(_mm_srli_si128(tmp6, 8)), mask, reinterpret_cast<char *>(&data[i[5]]));
892         _mm_maskmove_si64(_mm_movepi64_pi64(tmp7), mask, reinterpret_cast<char *>(&data[i[6]]));
893         _mm_maskmove_si64(_mm_movepi64_pi64(_mm_srli_si128(tmp7, 8)), mask, reinterpret_cast<char *>(&data[i[7]]));
894         _mm_empty();
895 #else
896         interleave(data, i, v0, v1);
897         v2.scatter(data + 2, i);
898 #endif
899     }/*}}}*/
900     template<typename I> static inline void interleave(typename V::EntryType *const data, const I &i,/*{{{*/
901             const typename V::AsArg v0, const typename V::AsArg v1,
902             const typename V::AsArg v2, const typename V::AsArg v3)
903     {
904         const __m128i tmp0 = _mm_unpacklo_epi16(v0.data(), v2.data());
905         const __m128i tmp1 = _mm_unpackhi_epi16(v0.data(), v2.data());
906         const __m128i tmp2 = _mm_unpacklo_epi16(v1.data(), v3.data());
907         const __m128i tmp3 = _mm_unpackhi_epi16(v1.data(), v3.data());
908 
909         const __m128i tmp4 = _mm_unpacklo_epi16(tmp0, tmp2);
910         const __m128i tmp5 = _mm_unpackhi_epi16(tmp0, tmp2);
911         const __m128i tmp6 = _mm_unpacklo_epi16(tmp1, tmp3);
912         const __m128i tmp7 = _mm_unpackhi_epi16(tmp1, tmp3);
913 
914         _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[0]]), tmp4);
915         _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[2]]), tmp5);
916         _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[4]]), tmp6);
917         _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[6]]), tmp7);
918         _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[1]]), _mm_castsi128_ps(tmp4));
919         _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[3]]), _mm_castsi128_ps(tmp5));
920         _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[5]]), _mm_castsi128_ps(tmp6));
921         _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[7]]), _mm_castsi128_ps(tmp7));
922     }/*}}}*/
923     static inline void interleave(typename V::EntryType *const data, const Common::SuccessiveEntries<4> &i,/*{{{*/
924             const typename V::AsArg v0, const typename V::AsArg v1,
925             const typename V::AsArg v2, const typename V::AsArg v3)
926     {
927         const __m128i tmp0 = _mm_unpacklo_epi16(v0.data(), v2.data());
928         const __m128i tmp1 = _mm_unpackhi_epi16(v0.data(), v2.data());
929         const __m128i tmp2 = _mm_unpacklo_epi16(v1.data(), v3.data());
930         const __m128i tmp3 = _mm_unpackhi_epi16(v1.data(), v3.data());
931 
932         const __m128i tmp4 = _mm_unpacklo_epi16(tmp0, tmp2);
933         const __m128i tmp5 = _mm_unpackhi_epi16(tmp0, tmp2);
934         const __m128i tmp6 = _mm_unpacklo_epi16(tmp1, tmp3);
935         const __m128i tmp7 = _mm_unpackhi_epi16(tmp1, tmp3);
936 
937         V(tmp4).store(&data[i[0]], ::Vc::Unaligned);
938         V(tmp5).store(&data[i[2]], ::Vc::Unaligned);
939         V(tmp6).store(&data[i[4]], ::Vc::Unaligned);
940         V(tmp7).store(&data[i[6]], ::Vc::Unaligned);
941     }/*}}}*/
942     template <typename I>  // interleave 5 args{{{2
943     static inline void interleave(typename V::EntryType *const data, const I &i,
944                                   const typename V::AsArg v0, const typename V::AsArg v1,
945                                   const typename V::AsArg v2, const typename V::AsArg v3,
946                                   const typename V::AsArg v4)
947     {
948         interleave(data, i, v0, v1, v2, v3);
949         v4.scatter(data + 4, i);
950     }
951     template <typename I>  // interleave 6 args{{{2
952     static inline void interleave(typename V::EntryType *const data, const I &i,
953                                   const typename V::AsArg v0, const typename V::AsArg v1,
954                                   const typename V::AsArg v2, const typename V::AsArg v3,
955                                   const typename V::AsArg v4, const typename V::AsArg v5)
956     {
957         interleave(data, i, v0, v1, v2, v3);
958         interleave(data + 4, i, v4, v5);
959     }
960     template <typename I>  // interleave 7 args{{{2
961     static inline void interleave(typename V::EntryType *const data, const I &i,
962                                   const typename V::AsArg v0, const typename V::AsArg v1,
963                                   const typename V::AsArg v2, const typename V::AsArg v3,
964                                   const typename V::AsArg v4, const typename V::AsArg v5,
965                                   const typename V::AsArg v6)
966     {
967         interleave(data, i, v0, v1, v2, v3);
968         interleave(data + 4, i, v4, v5, v6);
969     }
970     template <typename I>  // interleave 8 args{{{2
971     static inline void interleave(typename V::EntryType *const data, const I &i,
972                                   const typename V::AsArg v0, const typename V::AsArg v1,
973                                   const typename V::AsArg v2, const typename V::AsArg v3,
974                                   const typename V::AsArg v4, const typename V::AsArg v5,
975                                   const typename V::AsArg v6, const typename V::AsArg v7)
976     {
977         interleave(data, i, v0, v1, v2, v3);
978         interleave(data + 4, i, v4, v5, v6, v7);
979     }
980     //}}}2
981     template<typename I> static inline void deinterleave(typename V::EntryType const *const data, /*{{{*/
982             const I &i, V &v0, V &v1)
983     {
984         const __m128i a = _mm_cvtsi32_si128(*aliasing_cast<int>(&data[i[0]]));
985         const __m128i b = _mm_cvtsi32_si128(*aliasing_cast<int>(&data[i[1]]));
986         const __m128i c = _mm_cvtsi32_si128(*aliasing_cast<int>(&data[i[2]]));
987         const __m128i d = _mm_cvtsi32_si128(*aliasing_cast<int>(&data[i[3]]));
988         const __m128i e = _mm_cvtsi32_si128(*aliasing_cast<int>(&data[i[4]]));
989         const __m128i f = _mm_cvtsi32_si128(*aliasing_cast<int>(&data[i[5]]));
990         const __m128i g = _mm_cvtsi32_si128(*aliasing_cast<int>(&data[i[6]]));
991         const __m128i h = _mm_cvtsi32_si128(*aliasing_cast<int>(&data[i[7]]));
992 
993         const __m128i tmp2  = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4
994         const __m128i tmp3  = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6
995         const __m128i tmp4  = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5
996         const __m128i tmp5  = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7
997 
998         const __m128i tmp0  = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6
999         const __m128i tmp1  = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7
1000 
1001         v0.data() = _mm_unpacklo_epi16(tmp0, tmp1);
1002         v1.data() = _mm_unpackhi_epi16(tmp0, tmp1);
1003     }/*}}}*/
1004     template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
1005             const I &i, V &v0, V &v1, V &v2)
1006     {
1007         const __m128i a = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[0]]));
1008         const __m128i b = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[1]]));
1009         const __m128i c = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[2]]));
1010         const __m128i d = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[3]]));
1011         const __m128i e = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[4]]));
1012         const __m128i f = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[5]]));
1013         const __m128i g = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[6]]));
1014         const __m128i h = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[7]]));
1015 
1016         const __m128i tmp2  = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4
1017         const __m128i tmp4  = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5
1018         const __m128i tmp3  = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6
1019         const __m128i tmp5  = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7
1020 
1021         const __m128i tmp0  = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6
1022         const __m128i tmp1  = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7
1023         const __m128i tmp6  = _mm_unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6
1024         const __m128i tmp7  = _mm_unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7
1025 
1026         v0.data() = _mm_unpacklo_epi16(tmp0, tmp1);
1027         v1.data() = _mm_unpackhi_epi16(tmp0, tmp1);
1028         v2.data() = _mm_unpacklo_epi16(tmp6, tmp7);
1029     }/*}}}*/
1030     template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
1031             const I &i, V &v0, V &v1, V &v2, V &v3)
1032     {
1033         const __m128i a = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[0]]));
1034         const __m128i b = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[1]]));
1035         const __m128i c = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[2]]));
1036         const __m128i d = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[3]]));
1037         const __m128i e = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[4]]));
1038         const __m128i f = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[5]]));
1039         const __m128i g = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[6]]));
1040         const __m128i h = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[7]]));
1041 
1042         const __m128i tmp2  = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4
1043         const __m128i tmp4  = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5
1044         const __m128i tmp3  = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6
1045         const __m128i tmp5  = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7
1046 
1047         const __m128i tmp0  = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6
1048         const __m128i tmp1  = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7
1049         const __m128i tmp6  = _mm_unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6
1050         const __m128i tmp7  = _mm_unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7
1051 
1052         v0.data() = _mm_unpacklo_epi16(tmp0, tmp1);
1053         v1.data() = _mm_unpackhi_epi16(tmp0, tmp1);
1054         v2.data() = _mm_unpacklo_epi16(tmp6, tmp7);
1055         v3.data() = _mm_unpackhi_epi16(tmp6, tmp7);
1056     }/*}}}*/
1057     template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
1058             const I &i, V &v0, V &v1, V &v2, V &v3, V &v4)
1059     {
1060         const __m128i a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[0]]));
1061         const __m128i b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[1]]));
1062         const __m128i c = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[2]]));
1063         const __m128i d = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[3]]));
1064         const __m128i e = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[4]]));
1065         const __m128i f = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[5]]));
1066         const __m128i g = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[6]]));
1067         const __m128i h = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[7]]));
1068 
1069         const __m128i tmp2  = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4
1070         const __m128i tmp4  = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5
1071         const __m128i tmp3  = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6
1072         const __m128i tmp5  = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7
1073         const __m128i tmp10 = _mm_unpackhi_epi16(a, e); // e0 e4 f0 f4 g0 g4 h0 h4
1074         const __m128i tmp11 = _mm_unpackhi_epi16(c, g); // e1 e5 f1 f5 g1 g5 h1 h5
1075         const __m128i tmp12 = _mm_unpackhi_epi16(b, f); // e2 e6 f2 f6 g2 g6 h2 h6
1076         const __m128i tmp13 = _mm_unpackhi_epi16(d, h); // e3 e7 f3 f7 g3 g7 h3 h7
1077 
1078         const __m128i tmp0  = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6
1079         const __m128i tmp1  = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7
1080         const __m128i tmp6  = _mm_unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6
1081         const __m128i tmp7  = _mm_unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7
1082         const __m128i tmp8  = _mm_unpacklo_epi16(tmp10, tmp11); // e0 e2 e4 e6 f0 f2 f4 f6
1083         const __m128i tmp9  = _mm_unpacklo_epi16(tmp12, tmp13); // e1 e3 e5 e7 f1 f3 f5 f7
1084 
1085         v0.data() = _mm_unpacklo_epi16(tmp0, tmp1);
1086         v1.data() = _mm_unpackhi_epi16(tmp0, tmp1);
1087         v2.data() = _mm_unpacklo_epi16(tmp6, tmp7);
1088         v3.data() = _mm_unpackhi_epi16(tmp6, tmp7);
1089         v4.data() = _mm_unpacklo_epi16(tmp8, tmp9);
1090     }/*}}}*/
1091     template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
1092             const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5)
1093     {
1094         const __m128i a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[0]]));
1095         const __m128i b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[1]]));
1096         const __m128i c = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[2]]));
1097         const __m128i d = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[3]]));
1098         const __m128i e = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[4]]));
1099         const __m128i f = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[5]]));
1100         const __m128i g = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[6]]));
1101         const __m128i h = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[7]]));
1102 
1103         const __m128i tmp2  = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4
1104         const __m128i tmp4  = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5
1105         const __m128i tmp3  = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6
1106         const __m128i tmp5  = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7
1107         const __m128i tmp10 = _mm_unpackhi_epi16(a, e); // e0 e4 f0 f4 g0 g4 h0 h4
1108         const __m128i tmp11 = _mm_unpackhi_epi16(c, g); // e1 e5 f1 f5 g1 g5 h1 h5
1109         const __m128i tmp12 = _mm_unpackhi_epi16(b, f); // e2 e6 f2 f6 g2 g6 h2 h6
1110         const __m128i tmp13 = _mm_unpackhi_epi16(d, h); // e3 e7 f3 f7 g3 g7 h3 h7
1111 
1112         const __m128i tmp0  = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6
1113         const __m128i tmp1  = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7
1114         const __m128i tmp6  = _mm_unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6
1115         const __m128i tmp7  = _mm_unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7
1116         const __m128i tmp8  = _mm_unpacklo_epi16(tmp10, tmp11); // e0 e2 e4 e6 f0 f2 f4 f6
1117         const __m128i tmp9  = _mm_unpacklo_epi16(tmp12, tmp13); // e1 e3 e5 e7 f1 f3 f5 f7
1118 
1119         v0.data() = _mm_unpacklo_epi16(tmp0, tmp1);
1120         v1.data() = _mm_unpackhi_epi16(tmp0, tmp1);
1121         v2.data() = _mm_unpacklo_epi16(tmp6, tmp7);
1122         v3.data() = _mm_unpackhi_epi16(tmp6, tmp7);
1123         v4.data() = _mm_unpacklo_epi16(tmp8, tmp9);
1124         v5.data() = _mm_unpackhi_epi16(tmp8, tmp9);
1125     }/*}}}*/
1126     template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
1127             const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6)
1128     {
1129         const __m128i a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[0]]));
1130         const __m128i b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[1]]));
1131         const __m128i c = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[2]]));
1132         const __m128i d = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[3]]));
1133         const __m128i e = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[4]]));
1134         const __m128i f = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[5]]));
1135         const __m128i g = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[6]]));
1136         const __m128i h = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[7]]));
1137 
1138         const __m128i tmp2  = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4
1139         const __m128i tmp4  = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5
1140         const __m128i tmp3  = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6
1141         const __m128i tmp5  = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7
1142         const __m128i tmp10 = _mm_unpackhi_epi16(a, e); // e0 e4 f0 f4 g0 g4 h0 h4
1143         const __m128i tmp11 = _mm_unpackhi_epi16(c, g); // e1 e5 f1 f5 g1 g5 h1 h5
1144         const __m128i tmp12 = _mm_unpackhi_epi16(b, f); // e2 e6 f2 f6 g2 g6 h2 h6
1145         const __m128i tmp13 = _mm_unpackhi_epi16(d, h); // e3 e7 f3 f7 g3 g7 h3 h7
1146 
1147         const __m128i tmp0  = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6
1148         const __m128i tmp1  = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7
1149         const __m128i tmp6  = _mm_unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6
1150         const __m128i tmp7  = _mm_unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7
1151         const __m128i tmp8  = _mm_unpacklo_epi16(tmp10, tmp11); // e0 e2 e4 e6 f0 f2 f4 f6
1152         const __m128i tmp9  = _mm_unpacklo_epi16(tmp12, tmp13); // e1 e3 e5 e7 f1 f3 f5 f7
1153         const __m128i tmp14 = _mm_unpackhi_epi16(tmp10, tmp11); // g0 g2 g4 g6 h0 h2 h4 h6
1154         const __m128i tmp15 = _mm_unpackhi_epi16(tmp12, tmp13); // g1 g3 g5 g7 h1 h3 h5 h7
1155 
1156         v0.data() = _mm_unpacklo_epi16(tmp0, tmp1);
1157         v1.data() = _mm_unpackhi_epi16(tmp0, tmp1);
1158         v2.data() = _mm_unpacklo_epi16(tmp6, tmp7);
1159         v3.data() = _mm_unpackhi_epi16(tmp6, tmp7);
1160         v4.data() = _mm_unpacklo_epi16(tmp8, tmp9);
1161         v5.data() = _mm_unpackhi_epi16(tmp8, tmp9);
1162         v6.data() = _mm_unpacklo_epi16(tmp14, tmp15);
1163     }/*}}}*/
1164     template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
1165             const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6, V &v7)
1166     {
1167         const __m128i a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[0]]));
1168         const __m128i b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[1]]));
1169         const __m128i c = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[2]]));
1170         const __m128i d = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[3]]));
1171         const __m128i e = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[4]]));
1172         const __m128i f = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[5]]));
1173         const __m128i g = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[6]]));
1174         const __m128i h = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[7]]));
1175 
1176         const __m128i tmp2  = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4
1177         const __m128i tmp4  = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5
1178         const __m128i tmp3  = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6
1179         const __m128i tmp5  = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7
1180         const __m128i tmp10 = _mm_unpackhi_epi16(a, e); // e0 e4 f0 f4 g0 g4 h0 h4
1181         const __m128i tmp11 = _mm_unpackhi_epi16(c, g); // e1 e5 f1 f5 g1 g5 h1 h5
1182         const __m128i tmp12 = _mm_unpackhi_epi16(b, f); // e2 e6 f2 f6 g2 g6 h2 h6
1183         const __m128i tmp13 = _mm_unpackhi_epi16(d, h); // e3 e7 f3 f7 g3 g7 h3 h7
1184 
1185         const __m128i tmp0  = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6
1186         const __m128i tmp1  = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7
1187         const __m128i tmp6  = _mm_unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6
1188         const __m128i tmp7  = _mm_unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7
1189         const __m128i tmp8  = _mm_unpacklo_epi16(tmp10, tmp11); // e0 e2 e4 e6 f0 f2 f4 f6
1190         const __m128i tmp9  = _mm_unpacklo_epi16(tmp12, tmp13); // e1 e3 e5 e7 f1 f3 f5 f7
1191         const __m128i tmp14 = _mm_unpackhi_epi16(tmp10, tmp11); // g0 g2 g4 g6 h0 h2 h4 h6
1192         const __m128i tmp15 = _mm_unpackhi_epi16(tmp12, tmp13); // g1 g3 g5 g7 h1 h3 h5 h7
1193 
1194         v0.data() = _mm_unpacklo_epi16(tmp0, tmp1);
1195         v1.data() = _mm_unpackhi_epi16(tmp0, tmp1);
1196         v2.data() = _mm_unpacklo_epi16(tmp6, tmp7);
1197         v3.data() = _mm_unpackhi_epi16(tmp6, tmp7);
1198         v4.data() = _mm_unpacklo_epi16(tmp8, tmp9);
1199         v5.data() = _mm_unpackhi_epi16(tmp8, tmp9);
1200         v6.data() = _mm_unpacklo_epi16(tmp14, tmp15);
1201         v7.data() = _mm_unpackhi_epi16(tmp14, tmp15);
1202     }/*}}}*/
1203 };
1204 template<typename V> struct InterleaveImpl<V, 4, 16> {
1205     static inline void interleave(typename V::EntryType *const data, const Common::SuccessiveEntries<2> &i,/*{{{*/
1206             const typename V::AsArg v0, const typename V::AsArg v1)
1207     {
1208         const __m128 tmp0 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data()));
1209         const __m128 tmp1 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data()));
1210         _mm_storeu_ps(aliasing_cast<float>(&data[i[0]]), tmp0);
1211         _mm_storeu_ps(aliasing_cast<float>(&data[i[2]]), tmp1);
1212     }/*}}}*/
1213     template <typename I>  // interleave 2 args {{{2
1214     static inline void interleave(typename V::EntryType *const data, const I &i,
1215                                   const typename V::AsArg v0, const typename V::AsArg v1)
1216     {
1217         const __m128 tmp0 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data()));
1218         const __m128 tmp1 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data()));
1219         _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[0]]), tmp0);
1220         _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[1]]), tmp0);
1221         _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[2]]), tmp1);
1222         _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[3]]), tmp1);
1223     }
1224     template <typename I>  // interleave 3 args {{{2
1225     static inline void interleave(typename V::EntryType *const data, const I &i,
1226                                   const typename V::AsArg v0, const typename V::AsArg v1,
1227                                   const typename V::AsArg v2)
1228     {
1229 #ifdef Vc_USE_MASKMOV_SCATTER
1230         const __m128 tmp0 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v0.data()), SSE::sse_cast<__m128>(v1.data()));
1231         const __m128 tmp1 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v0.data()), SSE::sse_cast<__m128>(v1.data()));
1232         const __m128 tmp2 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v2.data()), SSE::sse_cast<__m128>(v2.data()));
1233         const __m128 tmp3 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v2.data()), SSE::sse_cast<__m128>(v2.data()));
1234         const __m128i mask = _mm_set_epi32(0, -1, -1, -1);
1235         _mm_maskmoveu_si128(_mm_castps_si128(_mm_movelh_ps(tmp0, tmp2)), mask, reinterpret_cast<char *>(&data[i[0]]));
1236         _mm_maskmoveu_si128(_mm_castps_si128(_mm_movehl_ps(tmp2, tmp0)), mask, reinterpret_cast<char *>(&data[i[1]]));
1237         _mm_maskmoveu_si128(_mm_castps_si128(_mm_movelh_ps(tmp1, tmp3)), mask, reinterpret_cast<char *>(&data[i[2]]));
1238         _mm_maskmoveu_si128(_mm_castps_si128(_mm_movehl_ps(tmp3, tmp1)), mask, reinterpret_cast<char *>(&data[i[3]]));
1239 #else
1240         const __m128 tmp0 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data()));
1241         const __m128 tmp1 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data()));
1242         _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[0]]), tmp0);
1243         _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[1]]), tmp0);
1244         _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[2]]), tmp1);
1245         _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[3]]), tmp1);
1246         v2.scatter(data + 2, i);
1247 #endif
1248     }
1249     template <typename I>  // interleave 4 args {{{2
1250     static inline void interleave(typename V::EntryType *const data, const I &i,
1251                                   const typename V::AsArg v0, const typename V::AsArg v1,
1252                                   const typename V::AsArg v2, const typename V::AsArg v3)
1253     {
1254         const __m128 tmp0 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data()));
1255         const __m128 tmp1 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data()));
1256         const __m128 tmp2 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v2.data()),SSE::sse_cast<__m128>(v3.data()));
1257         const __m128 tmp3 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v2.data()),SSE::sse_cast<__m128>(v3.data()));
1258         _mm_storeu_ps(aliasing_cast<float>(&data[i[0]]), _mm_movelh_ps(tmp0, tmp2));
1259         _mm_storeu_ps(aliasing_cast<float>(&data[i[1]]), _mm_movehl_ps(tmp2, tmp0));
1260         _mm_storeu_ps(aliasing_cast<float>(&data[i[2]]), _mm_movelh_ps(tmp1, tmp3));
1261         _mm_storeu_ps(aliasing_cast<float>(&data[i[3]]), _mm_movehl_ps(tmp3, tmp1));
1262     }
1263     template <typename I>  // interleave 5 args {{{2
1264     static inline void interleave(typename V::EntryType *const data, const I &i,
1265                                   const typename V::AsArg v0, const typename V::AsArg v1,
1266                                   const typename V::AsArg v2, const typename V::AsArg v3,
1267                                   const typename V::AsArg v4)
1268     {
1269         interleave(data, i, v0, v1, v2, v3);
1270         v4.scatter(data + 4, i);
1271     }
1272     template <typename I>  // interleave 6 args {{{2
1273     static inline void interleave(typename V::EntryType *const data, const I &i,
1274                                   const typename V::AsArg v0, const typename V::AsArg v1,
1275                                   const typename V::AsArg v2, const typename V::AsArg v3,
1276                                   const typename V::AsArg v4, const typename V::AsArg v5)
1277     {
1278         interleave(data, i, v0, v1, v2, v3);
1279         interleave(data + 4, i, v4, v5);
1280     }
1281     template <typename I>  // interleave 7 args {{{2
1282     static inline void interleave(typename V::EntryType *const data, const I &i,
1283                                   const typename V::AsArg v0, const typename V::AsArg v1,
1284                                   const typename V::AsArg v2, const typename V::AsArg v3,
1285                                   const typename V::AsArg v4, const typename V::AsArg v5,
1286                                   const typename V::AsArg v6)
1287     {
1288         interleave(data, i, v0, v1, v2, v3);
1289         interleave(data + 4, i, v4, v5, v6);
1290     }
1291     template <typename I>  // interleave 8 args {{{2
1292     static inline void interleave(typename V::EntryType *const data, const I &i,
1293                                   const typename V::AsArg v0, const typename V::AsArg v1,
1294                                   const typename V::AsArg v2, const typename V::AsArg v3,
1295                                   const typename V::AsArg v4, const typename V::AsArg v5,
1296                                   const typename V::AsArg v6, const typename V::AsArg v7)
1297     {
1298         interleave(data, i, v0, v1, v2, v3);
1299         interleave(data + 4, i, v4, v5, v6, v7);
1300     }
1301     //}}}2
1302     template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
1303             const I &i, V &v0, V &v1)
1304     {
1305         const __m128 a = _mm_castpd_ps(_mm_load_sd(aliasing_cast<double>(&data[i[0]])));
1306         const __m128 b = _mm_castpd_ps(_mm_load_sd(aliasing_cast<double>(&data[i[1]])));
1307         const __m128 c = _mm_castpd_ps(_mm_load_sd(aliasing_cast<double>(&data[i[2]])));
1308         const __m128 d = _mm_castpd_ps(_mm_load_sd(aliasing_cast<double>(&data[i[3]])));
1309 
1310         const __m128 tmp0 = _mm_unpacklo_ps(a, b); // [a0 a1 b0 b1]
1311         const __m128 tmp1 = _mm_unpacklo_ps(c, d); // [a2 a3 b2 b3]
1312 
1313         v0.data() = SSE::sse_cast<typename V::VectorType>(_mm_movelh_ps(tmp0, tmp1));
1314         v1.data() = SSE::sse_cast<typename V::VectorType>(_mm_movehl_ps(tmp1, tmp0));
1315     }/*}}}*/
1316     template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
1317             const I &i, V &v0, V &v1, V &v2)
1318     {
1319         const __m128 a = _mm_loadu_ps(aliasing_cast<float>(&data[i[0]]));
1320         const __m128 b = _mm_loadu_ps(aliasing_cast<float>(&data[i[1]]));
1321         const __m128 c = _mm_loadu_ps(aliasing_cast<float>(&data[i[2]]));
1322         const __m128 d = _mm_loadu_ps(aliasing_cast<float>(&data[i[3]]));
1323 
1324         const __m128 tmp0 = _mm_unpacklo_ps(a, b); // [a0 a1 b0 b1]
1325         const __m128 tmp1 = _mm_unpacklo_ps(c, d); // [a2 a3 b2 b3]
1326         const __m128 tmp2 = _mm_unpackhi_ps(a, b); // [c0 c1 XX XX]
1327         const __m128 tmp3 = _mm_unpackhi_ps(c, d); // [c2 c3 XX XX]
1328 
1329         v0.data() = SSE::sse_cast<typename V::VectorType>(_mm_movelh_ps(tmp0, tmp1));
1330         v1.data() = SSE::sse_cast<typename V::VectorType>(_mm_movehl_ps(tmp1, tmp0));
1331         v2.data() = SSE::sse_cast<typename V::VectorType>(_mm_movelh_ps(tmp2, tmp3));
1332     }/*}}}*/
1333     template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
1334             const I &i, V &v0, V &v1, V &v2, V &v3)
1335     {
1336         const __m128 a = _mm_loadu_ps(aliasing_cast<float>(&data[i[0]]));
1337         const __m128 b = _mm_loadu_ps(aliasing_cast<float>(&data[i[1]]));
1338         const __m128 c = _mm_loadu_ps(aliasing_cast<float>(&data[i[2]]));
1339         const __m128 d = _mm_loadu_ps(aliasing_cast<float>(&data[i[3]]));
1340 
1341         const __m128 tmp0 = _mm_unpacklo_ps(a, b); // [a0 a1 b0 b1]
1342         const __m128 tmp1 = _mm_unpacklo_ps(c, d); // [a2 a3 b2 b3]
1343         const __m128 tmp2 = _mm_unpackhi_ps(a, b); // [c0 c1 d0 d1]
1344         const __m128 tmp3 = _mm_unpackhi_ps(c, d); // [c2 c3 d2 d3]
1345 
1346         v0.data() = SSE::sse_cast<typename V::VectorType>(_mm_movelh_ps(tmp0, tmp1));
1347         v1.data() = SSE::sse_cast<typename V::VectorType>(_mm_movehl_ps(tmp1, tmp0));
1348         v2.data() = SSE::sse_cast<typename V::VectorType>(_mm_movelh_ps(tmp2, tmp3));
1349         v3.data() = SSE::sse_cast<typename V::VectorType>(_mm_movehl_ps(tmp3, tmp2));
1350     }/*}}}*/
1351     template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
1352             const I &i, V &v0, V &v1, V &v2, V &v3, V &v4)
1353     {
1354         deinterleave(data, i, v0, v1, v2, v3);
1355         v4.gather(data + 4, i);
1356     }/*}}}*/
1357     template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
1358             const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5)
1359     {
1360         deinterleave(data, i, v0, v1, v2, v3);
1361         deinterleave(data + 4, i, v4, v5);
1362     }/*}}}*/
1363     template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
1364             const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6)
1365     {
1366         deinterleave(data, i, v0, v1, v2, v3);
1367         deinterleave(data + 4, i, v4, v5, v6);
1368     }/*}}}*/
1369     template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
1370             const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6, V &v7)
1371     {
1372         deinterleave(data, i, v0, v1, v2, v3);
1373         deinterleave(data + 4, i, v4, v5, v6, v7);
1374     }/*}}}*/
1375 };
1376 template<typename V> struct InterleaveImpl<V, 2, 16> {
1377     template <typename I>  // interleave 2 args {{{2
1378     static inline void interleave(typename V::EntryType *const data, const I &i,
1379                                   const typename V::AsArg v0, const typename V::AsArg v1)
1380     {
1381         const __m128d tmp0 = _mm_unpacklo_pd(v0.data(), v1.data());
1382         const __m128d tmp1 = _mm_unpackhi_pd(v0.data(), v1.data());
1383         _mm_storeu_pd(&data[i[0]], tmp0);
1384         _mm_storeu_pd(&data[i[1]], tmp1);
1385     }
1386     template <typename I>  // interleave 3 args {{{2
1387     static inline void interleave(typename V::EntryType *const data, const I &i,
1388                                   const typename V::AsArg v0, const typename V::AsArg v1,
1389                                   const typename V::AsArg v2)
1390     {
1391         interleave(data, i, v0, v1);
1392         v2.scatter(data + 2, i);
1393     }
1394     template <typename I>  // interleave 4 args {{{2
1395     static inline void interleave(typename V::EntryType *const data, const I &i,
1396                                   const typename V::AsArg v0, const typename V::AsArg v1,
1397                                   const typename V::AsArg v2, const typename V::AsArg v3)
1398     {
1399         interleave(data, i, v0, v1);
1400         interleave(data + 2, i, v2, v3);
1401     }
1402     template <typename I>  // interleave 5 args {{{2
1403     static inline void interleave(typename V::EntryType *const data, const I &i,
1404                                   const typename V::AsArg v0, const typename V::AsArg v1,
1405                                   const typename V::AsArg v2, const typename V::AsArg v3,
1406                                   const typename V::AsArg v4)
1407     {
1408         interleave(data, i, v0, v1, v2, v3);
1409         v4.scatter(data + 4, i);
1410     }
1411     template <typename I>  // interleave 6 args {{{2
1412     static inline void interleave(typename V::EntryType *const data, const I &i,
1413                                   const typename V::AsArg v0, const typename V::AsArg v1,
1414                                   const typename V::AsArg v2, const typename V::AsArg v3,
1415                                   const typename V::AsArg v4, const typename V::AsArg v5)
1416     {
1417         interleave(data, i, v0, v1, v2, v3);
1418         interleave(data + 4, i, v4, v5);
1419     }
1420     template <typename I>  // interleave 7 args {{{2
1421     static inline void interleave(typename V::EntryType *const data, const I &i,
1422                                   const typename V::AsArg v0, const typename V::AsArg v1,
1423                                   const typename V::AsArg v2, const typename V::AsArg v3,
1424                                   const typename V::AsArg v4, const typename V::AsArg v5,
1425                                   const typename V::AsArg v6)
1426     {
1427         interleave(data, i, v0, v1, v2, v3);
1428         interleave(data + 4, i, v4, v5, v6);
1429     }
1430     template <typename I>  // interleave 8 args {{{2
1431     static inline void interleave(typename V::EntryType *const data, const I &i,
1432                                   const typename V::AsArg v0, const typename V::AsArg v1,
1433                                   const typename V::AsArg v2, const typename V::AsArg v3,
1434                                   const typename V::AsArg v4, const typename V::AsArg v5,
1435                                   const typename V::AsArg v6, const typename V::AsArg v7)
1436     {
1437         interleave(data, i, v0, v1, v2, v3);
1438         interleave(data + 4, i, v4, v5, v6, v7);
1439     }
1440     //}}}2
1441     template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
1442             const I &i, V &v0, V &v1)
1443     {
1444         const __m128d a = _mm_loadu_pd(&data[i[0]]);
1445         const __m128d b = _mm_loadu_pd(&data[i[1]]);
1446 
1447         v0.data() = _mm_unpacklo_pd(a, b);
1448         v1.data() = _mm_unpackhi_pd(a, b);
1449     }/*}}}*/
1450     template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
1451             const I &i, V &v0, V &v1, V &v2)
1452     {
1453         v2.gather(data + 2, i);
1454         deinterleave(data, i, v0, v1);
1455     }/*}}}*/
1456     template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
1457             const I &i, V &v0, V &v1, V &v2, V &v3)
1458     {
1459         deinterleave(data, i, v0, v1);
1460         deinterleave(data + 2, i, v2, v3);
1461     }/*}}}*/
1462     template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
1463             const I &i, V &v0, V &v1, V &v2, V &v3, V &v4)
1464     {
1465         deinterleave(data, i, v0, v1);
1466         deinterleave(data + 2, i, v2, v3);
1467         v4.gather(data + 4, i);
1468     }/*}}}*/
1469     template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
1470             const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5)
1471     {
1472         deinterleave(data, i, v0, v1);
1473         deinterleave(data + 2, i, v2, v3);
1474         deinterleave(data + 4, i, v4, v5);
1475     }/*}}}*/
1476     template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
1477             const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6)
1478     {
1479         deinterleave(data, i, v0, v1);
1480         deinterleave(data + 2, i, v2, v3);
1481         deinterleave(data + 4, i, v4, v5);
1482         v6.gather(data + 6, i);
1483     }/*}}}*/
1484     template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
1485             const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6, V &v7)
1486     {
1487         deinterleave(data, i, v0, v1);
1488         deinterleave(data + 2, i, v2, v3);
1489         deinterleave(data + 4, i, v4, v5);
1490         deinterleave(data + 6, i, v6, v7);
1491     }/*}}}*/
1492 };
1493 
1494 //}}}1
1495 }  // namespace Detail
1496 }  // namespace Vc
1497 
1498 #endif  // VC_SSE_DETAIL_H_
1499 
1500 // vim: foldmethod=marker
1501