1 /* This file is part of the Vc library. {{{
2 Copyright © 2015 Matthias Kretz <kretz@kde.org>
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are met:
6 * Redistributions of source code must retain the above copyright
7 notice, this list of conditions and the following disclaimer.
8 * Redistributions in binary form must reproduce the above copyright
9 notice, this list of conditions and the following disclaimer in the
10 documentation and/or other materials provided with the distribution.
11 * Neither the names of contributing organizations nor the
12 names of its contributors may be used to endorse or promote products
13 derived from this software without specific prior written permission.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
19 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26 }}}*/
27
28 #ifndef VC_SSE_DETAIL_H_
29 #define VC_SSE_DETAIL_H_
30
31 #include "casts.h"
32 #ifdef Vc_IMPL_AVX
33 #include "../avx/intrinsics.h"
34 #endif
35 #include "vectorhelper.h"
36
37 #include "macros.h"
38
39 namespace Vc_VERSIONED_NAMESPACE
40 {
41 namespace Detail
42 {
43 // (converting) load functions {{{1
44 template <typename V, typename DstT> struct LoadTag
45 {
46 };
47
48 // when_(un)aligned{{{2
49 class when_aligned
50 {
51 public:
52 template <typename F> constexpr when_aligned(F, typename F::EnableIfAligned = nullptr)
53 {
54 }
55 };
56
57 class when_unaligned
58 {
59 public:
60 template <typename F>
61 constexpr when_unaligned(F, typename F::EnableIfUnaligned = nullptr)
62 {
63 }
64 };
65
66 class when_streaming
67 {
68 public:
69 template <typename F>
70 constexpr when_streaming(F, typename F::EnableIfStreaming = nullptr)
71 {
72 }
73 };
74
75 // load16{{{2
load16(const float * mem,when_aligned)76 Vc_INTRINSIC __m128 load16(const float *mem, when_aligned)
77 {
78 return _mm_load_ps(mem);
79 }
load16(const float * mem,when_unaligned)80 Vc_INTRINSIC __m128 load16(const float *mem, when_unaligned)
81 {
82 return _mm_loadu_ps(mem);
83 }
load16(const float * mem,when_streaming)84 Vc_INTRINSIC __m128 load16(const float *mem, when_streaming)
85 {
86 return SseIntrinsics::_mm_stream_load(mem);
87 }
load16(const double * mem,when_aligned)88 Vc_INTRINSIC __m128d load16(const double *mem, when_aligned)
89 {
90 return _mm_load_pd(mem);
91 }
load16(const double * mem,when_unaligned)92 Vc_INTRINSIC __m128d load16(const double *mem, when_unaligned)
93 {
94 return _mm_loadu_pd(mem);
95 }
load16(const double * mem,when_streaming)96 Vc_INTRINSIC __m128d load16(const double *mem, when_streaming)
97 {
98 return SseIntrinsics::_mm_stream_load(mem);
99 }
load16(const T * mem,when_aligned)100 template <class T> Vc_INTRINSIC __m128i load16(const T *mem, when_aligned)
101 {
102 static_assert(std::is_integral<T>::value, "load16<T> is only intended for integral T");
103 return _mm_load_si128(reinterpret_cast<const __m128i *>(mem));
104 }
load16(const T * mem,when_unaligned)105 template <class T> Vc_INTRINSIC __m128i load16(const T *mem, when_unaligned)
106 {
107 static_assert(std::is_integral<T>::value, "load16<T> is only intended for integral T");
108 return _mm_loadu_si128(reinterpret_cast<const __m128i *>(mem));
109 }
load16(const T * mem,when_streaming)110 template <class T> Vc_INTRINSIC __m128i load16(const T *mem, when_streaming)
111 {
112 static_assert(std::is_integral<T>::value, "load16<T> is only intended for integral T");
113 return SseIntrinsics::_mm_stream_load(mem);
114 }
115
116 // MSVC workarounds{{{2
117 #ifdef Vc_MSVC
118 // work around: "fatal error C1001: An internal error has occurred in the compiler."
119 template <typename V, typename DstT, typename F>
120 Vc_INTRINSIC __m128d load(const double *mem, F f,
121 enable_if<(std::is_same<DstT, double>::value &&
122 std::is_same<V, __m128d>::value)> = nullarg)
123 {
124 return load16(mem, f);
125 }
126
127 template <typename V, typename DstT, typename F>
128 Vc_INTRINSIC __m128 load(const float *mem, F f,
129 enable_if<(std::is_same<DstT, float>::value &&
130 std::is_same<V, __m128>::value)> = nullarg)
131 {
132 return load16(mem, f);
133 }
134
135 template <typename V, typename DstT, typename F>
136 Vc_INTRINSIC __m128i load(const uint *mem, F f,
137 enable_if<(std::is_same<DstT, uint>::value &&
138 std::is_same<V, __m128i>::value)> = nullarg)
139 {
140 return load16(mem, f);
141 }
142
143 template <typename V, typename DstT, typename F>
144 Vc_INTRINSIC __m128i load(const int *mem, F f,
145 enable_if<(std::is_same<DstT, int>::value &&
146 std::is_same<V, __m128i>::value)> = nullarg)
147 {
148 return load16(mem, f);
149 }
150
151 template <typename V, typename DstT, typename F>
152 Vc_INTRINSIC __m128i load(const short *mem, F f,
153 enable_if<(std::is_same<DstT, short>::value &&
154 std::is_same<V, __m128i>::value)> = nullarg)
155 {
156 return load16(mem, f);
157 }
158
159 template <typename V, typename DstT, typename F>
160 Vc_INTRINSIC __m128i load(const ushort *mem, F f,
161 enable_if<(std::is_same<DstT, ushort>::value &&
162 std::is_same<V, __m128i>::value)> = nullarg)
163 {
164 return load16(mem, f);
165 }
166 #endif // Vc_MSVC
167
168 // generic load{{{2
169 template <typename V, typename DstT, typename SrcT, typename Flags,
170 typename = enable_if<
171 #ifdef Vc_MSVC
172 !std::is_same<DstT, SrcT>::value &&
173 #endif
174 (!std::is_integral<DstT>::value || !std::is_integral<SrcT>::value ||
175 sizeof(DstT) >= sizeof(SrcT))>>
load(const SrcT * mem,Flags flags)176 Vc_INTRINSIC V load(const SrcT *mem, Flags flags)
177 {
178 return load(mem, flags, LoadTag<V, DstT>());
179 }
180
181 // no conversion load from any T {{{2
182 template <typename V, typename T, typename Flags>
183 Vc_INTRINSIC V
184 load(const T *mem, Flags, LoadTag<V, T>, enable_if<sizeof(V) == 16> = nullarg)
185 {
186 return SSE::VectorHelper<V>::template load<Flags>(mem);
187 }
188
189 // short {{{2
190 template <typename Flags>
load(const ushort * mem,Flags,LoadTag<__m128i,short>)191 Vc_INTRINSIC __m128i load(const ushort *mem, Flags, LoadTag<__m128i, short>)
192 {
193 return SSE::VectorHelper<__m128i>::load<Flags>(mem);
194 }
195 template <typename Flags>
load(const uchar * mem,Flags,LoadTag<__m128i,short>)196 Vc_INTRINSIC __m128i load(const uchar *mem, Flags, LoadTag<__m128i, short>)
197 {
198 // the only available streaming load loads 16 bytes - twice as much as we need =>
199 // can't use it, or we risk an out-of-bounds read and an unaligned load exception
200 return SSE::cvtepu8_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
201 }
202 template <typename Flags>
load(const schar * mem,Flags,LoadTag<__m128i,short>)203 Vc_INTRINSIC __m128i load(const schar *mem, Flags, LoadTag<__m128i, short>)
204 {
205 // the only available streaming load loads 16 bytes - twice as much as we need =>
206 // can't use it, or we risk an out-of-bounds read and an unaligned load exception
207 return SSE::cvtepi8_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
208 }
209
210 // ushort {{{2
211 template <typename Flags>
load(const uchar * mem,Flags,LoadTag<__m128i,ushort>)212 Vc_INTRINSIC __m128i load(const uchar *mem, Flags, LoadTag<__m128i, ushort>)
213 {
214 // the only available streaming load loads 16 bytes - twice as much as we need =>
215 // can't use it, or we risk an out-of-bounds read and an unaligned load exception
216 return SSE::cvtepu8_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
217 }
218
219 // int {{{2
220 template <typename Flags>
load(const uint * mem,Flags,LoadTag<__m128i,int>)221 Vc_INTRINSIC __m128i load(const uint *mem, Flags, LoadTag<__m128i, int>)
222 {
223 return SSE::VectorHelper<__m128i>::load<Flags>(mem);
224 }
225 // no difference between streaming and alignment, because the
226 // 32/64 bit loads are not available as streaming loads, and can always be unaligned
227 template <typename Flags>
load(const ushort * mem,Flags,LoadTag<__m128i,int>)228 Vc_INTRINSIC __m128i load(const ushort *mem, Flags, LoadTag<__m128i, int>)
229 {
230 return SSE::cvtepu16_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
231 }
232 template <typename Flags>
load(const short * mem,Flags,LoadTag<__m128i,int>)233 Vc_INTRINSIC __m128i load(const short *mem, Flags, LoadTag<__m128i, int>)
234 {
235 return SSE::cvtepi16_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
236 }
237 template <typename Flags>
load(const uchar * mem,Flags,LoadTag<__m128i,int>)238 Vc_INTRINSIC __m128i load(const uchar *mem, Flags, LoadTag<__m128i, int>)
239 {
240 return SSE::cvtepu8_epi32(_mm_cvtsi32_si128(*aliasing_cast<int>(mem)));
241 }
242 template <typename Flags>
load(const schar * mem,Flags,LoadTag<__m128i,int>)243 Vc_INTRINSIC __m128i load(const schar *mem, Flags, LoadTag<__m128i, int>)
244 {
245 return SSE::cvtepi8_epi32(_mm_cvtsi32_si128(*aliasing_cast<int>(mem)));
246 }
247
248 // uint {{{2
249 template <typename Flags>
load(const ushort * mem,Flags,LoadTag<__m128i,uint>)250 Vc_INTRINSIC __m128i load(const ushort *mem, Flags, LoadTag<__m128i, uint>)
251 {
252 return SSE::cvtepu16_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
253 }
254 template <typename Flags>
load(const uchar * mem,Flags,LoadTag<__m128i,uint>)255 Vc_INTRINSIC __m128i load(const uchar *mem, Flags, LoadTag<__m128i, uint>)
256 {
257 return SSE::cvtepu8_epi32(_mm_cvtsi32_si128(*aliasing_cast<int>(mem)));
258 }
259
260 // double {{{2
261 template <typename Flags>
load(const float * mem,Flags,LoadTag<__m128d,double>)262 Vc_INTRINSIC __m128d load(const float *mem, Flags, LoadTag<__m128d, double>)
263 {
264 return SSE::convert<float, double>(
265 _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<const __m64 *>(mem)));
266 }
267 template <typename Flags>
load(const uint * mem,Flags,LoadTag<__m128d,double>)268 Vc_INTRINSIC __m128d load(const uint *mem, Flags, LoadTag<__m128d, double>)
269 {
270 return SSE::convert<uint, double>(
271 _mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
272 }
273 template <typename Flags>
load(const int * mem,Flags,LoadTag<__m128d,double>)274 Vc_INTRINSIC __m128d load(const int *mem, Flags, LoadTag<__m128d, double>)
275 {
276 return SSE::convert<int, double>(
277 _mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
278 }
279 template <typename Flags>
load(const ushort * mem,Flags,LoadTag<__m128d,double>)280 Vc_INTRINSIC __m128d load(const ushort *mem, Flags, LoadTag<__m128d, double>)
281 {
282 return SSE::convert<ushort, double>(
283 _mm_cvtsi32_si128(*aliasing_cast<int>(mem)));
284 }
285 template <typename Flags>
load(const short * mem,Flags,LoadTag<__m128d,double>)286 Vc_INTRINSIC __m128d load(const short *mem, Flags, LoadTag<__m128d, double>)
287 {
288 return SSE::convert<short, double>(
289 _mm_cvtsi32_si128(*aliasing_cast<int>(mem)));
290 }
291 template <typename Flags>
load(const uchar * mem,Flags,LoadTag<__m128d,double>)292 Vc_INTRINSIC __m128d load(const uchar *mem, Flags, LoadTag<__m128d, double>)
293 {
294 return SSE::convert<uchar, double>(
295 _mm_set1_epi16(*aliasing_cast<short>(mem)));
296 }
297 template <typename Flags>
load(const schar * mem,Flags,LoadTag<__m128d,double>)298 Vc_INTRINSIC __m128d load(const schar *mem, Flags, LoadTag<__m128d, double>)
299 {
300 return SSE::convert<char, double>(
301 _mm_set1_epi16(*aliasing_cast<short>(mem)));
302 }
303
304 // float {{{2
305 template <typename Flags>
load(const double * mem,Flags,LoadTag<__m128,float>)306 Vc_INTRINSIC __m128 load(const double *mem, Flags, LoadTag<__m128, float>)
307 {
308 #ifdef Vc_IMPL_AVX
309 if (Flags::IsUnaligned) {
310 return _mm256_cvtpd_ps(_mm256_loadu_pd(mem));
311 } else if (Flags::IsStreaming) {
312 return _mm256_cvtpd_ps(AvxIntrinsics::stream_load<__m256d>(mem));
313 } else { // Flags::IsAligned
314 return _mm256_cvtpd_ps(_mm256_load_pd(mem));
315 }
316 #else
317 return _mm_movelh_ps(_mm_cvtpd_ps(SSE::VectorHelper<__m128d>::load<Flags>(&mem[0])),
318 _mm_cvtpd_ps(SSE::VectorHelper<__m128d>::load<Flags>(&mem[2])));
319 #endif
320 }
321 template <typename Flags>
load(const uint * mem,Flags f,LoadTag<__m128,float>)322 Vc_INTRINSIC __m128 load(const uint *mem, Flags f, LoadTag<__m128, float>)
323 {
324 return SSE::convert<uint, float>(load<__m128i, uint>(mem, f));
325 }
326 template <typename T, typename Flags,
327 typename = enable_if<!std::is_same<T, float>::value>>
load(const T * mem,Flags f,LoadTag<__m128,float>)328 Vc_INTRINSIC __m128 load(const T *mem, Flags f, LoadTag<__m128, float>)
329 {
330 return _mm_cvtepi32_ps(load<__m128i, int>(mem, f));
331 }
332
333 // shifted{{{1
334 template <int amount, typename T>
shifted(T k)335 Vc_INTRINSIC Vc_CONST enable_if<amount == 0, T> shifted(T k)
336 {
337 return k;
338 }
339 template <int amount, typename T>
shifted(T k)340 Vc_INTRINSIC Vc_CONST enable_if<(sizeof(T) == 16 && amount > 0), T> shifted(T k)
341 {
342 return _mm_srli_si128(k, amount);
343 }
344 template <int amount, typename T>
shifted(T k)345 Vc_INTRINSIC Vc_CONST enable_if<(sizeof(T) == 16 && amount < 0), T> shifted(T k)
346 {
347 return _mm_slli_si128(k, -amount);
348 }
349
350 // IndexesFromZero{{{1
IndexesFromZero()351 template <typename T, int Size> Vc_INTRINSIC Vc_CONST const T *IndexesFromZero()
352 {
353 if (Size == 4) {
354 return reinterpret_cast<const T *>(SSE::_IndexesFromZero4);
355 } else if (Size == 8) {
356 return reinterpret_cast<const T *>(SSE::_IndexesFromZero8);
357 } else if (Size == 16) {
358 return reinterpret_cast<const T *>(SSE::_IndexesFromZero16);
359 }
360 return 0;
361 }
362
363 // popcnt{{{1
popcnt4(unsigned int n)364 Vc_INTRINSIC Vc_CONST unsigned int popcnt4(unsigned int n)
365 {
366 #ifdef Vc_IMPL_POPCNT
367 return _mm_popcnt_u32(n);
368 #else
369 n = (n & 0x5U) + ((n >> 1) & 0x5U);
370 n = (n & 0x3U) + ((n >> 2) & 0x3U);
371 return n;
372 #endif
373 }
popcnt8(unsigned int n)374 Vc_INTRINSIC Vc_CONST unsigned int popcnt8(unsigned int n)
375 {
376 #ifdef Vc_IMPL_POPCNT
377 return _mm_popcnt_u32(n);
378 #else
379 n = (n & 0x55U) + ((n >> 1) & 0x55U);
380 n = (n & 0x33U) + ((n >> 2) & 0x33U);
381 n = (n & 0x0fU) + ((n >> 4) & 0x0fU);
382 return n;
383 #endif
384 }
popcnt16(unsigned int n)385 Vc_INTRINSIC Vc_CONST unsigned int popcnt16(unsigned int n)
386 {
387 #ifdef Vc_IMPL_POPCNT
388 return _mm_popcnt_u32(n);
389 #else
390 n = (n & 0x5555U) + ((n >> 1) & 0x5555U);
391 n = (n & 0x3333U) + ((n >> 2) & 0x3333U);
392 n = (n & 0x0f0fU) + ((n >> 4) & 0x0f0fU);
393 n = (n & 0x00ffU) + ((n >> 8) & 0x00ffU);
394 return n;
395 #endif
396 }
popcnt32(unsigned int n)397 Vc_INTRINSIC Vc_CONST unsigned int popcnt32(unsigned int n)
398 {
399 #ifdef Vc_IMPL_POPCNT
400 return _mm_popcnt_u32(n);
401 #else
402 n = (n & 0x55555555U) + ((n >> 1) & 0x55555555U);
403 n = (n & 0x33333333U) + ((n >> 2) & 0x33333333U);
404 n = (n & 0x0f0f0f0fU) + ((n >> 4) & 0x0f0f0f0fU);
405 n = (n & 0x00ff00ffU) + ((n >> 8) & 0x00ff00ffU);
406 n = (n & 0x0000ffffU) + ((n >>16) & 0x0000ffffU);
407 return n;
408 #endif
409 }
410
411 // mask_cast{{{1
mask_cast(__m128i k)412 template<size_t From, size_t To, typename R> Vc_INTRINSIC Vc_CONST R mask_cast(__m128i k)
413 {
414 static_assert(From == To, "Incorrect mask cast.");
415 static_assert(std::is_same<R, __m128>::value, "Incorrect mask cast.");
416 return SSE::sse_cast<__m128>(k);
417 }
418
419 template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<2, 4, __m128>(__m128i k)
420 {
421 return SSE::sse_cast<__m128>(_mm_packs_epi16(k, _mm_setzero_si128()));
422 }
423 template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<2, 8, __m128>(__m128i k)
424 {
425 return SSE::sse_cast<__m128>(
426 _mm_packs_epi16(_mm_packs_epi16(k, _mm_setzero_si128()), _mm_setzero_si128()));
427 }
428
429 template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<4, 2, __m128>(__m128i k)
430 {
431 return SSE::sse_cast<__m128>(_mm_unpacklo_epi32(k, k));
432 }
433 template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<4, 8, __m128>(__m128i k)
434 {
435 return SSE::sse_cast<__m128>(_mm_packs_epi16(k, _mm_setzero_si128()));
436 }
437
438 template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<8, 2, __m128>(__m128i k)
439 {
440 const auto tmp = _mm_unpacklo_epi16(k, k);
441 return SSE::sse_cast<__m128>(_mm_unpacklo_epi32(tmp, tmp));
442 }
443 template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<8, 4, __m128>(__m128i k)
444 {
445 return SSE::sse_cast<__m128>(_mm_unpacklo_epi16(k, k));
446 }
447
448 template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<16, 8, __m128>(__m128i k)
449 {
450 return SSE::sse_cast<__m128>(_mm_unpacklo_epi8(k, k));
451 }
452 template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<16, 4, __m128>(__m128i k)
453 {
454 const auto tmp = SSE::sse_cast<__m128i>(mask_cast<16, 8, __m128>(k));
455 return SSE::sse_cast<__m128>(_mm_unpacklo_epi16(tmp, tmp));
456 }
457 template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<16, 2, __m128>(__m128i k)
458 {
459 const auto tmp = SSE::sse_cast<__m128i>(mask_cast<16, 4, __m128>(k));
460 return SSE::sse_cast<__m128>(_mm_unpacklo_epi32(tmp, tmp));
461 }
462
463 // allone{{{1
464 template <typename V> Vc_INTRINSIC_L Vc_CONST_L V allone() Vc_INTRINSIC_R Vc_CONST_R;
465 template<> Vc_INTRINSIC Vc_CONST __m128 allone<__m128 >() { return SSE::_mm_setallone_ps(); }
466 template<> Vc_INTRINSIC Vc_CONST __m128i allone<__m128i>() { return SSE::_mm_setallone_si128(); }
467 template<> Vc_INTRINSIC Vc_CONST __m128d allone<__m128d>() { return SSE::_mm_setallone_pd(); }
468
469 // zero{{{1
470 template <typename V> inline V zero();
471 template<> Vc_INTRINSIC Vc_CONST __m128 zero<__m128 >() { return _mm_setzero_ps(); }
472 template<> Vc_INTRINSIC Vc_CONST __m128i zero<__m128i>() { return _mm_setzero_si128(); }
473 template<> Vc_INTRINSIC Vc_CONST __m128d zero<__m128d>() { return _mm_setzero_pd(); }
474
475 // negate{{{1
negate(__m128 v,std::integral_constant<std::size_t,4>)476 Vc_ALWAYS_INLINE Vc_CONST __m128 negate(__m128 v, std::integral_constant<std::size_t, 4>)
477 {
478 return _mm_xor_ps(v, SSE::_mm_setsignmask_ps());
479 }
negate(__m128d v,std::integral_constant<std::size_t,8>)480 Vc_ALWAYS_INLINE Vc_CONST __m128d negate(__m128d v, std::integral_constant<std::size_t, 8>)
481 {
482 return _mm_xor_pd(v, SSE::_mm_setsignmask_pd());
483 }
negate(__m128i v,std::integral_constant<std::size_t,4>)484 Vc_ALWAYS_INLINE Vc_CONST __m128i negate(__m128i v, std::integral_constant<std::size_t, 4>)
485 {
486 #ifdef Vc_IMPL_SSSE3
487 return _mm_sign_epi32(v, allone<__m128i>());
488 #else
489 return _mm_sub_epi32(_mm_setzero_si128(), v);
490 #endif
491 }
negate(__m128i v,std::integral_constant<std::size_t,2>)492 Vc_ALWAYS_INLINE Vc_CONST __m128i negate(__m128i v, std::integral_constant<std::size_t, 2>)
493 {
494 #ifdef Vc_IMPL_SSSE3
495 return _mm_sign_epi16(v, allone<__m128i>());
496 #else
497 return _mm_sub_epi16(_mm_setzero_si128(), v);
498 #endif
499 }
500
501 // xor_{{{1
xor_(__m128 a,__m128 b)502 Vc_INTRINSIC __m128 xor_(__m128 a, __m128 b) { return _mm_xor_ps(a, b); }
xor_(__m128d a,__m128d b)503 Vc_INTRINSIC __m128d xor_(__m128d a, __m128d b) { return _mm_xor_pd(a, b); }
xor_(__m128i a,__m128i b)504 Vc_INTRINSIC __m128i xor_(__m128i a, __m128i b) { return _mm_xor_si128(a, b); }
505
506 // or_{{{1
or_(__m128 a,__m128 b)507 Vc_INTRINSIC __m128 or_(__m128 a, __m128 b) { return _mm_or_ps(a, b); }
or_(__m128d a,__m128d b)508 Vc_INTRINSIC __m128d or_(__m128d a, __m128d b) { return _mm_or_pd(a, b); }
or_(__m128i a,__m128i b)509 Vc_INTRINSIC __m128i or_(__m128i a, __m128i b) { return _mm_or_si128(a, b); }
510
511 // and_{{{1
and_(__m128 a,__m128 b)512 Vc_INTRINSIC __m128 and_(__m128 a, __m128 b) { return _mm_and_ps(a, b); }
and_(__m128d a,__m128d b)513 Vc_INTRINSIC __m128d and_(__m128d a, __m128d b) { return _mm_and_pd(a, b); }
and_(__m128i a,__m128i b)514 Vc_INTRINSIC __m128i and_(__m128i a, __m128i b) { return _mm_and_si128(a, b); }
515
516 // andnot_{{{1
andnot_(__m128 a,__m128 b)517 Vc_INTRINSIC __m128 andnot_(__m128 a, __m128 b) { return _mm_andnot_ps(a, b); }
andnot_(__m128d a,__m128d b)518 Vc_INTRINSIC __m128d andnot_(__m128d a, __m128d b) { return _mm_andnot_pd(a, b); }
andnot_(__m128i a,__m128i b)519 Vc_INTRINSIC __m128i andnot_(__m128i a, __m128i b) { return _mm_andnot_si128(a, b); }
520
521 // not_{{{1
not_(__m128 a)522 Vc_INTRINSIC __m128 not_(__m128 a) { return andnot_(a, allone<__m128 >()); }
not_(__m128d a)523 Vc_INTRINSIC __m128d not_(__m128d a) { return andnot_(a, allone<__m128d>()); }
not_(__m128i a)524 Vc_INTRINSIC __m128i not_(__m128i a) { return andnot_(a, allone<__m128i>()); }
525
526 // add{{{1
add(__m128 a,__m128 b,float)527 Vc_INTRINSIC __m128 add(__m128 a, __m128 b, float) { return _mm_add_ps(a, b); }
add(__m128d a,__m128d b,double)528 Vc_INTRINSIC __m128d add(__m128d a, __m128d b, double) { return _mm_add_pd(a, b); }
add(__m128i a,__m128i b,int)529 Vc_INTRINSIC __m128i add(__m128i a, __m128i b, int) { return _mm_add_epi32(a, b); }
add(__m128i a,__m128i b,uint)530 Vc_INTRINSIC __m128i add(__m128i a, __m128i b, uint) { return _mm_add_epi32(a, b); }
add(__m128i a,__m128i b,short)531 Vc_INTRINSIC __m128i add(__m128i a, __m128i b, short) { return _mm_add_epi16(a, b); }
add(__m128i a,__m128i b,ushort)532 Vc_INTRINSIC __m128i add(__m128i a, __m128i b, ushort) { return _mm_add_epi16(a, b); }
add(__m128i a,__m128i b,schar)533 Vc_INTRINSIC __m128i add(__m128i a, __m128i b, schar) { return _mm_add_epi8 (a, b); }
add(__m128i a,__m128i b,uchar)534 Vc_INTRINSIC __m128i add(__m128i a, __m128i b, uchar) { return _mm_add_epi8 (a, b); }
535
536 // sub{{{1
sub(__m128 a,__m128 b,float)537 Vc_INTRINSIC __m128 sub(__m128 a, __m128 b, float) { return _mm_sub_ps(a, b); }
sub(__m128d a,__m128d b,double)538 Vc_INTRINSIC __m128d sub(__m128d a, __m128d b, double) { return _mm_sub_pd(a, b); }
sub(__m128i a,__m128i b,int)539 Vc_INTRINSIC __m128i sub(__m128i a, __m128i b, int) { return _mm_sub_epi32(a, b); }
sub(__m128i a,__m128i b,uint)540 Vc_INTRINSIC __m128i sub(__m128i a, __m128i b, uint) { return _mm_sub_epi32(a, b); }
sub(__m128i a,__m128i b,short)541 Vc_INTRINSIC __m128i sub(__m128i a, __m128i b, short) { return _mm_sub_epi16(a, b); }
sub(__m128i a,__m128i b,ushort)542 Vc_INTRINSIC __m128i sub(__m128i a, __m128i b, ushort) { return _mm_sub_epi16(a, b); }
sub(__m128i a,__m128i b,schar)543 Vc_INTRINSIC __m128i sub(__m128i a, __m128i b, schar) { return _mm_sub_epi8 (a, b); }
sub(__m128i a,__m128i b,uchar)544 Vc_INTRINSIC __m128i sub(__m128i a, __m128i b, uchar) { return _mm_sub_epi8 (a, b); }
545
546 // mul{{{1
mul(__m128 a,__m128 b,float)547 Vc_INTRINSIC __m128 mul(__m128 a, __m128 b, float) { return _mm_mul_ps(a, b); }
mul(__m128d a,__m128d b,double)548 Vc_INTRINSIC __m128d mul(__m128d a, __m128d b, double) { return _mm_mul_pd(a, b); }
mul(__m128i a,__m128i b,int)549 Vc_INTRINSIC __m128i mul(__m128i a, __m128i b, int) {
550 #ifdef Vc_IMPL_SSE4_1
551 return _mm_mullo_epi32(a, b);
552 #else
553 const __m128i aShift = _mm_srli_si128(a, 4);
554 const __m128i ab02 = _mm_mul_epu32(a, b); // [a0 * b0, a2 * b2]
555 const __m128i bShift = _mm_srli_si128(b, 4);
556 const __m128i ab13 = _mm_mul_epu32(aShift, bShift); // [a1 * b1, a3 * b3]
557 return _mm_unpacklo_epi32(_mm_shuffle_epi32(ab02, 8), _mm_shuffle_epi32(ab13, 8));
558 #endif
559 }
mul(__m128i a,__m128i b,uint)560 Vc_INTRINSIC __m128i mul(__m128i a, __m128i b, uint) { return mul(a, b, int()); }
mul(__m128i a,__m128i b,short)561 Vc_INTRINSIC __m128i mul(__m128i a, __m128i b, short) { return _mm_mullo_epi16(a, b); }
mul(__m128i a,__m128i b,ushort)562 Vc_INTRINSIC __m128i mul(__m128i a, __m128i b, ushort) { return _mm_mullo_epi16(a, b); }
mul(__m128i a,__m128i b,schar)563 Vc_INTRINSIC __m128i mul(__m128i a, __m128i b, schar) {
564 #ifdef Vc_USE_BUILTIN_VECTOR_TYPES
565 using B = Common::BuiltinType<schar, 16>;
566 const auto x = aliasing_cast<B>(a) * aliasing_cast<B>(b);
567 return reinterpret_cast<const __m128i &>(x);
568 #else
569 return or_(
570 and_(_mm_mullo_epi16(a, b), _mm_slli_epi16(allone<__m128i>(), 8)),
571 _mm_slli_epi16(_mm_mullo_epi16(_mm_srli_si128(a, 1), _mm_srli_si128(b, 1)), 8));
572 #endif
573 }
mul(__m128i a,__m128i b,uchar)574 Vc_INTRINSIC __m128i mul(__m128i a, __m128i b, uchar) {
575 #ifdef Vc_USE_BUILTIN_VECTOR_TYPES
576 using B = Common::BuiltinType<uchar, 16>;
577 const auto x = aliasing_cast<B>(a) * aliasing_cast<B>(b);
578 return reinterpret_cast<const __m128i &>(x);
579 #else
580 return or_(
581 and_(_mm_mullo_epi16(a, b), _mm_slli_epi16(allone<__m128i>(), 8)),
582 _mm_slli_epi16(_mm_mullo_epi16(_mm_srli_si128(a, 1), _mm_srli_si128(b, 1)), 8));
583 #endif
584 }
585
586 // div{{{1
div(__m128 a,__m128 b,float)587 Vc_INTRINSIC __m128 div(__m128 a, __m128 b, float) { return _mm_div_ps(a, b); }
div(__m128d a,__m128d b,double)588 Vc_INTRINSIC __m128d div(__m128d a, __m128d b, double) { return _mm_div_pd(a, b); }
589
590 // TODO: fma{{{1
591 //Vc_INTRINSIC __m128 fma(__m128 a, __m128 b, __m128 c, float) { return _mm_mul_ps(a, b); }
592 //Vc_INTRINSIC __m128d fma(__m128d a, __m128d b, __m128d c, double) { return _mm_mul_pd(a, b); }
593 //Vc_INTRINSIC __m128i fma(__m128i a, __m128i b, __m128i c, int) { }
594 //Vc_INTRINSIC __m128i fma(__m128i a, __m128i b, __m128i c, uint) { return fma(a, b, int()); }
595 //Vc_INTRINSIC __m128i fma(__m128i a, __m128i b, __m128i c, short) { return _mm_mullo_epi16(a, b); }
596 //Vc_INTRINSIC __m128i fma(__m128i a, __m128i b, __m128i c, ushort) { return _mm_mullo_epi16(a, b); }
597 //Vc_INTRINSIC __m128i fma(__m128i a, __m128i b, __m128i c, schar) { }
598 //Vc_INTRINSIC __m128i fma(__m128i a, __m128i b, __m128i c, uchar) { }
599
600 // min{{{1
min(__m128 a,__m128 b,float)601 Vc_INTRINSIC __m128 min(__m128 a, __m128 b, float) { return _mm_min_ps(a, b); }
min(__m128d a,__m128d b,double)602 Vc_INTRINSIC __m128d min(__m128d a, __m128d b, double) { return _mm_min_pd(a, b); }
min(__m128i a,__m128i b,int)603 Vc_INTRINSIC __m128i min(__m128i a, __m128i b, int) { return SSE::min_epi32(a, b); }
min(__m128i a,__m128i b,uint)604 Vc_INTRINSIC __m128i min(__m128i a, __m128i b, uint) { return SSE::min_epu32(a, b); }
min(__m128i a,__m128i b,short)605 Vc_INTRINSIC __m128i min(__m128i a, __m128i b, short) { return _mm_min_epi16(a, b); }
min(__m128i a,__m128i b,ushort)606 Vc_INTRINSIC __m128i min(__m128i a, __m128i b, ushort) { return SSE::min_epu16(a, b); }
min(__m128i a,__m128i b,schar)607 Vc_INTRINSIC __m128i min(__m128i a, __m128i b, schar) { return SSE::min_epi8 (a, b); }
min(__m128i a,__m128i b,uchar)608 Vc_INTRINSIC __m128i min(__m128i a, __m128i b, uchar) { return _mm_min_epu8 (a, b); }
609
610 // max{{{1
max(__m128 a,__m128 b,float)611 Vc_INTRINSIC __m128 max(__m128 a, __m128 b, float) { return _mm_max_ps(a, b); }
max(__m128d a,__m128d b,double)612 Vc_INTRINSIC __m128d max(__m128d a, __m128d b, double) { return _mm_max_pd(a, b); }
max(__m128i a,__m128i b,int)613 Vc_INTRINSIC __m128i max(__m128i a, __m128i b, int) { return SSE::max_epi32(a, b); }
max(__m128i a,__m128i b,uint)614 Vc_INTRINSIC __m128i max(__m128i a, __m128i b, uint) { return SSE::max_epu32(a, b); }
max(__m128i a,__m128i b,short)615 Vc_INTRINSIC __m128i max(__m128i a, __m128i b, short) { return _mm_max_epi16(a, b); }
max(__m128i a,__m128i b,ushort)616 Vc_INTRINSIC __m128i max(__m128i a, __m128i b, ushort) { return SSE::max_epu16(a, b); }
max(__m128i a,__m128i b,schar)617 Vc_INTRINSIC __m128i max(__m128i a, __m128i b, schar) { return SSE::max_epi8 (a, b); }
max(__m128i a,__m128i b,uchar)618 Vc_INTRINSIC __m128i max(__m128i a, __m128i b, uchar) { return _mm_max_epu8 (a, b); }
619
620 // horizontal add{{{1
add(__m128 a,float)621 Vc_INTRINSIC float add(__m128 a, float) {
622 a = _mm_add_ps(a, _mm_movehl_ps(a, a));
623 a = _mm_add_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)));
624 return _mm_cvtss_f32(a);
625 }
add(__m128d a,double)626 Vc_INTRINSIC double add(__m128d a, double) {
627 a = _mm_add_sd(a, _mm_unpackhi_pd(a, a));
628 return _mm_cvtsd_f64(a);
629 }
add(__m128i a,int)630 Vc_INTRINSIC int add(__m128i a, int) {
631 a = add(a, _mm_srli_si128(a, 8), int());
632 a = add(a, _mm_srli_si128(a, 4), int());
633 return _mm_cvtsi128_si32(a);
634 }
add(__m128i a,uint)635 Vc_INTRINSIC uint add(__m128i a, uint) { return add(a, int()); }
add(__m128i a,short)636 Vc_INTRINSIC short add(__m128i a, short) {
637 a = add(a, _mm_srli_si128(a, 8), short());
638 a = add(a, _mm_srli_si128(a, 4), short());
639 a = add(a, _mm_srli_si128(a, 2), short());
640 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
641 }
add(__m128i a,ushort)642 Vc_INTRINSIC ushort add(__m128i a, ushort) { return add(a, short()); }
add(__m128i a,schar)643 Vc_INTRINSIC schar add(__m128i a, schar) {
644 a = add(a, _mm_srli_si128(a, 8), schar());
645 a = add(a, _mm_srli_si128(a, 4), schar());
646 a = add(a, _mm_srli_si128(a, 2), schar());
647 a = add(a, _mm_srli_si128(a, 1), schar());
648 return _mm_cvtsi128_si32(a); // & 0xff is implicit
649 }
add(__m128i a,uchar)650 Vc_INTRINSIC uchar add(__m128i a, uchar) { return add(a, schar()); }
651
652 // horizontal mul{{{1
mul(__m128 a,float)653 Vc_INTRINSIC float mul(__m128 a, float) {
654 a = _mm_mul_ps(a, _mm_movehl_ps(a, a));
655 a = _mm_mul_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)));
656 return _mm_cvtss_f32(a);
657 }
mul(__m128d a,double)658 Vc_INTRINSIC double mul(__m128d a, double) {
659 a = _mm_mul_sd(a, _mm_unpackhi_pd(a, a));
660 return _mm_cvtsd_f64(a);
661 }
mul(__m128i a,int)662 Vc_INTRINSIC int mul(__m128i a, int) {
663 a = mul(a, _mm_srli_si128(a, 8), int());
664 a = mul(a, _mm_srli_si128(a, 4), int());
665 return _mm_cvtsi128_si32(a);
666 }
mul(__m128i a,uint)667 Vc_INTRINSIC uint mul(__m128i a, uint) { return mul(a, int()); }
mul(__m128i a,short)668 Vc_INTRINSIC short mul(__m128i a, short) {
669 a = mul(a, _mm_srli_si128(a, 8), short());
670 a = mul(a, _mm_srli_si128(a, 4), short());
671 a = mul(a, _mm_srli_si128(a, 2), short());
672 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
673 }
mul(__m128i a,ushort)674 Vc_INTRINSIC ushort mul(__m128i a, ushort) { return mul(a, short()); }
mul(__m128i a,schar)675 Vc_INTRINSIC schar mul(__m128i a, schar) {
676 // convert to two short vectors, multiply them and then do horizontal reduction
677 const __m128i s0 = _mm_srai_epi16(a, 1);
678 const __m128i s1 = Detail::and_(a, _mm_set1_epi32(0x0f0f0f0f));
679 return mul(mul(s0, s1, short()), short());
680 }
mul(__m128i a,uchar)681 Vc_INTRINSIC uchar mul(__m128i a, uchar) { return mul(a, schar()); }
682
683 // horizontal min{{{1
min(__m128 a,float)684 Vc_INTRINSIC float min(__m128 a, float) {
685 a = _mm_min_ps(a, _mm_movehl_ps(a, a));
686 a = _mm_min_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)));
687 return _mm_cvtss_f32(a);
688 }
min(__m128d a,double)689 Vc_INTRINSIC double min(__m128d a, double) {
690 a = _mm_min_sd(a, _mm_unpackhi_pd(a, a));
691 return _mm_cvtsd_f64(a);
692 }
min(__m128i a,int)693 Vc_INTRINSIC int min(__m128i a, int) {
694 a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), int());
695 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), int());
696 return _mm_cvtsi128_si32(a);
697 }
min(__m128i a,uint)698 Vc_INTRINSIC uint min(__m128i a, uint) {
699 a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), uint());
700 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), uint());
701 return _mm_cvtsi128_si32(a);
702 }
min(__m128i a,short)703 Vc_INTRINSIC short min(__m128i a, short) {
704 a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), short());
705 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), short());
706 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)), short());
707 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
708 }
min(__m128i a,ushort)709 Vc_INTRINSIC ushort min(__m128i a, ushort) {
710 a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), ushort());
711 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), ushort());
712 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)), ushort());
713 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
714 }
min(__m128i a,schar)715 Vc_INTRINSIC schar min(__m128i a, schar) {
716 a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), schar());
717 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), schar());
718 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)), schar());
719 return std::min(schar(_mm_cvtsi128_si32(a) >> 8), schar(_mm_cvtsi128_si32(a)));
720 }
min(__m128i a,uchar)721 Vc_INTRINSIC uchar min(__m128i a, uchar) {
722 a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), schar());
723 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), schar());
724 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)), schar());
725 return std::min((_mm_cvtsi128_si32(a) >> 8) & 0xff, _mm_cvtsi128_si32(a) & 0xff);
726 }
727
728 // horizontal max{{{1
max(__m128 a,float)729 Vc_INTRINSIC float max(__m128 a, float) {
730 a = _mm_max_ps(a, _mm_movehl_ps(a, a));
731 a = _mm_max_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)));
732 return _mm_cvtss_f32(a);
733 }
max(__m128d a,double)734 Vc_INTRINSIC double max(__m128d a, double) {
735 a = _mm_max_sd(a, _mm_unpackhi_pd(a, a));
736 return _mm_cvtsd_f64(a);
737 }
max(__m128i a,int)738 Vc_INTRINSIC int max(__m128i a, int) {
739 a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), int());
740 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), int());
741 return _mm_cvtsi128_si32(a);
742 }
max(__m128i a,uint)743 Vc_INTRINSIC uint max(__m128i a, uint) {
744 a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), uint());
745 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), uint());
746 return _mm_cvtsi128_si32(a);
747 }
max(__m128i a,short)748 Vc_INTRINSIC short max(__m128i a, short) {
749 a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), short());
750 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), short());
751 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)), short());
752 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
753 }
max(__m128i a,ushort)754 Vc_INTRINSIC ushort max(__m128i a, ushort) {
755 a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), ushort());
756 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), ushort());
757 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)), ushort());
758 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
759 }
max(__m128i a,schar)760 Vc_INTRINSIC schar max(__m128i a, schar) {
761 a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), schar());
762 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), schar());
763 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)), schar());
764 return std::max(schar(_mm_cvtsi128_si32(a) >> 8), schar(_mm_cvtsi128_si32(a)));
765 }
max(__m128i a,uchar)766 Vc_INTRINSIC uchar max(__m128i a, uchar) {
767 a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), schar());
768 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), schar());
769 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)), schar());
770 return std::max((_mm_cvtsi128_si32(a) >> 8) & 0xff, _mm_cvtsi128_si32(a) & 0xff);
771 }
772
773 // sorted{{{1
774 template <Vc::Implementation, typename T>
775 Vc_CONST_L SSE::Vector<T> sorted(SSE::Vector<T> x) Vc_CONST_R;
sorted(SSE::Vector<T> x)776 template <typename T> Vc_INTRINSIC Vc_CONST SSE::Vector<T> sorted(SSE::Vector<T> x)
777 {
778 static_assert(!CurrentImplementation::is(ScalarImpl),
779 "Detail::sorted can only be instantiated if a non-Scalar "
780 "implementation is selected.");
781 return sorted < CurrentImplementation::is_between(SSE2Impl, SSSE3Impl)
782 ? SSE2Impl
783 : CurrentImplementation::is_between(SSE41Impl, SSE42Impl)
784 ? SSE41Impl
785 : CurrentImplementation::current() > (x);
786 }
787
788 // sanitize{{{1
sanitize(int n)789 template <typename V> constexpr int sanitize(int n)
790 {
791 return (n >= int(sizeof(V)) || n <= -int(sizeof(V))) ? 0 : n;
792 }
793
794 // rotated{{{1
795 template <typename T, size_t N, typename V>
rotated(V v,int amount)796 static Vc_INTRINSIC Vc_CONST enable_if<(sizeof(V) == 16), V> rotated(V v, int amount)
797 {
798 using namespace SSE;
799 switch (static_cast<unsigned int>(amount) % N) {
800 case 0:
801 return v;
802 case 1:
803 return sse_cast<V>(_mm_alignr_epi8(v, v, sanitize<V>(1 * sizeof(T))));
804 case 2:
805 return sse_cast<V>(_mm_alignr_epi8(v, v, sanitize<V>(2 * sizeof(T))));
806 case 3:
807 return sse_cast<V>(_mm_alignr_epi8(v, v, sanitize<V>(3 * sizeof(T))));
808 case 4:
809 return sse_cast<V>(_mm_alignr_epi8(v, v, sanitize<V>(4 * sizeof(T))));
810 case 5:
811 return sse_cast<V>(_mm_alignr_epi8(v, v, sanitize<V>(5 * sizeof(T))));
812 case 6:
813 return sse_cast<V>(_mm_alignr_epi8(v, v, sanitize<V>(6 * sizeof(T))));
814 case 7:
815 return sse_cast<V>(_mm_alignr_epi8(v, v, sanitize<V>(7 * sizeof(T))));
816 }
817 return sse_cast<V>(_mm_setzero_si128());
818 }
819
820 //InterleaveImpl{{{1
821 template<typename V, size_t Size, size_t VSize> struct InterleaveImpl;
822 template<typename V> struct InterleaveImpl<V, 8, 16> {
823 template<typename I> static inline void interleave(typename V::EntryType *const data, const I &i,/*{{{*/
824 const typename V::AsArg v0, const typename V::AsArg v1)
825 {
826 const __m128i tmp0 = _mm_unpacklo_epi16(v0.data(), v1.data());
827 const __m128i tmp1 = _mm_unpackhi_epi16(v0.data(), v1.data());
828 #ifdef __x86_64__
829 const long long tmp00 = _mm_cvtsi128_si64(tmp0);
830 const long long tmp01 = _mm_cvtsi128_si64(_mm_unpackhi_epi64(tmp0, tmp0));
831 const long long tmp10 = _mm_cvtsi128_si64(tmp1);
832 const long long tmp11 = _mm_cvtsi128_si64(_mm_unpackhi_epi64(tmp1, tmp1));
833 aliasing_cast<int>(data[i[0]]) = tmp00;
834 aliasing_cast<int>(data[i[1]]) = tmp00 >> 32;
835 aliasing_cast<int>(data[i[2]]) = tmp01;
836 aliasing_cast<int>(data[i[3]]) = tmp01 >> 32;
837 aliasing_cast<int>(data[i[4]]) = tmp10;
838 aliasing_cast<int>(data[i[5]]) = tmp10 >> 32;
839 aliasing_cast<int>(data[i[6]]) = tmp11;
840 aliasing_cast<int>(data[i[7]]) = tmp11 >> 32;
841 #elif defined(Vc_IMPL_SSE4_1)
842 using namespace SseIntrinsics;
843 aliasing_cast<int>(data[i[0]]) = _mm_cvtsi128_si32(tmp0);
844 aliasing_cast<int>(data[i[1]]) = extract_epi32<1>(tmp0);
845 aliasing_cast<int>(data[i[2]]) = extract_epi32<2>(tmp0);
846 aliasing_cast<int>(data[i[3]]) = extract_epi32<3>(tmp0);
847 aliasing_cast<int>(data[i[4]]) = _mm_cvtsi128_si32(tmp1);
848 aliasing_cast<int>(data[i[5]]) = extract_epi32<1>(tmp1);
849 aliasing_cast<int>(data[i[6]]) = extract_epi32<2>(tmp1);
850 aliasing_cast<int>(data[i[7]]) = extract_epi32<3>(tmp1);
851 #else
852 aliasing_cast<int>(data[i[0]]) = _mm_cvtsi128_si32(tmp0);
853 aliasing_cast<int>(data[i[1]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp0, 4));
854 aliasing_cast<int>(data[i[2]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp0, 8));
855 aliasing_cast<int>(data[i[3]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp0, 12));
856 aliasing_cast<int>(data[i[4]]) = _mm_cvtsi128_si32(tmp1);
857 aliasing_cast<int>(data[i[5]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp1, 4));
858 aliasing_cast<int>(data[i[6]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp1, 8));
859 aliasing_cast<int>(data[i[7]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp1, 12));
860 #endif
861 }/*}}}*/
862 static inline void interleave(typename V::EntryType *const data, const Common::SuccessiveEntries<2> &i,/*{{{*/
863 const typename V::AsArg v0, const typename V::AsArg v1)
864 {
865 const __m128i tmp0 = _mm_unpacklo_epi16(v0.data(), v1.data());
866 const __m128i tmp1 = _mm_unpackhi_epi16(v0.data(), v1.data());
867 V(tmp0).store(&data[i[0]], Vc::Unaligned);
868 V(tmp1).store(&data[i[4]], Vc::Unaligned);
869 }/*}}}*/
870 template<typename I> static inline void interleave(typename V::EntryType *const data, const I &i,/*{{{*/
871 const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2)
872 {
873 #if defined Vc_USE_MASKMOV_SCATTER && !defined Vc_MSVC
874 // MSVC fails to compile the MMX intrinsics
875 const __m64 mask = _mm_set_pi16(0, -1, -1, -1);
876 const __m128i tmp0 = _mm_unpacklo_epi16(v0.data(), v2.data());
877 const __m128i tmp1 = _mm_unpackhi_epi16(v0.data(), v2.data());
878 const __m128i tmp2 = _mm_unpacklo_epi16(v1.data(), v1.data());
879 const __m128i tmp3 = _mm_unpackhi_epi16(v1.data(), v1.data());
880
881 const __m128i tmp4 = _mm_unpacklo_epi16(tmp0, tmp2);
882 const __m128i tmp5 = _mm_unpackhi_epi16(tmp0, tmp2);
883 const __m128i tmp6 = _mm_unpacklo_epi16(tmp1, tmp3);
884 const __m128i tmp7 = _mm_unpackhi_epi16(tmp1, tmp3);
885
886 _mm_maskmove_si64(_mm_movepi64_pi64(tmp4), mask, reinterpret_cast<char *>(&data[i[0]]));
887 _mm_maskmove_si64(_mm_movepi64_pi64(_mm_srli_si128(tmp4, 8)), mask, reinterpret_cast<char *>(&data[i[1]]));
888 _mm_maskmove_si64(_mm_movepi64_pi64(tmp5), mask, reinterpret_cast<char *>(&data[i[2]]));
889 _mm_maskmove_si64(_mm_movepi64_pi64(_mm_srli_si128(tmp5, 8)), mask, reinterpret_cast<char *>(&data[i[3]]));
890 _mm_maskmove_si64(_mm_movepi64_pi64(tmp6), mask, reinterpret_cast<char *>(&data[i[4]]));
891 _mm_maskmove_si64(_mm_movepi64_pi64(_mm_srli_si128(tmp6, 8)), mask, reinterpret_cast<char *>(&data[i[5]]));
892 _mm_maskmove_si64(_mm_movepi64_pi64(tmp7), mask, reinterpret_cast<char *>(&data[i[6]]));
893 _mm_maskmove_si64(_mm_movepi64_pi64(_mm_srli_si128(tmp7, 8)), mask, reinterpret_cast<char *>(&data[i[7]]));
894 _mm_empty();
895 #else
896 interleave(data, i, v0, v1);
897 v2.scatter(data + 2, i);
898 #endif
899 }/*}}}*/
900 template<typename I> static inline void interleave(typename V::EntryType *const data, const I &i,/*{{{*/
901 const typename V::AsArg v0, const typename V::AsArg v1,
902 const typename V::AsArg v2, const typename V::AsArg v3)
903 {
904 const __m128i tmp0 = _mm_unpacklo_epi16(v0.data(), v2.data());
905 const __m128i tmp1 = _mm_unpackhi_epi16(v0.data(), v2.data());
906 const __m128i tmp2 = _mm_unpacklo_epi16(v1.data(), v3.data());
907 const __m128i tmp3 = _mm_unpackhi_epi16(v1.data(), v3.data());
908
909 const __m128i tmp4 = _mm_unpacklo_epi16(tmp0, tmp2);
910 const __m128i tmp5 = _mm_unpackhi_epi16(tmp0, tmp2);
911 const __m128i tmp6 = _mm_unpacklo_epi16(tmp1, tmp3);
912 const __m128i tmp7 = _mm_unpackhi_epi16(tmp1, tmp3);
913
914 _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[0]]), tmp4);
915 _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[2]]), tmp5);
916 _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[4]]), tmp6);
917 _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[6]]), tmp7);
918 _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[1]]), _mm_castsi128_ps(tmp4));
919 _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[3]]), _mm_castsi128_ps(tmp5));
920 _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[5]]), _mm_castsi128_ps(tmp6));
921 _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[7]]), _mm_castsi128_ps(tmp7));
922 }/*}}}*/
923 static inline void interleave(typename V::EntryType *const data, const Common::SuccessiveEntries<4> &i,/*{{{*/
924 const typename V::AsArg v0, const typename V::AsArg v1,
925 const typename V::AsArg v2, const typename V::AsArg v3)
926 {
927 const __m128i tmp0 = _mm_unpacklo_epi16(v0.data(), v2.data());
928 const __m128i tmp1 = _mm_unpackhi_epi16(v0.data(), v2.data());
929 const __m128i tmp2 = _mm_unpacklo_epi16(v1.data(), v3.data());
930 const __m128i tmp3 = _mm_unpackhi_epi16(v1.data(), v3.data());
931
932 const __m128i tmp4 = _mm_unpacklo_epi16(tmp0, tmp2);
933 const __m128i tmp5 = _mm_unpackhi_epi16(tmp0, tmp2);
934 const __m128i tmp6 = _mm_unpacklo_epi16(tmp1, tmp3);
935 const __m128i tmp7 = _mm_unpackhi_epi16(tmp1, tmp3);
936
937 V(tmp4).store(&data[i[0]], ::Vc::Unaligned);
938 V(tmp5).store(&data[i[2]], ::Vc::Unaligned);
939 V(tmp6).store(&data[i[4]], ::Vc::Unaligned);
940 V(tmp7).store(&data[i[6]], ::Vc::Unaligned);
941 }/*}}}*/
942 template <typename I> // interleave 5 args{{{2
943 static inline void interleave(typename V::EntryType *const data, const I &i,
944 const typename V::AsArg v0, const typename V::AsArg v1,
945 const typename V::AsArg v2, const typename V::AsArg v3,
946 const typename V::AsArg v4)
947 {
948 interleave(data, i, v0, v1, v2, v3);
949 v4.scatter(data + 4, i);
950 }
951 template <typename I> // interleave 6 args{{{2
952 static inline void interleave(typename V::EntryType *const data, const I &i,
953 const typename V::AsArg v0, const typename V::AsArg v1,
954 const typename V::AsArg v2, const typename V::AsArg v3,
955 const typename V::AsArg v4, const typename V::AsArg v5)
956 {
957 interleave(data, i, v0, v1, v2, v3);
958 interleave(data + 4, i, v4, v5);
959 }
960 template <typename I> // interleave 7 args{{{2
961 static inline void interleave(typename V::EntryType *const data, const I &i,
962 const typename V::AsArg v0, const typename V::AsArg v1,
963 const typename V::AsArg v2, const typename V::AsArg v3,
964 const typename V::AsArg v4, const typename V::AsArg v5,
965 const typename V::AsArg v6)
966 {
967 interleave(data, i, v0, v1, v2, v3);
968 interleave(data + 4, i, v4, v5, v6);
969 }
970 template <typename I> // interleave 8 args{{{2
971 static inline void interleave(typename V::EntryType *const data, const I &i,
972 const typename V::AsArg v0, const typename V::AsArg v1,
973 const typename V::AsArg v2, const typename V::AsArg v3,
974 const typename V::AsArg v4, const typename V::AsArg v5,
975 const typename V::AsArg v6, const typename V::AsArg v7)
976 {
977 interleave(data, i, v0, v1, v2, v3);
978 interleave(data + 4, i, v4, v5, v6, v7);
979 }
980 //}}}2
981 template<typename I> static inline void deinterleave(typename V::EntryType const *const data, /*{{{*/
982 const I &i, V &v0, V &v1)
983 {
984 const __m128i a = _mm_cvtsi32_si128(*aliasing_cast<int>(&data[i[0]]));
985 const __m128i b = _mm_cvtsi32_si128(*aliasing_cast<int>(&data[i[1]]));
986 const __m128i c = _mm_cvtsi32_si128(*aliasing_cast<int>(&data[i[2]]));
987 const __m128i d = _mm_cvtsi32_si128(*aliasing_cast<int>(&data[i[3]]));
988 const __m128i e = _mm_cvtsi32_si128(*aliasing_cast<int>(&data[i[4]]));
989 const __m128i f = _mm_cvtsi32_si128(*aliasing_cast<int>(&data[i[5]]));
990 const __m128i g = _mm_cvtsi32_si128(*aliasing_cast<int>(&data[i[6]]));
991 const __m128i h = _mm_cvtsi32_si128(*aliasing_cast<int>(&data[i[7]]));
992
993 const __m128i tmp2 = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4
994 const __m128i tmp3 = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6
995 const __m128i tmp4 = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5
996 const __m128i tmp5 = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7
997
998 const __m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6
999 const __m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7
1000
1001 v0.data() = _mm_unpacklo_epi16(tmp0, tmp1);
1002 v1.data() = _mm_unpackhi_epi16(tmp0, tmp1);
1003 }/*}}}*/
1004 template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
1005 const I &i, V &v0, V &v1, V &v2)
1006 {
1007 const __m128i a = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[0]]));
1008 const __m128i b = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[1]]));
1009 const __m128i c = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[2]]));
1010 const __m128i d = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[3]]));
1011 const __m128i e = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[4]]));
1012 const __m128i f = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[5]]));
1013 const __m128i g = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[6]]));
1014 const __m128i h = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[7]]));
1015
1016 const __m128i tmp2 = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4
1017 const __m128i tmp4 = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5
1018 const __m128i tmp3 = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6
1019 const __m128i tmp5 = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7
1020
1021 const __m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6
1022 const __m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7
1023 const __m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6
1024 const __m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7
1025
1026 v0.data() = _mm_unpacklo_epi16(tmp0, tmp1);
1027 v1.data() = _mm_unpackhi_epi16(tmp0, tmp1);
1028 v2.data() = _mm_unpacklo_epi16(tmp6, tmp7);
1029 }/*}}}*/
1030 template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
1031 const I &i, V &v0, V &v1, V &v2, V &v3)
1032 {
1033 const __m128i a = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[0]]));
1034 const __m128i b = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[1]]));
1035 const __m128i c = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[2]]));
1036 const __m128i d = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[3]]));
1037 const __m128i e = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[4]]));
1038 const __m128i f = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[5]]));
1039 const __m128i g = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[6]]));
1040 const __m128i h = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[7]]));
1041
1042 const __m128i tmp2 = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4
1043 const __m128i tmp4 = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5
1044 const __m128i tmp3 = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6
1045 const __m128i tmp5 = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7
1046
1047 const __m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6
1048 const __m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7
1049 const __m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6
1050 const __m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7
1051
1052 v0.data() = _mm_unpacklo_epi16(tmp0, tmp1);
1053 v1.data() = _mm_unpackhi_epi16(tmp0, tmp1);
1054 v2.data() = _mm_unpacklo_epi16(tmp6, tmp7);
1055 v3.data() = _mm_unpackhi_epi16(tmp6, tmp7);
1056 }/*}}}*/
1057 template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
1058 const I &i, V &v0, V &v1, V &v2, V &v3, V &v4)
1059 {
1060 const __m128i a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[0]]));
1061 const __m128i b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[1]]));
1062 const __m128i c = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[2]]));
1063 const __m128i d = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[3]]));
1064 const __m128i e = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[4]]));
1065 const __m128i f = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[5]]));
1066 const __m128i g = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[6]]));
1067 const __m128i h = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[7]]));
1068
1069 const __m128i tmp2 = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4
1070 const __m128i tmp4 = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5
1071 const __m128i tmp3 = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6
1072 const __m128i tmp5 = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7
1073 const __m128i tmp10 = _mm_unpackhi_epi16(a, e); // e0 e4 f0 f4 g0 g4 h0 h4
1074 const __m128i tmp11 = _mm_unpackhi_epi16(c, g); // e1 e5 f1 f5 g1 g5 h1 h5
1075 const __m128i tmp12 = _mm_unpackhi_epi16(b, f); // e2 e6 f2 f6 g2 g6 h2 h6
1076 const __m128i tmp13 = _mm_unpackhi_epi16(d, h); // e3 e7 f3 f7 g3 g7 h3 h7
1077
1078 const __m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6
1079 const __m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7
1080 const __m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6
1081 const __m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7
1082 const __m128i tmp8 = _mm_unpacklo_epi16(tmp10, tmp11); // e0 e2 e4 e6 f0 f2 f4 f6
1083 const __m128i tmp9 = _mm_unpacklo_epi16(tmp12, tmp13); // e1 e3 e5 e7 f1 f3 f5 f7
1084
1085 v0.data() = _mm_unpacklo_epi16(tmp0, tmp1);
1086 v1.data() = _mm_unpackhi_epi16(tmp0, tmp1);
1087 v2.data() = _mm_unpacklo_epi16(tmp6, tmp7);
1088 v3.data() = _mm_unpackhi_epi16(tmp6, tmp7);
1089 v4.data() = _mm_unpacklo_epi16(tmp8, tmp9);
1090 }/*}}}*/
1091 template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
1092 const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5)
1093 {
1094 const __m128i a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[0]]));
1095 const __m128i b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[1]]));
1096 const __m128i c = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[2]]));
1097 const __m128i d = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[3]]));
1098 const __m128i e = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[4]]));
1099 const __m128i f = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[5]]));
1100 const __m128i g = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[6]]));
1101 const __m128i h = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[7]]));
1102
1103 const __m128i tmp2 = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4
1104 const __m128i tmp4 = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5
1105 const __m128i tmp3 = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6
1106 const __m128i tmp5 = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7
1107 const __m128i tmp10 = _mm_unpackhi_epi16(a, e); // e0 e4 f0 f4 g0 g4 h0 h4
1108 const __m128i tmp11 = _mm_unpackhi_epi16(c, g); // e1 e5 f1 f5 g1 g5 h1 h5
1109 const __m128i tmp12 = _mm_unpackhi_epi16(b, f); // e2 e6 f2 f6 g2 g6 h2 h6
1110 const __m128i tmp13 = _mm_unpackhi_epi16(d, h); // e3 e7 f3 f7 g3 g7 h3 h7
1111
1112 const __m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6
1113 const __m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7
1114 const __m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6
1115 const __m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7
1116 const __m128i tmp8 = _mm_unpacklo_epi16(tmp10, tmp11); // e0 e2 e4 e6 f0 f2 f4 f6
1117 const __m128i tmp9 = _mm_unpacklo_epi16(tmp12, tmp13); // e1 e3 e5 e7 f1 f3 f5 f7
1118
1119 v0.data() = _mm_unpacklo_epi16(tmp0, tmp1);
1120 v1.data() = _mm_unpackhi_epi16(tmp0, tmp1);
1121 v2.data() = _mm_unpacklo_epi16(tmp6, tmp7);
1122 v3.data() = _mm_unpackhi_epi16(tmp6, tmp7);
1123 v4.data() = _mm_unpacklo_epi16(tmp8, tmp9);
1124 v5.data() = _mm_unpackhi_epi16(tmp8, tmp9);
1125 }/*}}}*/
1126 template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
1127 const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6)
1128 {
1129 const __m128i a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[0]]));
1130 const __m128i b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[1]]));
1131 const __m128i c = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[2]]));
1132 const __m128i d = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[3]]));
1133 const __m128i e = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[4]]));
1134 const __m128i f = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[5]]));
1135 const __m128i g = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[6]]));
1136 const __m128i h = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[7]]));
1137
1138 const __m128i tmp2 = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4
1139 const __m128i tmp4 = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5
1140 const __m128i tmp3 = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6
1141 const __m128i tmp5 = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7
1142 const __m128i tmp10 = _mm_unpackhi_epi16(a, e); // e0 e4 f0 f4 g0 g4 h0 h4
1143 const __m128i tmp11 = _mm_unpackhi_epi16(c, g); // e1 e5 f1 f5 g1 g5 h1 h5
1144 const __m128i tmp12 = _mm_unpackhi_epi16(b, f); // e2 e6 f2 f6 g2 g6 h2 h6
1145 const __m128i tmp13 = _mm_unpackhi_epi16(d, h); // e3 e7 f3 f7 g3 g7 h3 h7
1146
1147 const __m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6
1148 const __m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7
1149 const __m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6
1150 const __m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7
1151 const __m128i tmp8 = _mm_unpacklo_epi16(tmp10, tmp11); // e0 e2 e4 e6 f0 f2 f4 f6
1152 const __m128i tmp9 = _mm_unpacklo_epi16(tmp12, tmp13); // e1 e3 e5 e7 f1 f3 f5 f7
1153 const __m128i tmp14 = _mm_unpackhi_epi16(tmp10, tmp11); // g0 g2 g4 g6 h0 h2 h4 h6
1154 const __m128i tmp15 = _mm_unpackhi_epi16(tmp12, tmp13); // g1 g3 g5 g7 h1 h3 h5 h7
1155
1156 v0.data() = _mm_unpacklo_epi16(tmp0, tmp1);
1157 v1.data() = _mm_unpackhi_epi16(tmp0, tmp1);
1158 v2.data() = _mm_unpacklo_epi16(tmp6, tmp7);
1159 v3.data() = _mm_unpackhi_epi16(tmp6, tmp7);
1160 v4.data() = _mm_unpacklo_epi16(tmp8, tmp9);
1161 v5.data() = _mm_unpackhi_epi16(tmp8, tmp9);
1162 v6.data() = _mm_unpacklo_epi16(tmp14, tmp15);
1163 }/*}}}*/
1164 template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
1165 const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6, V &v7)
1166 {
1167 const __m128i a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[0]]));
1168 const __m128i b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[1]]));
1169 const __m128i c = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[2]]));
1170 const __m128i d = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[3]]));
1171 const __m128i e = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[4]]));
1172 const __m128i f = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[5]]));
1173 const __m128i g = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[6]]));
1174 const __m128i h = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[7]]));
1175
1176 const __m128i tmp2 = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4
1177 const __m128i tmp4 = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5
1178 const __m128i tmp3 = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6
1179 const __m128i tmp5 = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7
1180 const __m128i tmp10 = _mm_unpackhi_epi16(a, e); // e0 e4 f0 f4 g0 g4 h0 h4
1181 const __m128i tmp11 = _mm_unpackhi_epi16(c, g); // e1 e5 f1 f5 g1 g5 h1 h5
1182 const __m128i tmp12 = _mm_unpackhi_epi16(b, f); // e2 e6 f2 f6 g2 g6 h2 h6
1183 const __m128i tmp13 = _mm_unpackhi_epi16(d, h); // e3 e7 f3 f7 g3 g7 h3 h7
1184
1185 const __m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6
1186 const __m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7
1187 const __m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6
1188 const __m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7
1189 const __m128i tmp8 = _mm_unpacklo_epi16(tmp10, tmp11); // e0 e2 e4 e6 f0 f2 f4 f6
1190 const __m128i tmp9 = _mm_unpacklo_epi16(tmp12, tmp13); // e1 e3 e5 e7 f1 f3 f5 f7
1191 const __m128i tmp14 = _mm_unpackhi_epi16(tmp10, tmp11); // g0 g2 g4 g6 h0 h2 h4 h6
1192 const __m128i tmp15 = _mm_unpackhi_epi16(tmp12, tmp13); // g1 g3 g5 g7 h1 h3 h5 h7
1193
1194 v0.data() = _mm_unpacklo_epi16(tmp0, tmp1);
1195 v1.data() = _mm_unpackhi_epi16(tmp0, tmp1);
1196 v2.data() = _mm_unpacklo_epi16(tmp6, tmp7);
1197 v3.data() = _mm_unpackhi_epi16(tmp6, tmp7);
1198 v4.data() = _mm_unpacklo_epi16(tmp8, tmp9);
1199 v5.data() = _mm_unpackhi_epi16(tmp8, tmp9);
1200 v6.data() = _mm_unpacklo_epi16(tmp14, tmp15);
1201 v7.data() = _mm_unpackhi_epi16(tmp14, tmp15);
1202 }/*}}}*/
1203 };
1204 template<typename V> struct InterleaveImpl<V, 4, 16> {
1205 static inline void interleave(typename V::EntryType *const data, const Common::SuccessiveEntries<2> &i,/*{{{*/
1206 const typename V::AsArg v0, const typename V::AsArg v1)
1207 {
1208 const __m128 tmp0 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data()));
1209 const __m128 tmp1 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data()));
1210 _mm_storeu_ps(aliasing_cast<float>(&data[i[0]]), tmp0);
1211 _mm_storeu_ps(aliasing_cast<float>(&data[i[2]]), tmp1);
1212 }/*}}}*/
1213 template <typename I> // interleave 2 args {{{2
1214 static inline void interleave(typename V::EntryType *const data, const I &i,
1215 const typename V::AsArg v0, const typename V::AsArg v1)
1216 {
1217 const __m128 tmp0 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data()));
1218 const __m128 tmp1 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data()));
1219 _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[0]]), tmp0);
1220 _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[1]]), tmp0);
1221 _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[2]]), tmp1);
1222 _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[3]]), tmp1);
1223 }
1224 template <typename I> // interleave 3 args {{{2
1225 static inline void interleave(typename V::EntryType *const data, const I &i,
1226 const typename V::AsArg v0, const typename V::AsArg v1,
1227 const typename V::AsArg v2)
1228 {
1229 #ifdef Vc_USE_MASKMOV_SCATTER
1230 const __m128 tmp0 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v0.data()), SSE::sse_cast<__m128>(v1.data()));
1231 const __m128 tmp1 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v0.data()), SSE::sse_cast<__m128>(v1.data()));
1232 const __m128 tmp2 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v2.data()), SSE::sse_cast<__m128>(v2.data()));
1233 const __m128 tmp3 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v2.data()), SSE::sse_cast<__m128>(v2.data()));
1234 const __m128i mask = _mm_set_epi32(0, -1, -1, -1);
1235 _mm_maskmoveu_si128(_mm_castps_si128(_mm_movelh_ps(tmp0, tmp2)), mask, reinterpret_cast<char *>(&data[i[0]]));
1236 _mm_maskmoveu_si128(_mm_castps_si128(_mm_movehl_ps(tmp2, tmp0)), mask, reinterpret_cast<char *>(&data[i[1]]));
1237 _mm_maskmoveu_si128(_mm_castps_si128(_mm_movelh_ps(tmp1, tmp3)), mask, reinterpret_cast<char *>(&data[i[2]]));
1238 _mm_maskmoveu_si128(_mm_castps_si128(_mm_movehl_ps(tmp3, tmp1)), mask, reinterpret_cast<char *>(&data[i[3]]));
1239 #else
1240 const __m128 tmp0 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data()));
1241 const __m128 tmp1 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data()));
1242 _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[0]]), tmp0);
1243 _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[1]]), tmp0);
1244 _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[2]]), tmp1);
1245 _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[3]]), tmp1);
1246 v2.scatter(data + 2, i);
1247 #endif
1248 }
1249 template <typename I> // interleave 4 args {{{2
1250 static inline void interleave(typename V::EntryType *const data, const I &i,
1251 const typename V::AsArg v0, const typename V::AsArg v1,
1252 const typename V::AsArg v2, const typename V::AsArg v3)
1253 {
1254 const __m128 tmp0 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data()));
1255 const __m128 tmp1 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data()));
1256 const __m128 tmp2 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v2.data()),SSE::sse_cast<__m128>(v3.data()));
1257 const __m128 tmp3 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v2.data()),SSE::sse_cast<__m128>(v3.data()));
1258 _mm_storeu_ps(aliasing_cast<float>(&data[i[0]]), _mm_movelh_ps(tmp0, tmp2));
1259 _mm_storeu_ps(aliasing_cast<float>(&data[i[1]]), _mm_movehl_ps(tmp2, tmp0));
1260 _mm_storeu_ps(aliasing_cast<float>(&data[i[2]]), _mm_movelh_ps(tmp1, tmp3));
1261 _mm_storeu_ps(aliasing_cast<float>(&data[i[3]]), _mm_movehl_ps(tmp3, tmp1));
1262 }
1263 template <typename I> // interleave 5 args {{{2
1264 static inline void interleave(typename V::EntryType *const data, const I &i,
1265 const typename V::AsArg v0, const typename V::AsArg v1,
1266 const typename V::AsArg v2, const typename V::AsArg v3,
1267 const typename V::AsArg v4)
1268 {
1269 interleave(data, i, v0, v1, v2, v3);
1270 v4.scatter(data + 4, i);
1271 }
1272 template <typename I> // interleave 6 args {{{2
1273 static inline void interleave(typename V::EntryType *const data, const I &i,
1274 const typename V::AsArg v0, const typename V::AsArg v1,
1275 const typename V::AsArg v2, const typename V::AsArg v3,
1276 const typename V::AsArg v4, const typename V::AsArg v5)
1277 {
1278 interleave(data, i, v0, v1, v2, v3);
1279 interleave(data + 4, i, v4, v5);
1280 }
1281 template <typename I> // interleave 7 args {{{2
1282 static inline void interleave(typename V::EntryType *const data, const I &i,
1283 const typename V::AsArg v0, const typename V::AsArg v1,
1284 const typename V::AsArg v2, const typename V::AsArg v3,
1285 const typename V::AsArg v4, const typename V::AsArg v5,
1286 const typename V::AsArg v6)
1287 {
1288 interleave(data, i, v0, v1, v2, v3);
1289 interleave(data + 4, i, v4, v5, v6);
1290 }
1291 template <typename I> // interleave 8 args {{{2
1292 static inline void interleave(typename V::EntryType *const data, const I &i,
1293 const typename V::AsArg v0, const typename V::AsArg v1,
1294 const typename V::AsArg v2, const typename V::AsArg v3,
1295 const typename V::AsArg v4, const typename V::AsArg v5,
1296 const typename V::AsArg v6, const typename V::AsArg v7)
1297 {
1298 interleave(data, i, v0, v1, v2, v3);
1299 interleave(data + 4, i, v4, v5, v6, v7);
1300 }
1301 //}}}2
1302 template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
1303 const I &i, V &v0, V &v1)
1304 {
1305 const __m128 a = _mm_castpd_ps(_mm_load_sd(aliasing_cast<double>(&data[i[0]])));
1306 const __m128 b = _mm_castpd_ps(_mm_load_sd(aliasing_cast<double>(&data[i[1]])));
1307 const __m128 c = _mm_castpd_ps(_mm_load_sd(aliasing_cast<double>(&data[i[2]])));
1308 const __m128 d = _mm_castpd_ps(_mm_load_sd(aliasing_cast<double>(&data[i[3]])));
1309
1310 const __m128 tmp0 = _mm_unpacklo_ps(a, b); // [a0 a1 b0 b1]
1311 const __m128 tmp1 = _mm_unpacklo_ps(c, d); // [a2 a3 b2 b3]
1312
1313 v0.data() = SSE::sse_cast<typename V::VectorType>(_mm_movelh_ps(tmp0, tmp1));
1314 v1.data() = SSE::sse_cast<typename V::VectorType>(_mm_movehl_ps(tmp1, tmp0));
1315 }/*}}}*/
1316 template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
1317 const I &i, V &v0, V &v1, V &v2)
1318 {
1319 const __m128 a = _mm_loadu_ps(aliasing_cast<float>(&data[i[0]]));
1320 const __m128 b = _mm_loadu_ps(aliasing_cast<float>(&data[i[1]]));
1321 const __m128 c = _mm_loadu_ps(aliasing_cast<float>(&data[i[2]]));
1322 const __m128 d = _mm_loadu_ps(aliasing_cast<float>(&data[i[3]]));
1323
1324 const __m128 tmp0 = _mm_unpacklo_ps(a, b); // [a0 a1 b0 b1]
1325 const __m128 tmp1 = _mm_unpacklo_ps(c, d); // [a2 a3 b2 b3]
1326 const __m128 tmp2 = _mm_unpackhi_ps(a, b); // [c0 c1 XX XX]
1327 const __m128 tmp3 = _mm_unpackhi_ps(c, d); // [c2 c3 XX XX]
1328
1329 v0.data() = SSE::sse_cast<typename V::VectorType>(_mm_movelh_ps(tmp0, tmp1));
1330 v1.data() = SSE::sse_cast<typename V::VectorType>(_mm_movehl_ps(tmp1, tmp0));
1331 v2.data() = SSE::sse_cast<typename V::VectorType>(_mm_movelh_ps(tmp2, tmp3));
1332 }/*}}}*/
1333 template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
1334 const I &i, V &v0, V &v1, V &v2, V &v3)
1335 {
1336 const __m128 a = _mm_loadu_ps(aliasing_cast<float>(&data[i[0]]));
1337 const __m128 b = _mm_loadu_ps(aliasing_cast<float>(&data[i[1]]));
1338 const __m128 c = _mm_loadu_ps(aliasing_cast<float>(&data[i[2]]));
1339 const __m128 d = _mm_loadu_ps(aliasing_cast<float>(&data[i[3]]));
1340
1341 const __m128 tmp0 = _mm_unpacklo_ps(a, b); // [a0 a1 b0 b1]
1342 const __m128 tmp1 = _mm_unpacklo_ps(c, d); // [a2 a3 b2 b3]
1343 const __m128 tmp2 = _mm_unpackhi_ps(a, b); // [c0 c1 d0 d1]
1344 const __m128 tmp3 = _mm_unpackhi_ps(c, d); // [c2 c3 d2 d3]
1345
1346 v0.data() = SSE::sse_cast<typename V::VectorType>(_mm_movelh_ps(tmp0, tmp1));
1347 v1.data() = SSE::sse_cast<typename V::VectorType>(_mm_movehl_ps(tmp1, tmp0));
1348 v2.data() = SSE::sse_cast<typename V::VectorType>(_mm_movelh_ps(tmp2, tmp3));
1349 v3.data() = SSE::sse_cast<typename V::VectorType>(_mm_movehl_ps(tmp3, tmp2));
1350 }/*}}}*/
1351 template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
1352 const I &i, V &v0, V &v1, V &v2, V &v3, V &v4)
1353 {
1354 deinterleave(data, i, v0, v1, v2, v3);
1355 v4.gather(data + 4, i);
1356 }/*}}}*/
1357 template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
1358 const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5)
1359 {
1360 deinterleave(data, i, v0, v1, v2, v3);
1361 deinterleave(data + 4, i, v4, v5);
1362 }/*}}}*/
1363 template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
1364 const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6)
1365 {
1366 deinterleave(data, i, v0, v1, v2, v3);
1367 deinterleave(data + 4, i, v4, v5, v6);
1368 }/*}}}*/
1369 template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
1370 const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6, V &v7)
1371 {
1372 deinterleave(data, i, v0, v1, v2, v3);
1373 deinterleave(data + 4, i, v4, v5, v6, v7);
1374 }/*}}}*/
1375 };
1376 template<typename V> struct InterleaveImpl<V, 2, 16> {
1377 template <typename I> // interleave 2 args {{{2
1378 static inline void interleave(typename V::EntryType *const data, const I &i,
1379 const typename V::AsArg v0, const typename V::AsArg v1)
1380 {
1381 const __m128d tmp0 = _mm_unpacklo_pd(v0.data(), v1.data());
1382 const __m128d tmp1 = _mm_unpackhi_pd(v0.data(), v1.data());
1383 _mm_storeu_pd(&data[i[0]], tmp0);
1384 _mm_storeu_pd(&data[i[1]], tmp1);
1385 }
1386 template <typename I> // interleave 3 args {{{2
1387 static inline void interleave(typename V::EntryType *const data, const I &i,
1388 const typename V::AsArg v0, const typename V::AsArg v1,
1389 const typename V::AsArg v2)
1390 {
1391 interleave(data, i, v0, v1);
1392 v2.scatter(data + 2, i);
1393 }
1394 template <typename I> // interleave 4 args {{{2
1395 static inline void interleave(typename V::EntryType *const data, const I &i,
1396 const typename V::AsArg v0, const typename V::AsArg v1,
1397 const typename V::AsArg v2, const typename V::AsArg v3)
1398 {
1399 interleave(data, i, v0, v1);
1400 interleave(data + 2, i, v2, v3);
1401 }
1402 template <typename I> // interleave 5 args {{{2
1403 static inline void interleave(typename V::EntryType *const data, const I &i,
1404 const typename V::AsArg v0, const typename V::AsArg v1,
1405 const typename V::AsArg v2, const typename V::AsArg v3,
1406 const typename V::AsArg v4)
1407 {
1408 interleave(data, i, v0, v1, v2, v3);
1409 v4.scatter(data + 4, i);
1410 }
1411 template <typename I> // interleave 6 args {{{2
1412 static inline void interleave(typename V::EntryType *const data, const I &i,
1413 const typename V::AsArg v0, const typename V::AsArg v1,
1414 const typename V::AsArg v2, const typename V::AsArg v3,
1415 const typename V::AsArg v4, const typename V::AsArg v5)
1416 {
1417 interleave(data, i, v0, v1, v2, v3);
1418 interleave(data + 4, i, v4, v5);
1419 }
1420 template <typename I> // interleave 7 args {{{2
1421 static inline void interleave(typename V::EntryType *const data, const I &i,
1422 const typename V::AsArg v0, const typename V::AsArg v1,
1423 const typename V::AsArg v2, const typename V::AsArg v3,
1424 const typename V::AsArg v4, const typename V::AsArg v5,
1425 const typename V::AsArg v6)
1426 {
1427 interleave(data, i, v0, v1, v2, v3);
1428 interleave(data + 4, i, v4, v5, v6);
1429 }
1430 template <typename I> // interleave 8 args {{{2
1431 static inline void interleave(typename V::EntryType *const data, const I &i,
1432 const typename V::AsArg v0, const typename V::AsArg v1,
1433 const typename V::AsArg v2, const typename V::AsArg v3,
1434 const typename V::AsArg v4, const typename V::AsArg v5,
1435 const typename V::AsArg v6, const typename V::AsArg v7)
1436 {
1437 interleave(data, i, v0, v1, v2, v3);
1438 interleave(data + 4, i, v4, v5, v6, v7);
1439 }
1440 //}}}2
1441 template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
1442 const I &i, V &v0, V &v1)
1443 {
1444 const __m128d a = _mm_loadu_pd(&data[i[0]]);
1445 const __m128d b = _mm_loadu_pd(&data[i[1]]);
1446
1447 v0.data() = _mm_unpacklo_pd(a, b);
1448 v1.data() = _mm_unpackhi_pd(a, b);
1449 }/*}}}*/
1450 template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
1451 const I &i, V &v0, V &v1, V &v2)
1452 {
1453 v2.gather(data + 2, i);
1454 deinterleave(data, i, v0, v1);
1455 }/*}}}*/
1456 template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
1457 const I &i, V &v0, V &v1, V &v2, V &v3)
1458 {
1459 deinterleave(data, i, v0, v1);
1460 deinterleave(data + 2, i, v2, v3);
1461 }/*}}}*/
1462 template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
1463 const I &i, V &v0, V &v1, V &v2, V &v3, V &v4)
1464 {
1465 deinterleave(data, i, v0, v1);
1466 deinterleave(data + 2, i, v2, v3);
1467 v4.gather(data + 4, i);
1468 }/*}}}*/
1469 template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
1470 const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5)
1471 {
1472 deinterleave(data, i, v0, v1);
1473 deinterleave(data + 2, i, v2, v3);
1474 deinterleave(data + 4, i, v4, v5);
1475 }/*}}}*/
1476 template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
1477 const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6)
1478 {
1479 deinterleave(data, i, v0, v1);
1480 deinterleave(data + 2, i, v2, v3);
1481 deinterleave(data + 4, i, v4, v5);
1482 v6.gather(data + 6, i);
1483 }/*}}}*/
1484 template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
1485 const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6, V &v7)
1486 {
1487 deinterleave(data, i, v0, v1);
1488 deinterleave(data + 2, i, v2, v3);
1489 deinterleave(data + 4, i, v4, v5);
1490 deinterleave(data + 6, i, v6, v7);
1491 }/*}}}*/
1492 };
1493
1494 //}}}1
1495 } // namespace Detail
1496 } // namespace Vc
1497
1498 #endif // VC_SSE_DETAIL_H_
1499
1500 // vim: foldmethod=marker
1501