1 /*  Copyright (C) 2013-2014  Povilas Kanapickas <povilas@radix.lt>
2 
3     Distributed under the Boost Software License, Version 1.0.
4         (See accompanying file LICENSE_1_0.txt or copy at
5             http://www.boost.org/LICENSE_1_0.txt)
6 */
7 
8 #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_MEM_UNPACK_H
9 #define LIBSIMDPP_SIMDPP_DETAIL_INSN_MEM_UNPACK_H
10 
11 #ifndef LIBSIMDPP_SIMD_H
12     #error "This file must be included through simd.h"
13 #endif
14 
15 #include <simdpp/types.h>
16 #include <simdpp/detail/width.h>
17 #include <simdpp/detail/insn/shuffle128.h>
18 #include <simdpp/detail/insn/zip128.h>
19 #include <simdpp/core/align.h>
20 #include <simdpp/core/splat_n.h>
21 #include <simdpp/core/make_shuffle_bytes_mask.h>
22 #include <simdpp/core/shuffle1.h>
23 #include <simdpp/core/shuffle2.h>
24 #include <simdpp/core/transpose.h>
25 #include <simdpp/core/unzip_hi.h>
26 #include <simdpp/core/unzip_lo.h>
27 #include <simdpp/core/zip_hi.h>
28 #include <simdpp/core/zip_lo.h>
29 
30 namespace simdpp {
31 namespace SIMDPP_ARCH_NAMESPACE {
32 namespace detail {
33 namespace insn {
34 
35 /** Concatenates @a a and @a b and stores the elements of the resulting array
36     as follows:
37      * every (2n)-th element is stored to @a a
38      * every (2n+1)-th element is stored to @a b
39 
40     n = [0, <number of elements in vector> - 1]
41 */
42 template<class V> SIMDPP_INL
mem_unpack2(any_vec<16,V> & qa,any_vec<16,V> & qb)43 void mem_unpack2(any_vec<16,V>& qa, any_vec<16,V>& qb)
44 {
45     V a = qa.wrapped();
46     V b = qb.wrapped();
47 
48     qa.wrapped() = unzip128_lo(a, b);
49     qb.wrapped() = unzip128_hi(a, b);
50 }
51 
52 template<class V> SIMDPP_INL
mem_unpack2(any_vec<32,V> & qa,any_vec<32,V> & qb)53 void mem_unpack2(any_vec<32,V>& qa, any_vec<32,V>& qb)
54 {
55     V a = qa.wrapped();
56     V b = qb.wrapped();
57 
58     V c1 = shuffle1_128<0,0>(a, b);
59     V c2 = shuffle1_128<1,1>(a, b);
60     qa.wrapped() = unzip128_lo(c1, c2);
61     qb.wrapped() = unzip128_hi(c1, c2);
62 }
63 
64 #if SIMDPP_USE_AVX512F
65 template<class V> SIMDPP_INL
mem_unpack2(any_vec<64,V> & qa,any_vec<64,V> & qb)66 void mem_unpack2(any_vec<64,V>& qa, any_vec<64,V>& qb)
67 {
68     V a = qa.wrapped();
69     V b = qb.wrapped();
70 
71     V c1 = shuffle2_128<0,2,0,2>(a, b);
72     V c2 = shuffle2_128<1,3,1,3>(a, b);
73     qa.wrapped() = unzip128_lo(c1, c2);
74     qb.wrapped() = unzip128_hi(c1, c2);
75 }
76 #endif
77 
78 /** Generic implementation of mem_unpack3. The 128-bit lanes are processed
79     independently
80 */
81 template<class T> SIMDPP_INL
v_mem_unpack3_impl8_128(T & a,T & b,T & c)82 void v_mem_unpack3_impl8_128(T& a, T& b, T& c)
83 {
84 #if SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
85     // [a0, b0, c0, a1, b1, c1, a2, b2, c2, a3, b3, c3, a4, b4, c4, a5 ]
86     // [b5, c5, a6, b6, c6, a7, b7, c7, a8, b8, c8, a9, b9, c9, a10,b10]
87     // [c10,a11,b11,c11,a12,b12,c12,a13,b13,c13,a14,b14,c14,a15,b15,c15]
88     T mask1 = make_shuffle_bytes16_mask<   1,    4,    7, 10, 13,16+0,16+3,16+6,
89                                         16+9,16+12,16+15,  2,  5,   8,  11,  14>(mask1);
90     T a1, b1, c1;
91     a1 = shuffle_bytes16(c, a, mask1);
92     b1 = shuffle_bytes16(a, b, mask1);
93     c1 = shuffle_bytes16(b, c, mask1);
94     // [a11,a12,a13,a14,a15,a0, a1, a2, a3, a4, a5, b11,b12,b13,b14,b15]
95     // [b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10,c0, c1, c2, c3, c4 ]
96     // [c5, c6, c7, c8, c9, c10,c11,c12,c13,c14,c15,a6, a7, a8, a9, a10]
97     T a2, b2, c2;
98     T mask2 = make_uint(0xff);
99     mask2 = move16_l<5>(mask2);
100 
101     a2 = blend(a1, c1, mask2);
102     b2 = blend(b1, a1, mask2);
103     c2 = blend(c1, b1, mask2);
104     // [a11..a15,a0..a10]
105     // [b0..b15]
106     // [c5..c15,c0..c5]
107     a = align16<5>(a2, a2);
108     b = b2;
109     c = align16<11>(c2, c2);
110 #else
111     typename same_width<T>::u8 t0, t1, t2, t3;
112     t0 = a;
113     t1 = align16<12>(a, b);
114     t2 = align16<8>(b, c);
115     t3 = move16_l<4>(c);
116     // [a0, b0, c0, a1, b1, c1, a2, b2, c2, a3, b3, c3, ...]
117     // [a4, b4, c4, a5, b5, c5, a6, b6, c6, a7, b7, c7, ...]
118     // [a8, b8, c8, a9, b9, c9, a10,b10,c10,a11,b11,c11, ...]
119     // [a12,b12,c12,a13,b13,c13,a14,b14,c14,a15,b15,c15, ...]
120     typename same_width<T>::u16 b0, b1, b2, b3;
121     b0 = zip16_lo(t0, t1);
122     b1 = zip16_lo(t2, t3);
123     b2 = zip16_hi(t0, t1);
124     b3 = zip16_hi(t2, t3);
125     // [a0, a4, b0, b4, c0, c4, a1, a5, b1, b5, c1, c5, a2, a6, b2, b6 ]
126     // [a8, a12,b8, b12,c9, c13,a9, a13,b9, b13,c9, c13,a10,a14,b10,b14,]
127     // [c2, c6, a3, a7, b3, b7, c3, c7, ... ]
128     // [c10,c14,a11,a15,b11,b15,c11,c15,... ]
129     typename same_width<T>::u8 u0, u1, u2;
130     u0 = zip8_lo(b0, b1);
131     u1 = zip8_hi(b0, b1);
132     u2 = zip8_lo(b2, b3);
133     // [a0, a4, a8, a12,b0, b4, b8, b12, c0, c4, c8, c12, a1, a5, a9, a13 ]
134     // [b1, b5, b9, b13,c1, c5, c9, c13, a2, a6, a10,a14, b2, b6, b10,b14 ]
135     // [c2, c6, c10,c14,a3, a7, a11,a15, b3, b7, b11,b15, c3, c7, c11,c15 ]
136     t0 = u0;
137     t1 = align16<12>(u0, u1);
138     t2 = align16<8>(u1, u2);
139     t3 = move16_l<4>(u2);
140     // [a0, a4, a8, a12,b0, b4, b8, b12, c0, c4, c8, c12, ...]
141     // [a1, a5, a9, a13,b1, b5, b9, b13, c1, c5, c9, c13, ...]
142     // [a2, a6, a10,a14,b2, b6, b10,b13, c2, c6, c10,c14, ...]
143     // [a3, a7, a11,a15,b3, b7, b11,b13, c3, c7, c11,c15, ...]
144     b0 = zip16_lo(t0, t1);
145     b1 = zip16_lo(t2, t3);
146     b2 = zip16_hi(t0, t1);
147     b3 = zip16_hi(t2, t3);
148     // [a0, a1, a4, a5, a8, a9, a12,a13,b0, b1, b4, b5, b8, b9, b12,b13 ]
149     // [a2, a3, a6, a7, a10,a11,a14,a15,b2, b3, b6, b7, b10,b11,b14,b15 ]
150     // [c0, c1, c4, c5, c8, c9, c12,c13, ... ]
151     // [c2, c3, c6, c7, c10,c11,c14,c15, ... ]
152     a = zip8_lo(b0, b1);
153     b = zip8_hi(b0, b1);
154     c = zip8_lo(b2, b3);
155 #endif
156 }
157 
158 template<class T> SIMDPP_INL
v_mem_unpack3_impl16_128(T & a,T & b,T & c)159 void v_mem_unpack3_impl16_128(T& a, T& b, T& c)
160 {
161 #if SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
162     // [a0,b0,c0,a1,b1,c1,a2,b2]
163     // [c2,a3,b3,c3,a4,b4,c4,a5]
164     // [b5,c5,a6,b6,c6,a7,b7,c7]
165     T mask1 = make_shuffle_bytes16_mask<0,3,6,8+1,8+4,8+7,8+2,8+5>(mask1);
166     T a1, b1, c1;
167     a1 = shuffle_bytes16(a, b, mask1);
168     c1 = shuffle_bytes16(b, c, mask1);
169     b1 = shuffle_bytes16(c, a, mask1);
170     // [a0,a1,a2,a3,a4,a5,b3,b4]
171     // [c2,c3,c4,c5,c6,c7,a6,a7]
172     // [b5,b6,b7,b0,b1,b2,c0,c1]
173     T a2, b2, c2;
174     T mask2 = make_uint(0xffff);
175     mask2 = move8_l<2>(mask2);
176 
177     a2 = blend(a1, c1, mask2);
178     b2 = blend(b1, a1, mask2);
179     c2 = blend(c1, b1, mask2);
180     // [a0..a7]
181     // [b5..b7,b0..b4]
182     // [c2..c7,c0,c1]
183     a = a2;
184     b = align8<3>(b2, b2);
185     c = align8<6>(c2, c2);
186 #else
187     T t0, t1, t2, t3;
188     t0 = a;
189     t1 = align8<6>(a, b);
190     t2 = align8<4>(b, c);
191     t3 = move8_l<2>(c);
192     // [a0,b0,c0,a1,b1,c1, ... ]
193     // [a2,b2,c2,a3,b3,c3, ... ]
194     // [a4,b4,c4,a5,b5,c5, ... ]
195     // [a6,b6,c6,a7,b7,c7, ... ]
196     typename same_width<T>::u32 b0, b1, b2, b3;
197     b0 = zip8_lo(t0, t1);
198     b1 = zip8_lo(t2, t3);
199     b2 = zip8_hi(t0, t1);
200     b3 = zip8_hi(t2, t3);
201     // [a0,a2,b0,b2,c0,c2,a1,a3]
202     // [a4,a6,b4,b6,c4,c6,a5,a7]
203     // [b1,b3,c1,c3, ... ]
204     // [b5,b7,c5,c7, ... ]
205     typename same_width<T>::u64 c0, c1, c2;
206     c0 = zip4_lo(b0, b1);
207     c1 = zip4_hi(b0, b1);
208     c2 = zip4_lo(b2, b3);
209     // [a0,a2,a4,a6,b0,b2,b4,b6]
210     // [c0,c2,c4,c6,a1,a3,a5,a7]
211     // [b1,b3,b5,b7,c1,c3,c5,c7]
212     t0 = c0;
213     t1 = shuffle1<1,0>(c0, c1);
214     t2 = splat2<1>(c1);
215     t3 = c2;
216     // [a0,a2,a4,a6,b0,b2,b4,b6]
217     // [b0,b2,b4,b6,c0,c2,c4,c6]
218     // [a1,a3,a5,a7,a1,a3,a5,a7]
219     // [b1,b3,b5,b7,c1,c3,c5,c7]
220     a = zip8_lo(t0, t2);
221     b = zip8_lo(t1, t3);
222     c = zip8_hi(t1, t3);
223 #endif
224 }
225 
226 template<class T> SIMDPP_INL
v_mem_unpack3_impl32_128(T & a,T & b,T & c)227 void v_mem_unpack3_impl32_128(T& a, T& b, T& c)
228 {
229 #if SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
230     using U = typename T::uint_vector_type;
231 
232     // [a0,b0,c0,a1]
233     // [b1,c1,a2,b2]
234     // [c2,a3,b3,c3]
235     U mask1 = make_shuffle_bytes16_mask<0,3,4+2,4+1>(mask1);
236     T a1, b1, c1;
237     a1 = shuffle_bytes16(a, b, mask1);
238     b1 = shuffle_bytes16(b, c, mask1);
239     c1 = shuffle_bytes16(c, a, mask1);
240     // [a0,a1,a2,c1]
241     // [b1,b2,b3,a3]
242     // [c2,c3,c0,b0]
243     T a2, b2, c2;
244     U mask2 = make_uint(0xffffffff);
245     mask2 = move4_l<1>(mask2);
246 
247     a2 = blend(a1, b1, mask2);
248     b2 = blend(b1, c1, mask2);
249     c2 = blend(c1, a1, mask2);
250     // [a0,a1,a2,a3]
251     // [b1,b2,b3,b0]
252     // [c2,c3,c0,c1]
253     a = a2;
254     b = align4<3>(b2, b2);
255     c = align4<2>(c2, c2);
256 #else
257     T t11, t12, t21, t22, t31, t32;
258     // [a0,b0,c0,a1]
259     // [b1,c1,a2,b2]
260     // [c2,a3,b3,c3]
261     t11 = a;
262     t12 = shuffle2<0,1,2,3>(c, b);
263     t21 = shuffle2<0,1,0,1>(a, b);
264     t22 = shuffle2<2,3,2,3>(b, c);
265     t31 = shuffle2<2,3,0,1>(a, b);
266     t32 = c;
267     // [a0,b0,c0,a1]
268     // [c2,a3,a2,b2]
269     // [a0,b0,b1,c1]
270     // [a2,b2,b3,c3]
271     // [c0,a1,b1,c1]
272     // [c2,a3,b3,c3]
273     a = shuffle2<0,3,2,1>(t11, t12);
274     b = shuffle2<1,2,1,2>(t21, t22);
275     c = shuffle2<0,3,0,3>(t31, t32);
276 #endif
277 }
278 
279 template<class T> SIMDPP_INL
v_mem_unpack3_impl64_128(T & a,T & b,T & c)280 void v_mem_unpack3_impl64_128(T& a, T& b, T& c)
281 {
282     T d0, d1, d2;
283     d0 = shuffle1<0,1>(a, b);
284     d1 = shuffle1<1,0>(a, c);
285     d2 = shuffle1<0,1>(b, c);
286     a = d0; b = d1; c = d2;
287 }
288 
289 template<class V> SIMDPP_INL
v_mem_unpack3_shuffle128(any_vec<16,V> & qa,any_vec<16,V> & qb,any_vec<16,V> & qc)290 void v_mem_unpack3_shuffle128(any_vec<16,V>& qa, any_vec<16,V>& qb, any_vec<16,V>& qc)
291 {
292     (void) qa; (void) qb; (void) qc;
293 }
294 
295 template<class V> SIMDPP_INL
v_mem_unpack3_shuffle128(any_vec<32,V> & qa,any_vec<32,V> & qb,any_vec<32,V> & qc)296 void v_mem_unpack3_shuffle128(any_vec<32,V>& qa, any_vec<32,V>& qb, any_vec<32,V>& qc)
297 {
298     // shuffle the vectors so that the lower halves contain the first 3 128-bit
299     // items (a and lower half of b) and the higher halves contain the rest
300 
301     V a0, b0, c0, a1, b1, c1;
302 
303     a0 = qa.wrapped();  b0 = qb.wrapped();  c0 = qc.wrapped();
304 
305     a1 = shuffle1_128<0,1>(a0, b0);
306     b1 = shuffle1_128<1,0>(a0, c0);
307     c1 = shuffle1_128<0,1>(b0, c0);
308 
309     qa.wrapped() = a1;  qb.wrapped() = b1;  qc.wrapped() = c1;
310 }
311 
312 #if SIMDPP_USE_AVX512F
313 template<class V> SIMDPP_INL
v_mem_unpack3_shuffle128(any_vec<64,V> & qa,any_vec<64,V> & qb,any_vec<64,V> & qc)314 void v_mem_unpack3_shuffle128(any_vec<64,V>& qa, any_vec<64,V>& qb, any_vec<64,V>& qc)
315 {
316     V a, b, c; // TODO: optimize. Using full-vector shuffle may be faster
317     a = qa.wrapped();  b = qb.wrapped();  c = qc.wrapped();
318 
319     V t11, t12, t21, t22, t31, t32;
320     // [a0,b0,c0,a1]
321     // [b1,c1,a2,b2]
322     // [c2,a3,b3,c3]
323     t11 = a;
324     t12 = shuffle2_128<0,1,2,3>(c, b);
325     t21 = shuffle2_128<0,1,0,1>(a, b);
326     t22 = shuffle2_128<2,3,2,3>(b, c);
327     t31 = shuffle2_128<2,3,0,1>(a, b);
328     t32 = c;
329     // [a0,b0,c0,a1]
330     // [c2,a3,a2,b2]
331     // [a0,b0,b1,c1]
332     // [a2,b2,b3,c3]
333     // [c0,a1,b1,c1]
334     // [c2,a3,b3,c3]
335     a = shuffle2_128<0,3,2,1>(t11, t12);
336     b = shuffle2_128<1,2,1,2>(t21, t22);
337     c = shuffle2_128<0,3,0,3>(t31, t32);
338 
339     qa.wrapped() = a;  qb.wrapped() = b;  qc.wrapped() = c;
340 }
341 #endif
342 
343 /** Concatenates @a a, @a b and @a c and stores the elements of the resulting
344     array as follows:
345      * every (3n)-th element is stored to @a a
346      * every (3n+1)-th element is stored to @a b
347      * every (3n+2)-th element is stored to @a c
348 
349     n = [0, <number of elements in vector> - 1]
350 */
351 template<unsigned N> SIMDPP_INL
mem_unpack3(uint8<N> & a,uint8<N> & b,uint8<N> & c)352 void mem_unpack3(uint8<N>& a, uint8<N>& b, uint8<N>& c)
353 {
354     v_mem_unpack3_shuffle128(a, b, c);
355     v_mem_unpack3_impl8_128(a, b, c);
356 }
357 
358 template<unsigned N> SIMDPP_INL
mem_unpack3(uint16<N> & a,uint16<N> & b,uint16<N> & c)359 void mem_unpack3(uint16<N>& a, uint16<N>& b, uint16<N>& c)
360 {
361     v_mem_unpack3_shuffle128(a, b, c);
362     v_mem_unpack3_impl16_128(a, b, c);
363 }
364 
365 template<unsigned N> SIMDPP_INL
mem_unpack3(uint32<N> & a,uint32<N> & b,uint32<N> & c)366 void mem_unpack3(uint32<N>& a, uint32<N>& b, uint32<N>& c)
367 {
368     v_mem_unpack3_shuffle128(a, b, c);
369     v_mem_unpack3_impl32_128(a, b, c);
370 }
371 
372 template<unsigned N> SIMDPP_INL
mem_unpack3(uint64<N> & a,uint64<N> & b,uint64<N> & c)373 void mem_unpack3(uint64<N>& a, uint64<N>& b, uint64<N>& c)
374 {
375     v_mem_unpack3_shuffle128(a, b, c);
376     v_mem_unpack3_impl64_128(a, b, c);
377 }
378 
379 template<unsigned N> SIMDPP_INL
mem_unpack3(float32<N> & a,float32<N> & b,float32<N> & c)380 void mem_unpack3(float32<N>& a, float32<N>& b, float32<N>& c)
381 {
382     v_mem_unpack3_shuffle128(a, b, c);
383     v_mem_unpack3_impl32_128(a, b, c);
384 }
385 
386 template<unsigned N> SIMDPP_INL
mem_unpack3(float64<N> & a,float64<N> & b,float64<N> & c)387 void mem_unpack3(float64<N>& a, float64<N>& b, float64<N>& c)
388 {
389     v_mem_unpack3_shuffle128(a, b, c);
390     v_mem_unpack3_impl64_128(a, b, c);
391 }
392 
393 /** Generic implementation of mem_unpack4. The 256-bit version applies 128-bit
394     operations to each half of each vector separately.
395 */
396 template<class T> SIMDPP_INL
v_mem_unpack4_impl8_128(T & a,T & b,T & c,T & d)397 void v_mem_unpack4_impl8_128(T& a, T& b, T& c, T& d)
398 {
399 #if SIMDPP_USE_SSSE3 || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
400     // TODO: optimize for Altivec and MSA
401     typename same_width<T>::u32 b0, b1, b2, b3;
402     b0 = transpose_inplace(a);
403     b1 = transpose_inplace(b);
404     b2 = transpose_inplace(c);
405     b3 = transpose_inplace(d);
406 
407     transpose4(b0, b1, b2, b3);
408     a = b0;  b = b1;  c = b2;  d = b3;
409 #else
410     // [a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3 ]
411     // [a4, b4, c4, d4, a5, b5, c5, d5, a6, b6, c6, d6, a7, b7, c7, d7 ]
412     // [a8, b8, c8, d8, a9, b9, c9, d9, a10,b10,c10,d10,a11,b11,c11,d11]
413     // [a12,b12,c12,d12,a13,b13,c13,d13,a14,b14,c14,d14,a15,b15,c15,d15]
414     T b0, b1, b2, b3, c0, c1, c2, c3;
415     b0 = zip16_lo(a, b);
416     b1 = zip16_hi(a, b);
417     b2 = zip16_lo(c, d);
418     b3 = zip16_hi(c, d);
419     // [a0, a4, b0, b4, c0, c4, d0, d4, a1, a5, b1, b5, c1, c5, d1, d5 ]
420     // [a2, a6, b2, b6, c2, c6, d2, d6, a3, a7, b3, b7, c3, c7, d3, d7 ]
421     // [a8, a12,b8, b12,c8, c12,d8, d12,a9, a13,b9, b13,c9, c13,d9, d13]
422     // [a10,a14,b10,b14,c10,c14,d10,d14,a11,a15,b11,b15,c11,c15,d11,d15]
423     c0 = zip16_lo(b0, b1);
424     c1 = zip16_hi(b0, b1);
425     c2 = zip16_lo(b2, b3);
426     c3 = zip16_hi(b2, b3);
427     // [a0, a2, a4, a6, b0, b2, b4, b6, c0, c2, c4, c6, d0, d2, d4, d6 ]
428     // [a1, a3, a5, a7, b1, b3, b5, b7, c1, c3, c5, c7, d1, d3, d5, d7 ]
429     // [a8, a10,a12,a14,b8, b10,b12,b14,c8, c10,c12,c14,d8, d10,d12,d14]
430     // [a9, a11,a13,a15,b9, b11,b13,b15,c9, c11,c13,c15,d9, d11,d13,d15]
431     typename same_width<T>::u64 d0, d1, d2, d3;
432     d0 = zip16_lo(c0, c1);
433     d1 = zip16_hi(c0, c1);
434     d2 = zip16_lo(c2, c3);
435     d3 = zip16_hi(c2, c3);
436     // [a0 .. a7, b0 .. b7 ]
437     // [c0 .. c7, d0 .. d7 ]
438     // [a8 .. a15, b8 .. b15 ]
439     // [b8 .. b15, d8 .. d15 ]
440     a = zip2_lo(d0, d2);
441     b = zip2_hi(d0, d2);
442     c = zip2_lo(d1, d3);
443     d = zip2_hi(d1, d3);
444 #endif
445 }
446 
447 template<class T> SIMDPP_INL
v_mem_unpack4_impl16_128(T & a,T & b,T & c,T & d)448 void v_mem_unpack4_impl16_128(T& a, T& b, T& c, T& d)
449 {
450     // [a0,b0,c0,d0,a1,b1,c1,d1]
451     // [a2,b2,c2,d2,a3,b3,c3,d3]
452     // [a4,b4,c4,d4,a5,b5,c5,d5]
453     // [a6,b6,c6,d6,a7,b7,c7,d7]
454     typename same_width<T>::u16 t0, t1, t2, t3;
455     t0 = zip8_lo(a, b);
456     t1 = zip8_hi(a, b);
457     t2 = zip8_lo(c, d);
458     t3 = zip8_hi(c, d);
459     // [a0,a2,b0,b2,c0,c2,d0,d2]
460     // [a1,a3,b1,b3,c1,c3,d1,d3]
461     // [a4,a6,b4,b6,c4,c6,d4,d6]
462     // [a5,a7,b5,b7,c5,c7,d5,d7]
463     typename same_width<T>::u64 u0, u1, u2, u3;
464     u0 = zip8_lo(t0, t1);
465     u1 = zip8_hi(t0, t1);
466     u2 = zip8_lo(t2, t3);
467     u3 = zip8_hi(t2, t3);
468     // [a0,a1,a2,a3,b0,b1,b2,b3]
469     // [c0,c1,c2,c3,d0,d1,d2,d3]
470     // [a4,a5,a6,a7,b4,b5,b6,b7]
471     // [c4,c5,c6,c7,d4,d5,d6,d7]
472     a = zip2_lo(u0, u2);
473     b = zip2_hi(u0, u2);
474     c = zip2_lo(u1, u3);
475     d = zip2_hi(u1, u3);
476 }
477 
478 template<class T> SIMDPP_INL
v_mem_unpack4_impl32_128(T & a,T & b,T & c,T & d)479 void v_mem_unpack4_impl32_128(T& a, T& b, T& c, T& d)
480 {
481     transpose4(a, b, c, d);
482 }
483 
484 template<class T> SIMDPP_INL
v_mem_unpack4_impl64_128(T & a,T & b,T & c,T & d)485 void v_mem_unpack4_impl64_128(T& a, T& b, T& c, T& d)
486 {
487     transpose2(a, c);
488     transpose2(b, d);
489     T t;
490     t = b;
491     b = c;
492     c = t;
493 }
494 
495 template<class V> SIMDPP_INL
v_mem_unpack4_shuffle128(any_vec<16,V> & qa,any_vec<16,V> & qb,any_vec<16,V> & qc,any_vec<16,V> & qd)496 void v_mem_unpack4_shuffle128(any_vec<16,V>& qa, any_vec<16,V>& qb,
497                               any_vec<16,V>& qc, any_vec<16,V>& qd)
498 {
499     (void) qa; (void) qb; (void) qc; (void) qd;
500 }
501 
502 template<class V> SIMDPP_INL
v_mem_unpack4_shuffle128(any_vec<32,V> & qa,any_vec<32,V> & qb,any_vec<32,V> & qc,any_vec<32,V> & qd)503 void v_mem_unpack4_shuffle128(any_vec<32,V>& qa, any_vec<32,V>& qb,
504                               any_vec<32,V>& qc, any_vec<32,V>& qd)
505 {
506     V a0, b0, c0, d0, a1, b1, c1, d1;
507 
508     a0 = qa.wrapped();  b0 = qb.wrapped();  c0 = qc.wrapped();  d0 = qd.wrapped();
509 
510     a1 = shuffle1_128<0,0>(a0, c0);
511     b1 = shuffle1_128<1,1>(a0, c0);
512     c1 = shuffle1_128<0,0>(b0, d0);
513     d1 = shuffle1_128<1,1>(b0, d0);
514 
515     qa.wrapped() = a1;  qb.wrapped() = b1;  qc.wrapped() = c1;  qd.wrapped() = d1;
516 }
517 
518 #if SIMDPP_USE_AVX512F
519 template<class V> SIMDPP_INL
v_mem_unpack4_shuffle128(any_vec<64,V> & qa,any_vec<64,V> & qb,any_vec<64,V> & qc,any_vec<64,V> & qd)520 void v_mem_unpack4_shuffle128(any_vec<64,V>& qa, any_vec<64,V>& qb,
521                               any_vec<64,V>& qc, any_vec<64,V>& qd)
522 {
523     V a, b, c, d; // TODO: optimize. Using full-vector shuffle/permute will be faster
524 
525     a = qa.wrapped();  b = qb.wrapped();  c = qc.wrapped();  d = qd.wrapped();
526 
527     V t1, t2, t3, t4;
528     // [a0,a1,a2,a3]
529     // [b0,b1,b2,b3]
530     // [c0,c1,c2,c3]
531     // [d0,d1,d2,d3]
532     t1 = shuffle2_128<0,2,0,2>(a, b);
533     t2 = shuffle2_128<1,3,1,3>(a, b);
534     t3 = shuffle2_128<0,2,0,2>(c, d);
535     t4 = shuffle2_128<1,3,1,3>(c, d);
536     // [a0,a2,b0,b2]
537     // [a1,a3,b1,b3]
538     // [c0,c2,d0,d2]
539     // [c1,c3,d1,d3]
540     a = shuffle2_128<0,2,0,2>(t1, t3);
541     b = shuffle2_128<0,2,0,2>(t2, t4);
542     c = shuffle2_128<1,3,1,3>(t1, t3);
543     d = shuffle2_128<1,3,1,3>(t2, t4);
544     // [a0,b0,c0,d0]
545     // [a1,b1,c1,d1]
546     // [a2,b2,c2,d2]
547     // [a3,b3,c3,d3]
548 
549     qa.wrapped() = a;  qb.wrapped() = b;  qc.wrapped() = c;  qd.wrapped() = d;
550 }
551 #endif
552 
553 /** Concatenates @a a, @a b, @a c and @a d and stores the elements of the
554     resulting array as follows:
555      * every (4n)-th element is stored to @a a
556      * every (4n+1)-th element is stored to @a b
557      * every (4n+2)-th element is stored to @a c
558      * every (4n+3)-th element is stored to @a d
559 
560     n = [0, <number of elements in vector> - 1]
561 */
562 // @icost{SSE2, SSE3, 16}
563 // @icost{SSSE3, SSE4.1, 12}
564 template<unsigned N> SIMDPP_INL
mem_unpack4(uint8<N> & a,uint8<N> & b,uint8<N> & c,uint8<N> & d)565 void mem_unpack4(uint8<N>& a, uint8<N>& b, uint8<N>& c, uint8<N>& d)
566 {
567     v_mem_unpack4_shuffle128(a, b, c, d);
568     v_mem_unpack4_impl8_128(a, b, c, d);
569 }
570 
571 template<unsigned N> SIMDPP_INL
mem_unpack4(uint16<N> & a,uint16<N> & b,uint16<N> & c,uint16<N> & d)572 void mem_unpack4(uint16<N>& a, uint16<N>& b, uint16<N>& c, uint16<N>& d)
573 {
574     v_mem_unpack4_shuffle128(a, b, c, d);
575     v_mem_unpack4_impl16_128(a, b, c, d);
576 }
577 
578 template<unsigned N> SIMDPP_INL
mem_unpack4(uint32<N> & a,uint32<N> & b,uint32<N> & c,uint32<N> & d)579 void mem_unpack4(uint32<N>& a, uint32<N>& b, uint32<N>& c, uint32<N>& d)
580 {
581     v_mem_unpack4_shuffle128(a, b, c, d);
582     v_mem_unpack4_impl32_128(a, b, c, d);
583 }
584 
585 template<unsigned N> SIMDPP_INL
mem_unpack4(uint64<N> & a,uint64<N> & b,uint64<N> & c,uint64<N> & d)586 void mem_unpack4(uint64<N>& a, uint64<N>& b, uint64<N>& c, uint64<N>& d)
587 {
588     v_mem_unpack4_shuffle128(a, b, c, d);
589     v_mem_unpack4_impl64_128(a, b, c, d);
590 }
591 
592 template<unsigned N> SIMDPP_INL
mem_unpack4(float32<N> & a,float32<N> & b,float32<N> & c,float32<N> & d)593 void mem_unpack4(float32<N>& a, float32<N>& b, float32<N>& c, float32<N>& d)
594 {
595     v_mem_unpack4_shuffle128(a, b, c, d);
596     v_mem_unpack4_impl32_128(a, b, c, d);
597 }
598 
599 template<unsigned N> SIMDPP_INL
mem_unpack4(float64<N> & a,float64<N> & b,float64<N> & c,float64<N> & d)600 void mem_unpack4(float64<N>& a, float64<N>& b, float64<N>& c, float64<N>& d)
601 {
602     v_mem_unpack4_shuffle128(a, b, c, d);
603     v_mem_unpack4_impl64_128(a, b, c, d);
604 }
605 
606 /** Concatenates the given vectors and stores the elements of the resulting
607     array as follows:
608      * every (3n)-th element of the first 48 elements is stored to @a a
609      * every (3n+1)-th element of the first 48 elements is stored to @a b
610      * every (3n+2)-th element of the first 48 elements is stored to @a c
611      * every (3n)-th element of the last 48 elements is stored to @a d
612      * every (3n+1)-th element of the last 48 elements is stored to @a e
613      * every (3n+2)-th element of the lasd 48 elements is stored to @a f
614 
615     n = [0, <number of elements in vector> - 1]
616 */
617 static SIMDPP_INL
mem_unpack6(uint8x16 & a,uint8x16 & b,uint8x16 & c,uint8x16 & d,uint8x16 & e,uint8x16 & f)618 void mem_unpack6(uint8x16& a, uint8x16& b, uint8x16& c,
619                             uint8x16& d, uint8x16& e, uint8x16& f)
620 {
621     uint8x16 t0, t1, t2, t3, t4, t5;
622     t0 = zip16_lo(a, d);
623     t1 = zip16_hi(a, d);
624     t2 = zip16_lo(b, e);
625     t3 = zip16_hi(b, e);
626     t4 = zip16_lo(c, f);
627     t5 = zip16_hi(c, f);
628 
629     uint8x16 u0, u1, u2, u3, u4, u5;
630     u0 = zip16_lo(t0, t3);
631     u1 = zip16_hi(t0, t3);
632     u2 = zip16_lo(t1, t4);
633     u3 = zip16_hi(t1, t4);
634     u4 = zip16_lo(t2, t5);
635     u5 = zip16_hi(t2, t5);
636 
637     t0 = zip16_lo(u0, u3);
638     t1 = zip16_hi(u0, u3);
639     t2 = zip16_lo(u1, u4);
640     t3 = zip16_hi(u1, u4);
641     t4 = zip16_lo(u2, u5);
642     t5 = zip16_hi(u2, u5);
643 
644     u0 = zip16_lo(t0, t3);
645     u1 = zip16_hi(t0, t3);
646     u2 = zip16_lo(t1, t4);
647     u3 = zip16_hi(t1, t4);
648     u4 = zip16_lo(t2, t5);
649     u5 = zip16_hi(t2, t5);
650 
651     t0 = zip16_lo(u0, u3);
652     t1 = zip16_hi(u0, u3);
653     t2 = zip16_lo(u1, u4);
654     t3 = zip16_hi(u1, u4);
655     t4 = zip16_lo(u2, u5);
656     t5 = zip16_hi(u2, u5);
657 
658     a = zip16_lo(t0, t3);
659     b = zip16_hi(t0, t3);
660     c = zip16_lo(t1, t4);
661     d = zip16_hi(t1, t4);
662     e = zip16_lo(t2, t5);
663     f = zip16_hi(t2, t5);
664 }
665 
666 static SIMDPP_INL
mem_unpack6(uint16x8 & a,uint16x8 & b,uint16x8 & c,uint16x8 & d,uint16x8 & e,uint16x8 & f)667 void mem_unpack6(uint16x8& a, uint16x8& b, uint16x8& c,
668                             uint16x8& d, uint16x8& e, uint16x8& f)
669 {
670     uint16x8 t0, t1, t2, t3, t4, t5;
671     t0 = zip8_lo(a, d);
672     t1 = zip8_hi(a, d);
673     t2 = zip8_lo(b, e);
674     t3 = zip8_hi(b, e);
675     t4 = zip8_lo(c, f);
676     t5 = zip8_hi(c, f);
677 
678     uint16x8 u0, u1, u2, u3, u4, u5;
679     u0 = zip8_lo(t0, t3);
680     u1 = zip8_hi(t0, t3);
681     u2 = zip8_lo(t1, t4);
682     u3 = zip8_hi(t1, t4);
683     u4 = zip8_lo(t2, t5);
684     u5 = zip8_hi(t2, t5);
685 
686     a = zip8_lo(u0, u3);
687     b = zip8_hi(u0, u3);
688     c = zip8_lo(u1, u4);
689     d = zip8_hi(u1, u4);
690     e = zip8_lo(u2, u5);
691     f = zip8_hi(u2, u5);
692 }
693 
694 } // namespace insn
695 } // namespace detail
696 } // namespace SIMDPP_ARCH_NAMESPACE
697 } // namespace simdpp
698 
699 #endif
700 
701