1 /*  Copyright (C) 2011-2014  Povilas Kanapickas <povilas@radix.lt>
2 
3     Distributed under the Boost Software License, Version 1.0.
4         (See accompanying file LICENSE_1_0.txt or copy at
5             http://www.boost.org/LICENSE_1_0.txt)
6 */
7 
8 #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_ZIP_LO_H
9 #define LIBSIMDPP_SIMDPP_DETAIL_INSN_ZIP_LO_H
10 
11 #ifndef LIBSIMDPP_SIMD_H
12     #error "This file must be included through simd.h"
13 #endif
14 
15 #include <simdpp/types.h>
16 #include <simdpp/core/shuffle_bytes16.h>
17 #include <simdpp/detail/null/shuffle.h>
18 #include <simdpp/detail/neon/shuffle.h>
19 
20 namespace simdpp {
21 namespace SIMDPP_ARCH_NAMESPACE {
22 namespace detail {
23 namespace insn {
24 
25 static SIMDPP_INL
i_zip16_lo(const uint8x16 & a,const uint8x16 & b)26 uint8x16 i_zip16_lo(const uint8x16& a, const uint8x16& b)
27 {
28 #if SIMDPP_USE_NULL
29     return detail::null::zip16_lo(a, b);
30 #elif SIMDPP_USE_SSE2
31     return _mm_unpacklo_epi8(a.native(), b.native());
32 #elif SIMDPP_USE_NEON
33     // the compiler will optimize multiple vzip instructions if both zip_lo
34     // and zip_hi are used on the same arguments
35     return vzipq_u8(a.native(), b.native()).val[0];
36 #elif SIMDPP_USE_ALTIVEC
37     return vec_mergeh(a.native(), b.native());
38 #elif SIMDPP_USE_MSA
39     return (v16u8) __msa_ilvr_b((v16i8)b.native(), (v16i8)a.native());
40 #endif
41 }
42 
43 #if SIMDPP_USE_AVX2
44 static SIMDPP_INL
i_zip16_lo(const uint8x32 & a,const uint8x32 & b)45 uint8x32 i_zip16_lo(const uint8x32& a, const uint8x32& b)
46 {
47     return _mm256_unpacklo_epi8(a.native(), b.native());
48 }
49 #endif
50 
51 #if SIMDPP_USE_AVX512BW
i_zip16_lo(const uint8<64> & a,const uint8<64> & b)52 SIMDPP_INL uint8<64> i_zip16_lo(const uint8<64>& a, const uint8<64>& b)
53 {
54     return _mm512_unpacklo_epi8(a.native(), b.native());
55 }
56 #endif
57 
58 template<unsigned N> SIMDPP_INL
i_zip16_lo(const uint8<N> & a,const uint8<N> & b)59 uint8<N> i_zip16_lo(const uint8<N>& a, const uint8<N>& b)
60 {
61     SIMDPP_VEC_ARRAY_IMPL2(uint8<N>, i_zip16_lo, a, b)
62 }
63 
64 // -----------------------------------------------------------------------------
65 
66 static SIMDPP_INL
i_zip8_lo(const uint16x8 & a,const uint16x8 & b)67 uint16x8 i_zip8_lo(const uint16x8& a, const uint16x8& b)
68 {
69 #if SIMDPP_USE_NULL
70     return detail::null::zip8_lo(a, b);
71 #elif SIMDPP_USE_SSE2
72     return _mm_unpacklo_epi16(a.native(), b.native());
73 #elif SIMDPP_USE_NEON
74     return vzipq_u16(a.native(), b.native()).val[0];
75 #elif SIMDPP_USE_ALTIVEC
76     return vec_mergeh(a.native(), b.native());
77 #elif SIMDPP_USE_MSA
78     return (v8u16) __msa_ilvr_h((v8i16)b.native(), (v8i16)a.native());
79 #endif
80 }
81 
82 #if SIMDPP_USE_AVX2
83 static SIMDPP_INL
i_zip8_lo(const uint16x16 & a,const uint16x16 & b)84 uint16x16 i_zip8_lo(const uint16x16& a, const uint16x16& b)
85 {
86     return _mm256_unpacklo_epi16(a.native(), b.native());
87 }
88 #endif
89 
90 #if SIMDPP_USE_AVX512BW
i_zip8_lo(const uint16<32> & a,const uint16<32> & b)91 SIMDPP_INL uint16<32> i_zip8_lo(const uint16<32>& a, const uint16<32>& b)
92 {
93     return _mm512_unpacklo_epi16(a.native(), b.native());
94 }
95 #endif
96 
97 template<unsigned N> SIMDPP_INL
i_zip8_lo(const uint16<N> & a,const uint16<N> & b)98 uint16<N> i_zip8_lo(const uint16<N>& a, const uint16<N>& b)
99 {
100     SIMDPP_VEC_ARRAY_IMPL2(uint16<N>, i_zip8_lo, a, b)
101 }
102 
103 // -----------------------------------------------------------------------------
104 
105 static SIMDPP_INL
i_zip4_lo(const uint32x4 & a,const uint32x4 & b)106 uint32x4 i_zip4_lo(const uint32x4& a, const uint32x4& b)
107 {
108 #if SIMDPP_USE_NULL
109     return detail::null::zip4_lo(a, b);
110 #elif SIMDPP_USE_SSE2
111     return _mm_unpacklo_epi32(a.native(), b.native());
112 #elif SIMDPP_USE_NEON
113     return vzipq_u32(a.native(), b.native()).val[0];
114 #elif SIMDPP_USE_ALTIVEC
115     return vec_mergeh(a.native(), b.native());
116 #elif SIMDPP_USE_MSA
117     return (v4u32) __msa_ilvr_w((v4i32)b.native(), (v4i32)a.native());
118 #endif
119 }
120 
121 #if SIMDPP_USE_AVX2
122 static SIMDPP_INL
i_zip4_lo(const uint32x8 & a,const uint32x8 & b)123 uint32x8 i_zip4_lo(const uint32x8& a, const uint32x8& b)
124 {
125     return _mm256_unpacklo_epi32(a.native(), b.native());
126 }
127 #endif
128 
129 #if SIMDPP_USE_AVX512F
130 static SIMDPP_INL
i_zip4_lo(const uint32<16> & a,const uint32<16> & b)131 uint32<16> i_zip4_lo(const uint32<16>& a, const uint32<16>& b)
132 {
133     return _mm512_unpacklo_epi32(a.native(), b.native());
134 }
135 #endif
136 
137 template<unsigned N> SIMDPP_INL
i_zip4_lo(const uint32<N> & a,const uint32<N> & b)138 uint32<N> i_zip4_lo(const uint32<N>& a, const uint32<N>& b)
139 {
140     SIMDPP_VEC_ARRAY_IMPL2(uint32<N>, i_zip4_lo, a, b)
141 }
142 
143 // -----------------------------------------------------------------------------
144 
145 static SIMDPP_INL
i_zip2_lo(const uint64x2 & a,const uint64x2 & b)146 uint64x2 i_zip2_lo(const uint64x2& a, const uint64x2& b)
147 {
148 #if SIMDPP_USE_SSE2
149     return _mm_unpacklo_epi64(a.native(), b.native());
150 #elif SIMDPP_USE_NEON
151     return neon::zip2_lo(a, b);
152 #elif SIMDPP_USE_VSX_207
153     return vec_mergeh(a.native(), b.native());
154 #elif SIMDPP_USE_MSA
155     return (v2u64) __msa_ilvr_d((v2i64) b.native(), (v2i64) a.native());
156 #elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC
157     return detail::null::zip2_lo(a, b);
158 #endif
159 }
160 
161 #if SIMDPP_USE_AVX2
162 static SIMDPP_INL
i_zip2_lo(const uint64x4 & a,const uint64x4 & b)163 uint64x4 i_zip2_lo(const uint64x4& a, const uint64x4& b)
164 {
165     return _mm256_unpacklo_epi64(a.native(), b.native());
166 }
167 #endif
168 
169 #if SIMDPP_USE_AVX512F
170 static SIMDPP_INL
i_zip2_lo(const uint64<8> & a,const uint64<8> & b)171 uint64<8> i_zip2_lo(const uint64<8>& a, const uint64<8>& b)
172 {
173     return _mm512_unpacklo_epi64(a.native(), b.native());
174 }
175 #endif
176 
177 template<unsigned N> SIMDPP_INL
i_zip2_lo(const uint64<N> & a,const uint64<N> & b)178 uint64<N> i_zip2_lo(const uint64<N>& a, const uint64<N>& b)
179 {
180     SIMDPP_VEC_ARRAY_IMPL2(uint64<N>, i_zip2_lo, a, b)
181 }
182 
183 // -----------------------------------------------------------------------------
184 
185 static SIMDPP_INL
i_zip4_lo(const float32x4 & a,const float32x4 & b)186 float32x4 i_zip4_lo(const float32x4& a, const float32x4& b)
187 {
188 #if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP
189     return detail::null::zip4_lo(a, b);
190 #elif SIMDPP_USE_SSE2
191     return _mm_unpacklo_ps(a.native(), b.native());
192 #elif SIMDPP_USE_NEON
193     return vzipq_f32(a.native(), b.native()).val[0];
194 #elif SIMDPP_USE_ALTIVEC
195     return vec_mergeh(a.native(), b.native());
196 #elif SIMDPP_USE_MSA
197     return (v4f32) __msa_ilvr_w((v4i32) b.native(), (v4i32) a.native());
198 #endif
199 }
200 
201 #if SIMDPP_USE_AVX
202 static SIMDPP_INL
i_zip4_lo(const float32x8 & a,const float32x8 & b)203 float32x8 i_zip4_lo(const float32x8& a, const float32x8& b)
204 {
205     return _mm256_unpacklo_ps(a.native(), b.native());
206 }
207 #endif
208 
209 #if SIMDPP_USE_AVX512F
210 static SIMDPP_INL
i_zip4_lo(const float32<16> & a,const float32<16> & b)211 float32<16> i_zip4_lo(const float32<16>& a, const float32<16>& b)
212 {
213     return _mm512_unpacklo_ps(a.native(), b.native());
214 }
215 #endif
216 
217 template<unsigned N> SIMDPP_INL
i_zip4_lo(const float32<N> & a,const float32<N> & b)218 float32<N> i_zip4_lo(const float32<N>& a, const float32<N>& b)
219 {
220     SIMDPP_VEC_ARRAY_IMPL2(float32<N>, i_zip4_lo, a, b)
221 }
222 
223 // -----------------------------------------------------------------------------
224 
225 static SIMDPP_INL
i_zip2_lo(const float64x2 & a,const float64x2 & b)226 float64x2 i_zip2_lo(const float64x2& a, const float64x2& b)
227 {
228 #if SIMDPP_USE_SSE2
229     return _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(a.native()),
230                                        _mm_castpd_ps(b.native())));
231 #elif SIMDPP_USE_NEON64
232     return vtrn1q_f64(a.native(), b.native());
233 #elif SIMDPP_USE_VSX_206
234     return (__vector double) vec_mergeh((__vector uint64_t)a.native(),
235                                         (__vector uint64_t)b.native());
236 #elif SIMDPP_USE_MSA
237     return (v2f64) __msa_ilvr_d((v2i64) b.native(), (v2i64) a.native());
238 #elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC || SIMDPP_USE_NEON
239     return detail::null::zip2_lo(a, b);
240 #endif
241 }
242 
243 #if SIMDPP_USE_AVX
244 static SIMDPP_INL
i_zip2_lo(const float64x4 & a,const float64x4 & b)245 float64x4 i_zip2_lo(const float64x4& a, const float64x4& b)
246 {
247     return _mm256_unpacklo_pd(a.native(), b.native());
248 }
249 #endif
250 
251 #if SIMDPP_USE_AVX512F
252 static SIMDPP_INL
i_zip2_lo(const float64<8> & a,const float64<8> & b)253 float64<8> i_zip2_lo(const float64<8>& a, const float64<8>& b)
254 {
255     return _mm512_unpacklo_pd(a.native(), b.native());
256 }
257 #endif
258 
259 template<unsigned N> SIMDPP_INL
i_zip2_lo(const float64<N> & a,const float64<N> & b)260 float64<N> i_zip2_lo(const float64<N>& a, const float64<N>& b)
261 {
262     SIMDPP_VEC_ARRAY_IMPL2(float64<N>, i_zip2_lo, a, b)
263 }
264 
265 
266 } // namespace insn
267 } // namespace detail
268 } // namespace SIMDPP_ARCH_NAMESPACE
269 } // namespace simdpp
270 
271 #endif
272