1 /*  Copyright (C) 2011-2014  Povilas Kanapickas <povilas@radix.lt>
2 
3     Distributed under the Boost Software License, Version 1.0.
4         (See accompanying file LICENSE_1_0.txt or copy at
5             http://www.boost.org/LICENSE_1_0.txt)
6 */
7 
8 #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_MOVE_L_H
9 #define LIBSIMDPP_SIMDPP_DETAIL_INSN_MOVE_L_H
10 
11 #ifndef LIBSIMDPP_SIMD_H
12     #error "This file must be included through simd.h"
13 #endif
14 
15 #include <simdpp/types.h>
16 #include <simdpp/core/permute4.h>
17 #include <simdpp/detail/null/shuffle.h>
18 #include <simdpp/detail/shuffle/shuffle_mask.h>
19 #include <simdpp/detail/vector_array_macros.h>
20 
21 namespace simdpp {
22 namespace SIMDPP_ARCH_NAMESPACE {
23 namespace detail {
24 namespace insn {
25 
26 #if SIMDPP_USE_ALTIVEC
27 template<unsigned shift> SIMDPP_INL
vec_sld_biendian(const uint8<16> & lower,const uint8<16> & upper)28 uint8<16> vec_sld_biendian(const uint8<16>& lower, const uint8<16>& upper)
29 {
30 #if SIMDPP_BIG_ENDIAN
31     return vec_sld(lower.native(), upper.native(), shift);
32 #else
33     // by default GCC adjusts vec_sld element order to match endianness of the target
34     return vec_sld(upper.native(), lower.native(), 16 - shift);
35 #endif
36 }
37 #endif
38 
39 template<unsigned shift> SIMDPP_INL
i_move16_l(const uint8x16 & a)40 uint8x16 i_move16_l(const uint8x16& a)
41 {
42     static_assert(shift <= 16, "Selector out of range");
43 #if SIMDPP_USE_NULL
44     return detail::null::move_n_l<shift>(a);
45 #elif SIMDPP_USE_SSE2
46     return _mm_srli_si128(a.native(), shift);
47 #elif SIMDPP_USE_NEON
48     uint8x16 z = make_zero();
49     return vextq_u8(a.native(), z.native(), shift);
50 #elif SIMDPP_USE_ALTIVEC
51     // return align<shift>(a, (uint8x16) make_zero());
52     return vec_sld_biendian<shift>((uint8<16>)a, (uint8<16>)make_zero());
53 #elif SIMDPP_USE_MSA
54     uint8x16 zero = make_zero();
55     return (v16u8) __msa_sldi_b((v16i8)zero.native(), (v16i8)a.native(), shift);
56 #endif
57 }
58 
59 #if SIMDPP_USE_AVX2
60 template<unsigned shift> SIMDPP_INL
i_move16_l(const uint8x32 & a)61 uint8x32 i_move16_l(const uint8x32& a)
62 {
63     static_assert(shift <= 16, "Selector out of range");
64     return _mm256_srli_si256(a.native(), shift);
65 }
66 #endif
67 
68 #if SIMDPP_USE_AVX512BW
69 template<unsigned shift> SIMDPP_INL
i_move16_l(const uint8<64> & a)70 uint8<64> i_move16_l(const uint8<64>& a)
71 {
72     static_assert(shift <= 16, "Selector out of range");
73     return _mm512_bsrli_epi128(a.native(), shift);
74 }
75 #endif
76 
77 template<unsigned shift, unsigned N> SIMDPP_INL
i_move16_l(const uint8<N> & a)78 uint8<N> i_move16_l(const uint8<N>& a)
79 {
80     static_assert(shift <= 16, "Selector out of range");
81     SIMDPP_VEC_ARRAY_IMPL1(uint8<N>, i_move16_l<shift>, a);
82 }
83 
84 // -----------------------------------------------------------------------------
85 
86 template<unsigned shift> SIMDPP_INL
i_move8_l(const uint16<8> & a)87 uint16<8> i_move8_l(const uint16<8>& a)
88 {
89 #if SIMDPP_USE_NULL
90     return detail::null::move_n_l<shift>(a);
91 #else
92     return (uint16<8>) i_move16_l<shift*2>(uint8<16>(a));
93 #endif
94 }
95 
96 #if SIMDPP_USE_AVX2
97 template<unsigned shift> SIMDPP_INL
i_move8_l(const uint16<16> & a)98 uint16<16> i_move8_l(const uint16<16>& a)
99 {
100     static_assert(shift <= 8, "Selector out of range");
101     return _mm256_srli_si256(a.native(), shift*2);
102 }
103 #endif
104 
105 #if SIMDPP_USE_AVX512BW
106 template<unsigned shift> SIMDPP_INL
i_move8_l(const uint16<32> & a)107 uint16<32> i_move8_l(const uint16<32>& a)
108 {
109     static_assert(shift <= 8, "Selector out of range");
110     return _mm512_bsrli_epi128(a.native(), shift*2);
111 }
112 #endif
113 
114 template<unsigned shift, unsigned N> SIMDPP_INL
i_move8_l(const uint16<N> & a)115 uint16<N> i_move8_l(const uint16<N>& a)
116 {
117     SIMDPP_VEC_ARRAY_IMPL1(uint16<N>, i_move8_l<shift>, a);
118 }
119 
120 // -----------------------------------------------------------------------------
121 
122 template<unsigned shift> SIMDPP_INL
i_move4_l(const uint32<4> & a)123 uint32<4> i_move4_l(const uint32<4>& a)
124 {
125 #if SIMDPP_USE_NULL
126     return detail::null::move_n_l<shift>(a);
127 #else
128     return (uint32<4>) i_move16_l<shift*4>(uint8<16>(a));
129 #endif
130 }
131 
132 #if SIMDPP_USE_AVX2
133 template<unsigned shift> SIMDPP_INL
i_move4_l(const uint32<8> & a)134 uint32<8> i_move4_l(const uint32<8>& a)
135 {
136     static_assert(shift <= 4, "Selector out of range");
137     return _mm256_srli_si256(a.native(), shift*4);
138 }
139 #endif
140 
141 #if SIMDPP_USE_AVX512F
142 template<unsigned shift> SIMDPP_INL
i_move4_l(const uint32<16> & a)143 uint32<16> i_move4_l(const uint32<16>& a)
144 {
145     static_assert(shift <= 4, "Selector out of range");
146     switch (shift) {
147     default:
148     case 0: return a;
149     case 1: return _mm512_maskz_shuffle_epi32(0x7777, a.native(),
150                                               _MM_PERM_ENUM(_MM_SHUFFLE(3, 3, 2, 1)));
151     case 2: return _mm512_maskz_shuffle_epi32(0x3333, a.native(),
152                                               _MM_PERM_ENUM(_MM_SHUFFLE(3, 3, 3, 2)));
153     case 3: return _mm512_maskz_shuffle_epi32(0x1111, a.native(),
154                                               _MM_PERM_ENUM(_MM_SHUFFLE(3, 3, 3, 3)));
155     case 4: return make_zero();
156     }
157 }
158 #endif
159 
160 template<unsigned shift, unsigned N> SIMDPP_INL
i_move4_l(const uint32<N> & a)161 uint32<N> i_move4_l(const uint32<N>& a)
162 {
163     SIMDPP_VEC_ARRAY_IMPL1(uint32<N>, i_move4_l<shift>, a);
164 }
165 
166 // -----------------------------------------------------------------------------
167 
168 template<unsigned shift> SIMDPP_INL
i_move2_l(const uint64<2> & a)169 uint64<2> i_move2_l(const uint64<2>& a)
170 {
171 #if SIMDPP_USE_NULL || (SIMDPP_USE_ALTIVEC && !SIMDPP_USE_VSX_207)
172     return detail::null::move_n_l<shift>(a);
173 #else
174     return (uint64<2>) i_move16_l<shift*8>(uint8<16>(a));
175 #endif
176 }
177 
178 #if SIMDPP_USE_AVX2
179 template<unsigned shift> SIMDPP_INL
i_move2_l(const uint64<4> & a)180 uint64<4> i_move2_l(const uint64<4>& a)
181 {
182     static_assert(shift <= 2, "Selector out of range");
183     return _mm256_srli_si256(a.native(), shift*8);
184 }
185 #endif
186 
187 #if SIMDPP_USE_AVX512F
188 template<unsigned shift> SIMDPP_INL
i_move2_l(const uint64<8> & a)189 uint64<8> i_move2_l(const uint64<8>& a)
190 {
191     static_assert(shift <= 4, "Selector out of range");
192     return (uint64<8>) i_move4_l<shift*2>(uint32<16>(a));
193 }
194 #endif
195 
196 template<unsigned shift, unsigned N> SIMDPP_INL
i_move2_l(const uint64<N> & a)197 uint64<N> i_move2_l(const uint64<N>& a)
198 {
199     SIMDPP_VEC_ARRAY_IMPL1(uint64<N>, i_move2_l<shift>, a);
200 }
201 
202 // -----------------------------------------------------------------------------
203 
204 template<unsigned shift> SIMDPP_INL
i_move4_l(const float32<4> & a)205 float32<4> i_move4_l(const float32<4>& a)
206 {
207 #if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP
208     return detail::null::move_n_l<shift>(a);
209 #else
210     return (float32<4>) i_move16_l<shift*4>(uint8<16>(a));
211 #endif
212 }
213 
214 #if SIMDPP_USE_AVX
215 template<unsigned shift> SIMDPP_INL
i_move4_l(const float32<8> & a)216 float32<8> i_move4_l(const float32<8>& a)
217 {
218     static_assert(shift <= 4, "Selector out of range");
219     return (float32<8>) i_move16_l<shift*4>(uint8<32>(a));
220 }
221 #endif
222 
223 #if SIMDPP_USE_AVX512F
224 template<unsigned shift> SIMDPP_INL
i_move4_l(const float32<16> & a)225 float32<16> i_move4_l(const float32<16>& a)
226 {
227     static_assert(shift <= 4, "Selector out of range");
228     switch (shift) {
229     default:
230     case 0: return a;
231     case 1: return _mm512_maskz_shuffle_ps(0x7777, a.native(), a.native(),
232                                            _MM_SHUFFLE(3, 3, 2, 1));
233     case 2: return _mm512_maskz_shuffle_ps(0x3333, a.native(), a.native(),
234                                            _MM_SHUFFLE(3, 3, 3, 2));
235     case 3: return _mm512_maskz_shuffle_ps(0x1111, a.native(), a.native(),
236                                            _MM_SHUFFLE(3, 3, 3, 3));
237     case 4: return make_zero();
238     }
239 }
240 #endif
241 
242 template<unsigned shift, unsigned N> SIMDPP_INL
i_move4_l(const float32<N> & a)243 float32<N> i_move4_l(const float32<N>& a)
244 {
245     SIMDPP_VEC_ARRAY_IMPL1(float32<N>, i_move4_l<shift>, a);
246 }
247 
248 // -----------------------------------------------------------------------------
249 
250 template<unsigned shift> SIMDPP_INL
i_move2_l(const float64<2> & a)251 float64<2> i_move2_l(const float64<2>& a)
252 {
253 #if SIMDPP_USE_SSE2 || SIMDPP_USE_NEON64 || SIMDPP_USE_VSX_206 || SIMDPP_USE_MSA
254     return (float64<2>) i_move16_l<shift*8>(uint8<16>(a));
255 #elif SIMDPP_USE_NULL || SIMDPP_USE_NEON32 || SIMDPP_USE_ALTIVEC
256     return detail::null::move_n_l<shift>(a);
257 #endif
258 }
259 
260 #if SIMDPP_USE_AVX
261 template<unsigned shift> SIMDPP_INL
i_move2_l(const float64<4> & a)262 float64<4> i_move2_l(const float64<4>& a)
263 {
264     static_assert(shift <= 2, "Selector out of range");
265     return (float64<4>) i_move16_l<shift*8>(uint8<32>(a));
266 }
267 #endif
268 
269 #if SIMDPP_USE_AVX512F
270 template<unsigned shift> SIMDPP_INL
i_move2_l(const float64<8> & a)271 float64<8> i_move2_l(const float64<8>& a)
272 {
273     static_assert(shift <= 2, "Selector out of range");
274     switch (shift) {
275     default:
276     case 0: return a;
277     case 1: return _mm512_maskz_shuffle_pd(0x55, a.native(), a.native(),
278                                            SIMDPP_SHUFFLE_MASK_2x2_4(1, 1));
279     case 2: return make_zero();
280     }
281 }
282 #endif
283 
284 template<unsigned shift, unsigned N> SIMDPP_INL
i_move2_l(const float64<N> & a)285 float64<N> i_move2_l(const float64<N>& a)
286 {
287     SIMDPP_VEC_ARRAY_IMPL1(float64<N>, i_move2_l<shift>, a);
288 }
289 
290 // -----------------------------------------------------------------------------
291 // Certain compilers don't like zero or full vector width moves. The templates
292 // below offer a warkaround
293 
294 template<unsigned count>
295 struct i_move2_l_wrapper {
296     template<class V>
runi_move2_l_wrapper297     static SIMDPP_INL V run(const V& arg) { return i_move2_l<count>(arg); }
298 };
299 template<>
300 struct i_move2_l_wrapper<0> {
301     template<class V>
302     static SIMDPP_INL V run(const V& arg) { return arg; }
303 };
304 template<>
305 struct i_move2_l_wrapper<2> {
306     template<class V>
307     static SIMDPP_INL V run(const V&) { return (V) make_zero(); }
308 };
309 
310 template<unsigned count>
311 struct i_move4_l_wrapper {
312     template<class V>
313     static SIMDPP_INL V run(const V& arg) { return i_move4_l<count>(arg); }
314 };
315 template<>
316 struct i_move4_l_wrapper<0> {
317     template<class V>
318     static SIMDPP_INL V run(const V& arg) { return arg; }
319 };
320 template<>
321 struct i_move4_l_wrapper<4> {
322     template<class V>
323     static SIMDPP_INL V run(const V&) { return (V) make_zero(); }
324 };
325 
326 template<unsigned count>
327 struct i_move8_l_wrapper {
328     template<class V>
329     static SIMDPP_INL V run(const V& arg) { return i_move8_l<count>(arg); }
330 };
331 template<>
332 struct i_move8_l_wrapper<0> {
333     template<class V>
334     static SIMDPP_INL V run(const V& arg) { return arg; }
335 };
336 template<>
337 struct i_move8_l_wrapper<8> {
338     template<class V>
339     static SIMDPP_INL V run(const V&) { return (V) make_zero(); }
340 };
341 
342 template<unsigned count>
343 struct i_move16_l_wrapper {
344     template<class V>
345     static SIMDPP_INL V run(const V& arg) { return i_move16_l<count>(arg); }
346 };
347 template<>
348 struct i_move16_l_wrapper<0> {
349     template<class V>
350     static SIMDPP_INL V run(const V& arg) { return arg; }
351 };
352 template<>
353 struct i_move16_l_wrapper<16> {
354     template<class V>
355     static SIMDPP_INL V run(const V&) { return (V) make_zero(); }
356 };
357 
358 } // namespace insn
359 } // namespace detail
360 } // namespace SIMDPP_ARCH_NAMESPACE
361 } // namespace simdpp
362 
363 #endif
364 
365