1 /*  Copyright (C) 2011-2014  Povilas Kanapickas <povilas@radix.lt>
2 
3     Distributed under the Boost Software License, Version 1.0.
4         (See accompanying file LICENSE_1_0.txt or copy at
5             http://www.boost.org/LICENSE_1_0.txt)
6 */
7 
8 #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_I_SHIFT_L_H
9 #define LIBSIMDPP_SIMDPP_DETAIL_INSN_I_SHIFT_L_H
10 
11 #ifndef LIBSIMDPP_SIMD_H
12     #error "This file must be included through simd.h"
13 #endif
14 
15 #include <simdpp/types.h>
16 #include <simdpp/detail/not_implemented.h>
17 #include <simdpp/core/bit_and.h>
18 #include <simdpp/core/bit_andnot.h>
19 #include <simdpp/core/set_splat.h>
20 #include <simdpp/detail/insn/i_shift.h>
21 #include <simdpp/detail/null/math.h>
22 #include <simdpp/detail/vector_array_macros.h>
23 
24 namespace simdpp {
25 namespace SIMDPP_ARCH_NAMESPACE {
26 namespace detail {
27 namespace insn {
28 
29 
30 static SIMDPP_INL
i_shift_l(const uint8x16 & a,unsigned count)31 uint8x16 i_shift_l(const uint8x16& a, unsigned count)
32 {
33 #if SIMDPP_USE_NULL
34     return detail::null::shift_l(a, count);
35 #elif SIMDPP_USE_AVX2
36     uint16x8 mask, a16;
37     uint16_t mask1 = (0x00ff >> (8-count)) << 8;
38 
39     a16 = a;
40     mask = splat(mask1);
41     a16 = shift_l(a16, count);
42     a16 = bit_andnot(a16, mask);
43     return uint8x16(a16);
44 #elif SIMDPP_USE_SSE2
45     uint16x8 mask, a16;
46     mask = make_ones();
47     mask = shift_r(mask, 16-count);
48     mask = shift_l(mask, 8);
49 
50     a16 = a;
51     a16 = shift_l(a16, count);
52     a16 = bit_andnot(a16, mask);
53     return uint8x16(a16);
54 #elif SIMDPP_USE_NEON
55     int8x16 shift = splat(count);
56     return vshlq_u8(a.native(), shift.native());
57 #elif SIMDPP_USE_ALTIVEC
58     uint8x16 shift = splat(count);
59     return vec_sl(a.native(), shift.native());
60 #elif SIMDPP_USE_MSA
61     int8x16 shift = splat(count);
62     return (v16u8) __msa_sll_b((v16i8)a.native(), shift.native());
63 #endif
64 }
65 
66 #if SIMDPP_USE_AVX2
67 static SIMDPP_INL
i_shift_l(const uint8x32 & a,unsigned count)68 uint8x32 i_shift_l(const uint8x32& a, unsigned count)
69 {
70     uint16x16 mask, a16;
71     uint16_t mask1 = (0x00ff >> (8-count)) << 8;
72 
73     a16 = a;
74     mask = splat(mask1);
75     a16 = shift_l(a16, count);
76     a16 = bit_andnot(a16, mask);
77     return uint8<32>(a16);
78 }
79 #endif
80 
81 #if SIMDPP_USE_AVX512BW
i_shift_l(const uint8<64> & a,unsigned count)82 SIMDPP_INL uint8<64> i_shift_l(const uint8<64>& a, unsigned count)
83 {
84     uint16<32> mask, a16;
85     uint16_t mask1 = (0x00ff >> (8-count)) << 8;
86 
87     a16 = a;
88     mask = splat(mask1);
89     a16 = shift_l(a16, count);
90     a16 = bit_andnot(a16, mask);
91     return uint8<64>(a16);
92 }
93 #endif
94 
95 // -----------------------------------------------------------------------------
96 
97 static SIMDPP_INL
i_shift_l(const uint16x8 & a,unsigned count)98 uint16x8 i_shift_l(const uint16x8& a, unsigned count)
99 {
100 #if SIMDPP_USE_NULL
101     return detail::null::shift_l(a, count);
102 #elif SIMDPP_USE_SSE2
103     return _mm_sll_epi16(a.native(), _mm_cvtsi32_si128(count));
104 #elif SIMDPP_USE_NEON
105     int16x8 shift = splat(count);
106     return vshlq_u16(a.native(), shift.native());
107 #elif SIMDPP_USE_ALTIVEC
108     uint16x8 shift = splat(count);
109     return vec_sl(a.native(), shift.native());
110 #elif SIMDPP_USE_MSA
111     int16x8 shift = splat(count);
112     return (v8u16) __msa_sll_h((v8i16) a.native(), shift.native());
113 #endif
114 }
115 
116 #if SIMDPP_USE_AVX2
117 static SIMDPP_INL
i_shift_l(const uint16x16 & a,unsigned count)118 uint16x16 i_shift_l(const uint16x16& a, unsigned count)
119 {
120 #if SIMDPP_WORKAROUND_AVX2_SHIFT_INTRINSICS
121     __m256i r = a.native();
122     __m128i x = _mm_cvtsi32_si128(count);
123     __asm("vpsllw	%1, %2, %0" : "=x"(r) : "x"(x), "x"(r));
124     return r;
125 #else
126     return _mm256_sll_epi16(a.native(), _mm_cvtsi32_si128(count));
127 #endif
128 }
129 #endif
130 
131 #if SIMDPP_USE_AVX512BW
i_shift_l(const uint16<32> & a,unsigned count)132 SIMDPP_INL uint16<32> i_shift_l(const uint16<32>& a, unsigned count)
133 {
134     return _mm512_sll_epi16(a.native(), _mm_cvtsi32_si128(count));
135 }
136 #endif
137 
138 // -----------------------------------------------------------------------------
139 
140 static SIMDPP_INL
i_shift_l(const uint32x4 & a,unsigned count)141 uint32x4 i_shift_l(const uint32x4& a, unsigned count)
142 {
143 #if SIMDPP_USE_NULL
144     return detail::null::shift_l(a, count);
145 #elif SIMDPP_USE_SSE2
146     return _mm_sll_epi32(a.native(), _mm_cvtsi32_si128(count));
147 #elif SIMDPP_USE_NEON
148     int32x4 shift = splat(count);
149     return vshlq_u32(a.native(), shift.native());
150 #elif SIMDPP_USE_ALTIVEC
151     uint32x4 shift = splat(count);
152     return vec_sl(a.native(), shift.native());
153 #elif SIMDPP_USE_MSA
154     int32x4 shift = splat(count);
155     return (v4u32) __msa_sll_w((v4i32) a.native(), shift.native());
156 #endif
157 }
158 
159 #if SIMDPP_USE_AVX2
160 static SIMDPP_INL
i_shift_l(const uint32x8 & a,unsigned count)161 uint32x8 i_shift_l(const uint32x8& a, unsigned count)
162 {
163 #if SIMDPP_WORKAROUND_AVX2_SHIFT_INTRINSICS
164     __m256i r = a.native();
165     __m128i x = _mm_cvtsi32_si128(count);
166     __asm("vpslld	%1, %2, %0" : "=x"(r) : "x"(x), "x"(r));
167     return r;
168 #else
169     return _mm256_sll_epi32(a.native(), _mm_cvtsi32_si128(count));
170 #endif
171 }
172 #endif
173 
174 #if SIMDPP_USE_AVX512F
175 static SIMDPP_INL
i_shift_l(const uint32<16> & a,unsigned count)176 uint32<16> i_shift_l(const uint32<16>& a, unsigned count)
177 {
178     return _mm512_sll_epi32(a.native(), _mm_cvtsi32_si128(count));
179 }
180 #endif
181 
182 // -----------------------------------------------------------------------------
183 
184 static SIMDPP_INL
i_shift_l(const uint64x2 & a,unsigned count)185 uint64x2 i_shift_l(const uint64x2& a, unsigned count)
186 {
187 #if SIMDPP_USE_SSE2
188     return _mm_sll_epi64(a.native(), _mm_cvtsi32_si128(count));
189 #elif SIMDPP_USE_NEON
190     int64x2 shift = splat(count);
191     return vshlq_u64(a.native(), shift.native());
192 #elif SIMDPP_USE_VSX_207
193     uint64x2 shift = splat(count);
194     return vec_sl(a.native(), shift.native());
195 #elif SIMDPP_USE_MSA
196     int32x4 shift = splat(count);
197     return (v2u64) __msa_sll_d((v2i64) a.native(), (v2i64) shift.native());
198 #elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC
199     return detail::null::shift_l(a, count);
200 #endif
201 }
202 
203 #if SIMDPP_USE_AVX2
204 static SIMDPP_INL
i_shift_l(const uint64x4 & a,unsigned count)205 uint64x4 i_shift_l(const uint64x4& a, unsigned count)
206 {
207 #if SIMDPP_WORKAROUND_AVX2_SHIFT_INTRINSICS
208     __m256i r = a.native();
209     __m128i x = _mm_cvtsi32_si128(count);
210     __asm("vpsllq	%1, %2, %0" : "=x"(r) : "x"(x), "x"(r));
211     return r;
212 #else
213     return _mm256_sll_epi64(a.native(), _mm_cvtsi32_si128(count));
214 #endif
215 }
216 #endif
217 
218 #if SIMDPP_USE_AVX512F
219 static SIMDPP_INL
i_shift_l(const uint64<8> & a,unsigned count)220 uint64<8> i_shift_l(const uint64<8>& a, unsigned count)
221 {
222     return _mm512_sll_epi64(a.native(), _mm_cvtsi32_si128(count));
223 }
224 #endif
225 
226 // -----------------------------------------------------------------------------
227 
228 template<class V> SIMDPP_INL
i_shift_l(const V & a,unsigned count)229 V i_shift_l(const V& a, unsigned count)
230 {
231     SIMDPP_VEC_ARRAY_IMPL2S(V, i_shift_l, a, count);
232 }
233 
234 // -----------------------------------------------------------------------------
235 
236 template<unsigned count, unsigned N> SIMDPP_INL
sse_shift_l_8(const uint8<N> & a)237 uint8<N> sse_shift_l_8(const uint8<N>& a)
238 {
239     uint8_t mask1 = 0xff >> count;
240     uint8<N> mask = make_uint(mask1);
241 
242     uint16<N/2> a16 = (uint16<N/2>) bit_and(a, mask);
243     a16 = shift_l<count>(a16);
244 
245     return uint8<N>(a16);
246 }
247 
248 template<unsigned count> SIMDPP_INL
i_shift_l(const uint8x16 & a)249 uint8x16 i_shift_l(const uint8x16& a)
250 {
251     static_assert(count < 8, "Shift out of bounds");
252 #if SIMDPP_USE_NULL
253     return i_shift_l(a, count);
254 #elif SIMDPP_USE_SSE2
255     return sse_shift_l_8<count>(a);
256 #elif SIMDPP_USE_NEON
257     return vshlq_n_u8(a.native(), count);
258 #elif SIMDPP_USE_ALTIVEC
259     uint8x16 shift = make_uint(count);
260     return vec_sl(a.native(), shift.native());
261 #elif SIMDPP_USE_MSA
262     return (v16u8) __msa_slli_b((v16i8) a.native(), count);
263 #endif
264 }
265 
266 #if SIMDPP_USE_AVX2
267 template<unsigned count> SIMDPP_INL
i_shift_l(const uint8<32> & a)268 uint8<32> i_shift_l(const uint8<32>& a)
269 {
270     static_assert(count < 8, "Shift out of bounds");
271     return sse_shift_l_8<count>(a);
272 }
273 #endif
274 
275 #if SIMDPP_USE_AVX512BW
276 template<unsigned count> SIMDPP_INL
i_shift_l(const uint8<64> & a)277 uint8<64> i_shift_l(const uint8<64>& a)
278 {
279     static_assert(count < 8, "Shift out of bounds");
280     return sse_shift_l_8<count>(a);
281 }
282 #endif
283 
284 // -----------------------------------------------------------------------------
285 
286 template<unsigned count> SIMDPP_INL
i_shift_l(const uint16x8 & a)287 uint16x8 i_shift_l(const uint16x8& a)
288 {
289     static_assert(count < 16, "Shift out of bounds");
290 #if SIMDPP_USE_NULL
291     return i_shift_l(a, count);
292 #elif SIMDPP_USE_SSE2
293     return _mm_slli_epi16(a.native(), count);
294 #elif SIMDPP_USE_NEON
295     return vshlq_n_u16(a.native(), count);
296 #elif SIMDPP_USE_ALTIVEC
297     uint16x8 shift = make_uint(count);
298     return vec_sl(a.native(), shift.native());
299 #elif SIMDPP_USE_MSA
300     return (v8u16) __msa_slli_h((v8i16) a.native(), count);
301 #endif
302 }
303 
304 #if SIMDPP_USE_AVX2
305 template<unsigned count> SIMDPP_INL
i_shift_l(const uint16x16 & a)306 uint16x16 i_shift_l(const uint16x16& a)
307 {
308     static_assert(count < 16, "Shift out of bounds");
309     return _mm256_slli_epi16(a.native(), count);
310 }
311 #endif
312 
313 #if SIMDPP_USE_AVX512BW
314 template<unsigned count> SIMDPP_INL
i_shift_l(const uint16<32> & a)315 uint16<32> i_shift_l(const uint16<32>& a)
316 {
317     static_assert(count < 16, "Shift out of bounds");
318     return _mm512_slli_epi16(a.native(), count);
319 }
320 #endif
321 
322 // -----------------------------------------------------------------------------
323 
324 template<unsigned count> SIMDPP_INL
i_shift_l(const uint32x4 & a)325 uint32x4 i_shift_l(const uint32x4& a)
326 {
327     static_assert(count < 32, "Shift out of bounds");
328 #if SIMDPP_USE_NULL
329     return i_shift_l(a, count);
330 #elif SIMDPP_USE_SSE2
331     return _mm_slli_epi32(a.native(), count);
332 #elif SIMDPP_USE_NEON
333     return vshlq_n_u32(a.native(), count);
334 #elif SIMDPP_USE_ALTIVEC
335     uint32x4 shift = make_uint(count);
336     return vec_sl(a.native(), shift.native());
337 #elif SIMDPP_USE_MSA
338     return (v4u32) __msa_slli_w((v4i32) a.native(), count);
339 #endif
340 }
341 
342 #if SIMDPP_USE_AVX2
343 template<unsigned count> SIMDPP_INL
i_shift_l(const uint32x8 & a)344 uint32x8 i_shift_l(const uint32x8& a)
345 {
346     static_assert(count < 32, "Shift out of bounds");
347     return _mm256_slli_epi32(a.native(), count);
348 }
349 #endif
350 
351 #if SIMDPP_USE_AVX512F
352 template<unsigned count> SIMDPP_INL
i_shift_l(const uint32<16> & a)353 uint32<16> i_shift_l(const uint32<16>& a)
354 {
355     static_assert(count < 32, "Shift out of bounds");
356     return _mm512_slli_epi32(a.native(), count);
357 }
358 #endif
359 
360 // -----------------------------------------------------------------------------
361 
362 template<unsigned count> SIMDPP_INL
i_shift_l(const uint64x2 & a)363 uint64x2 i_shift_l(const uint64x2& a)
364 {
365     static_assert(count < 64, "Shift out of bounds");
366 #if SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC
367     return i_shift_l(a, count);
368 #elif SIMDPP_USE_SSE2
369     return _mm_slli_epi64(a.native(), count);
370 #elif SIMDPP_USE_NEON
371     return vshlq_n_u64(a.native(), count);
372 #elif SIMDPP_USE_MSA
373     return (v2u64) __msa_slli_d((v2i64) a.native(), count);
374 #else
375     return SIMDPP_NOT_IMPLEMENTED1(a);
376 #endif
377 }
378 
379 #if SIMDPP_USE_AVX2
380 template<unsigned count> SIMDPP_INL
i_shift_l(const uint64x4 & a)381 uint64x4 i_shift_l(const uint64x4& a)
382 {
383     static_assert(count < 64, "Shift out of bounds");
384     return _mm256_slli_epi64(a.native(), count);
385 }
386 #endif
387 
388 #if SIMDPP_USE_AVX512F
389 template<unsigned count> SIMDPP_INL
i_shift_l(const uint64<8> & a)390 uint64<8> i_shift_l(const uint64<8>& a)
391 {
392     static_assert(count < 64, "Shift out of bounds");
393     return _mm512_slli_epi64(a.native(), count);
394 }
395 #endif
396 
397 // -----------------------------------------------------------------------------
398 
399 template<unsigned count, class V> SIMDPP_INL
i_shift_l(const V & a)400 V i_shift_l(const V& a)
401 {
402     SIMDPP_VEC_ARRAY_IMPL1(V, i_shift_l<count>, a);
403 }
404 
405 template<bool no_shift>
406 struct i_shift_l_wrapper {
407     template<unsigned count, class V>
runi_shift_l_wrapper408     static SIMDPP_INL V run(const V& arg) { return i_shift_l<count>(arg); }
409 };
410 template<>
411 struct i_shift_l_wrapper<true> {
412     template<unsigned count, class V>
413     static SIMDPP_INL V run(const V& arg) { return arg; }
414 };
415 
416 } // namespace insn
417 } // namespace detail
418 } // namespace SIMDPP_ARCH_NAMESPACE
419 } // namespace simdpp
420 
421 #endif
422 
423