1 /* Copyright (C) 2011-2014 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6 */
7
8 #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_I_SHIFT_L_H
9 #define LIBSIMDPP_SIMDPP_DETAIL_INSN_I_SHIFT_L_H
10
11 #ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13 #endif
14
15 #include <simdpp/types.h>
16 #include <simdpp/detail/not_implemented.h>
17 #include <simdpp/core/bit_and.h>
18 #include <simdpp/core/bit_andnot.h>
19 #include <simdpp/core/set_splat.h>
20 #include <simdpp/detail/insn/i_shift.h>
21 #include <simdpp/detail/null/math.h>
22 #include <simdpp/detail/vector_array_macros.h>
23
24 namespace simdpp {
25 namespace SIMDPP_ARCH_NAMESPACE {
26 namespace detail {
27 namespace insn {
28
29
30 static SIMDPP_INL
i_shift_l(const uint8x16 & a,unsigned count)31 uint8x16 i_shift_l(const uint8x16& a, unsigned count)
32 {
33 #if SIMDPP_USE_NULL
34 return detail::null::shift_l(a, count);
35 #elif SIMDPP_USE_AVX2
36 uint16x8 mask, a16;
37 uint16_t mask1 = (0x00ff >> (8-count)) << 8;
38
39 a16 = a;
40 mask = splat(mask1);
41 a16 = shift_l(a16, count);
42 a16 = bit_andnot(a16, mask);
43 return uint8x16(a16);
44 #elif SIMDPP_USE_SSE2
45 uint16x8 mask, a16;
46 mask = make_ones();
47 mask = shift_r(mask, 16-count);
48 mask = shift_l(mask, 8);
49
50 a16 = a;
51 a16 = shift_l(a16, count);
52 a16 = bit_andnot(a16, mask);
53 return uint8x16(a16);
54 #elif SIMDPP_USE_NEON
55 int8x16 shift = splat(count);
56 return vshlq_u8(a.native(), shift.native());
57 #elif SIMDPP_USE_ALTIVEC
58 uint8x16 shift = splat(count);
59 return vec_sl(a.native(), shift.native());
60 #elif SIMDPP_USE_MSA
61 int8x16 shift = splat(count);
62 return (v16u8) __msa_sll_b((v16i8)a.native(), shift.native());
63 #endif
64 }
65
66 #if SIMDPP_USE_AVX2
67 static SIMDPP_INL
i_shift_l(const uint8x32 & a,unsigned count)68 uint8x32 i_shift_l(const uint8x32& a, unsigned count)
69 {
70 uint16x16 mask, a16;
71 uint16_t mask1 = (0x00ff >> (8-count)) << 8;
72
73 a16 = a;
74 mask = splat(mask1);
75 a16 = shift_l(a16, count);
76 a16 = bit_andnot(a16, mask);
77 return uint8<32>(a16);
78 }
79 #endif
80
81 #if SIMDPP_USE_AVX512BW
i_shift_l(const uint8<64> & a,unsigned count)82 SIMDPP_INL uint8<64> i_shift_l(const uint8<64>& a, unsigned count)
83 {
84 uint16<32> mask, a16;
85 uint16_t mask1 = (0x00ff >> (8-count)) << 8;
86
87 a16 = a;
88 mask = splat(mask1);
89 a16 = shift_l(a16, count);
90 a16 = bit_andnot(a16, mask);
91 return uint8<64>(a16);
92 }
93 #endif
94
95 // -----------------------------------------------------------------------------
96
97 static SIMDPP_INL
i_shift_l(const uint16x8 & a,unsigned count)98 uint16x8 i_shift_l(const uint16x8& a, unsigned count)
99 {
100 #if SIMDPP_USE_NULL
101 return detail::null::shift_l(a, count);
102 #elif SIMDPP_USE_SSE2
103 return _mm_sll_epi16(a.native(), _mm_cvtsi32_si128(count));
104 #elif SIMDPP_USE_NEON
105 int16x8 shift = splat(count);
106 return vshlq_u16(a.native(), shift.native());
107 #elif SIMDPP_USE_ALTIVEC
108 uint16x8 shift = splat(count);
109 return vec_sl(a.native(), shift.native());
110 #elif SIMDPP_USE_MSA
111 int16x8 shift = splat(count);
112 return (v8u16) __msa_sll_h((v8i16) a.native(), shift.native());
113 #endif
114 }
115
116 #if SIMDPP_USE_AVX2
117 static SIMDPP_INL
i_shift_l(const uint16x16 & a,unsigned count)118 uint16x16 i_shift_l(const uint16x16& a, unsigned count)
119 {
120 #if SIMDPP_WORKAROUND_AVX2_SHIFT_INTRINSICS
121 __m256i r = a.native();
122 __m128i x = _mm_cvtsi32_si128(count);
123 __asm("vpsllw %1, %2, %0" : "=x"(r) : "x"(x), "x"(r));
124 return r;
125 #else
126 return _mm256_sll_epi16(a.native(), _mm_cvtsi32_si128(count));
127 #endif
128 }
129 #endif
130
131 #if SIMDPP_USE_AVX512BW
i_shift_l(const uint16<32> & a,unsigned count)132 SIMDPP_INL uint16<32> i_shift_l(const uint16<32>& a, unsigned count)
133 {
134 return _mm512_sll_epi16(a.native(), _mm_cvtsi32_si128(count));
135 }
136 #endif
137
138 // -----------------------------------------------------------------------------
139
140 static SIMDPP_INL
i_shift_l(const uint32x4 & a,unsigned count)141 uint32x4 i_shift_l(const uint32x4& a, unsigned count)
142 {
143 #if SIMDPP_USE_NULL
144 return detail::null::shift_l(a, count);
145 #elif SIMDPP_USE_SSE2
146 return _mm_sll_epi32(a.native(), _mm_cvtsi32_si128(count));
147 #elif SIMDPP_USE_NEON
148 int32x4 shift = splat(count);
149 return vshlq_u32(a.native(), shift.native());
150 #elif SIMDPP_USE_ALTIVEC
151 uint32x4 shift = splat(count);
152 return vec_sl(a.native(), shift.native());
153 #elif SIMDPP_USE_MSA
154 int32x4 shift = splat(count);
155 return (v4u32) __msa_sll_w((v4i32) a.native(), shift.native());
156 #endif
157 }
158
159 #if SIMDPP_USE_AVX2
160 static SIMDPP_INL
i_shift_l(const uint32x8 & a,unsigned count)161 uint32x8 i_shift_l(const uint32x8& a, unsigned count)
162 {
163 #if SIMDPP_WORKAROUND_AVX2_SHIFT_INTRINSICS
164 __m256i r = a.native();
165 __m128i x = _mm_cvtsi32_si128(count);
166 __asm("vpslld %1, %2, %0" : "=x"(r) : "x"(x), "x"(r));
167 return r;
168 #else
169 return _mm256_sll_epi32(a.native(), _mm_cvtsi32_si128(count));
170 #endif
171 }
172 #endif
173
174 #if SIMDPP_USE_AVX512F
175 static SIMDPP_INL
i_shift_l(const uint32<16> & a,unsigned count)176 uint32<16> i_shift_l(const uint32<16>& a, unsigned count)
177 {
178 return _mm512_sll_epi32(a.native(), _mm_cvtsi32_si128(count));
179 }
180 #endif
181
182 // -----------------------------------------------------------------------------
183
184 static SIMDPP_INL
i_shift_l(const uint64x2 & a,unsigned count)185 uint64x2 i_shift_l(const uint64x2& a, unsigned count)
186 {
187 #if SIMDPP_USE_SSE2
188 return _mm_sll_epi64(a.native(), _mm_cvtsi32_si128(count));
189 #elif SIMDPP_USE_NEON
190 int64x2 shift = splat(count);
191 return vshlq_u64(a.native(), shift.native());
192 #elif SIMDPP_USE_VSX_207
193 uint64x2 shift = splat(count);
194 return vec_sl(a.native(), shift.native());
195 #elif SIMDPP_USE_MSA
196 int32x4 shift = splat(count);
197 return (v2u64) __msa_sll_d((v2i64) a.native(), (v2i64) shift.native());
198 #elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC
199 return detail::null::shift_l(a, count);
200 #endif
201 }
202
203 #if SIMDPP_USE_AVX2
204 static SIMDPP_INL
i_shift_l(const uint64x4 & a,unsigned count)205 uint64x4 i_shift_l(const uint64x4& a, unsigned count)
206 {
207 #if SIMDPP_WORKAROUND_AVX2_SHIFT_INTRINSICS
208 __m256i r = a.native();
209 __m128i x = _mm_cvtsi32_si128(count);
210 __asm("vpsllq %1, %2, %0" : "=x"(r) : "x"(x), "x"(r));
211 return r;
212 #else
213 return _mm256_sll_epi64(a.native(), _mm_cvtsi32_si128(count));
214 #endif
215 }
216 #endif
217
218 #if SIMDPP_USE_AVX512F
219 static SIMDPP_INL
i_shift_l(const uint64<8> & a,unsigned count)220 uint64<8> i_shift_l(const uint64<8>& a, unsigned count)
221 {
222 return _mm512_sll_epi64(a.native(), _mm_cvtsi32_si128(count));
223 }
224 #endif
225
226 // -----------------------------------------------------------------------------
227
228 template<class V> SIMDPP_INL
i_shift_l(const V & a,unsigned count)229 V i_shift_l(const V& a, unsigned count)
230 {
231 SIMDPP_VEC_ARRAY_IMPL2S(V, i_shift_l, a, count);
232 }
233
234 // -----------------------------------------------------------------------------
235
236 template<unsigned count, unsigned N> SIMDPP_INL
sse_shift_l_8(const uint8<N> & a)237 uint8<N> sse_shift_l_8(const uint8<N>& a)
238 {
239 uint8_t mask1 = 0xff >> count;
240 uint8<N> mask = make_uint(mask1);
241
242 uint16<N/2> a16 = (uint16<N/2>) bit_and(a, mask);
243 a16 = shift_l<count>(a16);
244
245 return uint8<N>(a16);
246 }
247
248 template<unsigned count> SIMDPP_INL
i_shift_l(const uint8x16 & a)249 uint8x16 i_shift_l(const uint8x16& a)
250 {
251 static_assert(count < 8, "Shift out of bounds");
252 #if SIMDPP_USE_NULL
253 return i_shift_l(a, count);
254 #elif SIMDPP_USE_SSE2
255 return sse_shift_l_8<count>(a);
256 #elif SIMDPP_USE_NEON
257 return vshlq_n_u8(a.native(), count);
258 #elif SIMDPP_USE_ALTIVEC
259 uint8x16 shift = make_uint(count);
260 return vec_sl(a.native(), shift.native());
261 #elif SIMDPP_USE_MSA
262 return (v16u8) __msa_slli_b((v16i8) a.native(), count);
263 #endif
264 }
265
266 #if SIMDPP_USE_AVX2
267 template<unsigned count> SIMDPP_INL
i_shift_l(const uint8<32> & a)268 uint8<32> i_shift_l(const uint8<32>& a)
269 {
270 static_assert(count < 8, "Shift out of bounds");
271 return sse_shift_l_8<count>(a);
272 }
273 #endif
274
275 #if SIMDPP_USE_AVX512BW
276 template<unsigned count> SIMDPP_INL
i_shift_l(const uint8<64> & a)277 uint8<64> i_shift_l(const uint8<64>& a)
278 {
279 static_assert(count < 8, "Shift out of bounds");
280 return sse_shift_l_8<count>(a);
281 }
282 #endif
283
284 // -----------------------------------------------------------------------------
285
286 template<unsigned count> SIMDPP_INL
i_shift_l(const uint16x8 & a)287 uint16x8 i_shift_l(const uint16x8& a)
288 {
289 static_assert(count < 16, "Shift out of bounds");
290 #if SIMDPP_USE_NULL
291 return i_shift_l(a, count);
292 #elif SIMDPP_USE_SSE2
293 return _mm_slli_epi16(a.native(), count);
294 #elif SIMDPP_USE_NEON
295 return vshlq_n_u16(a.native(), count);
296 #elif SIMDPP_USE_ALTIVEC
297 uint16x8 shift = make_uint(count);
298 return vec_sl(a.native(), shift.native());
299 #elif SIMDPP_USE_MSA
300 return (v8u16) __msa_slli_h((v8i16) a.native(), count);
301 #endif
302 }
303
304 #if SIMDPP_USE_AVX2
305 template<unsigned count> SIMDPP_INL
i_shift_l(const uint16x16 & a)306 uint16x16 i_shift_l(const uint16x16& a)
307 {
308 static_assert(count < 16, "Shift out of bounds");
309 return _mm256_slli_epi16(a.native(), count);
310 }
311 #endif
312
313 #if SIMDPP_USE_AVX512BW
314 template<unsigned count> SIMDPP_INL
i_shift_l(const uint16<32> & a)315 uint16<32> i_shift_l(const uint16<32>& a)
316 {
317 static_assert(count < 16, "Shift out of bounds");
318 return _mm512_slli_epi16(a.native(), count);
319 }
320 #endif
321
322 // -----------------------------------------------------------------------------
323
324 template<unsigned count> SIMDPP_INL
i_shift_l(const uint32x4 & a)325 uint32x4 i_shift_l(const uint32x4& a)
326 {
327 static_assert(count < 32, "Shift out of bounds");
328 #if SIMDPP_USE_NULL
329 return i_shift_l(a, count);
330 #elif SIMDPP_USE_SSE2
331 return _mm_slli_epi32(a.native(), count);
332 #elif SIMDPP_USE_NEON
333 return vshlq_n_u32(a.native(), count);
334 #elif SIMDPP_USE_ALTIVEC
335 uint32x4 shift = make_uint(count);
336 return vec_sl(a.native(), shift.native());
337 #elif SIMDPP_USE_MSA
338 return (v4u32) __msa_slli_w((v4i32) a.native(), count);
339 #endif
340 }
341
342 #if SIMDPP_USE_AVX2
343 template<unsigned count> SIMDPP_INL
i_shift_l(const uint32x8 & a)344 uint32x8 i_shift_l(const uint32x8& a)
345 {
346 static_assert(count < 32, "Shift out of bounds");
347 return _mm256_slli_epi32(a.native(), count);
348 }
349 #endif
350
351 #if SIMDPP_USE_AVX512F
352 template<unsigned count> SIMDPP_INL
i_shift_l(const uint32<16> & a)353 uint32<16> i_shift_l(const uint32<16>& a)
354 {
355 static_assert(count < 32, "Shift out of bounds");
356 return _mm512_slli_epi32(a.native(), count);
357 }
358 #endif
359
360 // -----------------------------------------------------------------------------
361
362 template<unsigned count> SIMDPP_INL
i_shift_l(const uint64x2 & a)363 uint64x2 i_shift_l(const uint64x2& a)
364 {
365 static_assert(count < 64, "Shift out of bounds");
366 #if SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC
367 return i_shift_l(a, count);
368 #elif SIMDPP_USE_SSE2
369 return _mm_slli_epi64(a.native(), count);
370 #elif SIMDPP_USE_NEON
371 return vshlq_n_u64(a.native(), count);
372 #elif SIMDPP_USE_MSA
373 return (v2u64) __msa_slli_d((v2i64) a.native(), count);
374 #else
375 return SIMDPP_NOT_IMPLEMENTED1(a);
376 #endif
377 }
378
379 #if SIMDPP_USE_AVX2
380 template<unsigned count> SIMDPP_INL
i_shift_l(const uint64x4 & a)381 uint64x4 i_shift_l(const uint64x4& a)
382 {
383 static_assert(count < 64, "Shift out of bounds");
384 return _mm256_slli_epi64(a.native(), count);
385 }
386 #endif
387
388 #if SIMDPP_USE_AVX512F
389 template<unsigned count> SIMDPP_INL
i_shift_l(const uint64<8> & a)390 uint64<8> i_shift_l(const uint64<8>& a)
391 {
392 static_assert(count < 64, "Shift out of bounds");
393 return _mm512_slli_epi64(a.native(), count);
394 }
395 #endif
396
397 // -----------------------------------------------------------------------------
398
399 template<unsigned count, class V> SIMDPP_INL
i_shift_l(const V & a)400 V i_shift_l(const V& a)
401 {
402 SIMDPP_VEC_ARRAY_IMPL1(V, i_shift_l<count>, a);
403 }
404
405 template<bool no_shift>
406 struct i_shift_l_wrapper {
407 template<unsigned count, class V>
runi_shift_l_wrapper408 static SIMDPP_INL V run(const V& arg) { return i_shift_l<count>(arg); }
409 };
410 template<>
411 struct i_shift_l_wrapper<true> {
412 template<unsigned count, class V>
413 static SIMDPP_INL V run(const V& arg) { return arg; }
414 };
415
416 } // namespace insn
417 } // namespace detail
418 } // namespace SIMDPP_ARCH_NAMESPACE
419 } // namespace simdpp
420
421 #endif
422
423