1 /*  Copyright (C) 2017  Povilas Kanapickas <povilas@radix.lt>
2 
3     Distributed under the Boost Software License, Version 1.0.
4         (See accompanying file LICENSE_1_0.txt or copy at
5             http://www.boost.org/LICENSE_1_0.txt)
6 */
7 
8 #ifndef LIBSIMDPP_SIMDPP_CORE_I_SHIFT_L_V_H
9 #define LIBSIMDPP_SIMDPP_CORE_I_SHIFT_L_V_H
10 
11 #ifndef LIBSIMDPP_SIMD_H
12     #error "This file must be included through simd.h"
13 #endif
14 
15 #include <simdpp/types.h>
16 #include <simdpp/detail/null/math.h>
17 #include <simdpp/detail/insn/i_shift.h>
18 #include <simdpp/detail/shuffle/shuffle_mask.h>
19 #include <simdpp/core/i_mul.h>
20 #include <simdpp/core/permute_bytes16.h>
21 #include <simdpp/detail/vector_array_macros.h>
22 
23 namespace simdpp {
24 namespace SIMDPP_ARCH_NAMESPACE {
25 namespace detail {
26 namespace insn {
27 
28 // emulates 8-bit variable shift using 16-bit variable shift
29 template<class U8> SIMDPP_INL
v_emul_shift_l_v8_using_v16(const U8 & a,const U8 & count)30 U8 v_emul_shift_l_v8_using_v16(const U8& a, const U8& count)
31 {
32     using U16 = typename same_width<U8>::u16;
33 
34     U16 a16; a16 = a;
35     U16 c16; c16 = count;
36 
37     U16 select_mask = make_uint(0xff00);
38     U16 a_lo = a16;
39     U16 a_hi = bit_and(a16, select_mask);
40     U16 c_lo = bit_andnot(c16, select_mask);
41     U16 c_hi = shift_r<8>(c16);
42     a_lo = shift_l(a_lo, c_lo);
43     a_hi = shift_l(a_hi, c_hi);
44     a_lo = bit_andnot(a_lo, select_mask);
45 
46     a16 = bit_or(a_lo, a_hi);
47     return (U8) a16;
48 }
49 
50 // emulates 8-bit variable shift using permute_bytes16 and 16-bit multiplication
51 template<class U8> SIMDPP_INL
v_emul_shift_l_v8_using_mul(const U8 & a,const U8 & count)52 U8 v_emul_shift_l_v8_using_mul(const U8& a, const U8& count)
53 {
54     using U16 = typename same_width<U8>::u16;
55 
56     // Variable shift is implemented by obtaining 1 << countN for each element
57     // from a and then multiplying each element by that number. Implementation
58     // is complicated by the fact, that only 16-bit multiplication is available.
59     U8 mulshift_mask = make_uint(0x01, 0x02, 0x04, 0x08,
60                                  0x10, 0x20, 0x40, 0x80);
61     U16 mulshift = (U16) permute_bytes16(mulshift_mask, count);
62 
63     U16 a16; a16 = a;
64     U16 a16_lo, a16_hi, mulshift_lo, mulshift_hi;
65     U16 select_mask = make_uint(0x00ff);
66 
67     // Move the element values to the high byte of the 16-bit elements and the
68     // shift values to the low byte. The results will have the low byte clear
69     // which will help composing the result back to a single vector.
70     a16_lo = shift_l<8>(a16);
71     mulshift_lo = bit_and(mulshift, select_mask);
72     a16_hi = bit_andnot(a16, select_mask);
73     mulshift_hi = shift_r<8>(mulshift);
74 
75     a16_lo = mul_lo(a16_lo, mulshift_lo);
76     a16_hi = mul_lo(a16_hi, mulshift_hi);
77 
78     a16_lo = shift_r<8>(a16_lo);
79     a16 = bit_or(a16_lo, a16_hi);
80     return (U8) a16;
81 }
82 
83 static SIMDPP_INL
i_shift_l_v(const uint8<16> & a,const uint8<16> & count)84 uint8<16> i_shift_l_v(const uint8<16>& a, const uint8<16>& count)
85 {
86 #if SIMDPP_USE_NULL
87     return detail::null::shift_l_v(a, count);
88 #elif SIMDPP_USE_AVX512BW && SIMDPP_USE_AVX512VL
89     return v_emul_shift_l_v8_using_v16(a, count);
90 #elif SIMDPP_USE_SSSE3
91     return v_emul_shift_l_v8_using_mul(a, count);
92 #elif SIMDPP_USE_NEON
93     return vshlq_u8(a.native(), vreinterpretq_s8_u8(count.native()));
94 #elif SIMDPP_USE_ALTIVEC
95     return vec_sl(a.native(), count.native());
96 #elif SIMDPP_USE_MSA
97     return (v16u8) __msa_sll_b((v16i8)a.native(), (v16i8)count.native());
98 #else
99     return SIMDPP_NOT_IMPLEMENTED2(a, count);
100 #endif
101 }
102 
103 #if SIMDPP_USE_AVX2
104 static SIMDPP_INL
i_shift_l_v(const uint8<32> & a,const uint8<32> & count)105 uint8<32> i_shift_l_v(const uint8<32>& a, const uint8<32>& count)
106 {
107 #if SIMDPP_USE_AVX512BW && SIMDPP_USE_AVX512VL
108     return v_emul_shift_l_v8_using_v16(a, count);
109 #else
110     return v_emul_shift_l_v8_using_mul(a, count);
111 #endif
112 }
113 #endif
114 
115 #if SIMDPP_USE_AVX512BW
116 static SIMDPP_INL
i_shift_l_v(const uint8<64> & a,const uint8<64> & count)117 uint8<64> i_shift_l_v(const uint8<64>& a, const uint8<64>& count)
118 {
119     return v_emul_shift_l_v8_using_v16(a, count);
120 }
121 #endif
122 
123 // -----------------------------------------------------------------------------
124 
125 // emulates 16-bit variable shift using permute_bytes16 and 16-bit multiplication
126 template<class U16>
v_emul_shift_l_v16_using_mul(const U16 & a,const U16 & count)127 U16 v_emul_shift_l_v16_using_mul(const U16& a, const U16& count)
128 {
129     using U8 = typename same_width<U16>::u8;
130 
131     // Variable shift is implemented by obtaining 1 << countN for each element
132     // from a and then multiplying each element by that number. The
133     // implementation is complicated by the fact that permute_bytes16 permutes
134     // 8-bit elements instead of 16 which would be optimal in this case
135     U8 mulshift_mask = make_uint(0x01, 0x02, 0x04, 0x08,
136                                  0x10, 0x20, 0x40, 0x80,
137                                  0x00, 0x00, 0x00, 0x00,
138                                  0x00, 0x00, 0x00, 0x00);
139     U16 qcount = bit_or(count, shift_l<8>(count));
140 
141     // toggle the 4-th bit so that the high byte takes zeros from the mulshift
142     // mask when the shift count is higher than 8.
143     qcount = bit_xor(qcount, 0x0800);
144     U16 mulshift = (U16) permute_bytes16(mulshift_mask, (U8) qcount);
145     return mul_lo(a, mulshift);
146 }
147 
148 static SIMDPP_INL
i_shift_l_v(const uint16<8> & a,const uint16<8> & count)149 uint16<8> i_shift_l_v(const uint16<8>& a, const uint16<8>& count)
150 {
151 #if SIMDPP_USE_NULL
152     return detail::null::shift_l_v(a, count);
153 #elif SIMDPP_USE_AVX512BW && SIMDPP_USE_AVX512VL
154     return _mm_sllv_epi16(a.native(), count.native());
155 #elif SIMDPP_USE_SSSE3
156     return v_emul_shift_l_v16_using_mul(a, count);
157 #elif SIMDPP_USE_NEON
158     return vshlq_u16(a.native(), vreinterpretq_s16_u16(count.native()));
159 #elif SIMDPP_USE_ALTIVEC
160     return vec_sl(a.native(), count.native());
161 #elif SIMDPP_USE_MSA
162     return (v8u16) __msa_sll_h((v8i16)a.native(), (v8i16)count.native());
163 #else
164     return SIMDPP_NOT_IMPLEMENTED2(a, count);
165 #endif
166 }
167 
168 #if SIMDPP_USE_AVX2
169 static SIMDPP_INL
i_shift_l_v(const uint16<16> & a,const uint16<16> & count)170 uint16<16> i_shift_l_v(const uint16<16>& a, const uint16<16>& count)
171 {
172 #if SIMDPP_USE_AVX512BW && SIMDPP_USE_AVX512VL
173     return _mm256_sllv_epi16(a.native(), count.native());
174 #else
175     return v_emul_shift_l_v16_using_mul(a, count);
176 #endif
177 }
178 #endif
179 
180 #if SIMDPP_USE_AVX512BW
i_shift_l_v(const uint16<32> & a,const uint16<32> & count)181 SIMDPP_INL uint16<32> i_shift_l_v(const uint16<32>& a, const uint16<32>& count)
182 {
183     return _mm512_sllv_epi16(a.native(), count.native());
184 }
185 #endif
186 
187 // -----------------------------------------------------------------------------
188 
189 static SIMDPP_INL
i_shift_l_v(const uint32<4> & a,const uint32<4> & count)190 uint32<4> i_shift_l_v(const uint32<4>& a, const uint32<4>& count)
191 {
192 #if SIMDPP_USE_NULL
193     return detail::null::shift_l_v(a, count);
194 #elif SIMDPP_USE_AVX2
195     return _mm_sllv_epi32(a.native(), count.native());
196 #elif SIMDPP_USE_SSE2
197     uint32<4> count0 = count;
198 #if SIMDPP_USE_SSE4_1
199     uint32<4> zero = make_zero();
200     count0 = _mm_blend_epi16(count0.native(), zero.native(), 0xcc);
201 #else
202     uint32<4> mask = make_uint(0xffffffff, 0, 0xffffffff, 0);
203     count0 = bit_and(count0, mask);
204 #endif
205     uint32<4> count1 = _mm_srli_epi64(count.native(), 32);
206     uint32<4> count2 = _mm_srli_si128(count0.native(), 8);
207     uint32<4> count3 = _mm_srli_si128(count.native(), 12);
208 
209     __m128i a0 = _mm_sll_epi32(a.native(), count0.native());
210     __m128i a1 = _mm_sll_epi32(a.native(), count1.native());
211     __m128i a2 = _mm_sll_epi32(a.native(), count2.native());
212     __m128i a3 = _mm_sll_epi32(a.native(), count3.native());
213 #if SIMDPP_USE_SSE4_1
214     a0 = _mm_blend_epi16(a0, a1, 0x0c);
215     a2 = _mm_blend_epi16(a2, a3, 0xc0);
216     a0 = _mm_blend_epi16(a0, a2, 0xf0);
217 #else
218     __m128 f0 = _mm_shuffle_ps(_mm_castsi128_ps(a0),
219                                _mm_castsi128_ps(a1),
220                                SIMDPP_SHUFFLE_MASK_4x4(0, 0, 1, 1));
221     __m128 f1 = _mm_shuffle_ps(_mm_castsi128_ps(a2),
222                                _mm_castsi128_ps(a3),
223                                SIMDPP_SHUFFLE_MASK_4x4(2, 2, 3, 3));
224     f0 = _mm_shuffle_ps(f0, f1, SIMDPP_SHUFFLE_MASK_4x4(0, 2, 0, 2));
225     a0 = _mm_castps_si128(f0);
226 #endif
227     return a0;
228 #elif SIMDPP_USE_NEON
229     return vshlq_u32(a.native(), vreinterpretq_s32_u32(count.native()));
230 #elif SIMDPP_USE_ALTIVEC
231     return vec_sl(a.native(), count.native());
232 #elif SIMDPP_USE_MSA
233     return (v4u32) __msa_sll_w((v4i32)a.native(), (v4i32)count.native());
234 #endif
235 }
236 
237 #if SIMDPP_USE_AVX2
238 static SIMDPP_INL
i_shift_l_v(const uint32<8> & a,const uint32<8> & count)239 uint32<8> i_shift_l_v(const uint32<8>& a, const uint32<8>& count)
240 {
241     return _mm256_sllv_epi32(a.native(), count.native());
242 }
243 #endif
244 
245 #if SIMDPP_USE_AVX512F
i_shift_l_v(const uint32<16> & a,const uint32<16> & count)246 SIMDPP_INL uint32<16> i_shift_l_v(const uint32<16>& a, const uint32<16>& count)
247 {
248     return _mm512_sllv_epi32(a.native(), count.native());
249 }
250 #endif
251 
252 // -----------------------------------------------------------------------------
253 
254 template<class V, class U> SIMDPP_INL
i_shift_l_v(const V & a,const U & b)255 V i_shift_l_v(const V& a, const U& b)
256 {
257     SIMDPP_VEC_ARRAY_IMPL2(V, i_shift_l_v, a, b);
258 }
259 
260 } // namespace insn
261 } // namespace detail
262 } // namespace SIMDPP_ARCH_NAMESPACE
263 } // namespace simdpp
264 
265 #endif
266