1 /* Copyright (C) 2017 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6 */
7
8 #ifndef LIBSIMDPP_SIMDPP_CORE_I_SHIFT_L_V_H
9 #define LIBSIMDPP_SIMDPP_CORE_I_SHIFT_L_V_H
10
11 #ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13 #endif
14
15 #include <simdpp/types.h>
16 #include <simdpp/detail/null/math.h>
17 #include <simdpp/detail/insn/i_shift.h>
18 #include <simdpp/detail/shuffle/shuffle_mask.h>
19 #include <simdpp/core/i_mul.h>
20 #include <simdpp/core/permute_bytes16.h>
21 #include <simdpp/detail/vector_array_macros.h>
22
23 namespace simdpp {
24 namespace SIMDPP_ARCH_NAMESPACE {
25 namespace detail {
26 namespace insn {
27
28 // emulates 8-bit variable shift using 16-bit variable shift
29 template<class U8> SIMDPP_INL
v_emul_shift_l_v8_using_v16(const U8 & a,const U8 & count)30 U8 v_emul_shift_l_v8_using_v16(const U8& a, const U8& count)
31 {
32 using U16 = typename same_width<U8>::u16;
33
34 U16 a16; a16 = a;
35 U16 c16; c16 = count;
36
37 U16 select_mask = make_uint(0xff00);
38 U16 a_lo = a16;
39 U16 a_hi = bit_and(a16, select_mask);
40 U16 c_lo = bit_andnot(c16, select_mask);
41 U16 c_hi = shift_r<8>(c16);
42 a_lo = shift_l(a_lo, c_lo);
43 a_hi = shift_l(a_hi, c_hi);
44 a_lo = bit_andnot(a_lo, select_mask);
45
46 a16 = bit_or(a_lo, a_hi);
47 return (U8) a16;
48 }
49
50 // emulates 8-bit variable shift using permute_bytes16 and 16-bit multiplication
51 template<class U8> SIMDPP_INL
v_emul_shift_l_v8_using_mul(const U8 & a,const U8 & count)52 U8 v_emul_shift_l_v8_using_mul(const U8& a, const U8& count)
53 {
54 using U16 = typename same_width<U8>::u16;
55
56 // Variable shift is implemented by obtaining 1 << countN for each element
57 // from a and then multiplying each element by that number. Implementation
58 // is complicated by the fact, that only 16-bit multiplication is available.
59 U8 mulshift_mask = make_uint(0x01, 0x02, 0x04, 0x08,
60 0x10, 0x20, 0x40, 0x80);
61 U16 mulshift = (U16) permute_bytes16(mulshift_mask, count);
62
63 U16 a16; a16 = a;
64 U16 a16_lo, a16_hi, mulshift_lo, mulshift_hi;
65 U16 select_mask = make_uint(0x00ff);
66
67 // Move the element values to the high byte of the 16-bit elements and the
68 // shift values to the low byte. The results will have the low byte clear
69 // which will help composing the result back to a single vector.
70 a16_lo = shift_l<8>(a16);
71 mulshift_lo = bit_and(mulshift, select_mask);
72 a16_hi = bit_andnot(a16, select_mask);
73 mulshift_hi = shift_r<8>(mulshift);
74
75 a16_lo = mul_lo(a16_lo, mulshift_lo);
76 a16_hi = mul_lo(a16_hi, mulshift_hi);
77
78 a16_lo = shift_r<8>(a16_lo);
79 a16 = bit_or(a16_lo, a16_hi);
80 return (U8) a16;
81 }
82
83 static SIMDPP_INL
i_shift_l_v(const uint8<16> & a,const uint8<16> & count)84 uint8<16> i_shift_l_v(const uint8<16>& a, const uint8<16>& count)
85 {
86 #if SIMDPP_USE_NULL
87 return detail::null::shift_l_v(a, count);
88 #elif SIMDPP_USE_AVX512BW && SIMDPP_USE_AVX512VL
89 return v_emul_shift_l_v8_using_v16(a, count);
90 #elif SIMDPP_USE_SSSE3
91 return v_emul_shift_l_v8_using_mul(a, count);
92 #elif SIMDPP_USE_NEON
93 return vshlq_u8(a.native(), vreinterpretq_s8_u8(count.native()));
94 #elif SIMDPP_USE_ALTIVEC
95 return vec_sl(a.native(), count.native());
96 #elif SIMDPP_USE_MSA
97 return (v16u8) __msa_sll_b((v16i8)a.native(), (v16i8)count.native());
98 #else
99 return SIMDPP_NOT_IMPLEMENTED2(a, count);
100 #endif
101 }
102
103 #if SIMDPP_USE_AVX2
104 static SIMDPP_INL
i_shift_l_v(const uint8<32> & a,const uint8<32> & count)105 uint8<32> i_shift_l_v(const uint8<32>& a, const uint8<32>& count)
106 {
107 #if SIMDPP_USE_AVX512BW && SIMDPP_USE_AVX512VL
108 return v_emul_shift_l_v8_using_v16(a, count);
109 #else
110 return v_emul_shift_l_v8_using_mul(a, count);
111 #endif
112 }
113 #endif
114
115 #if SIMDPP_USE_AVX512BW
116 static SIMDPP_INL
i_shift_l_v(const uint8<64> & a,const uint8<64> & count)117 uint8<64> i_shift_l_v(const uint8<64>& a, const uint8<64>& count)
118 {
119 return v_emul_shift_l_v8_using_v16(a, count);
120 }
121 #endif
122
123 // -----------------------------------------------------------------------------
124
125 // emulates 16-bit variable shift using permute_bytes16 and 16-bit multiplication
126 template<class U16>
v_emul_shift_l_v16_using_mul(const U16 & a,const U16 & count)127 U16 v_emul_shift_l_v16_using_mul(const U16& a, const U16& count)
128 {
129 using U8 = typename same_width<U16>::u8;
130
131 // Variable shift is implemented by obtaining 1 << countN for each element
132 // from a and then multiplying each element by that number. The
133 // implementation is complicated by the fact that permute_bytes16 permutes
134 // 8-bit elements instead of 16 which would be optimal in this case
135 U8 mulshift_mask = make_uint(0x01, 0x02, 0x04, 0x08,
136 0x10, 0x20, 0x40, 0x80,
137 0x00, 0x00, 0x00, 0x00,
138 0x00, 0x00, 0x00, 0x00);
139 U16 qcount = bit_or(count, shift_l<8>(count));
140
141 // toggle the 4-th bit so that the high byte takes zeros from the mulshift
142 // mask when the shift count is higher than 8.
143 qcount = bit_xor(qcount, 0x0800);
144 U16 mulshift = (U16) permute_bytes16(mulshift_mask, (U8) qcount);
145 return mul_lo(a, mulshift);
146 }
147
148 static SIMDPP_INL
i_shift_l_v(const uint16<8> & a,const uint16<8> & count)149 uint16<8> i_shift_l_v(const uint16<8>& a, const uint16<8>& count)
150 {
151 #if SIMDPP_USE_NULL
152 return detail::null::shift_l_v(a, count);
153 #elif SIMDPP_USE_AVX512BW && SIMDPP_USE_AVX512VL
154 return _mm_sllv_epi16(a.native(), count.native());
155 #elif SIMDPP_USE_SSSE3
156 return v_emul_shift_l_v16_using_mul(a, count);
157 #elif SIMDPP_USE_NEON
158 return vshlq_u16(a.native(), vreinterpretq_s16_u16(count.native()));
159 #elif SIMDPP_USE_ALTIVEC
160 return vec_sl(a.native(), count.native());
161 #elif SIMDPP_USE_MSA
162 return (v8u16) __msa_sll_h((v8i16)a.native(), (v8i16)count.native());
163 #else
164 return SIMDPP_NOT_IMPLEMENTED2(a, count);
165 #endif
166 }
167
168 #if SIMDPP_USE_AVX2
169 static SIMDPP_INL
i_shift_l_v(const uint16<16> & a,const uint16<16> & count)170 uint16<16> i_shift_l_v(const uint16<16>& a, const uint16<16>& count)
171 {
172 #if SIMDPP_USE_AVX512BW && SIMDPP_USE_AVX512VL
173 return _mm256_sllv_epi16(a.native(), count.native());
174 #else
175 return v_emul_shift_l_v16_using_mul(a, count);
176 #endif
177 }
178 #endif
179
180 #if SIMDPP_USE_AVX512BW
i_shift_l_v(const uint16<32> & a,const uint16<32> & count)181 SIMDPP_INL uint16<32> i_shift_l_v(const uint16<32>& a, const uint16<32>& count)
182 {
183 return _mm512_sllv_epi16(a.native(), count.native());
184 }
185 #endif
186
187 // -----------------------------------------------------------------------------
188
189 static SIMDPP_INL
i_shift_l_v(const uint32<4> & a,const uint32<4> & count)190 uint32<4> i_shift_l_v(const uint32<4>& a, const uint32<4>& count)
191 {
192 #if SIMDPP_USE_NULL
193 return detail::null::shift_l_v(a, count);
194 #elif SIMDPP_USE_AVX2
195 return _mm_sllv_epi32(a.native(), count.native());
196 #elif SIMDPP_USE_SSE2
197 uint32<4> count0 = count;
198 #if SIMDPP_USE_SSE4_1
199 uint32<4> zero = make_zero();
200 count0 = _mm_blend_epi16(count0.native(), zero.native(), 0xcc);
201 #else
202 uint32<4> mask = make_uint(0xffffffff, 0, 0xffffffff, 0);
203 count0 = bit_and(count0, mask);
204 #endif
205 uint32<4> count1 = _mm_srli_epi64(count.native(), 32);
206 uint32<4> count2 = _mm_srli_si128(count0.native(), 8);
207 uint32<4> count3 = _mm_srli_si128(count.native(), 12);
208
209 __m128i a0 = _mm_sll_epi32(a.native(), count0.native());
210 __m128i a1 = _mm_sll_epi32(a.native(), count1.native());
211 __m128i a2 = _mm_sll_epi32(a.native(), count2.native());
212 __m128i a3 = _mm_sll_epi32(a.native(), count3.native());
213 #if SIMDPP_USE_SSE4_1
214 a0 = _mm_blend_epi16(a0, a1, 0x0c);
215 a2 = _mm_blend_epi16(a2, a3, 0xc0);
216 a0 = _mm_blend_epi16(a0, a2, 0xf0);
217 #else
218 __m128 f0 = _mm_shuffle_ps(_mm_castsi128_ps(a0),
219 _mm_castsi128_ps(a1),
220 SIMDPP_SHUFFLE_MASK_4x4(0, 0, 1, 1));
221 __m128 f1 = _mm_shuffle_ps(_mm_castsi128_ps(a2),
222 _mm_castsi128_ps(a3),
223 SIMDPP_SHUFFLE_MASK_4x4(2, 2, 3, 3));
224 f0 = _mm_shuffle_ps(f0, f1, SIMDPP_SHUFFLE_MASK_4x4(0, 2, 0, 2));
225 a0 = _mm_castps_si128(f0);
226 #endif
227 return a0;
228 #elif SIMDPP_USE_NEON
229 return vshlq_u32(a.native(), vreinterpretq_s32_u32(count.native()));
230 #elif SIMDPP_USE_ALTIVEC
231 return vec_sl(a.native(), count.native());
232 #elif SIMDPP_USE_MSA
233 return (v4u32) __msa_sll_w((v4i32)a.native(), (v4i32)count.native());
234 #endif
235 }
236
237 #if SIMDPP_USE_AVX2
238 static SIMDPP_INL
i_shift_l_v(const uint32<8> & a,const uint32<8> & count)239 uint32<8> i_shift_l_v(const uint32<8>& a, const uint32<8>& count)
240 {
241 return _mm256_sllv_epi32(a.native(), count.native());
242 }
243 #endif
244
245 #if SIMDPP_USE_AVX512F
i_shift_l_v(const uint32<16> & a,const uint32<16> & count)246 SIMDPP_INL uint32<16> i_shift_l_v(const uint32<16>& a, const uint32<16>& count)
247 {
248 return _mm512_sllv_epi32(a.native(), count.native());
249 }
250 #endif
251
252 // -----------------------------------------------------------------------------
253
254 template<class V, class U> SIMDPP_INL
i_shift_l_v(const V & a,const U & b)255 V i_shift_l_v(const V& a, const U& b)
256 {
257 SIMDPP_VEC_ARRAY_IMPL2(V, i_shift_l_v, a, b);
258 }
259
260 } // namespace insn
261 } // namespace detail
262 } // namespace SIMDPP_ARCH_NAMESPACE
263 } // namespace simdpp
264
265 #endif
266