1 /* SPDX-License-Identifier: MIT
2 *
3 * Permission is hereby granted, free of charge, to any person
4 * obtaining a copy of this software and associated documentation
5 * files (the "Software"), to deal in the Software without
6 * restriction, including without limitation the rights to use, copy,
7 * modify, merge, publish, distribute, sublicense, and/or sell copies
8 * of the Software, and to permit persons to whom the Software is
9 * furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be
12 * included in all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
18 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
19 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Copyright:
24 * 2020 Evan Nemerson <evan@nemerson.com>
25 * 2020 Christopher Moore <moore@free.fr>
26 */
27
28 #if !defined(SIMDE_ARM_NEON_SHL_H)
29 #define SIMDE_ARM_NEON_SHL_H
30
31 #include "types.h"
32
33 /* Notes from the implementer (Christopher Moore aka rosbif)
34 *
35 * I have tried to exactly reproduce the documented behaviour of the
36 * ARM NEON shl and shlq intrinsics.
37 * This is complicated for the following reasons:-
38 *
39 * a) Negative shift counts shift right.
40 *
41 * b) Only the low byte of the shift count is used but the shift count
42 * is not limited to 8-bit values (-128 to 127).
43 *
44 * c) Intel SIMD is not nearly as complete as NEON and AltiVec.
45 * There were no intrisics with a vector shift count before AVX2 which
46 * only has 32 and 64-bit logical ones and only a 32-bit arithmetic
47 * one. The others need AVX512. There are no 8-bit shift intrinsics at
48 * all, even with a scalar shift count. It is surprising to use AVX2
49 * and even AVX512 to implement a 64-bit vector operation.
50 *
51 * d) Many shift implementations, and the C standard, do not treat a
52 * shift count >= the object's size in bits as one would expect.
53 * (Personally I feel that > is silly but == can be useful.)
54 *
55 * Maybe it would be useful for SIMDe to have a flag enabling a fast
56 * implementation where the result is only guaranteed for shift counts
57 * conforming to the C standard.
58 *
59 * Note that even the C17/18 standard does not define the behaviour of
60 * a right shift of a negative value.
61 * However Evan and I agree that all compilers likely to be used
62 * implement this as an arithmetic right shift with sign extension.
63 * If this is not the case it could be replaced by a logical right shift
64 * if negative values are complemented before and after the shift.
65 *
66 * Some of the SIMD translations may be slower than the portable code,
67 * particularly those for vectors with only one or two elements.
68 * But I had fun writing them ;-)
69 *
70 */
71
72 HEDLEY_DIAGNOSTIC_PUSH
73 SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
74 SIMDE_BEGIN_DECLS_
75
76 SIMDE_FUNCTION_ATTRIBUTES
77 simde_int8x8_t
simde_vshl_s8(const simde_int8x8_t a,const simde_int8x8_t b)78 simde_vshl_s8 (const simde_int8x8_t a, const simde_int8x8_t b) {
79 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
80 return vshl_s8(a, b);
81 #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
82 __m128i a128 = _mm_cvtepi8_epi16(_mm_movpi64_epi64(a));
83 __m128i b128 = _mm_cvtepi8_epi16(_mm_movpi64_epi64(b));
84 __m128i r128 = _mm_blendv_epi8(_mm_sllv_epi16(a128, b128),
85 _mm_srav_epi16(a128, _mm_abs_epi16(b128)),
86 _mm_cmpgt_epi16(_mm_setzero_si128(), b128));
87 return _mm_movepi64_pi64(_mm_cvtepi16_epi8(r128));
88 #elif defined(SIMDE_X86_AVX2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
89 __m256i a256 = _mm256_cvtepi8_epi32(_mm_movpi64_epi64(a));
90 __m256i b256 = _mm256_cvtepi8_epi32(_mm_movpi64_epi64(b));
91 __m256i r256 = _mm256_blendv_epi8(_mm256_sllv_epi32(a256, b256),
92 _mm256_srav_epi32(a256, _mm256_abs_epi32(b256)),
93 _mm256_cmpgt_epi32(_mm256_setzero_si256(), b256));
94 r256 = _mm256_shuffle_epi8(r256, _mm256_set1_epi32(0x0C080400));
95 return _mm_set_pi32(_mm256_extract_epi32(r256, 4), _mm256_extract_epi32(r256, 0));
96 #else
97 simde_int8x8_private
98 r_,
99 a_ = simde_int8x8_to_private(a),
100 b_ = simde_int8x8_to_private(b);
101
102 SIMDE_VECTORIZE
103 for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
104 r_.values[i] = HEDLEY_STATIC_CAST(int8_t,
105 (b_.values[i] >= 0) ?
106 (b_.values[i] >= 8) ? 0 : (a_.values[i] << b_.values[i]) :
107 (b_.values[i] <= -8) ? (a_.values[i] >> 7) : (a_.values[i] >> -b_.values[i]));
108 }
109
110 return simde_int8x8_from_private(r_);
111 #endif
112 }
113 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
114 #undef vshl_s8
115 #define vshl_s8(a, b) simde_vshl_s8((a), (b))
116 #endif
117
118 SIMDE_FUNCTION_ATTRIBUTES
119 simde_int16x4_t
simde_vshl_s16(const simde_int16x4_t a,const simde_int16x4_t b)120 simde_vshl_s16 (const simde_int16x4_t a, const simde_int16x4_t b) {
121 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
122 return vshl_s16(a, b);
123 #elif defined(SIMDE_X86_AVX2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
124 __m128i a128 = _mm_cvtepi16_epi32(_mm_movpi64_epi64(a));
125 __m128i b128 = _mm_cvtepi16_epi32(_mm_movpi64_epi64(b));
126 b128 = _mm_srai_epi32(_mm_slli_epi32(b128, 24), 24);
127 __m128i r128 = _mm_blendv_epi8(_mm_sllv_epi32(a128, b128),
128 _mm_srav_epi32(a128, _mm_abs_epi32(b128)),
129 _mm_cmpgt_epi32(_mm_setzero_si128(), b128));
130 return _mm_movepi64_pi64(_mm_shuffle_epi8(r128, _mm_set1_epi64x(0x0D0C090805040100)));
131 #else
132 simde_int16x4_private
133 r_,
134 a_ = simde_int16x4_to_private(a),
135 b_ = simde_int16x4_to_private(b);
136
137 SIMDE_VECTORIZE
138 for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
139 b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]);
140 r_.values[i] = HEDLEY_STATIC_CAST(int16_t,
141 (b_.values[i] >= 0) ?
142 (b_.values[i] >= 16) ? 0 : (a_.values[i] << b_.values[i]) :
143 (b_.values[i] <= -16) ? (a_.values[i] >> 15) : (a_.values[i] >> -b_.values[i]));
144 }
145
146 return simde_int16x4_from_private(r_);
147 #endif
148 }
149 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
150 #undef vshl_s16
151 #define vshl_s16(a, b) simde_vshl_s16((a), (b))
152 #endif
153
154 SIMDE_FUNCTION_ATTRIBUTES
155 simde_int32x2_t
simde_vshl_s32(const simde_int32x2_t a,const simde_int32x2_t b)156 simde_vshl_s32 (const simde_int32x2_t a, const simde_int32x2_t b) {
157 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
158 return vshl_s32(a, b);
159 #elif defined(SIMDE_X86_AVX2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
160 __m128i a128 = _mm_movpi64_epi64(a);
161 __m128i b128 = _mm_movpi64_epi64(b);
162 b128 = _mm_srai_epi32(_mm_slli_epi32(b128, 24), 24);
163 __m128i r128 = _mm_blendv_epi8(_mm_sllv_epi32(a128, b128),
164 _mm_srav_epi32(a128, _mm_abs_epi32(b128)),
165 _mm_cmpgt_epi32(_mm_setzero_si128(), b128));
166 return _mm_movepi64_pi64(r128);
167 #else
168 simde_int32x2_private
169 r_,
170 a_ = simde_int32x2_to_private(a),
171 b_ = simde_int32x2_to_private(b);
172
173 SIMDE_VECTORIZE
174 for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
175 b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]);
176 r_.values[i] =
177 (b_.values[i] >= 0) ?
178 (b_.values[i] >= 32) ? 0 : (a_.values[i] << b_.values[i]) :
179 (b_.values[i] <= -32) ? (a_.values[i] >> 31) : (a_.values[i] >> -b_.values[i]);
180 }
181
182 return simde_int32x2_from_private(r_);
183 #endif
184 }
185 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
186 #undef vshl_s32
187 #define vshl_s32(a, b) simde_vshl_s32((a), (b))
188 #endif
189
190 SIMDE_FUNCTION_ATTRIBUTES
191 simde_int64x1_t
simde_vshl_s64(const simde_int64x1_t a,const simde_int64x1_t b)192 simde_vshl_s64 (const simde_int64x1_t a, const simde_int64x1_t b) {
193 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
194 return vshl_s64(a, b);
195 #elif defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
196 __m128i zero = _mm_setzero_si128();
197 __m128i a128 = _mm_movpi64_epi64(a);
198 __m128i b128 = _mm_movpi64_epi64(b);
199 b128 = _mm_srai_epi64(_mm_slli_epi64(b128, 56), 56);
200 __m128i r128 = _mm_blendv_epi8(_mm_sllv_epi64(a128, b128),
201 _mm_srav_epi64(a128, _mm_sub_epi64(zero, b128)),
202 _mm_cmpgt_epi64(zero, b128));
203 return _mm_movepi64_pi64(r128);
204 #elif defined(SIMDE_X86_AVX2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
205 __m128i zero = _mm_setzero_si128();
206 __m128i a128 = _mm_movpi64_epi64(a);
207 __m128i b128 = _mm_movpi64_epi64(b);
208 __m128i maska = _mm_cmpgt_epi64(zero, a128);
209 __m128i b_abs = _mm_and_si128(_mm_abs_epi8(b128), _mm_set1_epi64x(0xFF));
210 __m128i r128 = _mm_blendv_epi8(_mm_sllv_epi64(a128, b_abs),
211 _mm_xor_si128(_mm_srlv_epi64(_mm_xor_si128(a128, maska), b_abs), maska),
212 _mm_cmpgt_epi64(zero, _mm_slli_epi64(b128, 56)));
213 return _mm_movepi64_pi64(r128);
214 #else
215 simde_int64x1_private
216 r_,
217 a_ = simde_int64x1_to_private(a),
218 b_ = simde_int64x1_to_private(b);
219
220 SIMDE_VECTORIZE
221 for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
222 b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]);
223 r_.values[i] =
224 (b_.values[i] >= 0) ?
225 (b_.values[i] >= 64) ? 0 : (a_.values[i] << b_.values[i]) :
226 (b_.values[i] <= -64) ? (a_.values[i] >> 63) : (a_.values[i] >> -b_.values[i]);
227 }
228
229 return simde_int64x1_from_private(r_);
230 #endif
231 }
232 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
233 #undef vshl_s64
234 #define vshl_s64(a, b) simde_vshl_u64((a), (b))
235 #endif
236
237 SIMDE_FUNCTION_ATTRIBUTES
238 simde_uint8x8_t
simde_vshl_u8(const simde_uint8x8_t a,const simde_int8x8_t b)239 simde_vshl_u8 (const simde_uint8x8_t a, const simde_int8x8_t b) {
240 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
241 return vshl_u8(a, b);
242 #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
243 __m128i a128 = _mm_cvtepu8_epi16(_mm_movpi64_epi64(a));
244 __m128i b128 = _mm_cvtepi8_epi16(_mm_movpi64_epi64(b));
245 __m128i r128 = _mm_blendv_epi8(_mm_sllv_epi16(a128, b128),
246 _mm_srlv_epi16(a128, _mm_abs_epi16(b128)),
247 _mm_cmpgt_epi16(_mm_setzero_si128(), b128));
248 return _mm_movepi64_pi64(_mm_cvtepi16_epi8(r128));
249 #elif defined(SIMDE_X86_AVX2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
250 __m256i a256 = _mm256_cvtepu8_epi32(_mm_movpi64_epi64(a));
251 __m256i b256 = _mm256_cvtepi8_epi32(_mm_movpi64_epi64(b));
252 __m256i r256 = _mm256_blendv_epi8(_mm256_sllv_epi32(a256, b256),
253 _mm256_srlv_epi32(a256, _mm256_abs_epi32(b256)),
254 _mm256_cmpgt_epi32(_mm256_setzero_si256(), b256));
255 r256 = _mm256_shuffle_epi8(r256, _mm256_set1_epi32(0x0C080400));
256 return _mm_set_pi32(_mm256_extract_epi32(r256, 4), _mm256_extract_epi32(r256, 0));
257 #else
258 simde_uint8x8_private
259 r_,
260 a_ = simde_uint8x8_to_private(a);
261 simde_int8x8_private b_ = simde_int8x8_to_private(b);
262
263 SIMDE_VECTORIZE
264 for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
265 r_.values[i] = HEDLEY_STATIC_CAST(uint8_t,
266 (abs(b_.values[i]) >= 8) ? 0 :
267 (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) :
268 (a_.values[i] >> -b_.values[i]));
269 }
270
271 return simde_uint8x8_from_private(r_);
272 #endif
273 }
274 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
275 #undef vshl_u8
276 #define vshl_u8(a, b) simde_vshl_u8((a), (b))
277 #endif
278
279 SIMDE_FUNCTION_ATTRIBUTES
280 simde_uint16x4_t
simde_vshl_u16(const simde_uint16x4_t a,const simde_int16x4_t b)281 simde_vshl_u16 (const simde_uint16x4_t a, const simde_int16x4_t b) {
282 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
283 return vshl_u16(a, b);
284 #elif defined(SIMDE_X86_AVX2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
285 __m128i a128 = _mm_cvtepu16_epi32(_mm_movpi64_epi64(a));
286 __m128i b128 = _mm_cvtepi16_epi32(_mm_movpi64_epi64(b));
287 b128 = _mm_srai_epi32(_mm_slli_epi32(b128, 24), 24);
288 __m128i r128 = _mm_blendv_epi8(_mm_sllv_epi32(a128, b128),
289 _mm_srlv_epi32(a128, _mm_abs_epi32(b128)),
290 _mm_cmpgt_epi32(_mm_setzero_si128(), b128));
291 return _mm_movepi64_pi64(_mm_shuffle_epi8(r128, _mm_set1_epi64x(0x0D0C090805040100)));
292 #else
293 simde_uint16x4_private
294 r_,
295 a_ = simde_uint16x4_to_private(a);
296 simde_int16x4_private b_ = simde_int16x4_to_private(b);
297
298 SIMDE_VECTORIZE
299 for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
300 b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]);
301 r_.values[i] = HEDLEY_STATIC_CAST(uint16_t,
302 (abs(b_.values[i]) >= 16) ? 0 :
303 (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) :
304 (a_.values[i] >> -b_.values[i]));
305 }
306
307 return simde_uint16x4_from_private(r_);
308 #endif
309 }
310 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
311 #undef vshl_u16
312 #define vshl_u16(a, b) simde_vshl_u16((a), (b))
313 #endif
314
315 SIMDE_FUNCTION_ATTRIBUTES
316 simde_uint32x2_t
simde_vshl_u32(const simde_uint32x2_t a,const simde_int32x2_t b)317 simde_vshl_u32 (const simde_uint32x2_t a, const simde_int32x2_t b) {
318 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
319 return vshl_u32(a, b);
320 #elif defined(SIMDE_X86_AVX2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
321 __m128i a128 = _mm_movpi64_epi64(a);
322 __m128i b128 = _mm_movpi64_epi64(b);
323 b128 = _mm_srai_epi32(_mm_slli_epi32(b128, 24), 24);
324 __m128i r128 = _mm_blendv_epi8(_mm_sllv_epi32(a128, b128),
325 _mm_srlv_epi32(a128, _mm_abs_epi32(b128)),
326 _mm_cmpgt_epi32(_mm_setzero_si128(), b128));
327 return _mm_movepi64_pi64(r128);
328 #else
329 simde_uint32x2_private
330 r_,
331 a_ = simde_uint32x2_to_private(a);
332 simde_int32x2_private b_ = simde_int32x2_to_private(b);
333
334 SIMDE_VECTORIZE
335 for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
336 b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]);
337 r_.values[i] =
338 (abs(b_.values[i]) >= 32) ? 0 :
339 (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) :
340 (a_.values[i] >> -b_.values[i]);
341 }
342
343 return simde_uint32x2_from_private(r_);
344 #endif
345 }
346 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
347 #undef vshl_u32
348 #define vshl_u32(a, b) simde_vshl_u32((a), (b))
349 #endif
350
351 SIMDE_FUNCTION_ATTRIBUTES
352 simde_uint64x1_t
simde_vshl_u64(const simde_uint64x1_t a,const simde_int64x1_t b)353 simde_vshl_u64 (const simde_uint64x1_t a, const simde_int64x1_t b) {
354 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
355 return vshl_u64(a, b);
356 #elif defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
357 __m128i zero = _mm_setzero_si128();
358 __m128i a128 = _mm_movpi64_epi64(a);
359 __m128i b128 = _mm_movpi64_epi64(b);
360 b128 = _mm_srai_epi64(_mm_slli_epi64(b128, 56), 56);
361 __m128i r128 = _mm_blendv_epi8(_mm_sllv_epi64(a128, b128),
362 _mm_srlv_epi64(a128, _mm_sub_epi64(zero, b128)),
363 _mm_cmpgt_epi64(zero, b128));
364 return _mm_movepi64_pi64(r128);
365 #elif defined(SIMDE_X86_AVX2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
366 __m128i a128 = _mm_movpi64_epi64(a);
367 __m128i b128 = _mm_movpi64_epi64(b);
368 __m128i b_abs = _mm_and_si128(_mm_abs_epi8(b128), _mm_set1_epi64x(0xFF));
369 __m128i r128 = _mm_blendv_epi8(_mm_sllv_epi64(a128, b_abs),
370 _mm_srlv_epi64(a128, b_abs),
371 _mm_cmpgt_epi64(_mm_setzero_si128(), _mm_slli_epi64(b128, 56)));
372 return _mm_movepi64_pi64(r128);
373 #else
374 simde_uint64x1_private
375 r_,
376 a_ = simde_uint64x1_to_private(a);
377 simde_int64x1_private b_ = simde_int64x1_to_private(b);
378
379 SIMDE_VECTORIZE
380 for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
381 b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]);
382 r_.values[i] =
383 (llabs(b_.values[i]) >= 64) ? 0 :
384 (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) :
385 (a_.values[i] >> -b_.values[i]);
386 }
387
388 return simde_uint64x1_from_private(r_);
389 #endif
390 }
391 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
392 #undef vshl_u64
393 #define vshl_u64(a, b) simde_vshl_u64((a), (b))
394 #endif
395
396 SIMDE_FUNCTION_ATTRIBUTES
397 simde_int8x16_t
simde_vshlq_s8(const simde_int8x16_t a,const simde_int8x16_t b)398 simde_vshlq_s8 (const simde_int8x16_t a, const simde_int8x16_t b) {
399 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
400 return vshlq_s8(a, b);
401 #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
402 __m256i a256 = _mm256_cvtepi8_epi16(a);
403 __m256i b256 = _mm256_cvtepi8_epi16(b);
404 __m256i r256 = _mm256_blendv_epi8(_mm256_sllv_epi16(a256, b256),
405 _mm256_srav_epi16(a256, _mm256_abs_epi16(b256)),
406 _mm256_cmpgt_epi16(_mm256_setzero_si256(), b256));
407 return _mm256_cvtepi16_epi8(r256);
408 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
409 vector signed char a_shl, a_shr;
410 vector unsigned char b_abs, b_max;
411 vector bool char b_mask;
412 b_abs = HEDLEY_REINTERPRET_CAST(vector unsigned char, vec_abs(b));
413 b_max = vec_splat_u8(7);
414 #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
415 a_shl = vec_and(vec_sl(a, b_abs), vec_cmple(b_abs, b_max));
416 #else
417 a_shl = vec_and(vec_sl(a, b_abs), vec_cmplt(b_abs, vec_splat_u8(8)));
418 #endif
419 a_shr = vec_sra(a, vec_min(b_abs, b_max));
420 b_mask = vec_cmplt(b, vec_splat_s8(0));
421 return vec_sel(a_shl, a_shr, b_mask);
422 #else
423 simde_int8x16_private
424 r_,
425 a_ = simde_int8x16_to_private(a),
426 b_ = simde_int8x16_to_private(b);
427
428 SIMDE_VECTORIZE
429 for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
430 r_.values[i] = HEDLEY_STATIC_CAST(int8_t,
431 (b_.values[i] >= 0) ?
432 (b_.values[i] >= 8) ? 0 : (a_.values[i] << b_.values[i]) :
433 (b_.values[i] <= -8) ? (a_.values[i] >> 7) : (a_.values[i] >> -b_.values[i]));
434 }
435
436 return simde_int8x16_from_private(r_);
437 #endif
438 }
439 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
440 #undef vshlq_s8
441 #define vshlq_s8(a, b) simde_vshlq_s8((a), (b))
442 #endif
443
444 SIMDE_FUNCTION_ATTRIBUTES
445 simde_int16x8_t
simde_vshlq_s16(const simde_int16x8_t a,const simde_int16x8_t b)446 simde_vshlq_s16 (const simde_int16x8_t a, const simde_int16x8_t b) {
447 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
448 return vshlq_s16(a, b);
449 #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
450 __m128i b_ = _mm_srai_epi16(_mm_slli_epi16(b, 8), 8);
451 return _mm_blendv_epi8(_mm_sllv_epi16(a, b_),
452 _mm_srav_epi16(a, _mm_abs_epi16(b_)),
453 _mm_cmpgt_epi16(_mm_setzero_si128(), b_));
454 #elif defined(SIMDE_X86_AVX2_NATIVE)
455 __m256i a256 = _mm256_cvtepi16_epi32(a);
456 __m256i b256 = _mm256_cvtepi16_epi32(b);
457 b256 = _mm256_srai_epi32(_mm256_slli_epi32(b256, 24), 24);
458 __m256i r256 = _mm256_blendv_epi8(_mm256_sllv_epi32(a256, b256),
459 _mm256_srav_epi32(a256, _mm256_abs_epi32(b256)),
460 _mm256_cmpgt_epi32(_mm256_setzero_si256(), b256));
461 r256 = _mm256_shuffle_epi8(r256, _mm256_set1_epi64x(0x0D0C090805040100));
462 return _mm_set_epi64x(_mm256_extract_epi64(r256, 2), _mm256_extract_epi64(r256, 0));
463 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
464 vector signed short a_shl, a_shr;
465 vector unsigned short b_abs, b_max;
466 vector bool short b_mask;
467 b_abs = vec_and(HEDLEY_REINTERPRET_CAST(vector unsigned short,
468 vec_abs(HEDLEY_REINTERPRET_CAST(vector signed char, b))),
469 vec_splats(HEDLEY_STATIC_CAST(unsigned short, 0xFF)));
470 b_max = vec_splat_u16(15);
471 #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
472 a_shl = vec_and(vec_sl(a, b_abs), vec_cmple(b_abs, b_max));
473 #else
474 a_shl = vec_and(vec_sl(a, b_abs), vec_cmplt(b_abs, vec_splats(HEDLEY_STATIC_CAST(unsigned short, 16))));
475 #endif
476 a_shr = vec_sra(a, vec_min(b_abs, b_max));
477 b_mask = vec_cmplt(vec_sl(b, vec_splat_u16(8)), vec_splat_s16(0));
478 return vec_sel(a_shl, a_shr, b_mask);
479 #else
480 simde_int16x8_private
481 r_,
482 a_ = simde_int16x8_to_private(a),
483 b_ = simde_int16x8_to_private(b);
484
485 SIMDE_VECTORIZE
486 for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
487 b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]);
488 r_.values[i] = HEDLEY_STATIC_CAST(int16_t,
489 (b_.values[i] >= 0) ?
490 (b_.values[i] >= 16) ? 0 : (a_.values[i] << b_.values[i]) :
491 (b_.values[i] <= -16) ? (a_.values[i] >> 15) : (a_.values[i] >> -b_.values[i]));
492 }
493
494 return simde_int16x8_from_private(r_);
495 #endif
496 }
497 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
498 #undef vshlq_s16
499 #define vshlq_s16(a, b) simde_vshlq_s16((a), (b))
500 #endif
501
502 SIMDE_FUNCTION_ATTRIBUTES
503 simde_int32x4_t
simde_vshlq_s32(const simde_int32x4_t a,const simde_int32x4_t b)504 simde_vshlq_s32 (const simde_int32x4_t a, const simde_int32x4_t b) {
505 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
506 return vshlq_s32(a, b);
507 #elif defined(SIMDE_X86_AVX2_NATIVE)
508 __m128i b_ = _mm_srai_epi32(_mm_slli_epi32(b, 24), 24);
509 return _mm_blendv_epi8(_mm_sllv_epi32(a, b_),
510 _mm_srav_epi32(a, _mm_abs_epi32(b_)),
511 _mm_cmpgt_epi32(_mm_setzero_si128(), b_));
512 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
513 vector signed int a_shl, a_shr;
514 vector unsigned int b_abs, b_max;
515 vector bool int b_mask;
516 b_abs = vec_and(HEDLEY_REINTERPRET_CAST(vector unsigned int,
517 vec_abs(HEDLEY_REINTERPRET_CAST(vector signed char, b))),
518 vec_splats(HEDLEY_STATIC_CAST(unsigned int, 0xFF)));
519 b_max = vec_splats(HEDLEY_STATIC_CAST(unsigned int, 31));
520 #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
521 a_shl = vec_and(vec_sl(a, b_abs), vec_cmple(b_abs, b_max));
522 #else
523 a_shl = vec_and(vec_sl(a, b_abs), vec_cmplt(b_abs, vec_splats(HEDLEY_STATIC_CAST(unsigned int, 32))));
524 #endif
525 a_shr = vec_sra(a, vec_min(b_abs, b_max));
526 b_mask = vec_cmplt(vec_sl(b, vec_splats(HEDLEY_STATIC_CAST(unsigned int, 24))),
527 vec_splat_s32(0));
528 return vec_sel(a_shl, a_shr, b_mask);
529 #else
530 simde_int32x4_private
531 r_,
532 a_ = simde_int32x4_to_private(a),
533 b_ = simde_int32x4_to_private(b);
534
535 SIMDE_VECTORIZE
536 for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
537 b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]);
538 r_.values[i] =
539 (b_.values[i] >= 0) ?
540 (b_.values[i] >= 32) ? 0 : (a_.values[i] << b_.values[i]) :
541 (b_.values[i] <= -32) ? (a_.values[i] >> 31) : (a_.values[i] >> -b_.values[i]);
542 }
543
544 return simde_int32x4_from_private(r_);
545 #endif
546 }
547 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
548 #undef vshlq_s32
549 #define vshlq_s32(a, b) simde_vshlq_s32((a), (b))
550 #endif
551
552 SIMDE_FUNCTION_ATTRIBUTES
553 simde_int64x2_t
simde_vshlq_s64(const simde_int64x2_t a,const simde_int64x2_t b)554 simde_vshlq_s64 (const simde_int64x2_t a, const simde_int64x2_t b) {
555 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
556 return vshlq_s64(a, b);
557 #elif defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
558 __m128i zero = _mm_setzero_si128();
559 __m128i b_ = _mm_srai_epi64(_mm_slli_epi64(b, 56), 56);
560 return _mm_blendv_epi8(_mm_sllv_epi64(a, b_),
561 _mm_srav_epi64(a, _mm_sub_epi64(zero, b_)),
562 _mm_cmpgt_epi64(zero, b_));
563 #elif defined(SIMDE_X86_AVX2_NATIVE)
564 __m128i zero = _mm_setzero_si128();
565 __m128i maska = _mm_cmpgt_epi64(zero, a);
566 __m128i b_abs = _mm_and_si128(_mm_abs_epi8(b), _mm_set1_epi64x(0xFF));
567 return _mm_blendv_epi8(_mm_sllv_epi64(a, b_abs),
568 _mm_xor_si128(_mm_srlv_epi64(_mm_xor_si128(a, maska), b_abs), maska),
569 _mm_cmpgt_epi64(zero, _mm_slli_epi64(b, 56)));
570 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
571 vector signed long long a_shl, a_shr;
572 vector unsigned long long b_abs, b_max;
573 vector bool long long b_mask;
574 b_abs = vec_and(HEDLEY_REINTERPRET_CAST(vector unsigned long long,
575 vec_abs(HEDLEY_REINTERPRET_CAST(vector signed char, b))),
576 vec_splats(HEDLEY_STATIC_CAST(unsigned long long, 0xFF)));
577 b_max = vec_splats(HEDLEY_STATIC_CAST(unsigned long long, 63));
578 a_shl = vec_and(vec_sl(a, b_abs), vec_cmple(b_abs, b_max));
579 a_shr = vec_sra(a, vec_min(b_abs, b_max));
580 b_mask = vec_cmplt(vec_sl(b, vec_splats(HEDLEY_STATIC_CAST(unsigned long long, 56))),
581 vec_splats(HEDLEY_STATIC_CAST(signed long long, 0)));
582 return vec_sel(a_shl, a_shr, b_mask);
583 #else
584 simde_int64x2_private
585 r_,
586 a_ = simde_int64x2_to_private(a),
587 b_ = simde_int64x2_to_private(b);
588
589 SIMDE_VECTORIZE
590 for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
591 b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]);
592 r_.values[i] =
593 (b_.values[i] >= 0) ?
594 (b_.values[i] >= 64) ? 0 : (a_.values[i] << b_.values[i]) :
595 (b_.values[i] <= -64) ? (a_.values[i] >> 63) : (a_.values[i] >> -b_.values[i]);
596 }
597
598 return simde_int64x2_from_private(r_);
599 #endif
600 }
601 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
602 #undef vshlq_s64
603 #define vshlq_s64(a, b) simde_vshlq_s64((a), (b))
604 #endif
605
606 SIMDE_FUNCTION_ATTRIBUTES
607 simde_uint8x16_t
simde_vshlq_u8(const simde_uint8x16_t a,const simde_int8x16_t b)608 simde_vshlq_u8 (const simde_uint8x16_t a, const simde_int8x16_t b) {
609 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
610 return vshlq_u8(a, b);
611 #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
612 __m256i a256 = _mm256_cvtepu8_epi16(a);
613 __m256i b256 = _mm256_cvtepi8_epi16(b);
614 __m256i r256 = _mm256_blendv_epi8(_mm256_sllv_epi16(a256, b256),
615 _mm256_srlv_epi16(a256, _mm256_abs_epi16(b256)),
616 _mm256_cmpgt_epi16(_mm256_setzero_si256(), b256));
617 return _mm256_cvtepi16_epi8(r256);
618 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
619 vector unsigned char b_abs;
620 vector bool char b_mask;
621 b_abs = HEDLEY_REINTERPRET_CAST(vector unsigned char, vec_abs(b));
622 b_mask = vec_cmplt(b, vec_splat_s8(0));
623 return vec_and(vec_sel(vec_sl(a, b_abs), vec_sr(a, b_abs), b_mask),
624 vec_cmplt(b_abs, vec_splat_u8(8)));
625 #else
626 simde_uint8x16_private
627 r_,
628 a_ = simde_uint8x16_to_private(a);
629 simde_int8x16_private b_ = simde_int8x16_to_private(b);
630
631 SIMDE_VECTORIZE
632 for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
633 r_.values[i] = HEDLEY_STATIC_CAST(uint8_t,
634 (abs(b_.values[i]) >= 8) ? 0 :
635 (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) :
636 (a_.values[i] >> -b_.values[i]));
637 }
638
639 return simde_uint8x16_from_private(r_);
640 #endif
641 }
642 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
643 #undef vshlq_u8
644 #define vshlq_u8(a, b) simde_vshlq_u8((a), (b))
645 #endif
646
647 SIMDE_FUNCTION_ATTRIBUTES
648 simde_uint16x8_t
simde_vshlq_u16(const simde_uint16x8_t a,const simde_int16x8_t b)649 simde_vshlq_u16 (const simde_uint16x8_t a, const simde_int16x8_t b) {
650 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
651 return vshlq_u16(a, b);
652 #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
653 __m128i b_ = _mm_srai_epi16(_mm_slli_epi16(b, 8), 8);
654 return _mm_blendv_epi8(_mm_sllv_epi16(a, b_),
655 _mm_srlv_epi16(a, _mm_abs_epi16(b_)),
656 _mm_cmpgt_epi16(_mm_setzero_si128(), b_));
657 #elif defined(SIMDE_X86_AVX2_NATIVE)
658 __m256i a256 = _mm256_cvtepu16_epi32(a);
659 __m256i b256 = _mm256_cvtepi16_epi32(b);
660 b256 = _mm256_srai_epi32(_mm256_slli_epi32(b256, 24), 24);
661 __m256i r256 = _mm256_blendv_epi8(_mm256_sllv_epi32(a256, b256),
662 _mm256_srlv_epi32(a256, _mm256_abs_epi32(b256)),
663 _mm256_cmpgt_epi32(_mm256_setzero_si256(), b256));
664 r256 = _mm256_shuffle_epi8(r256, _mm256_set1_epi64x(0x0D0C090805040100));
665 return _mm_set_epi64x(_mm256_extract_epi64(r256, 2), _mm256_extract_epi64(r256, 0));
666 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
667 vector unsigned short b_abs;
668 vector bool short b_mask;
669 b_abs = vec_and(HEDLEY_REINTERPRET_CAST(vector unsigned short,
670 vec_abs(HEDLEY_REINTERPRET_CAST(vector signed char, b))),
671 vec_splats(HEDLEY_STATIC_CAST(unsigned short, 0xFF)));
672 b_mask = vec_cmplt(vec_sl(b, vec_splat_u16(8)), vec_splat_s16(0));
673 #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
674 return vec_and(vec_sel(vec_sl(a, b_abs), vec_sr(a, b_abs), b_mask),
675 vec_cmple(b_abs, vec_splat_u16(15)));
676 #else
677 return vec_and(vec_sel(vec_sl(a, b_abs), vec_sr(a, b_abs), b_mask),
678 vec_cmplt(b_abs, vec_splats(HEDLEY_STATIC_CAST(unsigned short, 16))));
679 #endif
680 #else
681 simde_uint16x8_private
682 r_,
683 a_ = simde_uint16x8_to_private(a);
684 simde_int16x8_private b_ = simde_int16x8_to_private(b);
685
686 SIMDE_VECTORIZE
687 for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
688 b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]);
689 r_.values[i] = HEDLEY_STATIC_CAST(uint16_t,
690 (abs(b_.values[i]) >= 16) ? 0 :
691 (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) :
692 (a_.values[i] >> -b_.values[i]));
693 }
694
695 return simde_uint16x8_from_private(r_);
696 #endif
697 }
698 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
699 #undef vshlq_u16
700 #define vshlq_u16(a, b) simde_vshlq_u16((a), (b))
701 #endif
702
703 SIMDE_FUNCTION_ATTRIBUTES
704 simde_uint32x4_t
simde_vshlq_u32(const simde_uint32x4_t a,const simde_int32x4_t b)705 simde_vshlq_u32 (const simde_uint32x4_t a, const simde_int32x4_t b) {
706 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
707 return vshlq_u32(a, b);
708 #elif defined(SIMDE_X86_AVX2_NATIVE)
709 __m128i b_ = _mm_srai_epi32(_mm_slli_epi32(b, 24), 24);
710 return _mm_blendv_epi8(_mm_sllv_epi32(a, b_),
711 _mm_srlv_epi32(a, _mm_abs_epi32(b_)),
712 _mm_cmpgt_epi32(_mm_setzero_si128(), b_));
713 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
714 vector unsigned int b_abs;
715 vector bool int b_mask;
716 b_abs = vec_and(HEDLEY_REINTERPRET_CAST(vector unsigned int,
717 vec_abs(HEDLEY_REINTERPRET_CAST(vector signed char, b))),
718 vec_splats(HEDLEY_STATIC_CAST(unsigned int, 0xFF)));
719 b_mask = vec_cmplt(vec_sl(b, vec_splats(HEDLEY_STATIC_CAST(unsigned int, 24))), vec_splat_s32(0));
720 return vec_and(vec_sel(vec_sl(a, b_abs), vec_sr(a, b_abs), b_mask),
721 vec_cmplt(b_abs, vec_splats(HEDLEY_STATIC_CAST(unsigned int, 32))));
722 #else
723 simde_uint32x4_private
724 r_,
725 a_ = simde_uint32x4_to_private(a);
726 simde_int32x4_private b_ = simde_int32x4_to_private(b);
727
728 SIMDE_VECTORIZE
729 for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
730 b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]);
731 r_.values[i] = (abs(b_.values[i]) >= 32) ? 0 :
732 (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) :
733 (a_.values[i] >> -b_.values[i]);
734 }
735
736 return simde_uint32x4_from_private(r_);
737 #endif
738 }
739 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
740 #undef vshlq_u32
741 #define vshlq_u32(a, b) simde_vshlq_u32((a), (b))
742 #endif
743
744 SIMDE_FUNCTION_ATTRIBUTES
745 simde_uint64x2_t
simde_vshlq_u64(const simde_uint64x2_t a,const simde_int64x2_t b)746 simde_vshlq_u64 (const simde_uint64x2_t a, const simde_int64x2_t b) {
747 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
748 return vshlq_u64(a, b);
749 #elif defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
750 __m128i zero = _mm_setzero_si128();
751 __m128i b_ = _mm_srai_epi64(_mm_slli_epi64(b, 56), 56);
752 return _mm_blendv_epi8(_mm_sllv_epi64(a, b_),
753 _mm_srlv_epi64(a, _mm_sub_epi64(zero, b_)),
754 _mm_cmpgt_epi64(zero, b_));
755 #elif defined(SIMDE_X86_AVX2_NATIVE)
756 __m128i b_abs = _mm_and_si128(_mm_abs_epi8(b), _mm_set1_epi64x(0xFF));
757 return _mm_blendv_epi8(_mm_sllv_epi64(a, b_abs),
758 _mm_srlv_epi64(a, b_abs),
759 _mm_cmpgt_epi64(_mm_setzero_si128(), _mm_slli_epi64(b, 56)));
760 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
761 vector unsigned long long b_abs;
762 vector bool long long b_mask;
763 b_abs = vec_and(HEDLEY_REINTERPRET_CAST(vector unsigned long long,
764 vec_abs(HEDLEY_REINTERPRET_CAST(vector signed char, b))),
765 vec_splats(HEDLEY_STATIC_CAST(unsigned long long, 0xFF)));
766 b_mask = vec_cmplt(vec_sl(b, vec_splats(HEDLEY_STATIC_CAST(unsigned long long, 56))),
767 vec_splats(HEDLEY_STATIC_CAST(signed long long, 0)));
768 return vec_and(vec_sel(vec_sl(a, b_abs), vec_sr(a, b_abs), b_mask),
769 vec_cmplt(b_abs, vec_splats(HEDLEY_STATIC_CAST(unsigned long long, 64))));
770 #else
771 simde_uint64x2_private
772 r_,
773 a_ = simde_uint64x2_to_private(a);
774 simde_int64x2_private b_ = simde_int64x2_to_private(b);
775
776 SIMDE_VECTORIZE
777 for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
778 b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]);
779 r_.values[i] = (llabs(b_.values[i]) >= 64) ? 0 :
780 (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) :
781 (a_.values[i] >> -b_.values[i]);
782 }
783
784 return simde_uint64x2_from_private(r_);
785 #endif
786 }
787 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
788 #undef vshlq_u64
789 #define vshlq_u64(a, b) simde_vshlq_u64((a), (b))
790 #endif
791
792 SIMDE_END_DECLS_
793 HEDLEY_DIAGNOSTIC_POP
794
795 #endif /* !defined(SIMDE_ARM_NEON_SHL_H) */
796