1 /* SPDX-License-Identifier: MIT
2 *
3 * Permission is hereby granted, free of charge, to any person
4 * obtaining a copy of this software and associated documentation
5 * files (the "Software"), to deal in the Software without
6 * restriction, including without limitation the rights to use, copy,
7 * modify, merge, publish, distribute, sublicense, and/or sell copies
8 * of the Software, and to permit persons to whom the Software is
9 * furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be
12 * included in all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
18 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
19 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Copyright:
24 * 2020 Evan Nemerson <evan@nemerson.com>
25 * 2020 Christopher Moore <moore@free.fr>
26 */
27
28 #if !defined(SIMDE_ARM_NEON_SHL_H)
29 #define SIMDE_ARM_NEON_SHL_H
30
31 #include "types.h"
32
33 /* Notes from the implementer (Christopher Moore aka rosbif)
34 *
35 * I have tried to exactly reproduce the documented behaviour of the
36 * ARM NEON shl and shlq intrinsics.
37 * This is complicated for the following reasons:-
38 *
39 * a) Negative shift counts shift right.
40 *
41 * b) Only the low byte of the shift count is used but the shift count
42 * is not limited to 8-bit values (-128 to 127).
43 *
44 * c) Intel SIMD is not nearly as complete as NEON and AltiVec.
45 * There were no intrisics with a vector shift count before AVX2 which
46 * only has 32 and 64-bit logical ones and only a 32-bit arithmetic
47 * one. The others need AVX512. There are no 8-bit shift intrinsics at
48 * all, even with a scalar shift count. It is surprising to use AVX2
49 * and even AVX512 to implement a 64-bit vector operation.
50 *
51 * d) Many shift implementations, and the C standard, do not treat a
52 * shift count >= the object's size in bits as one would expect.
53 * (Personally I feel that > is silly but == can be useful.)
54 *
55 * Maybe it would be useful for SIMDe to have a flag enabling a fast
56 * implementation where the result is only guaranteed for shift counts
57 * conforming to the C standard.
58 *
59 * Note that even the C17/18 standard does not define the behaviour of
60 * a right shift of a negative value.
61 * However Evan and I agree that all compilers likely to be used
62 * implement this as an arithmetic right shift with sign extension.
63 * If this is not the case it could be replaced by a logical right shift
64 * if negative values are complemented before and after the shift.
65 *
66 * Some of the SIMD translations may be slower than the portable code,
67 * particularly those for vectors with only one or two elements.
68 * But I had fun writing them ;-)
69 *
70 */
71
72 HEDLEY_DIAGNOSTIC_PUSH
73 SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
74 SIMDE_BEGIN_DECLS_
75
76 SIMDE_FUNCTION_ATTRIBUTES
77 simde_int8x8_t
simde_vshl_s8(const simde_int8x8_t a,const simde_int8x8_t b)78 simde_vshl_s8 (const simde_int8x8_t a, const simde_int8x8_t b) {
79 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
80 return vshl_s8(a, b);
81 #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
82 __m128i a128 = _mm_cvtepi8_epi16(_mm_movpi64_epi64(a));
83 __m128i b128 = _mm_cvtepi8_epi16(_mm_movpi64_epi64(b));
84 __m128i r128 = _mm_blendv_epi8(_mm_sllv_epi16(a128, b128),
85 _mm_srav_epi16(a128, _mm_abs_epi16(b128)),
86 _mm_cmpgt_epi16(_mm_setzero_si128(), b128));
87 return _mm_movepi64_pi64(_mm_cvtepi16_epi8(r128));
88 #elif defined(SIMDE_X86_AVX2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
89 __m256i a256 = _mm256_cvtepi8_epi32(_mm_movpi64_epi64(a));
90 __m256i b256 = _mm256_cvtepi8_epi32(_mm_movpi64_epi64(b));
91 __m256i r256 = _mm256_blendv_epi8(_mm256_sllv_epi32(a256, b256),
92 _mm256_srav_epi32(a256, _mm256_abs_epi32(b256)),
93 _mm256_cmpgt_epi32(_mm256_setzero_si256(), b256));
94 r256 = _mm256_shuffle_epi8(r256, _mm256_set1_epi32(0x0C080400));
95 return _mm_set_pi32(_mm256_extract_epi32(r256, 4), _mm256_extract_epi32(r256, 0));
96 #else
97 simde_int8x8_private
98 r_,
99 a_ = simde_int8x8_to_private(a),
100 b_ = simde_int8x8_to_private(b);
101
102 SIMDE_VECTORIZE
103 for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
104 r_.values[i] = HEDLEY_STATIC_CAST(int8_t,
105 (b_.values[i] >= 0) ?
106 (b_.values[i] >= 8) ? 0 : (a_.values[i] << b_.values[i]) :
107 (b_.values[i] <= -8) ? (a_.values[i] >> 7) : (a_.values[i] >> -b_.values[i]));
108 }
109
110 return simde_int8x8_from_private(r_);
111 #endif
112 }
113 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
114 #undef vshl_s8
115 #define vshl_s8(a, b) simde_vshl_s8((a), (b))
116 #endif
117
118 SIMDE_FUNCTION_ATTRIBUTES
119 simde_int16x4_t
simde_vshl_s16(const simde_int16x4_t a,const simde_int16x4_t b)120 simde_vshl_s16 (const simde_int16x4_t a, const simde_int16x4_t b) {
121 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
122 return vshl_s16(a, b);
123 #elif defined(SIMDE_X86_AVX2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
124 __m128i a128 = _mm_cvtepi16_epi32(_mm_movpi64_epi64(a));
125 __m128i b128 = _mm_cvtepi16_epi32(_mm_movpi64_epi64(b));
126 b128 = _mm_srai_epi32(_mm_slli_epi32(b128, 24), 24);
127 __m128i r128 = _mm_blendv_epi8(_mm_sllv_epi32(a128, b128),
128 _mm_srav_epi32(a128, _mm_abs_epi32(b128)),
129 _mm_cmpgt_epi32(_mm_setzero_si128(), b128));
130 return _mm_movepi64_pi64(_mm_shuffle_epi8(r128, _mm_set1_epi64x(0x0D0C090805040100)));
131 #else
132 simde_int16x4_private
133 r_,
134 a_ = simde_int16x4_to_private(a),
135 b_ = simde_int16x4_to_private(b);
136
137 SIMDE_VECTORIZE
138 for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
139 b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]);
140 r_.values[i] = HEDLEY_STATIC_CAST(int16_t,
141 (b_.values[i] >= 0) ?
142 (b_.values[i] >= 16) ? 0 : (a_.values[i] << b_.values[i]) :
143 (b_.values[i] <= -16) ? (a_.values[i] >> 15) : (a_.values[i] >> -b_.values[i]));
144 }
145
146 return simde_int16x4_from_private(r_);
147 #endif
148 }
149 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
150 #undef vshl_s16
151 #define vshl_s16(a, b) simde_vshl_s16((a), (b))
152 #endif
153
154 SIMDE_FUNCTION_ATTRIBUTES
155 simde_int32x2_t
simde_vshl_s32(const simde_int32x2_t a,const simde_int32x2_t b)156 simde_vshl_s32 (const simde_int32x2_t a, const simde_int32x2_t b) {
157 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
158 return vshl_s32(a, b);
159 #elif defined(SIMDE_X86_AVX2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
160 __m128i a128 = _mm_movpi64_epi64(a);
161 __m128i b128 = _mm_movpi64_epi64(b);
162 b128 = _mm_srai_epi32(_mm_slli_epi32(b128, 24), 24);
163 __m128i r128 = _mm_blendv_epi8(_mm_sllv_epi32(a128, b128),
164 _mm_srav_epi32(a128, _mm_abs_epi32(b128)),
165 _mm_cmpgt_epi32(_mm_setzero_si128(), b128));
166 return _mm_movepi64_pi64(r128);
167 #else
168 simde_int32x2_private
169 r_,
170 a_ = simde_int32x2_to_private(a),
171 b_ = simde_int32x2_to_private(b);
172
173 SIMDE_VECTORIZE
174 for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
175 b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]);
176 r_.values[i] =
177 (b_.values[i] >= 0) ?
178 (b_.values[i] >= 32) ? 0 : (a_.values[i] << b_.values[i]) :
179 (b_.values[i] <= -32) ? (a_.values[i] >> 31) : (a_.values[i] >> -b_.values[i]);
180 }
181
182 return simde_int32x2_from_private(r_);
183 #endif
184 }
185 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
186 #undef vshl_s32
187 #define vshl_s32(a, b) simde_vshl_s32((a), (b))
188 #endif
189
190 SIMDE_FUNCTION_ATTRIBUTES
191 simde_int64x1_t
simde_vshl_s64(const simde_int64x1_t a,const simde_int64x1_t b)192 simde_vshl_s64 (const simde_int64x1_t a, const simde_int64x1_t b) {
193 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
194 return vshl_s64(a, b);
195 #elif defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
196 __m128i zero = _mm_setzero_si128();
197 __m128i a128 = _mm_movpi64_epi64(a);
198 __m128i b128 = _mm_movpi64_epi64(b);
199 b128 = _mm_srai_epi64(_mm_slli_epi64(b128, 56), 56);
200 __m128i r128 = _mm_blendv_epi8(_mm_sllv_epi64(a128, b128),
201 _mm_srav_epi64(a128, _mm_sub_epi64(zero, b128)),
202 _mm_cmpgt_epi64(zero, b128));
203 return _mm_movepi64_pi64(r128);
204 #elif defined(SIMDE_X86_AVX2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
205 __m128i zero = _mm_setzero_si128();
206 __m128i a128 = _mm_movpi64_epi64(a);
207 __m128i b128 = _mm_movpi64_epi64(b);
208 __m128i maska = _mm_cmpgt_epi64(zero, a128);
209 __m128i b_abs = _mm_and_si128(_mm_abs_epi8(b128), _mm_set1_epi64x(0xFF));
210 __m128i r128 = _mm_blendv_epi8(_mm_sllv_epi64(a128, b_abs),
211 _mm_xor_si128(_mm_srlv_epi64(_mm_xor_si128(a128, maska), b_abs), maska),
212 _mm_cmpgt_epi64(zero, _mm_slli_epi64(b128, 56)));
213 return _mm_movepi64_pi64(r128);
214 #else
215 simde_int64x1_private
216 r_,
217 a_ = simde_int64x1_to_private(a),
218 b_ = simde_int64x1_to_private(b);
219
220 SIMDE_VECTORIZE
221 for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
222 b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]);
223 r_.values[i] =
224 (b_.values[i] >= 0) ?
225 (b_.values[i] >= 64) ? 0 : (a_.values[i] << b_.values[i]) :
226 (b_.values[i] <= -64) ? (a_.values[i] >> 63) : (a_.values[i] >> -b_.values[i]);
227 }
228
229 return simde_int64x1_from_private(r_);
230 #endif
231 }
232 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
233 #undef vshl_s64
234 #define vshl_s64(a, b) simde_vshl_s64((a), (b))
235 #endif
236
237 SIMDE_FUNCTION_ATTRIBUTES
238 simde_uint8x8_t
simde_vshl_u8(const simde_uint8x8_t a,const simde_int8x8_t b)239 simde_vshl_u8 (const simde_uint8x8_t a, const simde_int8x8_t b) {
240 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
241 return vshl_u8(a, b);
242 #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
243 __m128i a128 = _mm_cvtepu8_epi16(_mm_movpi64_epi64(a));
244 __m128i b128 = _mm_cvtepi8_epi16(_mm_movpi64_epi64(b));
245 __m128i r128 = _mm_blendv_epi8(_mm_sllv_epi16(a128, b128),
246 _mm_srlv_epi16(a128, _mm_abs_epi16(b128)),
247 _mm_cmpgt_epi16(_mm_setzero_si128(), b128));
248 return _mm_movepi64_pi64(_mm_cvtepi16_epi8(r128));
249 #elif defined(SIMDE_X86_AVX2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
250 __m256i a256 = _mm256_cvtepu8_epi32(_mm_movpi64_epi64(a));
251 __m256i b256 = _mm256_cvtepi8_epi32(_mm_movpi64_epi64(b));
252 __m256i r256 = _mm256_blendv_epi8(_mm256_sllv_epi32(a256, b256),
253 _mm256_srlv_epi32(a256, _mm256_abs_epi32(b256)),
254 _mm256_cmpgt_epi32(_mm256_setzero_si256(), b256));
255 r256 = _mm256_shuffle_epi8(r256, _mm256_set1_epi32(0x0C080400));
256 return _mm_set_pi32(_mm256_extract_epi32(r256, 4), _mm256_extract_epi32(r256, 0));
257 #else
258 simde_uint8x8_private
259 r_,
260 a_ = simde_uint8x8_to_private(a);
261 simde_int8x8_private b_ = simde_int8x8_to_private(b);
262
263 SIMDE_VECTORIZE
264 for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
265 r_.values[i] = HEDLEY_STATIC_CAST(uint8_t,
266 (abs(b_.values[i]) >= 8) ? 0 :
267 (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) :
268 (a_.values[i] >> -b_.values[i]));
269 }
270
271 return simde_uint8x8_from_private(r_);
272 #endif
273 }
274 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
275 #undef vshl_u8
276 #define vshl_u8(a, b) simde_vshl_u8((a), (b))
277 #endif
278
279 SIMDE_FUNCTION_ATTRIBUTES
280 simde_uint16x4_t
simde_vshl_u16(const simde_uint16x4_t a,const simde_int16x4_t b)281 simde_vshl_u16 (const simde_uint16x4_t a, const simde_int16x4_t b) {
282 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
283 return vshl_u16(a, b);
284 #elif defined(SIMDE_X86_AVX2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
285 __m128i a128 = _mm_cvtepu16_epi32(_mm_movpi64_epi64(a));
286 __m128i b128 = _mm_cvtepi16_epi32(_mm_movpi64_epi64(b));
287 b128 = _mm_srai_epi32(_mm_slli_epi32(b128, 24), 24);
288 __m128i r128 = _mm_blendv_epi8(_mm_sllv_epi32(a128, b128),
289 _mm_srlv_epi32(a128, _mm_abs_epi32(b128)),
290 _mm_cmpgt_epi32(_mm_setzero_si128(), b128));
291 return _mm_movepi64_pi64(_mm_shuffle_epi8(r128, _mm_set1_epi64x(0x0D0C090805040100)));
292 #else
293 simde_uint16x4_private
294 r_,
295 a_ = simde_uint16x4_to_private(a);
296 simde_int16x4_private b_ = simde_int16x4_to_private(b);
297
298 SIMDE_VECTORIZE
299 for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
300 b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]);
301 r_.values[i] = HEDLEY_STATIC_CAST(uint16_t,
302 (abs(b_.values[i]) >= 16) ? 0 :
303 (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) :
304 (a_.values[i] >> -b_.values[i]));
305 }
306
307 return simde_uint16x4_from_private(r_);
308 #endif
309 }
310 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
311 #undef vshl_u16
312 #define vshl_u16(a, b) simde_vshl_u16((a), (b))
313 #endif
314
315 SIMDE_FUNCTION_ATTRIBUTES
316 simde_uint32x2_t
simde_vshl_u32(const simde_uint32x2_t a,const simde_int32x2_t b)317 simde_vshl_u32 (const simde_uint32x2_t a, const simde_int32x2_t b) {
318 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
319 return vshl_u32(a, b);
320 #elif defined(SIMDE_X86_AVX2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
321 __m128i a128 = _mm_movpi64_epi64(a);
322 __m128i b128 = _mm_movpi64_epi64(b);
323 b128 = _mm_srai_epi32(_mm_slli_epi32(b128, 24), 24);
324 __m128i r128 = _mm_blendv_epi8(_mm_sllv_epi32(a128, b128),
325 _mm_srlv_epi32(a128, _mm_abs_epi32(b128)),
326 _mm_cmpgt_epi32(_mm_setzero_si128(), b128));
327 return _mm_movepi64_pi64(r128);
328 #else
329 simde_uint32x2_private
330 r_,
331 a_ = simde_uint32x2_to_private(a);
332 simde_int32x2_private b_ = simde_int32x2_to_private(b);
333
334 SIMDE_VECTORIZE
335 for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
336 b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]);
337 r_.values[i] =
338 (abs(b_.values[i]) >= 32) ? 0 :
339 (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) :
340 (a_.values[i] >> -b_.values[i]);
341 }
342
343 return simde_uint32x2_from_private(r_);
344 #endif
345 }
346 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
347 #undef vshl_u32
348 #define vshl_u32(a, b) simde_vshl_u32((a), (b))
349 #endif
350
351 SIMDE_FUNCTION_ATTRIBUTES
352 simde_uint64x1_t
simde_vshl_u64(const simde_uint64x1_t a,const simde_int64x1_t b)353 simde_vshl_u64 (const simde_uint64x1_t a, const simde_int64x1_t b) {
354 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
355 return vshl_u64(a, b);
356 #elif defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
357 __m128i zero = _mm_setzero_si128();
358 __m128i a128 = _mm_movpi64_epi64(a);
359 __m128i b128 = _mm_movpi64_epi64(b);
360 b128 = _mm_srai_epi64(_mm_slli_epi64(b128, 56), 56);
361 __m128i r128 = _mm_blendv_epi8(_mm_sllv_epi64(a128, b128),
362 _mm_srlv_epi64(a128, _mm_sub_epi64(zero, b128)),
363 _mm_cmpgt_epi64(zero, b128));
364 return _mm_movepi64_pi64(r128);
365 #elif defined(SIMDE_X86_AVX2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
366 __m128i a128 = _mm_movpi64_epi64(a);
367 __m128i b128 = _mm_movpi64_epi64(b);
368 __m128i b_abs = _mm_and_si128(_mm_abs_epi8(b128), _mm_set1_epi64x(0xFF));
369 __m128i r128 = _mm_blendv_epi8(_mm_sllv_epi64(a128, b_abs),
370 _mm_srlv_epi64(a128, b_abs),
371 _mm_cmpgt_epi64(_mm_setzero_si128(), _mm_slli_epi64(b128, 56)));
372 return _mm_movepi64_pi64(r128);
373 #else
374 simde_uint64x1_private
375 r_,
376 a_ = simde_uint64x1_to_private(a);
377 simde_int64x1_private b_ = simde_int64x1_to_private(b);
378
379 SIMDE_VECTORIZE
380 for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
381 b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]);
382 r_.values[i] =
383 (llabs(b_.values[i]) >= 64) ? 0 :
384 (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) :
385 (a_.values[i] >> -b_.values[i]);
386 }
387
388 return simde_uint64x1_from_private(r_);
389 #endif
390 }
391 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
392 #undef vshl_u64
393 #define vshl_u64(a, b) simde_vshl_u64((a), (b))
394 #endif
395
396 SIMDE_FUNCTION_ATTRIBUTES
397 simde_int8x16_t
simde_vshlq_s8(const simde_int8x16_t a,const simde_int8x16_t b)398 simde_vshlq_s8 (const simde_int8x16_t a, const simde_int8x16_t b) {
399 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
400 return vshlq_s8(a, b);
401 #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
402 __m256i a256 = _mm256_cvtepi8_epi16(a);
403 __m256i b256 = _mm256_cvtepi8_epi16(b);
404 __m256i r256 = _mm256_blendv_epi8(_mm256_sllv_epi16(a256, b256),
405 _mm256_srav_epi16(a256, _mm256_abs_epi16(b256)),
406 _mm256_cmpgt_epi16(_mm256_setzero_si256(), b256));
407 return _mm256_cvtepi16_epi8(r256);
408 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
409 SIMDE_POWER_ALTIVEC_VECTOR(signed char) a_shl, a_shr;
410 SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) b_abs, b_max;
411 SIMDE_POWER_ALTIVEC_VECTOR(SIMDE_POWER_ALTIVEC_BOOL char) b_mask;
412 b_abs = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), vec_abs(b));
413 b_max = vec_splat_u8(7);
414 #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
415 a_shl = vec_and(vec_sl(a, b_abs), vec_cmple(b_abs, b_max));
416 #else
417 a_shl = vec_and(vec_sl(a, b_abs), vec_cmplt(b_abs, vec_splat_u8(8)));
418 #endif
419 a_shr = vec_sra(a, vec_min(b_abs, b_max));
420 b_mask = vec_cmplt(b, vec_splat_s8(0));
421 return vec_sel(a_shl, a_shr, b_mask);
422 #else
423 simde_int8x16_private
424 r_,
425 a_ = simde_int8x16_to_private(a),
426 b_ = simde_int8x16_to_private(b);
427
428 SIMDE_VECTORIZE
429 for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
430 r_.values[i] = HEDLEY_STATIC_CAST(int8_t,
431 (b_.values[i] >= 0) ?
432 (b_.values[i] >= 8) ? 0 : (a_.values[i] << b_.values[i]) :
433 (b_.values[i] <= -8) ? (a_.values[i] >> 7) : (a_.values[i] >> -b_.values[i]));
434 }
435
436 return simde_int8x16_from_private(r_);
437 #endif
438 }
439 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
440 #undef vshlq_s8
441 #define vshlq_s8(a, b) simde_vshlq_s8((a), (b))
442 #endif
443
444 SIMDE_FUNCTION_ATTRIBUTES
445 simde_int16x8_t
simde_vshlq_s16(const simde_int16x8_t a,const simde_int16x8_t b)446 simde_vshlq_s16 (const simde_int16x8_t a, const simde_int16x8_t b) {
447 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
448 return vshlq_s16(a, b);
449 #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
450 __m128i b_ = _mm_srai_epi16(_mm_slli_epi16(b, 8), 8);
451 return _mm_blendv_epi8(_mm_sllv_epi16(a, b_),
452 _mm_srav_epi16(a, _mm_abs_epi16(b_)),
453 _mm_cmpgt_epi16(_mm_setzero_si128(), b_));
454 #elif defined(SIMDE_X86_AVX2_NATIVE) && defined(SIMDE_ARCH_AMD64)
455 __m256i a256 = _mm256_cvtepi16_epi32(a);
456 __m256i b256 = _mm256_cvtepi16_epi32(b);
457 b256 = _mm256_srai_epi32(_mm256_slli_epi32(b256, 24), 24);
458 __m256i r256 = _mm256_blendv_epi8(_mm256_sllv_epi32(a256, b256),
459 _mm256_srav_epi32(a256, _mm256_abs_epi32(b256)),
460 _mm256_cmpgt_epi32(_mm256_setzero_si256(), b256));
461 r256 = _mm256_shuffle_epi8(r256, _mm256_set1_epi64x(0x0D0C090805040100));
462 return _mm_set_epi64x(_mm256_extract_epi64(r256, 2), _mm256_extract_epi64(r256, 0));
463 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
464 SIMDE_POWER_ALTIVEC_VECTOR(signed short) a_shl, a_shr;
465 SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) b_abs, b_max;
466 SIMDE_POWER_ALTIVEC_VECTOR(SIMDE_POWER_ALTIVEC_BOOL short) b_mask;
467 b_abs = vec_and(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned short),
468 vec_abs(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char), b))),
469 vec_splats(HEDLEY_STATIC_CAST(unsigned short, 0xFF)));
470 b_max = vec_splat_u16(15);
471 #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
472 a_shl = vec_and(vec_sl(a, b_abs), vec_cmple(b_abs, b_max));
473 #else
474 a_shl = vec_and(vec_sl(a, b_abs), vec_cmplt(b_abs, vec_splats(HEDLEY_STATIC_CAST(unsigned short, 16))));
475 #endif
476 a_shr = vec_sra(a, vec_min(b_abs, b_max));
477 b_mask = vec_cmplt(vec_sl(b, vec_splat_u16(8)), vec_splat_s16(0));
478 return vec_sel(a_shl, a_shr, b_mask);
479 #else
480 simde_int16x8_private
481 r_,
482 a_ = simde_int16x8_to_private(a),
483 b_ = simde_int16x8_to_private(b);
484
485 SIMDE_VECTORIZE
486 for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
487 b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]);
488 r_.values[i] = HEDLEY_STATIC_CAST(int16_t,
489 (b_.values[i] >= 0) ?
490 (b_.values[i] >= 16) ? 0 : (a_.values[i] << b_.values[i]) :
491 (b_.values[i] <= -16) ? (a_.values[i] >> 15) : (a_.values[i] >> -b_.values[i]));
492 }
493
494 return simde_int16x8_from_private(r_);
495 #endif
496 }
497 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
498 #undef vshlq_s16
499 #define vshlq_s16(a, b) simde_vshlq_s16((a), (b))
500 #endif
501
502 SIMDE_FUNCTION_ATTRIBUTES
503 simde_int32x4_t
simde_vshlq_s32(const simde_int32x4_t a,const simde_int32x4_t b)504 simde_vshlq_s32 (const simde_int32x4_t a, const simde_int32x4_t b) {
505 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
506 return vshlq_s32(a, b);
507 #elif defined(SIMDE_X86_AVX2_NATIVE)
508 __m128i b_ = _mm_srai_epi32(_mm_slli_epi32(b, 24), 24);
509 return _mm_blendv_epi8(_mm_sllv_epi32(a, b_),
510 _mm_srav_epi32(a, _mm_abs_epi32(b_)),
511 _mm_cmpgt_epi32(_mm_setzero_si128(), b_));
512 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
513 SIMDE_POWER_ALTIVEC_VECTOR(signed int) a_shl, a_shr;
514 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) b_abs, b_max;
515 SIMDE_POWER_ALTIVEC_VECTOR(SIMDE_POWER_ALTIVEC_BOOL int) b_mask;
516 b_abs = vec_and(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned int),
517 vec_abs(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char), b))),
518 vec_splats(HEDLEY_STATIC_CAST(unsigned int, 0xFF)));
519 b_max = vec_splats(HEDLEY_STATIC_CAST(unsigned int, 31));
520 #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
521 a_shl = vec_and(vec_sl(a, b_abs), vec_cmple(b_abs, b_max));
522 #else
523 a_shl = vec_and(vec_sl(a, b_abs), vec_cmplt(b_abs, vec_splats(HEDLEY_STATIC_CAST(unsigned int, 32))));
524 #endif
525 a_shr = vec_sra(a, vec_min(b_abs, b_max));
526 b_mask = vec_cmplt(vec_sl(b, vec_splats(HEDLEY_STATIC_CAST(unsigned int, 24))),
527 vec_splat_s32(0));
528 return vec_sel(a_shl, a_shr, b_mask);
529 #else
530 simde_int32x4_private
531 r_,
532 a_ = simde_int32x4_to_private(a),
533 b_ = simde_int32x4_to_private(b);
534
535 SIMDE_VECTORIZE
536 for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
537 b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]);
538 r_.values[i] =
539 (b_.values[i] >= 0) ?
540 (b_.values[i] >= 32) ? 0 : (a_.values[i] << b_.values[i]) :
541 (b_.values[i] <= -32) ? (a_.values[i] >> 31) : (a_.values[i] >> -b_.values[i]);
542 }
543
544 return simde_int32x4_from_private(r_);
545 #endif
546 }
547 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
548 #undef vshlq_s32
549 #define vshlq_s32(a, b) simde_vshlq_s32((a), (b))
550 #endif
551
552 SIMDE_FUNCTION_ATTRIBUTES
553 simde_int64x2_t
simde_vshlq_s64(const simde_int64x2_t a,const simde_int64x2_t b)554 simde_vshlq_s64 (const simde_int64x2_t a, const simde_int64x2_t b) {
555 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
556 return vshlq_s64(a, b);
557 #elif defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
558 __m128i zero = _mm_setzero_si128();
559 __m128i b_ = _mm_srai_epi64(_mm_slli_epi64(b, 56), 56);
560 return _mm_blendv_epi8(_mm_sllv_epi64(a, b_),
561 _mm_srav_epi64(a, _mm_sub_epi64(zero, b_)),
562 _mm_cmpgt_epi64(zero, b_));
563 #elif defined(SIMDE_X86_AVX2_NATIVE)
564 __m128i zero = _mm_setzero_si128();
565 __m128i maska = _mm_cmpgt_epi64(zero, a);
566 __m128i b_abs = _mm_and_si128(_mm_abs_epi8(b), _mm_set1_epi64x(0xFF));
567 return _mm_blendv_epi8(_mm_sllv_epi64(a, b_abs),
568 _mm_xor_si128(_mm_srlv_epi64(_mm_xor_si128(a, maska), b_abs), maska),
569 _mm_cmpgt_epi64(zero, _mm_slli_epi64(b, 56)));
570 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
571 SIMDE_POWER_ALTIVEC_VECTOR(signed long long) a_shl, a_shr;
572 SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) b_abs, b_max;
573 SIMDE_POWER_ALTIVEC_VECTOR(SIMDE_POWER_ALTIVEC_BOOL long long) b_mask;
574 b_abs = vec_and(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long),
575 vec_abs(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char), b))),
576 vec_splats(HEDLEY_STATIC_CAST(unsigned long long, 0xFF)));
577 b_max = vec_splats(HEDLEY_STATIC_CAST(unsigned long long, 63));
578 a_shl = vec_and(vec_sl(a, b_abs), vec_cmple(b_abs, b_max));
579 a_shr = vec_sra(a, vec_min(b_abs, b_max));
580 b_mask = vec_cmplt(vec_sl(b, vec_splats(HEDLEY_STATIC_CAST(unsigned long long, 56))),
581 vec_splats(HEDLEY_STATIC_CAST(signed long long, 0)));
582 HEDLEY_DIAGNOSTIC_PUSH
583 #if defined(SIMDE_BUG_CLANG_46770)
584 SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_
585 #endif
586 return vec_sel(a_shl, a_shr, b_mask);
587 HEDLEY_DIAGNOSTIC_POP
588 #else
589 simde_int64x2_private
590 r_,
591 a_ = simde_int64x2_to_private(a),
592 b_ = simde_int64x2_to_private(b);
593
594 SIMDE_VECTORIZE
595 for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
596 b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]);
597 r_.values[i] =
598 (b_.values[i] >= 0) ?
599 (b_.values[i] >= 64) ? 0 : (a_.values[i] << b_.values[i]) :
600 (b_.values[i] <= -64) ? (a_.values[i] >> 63) : (a_.values[i] >> -b_.values[i]);
601 }
602
603 return simde_int64x2_from_private(r_);
604 #endif
605 }
606 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
607 #undef vshlq_s64
608 #define vshlq_s64(a, b) simde_vshlq_s64((a), (b))
609 #endif
610
611 SIMDE_FUNCTION_ATTRIBUTES
612 simde_uint8x16_t
simde_vshlq_u8(const simde_uint8x16_t a,const simde_int8x16_t b)613 simde_vshlq_u8 (const simde_uint8x16_t a, const simde_int8x16_t b) {
614 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
615 return vshlq_u8(a, b);
616 #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
617 __m256i a256 = _mm256_cvtepu8_epi16(a);
618 __m256i b256 = _mm256_cvtepi8_epi16(b);
619 __m256i r256 = _mm256_blendv_epi8(_mm256_sllv_epi16(a256, b256),
620 _mm256_srlv_epi16(a256, _mm256_abs_epi16(b256)),
621 _mm256_cmpgt_epi16(_mm256_setzero_si256(), b256));
622 return _mm256_cvtepi16_epi8(r256);
623 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
624 SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) b_abs;
625 SIMDE_POWER_ALTIVEC_VECTOR(SIMDE_POWER_ALTIVEC_BOOL char) b_mask;
626 b_abs = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), vec_abs(b));
627 b_mask = vec_cmplt(b, vec_splat_s8(0));
628 return vec_and(vec_sel(vec_sl(a, b_abs), vec_sr(a, b_abs), b_mask),
629 vec_cmplt(b_abs, vec_splat_u8(8)));
630 #else
631 simde_uint8x16_private
632 r_,
633 a_ = simde_uint8x16_to_private(a);
634 simde_int8x16_private b_ = simde_int8x16_to_private(b);
635
636 SIMDE_VECTORIZE
637 for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
638 r_.values[i] = HEDLEY_STATIC_CAST(uint8_t,
639 (abs(b_.values[i]) >= 8) ? 0 :
640 (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) :
641 (a_.values[i] >> -b_.values[i]));
642 }
643
644 return simde_uint8x16_from_private(r_);
645 #endif
646 }
647 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
648 #undef vshlq_u8
649 #define vshlq_u8(a, b) simde_vshlq_u8((a), (b))
650 #endif
651
652 SIMDE_FUNCTION_ATTRIBUTES
653 simde_uint16x8_t
simde_vshlq_u16(const simde_uint16x8_t a,const simde_int16x8_t b)654 simde_vshlq_u16 (const simde_uint16x8_t a, const simde_int16x8_t b) {
655 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
656 return vshlq_u16(a, b);
657 #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
658 __m128i b_ = _mm_srai_epi16(_mm_slli_epi16(b, 8), 8);
659 return _mm_blendv_epi8(_mm_sllv_epi16(a, b_),
660 _mm_srlv_epi16(a, _mm_abs_epi16(b_)),
661 _mm_cmpgt_epi16(_mm_setzero_si128(), b_));
662 #elif defined(SIMDE_X86_AVX2_NATIVE) && defined(SIMDE_ARCH_AMD64)
663 __m256i a256 = _mm256_cvtepu16_epi32(a);
664 __m256i b256 = _mm256_cvtepi16_epi32(b);
665 b256 = _mm256_srai_epi32(_mm256_slli_epi32(b256, 24), 24);
666 __m256i r256 = _mm256_blendv_epi8(_mm256_sllv_epi32(a256, b256),
667 _mm256_srlv_epi32(a256, _mm256_abs_epi32(b256)),
668 _mm256_cmpgt_epi32(_mm256_setzero_si256(), b256));
669 r256 = _mm256_shuffle_epi8(r256, _mm256_set1_epi64x(0x0D0C090805040100));
670 return _mm_set_epi64x(_mm256_extract_epi64(r256, 2), _mm256_extract_epi64(r256, 0));
671 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
672 SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) b_abs;
673 SIMDE_POWER_ALTIVEC_VECTOR(SIMDE_POWER_ALTIVEC_BOOL short) b_mask;
674 b_abs = vec_and(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned short),
675 vec_abs(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char), b))),
676 vec_splats(HEDLEY_STATIC_CAST(unsigned short, 0xFF)));
677 b_mask = vec_cmplt(vec_sl(b, vec_splat_u16(8)), vec_splat_s16(0));
678 #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
679 return vec_and(vec_sel(vec_sl(a, b_abs), vec_sr(a, b_abs), b_mask),
680 vec_cmple(b_abs, vec_splat_u16(15)));
681 #else
682 return vec_and(vec_sel(vec_sl(a, b_abs), vec_sr(a, b_abs), b_mask),
683 vec_cmplt(b_abs, vec_splats(HEDLEY_STATIC_CAST(unsigned short, 16))));
684 #endif
685 #else
686 simde_uint16x8_private
687 r_,
688 a_ = simde_uint16x8_to_private(a);
689 simde_int16x8_private b_ = simde_int16x8_to_private(b);
690
691 SIMDE_VECTORIZE
692 for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
693 b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]);
694 r_.values[i] = HEDLEY_STATIC_CAST(uint16_t,
695 (abs(b_.values[i]) >= 16) ? 0 :
696 (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) :
697 (a_.values[i] >> -b_.values[i]));
698 }
699
700 return simde_uint16x8_from_private(r_);
701 #endif
702 }
703 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
704 #undef vshlq_u16
705 #define vshlq_u16(a, b) simde_vshlq_u16((a), (b))
706 #endif
707
708 SIMDE_FUNCTION_ATTRIBUTES
709 simde_uint32x4_t
simde_vshlq_u32(const simde_uint32x4_t a,const simde_int32x4_t b)710 simde_vshlq_u32 (const simde_uint32x4_t a, const simde_int32x4_t b) {
711 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
712 return vshlq_u32(a, b);
713 #elif defined(SIMDE_X86_AVX2_NATIVE)
714 __m128i b_ = _mm_srai_epi32(_mm_slli_epi32(b, 24), 24);
715 return _mm_blendv_epi8(_mm_sllv_epi32(a, b_),
716 _mm_srlv_epi32(a, _mm_abs_epi32(b_)),
717 _mm_cmpgt_epi32(_mm_setzero_si128(), b_));
718 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
719 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) b_abs;
720 SIMDE_POWER_ALTIVEC_VECTOR(SIMDE_POWER_ALTIVEC_BOOL int) b_mask;
721 b_abs = vec_and(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned int),
722 vec_abs(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char), b))),
723 vec_splats(HEDLEY_STATIC_CAST(unsigned int, 0xFF)));
724 b_mask = vec_cmplt(vec_sl(b, vec_splats(HEDLEY_STATIC_CAST(unsigned int, 24))), vec_splat_s32(0));
725 return vec_and(vec_sel(vec_sl(a, b_abs), vec_sr(a, b_abs), b_mask),
726 vec_cmplt(b_abs, vec_splats(HEDLEY_STATIC_CAST(unsigned int, 32))));
727 #else
728 simde_uint32x4_private
729 r_,
730 a_ = simde_uint32x4_to_private(a);
731 simde_int32x4_private b_ = simde_int32x4_to_private(b);
732
733 SIMDE_VECTORIZE
734 for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
735 b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]);
736 r_.values[i] = (abs(b_.values[i]) >= 32) ? 0 :
737 (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) :
738 (a_.values[i] >> -b_.values[i]);
739 }
740
741 return simde_uint32x4_from_private(r_);
742 #endif
743 }
744 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
745 #undef vshlq_u32
746 #define vshlq_u32(a, b) simde_vshlq_u32((a), (b))
747 #endif
748
749 SIMDE_FUNCTION_ATTRIBUTES
750 simde_uint64x2_t
simde_vshlq_u64(const simde_uint64x2_t a,const simde_int64x2_t b)751 simde_vshlq_u64 (const simde_uint64x2_t a, const simde_int64x2_t b) {
752 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
753 return vshlq_u64(a, b);
754 #elif defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
755 __m128i zero = _mm_setzero_si128();
756 __m128i b_ = _mm_srai_epi64(_mm_slli_epi64(b, 56), 56);
757 return _mm_blendv_epi8(_mm_sllv_epi64(a, b_),
758 _mm_srlv_epi64(a, _mm_sub_epi64(zero, b_)),
759 _mm_cmpgt_epi64(zero, b_));
760 #elif defined(SIMDE_X86_AVX2_NATIVE)
761 __m128i b_abs = _mm_and_si128(_mm_abs_epi8(b), _mm_set1_epi64x(0xFF));
762 return _mm_blendv_epi8(_mm_sllv_epi64(a, b_abs),
763 _mm_srlv_epi64(a, b_abs),
764 _mm_cmpgt_epi64(_mm_setzero_si128(), _mm_slli_epi64(b, 56)));
765 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
766 SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) b_abs;
767 SIMDE_POWER_ALTIVEC_VECTOR(SIMDE_POWER_ALTIVEC_BOOL long long) b_mask;
768 b_abs = vec_and(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long),
769 vec_abs(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char), b))),
770 vec_splats(HEDLEY_STATIC_CAST(unsigned long long, 0xFF)));
771 b_mask = vec_cmplt(vec_sl(b, vec_splats(HEDLEY_STATIC_CAST(unsigned long long, 56))),
772 vec_splats(HEDLEY_STATIC_CAST(signed long long, 0)));
773 HEDLEY_DIAGNOSTIC_PUSH
774 #if defined(SIMDE_BUG_CLANG_46770)
775 SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_
776 #endif
777 return vec_and(vec_sel(vec_sl(a, b_abs), vec_sr(a, b_abs), b_mask),
778 vec_cmplt(b_abs, vec_splats(HEDLEY_STATIC_CAST(unsigned long long, 64))));
779 HEDLEY_DIAGNOSTIC_POP
780 #else
781 simde_uint64x2_private
782 r_,
783 a_ = simde_uint64x2_to_private(a);
784 simde_int64x2_private b_ = simde_int64x2_to_private(b);
785
786 SIMDE_VECTORIZE
787 for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
788 b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]);
789 r_.values[i] = (llabs(b_.values[i]) >= 64) ? 0 :
790 (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) :
791 (a_.values[i] >> -b_.values[i]);
792 }
793
794 return simde_uint64x2_from_private(r_);
795 #endif
796 }
797 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
798 #undef vshlq_u64
799 #define vshlq_u64(a, b) simde_vshlq_u64((a), (b))
800 #endif
801
802 SIMDE_END_DECLS_
803 HEDLEY_DIAGNOSTIC_POP
804
805 #endif /* !defined(SIMDE_ARM_NEON_SHL_H) */
806