1 /* SPDX-License-Identifier: MIT
2  *
3  * Permission is hereby granted, free of charge, to any person
4  * obtaining a copy of this software and associated documentation
5  * files (the "Software"), to deal in the Software without
6  * restriction, including without limitation the rights to use, copy,
7  * modify, merge, publish, distribute, sublicense, and/or sell copies
8  * of the Software, and to permit persons to whom the Software is
9  * furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be
12  * included in all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
18  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
19  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  * Copyright:
24  *   2020      Evan Nemerson <evan@nemerson.com>
25  *   2020      Christopher Moore <moore@free.fr>
26  */
27 
28 #if !defined(SIMDE_ARM_NEON_SHL_H)
29 #define SIMDE_ARM_NEON_SHL_H
30 
31 #include "types.h"
32 
33 /* Notes from the implementer (Christopher Moore aka rosbif)
34  *
35  * I have tried to exactly reproduce the documented behaviour of the
36  * ARM NEON shl and shlq intrinsics.
37  * This is complicated for the following reasons:-
38  *
39  * a) Negative shift counts shift right.
40  *
41  * b) Only the low byte of the shift count is used but the shift count
42  * is not limited to 8-bit values (-128 to 127).
43  *
44  * c) Intel SIMD is not nearly as complete as NEON and AltiVec.
45  * There were no intrisics with a vector shift count before AVX2 which
46  * only has 32 and 64-bit logical ones and only a 32-bit arithmetic
47  * one. The others need AVX512. There are no 8-bit shift intrinsics at
48  * all, even with a scalar shift count. It is surprising to use AVX2
49  * and even AVX512 to implement a 64-bit vector operation.
50  *
51  * d) Many shift implementations, and the C standard, do not treat a
52  * shift count >= the object's size in bits as one would expect.
53  * (Personally I feel that > is silly but == can be useful.)
54  *
55  * Maybe it would be useful for SIMDe to have a flag enabling a fast
56  * implementation where the result is only guaranteed for shift counts
57  * conforming to the C standard.
58  *
59  * Note that even the C17/18 standard does not define the behaviour of
60  * a right shift of a negative value.
61  * However Evan and I agree that all compilers likely to be used
62  * implement this as an arithmetic right shift with sign extension.
63  * If this is not the case it could be replaced by a logical right shift
64  * if negative values are complemented before and after the shift.
65  *
66  * Some of the SIMD translations may be slower than the portable code,
67  * particularly those for vectors with only one or two elements.
68  * But I had fun writing them ;-)
69  *
70  */
71 
72 HEDLEY_DIAGNOSTIC_PUSH
73 SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
74 SIMDE_BEGIN_DECLS_
75 
76 SIMDE_FUNCTION_ATTRIBUTES
77 simde_int8x8_t
simde_vshl_s8(const simde_int8x8_t a,const simde_int8x8_t b)78 simde_vshl_s8 (const simde_int8x8_t a, const simde_int8x8_t b) {
79   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
80     return vshl_s8(a, b);
81   #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
82     __m128i a128 = _mm_cvtepi8_epi16(_mm_movpi64_epi64(a));
83     __m128i b128 = _mm_cvtepi8_epi16(_mm_movpi64_epi64(b));
84     __m128i r128 = _mm_blendv_epi8(_mm_sllv_epi16(a128, b128),
85                                    _mm_srav_epi16(a128, _mm_abs_epi16(b128)),
86                                    _mm_cmpgt_epi16(_mm_setzero_si128(), b128));
87     return _mm_movepi64_pi64(_mm_cvtepi16_epi8(r128));
88   #elif defined(SIMDE_X86_AVX2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
89     __m256i a256 = _mm256_cvtepi8_epi32(_mm_movpi64_epi64(a));
90     __m256i b256 = _mm256_cvtepi8_epi32(_mm_movpi64_epi64(b));
91     __m256i r256 = _mm256_blendv_epi8(_mm256_sllv_epi32(a256, b256),
92                                       _mm256_srav_epi32(a256, _mm256_abs_epi32(b256)),
93                                       _mm256_cmpgt_epi32(_mm256_setzero_si256(), b256));
94     r256 = _mm256_shuffle_epi8(r256, _mm256_set1_epi32(0x0C080400));
95     return _mm_set_pi32(_mm256_extract_epi32(r256, 4), _mm256_extract_epi32(r256, 0));
96   #else
97     simde_int8x8_private
98       r_,
99       a_ = simde_int8x8_to_private(a),
100       b_ = simde_int8x8_to_private(b);
101 
102     SIMDE_VECTORIZE
103     for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
104       r_.values[i] = HEDLEY_STATIC_CAST(int8_t,
105         (b_.values[i] >=  0) ?
106         (b_.values[i] >=  8) ?                   0 : (a_.values[i] <<  b_.values[i]) :
107         (b_.values[i] <= -8) ? (a_.values[i] >> 7) : (a_.values[i] >> -b_.values[i]));
108     }
109 
110     return simde_int8x8_from_private(r_);
111   #endif
112 }
113 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
114   #undef vshl_s8
115   #define vshl_s8(a, b) simde_vshl_s8((a), (b))
116 #endif
117 
118 SIMDE_FUNCTION_ATTRIBUTES
119 simde_int16x4_t
simde_vshl_s16(const simde_int16x4_t a,const simde_int16x4_t b)120 simde_vshl_s16 (const simde_int16x4_t a, const simde_int16x4_t b) {
121   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
122     return vshl_s16(a, b);
123   #elif defined(SIMDE_X86_AVX2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
124     __m128i a128 = _mm_cvtepi16_epi32(_mm_movpi64_epi64(a));
125     __m128i b128 = _mm_cvtepi16_epi32(_mm_movpi64_epi64(b));
126     b128 = _mm_srai_epi32(_mm_slli_epi32(b128, 24), 24);
127     __m128i r128 = _mm_blendv_epi8(_mm_sllv_epi32(a128, b128),
128                                    _mm_srav_epi32(a128, _mm_abs_epi32(b128)),
129                                    _mm_cmpgt_epi32(_mm_setzero_si128(), b128));
130     return _mm_movepi64_pi64(_mm_shuffle_epi8(r128, _mm_set1_epi64x(0x0D0C090805040100)));
131   #else
132     simde_int16x4_private
133       r_,
134       a_ = simde_int16x4_to_private(a),
135       b_ = simde_int16x4_to_private(b);
136 
137     SIMDE_VECTORIZE
138     for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
139       b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]);
140       r_.values[i] = HEDLEY_STATIC_CAST(int16_t,
141         (b_.values[i] >=   0) ?
142         (b_.values[i] >=  16) ?                    0 : (a_.values[i] <<  b_.values[i]) :
143         (b_.values[i] <= -16) ? (a_.values[i] >> 15) : (a_.values[i] >> -b_.values[i]));
144     }
145 
146     return simde_int16x4_from_private(r_);
147   #endif
148 }
149 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
150   #undef vshl_s16
151   #define vshl_s16(a, b) simde_vshl_s16((a), (b))
152 #endif
153 
154 SIMDE_FUNCTION_ATTRIBUTES
155 simde_int32x2_t
simde_vshl_s32(const simde_int32x2_t a,const simde_int32x2_t b)156 simde_vshl_s32 (const simde_int32x2_t a, const simde_int32x2_t b) {
157   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
158     return vshl_s32(a, b);
159   #elif defined(SIMDE_X86_AVX2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
160     __m128i a128 = _mm_movpi64_epi64(a);
161     __m128i b128 = _mm_movpi64_epi64(b);
162     b128 = _mm_srai_epi32(_mm_slli_epi32(b128, 24), 24);
163     __m128i r128 = _mm_blendv_epi8(_mm_sllv_epi32(a128, b128),
164                                    _mm_srav_epi32(a128, _mm_abs_epi32(b128)),
165                                    _mm_cmpgt_epi32(_mm_setzero_si128(), b128));
166     return _mm_movepi64_pi64(r128);
167   #else
168     simde_int32x2_private
169       r_,
170       a_ = simde_int32x2_to_private(a),
171       b_ = simde_int32x2_to_private(b);
172 
173     SIMDE_VECTORIZE
174     for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
175       b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]);
176       r_.values[i] =
177         (b_.values[i] >=   0) ?
178         (b_.values[i] >=  32) ?                    0 : (a_.values[i] <<  b_.values[i]) :
179         (b_.values[i] <= -32) ? (a_.values[i] >> 31) : (a_.values[i] >> -b_.values[i]);
180     }
181 
182     return simde_int32x2_from_private(r_);
183   #endif
184 }
185 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
186   #undef vshl_s32
187   #define vshl_s32(a, b) simde_vshl_s32((a), (b))
188 #endif
189 
190 SIMDE_FUNCTION_ATTRIBUTES
191 simde_int64x1_t
simde_vshl_s64(const simde_int64x1_t a,const simde_int64x1_t b)192 simde_vshl_s64 (const simde_int64x1_t a, const simde_int64x1_t b) {
193   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
194     return vshl_s64(a, b);
195   #elif defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
196     __m128i zero = _mm_setzero_si128();
197     __m128i a128 = _mm_movpi64_epi64(a);
198     __m128i b128 = _mm_movpi64_epi64(b);
199     b128 = _mm_srai_epi64(_mm_slli_epi64(b128, 56), 56);
200     __m128i r128 = _mm_blendv_epi8(_mm_sllv_epi64(a128, b128),
201                                    _mm_srav_epi64(a128, _mm_sub_epi64(zero, b128)),
202                                    _mm_cmpgt_epi64(zero, b128));
203     return _mm_movepi64_pi64(r128);
204   #elif defined(SIMDE_X86_AVX2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
205     __m128i zero = _mm_setzero_si128();
206     __m128i a128 = _mm_movpi64_epi64(a);
207     __m128i b128 = _mm_movpi64_epi64(b);
208     __m128i maska = _mm_cmpgt_epi64(zero, a128);
209     __m128i b_abs = _mm_and_si128(_mm_abs_epi8(b128), _mm_set1_epi64x(0xFF));
210     __m128i r128 = _mm_blendv_epi8(_mm_sllv_epi64(a128, b_abs),
211                                    _mm_xor_si128(_mm_srlv_epi64(_mm_xor_si128(a128, maska), b_abs), maska),
212                                    _mm_cmpgt_epi64(zero, _mm_slli_epi64(b128, 56)));
213     return _mm_movepi64_pi64(r128);
214   #else
215     simde_int64x1_private
216       r_,
217       a_ = simde_int64x1_to_private(a),
218       b_ = simde_int64x1_to_private(b);
219 
220     SIMDE_VECTORIZE
221     for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
222       b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]);
223       r_.values[i] =
224         (b_.values[i] >=   0) ?
225         (b_.values[i] >=  64) ?                    0 : (a_.values[i] <<  b_.values[i]) :
226         (b_.values[i] <= -64) ? (a_.values[i] >> 63) : (a_.values[i] >> -b_.values[i]);
227     }
228 
229     return simde_int64x1_from_private(r_);
230   #endif
231 }
232 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
233   #undef vshl_s64
234   #define vshl_s64(a, b) simde_vshl_u64((a), (b))
235 #endif
236 
237 SIMDE_FUNCTION_ATTRIBUTES
238 simde_uint8x8_t
simde_vshl_u8(const simde_uint8x8_t a,const simde_int8x8_t b)239 simde_vshl_u8 (const simde_uint8x8_t a, const simde_int8x8_t b) {
240   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
241     return vshl_u8(a, b);
242   #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
243     __m128i a128 = _mm_cvtepu8_epi16(_mm_movpi64_epi64(a));
244     __m128i b128 = _mm_cvtepi8_epi16(_mm_movpi64_epi64(b));
245     __m128i r128 = _mm_blendv_epi8(_mm_sllv_epi16(a128, b128),
246                                    _mm_srlv_epi16(a128, _mm_abs_epi16(b128)),
247                                    _mm_cmpgt_epi16(_mm_setzero_si128(), b128));
248     return _mm_movepi64_pi64(_mm_cvtepi16_epi8(r128));
249   #elif defined(SIMDE_X86_AVX2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
250     __m256i a256 = _mm256_cvtepu8_epi32(_mm_movpi64_epi64(a));
251     __m256i b256 = _mm256_cvtepi8_epi32(_mm_movpi64_epi64(b));
252     __m256i r256 = _mm256_blendv_epi8(_mm256_sllv_epi32(a256, b256),
253                                       _mm256_srlv_epi32(a256, _mm256_abs_epi32(b256)),
254                                       _mm256_cmpgt_epi32(_mm256_setzero_si256(), b256));
255     r256 = _mm256_shuffle_epi8(r256, _mm256_set1_epi32(0x0C080400));
256     return _mm_set_pi32(_mm256_extract_epi32(r256, 4), _mm256_extract_epi32(r256, 0));
257   #else
258     simde_uint8x8_private
259       r_,
260       a_ = simde_uint8x8_to_private(a);
261     simde_int8x8_private b_ = simde_int8x8_to_private(b);
262 
263     SIMDE_VECTORIZE
264     for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
265       r_.values[i] = HEDLEY_STATIC_CAST(uint8_t,
266         (abs(b_.values[i]) >= 8) ? 0 :
267             (b_.values[i]  >= 0) ? (a_.values[i] <<  b_.values[i]) :
268                                    (a_.values[i] >> -b_.values[i]));
269     }
270 
271     return simde_uint8x8_from_private(r_);
272   #endif
273 }
274 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
275   #undef vshl_u8
276   #define vshl_u8(a, b) simde_vshl_u8((a), (b))
277 #endif
278 
279 SIMDE_FUNCTION_ATTRIBUTES
280 simde_uint16x4_t
simde_vshl_u16(const simde_uint16x4_t a,const simde_int16x4_t b)281 simde_vshl_u16 (const simde_uint16x4_t a, const simde_int16x4_t b) {
282   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
283     return vshl_u16(a, b);
284   #elif defined(SIMDE_X86_AVX2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
285     __m128i a128 = _mm_cvtepu16_epi32(_mm_movpi64_epi64(a));
286     __m128i b128 = _mm_cvtepi16_epi32(_mm_movpi64_epi64(b));
287     b128 = _mm_srai_epi32(_mm_slli_epi32(b128, 24), 24);
288     __m128i r128 = _mm_blendv_epi8(_mm_sllv_epi32(a128, b128),
289                                    _mm_srlv_epi32(a128, _mm_abs_epi32(b128)),
290                                    _mm_cmpgt_epi32(_mm_setzero_si128(), b128));
291     return _mm_movepi64_pi64(_mm_shuffle_epi8(r128, _mm_set1_epi64x(0x0D0C090805040100)));
292   #else
293     simde_uint16x4_private
294       r_,
295       a_ = simde_uint16x4_to_private(a);
296     simde_int16x4_private b_ = simde_int16x4_to_private(b);
297 
298     SIMDE_VECTORIZE
299     for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
300       b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]);
301       r_.values[i] = HEDLEY_STATIC_CAST(uint16_t,
302         (abs(b_.values[i]) >= 16) ? 0 :
303             (b_.values[i]  >=  0) ? (a_.values[i] <<  b_.values[i]) :
304                                     (a_.values[i] >> -b_.values[i]));
305     }
306 
307     return simde_uint16x4_from_private(r_);
308   #endif
309 }
310 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
311   #undef vshl_u16
312   #define vshl_u16(a, b) simde_vshl_u16((a), (b))
313 #endif
314 
315 SIMDE_FUNCTION_ATTRIBUTES
316 simde_uint32x2_t
simde_vshl_u32(const simde_uint32x2_t a,const simde_int32x2_t b)317 simde_vshl_u32 (const simde_uint32x2_t a, const simde_int32x2_t b) {
318   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
319     return vshl_u32(a, b);
320   #elif defined(SIMDE_X86_AVX2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
321     __m128i a128 = _mm_movpi64_epi64(a);
322     __m128i b128 = _mm_movpi64_epi64(b);
323     b128 = _mm_srai_epi32(_mm_slli_epi32(b128, 24), 24);
324     __m128i r128 = _mm_blendv_epi8(_mm_sllv_epi32(a128, b128),
325                                    _mm_srlv_epi32(a128, _mm_abs_epi32(b128)),
326                                    _mm_cmpgt_epi32(_mm_setzero_si128(), b128));
327     return _mm_movepi64_pi64(r128);
328   #else
329     simde_uint32x2_private
330       r_,
331       a_ = simde_uint32x2_to_private(a);
332     simde_int32x2_private b_ = simde_int32x2_to_private(b);
333 
334     SIMDE_VECTORIZE
335     for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
336       b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]);
337       r_.values[i] =
338         (abs(b_.values[i]) >= 32) ? 0 :
339             (b_.values[i]  >=  0) ? (a_.values[i] <<  b_.values[i]) :
340                                     (a_.values[i] >> -b_.values[i]);
341     }
342 
343     return simde_uint32x2_from_private(r_);
344   #endif
345 }
346 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
347   #undef vshl_u32
348   #define vshl_u32(a, b) simde_vshl_u32((a), (b))
349 #endif
350 
351 SIMDE_FUNCTION_ATTRIBUTES
352 simde_uint64x1_t
simde_vshl_u64(const simde_uint64x1_t a,const simde_int64x1_t b)353 simde_vshl_u64 (const simde_uint64x1_t a, const simde_int64x1_t b) {
354   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
355     return vshl_u64(a, b);
356   #elif defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
357     __m128i zero = _mm_setzero_si128();
358     __m128i a128 = _mm_movpi64_epi64(a);
359     __m128i b128 = _mm_movpi64_epi64(b);
360     b128 = _mm_srai_epi64(_mm_slli_epi64(b128, 56), 56);
361     __m128i r128 = _mm_blendv_epi8(_mm_sllv_epi64(a128, b128),
362                                    _mm_srlv_epi64(a128, _mm_sub_epi64(zero, b128)),
363                                    _mm_cmpgt_epi64(zero, b128));
364     return _mm_movepi64_pi64(r128);
365   #elif defined(SIMDE_X86_AVX2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
366     __m128i a128 = _mm_movpi64_epi64(a);
367     __m128i b128 = _mm_movpi64_epi64(b);
368     __m128i b_abs = _mm_and_si128(_mm_abs_epi8(b128), _mm_set1_epi64x(0xFF));
369     __m128i r128 = _mm_blendv_epi8(_mm_sllv_epi64(a128, b_abs),
370                                    _mm_srlv_epi64(a128, b_abs),
371                                    _mm_cmpgt_epi64(_mm_setzero_si128(), _mm_slli_epi64(b128, 56)));
372     return _mm_movepi64_pi64(r128);
373   #else
374     simde_uint64x1_private
375       r_,
376       a_ = simde_uint64x1_to_private(a);
377     simde_int64x1_private b_ = simde_int64x1_to_private(b);
378 
379     SIMDE_VECTORIZE
380     for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
381       b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]);
382       r_.values[i] =
383         (llabs(b_.values[i]) >= 64) ? 0 :
384               (b_.values[i]  >=  0) ? (a_.values[i] <<  b_.values[i]) :
385                                       (a_.values[i] >> -b_.values[i]);
386     }
387 
388   return simde_uint64x1_from_private(r_);
389 #endif
390 }
391 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
392   #undef vshl_u64
393   #define vshl_u64(a, b) simde_vshl_u64((a), (b))
394 #endif
395 
396 SIMDE_FUNCTION_ATTRIBUTES
397 simde_int8x16_t
simde_vshlq_s8(const simde_int8x16_t a,const simde_int8x16_t b)398 simde_vshlq_s8 (const simde_int8x16_t a, const simde_int8x16_t b) {
399   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
400     return vshlq_s8(a, b);
401   #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
402     __m256i a256 = _mm256_cvtepi8_epi16(a);
403     __m256i b256 = _mm256_cvtepi8_epi16(b);
404     __m256i r256 = _mm256_blendv_epi8(_mm256_sllv_epi16(a256, b256),
405                                       _mm256_srav_epi16(a256, _mm256_abs_epi16(b256)),
406                                       _mm256_cmpgt_epi16(_mm256_setzero_si256(), b256));
407     return _mm256_cvtepi16_epi8(r256);
408   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
409     vector signed char a_shl, a_shr;
410     vector unsigned char b_abs, b_max;
411     vector bool char b_mask;
412     b_abs = HEDLEY_REINTERPRET_CAST(vector unsigned char, vec_abs(b));
413     b_max = vec_splat_u8(7);
414     #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
415       a_shl = vec_and(vec_sl(a, b_abs), vec_cmple(b_abs, b_max));
416     #else
417       a_shl = vec_and(vec_sl(a, b_abs), vec_cmplt(b_abs, vec_splat_u8(8)));
418     #endif
419     a_shr = vec_sra(a, vec_min(b_abs, b_max));
420     b_mask = vec_cmplt(b, vec_splat_s8(0));
421     return vec_sel(a_shl, a_shr, b_mask);
422   #else
423     simde_int8x16_private
424       r_,
425       a_ = simde_int8x16_to_private(a),
426       b_ = simde_int8x16_to_private(b);
427 
428     SIMDE_VECTORIZE
429     for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
430       r_.values[i] = HEDLEY_STATIC_CAST(int8_t,
431         (b_.values[i] >=  0) ?
432         (b_.values[i] >=  8) ?                   0 : (a_.values[i] <<  b_.values[i]) :
433         (b_.values[i] <= -8) ? (a_.values[i] >> 7) : (a_.values[i] >> -b_.values[i]));
434     }
435 
436     return simde_int8x16_from_private(r_);
437   #endif
438 }
439 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
440   #undef vshlq_s8
441   #define vshlq_s8(a, b) simde_vshlq_s8((a), (b))
442 #endif
443 
444 SIMDE_FUNCTION_ATTRIBUTES
445 simde_int16x8_t
simde_vshlq_s16(const simde_int16x8_t a,const simde_int16x8_t b)446 simde_vshlq_s16 (const simde_int16x8_t a, const simde_int16x8_t b) {
447   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
448     return vshlq_s16(a, b);
449   #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
450     __m128i b_ = _mm_srai_epi16(_mm_slli_epi16(b, 8), 8);
451     return _mm_blendv_epi8(_mm_sllv_epi16(a, b_),
452                            _mm_srav_epi16(a, _mm_abs_epi16(b_)),
453                            _mm_cmpgt_epi16(_mm_setzero_si128(), b_));
454   #elif defined(SIMDE_X86_AVX2_NATIVE)
455     __m256i a256 = _mm256_cvtepi16_epi32(a);
456     __m256i b256 = _mm256_cvtepi16_epi32(b);
457     b256 = _mm256_srai_epi32(_mm256_slli_epi32(b256, 24), 24);
458     __m256i r256 = _mm256_blendv_epi8(_mm256_sllv_epi32(a256, b256),
459                                       _mm256_srav_epi32(a256, _mm256_abs_epi32(b256)),
460                                       _mm256_cmpgt_epi32(_mm256_setzero_si256(), b256));
461     r256 = _mm256_shuffle_epi8(r256, _mm256_set1_epi64x(0x0D0C090805040100));
462     return _mm_set_epi64x(_mm256_extract_epi64(r256, 2), _mm256_extract_epi64(r256, 0));
463   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
464     vector signed short a_shl, a_shr;
465     vector unsigned short b_abs, b_max;
466     vector bool short b_mask;
467     b_abs = vec_and(HEDLEY_REINTERPRET_CAST(vector unsigned short,
468                                             vec_abs(HEDLEY_REINTERPRET_CAST(vector signed char, b))),
469                     vec_splats(HEDLEY_STATIC_CAST(unsigned short, 0xFF)));
470     b_max = vec_splat_u16(15);
471     #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
472       a_shl = vec_and(vec_sl(a, b_abs), vec_cmple(b_abs, b_max));
473     #else
474       a_shl = vec_and(vec_sl(a, b_abs), vec_cmplt(b_abs, vec_splats(HEDLEY_STATIC_CAST(unsigned short, 16))));
475     #endif
476     a_shr = vec_sra(a, vec_min(b_abs, b_max));
477     b_mask = vec_cmplt(vec_sl(b, vec_splat_u16(8)), vec_splat_s16(0));
478     return vec_sel(a_shl, a_shr, b_mask);
479   #else
480     simde_int16x8_private
481       r_,
482       a_ = simde_int16x8_to_private(a),
483       b_ = simde_int16x8_to_private(b);
484 
485       SIMDE_VECTORIZE
486       for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
487         b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]);
488         r_.values[i] = HEDLEY_STATIC_CAST(int16_t,
489           (b_.values[i] >=   0) ?
490           (b_.values[i] >=  16) ?                    0 : (a_.values[i] <<  b_.values[i]) :
491           (b_.values[i] <= -16) ? (a_.values[i] >> 15) : (a_.values[i] >> -b_.values[i]));
492       }
493 
494     return simde_int16x8_from_private(r_);
495   #endif
496 }
497 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
498   #undef vshlq_s16
499   #define vshlq_s16(a, b) simde_vshlq_s16((a), (b))
500 #endif
501 
502 SIMDE_FUNCTION_ATTRIBUTES
503 simde_int32x4_t
simde_vshlq_s32(const simde_int32x4_t a,const simde_int32x4_t b)504 simde_vshlq_s32 (const simde_int32x4_t a, const simde_int32x4_t b) {
505   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
506     return vshlq_s32(a, b);
507   #elif defined(SIMDE_X86_AVX2_NATIVE)
508     __m128i b_ = _mm_srai_epi32(_mm_slli_epi32(b, 24), 24);
509     return _mm_blendv_epi8(_mm_sllv_epi32(a, b_),
510                            _mm_srav_epi32(a, _mm_abs_epi32(b_)),
511                            _mm_cmpgt_epi32(_mm_setzero_si128(), b_));
512   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
513     vector signed int a_shl, a_shr;
514     vector unsigned int b_abs, b_max;
515     vector bool int b_mask;
516     b_abs = vec_and(HEDLEY_REINTERPRET_CAST(vector unsigned int,
517                                             vec_abs(HEDLEY_REINTERPRET_CAST(vector signed char, b))),
518                     vec_splats(HEDLEY_STATIC_CAST(unsigned int, 0xFF)));
519     b_max = vec_splats(HEDLEY_STATIC_CAST(unsigned int, 31));
520     #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
521       a_shl = vec_and(vec_sl(a, b_abs), vec_cmple(b_abs, b_max));
522     #else
523     a_shl = vec_and(vec_sl(a, b_abs), vec_cmplt(b_abs, vec_splats(HEDLEY_STATIC_CAST(unsigned int, 32))));
524       #endif
525     a_shr = vec_sra(a, vec_min(b_abs, b_max));
526     b_mask = vec_cmplt(vec_sl(b, vec_splats(HEDLEY_STATIC_CAST(unsigned int, 24))),
527                        vec_splat_s32(0));
528     return vec_sel(a_shl, a_shr, b_mask);
529   #else
530     simde_int32x4_private
531       r_,
532       a_ = simde_int32x4_to_private(a),
533       b_ = simde_int32x4_to_private(b);
534 
535     SIMDE_VECTORIZE
536     for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
537       b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]);
538       r_.values[i] =
539         (b_.values[i] >=   0) ?
540         (b_.values[i] >=  32) ?                    0 : (a_.values[i] <<  b_.values[i]) :
541         (b_.values[i] <= -32) ? (a_.values[i] >> 31) : (a_.values[i] >> -b_.values[i]);
542     }
543 
544     return simde_int32x4_from_private(r_);
545   #endif
546 }
547 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
548   #undef vshlq_s32
549   #define vshlq_s32(a, b) simde_vshlq_s32((a), (b))
550 #endif
551 
552 SIMDE_FUNCTION_ATTRIBUTES
553 simde_int64x2_t
simde_vshlq_s64(const simde_int64x2_t a,const simde_int64x2_t b)554 simde_vshlq_s64 (const simde_int64x2_t a, const simde_int64x2_t b) {
555   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
556     return vshlq_s64(a, b);
557   #elif defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
558     __m128i zero = _mm_setzero_si128();
559     __m128i b_ = _mm_srai_epi64(_mm_slli_epi64(b, 56), 56);
560     return _mm_blendv_epi8(_mm_sllv_epi64(a, b_),
561                            _mm_srav_epi64(a, _mm_sub_epi64(zero, b_)),
562                            _mm_cmpgt_epi64(zero, b_));
563   #elif defined(SIMDE_X86_AVX2_NATIVE)
564     __m128i zero = _mm_setzero_si128();
565     __m128i maska = _mm_cmpgt_epi64(zero, a);
566     __m128i b_abs = _mm_and_si128(_mm_abs_epi8(b), _mm_set1_epi64x(0xFF));
567     return _mm_blendv_epi8(_mm_sllv_epi64(a, b_abs),
568                            _mm_xor_si128(_mm_srlv_epi64(_mm_xor_si128(a, maska), b_abs), maska),
569                            _mm_cmpgt_epi64(zero, _mm_slli_epi64(b, 56)));
570   #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
571     vector signed long long a_shl, a_shr;
572     vector unsigned long long b_abs, b_max;
573     vector bool long long b_mask;
574     b_abs = vec_and(HEDLEY_REINTERPRET_CAST(vector unsigned long long,
575                                             vec_abs(HEDLEY_REINTERPRET_CAST(vector signed char, b))),
576                     vec_splats(HEDLEY_STATIC_CAST(unsigned long long, 0xFF)));
577     b_max = vec_splats(HEDLEY_STATIC_CAST(unsigned long long, 63));
578     a_shl = vec_and(vec_sl(a, b_abs), vec_cmple(b_abs, b_max));
579     a_shr = vec_sra(a, vec_min(b_abs, b_max));
580     b_mask = vec_cmplt(vec_sl(b, vec_splats(HEDLEY_STATIC_CAST(unsigned long long, 56))),
581                        vec_splats(HEDLEY_STATIC_CAST(signed long long, 0)));
582     return vec_sel(a_shl, a_shr, b_mask);
583   #else
584     simde_int64x2_private
585       r_,
586       a_ = simde_int64x2_to_private(a),
587       b_ = simde_int64x2_to_private(b);
588 
589     SIMDE_VECTORIZE
590     for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
591       b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]);
592       r_.values[i] =
593         (b_.values[i] >=   0) ?
594         (b_.values[i] >=  64) ?                    0 : (a_.values[i] <<  b_.values[i]) :
595         (b_.values[i] <= -64) ? (a_.values[i] >> 63) : (a_.values[i] >> -b_.values[i]);
596     }
597 
598     return simde_int64x2_from_private(r_);
599   #endif
600 }
601 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
602   #undef vshlq_s64
603   #define vshlq_s64(a, b) simde_vshlq_s64((a), (b))
604 #endif
605 
606 SIMDE_FUNCTION_ATTRIBUTES
607 simde_uint8x16_t
simde_vshlq_u8(const simde_uint8x16_t a,const simde_int8x16_t b)608 simde_vshlq_u8 (const simde_uint8x16_t a, const simde_int8x16_t b) {
609   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
610     return vshlq_u8(a, b);
611   #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
612     __m256i a256 = _mm256_cvtepu8_epi16(a);
613     __m256i b256 = _mm256_cvtepi8_epi16(b);
614     __m256i r256 = _mm256_blendv_epi8(_mm256_sllv_epi16(a256, b256),
615                                       _mm256_srlv_epi16(a256, _mm256_abs_epi16(b256)),
616                                       _mm256_cmpgt_epi16(_mm256_setzero_si256(), b256));
617     return _mm256_cvtepi16_epi8(r256);
618   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
619     vector unsigned char b_abs;
620     vector bool char b_mask;
621     b_abs = HEDLEY_REINTERPRET_CAST(vector unsigned char, vec_abs(b));
622     b_mask = vec_cmplt(b, vec_splat_s8(0));
623     return vec_and(vec_sel(vec_sl(a, b_abs), vec_sr(a, b_abs), b_mask),
624                    vec_cmplt(b_abs, vec_splat_u8(8)));
625   #else
626     simde_uint8x16_private
627       r_,
628       a_ = simde_uint8x16_to_private(a);
629     simde_int8x16_private b_ = simde_int8x16_to_private(b);
630 
631     SIMDE_VECTORIZE
632     for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
633       r_.values[i] = HEDLEY_STATIC_CAST(uint8_t,
634         (abs(b_.values[i]) >= 8) ? 0 :
635             (b_.values[i]  >= 0) ? (a_.values[i] <<  b_.values[i]) :
636                                    (a_.values[i] >> -b_.values[i]));
637     }
638 
639     return simde_uint8x16_from_private(r_);
640   #endif
641 }
642 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
643   #undef vshlq_u8
644   #define vshlq_u8(a, b) simde_vshlq_u8((a), (b))
645 #endif
646 
647 SIMDE_FUNCTION_ATTRIBUTES
648 simde_uint16x8_t
simde_vshlq_u16(const simde_uint16x8_t a,const simde_int16x8_t b)649 simde_vshlq_u16 (const simde_uint16x8_t a, const simde_int16x8_t b) {
650   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
651     return vshlq_u16(a, b);
652   #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
653     __m128i b_ = _mm_srai_epi16(_mm_slli_epi16(b, 8), 8);
654     return _mm_blendv_epi8(_mm_sllv_epi16(a, b_),
655                            _mm_srlv_epi16(a, _mm_abs_epi16(b_)),
656                            _mm_cmpgt_epi16(_mm_setzero_si128(), b_));
657   #elif defined(SIMDE_X86_AVX2_NATIVE)
658     __m256i a256 = _mm256_cvtepu16_epi32(a);
659     __m256i b256 = _mm256_cvtepi16_epi32(b);
660     b256 = _mm256_srai_epi32(_mm256_slli_epi32(b256, 24), 24);
661     __m256i r256 = _mm256_blendv_epi8(_mm256_sllv_epi32(a256, b256),
662                                       _mm256_srlv_epi32(a256, _mm256_abs_epi32(b256)),
663                                       _mm256_cmpgt_epi32(_mm256_setzero_si256(), b256));
664     r256 = _mm256_shuffle_epi8(r256, _mm256_set1_epi64x(0x0D0C090805040100));
665     return _mm_set_epi64x(_mm256_extract_epi64(r256, 2), _mm256_extract_epi64(r256, 0));
666   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
667     vector unsigned short b_abs;
668     vector bool short b_mask;
669     b_abs = vec_and(HEDLEY_REINTERPRET_CAST(vector unsigned short,
670                                             vec_abs(HEDLEY_REINTERPRET_CAST(vector signed char, b))),
671                     vec_splats(HEDLEY_STATIC_CAST(unsigned short, 0xFF)));
672     b_mask = vec_cmplt(vec_sl(b, vec_splat_u16(8)), vec_splat_s16(0));
673     #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
674       return vec_and(vec_sel(vec_sl(a, b_abs), vec_sr(a, b_abs), b_mask),
675                      vec_cmple(b_abs, vec_splat_u16(15)));
676     #else
677       return vec_and(vec_sel(vec_sl(a, b_abs), vec_sr(a, b_abs), b_mask),
678                      vec_cmplt(b_abs, vec_splats(HEDLEY_STATIC_CAST(unsigned short, 16))));
679     #endif
680   #else
681     simde_uint16x8_private
682       r_,
683       a_ = simde_uint16x8_to_private(a);
684     simde_int16x8_private b_ = simde_int16x8_to_private(b);
685 
686     SIMDE_VECTORIZE
687     for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
688       b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]);
689       r_.values[i] = HEDLEY_STATIC_CAST(uint16_t,
690         (abs(b_.values[i]) >= 16) ? 0 :
691             (b_.values[i]  >=  0) ? (a_.values[i] <<  b_.values[i]) :
692                                     (a_.values[i] >> -b_.values[i]));
693     }
694 
695     return simde_uint16x8_from_private(r_);
696   #endif
697 }
698 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
699   #undef vshlq_u16
700   #define vshlq_u16(a, b) simde_vshlq_u16((a), (b))
701 #endif
702 
703 SIMDE_FUNCTION_ATTRIBUTES
704 simde_uint32x4_t
simde_vshlq_u32(const simde_uint32x4_t a,const simde_int32x4_t b)705 simde_vshlq_u32 (const simde_uint32x4_t a, const simde_int32x4_t b) {
706   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
707     return vshlq_u32(a, b);
708   #elif defined(SIMDE_X86_AVX2_NATIVE)
709     __m128i b_ = _mm_srai_epi32(_mm_slli_epi32(b, 24), 24);
710     return _mm_blendv_epi8(_mm_sllv_epi32(a, b_),
711                            _mm_srlv_epi32(a, _mm_abs_epi32(b_)),
712                            _mm_cmpgt_epi32(_mm_setzero_si128(), b_));
713   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
714     vector unsigned int b_abs;
715     vector bool int b_mask;
716     b_abs = vec_and(HEDLEY_REINTERPRET_CAST(vector unsigned int,
717                                             vec_abs(HEDLEY_REINTERPRET_CAST(vector signed char, b))),
718                     vec_splats(HEDLEY_STATIC_CAST(unsigned int, 0xFF)));
719     b_mask = vec_cmplt(vec_sl(b, vec_splats(HEDLEY_STATIC_CAST(unsigned int, 24))), vec_splat_s32(0));
720     return vec_and(vec_sel(vec_sl(a, b_abs), vec_sr(a, b_abs), b_mask),
721                    vec_cmplt(b_abs, vec_splats(HEDLEY_STATIC_CAST(unsigned int, 32))));
722   #else
723     simde_uint32x4_private
724       r_,
725       a_ = simde_uint32x4_to_private(a);
726     simde_int32x4_private b_ = simde_int32x4_to_private(b);
727 
728     SIMDE_VECTORIZE
729     for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
730       b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]);
731       r_.values[i] = (abs(b_.values[i]) >= 32) ? 0 :
732                          (b_.values[i]  >=  0) ? (a_.values[i] <<  b_.values[i]) :
733                                                  (a_.values[i] >> -b_.values[i]);
734     }
735 
736     return simde_uint32x4_from_private(r_);
737   #endif
738 }
739 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
740   #undef vshlq_u32
741   #define vshlq_u32(a, b) simde_vshlq_u32((a), (b))
742 #endif
743 
744 SIMDE_FUNCTION_ATTRIBUTES
745 simde_uint64x2_t
simde_vshlq_u64(const simde_uint64x2_t a,const simde_int64x2_t b)746 simde_vshlq_u64 (const simde_uint64x2_t a, const simde_int64x2_t b) {
747   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
748     return vshlq_u64(a, b);
749   #elif defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
750     __m128i zero = _mm_setzero_si128();
751     __m128i b_ = _mm_srai_epi64(_mm_slli_epi64(b, 56), 56);
752     return _mm_blendv_epi8(_mm_sllv_epi64(a, b_),
753                            _mm_srlv_epi64(a, _mm_sub_epi64(zero, b_)),
754                            _mm_cmpgt_epi64(zero, b_));
755   #elif defined(SIMDE_X86_AVX2_NATIVE)
756     __m128i b_abs = _mm_and_si128(_mm_abs_epi8(b), _mm_set1_epi64x(0xFF));
757     return _mm_blendv_epi8(_mm_sllv_epi64(a, b_abs),
758                            _mm_srlv_epi64(a, b_abs),
759                            _mm_cmpgt_epi64(_mm_setzero_si128(), _mm_slli_epi64(b, 56)));
760   #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
761     vector unsigned long long b_abs;
762     vector bool long long b_mask;
763     b_abs = vec_and(HEDLEY_REINTERPRET_CAST(vector unsigned long long,
764                                             vec_abs(HEDLEY_REINTERPRET_CAST(vector signed char, b))),
765                     vec_splats(HEDLEY_STATIC_CAST(unsigned long long, 0xFF)));
766     b_mask = vec_cmplt(vec_sl(b, vec_splats(HEDLEY_STATIC_CAST(unsigned long long, 56))),
767                        vec_splats(HEDLEY_STATIC_CAST(signed long long, 0)));
768     return vec_and(vec_sel(vec_sl(a, b_abs), vec_sr(a, b_abs), b_mask),
769                    vec_cmplt(b_abs, vec_splats(HEDLEY_STATIC_CAST(unsigned long long, 64))));
770   #else
771     simde_uint64x2_private
772       r_,
773       a_ = simde_uint64x2_to_private(a);
774     simde_int64x2_private b_ = simde_int64x2_to_private(b);
775 
776     SIMDE_VECTORIZE
777     for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
778       b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]);
779       r_.values[i] = (llabs(b_.values[i]) >= 64) ? 0 :
780                            (b_.values[i]  >=  0) ? (a_.values[i] <<  b_.values[i]) :
781                                                    (a_.values[i] >> -b_.values[i]);
782       }
783 
784     return simde_uint64x2_from_private(r_);
785   #endif
786 }
787 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
788   #undef vshlq_u64
789   #define vshlq_u64(a, b) simde_vshlq_u64((a), (b))
790 #endif
791 
792 SIMDE_END_DECLS_
793 HEDLEY_DIAGNOSTIC_POP
794 
795 #endif /* !defined(SIMDE_ARM_NEON_SHL_H) */
796