1 /* SPDX-License-Identifier: MIT
2 *
3 * Permission is hereby granted, free of charge, to any person
4 * obtaining a copy of this software and associated documentation
5 * files (the "Software"), to deal in the Software without
6 * restriction, including without limitation the rights to use, copy,
7 * modify, merge, publish, distribute, sublicense, and/or sell copies
8 * of the Software, and to permit persons to whom the Software is
9 * furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be
12 * included in all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
18 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
19 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Copyright:
24 * 2020 Evan Nemerson <evan@nemerson.com>
25 * 2020 Christopher Moore <moore@free.fr>
26 */
27
28 #if !defined(SIMDE_ARM_NEON_RSHL_H)
29 #define SIMDE_ARM_NEON_RSHL_H
30
31 #include "types.h"
32
33 /* Notes from the implementer (Christopher Moore aka rosbif)
34 *
35 * I have tried to exactly reproduce the documented behaviour of the
36 * ARM NEON rshl and rshlq intrinsics.
37 * This is complicated for the following reasons:-
38 *
39 * a) Negative shift counts shift right.
40 *
41 * b) Only the low byte of the shift count is used but the shift count
42 * is not limited to 8-bit values (-128 to 127).
43 *
44 * c) Overflow must be avoided when rounding, together with sign change
45 * warning/errors in the C versions.
46 *
47 * d) Intel SIMD is not nearly as complete as NEON and AltiVec.
48 * There were no intrisics with a vector shift count before AVX2 which
49 * only has 32 and 64-bit logical ones and only a 32-bit arithmetic
50 * one. The others need AVX512. There are no 8-bit shift intrinsics at
51 * all, even with a scalar shift count. It is surprising to use AVX2
52 * and even AVX512 to implement a 64-bit vector operation.
53 *
54 * e) Many shift implementations, and the C standard, do not treat a
55 * shift count >= the object's size in bits as one would expect.
56 * (Personally I feel that > is silly but == can be useful.)
57 *
58 * Note that even the C17/18 standard does not define the behaviour of
59 * a right shift of a negative value.
60 * However Evan and I agree that all compilers likely to be used
61 * implement this as an arithmetic right shift with sign extension.
62 * If this is not the case it could be replaced by a logical right shift
63 * if negative values are complemented before and after the shift.
64 *
65 * Some of the SIMD translations may be slower than the portable code,
66 * particularly those for vectors with only one or two elements.
67 * But I had fun writing them ;-)
68 *
69 */
70
71 HEDLEY_DIAGNOSTIC_PUSH
72 SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
73 SIMDE_BEGIN_DECLS_
74
75 SIMDE_FUNCTION_ATTRIBUTES
76 simde_int8x8_t
simde_vrshl_s8(const simde_int8x8_t a,const simde_int8x8_t b)77 simde_vrshl_s8 (const simde_int8x8_t a, const simde_int8x8_t b) {
78 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
79 return vrshl_s8(a, b);
80 #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
81 const __m128i zero = _mm_setzero_si128();
82 const __m128i ff = _mm_cmpeq_epi16(zero, zero);
83 __m128i a128 = _mm_cvtepi8_epi16(_mm_movpi64_epi64(a));
84 __m128i b128 = _mm_cvtepi8_epi16(_mm_movpi64_epi64(b));
85 __m128i a128_shr = _mm_srav_epi16(a128, _mm_xor_si128(b128, ff));
86 __m128i r128 = _mm_blendv_epi8(_mm_sllv_epi16(a128, b128),
87 _mm_srai_epi16(_mm_sub_epi16(a128_shr, ff), 1),
88 _mm_cmpgt_epi16(zero, b128));
89 return _mm_movepi64_pi64(_mm_cvtepi16_epi8(r128));
90 #elif defined(SIMDE_X86_AVX2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
91 const __m256i zero = _mm256_setzero_si256();
92 const __m256i ff = _mm256_cmpeq_epi32(zero, zero);
93 __m256i a256 = _mm256_cvtepi8_epi32(_mm_movpi64_epi64(a));
94 __m256i b256 = _mm256_cvtepi8_epi32(_mm_movpi64_epi64(b));
95 __m256i a256_shr = _mm256_srav_epi32(a256, _mm256_xor_si256(b256, ff));
96 __m256i r256 = _mm256_blendv_epi8(_mm256_sllv_epi32(a256, b256),
97 _mm256_srai_epi32(_mm256_sub_epi32(a256_shr, ff), 1),
98 _mm256_cmpgt_epi32(zero, b256));
99 r256 = _mm256_shuffle_epi8(r256, _mm256_set1_epi32(0x0C080400));
100 return _mm_set_pi32(_mm256_extract_epi32(r256, 4), _mm256_extract_epi32(r256, 0));
101 #else
102 simde_int8x8_private
103 r_,
104 a_ = simde_int8x8_to_private(a),
105 b_ = simde_int8x8_to_private(b);
106
107 SIMDE_VECTORIZE
108 for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
109 r_.values[i] = HEDLEY_STATIC_CAST(int8_t,
110 (abs(b_.values[i]) >= 8) ? 0 :
111 (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) :
112 ((a_.values[i] + (1 << (-b_.values[i] - 1))) >> -b_.values[i]));
113 }
114
115 return simde_int8x8_from_private(r_);
116 #endif
117 }
118 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
119 #undef vrshl_s8
120 #define vrshl_s8(a, b) simde_vrshl_s8((a), (b))
121 #endif
122
123 SIMDE_FUNCTION_ATTRIBUTES
124 simde_int16x4_t
simde_vrshl_s16(const simde_int16x4_t a,const simde_int16x4_t b)125 simde_vrshl_s16 (const simde_int16x4_t a, const simde_int16x4_t b) {
126 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
127 return vrshl_s16(a, b);
128 #elif defined(SIMDE_X86_AVX2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
129 const __m128i zero = _mm_setzero_si128();
130 const __m128i ff = _mm_cmpeq_epi32(zero, zero);
131 __m128i a128 = _mm_cvtepi16_epi32(_mm_movpi64_epi64(a));
132 __m128i b128 = _mm_cvtepi16_epi32(_mm_movpi64_epi64(b));
133 b128 = _mm_srai_epi32(_mm_slli_epi32(b128, 24), 24);
134 __m128i a128_shr = _mm_srav_epi32(a128, _mm_xor_si128(b128, ff));
135 __m128i r128 = _mm_blendv_epi8(_mm_sllv_epi32(a128, b128),
136 _mm_srai_epi32(_mm_sub_epi32(a128_shr, ff), 1),
137 _mm_cmpgt_epi32(zero, b128));
138 return _mm_movepi64_pi64(_mm_shuffle_epi8(r128, _mm_set1_epi64x(0x0D0C090805040100)));
139 #else
140 simde_int16x4_private
141 r_,
142 a_ = simde_int16x4_to_private(a),
143 b_ = simde_int16x4_to_private(b);
144
145 SIMDE_VECTORIZE
146 for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
147 b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]);
148 r_.values[i] = HEDLEY_STATIC_CAST(int16_t,
149 (abs(b_.values[i]) >= 16) ? 0 :
150 (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) :
151 ((a_.values[i] + (1 << (-b_.values[i] - 1))) >> -b_.values[i]));
152 }
153
154 return simde_int16x4_from_private(r_);
155 #endif
156 }
157 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
158 #undef vrshl_s16
159 #define vrshl_s16(a, b) simde_vrshl_s16((a), (b))
160 #endif
161
162 SIMDE_FUNCTION_ATTRIBUTES
163 simde_int32x2_t
simde_vrshl_s32(const simde_int32x2_t a,const simde_int32x2_t b)164 simde_vrshl_s32 (const simde_int32x2_t a, const simde_int32x2_t b) {
165 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
166 return vrshl_s32(a, b);
167 #elif defined(SIMDE_X86_AVX2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
168 const __m128i zero = _mm_setzero_si128();
169 const __m128i ff = _mm_cmpeq_epi32(zero, zero);
170 __m128i a128 = _mm_movpi64_epi64(a);
171 __m128i b128 = _mm_movpi64_epi64(b);
172 b128 = _mm_srai_epi32(_mm_slli_epi32(b128, 24), 24);
173 __m128i a128_shr = _mm_srav_epi32(a128, _mm_xor_si128(b128, ff));
174 __m128i r128 = _mm_blendv_epi8(_mm_sllv_epi32(a128, b128),
175 _mm_srai_epi32(_mm_sub_epi32(a128_shr, ff), 1),
176 _mm_cmpgt_epi32(zero, b128));
177 return _mm_movepi64_pi64(r128);
178 #else
179 simde_int32x2_private
180 r_,
181 a_ = simde_int32x2_to_private(a),
182 b_ = simde_int32x2_to_private(b);
183
184 SIMDE_VECTORIZE
185 for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
186 b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]);
187 r_.values[i] = HEDLEY_STATIC_CAST(int32_t,
188 (abs(b_.values[i]) >= 32) ? 0 :
189 (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) :
190 ((a_.values[i] + (1 << (-b_.values[i] - 1))) >> -b_.values[i]));
191 }
192
193 return simde_int32x2_from_private(r_);
194 #endif
195 }
196 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
197 #undef vrshl_s32
198 #define vrshl_s32(a, b) simde_vrshl_s32((a), (b))
199 #endif
200
201 SIMDE_FUNCTION_ATTRIBUTES
202 simde_int64x1_t
simde_vrshl_s64(const simde_int64x1_t a,const simde_int64x1_t b)203 simde_vrshl_s64 (const simde_int64x1_t a, const simde_int64x1_t b) {
204 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
205 return vrshl_s64(a, b);
206 #elif defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
207 const __m128i zero = _mm_setzero_si128();
208 const __m128i ff = _mm_cmpeq_epi64(zero, zero);
209 __m128i a128 = _mm_movpi64_epi64(a);
210 __m128i b128 = _mm_movpi64_epi64(b);
211 b128 = _mm_srai_epi64(_mm_slli_epi64(b128, 56), 56);
212 __m128i a128_shr = _mm_srav_epi64(a128, _mm_xor_si128(b128, ff));
213 __m128i r128 = _mm_blendv_epi8(_mm_sllv_epi64(a128, b128),
214 _mm_srai_epi64(_mm_sub_epi64(a128_shr, ff), 1),
215 _mm_cmpgt_epi64(zero, b128));
216 return _mm_movepi64_pi64(r128);
217 #elif defined(SIMDE_X86_AVX2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
218 const __m128i zero = _mm_setzero_si128();
219 const __m128i ones = _mm_set1_epi64x(1);
220 __m128i a128 = _mm_movpi64_epi64(a);
221 __m128i b128 = _mm_movpi64_epi64(b);
222 __m128i maska = _mm_cmpgt_epi64(zero, a128);
223 __m128i b128_abs = _mm_and_si128(_mm_abs_epi8(b128), _mm_set1_epi64x(0xFF));
224 __m128i a128_rnd = _mm_and_si128(_mm_srlv_epi64(a128, _mm_sub_epi64(b128_abs, ones)), ones);
225 __m128i r128 = _mm_blendv_epi8(_mm_sllv_epi64(a128, b128_abs),
226 _mm_add_epi64(_mm_xor_si128(_mm_srlv_epi64(_mm_xor_si128(a128, maska), b128_abs), maska), a128_rnd),
227 _mm_cmpgt_epi64(zero, _mm_slli_epi64(b128, 56)));
228 return _mm_movepi64_pi64(r128);
229 #else
230 simde_int64x1_private
231 r_,
232 a_ = simde_int64x1_to_private(a),
233 b_ = simde_int64x1_to_private(b);
234
235 SIMDE_VECTORIZE
236 for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
237 b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]);
238 r_.values[i] = HEDLEY_STATIC_CAST(int64_t,
239 (llabs(b_.values[i]) >= 64) ? 0 :
240 (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) :
241 ((a_.values[i] + (INT64_C(1) << (-b_.values[i] - 1))) >> -b_.values[i]));
242 }
243
244 return simde_int64x1_from_private(r_);
245 #endif
246 }
247 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
248 #undef vrshl_s64
249 #define vrshl_s64(a, b) simde_vrshl_s64((a), (b))
250 #endif
251
252 SIMDE_FUNCTION_ATTRIBUTES
253 simde_uint8x8_t
simde_vrshl_u8(const simde_uint8x8_t a,const simde_int8x8_t b)254 simde_vrshl_u8 (const simde_uint8x8_t a, const simde_int8x8_t b) {
255 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
256 return vrshl_u8(a, b);
257 #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
258 const __m128i zero = _mm_setzero_si128();
259 const __m128i ff = _mm_cmpeq_epi16(zero, zero);
260 __m128i a128 = _mm_cvtepu8_epi16(_mm_movpi64_epi64(a));
261 __m128i b128 = _mm_cvtepi8_epi16(_mm_movpi64_epi64(b));
262 __m128i a128_shr = _mm_srlv_epi16(a128, _mm_xor_si128(b128, ff));
263 __m128i r128 = _mm_blendv_epi8(_mm_sllv_epi16(a128, b128),
264 _mm_srli_epi16(_mm_sub_epi16(a128_shr, ff), 1),
265 _mm_cmpgt_epi16(zero, b128));
266 return _mm_movepi64_pi64(_mm_cvtepi16_epi8(r128));
267 #elif defined(SIMDE_X86_AVX2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
268 const __m256i zero = _mm256_setzero_si256();
269 const __m256i ff = _mm256_cmpeq_epi32(zero, zero);
270 __m256i a256 = _mm256_cvtepu8_epi32(_mm_movpi64_epi64(a));
271 __m256i b256 = _mm256_cvtepi8_epi32(_mm_movpi64_epi64(b));
272 __m256i a256_shr = _mm256_srlv_epi32(a256, _mm256_xor_si256(b256, ff));
273 __m256i r256 = _mm256_blendv_epi8(_mm256_sllv_epi32(a256, b256),
274 _mm256_srli_epi32(_mm256_sub_epi32(a256_shr, ff), 1),
275 _mm256_cmpgt_epi32(zero, b256));
276 r256 = _mm256_shuffle_epi8(r256, _mm256_set1_epi32(0x0C080400));
277 return _mm_set_pi32(_mm256_extract_epi32(r256, 4), _mm256_extract_epi32(r256, 0));
278 #else
279 simde_uint8x8_private
280 r_,
281 a_ = simde_uint8x8_to_private(a);
282 simde_int8x8_private b_ = simde_int8x8_to_private(b);
283
284 SIMDE_VECTORIZE
285 for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
286 r_.values[i] = HEDLEY_STATIC_CAST(uint8_t,
287 (b_.values[i] >= 8) ? 0 :
288 (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) :
289 (b_.values[i] >= -8) ? (((b_.values[i] == -8) ? 0 : (a_.values[i] >> -b_.values[i])) + ((a_.values[i] >> (-b_.values[i] - 1)) & 1)) :
290 0);
291 }
292
293 return simde_uint8x8_from_private(r_);
294 #endif
295 }
296 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
297 #undef vrshl_u8
298 #define vrshl_u8(a, b) simde_vrshl_u8((a), (b))
299 #endif
300
301 SIMDE_FUNCTION_ATTRIBUTES
302 simde_uint16x4_t
simde_vrshl_u16(const simde_uint16x4_t a,const simde_int16x4_t b)303 simde_vrshl_u16 (const simde_uint16x4_t a, const simde_int16x4_t b) {
304 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
305 return vrshl_u16(a, b);
306 #elif defined(SIMDE_X86_AVX2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
307 const __m128i zero = _mm_setzero_si128();
308 const __m128i ff = _mm_cmpeq_epi32(zero, zero);
309 __m128i a128 = _mm_cvtepu16_epi32(_mm_movpi64_epi64(a));
310 __m128i b128 = _mm_cvtepi16_epi32(_mm_movpi64_epi64(b));
311 b128 = _mm_srai_epi32(_mm_slli_epi32(b128, 24), 24);
312 __m128i a128_shr = _mm_srlv_epi32(a128, _mm_xor_si128(b128, ff));
313 __m128i r128 = _mm_blendv_epi8(_mm_sllv_epi32(a128, b128),
314 _mm_srli_epi32(_mm_sub_epi32(a128_shr, ff), 1),
315 _mm_cmpgt_epi32(zero, b128));
316 return _mm_movepi64_pi64(_mm_shuffle_epi8(r128, _mm_set1_epi64x(0x0D0C090805040100)));
317 #else
318 simde_uint16x4_private
319 r_,
320 a_ = simde_uint16x4_to_private(a);
321 simde_int16x4_private b_ = simde_int16x4_to_private(b);
322
323 SIMDE_VECTORIZE
324 for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
325 b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]);
326 r_.values[i] = HEDLEY_STATIC_CAST(uint16_t,
327 (b_.values[i] >= 16) ? 0 :
328 (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) :
329 (b_.values[i] >= -16) ? (((b_.values[i] == -16) ? 0 : (a_.values[i] >> -b_.values[i])) + ((a_.values[i] >> (-b_.values[i] - 1)) & 1)) :
330 0);
331 }
332
333 return simde_uint16x4_from_private(r_);
334 #endif
335 }
336 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
337 #undef vrshl_u16
338 #define vrshl_u16(a, b) simde_vrshl_u16((a), (b))
339 #endif
340
341 SIMDE_FUNCTION_ATTRIBUTES
342 simde_uint32x2_t
simde_vrshl_u32(const simde_uint32x2_t a,const simde_int32x2_t b)343 simde_vrshl_u32 (const simde_uint32x2_t a, const simde_int32x2_t b) {
344 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
345 return vrshl_u32(a, b);
346 #elif defined(SIMDE_X86_AVX2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
347 const __m128i zero = _mm_setzero_si128();
348 const __m128i ff = _mm_cmpeq_epi32(zero, zero);
349 __m128i a128 = _mm_movpi64_epi64(a);
350 __m128i b128 = _mm_movpi64_epi64(b);
351 b128 = _mm_srai_epi32(_mm_slli_epi32(b128, 24), 24);
352 __m128i a128_shr = _mm_srlv_epi32(a128, _mm_xor_si128(b128, ff));
353 __m128i r128 = _mm_blendv_epi8(_mm_sllv_epi32(a128, b128),
354 _mm_srli_epi32(_mm_sub_epi32(a128_shr, ff), 1),
355 _mm_cmpgt_epi32(zero, b128));
356 return _mm_movepi64_pi64(r128);
357 #else
358 simde_uint32x2_private
359 r_,
360 a_ = simde_uint32x2_to_private(a);
361 simde_int32x2_private b_ = simde_int32x2_to_private(b);
362
363 SIMDE_VECTORIZE
364 for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
365 b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]);
366 r_.values[i] =
367 (b_.values[i] >= 32) ? 0 :
368 (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) :
369 (b_.values[i] >= -32) ? (((b_.values[i] == -32) ? 0 : (a_.values[i] >> -b_.values[i])) + ((a_.values[i] >> (-b_.values[i] - 1)) & 1)) :
370 0;
371 }
372
373 return simde_uint32x2_from_private(r_);
374 #endif
375 }
376 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
377 #undef vrshl_u32
378 #define vrshl_u32(a, b) simde_vrshl_u32((a), (b))
379 #endif
380
381 SIMDE_FUNCTION_ATTRIBUTES
382 simde_uint64x1_t
simde_vrshl_u64(const simde_uint64x1_t a,const simde_int64x1_t b)383 simde_vrshl_u64 (const simde_uint64x1_t a, const simde_int64x1_t b) {
384 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
385 return vrshl_u64(a, b);
386 #elif defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
387 const __m128i zero = _mm_setzero_si128();
388 const __m128i ff = _mm_cmpeq_epi64(zero, zero);
389 __m128i a128 = _mm_movpi64_epi64(a);
390 __m128i b128 = _mm_movpi64_epi64(b);
391 b128 = _mm_srai_epi64(_mm_slli_epi64(b128, 56), 56);
392 __m128i a128_shr = _mm_srlv_epi64(a128, _mm_xor_si128(b128, ff));
393 __m128i r128 = _mm_blendv_epi8(_mm_sllv_epi64(a128, b128),
394 _mm_srli_epi64(_mm_sub_epi64(a128_shr, ff), 1),
395 _mm_cmpgt_epi64(zero, b128));
396 return _mm_movepi64_pi64(r128);
397 #elif defined(SIMDE_X86_AVX2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
398 const __m128i ones = _mm_set1_epi64x(1);
399 const __m128i a128 = _mm_movpi64_epi64(a);
400 __m128i b128 = _mm_movpi64_epi64(b);
401 __m128i b128_abs = _mm_and_si128(_mm_abs_epi8(b128), _mm_set1_epi64x(0xFF));
402 __m128i a128_shr = _mm_srlv_epi64(a128, _mm_sub_epi64(b128_abs, ones));
403 __m128i r128 = _mm_blendv_epi8(_mm_sllv_epi64(a128, b128_abs),
404 _mm_srli_epi64(_mm_add_epi64(a128_shr, ones), 1),
405 _mm_cmpgt_epi64(_mm_setzero_si128(), _mm_slli_epi64(b128, 56)));
406 return _mm_movepi64_pi64(r128);
407 #else
408 simde_uint64x1_private
409 r_,
410 a_ = simde_uint64x1_to_private(a);
411 simde_int64x1_private b_ = simde_int64x1_to_private(b);
412
413 SIMDE_VECTORIZE
414 for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
415 b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]);
416 r_.values[i] =
417 (b_.values[i] >= 64) ? 0 :
418 (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) :
419 (b_.values[i] >= -64) ? (((b_.values[i] == -64) ? 0 : (a_.values[i] >> -b_.values[i])) + ((a_.values[i] >> (-b_.values[i] - 1)) & 1)) :
420 0;
421 }
422
423 return simde_uint64x1_from_private(r_);
424 #endif
425 }
426 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
427 #undef vrshl_u64
428 #define vrshl_u64(a, b) simde_vrshl_u64((a), (b))
429 #endif
430
431 SIMDE_FUNCTION_ATTRIBUTES
432 simde_int8x16_t
simde_vrshlq_s8(const simde_int8x16_t a,const simde_int8x16_t b)433 simde_vrshlq_s8 (const simde_int8x16_t a, const simde_int8x16_t b) {
434 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
435 return vrshlq_s8(a, b);
436 #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
437 const __m256i zero = _mm256_setzero_si256();
438 const __m256i ff = _mm256_cmpeq_epi16(zero, zero);
439 __m256i a256 = _mm256_cvtepi8_epi16(a);
440 __m256i b256 = _mm256_cvtepi8_epi16(b);
441 __m256i a256_shr = _mm256_srav_epi16(a256, _mm256_xor_si256(b256, ff));
442 __m256i r256 = _mm256_blendv_epi8(_mm256_sllv_epi16(a256, b256),
443 _mm256_srai_epi16(_mm256_sub_epi16(a256_shr, ff), 1),
444 _mm256_cmpgt_epi16(zero, b256));
445 return _mm256_cvtepi16_epi8(r256);
446 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
447 const SIMDE_POWER_ALTIVEC_VECTOR( signed char) zero = vec_splats(HEDLEY_STATIC_CAST( signed char, 0));
448 const SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) ones = vec_splats(HEDLEY_STATIC_CAST(unsigned char, 1));
449 const SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) max = vec_splats(HEDLEY_STATIC_CAST(unsigned char, 8));
450 SIMDE_POWER_ALTIVEC_VECTOR(signed char) a_shr;
451 SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) b_abs;
452
453 b_abs = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), vec_abs(b));
454 a_shr = vec_sra(a, vec_sub(b_abs, ones));
455 return vec_and(vec_sel(vec_sl(a, b_abs),
456 vec_add(vec_sra(a_shr, ones), vec_and(a_shr, HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char), ones))),
457 vec_cmplt(b, zero)),
458 vec_cmplt(b_abs, max));
459 #else
460 simde_int8x16_private
461 r_,
462 a_ = simde_int8x16_to_private(a),
463 b_ = simde_int8x16_to_private(b);
464
465 SIMDE_VECTORIZE
466 for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
467 r_.values[i] = HEDLEY_STATIC_CAST(int8_t,
468 (abs(b_.values[i]) >= 8) ? 0 :
469 (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) :
470 ((a_.values[i] + (1 << (-b_.values[i] - 1))) >> -b_.values[i]));
471 }
472
473 return simde_int8x16_from_private(r_);
474 #endif
475 }
476 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
477 #undef vrshlq_s8
478 #define vrshlq_s8(a, b) simde_vrshlq_s8((a), (b))
479 #endif
480
481 SIMDE_FUNCTION_ATTRIBUTES
482 simde_int16x8_t
simde_vrshlq_s16(const simde_int16x8_t a,const simde_int16x8_t b)483 simde_vrshlq_s16 (const simde_int16x8_t a, const simde_int16x8_t b) {
484 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
485 return vrshlq_s16(a, b);
486 #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
487 const __m128i zero = _mm_setzero_si128();
488 const __m128i ff = _mm_cmpeq_epi16(zero, zero);
489 __m128i b_ = _mm_srai_epi16(_mm_slli_epi16(b, 8), 8);
490 __m128i a_shr = _mm_srav_epi16(a, _mm_xor_si128(b_, ff));
491 return _mm_blendv_epi8(_mm_sllv_epi16(a, b_),
492 _mm_srai_epi16(_mm_sub_epi16(a_shr, ff), 1),
493 _mm_cmpgt_epi16(zero, b_));
494 #elif defined(SIMDE_X86_AVX2_NATIVE) && defined(SIMDE_ARCH_AMD64)
495 const __m256i zero = _mm256_setzero_si256();
496 const __m256i ff = _mm256_cmpeq_epi32(zero, zero);
497 __m256i a256 = _mm256_cvtepi16_epi32(a);
498 __m256i b256 = _mm256_cvtepi16_epi32(b);
499 b256 = _mm256_srai_epi32(_mm256_slli_epi32(b256, 24), 24);
500 __m256i a256_shr = _mm256_srav_epi32(a256, _mm256_xor_si256(b256, ff));
501 __m256i r256 = _mm256_blendv_epi8(_mm256_sllv_epi32(a256, b256),
502 _mm256_srai_epi32(_mm256_sub_epi32(a256_shr, ff), 1),
503 _mm256_cmpgt_epi32(zero, b256));
504 r256 = _mm256_shuffle_epi8(r256, _mm256_set1_epi64x(0x0D0C090805040100));
505 return _mm_set_epi64x(_mm256_extract_epi64(r256, 2), _mm256_extract_epi64(r256, 0));
506 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
507 const SIMDE_POWER_ALTIVEC_VECTOR( signed short) zero = vec_splats(HEDLEY_STATIC_CAST( signed short, 0));
508 const SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) ones = vec_splats(HEDLEY_STATIC_CAST(unsigned short, 1));
509 const SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) shift = vec_splats(HEDLEY_STATIC_CAST(unsigned short, 16 - 8));
510 const SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) max = vec_splats(HEDLEY_STATIC_CAST(unsigned short, 16));
511 const SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) ff = vec_splats(HEDLEY_STATIC_CAST(unsigned short, 0xFF));
512 SIMDE_POWER_ALTIVEC_VECTOR(signed short) a_shr;
513 SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) b_abs;
514
515 b_abs = vec_and(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned short),
516 vec_abs(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char), b))),
517 ff);
518 a_shr = vec_sra(a, vec_sub(b_abs, ones));
519 return vec_and(vec_sel(vec_sl(a, b_abs),
520 vec_add(vec_sra(a_shr, ones), vec_and(a_shr, HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed short), ones))),
521 vec_cmplt(vec_sl(b, shift), zero)),
522 vec_cmplt(b_abs, max));
523 #else
524 simde_int16x8_private
525 r_,
526 a_ = simde_int16x8_to_private(a),
527 b_ = simde_int16x8_to_private(b);
528
529 SIMDE_VECTORIZE
530 for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
531 b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]);
532 r_.values[i] = HEDLEY_STATIC_CAST(int16_t,
533 (abs(b_.values[i]) >= 16) ? 0 :
534 (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) :
535 ((a_.values[i] + (1 << (-b_.values[i] - 1))) >> -b_.values[i]));
536 }
537
538 return simde_int16x8_from_private(r_);
539 #endif
540 }
541 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
542 #undef vrshlq_s16
543 #define vrshlq_s16(a, b) simde_vrshlq_s16((a), (b))
544 #endif
545
546 SIMDE_FUNCTION_ATTRIBUTES
547 simde_int32x4_t
simde_vrshlq_s32(const simde_int32x4_t a,const simde_int32x4_t b)548 simde_vrshlq_s32 (const simde_int32x4_t a, const simde_int32x4_t b) {
549 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
550 return vrshlq_s32(a, b);
551 #elif defined(SIMDE_X86_AVX2_NATIVE)
552 const __m128i zero = _mm_setzero_si128();
553 const __m128i ff = _mm_cmpeq_epi32(zero, zero);
554 __m128i b_ = _mm_srai_epi32(_mm_slli_epi32(b, 24), 24);
555 __m128i a_shr = _mm_srav_epi32(a, _mm_xor_si128(b_, ff));
556 return _mm_blendv_epi8(_mm_sllv_epi32(a, b_),
557 _mm_srai_epi32(_mm_sub_epi32(a_shr, ff), 1),
558 _mm_cmpgt_epi32(zero, b_));
559 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
560 const SIMDE_POWER_ALTIVEC_VECTOR( signed int) zero = vec_splats(HEDLEY_STATIC_CAST( signed int, 0));
561 const SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) ones = vec_splats(HEDLEY_STATIC_CAST(unsigned int, 1));
562 const SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) shift = vec_splats(HEDLEY_STATIC_CAST(unsigned int, 32 - 8));
563 const SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) max = vec_splats(HEDLEY_STATIC_CAST(unsigned int, 32));
564 const SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) ff = vec_splats(HEDLEY_STATIC_CAST(unsigned int, 0xFF));
565 SIMDE_POWER_ALTIVEC_VECTOR(signed int) a_shr;
566 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) b_abs;
567
568 b_abs = vec_and(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned int),
569 vec_abs(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char), b))),
570 ff);
571 a_shr = vec_sra(a, vec_sub(b_abs, ones));
572 return vec_and(vec_sel(vec_sl(a, b_abs),
573 vec_add(vec_sra(a_shr, ones), vec_and(a_shr, HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed int), ones))),
574 vec_cmplt(vec_sl(b, shift), zero)),
575 vec_cmplt(b_abs, max));
576 #else
577 simde_int32x4_private
578 r_,
579 a_ = simde_int32x4_to_private(a),
580 b_ = simde_int32x4_to_private(b);
581
582 SIMDE_VECTORIZE
583 for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
584 b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]);
585 r_.values[i] = HEDLEY_STATIC_CAST(int32_t,
586 (abs(b_.values[i]) >= 32) ? 0 :
587 (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) :
588 ((a_.values[i] + (1 << (-b_.values[i] - 1))) >> -b_.values[i]));
589 }
590
591 return simde_int32x4_from_private(r_);
592 #endif
593 }
594 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
595 #undef vrshlq_s32
596 #define vrshlq_s32(a, b) simde_vrshlq_s32((a), (b))
597 #endif
598
599 SIMDE_FUNCTION_ATTRIBUTES
600 simde_int64x2_t
simde_vrshlq_s64(const simde_int64x2_t a,const simde_int64x2_t b)601 simde_vrshlq_s64 (const simde_int64x2_t a, const simde_int64x2_t b) {
602 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
603 return vrshlq_s64(a, b);
604 #elif defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
605 const __m128i zero = _mm_setzero_si128();
606 const __m128i ff = _mm_cmpeq_epi32(zero, zero);
607 __m128i b_ = _mm_srai_epi64(_mm_slli_epi64(b, 56), 56);
608 __m128i a_shr = _mm_srav_epi64(a, _mm_xor_si128(b_, ff));
609 return _mm_blendv_epi8(_mm_sllv_epi64(a, b_),
610 _mm_srai_epi64(_mm_sub_epi64(a_shr, ff), 1),
611 _mm_cmpgt_epi64(zero, b_));
612 #elif defined(SIMDE_X86_AVX2_NATIVE)
613 const __m128i zero = _mm_setzero_si128();
614 const __m128i ones = _mm_set1_epi64x(1);
615 __m128i maska = _mm_cmpgt_epi64(zero, a);
616 __m128i b_abs = _mm_and_si128(_mm_abs_epi8(b), _mm_set1_epi64x(0xFF));
617 __m128i a_rnd = _mm_and_si128(_mm_srlv_epi64(a, _mm_sub_epi64(b_abs, ones)), ones);
618 return _mm_blendv_epi8(_mm_sllv_epi64(a, b_abs),
619 _mm_add_epi64(_mm_xor_si128(_mm_srlv_epi64(_mm_xor_si128(a, maska), b_abs), maska), a_rnd),
620 _mm_cmpgt_epi64(zero, _mm_slli_epi64(b, 56)));
621 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
622 const SIMDE_POWER_ALTIVEC_VECTOR( signed long long) zero = vec_splats(HEDLEY_STATIC_CAST( signed long long, 0));
623 const SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) ones = vec_splats(HEDLEY_STATIC_CAST(unsigned long long, 1));
624 const SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) shift = vec_splats(HEDLEY_STATIC_CAST(unsigned long long, 64 - 8));
625 const SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) max = vec_splats(HEDLEY_STATIC_CAST(unsigned long long, 64));
626 const SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) ff = vec_splats(HEDLEY_STATIC_CAST(unsigned long long, 0xFF));
627 SIMDE_POWER_ALTIVEC_VECTOR(signed long long) a_shr;
628 SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) b_abs;
629
630 b_abs = vec_and(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long),
631 vec_abs(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char), b))),
632 ff);
633 a_shr = vec_sra(a, vec_sub(b_abs, ones));
634
635 HEDLEY_DIAGNOSTIC_PUSH
636 #if defined(SIMDE_BUG_CLANG_46770)
637 SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_
638 #endif
639 return vec_and(vec_sel(vec_sl(a, b_abs),
640 vec_add(vec_sra(a_shr, ones), vec_and(a_shr, HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed long long), ones))),
641 vec_cmplt(vec_sl(b, shift), zero)),
642 vec_cmplt(b_abs, max));
643 HEDLEY_DIAGNOSTIC_POP
644 #else
645 simde_int64x2_private
646 r_,
647 a_ = simde_int64x2_to_private(a),
648 b_ = simde_int64x2_to_private(b);
649
650 SIMDE_VECTORIZE
651 for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
652 b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]);
653 r_.values[i] = HEDLEY_STATIC_CAST(int64_t,
654 (llabs(b_.values[i]) >= 64) ? 0 :
655 (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) :
656 ((a_.values[i] + (INT64_C(1) << (-b_.values[i] - 1))) >> -b_.values[i]));
657 }
658
659 return simde_int64x2_from_private(r_);
660 #endif
661 }
662 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
663 #undef vrshlq_s64
664 #define vrshlq_s64(a, b) simde_vrshlq_s64((a), (b))
665 #endif
666
667 SIMDE_FUNCTION_ATTRIBUTES
668 simde_uint8x16_t
simde_vrshlq_u8(const simde_uint8x16_t a,const simde_int8x16_t b)669 simde_vrshlq_u8 (const simde_uint8x16_t a, const simde_int8x16_t b) {
670 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
671 return vrshlq_u8(a, b);
672 #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
673 const __m256i zero = _mm256_setzero_si256();
674 const __m256i ff = _mm256_cmpeq_epi32(zero, zero);
675 __m256i a256 = _mm256_cvtepu8_epi16(a);
676 __m256i b256 = _mm256_cvtepi8_epi16(b);
677 __m256i a256_shr = _mm256_srlv_epi16(a256, _mm256_xor_si256(b256, ff));
678 __m256i r256 = _mm256_blendv_epi8(_mm256_sllv_epi16(a256, b256),
679 _mm256_srli_epi16(_mm256_sub_epi16(a256_shr, ff), 1),
680 _mm256_cmpgt_epi16(zero, b256));
681 return _mm256_cvtepi16_epi8(r256);
682 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
683 const SIMDE_POWER_ALTIVEC_VECTOR( signed char) zero = vec_splats(HEDLEY_STATIC_CAST( signed char, 0));
684 const SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) ones = vec_splats(HEDLEY_STATIC_CAST(unsigned char, 1));
685 const SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) max = vec_splats(HEDLEY_STATIC_CAST(unsigned char, 8));
686 SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) b_abs, b_abs_dec, a_shr;
687
688 b_abs = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), vec_abs(b));
689 b_abs_dec = vec_sub(b_abs, ones);
690 a_shr = vec_and(vec_sr(a, b_abs_dec), vec_cmplt(b_abs_dec, max));
691 return vec_sel(vec_and(vec_sl(a, b_abs), vec_cmplt(b_abs, max)),
692 vec_sr(vec_add(a_shr, ones), ones),
693 vec_cmplt(b, zero));
694 #else
695 simde_uint8x16_private
696 r_,
697 a_ = simde_uint8x16_to_private(a);
698 simde_int8x16_private b_ = simde_int8x16_to_private(b);
699
700 SIMDE_VECTORIZE
701 for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
702 r_.values[i] = HEDLEY_STATIC_CAST(uint8_t,
703 (b_.values[i] >= 8) ? 0 :
704 (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) :
705 (b_.values[i] >= -8) ? (((b_.values[i] == -8) ? 0 : (a_.values[i] >> -b_.values[i])) + ((a_.values[i] >> (-b_.values[i] - 1)) & 1)) :
706 0);
707 }
708
709 return simde_uint8x16_from_private(r_);
710 #endif
711 }
712 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
713 #undef vrshlq_u8
714 #define vrshlq_u8(a, b) simde_vrshlq_u8((a), (b))
715 #endif
716
717 SIMDE_FUNCTION_ATTRIBUTES
718 simde_uint16x8_t
simde_vrshlq_u16(const simde_uint16x8_t a,const simde_int16x8_t b)719 simde_vrshlq_u16 (const simde_uint16x8_t a, const simde_int16x8_t b) {
720 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
721 return vrshlq_u16(a, b);
722 #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
723 const __m128i zero = _mm_setzero_si128();
724 const __m128i ff = _mm_cmpeq_epi16(zero, zero);
725 __m128i b_ = _mm_srai_epi16(_mm_slli_epi16(b, 8), 8);
726 __m128i a_shr = _mm_srlv_epi16(a, _mm_xor_si128(b_, ff));
727 return _mm_blendv_epi8(_mm_sllv_epi16(a, b_),
728 _mm_srli_epi16(_mm_sub_epi16(a_shr, ff), 1),
729 _mm_cmpgt_epi16(zero, b_));
730 #elif defined(SIMDE_X86_AVX2_NATIVE) && defined(SIMDE_ARCH_AMD64)
731 const __m256i zero = _mm256_setzero_si256();
732 const __m256i ff = _mm256_cmpeq_epi32(zero, zero);
733 __m256i a256 = _mm256_cvtepu16_epi32(a);
734 __m256i b256 = _mm256_cvtepi16_epi32(b);
735 b256 = _mm256_srai_epi32(_mm256_slli_epi32(b256, 24), 24);
736 __m256i a256_shr = _mm256_srlv_epi32(a256, _mm256_xor_si256(b256, ff));
737 __m256i r256 = _mm256_blendv_epi8(_mm256_sllv_epi32(a256, b256),
738 _mm256_srli_epi32(_mm256_sub_epi32(a256_shr, ff), 1),
739 _mm256_cmpgt_epi32(zero, b256));
740 r256 = _mm256_shuffle_epi8(r256, _mm256_set1_epi64x(0x0D0C090805040100));
741 return _mm_set_epi64x(_mm256_extract_epi64(r256, 2), _mm256_extract_epi64(r256, 0));
742 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
743 const SIMDE_POWER_ALTIVEC_VECTOR( signed short) zero = vec_splats(HEDLEY_STATIC_CAST( signed short, 0));
744 const SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) ones = vec_splats(HEDLEY_STATIC_CAST(unsigned short, 1));
745 const SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) shift = vec_splats(HEDLEY_STATIC_CAST(unsigned short, 16 - 8));
746 const SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) max = vec_splats(HEDLEY_STATIC_CAST(unsigned short, 16));
747 const SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) ff = vec_splats(HEDLEY_STATIC_CAST(unsigned short, 0xFF));
748 SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) b_abs, b_abs_dec, a_shr;
749
750 b_abs = vec_and(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned short),
751 vec_abs(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char), b))),
752 ff);
753 b_abs_dec = vec_sub(b_abs, ones);
754 a_shr = vec_and(vec_sr(a, b_abs_dec), vec_cmplt(b_abs_dec, max));
755 return vec_sel(vec_and(vec_sl(a, b_abs), vec_cmplt(b_abs, max)),
756 vec_sr(vec_add(a_shr, ones), ones),
757 vec_cmplt(vec_sl(b, shift), zero));
758 #else
759 simde_uint16x8_private
760 r_,
761 a_ = simde_uint16x8_to_private(a);
762 simde_int16x8_private b_ = simde_int16x8_to_private(b);
763
764 SIMDE_VECTORIZE
765 for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
766 b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]);
767 r_.values[i] = HEDLEY_STATIC_CAST(uint16_t,
768 (b_.values[i] >= 16) ? 0 :
769 (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) :
770 (b_.values[i] >= -16) ? (((b_.values[i] == -16) ? 0 : (a_.values[i] >> -b_.values[i])) + ((a_.values[i] >> (-b_.values[i] - 1)) & 1)) :
771 0);
772 }
773
774 return simde_uint16x8_from_private(r_);
775 #endif
776 }
777 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
778 #undef vrshlq_u16
779 #define vrshlq_u16(a, b) simde_vrshlq_u16((a), (b))
780 #endif
781
782 SIMDE_FUNCTION_ATTRIBUTES
783 simde_uint32x4_t
simde_vrshlq_u32(const simde_uint32x4_t a,const simde_int32x4_t b)784 simde_vrshlq_u32 (const simde_uint32x4_t a, const simde_int32x4_t b) {
785 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
786 return vrshlq_u32(a, b);
787 #elif defined(SIMDE_X86_AVX2_NATIVE)
788 const __m128i zero = _mm_setzero_si128();
789 const __m128i ff = _mm_cmpeq_epi32(zero, zero);
790 __m128i b_ = _mm_srai_epi32(_mm_slli_epi32(b, 24), 24);
791 __m128i a_shr = _mm_srlv_epi32(a, _mm_xor_si128(b_, ff));
792 return _mm_blendv_epi8(_mm_sllv_epi32(a, b_),
793 _mm_srli_epi32(_mm_sub_epi32(a_shr, ff), 1),
794 _mm_cmpgt_epi32(zero, b_));
795 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
796 const SIMDE_POWER_ALTIVEC_VECTOR( signed int) zero = vec_splats(HEDLEY_STATIC_CAST( signed int, 0));
797 const SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) ones = vec_splats(HEDLEY_STATIC_CAST(unsigned int, 1));
798 const SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) shift = vec_splats(HEDLEY_STATIC_CAST(unsigned int, 32 - 8));
799 const SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) max = vec_splats(HEDLEY_STATIC_CAST(unsigned int, 32));
800 const SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) ff = vec_splats(HEDLEY_STATIC_CAST(unsigned int, 0xFF));
801 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) b_abs, b_abs_dec, a_shr;
802
803 b_abs = vec_and(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned int),
804 vec_abs(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char), b))),
805 ff);
806 b_abs_dec = vec_sub(b_abs, ones);
807 a_shr = vec_and(vec_sr(a, b_abs_dec), vec_cmplt(b_abs_dec, max));
808 return vec_sel(vec_and(vec_sl(a, b_abs), vec_cmplt(b_abs, max)),
809 vec_sr(vec_add(a_shr, ones), ones),
810 vec_cmplt(vec_sl(b, shift), zero));
811 #else
812 simde_uint32x4_private
813 r_,
814 a_ = simde_uint32x4_to_private(a);
815 simde_int32x4_private b_ = simde_int32x4_to_private(b);
816
817 SIMDE_VECTORIZE
818 for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
819 b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]);
820 r_.values[i] =
821 (b_.values[i] >= 32) ? 0 :
822 (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) :
823 (b_.values[i] >= -32) ? (((b_.values[i] == -32) ? 0 : (a_.values[i] >> -b_.values[i])) + ((a_.values[i] >> (-b_.values[i] - 1)) & 1)) :
824 0;
825 }
826
827 return simde_uint32x4_from_private(r_);
828 #endif
829 }
830 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
831 #undef vrshlq_u32
832 #define vrshlq_u32(a, b) simde_vrshlq_u32((a), (b))
833 #endif
834
835 SIMDE_FUNCTION_ATTRIBUTES
836 simde_uint64x2_t
simde_vrshlq_u64(const simde_uint64x2_t a,const simde_int64x2_t b)837 simde_vrshlq_u64 (const simde_uint64x2_t a, const simde_int64x2_t b) {
838 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
839 return vrshlq_u64(a, b);
840 #elif defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
841 const __m128i zero = _mm_setzero_si128();
842 const __m128i ff = _mm_cmpeq_epi64(zero, zero);
843 __m128i b_ = _mm_srai_epi64(_mm_slli_epi64(b, 56), 56);
844 __m128i a_shr = _mm_srlv_epi64(a, _mm_xor_si128(b_, ff));
845 return _mm_blendv_epi8(_mm_sllv_epi64(a, b_),
846 _mm_srli_epi64(_mm_sub_epi64(a_shr, ff), 1),
847 _mm_cmpgt_epi64(zero, b_));
848 #elif defined(SIMDE_X86_AVX2_NATIVE)
849 const __m128i ones = _mm_set1_epi64x(1);
850 __m128i b_abs = _mm_and_si128(_mm_abs_epi8(b), _mm_set1_epi64x(0xFF));
851 __m128i a_shr = _mm_srlv_epi64(a, _mm_sub_epi64(b_abs, ones));
852 return _mm_blendv_epi8(_mm_sllv_epi64(a, b_abs),
853 _mm_srli_epi64(_mm_add_epi64(a_shr, ones), 1),
854 _mm_cmpgt_epi64(_mm_setzero_si128(), _mm_slli_epi64(b, 56)));
855 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
856 const SIMDE_POWER_ALTIVEC_VECTOR( signed long long) zero = vec_splats(HEDLEY_STATIC_CAST( signed long long, 0));
857 const SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) ones = vec_splats(HEDLEY_STATIC_CAST(unsigned long long, 1));
858 const SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) shift = vec_splats(HEDLEY_STATIC_CAST(unsigned long long, 64 - 8));
859 const SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) max = vec_splats(HEDLEY_STATIC_CAST(unsigned long long, 64));
860 const SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) ff = vec_splats(HEDLEY_STATIC_CAST(unsigned long long, 0xFF));
861 SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) b_abs, b_abs_dec, a_shr;
862
863 b_abs = vec_and(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long),
864 vec_abs(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char), b))),
865 ff);
866 b_abs_dec = vec_sub(b_abs, ones);
867 a_shr = vec_and(vec_sr(a, b_abs_dec), vec_cmplt(b_abs_dec, max));
868 HEDLEY_DIAGNOSTIC_PUSH
869 #if defined(SIMDE_BUG_CLANG_46770)
870 SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_
871 #endif
872 return vec_sel(vec_and(vec_sl(a, b_abs), vec_cmplt(b_abs, max)),
873 vec_sr(vec_add(a_shr, ones), ones),
874 vec_cmplt(vec_sl(b, shift), zero));
875 HEDLEY_DIAGNOSTIC_POP
876 #else
877 simde_uint64x2_private
878 r_,
879 a_ = simde_uint64x2_to_private(a);
880 simde_int64x2_private b_ = simde_int64x2_to_private(b);
881
882 SIMDE_VECTORIZE
883 for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
884 b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]);
885 r_.values[i] =
886 (b_.values[i] >= 64) ? 0 :
887 (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) :
888 (b_.values[i] >= -64) ? (((b_.values[i] == -64) ? 0 : (a_.values[i] >> -b_.values[i])) + ((a_.values[i] >> (-b_.values[i] - 1)) & 1)) :
889 0;
890 }
891
892 return simde_uint64x2_from_private(r_);
893 #endif
894 }
895 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
896 #undef vrshlq_u64
897 #define vrshlq_u64(a, b) simde_vrshlq_u64((a), (b))
898 #endif
899
900 SIMDE_END_DECLS_
901 HEDLEY_DIAGNOSTIC_POP
902
903 #endif /* !defined(SIMDE_ARM_NEON_RSHL_H) */
904