1 /* SPDX-License-Identifier: MIT
2  *
3  * Permission is hereby granted, free of charge, to any person
4  * obtaining a copy of this software and associated documentation
5  * files (the "Software"), to deal in the Software without
6  * restriction, including without limitation the rights to use, copy,
7  * modify, merge, publish, distribute, sublicense, and/or sell copies
8  * of the Software, and to permit persons to whom the Software is
9  * furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be
12  * included in all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
18  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
19  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  * Copyright:
24  *   2017-2020 Evan Nemerson <evan@nemerson.com>
25  */
26 
27 #include "sse.h"
28 #if !defined(SIMDE_X86_SSE4_1_H)
29 #define SIMDE_X86_SSE4_1_H
30 
31 #include "ssse3.h"
32 
33 HEDLEY_DIAGNOSTIC_PUSH
34 SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
35 SIMDE_BEGIN_DECLS_
36 
37 #if !defined(SIMDE_X86_SSE4_1_NATIVE) && defined(SIMDE_ENABLE_NATIVE_ALIASES)
38 #  define SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES
39 #endif
40 
41 SIMDE_FUNCTION_ATTRIBUTES
42 simde__m128i
simde_mm_blend_epi16(simde__m128i a,simde__m128i b,const int imm8)43 simde_mm_blend_epi16 (simde__m128i a, simde__m128i b, const int imm8)
44     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)  {
45   simde__m128i_private
46     r_,
47     a_ = simde__m128i_to_private(a),
48     b_ = simde__m128i_to_private(b);
49 
50   SIMDE_VECTORIZE
51   for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
52     r_.u16[i] = ((imm8 >> i) & 1) ? b_.u16[i] : a_.u16[i];
53   }
54 
55   return simde__m128i_from_private(r_);
56 }
57 #if defined(SIMDE_X86_SSE4_1_NATIVE)
58 #  define simde_mm_blend_epi16(a, b, imm8) _mm_blend_epi16(a, b, imm8)
59 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
60 #  define simde_mm_blend_epi16(a, b, imm8) \
61      (__extension__ ({ \
62            const uint16_t _mask[8] = {               \
63                ((imm8) & (1 << 0)) ? 0xFFFF : 0x0000, \
64                ((imm8) & (1 << 1)) ? 0xFFFF : 0x0000, \
65                ((imm8) & (1 << 2)) ? 0xFFFF : 0x0000, \
66                ((imm8) & (1 << 3)) ? 0xFFFF : 0x0000, \
67                ((imm8) & (1 << 4)) ? 0xFFFF : 0x0000, \
68                ((imm8) & (1 << 5)) ? 0xFFFF : 0x0000, \
69                ((imm8) & (1 << 6)) ? 0xFFFF : 0x0000, \
70                ((imm8) & (1 << 7)) ? 0xFFFF : 0x0000  \
71            };                                        \
72            uint16x8_t _mask_vec = vld1q_u16(_mask);  \
73            simde__m128i_from_neon_u16(vbslq_u16(_mask_vec, simde__m128i_to_neon_u16(b), simde__m128i_to_neon_u16(a))); \
74        }))
75 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
76 #  define simde_mm_blend_epi16(a, b, imm8)      \
77      (__extension__ ({ \
78            const SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) _mask = {      \
79                ((imm8) & (1 << 0)) ? 0xFFFF : 0x0000, \
80                ((imm8) & (1 << 1)) ? 0xFFFF : 0x0000, \
81                ((imm8) & (1 << 2)) ? 0xFFFF : 0x0000, \
82                ((imm8) & (1 << 3)) ? 0xFFFF : 0x0000, \
83                ((imm8) & (1 << 4)) ? 0xFFFF : 0x0000, \
84                ((imm8) & (1 << 5)) ? 0xFFFF : 0x0000, \
85                ((imm8) & (1 << 6)) ? 0xFFFF : 0x0000, \
86                ((imm8) & (1 << 7)) ? 0xFFFF : 0x0000  \
87            };                                         \
88            simde__m128i_from_altivec_u16(vec_sel(simde__m128i_to_altivec_u16(a), simde__m128i_to_altivec_u16(b), _mask)); \
89        }))
90 #endif
91 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
92   #undef _mm_blend_epi16
93   #define _mm_blend_epi16(a, b, imm8) simde_mm_blend_epi16(a, b, imm8)
94 #endif
95 
96 SIMDE_FUNCTION_ATTRIBUTES
97 simde__m128d
simde_mm_blend_pd(simde__m128d a,simde__m128d b,const int imm8)98 simde_mm_blend_pd (simde__m128d a, simde__m128d b, const int imm8)
99     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3)  {
100   simde__m128d_private
101     r_,
102     a_ = simde__m128d_to_private(a),
103     b_ = simde__m128d_to_private(b);
104 
105   SIMDE_VECTORIZE
106   for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
107     r_.f64[i] = ((imm8 >> i) & 1) ? b_.f64[i] : a_.f64[i];
108   }
109   return simde__m128d_from_private(r_);
110 }
111 #if defined(SIMDE_X86_SSE4_1_NATIVE)
112 #  define simde_mm_blend_pd(a, b, imm8) _mm_blend_pd(a, b, imm8)
113 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
114 #  define simde_mm_blend_pd(a, b, imm8) \
115      (__extension__ ({ \
116            const uint64_t _mask[2] = {               \
117                ((imm8) & (1 << 0)) ? UINT64_MAX : 0, \
118                ((imm8) & (1 << 1)) ? UINT64_MAX : 0  \
119            };                                        \
120            uint64x2_t _mask_vec = vld1q_u64(_mask);  \
121            simde__m128d_from_neon_u64(vbslq_u64(_mask_vec, simde__m128d_to_neon_u64(b), simde__m128d_to_neon_u64(a))); \
122        }))
123 #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
124 #  define simde_mm_blend_pd(a, b, imm8)         \
125      (__extension__ ({ \
126            const SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) _mask = { \
127                ((imm8) & (1 << 0)) ? UINT64_MAX : 0, \
128                ((imm8) & (1 << 1)) ? UINT64_MAX : 0  \
129            };                                        \
130            simde__m128d_from_altivec_f64(vec_sel(simde__m128d_to_altivec_f64(a), simde__m128d_to_altivec_f64(b), _mask)); \
131        }))
132 #endif
133 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
134   #undef _mm_blend_pd
135   #define _mm_blend_pd(a, b, imm8) simde_mm_blend_pd(a, b, imm8)
136 #endif
137 
138 SIMDE_FUNCTION_ATTRIBUTES
139 simde__m128
simde_mm_blend_ps(simde__m128 a,simde__m128 b,const int imm8)140 simde_mm_blend_ps (simde__m128 a, simde__m128 b, const int imm8)
141     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15)  {
142   simde__m128_private
143     r_,
144     a_ = simde__m128_to_private(a),
145     b_ = simde__m128_to_private(b);
146 
147   SIMDE_VECTORIZE
148   for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
149     r_.f32[i] = ((imm8 >> i) & 1) ? b_.f32[i] : a_.f32[i];
150   }
151   return simde__m128_from_private(r_);
152 }
153 #if defined(SIMDE_X86_SSE4_1_NATIVE)
154 #  define simde_mm_blend_ps(a, b, imm8) _mm_blend_ps(a, b, imm8)
155 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
156 #  define simde_mm_blend_ps(a, b, imm8) \
157      (__extension__ ({ \
158            const uint32_t _mask[4] = {               \
159                ((imm8) & (1 << 0)) ? UINT32_MAX : 0, \
160                ((imm8) & (1 << 1)) ? UINT32_MAX : 0, \
161                ((imm8) & (1 << 2)) ? UINT32_MAX : 0, \
162                ((imm8) & (1 << 3)) ? UINT32_MAX : 0  \
163            };                                        \
164            uint32x4_t _mask_vec = vld1q_u32(_mask);  \
165            simde__m128_from_neon_f32(vbslq_f32(_mask_vec, simde__m128_to_neon_f32(b), simde__m128_to_neon_f32(a))); \
166        }))
167 #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
168 #  define simde_mm_blend_ps(a, b, imm8) \
169      (__extension__ ({ \
170            const SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) _mask = {       \
171                ((imm8) & (1 << 0)) ? UINT32_MAX : 0, \
172                ((imm8) & (1 << 1)) ? UINT32_MAX : 0, \
173                ((imm8) & (1 << 2)) ? UINT32_MAX : 0, \
174                ((imm8) & (1 << 3)) ? UINT32_MAX : 0  \
175            };                                        \
176            simde__m128_from_altivec_f32(vec_sel(simde__m128_to_altivec_f32(a), simde__m128_to_altivec_f32(b), _mask)); \
177        }))
178 #endif
179 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
180   #undef _mm_blend_ps
181   #define _mm_blend_ps(a, b, imm8) simde_mm_blend_ps(a, b, imm8)
182 #endif
183 
184 SIMDE_FUNCTION_ATTRIBUTES
185 simde__m128i
simde_mm_blendv_epi8(simde__m128i a,simde__m128i b,simde__m128i mask)186 simde_mm_blendv_epi8 (simde__m128i a, simde__m128i b, simde__m128i mask) {
187   #if defined(SIMDE_X86_SSE4_1_NATIVE)
188     return _mm_blendv_epi8(a, b, mask);
189   #else
190     simde__m128i_private
191       r_,
192       a_ = simde__m128i_to_private(a),
193       b_ = simde__m128i_to_private(b),
194       mask_ = simde__m128i_to_private(mask);
195 
196     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
197       /* Use a signed shift right to create a mask with the sign bit */
198       mask_.neon_i8 = vshrq_n_s8(mask_.neon_i8, 7);
199       r_.neon_i8 = vbslq_s8(mask_.neon_u8, b_.neon_i8, a_.neon_i8);
200     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
201       v128_t m = wasm_i8x16_shr(mask_.wasm_v128, 7);
202       r_.wasm_v128 = wasm_v128_or(wasm_v128_and(b_.wasm_v128, m), wasm_v128_andnot(a_.wasm_v128, m));
203     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
204       r_.altivec_i8 = vec_sel(a_.altivec_i8, b_.altivec_i8, vec_cmplt(mask_.altivec_i8, vec_splat_s8(0)));
205     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
206       /* https://software.intel.com/en-us/forums/intel-c-compiler/topic/850087 */
207       #if defined(HEDLEY_INTEL_VERSION_CHECK)
208         __typeof__(mask_.i8) z = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
209         mask_.i8 = HEDLEY_STATIC_CAST(__typeof__(mask_.i8), mask_.i8 < z);
210       #else
211         mask_.i8 >>= (CHAR_BIT * sizeof(mask_.i8[0])) - 1;
212       #endif
213 
214       r_.i8 = (mask_.i8 & b_.i8) | (~mask_.i8 & a_.i8);
215     #else
216       SIMDE_VECTORIZE
217       for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
218         int8_t m = mask_.i8[i] >> 7;
219         r_.i8[i] = (m & b_.i8[i]) | (~m & a_.i8[i]);
220       }
221     #endif
222 
223     return simde__m128i_from_private(r_);
224   #endif
225 }
226 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
227   #undef _mm_blendv_epi8
228   #define _mm_blendv_epi8(a, b, mask) simde_mm_blendv_epi8(a, b, mask)
229 #endif
230 
231 SIMDE_FUNCTION_ATTRIBUTES
232 simde__m128i
simde_x_mm_blendv_epi16(simde__m128i a,simde__m128i b,simde__m128i mask)233 simde_x_mm_blendv_epi16 (simde__m128i a, simde__m128i b, simde__m128i mask) {
234   #if defined(SIMDE_X86_SSE2_NATIVE)
235     mask = simde_mm_srai_epi16(mask, 15);
236     return simde_mm_or_si128(simde_mm_and_si128(mask, b), simde_mm_andnot_si128(mask, a));
237   #else
238     simde__m128i_private
239       r_,
240       a_ = simde__m128i_to_private(a),
241       b_ = simde__m128i_to_private(b),
242       mask_ = simde__m128i_to_private(mask);
243 
244     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
245       mask_ = simde__m128i_to_private(simde_mm_cmplt_epi16(mask, simde_mm_setzero_si128()));
246       r_.neon_i16 = vbslq_s16(mask_.neon_u16, b_.neon_i16, a_.neon_i16);
247     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
248       r_.altivec_i16 = vec_sel(a_.altivec_i16, b_.altivec_i16, vec_cmplt(mask_.altivec_i16, vec_splat_s16(0)));
249     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
250       #if defined(HEDLEY_INTEL_VERSION_CHECK)
251         __typeof__(mask_.i16) z = { 0, 0, 0, 0, 0, 0, 0, 0 };
252         mask_.i16 = mask_.i16 < z;
253       #else
254         mask_.i16 >>= (CHAR_BIT * sizeof(mask_.i16[0])) - 1;
255       #endif
256 
257       r_.i16 = (mask_.i16 & b_.i16) | (~mask_.i16 & a_.i16);
258     #else
259       SIMDE_VECTORIZE
260       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
261         int16_t m = mask_.i16[i] >> 15;
262         r_.i16[i] = (m & b_.i16[i]) | (~m & a_.i16[i]);
263       }
264     #endif
265 
266     return simde__m128i_from_private(r_);
267   #endif
268 }
269 
270 SIMDE_FUNCTION_ATTRIBUTES
271 simde__m128i
simde_x_mm_blendv_epi32(simde__m128i a,simde__m128i b,simde__m128i mask)272 simde_x_mm_blendv_epi32 (simde__m128i a, simde__m128i b, simde__m128i mask) {
273   #if defined(SIMDE_X86_SSE4_1_NATIVE)
274     return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _mm_castsi128_ps(mask)));
275   #else
276     simde__m128i_private
277       r_,
278       a_ = simde__m128i_to_private(a),
279       b_ = simde__m128i_to_private(b),
280       mask_ = simde__m128i_to_private(mask);
281 
282     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
283       mask_ = simde__m128i_to_private(simde_mm_cmplt_epi32(mask, simde_mm_setzero_si128()));
284       r_.neon_i32 = vbslq_s32(mask_.neon_u32, b_.neon_i32, a_.neon_i32);
285     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
286       v128_t m = wasm_i32x4_shr(mask_.wasm_v128, 31);
287       r_.wasm_v128 = wasm_v128_or(wasm_v128_and(b_.wasm_v128, m), wasm_v128_andnot(a_.wasm_v128, m));
288     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
289       r_.altivec_i32 = vec_sel(a_.altivec_i32, b_.altivec_i32, vec_cmplt(mask_.altivec_i32, vec_splat_s32(0)));
290     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
291       #if defined(HEDLEY_INTEL_VERSION_CHECK)
292         __typeof__(mask_.i32) z = { 0, 0, 0, 0 };
293         mask_.i32 = HEDLEY_STATIC_CAST(__typeof__(mask_.i32), mask_.i32 < z);
294       #else
295         mask_.i32 >>= (CHAR_BIT * sizeof(mask_.i32[0])) - 1;
296       #endif
297 
298       r_.i32 = (mask_.i32 & b_.i32) | (~mask_.i32 & a_.i32);
299     #else
300       SIMDE_VECTORIZE
301       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
302         int32_t m = mask_.i32[i] >> 31;
303         r_.i32[i] = (m & b_.i32[i]) | (~m & a_.i32[i]);
304       }
305     #endif
306 
307     return simde__m128i_from_private(r_);
308   #endif
309 }
310 
311 SIMDE_FUNCTION_ATTRIBUTES
312 simde__m128i
simde_x_mm_blendv_epi64(simde__m128i a,simde__m128i b,simde__m128i mask)313 simde_x_mm_blendv_epi64 (simde__m128i a, simde__m128i b, simde__m128i mask) {
314   #if defined(SIMDE_X86_SSE4_1_NATIVE)
315     return _mm_castpd_si128(_mm_blendv_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(b), _mm_castsi128_pd(mask)));
316   #else
317     simde__m128i_private
318       r_,
319       a_ = simde__m128i_to_private(a),
320       b_ = simde__m128i_to_private(b),
321       mask_ = simde__m128i_to_private(mask);
322 
323     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
324       mask_.neon_u64 = vcltq_s64(mask_.neon_i64, vdupq_n_s64(UINT64_C(0)));
325       r_.neon_i64 = vbslq_s64(mask_.neon_u64, b_.neon_i64, a_.neon_i64);
326     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
327       v128_t m = wasm_i64x2_shr(mask_.wasm_v128, 63);
328       r_.wasm_v128 = wasm_v128_or(wasm_v128_and(b_.wasm_v128, m), wasm_v128_andnot(a_.wasm_v128, m));
329     #elif (defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) && !defined(SIMDE_BUG_CLANG_46770)) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
330       r_.altivec_i64 = vec_sel(a_.altivec_i64, b_.altivec_i64, vec_cmplt(mask_.altivec_i64, vec_splats(HEDLEY_STATIC_CAST(signed long long, 0))));
331     #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
332       SIMDE_POWER_ALTIVEC_VECTOR(signed long long) selector = vec_sra(mask_.altivec_i64, vec_splats(HEDLEY_STATIC_CAST(unsigned long long, 63)));
333       r_.altivec_i32 = vec_sel(a_.altivec_i32, b_.altivec_i32, HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned int), selector));
334     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
335       #if defined(HEDLEY_INTEL_VERSION_CHECK)
336         __typeof__(mask_.i64) z = { 0, 0 };
337         mask_.i64 = HEDLEY_STATIC_CAST(__typeof__(mask_.i64), mask_.i64 < z);
338       #else
339         mask_.i64 >>= (CHAR_BIT * sizeof(mask_.i64[0])) - 1;
340       #endif
341 
342     r_.i64 = (mask_.i64 & b_.i64) | (~mask_.i64 & a_.i64);
343   #else
344     SIMDE_VECTORIZE
345     for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
346       int64_t m = mask_.i64[i] >> 63;
347       r_.i64[i] = (m & b_.i64[i]) | (~m & a_.i64[i]);
348     }
349   #endif
350 
351     return simde__m128i_from_private(r_);
352   #endif
353 }
354 
355 SIMDE_FUNCTION_ATTRIBUTES
356 simde__m128d
simde_mm_blendv_pd(simde__m128d a,simde__m128d b,simde__m128d mask)357 simde_mm_blendv_pd (simde__m128d a, simde__m128d b, simde__m128d mask) {
358   #if defined(SIMDE_X86_SSE4_1_NATIVE)
359     return _mm_blendv_pd(a, b, mask);
360   #else
361     return simde_mm_castsi128_pd(simde_x_mm_blendv_epi64(simde_mm_castpd_si128(a), simde_mm_castpd_si128(b), simde_mm_castpd_si128(mask)));
362   #endif
363 }
364 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
365   #undef _mm_blendv_pd
366   #define _mm_blendv_pd(a, b, mask) simde_mm_blendv_pd(a, b, mask)
367 #endif
368 
369 SIMDE_FUNCTION_ATTRIBUTES
370 simde__m128
simde_mm_blendv_ps(simde__m128 a,simde__m128 b,simde__m128 mask)371 simde_mm_blendv_ps (simde__m128 a, simde__m128 b, simde__m128 mask) {
372   #if defined(SIMDE_X86_SSE4_1_NATIVE)
373     return _mm_blendv_ps(a, b, mask);
374   #else
375     return simde_mm_castsi128_ps(simde_x_mm_blendv_epi32(simde_mm_castps_si128(a), simde_mm_castps_si128(b), simde_mm_castps_si128(mask)));
376   #endif
377 }
378 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
379   #undef _mm_blendv_ps
380   #define _mm_blendv_ps(a, b, mask) simde_mm_blendv_ps(a, b, mask)
381 #endif
382 
383 SIMDE_FUNCTION_ATTRIBUTES
384 simde__m128d
simde_mm_round_pd(simde__m128d a,int rounding)385 simde_mm_round_pd (simde__m128d a, int rounding)
386     SIMDE_REQUIRE_CONSTANT_RANGE(rounding, 0, 15) {
387   simde__m128d_private
388     r_,
389     a_ = simde__m128d_to_private(a);
390 
391   /* For architectures which lack a current direction SIMD instruction. */
392   #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
393     if ((rounding & 7) == SIMDE_MM_FROUND_CUR_DIRECTION)
394       rounding = HEDLEY_STATIC_CAST(int, SIMDE_MM_GET_ROUNDING_MODE()) << 13;
395   #endif
396 
397   switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) {
398     case SIMDE_MM_FROUND_CUR_DIRECTION:
399       #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
400         r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_round(a_.altivec_f64));
401       #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
402         r_.neon_f64 = vrndiq_f64(a_.neon_f64);
403       #elif defined(simde_math_nearbyint)
404         SIMDE_VECTORIZE
405         for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
406           r_.f64[i] = simde_math_nearbyint(a_.f64[i]);
407         }
408       #else
409         HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
410       #endif
411       break;
412 
413     case SIMDE_MM_FROUND_TO_NEAREST_INT:
414       #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
415         r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_round(a_.altivec_f64));
416       #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
417         r_.neon_f64 = vrndaq_f64(a_.neon_f64);
418       #elif defined(simde_math_roundeven)
419         SIMDE_VECTORIZE
420         for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
421           r_.f64[i] = simde_math_roundeven(a_.f64[i]);
422         }
423       #else
424         HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
425       #endif
426       break;
427 
428     case SIMDE_MM_FROUND_TO_NEG_INF:
429       #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
430         r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_floor(a_.altivec_f64));
431       #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
432         r_.neon_f64 = vrndmq_f64(a_.neon_f64);
433       #else
434         SIMDE_VECTORIZE
435         for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
436           r_.f64[i] = simde_math_floor(a_.f64[i]);
437         }
438       #endif
439       break;
440 
441     case SIMDE_MM_FROUND_TO_POS_INF:
442       #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
443         r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_ceil(a_.altivec_f64));
444       #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
445         r_.neon_f64 = vrndpq_f64(a_.neon_f64);
446       #elif defined(simde_math_ceil)
447         SIMDE_VECTORIZE
448         for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
449           r_.f64[i] = simde_math_ceil(a_.f64[i]);
450         }
451       #else
452         HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
453       #endif
454       break;
455 
456     case SIMDE_MM_FROUND_TO_ZERO:
457       #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
458         r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_trunc(a_.altivec_f64));
459       #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
460         r_.neon_f64 = vrndq_f64(a_.neon_f64);
461       #else
462         SIMDE_VECTORIZE
463         for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
464           r_.f64[i] = simde_math_trunc(a_.f64[i]);
465         }
466       #endif
467       break;
468 
469     default:
470       HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
471   }
472 
473   return simde__m128d_from_private(r_);
474 }
475 #if defined(SIMDE_X86_SSE4_1_NATIVE)
476   #define simde_mm_round_pd(a, rounding) _mm_round_pd(a, rounding)
477 #endif
478 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
479   #undef _mm_round_pd
480   #define _mm_round_pd(a, rounding) simde_mm_round_pd(a, rounding)
481 #endif
482 
483 SIMDE_FUNCTION_ATTRIBUTES
484 simde__m128d
simde_mm_ceil_pd(simde__m128d a)485 simde_mm_ceil_pd (simde__m128d a) {
486   return simde_mm_round_pd(a, SIMDE_MM_FROUND_TO_POS_INF);
487 }
488 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
489   #undef _mm_ceil_pd
490   #define _mm_ceil_pd(a) simde_mm_ceil_pd(a)
491 #endif
492 
493 SIMDE_FUNCTION_ATTRIBUTES
494 simde__m128
simde_mm_ceil_ps(simde__m128 a)495 simde_mm_ceil_ps (simde__m128 a) {
496   return simde_mm_round_ps(a, SIMDE_MM_FROUND_TO_POS_INF);
497 }
498 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
499   #undef _mm_ceil_ps
500   #define _mm_ceil_ps(a) simde_mm_ceil_ps(a)
501 #endif
502 
503 SIMDE_FUNCTION_ATTRIBUTES
504 simde__m128d
simde_mm_ceil_sd(simde__m128d a,simde__m128d b)505 simde_mm_ceil_sd (simde__m128d a, simde__m128d b) {
506   #if defined(SIMDE_X86_SSE4_1_NATIVE)
507     return _mm_ceil_sd(a, b);
508   #else
509     simde__m128d_private
510       r_,
511       a_ = simde__m128d_to_private(a),
512       b_ = simde__m128d_to_private(b);
513 
514     #if defined(simde_math_ceilf)
515       r_ = simde__m128d_to_private(simde_mm_set_pd(a_.f64[1], simde_math_ceil(b_.f64[0])));
516     #else
517       HEDLEY_UNREACHABLE();
518     #endif
519 
520     return simde__m128d_from_private(r_);
521   #endif
522 }
523 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
524   #undef _mm_ceil_sd
525   #define _mm_ceil_sd(a, b) simde_mm_ceil_sd(a, b)
526 #endif
527 
528 SIMDE_FUNCTION_ATTRIBUTES
529 simde__m128
simde_mm_ceil_ss(simde__m128 a,simde__m128 b)530 simde_mm_ceil_ss (simde__m128 a, simde__m128 b) {
531   #if defined(SIMDE_X86_SSE4_1_NATIVE)
532     return _mm_ceil_ss(a, b);
533   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
534     return simde_mm_move_ss(a, simde_mm_ceil_ps(b));
535   #else
536     simde__m128_private
537       r_,
538       a_ = simde__m128_to_private(a),
539       b_ = simde__m128_to_private(b);
540 
541     #if defined(simde_math_ceilf)
542       r_ = simde__m128_to_private(simde_mm_set_ps(a_.f32[3], a_.f32[2], a_.f32[1], simde_math_ceilf(b_.f32[0])));
543     #else
544       HEDLEY_UNREACHABLE();
545     #endif
546 
547     return simde__m128_from_private(r_);
548   #endif
549 }
550 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
551   #undef _mm_ceil_ss
552   #define _mm_ceil_ss(a, b) simde_mm_ceil_ss(a, b)
553 #endif
554 
555 SIMDE_FUNCTION_ATTRIBUTES
556 simde__m128i
simde_mm_cmpeq_epi64(simde__m128i a,simde__m128i b)557 simde_mm_cmpeq_epi64 (simde__m128i a, simde__m128i b) {
558   #if defined(SIMDE_X86_SSE4_1_NATIVE)
559     return _mm_cmpeq_epi64(a, b);
560   #else
561     simde__m128i_private
562       r_,
563       a_ = simde__m128i_to_private(a),
564       b_ = simde__m128i_to_private(b);
565 
566     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
567       r_.neon_u64 = vceqq_u64(a_.neon_u64, b_.neon_u64);
568     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
569       /* (a == b) -> (a_lo == b_lo) && (a_hi == b_hi) */
570       uint32x4_t cmp = vceqq_u32(a_.neon_u32, b_.neon_u32);
571       uint32x4_t swapped = vrev64q_u32(cmp);
572       r_.neon_u32 = vandq_u32(cmp, swapped);
573     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
574       r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), a_.i64 == b_.i64);
575     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
576       r_.altivec_i64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed long long), vec_cmpeq(a_.altivec_i64, b_.altivec_i64));
577     #else
578       SIMDE_VECTORIZE
579       for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
580         r_.u64[i] = (a_.u64[i] == b_.u64[i]) ? ~UINT64_C(0) : UINT64_C(0);
581       }
582     #endif
583 
584     return simde__m128i_from_private(r_);
585   #endif
586 }
587 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
588   #undef _mm_cmpeq_epi64
589   #define _mm_cmpeq_epi64(a, b) simde_mm_cmpeq_epi64(a, b)
590 #endif
591 
592 SIMDE_FUNCTION_ATTRIBUTES
593 simde__m128i
simde_mm_cvtepi8_epi16(simde__m128i a)594 simde_mm_cvtepi8_epi16 (simde__m128i a) {
595   #if defined(SIMDE_X86_SSE4_1_NATIVE)
596     return _mm_cvtepi8_epi16(a);
597   #elif defined(SIMDE_X86_SSE2_NATIVE)
598     return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8);
599   #else
600     simde__m128i_private
601       r_,
602       a_ = simde__m128i_to_private(a);
603 
604     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
605       int8x16_t s8x16 = a_.neon_i8;                   /* xxxx xxxx xxxx DCBA */
606       int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
607       r_.neon_i16 = s16x8;
608     #elif defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_VECTOR_SCALAR) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
609       r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, a_.i8,
610           -1,  0, -1,  1, -1,  2,  -1,  3,
611           -1,  4, -1,  5, -1,  6,  -1,  7));
612       r_.i16 >>= 8;
613     #elif defined(SIMDE_CONVERT_VECTOR_)
614       SIMDE_CONVERT_VECTOR_(r_.i16, a_.m64_private[0].i8);
615     #else
616       SIMDE_VECTORIZE
617       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
618         r_.i16[i] = a_.i8[i];
619       }
620     #endif
621 
622     return simde__m128i_from_private(r_);
623   #endif
624 }
625 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
626   #undef _mm_cvtepi8_epi16
627   #define _mm_cvtepi8_epi16(a) simde_mm_cvtepi8_epi16(a)
628 #endif
629 
630 SIMDE_FUNCTION_ATTRIBUTES
631 simde__m128i
simde_mm_cvtepi8_epi32(simde__m128i a)632 simde_mm_cvtepi8_epi32 (simde__m128i a) {
633   #if defined(SIMDE_X86_SSE4_1_NATIVE)
634     return _mm_cvtepi8_epi32(a);
635   #elif defined(SIMDE_X86_SSE2_NATIVE)
636     __m128i tmp = _mm_unpacklo_epi8(a, a);
637     tmp = _mm_unpacklo_epi16(tmp, tmp);
638     return _mm_srai_epi32(tmp, 24);
639   #else
640     simde__m128i_private
641       r_,
642       a_ = simde__m128i_to_private(a);
643 
644     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
645       int8x16_t s8x16 = a_.neon_i8;                     /* xxxx xxxx xxxx DCBA */
646       int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0D0C 0B0A */
647       int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */
648       r_.neon_i32 = s32x4;
649     #elif defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_VECTOR_SCALAR) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
650       r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, a_.i8,
651           -1, -1, -1,  0, -1, -1,  -1,  1,
652           -1, -1, -1,  2, -1, -1,  -1,  3));
653       r_.i32 >>= 24;
654     #else
655       SIMDE_VECTORIZE
656       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
657         r_.i32[i] = a_.i8[i];
658       }
659     #endif
660 
661     return simde__m128i_from_private(r_);
662   #endif
663 }
664 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
665   #undef _mm_cvtepi8_epi32
666   #define _mm_cvtepi8_epi32(a) simde_mm_cvtepi8_epi32(a)
667 #endif
668 
669 SIMDE_FUNCTION_ATTRIBUTES
670 simde__m128i
simde_mm_cvtepi8_epi64(simde__m128i a)671 simde_mm_cvtepi8_epi64 (simde__m128i a) {
672   #if defined(SIMDE_X86_SSE4_1_NATIVE)
673     return _mm_cvtepi8_epi64(a);
674   #else
675     simde__m128i_private
676       r_,
677       a_ = simde__m128i_to_private(a);
678 
679     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
680       int8x16_t s8x16 = a_.neon_i8;                     /* xxxx xxxx xxxx xxBA */
681       int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0x0x 0B0A */
682       int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
683       int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
684       r_.neon_i64 = s64x2;
685     #elif (!defined(SIMDE_ARCH_X86) && !defined(SIMDE_ARCH_AMD64)) && defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_VECTOR_SCALAR) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
686       /* Disabled on x86 due to lack of 64-bit arithmetic shift until
687        * until AVX-512 (at which point we would be using the native
688        * _mm_cvtepi_epi64 anyways). */
689       r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, a_.i8,
690           -1, -1, -1, -1, -1, -1,  -1,  0,
691           -1, -1, -1, -1, -1, -1,  -1,  1));
692       r_.i64 >>= 56;
693     #else
694       SIMDE_VECTORIZE
695       for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
696         r_.i64[i] = a_.i8[i];
697       }
698     #endif
699 
700     return simde__m128i_from_private(r_);
701   #endif
702 }
703 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
704   #undef _mm_cvtepi8_epi64
705   #define _mm_cvtepi8_epi64(a) simde_mm_cvtepi8_epi64(a)
706 #endif
707 
708 SIMDE_FUNCTION_ATTRIBUTES
709 simde__m128i
simde_mm_cvtepu8_epi16(simde__m128i a)710 simde_mm_cvtepu8_epi16 (simde__m128i a) {
711   #if defined(SIMDE_X86_SSE4_1_NATIVE)
712     return _mm_cvtepu8_epi16(a);
713   #elif defined(SIMDE_X86_SSE2_NATIVE)
714     return _mm_unpacklo_epi8(a, _mm_setzero_si128());
715   #else
716     simde__m128i_private
717       r_,
718       a_ = simde__m128i_to_private(a);
719 
720     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
721       uint8x16_t u8x16 = a_.neon_u8;                   /* xxxx xxxx xxxx DCBA */
722       uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */
723       r_.neon_u16 = u16x8;
724     #elif defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
725       __typeof__(r_.i8) z = { 0, };
726       r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, z,
727           0, 16, 1, 17, 2, 18, 3, 19,
728           4, 20, 5, 21, 6, 22, 7, 23));
729     #elif defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_BUG_CLANG_45541) && (!defined(SIMDE_ARCH_POWER) || !defined(__clang__))
730       SIMDE_CONVERT_VECTOR_(r_.i16, a_.m64_private[0].u8);
731     #else
732       SIMDE_VECTORIZE
733       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
734         r_.i16[i] = a_.u8[i];
735       }
736     #endif
737 
738     return simde__m128i_from_private(r_);
739   #endif
740 }
741 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
742   #undef _mm_cvtepu8_epi16
743   #define _mm_cvtepu8_epi16(a) simde_mm_cvtepu8_epi16(a)
744 #endif
745 
746 SIMDE_FUNCTION_ATTRIBUTES
747 simde__m128i
simde_mm_cvtepu8_epi32(simde__m128i a)748 simde_mm_cvtepu8_epi32 (simde__m128i a) {
749   #if defined(SIMDE_X86_SSE4_1_NATIVE)
750     return _mm_cvtepu8_epi32(a);
751   #elif defined(SIMDE_X86_SSSE3_NATIVE)
752     __m128i s = _mm_set_epi8(
753         0x80, 0x80, 0x80, 0x03, 0x80, 0x80, 0x80, 0x02,
754         0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80, 0x00);
755     return _mm_shuffle_epi8(a, s);
756   #elif defined(SIMDE_X86_SSE2_NATIVE)
757     __m128i z = _mm_setzero_si128();
758     return _mm_unpacklo_epi16(_mm_unpacklo_epi8(a, z), z);
759   #else
760     simde__m128i_private
761       r_,
762       a_ = simde__m128i_to_private(a);
763 
764     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
765       uint8x16_t u8x16 = a_.neon_u8;                     /* xxxx xxxx xxxx DCBA */
766       uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0D0C 0B0A */
767       uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
768       r_.neon_u32 = u32x4;
769     #elif defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
770       __typeof__(r_.i8) z = { 0, };
771       r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, z,
772           0, 17, 18, 19, 1, 21, 22, 23,
773           2, 25, 26, 27, 3, 29, 30, 31));
774     #else
775       SIMDE_VECTORIZE
776       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
777         r_.i32[i] = a_.u8[i];
778       }
779     #endif
780 
781     return simde__m128i_from_private(r_);
782   #endif
783 }
784 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
785   #undef _mm_cvtepu8_epi32
786   #define _mm_cvtepu8_epi32(a) simde_mm_cvtepu8_epi32(a)
787 #endif
788 
789 SIMDE_FUNCTION_ATTRIBUTES
790 simde__m128i
simde_mm_cvtepu8_epi64(simde__m128i a)791 simde_mm_cvtepu8_epi64 (simde__m128i a) {
792   #if defined(SIMDE_X86_SSE4_1_NATIVE)
793     return _mm_cvtepu8_epi64(a);
794   #elif defined(SIMDE_X86_SSSE3_NATIVE)
795     __m128i s = _mm_set_epi8(
796         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x01,
797         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x00);
798     return _mm_shuffle_epi8(a, s);
799   #elif defined(SIMDE_X86_SSE2_NATIVE)
800     __m128i z = _mm_setzero_si128();
801     return _mm_unpacklo_epi32(_mm_unpacklo_epi16(_mm_unpacklo_epi8(a, z), z), z);
802   #else
803     simde__m128i_private
804       r_,
805       a_ = simde__m128i_to_private(a);
806 
807     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
808       uint8x16_t u8x16 = a_.neon_u8;                     /* xxxx xxxx xxxx xxBA */
809       uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0x0x 0B0A */
810       uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
811       uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
812       r_.neon_u64 = u64x2;
813     #elif defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
814       __typeof__(r_.i8) z = { 0, };
815       r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, z,
816           0, 17, 18, 19, 20, 21, 22, 23,
817           1, 25, 26, 27, 28, 29, 30, 31));
818     #else
819       SIMDE_VECTORIZE
820       for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
821         r_.i64[i] = a_.u8[i];
822       }
823     #endif
824 
825     return simde__m128i_from_private(r_);
826   #endif
827 }
828 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
829   #undef _mm_cvtepu8_epi64
830   #define _mm_cvtepu8_epi64(a) simde_mm_cvtepu8_epi64(a)
831 #endif
832 
833 SIMDE_FUNCTION_ATTRIBUTES
834 simde__m128i
simde_mm_cvtepi16_epi32(simde__m128i a)835 simde_mm_cvtepi16_epi32 (simde__m128i a) {
836   #if defined(SIMDE_X86_SSE4_1_NATIVE)
837     return _mm_cvtepi16_epi32(a);
838   #elif defined(SIMDE_X86_SSE2_NATIVE)
839     return _mm_srai_epi32(_mm_unpacklo_epi16(a, a), 16);
840   #else
841     simde__m128i_private
842       r_,
843       a_ = simde__m128i_to_private(a);
844 
845     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
846       r_.neon_i32 = vmovl_s16(vget_low_s16(a_.neon_i16));
847     #elif !defined(SIMDE_ARCH_X86) && defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_VECTOR_SCALAR) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
848       r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, a_.i16, 8, 0, 10, 1, 12, 2, 14, 3));
849       r_.i32 >>= 16;
850     #else
851       SIMDE_VECTORIZE
852       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
853         r_.i32[i] = a_.i16[i];
854       }
855     #endif
856 
857     return simde__m128i_from_private(r_);
858   #endif
859 }
860 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
861   #undef _mm_cvtepi16_epi32
862   #define _mm_cvtepi16_epi32(a) simde_mm_cvtepi16_epi32(a)
863 #endif
864 
865 SIMDE_FUNCTION_ATTRIBUTES
866 simde__m128i
simde_mm_cvtepu16_epi32(simde__m128i a)867 simde_mm_cvtepu16_epi32 (simde__m128i a) {
868   #if defined(SIMDE_X86_SSE4_1_NATIVE)
869     return _mm_cvtepu16_epi32(a);
870   #elif defined(SIMDE_X86_SSE2_NATIVE)
871     return _mm_unpacklo_epi16(a, _mm_setzero_si128());
872   #else
873     simde__m128i_private
874       r_,
875       a_ = simde__m128i_to_private(a);
876 
877     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
878       r_.neon_u32 = vmovl_u16(vget_low_u16(a_.neon_u16));
879     #elif defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
880       __typeof__(r_.u16) z = { 0, };
881       r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), SIMDE_SHUFFLE_VECTOR_(16, 16, a_.u16, z,
882           0, 9, 1, 11, 2, 13, 3, 15));
883     #elif defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_BUG_CLANG_45541) && (!defined(SIMDE_ARCH_POWER) || !defined(__clang__))
884       SIMDE_CONVERT_VECTOR_(r_.i32, a_.m64_private[0].u16);
885     #else
886       SIMDE_VECTORIZE
887       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
888         r_.i32[i] = a_.u16[i];
889       }
890     #endif
891 
892     return simde__m128i_from_private(r_);
893   #endif
894 }
895 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
896   #undef _mm_cvtepu16_epi32
897   #define _mm_cvtepu16_epi32(a) simde_mm_cvtepu16_epi32(a)
898 #endif
899 
900 SIMDE_FUNCTION_ATTRIBUTES
901 simde__m128i
simde_mm_cvtepu16_epi64(simde__m128i a)902 simde_mm_cvtepu16_epi64 (simde__m128i a) {
903   #if defined(SIMDE_X86_SSE4_1_NATIVE)
904     return _mm_cvtepu16_epi64(a);
905   #elif defined(SIMDE_X86_SSE2_NATIVE)
906     __m128i z = _mm_setzero_si128();
907     return _mm_unpacklo_epi32(_mm_unpacklo_epi16(a, z), z);
908   #else
909     simde__m128i_private
910       r_,
911       a_ = simde__m128i_to_private(a);
912 
913     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
914       uint16x8_t u16x8 = a_.neon_u16;                    /* xxxx xxxx xxxx 0B0A */
915       uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
916       uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
917       r_.neon_u64 = u64x2;
918     #elif defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
919       __typeof__(r_.u16) z = { 0, };
920       r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), SIMDE_SHUFFLE_VECTOR_(16, 16, a_.u16, z,
921           0,  9, 10, 11,
922           1, 13, 14, 15));
923     #else
924       SIMDE_VECTORIZE
925       for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
926         r_.i64[i] = a_.u16[i];
927       }
928     #endif
929 
930     return simde__m128i_from_private(r_);
931   #endif
932 }
933 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
934   #undef _mm_cvtepu16_epi64
935   #define _mm_cvtepu16_epi64(a) simde_mm_cvtepu16_epi64(a)
936 #endif
937 
938 SIMDE_FUNCTION_ATTRIBUTES
939 simde__m128i
simde_mm_cvtepi16_epi64(simde__m128i a)940 simde_mm_cvtepi16_epi64 (simde__m128i a) {
941   #if defined(SIMDE_X86_SSE4_1_NATIVE)
942     return _mm_cvtepi16_epi64(a);
943   #else
944     simde__m128i_private
945       r_,
946       a_ = simde__m128i_to_private(a);
947 
948     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
949       int16x8_t s16x8 = a_.neon_i16;                    /* xxxx xxxx xxxx 0B0A */
950       int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
951       int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
952       r_.neon_i64 = s64x2;
953     #elif (!defined(SIMDE_ARCH_X86) && !defined(SIMDE_ARCH_AMD64)) && defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_VECTOR_SCALAR) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
954       r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, a_.i16,
955            8,  9, 10, 0,
956           12, 13, 14, 1));
957       r_.i64 >>= 48;
958     #else
959       SIMDE_VECTORIZE
960       for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
961         r_.i64[i] = a_.i16[i];
962       }
963     #endif
964 
965     return simde__m128i_from_private(r_);
966   #endif
967 }
968 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
969   #undef _mm_cvtepi16_epi64
970   #define _mm_cvtepi16_epi64(a) simde_mm_cvtepi16_epi64(a)
971 #endif
972 
973 SIMDE_FUNCTION_ATTRIBUTES
974 simde__m128i
simde_mm_cvtepi32_epi64(simde__m128i a)975 simde_mm_cvtepi32_epi64 (simde__m128i a) {
976   #if defined(SIMDE_X86_SSE4_1_NATIVE)
977     return _mm_cvtepi32_epi64(a);
978   #elif defined(SIMDE_X86_SSE2_NATIVE)
979     __m128i tmp = _mm_shuffle_epi32(a, 0x50);
980     tmp = _mm_srai_epi32(tmp, 31);
981     tmp = _mm_shuffle_epi32(tmp, 0xed);
982     return _mm_unpacklo_epi32(a, tmp);
983   #else
984     simde__m128i_private
985       r_,
986       a_ = simde__m128i_to_private(a);
987 
988     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
989       r_.neon_i64 = vmovl_s32(vget_low_s32(a_.neon_i32));
990     #elif !defined(SIMDE_ARCH_X86) && defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_VECTOR_SCALAR) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
991       r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), SIMDE_SHUFFLE_VECTOR_(32, 16, a_.i32, a_.i32, -1, 0, -1, 1));
992       r_.i64 >>= 32;
993     #elif defined(SIMDE_CONVERT_VECTOR_)
994       SIMDE_CONVERT_VECTOR_(r_.i64, a_.m64_private[0].i32);
995     #else
996       SIMDE_VECTORIZE
997       for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
998         r_.i64[i] = a_.i32[i];
999       }
1000     #endif
1001 
1002     return simde__m128i_from_private(r_);
1003   #endif
1004 }
1005 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1006   #undef _mm_cvtepi32_epi64
1007   #define _mm_cvtepi32_epi64(a) simde_mm_cvtepi32_epi64(a)
1008 #endif
1009 
1010 SIMDE_FUNCTION_ATTRIBUTES
1011 simde__m128i
simde_mm_cvtepu32_epi64(simde__m128i a)1012 simde_mm_cvtepu32_epi64 (simde__m128i a) {
1013   #if defined(SIMDE_X86_SSE4_1_NATIVE)
1014     return _mm_cvtepu32_epi64(a);
1015   #elif defined(SIMDE_X86_SSE2_NATIVE)
1016     return _mm_unpacklo_epi32(a, _mm_setzero_si128());
1017   #else
1018     simde__m128i_private
1019       r_,
1020       a_ = simde__m128i_to_private(a);
1021 
1022     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1023       r_.neon_u64 = vmovl_u32(vget_low_u32(a_.neon_u32));
1024     #elif defined(SIMDE_VECTOR_SCALAR) && defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
1025       __typeof__(r_.u32) z = { 0, };
1026       r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), SIMDE_SHUFFLE_VECTOR_(32, 16, a_.u32, z, 0, 4, 1, 6));
1027     #elif defined(SIMDE_CONVERT_VECTOR_)
1028       SIMDE_CONVERT_VECTOR_(r_.i64, a_.m64_private[0].u32);
1029     #else
1030       SIMDE_VECTORIZE
1031       for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
1032         r_.i64[i] = a_.u32[i];
1033       }
1034     #endif
1035 
1036     return simde__m128i_from_private(r_);
1037   #endif
1038 }
1039 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1040   #undef _mm_cvtepu32_epi64
1041   #define _mm_cvtepu32_epi64(a) simde_mm_cvtepu32_epi64(a)
1042 #endif
1043 
1044 SIMDE_FUNCTION_ATTRIBUTES
1045 simde__m128d
simde_mm_dp_pd(simde__m128d a,simde__m128d b,const int imm8)1046 simde_mm_dp_pd (simde__m128d a, simde__m128d b, const int imm8)
1047     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)  {
1048   simde__m128d_private
1049     r_,
1050     a_ = simde__m128d_to_private(a),
1051     b_ = simde__m128d_to_private(b);
1052 
1053   #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1054     r_.neon_f64 = vmulq_f64(a_.neon_f64, b_.neon_f64);
1055 
1056     switch (imm8) {
1057       case 0xff:
1058         r_.neon_f64 = vaddq_f64(r_.neon_f64, vextq_f64(r_.neon_f64, r_.neon_f64, 1));
1059         break;
1060       case 0x13:
1061         r_.neon_f64 = vdupq_lane_f64(vget_low_f64(r_.neon_f64), 0);
1062         break;
1063       default:
1064         { /* imm8 is a compile-time constant, so this all becomes just a load */
1065           uint64_t mask_data[] = {
1066             (imm8 & (1 << 4)) ? ~UINT64_C(0) : UINT64_C(0),
1067             (imm8 & (1 << 5)) ? ~UINT64_C(0) : UINT64_C(0),
1068           };
1069           r_.neon_f64 = vreinterpretq_f64_u64(vandq_u64(vld1q_u64(mask_data), vreinterpretq_u64_f64(r_.neon_f64)));
1070         }
1071 
1072         r_.neon_f64 = vdupq_n_f64(vaddvq_f64(r_.neon_f64));
1073 
1074         {
1075           uint64_t mask_data[] = {
1076             (imm8 & 1) ? ~UINT64_C(0) : UINT64_C(0),
1077             (imm8 & 2) ? ~UINT64_C(0) : UINT64_C(0)
1078           };
1079           r_.neon_f64 = vreinterpretq_f64_u64(vandq_u64(vld1q_u64(mask_data), vreinterpretq_u64_f64(r_.neon_f64)));
1080         }
1081         break;
1082     }
1083   #else
1084     simde_float64 sum = SIMDE_FLOAT64_C(0.0);
1085 
1086     SIMDE_VECTORIZE_REDUCTION(+:sum)
1087     for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
1088       sum += ((imm8 >> (i + 4)) & 1) ? (a_.f64[i] * b_.f64[i]) : 0.0;
1089     }
1090 
1091     SIMDE_VECTORIZE
1092     for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
1093       r_.f64[i] = ((imm8 >> i) & 1) ? sum : 0.0;
1094     }
1095   #endif
1096 
1097   return simde__m128d_from_private(r_);
1098 }
1099 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1100 #  define simde_mm_dp_pd(a, b, imm8) _mm_dp_pd(a, b, imm8)
1101 #endif
1102 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1103   #undef _mm_dp_pd
1104   #define _mm_dp_pd(a, b, imm8) simde_mm_dp_pd(a, b, imm8)
1105 #endif
1106 
1107 SIMDE_FUNCTION_ATTRIBUTES
1108 simde__m128
simde_mm_dp_ps(simde__m128 a,simde__m128 b,const int imm8)1109 simde_mm_dp_ps (simde__m128 a, simde__m128 b, const int imm8)
1110     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)  {
1111   simde__m128_private
1112     r_,
1113     a_ = simde__m128_to_private(a),
1114     b_ = simde__m128_to_private(b);
1115 
1116   #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1117     r_.neon_f32 = vmulq_f32(a_.neon_f32, b_.neon_f32);
1118 
1119     switch (imm8) {
1120       case 0xff:
1121         r_.neon_f32 = vdupq_n_f32(vaddvq_f32(r_.neon_f32));
1122         break;
1123       case 0x7f:
1124         r_.neon_f32 = vsetq_lane_f32(0, r_.neon_f32, 3);
1125         r_.neon_f32 = vdupq_n_f32(vaddvq_f32(r_.neon_f32));
1126         break;
1127       default:
1128         {
1129           {
1130             uint32_t mask_data[] = {
1131               (imm8 & (1 << 4)) ? ~UINT32_C(0) : UINT32_C(0),
1132               (imm8 & (1 << 5)) ? ~UINT32_C(0) : UINT32_C(0),
1133               (imm8 & (1 << 6)) ? ~UINT32_C(0) : UINT32_C(0),
1134               (imm8 & (1 << 7)) ? ~UINT32_C(0) : UINT32_C(0)
1135             };
1136             r_.neon_f32 = vreinterpretq_f32_u32(vandq_u32(vld1q_u32(mask_data), vreinterpretq_u32_f32(r_.neon_f32)));
1137           }
1138 
1139           r_.neon_f32 = vdupq_n_f32(vaddvq_f32(r_.neon_f32));
1140 
1141           {
1142             uint32_t mask_data[] = {
1143               (imm8 & 1) ? ~UINT32_C(0) : UINT32_C(0),
1144               (imm8 & 2) ? ~UINT32_C(0) : UINT32_C(0),
1145               (imm8 & 4) ? ~UINT32_C(0) : UINT32_C(0),
1146               (imm8 & 8) ? ~UINT32_C(0) : UINT32_C(0)
1147             };
1148             r_.neon_f32 = vreinterpretq_f32_u32(vandq_u32(vld1q_u32(mask_data), vreinterpretq_u32_f32(r_.neon_f32)));
1149           }
1150         }
1151         break;
1152     }
1153   #else
1154     simde_float32 sum = SIMDE_FLOAT32_C(0.0);
1155 
1156     SIMDE_VECTORIZE_REDUCTION(+:sum)
1157     for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
1158       sum += ((imm8 >> (i + 4)) & 1) ? (a_.f32[i] * b_.f32[i]) : SIMDE_FLOAT32_C(0.0);
1159     }
1160 
1161     SIMDE_VECTORIZE
1162     for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
1163       r_.f32[i] = ((imm8 >> i) & 1) ? sum : SIMDE_FLOAT32_C(0.0);
1164     }
1165   #endif
1166 
1167   return simde__m128_from_private(r_);
1168 }
1169 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1170 #  define simde_mm_dp_ps(a, b, imm8) _mm_dp_ps(a, b, imm8)
1171 #endif
1172 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1173   #undef _mm_dp_ps
1174   #define _mm_dp_ps(a, b, imm8) simde_mm_dp_ps(a, b, imm8)
1175 #endif
1176 
1177 #if defined(simde_mm_extract_epi8)
1178 #  undef simde_mm_extract_epi8
1179 #endif
1180 SIMDE_FUNCTION_ATTRIBUTES
1181 int8_t
simde_mm_extract_epi8(simde__m128i a,const int imm8)1182 simde_mm_extract_epi8 (simde__m128i a, const int imm8)
1183     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15)  {
1184   simde__m128i_private
1185     a_ = simde__m128i_to_private(a);
1186 
1187   #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1188     #if defined(SIMDE_BUG_GCC_95227)
1189       (void) a_;
1190       (void) imm8;
1191     #endif
1192     return vec_extract(a_.altivec_i8, imm8);
1193   #else
1194     return a_.i8[imm8 & 15];
1195   #endif
1196 }
1197 #if defined(SIMDE_X86_SSE4_1_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_MM_EXTRACT_EPI8)
1198 #  define simde_mm_extract_epi8(a, imm8) HEDLEY_STATIC_CAST(int8_t, _mm_extract_epi8(a, imm8))
1199 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1200 #  define simde_mm_extract_epi8(a, imm8) vgetq_lane_s8(simde__m128i_to_private(a).neon_i8, imm8)
1201 #endif
1202 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1203   #undef _mm_extract_epi8
1204   #define _mm_extract_epi8(a, imm8) HEDLEY_STATIC_CAST(int, simde_mm_extract_epi8(a, imm8))
1205 #endif
1206 
1207 #if defined(simde_mm_extract_epi32)
1208 #  undef simde_mm_extract_epi32
1209 #endif
1210 SIMDE_FUNCTION_ATTRIBUTES
1211 int32_t
simde_mm_extract_epi32(simde__m128i a,const int imm8)1212 simde_mm_extract_epi32 (simde__m128i a, const int imm8)
1213     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3)  {
1214   simde__m128i_private
1215     a_ = simde__m128i_to_private(a);
1216 
1217   #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1218     #if defined(SIMDE_BUG_GCC_95227)
1219       (void) a_;
1220       (void) imm8;
1221     #endif
1222     return vec_extract(a_.altivec_i32, imm8);
1223   #else
1224     return a_.i32[imm8 & 3];
1225   #endif
1226 }
1227 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1228 #  define simde_mm_extract_epi32(a, imm8) _mm_extract_epi32(a, imm8)
1229 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1230 #  define simde_mm_extract_epi32(a, imm8) vgetq_lane_s32(simde__m128i_to_private(a).neon_i32, imm8)
1231 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1232 #  define simde_mm_extract_epi32(a, imm8) HEDLEY_STATIC_CAST(int32_t, vec_extract(simde__m128i_to_private(a).altivec_i32, imm8))
1233 #endif
1234 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1235   #undef _mm_extract_epi32
1236   #define _mm_extract_epi32(a, imm8) simde_mm_extract_epi32(a, imm8)
1237 #endif
1238 
1239 #if defined(simde_mm_extract_epi64)
1240 #  undef simde_mm_extract_epi64
1241 #endif
1242 SIMDE_FUNCTION_ATTRIBUTES
1243 int64_t
simde_mm_extract_epi64(simde__m128i a,const int imm8)1244 simde_mm_extract_epi64 (simde__m128i a, const int imm8)
1245     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1)  {
1246   simde__m128i_private
1247     a_ = simde__m128i_to_private(a);
1248 
1249   #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
1250     #if defined(SIMDE_BUG_GCC_95227)
1251       (void) a_;
1252       (void) imm8;
1253     #endif
1254     return vec_extract(a_.altivec_i64, imm8);
1255   #else
1256     return a_.i64[imm8 & 1];
1257   #endif
1258 }
1259 #if defined(SIMDE_X86_SSE4_1_NATIVE) && defined(SIMDE_ARCH_AMD64)
1260 #  define simde_mm_extract_epi64(a, imm8) _mm_extract_epi64(a, imm8)
1261 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1262 #  define simde_mm_extract_epi64(a, imm8) vgetq_lane_s64(simde__m128i_to_private(a).neon_i64, imm8)
1263 #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
1264 #  define simde_mm_extract_epi64(a, imm8) HEDLEY_STATIC_CAST(int64_t, vec_extract(simde__m128i_to_private(a).altivec_i64, imm8))
1265 #endif
1266 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64))
1267   #undef _mm_extract_epi64
1268   #define _mm_extract_epi64(a, imm8) simde_mm_extract_epi64(a, imm8)
1269 #endif
1270 
1271 #if defined(simde_mm_extract_ps)
1272 #  undef simde_mm_extract_ps
1273 #endif
1274 SIMDE_FUNCTION_ATTRIBUTES
1275 int32_t
simde_mm_extract_ps(simde__m128 a,const int imm8)1276 simde_mm_extract_ps (simde__m128 a, const int imm8)
1277     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3)  {
1278   simde__m128_private
1279     a_ = simde__m128_to_private(a);
1280 
1281   return a_.i32[imm8 & 3];
1282 }
1283 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1284   #define simde_mm_extract_ps(a, imm8) _mm_extract_ps(a, imm8)
1285 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1286   #define simde_mm_extract_ps(a, imm8) vgetq_lane_s32(simde__m128_to_private(a).neon_i32, imm8)
1287 #endif
1288 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1289   #undef _mm_extract_ps
1290   #define _mm_extract_ps(a, imm8) simde_mm_extract_ps(a, imm8)
1291 #endif
1292 
1293 SIMDE_FUNCTION_ATTRIBUTES
1294 simde__m128d
simde_mm_floor_pd(simde__m128d a)1295 simde_mm_floor_pd (simde__m128d a) {
1296   return simde_mm_round_pd(a, SIMDE_MM_FROUND_TO_NEG_INF);
1297 }
1298 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1299   #undef _mm_floor_pd
1300   #define _mm_floor_pd(a) simde_mm_floor_pd(a)
1301 #endif
1302 
1303 SIMDE_FUNCTION_ATTRIBUTES
1304 simde__m128
simde_mm_floor_ps(simde__m128 a)1305 simde_mm_floor_ps (simde__m128 a) {
1306   return simde_mm_round_ps(a, SIMDE_MM_FROUND_TO_NEG_INF);
1307 }
1308 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1309   #undef _mm_floor_ps
1310   #define _mm_floor_ps(a) simde_mm_floor_ps(a)
1311 #endif
1312 
1313 SIMDE_FUNCTION_ATTRIBUTES
1314 simde__m128d
simde_mm_floor_sd(simde__m128d a,simde__m128d b)1315 simde_mm_floor_sd (simde__m128d a, simde__m128d b) {
1316   #if defined(SIMDE_X86_SSE4_1_NATIVE)
1317     return _mm_floor_sd(a, b);
1318   #else
1319     simde__m128d_private
1320       r_,
1321       a_ = simde__m128d_to_private(a),
1322       b_ = simde__m128d_to_private(b);
1323 
1324     #if defined(simde_math_floor)
1325       r_.f64[0] = simde_math_floor(b_.f64[0]);
1326       r_.f64[1] = a_.f64[1];
1327     #else
1328       HEDLEY_UNREACHABLE();
1329     #endif
1330 
1331     return simde__m128d_from_private(r_);
1332   #endif
1333 }
1334 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1335   #undef _mm_floor_sd
1336   #define _mm_floor_sd(a, b) simde_mm_floor_sd(a, b)
1337 #endif
1338 
1339 SIMDE_FUNCTION_ATTRIBUTES
1340 simde__m128
simde_mm_floor_ss(simde__m128 a,simde__m128 b)1341 simde_mm_floor_ss (simde__m128 a, simde__m128 b) {
1342   #if defined(SIMDE_X86_SSE4_1_NATIVE)
1343     return _mm_floor_ss(a, b);
1344   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
1345       return simde_mm_move_ss(a, simde_mm_floor_ps(b));
1346   #else
1347     simde__m128_private
1348       r_,
1349       a_ = simde__m128_to_private(a),
1350       b_ = simde__m128_to_private(b);
1351 
1352     #if defined(simde_math_floorf)
1353       r_.f32[0] = simde_math_floorf(b_.f32[0]);
1354       for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
1355         r_.f32[i] = a_.f32[i];
1356       }
1357     #else
1358       HEDLEY_UNREACHABLE();
1359     #endif
1360 
1361     return simde__m128_from_private(r_);
1362   #endif
1363 }
1364 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1365   #undef _mm_floor_ss
1366   #define _mm_floor_ss(a, b) simde_mm_floor_ss(a, b)
1367 #endif
1368 
1369 SIMDE_FUNCTION_ATTRIBUTES
1370 simde__m128i
simde_mm_insert_epi8(simde__m128i a,int i,const int imm8)1371 simde_mm_insert_epi8 (simde__m128i a, int i, const int imm8)
1372     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15)  {
1373   simde__m128i_private
1374     r_ = simde__m128i_to_private(a);
1375 
1376   r_.i8[imm8] = HEDLEY_STATIC_CAST(int8_t, i);
1377 
1378   return simde__m128i_from_private(r_);
1379 }
1380 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1381   /* clang-3.8 returns an incompatible type, so we need the cast.  MSVC
1382    * can't handle the cast ("error C2440: 'type cast': cannot convert
1383    * from '__m128i' to '__m128i'").  */
1384   #if defined(__clang__)
1385     #define simde_mm_insert_epi8(a, i, imm8) HEDLEY_STATIC_CAST(__m128i, _mm_insert_epi8(a, i, imm8))
1386   #else
1387     #define simde_mm_insert_epi8(a, i, imm8) _mm_insert_epi8(a, i, imm8)
1388   #endif
1389 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1390 #  define simde_mm_insert_epi8(a, i, imm8) simde__m128i_from_neon_i8(vsetq_lane_s8(i, simde__m128i_to_private(a).i8, imm8))
1391 #endif
1392 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1393   #undef _mm_insert_epi8
1394   #define _mm_insert_epi8(a, i, imm8) simde_mm_insert_epi8(a, i, imm8)
1395 #endif
1396 
1397 SIMDE_FUNCTION_ATTRIBUTES
1398 simde__m128i
simde_mm_insert_epi32(simde__m128i a,int i,const int imm8)1399 simde_mm_insert_epi32 (simde__m128i a, int i, const int imm8)
1400     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3)  {
1401   simde__m128i_private
1402     r_ = simde__m128i_to_private(a);
1403 
1404   r_.i32[imm8] = HEDLEY_STATIC_CAST(int32_t, i);
1405 
1406   return simde__m128i_from_private(r_);
1407 }
1408 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1409   #if defined(__clang__)
1410     #define simde_mm_insert_epi32(a, i, imm8) HEDLEY_STATIC_CAST(__m128i, _mm_insert_epi32(a, i, imm8))
1411   #else
1412     #define simde_mm_insert_epi32(a, i, imm8) _mm_insert_epi32(a, i, imm8)
1413   #endif
1414 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1415 #  define simde_mm_insert_epi32(a, i, imm8) simde__m128i_from_neon_i32(vsetq_lane_s32(i, simde__m128i_to_private(a).i32, imm8))
1416 #endif
1417 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1418   #undef _mm_insert_epi32
1419   #define _mm_insert_epi32(a, i, imm8) simde_mm_insert_epi32(a, i, imm8)
1420 #endif
1421 
1422 SIMDE_FUNCTION_ATTRIBUTES
1423 simde__m128i
simde_mm_insert_epi64(simde__m128i a,int64_t i,const int imm8)1424 simde_mm_insert_epi64 (simde__m128i a, int64_t i, const int imm8)
1425     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1)  {
1426   #if defined(SIMDE_BUG_GCC_94482)
1427     simde__m128i_private
1428       a_ = simde__m128i_to_private(a);
1429 
1430     switch(imm8) {
1431       case 0:
1432         return simde_mm_set_epi64x(a_.i64[1], i);
1433         break;
1434       case 1:
1435         return simde_mm_set_epi64x(i, a_.i64[0]);
1436         break;
1437       default:
1438         HEDLEY_UNREACHABLE();
1439         break;
1440     }
1441   #else
1442     simde__m128i_private
1443       r_ = simde__m128i_to_private(a);
1444 
1445     r_.i64[imm8] = i;
1446     return simde__m128i_from_private(r_);
1447   #endif
1448 }
1449 #if defined(SIMDE_X86_SSE4_1_NATIVE) && defined(SIMDE_ARCH_AMD64)
1450 #  define simde_mm_insert_epi64(a, i, imm8) _mm_insert_epi64(a, i, imm8)
1451 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1452 #  define simde_mm_insert_epi64(a, i, imm8) simde__m128i_from_neon_i64(vsetq_lane_s64(i, simde__m128i_to_private(a).i64, imm8))
1453 #endif
1454 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64))
1455   #undef _mm_insert_epi64
1456   #define _mm_insert_epi64(a, i, imm8) simde_mm_insert_epi64(a, i, imm8)
1457 #endif
1458 
1459 SIMDE_FUNCTION_ATTRIBUTES
1460 simde__m128
simde_mm_insert_ps(simde__m128 a,simde__m128 b,const int imm8)1461 simde_mm_insert_ps (simde__m128 a, simde__m128 b, const int imm8)
1462     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)  {
1463   simde__m128_private
1464     r_,
1465     a_ = simde__m128_to_private(a),
1466     b_ = simde__m128_to_private(b);
1467 
1468   a_.f32[0] = b_.f32[(imm8 >> 6) & 3];
1469   a_.f32[(imm8 >> 4) & 3] = a_.f32[0];
1470 
1471   SIMDE_VECTORIZE
1472   for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
1473     r_.f32[i] = (imm8 >> i) ? SIMDE_FLOAT32_C(0.0) : a_.f32[i];
1474   }
1475 
1476   return simde__m128_from_private(r_);
1477 }
1478 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1479 #  define simde_mm_insert_ps(a, b, imm8) _mm_insert_ps(a, b, imm8)
1480 #endif
1481 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1482   #undef _mm_insert_ps
1483   #define _mm_insert_ps(a, b, imm8) simde_mm_insert_ps(a, b, imm8)
1484 #endif
1485 
1486 SIMDE_FUNCTION_ATTRIBUTES
1487 simde__m128i
simde_mm_max_epi8(simde__m128i a,simde__m128i b)1488 simde_mm_max_epi8 (simde__m128i a, simde__m128i b) {
1489   #if defined(SIMDE_X86_SSE4_1_NATIVE) && !defined(__PGI)
1490     return _mm_max_epi8(a, b);
1491   #else
1492     simde__m128i_private
1493       r_,
1494       a_ = simde__m128i_to_private(a),
1495       b_ = simde__m128i_to_private(b);
1496 
1497     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1498       r_.neon_i8 = vmaxq_s8(a_.neon_i8, b_.neon_i8);
1499     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1500       r_.wasm_v128 = wasm_i8x16_max(a_.wasm_v128, b_.wasm_v128);
1501     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
1502       r_.altivec_i8 = vec_max(a_.altivec_i8, b_.altivec_i8);
1503     #else
1504       SIMDE_VECTORIZE
1505       for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
1506         r_.i8[i] = a_.i8[i] > b_.i8[i] ? a_.i8[i] : b_.i8[i];
1507       }
1508     #endif
1509 
1510     return simde__m128i_from_private(r_);
1511   #endif
1512 }
1513 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1514   #undef _mm_max_epi8
1515   #define _mm_max_epi8(a, b) simde_mm_max_epi8(a, b)
1516 #endif
1517 
1518 SIMDE_FUNCTION_ATTRIBUTES
1519 simde__m128i
simde_mm_max_epi32(simde__m128i a,simde__m128i b)1520 simde_mm_max_epi32 (simde__m128i a, simde__m128i b) {
1521   #if defined(SIMDE_X86_SSE4_1_NATIVE) && !defined(__PGI)
1522     return _mm_max_epi32(a, b);
1523   #else
1524     simde__m128i_private
1525       r_,
1526       a_ = simde__m128i_to_private(a),
1527       b_ = simde__m128i_to_private(b);
1528 
1529     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1530       r_.neon_i32 = vmaxq_s32(a_.neon_i32, b_.neon_i32);
1531     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1532       r_.wasm_v128 = wasm_i32x4_max(a_.wasm_v128, b_.wasm_v128);
1533     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
1534       r_.altivec_i32 = vec_max(a_.altivec_i32, b_.altivec_i32);
1535     #else
1536       SIMDE_VECTORIZE
1537       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1538         r_.i32[i] = a_.i32[i] > b_.i32[i] ? a_.i32[i] : b_.i32[i];
1539       }
1540     #endif
1541 
1542     return simde__m128i_from_private(r_);
1543   #endif
1544 }
1545 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1546   #undef _mm_max_epi32
1547   #define _mm_max_epi32(a, b) simde_mm_max_epi32(a, b)
1548 #endif
1549 
1550 SIMDE_FUNCTION_ATTRIBUTES
1551 simde__m128i
simde_mm_max_epu16(simde__m128i a,simde__m128i b)1552 simde_mm_max_epu16 (simde__m128i a, simde__m128i b) {
1553   #if defined(SIMDE_X86_SSE4_1_NATIVE)
1554     return _mm_max_epu16(a, b);
1555   #else
1556     simde__m128i_private
1557       r_,
1558       a_ = simde__m128i_to_private(a),
1559       b_ = simde__m128i_to_private(b);
1560 
1561     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1562       r_.neon_u16 = vmaxq_u16(a_.neon_u16, b_.neon_u16);
1563     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1564       r_.wasm_v128 = wasm_u16x8_max(a_.wasm_v128, b_.wasm_v128);
1565     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
1566       r_.altivec_u16 = vec_max(a_.altivec_u16, b_.altivec_u16);
1567     #else
1568       SIMDE_VECTORIZE
1569       for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
1570         r_.u16[i] = a_.u16[i] > b_.u16[i] ? a_.u16[i] : b_.u16[i];
1571       }
1572     #endif
1573 
1574     return simde__m128i_from_private(r_);
1575   #endif
1576 }
1577 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1578   #undef _mm_max_epu16
1579   #define _mm_max_epu16(a, b) simde_mm_max_epu16(a, b)
1580 #endif
1581 
1582 SIMDE_FUNCTION_ATTRIBUTES
1583 simde__m128i
simde_mm_max_epu32(simde__m128i a,simde__m128i b)1584 simde_mm_max_epu32 (simde__m128i a, simde__m128i b) {
1585   #if defined(SIMDE_X86_SSE4_1_NATIVE)
1586     return _mm_max_epu32(a, b);
1587   #else
1588     simde__m128i_private
1589       r_,
1590       a_ = simde__m128i_to_private(a),
1591       b_ = simde__m128i_to_private(b);
1592 
1593     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1594       r_.neon_u32 = vmaxq_u32(a_.neon_u32, b_.neon_u32);
1595     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1596       r_.wasm_v128 = wasm_u32x4_max(a_.wasm_v128, b_.wasm_v128);
1597     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
1598       r_.altivec_u32 = vec_max(a_.altivec_u32, b_.altivec_u32);
1599     #else
1600       SIMDE_VECTORIZE
1601       for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
1602         r_.u32[i] = a_.u32[i] > b_.u32[i] ? a_.u32[i] : b_.u32[i];
1603       }
1604     #endif
1605 
1606     return simde__m128i_from_private(r_);
1607   #endif
1608 }
1609 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1610   #undef _mm_max_epu32
1611   #define _mm_max_epu32(a, b) simde_mm_max_epu32(a, b)
1612 #endif
1613 
1614 SIMDE_FUNCTION_ATTRIBUTES
1615 simde__m128i
simde_mm_min_epi8(simde__m128i a,simde__m128i b)1616 simde_mm_min_epi8 (simde__m128i a, simde__m128i b) {
1617   #if defined(SIMDE_X86_SSE4_1_NATIVE) && !defined(__PGI)
1618     return _mm_min_epi8(a, b);
1619   #else
1620     simde__m128i_private
1621       r_,
1622       a_ = simde__m128i_to_private(a),
1623       b_ = simde__m128i_to_private(b);
1624 
1625     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1626       r_.neon_i8 = vminq_s8(a_.neon_i8, b_.neon_i8);
1627     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1628       r_.wasm_v128 = wasm_i8x16_min(a_.wasm_v128, b_.wasm_v128);
1629     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
1630       r_.altivec_i8 = vec_min(a_.altivec_i8, b_.altivec_i8);
1631     #else
1632       SIMDE_VECTORIZE
1633       for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
1634         r_.i8[i] = a_.i8[i] < b_.i8[i] ? a_.i8[i] : b_.i8[i];
1635       }
1636     #endif
1637 
1638     return simde__m128i_from_private(r_);
1639   #endif
1640 }
1641 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1642   #undef _mm_min_epi8
1643   #define _mm_min_epi8(a, b) simde_mm_min_epi8(a, b)
1644 #endif
1645 
1646 SIMDE_FUNCTION_ATTRIBUTES
1647 simde__m128i
simde_mm_min_epi32(simde__m128i a,simde__m128i b)1648 simde_mm_min_epi32 (simde__m128i a, simde__m128i b) {
1649   #if defined(SIMDE_X86_SSE4_1_NATIVE) && !defined(__PGI)
1650     return _mm_min_epi32(a, b);
1651   #else
1652     simde__m128i_private
1653       r_,
1654       a_ = simde__m128i_to_private(a),
1655       b_ = simde__m128i_to_private(b);
1656 
1657     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1658       r_.neon_i32 = vminq_s32(a_.neon_i32, b_.neon_i32);
1659     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1660       r_.wasm_v128 = wasm_i32x4_min(a_.wasm_v128, b_.wasm_v128);
1661     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
1662       r_.altivec_i32 = vec_min(a_.altivec_i32, b_.altivec_i32);
1663     #else
1664       SIMDE_VECTORIZE
1665       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1666         r_.i32[i] = a_.i32[i] < b_.i32[i] ? a_.i32[i] : b_.i32[i];
1667       }
1668     #endif
1669 
1670     return simde__m128i_from_private(r_);
1671   #endif
1672 }
1673 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1674   #undef _mm_min_epi32
1675   #define _mm_min_epi32(a, b) simde_mm_min_epi32(a, b)
1676 #endif
1677 
1678 SIMDE_FUNCTION_ATTRIBUTES
1679 simde__m128i
simde_mm_min_epu16(simde__m128i a,simde__m128i b)1680 simde_mm_min_epu16 (simde__m128i a, simde__m128i b) {
1681   #if defined(SIMDE_X86_SSE4_1_NATIVE)
1682     return _mm_min_epu16(a, b);
1683   #else
1684     simde__m128i_private
1685       r_,
1686       a_ = simde__m128i_to_private(a),
1687       b_ = simde__m128i_to_private(b);
1688 
1689     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1690       r_.neon_u16 = vminq_u16(a_.neon_u16, b_.neon_u16);
1691     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1692       r_.wasm_v128 = wasm_u16x8_min(a_.wasm_v128, b_.wasm_v128);
1693     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
1694       r_.altivec_u16 = vec_min(a_.altivec_u16, b_.altivec_u16);
1695     #else
1696       SIMDE_VECTORIZE
1697       for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
1698         r_.u16[i] = a_.u16[i] < b_.u16[i] ? a_.u16[i] : b_.u16[i];
1699       }
1700     #endif
1701 
1702     return simde__m128i_from_private(r_);
1703   #endif
1704 }
1705 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1706   #undef _mm_min_epu16
1707   #define _mm_min_epu16(a, b) simde_mm_min_epu16(a, b)
1708 #endif
1709 
1710 SIMDE_FUNCTION_ATTRIBUTES
1711 simde__m128i
simde_mm_min_epu32(simde__m128i a,simde__m128i b)1712 simde_mm_min_epu32 (simde__m128i a, simde__m128i b) {
1713   #if defined(SIMDE_X86_SSE4_1_NATIVE)
1714     return _mm_min_epu32(a, b);
1715   #else
1716     simde__m128i_private
1717       r_,
1718       a_ = simde__m128i_to_private(a),
1719       b_ = simde__m128i_to_private(b);
1720 
1721     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1722       r_.neon_u32 = vminq_u32(a_.neon_u32, b_.neon_u32);
1723     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1724       r_.wasm_v128 = wasm_u32x4_min(a_.wasm_v128, b_.wasm_v128);
1725     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
1726       r_.altivec_u32 = vec_min(a_.altivec_u32, b_.altivec_u32);
1727     #else
1728       SIMDE_VECTORIZE
1729       for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
1730         r_.u32[i] = a_.u32[i] < b_.u32[i] ? a_.u32[i] : b_.u32[i];
1731       }
1732     #endif
1733 
1734     return simde__m128i_from_private(r_);
1735   #endif
1736 }
1737 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1738   #undef _mm_min_epu32
1739   #define _mm_min_epu32(a, b) simde_mm_min_epu32(a, b)
1740 #endif
1741 
1742 SIMDE_FUNCTION_ATTRIBUTES
1743 simde__m128i
simde_mm_minpos_epu16(simde__m128i a)1744 simde_mm_minpos_epu16 (simde__m128i a) {
1745   #if defined(SIMDE_X86_SSE4_1_NATIVE)
1746     return _mm_minpos_epu16(a);
1747   #else
1748     simde__m128i_private
1749       r_ = simde__m128i_to_private(simde_mm_setzero_si128()),
1750       a_ = simde__m128i_to_private(a);
1751 
1752     r_.u16[0] = UINT16_MAX;
1753     for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
1754       if (a_.u16[i] < r_.u16[0]) {
1755         r_.u16[0] = a_.u16[i];
1756         r_.u16[1] = HEDLEY_STATIC_CAST(uint16_t, i);
1757       }
1758     }
1759 
1760     return simde__m128i_from_private(r_);
1761   #endif
1762 }
1763 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1764   #undef _mm_minpos_epu16
1765   #define _mm_minpos_epu16(a) simde_mm_minpos_epu16(a)
1766 #endif
1767 
1768 SIMDE_FUNCTION_ATTRIBUTES
1769 simde__m128i
simde_mm_mpsadbw_epu8(simde__m128i a,simde__m128i b,const int imm8)1770 simde_mm_mpsadbw_epu8 (simde__m128i a, simde__m128i b, const int imm8)
1771     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)  {
1772   simde__m128i_private
1773     r_,
1774     a_ = simde__m128i_to_private(a),
1775     b_ = simde__m128i_to_private(b);
1776 
1777   const int a_offset = imm8 & 4;
1778   const int b_offset = (imm8 & 3) << 2;
1779 
1780 #if defined(simde_math_abs)
1781   for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, (sizeof(r_.u16) / sizeof(r_.u16[0]))) ; i++) {
1782     r_.u16[i] =
1783       HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[a_offset + i + 0] - b_.u8[b_offset + 0]))) +
1784       HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[a_offset + i + 1] - b_.u8[b_offset + 1]))) +
1785       HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[a_offset + i + 2] - b_.u8[b_offset + 2]))) +
1786       HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[a_offset + i + 3] - b_.u8[b_offset + 3])));
1787   }
1788 #else
1789   HEDLEY_UNREACHABLE();
1790 #endif
1791 
1792   return simde__m128i_from_private(r_);
1793 }
1794 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1795 #  define simde_mm_mpsadbw_epu8(a, b, imm8) _mm_mpsadbw_epu8(a, b, imm8)
1796 #endif
1797 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1798   #undef _mm_mpsadbw_epu8
1799   #define _mm_mpsadbw_epu8(a, b, imm8) simde_mm_mpsadbw_epu8(a, b, imm8)
1800 #endif
1801 
1802 SIMDE_FUNCTION_ATTRIBUTES
1803 simde__m128i
simde_mm_mul_epi32(simde__m128i a,simde__m128i b)1804 simde_mm_mul_epi32 (simde__m128i a, simde__m128i b) {
1805   #if defined(SIMDE_X86_SSE4_1_NATIVE)
1806     return _mm_mul_epi32(a, b);
1807   #else
1808     simde__m128i_private
1809       r_,
1810       a_ = simde__m128i_to_private(a),
1811       b_ = simde__m128i_to_private(b);
1812 
1813     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1814       // vmull_s32 upcasts instead of masking, so we downcast.
1815       int32x2_t a_lo = vmovn_s64(a_.neon_i64);
1816       int32x2_t b_lo = vmovn_s64(b_.neon_i64);
1817       r_.neon_i64 = vmull_s32(a_lo, b_lo);
1818     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1819       r_.wasm_v128 = wasm_i64x2_make(
1820         wasm_i32x4_extract_lane(a_.wasm_v128, 0) * HEDLEY_STATIC_CAST(int64_t, wasm_i32x4_extract_lane(b_.wasm_v128, 0)),
1821         wasm_i32x4_extract_lane(a_.wasm_v128, 2) * HEDLEY_STATIC_CAST(int64_t, wasm_i32x4_extract_lane(b_.wasm_v128, 2)));
1822     #else
1823       SIMDE_VECTORIZE
1824       for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
1825         r_.i64[i] =
1826           HEDLEY_STATIC_CAST(int64_t, a_.i32[i * 2]) *
1827           HEDLEY_STATIC_CAST(int64_t, b_.i32[i * 2]);
1828       }
1829     #endif
1830 
1831     return simde__m128i_from_private(r_);
1832   #endif
1833 }
1834 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1835   #undef _mm_mul_epi32
1836   #define _mm_mul_epi32(a, b) simde_mm_mul_epi32(a, b)
1837 #endif
1838 
1839 SIMDE_FUNCTION_ATTRIBUTES
1840 simde__m128i
simde_mm_mullo_epi32(simde__m128i a,simde__m128i b)1841 simde_mm_mullo_epi32 (simde__m128i a, simde__m128i b) {
1842   #if defined(SIMDE_X86_SSE4_1_NATIVE)
1843     return _mm_mullo_epi32(a, b);
1844   #else
1845     simde__m128i_private
1846       r_,
1847       a_ = simde__m128i_to_private(a),
1848       b_ = simde__m128i_to_private(b);
1849 
1850     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1851       r_.neon_i32 = vmulq_s32(a_.neon_i32, b_.neon_i32);
1852     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1853       (void) a_;
1854       (void) b_;
1855       r_.altivec_i32 = vec_mul(a_.altivec_i32, b_.altivec_i32);
1856     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1857       r_.wasm_v128 = wasm_i32x4_mul(a_.wasm_v128, b_.wasm_v128);
1858     #else
1859       SIMDE_VECTORIZE
1860       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1861         r_.u32[i] = HEDLEY_STATIC_CAST(uint32_t, (HEDLEY_STATIC_CAST(uint64_t, (HEDLEY_STATIC_CAST(int64_t, a_.i32[i]) * HEDLEY_STATIC_CAST(int64_t, b_.i32[i]))) & 0xffffffff));
1862       }
1863     #endif
1864 
1865     return simde__m128i_from_private(r_);
1866   #endif
1867 }
1868 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1869   #undef _mm_mullo_epi32
1870   #define _mm_mullo_epi32(a, b) simde_mm_mullo_epi32(a, b)
1871 #endif
1872 
1873 SIMDE_FUNCTION_ATTRIBUTES
1874 simde__m128i
simde_x_mm_mullo_epu32(simde__m128i a,simde__m128i b)1875 simde_x_mm_mullo_epu32 (simde__m128i a, simde__m128i b) {
1876   simde__m128i_private
1877     r_,
1878     a_ = simde__m128i_to_private(a),
1879     b_ = simde__m128i_to_private(b);
1880 
1881     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1882       r_.neon_u32 = vmulq_u32(a_.neon_u32, b_.neon_u32);
1883     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1884       r_.u32 = a_.u32 * b_.u32;
1885     #else
1886       SIMDE_VECTORIZE
1887       for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
1888         r_.u32[i] = a_.u32[i] * b_.u32[i];
1889       }
1890     #endif
1891 
1892   return simde__m128i_from_private(r_);
1893 }
1894 
1895 SIMDE_FUNCTION_ATTRIBUTES
1896 simde__m128i
simde_mm_packus_epi32(simde__m128i a,simde__m128i b)1897 simde_mm_packus_epi32 (simde__m128i a, simde__m128i b) {
1898   #if defined(SIMDE_X86_SSE4_1_NATIVE)
1899     return _mm_packus_epi32(a, b);
1900   #else
1901     simde__m128i_private
1902       r_,
1903       a_ = simde__m128i_to_private(a),
1904       b_ = simde__m128i_to_private(b);
1905 
1906     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1907       const int32x4_t z = vdupq_n_s32(0);
1908       r_.neon_u16 = vcombine_u16(
1909           vqmovn_u32(vreinterpretq_u32_s32(vmaxq_s32(z, a_.neon_i32))),
1910           vqmovn_u32(vreinterpretq_u32_s32(vmaxq_s32(z, b_.neon_i32))));
1911     #else
1912       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1913         r_.u16[i + 0] = (a_.i32[i] < 0) ? UINT16_C(0) : ((a_.i32[i] > UINT16_MAX) ? (UINT16_MAX) : HEDLEY_STATIC_CAST(uint16_t, a_.i32[i]));
1914         r_.u16[i + 4] = (b_.i32[i] < 0) ? UINT16_C(0) : ((b_.i32[i] > UINT16_MAX) ? (UINT16_MAX) : HEDLEY_STATIC_CAST(uint16_t, b_.i32[i]));
1915       }
1916     #endif
1917 
1918     return simde__m128i_from_private(r_);
1919   #endif
1920 }
1921 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1922   #undef _mm_packus_epi32
1923   #define _mm_packus_epi32(a, b) simde_mm_packus_epi32(a, b)
1924 #endif
1925 
1926 SIMDE_FUNCTION_ATTRIBUTES
1927 simde__m128d
simde_mm_round_sd(simde__m128d a,simde__m128d b,int rounding)1928 simde_mm_round_sd (simde__m128d a, simde__m128d b, int rounding)
1929     SIMDE_REQUIRE_CONSTANT_RANGE(rounding, 0, 15) {
1930   simde__m128d_private
1931     r_ = simde__m128d_to_private(a),
1932     b_ = simde__m128d_to_private(b);
1933 
1934   switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) {
1935     #if defined(simde_math_nearbyint)
1936       case SIMDE_MM_FROUND_TO_NEAREST_INT:
1937       case SIMDE_MM_FROUND_CUR_DIRECTION:
1938         r_.f64[0] = simde_math_nearbyint(b_.f64[0]);
1939         break;
1940     #endif
1941 
1942     #if defined(simde_math_floor)
1943       case SIMDE_MM_FROUND_TO_NEG_INF:
1944         r_.f64[0] = simde_math_floor(b_.f64[0]);
1945         break;
1946     #endif
1947 
1948     #if defined(simde_math_ceil)
1949       case SIMDE_MM_FROUND_TO_POS_INF:
1950         r_.f64[0] = simde_math_ceil(b_.f64[0]);
1951         break;
1952     #endif
1953 
1954     #if defined(simde_math_trunc)
1955       case SIMDE_MM_FROUND_TO_ZERO:
1956         r_.f64[0] = simde_math_trunc(b_.f64[0]);
1957         break;
1958     #endif
1959 
1960     default:
1961       HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
1962   }
1963 
1964   return simde__m128d_from_private(r_);
1965 }
1966 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1967 #  define simde_mm_round_sd(a, b, rounding) _mm_round_sd(a, b, rounding)
1968 #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
1969 #  define simde_mm_round_sd(a, b, rounding) simde_mm_move_sd(a, simde_mm_round_pd(b, rounding))
1970 #endif
1971 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1972   #undef _mm_round_sd
1973   #define _mm_round_sd(a, b, rounding) simde_mm_round_sd(a, b, rounding)
1974 #endif
1975 
1976 SIMDE_FUNCTION_ATTRIBUTES
1977 simde__m128
simde_mm_round_ss(simde__m128 a,simde__m128 b,int rounding)1978 simde_mm_round_ss (simde__m128 a, simde__m128 b, int rounding)
1979     SIMDE_REQUIRE_CONSTANT_RANGE(rounding, 0, 15) {
1980   simde__m128_private
1981     r_ = simde__m128_to_private(a),
1982     b_ = simde__m128_to_private(b);
1983 
1984   switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) {
1985     #if defined(simde_math_nearbyintf)
1986       case SIMDE_MM_FROUND_TO_NEAREST_INT:
1987       case SIMDE_MM_FROUND_CUR_DIRECTION:
1988         r_.f32[0] = simde_math_nearbyintf(b_.f32[0]);
1989         break;
1990     #endif
1991 
1992     #if defined(simde_math_floorf)
1993       case SIMDE_MM_FROUND_TO_NEG_INF:
1994         r_.f32[0] = simde_math_floorf(b_.f32[0]);
1995         break;
1996     #endif
1997 
1998     #if defined(simde_math_ceilf)
1999       case SIMDE_MM_FROUND_TO_POS_INF:
2000         r_.f32[0] = simde_math_ceilf(b_.f32[0]);
2001         break;
2002     #endif
2003 
2004     #if defined(simde_math_truncf)
2005       case SIMDE_MM_FROUND_TO_ZERO:
2006         r_.f32[0] = simde_math_truncf(b_.f32[0]);
2007         break;
2008     #endif
2009 
2010     default:
2011       HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
2012   }
2013 
2014   return simde__m128_from_private(r_);
2015 }
2016 #if defined(SIMDE_X86_SSE4_1_NATIVE)
2017 #  define simde_mm_round_ss(a, b, rounding) _mm_round_ss(a, b, rounding)
2018 #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
2019 #  define simde_mm_round_ss(a, b, rounding) simde_mm_move_ss(a, simde_mm_round_ps(b, rounding))
2020 #endif
2021 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
2022   #undef _mm_round_ss
2023   #define _mm_round_ss(a, b, rounding) simde_mm_round_ss(a, b, rounding)
2024 #endif
2025 
2026 SIMDE_FUNCTION_ATTRIBUTES
2027 simde__m128i
simde_mm_stream_load_si128(const simde__m128i * mem_addr)2028 simde_mm_stream_load_si128 (const simde__m128i* mem_addr) {
2029   #if defined(SIMDE_X86_SSE4_1_NATIVE)
2030     return _mm_stream_load_si128(HEDLEY_CONST_CAST(simde__m128i*, mem_addr));
2031   #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2032     return vreinterpretq_s64_s32(vld1q_s32(HEDLEY_REINTERPRET_CAST(int32_t const*, mem_addr)));
2033   #else
2034     return *mem_addr;
2035   #endif
2036 }
2037 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
2038   #undef _mm_stream_load_si128
2039   #define _mm_stream_load_si128(mem_addr) simde_mm_stream_load_si128(mem_addr)
2040 #endif
2041 
2042 SIMDE_FUNCTION_ATTRIBUTES
2043 int
simde_mm_test_all_ones(simde__m128i a)2044 simde_mm_test_all_ones (simde__m128i a) {
2045   #if defined(SIMDE_X86_SSE4_1_NATIVE)
2046     return _mm_test_all_ones(a);
2047   #else
2048     simde__m128i_private a_ = simde__m128i_to_private(a);
2049     int r;
2050 
2051     #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
2052       r = vec_all_eq(a_.altivec_i32, vec_splats(~0));
2053     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2054       return r = ((vgetq_lane_s64(a_.neon_i64, 0) & vgetq_lane_s64(a_.neon_i64, 1)) == ~HEDLEY_STATIC_CAST(int64_t, 0));
2055     #else
2056       int_fast32_t r_ = ~HEDLEY_STATIC_CAST(int_fast32_t, 0);
2057 
2058       SIMDE_VECTORIZE_REDUCTION(&:r_)
2059       for (size_t i = 0 ; i < (sizeof(a_.i32f) / sizeof(a_.i32f[0])) ; i++) {
2060         r_ &= a_.i32f[i];
2061       }
2062 
2063       r = (r_ == ~HEDLEY_STATIC_CAST(int_fast32_t, 0));
2064     #endif
2065 
2066     return r;
2067   #endif
2068 }
2069 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
2070   #undef _mm_test_all_ones
2071   #define _mm_test_all_ones(a) simde_mm_test_all_ones(a)
2072 #endif
2073 
2074 SIMDE_FUNCTION_ATTRIBUTES
2075 int
simde_mm_test_all_zeros(simde__m128i a,simde__m128i mask)2076 simde_mm_test_all_zeros (simde__m128i a, simde__m128i mask) {
2077   #if defined(SIMDE_X86_SSE4_1_NATIVE)
2078     return _mm_test_all_zeros(a, mask);
2079   #else
2080     simde__m128i_private tmp_ = simde__m128i_to_private(simde_mm_and_si128(a, mask));
2081     int r;
2082 
2083     #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
2084       r = vec_all_eq(tmp_.altivec_i32, vec_splats(0));
2085     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2086       return !(vgetq_lane_s64(tmp_.neon_i64, 0) | vgetq_lane_s64(tmp_.neon_i64, 1));
2087     #else
2088       int_fast32_t r_ = HEDLEY_STATIC_CAST(int_fast32_t, 0);
2089 
2090       SIMDE_VECTORIZE_REDUCTION(|:r_)
2091       for (size_t i = 0 ; i < (sizeof(tmp_.i32f) / sizeof(tmp_.i32f[0])) ; i++) {
2092         r_ |= tmp_.i32f[i];
2093       }
2094 
2095       r = !r_;
2096     #endif
2097 
2098     return r;
2099   #endif
2100 }
2101 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
2102   #undef _mm_test_all_zeros
2103   #define _mm_test_all_zeros(a, mask) simde_mm_test_all_zeros(a, mask)
2104 #endif
2105 
2106 SIMDE_FUNCTION_ATTRIBUTES
2107 int
simde_mm_test_mix_ones_zeros(simde__m128i a,simde__m128i mask)2108 simde_mm_test_mix_ones_zeros (simde__m128i a, simde__m128i mask) {
2109   #if defined(SIMDE_X86_SSE4_1_NATIVE)
2110     return _mm_test_mix_ones_zeros(a, mask);
2111   #else
2112     simde__m128i_private
2113       a_ = simde__m128i_to_private(a),
2114       mask_ = simde__m128i_to_private(mask);
2115 
2116     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2117       int64x2_t s640 = vandq_s64(a_.neon_i64, mask_.neon_i64);
2118       int64x2_t s641 = vandq_s64(vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_s64(a_.neon_i64))), mask_.neon_i64);
2119       return (((vgetq_lane_s64(s640, 0) | vgetq_lane_s64(s640, 1)) & (vgetq_lane_s64(s641, 0) | vgetq_lane_s64(s641, 1)))!=0);
2120     #else
2121       for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++)
2122         if (((a_.u64[i] & mask_.u64[i]) != 0) && ((~a_.u64[i] & mask_.u64[i]) != 0))
2123           return 1;
2124 
2125       return 0;
2126     #endif
2127   #endif
2128 }
2129 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
2130   #undef _mm_test_mix_ones_zeros
2131   #define _mm_test_mix_ones_zeros(a, mask) simde_mm_test_mix_ones_zeros(a, mask)
2132 #endif
2133 
2134 SIMDE_FUNCTION_ATTRIBUTES
2135 int
simde_mm_testc_si128(simde__m128i a,simde__m128i b)2136 simde_mm_testc_si128 (simde__m128i a, simde__m128i b) {
2137   #if defined(SIMDE_X86_SSE4_1_NATIVE)
2138     return _mm_testc_si128(a, b);
2139   #else
2140     simde__m128i_private
2141       a_ = simde__m128i_to_private(a),
2142       b_ = simde__m128i_to_private(b);
2143 
2144     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2145       int64x2_t s64 = vandq_s64(~a_.neon_i64, b_.neon_i64);
2146       return !(vgetq_lane_s64(s64, 0) & vgetq_lane_s64(s64, 1));
2147     #else
2148       int_fast32_t r = 0;
2149 
2150       SIMDE_VECTORIZE_REDUCTION(|:r)
2151       for (size_t i = 0 ; i < (sizeof(a_.i32f) / sizeof(a_.i32f[0])) ; i++) {
2152         r |= ~a_.i32f[i] & b_.i32f[i];
2153       }
2154 
2155       return HEDLEY_STATIC_CAST(int, !r);
2156     #endif
2157   #endif
2158 }
2159 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
2160   #undef _mm_testc_si128
2161   #define _mm_testc_si128(a, b) simde_mm_testc_si128(a, b)
2162 #endif
2163 
2164 SIMDE_FUNCTION_ATTRIBUTES
2165 int
simde_mm_testnzc_si128(simde__m128i a,simde__m128i b)2166 simde_mm_testnzc_si128 (simde__m128i a, simde__m128i b) {
2167   #if defined(SIMDE_X86_SSE4_1_NATIVE)
2168     return _mm_testnzc_si128(a, b);
2169   #else
2170     simde__m128i_private
2171       a_ = simde__m128i_to_private(a),
2172       b_ = simde__m128i_to_private(b);
2173 
2174     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2175       int64x2_t s640 = vandq_s64(a_.neon_i64, b_.neon_i64);
2176       int64x2_t s641 = vandq_s64(~a_.neon_i64, b_.neon_i64);
2177       return (((vgetq_lane_s64(s640, 0) | vgetq_lane_s64(s640, 1)) & (vgetq_lane_s64(s641, 0) | vgetq_lane_s64(s641, 1)))!=0);
2178     #else
2179       for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) {
2180         if (((a_.u64[i] & b_.u64[i]) != 0) && ((~a_.u64[i] & b_.u64[i]) != 0))
2181           return 1;
2182       }
2183 
2184       return 0;
2185     #endif
2186   #endif
2187 }
2188 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
2189   #undef _mm_testnzc_si128
2190   #define _mm_testnzc_si128(a, b) simde_mm_testnzc_si128(a, b)
2191 #endif
2192 
2193 SIMDE_FUNCTION_ATTRIBUTES
2194 int
simde_mm_testz_si128(simde__m128i a,simde__m128i b)2195 simde_mm_testz_si128 (simde__m128i a, simde__m128i b) {
2196   #if defined(SIMDE_X86_SSE4_1_NATIVE)
2197     return _mm_testz_si128(a, b);
2198   #else
2199     simde__m128i_private
2200       a_ = simde__m128i_to_private(a),
2201       b_ = simde__m128i_to_private(b);
2202 
2203     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2204       int64x2_t s64 = vandq_s64(a_.neon_i64, b_.neon_i64);
2205       return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
2206     #else
2207       for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) {
2208         if ((a_.u64[i] & b_.u64[i]) == 0)
2209           return 1;
2210       }
2211     #endif
2212 
2213     return 0;
2214   #endif
2215 }
2216 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
2217   #undef _mm_testz_si128
2218   #define _mm_testz_si128(a, b) simde_mm_testz_si128(a, b)
2219 #endif
2220 
2221 SIMDE_END_DECLS_
2222 
2223 HEDLEY_DIAGNOSTIC_POP
2224 
2225 #endif /* !defined(SIMDE_X86_SSE4_1_H) */
2226