1 /* SPDX-License-Identifier: MIT
2  *
3  * Permission is hereby granted, free of charge, to any person
4  * obtaining a copy of this software and associated documentation
5  * files (the "Software"), to deal in the Software without
6  * restriction, including without limitation the rights to use, copy,
7  * modify, merge, publish, distribute, sublicense, and/or sell copies
8  * of the Software, and to permit persons to whom the Software is
9  * furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be
12  * included in all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
18  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
19  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  * Copyright:
24  *   2017-2020 Evan Nemerson <evan@nemerson.com>
25  */
26 
27 #include "sse.h"
28 #if !defined(SIMDE_X86_SSE4_1_H)
29 #define SIMDE_X86_SSE4_1_H
30 
31 #include "ssse3.h"
32 
33 HEDLEY_DIAGNOSTIC_PUSH
34 SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
35 SIMDE_BEGIN_DECLS_
36 
37 #if !defined(SIMDE_X86_SSE4_1_NATIVE) && defined(SIMDE_ENABLE_NATIVE_ALIASES)
38 #  define SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES
39 #endif
40 
41 SIMDE_FUNCTION_ATTRIBUTES
42 simde__m128i
simde_mm_blend_epi16(simde__m128i a,simde__m128i b,const int imm8)43 simde_mm_blend_epi16 (simde__m128i a, simde__m128i b, const int imm8)
44     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)  {
45   simde__m128i_private
46     r_,
47     a_ = simde__m128i_to_private(a),
48     b_ = simde__m128i_to_private(b);
49 
50   SIMDE_VECTORIZE
51   for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
52     r_.u16[i] = ((imm8 >> i) & 1) ? b_.u16[i] : a_.u16[i];
53   }
54 
55   return simde__m128i_from_private(r_);
56 }
57 #if defined(SIMDE_X86_SSE4_1_NATIVE)
58 #  define simde_mm_blend_epi16(a, b, imm8) _mm_blend_epi16(a, b, imm8)
59 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
60 #  define simde_mm_blend_epi16(a, b, imm8) \
61      (__extension__ ({ \
62            const uint16_t _mask[8] = {               \
63                ((imm8) & (1 << 0)) ? 0xFFFF : 0x0000, \
64                ((imm8) & (1 << 1)) ? 0xFFFF : 0x0000, \
65                ((imm8) & (1 << 2)) ? 0xFFFF : 0x0000, \
66                ((imm8) & (1 << 3)) ? 0xFFFF : 0x0000, \
67                ((imm8) & (1 << 4)) ? 0xFFFF : 0x0000, \
68                ((imm8) & (1 << 5)) ? 0xFFFF : 0x0000, \
69                ((imm8) & (1 << 6)) ? 0xFFFF : 0x0000, \
70                ((imm8) & (1 << 7)) ? 0xFFFF : 0x0000  \
71            };                                        \
72            uint16x8_t _mask_vec = vld1q_u16(_mask);  \
73            simde__m128i_from_neon_u16(vbslq_u16(_mask_vec, simde__m128i_to_neon_u16(b), simde__m128i_to_neon_u16(a))); \
74        }))
75 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
76 #  define simde_mm_blend_epi16(a, b, imm8)      \
77      (__extension__ ({ \
78            const SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) _mask = {      \
79                ((imm8) & (1 << 0)) ? 0xFFFF : 0x0000, \
80                ((imm8) & (1 << 1)) ? 0xFFFF : 0x0000, \
81                ((imm8) & (1 << 2)) ? 0xFFFF : 0x0000, \
82                ((imm8) & (1 << 3)) ? 0xFFFF : 0x0000, \
83                ((imm8) & (1 << 4)) ? 0xFFFF : 0x0000, \
84                ((imm8) & (1 << 5)) ? 0xFFFF : 0x0000, \
85                ((imm8) & (1 << 6)) ? 0xFFFF : 0x0000, \
86                ((imm8) & (1 << 7)) ? 0xFFFF : 0x0000  \
87            };                                         \
88            simde__m128i_from_altivec_u16(vec_sel(simde__m128i_to_altivec_u16(a), simde__m128i_to_altivec_u16(b), _mask)); \
89        }))
90 #endif
91 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
92   #undef _mm_blend_epi16
93   #define _mm_blend_epi16(a, b, imm8) simde_mm_blend_epi16(a, b, imm8)
94 #endif
95 
96 SIMDE_FUNCTION_ATTRIBUTES
97 simde__m128d
simde_mm_blend_pd(simde__m128d a,simde__m128d b,const int imm8)98 simde_mm_blend_pd (simde__m128d a, simde__m128d b, const int imm8)
99     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3)  {
100   simde__m128d_private
101     r_,
102     a_ = simde__m128d_to_private(a),
103     b_ = simde__m128d_to_private(b);
104 
105   SIMDE_VECTORIZE
106   for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
107     r_.f64[i] = ((imm8 >> i) & 1) ? b_.f64[i] : a_.f64[i];
108   }
109   return simde__m128d_from_private(r_);
110 }
111 #if defined(SIMDE_X86_SSE4_1_NATIVE)
112 #  define simde_mm_blend_pd(a, b, imm8) _mm_blend_pd(a, b, imm8)
113 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
114 #  define simde_mm_blend_pd(a, b, imm8) \
115      (__extension__ ({ \
116            const uint64_t _mask[2] = {               \
117                ((imm8) & (1 << 0)) ? UINT64_MAX : 0, \
118                ((imm8) & (1 << 1)) ? UINT64_MAX : 0  \
119            };                                        \
120            uint64x2_t _mask_vec = vld1q_u64(_mask);  \
121            simde__m128d_from_neon_u64(vbslq_u64(_mask_vec, simde__m128d_to_neon_u64(b), simde__m128d_to_neon_u64(a))); \
122        }))
123 #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
124 #  define simde_mm_blend_pd(a, b, imm8)         \
125      (__extension__ ({ \
126            const SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) _mask = { \
127                ((imm8) & (1 << 0)) ? UINT64_MAX : 0, \
128                ((imm8) & (1 << 1)) ? UINT64_MAX : 0  \
129            };                                        \
130            simde__m128d_from_altivec_f64(vec_sel(simde__m128d_to_altivec_f64(a), simde__m128d_to_altivec_f64(b), _mask)); \
131        }))
132 #endif
133 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
134   #undef _mm_blend_pd
135   #define _mm_blend_pd(a, b, imm8) simde_mm_blend_pd(a, b, imm8)
136 #endif
137 
138 SIMDE_FUNCTION_ATTRIBUTES
139 simde__m128
simde_mm_blend_ps(simde__m128 a,simde__m128 b,const int imm8)140 simde_mm_blend_ps (simde__m128 a, simde__m128 b, const int imm8)
141     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15)  {
142   simde__m128_private
143     r_,
144     a_ = simde__m128_to_private(a),
145     b_ = simde__m128_to_private(b);
146 
147   SIMDE_VECTORIZE
148   for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
149     r_.f32[i] = ((imm8 >> i) & 1) ? b_.f32[i] : a_.f32[i];
150   }
151   return simde__m128_from_private(r_);
152 }
153 #if defined(SIMDE_X86_SSE4_1_NATIVE)
154 #  define simde_mm_blend_ps(a, b, imm8) _mm_blend_ps(a, b, imm8)
155 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
156 #  define simde_mm_blend_ps(a, b, imm8) \
157      (__extension__ ({ \
158            const uint32_t _mask[4] = {               \
159                ((imm8) & (1 << 0)) ? UINT32_MAX : 0, \
160                ((imm8) & (1 << 1)) ? UINT32_MAX : 0, \
161                ((imm8) & (1 << 2)) ? UINT32_MAX : 0, \
162                ((imm8) & (1 << 3)) ? UINT32_MAX : 0  \
163            };                                        \
164            uint32x4_t _mask_vec = vld1q_u32(_mask);  \
165            simde__m128_from_neon_f32(vbslq_f32(_mask_vec, simde__m128_to_neon_f32(b), simde__m128_to_neon_f32(a))); \
166        }))
167 #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
168 #  define simde_mm_blend_ps(a, b, imm8) \
169      (__extension__ ({ \
170            const SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) _mask = {       \
171                ((imm8) & (1 << 0)) ? UINT32_MAX : 0, \
172                ((imm8) & (1 << 1)) ? UINT32_MAX : 0, \
173                ((imm8) & (1 << 2)) ? UINT32_MAX : 0, \
174                ((imm8) & (1 << 3)) ? UINT32_MAX : 0  \
175            };                                        \
176            simde__m128_from_altivec_f32(vec_sel(simde__m128_to_altivec_f32(a), simde__m128_to_altivec_f32(b), _mask)); \
177        }))
178 #endif
179 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
180   #undef _mm_blend_ps
181   #define _mm_blend_ps(a, b, imm8) simde_mm_blend_ps(a, b, imm8)
182 #endif
183 
184 SIMDE_FUNCTION_ATTRIBUTES
185 simde__m128i
simde_mm_blendv_epi8(simde__m128i a,simde__m128i b,simde__m128i mask)186 simde_mm_blendv_epi8 (simde__m128i a, simde__m128i b, simde__m128i mask) {
187   #if defined(SIMDE_X86_SSE4_1_NATIVE)
188     return _mm_blendv_epi8(a, b, mask);
189   #else
190     simde__m128i_private
191       r_,
192       a_ = simde__m128i_to_private(a),
193       b_ = simde__m128i_to_private(b),
194       mask_ = simde__m128i_to_private(mask);
195 
196     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
197       /* Use a signed shift right to create a mask with the sign bit */
198       mask_.neon_i8 = vshrq_n_s8(mask_.neon_i8, 7);
199       r_.neon_i8 = vbslq_s8(mask_.neon_u8, b_.neon_i8, a_.neon_i8);
200     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
201       v128_t m = wasm_i8x16_shr(mask_.wasm_v128, 7);
202       r_.wasm_v128 = wasm_v128_or(wasm_v128_and(b_.wasm_v128, m), wasm_v128_andnot(a_.wasm_v128, m));
203     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
204       r_.altivec_i8 = vec_sel(a_.altivec_i8, b_.altivec_i8, vec_cmplt(mask_.altivec_i8, vec_splat_s8(0)));
205     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
206       /* https://software.intel.com/en-us/forums/intel-c-compiler/topic/850087 */
207       #if defined(HEDLEY_INTEL_VERSION_CHECK)
208         __typeof__(mask_.i8) z = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
209         mask_.i8 = HEDLEY_STATIC_CAST(__typeof__(mask_.i8), mask_.i8 < z);
210       #else
211         mask_.i8 >>= (CHAR_BIT * sizeof(mask_.i8[0])) - 1;
212       #endif
213 
214       r_.i8 = (mask_.i8 & b_.i8) | (~mask_.i8 & a_.i8);
215     #else
216       SIMDE_VECTORIZE
217       for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
218         int8_t m = mask_.i8[i] >> 7;
219         r_.i8[i] = (m & b_.i8[i]) | (~m & a_.i8[i]);
220       }
221     #endif
222 
223     return simde__m128i_from_private(r_);
224   #endif
225 }
226 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
227   #undef _mm_blendv_epi8
228   #define _mm_blendv_epi8(a, b, mask) simde_mm_blendv_epi8(a, b, mask)
229 #endif
230 
231 SIMDE_FUNCTION_ATTRIBUTES
232 simde__m128i
simde_x_mm_blendv_epi16(simde__m128i a,simde__m128i b,simde__m128i mask)233 simde_x_mm_blendv_epi16 (simde__m128i a, simde__m128i b, simde__m128i mask) {
234   #if defined(SIMDE_X86_SSE2_NATIVE)
235     mask = simde_mm_srai_epi16(mask, 15);
236     return simde_mm_or_si128(simde_mm_and_si128(mask, b), simde_mm_andnot_si128(mask, a));
237   #else
238     simde__m128i_private
239       r_,
240       a_ = simde__m128i_to_private(a),
241       b_ = simde__m128i_to_private(b),
242       mask_ = simde__m128i_to_private(mask);
243 
244     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
245       mask_ = simde__m128i_to_private(simde_mm_cmplt_epi16(mask, simde_mm_setzero_si128()));
246       r_.neon_i16 = vbslq_s16(mask_.neon_u16, b_.neon_i16, a_.neon_i16);
247     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
248       r_.altivec_i16 = vec_sel(a_.altivec_i16, b_.altivec_i16, vec_cmplt(mask_.altivec_i16, vec_splat_s16(0)));
249     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
250       #if defined(HEDLEY_INTEL_VERSION_CHECK)
251         __typeof__(mask_.i16) z = { 0, 0, 0, 0, 0, 0, 0, 0 };
252         mask_.i16 = mask_.i16 < z;
253       #else
254         mask_.i16 >>= (CHAR_BIT * sizeof(mask_.i16[0])) - 1;
255       #endif
256 
257       r_.i16 = (mask_.i16 & b_.i16) | (~mask_.i16 & a_.i16);
258     #else
259       SIMDE_VECTORIZE
260       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
261         int16_t m = mask_.i16[i] >> 15;
262         r_.i16[i] = (m & b_.i16[i]) | (~m & a_.i16[i]);
263       }
264     #endif
265 
266     return simde__m128i_from_private(r_);
267   #endif
268 }
269 
270 SIMDE_FUNCTION_ATTRIBUTES
271 simde__m128i
simde_x_mm_blendv_epi32(simde__m128i a,simde__m128i b,simde__m128i mask)272 simde_x_mm_blendv_epi32 (simde__m128i a, simde__m128i b, simde__m128i mask) {
273   #if defined(SIMDE_X86_SSE4_1_NATIVE)
274     return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _mm_castsi128_ps(mask)));
275   #else
276     simde__m128i_private
277       r_,
278       a_ = simde__m128i_to_private(a),
279       b_ = simde__m128i_to_private(b),
280       mask_ = simde__m128i_to_private(mask);
281 
282     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
283       mask_ = simde__m128i_to_private(simde_mm_cmplt_epi32(mask, simde_mm_setzero_si128()));
284       r_.neon_i32 = vbslq_s32(mask_.neon_u32, b_.neon_i32, a_.neon_i32);
285     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
286       v128_t m = wasm_i32x4_shr(mask_.wasm_v128, 31);
287       r_.wasm_v128 = wasm_v128_or(wasm_v128_and(b_.wasm_v128, m), wasm_v128_andnot(a_.wasm_v128, m));
288     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
289       r_.altivec_i32 = vec_sel(a_.altivec_i32, b_.altivec_i32, vec_cmplt(mask_.altivec_i32, vec_splat_s32(0)));
290     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
291       #if defined(HEDLEY_INTEL_VERSION_CHECK)
292         __typeof__(mask_.i32) z = { 0, 0, 0, 0 };
293         mask_.i32 = HEDLEY_STATIC_CAST(__typeof__(mask_.i32), mask_.i32 < z);
294       #else
295         mask_.i32 >>= (CHAR_BIT * sizeof(mask_.i32[0])) - 1;
296       #endif
297 
298       r_.i32 = (mask_.i32 & b_.i32) | (~mask_.i32 & a_.i32);
299     #else
300       SIMDE_VECTORIZE
301       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
302         int32_t m = mask_.i32[i] >> 31;
303         r_.i32[i] = (m & b_.i32[i]) | (~m & a_.i32[i]);
304       }
305     #endif
306 
307     return simde__m128i_from_private(r_);
308   #endif
309 }
310 
311 SIMDE_FUNCTION_ATTRIBUTES
312 simde__m128i
simde_x_mm_blendv_epi64(simde__m128i a,simde__m128i b,simde__m128i mask)313 simde_x_mm_blendv_epi64 (simde__m128i a, simde__m128i b, simde__m128i mask) {
314   #if defined(SIMDE_X86_SSE4_1_NATIVE)
315     return _mm_castpd_si128(_mm_blendv_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(b), _mm_castsi128_pd(mask)));
316   #else
317     simde__m128i_private
318       r_,
319       a_ = simde__m128i_to_private(a),
320       b_ = simde__m128i_to_private(b),
321       mask_ = simde__m128i_to_private(mask);
322 
323     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
324       mask_.u64 = vcltq_s64(mask_.i64, vdupq_n_s64(UINT64_C(0)));
325       r_.neon_i64 = vbslq_s64(mask_.neon_u64, b_.neon_i64, a_.neon_i64);
326     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
327       v128_t m = wasm_i64x2_shr(mask_.wasm_v128, 63);
328       r_.wasm_v128 = wasm_v128_or(wasm_v128_and(b_.wasm_v128, m), wasm_v128_andnot(a_.wasm_v128, m));
329     #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
330       /* Using int due to clang bug #46770 */
331       SIMDE_POWER_ALTIVEC_VECTOR(signed long long) selector = vec_sra(mask_.altivec_i64, vec_splats(HEDLEY_STATIC_CAST(unsigned long long, 63)));
332       r_.altivec_i32 = vec_sel(a_.altivec_i32, b_.altivec_i32, HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned int), selector));
333     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
334       #if defined(HEDLEY_INTEL_VERSION_CHECK)
335         __typeof__(mask_.i64) z = { 0, 0 };
336         mask_.i64 = HEDLEY_STATIC_CAST(__typeof__(mask_.i64), mask_.i64 < z);
337       #else
338         mask_.i64 >>= (CHAR_BIT * sizeof(mask_.i64[0])) - 1;
339       #endif
340 
341     r_.i64 = (mask_.i64 & b_.i64) | (~mask_.i64 & a_.i64);
342   #else
343     SIMDE_VECTORIZE
344     for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
345       int64_t m = mask_.i64[i] >> 63;
346       r_.i64[i] = (m & b_.i64[i]) | (~m & a_.i64[i]);
347     }
348   #endif
349 
350     return simde__m128i_from_private(r_);
351   #endif
352 }
353 
354 SIMDE_FUNCTION_ATTRIBUTES
355 simde__m128d
simde_mm_blendv_pd(simde__m128d a,simde__m128d b,simde__m128d mask)356 simde_mm_blendv_pd (simde__m128d a, simde__m128d b, simde__m128d mask) {
357   #if defined(SIMDE_X86_SSE4_1_NATIVE)
358     return _mm_blendv_pd(a, b, mask);
359   #else
360     return simde_mm_castsi128_pd(simde_x_mm_blendv_epi64(simde_mm_castpd_si128(a), simde_mm_castpd_si128(b), simde_mm_castpd_si128(mask)));
361   #endif
362 }
363 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
364   #undef _mm_blendv_pd
365   #define _mm_blendv_pd(a, b, mask) simde_mm_blendv_pd(a, b, mask)
366 #endif
367 
368 SIMDE_FUNCTION_ATTRIBUTES
369 simde__m128
simde_mm_blendv_ps(simde__m128 a,simde__m128 b,simde__m128 mask)370 simde_mm_blendv_ps (simde__m128 a, simde__m128 b, simde__m128 mask) {
371   #if defined(SIMDE_X86_SSE4_1_NATIVE)
372     return _mm_blendv_ps(a, b, mask);
373   #else
374     return simde_mm_castsi128_ps(simde_x_mm_blendv_epi32(simde_mm_castps_si128(a), simde_mm_castps_si128(b), simde_mm_castps_si128(mask)));
375   #endif
376 }
377 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
378   #undef _mm_blendv_ps
379   #define _mm_blendv_ps(a, b, mask) simde_mm_blendv_ps(a, b, mask)
380 #endif
381 
382 SIMDE_FUNCTION_ATTRIBUTES
383 simde__m128d
simde_mm_round_pd(simde__m128d a,int rounding)384 simde_mm_round_pd (simde__m128d a, int rounding)
385     SIMDE_REQUIRE_CONSTANT_RANGE(rounding, 0, 15) {
386   simde__m128d_private
387     r_,
388     a_ = simde__m128d_to_private(a);
389 
390   /* For architectures which lack a current direction SIMD instruction. */
391   #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
392     if ((rounding & 7) == SIMDE_MM_FROUND_CUR_DIRECTION)
393       rounding = HEDLEY_STATIC_CAST(int, SIMDE_MM_GET_ROUNDING_MODE()) << 13;
394   #endif
395 
396   switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) {
397     case SIMDE_MM_FROUND_CUR_DIRECTION:
398       #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
399         r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_round(a_.altivec_f64));
400       #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
401         r_.neon_f64 = vrndiq_f64(a_.neon_f64);
402       #elif defined(simde_math_nearbyint)
403         SIMDE_VECTORIZE
404         for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
405           r_.f64[i] = simde_math_nearbyint(a_.f64[i]);
406         }
407       #else
408         HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
409       #endif
410       break;
411 
412     case SIMDE_MM_FROUND_TO_NEAREST_INT:
413       #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
414         r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_round(a_.altivec_f64));
415       #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
416         r_.neon_f64 = vrndaq_f64(a_.neon_f64);
417       #elif defined(simde_math_roundeven)
418         SIMDE_VECTORIZE
419         for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
420           r_.f64[i] = simde_math_roundeven(a_.f64[i]);
421         }
422       #else
423         HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
424       #endif
425       break;
426 
427     case SIMDE_MM_FROUND_TO_NEG_INF:
428       #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
429         r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_floor(a_.altivec_f64));
430       #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
431         r_.neon_f64 = vrndmq_f64(a_.neon_f64);
432       #else
433         SIMDE_VECTORIZE
434         for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
435           r_.f64[i] = simde_math_floor(a_.f64[i]);
436         }
437       #endif
438       break;
439 
440     case SIMDE_MM_FROUND_TO_POS_INF:
441       #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
442         r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_ceil(a_.altivec_f64));
443       #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
444         r_.neon_f64 = vrndpq_f64(a_.neon_f64);
445       #elif defined(simde_math_ceil)
446         SIMDE_VECTORIZE
447         for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
448           r_.f64[i] = simde_math_ceil(a_.f64[i]);
449         }
450       #else
451         HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
452       #endif
453       break;
454 
455     case SIMDE_MM_FROUND_TO_ZERO:
456       #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
457         r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_trunc(a_.altivec_f64));
458       #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
459         r_.neon_f64 = vrndq_f64(a_.neon_f64);
460       #else
461         SIMDE_VECTORIZE
462         for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
463           r_.f64[i] = simde_math_trunc(a_.f64[i]);
464         }
465       #endif
466       break;
467 
468     default:
469       HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
470   }
471 
472   return simde__m128d_from_private(r_);
473 }
474 #if defined(SIMDE_X86_SSE4_1_NATIVE)
475   #define simde_mm_round_pd(a, rounding) _mm_round_pd(a, rounding)
476 #endif
477 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
478   #undef _mm_round_pd
479   #define _mm_round_pd(a, rounding) simde_mm_round_pd(a, rounding)
480 #endif
481 
482 SIMDE_FUNCTION_ATTRIBUTES
483 simde__m128d
simde_mm_ceil_pd(simde__m128d a)484 simde_mm_ceil_pd (simde__m128d a) {
485   return simde_mm_round_pd(a, SIMDE_MM_FROUND_TO_POS_INF);
486 }
487 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
488   #undef _mm_ceil_pd
489   #define _mm_ceil_pd(a) simde_mm_ceil_pd(a)
490 #endif
491 
492 SIMDE_FUNCTION_ATTRIBUTES
493 simde__m128
simde_mm_ceil_ps(simde__m128 a)494 simde_mm_ceil_ps (simde__m128 a) {
495   return simde_mm_round_ps(a, SIMDE_MM_FROUND_TO_POS_INF);
496 }
497 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
498   #undef _mm_ceil_ps
499   #define _mm_ceil_ps(a) simde_mm_ceil_ps(a)
500 #endif
501 
502 SIMDE_FUNCTION_ATTRIBUTES
503 simde__m128d
simde_mm_ceil_sd(simde__m128d a,simde__m128d b)504 simde_mm_ceil_sd (simde__m128d a, simde__m128d b) {
505   #if defined(SIMDE_X86_SSE4_1_NATIVE)
506     return _mm_ceil_sd(a, b);
507   #else
508     simde__m128d_private
509       r_,
510       a_ = simde__m128d_to_private(a),
511       b_ = simde__m128d_to_private(b);
512 
513     #if defined(simde_math_ceilf)
514       r_ = simde__m128d_to_private(simde_mm_set_pd(a_.f64[1], simde_math_ceil(b_.f64[0])));
515     #else
516       HEDLEY_UNREACHABLE();
517     #endif
518 
519     return simde__m128d_from_private(r_);
520   #endif
521 }
522 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
523   #undef _mm_ceil_sd
524   #define _mm_ceil_sd(a, b) simde_mm_ceil_sd(a, b)
525 #endif
526 
527 SIMDE_FUNCTION_ATTRIBUTES
528 simde__m128
simde_mm_ceil_ss(simde__m128 a,simde__m128 b)529 simde_mm_ceil_ss (simde__m128 a, simde__m128 b) {
530   #if defined(SIMDE_X86_SSE4_1_NATIVE)
531     return _mm_ceil_ss(a, b);
532   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
533     return simde_mm_move_ss(a, simde_mm_ceil_ps(b));
534   #else
535     simde__m128_private
536       r_,
537       a_ = simde__m128_to_private(a),
538       b_ = simde__m128_to_private(b);
539 
540     #if defined(simde_math_ceilf)
541       r_ = simde__m128_to_private(simde_mm_set_ps(a_.f32[3], a_.f32[2], a_.f32[1], simde_math_ceilf(b_.f32[0])));
542     #else
543       HEDLEY_UNREACHABLE();
544     #endif
545 
546     return simde__m128_from_private(r_);
547   #endif
548 }
549 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
550   #undef _mm_ceil_ss
551   #define _mm_ceil_ss(a, b) simde_mm_ceil_ss(a, b)
552 #endif
553 
554 SIMDE_FUNCTION_ATTRIBUTES
555 simde__m128i
simde_mm_cmpeq_epi64(simde__m128i a,simde__m128i b)556 simde_mm_cmpeq_epi64 (simde__m128i a, simde__m128i b) {
557   #if defined(SIMDE_X86_SSE4_1_NATIVE)
558     return _mm_cmpeq_epi64(a, b);
559   #else
560     simde__m128i_private
561       r_,
562       a_ = simde__m128i_to_private(a),
563       b_ = simde__m128i_to_private(b);
564 
565     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
566       r_.neon_u64 = vceqq_u64(a_.neon_u64, b_.neon_u64);
567     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
568       /* (a == b) -> (a_lo == b_lo) && (a_hi == b_hi) */
569       uint32x4_t cmp = vceqq_u32(a_.neon_u32, b_.neon_u32);
570       uint32x4_t swapped = vrev64q_u32(cmp);
571       r_.neon_u32 = vandq_u32(cmp, swapped);
572     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
573       r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), a_.i64 == b_.i64);
574     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
575       r_.altivec_i64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed long long), vec_cmpeq(a_.altivec_i64, b_.altivec_i64));
576     #else
577       SIMDE_VECTORIZE
578       for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
579         r_.u64[i] = (a_.u64[i] == b_.u64[i]) ? ~UINT64_C(0) : UINT64_C(0);
580       }
581     #endif
582 
583     return simde__m128i_from_private(r_);
584   #endif
585 }
586 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
587   #undef _mm_cmpeq_epi64
588   #define _mm_cmpeq_epi64(a, b) simde_mm_cmpeq_epi64(a, b)
589 #endif
590 
591 SIMDE_FUNCTION_ATTRIBUTES
592 simde__m128i
simde_mm_cvtepi8_epi16(simde__m128i a)593 simde_mm_cvtepi8_epi16 (simde__m128i a) {
594   #if defined(SIMDE_X86_SSE4_1_NATIVE)
595     return _mm_cvtepi8_epi16(a);
596   #elif defined(SIMDE_X86_SSE2_NATIVE)
597     return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8);
598   #else
599     simde__m128i_private
600       r_,
601       a_ = simde__m128i_to_private(a);
602 
603     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
604       int8x16_t s8x16 = a_.neon_i8;                   /* xxxx xxxx xxxx DCBA */
605       int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
606       r_.neon_i16 = s16x8;
607     #elif defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_VECTOR_SCALAR) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
608       r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, a_.i8,
609           -1,  0, -1,  1, -1,  2,  -1,  3,
610           -1,  4, -1,  5, -1,  6,  -1,  7));
611       r_.i16 >>= 8;
612     #elif defined(SIMDE_CONVERT_VECTOR_)
613       SIMDE_CONVERT_VECTOR_(r_.i16, a_.m64_private[0].i8);
614     #else
615       SIMDE_VECTORIZE
616       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
617         r_.i16[i] = a_.i8[i];
618       }
619     #endif
620 
621     return simde__m128i_from_private(r_);
622   #endif
623 }
624 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
625   #undef _mm_cvtepi8_epi16
626   #define _mm_cvtepi8_epi16(a) simde_mm_cvtepi8_epi16(a)
627 #endif
628 
629 SIMDE_FUNCTION_ATTRIBUTES
630 simde__m128i
simde_mm_cvtepi8_epi32(simde__m128i a)631 simde_mm_cvtepi8_epi32 (simde__m128i a) {
632   #if defined(SIMDE_X86_SSE4_1_NATIVE)
633     return _mm_cvtepi8_epi32(a);
634   #elif defined(SIMDE_X86_SSE2_NATIVE)
635     __m128i tmp = _mm_unpacklo_epi8(a, a);
636     tmp = _mm_unpacklo_epi16(tmp, tmp);
637     return _mm_srai_epi32(tmp, 24);
638   #else
639     simde__m128i_private
640       r_,
641       a_ = simde__m128i_to_private(a);
642 
643     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
644       int8x16_t s8x16 = a_.neon_i8;                     /* xxxx xxxx xxxx DCBA */
645       int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0D0C 0B0A */
646       int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */
647       r_.neon_i32 = s32x4;
648     #elif defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_VECTOR_SCALAR) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
649       r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, a_.i8,
650           -1, -1, -1,  0, -1, -1,  -1,  1,
651           -1, -1, -1,  2, -1, -1,  -1,  3));
652       r_.i32 >>= 24;
653     #else
654       SIMDE_VECTORIZE
655       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
656         r_.i32[i] = a_.i8[i];
657       }
658     #endif
659 
660     return simde__m128i_from_private(r_);
661   #endif
662 }
663 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
664   #undef _mm_cvtepi8_epi32
665   #define _mm_cvtepi8_epi32(a) simde_mm_cvtepi8_epi32(a)
666 #endif
667 
668 SIMDE_FUNCTION_ATTRIBUTES
669 simde__m128i
simde_mm_cvtepi8_epi64(simde__m128i a)670 simde_mm_cvtepi8_epi64 (simde__m128i a) {
671   #if defined(SIMDE_X86_SSE4_1_NATIVE)
672     return _mm_cvtepi8_epi64(a);
673   #else
674     simde__m128i_private
675       r_,
676       a_ = simde__m128i_to_private(a);
677 
678     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
679       int8x16_t s8x16 = a_.neon_i8;                     /* xxxx xxxx xxxx xxBA */
680       int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0x0x 0B0A */
681       int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
682       int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
683       r_.neon_i64 = s64x2;
684     #elif (!defined(SIMDE_ARCH_X86) && !defined(SIMDE_ARCH_AMD64)) && defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_VECTOR_SCALAR) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
685       /* Disabled on x86 due to lack of 64-bit arithmetic shift until
686        * until AVX-512 (at which point we would be using the native
687        * _mm_cvtepi_epi64 anyways). */
688       r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, a_.i8,
689           -1, -1, -1, -1, -1, -1,  -1,  0,
690           -1, -1, -1, -1, -1, -1,  -1,  1));
691       r_.i64 >>= 56;
692     #else
693       SIMDE_VECTORIZE
694       for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
695         r_.i64[i] = a_.i8[i];
696       }
697     #endif
698 
699     return simde__m128i_from_private(r_);
700   #endif
701 }
702 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
703   #undef _mm_cvtepi8_epi64
704   #define _mm_cvtepi8_epi64(a) simde_mm_cvtepi8_epi64(a)
705 #endif
706 
707 SIMDE_FUNCTION_ATTRIBUTES
708 simde__m128i
simde_mm_cvtepu8_epi16(simde__m128i a)709 simde_mm_cvtepu8_epi16 (simde__m128i a) {
710   #if defined(SIMDE_X86_SSE4_1_NATIVE)
711     return _mm_cvtepu8_epi16(a);
712   #elif defined(SIMDE_X86_SSE2_NATIVE)
713     return _mm_unpacklo_epi8(a, _mm_setzero_si128());
714   #else
715     simde__m128i_private
716       r_,
717       a_ = simde__m128i_to_private(a);
718 
719     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
720       uint8x16_t u8x16 = a_.neon_u8;                   /* xxxx xxxx xxxx DCBA */
721       uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */
722       r_.neon_u16 = u16x8;
723     #elif defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
724       __typeof__(r_.i8) z = { 0, };
725       r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, z,
726           0, 16, 1, 17, 2, 18, 3, 19,
727           4, 20, 5, 21, 6, 22, 7, 23));
728     #elif defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_BUG_CLANG_45541) && (!defined(SIMDE_ARCH_POWER) || !defined(__clang__))
729       SIMDE_CONVERT_VECTOR_(r_.i16, a_.m64_private[0].u8);
730     #else
731       SIMDE_VECTORIZE
732       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
733         r_.i16[i] = a_.u8[i];
734       }
735     #endif
736 
737     return simde__m128i_from_private(r_);
738   #endif
739 }
740 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
741   #undef _mm_cvtepu8_epi16
742   #define _mm_cvtepu8_epi16(a) simde_mm_cvtepu8_epi16(a)
743 #endif
744 
745 SIMDE_FUNCTION_ATTRIBUTES
746 simde__m128i
simde_mm_cvtepu8_epi32(simde__m128i a)747 simde_mm_cvtepu8_epi32 (simde__m128i a) {
748   #if defined(SIMDE_X86_SSE4_1_NATIVE)
749     return _mm_cvtepu8_epi32(a);
750   #elif defined(SIMDE_X86_SSSE3_NATIVE)
751     __m128i s = _mm_set_epi8(
752         0x80, 0x80, 0x80, 0x03, 0x80, 0x80, 0x80, 0x02,
753         0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80, 0x00);
754     return _mm_shuffle_epi8(a, s);
755   #elif defined(SIMDE_X86_SSE2_NATIVE)
756     __m128i z = _mm_setzero_si128();
757     return _mm_unpacklo_epi16(_mm_unpacklo_epi8(a, z), z);
758   #else
759     simde__m128i_private
760       r_,
761       a_ = simde__m128i_to_private(a);
762 
763     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
764       uint8x16_t u8x16 = a_.neon_u8;                     /* xxxx xxxx xxxx DCBA */
765       uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0D0C 0B0A */
766       uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
767       r_.neon_u32 = u32x4;
768     #elif defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
769       __typeof__(r_.i8) z = { 0, };
770       r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, z,
771           0, 17, 18, 19, 1, 21, 22, 23,
772           2, 25, 26, 27, 3, 29, 30, 31));
773     #else
774       SIMDE_VECTORIZE
775       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
776         r_.i32[i] = a_.u8[i];
777       }
778     #endif
779 
780     return simde__m128i_from_private(r_);
781   #endif
782 }
783 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
784   #undef _mm_cvtepu8_epi32
785   #define _mm_cvtepu8_epi32(a) simde_mm_cvtepu8_epi32(a)
786 #endif
787 
788 SIMDE_FUNCTION_ATTRIBUTES
789 simde__m128i
simde_mm_cvtepu8_epi64(simde__m128i a)790 simde_mm_cvtepu8_epi64 (simde__m128i a) {
791   #if defined(SIMDE_X86_SSE4_1_NATIVE)
792     return _mm_cvtepu8_epi64(a);
793   #elif defined(SIMDE_X86_SSSE3_NATIVE)
794     __m128i s = _mm_set_epi8(
795         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x01,
796         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x00);
797     return _mm_shuffle_epi8(a, s);
798   #elif defined(SIMDE_X86_SSE2_NATIVE)
799     __m128i z = _mm_setzero_si128();
800     return _mm_unpacklo_epi32(_mm_unpacklo_epi16(_mm_unpacklo_epi8(a, z), z), z);
801   #else
802     simde__m128i_private
803       r_,
804       a_ = simde__m128i_to_private(a);
805 
806     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
807       uint8x16_t u8x16 = a_.neon_u8;                     /* xxxx xxxx xxxx xxBA */
808       uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0x0x 0B0A */
809       uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
810       uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
811       r_.neon_u64 = u64x2;
812     #elif defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
813       __typeof__(r_.i8) z = { 0, };
814       r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, z,
815           0, 17, 18, 19, 20, 21, 22, 23,
816           1, 25, 26, 27, 28, 29, 30, 31));
817     #else
818       SIMDE_VECTORIZE
819       for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
820         r_.i64[i] = a_.u8[i];
821       }
822     #endif
823 
824     return simde__m128i_from_private(r_);
825   #endif
826 }
827 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
828   #undef _mm_cvtepu8_epi64
829   #define _mm_cvtepu8_epi64(a) simde_mm_cvtepu8_epi64(a)
830 #endif
831 
832 SIMDE_FUNCTION_ATTRIBUTES
833 simde__m128i
simde_mm_cvtepi16_epi32(simde__m128i a)834 simde_mm_cvtepi16_epi32 (simde__m128i a) {
835   #if defined(SIMDE_X86_SSE4_1_NATIVE)
836     return _mm_cvtepi16_epi32(a);
837   #elif defined(SIMDE_X86_SSE2_NATIVE)
838     return _mm_srai_epi32(_mm_unpacklo_epi16(a, a), 16);
839   #else
840     simde__m128i_private
841       r_,
842       a_ = simde__m128i_to_private(a);
843 
844     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
845       r_.neon_i32 = vmovl_s16(vget_low_s16(a_.neon_i16));
846     #elif !defined(SIMDE_ARCH_X86) && defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_VECTOR_SCALAR) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
847       r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, a_.i16, 8, 0, 10, 1, 12, 2, 14, 3));
848       r_.i32 >>= 16;
849     #else
850       SIMDE_VECTORIZE
851       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
852         r_.i32[i] = a_.i16[i];
853       }
854     #endif
855 
856     return simde__m128i_from_private(r_);
857   #endif
858 }
859 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
860   #undef _mm_cvtepi16_epi32
861   #define _mm_cvtepi16_epi32(a) simde_mm_cvtepi16_epi32(a)
862 #endif
863 
864 SIMDE_FUNCTION_ATTRIBUTES
865 simde__m128i
simde_mm_cvtepu16_epi32(simde__m128i a)866 simde_mm_cvtepu16_epi32 (simde__m128i a) {
867   #if defined(SIMDE_X86_SSE4_1_NATIVE)
868     return _mm_cvtepu16_epi32(a);
869   #elif defined(SIMDE_X86_SSE2_NATIVE)
870     return _mm_unpacklo_epi16(a, _mm_setzero_si128());
871   #else
872     simde__m128i_private
873       r_,
874       a_ = simde__m128i_to_private(a);
875 
876     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
877       r_.neon_u32 = vmovl_u16(vget_low_u16(a_.neon_u16));
878     #elif defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
879       __typeof__(r_.u16) z = { 0, };
880       r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), SIMDE_SHUFFLE_VECTOR_(16, 16, a_.u16, z,
881           0, 9, 1, 11, 2, 13, 3, 15));
882     #elif defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_BUG_CLANG_45541) && (!defined(SIMDE_ARCH_POWER) || !defined(__clang__))
883       SIMDE_CONVERT_VECTOR_(r_.i32, a_.m64_private[0].u16);
884     #else
885       SIMDE_VECTORIZE
886       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
887         r_.i32[i] = a_.u16[i];
888       }
889     #endif
890 
891     return simde__m128i_from_private(r_);
892   #endif
893 }
894 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
895   #undef _mm_cvtepu16_epi32
896   #define _mm_cvtepu16_epi32(a) simde_mm_cvtepu16_epi32(a)
897 #endif
898 
899 SIMDE_FUNCTION_ATTRIBUTES
900 simde__m128i
simde_mm_cvtepu16_epi64(simde__m128i a)901 simde_mm_cvtepu16_epi64 (simde__m128i a) {
902   #if defined(SIMDE_X86_SSE4_1_NATIVE)
903     return _mm_cvtepu16_epi64(a);
904   #elif defined(SIMDE_X86_SSE2_NATIVE)
905     __m128i z = _mm_setzero_si128();
906     return _mm_unpacklo_epi32(_mm_unpacklo_epi16(a, z), z);
907   #else
908     simde__m128i_private
909       r_,
910       a_ = simde__m128i_to_private(a);
911 
912     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
913       uint16x8_t u16x8 = a_.neon_u16;                    /* xxxx xxxx xxxx 0B0A */
914       uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
915       uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
916       r_.neon_u64 = u64x2;
917     #elif defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
918       __typeof__(r_.u16) z = { 0, };
919       r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), SIMDE_SHUFFLE_VECTOR_(16, 16, a_.u16, z,
920           0,  9, 10, 11,
921           1, 13, 14, 15));
922     #else
923       SIMDE_VECTORIZE
924       for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
925         r_.i64[i] = a_.u16[i];
926       }
927     #endif
928 
929     return simde__m128i_from_private(r_);
930   #endif
931 }
932 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
933   #undef _mm_cvtepu16_epi64
934   #define _mm_cvtepu16_epi64(a) simde_mm_cvtepu16_epi64(a)
935 #endif
936 
937 SIMDE_FUNCTION_ATTRIBUTES
938 simde__m128i
simde_mm_cvtepi16_epi64(simde__m128i a)939 simde_mm_cvtepi16_epi64 (simde__m128i a) {
940   #if defined(SIMDE_X86_SSE4_1_NATIVE)
941     return _mm_cvtepi16_epi64(a);
942   #else
943     simde__m128i_private
944       r_,
945       a_ = simde__m128i_to_private(a);
946 
947     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
948       int16x8_t s16x8 = a_.neon_i16;                    /* xxxx xxxx xxxx 0B0A */
949       int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
950       int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
951       r_.neon_i64 = s64x2;
952     #elif (!defined(SIMDE_ARCH_X86) && !defined(SIMDE_ARCH_AMD64)) && defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_VECTOR_SCALAR) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
953       r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, a_.i16,
954            8,  9, 10, 0,
955           12, 13, 14, 1));
956       r_.i64 >>= 48;
957     #else
958       SIMDE_VECTORIZE
959       for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
960         r_.i64[i] = a_.i16[i];
961       }
962     #endif
963 
964     return simde__m128i_from_private(r_);
965   #endif
966 }
967 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
968   #undef _mm_cvtepi16_epi64
969   #define _mm_cvtepi16_epi64(a) simde_mm_cvtepi16_epi64(a)
970 #endif
971 
972 SIMDE_FUNCTION_ATTRIBUTES
973 simde__m128i
simde_mm_cvtepi32_epi64(simde__m128i a)974 simde_mm_cvtepi32_epi64 (simde__m128i a) {
975   #if defined(SIMDE_X86_SSE4_1_NATIVE)
976     return _mm_cvtepi32_epi64(a);
977   #elif defined(SIMDE_X86_SSE2_NATIVE)
978     __m128i tmp = _mm_shuffle_epi32(a, 0x50);
979     tmp = _mm_srai_epi32(tmp, 31);
980     tmp = _mm_shuffle_epi32(tmp, 0xed);
981     return _mm_unpacklo_epi32(a, tmp);
982   #else
983     simde__m128i_private
984       r_,
985       a_ = simde__m128i_to_private(a);
986 
987     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
988       r_.neon_i64 = vmovl_s32(vget_low_s32(a_.neon_i32));
989     #elif !defined(SIMDE_ARCH_X86) && defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_VECTOR_SCALAR) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
990       r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), SIMDE_SHUFFLE_VECTOR_(32, 16, a_.i32, a_.i32, -1, 0, -1, 1));
991       r_.i64 >>= 32;
992     #elif defined(SIMDE_CONVERT_VECTOR_)
993       SIMDE_CONVERT_VECTOR_(r_.i64, a_.m64_private[0].i32);
994     #else
995       SIMDE_VECTORIZE
996       for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
997         r_.i64[i] = a_.i32[i];
998       }
999     #endif
1000 
1001     return simde__m128i_from_private(r_);
1002   #endif
1003 }
1004 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1005   #undef _mm_cvtepi32_epi64
1006   #define _mm_cvtepi32_epi64(a) simde_mm_cvtepi32_epi64(a)
1007 #endif
1008 
1009 SIMDE_FUNCTION_ATTRIBUTES
1010 simde__m128i
simde_mm_cvtepu32_epi64(simde__m128i a)1011 simde_mm_cvtepu32_epi64 (simde__m128i a) {
1012   #if defined(SIMDE_X86_SSE4_1_NATIVE)
1013     return _mm_cvtepu32_epi64(a);
1014   #elif defined(SIMDE_X86_SSE2_NATIVE)
1015     return _mm_unpacklo_epi32(a, _mm_setzero_si128());
1016   #else
1017     simde__m128i_private
1018       r_,
1019       a_ = simde__m128i_to_private(a);
1020 
1021     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1022       r_.neon_u64 = vmovl_u32(vget_low_u32(a_.neon_u32));
1023     #elif defined(SIMDE_VECTOR_SCALAR) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
1024       __typeof__(r_.u32) z = { 0, };
1025       r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), SIMDE_SHUFFLE_VECTOR_(32, 16, a_.u32, z, 0, 4, 1, 6));
1026     #elif defined(SIMDE_CONVERT_VECTOR_)
1027       SIMDE_CONVERT_VECTOR_(r_.i64, a_.m64_private[0].u32);
1028     #else
1029       SIMDE_VECTORIZE
1030       for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
1031         r_.i64[i] = a_.u32[i];
1032       }
1033     #endif
1034 
1035     return simde__m128i_from_private(r_);
1036   #endif
1037 }
1038 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1039   #undef _mm_cvtepu32_epi64
1040   #define _mm_cvtepu32_epi64(a) simde_mm_cvtepu32_epi64(a)
1041 #endif
1042 
1043 SIMDE_FUNCTION_ATTRIBUTES
1044 simde__m128d
simde_mm_dp_pd(simde__m128d a,simde__m128d b,const int imm8)1045 simde_mm_dp_pd (simde__m128d a, simde__m128d b, const int imm8)
1046     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)  {
1047   simde__m128d_private
1048     r_,
1049     a_ = simde__m128d_to_private(a),
1050     b_ = simde__m128d_to_private(b);
1051 
1052   #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1053     r_.neon_f64 = vmulq_f64(a_.neon_f64, b_.neon_f64);
1054 
1055     switch (imm8) {
1056       case 0xff:
1057         r_.neon_f64 = vaddq_f64(r_.neon_f64, vextq_f64(r_.neon_f64, r_.neon_f64, 1));
1058         break;
1059       case 0x13:
1060         r_.neon_f64 = vdupq_lane_f64(vget_low_f64(r_.neon_f64), 0);
1061         break;
1062       default:
1063         { /* imm8 is a compile-time constant, so this all becomes just a load */
1064           uint64_t mask_data[] = {
1065             (imm8 & (1 << 4)) ? ~UINT64_C(0) : UINT64_C(0),
1066             (imm8 & (1 << 5)) ? ~UINT64_C(0) : UINT64_C(0),
1067           };
1068           r_.neon_f64 = vreinterpretq_f64_u64(vandq_u64(vld1q_u64(mask_data), vreinterpretq_u64_f64(r_.neon_f64)));
1069         }
1070 
1071         r_.neon_f64 = vdupq_n_f64(vaddvq_f64(r_.neon_f64));
1072 
1073         {
1074           uint64_t mask_data[] = {
1075             (imm8 & 1) ? ~UINT64_C(0) : UINT64_C(0),
1076             (imm8 & 2) ? ~UINT64_C(0) : UINT64_C(0)
1077           };
1078           r_.neon_f64 = vreinterpretq_f64_u64(vandq_u64(vld1q_u64(mask_data), vreinterpretq_u64_f64(r_.neon_f64)));
1079         }
1080         break;
1081     }
1082   #else
1083     simde_float64 sum = SIMDE_FLOAT64_C(0.0);
1084 
1085     SIMDE_VECTORIZE_REDUCTION(+:sum)
1086     for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
1087       sum += ((imm8 >> (i + 4)) & 1) ? (a_.f64[i] * b_.f64[i]) : 0.0;
1088     }
1089 
1090     SIMDE_VECTORIZE
1091     for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
1092       r_.f64[i] = ((imm8 >> i) & 1) ? sum : 0.0;
1093     }
1094   #endif
1095 
1096   return simde__m128d_from_private(r_);
1097 }
1098 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1099 #  define simde_mm_dp_pd(a, b, imm8) _mm_dp_pd(a, b, imm8)
1100 #endif
1101 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1102   #undef _mm_dp_pd
1103   #define _mm_dp_pd(a, b, imm8) simde_mm_dp_pd(a, b, imm8)
1104 #endif
1105 
1106 SIMDE_FUNCTION_ATTRIBUTES
1107 simde__m128
simde_mm_dp_ps(simde__m128 a,simde__m128 b,const int imm8)1108 simde_mm_dp_ps (simde__m128 a, simde__m128 b, const int imm8)
1109     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)  {
1110   simde__m128_private
1111     r_,
1112     a_ = simde__m128_to_private(a),
1113     b_ = simde__m128_to_private(b);
1114 
1115   #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1116     r_.neon_f32 = vmulq_f32(a_.neon_f32, b_.neon_f32);
1117 
1118     switch (imm8) {
1119       case 0xff:
1120         r_.neon_f32 = vdupq_n_f32(vaddvq_f32(r_.neon_f32));
1121         break;
1122       case 0x7f:
1123         r_.neon_f32 = vsetq_lane_f32(0, r_.neon_f32, 3);
1124         r_.neon_f32 = vdupq_n_f32(vaddvq_f32(r_.neon_f32));
1125         break;
1126       default:
1127         {
1128           {
1129             uint32_t mask_data[] = {
1130               (imm8 & (1 << 4)) ? ~UINT32_C(0) : UINT32_C(0),
1131               (imm8 & (1 << 5)) ? ~UINT32_C(0) : UINT32_C(0),
1132               (imm8 & (1 << 6)) ? ~UINT32_C(0) : UINT32_C(0),
1133               (imm8 & (1 << 7)) ? ~UINT32_C(0) : UINT32_C(0)
1134             };
1135             r_.neon_f32 = vreinterpretq_f32_u32(vandq_u32(vld1q_u32(mask_data), vreinterpretq_u32_f32(r_.neon_f32)));
1136           }
1137 
1138           r_.neon_f32 = vdupq_n_f32(vaddvq_f32(r_.neon_f32));
1139 
1140           {
1141             uint32_t mask_data[] = {
1142               (imm8 & 1) ? ~UINT32_C(0) : UINT32_C(0),
1143               (imm8 & 2) ? ~UINT32_C(0) : UINT32_C(0),
1144               (imm8 & 4) ? ~UINT32_C(0) : UINT32_C(0),
1145               (imm8 & 8) ? ~UINT32_C(0) : UINT32_C(0)
1146             };
1147             r_.neon_f32 = vreinterpretq_f32_u32(vandq_u32(vld1q_u32(mask_data), vreinterpretq_u32_f32(r_.neon_f32)));
1148           }
1149         }
1150         break;
1151     }
1152   #else
1153     simde_float32 sum = SIMDE_FLOAT32_C(0.0);
1154 
1155     SIMDE_VECTORIZE_REDUCTION(+:sum)
1156     for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
1157       sum += ((imm8 >> (i + 4)) & 1) ? (a_.f32[i] * b_.f32[i]) : SIMDE_FLOAT32_C(0.0);
1158     }
1159 
1160     SIMDE_VECTORIZE
1161     for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
1162       r_.f32[i] = ((imm8 >> i) & 1) ? sum : SIMDE_FLOAT32_C(0.0);
1163     }
1164   #endif
1165 
1166   return simde__m128_from_private(r_);
1167 }
1168 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1169 #  define simde_mm_dp_ps(a, b, imm8) _mm_dp_ps(a, b, imm8)
1170 #endif
1171 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1172   #undef _mm_dp_ps
1173   #define _mm_dp_ps(a, b, imm8) simde_mm_dp_ps(a, b, imm8)
1174 #endif
1175 
1176 #if defined(simde_mm_extract_epi8)
1177 #  undef simde_mm_extract_epi8
1178 #endif
1179 SIMDE_FUNCTION_ATTRIBUTES
1180 int8_t
simde_mm_extract_epi8(simde__m128i a,const int imm8)1181 simde_mm_extract_epi8 (simde__m128i a, const int imm8)
1182     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15)  {
1183   simde__m128i_private
1184     a_ = simde__m128i_to_private(a);
1185 
1186   #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1187     #if defined(SIMDE_BUG_GCC_95227)
1188       (void) a_;
1189       (void) imm8;
1190     #endif
1191     return vec_extract(a_.altivec_i8, imm8);
1192   #else
1193     return a_.i8[imm8 & 15];
1194   #endif
1195 }
1196 #if defined(SIMDE_X86_SSE4_1_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_MM_EXTRACT_EPI8)
1197 #  define simde_mm_extract_epi8(a, imm8) HEDLEY_STATIC_CAST(int8_t, _mm_extract_epi8(a, imm8))
1198 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1199 #  define simde_mm_extract_epi8(a, imm8) vgetq_lane_s8(simde__m128i_to_private(a).neon_i8, imm8)
1200 #endif
1201 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1202   #undef _mm_extract_epi8
1203   #define _mm_extract_epi8(a, imm8) HEDLEY_STATIC_CAST(int, simde_mm_extract_epi8(a, imm8))
1204 #endif
1205 
1206 #if defined(simde_mm_extract_epi32)
1207 #  undef simde_mm_extract_epi32
1208 #endif
1209 SIMDE_FUNCTION_ATTRIBUTES
1210 int32_t
simde_mm_extract_epi32(simde__m128i a,const int imm8)1211 simde_mm_extract_epi32 (simde__m128i a, const int imm8)
1212     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3)  {
1213   simde__m128i_private
1214     a_ = simde__m128i_to_private(a);
1215 
1216   #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1217     #if defined(SIMDE_BUG_GCC_95227)
1218       (void) a_;
1219       (void) imm8;
1220     #endif
1221     return vec_extract(a_.altivec_i32, imm8);
1222   #else
1223     return a_.i32[imm8 & 3];
1224   #endif
1225 }
1226 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1227 #  define simde_mm_extract_epi32(a, imm8) _mm_extract_epi32(a, imm8)
1228 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1229 #  define simde_mm_extract_epi32(a, imm8) vgetq_lane_s32(simde__m128i_to_private(a).neon_i32, imm8)
1230 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1231 #  define simde_mm_extract_epi32(a, imm8) HEDLEY_STATIC_CAST(int32_t, vec_extract(simde__m128i_to_private(a).altivec_i32, imm8))
1232 #endif
1233 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1234   #undef _mm_extract_epi32
1235   #define _mm_extract_epi32(a, imm8) simde_mm_extract_epi32(a, imm8)
1236 #endif
1237 
1238 #if defined(simde_mm_extract_epi64)
1239 #  undef simde_mm_extract_epi64
1240 #endif
1241 SIMDE_FUNCTION_ATTRIBUTES
1242 int64_t
simde_mm_extract_epi64(simde__m128i a,const int imm8)1243 simde_mm_extract_epi64 (simde__m128i a, const int imm8)
1244     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1)  {
1245   simde__m128i_private
1246     a_ = simde__m128i_to_private(a);
1247 
1248   #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
1249     #if defined(SIMDE_BUG_GCC_95227)
1250       (void) a_;
1251       (void) imm8;
1252     #endif
1253     return vec_extract(a_.altivec_i64, imm8);
1254   #else
1255     return a_.i64[imm8 & 1];
1256   #endif
1257 }
1258 #if defined(SIMDE_X86_SSE4_1_NATIVE) && defined(SIMDE_ARCH_AMD64)
1259 #  define simde_mm_extract_epi64(a, imm8) _mm_extract_epi64(a, imm8)
1260 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1261 #  define simde_mm_extract_epi64(a, imm8) vgetq_lane_s64(simde__m128i_to_private(a).neon_i64, imm8)
1262 #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
1263 #  define simde_mm_extract_epi64(a, imm8) HEDLEY_STATIC_CAST(int64_t, vec_extract(simde__m128i_to_private(a).altivec_i64, imm8))
1264 #endif
1265 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1266   #undef _mm_extract_epi64
1267   #define _mm_extract_epi64(a, imm8) simde_mm_extract_epi64(a, imm8)
1268 #endif
1269 
1270 #if defined(simde_mm_extract_ps)
1271 #  undef simde_mm_extract_ps
1272 #endif
1273 SIMDE_FUNCTION_ATTRIBUTES
1274 int32_t
simde_mm_extract_ps(simde__m128 a,const int imm8)1275 simde_mm_extract_ps (simde__m128 a, const int imm8)
1276     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3)  {
1277   simde__m128_private
1278     a_ = simde__m128_to_private(a);
1279 
1280   return a_.i32[imm8 & 3];
1281 }
1282 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1283   #define simde_mm_extract_ps(a, imm8) _mm_extract_ps(a, imm8)
1284 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1285   #define simde_mm_extract_ps(a, imm8) vgetq_lane_s32(simde__m128_to_private(a).neon_i32, imm8)
1286 #endif
1287 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1288   #undef _mm_extract_ps
1289   #define _mm_extract_ps(a, imm8) simde_mm_extract_ps(a, imm8)
1290 #endif
1291 
1292 SIMDE_FUNCTION_ATTRIBUTES
1293 simde__m128d
simde_mm_floor_pd(simde__m128d a)1294 simde_mm_floor_pd (simde__m128d a) {
1295   return simde_mm_round_pd(a, SIMDE_MM_FROUND_TO_NEG_INF);
1296 }
1297 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1298   #undef _mm_floor_pd
1299   #define _mm_floor_pd(a) simde_mm_floor_pd(a)
1300 #endif
1301 
1302 SIMDE_FUNCTION_ATTRIBUTES
1303 simde__m128
simde_mm_floor_ps(simde__m128 a)1304 simde_mm_floor_ps (simde__m128 a) {
1305   return simde_mm_round_ps(a, SIMDE_MM_FROUND_TO_NEG_INF);
1306 }
1307 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1308   #undef _mm_floor_ps
1309   #define _mm_floor_ps(a) simde_mm_floor_ps(a)
1310 #endif
1311 
1312 SIMDE_FUNCTION_ATTRIBUTES
1313 simde__m128d
simde_mm_floor_sd(simde__m128d a,simde__m128d b)1314 simde_mm_floor_sd (simde__m128d a, simde__m128d b) {
1315   #if defined(SIMDE_X86_SSE4_1_NATIVE)
1316     return _mm_floor_sd(a, b);
1317   #else
1318     simde__m128d_private
1319       r_,
1320       a_ = simde__m128d_to_private(a),
1321       b_ = simde__m128d_to_private(b);
1322 
1323     #if defined(simde_math_floor)
1324       r_.f64[0] = simde_math_floor(b_.f64[0]);
1325       r_.f64[1] = a_.f64[1];
1326     #else
1327       HEDLEY_UNREACHABLE();
1328     #endif
1329 
1330     return simde__m128d_from_private(r_);
1331   #endif
1332 }
1333 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1334   #undef _mm_floor_sd
1335   #define _mm_floor_sd(a, b) simde_mm_floor_sd(a, b)
1336 #endif
1337 
1338 SIMDE_FUNCTION_ATTRIBUTES
1339 simde__m128
simde_mm_floor_ss(simde__m128 a,simde__m128 b)1340 simde_mm_floor_ss (simde__m128 a, simde__m128 b) {
1341   #if defined(SIMDE_X86_SSE4_1_NATIVE)
1342     return _mm_floor_ss(a, b);
1343   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
1344       return simde_mm_move_ss(a, simde_mm_floor_ps(b));
1345   #else
1346     simde__m128_private
1347       r_,
1348       a_ = simde__m128_to_private(a),
1349       b_ = simde__m128_to_private(b);
1350 
1351     #if defined(simde_math_floorf)
1352       r_.f32[0] = simde_math_floorf(b_.f32[0]);
1353       for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
1354         r_.f32[i] = a_.f32[i];
1355       }
1356     #else
1357       HEDLEY_UNREACHABLE();
1358     #endif
1359 
1360     return simde__m128_from_private(r_);
1361   #endif
1362 }
1363 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1364   #undef _mm_floor_ss
1365   #define _mm_floor_ss(a, b) simde_mm_floor_ss(a, b)
1366 #endif
1367 
1368 SIMDE_FUNCTION_ATTRIBUTES
1369 simde__m128i
simde_mm_insert_epi8(simde__m128i a,int i,const int imm8)1370 simde_mm_insert_epi8 (simde__m128i a, int i, const int imm8)
1371     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15)  {
1372   simde__m128i_private
1373     r_ = simde__m128i_to_private(a);
1374 
1375   r_.i8[imm8] = HEDLEY_STATIC_CAST(int8_t, i);
1376 
1377   return simde__m128i_from_private(r_);
1378 }
1379 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1380   /* clang-3.8 returns an incompatible type, so we need the cast.  MSVC
1381    * can't handle the cast ("error C2440: 'type cast': cannot convert
1382    * from '__m128i' to '__m128i'").  */
1383   #if defined(__clang__)
1384     #define simde_mm_insert_epi8(a, i, imm8) HEDLEY_STATIC_CAST(__m128i, _mm_insert_epi8(a, i, imm8))
1385   #else
1386     #define simde_mm_insert_epi8(a, i, imm8) _mm_insert_epi8(a, i, imm8)
1387   #endif
1388 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1389 #  define simde_mm_insert_epi8(a, i, imm8) simde__m128i_from_neon_i8(vsetq_lane_s8(i, simde__m128i_to_private(a).i8, imm8))
1390 #endif
1391 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1392   #undef _mm_insert_epi8
1393   #define _mm_insert_epi8(a, i, imm8) simde_mm_insert_epi8(a, i, imm8)
1394 #endif
1395 
1396 SIMDE_FUNCTION_ATTRIBUTES
1397 simde__m128i
simde_mm_insert_epi32(simde__m128i a,int i,const int imm8)1398 simde_mm_insert_epi32 (simde__m128i a, int i, const int imm8)
1399     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3)  {
1400   simde__m128i_private
1401     r_ = simde__m128i_to_private(a);
1402 
1403   r_.i32[imm8] = HEDLEY_STATIC_CAST(int32_t, i);
1404 
1405   return simde__m128i_from_private(r_);
1406 }
1407 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1408   #if defined(__clang__)
1409     #define simde_mm_insert_epi32(a, i, imm8) HEDLEY_STATIC_CAST(__m128i, _mm_insert_epi32(a, i, imm8))
1410   #else
1411     #define simde_mm_insert_epi32(a, i, imm8) _mm_insert_epi32(a, i, imm8)
1412   #endif
1413 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1414 #  define simde_mm_insert_epi32(a, i, imm8) simde__m128i_from_neon_i32(vsetq_lane_s32(i, simde__m128i_to_private(a).i32, imm8))
1415 #endif
1416 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1417   #undef _mm_insert_epi32
1418   #define _mm_insert_epi32(a, i, imm8) simde_mm_insert_epi32(a, i, imm8)
1419 #endif
1420 
1421 SIMDE_FUNCTION_ATTRIBUTES
1422 simde__m128i
simde_mm_insert_epi64(simde__m128i a,int64_t i,const int imm8)1423 simde_mm_insert_epi64 (simde__m128i a, int64_t i, const int imm8)
1424     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1)  {
1425   #if defined(SIMDE_BUG_GCC_94482)
1426     simde__m128i_private
1427       a_ = simde__m128i_to_private(a);
1428 
1429     switch(imm8) {
1430       case 0:
1431         return simde_mm_set_epi64x(a_.i64[1], i);
1432         break;
1433       case 1:
1434         return simde_mm_set_epi64x(i, a_.i64[0]);
1435         break;
1436       default:
1437         HEDLEY_UNREACHABLE();
1438         break;
1439     }
1440   #else
1441     simde__m128i_private
1442       r_ = simde__m128i_to_private(a);
1443 
1444     r_.i64[imm8] = i;
1445     return simde__m128i_from_private(r_);
1446   #endif
1447 }
1448 #if defined(SIMDE_X86_SSE4_1_NATIVE) && defined(SIMDE_ARCH_AMD64)
1449 #  define simde_mm_insert_epi64(a, i, imm8) _mm_insert_epi64(a, i, imm8)
1450 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1451 #  define simde_mm_insert_epi64(a, i, imm8) simde__m128i_from_neon_i64(vsetq_lane_s64(i, simde__m128i_to_private(a).i64, imm8))
1452 #endif
1453 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1454   #undef _mm_insert_epi64
1455   #define _mm_insert_epi64(a, i, imm8) simde_mm_insert_epi64(a, i, imm8)
1456 #endif
1457 
1458 SIMDE_FUNCTION_ATTRIBUTES
1459 simde__m128
simde_mm_insert_ps(simde__m128 a,simde__m128 b,const int imm8)1460 simde_mm_insert_ps (simde__m128 a, simde__m128 b, const int imm8)
1461     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)  {
1462   simde__m128_private
1463     r_,
1464     a_ = simde__m128_to_private(a),
1465     b_ = simde__m128_to_private(b);
1466 
1467   a_.f32[0] = b_.f32[(imm8 >> 6) & 3];
1468   a_.f32[(imm8 >> 4) & 3] = a_.f32[0];
1469 
1470   SIMDE_VECTORIZE
1471   for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
1472     r_.f32[i] = (imm8 >> i) ? SIMDE_FLOAT32_C(0.0) : a_.f32[i];
1473   }
1474 
1475   return simde__m128_from_private(r_);
1476 }
1477 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1478 #  define simde_mm_insert_ps(a, b, imm8) _mm_insert_ps(a, b, imm8)
1479 #endif
1480 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1481   #undef _mm_insert_ps
1482   #define _mm_insert_ps(a, b, imm8) simde_mm_insert_ps(a, b, imm8)
1483 #endif
1484 
1485 SIMDE_FUNCTION_ATTRIBUTES
1486 simde__m128i
simde_mm_max_epi8(simde__m128i a,simde__m128i b)1487 simde_mm_max_epi8 (simde__m128i a, simde__m128i b) {
1488   #if defined(SIMDE_X86_SSE4_1_NATIVE) && !defined(__PGI)
1489     return _mm_max_epi8(a, b);
1490   #else
1491     simde__m128i_private
1492       r_,
1493       a_ = simde__m128i_to_private(a),
1494       b_ = simde__m128i_to_private(b);
1495 
1496     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1497       r_.neon_i8 = vmaxq_s8(a_.neon_i8, b_.neon_i8);
1498     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1499       r_.wasm_v128 = wasm_i8x16_max(a_.wasm_v128, b_.wasm_v128);
1500     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1501       r_.altivec_i8 = vec_max(a_.altivec_i8, b_.altivec_i8);
1502     #else
1503       SIMDE_VECTORIZE
1504       for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
1505         r_.i8[i] = a_.i8[i] > b_.i8[i] ? a_.i8[i] : b_.i8[i];
1506       }
1507     #endif
1508 
1509     return simde__m128i_from_private(r_);
1510   #endif
1511 }
1512 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1513   #undef _mm_max_epi8
1514   #define _mm_max_epi8(a, b) simde_mm_max_epi8(a, b)
1515 #endif
1516 
1517 SIMDE_FUNCTION_ATTRIBUTES
1518 simde__m128i
simde_mm_max_epi32(simde__m128i a,simde__m128i b)1519 simde_mm_max_epi32 (simde__m128i a, simde__m128i b) {
1520   #if defined(SIMDE_X86_SSE4_1_NATIVE) && !defined(__PGI)
1521     return _mm_max_epi32(a, b);
1522   #else
1523     simde__m128i_private
1524       r_,
1525       a_ = simde__m128i_to_private(a),
1526       b_ = simde__m128i_to_private(b);
1527 
1528     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1529       r_.neon_i32 = vmaxq_s32(a_.neon_i32, b_.neon_i32);
1530     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1531       r_.wasm_v128 = wasm_i32x4_max(a_.wasm_v128, b_.wasm_v128);
1532     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1533       r_.altivec_i32 = vec_max(a_.altivec_i32, b_.altivec_i32);
1534     #else
1535       SIMDE_VECTORIZE
1536       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1537         r_.i32[i] = a_.i32[i] > b_.i32[i] ? a_.i32[i] : b_.i32[i];
1538       }
1539     #endif
1540 
1541     return simde__m128i_from_private(r_);
1542   #endif
1543 }
1544 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1545   #undef _mm_max_epi32
1546   #define _mm_max_epi32(a, b) simde_mm_max_epi32(a, b)
1547 #endif
1548 
1549 SIMDE_FUNCTION_ATTRIBUTES
1550 simde__m128i
simde_mm_max_epu16(simde__m128i a,simde__m128i b)1551 simde_mm_max_epu16 (simde__m128i a, simde__m128i b) {
1552   #if defined(SIMDE_X86_SSE4_1_NATIVE)
1553     return _mm_max_epu16(a, b);
1554   #else
1555     simde__m128i_private
1556       r_,
1557       a_ = simde__m128i_to_private(a),
1558       b_ = simde__m128i_to_private(b);
1559 
1560     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1561       r_.neon_u16 = vmaxq_u16(a_.neon_u16, b_.neon_u16);
1562     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1563       r_.wasm_v128 = wasm_u16x8_max(a_.wasm_v128, b_.wasm_v128);
1564     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1565       r_.altivec_u16 = vec_max(a_.altivec_u16, b_.altivec_u16);
1566     #else
1567       SIMDE_VECTORIZE
1568       for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
1569         r_.u16[i] = a_.u16[i] > b_.u16[i] ? a_.u16[i] : b_.u16[i];
1570       }
1571     #endif
1572 
1573     return simde__m128i_from_private(r_);
1574   #endif
1575 }
1576 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1577   #undef _mm_max_epu16
1578   #define _mm_max_epu16(a, b) simde_mm_max_epu16(a, b)
1579 #endif
1580 
1581 SIMDE_FUNCTION_ATTRIBUTES
1582 simde__m128i
simde_mm_max_epu32(simde__m128i a,simde__m128i b)1583 simde_mm_max_epu32 (simde__m128i a, simde__m128i b) {
1584   #if defined(SIMDE_X86_SSE4_1_NATIVE)
1585     return _mm_max_epu32(a, b);
1586   #else
1587     simde__m128i_private
1588       r_,
1589       a_ = simde__m128i_to_private(a),
1590       b_ = simde__m128i_to_private(b);
1591 
1592     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1593       r_.neon_u32 = vmaxq_u32(a_.neon_u32, b_.neon_u32);
1594     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1595       r_.wasm_v128 = wasm_u32x4_max(a_.wasm_v128, b_.wasm_v128);
1596     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1597       r_.altivec_u32 = vec_max(a_.altivec_u32, b_.altivec_u32);
1598     #else
1599       SIMDE_VECTORIZE
1600       for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
1601         r_.u32[i] = a_.u32[i] > b_.u32[i] ? a_.u32[i] : b_.u32[i];
1602       }
1603     #endif
1604 
1605     return simde__m128i_from_private(r_);
1606   #endif
1607 }
1608 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1609   #undef _mm_max_epu32
1610   #define _mm_max_epu32(a, b) simde_mm_max_epu32(a, b)
1611 #endif
1612 
1613 SIMDE_FUNCTION_ATTRIBUTES
1614 simde__m128i
simde_mm_min_epi8(simde__m128i a,simde__m128i b)1615 simde_mm_min_epi8 (simde__m128i a, simde__m128i b) {
1616   #if defined(SIMDE_X86_SSE4_1_NATIVE) && !defined(__PGI)
1617     return _mm_min_epi8(a, b);
1618   #else
1619     simde__m128i_private
1620       r_,
1621       a_ = simde__m128i_to_private(a),
1622       b_ = simde__m128i_to_private(b);
1623 
1624     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1625       r_.neon_i8 = vminq_s8(a_.neon_i8, b_.neon_i8);
1626     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1627       r_.wasm_v128 = wasm_i8x16_min(a_.wasm_v128, b_.wasm_v128);
1628     #else
1629       SIMDE_VECTORIZE
1630       for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
1631         r_.i8[i] = a_.i8[i] < b_.i8[i] ? a_.i8[i] : b_.i8[i];
1632       }
1633     #endif
1634 
1635     return simde__m128i_from_private(r_);
1636   #endif
1637 }
1638 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1639   #undef _mm_min_epi8
1640   #define _mm_min_epi8(a, b) simde_mm_min_epi8(a, b)
1641 #endif
1642 
1643 SIMDE_FUNCTION_ATTRIBUTES
1644 simde__m128i
simde_mm_min_epi32(simde__m128i a,simde__m128i b)1645 simde_mm_min_epi32 (simde__m128i a, simde__m128i b) {
1646   #if defined(SIMDE_X86_SSE4_1_NATIVE) && !defined(__PGI)
1647     return _mm_min_epi32(a, b);
1648   #else
1649     simde__m128i_private
1650       r_,
1651       a_ = simde__m128i_to_private(a),
1652       b_ = simde__m128i_to_private(b);
1653 
1654     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1655       r_.neon_i32 = vminq_s32(a_.neon_i32, b_.neon_i32);
1656     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1657       r_.wasm_v128 = wasm_i32x4_min(a_.wasm_v128, b_.wasm_v128);
1658     #else
1659       SIMDE_VECTORIZE
1660       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1661         r_.i32[i] = a_.i32[i] < b_.i32[i] ? a_.i32[i] : b_.i32[i];
1662       }
1663     #endif
1664 
1665     return simde__m128i_from_private(r_);
1666   #endif
1667 }
1668 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1669   #undef _mm_min_epi32
1670   #define _mm_min_epi32(a, b) simde_mm_min_epi32(a, b)
1671 #endif
1672 
1673 SIMDE_FUNCTION_ATTRIBUTES
1674 simde__m128i
simde_mm_min_epu16(simde__m128i a,simde__m128i b)1675 simde_mm_min_epu16 (simde__m128i a, simde__m128i b) {
1676   #if defined(SIMDE_X86_SSE4_1_NATIVE)
1677     return _mm_min_epu16(a, b);
1678   #else
1679     simde__m128i_private
1680       r_,
1681       a_ = simde__m128i_to_private(a),
1682       b_ = simde__m128i_to_private(b);
1683 
1684     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1685       r_.neon_u16 = vminq_u16(a_.neon_u16, b_.neon_u16);
1686     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1687       r_.wasm_v128 = wasm_u16x8_min(a_.wasm_v128, b_.wasm_v128);
1688     #else
1689       SIMDE_VECTORIZE
1690       for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
1691         r_.u16[i] = a_.u16[i] < b_.u16[i] ? a_.u16[i] : b_.u16[i];
1692       }
1693     #endif
1694 
1695     return simde__m128i_from_private(r_);
1696   #endif
1697 }
1698 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1699   #undef _mm_min_epu16
1700   #define _mm_min_epu16(a, b) simde_mm_min_epu16(a, b)
1701 #endif
1702 
1703 SIMDE_FUNCTION_ATTRIBUTES
1704 simde__m128i
simde_mm_min_epu32(simde__m128i a,simde__m128i b)1705 simde_mm_min_epu32 (simde__m128i a, simde__m128i b) {
1706   #if defined(SIMDE_X86_SSE4_1_NATIVE)
1707     return _mm_min_epu32(a, b);
1708   #else
1709     simde__m128i_private
1710       r_,
1711       a_ = simde__m128i_to_private(a),
1712       b_ = simde__m128i_to_private(b);
1713 
1714     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1715       r_.neon_u32 = vminq_u32(a_.neon_u32, b_.neon_u32);
1716     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1717       r_.wasm_v128 = wasm_u32x4_min(a_.wasm_v128, b_.wasm_v128);
1718     #else
1719       SIMDE_VECTORIZE
1720       for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
1721         r_.u32[i] = a_.u32[i] < b_.u32[i] ? a_.u32[i] : b_.u32[i];
1722       }
1723     #endif
1724 
1725     return simde__m128i_from_private(r_);
1726   #endif
1727 }
1728 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1729   #undef _mm_min_epu32
1730   #define _mm_min_epu32(a, b) simde_mm_min_epu32(a, b)
1731 #endif
1732 
1733 SIMDE_FUNCTION_ATTRIBUTES
1734 simde__m128i
simde_mm_minpos_epu16(simde__m128i a)1735 simde_mm_minpos_epu16 (simde__m128i a) {
1736   #if defined(SIMDE_X86_SSE4_1_NATIVE)
1737     return _mm_minpos_epu16(a);
1738   #else
1739     simde__m128i_private
1740       r_ = simde__m128i_to_private(simde_mm_setzero_si128()),
1741       a_ = simde__m128i_to_private(a);
1742 
1743     r_.u16[0] = UINT16_MAX;
1744     for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
1745       if (a_.u16[i] < r_.u16[0]) {
1746         r_.u16[0] = a_.u16[i];
1747         r_.u16[1] = HEDLEY_STATIC_CAST(uint16_t, i);
1748       }
1749     }
1750 
1751     return simde__m128i_from_private(r_);
1752   #endif
1753 }
1754 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1755   #undef _mm_minpos_epu16
1756   #define _mm_minpos_epu16(a) simde_mm_minpos_epu16(a)
1757 #endif
1758 
1759 SIMDE_FUNCTION_ATTRIBUTES
1760 simde__m128i
simde_mm_mpsadbw_epu8(simde__m128i a,simde__m128i b,const int imm8)1761 simde_mm_mpsadbw_epu8 (simde__m128i a, simde__m128i b, const int imm8)
1762     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)  {
1763   simde__m128i_private
1764     r_,
1765     a_ = simde__m128i_to_private(a),
1766     b_ = simde__m128i_to_private(b);
1767 
1768   const int a_offset = imm8 & 4;
1769   const int b_offset = (imm8 & 3) << 2;
1770 
1771 #if defined(simde_math_abs)
1772   for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, (sizeof(r_.u16) / sizeof(r_.u16[0]))) ; i++) {
1773     r_.u16[i] =
1774       HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[a_offset + i + 0] - b_.u8[b_offset + 0]))) +
1775       HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[a_offset + i + 1] - b_.u8[b_offset + 1]))) +
1776       HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[a_offset + i + 2] - b_.u8[b_offset + 2]))) +
1777       HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[a_offset + i + 3] - b_.u8[b_offset + 3])));
1778   }
1779 #else
1780   HEDLEY_UNREACHABLE();
1781 #endif
1782 
1783   return simde__m128i_from_private(r_);
1784 }
1785 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1786 #  define simde_mm_mpsadbw_epu8(a, b, imm8) _mm_mpsadbw_epu8(a, b, imm8)
1787 #endif
1788 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1789   #undef _mm_mpsadbw_epu8
1790   #define _mm_mpsadbw_epu8(a, b, imm8) simde_mm_mpsadbw_epu8(a, b, imm8)
1791 #endif
1792 
1793 SIMDE_FUNCTION_ATTRIBUTES
1794 simde__m128i
simde_mm_mul_epi32(simde__m128i a,simde__m128i b)1795 simde_mm_mul_epi32 (simde__m128i a, simde__m128i b) {
1796   #if defined(SIMDE_X86_SSE4_1_NATIVE)
1797     return _mm_mul_epi32(a, b);
1798   #else
1799     simde__m128i_private
1800       r_,
1801       a_ = simde__m128i_to_private(a),
1802       b_ = simde__m128i_to_private(b);
1803 
1804     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1805       // vmull_s32 upcasts instead of masking, so we downcast.
1806       int32x2_t a_lo = vmovn_s64(a_.neon_i64);
1807       int32x2_t b_lo = vmovn_s64(b_.neon_i64);
1808       r_.neon_i64 = vmull_s32(a_lo, b_lo);
1809     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1810       r_.wasm_v128 = wasm_i64x2_make(
1811         wasm_i32x4_extract_lane(a_.wasm_v128, 0) * HEDLEY_STATIC_CAST(int64_t, wasm_i32x4_extract_lane(b_.wasm_v128, 0)),
1812         wasm_i32x4_extract_lane(a_.wasm_v128, 2) * HEDLEY_STATIC_CAST(int64_t, wasm_i32x4_extract_lane(b_.wasm_v128, 2)));
1813     #else
1814       SIMDE_VECTORIZE
1815       for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
1816         r_.i64[i] =
1817           HEDLEY_STATIC_CAST(int64_t, a_.i32[i * 2]) *
1818           HEDLEY_STATIC_CAST(int64_t, b_.i32[i * 2]);
1819       }
1820     #endif
1821 
1822     return simde__m128i_from_private(r_);
1823   #endif
1824 }
1825 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1826   #undef _mm_mul_epi32
1827   #define _mm_mul_epi32(a, b) simde_mm_mul_epi32(a, b)
1828 #endif
1829 
1830 SIMDE_FUNCTION_ATTRIBUTES
1831 simde__m128i
simde_mm_mullo_epi32(simde__m128i a,simde__m128i b)1832 simde_mm_mullo_epi32 (simde__m128i a, simde__m128i b) {
1833   #if defined(SIMDE_X86_SSE4_1_NATIVE)
1834     return _mm_mullo_epi32(a, b);
1835   #else
1836     simde__m128i_private
1837       r_,
1838       a_ = simde__m128i_to_private(a),
1839       b_ = simde__m128i_to_private(b);
1840 
1841     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1842       r_.neon_i32 = vmulq_s32(a_.neon_i32, b_.neon_i32);
1843     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1844       (void) a_;
1845       (void) b_;
1846       r_.altivec_i32 = vec_mul(a_.altivec_i32, b_.altivec_i32);
1847     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1848       r_.wasm_v128 = wasm_i32x4_mul(a_.wasm_v128, b_.wasm_v128);
1849     #else
1850       SIMDE_VECTORIZE
1851       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1852         r_.u32[i] = HEDLEY_STATIC_CAST(uint32_t, (HEDLEY_STATIC_CAST(uint64_t, (HEDLEY_STATIC_CAST(int64_t, a_.i32[i]) * HEDLEY_STATIC_CAST(int64_t, b_.i32[i]))) & 0xffffffff));
1853       }
1854     #endif
1855 
1856     return simde__m128i_from_private(r_);
1857   #endif
1858 }
1859 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1860   #undef _mm_mullo_epi32
1861   #define _mm_mullo_epi32(a, b) simde_mm_mullo_epi32(a, b)
1862 #endif
1863 
1864 SIMDE_FUNCTION_ATTRIBUTES
1865 simde__m128i
simde_x_mm_mullo_epu32(simde__m128i a,simde__m128i b)1866 simde_x_mm_mullo_epu32 (simde__m128i a, simde__m128i b) {
1867   simde__m128i_private
1868     r_,
1869     a_ = simde__m128i_to_private(a),
1870     b_ = simde__m128i_to_private(b);
1871 
1872     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1873       r_.neon_u32 = vmulq_u32(a_.neon_u32, b_.neon_u32);
1874     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1875       r_.u32 = a_.u32 * b_.u32;
1876     #else
1877       SIMDE_VECTORIZE
1878       for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
1879         r_.u32[i] = a_.u32[i] * b_.u32[i];
1880       }
1881     #endif
1882 
1883   return simde__m128i_from_private(r_);
1884 }
1885 
1886 SIMDE_FUNCTION_ATTRIBUTES
1887 simde__m128i
simde_mm_packus_epi32(simde__m128i a,simde__m128i b)1888 simde_mm_packus_epi32 (simde__m128i a, simde__m128i b) {
1889   #if defined(SIMDE_X86_SSE4_1_NATIVE)
1890     return _mm_packus_epi32(a, b);
1891   #else
1892     simde__m128i_private
1893       r_,
1894       a_ = simde__m128i_to_private(a),
1895       b_ = simde__m128i_to_private(b);
1896 
1897     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1898       const int32x4_t z = vdupq_n_s32(0);
1899       r_.neon_u16 = vcombine_u16(
1900           vqmovn_u32(vreinterpretq_u32_s32(vmaxq_s32(z, a_.neon_i32))),
1901           vqmovn_u32(vreinterpretq_u32_s32(vmaxq_s32(z, b_.neon_i32))));
1902     #else
1903       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1904         r_.u16[i + 0] = (a_.i32[i] < 0) ? UINT16_C(0) : ((a_.i32[i] > UINT16_MAX) ? (UINT16_MAX) : HEDLEY_STATIC_CAST(uint16_t, a_.i32[i]));
1905         r_.u16[i + 4] = (b_.i32[i] < 0) ? UINT16_C(0) : ((b_.i32[i] > UINT16_MAX) ? (UINT16_MAX) : HEDLEY_STATIC_CAST(uint16_t, b_.i32[i]));
1906       }
1907     #endif
1908 
1909     return simde__m128i_from_private(r_);
1910   #endif
1911 }
1912 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1913   #undef _mm_packus_epi32
1914   #define _mm_packus_epi32(a, b) simde_mm_packus_epi32(a, b)
1915 #endif
1916 
1917 SIMDE_FUNCTION_ATTRIBUTES
1918 simde__m128d
simde_mm_round_sd(simde__m128d a,simde__m128d b,int rounding)1919 simde_mm_round_sd (simde__m128d a, simde__m128d b, int rounding)
1920     SIMDE_REQUIRE_CONSTANT_RANGE(rounding, 0, 15) {
1921   simde__m128d_private
1922     r_ = simde__m128d_to_private(a),
1923     b_ = simde__m128d_to_private(b);
1924 
1925   switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) {
1926     #if defined(simde_math_nearbyint)
1927       case SIMDE_MM_FROUND_TO_NEAREST_INT:
1928       case SIMDE_MM_FROUND_CUR_DIRECTION:
1929         r_.f64[0] = simde_math_nearbyint(b_.f64[0]);
1930         break;
1931     #endif
1932 
1933     #if defined(simde_math_floor)
1934       case SIMDE_MM_FROUND_TO_NEG_INF:
1935         r_.f64[0] = simde_math_floor(b_.f64[0]);
1936         break;
1937     #endif
1938 
1939     #if defined(simde_math_ceil)
1940       case SIMDE_MM_FROUND_TO_POS_INF:
1941         r_.f64[0] = simde_math_ceil(b_.f64[0]);
1942         break;
1943     #endif
1944 
1945     #if defined(simde_math_trunc)
1946       case SIMDE_MM_FROUND_TO_ZERO:
1947         r_.f64[0] = simde_math_trunc(b_.f64[0]);
1948         break;
1949     #endif
1950 
1951     default:
1952       HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
1953   }
1954 
1955   return simde__m128d_from_private(r_);
1956 }
1957 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1958 #  define simde_mm_round_sd(a, b, rounding) _mm_round_sd(a, b, rounding)
1959 #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
1960 #  define simde_mm_round_sd(a, b, rounding) simde_mm_move_sd(a, simde_mm_round_pd(b, rounding))
1961 #endif
1962 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1963   #undef _mm_round_sd
1964   #define _mm_round_sd(a, b, rounding) simde_mm_round_sd(a, b, rounding)
1965 #endif
1966 
1967 SIMDE_FUNCTION_ATTRIBUTES
1968 simde__m128
simde_mm_round_ss(simde__m128 a,simde__m128 b,int rounding)1969 simde_mm_round_ss (simde__m128 a, simde__m128 b, int rounding)
1970     SIMDE_REQUIRE_CONSTANT_RANGE(rounding, 0, 15) {
1971   simde__m128_private
1972     r_ = simde__m128_to_private(a),
1973     b_ = simde__m128_to_private(b);
1974 
1975   switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) {
1976     #if defined(simde_math_nearbyintf)
1977       case SIMDE_MM_FROUND_TO_NEAREST_INT:
1978       case SIMDE_MM_FROUND_CUR_DIRECTION:
1979         r_.f32[0] = simde_math_nearbyintf(b_.f32[0]);
1980         break;
1981     #endif
1982 
1983     #if defined(simde_math_floorf)
1984       case SIMDE_MM_FROUND_TO_NEG_INF:
1985         r_.f32[0] = simde_math_floorf(b_.f32[0]);
1986         break;
1987     #endif
1988 
1989     #if defined(simde_math_ceilf)
1990       case SIMDE_MM_FROUND_TO_POS_INF:
1991         r_.f32[0] = simde_math_ceilf(b_.f32[0]);
1992         break;
1993     #endif
1994 
1995     #if defined(simde_math_truncf)
1996       case SIMDE_MM_FROUND_TO_ZERO:
1997         r_.f32[0] = simde_math_truncf(b_.f32[0]);
1998         break;
1999     #endif
2000 
2001     default:
2002       HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
2003   }
2004 
2005   return simde__m128_from_private(r_);
2006 }
2007 #if defined(SIMDE_X86_SSE4_1_NATIVE)
2008 #  define simde_mm_round_ss(a, b, rounding) _mm_round_ss(a, b, rounding)
2009 #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
2010 #  define simde_mm_round_ss(a, b, rounding) simde_mm_move_ss(a, simde_mm_round_ps(b, rounding))
2011 #endif
2012 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
2013   #undef _mm_round_ss
2014   #define _mm_round_ss(a, b, rounding) simde_mm_round_ss(a, b, rounding)
2015 #endif
2016 
2017 SIMDE_FUNCTION_ATTRIBUTES
2018 simde__m128i
simde_mm_stream_load_si128(const simde__m128i * mem_addr)2019 simde_mm_stream_load_si128 (const simde__m128i* mem_addr) {
2020   #if defined(SIMDE_X86_SSE4_1_NATIVE)
2021     return _mm_stream_load_si128(HEDLEY_CONST_CAST(simde__m128i*, mem_addr));
2022   #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2023     return vreinterpretq_s64_s32(vld1q_s32(HEDLEY_REINTERPRET_CAST(int32_t const*, mem_addr)));
2024   #else
2025     return *mem_addr;
2026   #endif
2027 }
2028 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
2029   #undef _mm_stream_load_si128
2030   #define _mm_stream_load_si128(mem_addr) simde_mm_stream_load_si128(mem_addr)
2031 #endif
2032 
2033 SIMDE_FUNCTION_ATTRIBUTES
2034 int
simde_mm_test_all_ones(simde__m128i a)2035 simde_mm_test_all_ones (simde__m128i a) {
2036   #if defined(SIMDE_X86_SSE4_1_NATIVE)
2037     return _mm_test_all_ones(a);
2038   #else
2039     simde__m128i_private a_ = simde__m128i_to_private(a);
2040     int r;
2041 
2042     #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
2043       r = vec_all_eq(a_.altivec_i32, vec_splats(~0));
2044     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2045       return r = ((vgetq_lane_s64(a_.neon_i64, 0) & vgetq_lane_s64(a_.neon_i64, 1)) == ~HEDLEY_STATIC_CAST(int64_t, 0));
2046     #else
2047       int_fast32_t r_ = ~HEDLEY_STATIC_CAST(int_fast32_t, 0);
2048 
2049       SIMDE_VECTORIZE_REDUCTION(&:r_)
2050       for (size_t i = 0 ; i < (sizeof(a_.i32f) / sizeof(a_.i32f[0])) ; i++) {
2051         r_ &= a_.i32f[i];
2052       }
2053 
2054       r = (r_ == ~HEDLEY_STATIC_CAST(int_fast32_t, 0));
2055     #endif
2056 
2057     return r;
2058   #endif
2059 }
2060 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
2061   #undef _mm_test_all_ones
2062   #define _mm_test_all_ones(a) simde_mm_test_all_ones(a)
2063 #endif
2064 
2065 SIMDE_FUNCTION_ATTRIBUTES
2066 int
simde_mm_test_all_zeros(simde__m128i a,simde__m128i mask)2067 simde_mm_test_all_zeros (simde__m128i a, simde__m128i mask) {
2068   #if defined(SIMDE_X86_SSE4_1_NATIVE)
2069     return _mm_test_all_zeros(a, mask);
2070   #else
2071     simde__m128i_private tmp_ = simde__m128i_to_private(simde_mm_and_si128(a, mask));
2072     int r;
2073 
2074     #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
2075       r = vec_all_eq(tmp_.altivec_i32, vec_splats(0));
2076     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2077       return !(vgetq_lane_s64(tmp_.neon_i64, 0) | vgetq_lane_s64(tmp_.neon_i64, 1));
2078     #else
2079       int_fast32_t r_ = HEDLEY_STATIC_CAST(int_fast32_t, 0);
2080 
2081       SIMDE_VECTORIZE_REDUCTION(|:r_)
2082       for (size_t i = 0 ; i < (sizeof(tmp_.i32f) / sizeof(tmp_.i32f[0])) ; i++) {
2083         r_ |= tmp_.i32f[i];
2084       }
2085 
2086       r = !r_;
2087     #endif
2088 
2089     return r;
2090   #endif
2091 }
2092 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
2093   #undef _mm_test_all_zeros
2094   #define _mm_test_all_zeros(a, mask) simde_mm_test_all_zeros(a, mask)
2095 #endif
2096 
2097 SIMDE_FUNCTION_ATTRIBUTES
2098 int
simde_mm_test_mix_ones_zeros(simde__m128i a,simde__m128i mask)2099 simde_mm_test_mix_ones_zeros (simde__m128i a, simde__m128i mask) {
2100   #if defined(SIMDE_X86_SSE4_1_NATIVE)
2101     return _mm_test_mix_ones_zeros(a, mask);
2102   #else
2103     simde__m128i_private
2104       a_ = simde__m128i_to_private(a),
2105       mask_ = simde__m128i_to_private(mask);
2106 
2107     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2108       int64x2_t s640 = vandq_s64(a_.neon_i64, mask_.neon_i64);
2109       int64x2_t s641 = vandq_s64(vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_s64(a_.neon_i64))), mask_.neon_i64);
2110       return (((vgetq_lane_s64(s640, 0) | vgetq_lane_s64(s640, 1)) & (vgetq_lane_s64(s641, 0) | vgetq_lane_s64(s641, 1)))!=0);
2111     #else
2112       for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++)
2113         if (((a_.u64[i] & mask_.u64[i]) != 0) && ((~a_.u64[i] & mask_.u64[i]) != 0))
2114           return 1;
2115 
2116       return 0;
2117     #endif
2118   #endif
2119 }
2120 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
2121   #undef _mm_test_mix_ones_zeros
2122   #define _mm_test_mix_ones_zeros(a, mask) simde_mm_test_mix_ones_zeros(a, mask)
2123 #endif
2124 
2125 SIMDE_FUNCTION_ATTRIBUTES
2126 int
simde_mm_testc_si128(simde__m128i a,simde__m128i b)2127 simde_mm_testc_si128 (simde__m128i a, simde__m128i b) {
2128   #if defined(SIMDE_X86_SSE4_1_NATIVE)
2129     return _mm_testc_si128(a, b);
2130   #else
2131     simde__m128i_private
2132       a_ = simde__m128i_to_private(a),
2133       b_ = simde__m128i_to_private(b);
2134 
2135     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2136       int64x2_t s64 = vandq_s64(~a_.neon_i64, b_.neon_i64);
2137       return !(vgetq_lane_s64(s64, 0) & vgetq_lane_s64(s64, 1));
2138     #else
2139       int_fast32_t r = 0;
2140 
2141       SIMDE_VECTORIZE_REDUCTION(|:r)
2142       for (size_t i = 0 ; i < (sizeof(a_.i32f) / sizeof(a_.i32f[0])) ; i++) {
2143         r |= ~a_.i32f[i] & b_.i32f[i];
2144       }
2145 
2146       return HEDLEY_STATIC_CAST(int, !r);
2147     #endif
2148   #endif
2149 }
2150 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
2151   #undef _mm_testc_si128
2152   #define _mm_testc_si128(a, b) simde_mm_testc_si128(a, b)
2153 #endif
2154 
2155 SIMDE_FUNCTION_ATTRIBUTES
2156 int
simde_mm_testnzc_si128(simde__m128i a,simde__m128i b)2157 simde_mm_testnzc_si128 (simde__m128i a, simde__m128i b) {
2158   #if defined(SIMDE_X86_SSE4_1_NATIVE)
2159     return _mm_testnzc_si128(a, b);
2160   #else
2161     simde__m128i_private
2162       a_ = simde__m128i_to_private(a),
2163       b_ = simde__m128i_to_private(b);
2164 
2165     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2166       int64x2_t s640 = vandq_s64(a_.neon_i64, b_.neon_i64);
2167       int64x2_t s641 = vandq_s64(~a_.neon_i64, b_.neon_i64);
2168       return (((vgetq_lane_s64(s640, 0) | vgetq_lane_s64(s640, 1)) & (vgetq_lane_s64(s641, 0) | vgetq_lane_s64(s641, 1)))!=0);
2169     #else
2170       for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) {
2171         if (((a_.u64[i] & b_.u64[i]) != 0) && ((~a_.u64[i] & b_.u64[i]) != 0))
2172           return 1;
2173       }
2174 
2175       return 0;
2176     #endif
2177   #endif
2178 }
2179 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
2180   #undef _mm_testnzc_si128
2181   #define _mm_testnzc_si128(a, b) simde_mm_testnzc_si128(a, b)
2182 #endif
2183 
2184 SIMDE_FUNCTION_ATTRIBUTES
2185 int
simde_mm_testz_si128(simde__m128i a,simde__m128i b)2186 simde_mm_testz_si128 (simde__m128i a, simde__m128i b) {
2187   #if defined(SIMDE_X86_SSE4_1_NATIVE)
2188     return _mm_testz_si128(a, b);
2189   #else
2190     simde__m128i_private
2191       a_ = simde__m128i_to_private(a),
2192       b_ = simde__m128i_to_private(b);
2193 
2194     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2195       int64x2_t s64 = vandq_s64(a_.neon_i64, b_.neon_i64);
2196       return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
2197     #else
2198       for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) {
2199         if ((a_.u64[i] & b_.u64[i]) == 0)
2200           return 1;
2201       }
2202     #endif
2203 
2204     return 0;
2205   #endif
2206 }
2207 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
2208   #undef _mm_testz_si128
2209   #define _mm_testz_si128(a, b) simde_mm_testz_si128(a, b)
2210 #endif
2211 
2212 SIMDE_END_DECLS_
2213 
2214 HEDLEY_DIAGNOSTIC_POP
2215 
2216 #endif /* !defined(SIMDE_X86_SSE4_1_H) */
2217