1 /* SPDX-License-Identifier: MIT
2  *
3  * Permission is hereby granted, free of charge, to any person
4  * obtaining a copy of this software and associated documentation
5  * files (the "Software"), to deal in the Software without
6  * restriction, including without limitation the rights to use, copy,
7  * modify, merge, publish, distribute, sublicense, and/or sell copies
8  * of the Software, and to permit persons to whom the Software is
9  * furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be
12  * included in all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
18  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
19  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  * Copyright:
24  *   2017-2020 Evan Nemerson <evan@nemerson.com>
25  */
26 
27 #include "sse.h"
28 #if !defined(SIMDE_X86_SSE4_1_H)
29 #define SIMDE_X86_SSE4_1_H
30 
31 #include "ssse3.h"
32 
33 HEDLEY_DIAGNOSTIC_PUSH
34 SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
35 SIMDE_BEGIN_DECLS_
36 
37 #if !defined(SIMDE_X86_SSE4_1_NATIVE) && defined(SIMDE_ENABLE_NATIVE_ALIASES)
38 #  define SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES
39 #endif
40 
41 SIMDE_FUNCTION_ATTRIBUTES
42 simde__m128i
simde_mm_blend_epi16(simde__m128i a,simde__m128i b,const int imm8)43 simde_mm_blend_epi16 (simde__m128i a, simde__m128i b, const int imm8)
44     SIMDE_REQUIRE_RANGE(imm8, 0, 255)  {
45   simde__m128i_private
46     r_,
47     a_ = simde__m128i_to_private(a),
48     b_ = simde__m128i_to_private(b);
49 
50   SIMDE_VECTORIZE
51   for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
52     r_.u16[i] = ((imm8 >> i) & 1) ? b_.u16[i] : a_.u16[i];
53   }
54 
55   return simde__m128i_from_private(r_);
56 }
57 #if defined(SIMDE_X86_SSE4_1_NATIVE)
58 #  define simde_mm_blend_epi16(a, b, imm8) _mm_blend_epi16(a, b, imm8)
59 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
60 #  define simde_mm_blend_epi16(a, b, imm8) \
61      (__extension__ ({ \
62            const uint16_t _mask[8] = {               \
63                ((imm8) & (1 << 0)) ? 0xFFFF : 0x0000, \
64                ((imm8) & (1 << 1)) ? 0xFFFF : 0x0000, \
65                ((imm8) & (1 << 2)) ? 0xFFFF : 0x0000, \
66                ((imm8) & (1 << 3)) ? 0xFFFF : 0x0000, \
67                ((imm8) & (1 << 4)) ? 0xFFFF : 0x0000, \
68                ((imm8) & (1 << 5)) ? 0xFFFF : 0x0000, \
69                ((imm8) & (1 << 6)) ? 0xFFFF : 0x0000, \
70                ((imm8) & (1 << 7)) ? 0xFFFF : 0x0000  \
71            };                                        \
72            uint16x8_t _mask_vec = vld1q_u16(_mask);  \
73            simde__m128i_from_neon_u16(vbslq_u16(_mask_vec, simde__m128i_to_neon_u16(b), simde__m128i_to_neon_u16(a))); \
74        }))
75 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
76 #  define simde_mm_blend_epi16(a, b, imm8)      \
77      (__extension__ ({ \
78            const vector unsigned short _mask = {      \
79                ((imm8) & (1 << 0)) ? 0xFFFF : 0x0000, \
80                ((imm8) & (1 << 1)) ? 0xFFFF : 0x0000, \
81                ((imm8) & (1 << 2)) ? 0xFFFF : 0x0000, \
82                ((imm8) & (1 << 3)) ? 0xFFFF : 0x0000, \
83                ((imm8) & (1 << 4)) ? 0xFFFF : 0x0000, \
84                ((imm8) & (1 << 5)) ? 0xFFFF : 0x0000, \
85                ((imm8) & (1 << 6)) ? 0xFFFF : 0x0000, \
86                ((imm8) & (1 << 7)) ? 0xFFFF : 0x0000  \
87            };                                         \
88            simde__m128i_from_altivec_u16(vec_sel(simde__m128i_to_altivec_u16(a), simde__m128i_to_altivec_u16(b), _mask)); \
89        }))
90 #endif
91 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
92   #undef _mm_blend_epi16
93   #define _mm_blend_epi16(a, b, imm8) simde_mm_blend_epi16(a, b, imm8)
94 #endif
95 
96 SIMDE_FUNCTION_ATTRIBUTES
97 simde__m128d
simde_mm_blend_pd(simde__m128d a,simde__m128d b,const int imm8)98 simde_mm_blend_pd (simde__m128d a, simde__m128d b, const int imm8)
99     SIMDE_REQUIRE_RANGE(imm8, 0, 3)  {
100   simde__m128d_private
101     r_,
102     a_ = simde__m128d_to_private(a),
103     b_ = simde__m128d_to_private(b);
104 
105   SIMDE_VECTORIZE
106   for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
107     r_.f64[i] = ((imm8 >> i) & 1) ? b_.f64[i] : a_.f64[i];
108   }
109   return simde__m128d_from_private(r_);
110 }
111 #if defined(SIMDE_X86_SSE4_1_NATIVE)
112 #  define simde_mm_blend_pd(a, b, imm8) _mm_blend_pd(a, b, imm8)
113 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
114 #  define simde_mm_blend_pd(a, b, imm8) \
115      (__extension__ ({ \
116            const uint64_t _mask[2] = {               \
117                ((imm8) & (1 << 0)) ? UINT64_MAX : 0, \
118                ((imm8) & (1 << 1)) ? UINT64_MAX : 0  \
119            };                                        \
120            uint64x2_t _mask_vec = vld1q_u64(_mask);  \
121            simde__m128d_from_neon_u64(vbslq_u64(_mask_vec, simde__m128d_to_neon_u64(b), simde__m128d_to_neon_u64(a))); \
122        }))
123 #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
124 #  define simde_mm_blend_pd(a, b, imm8)         \
125      (__extension__ ({ \
126            const vector unsigned long long _mask = { \
127                ((imm8) & (1 << 0)) ? UINT64_MAX : 0, \
128                ((imm8) & (1 << 1)) ? UINT64_MAX : 0  \
129            };                                        \
130            simde__m128d_from_altivec_f64(vec_sel(simde__m128d_to_altivec_f64(a), simde__m128d_to_altivec_f64(b), _mask)); \
131        }))
132 #endif
133 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
134   #undef _mm_blend_pd
135   #define _mm_blend_pd(a, b, imm8) simde_mm_blend_pd(a, b, imm8)
136 #endif
137 
138 SIMDE_FUNCTION_ATTRIBUTES
139 simde__m128
simde_mm_blend_ps(simde__m128 a,simde__m128 b,const int imm8)140 simde_mm_blend_ps (simde__m128 a, simde__m128 b, const int imm8)
141     SIMDE_REQUIRE_RANGE(imm8, 0, 15)  {
142   simde__m128_private
143     r_,
144     a_ = simde__m128_to_private(a),
145     b_ = simde__m128_to_private(b);
146 
147   SIMDE_VECTORIZE
148   for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
149     r_.f32[i] = ((imm8 >> i) & 1) ? b_.f32[i] : a_.f32[i];
150   }
151   return simde__m128_from_private(r_);
152 }
153 #if defined(SIMDE_X86_SSE4_1_NATIVE)
154 #  define simde_mm_blend_ps(a, b, imm8) _mm_blend_ps(a, b, imm8)
155 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
156 #  define simde_mm_blend_ps(a, b, imm8) \
157      (__extension__ ({ \
158            const uint32_t _mask[4] = {               \
159                ((imm8) & (1 << 0)) ? UINT32_MAX : 0, \
160                ((imm8) & (1 << 1)) ? UINT32_MAX : 0, \
161                ((imm8) & (1 << 2)) ? UINT32_MAX : 0, \
162                ((imm8) & (1 << 3)) ? UINT32_MAX : 0  \
163            };                                        \
164            uint32x4_t _mask_vec = vld1q_u32(_mask);  \
165            simde__m128_from_neon_f32(vbslq_f32(_mask_vec, simde__m128_to_neon_f32(b), simde__m128_to_neon_f32(a))); \
166        }))
167 #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
168 #  define simde_mm_blend_ps(a, b, imm8) \
169      (__extension__ ({ \
170            const vector unsigned int _mask = {       \
171                ((imm8) & (1 << 0)) ? UINT32_MAX : 0, \
172                ((imm8) & (1 << 1)) ? UINT32_MAX : 0, \
173                ((imm8) & (1 << 2)) ? UINT32_MAX : 0, \
174                ((imm8) & (1 << 3)) ? UINT32_MAX : 0  \
175            };                                        \
176            simde__m128_from_altivec_f32(vec_sel(simde__m128_to_altivec_f32(a), simde__m128_to_altivec_f32(b), _mask)); \
177        }))
178 #endif
179 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
180   #undef _mm_blend_ps
181   #define _mm_blend_ps(a, b, imm8) simde_mm_blend_ps(a, b, imm8)
182 #endif
183 
184 SIMDE_FUNCTION_ATTRIBUTES
185 simde__m128i
simde_mm_blendv_epi8(simde__m128i a,simde__m128i b,simde__m128i mask)186 simde_mm_blendv_epi8 (simde__m128i a, simde__m128i b, simde__m128i mask) {
187 #if defined(SIMDE_X86_SSE4_1_NATIVE)
188   return _mm_blendv_epi8(a, b, mask);
189 #else
190   simde__m128i_private
191     r_,
192     a_ = simde__m128i_to_private(a),
193     b_ = simde__m128i_to_private(b),
194     mask_ = simde__m128i_to_private(mask);
195 
196   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
197     // Use a signed shift right to create a mask with the sign bit
198     mask_.neon_i8 = vshrq_n_s8(mask_.neon_i8, 7);
199     r_.neon_i8 = vbslq_s8(mask_.neon_u8, b_.neon_i8, a_.neon_i8);
200   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
201     r_.altivec_i8 = vec_sel(a_.altivec_i8, b_.altivec_i8, vec_cmplt(mask_.altivec_i8, vec_splat_s8(0)));
202   #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
203     /* https://software.intel.com/en-us/forums/intel-c-compiler/topic/850087 */
204     #if defined(HEDLEY_INTEL_VERSION_CHECK)
205       __typeof__(mask_.i8) z = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
206       mask_.i8 = HEDLEY_STATIC_CAST(__typeof__(mask_.i8), mask_.i8 < z);
207     #else
208       mask_.i8 >>= (CHAR_BIT * sizeof(mask_.i8[0])) - 1;
209     #endif
210 
211     r_.i8 = (mask_.i8 & b_.i8) | (~mask_.i8 & a_.i8);
212   #else
213     SIMDE_VECTORIZE
214     for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
215       int8_t m = mask_.i8[i] >> 7;
216       r_.i8[i] = (m & b_.i8[i]) | (~m & a_.i8[i]);
217     }
218   #endif
219 
220   return simde__m128i_from_private(r_);
221 #endif
222 }
223 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
224   #undef _mm_blendv_epi8
225   #define _mm_blendv_epi8(a, b, mask) simde_mm_blendv_epi8(a, b, mask)
226 #endif
227 
228 SIMDE_FUNCTION_ATTRIBUTES
229 simde__m128i
simde_x_mm_blendv_epi16(simde__m128i a,simde__m128i b,simde__m128i mask)230 simde_x_mm_blendv_epi16 (simde__m128i a, simde__m128i b, simde__m128i mask) {
231 #if defined(SIMDE_X86_SSE2_NATIVE)
232   mask = simde_mm_srai_epi16(mask, 15);
233   return simde_mm_or_si128(simde_mm_and_si128(mask, b), simde_mm_andnot_si128(mask, a));
234 #else
235   simde__m128i_private
236     r_,
237     a_ = simde__m128i_to_private(a),
238     b_ = simde__m128i_to_private(b),
239     mask_ = simde__m128i_to_private(mask);
240 
241 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
242   mask_ = simde__m128i_to_private(simde_mm_cmplt_epi16(mask, simde_mm_setzero_si128()));
243   r_.neon_i16 = vbslq_s16(mask_.neon_u16, b_.neon_i16, a_.neon_i16);
244 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
245   r_.altivec_i16 = vec_sel(a_.altivec_i16, b_.altivec_i16, vec_cmplt(mask_.altivec_i16, vec_splat_s16(0)));
246 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
247   #if defined(HEDLEY_INTEL_VERSION_CHECK)
248     __typeof__(mask_.i16) z = { 0, 0, 0, 0, 0, 0, 0, 0 };
249     mask_.i16 = mask_.i16 < z;
250   #else
251     mask_.i16 >>= (CHAR_BIT * sizeof(mask_.i16[0])) - 1;
252   #endif
253 
254   r_.i16 = (mask_.i16 & b_.i16) | (~mask_.i16 & a_.i16);
255 #else
256   SIMDE_VECTORIZE
257   for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
258     int16_t m = mask_.i16[i] >> 15;
259     r_.i16[i] = (m & b_.i16[i]) | (~m & a_.i16[i]);
260   }
261 #endif
262 
263   return simde__m128i_from_private(r_);
264 #endif
265 }
266 
267 SIMDE_FUNCTION_ATTRIBUTES
268 simde__m128i
simde_x_mm_blendv_epi32(simde__m128i a,simde__m128i b,simde__m128i mask)269 simde_x_mm_blendv_epi32 (simde__m128i a, simde__m128i b, simde__m128i mask) {
270 #if defined(SIMDE_X86_SSE4_1_NATIVE)
271   return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _mm_castsi128_ps(mask)));
272 #else
273   simde__m128i_private
274     r_,
275     a_ = simde__m128i_to_private(a),
276     b_ = simde__m128i_to_private(b),
277     mask_ = simde__m128i_to_private(mask);
278 
279 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
280   mask_ = simde__m128i_to_private(simde_mm_cmplt_epi32(mask, simde_mm_setzero_si128()));
281   r_.neon_i32 = vbslq_s32(mask_.neon_u32, b_.neon_i32, a_.neon_i32);
282 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
283   r_.altivec_i32 = vec_sel(a_.altivec_i32, b_.altivec_i32, vec_cmplt(mask_.altivec_i32, vec_splat_s32(0)));
284 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
285   #if defined(HEDLEY_INTEL_VERSION_CHECK)
286     __typeof__(mask_.i32) z = { 0, 0, 0, 0 };
287     mask_.i32 = HEDLEY_STATIC_CAST(__typeof__(mask_.i32), mask_.i32 < z);
288   #else
289     mask_.i32 >>= (CHAR_BIT * sizeof(mask_.i32[0])) - 1;
290   #endif
291 
292   r_.i32 = (mask_.i32 & b_.i32) | (~mask_.i32 & a_.i32);
293 #else
294   SIMDE_VECTORIZE
295   for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
296     int32_t m = mask_.i32[i] >> 31;
297     r_.i32[i] = (m & b_.i32[i]) | (~m & a_.i32[i]);
298   }
299 #endif
300 
301   return simde__m128i_from_private(r_);
302 #endif
303 }
304 
305 SIMDE_FUNCTION_ATTRIBUTES
306 simde__m128i
simde_x_mm_blendv_epi64(simde__m128i a,simde__m128i b,simde__m128i mask)307 simde_x_mm_blendv_epi64 (simde__m128i a, simde__m128i b, simde__m128i mask) {
308 #if defined(SIMDE_X86_SSE4_1_NATIVE)
309   return _mm_castpd_si128(_mm_blendv_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(b), _mm_castsi128_pd(mask)));
310 #else
311   simde__m128i_private
312     r_,
313     a_ = simde__m128i_to_private(a),
314     b_ = simde__m128i_to_private(b),
315     mask_ = simde__m128i_to_private(mask);
316 
317 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
318   mask_.u64 = vcltq_s64(mask_.i64, vdupq_n_s64(UINT64_C(0)));
319   r_.neon_i64 = vbslq_s64(mask_.neon_u64, b_.neon_i64, a_.neon_i64);
320 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
321   r_.altivec_i64 = vec_sel(a_.altivec_i64, b_.altivec_i64,
322                            vec_cmplt(mask_.altivec_i64, vec_splats(HEDLEY_STATIC_CAST(signed long long, 0))));
323 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
324   #if defined(HEDLEY_INTEL_VERSION_CHECK)
325     __typeof__(mask_.i64) z = { 0, 0 };
326     mask_.i64 = HEDLEY_STATIC_CAST(__typeof__(mask_.i64), mask_.i64 < z);
327   #else
328     mask_.i64 >>= (CHAR_BIT * sizeof(mask_.i64[0])) - 1;
329   #endif
330 
331   r_.i64 = (mask_.i64 & b_.i64) | (~mask_.i64 & a_.i64);
332 #else
333   SIMDE_VECTORIZE
334   for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
335     int64_t m = mask_.i64[i] >> 63;
336     r_.i64[i] = (m & b_.i64[i]) | (~m & a_.i64[i]);
337   }
338 #endif
339 
340   return simde__m128i_from_private(r_);
341 #endif
342 }
343 
344 SIMDE_FUNCTION_ATTRIBUTES
345 simde__m128d
simde_mm_blendv_pd(simde__m128d a,simde__m128d b,simde__m128d mask)346 simde_mm_blendv_pd (simde__m128d a, simde__m128d b, simde__m128d mask) {
347 #if defined(SIMDE_X86_SSE4_1_NATIVE)
348   return _mm_blendv_pd(a, b, mask);
349 #else
350   return simde_mm_castsi128_pd(simde_x_mm_blendv_epi64(simde_mm_castpd_si128(a), simde_mm_castpd_si128(b), simde_mm_castpd_si128(mask)));
351 #endif
352 }
353 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
354   #undef _mm_blendv_pd
355   #define _mm_blendv_pd(a, b, mask) simde_mm_blendv_pd(a, b, mask)
356 #endif
357 
358 SIMDE_FUNCTION_ATTRIBUTES
359 simde__m128
simde_mm_blendv_ps(simde__m128 a,simde__m128 b,simde__m128 mask)360 simde_mm_blendv_ps (simde__m128 a, simde__m128 b, simde__m128 mask) {
361 #if defined(SIMDE_X86_SSE4_1_NATIVE)
362   return _mm_blendv_ps(a, b, mask);
363 #else
364   return simde_mm_castsi128_ps(simde_x_mm_blendv_epi32(simde_mm_castps_si128(a), simde_mm_castps_si128(b), simde_mm_castps_si128(mask)));
365 #endif
366 }
367 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
368   #undef _mm_blendv_ps
369   #define _mm_blendv_ps(a, b, mask) simde_mm_blendv_ps(a, b, mask)
370 #endif
371 
372 SIMDE_FUNCTION_ATTRIBUTES
373 simde__m128d
simde_mm_round_pd(simde__m128d a,int rounding)374 simde_mm_round_pd (simde__m128d a, int rounding)
375     SIMDE_REQUIRE_CONSTANT_RANGE(rounding, 0, 15) {
376   simde__m128d_private
377     r_,
378     a_ = simde__m128d_to_private(a);
379 
380   /* For architectures which lack a current direction SIMD instruction. */
381   #if defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
382     if ((rounding & 7) == SIMDE_MM_FROUND_CUR_DIRECTION)
383       rounding = HEDLEY_STATIC_CAST(int, SIMDE_MM_GET_ROUNDING_MODE()) << 13;
384   #endif
385 
386   switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) {
387     case SIMDE_MM_FROUND_CUR_DIRECTION:
388       #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
389         r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_round(a_.altivec_f64));
390       #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) && 0
391         r_.neon_f64 = vrndiq_f64(a_.neon_f64);
392       #elif defined(simde_math_nearbyint)
393         SIMDE_VECTORIZE
394         for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
395           r_.f64[i] = simde_math_nearbyint(a_.f64[i]);
396         }
397       #else
398         HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
399       #endif
400       break;
401 
402     case SIMDE_MM_FROUND_TO_NEAREST_INT:
403       #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
404         r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_round(a_.altivec_f64));
405       #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) && 0
406         r_.neon_f64 = vrndaq_f64(a_.neon_f64);
407       #elif defined(simde_math_round)
408         SIMDE_VECTORIZE
409         for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
410           r_.f64[i] = simde_math_round(a_.f64[i]);
411         }
412       #else
413         HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
414       #endif
415       break;
416 
417     case SIMDE_MM_FROUND_TO_NEG_INF:
418       #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
419         r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_floor(a_.altivec_f64));
420       #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) && 0
421         r_.neon_f64 = vrndmq_f64(a_.neon_f64);
422       #else
423         SIMDE_VECTORIZE
424         for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
425           r_.f64[i] = simde_math_floor(a_.f64[i]);
426         }
427       #endif
428       break;
429 
430     case SIMDE_MM_FROUND_TO_POS_INF:
431       #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
432         r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_ceil(a_.altivec_f64));
433       #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) && 0
434         r_.neon_f64 = vrndpq_f64(a_.neon_f64);
435       #elif defined(simde_math_ceil)
436         SIMDE_VECTORIZE
437         for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
438           r_.f64[i] = simde_math_ceil(a_.f64[i]);
439         }
440       #else
441         HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
442       #endif
443       break;
444 
445     case SIMDE_MM_FROUND_TO_ZERO:
446       #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
447         r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_trunc(a_.altivec_f64));
448       #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) && 0
449         r_.neon_f64 = vrndq_f64(a_.neon_f64);
450       #else
451         SIMDE_VECTORIZE
452         for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
453           r_.f64[i] = simde_math_trunc(a_.f64[i]);
454         }
455       #endif
456       break;
457 
458     default:
459       HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
460   }
461 
462   return simde__m128d_from_private(r_);
463 }
464 #if defined(SIMDE_X86_SSE4_1_NATIVE)
465   #define simde_mm_round_pd(a, rounding) _mm_round_pd(a, rounding)
466 #endif
467 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
468   #undef _mm_round_pd
469   #define _mm_round_pd(a, rounding) simde_mm_round_pd(a, rounding)
470 #endif
471 
472 SIMDE_FUNCTION_ATTRIBUTES
473 simde__m128d
simde_mm_ceil_pd(simde__m128d a)474 simde_mm_ceil_pd (simde__m128d a) {
475   return simde_mm_round_pd(a, SIMDE_MM_FROUND_TO_POS_INF);
476 }
477 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
478   #undef _mm_ceil_pd
479   #define _mm_ceil_pd(a) simde_mm_ceil_pd(a)
480 #endif
481 
482 SIMDE_FUNCTION_ATTRIBUTES
483 simde__m128
simde_mm_ceil_ps(simde__m128 a)484 simde_mm_ceil_ps (simde__m128 a) {
485   return simde_mm_round_ps(a, SIMDE_MM_FROUND_TO_POS_INF);
486 }
487 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
488   #undef _mm_ceil_ps
489   #define _mm_ceil_ps(a) simde_mm_ceil_ps(a)
490 #endif
491 
492 SIMDE_FUNCTION_ATTRIBUTES
493 simde__m128d
simde_mm_ceil_sd(simde__m128d a,simde__m128d b)494 simde_mm_ceil_sd (simde__m128d a, simde__m128d b) {
495 #if defined(SIMDE_X86_SSE4_1_NATIVE)
496   return _mm_ceil_sd(a, b);
497 #else
498   simde__m128d_private
499     r_,
500     a_ = simde__m128d_to_private(a),
501     b_ = simde__m128d_to_private(b);
502 
503   #if defined(simde_math_ceilf)
504     r_ = simde__m128d_to_private(simde_mm_set_pd(a_.f64[1], simde_math_ceil(b_.f64[0])));
505   #else
506     HEDLEY_UNREACHABLE();
507   #endif
508 
509   return simde__m128d_from_private(r_);
510 #endif
511 }
512 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
513   #undef _mm_ceil_sd
514   #define _mm_ceil_sd(a, b) simde_mm_ceil_sd(a, b)
515 #endif
516 
517 SIMDE_FUNCTION_ATTRIBUTES
518 simde__m128
simde_mm_ceil_ss(simde__m128 a,simde__m128 b)519 simde_mm_ceil_ss (simde__m128 a, simde__m128 b) {
520   #if defined(SIMDE_X86_SSE4_1_NATIVE)
521     return _mm_ceil_ss(a, b);
522   #elif defined(SIMDE_ASSUME_VECTORIZATION)
523     return simde_mm_move_ss(a, simde_mm_ceil_ps(b));
524   #else
525     simde__m128_private
526       r_,
527       a_ = simde__m128_to_private(a),
528       b_ = simde__m128_to_private(b);
529 
530     #if defined(simde_math_ceilf)
531       r_ = simde__m128_to_private(simde_mm_set_ps(a_.f32[3], a_.f32[2], a_.f32[1], simde_math_ceilf(b_.f32[0])));
532     #else
533       HEDLEY_UNREACHABLE();
534     #endif
535 
536     return simde__m128_from_private(r_);
537   #endif
538 }
539 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
540   #undef _mm_ceil_ss
541   #define _mm_ceil_ss(a, b) simde_mm_ceil_ss(a, b)
542 #endif
543 
544 SIMDE_FUNCTION_ATTRIBUTES
545 simde__m128i
simde_mm_cmpeq_epi64(simde__m128i a,simde__m128i b)546 simde_mm_cmpeq_epi64 (simde__m128i a, simde__m128i b) {
547 #if defined(SIMDE_X86_SSE4_1_NATIVE)
548   return _mm_cmpeq_epi64(a, b);
549 #else
550   simde__m128i_private
551     r_,
552     a_ = simde__m128i_to_private(a),
553     b_ = simde__m128i_to_private(b);
554 
555   #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
556     r_.neon_u64 = vceqq_u64(a_.neon_u64, b_.neon_u64);
557   #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
558     // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
559     uint32x4_t cmp = vceqq_u32(a_.neon_u32, b_.neon_u32);
560     uint32x4_t swapped = vrev64q_u32(cmp);
561     r_.neon_u32 = vandq_u32(cmp, swapped);
562   #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
563     r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), a_.i64 == b_.i64);
564   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
565     r_.altivec_i64 = (vector signed long long) vec_cmpeq(a_.altivec_i64, b_.altivec_i64);
566   #else
567     SIMDE_VECTORIZE
568     for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
569       r_.u64[i] = (a_.u64[i] == b_.u64[i]) ? ~UINT64_C(0) : UINT64_C(0);
570     }
571   #endif
572 
573   return simde__m128i_from_private(r_);
574 #endif
575 }
576 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
577   #undef _mm_cmpeq_epi64
578   #define _mm_cmpeq_epi64(a, b) simde_mm_cmpeq_epi64(a, b)
579 #endif
580 
581 SIMDE_FUNCTION_ATTRIBUTES
582 simde__m128i
simde_mm_cvtepi8_epi16(simde__m128i a)583 simde_mm_cvtepi8_epi16 (simde__m128i a) {
584 #if defined(SIMDE_X86_SSE4_1_NATIVE)
585   return _mm_cvtepi8_epi16(a);
586 #else
587   simde__m128i_private
588     r_,
589     a_ = simde__m128i_to_private(a);
590 
591   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
592     int8x16_t s8x16 = a_.neon_i8;                   /* xxxx xxxx xxxx DCBA */
593     int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
594     r_.neon_i16 = s16x8;
595   #elif defined(SIMDE_CONVERT_VECTOR_)
596     SIMDE_CONVERT_VECTOR_(r_.i16, a_.m64_private[0].i8);
597   #else
598     SIMDE_VECTORIZE
599     for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
600       r_.i16[i] = a_.i8[i];
601     }
602   #endif
603 
604   return simde__m128i_from_private(r_);
605 #endif
606 }
607 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
608   #undef _mm_cvtepi8_epi16
609   #define _mm_cvtepi8_epi16(a) simde_mm_cvtepi8_epi16(a)
610 #endif
611 
612 SIMDE_FUNCTION_ATTRIBUTES
613 simde__m128i
simde_mm_cvtepi8_epi32(simde__m128i a)614 simde_mm_cvtepi8_epi32 (simde__m128i a) {
615 #if defined(SIMDE_X86_SSE4_1_NATIVE)
616   return _mm_cvtepi8_epi32(a);
617 #else
618   simde__m128i_private
619     r_,
620     a_ = simde__m128i_to_private(a);
621 
622   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
623     int8x16_t s8x16 = a_.neon_i8;                     /* xxxx xxxx xxxx DCBA */
624     int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0D0C 0B0A */
625     int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */
626     r_.neon_i32 = s32x4;
627   #else
628     SIMDE_VECTORIZE
629     for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
630       r_.i32[i] = a_.i8[i];
631     }
632   #endif
633 
634   return simde__m128i_from_private(r_);
635 #endif
636 }
637 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
638   #undef _mm_cvtepi8_epi32
639   #define _mm_cvtepi8_epi32(a) simde_mm_cvtepi8_epi32(a)
640 #endif
641 
642 SIMDE_FUNCTION_ATTRIBUTES
643 simde__m128i
simde_mm_cvtepi8_epi64(simde__m128i a)644 simde_mm_cvtepi8_epi64 (simde__m128i a) {
645 #if defined(SIMDE_X86_SSE4_1_NATIVE)
646   return _mm_cvtepi8_epi64(a);
647 #else
648   simde__m128i_private
649     r_,
650     a_ = simde__m128i_to_private(a);
651 
652   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
653     int8x16_t s8x16 = a_.neon_i8;                     /* xxxx xxxx xxxx xxBA */
654     int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0x0x 0B0A */
655     int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
656     int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
657     r_.neon_i64 = s64x2;
658   #else
659     SIMDE_VECTORIZE
660     for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
661       r_.i64[i] = a_.i8[i];
662     }
663   #endif
664 
665   return simde__m128i_from_private(r_);
666 #endif
667 }
668 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
669   #undef _mm_cvtepi8_epi64
670   #define _mm_cvtepi8_epi64(a) simde_mm_cvtepi8_epi64(a)
671 #endif
672 
673 SIMDE_FUNCTION_ATTRIBUTES
674 simde__m128i
simde_mm_cvtepu8_epi16(simde__m128i a)675 simde_mm_cvtepu8_epi16 (simde__m128i a) {
676 #if defined(SIMDE_X86_SSE4_1_NATIVE)
677   return _mm_cvtepu8_epi16(a);
678 #else
679   simde__m128i_private
680     r_,
681     a_ = simde__m128i_to_private(a);
682 
683   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
684     uint8x16_t u8x16 = a_.neon_u8;                   /* xxxx xxxx xxxx DCBA */
685     uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */
686     r_.neon_u16 = u16x8;
687   #elif defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_BUG_CLANG_45541) && (!defined(SIMDE_ARCH_POWER) || !defined(__clang__))
688     SIMDE_CONVERT_VECTOR_(r_.i16, a_.m64_private[0].u8);
689   #else
690     SIMDE_VECTORIZE
691     for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
692       r_.i16[i] = a_.u8[i];
693     }
694   #endif
695 
696   return simde__m128i_from_private(r_);
697 #endif
698 }
699 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
700   #undef _mm_cvtepu8_epi16
701   #define _mm_cvtepu8_epi16(a) simde_mm_cvtepu8_epi16(a)
702 #endif
703 
704 SIMDE_FUNCTION_ATTRIBUTES
705 simde__m128i
simde_mm_cvtepu8_epi32(simde__m128i a)706 simde_mm_cvtepu8_epi32 (simde__m128i a) {
707 #if defined(SIMDE_X86_SSE4_1_NATIVE)
708   return _mm_cvtepu8_epi32(a);
709 #else
710   simde__m128i_private
711     r_,
712     a_ = simde__m128i_to_private(a);
713 
714   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
715     uint8x16_t u8x16 = a_.neon_u8;                     /* xxxx xxxx xxxx DCBA */
716     uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0D0C 0B0A */
717     uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
718     r_.neon_u32 = u32x4;
719   #else
720     SIMDE_VECTORIZE
721     for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
722       r_.i32[i] = a_.u8[i];
723     }
724   #endif
725 
726   return simde__m128i_from_private(r_);
727 #endif
728 }
729 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
730   #undef _mm_cvtepu8_epi32
731   #define _mm_cvtepu8_epi32(a) simde_mm_cvtepu8_epi32(a)
732 #endif
733 
734 SIMDE_FUNCTION_ATTRIBUTES
735 simde__m128i
simde_mm_cvtepu8_epi64(simde__m128i a)736 simde_mm_cvtepu8_epi64 (simde__m128i a) {
737 #if defined(SIMDE_X86_SSE4_1_NATIVE)
738   return _mm_cvtepu8_epi64(a);
739 #else
740   simde__m128i_private
741     r_,
742     a_ = simde__m128i_to_private(a);
743 
744   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
745     uint8x16_t u8x16 = a_.neon_u8;                     /* xxxx xxxx xxxx xxBA */
746     uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0x0x 0B0A */
747     uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
748     uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
749     r_.neon_u64 = u64x2;
750   #else
751     SIMDE_VECTORIZE
752     for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
753       r_.i64[i] = a_.u8[i];
754     }
755   #endif
756 
757   return simde__m128i_from_private(r_);
758 #endif
759 }
760 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
761   #undef _mm_cvtepu8_epi64
762   #define _mm_cvtepu8_epi64(a) simde_mm_cvtepu8_epi64(a)
763 #endif
764 
765 SIMDE_FUNCTION_ATTRIBUTES
766 simde__m128i
simde_mm_cvtepi16_epi32(simde__m128i a)767 simde_mm_cvtepi16_epi32 (simde__m128i a) {
768 #if defined(SIMDE_X86_SSE4_1_NATIVE)
769   return _mm_cvtepi16_epi32(a);
770 #else
771   simde__m128i_private
772     r_,
773     a_ = simde__m128i_to_private(a);
774 
775 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
776   r_.neon_i32 = vmovl_s16(vget_low_s16(a_.neon_i16));
777 #else
778   SIMDE_VECTORIZE
779   for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
780     r_.i32[i] = a_.i16[i];
781   }
782 #endif
783 
784   return simde__m128i_from_private(r_);
785 #endif
786 }
787 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
788   #undef _mm_cvtepi16_epi32
789   #define _mm_cvtepi16_epi32(a) simde_mm_cvtepi16_epi32(a)
790 #endif
791 
792 SIMDE_FUNCTION_ATTRIBUTES
793 simde__m128i
simde_mm_cvtepu16_epi32(simde__m128i a)794 simde_mm_cvtepu16_epi32 (simde__m128i a) {
795 #if defined(SIMDE_X86_SSE4_1_NATIVE)
796   return _mm_cvtepu16_epi32(a);
797 #else
798   simde__m128i_private
799     r_,
800     a_ = simde__m128i_to_private(a);
801 
802   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
803     r_.neon_u32 = vmovl_u16(vget_low_u16(a_.neon_u16));
804   #elif defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_BUG_CLANG_45541) && (!defined(SIMDE_ARCH_POWER) || !defined(__clang__))
805     SIMDE_CONVERT_VECTOR_(r_.i32, a_.m64_private[0].u16);
806   #else
807     SIMDE_VECTORIZE
808     for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
809       r_.i32[i] = a_.u16[i];
810     }
811   #endif
812 
813   return simde__m128i_from_private(r_);
814 #endif
815 }
816 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
817   #undef _mm_cvtepu16_epi32
818   #define _mm_cvtepu16_epi32(a) simde_mm_cvtepu16_epi32(a)
819 #endif
820 
821 SIMDE_FUNCTION_ATTRIBUTES
822 simde__m128i
simde_mm_cvtepu16_epi64(simde__m128i a)823 simde_mm_cvtepu16_epi64 (simde__m128i a) {
824 #if defined(SIMDE_X86_SSE4_1_NATIVE)
825   return _mm_cvtepu16_epi64(a);
826 #else
827   simde__m128i_private
828     r_,
829     a_ = simde__m128i_to_private(a);
830 
831   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
832     uint16x8_t u16x8 = a_.neon_u16;                    /* xxxx xxxx xxxx 0B0A */
833     uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
834     uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
835     r_.neon_u64 = u64x2;
836   #else
837     SIMDE_VECTORIZE
838     for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
839       r_.i64[i] = a_.u16[i];
840     }
841   #endif
842 
843   return simde__m128i_from_private(r_);
844 #endif
845 }
846 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
847   #undef _mm_cvtepu16_epi64
848   #define _mm_cvtepu16_epi64(a) simde_mm_cvtepu16_epi64(a)
849 #endif
850 
851 SIMDE_FUNCTION_ATTRIBUTES
852 simde__m128i
simde_mm_cvtepi16_epi64(simde__m128i a)853 simde_mm_cvtepi16_epi64 (simde__m128i a) {
854 #if defined(SIMDE_X86_SSE4_1_NATIVE)
855   return _mm_cvtepi16_epi64(a);
856 #else
857   simde__m128i_private
858     r_,
859     a_ = simde__m128i_to_private(a);
860 
861   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
862     int16x8_t s16x8 = a_.neon_i16;                    /* xxxx xxxx xxxx 0B0A */
863     int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
864     int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
865     r_.neon_i64 = s64x2;
866   #else
867     SIMDE_VECTORIZE
868     for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
869       r_.i64[i] = a_.i16[i];
870     }
871   #endif
872 
873   return simde__m128i_from_private(r_);
874 #endif
875 }
876 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
877   #undef _mm_cvtepi16_epi64
878   #define _mm_cvtepi16_epi64(a) simde_mm_cvtepi16_epi64(a)
879 #endif
880 
881 SIMDE_FUNCTION_ATTRIBUTES
882 simde__m128i
simde_mm_cvtepi32_epi64(simde__m128i a)883 simde_mm_cvtepi32_epi64 (simde__m128i a) {
884 #if defined(SIMDE_X86_SSE4_1_NATIVE)
885   return _mm_cvtepi32_epi64(a);
886 #else
887   simde__m128i_private
888     r_,
889     a_ = simde__m128i_to_private(a);
890 
891   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
892     r_.neon_i64 = vmovl_s32(vget_low_s32(a_.neon_i32));
893   #elif defined(SIMDE_CONVERT_VECTOR_)
894     SIMDE_CONVERT_VECTOR_(r_.i64, a_.m64_private[0].i32);
895   #else
896     SIMDE_VECTORIZE
897     for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
898       r_.i64[i] = a_.i32[i];
899     }
900   #endif
901 
902   return simde__m128i_from_private(r_);
903 #endif
904 }
905 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
906   #undef _mm_cvtepi32_epi64
907   #define _mm_cvtepi32_epi64(a) simde_mm_cvtepi32_epi64(a)
908 #endif
909 
910 SIMDE_FUNCTION_ATTRIBUTES
911 simde__m128i
simde_mm_cvtepu32_epi64(simde__m128i a)912 simde_mm_cvtepu32_epi64 (simde__m128i a) {
913 #if defined(SIMDE_X86_SSE4_1_NATIVE)
914   return _mm_cvtepu32_epi64(a);
915 #else
916   simde__m128i_private
917     r_,
918     a_ = simde__m128i_to_private(a);
919 
920   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
921     r_.neon_u64 = vmovl_u32(vget_low_u32(a_.neon_u32));
922   #elif defined(SIMDE_CONVERT_VECTOR_)
923     SIMDE_CONVERT_VECTOR_(r_.i64, a_.m64_private[0].u32);
924   #else
925     SIMDE_VECTORIZE
926     for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
927       r_.i64[i] = a_.u32[i];
928     }
929   #endif
930 
931   return simde__m128i_from_private(r_);
932 #endif
933 }
934 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
935   #undef _mm_cvtepu32_epi64
936   #define _mm_cvtepu32_epi64(a) simde_mm_cvtepu32_epi64(a)
937 #endif
938 
939 SIMDE_FUNCTION_ATTRIBUTES
940 simde__m128d
simde_mm_dp_pd(simde__m128d a,simde__m128d b,const int imm8)941 simde_mm_dp_pd (simde__m128d a, simde__m128d b, const int imm8)
942     SIMDE_REQUIRE_RANGE(imm8, 0, 255)  {
943   simde__m128d_private
944     r_,
945     a_ = simde__m128d_to_private(a),
946     b_ = simde__m128d_to_private(b);
947 
948   simde_float64 sum = SIMDE_FLOAT64_C(0.0);
949 
950   SIMDE_VECTORIZE_REDUCTION(+:sum)
951   for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
952     sum += ((imm8 >> (i + 4)) & 1) ? (a_.f64[i] * b_.f64[i]) : 0.0;
953   }
954 
955   SIMDE_VECTORIZE
956   for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
957     r_.f64[i] = ((imm8 >> i) & 1) ? sum : 0.0;
958   }
959 
960   return simde__m128d_from_private(r_);
961 }
962 #if defined(SIMDE_X86_SSE4_1_NATIVE)
963 #  define simde_mm_dp_pd(a, b, imm8) _mm_dp_pd(a, b, imm8)
964 #endif
965 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
966   #undef _mm_dp_pd
967   #define _mm_dp_pd(a, b, imm8) simde_mm_dp_pd(a, b, imm8)
968 #endif
969 
970 SIMDE_FUNCTION_ATTRIBUTES
971 simde__m128
simde_mm_dp_ps(simde__m128 a,simde__m128 b,const int imm8)972 simde_mm_dp_ps (simde__m128 a, simde__m128 b, const int imm8)
973     SIMDE_REQUIRE_RANGE(imm8, 0, 255)  {
974   simde__m128_private
975     r_,
976     a_ = simde__m128_to_private(a),
977     b_ = simde__m128_to_private(b);
978 
979   simde_float32 sum = SIMDE_FLOAT32_C(0.0);
980 
981   SIMDE_VECTORIZE_REDUCTION(+:sum)
982   for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
983     sum += ((imm8 >> (i + 4)) & 1) ? (a_.f32[i] * b_.f32[i]) : SIMDE_FLOAT32_C(0.0);
984   }
985 
986   SIMDE_VECTORIZE
987   for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
988     r_.f32[i] = ((imm8 >> i) & 1) ? sum : SIMDE_FLOAT32_C(0.0);
989   }
990 
991   return simde__m128_from_private(r_);
992 }
993 #if defined(SIMDE_X86_SSE4_1_NATIVE)
994 #  define simde_mm_dp_ps(a, b, imm8) _mm_dp_ps(a, b, imm8)
995 #endif
996 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
997   #undef _mm_dp_ps
998   #define _mm_dp_ps(a, b, imm8) simde_mm_dp_ps(a, b, imm8)
999 #endif
1000 
1001 #if defined(simde_mm_extract_epi8)
1002 #  undef simde_mm_extract_epi8
1003 #endif
1004 SIMDE_FUNCTION_ATTRIBUTES
1005 int8_t
simde_mm_extract_epi8(simde__m128i a,const int imm8)1006 simde_mm_extract_epi8 (simde__m128i a, const int imm8)
1007     SIMDE_REQUIRE_RANGE(imm8, 0, 15)  {
1008   simde__m128i_private
1009     a_ = simde__m128i_to_private(a);
1010 
1011   #if defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1012     #if defined(SIMDE_BUG_GCC_95227)
1013       (void) a_;
1014       (void) imm8;
1015     #endif
1016     return vec_extract(a_.altivec_i8, imm8);
1017   #else
1018     return a_.i8[imm8 & 15];
1019   #endif
1020 }
1021 #if defined(SIMDE_X86_SSE4_1_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_MM_EXTRACT_EPI8)
1022 #  define simde_mm_extract_epi8(a, imm8) HEDLEY_STATIC_CAST(int8_t, _mm_extract_epi8(a, imm8))
1023 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1024 #  define simde_mm_extract_epi8(a, imm8) vgetq_lane_s8(simde__m128i_to_private(a).neon_i8, imm8)
1025 #endif
1026 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1027   #undef _mm_extract_epi8
1028   #define _mm_extract_epi8(a, imm8) HEDLEY_STATIC_CAST(int, simde_mm_extract_epi8(a, imm8))
1029 #endif
1030 
1031 #if defined(simde_mm_extract_epi32)
1032 #  undef simde_mm_extract_epi32
1033 #endif
1034 SIMDE_FUNCTION_ATTRIBUTES
1035 int32_t
simde_mm_extract_epi32(simde__m128i a,const int imm8)1036 simde_mm_extract_epi32 (simde__m128i a, const int imm8)
1037     SIMDE_REQUIRE_RANGE(imm8, 0, 3)  {
1038   simde__m128i_private
1039     a_ = simde__m128i_to_private(a);
1040 
1041   #if defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1042     #if defined(SIMDE_BUG_GCC_95227)
1043       (void) a_;
1044       (void) imm8;
1045     #endif
1046     return vec_extract(a_.altivec_i32, imm8);
1047   #else
1048     return a_.i32[imm8 & 3];
1049   #endif
1050 }
1051 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1052 #  define simde_mm_extract_epi32(a, imm8) _mm_extract_epi32(a, imm8)
1053 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1054 #  define simde_mm_extract_epi32(a, imm8) vgetq_lane_s32(simde__m128i_to_private(a).neon_i32, imm8)
1055 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1056 #  define simde_mm_extract_epi32(a, imm8) HEDLEY_STATIC_CAST(int32_t, vec_extract(simde__m128i_to_private(a).altivec_i32, imm8))
1057 #endif
1058 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1059   #undef _mm_extract_epi32
1060   #define _mm_extract_epi32(a, imm8) simde_mm_extract_epi32(a, imm8)
1061 #endif
1062 
1063 #if defined(simde_mm_extract_epi64)
1064 #  undef simde_mm_extract_epi64
1065 #endif
1066 SIMDE_FUNCTION_ATTRIBUTES
1067 int64_t
simde_mm_extract_epi64(simde__m128i a,const int imm8)1068 simde_mm_extract_epi64 (simde__m128i a, const int imm8)
1069     SIMDE_REQUIRE_RANGE(imm8, 0, 1)  {
1070   simde__m128i_private
1071     a_ = simde__m128i_to_private(a);
1072 
1073   #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
1074     #if defined(SIMDE_BUG_GCC_95227)
1075       (void) a_;
1076       (void) imm8;
1077     #endif
1078     return vec_extract(a_.altivec_i64, imm8);
1079   #else
1080     return a_.i64[imm8 & 1];
1081   #endif
1082 }
1083 #if defined(SIMDE_X86_SSE4_1_NATIVE) && defined(SIMDE_ARCH_AMD64)
1084 #  define simde_mm_extract_epi64(a, imm8) _mm_extract_epi64(a, imm8)
1085 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1086 #  define simde_mm_extract_epi64(a, imm8) vgetq_lane_s64(simde__m128i_to_private(a).neon_i64, imm8)
1087 #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
1088 #  define simde_mm_extract_epi64(a, imm8) HEDLEY_STATIC_CAST(int64_t, vec_extract(simde__m128i_to_private(a).altivec_i64, imm8))
1089 #endif
1090 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1091   #undef _mm_extract_epi64
1092   #define _mm_extract_epi64(a, imm8) simde_mm_extract_epi64(a, imm8)
1093 #endif
1094 
1095 SIMDE_FUNCTION_ATTRIBUTES
1096 simde__m128d
simde_mm_floor_pd(simde__m128d a)1097 simde_mm_floor_pd (simde__m128d a) {
1098   return simde_mm_round_pd(a, SIMDE_MM_FROUND_TO_NEG_INF);
1099 }
1100 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1101   #undef _mm_floor_pd
1102   #define _mm_floor_pd(a) simde_mm_floor_pd(a)
1103 #endif
1104 
1105 SIMDE_FUNCTION_ATTRIBUTES
1106 simde__m128
simde_mm_floor_ps(simde__m128 a)1107 simde_mm_floor_ps (simde__m128 a) {
1108   return simde_mm_round_ps(a, SIMDE_MM_FROUND_TO_NEG_INF);
1109 }
1110 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1111   #undef _mm_floor_ps
1112   #define _mm_floor_ps(a) simde_mm_floor_ps(a)
1113 #endif
1114 
1115 SIMDE_FUNCTION_ATTRIBUTES
1116 simde__m128d
simde_mm_floor_sd(simde__m128d a,simde__m128d b)1117 simde_mm_floor_sd (simde__m128d a, simde__m128d b) {
1118 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1119   return _mm_floor_sd(a, b);
1120 #else
1121   simde__m128d_private
1122     r_,
1123     a_ = simde__m128d_to_private(a),
1124     b_ = simde__m128d_to_private(b);
1125 
1126 #if defined(simde_math_floor)
1127   r_.f64[0] = simde_math_floor(b_.f64[0]);
1128   r_.f64[1] = a_.f64[1];
1129 #else
1130   HEDLEY_UNREACHABLE();
1131 #endif
1132 
1133   return simde__m128d_from_private(r_);
1134 #endif
1135 }
1136 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1137   #undef _mm_floor_sd
1138   #define _mm_floor_sd(a, b) simde_mm_floor_sd(a, b)
1139 #endif
1140 
1141 SIMDE_FUNCTION_ATTRIBUTES
1142 simde__m128
simde_mm_floor_ss(simde__m128 a,simde__m128 b)1143 simde_mm_floor_ss (simde__m128 a, simde__m128 b) {
1144 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1145   return _mm_floor_ss(a, b);
1146 #elif defined(SIMDE_ASSUME_VECTORIZATION)
1147     return simde_mm_move_ss(a, simde_mm_floor_ps(b));
1148 #else
1149   simde__m128_private
1150     r_,
1151     a_ = simde__m128_to_private(a),
1152     b_ = simde__m128_to_private(b);
1153 
1154   #if defined(simde_math_floorf)
1155     r_.f32[0] = simde_math_floorf(b_.f32[0]);
1156     for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
1157       r_.f32[i] = a_.f32[i];
1158     }
1159   #else
1160     HEDLEY_UNREACHABLE();
1161   #endif
1162 
1163   return simde__m128_from_private(r_);
1164 #endif
1165 }
1166 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1167   #undef _mm_floor_ss
1168   #define _mm_floor_ss(a, b) simde_mm_floor_ss(a, b)
1169 #endif
1170 
1171 SIMDE_FUNCTION_ATTRIBUTES
1172 simde__m128i
simde_mm_insert_epi8(simde__m128i a,int i,const int imm8)1173 simde_mm_insert_epi8 (simde__m128i a, int i, const int imm8)
1174     SIMDE_REQUIRE_RANGE(imm8, 0, 15)  {
1175   simde__m128i_private
1176     r_ = simde__m128i_to_private(a);
1177 
1178   r_.i8[imm8] = HEDLEY_STATIC_CAST(int8_t, i);
1179 
1180   return simde__m128i_from_private(r_);
1181 }
1182 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1183   /* clang-3.8 returns an incompatible type, so we need the cast.  MSVC
1184    * can't handle the cast ("error C2440: 'type cast': cannot convert
1185    * from '__m128i' to '__m128i'").  */
1186   #if defined(__clang__)
1187     #define simde_mm_insert_epi8(a, i, imm8) HEDLEY_STATIC_CAST(__m128i, _mm_insert_epi8(a, i, imm8))
1188   #else
1189     #define simde_mm_insert_epi8(a, i, imm8) _mm_insert_epi8(a, i, imm8)
1190   #endif
1191 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1192 #  define simde_mm_insert_epi8(a, i, imm8) simde__m128i_from_neon_i8(vsetq_lane_s8(i, simde__m128i_to_private(a).i8, imm8))
1193 #endif
1194 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1195   #undef _mm_insert_epi8
1196   #define _mm_insert_epi8(a, i, imm8) simde_mm_insert_epi8(a, i, imm8)
1197 #endif
1198 
1199 SIMDE_FUNCTION_ATTRIBUTES
1200 simde__m128i
simde_mm_insert_epi32(simde__m128i a,int i,const int imm8)1201 simde_mm_insert_epi32 (simde__m128i a, int i, const int imm8)
1202     SIMDE_REQUIRE_RANGE(imm8, 0, 3)  {
1203   simde__m128i_private
1204     r_ = simde__m128i_to_private(a);
1205 
1206   r_.i32[imm8] = HEDLEY_STATIC_CAST(int32_t, i);
1207 
1208   return simde__m128i_from_private(r_);
1209 }
1210 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1211   #if defined(__clang__)
1212     #define simde_mm_insert_epi32(a, i, imm8) HEDLEY_STATIC_CAST(__m128i, _mm_insert_epi32(a, i, imm8))
1213   #else
1214     #define simde_mm_insert_epi32(a, i, imm8) _mm_insert_epi32(a, i, imm8)
1215   #endif
1216 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1217 #  define simde_mm_insert_epi32(a, i, imm8) simde__m128i_from_neon_i32(vsetq_lane_s32(i, simde__m128i_to_private(a).i32, imm8))
1218 #endif
1219 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1220   #undef _mm_insert_epi32
1221   #define _mm_insert_epi32(a, i, imm8) simde_mm_insert_epi32(a, i, imm8)
1222 #endif
1223 
1224 SIMDE_FUNCTION_ATTRIBUTES
1225 simde__m128i
simde_mm_insert_epi64(simde__m128i a,int64_t i,const int imm8)1226 simde_mm_insert_epi64 (simde__m128i a, int64_t i, const int imm8)
1227     SIMDE_REQUIRE_RANGE(imm8, 0, 1)  {
1228   #if defined(SIMDE_BUG_GCC_94482)
1229     simde__m128i_private
1230       a_ = simde__m128i_to_private(a);
1231 
1232     switch(imm8) {
1233       case 0:
1234         return simde_mm_set_epi64x(a_.i64[1], i);
1235         break;
1236       case 1:
1237         return simde_mm_set_epi64x(i, a_.i64[0]);
1238         break;
1239       default:
1240         HEDLEY_UNREACHABLE();
1241         break;
1242     }
1243   #else
1244     simde__m128i_private
1245       r_ = simde__m128i_to_private(a);
1246 
1247     r_.i64[imm8] = i;
1248     return simde__m128i_from_private(r_);
1249   #endif
1250 }
1251 #if defined(SIMDE_X86_SSE4_1_NATIVE) && defined(SIMDE_ARCH_AMD64)
1252 #  define simde_mm_insert_epi64(a, i, imm8) _mm_insert_epi64(a, i, imm8)
1253 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1254 #  define simde_mm_insert_epi64(a, i, imm8) simde__m128i_from_neon_i64(vsetq_lane_s64(i, simde__m128i_to_private(a).i64, imm8))
1255 #endif
1256 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1257   #undef _mm_insert_epi64
1258   #define _mm_insert_epi64(a, i, imm8) simde_mm_insert_epi64(a, i, imm8)
1259 #endif
1260 
1261 SIMDE_FUNCTION_ATTRIBUTES
1262 simde__m128
simde_mm_insert_ps(simde__m128 a,simde__m128 b,const int imm8)1263 simde_mm_insert_ps (simde__m128 a, simde__m128 b, const int imm8)
1264     SIMDE_REQUIRE_RANGE(imm8, 0, 255)  {
1265   simde__m128_private
1266     r_,
1267     a_ = simde__m128_to_private(a),
1268     b_ = simde__m128_to_private(b);
1269 
1270   a_.f32[0] = b_.f32[(imm8 >> 6) & 3];
1271   a_.f32[(imm8 >> 4) & 3] = a_.f32[0];
1272 
1273   SIMDE_VECTORIZE
1274   for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
1275     r_.f32[i] = (imm8 >> i) ? SIMDE_FLOAT32_C(0.0) : a_.f32[i];
1276   }
1277 
1278   return simde__m128_from_private(r_);
1279 }
1280 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1281 #  define simde_mm_insert_ps(a, b, imm8) _mm_insert_ps(a, b, imm8)
1282 #endif
1283 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1284   #undef _mm_insert_ps
1285   #define _mm_insert_ps(a, b, imm8) simde_mm_insert_ps(a, b, imm8)
1286 #endif
1287 
1288 SIMDE_FUNCTION_ATTRIBUTES
1289 simde__m128i
simde_mm_max_epi8(simde__m128i a,simde__m128i b)1290 simde_mm_max_epi8 (simde__m128i a, simde__m128i b) {
1291   #if defined(SIMDE_X86_SSE4_1_NATIVE) && !defined(__PGI)
1292     return _mm_max_epi8(a, b);
1293   #else
1294     simde__m128i_private
1295       r_,
1296       a_ = simde__m128i_to_private(a),
1297       b_ = simde__m128i_to_private(b);
1298 
1299     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1300       r_.neon_i8 = vmaxq_s8(a_.neon_i8, b_.neon_i8);
1301     #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1302       r_.altivec_i8 = vec_max(a_.altivec_i8, b_.altivec_i8);
1303     #else
1304       SIMDE_VECTORIZE
1305       for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
1306         r_.i8[i] = a_.i8[i] > b_.i8[i] ? a_.i8[i] : b_.i8[i];
1307       }
1308     #endif
1309 
1310     return simde__m128i_from_private(r_);
1311   #endif
1312 }
1313 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1314   #undef _mm_max_epi8
1315   #define _mm_max_epi8(a, b) simde_mm_max_epi8(a, b)
1316 #endif
1317 
1318 SIMDE_FUNCTION_ATTRIBUTES
1319 simde__m128i
simde_mm_max_epi32(simde__m128i a,simde__m128i b)1320 simde_mm_max_epi32 (simde__m128i a, simde__m128i b) {
1321   #if defined(SIMDE_X86_SSE4_1_NATIVE) && !defined(__PGI)
1322     return _mm_max_epi32(a, b);
1323   #else
1324     simde__m128i_private
1325       r_,
1326       a_ = simde__m128i_to_private(a),
1327       b_ = simde__m128i_to_private(b);
1328 
1329     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1330       r_.neon_i32 = vmaxq_s32(a_.neon_i32, b_.neon_i32);
1331     #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1332       r_.altivec_i32 = vec_max(a_.altivec_i32, b_.altivec_i32);
1333     #else
1334       SIMDE_VECTORIZE
1335       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1336         r_.i32[i] = a_.i32[i] > b_.i32[i] ? a_.i32[i] : b_.i32[i];
1337       }
1338     #endif
1339 
1340     return simde__m128i_from_private(r_);
1341   #endif
1342 }
1343 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1344   #undef _mm_max_epi32
1345   #define _mm_max_epi32(a, b) simde_mm_max_epi32(a, b)
1346 #endif
1347 
1348 SIMDE_FUNCTION_ATTRIBUTES
1349 simde__m128i
simde_mm_max_epu16(simde__m128i a,simde__m128i b)1350 simde_mm_max_epu16 (simde__m128i a, simde__m128i b) {
1351   #if defined(SIMDE_X86_SSE4_1_NATIVE)
1352     return _mm_max_epu16(a, b);
1353   #else
1354     simde__m128i_private
1355       r_,
1356       a_ = simde__m128i_to_private(a),
1357       b_ = simde__m128i_to_private(b);
1358 
1359     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1360       r_.neon_u16 = vmaxq_u16(a_.neon_u16, b_.neon_u16);
1361     #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1362       r_.altivec_u16 = vec_max(a_.altivec_u16, b_.altivec_u16);
1363     #else
1364       SIMDE_VECTORIZE
1365       for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
1366         r_.u16[i] = a_.u16[i] > b_.u16[i] ? a_.u16[i] : b_.u16[i];
1367       }
1368     #endif
1369 
1370     return simde__m128i_from_private(r_);
1371   #endif
1372 }
1373 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1374   #undef _mm_max_epu16
1375   #define _mm_max_epu16(a, b) simde_mm_max_epu16(a, b)
1376 #endif
1377 
1378 SIMDE_FUNCTION_ATTRIBUTES
1379 simde__m128i
simde_mm_max_epu32(simde__m128i a,simde__m128i b)1380 simde_mm_max_epu32 (simde__m128i a, simde__m128i b) {
1381   #if defined(SIMDE_X86_SSE4_1_NATIVE)
1382     return _mm_max_epu32(a, b);
1383   #else
1384     simde__m128i_private
1385       r_,
1386       a_ = simde__m128i_to_private(a),
1387       b_ = simde__m128i_to_private(b);
1388 
1389     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1390       r_.neon_u32 = vmaxq_u32(a_.neon_u32, b_.neon_u32);
1391     #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1392       r_.altivec_u32 = vec_max(a_.altivec_u32, b_.altivec_u32);
1393     #else
1394       SIMDE_VECTORIZE
1395       for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
1396         r_.u32[i] = a_.u32[i] > b_.u32[i] ? a_.u32[i] : b_.u32[i];
1397       }
1398     #endif
1399 
1400     return simde__m128i_from_private(r_);
1401   #endif
1402 }
1403 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1404   #undef _mm_max_epu32
1405   #define _mm_max_epu32(a, b) simde_mm_max_epu32(a, b)
1406 #endif
1407 
1408 SIMDE_FUNCTION_ATTRIBUTES
1409 simde__m128i
simde_mm_min_epi8(simde__m128i a,simde__m128i b)1410 simde_mm_min_epi8 (simde__m128i a, simde__m128i b) {
1411 #if defined(SIMDE_X86_SSE4_1_NATIVE) && !defined(__PGI)
1412   return _mm_min_epi8(a, b);
1413 #else
1414   simde__m128i_private
1415     r_,
1416     a_ = simde__m128i_to_private(a),
1417     b_ = simde__m128i_to_private(b);
1418 
1419 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1420   r_.neon_i8 = vminq_s8(a_.neon_i8, b_.neon_i8);
1421 #else
1422   SIMDE_VECTORIZE
1423   for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
1424     r_.i8[i] = a_.i8[i] < b_.i8[i] ? a_.i8[i] : b_.i8[i];
1425   }
1426 #endif
1427 
1428   return simde__m128i_from_private(r_);
1429 #endif
1430 }
1431 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1432   #undef _mm_min_epi8
1433   #define _mm_min_epi8(a, b) simde_mm_min_epi8(a, b)
1434 #endif
1435 
1436 SIMDE_FUNCTION_ATTRIBUTES
1437 simde__m128i
simde_mm_min_epi32(simde__m128i a,simde__m128i b)1438 simde_mm_min_epi32 (simde__m128i a, simde__m128i b) {
1439 #if defined(SIMDE_X86_SSE4_1_NATIVE) && !defined(__PGI)
1440   return _mm_min_epi32(a, b);
1441 #else
1442   simde__m128i_private
1443     r_,
1444     a_ = simde__m128i_to_private(a),
1445     b_ = simde__m128i_to_private(b);
1446 
1447   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1448     r_.neon_i32 = vminq_s32(a_.neon_i32, b_.neon_i32);
1449   #else
1450     SIMDE_VECTORIZE
1451     for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1452       r_.i32[i] = a_.i32[i] < b_.i32[i] ? a_.i32[i] : b_.i32[i];
1453     }
1454   #endif
1455 
1456   return simde__m128i_from_private(r_);
1457 #endif
1458 }
1459 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1460   #undef _mm_min_epi32
1461   #define _mm_min_epi32(a, b) simde_mm_min_epi32(a, b)
1462 #endif
1463 
1464 SIMDE_FUNCTION_ATTRIBUTES
1465 simde__m128i
simde_mm_min_epu16(simde__m128i a,simde__m128i b)1466 simde_mm_min_epu16 (simde__m128i a, simde__m128i b) {
1467 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1468   return _mm_min_epu16(a, b);
1469 #else
1470   simde__m128i_private
1471     r_,
1472     a_ = simde__m128i_to_private(a),
1473     b_ = simde__m128i_to_private(b);
1474 
1475 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1476   r_.neon_u16 = vminq_u16(a_.neon_u16, b_.neon_u16);
1477 #else
1478   SIMDE_VECTORIZE
1479   for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
1480     r_.u16[i] = a_.u16[i] < b_.u16[i] ? a_.u16[i] : b_.u16[i];
1481   }
1482 #endif
1483 
1484   return simde__m128i_from_private(r_);
1485 #endif
1486 }
1487 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1488   #undef _mm_min_epu16
1489   #define _mm_min_epu16(a, b) simde_mm_min_epu16(a, b)
1490 #endif
1491 
1492 SIMDE_FUNCTION_ATTRIBUTES
1493 simde__m128i
simde_mm_min_epu32(simde__m128i a,simde__m128i b)1494 simde_mm_min_epu32 (simde__m128i a, simde__m128i b) {
1495 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1496   return _mm_min_epu32(a, b);
1497 #else
1498   simde__m128i_private
1499     r_,
1500     a_ = simde__m128i_to_private(a),
1501     b_ = simde__m128i_to_private(b);
1502 
1503 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1504   r_.neon_u32 = vminq_u32(a_.neon_u32, b_.neon_u32);
1505 #else
1506   SIMDE_VECTORIZE
1507   for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
1508     r_.u32[i] = a_.u32[i] < b_.u32[i] ? a_.u32[i] : b_.u32[i];
1509   }
1510 #endif
1511 
1512   return simde__m128i_from_private(r_);
1513 #endif
1514 }
1515 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1516   #undef _mm_min_epu32
1517   #define _mm_min_epu32(a, b) simde_mm_min_epu32(a, b)
1518 #endif
1519 
1520 SIMDE_FUNCTION_ATTRIBUTES
1521 simde__m128i
simde_mm_minpos_epu16(simde__m128i a)1522 simde_mm_minpos_epu16 (simde__m128i a) {
1523 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1524   return _mm_minpos_epu16(a);
1525 #else
1526   simde__m128i_private
1527     r_ = simde__m128i_to_private(simde_mm_setzero_si128()),
1528     a_ = simde__m128i_to_private(a);
1529 
1530   r_.u16[0] = UINT16_MAX;
1531   for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
1532     if (a_.u16[i] < r_.u16[0]) {
1533       r_.u16[0] = a_.u16[i];
1534       r_.u16[1] = HEDLEY_STATIC_CAST(uint16_t, i);
1535     }
1536   }
1537 
1538   return simde__m128i_from_private(r_);
1539 #endif
1540 }
1541 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1542   #undef _mm_minpos_epu16
1543   #define _mm_minpos_epu16(a) simde_mm_minpos_epu16(a)
1544 #endif
1545 
1546 SIMDE_FUNCTION_ATTRIBUTES
1547 simde__m128i
simde_mm_mpsadbw_epu8(simde__m128i a,simde__m128i b,const int imm8)1548 simde_mm_mpsadbw_epu8 (simde__m128i a, simde__m128i b, const int imm8)
1549     SIMDE_REQUIRE_RANGE(imm8, 0, 7)  {
1550   simde__m128i_private
1551     r_,
1552     a_ = simde__m128i_to_private(a),
1553     b_ = simde__m128i_to_private(b);
1554 
1555   const int a_offset = imm8 & 4;
1556   const int b_offset = (imm8 & 3) << 2;
1557 
1558 #if defined(simde_math_abs)
1559   for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, (sizeof(r_.u16) / sizeof(r_.u16[0]))) ; i++) {
1560     r_.u16[i] =
1561       HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[a_offset + i + 0] - b_.u8[b_offset + 0]))) +
1562       HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[a_offset + i + 1] - b_.u8[b_offset + 1]))) +
1563       HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[a_offset + i + 2] - b_.u8[b_offset + 2]))) +
1564       HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[a_offset + i + 3] - b_.u8[b_offset + 3])));
1565   }
1566 #else
1567   HEDLEY_UNREACHABLE();
1568 #endif
1569 
1570   return simde__m128i_from_private(r_);
1571 }
1572 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1573 #  define simde_mm_mpsadbw_epu8(a, b, imm8) _mm_mpsadbw_epu8(a, b, imm8)
1574 #endif
1575 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1576   #undef _mm_mpsadbw_epu8
1577   #define _mm_mpsadbw_epu8(a, b, imm8) simde_mm_mpsadbw_epu8(a, b, imm8)
1578 #endif
1579 
1580 SIMDE_FUNCTION_ATTRIBUTES
1581 simde__m128i
simde_mm_mul_epi32(simde__m128i a,simde__m128i b)1582 simde_mm_mul_epi32 (simde__m128i a, simde__m128i b) {
1583 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1584   return _mm_mul_epi32(a, b);
1585 #else
1586   simde__m128i_private
1587     r_,
1588     a_ = simde__m128i_to_private(a),
1589     b_ = simde__m128i_to_private(b);
1590 
1591   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1592     // vmull_s32 upcasts instead of masking, so we downcast.
1593     int32x2_t a_lo = vmovn_s64(a_.neon_i64);
1594     int32x2_t b_lo = vmovn_s64(b_.neon_i64);
1595     r_.neon_i64 = vmull_s32(a_lo, b_lo);
1596   #else
1597     SIMDE_VECTORIZE
1598     for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
1599       r_.i64[i] =
1600         HEDLEY_STATIC_CAST(int64_t, a_.i32[i * 2]) *
1601         HEDLEY_STATIC_CAST(int64_t, b_.i32[i * 2]);
1602     }
1603   #endif
1604 
1605   return simde__m128i_from_private(r_);
1606 #endif
1607 }
1608 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1609   #undef _mm_mul_epi32
1610   #define _mm_mul_epi32(a, b) simde_mm_mul_epi32(a, b)
1611 #endif
1612 
1613 SIMDE_FUNCTION_ATTRIBUTES
1614 simde__m128i
simde_mm_mullo_epi32(simde__m128i a,simde__m128i b)1615 simde_mm_mullo_epi32 (simde__m128i a, simde__m128i b) {
1616 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1617   return _mm_mullo_epi32(a, b);
1618 #else
1619   simde__m128i_private
1620     r_,
1621     a_ = simde__m128i_to_private(a),
1622     b_ = simde__m128i_to_private(b);
1623 
1624   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1625     r_.neon_i32 = vmulq_s32(a_.neon_i32, b_.neon_i32);
1626   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1627     (void) a_;
1628     (void) b_;
1629     r_.altivec_i32 = vec_mul(a_.altivec_i32, b_.altivec_i32);
1630   #else
1631     SIMDE_VECTORIZE
1632     for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1633       r_.u32[i] = HEDLEY_STATIC_CAST(uint32_t, (HEDLEY_STATIC_CAST(uint64_t, (HEDLEY_STATIC_CAST(int64_t, a_.i32[i]) * HEDLEY_STATIC_CAST(int64_t, b_.i32[i]))) & 0xffffffff));
1634     }
1635   #endif
1636 
1637   return simde__m128i_from_private(r_);
1638 #endif
1639 }
1640 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1641   #undef _mm_mullo_epi32
1642   #define _mm_mullo_epi32(a, b) simde_mm_mullo_epi32(a, b)
1643 #endif
1644 
1645 SIMDE_FUNCTION_ATTRIBUTES
1646 simde__m128i
simde_x_mm_mullo_epu32(simde__m128i a,simde__m128i b)1647 simde_x_mm_mullo_epu32 (simde__m128i a, simde__m128i b) {
1648   simde__m128i_private
1649     r_,
1650     a_ = simde__m128i_to_private(a),
1651     b_ = simde__m128i_to_private(b);
1652 
1653     #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1654       r_.u32 = a_.u32 * b_.u32;
1655     #else
1656       SIMDE_VECTORIZE
1657       for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
1658         r_.u32[i] = a_.u32[i] * b_.u32[i];
1659       }
1660     #endif
1661 
1662   return simde__m128i_from_private(r_);
1663 }
1664 
1665 SIMDE_FUNCTION_ATTRIBUTES
1666 simde__m128i
simde_mm_packus_epi32(simde__m128i a,simde__m128i b)1667 simde_mm_packus_epi32 (simde__m128i a, simde__m128i b) {
1668 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1669   return _mm_packus_epi32(a, b);
1670 #else
1671   simde__m128i_private
1672     r_,
1673     a_ = simde__m128i_to_private(a),
1674     b_ = simde__m128i_to_private(b);
1675 
1676   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1677     const int32x4_t z = vdupq_n_s32(0);
1678     r_.neon_u16 = vcombine_u16(
1679         vqmovn_u32(vreinterpretq_u32_s32(vmaxq_s32(z, a_.neon_i32))),
1680         vqmovn_u32(vreinterpretq_u32_s32(vmaxq_s32(z, b_.neon_i32))));
1681   #else
1682     for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1683       r_.u16[i + 0] = (a_.i32[i] < 0) ? UINT16_C(0) : ((a_.i32[i] > UINT16_MAX) ? (UINT16_MAX) : HEDLEY_STATIC_CAST(uint16_t, a_.i32[i]));
1684       r_.u16[i + 4] = (b_.i32[i] < 0) ? UINT16_C(0) : ((b_.i32[i] > UINT16_MAX) ? (UINT16_MAX) : HEDLEY_STATIC_CAST(uint16_t, b_.i32[i]));
1685     }
1686   #endif
1687 
1688   return simde__m128i_from_private(r_);
1689 #endif
1690 }
1691 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1692   #undef _mm_packus_epi32
1693   #define _mm_packus_epi32(a, b) simde_mm_packus_epi32(a, b)
1694 #endif
1695 
1696 SIMDE_FUNCTION_ATTRIBUTES
1697 simde__m128d
simde_mm_round_sd(simde__m128d a,simde__m128d b,int rounding)1698 simde_mm_round_sd (simde__m128d a, simde__m128d b, int rounding)
1699     SIMDE_REQUIRE_CONSTANT_RANGE(rounding, 0, 15) {
1700   simde__m128d_private
1701     r_ = simde__m128d_to_private(a),
1702     b_ = simde__m128d_to_private(b);
1703 
1704   switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) {
1705     #if defined(simde_math_nearbyint)
1706       case SIMDE_MM_FROUND_TO_NEAREST_INT:
1707       case SIMDE_MM_FROUND_CUR_DIRECTION:
1708         r_.f64[0] = simde_math_nearbyint(b_.f64[0]);
1709         break;
1710     #endif
1711 
1712     #if defined(simde_math_floor)
1713       case SIMDE_MM_FROUND_TO_NEG_INF:
1714         r_.f64[0] = simde_math_floor(b_.f64[0]);
1715         break;
1716     #endif
1717 
1718     #if defined(simde_math_ceil)
1719       case SIMDE_MM_FROUND_TO_POS_INF:
1720         r_.f64[0] = simde_math_ceil(b_.f64[0]);
1721         break;
1722     #endif
1723 
1724     #if defined(simde_math_trunc)
1725       case SIMDE_MM_FROUND_TO_ZERO:
1726         r_.f64[0] = simde_math_trunc(b_.f64[0]);
1727         break;
1728     #endif
1729 
1730     default:
1731       HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
1732   }
1733 
1734   return simde__m128d_from_private(r_);
1735 }
1736 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1737 #  define simde_mm_round_sd(a, b, rounding) _mm_round_sd(a, b, rounding)
1738 #endif
1739 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1740   #undef _mm_round_sd
1741   #define _mm_round_sd(a, b, rounding) simde_mm_round_sd(a, b, rounding)
1742 #endif
1743 
1744 SIMDE_FUNCTION_ATTRIBUTES
1745 simde__m128
simde_mm_round_ss(simde__m128 a,simde__m128 b,int rounding)1746 simde_mm_round_ss (simde__m128 a, simde__m128 b, int rounding)
1747     SIMDE_REQUIRE_CONSTANT_RANGE(rounding, 0, 15) {
1748   simde__m128_private
1749     r_ = simde__m128_to_private(a),
1750     b_ = simde__m128_to_private(b);
1751 
1752   switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) {
1753     #if defined(simde_math_nearbyintf)
1754       case SIMDE_MM_FROUND_TO_NEAREST_INT:
1755       case SIMDE_MM_FROUND_CUR_DIRECTION:
1756         r_.f32[0] = simde_math_nearbyintf(b_.f32[0]);
1757         break;
1758     #endif
1759 
1760     #if defined(simde_math_floorf)
1761       case SIMDE_MM_FROUND_TO_NEG_INF:
1762         r_.f32[0] = simde_math_floorf(b_.f32[0]);
1763         break;
1764     #endif
1765 
1766     #if defined(simde_math_ceilf)
1767       case SIMDE_MM_FROUND_TO_POS_INF:
1768         r_.f32[0] = simde_math_ceilf(b_.f32[0]);
1769         break;
1770     #endif
1771 
1772     #if defined(simde_math_truncf)
1773       case SIMDE_MM_FROUND_TO_ZERO:
1774         r_.f32[0] = simde_math_truncf(b_.f32[0]);
1775         break;
1776     #endif
1777 
1778     default:
1779       HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
1780   }
1781 
1782   return simde__m128_from_private(r_);
1783 }
1784 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1785 #  define simde_mm_round_ss(a, b, rounding) _mm_round_ss(a, b, rounding)
1786 #endif
1787 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1788   #undef _mm_round_ss
1789   #define _mm_round_ss(a, b, rounding) simde_mm_round_ss(a, b, rounding)
1790 #endif
1791 
1792 SIMDE_FUNCTION_ATTRIBUTES
1793 simde__m128i
simde_mm_stream_load_si128(const simde__m128i * mem_addr)1794 simde_mm_stream_load_si128 (const simde__m128i* mem_addr) {
1795 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1796   return _mm_stream_load_si128(HEDLEY_CONST_CAST(simde__m128i*, mem_addr));
1797 #else
1798   return *mem_addr;
1799 #endif
1800 }
1801 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1802   #undef _mm_stream_load_si128
1803   #define _mm_stream_load_si128(mem_addr) simde_mm_stream_load_si128(mem_addr)
1804 #endif
1805 
1806 SIMDE_FUNCTION_ATTRIBUTES
1807 int
simde_mm_test_all_ones(simde__m128i a)1808 simde_mm_test_all_ones (simde__m128i a) {
1809 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1810   return _mm_test_all_ones(a);
1811 #else
1812   simde__m128i_private a_ = simde__m128i_to_private(a);
1813 
1814   for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) {
1815     if (a_.u64[i] != ~UINT64_C(0))
1816       return 0;
1817   }
1818 
1819   return 1;
1820 #endif
1821 }
1822 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1823   #undef _mm_test_all_ones
1824   #define _mm_test_all_ones(a) simde_mm_test_all_ones(a)
1825 #endif
1826 
1827 SIMDE_FUNCTION_ATTRIBUTES
1828 int
simde_mm_test_all_zeros(simde__m128i a,simde__m128i mask)1829 simde_mm_test_all_zeros (simde__m128i a, simde__m128i mask) {
1830 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1831   return _mm_test_all_zeros(a, mask);
1832 #else
1833   simde__m128i_private
1834     a_ = simde__m128i_to_private(a),
1835     mask_ = simde__m128i_to_private(mask);
1836 
1837   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1838     int64x2_t a_and_mask =
1839             vandq_s64(a_.neon_i64, mask_.neon_i64);
1840     return (vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1)) ? 0
1841                                                                            : 1;
1842   #else
1843     for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) {
1844       if ((a_.u64[i] & mask_.u64[i]) != 0)
1845         return 0;
1846     }
1847   #endif
1848 
1849   return 1;
1850 #endif
1851 }
1852 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1853   #undef _mm_test_all_zeros
1854   #define _mm_test_all_zeros(a, mask) simde_mm_test_all_zeros(a, mask)
1855 #endif
1856 
1857 SIMDE_FUNCTION_ATTRIBUTES
1858 int
simde_mm_test_mix_ones_zeros(simde__m128i a,simde__m128i mask)1859 simde_mm_test_mix_ones_zeros (simde__m128i a, simde__m128i mask) {
1860 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1861   return _mm_test_mix_ones_zeros(a, mask);
1862 #else
1863   simde__m128i_private
1864     a_ = simde__m128i_to_private(a),
1865     mask_ = simde__m128i_to_private(mask);
1866 
1867   for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++)
1868     if (((a_.u64[i] & mask_.u64[i]) != 0) && ((~a_.u64[i] & mask_.u64[i]) != 0))
1869       return 1;
1870 
1871   return 0;
1872 #endif
1873 }
1874 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1875   #undef _mm_test_mix_ones_zeros
1876   #define _mm_test_mix_ones_zeros(a, mask) simde_mm_test_mix_ones_zeros(a, mask)
1877 #endif
1878 
1879 SIMDE_FUNCTION_ATTRIBUTES
1880 int
simde_mm_testc_si128(simde__m128i a,simde__m128i b)1881 simde_mm_testc_si128 (simde__m128i a, simde__m128i b) {
1882 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1883   return _mm_testc_si128(a, b);
1884 #else
1885   simde__m128i_private
1886     a_ = simde__m128i_to_private(a),
1887     b_ = simde__m128i_to_private(b);
1888 
1889   int_fast32_t r = 0;
1890 
1891   SIMDE_VECTORIZE_REDUCTION(|:r)
1892   for (size_t i = 0 ; i < (sizeof(a_.i32f) / sizeof(a_.i32f[0])) ; i++) {
1893     r |= ~a_.i32f[i] & b_.i32f[i];
1894   }
1895 
1896   return HEDLEY_STATIC_CAST(int, !r);
1897 #endif
1898 }
1899 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1900   #undef _mm_testc_si128
1901   #define _mm_testc_si128(a, b) simde_mm_testc_si128(a, b)
1902 #endif
1903 
1904 SIMDE_FUNCTION_ATTRIBUTES
1905 int
simde_mm_testnzc_si128(simde__m128i a,simde__m128i b)1906 simde_mm_testnzc_si128 (simde__m128i a, simde__m128i b) {
1907 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1908   return _mm_testnzc_si128(a, b);
1909 #else
1910   simde__m128i_private
1911     a_ = simde__m128i_to_private(a),
1912     b_ = simde__m128i_to_private(b);
1913 
1914   for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) {
1915     if (((a_.u64[i] & b_.u64[i]) != 0) && ((~a_.u64[i] & b_.u64[i]) != 0))
1916       return 1;
1917   }
1918 
1919   return 0;
1920 #endif
1921 }
1922 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1923   #undef _mm_testnzc_si128
1924   #define _mm_testnzc_si128(a, b) simde_mm_testnzc_si128(a, b)
1925 #endif
1926 
1927 SIMDE_FUNCTION_ATTRIBUTES
1928 int
simde_mm_testz_si128(simde__m128i a,simde__m128i b)1929 simde_mm_testz_si128 (simde__m128i a, simde__m128i b) {
1930 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1931   return _mm_testz_si128(a, b);
1932 #else
1933   simde__m128i_private
1934     a_ = simde__m128i_to_private(a),
1935     b_ = simde__m128i_to_private(b);
1936 
1937   for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) {
1938     if ((a_.u64[i] & b_.u64[i]) == 0)
1939       return 1;
1940   }
1941 
1942   return 0;
1943 #endif
1944 }
1945 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1946   #undef _mm_testz_si128
1947   #define _mm_testz_si128(a, b) simde_mm_testz_si128(a, b)
1948 #endif
1949 
1950 SIMDE_END_DECLS_
1951 
1952 HEDLEY_DIAGNOSTIC_POP
1953 
1954 #endif /* !defined(SIMDE_X86_SSE4_1_H) */
1955