1 /* SPDX-License-Identifier: MIT
2 *
3 * Permission is hereby granted, free of charge, to any person
4 * obtaining a copy of this software and associated documentation
5 * files (the "Software"), to deal in the Software without
6 * restriction, including without limitation the rights to use, copy,
7 * modify, merge, publish, distribute, sublicense, and/or sell copies
8 * of the Software, and to permit persons to whom the Software is
9 * furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be
12 * included in all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
18 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
19 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Copyright:
24 * 2017-2020 Evan Nemerson <evan@nemerson.com>
25 */
26
27 #include "sse.h"
28 #if !defined(SIMDE_X86_SSE4_1_H)
29 #define SIMDE_X86_SSE4_1_H
30
31 #include "ssse3.h"
32
33 HEDLEY_DIAGNOSTIC_PUSH
34 SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
35 SIMDE_BEGIN_DECLS_
36
37 #if !defined(SIMDE_X86_SSE4_1_NATIVE) && defined(SIMDE_ENABLE_NATIVE_ALIASES)
38 # define SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES
39 #endif
40
41 SIMDE_FUNCTION_ATTRIBUTES
42 simde__m128i
simde_mm_blend_epi16(simde__m128i a,simde__m128i b,const int imm8)43 simde_mm_blend_epi16 (simde__m128i a, simde__m128i b, const int imm8)
44 SIMDE_REQUIRE_RANGE(imm8, 0, 255) {
45 simde__m128i_private
46 r_,
47 a_ = simde__m128i_to_private(a),
48 b_ = simde__m128i_to_private(b);
49
50 SIMDE_VECTORIZE
51 for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
52 r_.u16[i] = ((imm8 >> i) & 1) ? b_.u16[i] : a_.u16[i];
53 }
54
55 return simde__m128i_from_private(r_);
56 }
57 #if defined(SIMDE_X86_SSE4_1_NATIVE)
58 # define simde_mm_blend_epi16(a, b, imm8) _mm_blend_epi16(a, b, imm8)
59 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
60 # define simde_mm_blend_epi16(a, b, imm8) \
61 (__extension__ ({ \
62 const uint16_t _mask[8] = { \
63 ((imm8) & (1 << 0)) ? 0xFFFF : 0x0000, \
64 ((imm8) & (1 << 1)) ? 0xFFFF : 0x0000, \
65 ((imm8) & (1 << 2)) ? 0xFFFF : 0x0000, \
66 ((imm8) & (1 << 3)) ? 0xFFFF : 0x0000, \
67 ((imm8) & (1 << 4)) ? 0xFFFF : 0x0000, \
68 ((imm8) & (1 << 5)) ? 0xFFFF : 0x0000, \
69 ((imm8) & (1 << 6)) ? 0xFFFF : 0x0000, \
70 ((imm8) & (1 << 7)) ? 0xFFFF : 0x0000 \
71 }; \
72 uint16x8_t _mask_vec = vld1q_u16(_mask); \
73 simde__m128i_from_neon_u16(vbslq_u16(_mask_vec, simde__m128i_to_neon_u16(b), simde__m128i_to_neon_u16(a))); \
74 }))
75 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
76 # define simde_mm_blend_epi16(a, b, imm8) \
77 (__extension__ ({ \
78 const vector unsigned short _mask = { \
79 ((imm8) & (1 << 0)) ? 0xFFFF : 0x0000, \
80 ((imm8) & (1 << 1)) ? 0xFFFF : 0x0000, \
81 ((imm8) & (1 << 2)) ? 0xFFFF : 0x0000, \
82 ((imm8) & (1 << 3)) ? 0xFFFF : 0x0000, \
83 ((imm8) & (1 << 4)) ? 0xFFFF : 0x0000, \
84 ((imm8) & (1 << 5)) ? 0xFFFF : 0x0000, \
85 ((imm8) & (1 << 6)) ? 0xFFFF : 0x0000, \
86 ((imm8) & (1 << 7)) ? 0xFFFF : 0x0000 \
87 }; \
88 simde__m128i_from_altivec_u16(vec_sel(simde__m128i_to_altivec_u16(a), simde__m128i_to_altivec_u16(b), _mask)); \
89 }))
90 #endif
91 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
92 #undef _mm_blend_epi16
93 #define _mm_blend_epi16(a, b, imm8) simde_mm_blend_epi16(a, b, imm8)
94 #endif
95
96 SIMDE_FUNCTION_ATTRIBUTES
97 simde__m128d
simde_mm_blend_pd(simde__m128d a,simde__m128d b,const int imm8)98 simde_mm_blend_pd (simde__m128d a, simde__m128d b, const int imm8)
99 SIMDE_REQUIRE_RANGE(imm8, 0, 3) {
100 simde__m128d_private
101 r_,
102 a_ = simde__m128d_to_private(a),
103 b_ = simde__m128d_to_private(b);
104
105 SIMDE_VECTORIZE
106 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
107 r_.f64[i] = ((imm8 >> i) & 1) ? b_.f64[i] : a_.f64[i];
108 }
109 return simde__m128d_from_private(r_);
110 }
111 #if defined(SIMDE_X86_SSE4_1_NATIVE)
112 # define simde_mm_blend_pd(a, b, imm8) _mm_blend_pd(a, b, imm8)
113 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
114 # define simde_mm_blend_pd(a, b, imm8) \
115 (__extension__ ({ \
116 const uint64_t _mask[2] = { \
117 ((imm8) & (1 << 0)) ? UINT64_MAX : 0, \
118 ((imm8) & (1 << 1)) ? UINT64_MAX : 0 \
119 }; \
120 uint64x2_t _mask_vec = vld1q_u64(_mask); \
121 simde__m128d_from_neon_u64(vbslq_u64(_mask_vec, simde__m128d_to_neon_u64(b), simde__m128d_to_neon_u64(a))); \
122 }))
123 #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
124 # define simde_mm_blend_pd(a, b, imm8) \
125 (__extension__ ({ \
126 const vector unsigned long long _mask = { \
127 ((imm8) & (1 << 0)) ? UINT64_MAX : 0, \
128 ((imm8) & (1 << 1)) ? UINT64_MAX : 0 \
129 }; \
130 simde__m128d_from_altivec_f64(vec_sel(simde__m128d_to_altivec_f64(a), simde__m128d_to_altivec_f64(b), _mask)); \
131 }))
132 #endif
133 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
134 #undef _mm_blend_pd
135 #define _mm_blend_pd(a, b, imm8) simde_mm_blend_pd(a, b, imm8)
136 #endif
137
138 SIMDE_FUNCTION_ATTRIBUTES
139 simde__m128
simde_mm_blend_ps(simde__m128 a,simde__m128 b,const int imm8)140 simde_mm_blend_ps (simde__m128 a, simde__m128 b, const int imm8)
141 SIMDE_REQUIRE_RANGE(imm8, 0, 15) {
142 simde__m128_private
143 r_,
144 a_ = simde__m128_to_private(a),
145 b_ = simde__m128_to_private(b);
146
147 SIMDE_VECTORIZE
148 for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
149 r_.f32[i] = ((imm8 >> i) & 1) ? b_.f32[i] : a_.f32[i];
150 }
151 return simde__m128_from_private(r_);
152 }
153 #if defined(SIMDE_X86_SSE4_1_NATIVE)
154 # define simde_mm_blend_ps(a, b, imm8) _mm_blend_ps(a, b, imm8)
155 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
156 # define simde_mm_blend_ps(a, b, imm8) \
157 (__extension__ ({ \
158 const uint32_t _mask[4] = { \
159 ((imm8) & (1 << 0)) ? UINT32_MAX : 0, \
160 ((imm8) & (1 << 1)) ? UINT32_MAX : 0, \
161 ((imm8) & (1 << 2)) ? UINT32_MAX : 0, \
162 ((imm8) & (1 << 3)) ? UINT32_MAX : 0 \
163 }; \
164 uint32x4_t _mask_vec = vld1q_u32(_mask); \
165 simde__m128_from_neon_f32(vbslq_f32(_mask_vec, simde__m128_to_neon_f32(b), simde__m128_to_neon_f32(a))); \
166 }))
167 #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
168 # define simde_mm_blend_ps(a, b, imm8) \
169 (__extension__ ({ \
170 const vector unsigned int _mask = { \
171 ((imm8) & (1 << 0)) ? UINT32_MAX : 0, \
172 ((imm8) & (1 << 1)) ? UINT32_MAX : 0, \
173 ((imm8) & (1 << 2)) ? UINT32_MAX : 0, \
174 ((imm8) & (1 << 3)) ? UINT32_MAX : 0 \
175 }; \
176 simde__m128_from_altivec_f32(vec_sel(simde__m128_to_altivec_f32(a), simde__m128_to_altivec_f32(b), _mask)); \
177 }))
178 #endif
179 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
180 #undef _mm_blend_ps
181 #define _mm_blend_ps(a, b, imm8) simde_mm_blend_ps(a, b, imm8)
182 #endif
183
184 SIMDE_FUNCTION_ATTRIBUTES
185 simde__m128i
simde_mm_blendv_epi8(simde__m128i a,simde__m128i b,simde__m128i mask)186 simde_mm_blendv_epi8 (simde__m128i a, simde__m128i b, simde__m128i mask) {
187 #if defined(SIMDE_X86_SSE4_1_NATIVE)
188 return _mm_blendv_epi8(a, b, mask);
189 #else
190 simde__m128i_private
191 r_,
192 a_ = simde__m128i_to_private(a),
193 b_ = simde__m128i_to_private(b),
194 mask_ = simde__m128i_to_private(mask);
195
196 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
197 // Use a signed shift right to create a mask with the sign bit
198 mask_.neon_i8 = vshrq_n_s8(mask_.neon_i8, 7);
199 r_.neon_i8 = vbslq_s8(mask_.neon_u8, b_.neon_i8, a_.neon_i8);
200 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
201 r_.altivec_i8 = vec_sel(a_.altivec_i8, b_.altivec_i8, vec_cmplt(mask_.altivec_i8, vec_splat_s8(0)));
202 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
203 /* https://software.intel.com/en-us/forums/intel-c-compiler/topic/850087 */
204 #if defined(HEDLEY_INTEL_VERSION_CHECK)
205 __typeof__(mask_.i8) z = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
206 mask_.i8 = HEDLEY_STATIC_CAST(__typeof__(mask_.i8), mask_.i8 < z);
207 #else
208 mask_.i8 >>= (CHAR_BIT * sizeof(mask_.i8[0])) - 1;
209 #endif
210
211 r_.i8 = (mask_.i8 & b_.i8) | (~mask_.i8 & a_.i8);
212 #else
213 SIMDE_VECTORIZE
214 for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
215 int8_t m = mask_.i8[i] >> 7;
216 r_.i8[i] = (m & b_.i8[i]) | (~m & a_.i8[i]);
217 }
218 #endif
219
220 return simde__m128i_from_private(r_);
221 #endif
222 }
223 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
224 #undef _mm_blendv_epi8
225 #define _mm_blendv_epi8(a, b, mask) simde_mm_blendv_epi8(a, b, mask)
226 #endif
227
228 SIMDE_FUNCTION_ATTRIBUTES
229 simde__m128i
simde_x_mm_blendv_epi16(simde__m128i a,simde__m128i b,simde__m128i mask)230 simde_x_mm_blendv_epi16 (simde__m128i a, simde__m128i b, simde__m128i mask) {
231 #if defined(SIMDE_X86_SSE2_NATIVE)
232 mask = simde_mm_srai_epi16(mask, 15);
233 return simde_mm_or_si128(simde_mm_and_si128(mask, b), simde_mm_andnot_si128(mask, a));
234 #else
235 simde__m128i_private
236 r_,
237 a_ = simde__m128i_to_private(a),
238 b_ = simde__m128i_to_private(b),
239 mask_ = simde__m128i_to_private(mask);
240
241 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
242 mask_ = simde__m128i_to_private(simde_mm_cmplt_epi16(mask, simde_mm_setzero_si128()));
243 r_.neon_i16 = vbslq_s16(mask_.neon_u16, b_.neon_i16, a_.neon_i16);
244 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
245 r_.altivec_i16 = vec_sel(a_.altivec_i16, b_.altivec_i16, vec_cmplt(mask_.altivec_i16, vec_splat_s16(0)));
246 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
247 #if defined(HEDLEY_INTEL_VERSION_CHECK)
248 __typeof__(mask_.i16) z = { 0, 0, 0, 0, 0, 0, 0, 0 };
249 mask_.i16 = mask_.i16 < z;
250 #else
251 mask_.i16 >>= (CHAR_BIT * sizeof(mask_.i16[0])) - 1;
252 #endif
253
254 r_.i16 = (mask_.i16 & b_.i16) | (~mask_.i16 & a_.i16);
255 #else
256 SIMDE_VECTORIZE
257 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
258 int16_t m = mask_.i16[i] >> 15;
259 r_.i16[i] = (m & b_.i16[i]) | (~m & a_.i16[i]);
260 }
261 #endif
262
263 return simde__m128i_from_private(r_);
264 #endif
265 }
266
267 SIMDE_FUNCTION_ATTRIBUTES
268 simde__m128i
simde_x_mm_blendv_epi32(simde__m128i a,simde__m128i b,simde__m128i mask)269 simde_x_mm_blendv_epi32 (simde__m128i a, simde__m128i b, simde__m128i mask) {
270 #if defined(SIMDE_X86_SSE4_1_NATIVE)
271 return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _mm_castsi128_ps(mask)));
272 #else
273 simde__m128i_private
274 r_,
275 a_ = simde__m128i_to_private(a),
276 b_ = simde__m128i_to_private(b),
277 mask_ = simde__m128i_to_private(mask);
278
279 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
280 mask_ = simde__m128i_to_private(simde_mm_cmplt_epi32(mask, simde_mm_setzero_si128()));
281 r_.neon_i32 = vbslq_s32(mask_.neon_u32, b_.neon_i32, a_.neon_i32);
282 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
283 r_.altivec_i32 = vec_sel(a_.altivec_i32, b_.altivec_i32, vec_cmplt(mask_.altivec_i32, vec_splat_s32(0)));
284 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
285 #if defined(HEDLEY_INTEL_VERSION_CHECK)
286 __typeof__(mask_.i32) z = { 0, 0, 0, 0 };
287 mask_.i32 = HEDLEY_STATIC_CAST(__typeof__(mask_.i32), mask_.i32 < z);
288 #else
289 mask_.i32 >>= (CHAR_BIT * sizeof(mask_.i32[0])) - 1;
290 #endif
291
292 r_.i32 = (mask_.i32 & b_.i32) | (~mask_.i32 & a_.i32);
293 #else
294 SIMDE_VECTORIZE
295 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
296 int32_t m = mask_.i32[i] >> 31;
297 r_.i32[i] = (m & b_.i32[i]) | (~m & a_.i32[i]);
298 }
299 #endif
300
301 return simde__m128i_from_private(r_);
302 #endif
303 }
304
305 SIMDE_FUNCTION_ATTRIBUTES
306 simde__m128i
simde_x_mm_blendv_epi64(simde__m128i a,simde__m128i b,simde__m128i mask)307 simde_x_mm_blendv_epi64 (simde__m128i a, simde__m128i b, simde__m128i mask) {
308 #if defined(SIMDE_X86_SSE4_1_NATIVE)
309 return _mm_castpd_si128(_mm_blendv_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(b), _mm_castsi128_pd(mask)));
310 #else
311 simde__m128i_private
312 r_,
313 a_ = simde__m128i_to_private(a),
314 b_ = simde__m128i_to_private(b),
315 mask_ = simde__m128i_to_private(mask);
316
317 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
318 mask_.u64 = vcltq_s64(mask_.i64, vdupq_n_s64(UINT64_C(0)));
319 r_.neon_i64 = vbslq_s64(mask_.neon_u64, b_.neon_i64, a_.neon_i64);
320 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
321 r_.altivec_i64 = vec_sel(a_.altivec_i64, b_.altivec_i64,
322 vec_cmplt(mask_.altivec_i64, vec_splats(HEDLEY_STATIC_CAST(signed long long, 0))));
323 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
324 #if defined(HEDLEY_INTEL_VERSION_CHECK)
325 __typeof__(mask_.i64) z = { 0, 0 };
326 mask_.i64 = HEDLEY_STATIC_CAST(__typeof__(mask_.i64), mask_.i64 < z);
327 #else
328 mask_.i64 >>= (CHAR_BIT * sizeof(mask_.i64[0])) - 1;
329 #endif
330
331 r_.i64 = (mask_.i64 & b_.i64) | (~mask_.i64 & a_.i64);
332 #else
333 SIMDE_VECTORIZE
334 for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
335 int64_t m = mask_.i64[i] >> 63;
336 r_.i64[i] = (m & b_.i64[i]) | (~m & a_.i64[i]);
337 }
338 #endif
339
340 return simde__m128i_from_private(r_);
341 #endif
342 }
343
344 SIMDE_FUNCTION_ATTRIBUTES
345 simde__m128d
simde_mm_blendv_pd(simde__m128d a,simde__m128d b,simde__m128d mask)346 simde_mm_blendv_pd (simde__m128d a, simde__m128d b, simde__m128d mask) {
347 #if defined(SIMDE_X86_SSE4_1_NATIVE)
348 return _mm_blendv_pd(a, b, mask);
349 #else
350 return simde_mm_castsi128_pd(simde_x_mm_blendv_epi64(simde_mm_castpd_si128(a), simde_mm_castpd_si128(b), simde_mm_castpd_si128(mask)));
351 #endif
352 }
353 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
354 #undef _mm_blendv_pd
355 #define _mm_blendv_pd(a, b, mask) simde_mm_blendv_pd(a, b, mask)
356 #endif
357
358 SIMDE_FUNCTION_ATTRIBUTES
359 simde__m128
simde_mm_blendv_ps(simde__m128 a,simde__m128 b,simde__m128 mask)360 simde_mm_blendv_ps (simde__m128 a, simde__m128 b, simde__m128 mask) {
361 #if defined(SIMDE_X86_SSE4_1_NATIVE)
362 return _mm_blendv_ps(a, b, mask);
363 #else
364 return simde_mm_castsi128_ps(simde_x_mm_blendv_epi32(simde_mm_castps_si128(a), simde_mm_castps_si128(b), simde_mm_castps_si128(mask)));
365 #endif
366 }
367 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
368 #undef _mm_blendv_ps
369 #define _mm_blendv_ps(a, b, mask) simde_mm_blendv_ps(a, b, mask)
370 #endif
371
372 SIMDE_FUNCTION_ATTRIBUTES
373 simde__m128d
simde_mm_round_pd(simde__m128d a,int rounding)374 simde_mm_round_pd (simde__m128d a, int rounding)
375 SIMDE_REQUIRE_CONSTANT_RANGE(rounding, 0, 15) {
376 simde__m128d_private
377 r_,
378 a_ = simde__m128d_to_private(a);
379
380 /* For architectures which lack a current direction SIMD instruction. */
381 #if defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
382 if ((rounding & 7) == SIMDE_MM_FROUND_CUR_DIRECTION)
383 rounding = HEDLEY_STATIC_CAST(int, SIMDE_MM_GET_ROUNDING_MODE()) << 13;
384 #endif
385
386 switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) {
387 case SIMDE_MM_FROUND_CUR_DIRECTION:
388 #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
389 r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_round(a_.altivec_f64));
390 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) && 0
391 r_.neon_f64 = vrndiq_f64(a_.neon_f64);
392 #elif defined(simde_math_nearbyint)
393 SIMDE_VECTORIZE
394 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
395 r_.f64[i] = simde_math_nearbyint(a_.f64[i]);
396 }
397 #else
398 HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
399 #endif
400 break;
401
402 case SIMDE_MM_FROUND_TO_NEAREST_INT:
403 #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
404 r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_round(a_.altivec_f64));
405 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) && 0
406 r_.neon_f64 = vrndaq_f64(a_.neon_f64);
407 #elif defined(simde_math_round)
408 SIMDE_VECTORIZE
409 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
410 r_.f64[i] = simde_math_round(a_.f64[i]);
411 }
412 #else
413 HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
414 #endif
415 break;
416
417 case SIMDE_MM_FROUND_TO_NEG_INF:
418 #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
419 r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_floor(a_.altivec_f64));
420 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) && 0
421 r_.neon_f64 = vrndmq_f64(a_.neon_f64);
422 #else
423 SIMDE_VECTORIZE
424 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
425 r_.f64[i] = simde_math_floor(a_.f64[i]);
426 }
427 #endif
428 break;
429
430 case SIMDE_MM_FROUND_TO_POS_INF:
431 #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
432 r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_ceil(a_.altivec_f64));
433 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) && 0
434 r_.neon_f64 = vrndpq_f64(a_.neon_f64);
435 #elif defined(simde_math_ceil)
436 SIMDE_VECTORIZE
437 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
438 r_.f64[i] = simde_math_ceil(a_.f64[i]);
439 }
440 #else
441 HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
442 #endif
443 break;
444
445 case SIMDE_MM_FROUND_TO_ZERO:
446 #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
447 r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_trunc(a_.altivec_f64));
448 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) && 0
449 r_.neon_f64 = vrndq_f64(a_.neon_f64);
450 #else
451 SIMDE_VECTORIZE
452 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
453 r_.f64[i] = simde_math_trunc(a_.f64[i]);
454 }
455 #endif
456 break;
457
458 default:
459 HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
460 }
461
462 return simde__m128d_from_private(r_);
463 }
464 #if defined(SIMDE_X86_SSE4_1_NATIVE)
465 #define simde_mm_round_pd(a, rounding) _mm_round_pd(a, rounding)
466 #endif
467 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
468 #undef _mm_round_pd
469 #define _mm_round_pd(a, rounding) simde_mm_round_pd(a, rounding)
470 #endif
471
472 SIMDE_FUNCTION_ATTRIBUTES
473 simde__m128d
simde_mm_ceil_pd(simde__m128d a)474 simde_mm_ceil_pd (simde__m128d a) {
475 return simde_mm_round_pd(a, SIMDE_MM_FROUND_TO_POS_INF);
476 }
477 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
478 #undef _mm_ceil_pd
479 #define _mm_ceil_pd(a) simde_mm_ceil_pd(a)
480 #endif
481
482 SIMDE_FUNCTION_ATTRIBUTES
483 simde__m128
simde_mm_ceil_ps(simde__m128 a)484 simde_mm_ceil_ps (simde__m128 a) {
485 return simde_mm_round_ps(a, SIMDE_MM_FROUND_TO_POS_INF);
486 }
487 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
488 #undef _mm_ceil_ps
489 #define _mm_ceil_ps(a) simde_mm_ceil_ps(a)
490 #endif
491
492 SIMDE_FUNCTION_ATTRIBUTES
493 simde__m128d
simde_mm_ceil_sd(simde__m128d a,simde__m128d b)494 simde_mm_ceil_sd (simde__m128d a, simde__m128d b) {
495 #if defined(SIMDE_X86_SSE4_1_NATIVE)
496 return _mm_ceil_sd(a, b);
497 #else
498 simde__m128d_private
499 r_,
500 a_ = simde__m128d_to_private(a),
501 b_ = simde__m128d_to_private(b);
502
503 #if defined(simde_math_ceilf)
504 r_ = simde__m128d_to_private(simde_mm_set_pd(a_.f64[1], simde_math_ceil(b_.f64[0])));
505 #else
506 HEDLEY_UNREACHABLE();
507 #endif
508
509 return simde__m128d_from_private(r_);
510 #endif
511 }
512 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
513 #undef _mm_ceil_sd
514 #define _mm_ceil_sd(a, b) simde_mm_ceil_sd(a, b)
515 #endif
516
517 SIMDE_FUNCTION_ATTRIBUTES
518 simde__m128
simde_mm_ceil_ss(simde__m128 a,simde__m128 b)519 simde_mm_ceil_ss (simde__m128 a, simde__m128 b) {
520 #if defined(SIMDE_X86_SSE4_1_NATIVE)
521 return _mm_ceil_ss(a, b);
522 #elif defined(SIMDE_ASSUME_VECTORIZATION)
523 return simde_mm_move_ss(a, simde_mm_ceil_ps(b));
524 #else
525 simde__m128_private
526 r_,
527 a_ = simde__m128_to_private(a),
528 b_ = simde__m128_to_private(b);
529
530 #if defined(simde_math_ceilf)
531 r_ = simde__m128_to_private(simde_mm_set_ps(a_.f32[3], a_.f32[2], a_.f32[1], simde_math_ceilf(b_.f32[0])));
532 #else
533 HEDLEY_UNREACHABLE();
534 #endif
535
536 return simde__m128_from_private(r_);
537 #endif
538 }
539 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
540 #undef _mm_ceil_ss
541 #define _mm_ceil_ss(a, b) simde_mm_ceil_ss(a, b)
542 #endif
543
544 SIMDE_FUNCTION_ATTRIBUTES
545 simde__m128i
simde_mm_cmpeq_epi64(simde__m128i a,simde__m128i b)546 simde_mm_cmpeq_epi64 (simde__m128i a, simde__m128i b) {
547 #if defined(SIMDE_X86_SSE4_1_NATIVE)
548 return _mm_cmpeq_epi64(a, b);
549 #else
550 simde__m128i_private
551 r_,
552 a_ = simde__m128i_to_private(a),
553 b_ = simde__m128i_to_private(b);
554
555 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
556 r_.neon_u64 = vceqq_u64(a_.neon_u64, b_.neon_u64);
557 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
558 // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
559 uint32x4_t cmp = vceqq_u32(a_.neon_u32, b_.neon_u32);
560 uint32x4_t swapped = vrev64q_u32(cmp);
561 r_.neon_u32 = vandq_u32(cmp, swapped);
562 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
563 r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), a_.i64 == b_.i64);
564 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
565 r_.altivec_i64 = (vector signed long long) vec_cmpeq(a_.altivec_i64, b_.altivec_i64);
566 #else
567 SIMDE_VECTORIZE
568 for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
569 r_.u64[i] = (a_.u64[i] == b_.u64[i]) ? ~UINT64_C(0) : UINT64_C(0);
570 }
571 #endif
572
573 return simde__m128i_from_private(r_);
574 #endif
575 }
576 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
577 #undef _mm_cmpeq_epi64
578 #define _mm_cmpeq_epi64(a, b) simde_mm_cmpeq_epi64(a, b)
579 #endif
580
581 SIMDE_FUNCTION_ATTRIBUTES
582 simde__m128i
simde_mm_cvtepi8_epi16(simde__m128i a)583 simde_mm_cvtepi8_epi16 (simde__m128i a) {
584 #if defined(SIMDE_X86_SSE4_1_NATIVE)
585 return _mm_cvtepi8_epi16(a);
586 #else
587 simde__m128i_private
588 r_,
589 a_ = simde__m128i_to_private(a);
590
591 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
592 int8x16_t s8x16 = a_.neon_i8; /* xxxx xxxx xxxx DCBA */
593 int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
594 r_.neon_i16 = s16x8;
595 #elif defined(SIMDE_CONVERT_VECTOR_)
596 SIMDE_CONVERT_VECTOR_(r_.i16, a_.m64_private[0].i8);
597 #else
598 SIMDE_VECTORIZE
599 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
600 r_.i16[i] = a_.i8[i];
601 }
602 #endif
603
604 return simde__m128i_from_private(r_);
605 #endif
606 }
607 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
608 #undef _mm_cvtepi8_epi16
609 #define _mm_cvtepi8_epi16(a) simde_mm_cvtepi8_epi16(a)
610 #endif
611
612 SIMDE_FUNCTION_ATTRIBUTES
613 simde__m128i
simde_mm_cvtepi8_epi32(simde__m128i a)614 simde_mm_cvtepi8_epi32 (simde__m128i a) {
615 #if defined(SIMDE_X86_SSE4_1_NATIVE)
616 return _mm_cvtepi8_epi32(a);
617 #else
618 simde__m128i_private
619 r_,
620 a_ = simde__m128i_to_private(a);
621
622 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
623 int8x16_t s8x16 = a_.neon_i8; /* xxxx xxxx xxxx DCBA */
624 int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
625 int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */
626 r_.neon_i32 = s32x4;
627 #else
628 SIMDE_VECTORIZE
629 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
630 r_.i32[i] = a_.i8[i];
631 }
632 #endif
633
634 return simde__m128i_from_private(r_);
635 #endif
636 }
637 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
638 #undef _mm_cvtepi8_epi32
639 #define _mm_cvtepi8_epi32(a) simde_mm_cvtepi8_epi32(a)
640 #endif
641
642 SIMDE_FUNCTION_ATTRIBUTES
643 simde__m128i
simde_mm_cvtepi8_epi64(simde__m128i a)644 simde_mm_cvtepi8_epi64 (simde__m128i a) {
645 #if defined(SIMDE_X86_SSE4_1_NATIVE)
646 return _mm_cvtepi8_epi64(a);
647 #else
648 simde__m128i_private
649 r_,
650 a_ = simde__m128i_to_private(a);
651
652 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
653 int8x16_t s8x16 = a_.neon_i8; /* xxxx xxxx xxxx xxBA */
654 int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0x0x 0B0A */
655 int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
656 int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
657 r_.neon_i64 = s64x2;
658 #else
659 SIMDE_VECTORIZE
660 for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
661 r_.i64[i] = a_.i8[i];
662 }
663 #endif
664
665 return simde__m128i_from_private(r_);
666 #endif
667 }
668 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
669 #undef _mm_cvtepi8_epi64
670 #define _mm_cvtepi8_epi64(a) simde_mm_cvtepi8_epi64(a)
671 #endif
672
673 SIMDE_FUNCTION_ATTRIBUTES
674 simde__m128i
simde_mm_cvtepu8_epi16(simde__m128i a)675 simde_mm_cvtepu8_epi16 (simde__m128i a) {
676 #if defined(SIMDE_X86_SSE4_1_NATIVE)
677 return _mm_cvtepu8_epi16(a);
678 #else
679 simde__m128i_private
680 r_,
681 a_ = simde__m128i_to_private(a);
682
683 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
684 uint8x16_t u8x16 = a_.neon_u8; /* xxxx xxxx xxxx DCBA */
685 uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */
686 r_.neon_u16 = u16x8;
687 #elif defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_BUG_CLANG_45541) && (!defined(SIMDE_ARCH_POWER) || !defined(__clang__))
688 SIMDE_CONVERT_VECTOR_(r_.i16, a_.m64_private[0].u8);
689 #else
690 SIMDE_VECTORIZE
691 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
692 r_.i16[i] = a_.u8[i];
693 }
694 #endif
695
696 return simde__m128i_from_private(r_);
697 #endif
698 }
699 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
700 #undef _mm_cvtepu8_epi16
701 #define _mm_cvtepu8_epi16(a) simde_mm_cvtepu8_epi16(a)
702 #endif
703
704 SIMDE_FUNCTION_ATTRIBUTES
705 simde__m128i
simde_mm_cvtepu8_epi32(simde__m128i a)706 simde_mm_cvtepu8_epi32 (simde__m128i a) {
707 #if defined(SIMDE_X86_SSE4_1_NATIVE)
708 return _mm_cvtepu8_epi32(a);
709 #else
710 simde__m128i_private
711 r_,
712 a_ = simde__m128i_to_private(a);
713
714 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
715 uint8x16_t u8x16 = a_.neon_u8; /* xxxx xxxx xxxx DCBA */
716 uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */
717 uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
718 r_.neon_u32 = u32x4;
719 #else
720 SIMDE_VECTORIZE
721 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
722 r_.i32[i] = a_.u8[i];
723 }
724 #endif
725
726 return simde__m128i_from_private(r_);
727 #endif
728 }
729 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
730 #undef _mm_cvtepu8_epi32
731 #define _mm_cvtepu8_epi32(a) simde_mm_cvtepu8_epi32(a)
732 #endif
733
734 SIMDE_FUNCTION_ATTRIBUTES
735 simde__m128i
simde_mm_cvtepu8_epi64(simde__m128i a)736 simde_mm_cvtepu8_epi64 (simde__m128i a) {
737 #if defined(SIMDE_X86_SSE4_1_NATIVE)
738 return _mm_cvtepu8_epi64(a);
739 #else
740 simde__m128i_private
741 r_,
742 a_ = simde__m128i_to_private(a);
743
744 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
745 uint8x16_t u8x16 = a_.neon_u8; /* xxxx xxxx xxxx xxBA */
746 uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0x0x 0B0A */
747 uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
748 uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
749 r_.neon_u64 = u64x2;
750 #else
751 SIMDE_VECTORIZE
752 for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
753 r_.i64[i] = a_.u8[i];
754 }
755 #endif
756
757 return simde__m128i_from_private(r_);
758 #endif
759 }
760 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
761 #undef _mm_cvtepu8_epi64
762 #define _mm_cvtepu8_epi64(a) simde_mm_cvtepu8_epi64(a)
763 #endif
764
765 SIMDE_FUNCTION_ATTRIBUTES
766 simde__m128i
simde_mm_cvtepi16_epi32(simde__m128i a)767 simde_mm_cvtepi16_epi32 (simde__m128i a) {
768 #if defined(SIMDE_X86_SSE4_1_NATIVE)
769 return _mm_cvtepi16_epi32(a);
770 #else
771 simde__m128i_private
772 r_,
773 a_ = simde__m128i_to_private(a);
774
775 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
776 r_.neon_i32 = vmovl_s16(vget_low_s16(a_.neon_i16));
777 #else
778 SIMDE_VECTORIZE
779 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
780 r_.i32[i] = a_.i16[i];
781 }
782 #endif
783
784 return simde__m128i_from_private(r_);
785 #endif
786 }
787 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
788 #undef _mm_cvtepi16_epi32
789 #define _mm_cvtepi16_epi32(a) simde_mm_cvtepi16_epi32(a)
790 #endif
791
792 SIMDE_FUNCTION_ATTRIBUTES
793 simde__m128i
simde_mm_cvtepu16_epi32(simde__m128i a)794 simde_mm_cvtepu16_epi32 (simde__m128i a) {
795 #if defined(SIMDE_X86_SSE4_1_NATIVE)
796 return _mm_cvtepu16_epi32(a);
797 #else
798 simde__m128i_private
799 r_,
800 a_ = simde__m128i_to_private(a);
801
802 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
803 r_.neon_u32 = vmovl_u16(vget_low_u16(a_.neon_u16));
804 #elif defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_BUG_CLANG_45541) && (!defined(SIMDE_ARCH_POWER) || !defined(__clang__))
805 SIMDE_CONVERT_VECTOR_(r_.i32, a_.m64_private[0].u16);
806 #else
807 SIMDE_VECTORIZE
808 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
809 r_.i32[i] = a_.u16[i];
810 }
811 #endif
812
813 return simde__m128i_from_private(r_);
814 #endif
815 }
816 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
817 #undef _mm_cvtepu16_epi32
818 #define _mm_cvtepu16_epi32(a) simde_mm_cvtepu16_epi32(a)
819 #endif
820
821 SIMDE_FUNCTION_ATTRIBUTES
822 simde__m128i
simde_mm_cvtepu16_epi64(simde__m128i a)823 simde_mm_cvtepu16_epi64 (simde__m128i a) {
824 #if defined(SIMDE_X86_SSE4_1_NATIVE)
825 return _mm_cvtepu16_epi64(a);
826 #else
827 simde__m128i_private
828 r_,
829 a_ = simde__m128i_to_private(a);
830
831 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
832 uint16x8_t u16x8 = a_.neon_u16; /* xxxx xxxx xxxx 0B0A */
833 uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
834 uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
835 r_.neon_u64 = u64x2;
836 #else
837 SIMDE_VECTORIZE
838 for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
839 r_.i64[i] = a_.u16[i];
840 }
841 #endif
842
843 return simde__m128i_from_private(r_);
844 #endif
845 }
846 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
847 #undef _mm_cvtepu16_epi64
848 #define _mm_cvtepu16_epi64(a) simde_mm_cvtepu16_epi64(a)
849 #endif
850
851 SIMDE_FUNCTION_ATTRIBUTES
852 simde__m128i
simde_mm_cvtepi16_epi64(simde__m128i a)853 simde_mm_cvtepi16_epi64 (simde__m128i a) {
854 #if defined(SIMDE_X86_SSE4_1_NATIVE)
855 return _mm_cvtepi16_epi64(a);
856 #else
857 simde__m128i_private
858 r_,
859 a_ = simde__m128i_to_private(a);
860
861 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
862 int16x8_t s16x8 = a_.neon_i16; /* xxxx xxxx xxxx 0B0A */
863 int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
864 int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
865 r_.neon_i64 = s64x2;
866 #else
867 SIMDE_VECTORIZE
868 for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
869 r_.i64[i] = a_.i16[i];
870 }
871 #endif
872
873 return simde__m128i_from_private(r_);
874 #endif
875 }
876 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
877 #undef _mm_cvtepi16_epi64
878 #define _mm_cvtepi16_epi64(a) simde_mm_cvtepi16_epi64(a)
879 #endif
880
881 SIMDE_FUNCTION_ATTRIBUTES
882 simde__m128i
simde_mm_cvtepi32_epi64(simde__m128i a)883 simde_mm_cvtepi32_epi64 (simde__m128i a) {
884 #if defined(SIMDE_X86_SSE4_1_NATIVE)
885 return _mm_cvtepi32_epi64(a);
886 #else
887 simde__m128i_private
888 r_,
889 a_ = simde__m128i_to_private(a);
890
891 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
892 r_.neon_i64 = vmovl_s32(vget_low_s32(a_.neon_i32));
893 #elif defined(SIMDE_CONVERT_VECTOR_)
894 SIMDE_CONVERT_VECTOR_(r_.i64, a_.m64_private[0].i32);
895 #else
896 SIMDE_VECTORIZE
897 for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
898 r_.i64[i] = a_.i32[i];
899 }
900 #endif
901
902 return simde__m128i_from_private(r_);
903 #endif
904 }
905 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
906 #undef _mm_cvtepi32_epi64
907 #define _mm_cvtepi32_epi64(a) simde_mm_cvtepi32_epi64(a)
908 #endif
909
910 SIMDE_FUNCTION_ATTRIBUTES
911 simde__m128i
simde_mm_cvtepu32_epi64(simde__m128i a)912 simde_mm_cvtepu32_epi64 (simde__m128i a) {
913 #if defined(SIMDE_X86_SSE4_1_NATIVE)
914 return _mm_cvtepu32_epi64(a);
915 #else
916 simde__m128i_private
917 r_,
918 a_ = simde__m128i_to_private(a);
919
920 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
921 r_.neon_u64 = vmovl_u32(vget_low_u32(a_.neon_u32));
922 #elif defined(SIMDE_CONVERT_VECTOR_)
923 SIMDE_CONVERT_VECTOR_(r_.i64, a_.m64_private[0].u32);
924 #else
925 SIMDE_VECTORIZE
926 for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
927 r_.i64[i] = a_.u32[i];
928 }
929 #endif
930
931 return simde__m128i_from_private(r_);
932 #endif
933 }
934 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
935 #undef _mm_cvtepu32_epi64
936 #define _mm_cvtepu32_epi64(a) simde_mm_cvtepu32_epi64(a)
937 #endif
938
939 SIMDE_FUNCTION_ATTRIBUTES
940 simde__m128d
simde_mm_dp_pd(simde__m128d a,simde__m128d b,const int imm8)941 simde_mm_dp_pd (simde__m128d a, simde__m128d b, const int imm8)
942 SIMDE_REQUIRE_RANGE(imm8, 0, 255) {
943 simde__m128d_private
944 r_,
945 a_ = simde__m128d_to_private(a),
946 b_ = simde__m128d_to_private(b);
947
948 simde_float64 sum = SIMDE_FLOAT64_C(0.0);
949
950 SIMDE_VECTORIZE_REDUCTION(+:sum)
951 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
952 sum += ((imm8 >> (i + 4)) & 1) ? (a_.f64[i] * b_.f64[i]) : 0.0;
953 }
954
955 SIMDE_VECTORIZE
956 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
957 r_.f64[i] = ((imm8 >> i) & 1) ? sum : 0.0;
958 }
959
960 return simde__m128d_from_private(r_);
961 }
962 #if defined(SIMDE_X86_SSE4_1_NATIVE)
963 # define simde_mm_dp_pd(a, b, imm8) _mm_dp_pd(a, b, imm8)
964 #endif
965 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
966 #undef _mm_dp_pd
967 #define _mm_dp_pd(a, b, imm8) simde_mm_dp_pd(a, b, imm8)
968 #endif
969
970 SIMDE_FUNCTION_ATTRIBUTES
971 simde__m128
simde_mm_dp_ps(simde__m128 a,simde__m128 b,const int imm8)972 simde_mm_dp_ps (simde__m128 a, simde__m128 b, const int imm8)
973 SIMDE_REQUIRE_RANGE(imm8, 0, 255) {
974 simde__m128_private
975 r_,
976 a_ = simde__m128_to_private(a),
977 b_ = simde__m128_to_private(b);
978
979 simde_float32 sum = SIMDE_FLOAT32_C(0.0);
980
981 SIMDE_VECTORIZE_REDUCTION(+:sum)
982 for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
983 sum += ((imm8 >> (i + 4)) & 1) ? (a_.f32[i] * b_.f32[i]) : SIMDE_FLOAT32_C(0.0);
984 }
985
986 SIMDE_VECTORIZE
987 for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
988 r_.f32[i] = ((imm8 >> i) & 1) ? sum : SIMDE_FLOAT32_C(0.0);
989 }
990
991 return simde__m128_from_private(r_);
992 }
993 #if defined(SIMDE_X86_SSE4_1_NATIVE)
994 # define simde_mm_dp_ps(a, b, imm8) _mm_dp_ps(a, b, imm8)
995 #endif
996 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
997 #undef _mm_dp_ps
998 #define _mm_dp_ps(a, b, imm8) simde_mm_dp_ps(a, b, imm8)
999 #endif
1000
1001 #if defined(simde_mm_extract_epi8)
1002 # undef simde_mm_extract_epi8
1003 #endif
1004 SIMDE_FUNCTION_ATTRIBUTES
1005 int8_t
simde_mm_extract_epi8(simde__m128i a,const int imm8)1006 simde_mm_extract_epi8 (simde__m128i a, const int imm8)
1007 SIMDE_REQUIRE_RANGE(imm8, 0, 15) {
1008 simde__m128i_private
1009 a_ = simde__m128i_to_private(a);
1010
1011 #if defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1012 #if defined(SIMDE_BUG_GCC_95227)
1013 (void) a_;
1014 (void) imm8;
1015 #endif
1016 return vec_extract(a_.altivec_i8, imm8);
1017 #else
1018 return a_.i8[imm8 & 15];
1019 #endif
1020 }
1021 #if defined(SIMDE_X86_SSE4_1_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_MM_EXTRACT_EPI8)
1022 # define simde_mm_extract_epi8(a, imm8) HEDLEY_STATIC_CAST(int8_t, _mm_extract_epi8(a, imm8))
1023 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1024 # define simde_mm_extract_epi8(a, imm8) vgetq_lane_s8(simde__m128i_to_private(a).neon_i8, imm8)
1025 #endif
1026 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1027 #undef _mm_extract_epi8
1028 #define _mm_extract_epi8(a, imm8) HEDLEY_STATIC_CAST(int, simde_mm_extract_epi8(a, imm8))
1029 #endif
1030
1031 #if defined(simde_mm_extract_epi32)
1032 # undef simde_mm_extract_epi32
1033 #endif
1034 SIMDE_FUNCTION_ATTRIBUTES
1035 int32_t
simde_mm_extract_epi32(simde__m128i a,const int imm8)1036 simde_mm_extract_epi32 (simde__m128i a, const int imm8)
1037 SIMDE_REQUIRE_RANGE(imm8, 0, 3) {
1038 simde__m128i_private
1039 a_ = simde__m128i_to_private(a);
1040
1041 #if defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1042 #if defined(SIMDE_BUG_GCC_95227)
1043 (void) a_;
1044 (void) imm8;
1045 #endif
1046 return vec_extract(a_.altivec_i32, imm8);
1047 #else
1048 return a_.i32[imm8 & 3];
1049 #endif
1050 }
1051 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1052 # define simde_mm_extract_epi32(a, imm8) _mm_extract_epi32(a, imm8)
1053 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1054 # define simde_mm_extract_epi32(a, imm8) vgetq_lane_s32(simde__m128i_to_private(a).neon_i32, imm8)
1055 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1056 # define simde_mm_extract_epi32(a, imm8) HEDLEY_STATIC_CAST(int32_t, vec_extract(simde__m128i_to_private(a).altivec_i32, imm8))
1057 #endif
1058 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1059 #undef _mm_extract_epi32
1060 #define _mm_extract_epi32(a, imm8) simde_mm_extract_epi32(a, imm8)
1061 #endif
1062
1063 #if defined(simde_mm_extract_epi64)
1064 # undef simde_mm_extract_epi64
1065 #endif
1066 SIMDE_FUNCTION_ATTRIBUTES
1067 int64_t
simde_mm_extract_epi64(simde__m128i a,const int imm8)1068 simde_mm_extract_epi64 (simde__m128i a, const int imm8)
1069 SIMDE_REQUIRE_RANGE(imm8, 0, 1) {
1070 simde__m128i_private
1071 a_ = simde__m128i_to_private(a);
1072
1073 #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
1074 #if defined(SIMDE_BUG_GCC_95227)
1075 (void) a_;
1076 (void) imm8;
1077 #endif
1078 return vec_extract(a_.altivec_i64, imm8);
1079 #else
1080 return a_.i64[imm8 & 1];
1081 #endif
1082 }
1083 #if defined(SIMDE_X86_SSE4_1_NATIVE) && defined(SIMDE_ARCH_AMD64)
1084 # define simde_mm_extract_epi64(a, imm8) _mm_extract_epi64(a, imm8)
1085 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1086 # define simde_mm_extract_epi64(a, imm8) vgetq_lane_s64(simde__m128i_to_private(a).neon_i64, imm8)
1087 #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
1088 # define simde_mm_extract_epi64(a, imm8) HEDLEY_STATIC_CAST(int64_t, vec_extract(simde__m128i_to_private(a).altivec_i64, imm8))
1089 #endif
1090 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1091 #undef _mm_extract_epi64
1092 #define _mm_extract_epi64(a, imm8) simde_mm_extract_epi64(a, imm8)
1093 #endif
1094
1095 SIMDE_FUNCTION_ATTRIBUTES
1096 simde__m128d
simde_mm_floor_pd(simde__m128d a)1097 simde_mm_floor_pd (simde__m128d a) {
1098 return simde_mm_round_pd(a, SIMDE_MM_FROUND_TO_NEG_INF);
1099 }
1100 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1101 #undef _mm_floor_pd
1102 #define _mm_floor_pd(a) simde_mm_floor_pd(a)
1103 #endif
1104
1105 SIMDE_FUNCTION_ATTRIBUTES
1106 simde__m128
simde_mm_floor_ps(simde__m128 a)1107 simde_mm_floor_ps (simde__m128 a) {
1108 return simde_mm_round_ps(a, SIMDE_MM_FROUND_TO_NEG_INF);
1109 }
1110 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1111 #undef _mm_floor_ps
1112 #define _mm_floor_ps(a) simde_mm_floor_ps(a)
1113 #endif
1114
1115 SIMDE_FUNCTION_ATTRIBUTES
1116 simde__m128d
simde_mm_floor_sd(simde__m128d a,simde__m128d b)1117 simde_mm_floor_sd (simde__m128d a, simde__m128d b) {
1118 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1119 return _mm_floor_sd(a, b);
1120 #else
1121 simde__m128d_private
1122 r_,
1123 a_ = simde__m128d_to_private(a),
1124 b_ = simde__m128d_to_private(b);
1125
1126 #if defined(simde_math_floor)
1127 r_.f64[0] = simde_math_floor(b_.f64[0]);
1128 r_.f64[1] = a_.f64[1];
1129 #else
1130 HEDLEY_UNREACHABLE();
1131 #endif
1132
1133 return simde__m128d_from_private(r_);
1134 #endif
1135 }
1136 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1137 #undef _mm_floor_sd
1138 #define _mm_floor_sd(a, b) simde_mm_floor_sd(a, b)
1139 #endif
1140
1141 SIMDE_FUNCTION_ATTRIBUTES
1142 simde__m128
simde_mm_floor_ss(simde__m128 a,simde__m128 b)1143 simde_mm_floor_ss (simde__m128 a, simde__m128 b) {
1144 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1145 return _mm_floor_ss(a, b);
1146 #elif defined(SIMDE_ASSUME_VECTORIZATION)
1147 return simde_mm_move_ss(a, simde_mm_floor_ps(b));
1148 #else
1149 simde__m128_private
1150 r_,
1151 a_ = simde__m128_to_private(a),
1152 b_ = simde__m128_to_private(b);
1153
1154 #if defined(simde_math_floorf)
1155 r_.f32[0] = simde_math_floorf(b_.f32[0]);
1156 for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
1157 r_.f32[i] = a_.f32[i];
1158 }
1159 #else
1160 HEDLEY_UNREACHABLE();
1161 #endif
1162
1163 return simde__m128_from_private(r_);
1164 #endif
1165 }
1166 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1167 #undef _mm_floor_ss
1168 #define _mm_floor_ss(a, b) simde_mm_floor_ss(a, b)
1169 #endif
1170
1171 SIMDE_FUNCTION_ATTRIBUTES
1172 simde__m128i
simde_mm_insert_epi8(simde__m128i a,int i,const int imm8)1173 simde_mm_insert_epi8 (simde__m128i a, int i, const int imm8)
1174 SIMDE_REQUIRE_RANGE(imm8, 0, 15) {
1175 simde__m128i_private
1176 r_ = simde__m128i_to_private(a);
1177
1178 r_.i8[imm8] = HEDLEY_STATIC_CAST(int8_t, i);
1179
1180 return simde__m128i_from_private(r_);
1181 }
1182 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1183 /* clang-3.8 returns an incompatible type, so we need the cast. MSVC
1184 * can't handle the cast ("error C2440: 'type cast': cannot convert
1185 * from '__m128i' to '__m128i'"). */
1186 #if defined(__clang__)
1187 #define simde_mm_insert_epi8(a, i, imm8) HEDLEY_STATIC_CAST(__m128i, _mm_insert_epi8(a, i, imm8))
1188 #else
1189 #define simde_mm_insert_epi8(a, i, imm8) _mm_insert_epi8(a, i, imm8)
1190 #endif
1191 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1192 # define simde_mm_insert_epi8(a, i, imm8) simde__m128i_from_neon_i8(vsetq_lane_s8(i, simde__m128i_to_private(a).i8, imm8))
1193 #endif
1194 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1195 #undef _mm_insert_epi8
1196 #define _mm_insert_epi8(a, i, imm8) simde_mm_insert_epi8(a, i, imm8)
1197 #endif
1198
1199 SIMDE_FUNCTION_ATTRIBUTES
1200 simde__m128i
simde_mm_insert_epi32(simde__m128i a,int i,const int imm8)1201 simde_mm_insert_epi32 (simde__m128i a, int i, const int imm8)
1202 SIMDE_REQUIRE_RANGE(imm8, 0, 3) {
1203 simde__m128i_private
1204 r_ = simde__m128i_to_private(a);
1205
1206 r_.i32[imm8] = HEDLEY_STATIC_CAST(int32_t, i);
1207
1208 return simde__m128i_from_private(r_);
1209 }
1210 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1211 #if defined(__clang__)
1212 #define simde_mm_insert_epi32(a, i, imm8) HEDLEY_STATIC_CAST(__m128i, _mm_insert_epi32(a, i, imm8))
1213 #else
1214 #define simde_mm_insert_epi32(a, i, imm8) _mm_insert_epi32(a, i, imm8)
1215 #endif
1216 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1217 # define simde_mm_insert_epi32(a, i, imm8) simde__m128i_from_neon_i32(vsetq_lane_s32(i, simde__m128i_to_private(a).i32, imm8))
1218 #endif
1219 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1220 #undef _mm_insert_epi32
1221 #define _mm_insert_epi32(a, i, imm8) simde_mm_insert_epi32(a, i, imm8)
1222 #endif
1223
1224 SIMDE_FUNCTION_ATTRIBUTES
1225 simde__m128i
simde_mm_insert_epi64(simde__m128i a,int64_t i,const int imm8)1226 simde_mm_insert_epi64 (simde__m128i a, int64_t i, const int imm8)
1227 SIMDE_REQUIRE_RANGE(imm8, 0, 1) {
1228 #if defined(SIMDE_BUG_GCC_94482)
1229 simde__m128i_private
1230 a_ = simde__m128i_to_private(a);
1231
1232 switch(imm8) {
1233 case 0:
1234 return simde_mm_set_epi64x(a_.i64[1], i);
1235 break;
1236 case 1:
1237 return simde_mm_set_epi64x(i, a_.i64[0]);
1238 break;
1239 default:
1240 HEDLEY_UNREACHABLE();
1241 break;
1242 }
1243 #else
1244 simde__m128i_private
1245 r_ = simde__m128i_to_private(a);
1246
1247 r_.i64[imm8] = i;
1248 return simde__m128i_from_private(r_);
1249 #endif
1250 }
1251 #if defined(SIMDE_X86_SSE4_1_NATIVE) && defined(SIMDE_ARCH_AMD64)
1252 # define simde_mm_insert_epi64(a, i, imm8) _mm_insert_epi64(a, i, imm8)
1253 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1254 # define simde_mm_insert_epi64(a, i, imm8) simde__m128i_from_neon_i64(vsetq_lane_s64(i, simde__m128i_to_private(a).i64, imm8))
1255 #endif
1256 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1257 #undef _mm_insert_epi64
1258 #define _mm_insert_epi64(a, i, imm8) simde_mm_insert_epi64(a, i, imm8)
1259 #endif
1260
1261 SIMDE_FUNCTION_ATTRIBUTES
1262 simde__m128
simde_mm_insert_ps(simde__m128 a,simde__m128 b,const int imm8)1263 simde_mm_insert_ps (simde__m128 a, simde__m128 b, const int imm8)
1264 SIMDE_REQUIRE_RANGE(imm8, 0, 255) {
1265 simde__m128_private
1266 r_,
1267 a_ = simde__m128_to_private(a),
1268 b_ = simde__m128_to_private(b);
1269
1270 a_.f32[0] = b_.f32[(imm8 >> 6) & 3];
1271 a_.f32[(imm8 >> 4) & 3] = a_.f32[0];
1272
1273 SIMDE_VECTORIZE
1274 for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
1275 r_.f32[i] = (imm8 >> i) ? SIMDE_FLOAT32_C(0.0) : a_.f32[i];
1276 }
1277
1278 return simde__m128_from_private(r_);
1279 }
1280 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1281 # define simde_mm_insert_ps(a, b, imm8) _mm_insert_ps(a, b, imm8)
1282 #endif
1283 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1284 #undef _mm_insert_ps
1285 #define _mm_insert_ps(a, b, imm8) simde_mm_insert_ps(a, b, imm8)
1286 #endif
1287
1288 SIMDE_FUNCTION_ATTRIBUTES
1289 simde__m128i
simde_mm_max_epi8(simde__m128i a,simde__m128i b)1290 simde_mm_max_epi8 (simde__m128i a, simde__m128i b) {
1291 #if defined(SIMDE_X86_SSE4_1_NATIVE) && !defined(__PGI)
1292 return _mm_max_epi8(a, b);
1293 #else
1294 simde__m128i_private
1295 r_,
1296 a_ = simde__m128i_to_private(a),
1297 b_ = simde__m128i_to_private(b);
1298
1299 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1300 r_.neon_i8 = vmaxq_s8(a_.neon_i8, b_.neon_i8);
1301 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1302 r_.altivec_i8 = vec_max(a_.altivec_i8, b_.altivec_i8);
1303 #else
1304 SIMDE_VECTORIZE
1305 for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
1306 r_.i8[i] = a_.i8[i] > b_.i8[i] ? a_.i8[i] : b_.i8[i];
1307 }
1308 #endif
1309
1310 return simde__m128i_from_private(r_);
1311 #endif
1312 }
1313 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1314 #undef _mm_max_epi8
1315 #define _mm_max_epi8(a, b) simde_mm_max_epi8(a, b)
1316 #endif
1317
1318 SIMDE_FUNCTION_ATTRIBUTES
1319 simde__m128i
simde_mm_max_epi32(simde__m128i a,simde__m128i b)1320 simde_mm_max_epi32 (simde__m128i a, simde__m128i b) {
1321 #if defined(SIMDE_X86_SSE4_1_NATIVE) && !defined(__PGI)
1322 return _mm_max_epi32(a, b);
1323 #else
1324 simde__m128i_private
1325 r_,
1326 a_ = simde__m128i_to_private(a),
1327 b_ = simde__m128i_to_private(b);
1328
1329 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1330 r_.neon_i32 = vmaxq_s32(a_.neon_i32, b_.neon_i32);
1331 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1332 r_.altivec_i32 = vec_max(a_.altivec_i32, b_.altivec_i32);
1333 #else
1334 SIMDE_VECTORIZE
1335 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1336 r_.i32[i] = a_.i32[i] > b_.i32[i] ? a_.i32[i] : b_.i32[i];
1337 }
1338 #endif
1339
1340 return simde__m128i_from_private(r_);
1341 #endif
1342 }
1343 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1344 #undef _mm_max_epi32
1345 #define _mm_max_epi32(a, b) simde_mm_max_epi32(a, b)
1346 #endif
1347
1348 SIMDE_FUNCTION_ATTRIBUTES
1349 simde__m128i
simde_mm_max_epu16(simde__m128i a,simde__m128i b)1350 simde_mm_max_epu16 (simde__m128i a, simde__m128i b) {
1351 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1352 return _mm_max_epu16(a, b);
1353 #else
1354 simde__m128i_private
1355 r_,
1356 a_ = simde__m128i_to_private(a),
1357 b_ = simde__m128i_to_private(b);
1358
1359 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1360 r_.neon_u16 = vmaxq_u16(a_.neon_u16, b_.neon_u16);
1361 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1362 r_.altivec_u16 = vec_max(a_.altivec_u16, b_.altivec_u16);
1363 #else
1364 SIMDE_VECTORIZE
1365 for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
1366 r_.u16[i] = a_.u16[i] > b_.u16[i] ? a_.u16[i] : b_.u16[i];
1367 }
1368 #endif
1369
1370 return simde__m128i_from_private(r_);
1371 #endif
1372 }
1373 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1374 #undef _mm_max_epu16
1375 #define _mm_max_epu16(a, b) simde_mm_max_epu16(a, b)
1376 #endif
1377
1378 SIMDE_FUNCTION_ATTRIBUTES
1379 simde__m128i
simde_mm_max_epu32(simde__m128i a,simde__m128i b)1380 simde_mm_max_epu32 (simde__m128i a, simde__m128i b) {
1381 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1382 return _mm_max_epu32(a, b);
1383 #else
1384 simde__m128i_private
1385 r_,
1386 a_ = simde__m128i_to_private(a),
1387 b_ = simde__m128i_to_private(b);
1388
1389 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1390 r_.neon_u32 = vmaxq_u32(a_.neon_u32, b_.neon_u32);
1391 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1392 r_.altivec_u32 = vec_max(a_.altivec_u32, b_.altivec_u32);
1393 #else
1394 SIMDE_VECTORIZE
1395 for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
1396 r_.u32[i] = a_.u32[i] > b_.u32[i] ? a_.u32[i] : b_.u32[i];
1397 }
1398 #endif
1399
1400 return simde__m128i_from_private(r_);
1401 #endif
1402 }
1403 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1404 #undef _mm_max_epu32
1405 #define _mm_max_epu32(a, b) simde_mm_max_epu32(a, b)
1406 #endif
1407
1408 SIMDE_FUNCTION_ATTRIBUTES
1409 simde__m128i
simde_mm_min_epi8(simde__m128i a,simde__m128i b)1410 simde_mm_min_epi8 (simde__m128i a, simde__m128i b) {
1411 #if defined(SIMDE_X86_SSE4_1_NATIVE) && !defined(__PGI)
1412 return _mm_min_epi8(a, b);
1413 #else
1414 simde__m128i_private
1415 r_,
1416 a_ = simde__m128i_to_private(a),
1417 b_ = simde__m128i_to_private(b);
1418
1419 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1420 r_.neon_i8 = vminq_s8(a_.neon_i8, b_.neon_i8);
1421 #else
1422 SIMDE_VECTORIZE
1423 for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
1424 r_.i8[i] = a_.i8[i] < b_.i8[i] ? a_.i8[i] : b_.i8[i];
1425 }
1426 #endif
1427
1428 return simde__m128i_from_private(r_);
1429 #endif
1430 }
1431 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1432 #undef _mm_min_epi8
1433 #define _mm_min_epi8(a, b) simde_mm_min_epi8(a, b)
1434 #endif
1435
1436 SIMDE_FUNCTION_ATTRIBUTES
1437 simde__m128i
simde_mm_min_epi32(simde__m128i a,simde__m128i b)1438 simde_mm_min_epi32 (simde__m128i a, simde__m128i b) {
1439 #if defined(SIMDE_X86_SSE4_1_NATIVE) && !defined(__PGI)
1440 return _mm_min_epi32(a, b);
1441 #else
1442 simde__m128i_private
1443 r_,
1444 a_ = simde__m128i_to_private(a),
1445 b_ = simde__m128i_to_private(b);
1446
1447 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1448 r_.neon_i32 = vminq_s32(a_.neon_i32, b_.neon_i32);
1449 #else
1450 SIMDE_VECTORIZE
1451 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1452 r_.i32[i] = a_.i32[i] < b_.i32[i] ? a_.i32[i] : b_.i32[i];
1453 }
1454 #endif
1455
1456 return simde__m128i_from_private(r_);
1457 #endif
1458 }
1459 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1460 #undef _mm_min_epi32
1461 #define _mm_min_epi32(a, b) simde_mm_min_epi32(a, b)
1462 #endif
1463
1464 SIMDE_FUNCTION_ATTRIBUTES
1465 simde__m128i
simde_mm_min_epu16(simde__m128i a,simde__m128i b)1466 simde_mm_min_epu16 (simde__m128i a, simde__m128i b) {
1467 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1468 return _mm_min_epu16(a, b);
1469 #else
1470 simde__m128i_private
1471 r_,
1472 a_ = simde__m128i_to_private(a),
1473 b_ = simde__m128i_to_private(b);
1474
1475 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1476 r_.neon_u16 = vminq_u16(a_.neon_u16, b_.neon_u16);
1477 #else
1478 SIMDE_VECTORIZE
1479 for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
1480 r_.u16[i] = a_.u16[i] < b_.u16[i] ? a_.u16[i] : b_.u16[i];
1481 }
1482 #endif
1483
1484 return simde__m128i_from_private(r_);
1485 #endif
1486 }
1487 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1488 #undef _mm_min_epu16
1489 #define _mm_min_epu16(a, b) simde_mm_min_epu16(a, b)
1490 #endif
1491
1492 SIMDE_FUNCTION_ATTRIBUTES
1493 simde__m128i
simde_mm_min_epu32(simde__m128i a,simde__m128i b)1494 simde_mm_min_epu32 (simde__m128i a, simde__m128i b) {
1495 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1496 return _mm_min_epu32(a, b);
1497 #else
1498 simde__m128i_private
1499 r_,
1500 a_ = simde__m128i_to_private(a),
1501 b_ = simde__m128i_to_private(b);
1502
1503 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1504 r_.neon_u32 = vminq_u32(a_.neon_u32, b_.neon_u32);
1505 #else
1506 SIMDE_VECTORIZE
1507 for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
1508 r_.u32[i] = a_.u32[i] < b_.u32[i] ? a_.u32[i] : b_.u32[i];
1509 }
1510 #endif
1511
1512 return simde__m128i_from_private(r_);
1513 #endif
1514 }
1515 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1516 #undef _mm_min_epu32
1517 #define _mm_min_epu32(a, b) simde_mm_min_epu32(a, b)
1518 #endif
1519
1520 SIMDE_FUNCTION_ATTRIBUTES
1521 simde__m128i
simde_mm_minpos_epu16(simde__m128i a)1522 simde_mm_minpos_epu16 (simde__m128i a) {
1523 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1524 return _mm_minpos_epu16(a);
1525 #else
1526 simde__m128i_private
1527 r_ = simde__m128i_to_private(simde_mm_setzero_si128()),
1528 a_ = simde__m128i_to_private(a);
1529
1530 r_.u16[0] = UINT16_MAX;
1531 for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
1532 if (a_.u16[i] < r_.u16[0]) {
1533 r_.u16[0] = a_.u16[i];
1534 r_.u16[1] = HEDLEY_STATIC_CAST(uint16_t, i);
1535 }
1536 }
1537
1538 return simde__m128i_from_private(r_);
1539 #endif
1540 }
1541 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1542 #undef _mm_minpos_epu16
1543 #define _mm_minpos_epu16(a) simde_mm_minpos_epu16(a)
1544 #endif
1545
1546 SIMDE_FUNCTION_ATTRIBUTES
1547 simde__m128i
simde_mm_mpsadbw_epu8(simde__m128i a,simde__m128i b,const int imm8)1548 simde_mm_mpsadbw_epu8 (simde__m128i a, simde__m128i b, const int imm8)
1549 SIMDE_REQUIRE_RANGE(imm8, 0, 7) {
1550 simde__m128i_private
1551 r_,
1552 a_ = simde__m128i_to_private(a),
1553 b_ = simde__m128i_to_private(b);
1554
1555 const int a_offset = imm8 & 4;
1556 const int b_offset = (imm8 & 3) << 2;
1557
1558 #if defined(simde_math_abs)
1559 for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, (sizeof(r_.u16) / sizeof(r_.u16[0]))) ; i++) {
1560 r_.u16[i] =
1561 HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[a_offset + i + 0] - b_.u8[b_offset + 0]))) +
1562 HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[a_offset + i + 1] - b_.u8[b_offset + 1]))) +
1563 HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[a_offset + i + 2] - b_.u8[b_offset + 2]))) +
1564 HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[a_offset + i + 3] - b_.u8[b_offset + 3])));
1565 }
1566 #else
1567 HEDLEY_UNREACHABLE();
1568 #endif
1569
1570 return simde__m128i_from_private(r_);
1571 }
1572 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1573 # define simde_mm_mpsadbw_epu8(a, b, imm8) _mm_mpsadbw_epu8(a, b, imm8)
1574 #endif
1575 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1576 #undef _mm_mpsadbw_epu8
1577 #define _mm_mpsadbw_epu8(a, b, imm8) simde_mm_mpsadbw_epu8(a, b, imm8)
1578 #endif
1579
1580 SIMDE_FUNCTION_ATTRIBUTES
1581 simde__m128i
simde_mm_mul_epi32(simde__m128i a,simde__m128i b)1582 simde_mm_mul_epi32 (simde__m128i a, simde__m128i b) {
1583 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1584 return _mm_mul_epi32(a, b);
1585 #else
1586 simde__m128i_private
1587 r_,
1588 a_ = simde__m128i_to_private(a),
1589 b_ = simde__m128i_to_private(b);
1590
1591 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1592 // vmull_s32 upcasts instead of masking, so we downcast.
1593 int32x2_t a_lo = vmovn_s64(a_.neon_i64);
1594 int32x2_t b_lo = vmovn_s64(b_.neon_i64);
1595 r_.neon_i64 = vmull_s32(a_lo, b_lo);
1596 #else
1597 SIMDE_VECTORIZE
1598 for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
1599 r_.i64[i] =
1600 HEDLEY_STATIC_CAST(int64_t, a_.i32[i * 2]) *
1601 HEDLEY_STATIC_CAST(int64_t, b_.i32[i * 2]);
1602 }
1603 #endif
1604
1605 return simde__m128i_from_private(r_);
1606 #endif
1607 }
1608 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1609 #undef _mm_mul_epi32
1610 #define _mm_mul_epi32(a, b) simde_mm_mul_epi32(a, b)
1611 #endif
1612
1613 SIMDE_FUNCTION_ATTRIBUTES
1614 simde__m128i
simde_mm_mullo_epi32(simde__m128i a,simde__m128i b)1615 simde_mm_mullo_epi32 (simde__m128i a, simde__m128i b) {
1616 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1617 return _mm_mullo_epi32(a, b);
1618 #else
1619 simde__m128i_private
1620 r_,
1621 a_ = simde__m128i_to_private(a),
1622 b_ = simde__m128i_to_private(b);
1623
1624 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1625 r_.neon_i32 = vmulq_s32(a_.neon_i32, b_.neon_i32);
1626 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1627 (void) a_;
1628 (void) b_;
1629 r_.altivec_i32 = vec_mul(a_.altivec_i32, b_.altivec_i32);
1630 #else
1631 SIMDE_VECTORIZE
1632 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1633 r_.u32[i] = HEDLEY_STATIC_CAST(uint32_t, (HEDLEY_STATIC_CAST(uint64_t, (HEDLEY_STATIC_CAST(int64_t, a_.i32[i]) * HEDLEY_STATIC_CAST(int64_t, b_.i32[i]))) & 0xffffffff));
1634 }
1635 #endif
1636
1637 return simde__m128i_from_private(r_);
1638 #endif
1639 }
1640 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1641 #undef _mm_mullo_epi32
1642 #define _mm_mullo_epi32(a, b) simde_mm_mullo_epi32(a, b)
1643 #endif
1644
1645 SIMDE_FUNCTION_ATTRIBUTES
1646 simde__m128i
simde_x_mm_mullo_epu32(simde__m128i a,simde__m128i b)1647 simde_x_mm_mullo_epu32 (simde__m128i a, simde__m128i b) {
1648 simde__m128i_private
1649 r_,
1650 a_ = simde__m128i_to_private(a),
1651 b_ = simde__m128i_to_private(b);
1652
1653 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1654 r_.u32 = a_.u32 * b_.u32;
1655 #else
1656 SIMDE_VECTORIZE
1657 for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
1658 r_.u32[i] = a_.u32[i] * b_.u32[i];
1659 }
1660 #endif
1661
1662 return simde__m128i_from_private(r_);
1663 }
1664
1665 SIMDE_FUNCTION_ATTRIBUTES
1666 simde__m128i
simde_mm_packus_epi32(simde__m128i a,simde__m128i b)1667 simde_mm_packus_epi32 (simde__m128i a, simde__m128i b) {
1668 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1669 return _mm_packus_epi32(a, b);
1670 #else
1671 simde__m128i_private
1672 r_,
1673 a_ = simde__m128i_to_private(a),
1674 b_ = simde__m128i_to_private(b);
1675
1676 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1677 const int32x4_t z = vdupq_n_s32(0);
1678 r_.neon_u16 = vcombine_u16(
1679 vqmovn_u32(vreinterpretq_u32_s32(vmaxq_s32(z, a_.neon_i32))),
1680 vqmovn_u32(vreinterpretq_u32_s32(vmaxq_s32(z, b_.neon_i32))));
1681 #else
1682 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1683 r_.u16[i + 0] = (a_.i32[i] < 0) ? UINT16_C(0) : ((a_.i32[i] > UINT16_MAX) ? (UINT16_MAX) : HEDLEY_STATIC_CAST(uint16_t, a_.i32[i]));
1684 r_.u16[i + 4] = (b_.i32[i] < 0) ? UINT16_C(0) : ((b_.i32[i] > UINT16_MAX) ? (UINT16_MAX) : HEDLEY_STATIC_CAST(uint16_t, b_.i32[i]));
1685 }
1686 #endif
1687
1688 return simde__m128i_from_private(r_);
1689 #endif
1690 }
1691 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1692 #undef _mm_packus_epi32
1693 #define _mm_packus_epi32(a, b) simde_mm_packus_epi32(a, b)
1694 #endif
1695
1696 SIMDE_FUNCTION_ATTRIBUTES
1697 simde__m128d
simde_mm_round_sd(simde__m128d a,simde__m128d b,int rounding)1698 simde_mm_round_sd (simde__m128d a, simde__m128d b, int rounding)
1699 SIMDE_REQUIRE_CONSTANT_RANGE(rounding, 0, 15) {
1700 simde__m128d_private
1701 r_ = simde__m128d_to_private(a),
1702 b_ = simde__m128d_to_private(b);
1703
1704 switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) {
1705 #if defined(simde_math_nearbyint)
1706 case SIMDE_MM_FROUND_TO_NEAREST_INT:
1707 case SIMDE_MM_FROUND_CUR_DIRECTION:
1708 r_.f64[0] = simde_math_nearbyint(b_.f64[0]);
1709 break;
1710 #endif
1711
1712 #if defined(simde_math_floor)
1713 case SIMDE_MM_FROUND_TO_NEG_INF:
1714 r_.f64[0] = simde_math_floor(b_.f64[0]);
1715 break;
1716 #endif
1717
1718 #if defined(simde_math_ceil)
1719 case SIMDE_MM_FROUND_TO_POS_INF:
1720 r_.f64[0] = simde_math_ceil(b_.f64[0]);
1721 break;
1722 #endif
1723
1724 #if defined(simde_math_trunc)
1725 case SIMDE_MM_FROUND_TO_ZERO:
1726 r_.f64[0] = simde_math_trunc(b_.f64[0]);
1727 break;
1728 #endif
1729
1730 default:
1731 HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
1732 }
1733
1734 return simde__m128d_from_private(r_);
1735 }
1736 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1737 # define simde_mm_round_sd(a, b, rounding) _mm_round_sd(a, b, rounding)
1738 #endif
1739 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1740 #undef _mm_round_sd
1741 #define _mm_round_sd(a, b, rounding) simde_mm_round_sd(a, b, rounding)
1742 #endif
1743
1744 SIMDE_FUNCTION_ATTRIBUTES
1745 simde__m128
simde_mm_round_ss(simde__m128 a,simde__m128 b,int rounding)1746 simde_mm_round_ss (simde__m128 a, simde__m128 b, int rounding)
1747 SIMDE_REQUIRE_CONSTANT_RANGE(rounding, 0, 15) {
1748 simde__m128_private
1749 r_ = simde__m128_to_private(a),
1750 b_ = simde__m128_to_private(b);
1751
1752 switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) {
1753 #if defined(simde_math_nearbyintf)
1754 case SIMDE_MM_FROUND_TO_NEAREST_INT:
1755 case SIMDE_MM_FROUND_CUR_DIRECTION:
1756 r_.f32[0] = simde_math_nearbyintf(b_.f32[0]);
1757 break;
1758 #endif
1759
1760 #if defined(simde_math_floorf)
1761 case SIMDE_MM_FROUND_TO_NEG_INF:
1762 r_.f32[0] = simde_math_floorf(b_.f32[0]);
1763 break;
1764 #endif
1765
1766 #if defined(simde_math_ceilf)
1767 case SIMDE_MM_FROUND_TO_POS_INF:
1768 r_.f32[0] = simde_math_ceilf(b_.f32[0]);
1769 break;
1770 #endif
1771
1772 #if defined(simde_math_truncf)
1773 case SIMDE_MM_FROUND_TO_ZERO:
1774 r_.f32[0] = simde_math_truncf(b_.f32[0]);
1775 break;
1776 #endif
1777
1778 default:
1779 HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
1780 }
1781
1782 return simde__m128_from_private(r_);
1783 }
1784 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1785 # define simde_mm_round_ss(a, b, rounding) _mm_round_ss(a, b, rounding)
1786 #endif
1787 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1788 #undef _mm_round_ss
1789 #define _mm_round_ss(a, b, rounding) simde_mm_round_ss(a, b, rounding)
1790 #endif
1791
1792 SIMDE_FUNCTION_ATTRIBUTES
1793 simde__m128i
simde_mm_stream_load_si128(const simde__m128i * mem_addr)1794 simde_mm_stream_load_si128 (const simde__m128i* mem_addr) {
1795 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1796 return _mm_stream_load_si128(HEDLEY_CONST_CAST(simde__m128i*, mem_addr));
1797 #else
1798 return *mem_addr;
1799 #endif
1800 }
1801 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1802 #undef _mm_stream_load_si128
1803 #define _mm_stream_load_si128(mem_addr) simde_mm_stream_load_si128(mem_addr)
1804 #endif
1805
1806 SIMDE_FUNCTION_ATTRIBUTES
1807 int
simde_mm_test_all_ones(simde__m128i a)1808 simde_mm_test_all_ones (simde__m128i a) {
1809 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1810 return _mm_test_all_ones(a);
1811 #else
1812 simde__m128i_private a_ = simde__m128i_to_private(a);
1813
1814 for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) {
1815 if (a_.u64[i] != ~UINT64_C(0))
1816 return 0;
1817 }
1818
1819 return 1;
1820 #endif
1821 }
1822 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1823 #undef _mm_test_all_ones
1824 #define _mm_test_all_ones(a) simde_mm_test_all_ones(a)
1825 #endif
1826
1827 SIMDE_FUNCTION_ATTRIBUTES
1828 int
simde_mm_test_all_zeros(simde__m128i a,simde__m128i mask)1829 simde_mm_test_all_zeros (simde__m128i a, simde__m128i mask) {
1830 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1831 return _mm_test_all_zeros(a, mask);
1832 #else
1833 simde__m128i_private
1834 a_ = simde__m128i_to_private(a),
1835 mask_ = simde__m128i_to_private(mask);
1836
1837 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1838 int64x2_t a_and_mask =
1839 vandq_s64(a_.neon_i64, mask_.neon_i64);
1840 return (vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1)) ? 0
1841 : 1;
1842 #else
1843 for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) {
1844 if ((a_.u64[i] & mask_.u64[i]) != 0)
1845 return 0;
1846 }
1847 #endif
1848
1849 return 1;
1850 #endif
1851 }
1852 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1853 #undef _mm_test_all_zeros
1854 #define _mm_test_all_zeros(a, mask) simde_mm_test_all_zeros(a, mask)
1855 #endif
1856
1857 SIMDE_FUNCTION_ATTRIBUTES
1858 int
simde_mm_test_mix_ones_zeros(simde__m128i a,simde__m128i mask)1859 simde_mm_test_mix_ones_zeros (simde__m128i a, simde__m128i mask) {
1860 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1861 return _mm_test_mix_ones_zeros(a, mask);
1862 #else
1863 simde__m128i_private
1864 a_ = simde__m128i_to_private(a),
1865 mask_ = simde__m128i_to_private(mask);
1866
1867 for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++)
1868 if (((a_.u64[i] & mask_.u64[i]) != 0) && ((~a_.u64[i] & mask_.u64[i]) != 0))
1869 return 1;
1870
1871 return 0;
1872 #endif
1873 }
1874 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1875 #undef _mm_test_mix_ones_zeros
1876 #define _mm_test_mix_ones_zeros(a, mask) simde_mm_test_mix_ones_zeros(a, mask)
1877 #endif
1878
1879 SIMDE_FUNCTION_ATTRIBUTES
1880 int
simde_mm_testc_si128(simde__m128i a,simde__m128i b)1881 simde_mm_testc_si128 (simde__m128i a, simde__m128i b) {
1882 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1883 return _mm_testc_si128(a, b);
1884 #else
1885 simde__m128i_private
1886 a_ = simde__m128i_to_private(a),
1887 b_ = simde__m128i_to_private(b);
1888
1889 int_fast32_t r = 0;
1890
1891 SIMDE_VECTORIZE_REDUCTION(|:r)
1892 for (size_t i = 0 ; i < (sizeof(a_.i32f) / sizeof(a_.i32f[0])) ; i++) {
1893 r |= ~a_.i32f[i] & b_.i32f[i];
1894 }
1895
1896 return HEDLEY_STATIC_CAST(int, !r);
1897 #endif
1898 }
1899 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1900 #undef _mm_testc_si128
1901 #define _mm_testc_si128(a, b) simde_mm_testc_si128(a, b)
1902 #endif
1903
1904 SIMDE_FUNCTION_ATTRIBUTES
1905 int
simde_mm_testnzc_si128(simde__m128i a,simde__m128i b)1906 simde_mm_testnzc_si128 (simde__m128i a, simde__m128i b) {
1907 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1908 return _mm_testnzc_si128(a, b);
1909 #else
1910 simde__m128i_private
1911 a_ = simde__m128i_to_private(a),
1912 b_ = simde__m128i_to_private(b);
1913
1914 for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) {
1915 if (((a_.u64[i] & b_.u64[i]) != 0) && ((~a_.u64[i] & b_.u64[i]) != 0))
1916 return 1;
1917 }
1918
1919 return 0;
1920 #endif
1921 }
1922 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1923 #undef _mm_testnzc_si128
1924 #define _mm_testnzc_si128(a, b) simde_mm_testnzc_si128(a, b)
1925 #endif
1926
1927 SIMDE_FUNCTION_ATTRIBUTES
1928 int
simde_mm_testz_si128(simde__m128i a,simde__m128i b)1929 simde_mm_testz_si128 (simde__m128i a, simde__m128i b) {
1930 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1931 return _mm_testz_si128(a, b);
1932 #else
1933 simde__m128i_private
1934 a_ = simde__m128i_to_private(a),
1935 b_ = simde__m128i_to_private(b);
1936
1937 for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) {
1938 if ((a_.u64[i] & b_.u64[i]) == 0)
1939 return 1;
1940 }
1941
1942 return 0;
1943 #endif
1944 }
1945 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1946 #undef _mm_testz_si128
1947 #define _mm_testz_si128(a, b) simde_mm_testz_si128(a, b)
1948 #endif
1949
1950 SIMDE_END_DECLS_
1951
1952 HEDLEY_DIAGNOSTIC_POP
1953
1954 #endif /* !defined(SIMDE_X86_SSE4_1_H) */
1955