1 /* SPDX-License-Identifier: MIT
2 *
3 * Permission is hereby granted, free of charge, to any person
4 * obtaining a copy of this software and associated documentation
5 * files (the "Software"), to deal in the Software without
6 * restriction, including without limitation the rights to use, copy,
7 * modify, merge, publish, distribute, sublicense, and/or sell copies
8 * of the Software, and to permit persons to whom the Software is
9 * furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be
12 * included in all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
18 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
19 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Copyright:
24 * 2017-2020 Evan Nemerson <evan@nemerson.com>
25 */
26
27 #include "sse.h"
28 #if !defined(SIMDE_X86_SSE4_1_H)
29 #define SIMDE_X86_SSE4_1_H
30
31 #include "ssse3.h"
32
33 HEDLEY_DIAGNOSTIC_PUSH
34 SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
35 SIMDE_BEGIN_DECLS_
36
37 #if !defined(SIMDE_X86_SSE4_1_NATIVE) && defined(SIMDE_ENABLE_NATIVE_ALIASES)
38 # define SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES
39 #endif
40
41 SIMDE_FUNCTION_ATTRIBUTES
42 simde__m128i
simde_mm_blend_epi16(simde__m128i a,simde__m128i b,const int imm8)43 simde_mm_blend_epi16 (simde__m128i a, simde__m128i b, const int imm8)
44 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
45 simde__m128i_private
46 r_,
47 a_ = simde__m128i_to_private(a),
48 b_ = simde__m128i_to_private(b);
49
50 SIMDE_VECTORIZE
51 for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
52 r_.u16[i] = ((imm8 >> i) & 1) ? b_.u16[i] : a_.u16[i];
53 }
54
55 return simde__m128i_from_private(r_);
56 }
57 #if defined(SIMDE_X86_SSE4_1_NATIVE)
58 # define simde_mm_blend_epi16(a, b, imm8) _mm_blend_epi16(a, b, imm8)
59 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
60 # define simde_mm_blend_epi16(a, b, imm8) \
61 (__extension__ ({ \
62 const uint16_t _mask[8] = { \
63 ((imm8) & (1 << 0)) ? 0xFFFF : 0x0000, \
64 ((imm8) & (1 << 1)) ? 0xFFFF : 0x0000, \
65 ((imm8) & (1 << 2)) ? 0xFFFF : 0x0000, \
66 ((imm8) & (1 << 3)) ? 0xFFFF : 0x0000, \
67 ((imm8) & (1 << 4)) ? 0xFFFF : 0x0000, \
68 ((imm8) & (1 << 5)) ? 0xFFFF : 0x0000, \
69 ((imm8) & (1 << 6)) ? 0xFFFF : 0x0000, \
70 ((imm8) & (1 << 7)) ? 0xFFFF : 0x0000 \
71 }; \
72 uint16x8_t _mask_vec = vld1q_u16(_mask); \
73 simde__m128i_from_neon_u16(vbslq_u16(_mask_vec, simde__m128i_to_neon_u16(b), simde__m128i_to_neon_u16(a))); \
74 }))
75 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
76 # define simde_mm_blend_epi16(a, b, imm8) \
77 (__extension__ ({ \
78 const SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) _mask = { \
79 ((imm8) & (1 << 0)) ? 0xFFFF : 0x0000, \
80 ((imm8) & (1 << 1)) ? 0xFFFF : 0x0000, \
81 ((imm8) & (1 << 2)) ? 0xFFFF : 0x0000, \
82 ((imm8) & (1 << 3)) ? 0xFFFF : 0x0000, \
83 ((imm8) & (1 << 4)) ? 0xFFFF : 0x0000, \
84 ((imm8) & (1 << 5)) ? 0xFFFF : 0x0000, \
85 ((imm8) & (1 << 6)) ? 0xFFFF : 0x0000, \
86 ((imm8) & (1 << 7)) ? 0xFFFF : 0x0000 \
87 }; \
88 simde__m128i_from_altivec_u16(vec_sel(simde__m128i_to_altivec_u16(a), simde__m128i_to_altivec_u16(b), _mask)); \
89 }))
90 #endif
91 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
92 #undef _mm_blend_epi16
93 #define _mm_blend_epi16(a, b, imm8) simde_mm_blend_epi16(a, b, imm8)
94 #endif
95
96 SIMDE_FUNCTION_ATTRIBUTES
97 simde__m128d
simde_mm_blend_pd(simde__m128d a,simde__m128d b,const int imm8)98 simde_mm_blend_pd (simde__m128d a, simde__m128d b, const int imm8)
99 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3) {
100 simde__m128d_private
101 r_,
102 a_ = simde__m128d_to_private(a),
103 b_ = simde__m128d_to_private(b);
104
105 SIMDE_VECTORIZE
106 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
107 r_.f64[i] = ((imm8 >> i) & 1) ? b_.f64[i] : a_.f64[i];
108 }
109 return simde__m128d_from_private(r_);
110 }
111 #if defined(SIMDE_X86_SSE4_1_NATIVE)
112 # define simde_mm_blend_pd(a, b, imm8) _mm_blend_pd(a, b, imm8)
113 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
114 # define simde_mm_blend_pd(a, b, imm8) \
115 (__extension__ ({ \
116 const uint64_t _mask[2] = { \
117 ((imm8) & (1 << 0)) ? UINT64_MAX : 0, \
118 ((imm8) & (1 << 1)) ? UINT64_MAX : 0 \
119 }; \
120 uint64x2_t _mask_vec = vld1q_u64(_mask); \
121 simde__m128d_from_neon_u64(vbslq_u64(_mask_vec, simde__m128d_to_neon_u64(b), simde__m128d_to_neon_u64(a))); \
122 }))
123 #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
124 # define simde_mm_blend_pd(a, b, imm8) \
125 (__extension__ ({ \
126 const SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) _mask = { \
127 ((imm8) & (1 << 0)) ? UINT64_MAX : 0, \
128 ((imm8) & (1 << 1)) ? UINT64_MAX : 0 \
129 }; \
130 simde__m128d_from_altivec_f64(vec_sel(simde__m128d_to_altivec_f64(a), simde__m128d_to_altivec_f64(b), _mask)); \
131 }))
132 #endif
133 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
134 #undef _mm_blend_pd
135 #define _mm_blend_pd(a, b, imm8) simde_mm_blend_pd(a, b, imm8)
136 #endif
137
138 SIMDE_FUNCTION_ATTRIBUTES
139 simde__m128
simde_mm_blend_ps(simde__m128 a,simde__m128 b,const int imm8)140 simde_mm_blend_ps (simde__m128 a, simde__m128 b, const int imm8)
141 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) {
142 simde__m128_private
143 r_,
144 a_ = simde__m128_to_private(a),
145 b_ = simde__m128_to_private(b);
146
147 SIMDE_VECTORIZE
148 for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
149 r_.f32[i] = ((imm8 >> i) & 1) ? b_.f32[i] : a_.f32[i];
150 }
151 return simde__m128_from_private(r_);
152 }
153 #if defined(SIMDE_X86_SSE4_1_NATIVE)
154 # define simde_mm_blend_ps(a, b, imm8) _mm_blend_ps(a, b, imm8)
155 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
156 # define simde_mm_blend_ps(a, b, imm8) \
157 (__extension__ ({ \
158 const uint32_t _mask[4] = { \
159 ((imm8) & (1 << 0)) ? UINT32_MAX : 0, \
160 ((imm8) & (1 << 1)) ? UINT32_MAX : 0, \
161 ((imm8) & (1 << 2)) ? UINT32_MAX : 0, \
162 ((imm8) & (1 << 3)) ? UINT32_MAX : 0 \
163 }; \
164 uint32x4_t _mask_vec = vld1q_u32(_mask); \
165 simde__m128_from_neon_f32(vbslq_f32(_mask_vec, simde__m128_to_neon_f32(b), simde__m128_to_neon_f32(a))); \
166 }))
167 #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
168 # define simde_mm_blend_ps(a, b, imm8) \
169 (__extension__ ({ \
170 const SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) _mask = { \
171 ((imm8) & (1 << 0)) ? UINT32_MAX : 0, \
172 ((imm8) & (1 << 1)) ? UINT32_MAX : 0, \
173 ((imm8) & (1 << 2)) ? UINT32_MAX : 0, \
174 ((imm8) & (1 << 3)) ? UINT32_MAX : 0 \
175 }; \
176 simde__m128_from_altivec_f32(vec_sel(simde__m128_to_altivec_f32(a), simde__m128_to_altivec_f32(b), _mask)); \
177 }))
178 #endif
179 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
180 #undef _mm_blend_ps
181 #define _mm_blend_ps(a, b, imm8) simde_mm_blend_ps(a, b, imm8)
182 #endif
183
184 SIMDE_FUNCTION_ATTRIBUTES
185 simde__m128i
simde_mm_blendv_epi8(simde__m128i a,simde__m128i b,simde__m128i mask)186 simde_mm_blendv_epi8 (simde__m128i a, simde__m128i b, simde__m128i mask) {
187 #if defined(SIMDE_X86_SSE4_1_NATIVE)
188 return _mm_blendv_epi8(a, b, mask);
189 #else
190 simde__m128i_private
191 r_,
192 a_ = simde__m128i_to_private(a),
193 b_ = simde__m128i_to_private(b),
194 mask_ = simde__m128i_to_private(mask);
195
196 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
197 /* Use a signed shift right to create a mask with the sign bit */
198 mask_.neon_i8 = vshrq_n_s8(mask_.neon_i8, 7);
199 r_.neon_i8 = vbslq_s8(mask_.neon_u8, b_.neon_i8, a_.neon_i8);
200 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
201 v128_t m = wasm_i8x16_shr(mask_.wasm_v128, 7);
202 r_.wasm_v128 = wasm_v128_or(wasm_v128_and(b_.wasm_v128, m), wasm_v128_andnot(a_.wasm_v128, m));
203 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
204 r_.altivec_i8 = vec_sel(a_.altivec_i8, b_.altivec_i8, vec_cmplt(mask_.altivec_i8, vec_splat_s8(0)));
205 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
206 /* https://software.intel.com/en-us/forums/intel-c-compiler/topic/850087 */
207 #if defined(HEDLEY_INTEL_VERSION_CHECK)
208 __typeof__(mask_.i8) z = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
209 mask_.i8 = HEDLEY_STATIC_CAST(__typeof__(mask_.i8), mask_.i8 < z);
210 #else
211 mask_.i8 >>= (CHAR_BIT * sizeof(mask_.i8[0])) - 1;
212 #endif
213
214 r_.i8 = (mask_.i8 & b_.i8) | (~mask_.i8 & a_.i8);
215 #else
216 SIMDE_VECTORIZE
217 for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
218 int8_t m = mask_.i8[i] >> 7;
219 r_.i8[i] = (m & b_.i8[i]) | (~m & a_.i8[i]);
220 }
221 #endif
222
223 return simde__m128i_from_private(r_);
224 #endif
225 }
226 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
227 #undef _mm_blendv_epi8
228 #define _mm_blendv_epi8(a, b, mask) simde_mm_blendv_epi8(a, b, mask)
229 #endif
230
231 SIMDE_FUNCTION_ATTRIBUTES
232 simde__m128i
simde_x_mm_blendv_epi16(simde__m128i a,simde__m128i b,simde__m128i mask)233 simde_x_mm_blendv_epi16 (simde__m128i a, simde__m128i b, simde__m128i mask) {
234 #if defined(SIMDE_X86_SSE2_NATIVE)
235 mask = simde_mm_srai_epi16(mask, 15);
236 return simde_mm_or_si128(simde_mm_and_si128(mask, b), simde_mm_andnot_si128(mask, a));
237 #else
238 simde__m128i_private
239 r_,
240 a_ = simde__m128i_to_private(a),
241 b_ = simde__m128i_to_private(b),
242 mask_ = simde__m128i_to_private(mask);
243
244 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
245 mask_ = simde__m128i_to_private(simde_mm_cmplt_epi16(mask, simde_mm_setzero_si128()));
246 r_.neon_i16 = vbslq_s16(mask_.neon_u16, b_.neon_i16, a_.neon_i16);
247 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
248 r_.altivec_i16 = vec_sel(a_.altivec_i16, b_.altivec_i16, vec_cmplt(mask_.altivec_i16, vec_splat_s16(0)));
249 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
250 #if defined(HEDLEY_INTEL_VERSION_CHECK)
251 __typeof__(mask_.i16) z = { 0, 0, 0, 0, 0, 0, 0, 0 };
252 mask_.i16 = mask_.i16 < z;
253 #else
254 mask_.i16 >>= (CHAR_BIT * sizeof(mask_.i16[0])) - 1;
255 #endif
256
257 r_.i16 = (mask_.i16 & b_.i16) | (~mask_.i16 & a_.i16);
258 #else
259 SIMDE_VECTORIZE
260 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
261 int16_t m = mask_.i16[i] >> 15;
262 r_.i16[i] = (m & b_.i16[i]) | (~m & a_.i16[i]);
263 }
264 #endif
265
266 return simde__m128i_from_private(r_);
267 #endif
268 }
269
270 SIMDE_FUNCTION_ATTRIBUTES
271 simde__m128i
simde_x_mm_blendv_epi32(simde__m128i a,simde__m128i b,simde__m128i mask)272 simde_x_mm_blendv_epi32 (simde__m128i a, simde__m128i b, simde__m128i mask) {
273 #if defined(SIMDE_X86_SSE4_1_NATIVE)
274 return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _mm_castsi128_ps(mask)));
275 #else
276 simde__m128i_private
277 r_,
278 a_ = simde__m128i_to_private(a),
279 b_ = simde__m128i_to_private(b),
280 mask_ = simde__m128i_to_private(mask);
281
282 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
283 mask_ = simde__m128i_to_private(simde_mm_cmplt_epi32(mask, simde_mm_setzero_si128()));
284 r_.neon_i32 = vbslq_s32(mask_.neon_u32, b_.neon_i32, a_.neon_i32);
285 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
286 v128_t m = wasm_i32x4_shr(mask_.wasm_v128, 31);
287 r_.wasm_v128 = wasm_v128_or(wasm_v128_and(b_.wasm_v128, m), wasm_v128_andnot(a_.wasm_v128, m));
288 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
289 r_.altivec_i32 = vec_sel(a_.altivec_i32, b_.altivec_i32, vec_cmplt(mask_.altivec_i32, vec_splat_s32(0)));
290 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
291 #if defined(HEDLEY_INTEL_VERSION_CHECK)
292 __typeof__(mask_.i32) z = { 0, 0, 0, 0 };
293 mask_.i32 = HEDLEY_STATIC_CAST(__typeof__(mask_.i32), mask_.i32 < z);
294 #else
295 mask_.i32 >>= (CHAR_BIT * sizeof(mask_.i32[0])) - 1;
296 #endif
297
298 r_.i32 = (mask_.i32 & b_.i32) | (~mask_.i32 & a_.i32);
299 #else
300 SIMDE_VECTORIZE
301 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
302 int32_t m = mask_.i32[i] >> 31;
303 r_.i32[i] = (m & b_.i32[i]) | (~m & a_.i32[i]);
304 }
305 #endif
306
307 return simde__m128i_from_private(r_);
308 #endif
309 }
310
311 SIMDE_FUNCTION_ATTRIBUTES
312 simde__m128i
simde_x_mm_blendv_epi64(simde__m128i a,simde__m128i b,simde__m128i mask)313 simde_x_mm_blendv_epi64 (simde__m128i a, simde__m128i b, simde__m128i mask) {
314 #if defined(SIMDE_X86_SSE4_1_NATIVE)
315 return _mm_castpd_si128(_mm_blendv_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(b), _mm_castsi128_pd(mask)));
316 #else
317 simde__m128i_private
318 r_,
319 a_ = simde__m128i_to_private(a),
320 b_ = simde__m128i_to_private(b),
321 mask_ = simde__m128i_to_private(mask);
322
323 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
324 mask_.neon_u64 = vcltq_s64(mask_.neon_i64, vdupq_n_s64(UINT64_C(0)));
325 r_.neon_i64 = vbslq_s64(mask_.neon_u64, b_.neon_i64, a_.neon_i64);
326 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
327 v128_t m = wasm_i64x2_shr(mask_.wasm_v128, 63);
328 r_.wasm_v128 = wasm_v128_or(wasm_v128_and(b_.wasm_v128, m), wasm_v128_andnot(a_.wasm_v128, m));
329 #elif (defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) && !defined(SIMDE_BUG_CLANG_46770)) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
330 r_.altivec_i64 = vec_sel(a_.altivec_i64, b_.altivec_i64, vec_cmplt(mask_.altivec_i64, vec_splats(HEDLEY_STATIC_CAST(signed long long, 0))));
331 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
332 SIMDE_POWER_ALTIVEC_VECTOR(signed long long) selector = vec_sra(mask_.altivec_i64, vec_splats(HEDLEY_STATIC_CAST(unsigned long long, 63)));
333 r_.altivec_i32 = vec_sel(a_.altivec_i32, b_.altivec_i32, HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned int), selector));
334 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
335 #if defined(HEDLEY_INTEL_VERSION_CHECK)
336 __typeof__(mask_.i64) z = { 0, 0 };
337 mask_.i64 = HEDLEY_STATIC_CAST(__typeof__(mask_.i64), mask_.i64 < z);
338 #else
339 mask_.i64 >>= (CHAR_BIT * sizeof(mask_.i64[0])) - 1;
340 #endif
341
342 r_.i64 = (mask_.i64 & b_.i64) | (~mask_.i64 & a_.i64);
343 #else
344 SIMDE_VECTORIZE
345 for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
346 int64_t m = mask_.i64[i] >> 63;
347 r_.i64[i] = (m & b_.i64[i]) | (~m & a_.i64[i]);
348 }
349 #endif
350
351 return simde__m128i_from_private(r_);
352 #endif
353 }
354
355 SIMDE_FUNCTION_ATTRIBUTES
356 simde__m128d
simde_mm_blendv_pd(simde__m128d a,simde__m128d b,simde__m128d mask)357 simde_mm_blendv_pd (simde__m128d a, simde__m128d b, simde__m128d mask) {
358 #if defined(SIMDE_X86_SSE4_1_NATIVE)
359 return _mm_blendv_pd(a, b, mask);
360 #else
361 return simde_mm_castsi128_pd(simde_x_mm_blendv_epi64(simde_mm_castpd_si128(a), simde_mm_castpd_si128(b), simde_mm_castpd_si128(mask)));
362 #endif
363 }
364 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
365 #undef _mm_blendv_pd
366 #define _mm_blendv_pd(a, b, mask) simde_mm_blendv_pd(a, b, mask)
367 #endif
368
369 SIMDE_FUNCTION_ATTRIBUTES
370 simde__m128
simde_mm_blendv_ps(simde__m128 a,simde__m128 b,simde__m128 mask)371 simde_mm_blendv_ps (simde__m128 a, simde__m128 b, simde__m128 mask) {
372 #if defined(SIMDE_X86_SSE4_1_NATIVE)
373 return _mm_blendv_ps(a, b, mask);
374 #else
375 return simde_mm_castsi128_ps(simde_x_mm_blendv_epi32(simde_mm_castps_si128(a), simde_mm_castps_si128(b), simde_mm_castps_si128(mask)));
376 #endif
377 }
378 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
379 #undef _mm_blendv_ps
380 #define _mm_blendv_ps(a, b, mask) simde_mm_blendv_ps(a, b, mask)
381 #endif
382
383 SIMDE_FUNCTION_ATTRIBUTES
384 simde__m128d
simde_mm_round_pd(simde__m128d a,int rounding)385 simde_mm_round_pd (simde__m128d a, int rounding)
386 SIMDE_REQUIRE_CONSTANT_RANGE(rounding, 0, 15) {
387 simde__m128d_private
388 r_,
389 a_ = simde__m128d_to_private(a);
390
391 /* For architectures which lack a current direction SIMD instruction. */
392 #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
393 if ((rounding & 7) == SIMDE_MM_FROUND_CUR_DIRECTION)
394 rounding = HEDLEY_STATIC_CAST(int, SIMDE_MM_GET_ROUNDING_MODE()) << 13;
395 #endif
396
397 switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) {
398 case SIMDE_MM_FROUND_CUR_DIRECTION:
399 #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
400 r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_round(a_.altivec_f64));
401 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
402 r_.neon_f64 = vrndiq_f64(a_.neon_f64);
403 #elif defined(simde_math_nearbyint)
404 SIMDE_VECTORIZE
405 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
406 r_.f64[i] = simde_math_nearbyint(a_.f64[i]);
407 }
408 #else
409 HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
410 #endif
411 break;
412
413 case SIMDE_MM_FROUND_TO_NEAREST_INT:
414 #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
415 r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_round(a_.altivec_f64));
416 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
417 r_.neon_f64 = vrndaq_f64(a_.neon_f64);
418 #elif defined(simde_math_roundeven)
419 SIMDE_VECTORIZE
420 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
421 r_.f64[i] = simde_math_roundeven(a_.f64[i]);
422 }
423 #else
424 HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
425 #endif
426 break;
427
428 case SIMDE_MM_FROUND_TO_NEG_INF:
429 #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
430 r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_floor(a_.altivec_f64));
431 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
432 r_.neon_f64 = vrndmq_f64(a_.neon_f64);
433 #else
434 SIMDE_VECTORIZE
435 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
436 r_.f64[i] = simde_math_floor(a_.f64[i]);
437 }
438 #endif
439 break;
440
441 case SIMDE_MM_FROUND_TO_POS_INF:
442 #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
443 r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_ceil(a_.altivec_f64));
444 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
445 r_.neon_f64 = vrndpq_f64(a_.neon_f64);
446 #elif defined(simde_math_ceil)
447 SIMDE_VECTORIZE
448 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
449 r_.f64[i] = simde_math_ceil(a_.f64[i]);
450 }
451 #else
452 HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
453 #endif
454 break;
455
456 case SIMDE_MM_FROUND_TO_ZERO:
457 #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
458 r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_trunc(a_.altivec_f64));
459 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
460 r_.neon_f64 = vrndq_f64(a_.neon_f64);
461 #else
462 SIMDE_VECTORIZE
463 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
464 r_.f64[i] = simde_math_trunc(a_.f64[i]);
465 }
466 #endif
467 break;
468
469 default:
470 HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
471 }
472
473 return simde__m128d_from_private(r_);
474 }
475 #if defined(SIMDE_X86_SSE4_1_NATIVE)
476 #define simde_mm_round_pd(a, rounding) _mm_round_pd(a, rounding)
477 #endif
478 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
479 #undef _mm_round_pd
480 #define _mm_round_pd(a, rounding) simde_mm_round_pd(a, rounding)
481 #endif
482
483 SIMDE_FUNCTION_ATTRIBUTES
484 simde__m128d
simde_mm_ceil_pd(simde__m128d a)485 simde_mm_ceil_pd (simde__m128d a) {
486 return simde_mm_round_pd(a, SIMDE_MM_FROUND_TO_POS_INF);
487 }
488 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
489 #undef _mm_ceil_pd
490 #define _mm_ceil_pd(a) simde_mm_ceil_pd(a)
491 #endif
492
493 SIMDE_FUNCTION_ATTRIBUTES
494 simde__m128
simde_mm_ceil_ps(simde__m128 a)495 simde_mm_ceil_ps (simde__m128 a) {
496 return simde_mm_round_ps(a, SIMDE_MM_FROUND_TO_POS_INF);
497 }
498 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
499 #undef _mm_ceil_ps
500 #define _mm_ceil_ps(a) simde_mm_ceil_ps(a)
501 #endif
502
503 SIMDE_FUNCTION_ATTRIBUTES
504 simde__m128d
simde_mm_ceil_sd(simde__m128d a,simde__m128d b)505 simde_mm_ceil_sd (simde__m128d a, simde__m128d b) {
506 #if defined(SIMDE_X86_SSE4_1_NATIVE)
507 return _mm_ceil_sd(a, b);
508 #else
509 simde__m128d_private
510 r_,
511 a_ = simde__m128d_to_private(a),
512 b_ = simde__m128d_to_private(b);
513
514 #if defined(simde_math_ceilf)
515 r_ = simde__m128d_to_private(simde_mm_set_pd(a_.f64[1], simde_math_ceil(b_.f64[0])));
516 #else
517 HEDLEY_UNREACHABLE();
518 #endif
519
520 return simde__m128d_from_private(r_);
521 #endif
522 }
523 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
524 #undef _mm_ceil_sd
525 #define _mm_ceil_sd(a, b) simde_mm_ceil_sd(a, b)
526 #endif
527
528 SIMDE_FUNCTION_ATTRIBUTES
529 simde__m128
simde_mm_ceil_ss(simde__m128 a,simde__m128 b)530 simde_mm_ceil_ss (simde__m128 a, simde__m128 b) {
531 #if defined(SIMDE_X86_SSE4_1_NATIVE)
532 return _mm_ceil_ss(a, b);
533 #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
534 return simde_mm_move_ss(a, simde_mm_ceil_ps(b));
535 #else
536 simde__m128_private
537 r_,
538 a_ = simde__m128_to_private(a),
539 b_ = simde__m128_to_private(b);
540
541 #if defined(simde_math_ceilf)
542 r_ = simde__m128_to_private(simde_mm_set_ps(a_.f32[3], a_.f32[2], a_.f32[1], simde_math_ceilf(b_.f32[0])));
543 #else
544 HEDLEY_UNREACHABLE();
545 #endif
546
547 return simde__m128_from_private(r_);
548 #endif
549 }
550 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
551 #undef _mm_ceil_ss
552 #define _mm_ceil_ss(a, b) simde_mm_ceil_ss(a, b)
553 #endif
554
555 SIMDE_FUNCTION_ATTRIBUTES
556 simde__m128i
simde_mm_cmpeq_epi64(simde__m128i a,simde__m128i b)557 simde_mm_cmpeq_epi64 (simde__m128i a, simde__m128i b) {
558 #if defined(SIMDE_X86_SSE4_1_NATIVE)
559 return _mm_cmpeq_epi64(a, b);
560 #else
561 simde__m128i_private
562 r_,
563 a_ = simde__m128i_to_private(a),
564 b_ = simde__m128i_to_private(b);
565
566 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
567 r_.neon_u64 = vceqq_u64(a_.neon_u64, b_.neon_u64);
568 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
569 /* (a == b) -> (a_lo == b_lo) && (a_hi == b_hi) */
570 uint32x4_t cmp = vceqq_u32(a_.neon_u32, b_.neon_u32);
571 uint32x4_t swapped = vrev64q_u32(cmp);
572 r_.neon_u32 = vandq_u32(cmp, swapped);
573 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
574 r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), a_.i64 == b_.i64);
575 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
576 r_.altivec_i64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed long long), vec_cmpeq(a_.altivec_i64, b_.altivec_i64));
577 #else
578 SIMDE_VECTORIZE
579 for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
580 r_.u64[i] = (a_.u64[i] == b_.u64[i]) ? ~UINT64_C(0) : UINT64_C(0);
581 }
582 #endif
583
584 return simde__m128i_from_private(r_);
585 #endif
586 }
587 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
588 #undef _mm_cmpeq_epi64
589 #define _mm_cmpeq_epi64(a, b) simde_mm_cmpeq_epi64(a, b)
590 #endif
591
592 SIMDE_FUNCTION_ATTRIBUTES
593 simde__m128i
simde_mm_cvtepi8_epi16(simde__m128i a)594 simde_mm_cvtepi8_epi16 (simde__m128i a) {
595 #if defined(SIMDE_X86_SSE4_1_NATIVE)
596 return _mm_cvtepi8_epi16(a);
597 #elif defined(SIMDE_X86_SSE2_NATIVE)
598 return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8);
599 #else
600 simde__m128i_private
601 r_,
602 a_ = simde__m128i_to_private(a);
603
604 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
605 int8x16_t s8x16 = a_.neon_i8; /* xxxx xxxx xxxx DCBA */
606 int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
607 r_.neon_i16 = s16x8;
608 #elif defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_VECTOR_SCALAR) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
609 r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, a_.i8,
610 -1, 0, -1, 1, -1, 2, -1, 3,
611 -1, 4, -1, 5, -1, 6, -1, 7));
612 r_.i16 >>= 8;
613 #elif defined(SIMDE_CONVERT_VECTOR_)
614 SIMDE_CONVERT_VECTOR_(r_.i16, a_.m64_private[0].i8);
615 #else
616 SIMDE_VECTORIZE
617 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
618 r_.i16[i] = a_.i8[i];
619 }
620 #endif
621
622 return simde__m128i_from_private(r_);
623 #endif
624 }
625 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
626 #undef _mm_cvtepi8_epi16
627 #define _mm_cvtepi8_epi16(a) simde_mm_cvtepi8_epi16(a)
628 #endif
629
630 SIMDE_FUNCTION_ATTRIBUTES
631 simde__m128i
simde_mm_cvtepi8_epi32(simde__m128i a)632 simde_mm_cvtepi8_epi32 (simde__m128i a) {
633 #if defined(SIMDE_X86_SSE4_1_NATIVE)
634 return _mm_cvtepi8_epi32(a);
635 #elif defined(SIMDE_X86_SSE2_NATIVE)
636 __m128i tmp = _mm_unpacklo_epi8(a, a);
637 tmp = _mm_unpacklo_epi16(tmp, tmp);
638 return _mm_srai_epi32(tmp, 24);
639 #else
640 simde__m128i_private
641 r_,
642 a_ = simde__m128i_to_private(a);
643
644 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
645 int8x16_t s8x16 = a_.neon_i8; /* xxxx xxxx xxxx DCBA */
646 int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
647 int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */
648 r_.neon_i32 = s32x4;
649 #elif defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_VECTOR_SCALAR) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
650 r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, a_.i8,
651 -1, -1, -1, 0, -1, -1, -1, 1,
652 -1, -1, -1, 2, -1, -1, -1, 3));
653 r_.i32 >>= 24;
654 #else
655 SIMDE_VECTORIZE
656 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
657 r_.i32[i] = a_.i8[i];
658 }
659 #endif
660
661 return simde__m128i_from_private(r_);
662 #endif
663 }
664 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
665 #undef _mm_cvtepi8_epi32
666 #define _mm_cvtepi8_epi32(a) simde_mm_cvtepi8_epi32(a)
667 #endif
668
669 SIMDE_FUNCTION_ATTRIBUTES
670 simde__m128i
simde_mm_cvtepi8_epi64(simde__m128i a)671 simde_mm_cvtepi8_epi64 (simde__m128i a) {
672 #if defined(SIMDE_X86_SSE4_1_NATIVE)
673 return _mm_cvtepi8_epi64(a);
674 #else
675 simde__m128i_private
676 r_,
677 a_ = simde__m128i_to_private(a);
678
679 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
680 int8x16_t s8x16 = a_.neon_i8; /* xxxx xxxx xxxx xxBA */
681 int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0x0x 0B0A */
682 int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
683 int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
684 r_.neon_i64 = s64x2;
685 #elif (!defined(SIMDE_ARCH_X86) && !defined(SIMDE_ARCH_AMD64)) && defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_VECTOR_SCALAR) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
686 /* Disabled on x86 due to lack of 64-bit arithmetic shift until
687 * until AVX-512 (at which point we would be using the native
688 * _mm_cvtepi_epi64 anyways). */
689 r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, a_.i8,
690 -1, -1, -1, -1, -1, -1, -1, 0,
691 -1, -1, -1, -1, -1, -1, -1, 1));
692 r_.i64 >>= 56;
693 #else
694 SIMDE_VECTORIZE
695 for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
696 r_.i64[i] = a_.i8[i];
697 }
698 #endif
699
700 return simde__m128i_from_private(r_);
701 #endif
702 }
703 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
704 #undef _mm_cvtepi8_epi64
705 #define _mm_cvtepi8_epi64(a) simde_mm_cvtepi8_epi64(a)
706 #endif
707
708 SIMDE_FUNCTION_ATTRIBUTES
709 simde__m128i
simde_mm_cvtepu8_epi16(simde__m128i a)710 simde_mm_cvtepu8_epi16 (simde__m128i a) {
711 #if defined(SIMDE_X86_SSE4_1_NATIVE)
712 return _mm_cvtepu8_epi16(a);
713 #elif defined(SIMDE_X86_SSE2_NATIVE)
714 return _mm_unpacklo_epi8(a, _mm_setzero_si128());
715 #else
716 simde__m128i_private
717 r_,
718 a_ = simde__m128i_to_private(a);
719
720 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
721 uint8x16_t u8x16 = a_.neon_u8; /* xxxx xxxx xxxx DCBA */
722 uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */
723 r_.neon_u16 = u16x8;
724 #elif defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
725 __typeof__(r_.i8) z = { 0, };
726 r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, z,
727 0, 16, 1, 17, 2, 18, 3, 19,
728 4, 20, 5, 21, 6, 22, 7, 23));
729 #elif defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_BUG_CLANG_45541) && (!defined(SIMDE_ARCH_POWER) || !defined(__clang__))
730 SIMDE_CONVERT_VECTOR_(r_.i16, a_.m64_private[0].u8);
731 #else
732 SIMDE_VECTORIZE
733 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
734 r_.i16[i] = a_.u8[i];
735 }
736 #endif
737
738 return simde__m128i_from_private(r_);
739 #endif
740 }
741 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
742 #undef _mm_cvtepu8_epi16
743 #define _mm_cvtepu8_epi16(a) simde_mm_cvtepu8_epi16(a)
744 #endif
745
746 SIMDE_FUNCTION_ATTRIBUTES
747 simde__m128i
simde_mm_cvtepu8_epi32(simde__m128i a)748 simde_mm_cvtepu8_epi32 (simde__m128i a) {
749 #if defined(SIMDE_X86_SSE4_1_NATIVE)
750 return _mm_cvtepu8_epi32(a);
751 #elif defined(SIMDE_X86_SSSE3_NATIVE)
752 __m128i s = _mm_set_epi8(
753 0x80, 0x80, 0x80, 0x03, 0x80, 0x80, 0x80, 0x02,
754 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80, 0x00);
755 return _mm_shuffle_epi8(a, s);
756 #elif defined(SIMDE_X86_SSE2_NATIVE)
757 __m128i z = _mm_setzero_si128();
758 return _mm_unpacklo_epi16(_mm_unpacklo_epi8(a, z), z);
759 #else
760 simde__m128i_private
761 r_,
762 a_ = simde__m128i_to_private(a);
763
764 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
765 uint8x16_t u8x16 = a_.neon_u8; /* xxxx xxxx xxxx DCBA */
766 uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */
767 uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
768 r_.neon_u32 = u32x4;
769 #elif defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
770 __typeof__(r_.i8) z = { 0, };
771 r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, z,
772 0, 17, 18, 19, 1, 21, 22, 23,
773 2, 25, 26, 27, 3, 29, 30, 31));
774 #else
775 SIMDE_VECTORIZE
776 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
777 r_.i32[i] = a_.u8[i];
778 }
779 #endif
780
781 return simde__m128i_from_private(r_);
782 #endif
783 }
784 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
785 #undef _mm_cvtepu8_epi32
786 #define _mm_cvtepu8_epi32(a) simde_mm_cvtepu8_epi32(a)
787 #endif
788
789 SIMDE_FUNCTION_ATTRIBUTES
790 simde__m128i
simde_mm_cvtepu8_epi64(simde__m128i a)791 simde_mm_cvtepu8_epi64 (simde__m128i a) {
792 #if defined(SIMDE_X86_SSE4_1_NATIVE)
793 return _mm_cvtepu8_epi64(a);
794 #elif defined(SIMDE_X86_SSSE3_NATIVE)
795 __m128i s = _mm_set_epi8(
796 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x01,
797 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x00);
798 return _mm_shuffle_epi8(a, s);
799 #elif defined(SIMDE_X86_SSE2_NATIVE)
800 __m128i z = _mm_setzero_si128();
801 return _mm_unpacklo_epi32(_mm_unpacklo_epi16(_mm_unpacklo_epi8(a, z), z), z);
802 #else
803 simde__m128i_private
804 r_,
805 a_ = simde__m128i_to_private(a);
806
807 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
808 uint8x16_t u8x16 = a_.neon_u8; /* xxxx xxxx xxxx xxBA */
809 uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0x0x 0B0A */
810 uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
811 uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
812 r_.neon_u64 = u64x2;
813 #elif defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
814 __typeof__(r_.i8) z = { 0, };
815 r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, z,
816 0, 17, 18, 19, 20, 21, 22, 23,
817 1, 25, 26, 27, 28, 29, 30, 31));
818 #else
819 SIMDE_VECTORIZE
820 for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
821 r_.i64[i] = a_.u8[i];
822 }
823 #endif
824
825 return simde__m128i_from_private(r_);
826 #endif
827 }
828 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
829 #undef _mm_cvtepu8_epi64
830 #define _mm_cvtepu8_epi64(a) simde_mm_cvtepu8_epi64(a)
831 #endif
832
833 SIMDE_FUNCTION_ATTRIBUTES
834 simde__m128i
simde_mm_cvtepi16_epi32(simde__m128i a)835 simde_mm_cvtepi16_epi32 (simde__m128i a) {
836 #if defined(SIMDE_X86_SSE4_1_NATIVE)
837 return _mm_cvtepi16_epi32(a);
838 #elif defined(SIMDE_X86_SSE2_NATIVE)
839 return _mm_srai_epi32(_mm_unpacklo_epi16(a, a), 16);
840 #else
841 simde__m128i_private
842 r_,
843 a_ = simde__m128i_to_private(a);
844
845 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
846 r_.neon_i32 = vmovl_s16(vget_low_s16(a_.neon_i16));
847 #elif !defined(SIMDE_ARCH_X86) && defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_VECTOR_SCALAR) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
848 r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, a_.i16, 8, 0, 10, 1, 12, 2, 14, 3));
849 r_.i32 >>= 16;
850 #else
851 SIMDE_VECTORIZE
852 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
853 r_.i32[i] = a_.i16[i];
854 }
855 #endif
856
857 return simde__m128i_from_private(r_);
858 #endif
859 }
860 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
861 #undef _mm_cvtepi16_epi32
862 #define _mm_cvtepi16_epi32(a) simde_mm_cvtepi16_epi32(a)
863 #endif
864
865 SIMDE_FUNCTION_ATTRIBUTES
866 simde__m128i
simde_mm_cvtepu16_epi32(simde__m128i a)867 simde_mm_cvtepu16_epi32 (simde__m128i a) {
868 #if defined(SIMDE_X86_SSE4_1_NATIVE)
869 return _mm_cvtepu16_epi32(a);
870 #elif defined(SIMDE_X86_SSE2_NATIVE)
871 return _mm_unpacklo_epi16(a, _mm_setzero_si128());
872 #else
873 simde__m128i_private
874 r_,
875 a_ = simde__m128i_to_private(a);
876
877 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
878 r_.neon_u32 = vmovl_u16(vget_low_u16(a_.neon_u16));
879 #elif defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
880 __typeof__(r_.u16) z = { 0, };
881 r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), SIMDE_SHUFFLE_VECTOR_(16, 16, a_.u16, z,
882 0, 9, 1, 11, 2, 13, 3, 15));
883 #elif defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_BUG_CLANG_45541) && (!defined(SIMDE_ARCH_POWER) || !defined(__clang__))
884 SIMDE_CONVERT_VECTOR_(r_.i32, a_.m64_private[0].u16);
885 #else
886 SIMDE_VECTORIZE
887 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
888 r_.i32[i] = a_.u16[i];
889 }
890 #endif
891
892 return simde__m128i_from_private(r_);
893 #endif
894 }
895 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
896 #undef _mm_cvtepu16_epi32
897 #define _mm_cvtepu16_epi32(a) simde_mm_cvtepu16_epi32(a)
898 #endif
899
900 SIMDE_FUNCTION_ATTRIBUTES
901 simde__m128i
simde_mm_cvtepu16_epi64(simde__m128i a)902 simde_mm_cvtepu16_epi64 (simde__m128i a) {
903 #if defined(SIMDE_X86_SSE4_1_NATIVE)
904 return _mm_cvtepu16_epi64(a);
905 #elif defined(SIMDE_X86_SSE2_NATIVE)
906 __m128i z = _mm_setzero_si128();
907 return _mm_unpacklo_epi32(_mm_unpacklo_epi16(a, z), z);
908 #else
909 simde__m128i_private
910 r_,
911 a_ = simde__m128i_to_private(a);
912
913 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
914 uint16x8_t u16x8 = a_.neon_u16; /* xxxx xxxx xxxx 0B0A */
915 uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
916 uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
917 r_.neon_u64 = u64x2;
918 #elif defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
919 __typeof__(r_.u16) z = { 0, };
920 r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), SIMDE_SHUFFLE_VECTOR_(16, 16, a_.u16, z,
921 0, 9, 10, 11,
922 1, 13, 14, 15));
923 #else
924 SIMDE_VECTORIZE
925 for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
926 r_.i64[i] = a_.u16[i];
927 }
928 #endif
929
930 return simde__m128i_from_private(r_);
931 #endif
932 }
933 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
934 #undef _mm_cvtepu16_epi64
935 #define _mm_cvtepu16_epi64(a) simde_mm_cvtepu16_epi64(a)
936 #endif
937
938 SIMDE_FUNCTION_ATTRIBUTES
939 simde__m128i
simde_mm_cvtepi16_epi64(simde__m128i a)940 simde_mm_cvtepi16_epi64 (simde__m128i a) {
941 #if defined(SIMDE_X86_SSE4_1_NATIVE)
942 return _mm_cvtepi16_epi64(a);
943 #else
944 simde__m128i_private
945 r_,
946 a_ = simde__m128i_to_private(a);
947
948 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
949 int16x8_t s16x8 = a_.neon_i16; /* xxxx xxxx xxxx 0B0A */
950 int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
951 int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
952 r_.neon_i64 = s64x2;
953 #elif (!defined(SIMDE_ARCH_X86) && !defined(SIMDE_ARCH_AMD64)) && defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_VECTOR_SCALAR) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
954 r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, a_.i16,
955 8, 9, 10, 0,
956 12, 13, 14, 1));
957 r_.i64 >>= 48;
958 #else
959 SIMDE_VECTORIZE
960 for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
961 r_.i64[i] = a_.i16[i];
962 }
963 #endif
964
965 return simde__m128i_from_private(r_);
966 #endif
967 }
968 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
969 #undef _mm_cvtepi16_epi64
970 #define _mm_cvtepi16_epi64(a) simde_mm_cvtepi16_epi64(a)
971 #endif
972
973 SIMDE_FUNCTION_ATTRIBUTES
974 simde__m128i
simde_mm_cvtepi32_epi64(simde__m128i a)975 simde_mm_cvtepi32_epi64 (simde__m128i a) {
976 #if defined(SIMDE_X86_SSE4_1_NATIVE)
977 return _mm_cvtepi32_epi64(a);
978 #elif defined(SIMDE_X86_SSE2_NATIVE)
979 __m128i tmp = _mm_shuffle_epi32(a, 0x50);
980 tmp = _mm_srai_epi32(tmp, 31);
981 tmp = _mm_shuffle_epi32(tmp, 0xed);
982 return _mm_unpacklo_epi32(a, tmp);
983 #else
984 simde__m128i_private
985 r_,
986 a_ = simde__m128i_to_private(a);
987
988 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
989 r_.neon_i64 = vmovl_s32(vget_low_s32(a_.neon_i32));
990 #elif !defined(SIMDE_ARCH_X86) && defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_VECTOR_SCALAR) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
991 r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), SIMDE_SHUFFLE_VECTOR_(32, 16, a_.i32, a_.i32, -1, 0, -1, 1));
992 r_.i64 >>= 32;
993 #elif defined(SIMDE_CONVERT_VECTOR_)
994 SIMDE_CONVERT_VECTOR_(r_.i64, a_.m64_private[0].i32);
995 #else
996 SIMDE_VECTORIZE
997 for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
998 r_.i64[i] = a_.i32[i];
999 }
1000 #endif
1001
1002 return simde__m128i_from_private(r_);
1003 #endif
1004 }
1005 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1006 #undef _mm_cvtepi32_epi64
1007 #define _mm_cvtepi32_epi64(a) simde_mm_cvtepi32_epi64(a)
1008 #endif
1009
1010 SIMDE_FUNCTION_ATTRIBUTES
1011 simde__m128i
simde_mm_cvtepu32_epi64(simde__m128i a)1012 simde_mm_cvtepu32_epi64 (simde__m128i a) {
1013 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1014 return _mm_cvtepu32_epi64(a);
1015 #elif defined(SIMDE_X86_SSE2_NATIVE)
1016 return _mm_unpacklo_epi32(a, _mm_setzero_si128());
1017 #else
1018 simde__m128i_private
1019 r_,
1020 a_ = simde__m128i_to_private(a);
1021
1022 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1023 r_.neon_u64 = vmovl_u32(vget_low_u32(a_.neon_u32));
1024 #elif defined(SIMDE_VECTOR_SCALAR) && defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
1025 __typeof__(r_.u32) z = { 0, };
1026 r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), SIMDE_SHUFFLE_VECTOR_(32, 16, a_.u32, z, 0, 4, 1, 6));
1027 #elif defined(SIMDE_CONVERT_VECTOR_)
1028 SIMDE_CONVERT_VECTOR_(r_.i64, a_.m64_private[0].u32);
1029 #else
1030 SIMDE_VECTORIZE
1031 for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
1032 r_.i64[i] = a_.u32[i];
1033 }
1034 #endif
1035
1036 return simde__m128i_from_private(r_);
1037 #endif
1038 }
1039 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1040 #undef _mm_cvtepu32_epi64
1041 #define _mm_cvtepu32_epi64(a) simde_mm_cvtepu32_epi64(a)
1042 #endif
1043
1044 SIMDE_FUNCTION_ATTRIBUTES
1045 simde__m128d
simde_mm_dp_pd(simde__m128d a,simde__m128d b,const int imm8)1046 simde_mm_dp_pd (simde__m128d a, simde__m128d b, const int imm8)
1047 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
1048 simde__m128d_private
1049 r_,
1050 a_ = simde__m128d_to_private(a),
1051 b_ = simde__m128d_to_private(b);
1052
1053 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1054 r_.neon_f64 = vmulq_f64(a_.neon_f64, b_.neon_f64);
1055
1056 switch (imm8) {
1057 case 0xff:
1058 r_.neon_f64 = vaddq_f64(r_.neon_f64, vextq_f64(r_.neon_f64, r_.neon_f64, 1));
1059 break;
1060 case 0x13:
1061 r_.neon_f64 = vdupq_lane_f64(vget_low_f64(r_.neon_f64), 0);
1062 break;
1063 default:
1064 { /* imm8 is a compile-time constant, so this all becomes just a load */
1065 uint64_t mask_data[] = {
1066 (imm8 & (1 << 4)) ? ~UINT64_C(0) : UINT64_C(0),
1067 (imm8 & (1 << 5)) ? ~UINT64_C(0) : UINT64_C(0),
1068 };
1069 r_.neon_f64 = vreinterpretq_f64_u64(vandq_u64(vld1q_u64(mask_data), vreinterpretq_u64_f64(r_.neon_f64)));
1070 }
1071
1072 r_.neon_f64 = vdupq_n_f64(vaddvq_f64(r_.neon_f64));
1073
1074 {
1075 uint64_t mask_data[] = {
1076 (imm8 & 1) ? ~UINT64_C(0) : UINT64_C(0),
1077 (imm8 & 2) ? ~UINT64_C(0) : UINT64_C(0)
1078 };
1079 r_.neon_f64 = vreinterpretq_f64_u64(vandq_u64(vld1q_u64(mask_data), vreinterpretq_u64_f64(r_.neon_f64)));
1080 }
1081 break;
1082 }
1083 #else
1084 simde_float64 sum = SIMDE_FLOAT64_C(0.0);
1085
1086 SIMDE_VECTORIZE_REDUCTION(+:sum)
1087 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
1088 sum += ((imm8 >> (i + 4)) & 1) ? (a_.f64[i] * b_.f64[i]) : 0.0;
1089 }
1090
1091 SIMDE_VECTORIZE
1092 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
1093 r_.f64[i] = ((imm8 >> i) & 1) ? sum : 0.0;
1094 }
1095 #endif
1096
1097 return simde__m128d_from_private(r_);
1098 }
1099 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1100 # define simde_mm_dp_pd(a, b, imm8) _mm_dp_pd(a, b, imm8)
1101 #endif
1102 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1103 #undef _mm_dp_pd
1104 #define _mm_dp_pd(a, b, imm8) simde_mm_dp_pd(a, b, imm8)
1105 #endif
1106
1107 SIMDE_FUNCTION_ATTRIBUTES
1108 simde__m128
simde_mm_dp_ps(simde__m128 a,simde__m128 b,const int imm8)1109 simde_mm_dp_ps (simde__m128 a, simde__m128 b, const int imm8)
1110 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
1111 simde__m128_private
1112 r_,
1113 a_ = simde__m128_to_private(a),
1114 b_ = simde__m128_to_private(b);
1115
1116 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1117 r_.neon_f32 = vmulq_f32(a_.neon_f32, b_.neon_f32);
1118
1119 switch (imm8) {
1120 case 0xff:
1121 r_.neon_f32 = vdupq_n_f32(vaddvq_f32(r_.neon_f32));
1122 break;
1123 case 0x7f:
1124 r_.neon_f32 = vsetq_lane_f32(0, r_.neon_f32, 3);
1125 r_.neon_f32 = vdupq_n_f32(vaddvq_f32(r_.neon_f32));
1126 break;
1127 default:
1128 {
1129 {
1130 uint32_t mask_data[] = {
1131 (imm8 & (1 << 4)) ? ~UINT32_C(0) : UINT32_C(0),
1132 (imm8 & (1 << 5)) ? ~UINT32_C(0) : UINT32_C(0),
1133 (imm8 & (1 << 6)) ? ~UINT32_C(0) : UINT32_C(0),
1134 (imm8 & (1 << 7)) ? ~UINT32_C(0) : UINT32_C(0)
1135 };
1136 r_.neon_f32 = vreinterpretq_f32_u32(vandq_u32(vld1q_u32(mask_data), vreinterpretq_u32_f32(r_.neon_f32)));
1137 }
1138
1139 r_.neon_f32 = vdupq_n_f32(vaddvq_f32(r_.neon_f32));
1140
1141 {
1142 uint32_t mask_data[] = {
1143 (imm8 & 1) ? ~UINT32_C(0) : UINT32_C(0),
1144 (imm8 & 2) ? ~UINT32_C(0) : UINT32_C(0),
1145 (imm8 & 4) ? ~UINT32_C(0) : UINT32_C(0),
1146 (imm8 & 8) ? ~UINT32_C(0) : UINT32_C(0)
1147 };
1148 r_.neon_f32 = vreinterpretq_f32_u32(vandq_u32(vld1q_u32(mask_data), vreinterpretq_u32_f32(r_.neon_f32)));
1149 }
1150 }
1151 break;
1152 }
1153 #else
1154 simde_float32 sum = SIMDE_FLOAT32_C(0.0);
1155
1156 SIMDE_VECTORIZE_REDUCTION(+:sum)
1157 for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
1158 sum += ((imm8 >> (i + 4)) & 1) ? (a_.f32[i] * b_.f32[i]) : SIMDE_FLOAT32_C(0.0);
1159 }
1160
1161 SIMDE_VECTORIZE
1162 for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
1163 r_.f32[i] = ((imm8 >> i) & 1) ? sum : SIMDE_FLOAT32_C(0.0);
1164 }
1165 #endif
1166
1167 return simde__m128_from_private(r_);
1168 }
1169 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1170 # define simde_mm_dp_ps(a, b, imm8) _mm_dp_ps(a, b, imm8)
1171 #endif
1172 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1173 #undef _mm_dp_ps
1174 #define _mm_dp_ps(a, b, imm8) simde_mm_dp_ps(a, b, imm8)
1175 #endif
1176
1177 #if defined(simde_mm_extract_epi8)
1178 # undef simde_mm_extract_epi8
1179 #endif
1180 SIMDE_FUNCTION_ATTRIBUTES
1181 int8_t
simde_mm_extract_epi8(simde__m128i a,const int imm8)1182 simde_mm_extract_epi8 (simde__m128i a, const int imm8)
1183 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) {
1184 simde__m128i_private
1185 a_ = simde__m128i_to_private(a);
1186
1187 #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1188 #if defined(SIMDE_BUG_GCC_95227)
1189 (void) a_;
1190 (void) imm8;
1191 #endif
1192 return vec_extract(a_.altivec_i8, imm8);
1193 #else
1194 return a_.i8[imm8 & 15];
1195 #endif
1196 }
1197 #if defined(SIMDE_X86_SSE4_1_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_MM_EXTRACT_EPI8)
1198 # define simde_mm_extract_epi8(a, imm8) HEDLEY_STATIC_CAST(int8_t, _mm_extract_epi8(a, imm8))
1199 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1200 # define simde_mm_extract_epi8(a, imm8) vgetq_lane_s8(simde__m128i_to_private(a).neon_i8, imm8)
1201 #endif
1202 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1203 #undef _mm_extract_epi8
1204 #define _mm_extract_epi8(a, imm8) HEDLEY_STATIC_CAST(int, simde_mm_extract_epi8(a, imm8))
1205 #endif
1206
1207 #if defined(simde_mm_extract_epi32)
1208 # undef simde_mm_extract_epi32
1209 #endif
1210 SIMDE_FUNCTION_ATTRIBUTES
1211 int32_t
simde_mm_extract_epi32(simde__m128i a,const int imm8)1212 simde_mm_extract_epi32 (simde__m128i a, const int imm8)
1213 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3) {
1214 simde__m128i_private
1215 a_ = simde__m128i_to_private(a);
1216
1217 #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1218 #if defined(SIMDE_BUG_GCC_95227)
1219 (void) a_;
1220 (void) imm8;
1221 #endif
1222 return vec_extract(a_.altivec_i32, imm8);
1223 #else
1224 return a_.i32[imm8 & 3];
1225 #endif
1226 }
1227 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1228 # define simde_mm_extract_epi32(a, imm8) _mm_extract_epi32(a, imm8)
1229 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1230 # define simde_mm_extract_epi32(a, imm8) vgetq_lane_s32(simde__m128i_to_private(a).neon_i32, imm8)
1231 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1232 # define simde_mm_extract_epi32(a, imm8) HEDLEY_STATIC_CAST(int32_t, vec_extract(simde__m128i_to_private(a).altivec_i32, imm8))
1233 #endif
1234 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1235 #undef _mm_extract_epi32
1236 #define _mm_extract_epi32(a, imm8) simde_mm_extract_epi32(a, imm8)
1237 #endif
1238
1239 #if defined(simde_mm_extract_epi64)
1240 # undef simde_mm_extract_epi64
1241 #endif
1242 SIMDE_FUNCTION_ATTRIBUTES
1243 int64_t
simde_mm_extract_epi64(simde__m128i a,const int imm8)1244 simde_mm_extract_epi64 (simde__m128i a, const int imm8)
1245 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) {
1246 simde__m128i_private
1247 a_ = simde__m128i_to_private(a);
1248
1249 #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
1250 #if defined(SIMDE_BUG_GCC_95227)
1251 (void) a_;
1252 (void) imm8;
1253 #endif
1254 return vec_extract(a_.altivec_i64, imm8);
1255 #else
1256 return a_.i64[imm8 & 1];
1257 #endif
1258 }
1259 #if defined(SIMDE_X86_SSE4_1_NATIVE) && defined(SIMDE_ARCH_AMD64)
1260 # define simde_mm_extract_epi64(a, imm8) _mm_extract_epi64(a, imm8)
1261 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1262 # define simde_mm_extract_epi64(a, imm8) vgetq_lane_s64(simde__m128i_to_private(a).neon_i64, imm8)
1263 #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
1264 # define simde_mm_extract_epi64(a, imm8) HEDLEY_STATIC_CAST(int64_t, vec_extract(simde__m128i_to_private(a).altivec_i64, imm8))
1265 #endif
1266 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64))
1267 #undef _mm_extract_epi64
1268 #define _mm_extract_epi64(a, imm8) simde_mm_extract_epi64(a, imm8)
1269 #endif
1270
1271 #if defined(simde_mm_extract_ps)
1272 # undef simde_mm_extract_ps
1273 #endif
1274 SIMDE_FUNCTION_ATTRIBUTES
1275 int32_t
simde_mm_extract_ps(simde__m128 a,const int imm8)1276 simde_mm_extract_ps (simde__m128 a, const int imm8)
1277 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3) {
1278 simde__m128_private
1279 a_ = simde__m128_to_private(a);
1280
1281 return a_.i32[imm8 & 3];
1282 }
1283 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1284 #define simde_mm_extract_ps(a, imm8) _mm_extract_ps(a, imm8)
1285 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1286 #define simde_mm_extract_ps(a, imm8) vgetq_lane_s32(simde__m128_to_private(a).neon_i32, imm8)
1287 #endif
1288 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1289 #undef _mm_extract_ps
1290 #define _mm_extract_ps(a, imm8) simde_mm_extract_ps(a, imm8)
1291 #endif
1292
1293 SIMDE_FUNCTION_ATTRIBUTES
1294 simde__m128d
simde_mm_floor_pd(simde__m128d a)1295 simde_mm_floor_pd (simde__m128d a) {
1296 return simde_mm_round_pd(a, SIMDE_MM_FROUND_TO_NEG_INF);
1297 }
1298 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1299 #undef _mm_floor_pd
1300 #define _mm_floor_pd(a) simde_mm_floor_pd(a)
1301 #endif
1302
1303 SIMDE_FUNCTION_ATTRIBUTES
1304 simde__m128
simde_mm_floor_ps(simde__m128 a)1305 simde_mm_floor_ps (simde__m128 a) {
1306 return simde_mm_round_ps(a, SIMDE_MM_FROUND_TO_NEG_INF);
1307 }
1308 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1309 #undef _mm_floor_ps
1310 #define _mm_floor_ps(a) simde_mm_floor_ps(a)
1311 #endif
1312
1313 SIMDE_FUNCTION_ATTRIBUTES
1314 simde__m128d
simde_mm_floor_sd(simde__m128d a,simde__m128d b)1315 simde_mm_floor_sd (simde__m128d a, simde__m128d b) {
1316 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1317 return _mm_floor_sd(a, b);
1318 #else
1319 simde__m128d_private
1320 r_,
1321 a_ = simde__m128d_to_private(a),
1322 b_ = simde__m128d_to_private(b);
1323
1324 #if defined(simde_math_floor)
1325 r_.f64[0] = simde_math_floor(b_.f64[0]);
1326 r_.f64[1] = a_.f64[1];
1327 #else
1328 HEDLEY_UNREACHABLE();
1329 #endif
1330
1331 return simde__m128d_from_private(r_);
1332 #endif
1333 }
1334 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1335 #undef _mm_floor_sd
1336 #define _mm_floor_sd(a, b) simde_mm_floor_sd(a, b)
1337 #endif
1338
1339 SIMDE_FUNCTION_ATTRIBUTES
1340 simde__m128
simde_mm_floor_ss(simde__m128 a,simde__m128 b)1341 simde_mm_floor_ss (simde__m128 a, simde__m128 b) {
1342 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1343 return _mm_floor_ss(a, b);
1344 #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
1345 return simde_mm_move_ss(a, simde_mm_floor_ps(b));
1346 #else
1347 simde__m128_private
1348 r_,
1349 a_ = simde__m128_to_private(a),
1350 b_ = simde__m128_to_private(b);
1351
1352 #if defined(simde_math_floorf)
1353 r_.f32[0] = simde_math_floorf(b_.f32[0]);
1354 for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
1355 r_.f32[i] = a_.f32[i];
1356 }
1357 #else
1358 HEDLEY_UNREACHABLE();
1359 #endif
1360
1361 return simde__m128_from_private(r_);
1362 #endif
1363 }
1364 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1365 #undef _mm_floor_ss
1366 #define _mm_floor_ss(a, b) simde_mm_floor_ss(a, b)
1367 #endif
1368
1369 SIMDE_FUNCTION_ATTRIBUTES
1370 simde__m128i
simde_mm_insert_epi8(simde__m128i a,int i,const int imm8)1371 simde_mm_insert_epi8 (simde__m128i a, int i, const int imm8)
1372 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) {
1373 simde__m128i_private
1374 r_ = simde__m128i_to_private(a);
1375
1376 r_.i8[imm8] = HEDLEY_STATIC_CAST(int8_t, i);
1377
1378 return simde__m128i_from_private(r_);
1379 }
1380 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1381 /* clang-3.8 returns an incompatible type, so we need the cast. MSVC
1382 * can't handle the cast ("error C2440: 'type cast': cannot convert
1383 * from '__m128i' to '__m128i'"). */
1384 #if defined(__clang__)
1385 #define simde_mm_insert_epi8(a, i, imm8) HEDLEY_STATIC_CAST(__m128i, _mm_insert_epi8(a, i, imm8))
1386 #else
1387 #define simde_mm_insert_epi8(a, i, imm8) _mm_insert_epi8(a, i, imm8)
1388 #endif
1389 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1390 # define simde_mm_insert_epi8(a, i, imm8) simde__m128i_from_neon_i8(vsetq_lane_s8(i, simde__m128i_to_private(a).i8, imm8))
1391 #endif
1392 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1393 #undef _mm_insert_epi8
1394 #define _mm_insert_epi8(a, i, imm8) simde_mm_insert_epi8(a, i, imm8)
1395 #endif
1396
1397 SIMDE_FUNCTION_ATTRIBUTES
1398 simde__m128i
simde_mm_insert_epi32(simde__m128i a,int i,const int imm8)1399 simde_mm_insert_epi32 (simde__m128i a, int i, const int imm8)
1400 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3) {
1401 simde__m128i_private
1402 r_ = simde__m128i_to_private(a);
1403
1404 r_.i32[imm8] = HEDLEY_STATIC_CAST(int32_t, i);
1405
1406 return simde__m128i_from_private(r_);
1407 }
1408 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1409 #if defined(__clang__)
1410 #define simde_mm_insert_epi32(a, i, imm8) HEDLEY_STATIC_CAST(__m128i, _mm_insert_epi32(a, i, imm8))
1411 #else
1412 #define simde_mm_insert_epi32(a, i, imm8) _mm_insert_epi32(a, i, imm8)
1413 #endif
1414 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1415 # define simde_mm_insert_epi32(a, i, imm8) simde__m128i_from_neon_i32(vsetq_lane_s32(i, simde__m128i_to_private(a).i32, imm8))
1416 #endif
1417 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1418 #undef _mm_insert_epi32
1419 #define _mm_insert_epi32(a, i, imm8) simde_mm_insert_epi32(a, i, imm8)
1420 #endif
1421
1422 SIMDE_FUNCTION_ATTRIBUTES
1423 simde__m128i
simde_mm_insert_epi64(simde__m128i a,int64_t i,const int imm8)1424 simde_mm_insert_epi64 (simde__m128i a, int64_t i, const int imm8)
1425 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) {
1426 #if defined(SIMDE_BUG_GCC_94482)
1427 simde__m128i_private
1428 a_ = simde__m128i_to_private(a);
1429
1430 switch(imm8) {
1431 case 0:
1432 return simde_mm_set_epi64x(a_.i64[1], i);
1433 break;
1434 case 1:
1435 return simde_mm_set_epi64x(i, a_.i64[0]);
1436 break;
1437 default:
1438 HEDLEY_UNREACHABLE();
1439 break;
1440 }
1441 #else
1442 simde__m128i_private
1443 r_ = simde__m128i_to_private(a);
1444
1445 r_.i64[imm8] = i;
1446 return simde__m128i_from_private(r_);
1447 #endif
1448 }
1449 #if defined(SIMDE_X86_SSE4_1_NATIVE) && defined(SIMDE_ARCH_AMD64)
1450 # define simde_mm_insert_epi64(a, i, imm8) _mm_insert_epi64(a, i, imm8)
1451 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1452 # define simde_mm_insert_epi64(a, i, imm8) simde__m128i_from_neon_i64(vsetq_lane_s64(i, simde__m128i_to_private(a).i64, imm8))
1453 #endif
1454 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64))
1455 #undef _mm_insert_epi64
1456 #define _mm_insert_epi64(a, i, imm8) simde_mm_insert_epi64(a, i, imm8)
1457 #endif
1458
1459 SIMDE_FUNCTION_ATTRIBUTES
1460 simde__m128
simde_mm_insert_ps(simde__m128 a,simde__m128 b,const int imm8)1461 simde_mm_insert_ps (simde__m128 a, simde__m128 b, const int imm8)
1462 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
1463 simde__m128_private
1464 r_,
1465 a_ = simde__m128_to_private(a),
1466 b_ = simde__m128_to_private(b);
1467
1468 a_.f32[0] = b_.f32[(imm8 >> 6) & 3];
1469 a_.f32[(imm8 >> 4) & 3] = a_.f32[0];
1470
1471 SIMDE_VECTORIZE
1472 for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
1473 r_.f32[i] = (imm8 >> i) ? SIMDE_FLOAT32_C(0.0) : a_.f32[i];
1474 }
1475
1476 return simde__m128_from_private(r_);
1477 }
1478 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1479 # define simde_mm_insert_ps(a, b, imm8) _mm_insert_ps(a, b, imm8)
1480 #endif
1481 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1482 #undef _mm_insert_ps
1483 #define _mm_insert_ps(a, b, imm8) simde_mm_insert_ps(a, b, imm8)
1484 #endif
1485
1486 SIMDE_FUNCTION_ATTRIBUTES
1487 simde__m128i
simde_mm_max_epi8(simde__m128i a,simde__m128i b)1488 simde_mm_max_epi8 (simde__m128i a, simde__m128i b) {
1489 #if defined(SIMDE_X86_SSE4_1_NATIVE) && !defined(__PGI)
1490 return _mm_max_epi8(a, b);
1491 #else
1492 simde__m128i_private
1493 r_,
1494 a_ = simde__m128i_to_private(a),
1495 b_ = simde__m128i_to_private(b);
1496
1497 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1498 r_.neon_i8 = vmaxq_s8(a_.neon_i8, b_.neon_i8);
1499 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1500 r_.wasm_v128 = wasm_i8x16_max(a_.wasm_v128, b_.wasm_v128);
1501 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
1502 r_.altivec_i8 = vec_max(a_.altivec_i8, b_.altivec_i8);
1503 #else
1504 SIMDE_VECTORIZE
1505 for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
1506 r_.i8[i] = a_.i8[i] > b_.i8[i] ? a_.i8[i] : b_.i8[i];
1507 }
1508 #endif
1509
1510 return simde__m128i_from_private(r_);
1511 #endif
1512 }
1513 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1514 #undef _mm_max_epi8
1515 #define _mm_max_epi8(a, b) simde_mm_max_epi8(a, b)
1516 #endif
1517
1518 SIMDE_FUNCTION_ATTRIBUTES
1519 simde__m128i
simde_mm_max_epi32(simde__m128i a,simde__m128i b)1520 simde_mm_max_epi32 (simde__m128i a, simde__m128i b) {
1521 #if defined(SIMDE_X86_SSE4_1_NATIVE) && !defined(__PGI)
1522 return _mm_max_epi32(a, b);
1523 #else
1524 simde__m128i_private
1525 r_,
1526 a_ = simde__m128i_to_private(a),
1527 b_ = simde__m128i_to_private(b);
1528
1529 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1530 r_.neon_i32 = vmaxq_s32(a_.neon_i32, b_.neon_i32);
1531 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1532 r_.wasm_v128 = wasm_i32x4_max(a_.wasm_v128, b_.wasm_v128);
1533 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
1534 r_.altivec_i32 = vec_max(a_.altivec_i32, b_.altivec_i32);
1535 #else
1536 SIMDE_VECTORIZE
1537 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1538 r_.i32[i] = a_.i32[i] > b_.i32[i] ? a_.i32[i] : b_.i32[i];
1539 }
1540 #endif
1541
1542 return simde__m128i_from_private(r_);
1543 #endif
1544 }
1545 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1546 #undef _mm_max_epi32
1547 #define _mm_max_epi32(a, b) simde_mm_max_epi32(a, b)
1548 #endif
1549
1550 SIMDE_FUNCTION_ATTRIBUTES
1551 simde__m128i
simde_mm_max_epu16(simde__m128i a,simde__m128i b)1552 simde_mm_max_epu16 (simde__m128i a, simde__m128i b) {
1553 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1554 return _mm_max_epu16(a, b);
1555 #else
1556 simde__m128i_private
1557 r_,
1558 a_ = simde__m128i_to_private(a),
1559 b_ = simde__m128i_to_private(b);
1560
1561 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1562 r_.neon_u16 = vmaxq_u16(a_.neon_u16, b_.neon_u16);
1563 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1564 r_.wasm_v128 = wasm_u16x8_max(a_.wasm_v128, b_.wasm_v128);
1565 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
1566 r_.altivec_u16 = vec_max(a_.altivec_u16, b_.altivec_u16);
1567 #else
1568 SIMDE_VECTORIZE
1569 for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
1570 r_.u16[i] = a_.u16[i] > b_.u16[i] ? a_.u16[i] : b_.u16[i];
1571 }
1572 #endif
1573
1574 return simde__m128i_from_private(r_);
1575 #endif
1576 }
1577 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1578 #undef _mm_max_epu16
1579 #define _mm_max_epu16(a, b) simde_mm_max_epu16(a, b)
1580 #endif
1581
1582 SIMDE_FUNCTION_ATTRIBUTES
1583 simde__m128i
simde_mm_max_epu32(simde__m128i a,simde__m128i b)1584 simde_mm_max_epu32 (simde__m128i a, simde__m128i b) {
1585 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1586 return _mm_max_epu32(a, b);
1587 #else
1588 simde__m128i_private
1589 r_,
1590 a_ = simde__m128i_to_private(a),
1591 b_ = simde__m128i_to_private(b);
1592
1593 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1594 r_.neon_u32 = vmaxq_u32(a_.neon_u32, b_.neon_u32);
1595 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1596 r_.wasm_v128 = wasm_u32x4_max(a_.wasm_v128, b_.wasm_v128);
1597 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
1598 r_.altivec_u32 = vec_max(a_.altivec_u32, b_.altivec_u32);
1599 #else
1600 SIMDE_VECTORIZE
1601 for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
1602 r_.u32[i] = a_.u32[i] > b_.u32[i] ? a_.u32[i] : b_.u32[i];
1603 }
1604 #endif
1605
1606 return simde__m128i_from_private(r_);
1607 #endif
1608 }
1609 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1610 #undef _mm_max_epu32
1611 #define _mm_max_epu32(a, b) simde_mm_max_epu32(a, b)
1612 #endif
1613
1614 SIMDE_FUNCTION_ATTRIBUTES
1615 simde__m128i
simde_mm_min_epi8(simde__m128i a,simde__m128i b)1616 simde_mm_min_epi8 (simde__m128i a, simde__m128i b) {
1617 #if defined(SIMDE_X86_SSE4_1_NATIVE) && !defined(__PGI)
1618 return _mm_min_epi8(a, b);
1619 #else
1620 simde__m128i_private
1621 r_,
1622 a_ = simde__m128i_to_private(a),
1623 b_ = simde__m128i_to_private(b);
1624
1625 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1626 r_.neon_i8 = vminq_s8(a_.neon_i8, b_.neon_i8);
1627 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1628 r_.wasm_v128 = wasm_i8x16_min(a_.wasm_v128, b_.wasm_v128);
1629 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
1630 r_.altivec_i8 = vec_min(a_.altivec_i8, b_.altivec_i8);
1631 #else
1632 SIMDE_VECTORIZE
1633 for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
1634 r_.i8[i] = a_.i8[i] < b_.i8[i] ? a_.i8[i] : b_.i8[i];
1635 }
1636 #endif
1637
1638 return simde__m128i_from_private(r_);
1639 #endif
1640 }
1641 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1642 #undef _mm_min_epi8
1643 #define _mm_min_epi8(a, b) simde_mm_min_epi8(a, b)
1644 #endif
1645
1646 SIMDE_FUNCTION_ATTRIBUTES
1647 simde__m128i
simde_mm_min_epi32(simde__m128i a,simde__m128i b)1648 simde_mm_min_epi32 (simde__m128i a, simde__m128i b) {
1649 #if defined(SIMDE_X86_SSE4_1_NATIVE) && !defined(__PGI)
1650 return _mm_min_epi32(a, b);
1651 #else
1652 simde__m128i_private
1653 r_,
1654 a_ = simde__m128i_to_private(a),
1655 b_ = simde__m128i_to_private(b);
1656
1657 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1658 r_.neon_i32 = vminq_s32(a_.neon_i32, b_.neon_i32);
1659 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1660 r_.wasm_v128 = wasm_i32x4_min(a_.wasm_v128, b_.wasm_v128);
1661 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
1662 r_.altivec_i32 = vec_min(a_.altivec_i32, b_.altivec_i32);
1663 #else
1664 SIMDE_VECTORIZE
1665 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1666 r_.i32[i] = a_.i32[i] < b_.i32[i] ? a_.i32[i] : b_.i32[i];
1667 }
1668 #endif
1669
1670 return simde__m128i_from_private(r_);
1671 #endif
1672 }
1673 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1674 #undef _mm_min_epi32
1675 #define _mm_min_epi32(a, b) simde_mm_min_epi32(a, b)
1676 #endif
1677
1678 SIMDE_FUNCTION_ATTRIBUTES
1679 simde__m128i
simde_mm_min_epu16(simde__m128i a,simde__m128i b)1680 simde_mm_min_epu16 (simde__m128i a, simde__m128i b) {
1681 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1682 return _mm_min_epu16(a, b);
1683 #else
1684 simde__m128i_private
1685 r_,
1686 a_ = simde__m128i_to_private(a),
1687 b_ = simde__m128i_to_private(b);
1688
1689 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1690 r_.neon_u16 = vminq_u16(a_.neon_u16, b_.neon_u16);
1691 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1692 r_.wasm_v128 = wasm_u16x8_min(a_.wasm_v128, b_.wasm_v128);
1693 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
1694 r_.altivec_u16 = vec_min(a_.altivec_u16, b_.altivec_u16);
1695 #else
1696 SIMDE_VECTORIZE
1697 for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
1698 r_.u16[i] = a_.u16[i] < b_.u16[i] ? a_.u16[i] : b_.u16[i];
1699 }
1700 #endif
1701
1702 return simde__m128i_from_private(r_);
1703 #endif
1704 }
1705 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1706 #undef _mm_min_epu16
1707 #define _mm_min_epu16(a, b) simde_mm_min_epu16(a, b)
1708 #endif
1709
1710 SIMDE_FUNCTION_ATTRIBUTES
1711 simde__m128i
simde_mm_min_epu32(simde__m128i a,simde__m128i b)1712 simde_mm_min_epu32 (simde__m128i a, simde__m128i b) {
1713 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1714 return _mm_min_epu32(a, b);
1715 #else
1716 simde__m128i_private
1717 r_,
1718 a_ = simde__m128i_to_private(a),
1719 b_ = simde__m128i_to_private(b);
1720
1721 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1722 r_.neon_u32 = vminq_u32(a_.neon_u32, b_.neon_u32);
1723 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1724 r_.wasm_v128 = wasm_u32x4_min(a_.wasm_v128, b_.wasm_v128);
1725 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
1726 r_.altivec_u32 = vec_min(a_.altivec_u32, b_.altivec_u32);
1727 #else
1728 SIMDE_VECTORIZE
1729 for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
1730 r_.u32[i] = a_.u32[i] < b_.u32[i] ? a_.u32[i] : b_.u32[i];
1731 }
1732 #endif
1733
1734 return simde__m128i_from_private(r_);
1735 #endif
1736 }
1737 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1738 #undef _mm_min_epu32
1739 #define _mm_min_epu32(a, b) simde_mm_min_epu32(a, b)
1740 #endif
1741
1742 SIMDE_FUNCTION_ATTRIBUTES
1743 simde__m128i
simde_mm_minpos_epu16(simde__m128i a)1744 simde_mm_minpos_epu16 (simde__m128i a) {
1745 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1746 return _mm_minpos_epu16(a);
1747 #else
1748 simde__m128i_private
1749 r_ = simde__m128i_to_private(simde_mm_setzero_si128()),
1750 a_ = simde__m128i_to_private(a);
1751
1752 r_.u16[0] = UINT16_MAX;
1753 for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
1754 if (a_.u16[i] < r_.u16[0]) {
1755 r_.u16[0] = a_.u16[i];
1756 r_.u16[1] = HEDLEY_STATIC_CAST(uint16_t, i);
1757 }
1758 }
1759
1760 return simde__m128i_from_private(r_);
1761 #endif
1762 }
1763 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1764 #undef _mm_minpos_epu16
1765 #define _mm_minpos_epu16(a) simde_mm_minpos_epu16(a)
1766 #endif
1767
1768 SIMDE_FUNCTION_ATTRIBUTES
1769 simde__m128i
simde_mm_mpsadbw_epu8(simde__m128i a,simde__m128i b,const int imm8)1770 simde_mm_mpsadbw_epu8 (simde__m128i a, simde__m128i b, const int imm8)
1771 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
1772 simde__m128i_private
1773 r_,
1774 a_ = simde__m128i_to_private(a),
1775 b_ = simde__m128i_to_private(b);
1776
1777 const int a_offset = imm8 & 4;
1778 const int b_offset = (imm8 & 3) << 2;
1779
1780 #if defined(simde_math_abs)
1781 for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, (sizeof(r_.u16) / sizeof(r_.u16[0]))) ; i++) {
1782 r_.u16[i] =
1783 HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[a_offset + i + 0] - b_.u8[b_offset + 0]))) +
1784 HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[a_offset + i + 1] - b_.u8[b_offset + 1]))) +
1785 HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[a_offset + i + 2] - b_.u8[b_offset + 2]))) +
1786 HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[a_offset + i + 3] - b_.u8[b_offset + 3])));
1787 }
1788 #else
1789 HEDLEY_UNREACHABLE();
1790 #endif
1791
1792 return simde__m128i_from_private(r_);
1793 }
1794 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1795 # define simde_mm_mpsadbw_epu8(a, b, imm8) _mm_mpsadbw_epu8(a, b, imm8)
1796 #endif
1797 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1798 #undef _mm_mpsadbw_epu8
1799 #define _mm_mpsadbw_epu8(a, b, imm8) simde_mm_mpsadbw_epu8(a, b, imm8)
1800 #endif
1801
1802 SIMDE_FUNCTION_ATTRIBUTES
1803 simde__m128i
simde_mm_mul_epi32(simde__m128i a,simde__m128i b)1804 simde_mm_mul_epi32 (simde__m128i a, simde__m128i b) {
1805 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1806 return _mm_mul_epi32(a, b);
1807 #else
1808 simde__m128i_private
1809 r_,
1810 a_ = simde__m128i_to_private(a),
1811 b_ = simde__m128i_to_private(b);
1812
1813 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1814 // vmull_s32 upcasts instead of masking, so we downcast.
1815 int32x2_t a_lo = vmovn_s64(a_.neon_i64);
1816 int32x2_t b_lo = vmovn_s64(b_.neon_i64);
1817 r_.neon_i64 = vmull_s32(a_lo, b_lo);
1818 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1819 r_.wasm_v128 = wasm_i64x2_make(
1820 wasm_i32x4_extract_lane(a_.wasm_v128, 0) * HEDLEY_STATIC_CAST(int64_t, wasm_i32x4_extract_lane(b_.wasm_v128, 0)),
1821 wasm_i32x4_extract_lane(a_.wasm_v128, 2) * HEDLEY_STATIC_CAST(int64_t, wasm_i32x4_extract_lane(b_.wasm_v128, 2)));
1822 #else
1823 SIMDE_VECTORIZE
1824 for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
1825 r_.i64[i] =
1826 HEDLEY_STATIC_CAST(int64_t, a_.i32[i * 2]) *
1827 HEDLEY_STATIC_CAST(int64_t, b_.i32[i * 2]);
1828 }
1829 #endif
1830
1831 return simde__m128i_from_private(r_);
1832 #endif
1833 }
1834 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1835 #undef _mm_mul_epi32
1836 #define _mm_mul_epi32(a, b) simde_mm_mul_epi32(a, b)
1837 #endif
1838
1839 SIMDE_FUNCTION_ATTRIBUTES
1840 simde__m128i
simde_mm_mullo_epi32(simde__m128i a,simde__m128i b)1841 simde_mm_mullo_epi32 (simde__m128i a, simde__m128i b) {
1842 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1843 return _mm_mullo_epi32(a, b);
1844 #else
1845 simde__m128i_private
1846 r_,
1847 a_ = simde__m128i_to_private(a),
1848 b_ = simde__m128i_to_private(b);
1849
1850 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1851 r_.neon_i32 = vmulq_s32(a_.neon_i32, b_.neon_i32);
1852 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1853 (void) a_;
1854 (void) b_;
1855 r_.altivec_i32 = vec_mul(a_.altivec_i32, b_.altivec_i32);
1856 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1857 r_.wasm_v128 = wasm_i32x4_mul(a_.wasm_v128, b_.wasm_v128);
1858 #else
1859 SIMDE_VECTORIZE
1860 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1861 r_.u32[i] = HEDLEY_STATIC_CAST(uint32_t, (HEDLEY_STATIC_CAST(uint64_t, (HEDLEY_STATIC_CAST(int64_t, a_.i32[i]) * HEDLEY_STATIC_CAST(int64_t, b_.i32[i]))) & 0xffffffff));
1862 }
1863 #endif
1864
1865 return simde__m128i_from_private(r_);
1866 #endif
1867 }
1868 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1869 #undef _mm_mullo_epi32
1870 #define _mm_mullo_epi32(a, b) simde_mm_mullo_epi32(a, b)
1871 #endif
1872
1873 SIMDE_FUNCTION_ATTRIBUTES
1874 simde__m128i
simde_x_mm_mullo_epu32(simde__m128i a,simde__m128i b)1875 simde_x_mm_mullo_epu32 (simde__m128i a, simde__m128i b) {
1876 simde__m128i_private
1877 r_,
1878 a_ = simde__m128i_to_private(a),
1879 b_ = simde__m128i_to_private(b);
1880
1881 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1882 r_.neon_u32 = vmulq_u32(a_.neon_u32, b_.neon_u32);
1883 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1884 r_.u32 = a_.u32 * b_.u32;
1885 #else
1886 SIMDE_VECTORIZE
1887 for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
1888 r_.u32[i] = a_.u32[i] * b_.u32[i];
1889 }
1890 #endif
1891
1892 return simde__m128i_from_private(r_);
1893 }
1894
1895 SIMDE_FUNCTION_ATTRIBUTES
1896 simde__m128i
simde_mm_packus_epi32(simde__m128i a,simde__m128i b)1897 simde_mm_packus_epi32 (simde__m128i a, simde__m128i b) {
1898 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1899 return _mm_packus_epi32(a, b);
1900 #else
1901 simde__m128i_private
1902 r_,
1903 a_ = simde__m128i_to_private(a),
1904 b_ = simde__m128i_to_private(b);
1905
1906 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1907 const int32x4_t z = vdupq_n_s32(0);
1908 r_.neon_u16 = vcombine_u16(
1909 vqmovn_u32(vreinterpretq_u32_s32(vmaxq_s32(z, a_.neon_i32))),
1910 vqmovn_u32(vreinterpretq_u32_s32(vmaxq_s32(z, b_.neon_i32))));
1911 #else
1912 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1913 r_.u16[i + 0] = (a_.i32[i] < 0) ? UINT16_C(0) : ((a_.i32[i] > UINT16_MAX) ? (UINT16_MAX) : HEDLEY_STATIC_CAST(uint16_t, a_.i32[i]));
1914 r_.u16[i + 4] = (b_.i32[i] < 0) ? UINT16_C(0) : ((b_.i32[i] > UINT16_MAX) ? (UINT16_MAX) : HEDLEY_STATIC_CAST(uint16_t, b_.i32[i]));
1915 }
1916 #endif
1917
1918 return simde__m128i_from_private(r_);
1919 #endif
1920 }
1921 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1922 #undef _mm_packus_epi32
1923 #define _mm_packus_epi32(a, b) simde_mm_packus_epi32(a, b)
1924 #endif
1925
1926 SIMDE_FUNCTION_ATTRIBUTES
1927 simde__m128d
simde_mm_round_sd(simde__m128d a,simde__m128d b,int rounding)1928 simde_mm_round_sd (simde__m128d a, simde__m128d b, int rounding)
1929 SIMDE_REQUIRE_CONSTANT_RANGE(rounding, 0, 15) {
1930 simde__m128d_private
1931 r_ = simde__m128d_to_private(a),
1932 b_ = simde__m128d_to_private(b);
1933
1934 switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) {
1935 #if defined(simde_math_nearbyint)
1936 case SIMDE_MM_FROUND_TO_NEAREST_INT:
1937 case SIMDE_MM_FROUND_CUR_DIRECTION:
1938 r_.f64[0] = simde_math_nearbyint(b_.f64[0]);
1939 break;
1940 #endif
1941
1942 #if defined(simde_math_floor)
1943 case SIMDE_MM_FROUND_TO_NEG_INF:
1944 r_.f64[0] = simde_math_floor(b_.f64[0]);
1945 break;
1946 #endif
1947
1948 #if defined(simde_math_ceil)
1949 case SIMDE_MM_FROUND_TO_POS_INF:
1950 r_.f64[0] = simde_math_ceil(b_.f64[0]);
1951 break;
1952 #endif
1953
1954 #if defined(simde_math_trunc)
1955 case SIMDE_MM_FROUND_TO_ZERO:
1956 r_.f64[0] = simde_math_trunc(b_.f64[0]);
1957 break;
1958 #endif
1959
1960 default:
1961 HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
1962 }
1963
1964 return simde__m128d_from_private(r_);
1965 }
1966 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1967 # define simde_mm_round_sd(a, b, rounding) _mm_round_sd(a, b, rounding)
1968 #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
1969 # define simde_mm_round_sd(a, b, rounding) simde_mm_move_sd(a, simde_mm_round_pd(b, rounding))
1970 #endif
1971 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1972 #undef _mm_round_sd
1973 #define _mm_round_sd(a, b, rounding) simde_mm_round_sd(a, b, rounding)
1974 #endif
1975
1976 SIMDE_FUNCTION_ATTRIBUTES
1977 simde__m128
simde_mm_round_ss(simde__m128 a,simde__m128 b,int rounding)1978 simde_mm_round_ss (simde__m128 a, simde__m128 b, int rounding)
1979 SIMDE_REQUIRE_CONSTANT_RANGE(rounding, 0, 15) {
1980 simde__m128_private
1981 r_ = simde__m128_to_private(a),
1982 b_ = simde__m128_to_private(b);
1983
1984 switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) {
1985 #if defined(simde_math_nearbyintf)
1986 case SIMDE_MM_FROUND_TO_NEAREST_INT:
1987 case SIMDE_MM_FROUND_CUR_DIRECTION:
1988 r_.f32[0] = simde_math_nearbyintf(b_.f32[0]);
1989 break;
1990 #endif
1991
1992 #if defined(simde_math_floorf)
1993 case SIMDE_MM_FROUND_TO_NEG_INF:
1994 r_.f32[0] = simde_math_floorf(b_.f32[0]);
1995 break;
1996 #endif
1997
1998 #if defined(simde_math_ceilf)
1999 case SIMDE_MM_FROUND_TO_POS_INF:
2000 r_.f32[0] = simde_math_ceilf(b_.f32[0]);
2001 break;
2002 #endif
2003
2004 #if defined(simde_math_truncf)
2005 case SIMDE_MM_FROUND_TO_ZERO:
2006 r_.f32[0] = simde_math_truncf(b_.f32[0]);
2007 break;
2008 #endif
2009
2010 default:
2011 HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
2012 }
2013
2014 return simde__m128_from_private(r_);
2015 }
2016 #if defined(SIMDE_X86_SSE4_1_NATIVE)
2017 # define simde_mm_round_ss(a, b, rounding) _mm_round_ss(a, b, rounding)
2018 #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
2019 # define simde_mm_round_ss(a, b, rounding) simde_mm_move_ss(a, simde_mm_round_ps(b, rounding))
2020 #endif
2021 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
2022 #undef _mm_round_ss
2023 #define _mm_round_ss(a, b, rounding) simde_mm_round_ss(a, b, rounding)
2024 #endif
2025
2026 SIMDE_FUNCTION_ATTRIBUTES
2027 simde__m128i
simde_mm_stream_load_si128(const simde__m128i * mem_addr)2028 simde_mm_stream_load_si128 (const simde__m128i* mem_addr) {
2029 #if defined(SIMDE_X86_SSE4_1_NATIVE)
2030 return _mm_stream_load_si128(HEDLEY_CONST_CAST(simde__m128i*, mem_addr));
2031 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2032 return vreinterpretq_s64_s32(vld1q_s32(HEDLEY_REINTERPRET_CAST(int32_t const*, mem_addr)));
2033 #else
2034 return *mem_addr;
2035 #endif
2036 }
2037 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
2038 #undef _mm_stream_load_si128
2039 #define _mm_stream_load_si128(mem_addr) simde_mm_stream_load_si128(mem_addr)
2040 #endif
2041
2042 SIMDE_FUNCTION_ATTRIBUTES
2043 int
simde_mm_test_all_ones(simde__m128i a)2044 simde_mm_test_all_ones (simde__m128i a) {
2045 #if defined(SIMDE_X86_SSE4_1_NATIVE)
2046 return _mm_test_all_ones(a);
2047 #else
2048 simde__m128i_private a_ = simde__m128i_to_private(a);
2049 int r;
2050
2051 #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
2052 r = vec_all_eq(a_.altivec_i32, vec_splats(~0));
2053 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2054 return r = ((vgetq_lane_s64(a_.neon_i64, 0) & vgetq_lane_s64(a_.neon_i64, 1)) == ~HEDLEY_STATIC_CAST(int64_t, 0));
2055 #else
2056 int_fast32_t r_ = ~HEDLEY_STATIC_CAST(int_fast32_t, 0);
2057
2058 SIMDE_VECTORIZE_REDUCTION(&:r_)
2059 for (size_t i = 0 ; i < (sizeof(a_.i32f) / sizeof(a_.i32f[0])) ; i++) {
2060 r_ &= a_.i32f[i];
2061 }
2062
2063 r = (r_ == ~HEDLEY_STATIC_CAST(int_fast32_t, 0));
2064 #endif
2065
2066 return r;
2067 #endif
2068 }
2069 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
2070 #undef _mm_test_all_ones
2071 #define _mm_test_all_ones(a) simde_mm_test_all_ones(a)
2072 #endif
2073
2074 SIMDE_FUNCTION_ATTRIBUTES
2075 int
simde_mm_test_all_zeros(simde__m128i a,simde__m128i mask)2076 simde_mm_test_all_zeros (simde__m128i a, simde__m128i mask) {
2077 #if defined(SIMDE_X86_SSE4_1_NATIVE)
2078 return _mm_test_all_zeros(a, mask);
2079 #else
2080 simde__m128i_private tmp_ = simde__m128i_to_private(simde_mm_and_si128(a, mask));
2081 int r;
2082
2083 #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
2084 r = vec_all_eq(tmp_.altivec_i32, vec_splats(0));
2085 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2086 return !(vgetq_lane_s64(tmp_.neon_i64, 0) | vgetq_lane_s64(tmp_.neon_i64, 1));
2087 #else
2088 int_fast32_t r_ = HEDLEY_STATIC_CAST(int_fast32_t, 0);
2089
2090 SIMDE_VECTORIZE_REDUCTION(|:r_)
2091 for (size_t i = 0 ; i < (sizeof(tmp_.i32f) / sizeof(tmp_.i32f[0])) ; i++) {
2092 r_ |= tmp_.i32f[i];
2093 }
2094
2095 r = !r_;
2096 #endif
2097
2098 return r;
2099 #endif
2100 }
2101 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
2102 #undef _mm_test_all_zeros
2103 #define _mm_test_all_zeros(a, mask) simde_mm_test_all_zeros(a, mask)
2104 #endif
2105
2106 SIMDE_FUNCTION_ATTRIBUTES
2107 int
simde_mm_test_mix_ones_zeros(simde__m128i a,simde__m128i mask)2108 simde_mm_test_mix_ones_zeros (simde__m128i a, simde__m128i mask) {
2109 #if defined(SIMDE_X86_SSE4_1_NATIVE)
2110 return _mm_test_mix_ones_zeros(a, mask);
2111 #else
2112 simde__m128i_private
2113 a_ = simde__m128i_to_private(a),
2114 mask_ = simde__m128i_to_private(mask);
2115
2116 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2117 int64x2_t s640 = vandq_s64(a_.neon_i64, mask_.neon_i64);
2118 int64x2_t s641 = vandq_s64(vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_s64(a_.neon_i64))), mask_.neon_i64);
2119 return (((vgetq_lane_s64(s640, 0) | vgetq_lane_s64(s640, 1)) & (vgetq_lane_s64(s641, 0) | vgetq_lane_s64(s641, 1)))!=0);
2120 #else
2121 for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++)
2122 if (((a_.u64[i] & mask_.u64[i]) != 0) && ((~a_.u64[i] & mask_.u64[i]) != 0))
2123 return 1;
2124
2125 return 0;
2126 #endif
2127 #endif
2128 }
2129 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
2130 #undef _mm_test_mix_ones_zeros
2131 #define _mm_test_mix_ones_zeros(a, mask) simde_mm_test_mix_ones_zeros(a, mask)
2132 #endif
2133
2134 SIMDE_FUNCTION_ATTRIBUTES
2135 int
simde_mm_testc_si128(simde__m128i a,simde__m128i b)2136 simde_mm_testc_si128 (simde__m128i a, simde__m128i b) {
2137 #if defined(SIMDE_X86_SSE4_1_NATIVE)
2138 return _mm_testc_si128(a, b);
2139 #else
2140 simde__m128i_private
2141 a_ = simde__m128i_to_private(a),
2142 b_ = simde__m128i_to_private(b);
2143
2144 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2145 int64x2_t s64 = vandq_s64(~a_.neon_i64, b_.neon_i64);
2146 return !(vgetq_lane_s64(s64, 0) & vgetq_lane_s64(s64, 1));
2147 #else
2148 int_fast32_t r = 0;
2149
2150 SIMDE_VECTORIZE_REDUCTION(|:r)
2151 for (size_t i = 0 ; i < (sizeof(a_.i32f) / sizeof(a_.i32f[0])) ; i++) {
2152 r |= ~a_.i32f[i] & b_.i32f[i];
2153 }
2154
2155 return HEDLEY_STATIC_CAST(int, !r);
2156 #endif
2157 #endif
2158 }
2159 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
2160 #undef _mm_testc_si128
2161 #define _mm_testc_si128(a, b) simde_mm_testc_si128(a, b)
2162 #endif
2163
2164 SIMDE_FUNCTION_ATTRIBUTES
2165 int
simde_mm_testnzc_si128(simde__m128i a,simde__m128i b)2166 simde_mm_testnzc_si128 (simde__m128i a, simde__m128i b) {
2167 #if defined(SIMDE_X86_SSE4_1_NATIVE)
2168 return _mm_testnzc_si128(a, b);
2169 #else
2170 simde__m128i_private
2171 a_ = simde__m128i_to_private(a),
2172 b_ = simde__m128i_to_private(b);
2173
2174 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2175 int64x2_t s640 = vandq_s64(a_.neon_i64, b_.neon_i64);
2176 int64x2_t s641 = vandq_s64(~a_.neon_i64, b_.neon_i64);
2177 return (((vgetq_lane_s64(s640, 0) | vgetq_lane_s64(s640, 1)) & (vgetq_lane_s64(s641, 0) | vgetq_lane_s64(s641, 1)))!=0);
2178 #else
2179 for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) {
2180 if (((a_.u64[i] & b_.u64[i]) != 0) && ((~a_.u64[i] & b_.u64[i]) != 0))
2181 return 1;
2182 }
2183
2184 return 0;
2185 #endif
2186 #endif
2187 }
2188 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
2189 #undef _mm_testnzc_si128
2190 #define _mm_testnzc_si128(a, b) simde_mm_testnzc_si128(a, b)
2191 #endif
2192
2193 SIMDE_FUNCTION_ATTRIBUTES
2194 int
simde_mm_testz_si128(simde__m128i a,simde__m128i b)2195 simde_mm_testz_si128 (simde__m128i a, simde__m128i b) {
2196 #if defined(SIMDE_X86_SSE4_1_NATIVE)
2197 return _mm_testz_si128(a, b);
2198 #else
2199 simde__m128i_private
2200 a_ = simde__m128i_to_private(a),
2201 b_ = simde__m128i_to_private(b);
2202
2203 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2204 int64x2_t s64 = vandq_s64(a_.neon_i64, b_.neon_i64);
2205 return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
2206 #else
2207 for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) {
2208 if ((a_.u64[i] & b_.u64[i]) == 0)
2209 return 1;
2210 }
2211 #endif
2212
2213 return 0;
2214 #endif
2215 }
2216 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
2217 #undef _mm_testz_si128
2218 #define _mm_testz_si128(a, b) simde_mm_testz_si128(a, b)
2219 #endif
2220
2221 SIMDE_END_DECLS_
2222
2223 HEDLEY_DIAGNOSTIC_POP
2224
2225 #endif /* !defined(SIMDE_X86_SSE4_1_H) */
2226