1 /* SPDX-License-Identifier: MIT
2 *
3 * Permission is hereby granted, free of charge, to any person
4 * obtaining a copy of this software and associated documentation
5 * files (the "Software"), to deal in the Software without
6 * restriction, including without limitation the rights to use, copy,
7 * modify, merge, publish, distribute, sublicense, and/or sell copies
8 * of the Software, and to permit persons to whom the Software is
9 * furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be
12 * included in all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
18 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
19 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Copyright:
24 * 2017-2020 Evan Nemerson <evan@nemerson.com>
25 */
26
27 #include "sse.h"
28 #if !defined(SIMDE_X86_SSE4_1_H)
29 #define SIMDE_X86_SSE4_1_H
30
31 #include "ssse3.h"
32
33 HEDLEY_DIAGNOSTIC_PUSH
34 SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
35 SIMDE_BEGIN_DECLS_
36
37 #if !defined(SIMDE_X86_SSE4_1_NATIVE) && defined(SIMDE_ENABLE_NATIVE_ALIASES)
38 # define SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES
39 #endif
40
41 SIMDE_FUNCTION_ATTRIBUTES
42 simde__m128i
simde_mm_blend_epi16(simde__m128i a,simde__m128i b,const int imm8)43 simde_mm_blend_epi16 (simde__m128i a, simde__m128i b, const int imm8)
44 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
45 simde__m128i_private
46 r_,
47 a_ = simde__m128i_to_private(a),
48 b_ = simde__m128i_to_private(b);
49
50 SIMDE_VECTORIZE
51 for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
52 r_.u16[i] = ((imm8 >> i) & 1) ? b_.u16[i] : a_.u16[i];
53 }
54
55 return simde__m128i_from_private(r_);
56 }
57 #if defined(SIMDE_X86_SSE4_1_NATIVE)
58 # define simde_mm_blend_epi16(a, b, imm8) _mm_blend_epi16(a, b, imm8)
59 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
60 # define simde_mm_blend_epi16(a, b, imm8) \
61 (__extension__ ({ \
62 const uint16_t _mask[8] = { \
63 ((imm8) & (1 << 0)) ? 0xFFFF : 0x0000, \
64 ((imm8) & (1 << 1)) ? 0xFFFF : 0x0000, \
65 ((imm8) & (1 << 2)) ? 0xFFFF : 0x0000, \
66 ((imm8) & (1 << 3)) ? 0xFFFF : 0x0000, \
67 ((imm8) & (1 << 4)) ? 0xFFFF : 0x0000, \
68 ((imm8) & (1 << 5)) ? 0xFFFF : 0x0000, \
69 ((imm8) & (1 << 6)) ? 0xFFFF : 0x0000, \
70 ((imm8) & (1 << 7)) ? 0xFFFF : 0x0000 \
71 }; \
72 uint16x8_t _mask_vec = vld1q_u16(_mask); \
73 simde__m128i_from_neon_u16(vbslq_u16(_mask_vec, simde__m128i_to_neon_u16(b), simde__m128i_to_neon_u16(a))); \
74 }))
75 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
76 # define simde_mm_blend_epi16(a, b, imm8) \
77 (__extension__ ({ \
78 const SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) _mask = { \
79 ((imm8) & (1 << 0)) ? 0xFFFF : 0x0000, \
80 ((imm8) & (1 << 1)) ? 0xFFFF : 0x0000, \
81 ((imm8) & (1 << 2)) ? 0xFFFF : 0x0000, \
82 ((imm8) & (1 << 3)) ? 0xFFFF : 0x0000, \
83 ((imm8) & (1 << 4)) ? 0xFFFF : 0x0000, \
84 ((imm8) & (1 << 5)) ? 0xFFFF : 0x0000, \
85 ((imm8) & (1 << 6)) ? 0xFFFF : 0x0000, \
86 ((imm8) & (1 << 7)) ? 0xFFFF : 0x0000 \
87 }; \
88 simde__m128i_from_altivec_u16(vec_sel(simde__m128i_to_altivec_u16(a), simde__m128i_to_altivec_u16(b), _mask)); \
89 }))
90 #endif
91 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
92 #undef _mm_blend_epi16
93 #define _mm_blend_epi16(a, b, imm8) simde_mm_blend_epi16(a, b, imm8)
94 #endif
95
96 SIMDE_FUNCTION_ATTRIBUTES
97 simde__m128d
simde_mm_blend_pd(simde__m128d a,simde__m128d b,const int imm8)98 simde_mm_blend_pd (simde__m128d a, simde__m128d b, const int imm8)
99 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3) {
100 simde__m128d_private
101 r_,
102 a_ = simde__m128d_to_private(a),
103 b_ = simde__m128d_to_private(b);
104
105 SIMDE_VECTORIZE
106 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
107 r_.f64[i] = ((imm8 >> i) & 1) ? b_.f64[i] : a_.f64[i];
108 }
109 return simde__m128d_from_private(r_);
110 }
111 #if defined(SIMDE_X86_SSE4_1_NATIVE)
112 # define simde_mm_blend_pd(a, b, imm8) _mm_blend_pd(a, b, imm8)
113 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
114 # define simde_mm_blend_pd(a, b, imm8) \
115 (__extension__ ({ \
116 const uint64_t _mask[2] = { \
117 ((imm8) & (1 << 0)) ? UINT64_MAX : 0, \
118 ((imm8) & (1 << 1)) ? UINT64_MAX : 0 \
119 }; \
120 uint64x2_t _mask_vec = vld1q_u64(_mask); \
121 simde__m128d_from_neon_u64(vbslq_u64(_mask_vec, simde__m128d_to_neon_u64(b), simde__m128d_to_neon_u64(a))); \
122 }))
123 #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
124 # define simde_mm_blend_pd(a, b, imm8) \
125 (__extension__ ({ \
126 const SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) _mask = { \
127 ((imm8) & (1 << 0)) ? UINT64_MAX : 0, \
128 ((imm8) & (1 << 1)) ? UINT64_MAX : 0 \
129 }; \
130 simde__m128d_from_altivec_f64(vec_sel(simde__m128d_to_altivec_f64(a), simde__m128d_to_altivec_f64(b), _mask)); \
131 }))
132 #endif
133 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
134 #undef _mm_blend_pd
135 #define _mm_blend_pd(a, b, imm8) simde_mm_blend_pd(a, b, imm8)
136 #endif
137
138 SIMDE_FUNCTION_ATTRIBUTES
139 simde__m128
simde_mm_blend_ps(simde__m128 a,simde__m128 b,const int imm8)140 simde_mm_blend_ps (simde__m128 a, simde__m128 b, const int imm8)
141 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) {
142 simde__m128_private
143 r_,
144 a_ = simde__m128_to_private(a),
145 b_ = simde__m128_to_private(b);
146
147 SIMDE_VECTORIZE
148 for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
149 r_.f32[i] = ((imm8 >> i) & 1) ? b_.f32[i] : a_.f32[i];
150 }
151 return simde__m128_from_private(r_);
152 }
153 #if defined(SIMDE_X86_SSE4_1_NATIVE)
154 # define simde_mm_blend_ps(a, b, imm8) _mm_blend_ps(a, b, imm8)
155 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
156 # define simde_mm_blend_ps(a, b, imm8) \
157 (__extension__ ({ \
158 const uint32_t _mask[4] = { \
159 ((imm8) & (1 << 0)) ? UINT32_MAX : 0, \
160 ((imm8) & (1 << 1)) ? UINT32_MAX : 0, \
161 ((imm8) & (1 << 2)) ? UINT32_MAX : 0, \
162 ((imm8) & (1 << 3)) ? UINT32_MAX : 0 \
163 }; \
164 uint32x4_t _mask_vec = vld1q_u32(_mask); \
165 simde__m128_from_neon_f32(vbslq_f32(_mask_vec, simde__m128_to_neon_f32(b), simde__m128_to_neon_f32(a))); \
166 }))
167 #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
168 # define simde_mm_blend_ps(a, b, imm8) \
169 (__extension__ ({ \
170 const SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) _mask = { \
171 ((imm8) & (1 << 0)) ? UINT32_MAX : 0, \
172 ((imm8) & (1 << 1)) ? UINT32_MAX : 0, \
173 ((imm8) & (1 << 2)) ? UINT32_MAX : 0, \
174 ((imm8) & (1 << 3)) ? UINT32_MAX : 0 \
175 }; \
176 simde__m128_from_altivec_f32(vec_sel(simde__m128_to_altivec_f32(a), simde__m128_to_altivec_f32(b), _mask)); \
177 }))
178 #endif
179 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
180 #undef _mm_blend_ps
181 #define _mm_blend_ps(a, b, imm8) simde_mm_blend_ps(a, b, imm8)
182 #endif
183
184 SIMDE_FUNCTION_ATTRIBUTES
185 simde__m128i
simde_mm_blendv_epi8(simde__m128i a,simde__m128i b,simde__m128i mask)186 simde_mm_blendv_epi8 (simde__m128i a, simde__m128i b, simde__m128i mask) {
187 #if defined(SIMDE_X86_SSE4_1_NATIVE)
188 return _mm_blendv_epi8(a, b, mask);
189 #else
190 simde__m128i_private
191 r_,
192 a_ = simde__m128i_to_private(a),
193 b_ = simde__m128i_to_private(b),
194 mask_ = simde__m128i_to_private(mask);
195
196 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
197 /* Use a signed shift right to create a mask with the sign bit */
198 mask_.neon_i8 = vshrq_n_s8(mask_.neon_i8, 7);
199 r_.neon_i8 = vbslq_s8(mask_.neon_u8, b_.neon_i8, a_.neon_i8);
200 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
201 v128_t m = wasm_i8x16_shr(mask_.wasm_v128, 7);
202 r_.wasm_v128 = wasm_v128_or(wasm_v128_and(b_.wasm_v128, m), wasm_v128_andnot(a_.wasm_v128, m));
203 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
204 r_.altivec_i8 = vec_sel(a_.altivec_i8, b_.altivec_i8, vec_cmplt(mask_.altivec_i8, vec_splat_s8(0)));
205 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
206 /* https://software.intel.com/en-us/forums/intel-c-compiler/topic/850087 */
207 #if defined(HEDLEY_INTEL_VERSION_CHECK)
208 __typeof__(mask_.i8) z = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
209 mask_.i8 = HEDLEY_STATIC_CAST(__typeof__(mask_.i8), mask_.i8 < z);
210 #else
211 mask_.i8 >>= (CHAR_BIT * sizeof(mask_.i8[0])) - 1;
212 #endif
213
214 r_.i8 = (mask_.i8 & b_.i8) | (~mask_.i8 & a_.i8);
215 #else
216 SIMDE_VECTORIZE
217 for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
218 int8_t m = mask_.i8[i] >> 7;
219 r_.i8[i] = (m & b_.i8[i]) | (~m & a_.i8[i]);
220 }
221 #endif
222
223 return simde__m128i_from_private(r_);
224 #endif
225 }
226 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
227 #undef _mm_blendv_epi8
228 #define _mm_blendv_epi8(a, b, mask) simde_mm_blendv_epi8(a, b, mask)
229 #endif
230
231 SIMDE_FUNCTION_ATTRIBUTES
232 simde__m128i
simde_x_mm_blendv_epi16(simde__m128i a,simde__m128i b,simde__m128i mask)233 simde_x_mm_blendv_epi16 (simde__m128i a, simde__m128i b, simde__m128i mask) {
234 #if defined(SIMDE_X86_SSE2_NATIVE)
235 mask = simde_mm_srai_epi16(mask, 15);
236 return simde_mm_or_si128(simde_mm_and_si128(mask, b), simde_mm_andnot_si128(mask, a));
237 #else
238 simde__m128i_private
239 r_,
240 a_ = simde__m128i_to_private(a),
241 b_ = simde__m128i_to_private(b),
242 mask_ = simde__m128i_to_private(mask);
243
244 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
245 mask_ = simde__m128i_to_private(simde_mm_cmplt_epi16(mask, simde_mm_setzero_si128()));
246 r_.neon_i16 = vbslq_s16(mask_.neon_u16, b_.neon_i16, a_.neon_i16);
247 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
248 r_.altivec_i16 = vec_sel(a_.altivec_i16, b_.altivec_i16, vec_cmplt(mask_.altivec_i16, vec_splat_s16(0)));
249 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
250 #if defined(HEDLEY_INTEL_VERSION_CHECK)
251 __typeof__(mask_.i16) z = { 0, 0, 0, 0, 0, 0, 0, 0 };
252 mask_.i16 = mask_.i16 < z;
253 #else
254 mask_.i16 >>= (CHAR_BIT * sizeof(mask_.i16[0])) - 1;
255 #endif
256
257 r_.i16 = (mask_.i16 & b_.i16) | (~mask_.i16 & a_.i16);
258 #else
259 SIMDE_VECTORIZE
260 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
261 int16_t m = mask_.i16[i] >> 15;
262 r_.i16[i] = (m & b_.i16[i]) | (~m & a_.i16[i]);
263 }
264 #endif
265
266 return simde__m128i_from_private(r_);
267 #endif
268 }
269
270 SIMDE_FUNCTION_ATTRIBUTES
271 simde__m128i
simde_x_mm_blendv_epi32(simde__m128i a,simde__m128i b,simde__m128i mask)272 simde_x_mm_blendv_epi32 (simde__m128i a, simde__m128i b, simde__m128i mask) {
273 #if defined(SIMDE_X86_SSE4_1_NATIVE)
274 return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _mm_castsi128_ps(mask)));
275 #else
276 simde__m128i_private
277 r_,
278 a_ = simde__m128i_to_private(a),
279 b_ = simde__m128i_to_private(b),
280 mask_ = simde__m128i_to_private(mask);
281
282 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
283 mask_ = simde__m128i_to_private(simde_mm_cmplt_epi32(mask, simde_mm_setzero_si128()));
284 r_.neon_i32 = vbslq_s32(mask_.neon_u32, b_.neon_i32, a_.neon_i32);
285 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
286 v128_t m = wasm_i32x4_shr(mask_.wasm_v128, 31);
287 r_.wasm_v128 = wasm_v128_or(wasm_v128_and(b_.wasm_v128, m), wasm_v128_andnot(a_.wasm_v128, m));
288 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
289 r_.altivec_i32 = vec_sel(a_.altivec_i32, b_.altivec_i32, vec_cmplt(mask_.altivec_i32, vec_splat_s32(0)));
290 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
291 #if defined(HEDLEY_INTEL_VERSION_CHECK)
292 __typeof__(mask_.i32) z = { 0, 0, 0, 0 };
293 mask_.i32 = HEDLEY_STATIC_CAST(__typeof__(mask_.i32), mask_.i32 < z);
294 #else
295 mask_.i32 >>= (CHAR_BIT * sizeof(mask_.i32[0])) - 1;
296 #endif
297
298 r_.i32 = (mask_.i32 & b_.i32) | (~mask_.i32 & a_.i32);
299 #else
300 SIMDE_VECTORIZE
301 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
302 int32_t m = mask_.i32[i] >> 31;
303 r_.i32[i] = (m & b_.i32[i]) | (~m & a_.i32[i]);
304 }
305 #endif
306
307 return simde__m128i_from_private(r_);
308 #endif
309 }
310
311 SIMDE_FUNCTION_ATTRIBUTES
312 simde__m128i
simde_x_mm_blendv_epi64(simde__m128i a,simde__m128i b,simde__m128i mask)313 simde_x_mm_blendv_epi64 (simde__m128i a, simde__m128i b, simde__m128i mask) {
314 #if defined(SIMDE_X86_SSE4_1_NATIVE)
315 return _mm_castpd_si128(_mm_blendv_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(b), _mm_castsi128_pd(mask)));
316 #else
317 simde__m128i_private
318 r_,
319 a_ = simde__m128i_to_private(a),
320 b_ = simde__m128i_to_private(b),
321 mask_ = simde__m128i_to_private(mask);
322
323 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
324 mask_.u64 = vcltq_s64(mask_.i64, vdupq_n_s64(UINT64_C(0)));
325 r_.neon_i64 = vbslq_s64(mask_.neon_u64, b_.neon_i64, a_.neon_i64);
326 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
327 v128_t m = wasm_i64x2_shr(mask_.wasm_v128, 63);
328 r_.wasm_v128 = wasm_v128_or(wasm_v128_and(b_.wasm_v128, m), wasm_v128_andnot(a_.wasm_v128, m));
329 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
330 /* Using int due to clang bug #46770 */
331 SIMDE_POWER_ALTIVEC_VECTOR(signed long long) selector = vec_sra(mask_.altivec_i64, vec_splats(HEDLEY_STATIC_CAST(unsigned long long, 63)));
332 r_.altivec_i32 = vec_sel(a_.altivec_i32, b_.altivec_i32, HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned int), selector));
333 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
334 #if defined(HEDLEY_INTEL_VERSION_CHECK)
335 __typeof__(mask_.i64) z = { 0, 0 };
336 mask_.i64 = HEDLEY_STATIC_CAST(__typeof__(mask_.i64), mask_.i64 < z);
337 #else
338 mask_.i64 >>= (CHAR_BIT * sizeof(mask_.i64[0])) - 1;
339 #endif
340
341 r_.i64 = (mask_.i64 & b_.i64) | (~mask_.i64 & a_.i64);
342 #else
343 SIMDE_VECTORIZE
344 for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
345 int64_t m = mask_.i64[i] >> 63;
346 r_.i64[i] = (m & b_.i64[i]) | (~m & a_.i64[i]);
347 }
348 #endif
349
350 return simde__m128i_from_private(r_);
351 #endif
352 }
353
354 SIMDE_FUNCTION_ATTRIBUTES
355 simde__m128d
simde_mm_blendv_pd(simde__m128d a,simde__m128d b,simde__m128d mask)356 simde_mm_blendv_pd (simde__m128d a, simde__m128d b, simde__m128d mask) {
357 #if defined(SIMDE_X86_SSE4_1_NATIVE)
358 return _mm_blendv_pd(a, b, mask);
359 #else
360 return simde_mm_castsi128_pd(simde_x_mm_blendv_epi64(simde_mm_castpd_si128(a), simde_mm_castpd_si128(b), simde_mm_castpd_si128(mask)));
361 #endif
362 }
363 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
364 #undef _mm_blendv_pd
365 #define _mm_blendv_pd(a, b, mask) simde_mm_blendv_pd(a, b, mask)
366 #endif
367
368 SIMDE_FUNCTION_ATTRIBUTES
369 simde__m128
simde_mm_blendv_ps(simde__m128 a,simde__m128 b,simde__m128 mask)370 simde_mm_blendv_ps (simde__m128 a, simde__m128 b, simde__m128 mask) {
371 #if defined(SIMDE_X86_SSE4_1_NATIVE)
372 return _mm_blendv_ps(a, b, mask);
373 #else
374 return simde_mm_castsi128_ps(simde_x_mm_blendv_epi32(simde_mm_castps_si128(a), simde_mm_castps_si128(b), simde_mm_castps_si128(mask)));
375 #endif
376 }
377 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
378 #undef _mm_blendv_ps
379 #define _mm_blendv_ps(a, b, mask) simde_mm_blendv_ps(a, b, mask)
380 #endif
381
382 SIMDE_FUNCTION_ATTRIBUTES
383 simde__m128d
simde_mm_round_pd(simde__m128d a,int rounding)384 simde_mm_round_pd (simde__m128d a, int rounding)
385 SIMDE_REQUIRE_CONSTANT_RANGE(rounding, 0, 15) {
386 simde__m128d_private
387 r_,
388 a_ = simde__m128d_to_private(a);
389
390 /* For architectures which lack a current direction SIMD instruction. */
391 #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
392 if ((rounding & 7) == SIMDE_MM_FROUND_CUR_DIRECTION)
393 rounding = HEDLEY_STATIC_CAST(int, SIMDE_MM_GET_ROUNDING_MODE()) << 13;
394 #endif
395
396 switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) {
397 case SIMDE_MM_FROUND_CUR_DIRECTION:
398 #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
399 r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_round(a_.altivec_f64));
400 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
401 r_.neon_f64 = vrndiq_f64(a_.neon_f64);
402 #elif defined(simde_math_nearbyint)
403 SIMDE_VECTORIZE
404 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
405 r_.f64[i] = simde_math_nearbyint(a_.f64[i]);
406 }
407 #else
408 HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
409 #endif
410 break;
411
412 case SIMDE_MM_FROUND_TO_NEAREST_INT:
413 #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
414 r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_round(a_.altivec_f64));
415 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
416 r_.neon_f64 = vrndaq_f64(a_.neon_f64);
417 #elif defined(simde_math_roundeven)
418 SIMDE_VECTORIZE
419 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
420 r_.f64[i] = simde_math_roundeven(a_.f64[i]);
421 }
422 #else
423 HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
424 #endif
425 break;
426
427 case SIMDE_MM_FROUND_TO_NEG_INF:
428 #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
429 r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_floor(a_.altivec_f64));
430 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
431 r_.neon_f64 = vrndmq_f64(a_.neon_f64);
432 #else
433 SIMDE_VECTORIZE
434 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
435 r_.f64[i] = simde_math_floor(a_.f64[i]);
436 }
437 #endif
438 break;
439
440 case SIMDE_MM_FROUND_TO_POS_INF:
441 #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
442 r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_ceil(a_.altivec_f64));
443 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
444 r_.neon_f64 = vrndpq_f64(a_.neon_f64);
445 #elif defined(simde_math_ceil)
446 SIMDE_VECTORIZE
447 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
448 r_.f64[i] = simde_math_ceil(a_.f64[i]);
449 }
450 #else
451 HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
452 #endif
453 break;
454
455 case SIMDE_MM_FROUND_TO_ZERO:
456 #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
457 r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_trunc(a_.altivec_f64));
458 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
459 r_.neon_f64 = vrndq_f64(a_.neon_f64);
460 #else
461 SIMDE_VECTORIZE
462 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
463 r_.f64[i] = simde_math_trunc(a_.f64[i]);
464 }
465 #endif
466 break;
467
468 default:
469 HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
470 }
471
472 return simde__m128d_from_private(r_);
473 }
474 #if defined(SIMDE_X86_SSE4_1_NATIVE)
475 #define simde_mm_round_pd(a, rounding) _mm_round_pd(a, rounding)
476 #endif
477 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
478 #undef _mm_round_pd
479 #define _mm_round_pd(a, rounding) simde_mm_round_pd(a, rounding)
480 #endif
481
482 SIMDE_FUNCTION_ATTRIBUTES
483 simde__m128d
simde_mm_ceil_pd(simde__m128d a)484 simde_mm_ceil_pd (simde__m128d a) {
485 return simde_mm_round_pd(a, SIMDE_MM_FROUND_TO_POS_INF);
486 }
487 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
488 #undef _mm_ceil_pd
489 #define _mm_ceil_pd(a) simde_mm_ceil_pd(a)
490 #endif
491
492 SIMDE_FUNCTION_ATTRIBUTES
493 simde__m128
simde_mm_ceil_ps(simde__m128 a)494 simde_mm_ceil_ps (simde__m128 a) {
495 return simde_mm_round_ps(a, SIMDE_MM_FROUND_TO_POS_INF);
496 }
497 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
498 #undef _mm_ceil_ps
499 #define _mm_ceil_ps(a) simde_mm_ceil_ps(a)
500 #endif
501
502 SIMDE_FUNCTION_ATTRIBUTES
503 simde__m128d
simde_mm_ceil_sd(simde__m128d a,simde__m128d b)504 simde_mm_ceil_sd (simde__m128d a, simde__m128d b) {
505 #if defined(SIMDE_X86_SSE4_1_NATIVE)
506 return _mm_ceil_sd(a, b);
507 #else
508 simde__m128d_private
509 r_,
510 a_ = simde__m128d_to_private(a),
511 b_ = simde__m128d_to_private(b);
512
513 #if defined(simde_math_ceilf)
514 r_ = simde__m128d_to_private(simde_mm_set_pd(a_.f64[1], simde_math_ceil(b_.f64[0])));
515 #else
516 HEDLEY_UNREACHABLE();
517 #endif
518
519 return simde__m128d_from_private(r_);
520 #endif
521 }
522 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
523 #undef _mm_ceil_sd
524 #define _mm_ceil_sd(a, b) simde_mm_ceil_sd(a, b)
525 #endif
526
527 SIMDE_FUNCTION_ATTRIBUTES
528 simde__m128
simde_mm_ceil_ss(simde__m128 a,simde__m128 b)529 simde_mm_ceil_ss (simde__m128 a, simde__m128 b) {
530 #if defined(SIMDE_X86_SSE4_1_NATIVE)
531 return _mm_ceil_ss(a, b);
532 #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
533 return simde_mm_move_ss(a, simde_mm_ceil_ps(b));
534 #else
535 simde__m128_private
536 r_,
537 a_ = simde__m128_to_private(a),
538 b_ = simde__m128_to_private(b);
539
540 #if defined(simde_math_ceilf)
541 r_ = simde__m128_to_private(simde_mm_set_ps(a_.f32[3], a_.f32[2], a_.f32[1], simde_math_ceilf(b_.f32[0])));
542 #else
543 HEDLEY_UNREACHABLE();
544 #endif
545
546 return simde__m128_from_private(r_);
547 #endif
548 }
549 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
550 #undef _mm_ceil_ss
551 #define _mm_ceil_ss(a, b) simde_mm_ceil_ss(a, b)
552 #endif
553
554 SIMDE_FUNCTION_ATTRIBUTES
555 simde__m128i
simde_mm_cmpeq_epi64(simde__m128i a,simde__m128i b)556 simde_mm_cmpeq_epi64 (simde__m128i a, simde__m128i b) {
557 #if defined(SIMDE_X86_SSE4_1_NATIVE)
558 return _mm_cmpeq_epi64(a, b);
559 #else
560 simde__m128i_private
561 r_,
562 a_ = simde__m128i_to_private(a),
563 b_ = simde__m128i_to_private(b);
564
565 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
566 r_.neon_u64 = vceqq_u64(a_.neon_u64, b_.neon_u64);
567 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
568 /* (a == b) -> (a_lo == b_lo) && (a_hi == b_hi) */
569 uint32x4_t cmp = vceqq_u32(a_.neon_u32, b_.neon_u32);
570 uint32x4_t swapped = vrev64q_u32(cmp);
571 r_.neon_u32 = vandq_u32(cmp, swapped);
572 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
573 r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), a_.i64 == b_.i64);
574 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
575 r_.altivec_i64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed long long), vec_cmpeq(a_.altivec_i64, b_.altivec_i64));
576 #else
577 SIMDE_VECTORIZE
578 for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
579 r_.u64[i] = (a_.u64[i] == b_.u64[i]) ? ~UINT64_C(0) : UINT64_C(0);
580 }
581 #endif
582
583 return simde__m128i_from_private(r_);
584 #endif
585 }
586 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
587 #undef _mm_cmpeq_epi64
588 #define _mm_cmpeq_epi64(a, b) simde_mm_cmpeq_epi64(a, b)
589 #endif
590
591 SIMDE_FUNCTION_ATTRIBUTES
592 simde__m128i
simde_mm_cvtepi8_epi16(simde__m128i a)593 simde_mm_cvtepi8_epi16 (simde__m128i a) {
594 #if defined(SIMDE_X86_SSE4_1_NATIVE)
595 return _mm_cvtepi8_epi16(a);
596 #elif defined(SIMDE_X86_SSE2_NATIVE)
597 return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8);
598 #else
599 simde__m128i_private
600 r_,
601 a_ = simde__m128i_to_private(a);
602
603 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
604 int8x16_t s8x16 = a_.neon_i8; /* xxxx xxxx xxxx DCBA */
605 int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
606 r_.neon_i16 = s16x8;
607 #elif defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_VECTOR_SCALAR) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
608 r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, a_.i8,
609 -1, 0, -1, 1, -1, 2, -1, 3,
610 -1, 4, -1, 5, -1, 6, -1, 7));
611 r_.i16 >>= 8;
612 #elif defined(SIMDE_CONVERT_VECTOR_)
613 SIMDE_CONVERT_VECTOR_(r_.i16, a_.m64_private[0].i8);
614 #else
615 SIMDE_VECTORIZE
616 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
617 r_.i16[i] = a_.i8[i];
618 }
619 #endif
620
621 return simde__m128i_from_private(r_);
622 #endif
623 }
624 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
625 #undef _mm_cvtepi8_epi16
626 #define _mm_cvtepi8_epi16(a) simde_mm_cvtepi8_epi16(a)
627 #endif
628
629 SIMDE_FUNCTION_ATTRIBUTES
630 simde__m128i
simde_mm_cvtepi8_epi32(simde__m128i a)631 simde_mm_cvtepi8_epi32 (simde__m128i a) {
632 #if defined(SIMDE_X86_SSE4_1_NATIVE)
633 return _mm_cvtepi8_epi32(a);
634 #elif defined(SIMDE_X86_SSE2_NATIVE)
635 __m128i tmp = _mm_unpacklo_epi8(a, a);
636 tmp = _mm_unpacklo_epi16(tmp, tmp);
637 return _mm_srai_epi32(tmp, 24);
638 #else
639 simde__m128i_private
640 r_,
641 a_ = simde__m128i_to_private(a);
642
643 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
644 int8x16_t s8x16 = a_.neon_i8; /* xxxx xxxx xxxx DCBA */
645 int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
646 int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */
647 r_.neon_i32 = s32x4;
648 #elif defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_VECTOR_SCALAR) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
649 r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, a_.i8,
650 -1, -1, -1, 0, -1, -1, -1, 1,
651 -1, -1, -1, 2, -1, -1, -1, 3));
652 r_.i32 >>= 24;
653 #else
654 SIMDE_VECTORIZE
655 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
656 r_.i32[i] = a_.i8[i];
657 }
658 #endif
659
660 return simde__m128i_from_private(r_);
661 #endif
662 }
663 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
664 #undef _mm_cvtepi8_epi32
665 #define _mm_cvtepi8_epi32(a) simde_mm_cvtepi8_epi32(a)
666 #endif
667
668 SIMDE_FUNCTION_ATTRIBUTES
669 simde__m128i
simde_mm_cvtepi8_epi64(simde__m128i a)670 simde_mm_cvtepi8_epi64 (simde__m128i a) {
671 #if defined(SIMDE_X86_SSE4_1_NATIVE)
672 return _mm_cvtepi8_epi64(a);
673 #else
674 simde__m128i_private
675 r_,
676 a_ = simde__m128i_to_private(a);
677
678 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
679 int8x16_t s8x16 = a_.neon_i8; /* xxxx xxxx xxxx xxBA */
680 int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0x0x 0B0A */
681 int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
682 int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
683 r_.neon_i64 = s64x2;
684 #elif (!defined(SIMDE_ARCH_X86) && !defined(SIMDE_ARCH_AMD64)) && defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_VECTOR_SCALAR) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
685 /* Disabled on x86 due to lack of 64-bit arithmetic shift until
686 * until AVX-512 (at which point we would be using the native
687 * _mm_cvtepi_epi64 anyways). */
688 r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, a_.i8,
689 -1, -1, -1, -1, -1, -1, -1, 0,
690 -1, -1, -1, -1, -1, -1, -1, 1));
691 r_.i64 >>= 56;
692 #else
693 SIMDE_VECTORIZE
694 for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
695 r_.i64[i] = a_.i8[i];
696 }
697 #endif
698
699 return simde__m128i_from_private(r_);
700 #endif
701 }
702 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
703 #undef _mm_cvtepi8_epi64
704 #define _mm_cvtepi8_epi64(a) simde_mm_cvtepi8_epi64(a)
705 #endif
706
707 SIMDE_FUNCTION_ATTRIBUTES
708 simde__m128i
simde_mm_cvtepu8_epi16(simde__m128i a)709 simde_mm_cvtepu8_epi16 (simde__m128i a) {
710 #if defined(SIMDE_X86_SSE4_1_NATIVE)
711 return _mm_cvtepu8_epi16(a);
712 #elif defined(SIMDE_X86_SSE2_NATIVE)
713 return _mm_unpacklo_epi8(a, _mm_setzero_si128());
714 #else
715 simde__m128i_private
716 r_,
717 a_ = simde__m128i_to_private(a);
718
719 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
720 uint8x16_t u8x16 = a_.neon_u8; /* xxxx xxxx xxxx DCBA */
721 uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */
722 r_.neon_u16 = u16x8;
723 #elif defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
724 __typeof__(r_.i8) z = { 0, };
725 r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, z,
726 0, 16, 1, 17, 2, 18, 3, 19,
727 4, 20, 5, 21, 6, 22, 7, 23));
728 #elif defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_BUG_CLANG_45541) && (!defined(SIMDE_ARCH_POWER) || !defined(__clang__))
729 SIMDE_CONVERT_VECTOR_(r_.i16, a_.m64_private[0].u8);
730 #else
731 SIMDE_VECTORIZE
732 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
733 r_.i16[i] = a_.u8[i];
734 }
735 #endif
736
737 return simde__m128i_from_private(r_);
738 #endif
739 }
740 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
741 #undef _mm_cvtepu8_epi16
742 #define _mm_cvtepu8_epi16(a) simde_mm_cvtepu8_epi16(a)
743 #endif
744
745 SIMDE_FUNCTION_ATTRIBUTES
746 simde__m128i
simde_mm_cvtepu8_epi32(simde__m128i a)747 simde_mm_cvtepu8_epi32 (simde__m128i a) {
748 #if defined(SIMDE_X86_SSE4_1_NATIVE)
749 return _mm_cvtepu8_epi32(a);
750 #elif defined(SIMDE_X86_SSSE3_NATIVE)
751 __m128i s = _mm_set_epi8(
752 0x80, 0x80, 0x80, 0x03, 0x80, 0x80, 0x80, 0x02,
753 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80, 0x00);
754 return _mm_shuffle_epi8(a, s);
755 #elif defined(SIMDE_X86_SSE2_NATIVE)
756 __m128i z = _mm_setzero_si128();
757 return _mm_unpacklo_epi16(_mm_unpacklo_epi8(a, z), z);
758 #else
759 simde__m128i_private
760 r_,
761 a_ = simde__m128i_to_private(a);
762
763 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
764 uint8x16_t u8x16 = a_.neon_u8; /* xxxx xxxx xxxx DCBA */
765 uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */
766 uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
767 r_.neon_u32 = u32x4;
768 #elif defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
769 __typeof__(r_.i8) z = { 0, };
770 r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, z,
771 0, 17, 18, 19, 1, 21, 22, 23,
772 2, 25, 26, 27, 3, 29, 30, 31));
773 #else
774 SIMDE_VECTORIZE
775 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
776 r_.i32[i] = a_.u8[i];
777 }
778 #endif
779
780 return simde__m128i_from_private(r_);
781 #endif
782 }
783 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
784 #undef _mm_cvtepu8_epi32
785 #define _mm_cvtepu8_epi32(a) simde_mm_cvtepu8_epi32(a)
786 #endif
787
788 SIMDE_FUNCTION_ATTRIBUTES
789 simde__m128i
simde_mm_cvtepu8_epi64(simde__m128i a)790 simde_mm_cvtepu8_epi64 (simde__m128i a) {
791 #if defined(SIMDE_X86_SSE4_1_NATIVE)
792 return _mm_cvtepu8_epi64(a);
793 #elif defined(SIMDE_X86_SSSE3_NATIVE)
794 __m128i s = _mm_set_epi8(
795 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x01,
796 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x00);
797 return _mm_shuffle_epi8(a, s);
798 #elif defined(SIMDE_X86_SSE2_NATIVE)
799 __m128i z = _mm_setzero_si128();
800 return _mm_unpacklo_epi32(_mm_unpacklo_epi16(_mm_unpacklo_epi8(a, z), z), z);
801 #else
802 simde__m128i_private
803 r_,
804 a_ = simde__m128i_to_private(a);
805
806 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
807 uint8x16_t u8x16 = a_.neon_u8; /* xxxx xxxx xxxx xxBA */
808 uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0x0x 0B0A */
809 uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
810 uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
811 r_.neon_u64 = u64x2;
812 #elif defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
813 __typeof__(r_.i8) z = { 0, };
814 r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, z,
815 0, 17, 18, 19, 20, 21, 22, 23,
816 1, 25, 26, 27, 28, 29, 30, 31));
817 #else
818 SIMDE_VECTORIZE
819 for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
820 r_.i64[i] = a_.u8[i];
821 }
822 #endif
823
824 return simde__m128i_from_private(r_);
825 #endif
826 }
827 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
828 #undef _mm_cvtepu8_epi64
829 #define _mm_cvtepu8_epi64(a) simde_mm_cvtepu8_epi64(a)
830 #endif
831
832 SIMDE_FUNCTION_ATTRIBUTES
833 simde__m128i
simde_mm_cvtepi16_epi32(simde__m128i a)834 simde_mm_cvtepi16_epi32 (simde__m128i a) {
835 #if defined(SIMDE_X86_SSE4_1_NATIVE)
836 return _mm_cvtepi16_epi32(a);
837 #elif defined(SIMDE_X86_SSE2_NATIVE)
838 return _mm_srai_epi32(_mm_unpacklo_epi16(a, a), 16);
839 #else
840 simde__m128i_private
841 r_,
842 a_ = simde__m128i_to_private(a);
843
844 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
845 r_.neon_i32 = vmovl_s16(vget_low_s16(a_.neon_i16));
846 #elif !defined(SIMDE_ARCH_X86) && defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_VECTOR_SCALAR) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
847 r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, a_.i16, 8, 0, 10, 1, 12, 2, 14, 3));
848 r_.i32 >>= 16;
849 #else
850 SIMDE_VECTORIZE
851 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
852 r_.i32[i] = a_.i16[i];
853 }
854 #endif
855
856 return simde__m128i_from_private(r_);
857 #endif
858 }
859 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
860 #undef _mm_cvtepi16_epi32
861 #define _mm_cvtepi16_epi32(a) simde_mm_cvtepi16_epi32(a)
862 #endif
863
864 SIMDE_FUNCTION_ATTRIBUTES
865 simde__m128i
simde_mm_cvtepu16_epi32(simde__m128i a)866 simde_mm_cvtepu16_epi32 (simde__m128i a) {
867 #if defined(SIMDE_X86_SSE4_1_NATIVE)
868 return _mm_cvtepu16_epi32(a);
869 #elif defined(SIMDE_X86_SSE2_NATIVE)
870 return _mm_unpacklo_epi16(a, _mm_setzero_si128());
871 #else
872 simde__m128i_private
873 r_,
874 a_ = simde__m128i_to_private(a);
875
876 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
877 r_.neon_u32 = vmovl_u16(vget_low_u16(a_.neon_u16));
878 #elif defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
879 __typeof__(r_.u16) z = { 0, };
880 r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), SIMDE_SHUFFLE_VECTOR_(16, 16, a_.u16, z,
881 0, 9, 1, 11, 2, 13, 3, 15));
882 #elif defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_BUG_CLANG_45541) && (!defined(SIMDE_ARCH_POWER) || !defined(__clang__))
883 SIMDE_CONVERT_VECTOR_(r_.i32, a_.m64_private[0].u16);
884 #else
885 SIMDE_VECTORIZE
886 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
887 r_.i32[i] = a_.u16[i];
888 }
889 #endif
890
891 return simde__m128i_from_private(r_);
892 #endif
893 }
894 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
895 #undef _mm_cvtepu16_epi32
896 #define _mm_cvtepu16_epi32(a) simde_mm_cvtepu16_epi32(a)
897 #endif
898
899 SIMDE_FUNCTION_ATTRIBUTES
900 simde__m128i
simde_mm_cvtepu16_epi64(simde__m128i a)901 simde_mm_cvtepu16_epi64 (simde__m128i a) {
902 #if defined(SIMDE_X86_SSE4_1_NATIVE)
903 return _mm_cvtepu16_epi64(a);
904 #elif defined(SIMDE_X86_SSE2_NATIVE)
905 __m128i z = _mm_setzero_si128();
906 return _mm_unpacklo_epi32(_mm_unpacklo_epi16(a, z), z);
907 #else
908 simde__m128i_private
909 r_,
910 a_ = simde__m128i_to_private(a);
911
912 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
913 uint16x8_t u16x8 = a_.neon_u16; /* xxxx xxxx xxxx 0B0A */
914 uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
915 uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
916 r_.neon_u64 = u64x2;
917 #elif defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
918 __typeof__(r_.u16) z = { 0, };
919 r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), SIMDE_SHUFFLE_VECTOR_(16, 16, a_.u16, z,
920 0, 9, 10, 11,
921 1, 13, 14, 15));
922 #else
923 SIMDE_VECTORIZE
924 for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
925 r_.i64[i] = a_.u16[i];
926 }
927 #endif
928
929 return simde__m128i_from_private(r_);
930 #endif
931 }
932 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
933 #undef _mm_cvtepu16_epi64
934 #define _mm_cvtepu16_epi64(a) simde_mm_cvtepu16_epi64(a)
935 #endif
936
937 SIMDE_FUNCTION_ATTRIBUTES
938 simde__m128i
simde_mm_cvtepi16_epi64(simde__m128i a)939 simde_mm_cvtepi16_epi64 (simde__m128i a) {
940 #if defined(SIMDE_X86_SSE4_1_NATIVE)
941 return _mm_cvtepi16_epi64(a);
942 #else
943 simde__m128i_private
944 r_,
945 a_ = simde__m128i_to_private(a);
946
947 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
948 int16x8_t s16x8 = a_.neon_i16; /* xxxx xxxx xxxx 0B0A */
949 int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
950 int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
951 r_.neon_i64 = s64x2;
952 #elif (!defined(SIMDE_ARCH_X86) && !defined(SIMDE_ARCH_AMD64)) && defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_VECTOR_SCALAR) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
953 r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, a_.i16,
954 8, 9, 10, 0,
955 12, 13, 14, 1));
956 r_.i64 >>= 48;
957 #else
958 SIMDE_VECTORIZE
959 for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
960 r_.i64[i] = a_.i16[i];
961 }
962 #endif
963
964 return simde__m128i_from_private(r_);
965 #endif
966 }
967 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
968 #undef _mm_cvtepi16_epi64
969 #define _mm_cvtepi16_epi64(a) simde_mm_cvtepi16_epi64(a)
970 #endif
971
972 SIMDE_FUNCTION_ATTRIBUTES
973 simde__m128i
simde_mm_cvtepi32_epi64(simde__m128i a)974 simde_mm_cvtepi32_epi64 (simde__m128i a) {
975 #if defined(SIMDE_X86_SSE4_1_NATIVE)
976 return _mm_cvtepi32_epi64(a);
977 #elif defined(SIMDE_X86_SSE2_NATIVE)
978 __m128i tmp = _mm_shuffle_epi32(a, 0x50);
979 tmp = _mm_srai_epi32(tmp, 31);
980 tmp = _mm_shuffle_epi32(tmp, 0xed);
981 return _mm_unpacklo_epi32(a, tmp);
982 #else
983 simde__m128i_private
984 r_,
985 a_ = simde__m128i_to_private(a);
986
987 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
988 r_.neon_i64 = vmovl_s32(vget_low_s32(a_.neon_i32));
989 #elif !defined(SIMDE_ARCH_X86) && defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_VECTOR_SCALAR) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
990 r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), SIMDE_SHUFFLE_VECTOR_(32, 16, a_.i32, a_.i32, -1, 0, -1, 1));
991 r_.i64 >>= 32;
992 #elif defined(SIMDE_CONVERT_VECTOR_)
993 SIMDE_CONVERT_VECTOR_(r_.i64, a_.m64_private[0].i32);
994 #else
995 SIMDE_VECTORIZE
996 for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
997 r_.i64[i] = a_.i32[i];
998 }
999 #endif
1000
1001 return simde__m128i_from_private(r_);
1002 #endif
1003 }
1004 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1005 #undef _mm_cvtepi32_epi64
1006 #define _mm_cvtepi32_epi64(a) simde_mm_cvtepi32_epi64(a)
1007 #endif
1008
1009 SIMDE_FUNCTION_ATTRIBUTES
1010 simde__m128i
simde_mm_cvtepu32_epi64(simde__m128i a)1011 simde_mm_cvtepu32_epi64 (simde__m128i a) {
1012 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1013 return _mm_cvtepu32_epi64(a);
1014 #elif defined(SIMDE_X86_SSE2_NATIVE)
1015 return _mm_unpacklo_epi32(a, _mm_setzero_si128());
1016 #else
1017 simde__m128i_private
1018 r_,
1019 a_ = simde__m128i_to_private(a);
1020
1021 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1022 r_.neon_u64 = vmovl_u32(vget_low_u32(a_.neon_u32));
1023 #elif defined(SIMDE_VECTOR_SCALAR) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
1024 __typeof__(r_.u32) z = { 0, };
1025 r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), SIMDE_SHUFFLE_VECTOR_(32, 16, a_.u32, z, 0, 4, 1, 6));
1026 #elif defined(SIMDE_CONVERT_VECTOR_)
1027 SIMDE_CONVERT_VECTOR_(r_.i64, a_.m64_private[0].u32);
1028 #else
1029 SIMDE_VECTORIZE
1030 for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
1031 r_.i64[i] = a_.u32[i];
1032 }
1033 #endif
1034
1035 return simde__m128i_from_private(r_);
1036 #endif
1037 }
1038 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1039 #undef _mm_cvtepu32_epi64
1040 #define _mm_cvtepu32_epi64(a) simde_mm_cvtepu32_epi64(a)
1041 #endif
1042
1043 SIMDE_FUNCTION_ATTRIBUTES
1044 simde__m128d
simde_mm_dp_pd(simde__m128d a,simde__m128d b,const int imm8)1045 simde_mm_dp_pd (simde__m128d a, simde__m128d b, const int imm8)
1046 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
1047 simde__m128d_private
1048 r_,
1049 a_ = simde__m128d_to_private(a),
1050 b_ = simde__m128d_to_private(b);
1051
1052 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1053 r_.neon_f64 = vmulq_f64(a_.neon_f64, b_.neon_f64);
1054
1055 switch (imm8) {
1056 case 0xff:
1057 r_.neon_f64 = vaddq_f64(r_.neon_f64, vextq_f64(r_.neon_f64, r_.neon_f64, 1));
1058 break;
1059 case 0x13:
1060 r_.neon_f64 = vdupq_lane_f64(vget_low_f64(r_.neon_f64), 0);
1061 break;
1062 default:
1063 { /* imm8 is a compile-time constant, so this all becomes just a load */
1064 uint64_t mask_data[] = {
1065 (imm8 & (1 << 4)) ? ~UINT64_C(0) : UINT64_C(0),
1066 (imm8 & (1 << 5)) ? ~UINT64_C(0) : UINT64_C(0),
1067 };
1068 r_.neon_f64 = vreinterpretq_f64_u64(vandq_u64(vld1q_u64(mask_data), vreinterpretq_u64_f64(r_.neon_f64)));
1069 }
1070
1071 r_.neon_f64 = vdupq_n_f64(vaddvq_f64(r_.neon_f64));
1072
1073 {
1074 uint64_t mask_data[] = {
1075 (imm8 & 1) ? ~UINT64_C(0) : UINT64_C(0),
1076 (imm8 & 2) ? ~UINT64_C(0) : UINT64_C(0)
1077 };
1078 r_.neon_f64 = vreinterpretq_f64_u64(vandq_u64(vld1q_u64(mask_data), vreinterpretq_u64_f64(r_.neon_f64)));
1079 }
1080 break;
1081 }
1082 #else
1083 simde_float64 sum = SIMDE_FLOAT64_C(0.0);
1084
1085 SIMDE_VECTORIZE_REDUCTION(+:sum)
1086 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
1087 sum += ((imm8 >> (i + 4)) & 1) ? (a_.f64[i] * b_.f64[i]) : 0.0;
1088 }
1089
1090 SIMDE_VECTORIZE
1091 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
1092 r_.f64[i] = ((imm8 >> i) & 1) ? sum : 0.0;
1093 }
1094 #endif
1095
1096 return simde__m128d_from_private(r_);
1097 }
1098 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1099 # define simde_mm_dp_pd(a, b, imm8) _mm_dp_pd(a, b, imm8)
1100 #endif
1101 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1102 #undef _mm_dp_pd
1103 #define _mm_dp_pd(a, b, imm8) simde_mm_dp_pd(a, b, imm8)
1104 #endif
1105
1106 SIMDE_FUNCTION_ATTRIBUTES
1107 simde__m128
simde_mm_dp_ps(simde__m128 a,simde__m128 b,const int imm8)1108 simde_mm_dp_ps (simde__m128 a, simde__m128 b, const int imm8)
1109 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
1110 simde__m128_private
1111 r_,
1112 a_ = simde__m128_to_private(a),
1113 b_ = simde__m128_to_private(b);
1114
1115 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1116 r_.neon_f32 = vmulq_f32(a_.neon_f32, b_.neon_f32);
1117
1118 switch (imm8) {
1119 case 0xff:
1120 r_.neon_f32 = vdupq_n_f32(vaddvq_f32(r_.neon_f32));
1121 break;
1122 case 0x7f:
1123 r_.neon_f32 = vsetq_lane_f32(0, r_.neon_f32, 3);
1124 r_.neon_f32 = vdupq_n_f32(vaddvq_f32(r_.neon_f32));
1125 break;
1126 default:
1127 {
1128 {
1129 uint32_t mask_data[] = {
1130 (imm8 & (1 << 4)) ? ~UINT32_C(0) : UINT32_C(0),
1131 (imm8 & (1 << 5)) ? ~UINT32_C(0) : UINT32_C(0),
1132 (imm8 & (1 << 6)) ? ~UINT32_C(0) : UINT32_C(0),
1133 (imm8 & (1 << 7)) ? ~UINT32_C(0) : UINT32_C(0)
1134 };
1135 r_.neon_f32 = vreinterpretq_f32_u32(vandq_u32(vld1q_u32(mask_data), vreinterpretq_u32_f32(r_.neon_f32)));
1136 }
1137
1138 r_.neon_f32 = vdupq_n_f32(vaddvq_f32(r_.neon_f32));
1139
1140 {
1141 uint32_t mask_data[] = {
1142 (imm8 & 1) ? ~UINT32_C(0) : UINT32_C(0),
1143 (imm8 & 2) ? ~UINT32_C(0) : UINT32_C(0),
1144 (imm8 & 4) ? ~UINT32_C(0) : UINT32_C(0),
1145 (imm8 & 8) ? ~UINT32_C(0) : UINT32_C(0)
1146 };
1147 r_.neon_f32 = vreinterpretq_f32_u32(vandq_u32(vld1q_u32(mask_data), vreinterpretq_u32_f32(r_.neon_f32)));
1148 }
1149 }
1150 break;
1151 }
1152 #else
1153 simde_float32 sum = SIMDE_FLOAT32_C(0.0);
1154
1155 SIMDE_VECTORIZE_REDUCTION(+:sum)
1156 for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
1157 sum += ((imm8 >> (i + 4)) & 1) ? (a_.f32[i] * b_.f32[i]) : SIMDE_FLOAT32_C(0.0);
1158 }
1159
1160 SIMDE_VECTORIZE
1161 for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
1162 r_.f32[i] = ((imm8 >> i) & 1) ? sum : SIMDE_FLOAT32_C(0.0);
1163 }
1164 #endif
1165
1166 return simde__m128_from_private(r_);
1167 }
1168 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1169 # define simde_mm_dp_ps(a, b, imm8) _mm_dp_ps(a, b, imm8)
1170 #endif
1171 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1172 #undef _mm_dp_ps
1173 #define _mm_dp_ps(a, b, imm8) simde_mm_dp_ps(a, b, imm8)
1174 #endif
1175
1176 #if defined(simde_mm_extract_epi8)
1177 # undef simde_mm_extract_epi8
1178 #endif
1179 SIMDE_FUNCTION_ATTRIBUTES
1180 int8_t
simde_mm_extract_epi8(simde__m128i a,const int imm8)1181 simde_mm_extract_epi8 (simde__m128i a, const int imm8)
1182 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) {
1183 simde__m128i_private
1184 a_ = simde__m128i_to_private(a);
1185
1186 #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1187 #if defined(SIMDE_BUG_GCC_95227)
1188 (void) a_;
1189 (void) imm8;
1190 #endif
1191 return vec_extract(a_.altivec_i8, imm8);
1192 #else
1193 return a_.i8[imm8 & 15];
1194 #endif
1195 }
1196 #if defined(SIMDE_X86_SSE4_1_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_MM_EXTRACT_EPI8)
1197 # define simde_mm_extract_epi8(a, imm8) HEDLEY_STATIC_CAST(int8_t, _mm_extract_epi8(a, imm8))
1198 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1199 # define simde_mm_extract_epi8(a, imm8) vgetq_lane_s8(simde__m128i_to_private(a).neon_i8, imm8)
1200 #endif
1201 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1202 #undef _mm_extract_epi8
1203 #define _mm_extract_epi8(a, imm8) HEDLEY_STATIC_CAST(int, simde_mm_extract_epi8(a, imm8))
1204 #endif
1205
1206 #if defined(simde_mm_extract_epi32)
1207 # undef simde_mm_extract_epi32
1208 #endif
1209 SIMDE_FUNCTION_ATTRIBUTES
1210 int32_t
simde_mm_extract_epi32(simde__m128i a,const int imm8)1211 simde_mm_extract_epi32 (simde__m128i a, const int imm8)
1212 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3) {
1213 simde__m128i_private
1214 a_ = simde__m128i_to_private(a);
1215
1216 #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1217 #if defined(SIMDE_BUG_GCC_95227)
1218 (void) a_;
1219 (void) imm8;
1220 #endif
1221 return vec_extract(a_.altivec_i32, imm8);
1222 #else
1223 return a_.i32[imm8 & 3];
1224 #endif
1225 }
1226 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1227 # define simde_mm_extract_epi32(a, imm8) _mm_extract_epi32(a, imm8)
1228 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1229 # define simde_mm_extract_epi32(a, imm8) vgetq_lane_s32(simde__m128i_to_private(a).neon_i32, imm8)
1230 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1231 # define simde_mm_extract_epi32(a, imm8) HEDLEY_STATIC_CAST(int32_t, vec_extract(simde__m128i_to_private(a).altivec_i32, imm8))
1232 #endif
1233 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1234 #undef _mm_extract_epi32
1235 #define _mm_extract_epi32(a, imm8) simde_mm_extract_epi32(a, imm8)
1236 #endif
1237
1238 #if defined(simde_mm_extract_epi64)
1239 # undef simde_mm_extract_epi64
1240 #endif
1241 SIMDE_FUNCTION_ATTRIBUTES
1242 int64_t
simde_mm_extract_epi64(simde__m128i a,const int imm8)1243 simde_mm_extract_epi64 (simde__m128i a, const int imm8)
1244 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) {
1245 simde__m128i_private
1246 a_ = simde__m128i_to_private(a);
1247
1248 #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
1249 #if defined(SIMDE_BUG_GCC_95227)
1250 (void) a_;
1251 (void) imm8;
1252 #endif
1253 return vec_extract(a_.altivec_i64, imm8);
1254 #else
1255 return a_.i64[imm8 & 1];
1256 #endif
1257 }
1258 #if defined(SIMDE_X86_SSE4_1_NATIVE) && defined(SIMDE_ARCH_AMD64)
1259 # define simde_mm_extract_epi64(a, imm8) _mm_extract_epi64(a, imm8)
1260 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1261 # define simde_mm_extract_epi64(a, imm8) vgetq_lane_s64(simde__m128i_to_private(a).neon_i64, imm8)
1262 #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
1263 # define simde_mm_extract_epi64(a, imm8) HEDLEY_STATIC_CAST(int64_t, vec_extract(simde__m128i_to_private(a).altivec_i64, imm8))
1264 #endif
1265 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1266 #undef _mm_extract_epi64
1267 #define _mm_extract_epi64(a, imm8) simde_mm_extract_epi64(a, imm8)
1268 #endif
1269
1270 #if defined(simde_mm_extract_ps)
1271 # undef simde_mm_extract_ps
1272 #endif
1273 SIMDE_FUNCTION_ATTRIBUTES
1274 int32_t
simde_mm_extract_ps(simde__m128 a,const int imm8)1275 simde_mm_extract_ps (simde__m128 a, const int imm8)
1276 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3) {
1277 simde__m128_private
1278 a_ = simde__m128_to_private(a);
1279
1280 return a_.i32[imm8 & 3];
1281 }
1282 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1283 #define simde_mm_extract_ps(a, imm8) _mm_extract_ps(a, imm8)
1284 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1285 #define simde_mm_extract_ps(a, imm8) vgetq_lane_s32(simde__m128_to_private(a).neon_i32, imm8)
1286 #endif
1287 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1288 #undef _mm_extract_ps
1289 #define _mm_extract_ps(a, imm8) simde_mm_extract_ps(a, imm8)
1290 #endif
1291
1292 SIMDE_FUNCTION_ATTRIBUTES
1293 simde__m128d
simde_mm_floor_pd(simde__m128d a)1294 simde_mm_floor_pd (simde__m128d a) {
1295 return simde_mm_round_pd(a, SIMDE_MM_FROUND_TO_NEG_INF);
1296 }
1297 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1298 #undef _mm_floor_pd
1299 #define _mm_floor_pd(a) simde_mm_floor_pd(a)
1300 #endif
1301
1302 SIMDE_FUNCTION_ATTRIBUTES
1303 simde__m128
simde_mm_floor_ps(simde__m128 a)1304 simde_mm_floor_ps (simde__m128 a) {
1305 return simde_mm_round_ps(a, SIMDE_MM_FROUND_TO_NEG_INF);
1306 }
1307 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1308 #undef _mm_floor_ps
1309 #define _mm_floor_ps(a) simde_mm_floor_ps(a)
1310 #endif
1311
1312 SIMDE_FUNCTION_ATTRIBUTES
1313 simde__m128d
simde_mm_floor_sd(simde__m128d a,simde__m128d b)1314 simde_mm_floor_sd (simde__m128d a, simde__m128d b) {
1315 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1316 return _mm_floor_sd(a, b);
1317 #else
1318 simde__m128d_private
1319 r_,
1320 a_ = simde__m128d_to_private(a),
1321 b_ = simde__m128d_to_private(b);
1322
1323 #if defined(simde_math_floor)
1324 r_.f64[0] = simde_math_floor(b_.f64[0]);
1325 r_.f64[1] = a_.f64[1];
1326 #else
1327 HEDLEY_UNREACHABLE();
1328 #endif
1329
1330 return simde__m128d_from_private(r_);
1331 #endif
1332 }
1333 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1334 #undef _mm_floor_sd
1335 #define _mm_floor_sd(a, b) simde_mm_floor_sd(a, b)
1336 #endif
1337
1338 SIMDE_FUNCTION_ATTRIBUTES
1339 simde__m128
simde_mm_floor_ss(simde__m128 a,simde__m128 b)1340 simde_mm_floor_ss (simde__m128 a, simde__m128 b) {
1341 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1342 return _mm_floor_ss(a, b);
1343 #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
1344 return simde_mm_move_ss(a, simde_mm_floor_ps(b));
1345 #else
1346 simde__m128_private
1347 r_,
1348 a_ = simde__m128_to_private(a),
1349 b_ = simde__m128_to_private(b);
1350
1351 #if defined(simde_math_floorf)
1352 r_.f32[0] = simde_math_floorf(b_.f32[0]);
1353 for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
1354 r_.f32[i] = a_.f32[i];
1355 }
1356 #else
1357 HEDLEY_UNREACHABLE();
1358 #endif
1359
1360 return simde__m128_from_private(r_);
1361 #endif
1362 }
1363 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1364 #undef _mm_floor_ss
1365 #define _mm_floor_ss(a, b) simde_mm_floor_ss(a, b)
1366 #endif
1367
1368 SIMDE_FUNCTION_ATTRIBUTES
1369 simde__m128i
simde_mm_insert_epi8(simde__m128i a,int i,const int imm8)1370 simde_mm_insert_epi8 (simde__m128i a, int i, const int imm8)
1371 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) {
1372 simde__m128i_private
1373 r_ = simde__m128i_to_private(a);
1374
1375 r_.i8[imm8] = HEDLEY_STATIC_CAST(int8_t, i);
1376
1377 return simde__m128i_from_private(r_);
1378 }
1379 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1380 /* clang-3.8 returns an incompatible type, so we need the cast. MSVC
1381 * can't handle the cast ("error C2440: 'type cast': cannot convert
1382 * from '__m128i' to '__m128i'"). */
1383 #if defined(__clang__)
1384 #define simde_mm_insert_epi8(a, i, imm8) HEDLEY_STATIC_CAST(__m128i, _mm_insert_epi8(a, i, imm8))
1385 #else
1386 #define simde_mm_insert_epi8(a, i, imm8) _mm_insert_epi8(a, i, imm8)
1387 #endif
1388 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1389 # define simde_mm_insert_epi8(a, i, imm8) simde__m128i_from_neon_i8(vsetq_lane_s8(i, simde__m128i_to_private(a).i8, imm8))
1390 #endif
1391 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1392 #undef _mm_insert_epi8
1393 #define _mm_insert_epi8(a, i, imm8) simde_mm_insert_epi8(a, i, imm8)
1394 #endif
1395
1396 SIMDE_FUNCTION_ATTRIBUTES
1397 simde__m128i
simde_mm_insert_epi32(simde__m128i a,int i,const int imm8)1398 simde_mm_insert_epi32 (simde__m128i a, int i, const int imm8)
1399 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3) {
1400 simde__m128i_private
1401 r_ = simde__m128i_to_private(a);
1402
1403 r_.i32[imm8] = HEDLEY_STATIC_CAST(int32_t, i);
1404
1405 return simde__m128i_from_private(r_);
1406 }
1407 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1408 #if defined(__clang__)
1409 #define simde_mm_insert_epi32(a, i, imm8) HEDLEY_STATIC_CAST(__m128i, _mm_insert_epi32(a, i, imm8))
1410 #else
1411 #define simde_mm_insert_epi32(a, i, imm8) _mm_insert_epi32(a, i, imm8)
1412 #endif
1413 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1414 # define simde_mm_insert_epi32(a, i, imm8) simde__m128i_from_neon_i32(vsetq_lane_s32(i, simde__m128i_to_private(a).i32, imm8))
1415 #endif
1416 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1417 #undef _mm_insert_epi32
1418 #define _mm_insert_epi32(a, i, imm8) simde_mm_insert_epi32(a, i, imm8)
1419 #endif
1420
1421 SIMDE_FUNCTION_ATTRIBUTES
1422 simde__m128i
simde_mm_insert_epi64(simde__m128i a,int64_t i,const int imm8)1423 simde_mm_insert_epi64 (simde__m128i a, int64_t i, const int imm8)
1424 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) {
1425 #if defined(SIMDE_BUG_GCC_94482)
1426 simde__m128i_private
1427 a_ = simde__m128i_to_private(a);
1428
1429 switch(imm8) {
1430 case 0:
1431 return simde_mm_set_epi64x(a_.i64[1], i);
1432 break;
1433 case 1:
1434 return simde_mm_set_epi64x(i, a_.i64[0]);
1435 break;
1436 default:
1437 HEDLEY_UNREACHABLE();
1438 break;
1439 }
1440 #else
1441 simde__m128i_private
1442 r_ = simde__m128i_to_private(a);
1443
1444 r_.i64[imm8] = i;
1445 return simde__m128i_from_private(r_);
1446 #endif
1447 }
1448 #if defined(SIMDE_X86_SSE4_1_NATIVE) && defined(SIMDE_ARCH_AMD64)
1449 # define simde_mm_insert_epi64(a, i, imm8) _mm_insert_epi64(a, i, imm8)
1450 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1451 # define simde_mm_insert_epi64(a, i, imm8) simde__m128i_from_neon_i64(vsetq_lane_s64(i, simde__m128i_to_private(a).i64, imm8))
1452 #endif
1453 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1454 #undef _mm_insert_epi64
1455 #define _mm_insert_epi64(a, i, imm8) simde_mm_insert_epi64(a, i, imm8)
1456 #endif
1457
1458 SIMDE_FUNCTION_ATTRIBUTES
1459 simde__m128
simde_mm_insert_ps(simde__m128 a,simde__m128 b,const int imm8)1460 simde_mm_insert_ps (simde__m128 a, simde__m128 b, const int imm8)
1461 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
1462 simde__m128_private
1463 r_,
1464 a_ = simde__m128_to_private(a),
1465 b_ = simde__m128_to_private(b);
1466
1467 a_.f32[0] = b_.f32[(imm8 >> 6) & 3];
1468 a_.f32[(imm8 >> 4) & 3] = a_.f32[0];
1469
1470 SIMDE_VECTORIZE
1471 for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
1472 r_.f32[i] = (imm8 >> i) ? SIMDE_FLOAT32_C(0.0) : a_.f32[i];
1473 }
1474
1475 return simde__m128_from_private(r_);
1476 }
1477 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1478 # define simde_mm_insert_ps(a, b, imm8) _mm_insert_ps(a, b, imm8)
1479 #endif
1480 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1481 #undef _mm_insert_ps
1482 #define _mm_insert_ps(a, b, imm8) simde_mm_insert_ps(a, b, imm8)
1483 #endif
1484
1485 SIMDE_FUNCTION_ATTRIBUTES
1486 simde__m128i
simde_mm_max_epi8(simde__m128i a,simde__m128i b)1487 simde_mm_max_epi8 (simde__m128i a, simde__m128i b) {
1488 #if defined(SIMDE_X86_SSE4_1_NATIVE) && !defined(__PGI)
1489 return _mm_max_epi8(a, b);
1490 #else
1491 simde__m128i_private
1492 r_,
1493 a_ = simde__m128i_to_private(a),
1494 b_ = simde__m128i_to_private(b);
1495
1496 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1497 r_.neon_i8 = vmaxq_s8(a_.neon_i8, b_.neon_i8);
1498 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1499 r_.wasm_v128 = wasm_i8x16_max(a_.wasm_v128, b_.wasm_v128);
1500 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1501 r_.altivec_i8 = vec_max(a_.altivec_i8, b_.altivec_i8);
1502 #else
1503 SIMDE_VECTORIZE
1504 for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
1505 r_.i8[i] = a_.i8[i] > b_.i8[i] ? a_.i8[i] : b_.i8[i];
1506 }
1507 #endif
1508
1509 return simde__m128i_from_private(r_);
1510 #endif
1511 }
1512 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1513 #undef _mm_max_epi8
1514 #define _mm_max_epi8(a, b) simde_mm_max_epi8(a, b)
1515 #endif
1516
1517 SIMDE_FUNCTION_ATTRIBUTES
1518 simde__m128i
simde_mm_max_epi32(simde__m128i a,simde__m128i b)1519 simde_mm_max_epi32 (simde__m128i a, simde__m128i b) {
1520 #if defined(SIMDE_X86_SSE4_1_NATIVE) && !defined(__PGI)
1521 return _mm_max_epi32(a, b);
1522 #else
1523 simde__m128i_private
1524 r_,
1525 a_ = simde__m128i_to_private(a),
1526 b_ = simde__m128i_to_private(b);
1527
1528 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1529 r_.neon_i32 = vmaxq_s32(a_.neon_i32, b_.neon_i32);
1530 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1531 r_.wasm_v128 = wasm_i32x4_max(a_.wasm_v128, b_.wasm_v128);
1532 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1533 r_.altivec_i32 = vec_max(a_.altivec_i32, b_.altivec_i32);
1534 #else
1535 SIMDE_VECTORIZE
1536 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1537 r_.i32[i] = a_.i32[i] > b_.i32[i] ? a_.i32[i] : b_.i32[i];
1538 }
1539 #endif
1540
1541 return simde__m128i_from_private(r_);
1542 #endif
1543 }
1544 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1545 #undef _mm_max_epi32
1546 #define _mm_max_epi32(a, b) simde_mm_max_epi32(a, b)
1547 #endif
1548
1549 SIMDE_FUNCTION_ATTRIBUTES
1550 simde__m128i
simde_mm_max_epu16(simde__m128i a,simde__m128i b)1551 simde_mm_max_epu16 (simde__m128i a, simde__m128i b) {
1552 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1553 return _mm_max_epu16(a, b);
1554 #else
1555 simde__m128i_private
1556 r_,
1557 a_ = simde__m128i_to_private(a),
1558 b_ = simde__m128i_to_private(b);
1559
1560 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1561 r_.neon_u16 = vmaxq_u16(a_.neon_u16, b_.neon_u16);
1562 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1563 r_.wasm_v128 = wasm_u16x8_max(a_.wasm_v128, b_.wasm_v128);
1564 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1565 r_.altivec_u16 = vec_max(a_.altivec_u16, b_.altivec_u16);
1566 #else
1567 SIMDE_VECTORIZE
1568 for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
1569 r_.u16[i] = a_.u16[i] > b_.u16[i] ? a_.u16[i] : b_.u16[i];
1570 }
1571 #endif
1572
1573 return simde__m128i_from_private(r_);
1574 #endif
1575 }
1576 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1577 #undef _mm_max_epu16
1578 #define _mm_max_epu16(a, b) simde_mm_max_epu16(a, b)
1579 #endif
1580
1581 SIMDE_FUNCTION_ATTRIBUTES
1582 simde__m128i
simde_mm_max_epu32(simde__m128i a,simde__m128i b)1583 simde_mm_max_epu32 (simde__m128i a, simde__m128i b) {
1584 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1585 return _mm_max_epu32(a, b);
1586 #else
1587 simde__m128i_private
1588 r_,
1589 a_ = simde__m128i_to_private(a),
1590 b_ = simde__m128i_to_private(b);
1591
1592 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1593 r_.neon_u32 = vmaxq_u32(a_.neon_u32, b_.neon_u32);
1594 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1595 r_.wasm_v128 = wasm_u32x4_max(a_.wasm_v128, b_.wasm_v128);
1596 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1597 r_.altivec_u32 = vec_max(a_.altivec_u32, b_.altivec_u32);
1598 #else
1599 SIMDE_VECTORIZE
1600 for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
1601 r_.u32[i] = a_.u32[i] > b_.u32[i] ? a_.u32[i] : b_.u32[i];
1602 }
1603 #endif
1604
1605 return simde__m128i_from_private(r_);
1606 #endif
1607 }
1608 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1609 #undef _mm_max_epu32
1610 #define _mm_max_epu32(a, b) simde_mm_max_epu32(a, b)
1611 #endif
1612
1613 SIMDE_FUNCTION_ATTRIBUTES
1614 simde__m128i
simde_mm_min_epi8(simde__m128i a,simde__m128i b)1615 simde_mm_min_epi8 (simde__m128i a, simde__m128i b) {
1616 #if defined(SIMDE_X86_SSE4_1_NATIVE) && !defined(__PGI)
1617 return _mm_min_epi8(a, b);
1618 #else
1619 simde__m128i_private
1620 r_,
1621 a_ = simde__m128i_to_private(a),
1622 b_ = simde__m128i_to_private(b);
1623
1624 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1625 r_.neon_i8 = vminq_s8(a_.neon_i8, b_.neon_i8);
1626 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1627 r_.wasm_v128 = wasm_i8x16_min(a_.wasm_v128, b_.wasm_v128);
1628 #else
1629 SIMDE_VECTORIZE
1630 for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
1631 r_.i8[i] = a_.i8[i] < b_.i8[i] ? a_.i8[i] : b_.i8[i];
1632 }
1633 #endif
1634
1635 return simde__m128i_from_private(r_);
1636 #endif
1637 }
1638 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1639 #undef _mm_min_epi8
1640 #define _mm_min_epi8(a, b) simde_mm_min_epi8(a, b)
1641 #endif
1642
1643 SIMDE_FUNCTION_ATTRIBUTES
1644 simde__m128i
simde_mm_min_epi32(simde__m128i a,simde__m128i b)1645 simde_mm_min_epi32 (simde__m128i a, simde__m128i b) {
1646 #if defined(SIMDE_X86_SSE4_1_NATIVE) && !defined(__PGI)
1647 return _mm_min_epi32(a, b);
1648 #else
1649 simde__m128i_private
1650 r_,
1651 a_ = simde__m128i_to_private(a),
1652 b_ = simde__m128i_to_private(b);
1653
1654 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1655 r_.neon_i32 = vminq_s32(a_.neon_i32, b_.neon_i32);
1656 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1657 r_.wasm_v128 = wasm_i32x4_min(a_.wasm_v128, b_.wasm_v128);
1658 #else
1659 SIMDE_VECTORIZE
1660 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1661 r_.i32[i] = a_.i32[i] < b_.i32[i] ? a_.i32[i] : b_.i32[i];
1662 }
1663 #endif
1664
1665 return simde__m128i_from_private(r_);
1666 #endif
1667 }
1668 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1669 #undef _mm_min_epi32
1670 #define _mm_min_epi32(a, b) simde_mm_min_epi32(a, b)
1671 #endif
1672
1673 SIMDE_FUNCTION_ATTRIBUTES
1674 simde__m128i
simde_mm_min_epu16(simde__m128i a,simde__m128i b)1675 simde_mm_min_epu16 (simde__m128i a, simde__m128i b) {
1676 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1677 return _mm_min_epu16(a, b);
1678 #else
1679 simde__m128i_private
1680 r_,
1681 a_ = simde__m128i_to_private(a),
1682 b_ = simde__m128i_to_private(b);
1683
1684 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1685 r_.neon_u16 = vminq_u16(a_.neon_u16, b_.neon_u16);
1686 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1687 r_.wasm_v128 = wasm_u16x8_min(a_.wasm_v128, b_.wasm_v128);
1688 #else
1689 SIMDE_VECTORIZE
1690 for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
1691 r_.u16[i] = a_.u16[i] < b_.u16[i] ? a_.u16[i] : b_.u16[i];
1692 }
1693 #endif
1694
1695 return simde__m128i_from_private(r_);
1696 #endif
1697 }
1698 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1699 #undef _mm_min_epu16
1700 #define _mm_min_epu16(a, b) simde_mm_min_epu16(a, b)
1701 #endif
1702
1703 SIMDE_FUNCTION_ATTRIBUTES
1704 simde__m128i
simde_mm_min_epu32(simde__m128i a,simde__m128i b)1705 simde_mm_min_epu32 (simde__m128i a, simde__m128i b) {
1706 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1707 return _mm_min_epu32(a, b);
1708 #else
1709 simde__m128i_private
1710 r_,
1711 a_ = simde__m128i_to_private(a),
1712 b_ = simde__m128i_to_private(b);
1713
1714 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1715 r_.neon_u32 = vminq_u32(a_.neon_u32, b_.neon_u32);
1716 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1717 r_.wasm_v128 = wasm_u32x4_min(a_.wasm_v128, b_.wasm_v128);
1718 #else
1719 SIMDE_VECTORIZE
1720 for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
1721 r_.u32[i] = a_.u32[i] < b_.u32[i] ? a_.u32[i] : b_.u32[i];
1722 }
1723 #endif
1724
1725 return simde__m128i_from_private(r_);
1726 #endif
1727 }
1728 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1729 #undef _mm_min_epu32
1730 #define _mm_min_epu32(a, b) simde_mm_min_epu32(a, b)
1731 #endif
1732
1733 SIMDE_FUNCTION_ATTRIBUTES
1734 simde__m128i
simde_mm_minpos_epu16(simde__m128i a)1735 simde_mm_minpos_epu16 (simde__m128i a) {
1736 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1737 return _mm_minpos_epu16(a);
1738 #else
1739 simde__m128i_private
1740 r_ = simde__m128i_to_private(simde_mm_setzero_si128()),
1741 a_ = simde__m128i_to_private(a);
1742
1743 r_.u16[0] = UINT16_MAX;
1744 for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
1745 if (a_.u16[i] < r_.u16[0]) {
1746 r_.u16[0] = a_.u16[i];
1747 r_.u16[1] = HEDLEY_STATIC_CAST(uint16_t, i);
1748 }
1749 }
1750
1751 return simde__m128i_from_private(r_);
1752 #endif
1753 }
1754 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1755 #undef _mm_minpos_epu16
1756 #define _mm_minpos_epu16(a) simde_mm_minpos_epu16(a)
1757 #endif
1758
1759 SIMDE_FUNCTION_ATTRIBUTES
1760 simde__m128i
simde_mm_mpsadbw_epu8(simde__m128i a,simde__m128i b,const int imm8)1761 simde_mm_mpsadbw_epu8 (simde__m128i a, simde__m128i b, const int imm8)
1762 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
1763 simde__m128i_private
1764 r_,
1765 a_ = simde__m128i_to_private(a),
1766 b_ = simde__m128i_to_private(b);
1767
1768 const int a_offset = imm8 & 4;
1769 const int b_offset = (imm8 & 3) << 2;
1770
1771 #if defined(simde_math_abs)
1772 for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, (sizeof(r_.u16) / sizeof(r_.u16[0]))) ; i++) {
1773 r_.u16[i] =
1774 HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[a_offset + i + 0] - b_.u8[b_offset + 0]))) +
1775 HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[a_offset + i + 1] - b_.u8[b_offset + 1]))) +
1776 HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[a_offset + i + 2] - b_.u8[b_offset + 2]))) +
1777 HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[a_offset + i + 3] - b_.u8[b_offset + 3])));
1778 }
1779 #else
1780 HEDLEY_UNREACHABLE();
1781 #endif
1782
1783 return simde__m128i_from_private(r_);
1784 }
1785 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1786 # define simde_mm_mpsadbw_epu8(a, b, imm8) _mm_mpsadbw_epu8(a, b, imm8)
1787 #endif
1788 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1789 #undef _mm_mpsadbw_epu8
1790 #define _mm_mpsadbw_epu8(a, b, imm8) simde_mm_mpsadbw_epu8(a, b, imm8)
1791 #endif
1792
1793 SIMDE_FUNCTION_ATTRIBUTES
1794 simde__m128i
simde_mm_mul_epi32(simde__m128i a,simde__m128i b)1795 simde_mm_mul_epi32 (simde__m128i a, simde__m128i b) {
1796 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1797 return _mm_mul_epi32(a, b);
1798 #else
1799 simde__m128i_private
1800 r_,
1801 a_ = simde__m128i_to_private(a),
1802 b_ = simde__m128i_to_private(b);
1803
1804 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1805 // vmull_s32 upcasts instead of masking, so we downcast.
1806 int32x2_t a_lo = vmovn_s64(a_.neon_i64);
1807 int32x2_t b_lo = vmovn_s64(b_.neon_i64);
1808 r_.neon_i64 = vmull_s32(a_lo, b_lo);
1809 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1810 r_.wasm_v128 = wasm_i64x2_make(
1811 wasm_i32x4_extract_lane(a_.wasm_v128, 0) * HEDLEY_STATIC_CAST(int64_t, wasm_i32x4_extract_lane(b_.wasm_v128, 0)),
1812 wasm_i32x4_extract_lane(a_.wasm_v128, 2) * HEDLEY_STATIC_CAST(int64_t, wasm_i32x4_extract_lane(b_.wasm_v128, 2)));
1813 #else
1814 SIMDE_VECTORIZE
1815 for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
1816 r_.i64[i] =
1817 HEDLEY_STATIC_CAST(int64_t, a_.i32[i * 2]) *
1818 HEDLEY_STATIC_CAST(int64_t, b_.i32[i * 2]);
1819 }
1820 #endif
1821
1822 return simde__m128i_from_private(r_);
1823 #endif
1824 }
1825 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1826 #undef _mm_mul_epi32
1827 #define _mm_mul_epi32(a, b) simde_mm_mul_epi32(a, b)
1828 #endif
1829
1830 SIMDE_FUNCTION_ATTRIBUTES
1831 simde__m128i
simde_mm_mullo_epi32(simde__m128i a,simde__m128i b)1832 simde_mm_mullo_epi32 (simde__m128i a, simde__m128i b) {
1833 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1834 return _mm_mullo_epi32(a, b);
1835 #else
1836 simde__m128i_private
1837 r_,
1838 a_ = simde__m128i_to_private(a),
1839 b_ = simde__m128i_to_private(b);
1840
1841 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1842 r_.neon_i32 = vmulq_s32(a_.neon_i32, b_.neon_i32);
1843 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1844 (void) a_;
1845 (void) b_;
1846 r_.altivec_i32 = vec_mul(a_.altivec_i32, b_.altivec_i32);
1847 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1848 r_.wasm_v128 = wasm_i32x4_mul(a_.wasm_v128, b_.wasm_v128);
1849 #else
1850 SIMDE_VECTORIZE
1851 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1852 r_.u32[i] = HEDLEY_STATIC_CAST(uint32_t, (HEDLEY_STATIC_CAST(uint64_t, (HEDLEY_STATIC_CAST(int64_t, a_.i32[i]) * HEDLEY_STATIC_CAST(int64_t, b_.i32[i]))) & 0xffffffff));
1853 }
1854 #endif
1855
1856 return simde__m128i_from_private(r_);
1857 #endif
1858 }
1859 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1860 #undef _mm_mullo_epi32
1861 #define _mm_mullo_epi32(a, b) simde_mm_mullo_epi32(a, b)
1862 #endif
1863
1864 SIMDE_FUNCTION_ATTRIBUTES
1865 simde__m128i
simde_x_mm_mullo_epu32(simde__m128i a,simde__m128i b)1866 simde_x_mm_mullo_epu32 (simde__m128i a, simde__m128i b) {
1867 simde__m128i_private
1868 r_,
1869 a_ = simde__m128i_to_private(a),
1870 b_ = simde__m128i_to_private(b);
1871
1872 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1873 r_.neon_u32 = vmulq_u32(a_.neon_u32, b_.neon_u32);
1874 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1875 r_.u32 = a_.u32 * b_.u32;
1876 #else
1877 SIMDE_VECTORIZE
1878 for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
1879 r_.u32[i] = a_.u32[i] * b_.u32[i];
1880 }
1881 #endif
1882
1883 return simde__m128i_from_private(r_);
1884 }
1885
1886 SIMDE_FUNCTION_ATTRIBUTES
1887 simde__m128i
simde_mm_packus_epi32(simde__m128i a,simde__m128i b)1888 simde_mm_packus_epi32 (simde__m128i a, simde__m128i b) {
1889 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1890 return _mm_packus_epi32(a, b);
1891 #else
1892 simde__m128i_private
1893 r_,
1894 a_ = simde__m128i_to_private(a),
1895 b_ = simde__m128i_to_private(b);
1896
1897 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1898 const int32x4_t z = vdupq_n_s32(0);
1899 r_.neon_u16 = vcombine_u16(
1900 vqmovn_u32(vreinterpretq_u32_s32(vmaxq_s32(z, a_.neon_i32))),
1901 vqmovn_u32(vreinterpretq_u32_s32(vmaxq_s32(z, b_.neon_i32))));
1902 #else
1903 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1904 r_.u16[i + 0] = (a_.i32[i] < 0) ? UINT16_C(0) : ((a_.i32[i] > UINT16_MAX) ? (UINT16_MAX) : HEDLEY_STATIC_CAST(uint16_t, a_.i32[i]));
1905 r_.u16[i + 4] = (b_.i32[i] < 0) ? UINT16_C(0) : ((b_.i32[i] > UINT16_MAX) ? (UINT16_MAX) : HEDLEY_STATIC_CAST(uint16_t, b_.i32[i]));
1906 }
1907 #endif
1908
1909 return simde__m128i_from_private(r_);
1910 #endif
1911 }
1912 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1913 #undef _mm_packus_epi32
1914 #define _mm_packus_epi32(a, b) simde_mm_packus_epi32(a, b)
1915 #endif
1916
1917 SIMDE_FUNCTION_ATTRIBUTES
1918 simde__m128d
simde_mm_round_sd(simde__m128d a,simde__m128d b,int rounding)1919 simde_mm_round_sd (simde__m128d a, simde__m128d b, int rounding)
1920 SIMDE_REQUIRE_CONSTANT_RANGE(rounding, 0, 15) {
1921 simde__m128d_private
1922 r_ = simde__m128d_to_private(a),
1923 b_ = simde__m128d_to_private(b);
1924
1925 switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) {
1926 #if defined(simde_math_nearbyint)
1927 case SIMDE_MM_FROUND_TO_NEAREST_INT:
1928 case SIMDE_MM_FROUND_CUR_DIRECTION:
1929 r_.f64[0] = simde_math_nearbyint(b_.f64[0]);
1930 break;
1931 #endif
1932
1933 #if defined(simde_math_floor)
1934 case SIMDE_MM_FROUND_TO_NEG_INF:
1935 r_.f64[0] = simde_math_floor(b_.f64[0]);
1936 break;
1937 #endif
1938
1939 #if defined(simde_math_ceil)
1940 case SIMDE_MM_FROUND_TO_POS_INF:
1941 r_.f64[0] = simde_math_ceil(b_.f64[0]);
1942 break;
1943 #endif
1944
1945 #if defined(simde_math_trunc)
1946 case SIMDE_MM_FROUND_TO_ZERO:
1947 r_.f64[0] = simde_math_trunc(b_.f64[0]);
1948 break;
1949 #endif
1950
1951 default:
1952 HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
1953 }
1954
1955 return simde__m128d_from_private(r_);
1956 }
1957 #if defined(SIMDE_X86_SSE4_1_NATIVE)
1958 # define simde_mm_round_sd(a, b, rounding) _mm_round_sd(a, b, rounding)
1959 #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
1960 # define simde_mm_round_sd(a, b, rounding) simde_mm_move_sd(a, simde_mm_round_pd(b, rounding))
1961 #endif
1962 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
1963 #undef _mm_round_sd
1964 #define _mm_round_sd(a, b, rounding) simde_mm_round_sd(a, b, rounding)
1965 #endif
1966
1967 SIMDE_FUNCTION_ATTRIBUTES
1968 simde__m128
simde_mm_round_ss(simde__m128 a,simde__m128 b,int rounding)1969 simde_mm_round_ss (simde__m128 a, simde__m128 b, int rounding)
1970 SIMDE_REQUIRE_CONSTANT_RANGE(rounding, 0, 15) {
1971 simde__m128_private
1972 r_ = simde__m128_to_private(a),
1973 b_ = simde__m128_to_private(b);
1974
1975 switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) {
1976 #if defined(simde_math_nearbyintf)
1977 case SIMDE_MM_FROUND_TO_NEAREST_INT:
1978 case SIMDE_MM_FROUND_CUR_DIRECTION:
1979 r_.f32[0] = simde_math_nearbyintf(b_.f32[0]);
1980 break;
1981 #endif
1982
1983 #if defined(simde_math_floorf)
1984 case SIMDE_MM_FROUND_TO_NEG_INF:
1985 r_.f32[0] = simde_math_floorf(b_.f32[0]);
1986 break;
1987 #endif
1988
1989 #if defined(simde_math_ceilf)
1990 case SIMDE_MM_FROUND_TO_POS_INF:
1991 r_.f32[0] = simde_math_ceilf(b_.f32[0]);
1992 break;
1993 #endif
1994
1995 #if defined(simde_math_truncf)
1996 case SIMDE_MM_FROUND_TO_ZERO:
1997 r_.f32[0] = simde_math_truncf(b_.f32[0]);
1998 break;
1999 #endif
2000
2001 default:
2002 HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
2003 }
2004
2005 return simde__m128_from_private(r_);
2006 }
2007 #if defined(SIMDE_X86_SSE4_1_NATIVE)
2008 # define simde_mm_round_ss(a, b, rounding) _mm_round_ss(a, b, rounding)
2009 #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
2010 # define simde_mm_round_ss(a, b, rounding) simde_mm_move_ss(a, simde_mm_round_ps(b, rounding))
2011 #endif
2012 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
2013 #undef _mm_round_ss
2014 #define _mm_round_ss(a, b, rounding) simde_mm_round_ss(a, b, rounding)
2015 #endif
2016
2017 SIMDE_FUNCTION_ATTRIBUTES
2018 simde__m128i
simde_mm_stream_load_si128(const simde__m128i * mem_addr)2019 simde_mm_stream_load_si128 (const simde__m128i* mem_addr) {
2020 #if defined(SIMDE_X86_SSE4_1_NATIVE)
2021 return _mm_stream_load_si128(HEDLEY_CONST_CAST(simde__m128i*, mem_addr));
2022 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2023 return vreinterpretq_s64_s32(vld1q_s32(HEDLEY_REINTERPRET_CAST(int32_t const*, mem_addr)));
2024 #else
2025 return *mem_addr;
2026 #endif
2027 }
2028 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
2029 #undef _mm_stream_load_si128
2030 #define _mm_stream_load_si128(mem_addr) simde_mm_stream_load_si128(mem_addr)
2031 #endif
2032
2033 SIMDE_FUNCTION_ATTRIBUTES
2034 int
simde_mm_test_all_ones(simde__m128i a)2035 simde_mm_test_all_ones (simde__m128i a) {
2036 #if defined(SIMDE_X86_SSE4_1_NATIVE)
2037 return _mm_test_all_ones(a);
2038 #else
2039 simde__m128i_private a_ = simde__m128i_to_private(a);
2040 int r;
2041
2042 #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
2043 r = vec_all_eq(a_.altivec_i32, vec_splats(~0));
2044 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2045 return r = ((vgetq_lane_s64(a_.neon_i64, 0) & vgetq_lane_s64(a_.neon_i64, 1)) == ~HEDLEY_STATIC_CAST(int64_t, 0));
2046 #else
2047 int_fast32_t r_ = ~HEDLEY_STATIC_CAST(int_fast32_t, 0);
2048
2049 SIMDE_VECTORIZE_REDUCTION(&:r_)
2050 for (size_t i = 0 ; i < (sizeof(a_.i32f) / sizeof(a_.i32f[0])) ; i++) {
2051 r_ &= a_.i32f[i];
2052 }
2053
2054 r = (r_ == ~HEDLEY_STATIC_CAST(int_fast32_t, 0));
2055 #endif
2056
2057 return r;
2058 #endif
2059 }
2060 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
2061 #undef _mm_test_all_ones
2062 #define _mm_test_all_ones(a) simde_mm_test_all_ones(a)
2063 #endif
2064
2065 SIMDE_FUNCTION_ATTRIBUTES
2066 int
simde_mm_test_all_zeros(simde__m128i a,simde__m128i mask)2067 simde_mm_test_all_zeros (simde__m128i a, simde__m128i mask) {
2068 #if defined(SIMDE_X86_SSE4_1_NATIVE)
2069 return _mm_test_all_zeros(a, mask);
2070 #else
2071 simde__m128i_private tmp_ = simde__m128i_to_private(simde_mm_and_si128(a, mask));
2072 int r;
2073
2074 #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
2075 r = vec_all_eq(tmp_.altivec_i32, vec_splats(0));
2076 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2077 return !(vgetq_lane_s64(tmp_.neon_i64, 0) | vgetq_lane_s64(tmp_.neon_i64, 1));
2078 #else
2079 int_fast32_t r_ = HEDLEY_STATIC_CAST(int_fast32_t, 0);
2080
2081 SIMDE_VECTORIZE_REDUCTION(|:r_)
2082 for (size_t i = 0 ; i < (sizeof(tmp_.i32f) / sizeof(tmp_.i32f[0])) ; i++) {
2083 r_ |= tmp_.i32f[i];
2084 }
2085
2086 r = !r_;
2087 #endif
2088
2089 return r;
2090 #endif
2091 }
2092 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
2093 #undef _mm_test_all_zeros
2094 #define _mm_test_all_zeros(a, mask) simde_mm_test_all_zeros(a, mask)
2095 #endif
2096
2097 SIMDE_FUNCTION_ATTRIBUTES
2098 int
simde_mm_test_mix_ones_zeros(simde__m128i a,simde__m128i mask)2099 simde_mm_test_mix_ones_zeros (simde__m128i a, simde__m128i mask) {
2100 #if defined(SIMDE_X86_SSE4_1_NATIVE)
2101 return _mm_test_mix_ones_zeros(a, mask);
2102 #else
2103 simde__m128i_private
2104 a_ = simde__m128i_to_private(a),
2105 mask_ = simde__m128i_to_private(mask);
2106
2107 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2108 int64x2_t s640 = vandq_s64(a_.neon_i64, mask_.neon_i64);
2109 int64x2_t s641 = vandq_s64(vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_s64(a_.neon_i64))), mask_.neon_i64);
2110 return (((vgetq_lane_s64(s640, 0) | vgetq_lane_s64(s640, 1)) & (vgetq_lane_s64(s641, 0) | vgetq_lane_s64(s641, 1)))!=0);
2111 #else
2112 for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++)
2113 if (((a_.u64[i] & mask_.u64[i]) != 0) && ((~a_.u64[i] & mask_.u64[i]) != 0))
2114 return 1;
2115
2116 return 0;
2117 #endif
2118 #endif
2119 }
2120 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
2121 #undef _mm_test_mix_ones_zeros
2122 #define _mm_test_mix_ones_zeros(a, mask) simde_mm_test_mix_ones_zeros(a, mask)
2123 #endif
2124
2125 SIMDE_FUNCTION_ATTRIBUTES
2126 int
simde_mm_testc_si128(simde__m128i a,simde__m128i b)2127 simde_mm_testc_si128 (simde__m128i a, simde__m128i b) {
2128 #if defined(SIMDE_X86_SSE4_1_NATIVE)
2129 return _mm_testc_si128(a, b);
2130 #else
2131 simde__m128i_private
2132 a_ = simde__m128i_to_private(a),
2133 b_ = simde__m128i_to_private(b);
2134
2135 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2136 int64x2_t s64 = vandq_s64(~a_.neon_i64, b_.neon_i64);
2137 return !(vgetq_lane_s64(s64, 0) & vgetq_lane_s64(s64, 1));
2138 #else
2139 int_fast32_t r = 0;
2140
2141 SIMDE_VECTORIZE_REDUCTION(|:r)
2142 for (size_t i = 0 ; i < (sizeof(a_.i32f) / sizeof(a_.i32f[0])) ; i++) {
2143 r |= ~a_.i32f[i] & b_.i32f[i];
2144 }
2145
2146 return HEDLEY_STATIC_CAST(int, !r);
2147 #endif
2148 #endif
2149 }
2150 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
2151 #undef _mm_testc_si128
2152 #define _mm_testc_si128(a, b) simde_mm_testc_si128(a, b)
2153 #endif
2154
2155 SIMDE_FUNCTION_ATTRIBUTES
2156 int
simde_mm_testnzc_si128(simde__m128i a,simde__m128i b)2157 simde_mm_testnzc_si128 (simde__m128i a, simde__m128i b) {
2158 #if defined(SIMDE_X86_SSE4_1_NATIVE)
2159 return _mm_testnzc_si128(a, b);
2160 #else
2161 simde__m128i_private
2162 a_ = simde__m128i_to_private(a),
2163 b_ = simde__m128i_to_private(b);
2164
2165 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2166 int64x2_t s640 = vandq_s64(a_.neon_i64, b_.neon_i64);
2167 int64x2_t s641 = vandq_s64(~a_.neon_i64, b_.neon_i64);
2168 return (((vgetq_lane_s64(s640, 0) | vgetq_lane_s64(s640, 1)) & (vgetq_lane_s64(s641, 0) | vgetq_lane_s64(s641, 1)))!=0);
2169 #else
2170 for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) {
2171 if (((a_.u64[i] & b_.u64[i]) != 0) && ((~a_.u64[i] & b_.u64[i]) != 0))
2172 return 1;
2173 }
2174
2175 return 0;
2176 #endif
2177 #endif
2178 }
2179 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
2180 #undef _mm_testnzc_si128
2181 #define _mm_testnzc_si128(a, b) simde_mm_testnzc_si128(a, b)
2182 #endif
2183
2184 SIMDE_FUNCTION_ATTRIBUTES
2185 int
simde_mm_testz_si128(simde__m128i a,simde__m128i b)2186 simde_mm_testz_si128 (simde__m128i a, simde__m128i b) {
2187 #if defined(SIMDE_X86_SSE4_1_NATIVE)
2188 return _mm_testz_si128(a, b);
2189 #else
2190 simde__m128i_private
2191 a_ = simde__m128i_to_private(a),
2192 b_ = simde__m128i_to_private(b);
2193
2194 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2195 int64x2_t s64 = vandq_s64(a_.neon_i64, b_.neon_i64);
2196 return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
2197 #else
2198 for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) {
2199 if ((a_.u64[i] & b_.u64[i]) == 0)
2200 return 1;
2201 }
2202 #endif
2203
2204 return 0;
2205 #endif
2206 }
2207 #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
2208 #undef _mm_testz_si128
2209 #define _mm_testz_si128(a, b) simde_mm_testz_si128(a, b)
2210 #endif
2211
2212 SIMDE_END_DECLS_
2213
2214 HEDLEY_DIAGNOSTIC_POP
2215
2216 #endif /* !defined(SIMDE_X86_SSE4_1_H) */
2217