1 /* SPDX-License-Identifier: MIT
2 *
3 * Permission is hereby granted, free of charge, to any person
4 * obtaining a copy of this software and associated documentation
5 * files (the "Software"), to deal in the Software without
6 * restriction, including without limitation the rights to use, copy,
7 * modify, merge, publish, distribute, sublicense, and/or sell copies
8 * of the Software, and to permit persons to whom the Software is
9 * furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be
12 * included in all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
18 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
19 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Copyright:
24 * 2017-2020 Evan Nemerson <evan@nemerson.com>
25 * 2015-2017 John W. Ratcliff <jratcliffscarab@gmail.com>
26 * 2015 Brandon Rowlett <browlett@nvidia.com>
27 * 2015 Ken Fast <kfast@gdeb.com>
28 * 2017 Hasindu Gamaarachchi <hasindu@unsw.edu.au>
29 * 2018 Jeff Daily <jeff.daily@amd.com>
30 */
31
32 #if !defined(SIMDE_X86_SSE2_H)
33 #define SIMDE_X86_SSE2_H
34
35 #include "sse.h"
36
37 HEDLEY_DIAGNOSTIC_PUSH
38 SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
39 SIMDE_BEGIN_DECLS_
40
41 typedef union {
42 #if defined(SIMDE_VECTOR_SUBSCRIPT)
43 SIMDE_ALIGN_TO_16 int8_t i8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
44 SIMDE_ALIGN_TO_16 int16_t i16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
45 SIMDE_ALIGN_TO_16 int32_t i32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
46 SIMDE_ALIGN_TO_16 int64_t i64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
47 SIMDE_ALIGN_TO_16 uint8_t u8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
48 SIMDE_ALIGN_TO_16 uint16_t u16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
49 SIMDE_ALIGN_TO_16 uint32_t u32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
50 SIMDE_ALIGN_TO_16 uint64_t u64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
51 #if defined(SIMDE_HAVE_INT128_)
52 SIMDE_ALIGN_TO_16 simde_int128 i128 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
53 SIMDE_ALIGN_TO_16 simde_uint128 u128 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
54 #endif
55 SIMDE_ALIGN_TO_16 simde_float32 f32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
56 SIMDE_ALIGN_TO_16 simde_float64 f64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
57
58 SIMDE_ALIGN_TO_16 int_fast32_t i32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
59 SIMDE_ALIGN_TO_16 uint_fast32_t u32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
60 #else
61 SIMDE_ALIGN_TO_16 int8_t i8[16];
62 SIMDE_ALIGN_TO_16 int16_t i16[8];
63 SIMDE_ALIGN_TO_16 int32_t i32[4];
64 SIMDE_ALIGN_TO_16 int64_t i64[2];
65 SIMDE_ALIGN_TO_16 uint8_t u8[16];
66 SIMDE_ALIGN_TO_16 uint16_t u16[8];
67 SIMDE_ALIGN_TO_16 uint32_t u32[4];
68 SIMDE_ALIGN_TO_16 uint64_t u64[2];
69 #if defined(SIMDE_HAVE_INT128_)
70 SIMDE_ALIGN_TO_16 simde_int128 i128[1];
71 SIMDE_ALIGN_TO_16 simde_uint128 u128[1];
72 #endif
73 SIMDE_ALIGN_TO_16 simde_float32 f32[4];
74 SIMDE_ALIGN_TO_16 simde_float64 f64[2];
75
76 SIMDE_ALIGN_TO_16 int_fast32_t i32f[16 / sizeof(int_fast32_t)];
77 SIMDE_ALIGN_TO_16 uint_fast32_t u32f[16 / sizeof(uint_fast32_t)];
78 #endif
79
80 SIMDE_ALIGN_TO_16 simde__m64_private m64_private[2];
81 SIMDE_ALIGN_TO_16 simde__m64 m64[2];
82
83 #if defined(SIMDE_X86_SSE2_NATIVE)
84 SIMDE_ALIGN_TO_16 __m128i n;
85 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
86 SIMDE_ALIGN_TO_16 int8x16_t neon_i8;
87 SIMDE_ALIGN_TO_16 int16x8_t neon_i16;
88 SIMDE_ALIGN_TO_16 int32x4_t neon_i32;
89 SIMDE_ALIGN_TO_16 int64x2_t neon_i64;
90 SIMDE_ALIGN_TO_16 uint8x16_t neon_u8;
91 SIMDE_ALIGN_TO_16 uint16x8_t neon_u16;
92 SIMDE_ALIGN_TO_16 uint32x4_t neon_u32;
93 SIMDE_ALIGN_TO_16 uint64x2_t neon_u64;
94 #if defined(__ARM_FP16_FORMAT_IEEE)
95 SIMDE_ALIGN_TO_16 float16x8_t neon_f16;
96 #endif
97 SIMDE_ALIGN_TO_16 float32x4_t neon_f32;
98 #if defined(SIMDE_ARCH_AARCH64)
99 SIMDE_ALIGN_TO_16 float64x2_t neon_f64;
100 #endif
101 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
102 SIMDE_ALIGN_TO_16 v128_t wasm_v128;
103 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
104 SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed char) altivec_i8;
105 SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed short) altivec_i16;
106 SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed int) altivec_i32;
107 #if defined(__UINT_FAST32_TYPE__) && (defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE))
108 SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(__INT_FAST32_TYPE__) altivec_i32f;
109 #else
110 SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed int) altivec_i32f;
111 #endif
112 SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) altivec_u8;
113 SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) altivec_u16;
114 SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32;
115 #if defined(__UINT_FAST32_TYPE__) && (defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE))
116 SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(__UINT_FAST32_TYPE__) altivec_u32f;
117 #else
118 SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32f;
119 #endif
120 SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(float) altivec_f32;
121 #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
122 SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed long long) altivec_i64;
123 SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) altivec_u64;
124 SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(double) altivec_f64;
125 #endif
126 #endif
127 } simde__m128i_private;
128
129 typedef union {
130 #if defined(SIMDE_VECTOR_SUBSCRIPT)
131 SIMDE_ALIGN_TO_16 int8_t i8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
132 SIMDE_ALIGN_TO_16 int16_t i16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
133 SIMDE_ALIGN_TO_16 int32_t i32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
134 SIMDE_ALIGN_TO_16 int64_t i64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
135 SIMDE_ALIGN_TO_16 uint8_t u8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
136 SIMDE_ALIGN_TO_16 uint16_t u16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
137 SIMDE_ALIGN_TO_16 uint32_t u32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
138 SIMDE_ALIGN_TO_16 uint64_t u64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
139 SIMDE_ALIGN_TO_16 simde_float32 f32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
140 SIMDE_ALIGN_TO_16 simde_float64 f64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
141 SIMDE_ALIGN_TO_16 int_fast32_t i32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
142 SIMDE_ALIGN_TO_16 uint_fast32_t u32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
143 #else
144 SIMDE_ALIGN_TO_16 int8_t i8[16];
145 SIMDE_ALIGN_TO_16 int16_t i16[8];
146 SIMDE_ALIGN_TO_16 int32_t i32[4];
147 SIMDE_ALIGN_TO_16 int64_t i64[2];
148 SIMDE_ALIGN_TO_16 uint8_t u8[16];
149 SIMDE_ALIGN_TO_16 uint16_t u16[8];
150 SIMDE_ALIGN_TO_16 uint32_t u32[4];
151 SIMDE_ALIGN_TO_16 uint64_t u64[2];
152 SIMDE_ALIGN_TO_16 simde_float32 f32[4];
153 SIMDE_ALIGN_TO_16 simde_float64 f64[2];
154 SIMDE_ALIGN_TO_16 int_fast32_t i32f[16 / sizeof(int_fast32_t)];
155 SIMDE_ALIGN_TO_16 uint_fast32_t u32f[16 / sizeof(uint_fast32_t)];
156 #endif
157
158 SIMDE_ALIGN_TO_16 simde__m64_private m64_private[2];
159 SIMDE_ALIGN_TO_16 simde__m64 m64[2];
160
161 #if defined(SIMDE_X86_SSE2_NATIVE)
162 SIMDE_ALIGN_TO_16 __m128d n;
163 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
164 SIMDE_ALIGN_TO_16 int8x16_t neon_i8;
165 SIMDE_ALIGN_TO_16 int16x8_t neon_i16;
166 SIMDE_ALIGN_TO_16 int32x4_t neon_i32;
167 SIMDE_ALIGN_TO_16 int64x2_t neon_i64;
168 SIMDE_ALIGN_TO_16 uint8x16_t neon_u8;
169 SIMDE_ALIGN_TO_16 uint16x8_t neon_u16;
170 SIMDE_ALIGN_TO_16 uint32x4_t neon_u32;
171 SIMDE_ALIGN_TO_16 uint64x2_t neon_u64;
172 SIMDE_ALIGN_TO_16 float32x4_t neon_f32;
173 #if defined(SIMDE_ARCH_AARCH64)
174 SIMDE_ALIGN_TO_16 float64x2_t neon_f64;
175 #endif
176 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
177 SIMDE_ALIGN_TO_16 v128_t wasm_v128;
178 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
179 SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed char) altivec_i8;
180 SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed short) altivec_i16;
181 SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed int) altivec_i32;
182 #if defined(__INT_FAST32_TYPE__) && (defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE))
183 SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(__INT_FAST32_TYPE__) altivec_i32f;
184 #else
185 SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed int) altivec_i32f;
186 #endif
187 SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) altivec_u8;
188 SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) altivec_u16;
189 SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32;
190 #if defined(__UINT_FAST32_TYPE__) && (defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE))
191 SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(__UINT_FAST32_TYPE__) altivec_u32f;
192 #else
193 SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32f;
194 #endif
195 SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(float) altivec_f32;
196 #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
197 SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed long long) altivec_i64;
198 SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) altivec_u64;
199 SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(double) altivec_f64;
200 #endif
201 #endif
202 } simde__m128d_private;
203
204 #if defined(SIMDE_X86_SSE2_NATIVE)
205 typedef __m128i simde__m128i;
206 typedef __m128d simde__m128d;
207 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
208 typedef int64x2_t simde__m128i;
209 # if defined(SIMDE_ARCH_AARCH64)
210 typedef float64x2_t simde__m128d;
211 # elif defined(SIMDE_VECTOR_SUBSCRIPT)
212 typedef simde_float64 simde__m128d SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
213 # else
214 typedef simde__m128d_private simde__m128d;
215 # endif
216 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
217 typedef v128_t simde__m128i;
218 typedef v128_t simde__m128d;
219 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
220 typedef SIMDE_POWER_ALTIVEC_VECTOR(float) simde__m128i;
221 #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
222 typedef SIMDE_POWER_ALTIVEC_VECTOR(double) simde__m128d;
223 #else
224 typedef simde__m128d_private simde__m128d;
225 #endif
226 #elif defined(SIMDE_VECTOR_SUBSCRIPT)
227 typedef int64_t simde__m128i SIMDE_ALIGN_TO_16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
228 typedef simde_float64 simde__m128d SIMDE_ALIGN_TO_16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
229 #else
230 typedef simde__m128i_private simde__m128i;
231 typedef simde__m128d_private simde__m128d;
232 #endif
233
234 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
235 typedef simde__m128i __m128i;
236 typedef simde__m128d __m128d;
237 #endif
238
239 HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128i), "simde__m128i size incorrect");
240 HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128i_private), "simde__m128i_private size incorrect");
241 HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128d), "simde__m128d size incorrect");
242 HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128d_private), "simde__m128d_private size incorrect");
243 #if defined(SIMDE_CHECK_ALIGNMENT) && defined(SIMDE_ALIGN_OF)
244 HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128i) == 16, "simde__m128i is not 16-byte aligned");
245 HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128i_private) == 16, "simde__m128i_private is not 16-byte aligned");
246 HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128d) == 16, "simde__m128d is not 16-byte aligned");
247 HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128d_private) == 16, "simde__m128d_private is not 16-byte aligned");
248 #endif
249
250 SIMDE_FUNCTION_ATTRIBUTES
251 simde__m128i
simde__m128i_from_private(simde__m128i_private v)252 simde__m128i_from_private(simde__m128i_private v) {
253 simde__m128i r;
254 simde_memcpy(&r, &v, sizeof(r));
255 return r;
256 }
257
258 SIMDE_FUNCTION_ATTRIBUTES
259 simde__m128i_private
simde__m128i_to_private(simde__m128i v)260 simde__m128i_to_private(simde__m128i v) {
261 simde__m128i_private r;
262 simde_memcpy(&r, &v, sizeof(r));
263 return r;
264 }
265
266 SIMDE_FUNCTION_ATTRIBUTES
267 simde__m128d
simde__m128d_from_private(simde__m128d_private v)268 simde__m128d_from_private(simde__m128d_private v) {
269 simde__m128d r;
270 simde_memcpy(&r, &v, sizeof(r));
271 return r;
272 }
273
274 SIMDE_FUNCTION_ATTRIBUTES
275 simde__m128d_private
simde__m128d_to_private(simde__m128d v)276 simde__m128d_to_private(simde__m128d v) {
277 simde__m128d_private r;
278 simde_memcpy(&r, &v, sizeof(r));
279 return r;
280 }
281
282 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i,int8x16_t,neon,i8)283 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, int8x16_t, neon, i8)
284 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, int16x8_t, neon, i16)
285 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, int32x4_t, neon, i32)
286 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, int64x2_t, neon, i64)
287 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, uint8x16_t, neon, u8)
288 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, uint16x8_t, neon, u16)
289 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, uint32x4_t, neon, u32)
290 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, uint64x2_t, neon, u64)
291 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, float32x4_t, neon, f32)
292 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
293 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, float64x2_t, neon, f64)
294 #endif
295 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
296 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(signed char), altivec, i8)
297 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(signed short), altivec, i16)
298 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(signed int), altivec, i32)
299 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), altivec, u8)
300 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(unsigned short), altivec, u16)
301 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(unsigned int), altivec, u32)
302 #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
303 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long), altivec, u64)
304 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(signed long long), altivec, i64)
305 #endif
306 #endif /* defined(SIMDE_ARM_NEON_A32V7_NATIVE) */
307
308 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
309 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, int8x16_t, neon, i8)
310 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, int16x8_t, neon, i16)
311 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, int32x4_t, neon, i32)
312 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, int64x2_t, neon, i64)
313 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, uint8x16_t, neon, u8)
314 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, uint16x8_t, neon, u16)
315 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, uint32x4_t, neon, u32)
316 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, uint64x2_t, neon, u64)
317 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, float32x4_t, neon, f32)
318 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
319 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, float64x2_t, neon, f64)
320 #endif
321 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
322 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(signed char), altivec, i8)
323 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(signed short), altivec, i16)
324 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(signed int), altivec, i32)
325 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), altivec, u8)
326 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(unsigned short), altivec, u16)
327 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(unsigned int), altivec, u32)
328 #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
329 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long), altivec, u64)
330 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(signed long long), altivec, i64)
331 #if defined(SIMDE_BUG_GCC_95782)
332 SIMDE_FUNCTION_ATTRIBUTES
333 SIMDE_POWER_ALTIVEC_VECTOR(double)
334 simde__m128d_to_altivec_f64(simde__m128d value) {
335 simde__m128d_private r_ = simde__m128d_to_private(value);
336 return r_.altivec_f64;
337 }
338
339 SIMDE_FUNCTION_ATTRIBUTES
340 simde__m128d
341 simde__m128d_from_altivec_f64(SIMDE_POWER_ALTIVEC_VECTOR(double) value) {
342 simde__m128d_private r_;
343 r_.altivec_f64 = value;
344 return simde__m128d_from_private(r_);
345 }
346 #else
347 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(double), altivec, f64)
348 #endif
349 #endif
350 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
351 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, v128_t, wasm, v128);
352 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, v128_t, wasm, v128);
353 #endif /* defined(SIMDE_ARM_NEON_A32V7_NATIVE) */
354
355 SIMDE_FUNCTION_ATTRIBUTES
356 simde__m128d
357 simde_mm_set_pd (simde_float64 e1, simde_float64 e0) {
358 #if defined(SIMDE_X86_SSE2_NATIVE)
359 return _mm_set_pd(e1, e0);
360 #else
361 simde__m128d_private r_;
362
363 #if defined(SIMDE_WASM_SIMD128_NATIVE)
364 r_.wasm_v128 = wasm_f64x2_make(e0, e1);
365 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
366 SIMDE_ALIGN_TO_16 simde_float64 data[2] = { e0, e1 };
367 r_.neon_f64 = vld1q_f64(data);
368 #else
369 r_.f64[0] = e0;
370 r_.f64[1] = e1;
371 #endif
372
373 return simde__m128d_from_private(r_);
374 #endif
375 }
376 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
377 #define _mm_set_pd(e1, e0) simde_mm_set_pd(e1, e0)
378 #endif
379
380 SIMDE_FUNCTION_ATTRIBUTES
381 simde__m128d
simde_mm_set1_pd(simde_float64 a)382 simde_mm_set1_pd (simde_float64 a) {
383 #if defined(SIMDE_X86_SSE2_NATIVE)
384 return _mm_set1_pd(a);
385 #else
386 simde__m128d_private r_;
387
388 #if defined(SIMDE_WASM_SIMD128_NATIVE)
389 r_.wasm_v128 = wasm_f64x2_splat(a);
390 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
391 r_.neon_f64 = vdupq_n_f64(a);
392 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
393 r_.altivec_f64 = vec_splats(HEDLEY_STATIC_CAST(double, a));
394 #else
395 SIMDE_VECTORIZE
396 for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
397 r_.f64[i] = a;
398 }
399 #endif
400
401 return simde__m128d_from_private(r_);
402 #endif
403 }
404 #define simde_mm_set_pd1(a) simde_mm_set1_pd(a)
405 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
406 #define _mm_set1_pd(a) simde_mm_set1_pd(a)
407 #define _mm_set_pd1(a) simde_mm_set1_pd(a)
408 #endif
409
410 SIMDE_FUNCTION_ATTRIBUTES
411 simde__m128d
simde_x_mm_abs_pd(simde__m128d a)412 simde_x_mm_abs_pd(simde__m128d a) {
413 #if defined(SIMDE_X86_SSE2_NATIVE)
414 simde_float64 mask_;
415 uint64_t u64_ = UINT64_C(0x7FFFFFFFFFFFFFFF);
416 simde_memcpy(&mask_, &u64_, sizeof(u64_));
417 return _mm_and_pd(_mm_set1_pd(mask_), a);
418 #else
419 simde__m128d_private
420 r_,
421 a_ = simde__m128d_to_private(a);
422
423 #if defined(SIMDE_ARM_NEON_A32V8_NATIVE)
424 r_.neon_f64 = vabsq_f64(a_.neon_f64);
425 #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
426 r_.altivec_f64 = vec_abs(a_.altivec_f64);
427 #else
428 SIMDE_VECTORIZE
429 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
430 r_.f64[i] = simde_math_fabs(a_.f64[i]);
431 }
432 #endif
433
434 return simde__m128d_from_private(r_);
435 #endif
436 }
437
438 SIMDE_FUNCTION_ATTRIBUTES
439 simde__m128d
simde_x_mm_not_pd(simde__m128d a)440 simde_x_mm_not_pd(simde__m128d a) {
441 #if defined(SIMDE_X86_AVX512VL_NATIVE)
442 __m128i ai = _mm_castpd_si128(a);
443 return _mm_castsi128_pd(_mm_ternarylogic_epi64(ai, ai, ai, 0x55));
444 #else
445 simde__m128d_private
446 r_,
447 a_ = simde__m128d_to_private(a);
448
449 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
450 r_.neon_i32 = vmvnq_s32(a_.neon_i32);
451 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
452 r_.altivec_f64 = vec_nor(a_.altivec_f64, a_.altivec_f64);
453 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
454 r_.altivec_i32 = vec_nor(a_.altivec_i32, a_.altivec_i32);
455 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
456 r_.wasm_v128 = wasm_v128_not(a_.wasm_v128);
457 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
458 r_.i32f = ~a_.i32f;
459 #else
460 SIMDE_VECTORIZE
461 for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
462 r_.i32f[i] = ~(a_.i32f[i]);
463 }
464 #endif
465
466 return simde__m128d_from_private(r_);
467 #endif
468 }
469
470 SIMDE_FUNCTION_ATTRIBUTES
471 simde__m128d
simde_x_mm_select_pd(simde__m128d a,simde__m128d b,simde__m128d mask)472 simde_x_mm_select_pd(simde__m128d a, simde__m128d b, simde__m128d mask) {
473 /* This function is for when you want to blend two elements together
474 * according to a mask. It is similar to _mm_blendv_pd, except that
475 * it is undefined whether the blend is based on the highest bit in
476 * each lane (like blendv) or just bitwise operations. This allows
477 * us to implement the function efficiently everywhere.
478 *
479 * Basically, you promise that all the lanes in mask are either 0 or
480 * ~0. */
481 #if defined(SIMDE_X86_SSE4_1_NATIVE)
482 return _mm_blendv_pd(a, b, mask);
483 #else
484 simde__m128d_private
485 r_,
486 a_ = simde__m128d_to_private(a),
487 b_ = simde__m128d_to_private(b),
488 mask_ = simde__m128d_to_private(mask);
489
490 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
491 r_.i64 = a_.i64 ^ ((a_.i64 ^ b_.i64) & mask_.i64);
492 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
493 r_.neon_i64 = vbslq_s64(mask_.neon_u64, b_.neon_i64, a_.neon_i64);
494 #else
495 SIMDE_VECTORIZE
496 for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
497 r_.i64[i] = a_.i64[i] ^ ((a_.i64[i] ^ b_.i64[i]) & mask_.i64[i]);
498 }
499 #endif
500
501 return simde__m128d_from_private(r_);
502 #endif
503 }
504
505 SIMDE_FUNCTION_ATTRIBUTES
506 simde__m128i
simde_mm_add_epi8(simde__m128i a,simde__m128i b)507 simde_mm_add_epi8 (simde__m128i a, simde__m128i b) {
508 #if defined(SIMDE_X86_SSE2_NATIVE)
509 return _mm_add_epi8(a, b);
510 #else
511 simde__m128i_private
512 r_,
513 a_ = simde__m128i_to_private(a),
514 b_ = simde__m128i_to_private(b);
515
516 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
517 r_.neon_i8 = vaddq_s8(a_.neon_i8, b_.neon_i8);
518 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
519 r_.altivec_i8 = vec_add(a_.altivec_i8, b_.altivec_i8);
520 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
521 r_.wasm_v128 = wasm_i8x16_add(a_.wasm_v128, b_.wasm_v128);
522 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
523 r_.i8 = a_.i8 + b_.i8;
524 #else
525 SIMDE_VECTORIZE
526 for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
527 r_.i8[i] = a_.i8[i] + b_.i8[i];
528 }
529 #endif
530
531 return simde__m128i_from_private(r_);
532 #endif
533 }
534 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
535 #define _mm_add_epi8(a, b) simde_mm_add_epi8(a, b)
536 #endif
537
538 SIMDE_FUNCTION_ATTRIBUTES
539 simde__m128i
simde_mm_add_epi16(simde__m128i a,simde__m128i b)540 simde_mm_add_epi16 (simde__m128i a, simde__m128i b) {
541 #if defined(SIMDE_X86_SSE2_NATIVE)
542 return _mm_add_epi16(a, b);
543 #else
544 simde__m128i_private
545 r_,
546 a_ = simde__m128i_to_private(a),
547 b_ = simde__m128i_to_private(b);
548
549 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
550 r_.neon_i16 = vaddq_s16(a_.neon_i16, b_.neon_i16);
551 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
552 r_.altivec_i16 = vec_add(a_.altivec_i16, b_.altivec_i16);
553 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
554 r_.wasm_v128 = wasm_i16x8_add(a_.wasm_v128, b_.wasm_v128);
555 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
556 r_.i16 = a_.i16 + b_.i16;
557 #else
558 SIMDE_VECTORIZE
559 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
560 r_.i16[i] = a_.i16[i] + b_.i16[i];
561 }
562 #endif
563
564 return simde__m128i_from_private(r_);
565 #endif
566 }
567 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
568 #define _mm_add_epi16(a, b) simde_mm_add_epi16(a, b)
569 #endif
570
571 SIMDE_FUNCTION_ATTRIBUTES
572 simde__m128i
simde_mm_add_epi32(simde__m128i a,simde__m128i b)573 simde_mm_add_epi32 (simde__m128i a, simde__m128i b) {
574 #if defined(SIMDE_X86_SSE2_NATIVE)
575 return _mm_add_epi32(a, b);
576 #else
577 simde__m128i_private
578 r_,
579 a_ = simde__m128i_to_private(a),
580 b_ = simde__m128i_to_private(b);
581
582 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
583 r_.neon_i32 = vaddq_s32(a_.neon_i32, b_.neon_i32);
584 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
585 r_.altivec_i32 = vec_add(a_.altivec_i32, b_.altivec_i32);
586 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
587 r_.wasm_v128 = wasm_i32x4_add(a_.wasm_v128, b_.wasm_v128);
588 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
589 r_.i32 = a_.i32 + b_.i32;
590 #else
591 SIMDE_VECTORIZE
592 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
593 r_.i32[i] = a_.i32[i] + b_.i32[i];
594 }
595 #endif
596
597 return simde__m128i_from_private(r_);
598 #endif
599 }
600 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
601 #define _mm_add_epi32(a, b) simde_mm_add_epi32(a, b)
602 #endif
603
604 SIMDE_FUNCTION_ATTRIBUTES
605 simde__m128i
simde_mm_add_epi64(simde__m128i a,simde__m128i b)606 simde_mm_add_epi64 (simde__m128i a, simde__m128i b) {
607 #if defined(SIMDE_X86_SSE2_NATIVE)
608 return _mm_add_epi64(a, b);
609 #else
610 simde__m128i_private
611 r_,
612 a_ = simde__m128i_to_private(a),
613 b_ = simde__m128i_to_private(b);
614
615 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
616 r_.neon_i64 = vaddq_s64(a_.neon_i64, b_.neon_i64);
617 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
618 r_.altivec_i64 = vec_add(a_.altivec_i64, b_.altivec_i64);
619 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
620 r_.wasm_v128 = wasm_i64x2_add(a_.wasm_v128, b_.wasm_v128);
621 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
622 r_.i64 = a_.i64 + b_.i64;
623 #else
624 SIMDE_VECTORIZE
625 for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
626 r_.i64[i] = a_.i64[i] + b_.i64[i];
627 }
628 #endif
629
630 return simde__m128i_from_private(r_);
631 #endif
632 }
633 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
634 #define _mm_add_epi64(a, b) simde_mm_add_epi64(a, b)
635 #endif
636
637 SIMDE_FUNCTION_ATTRIBUTES
638 simde__m128d
simde_mm_add_pd(simde__m128d a,simde__m128d b)639 simde_mm_add_pd (simde__m128d a, simde__m128d b) {
640 #if defined(SIMDE_X86_SSE2_NATIVE)
641 return _mm_add_pd(a, b);
642 #else
643 simde__m128d_private
644 r_,
645 a_ = simde__m128d_to_private(a),
646 b_ = simde__m128d_to_private(b);
647
648 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
649 r_.neon_f64 = vaddq_f64(a_.neon_f64, b_.neon_f64);
650 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
651 r_.wasm_v128 = wasm_f64x2_add(a_.wasm_v128, b_.wasm_v128);
652 #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
653 r_.altivec_f64 = vec_add(a_.altivec_f64, b_.altivec_f64);
654 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
655 r_.wasm_v128 = wasm_f64x2_add(a_.wasm_v128, b_.wasm_v128);
656 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
657 r_.f64 = a_.f64 + b_.f64;
658 #else
659 SIMDE_VECTORIZE
660 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
661 r_.f64[i] = a_.f64[i] + b_.f64[i];
662 }
663 #endif
664
665 return simde__m128d_from_private(r_);
666 #endif
667 }
668 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
669 #define _mm_add_pd(a, b) simde_mm_add_pd(a, b)
670 #endif
671
672 SIMDE_FUNCTION_ATTRIBUTES
673 simde__m128d
simde_mm_move_sd(simde__m128d a,simde__m128d b)674 simde_mm_move_sd (simde__m128d a, simde__m128d b) {
675 #if defined(SIMDE_X86_SSE2_NATIVE)
676 return _mm_move_sd(a, b);
677 #else
678 simde__m128d_private
679 r_,
680 a_ = simde__m128d_to_private(a),
681 b_ = simde__m128d_to_private(b);
682
683 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
684 r_.neon_f64 = vsetq_lane_f64(vgetq_lane_f64(b_.neon_f64, 0), a_.neon_f64, 0);
685 #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
686 #if defined(HEDLEY_IBM_VERSION)
687 r_.altivec_f64 = vec_xxpermdi(a_.altivec_f64, b_.altivec_f64, 1);
688 #else
689 r_.altivec_f64 = vec_xxpermdi(b_.altivec_f64, a_.altivec_f64, 1);
690 #endif
691 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
692 r_.wasm_v128 = wasm_v64x2_shuffle(a_.wasm_v128, b_.wasm_v128, 2, 1);
693 #elif defined(SIMDE_SHUFFLE_VECTOR_)
694 r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 2, 1);
695 #else
696 r_.f64[0] = b_.f64[0];
697 r_.f64[1] = a_.f64[1];
698 #endif
699
700 return simde__m128d_from_private(r_);
701 #endif
702 }
703 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
704 #define _mm_move_sd(a, b) simde_mm_move_sd(a, b)
705 #endif
706
707 SIMDE_FUNCTION_ATTRIBUTES
708 simde__m128d
simde_mm_add_sd(simde__m128d a,simde__m128d b)709 simde_mm_add_sd (simde__m128d a, simde__m128d b) {
710 #if defined(SIMDE_X86_SSE2_NATIVE)
711 return _mm_add_sd(a, b);
712 #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
713 return simde_mm_move_sd(a, simde_mm_add_pd(a, b));
714 #else
715 simde__m128d_private
716 r_,
717 a_ = simde__m128d_to_private(a),
718 b_ = simde__m128d_to_private(b);
719
720 r_.f64[0] = a_.f64[0] + b_.f64[0];
721 r_.f64[1] = a_.f64[1];
722
723 return simde__m128d_from_private(r_);
724 #endif
725 }
726 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
727 #define _mm_add_sd(a, b) simde_mm_add_sd(a, b)
728 #endif
729
730 SIMDE_FUNCTION_ATTRIBUTES
731 simde__m64
simde_mm_add_si64(simde__m64 a,simde__m64 b)732 simde_mm_add_si64 (simde__m64 a, simde__m64 b) {
733 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
734 return _mm_add_si64(a, b);
735 #else
736 simde__m64_private
737 r_,
738 a_ = simde__m64_to_private(a),
739 b_ = simde__m64_to_private(b);
740
741 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
742 r_.neon_i64 = vadd_s64(a_.neon_i64, b_.neon_i64);
743 #else
744 r_.i64[0] = a_.i64[0] + b_.i64[0];
745 #endif
746
747 return simde__m64_from_private(r_);
748 #endif
749 }
750 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
751 #define _mm_add_si64(a, b) simde_mm_add_si64(a, b)
752 #endif
753
754 SIMDE_FUNCTION_ATTRIBUTES
755 simde__m128i
simde_mm_adds_epi8(simde__m128i a,simde__m128i b)756 simde_mm_adds_epi8 (simde__m128i a, simde__m128i b) {
757 #if defined(SIMDE_X86_SSE2_NATIVE)
758 return _mm_adds_epi8(a, b);
759 #else
760 simde__m128i_private
761 r_,
762 a_ = simde__m128i_to_private(a),
763 b_ = simde__m128i_to_private(b);
764
765 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
766 r_.neon_i8 = vqaddq_s8(a_.neon_i8, b_.neon_i8);
767 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
768 r_.wasm_v128 = wasm_i8x16_add_saturate(a_.wasm_v128, b_.wasm_v128);
769 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
770 r_.altivec_i8 = vec_adds(a_.altivec_i8, b_.altivec_i8);
771 #else
772 SIMDE_VECTORIZE
773 for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
774 const int_fast16_t tmp =
775 HEDLEY_STATIC_CAST(int_fast16_t, a_.i8[i]) +
776 HEDLEY_STATIC_CAST(int_fast16_t, b_.i8[i]);
777 r_.i8[i] = HEDLEY_STATIC_CAST(int8_t, ((tmp < INT8_MAX) ? ((tmp > INT8_MIN) ? tmp : INT8_MIN) : INT8_MAX));
778 }
779 #endif
780
781 return simde__m128i_from_private(r_);
782 #endif
783 }
784 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
785 #define _mm_adds_epi8(a, b) simde_mm_adds_epi8(a, b)
786 #endif
787
788 SIMDE_FUNCTION_ATTRIBUTES
789 simde__m128i
simde_mm_adds_epi16(simde__m128i a,simde__m128i b)790 simde_mm_adds_epi16 (simde__m128i a, simde__m128i b) {
791 #if defined(SIMDE_X86_SSE2_NATIVE)
792 return _mm_adds_epi16(a, b);
793 #else
794 simde__m128i_private
795 r_,
796 a_ = simde__m128i_to_private(a),
797 b_ = simde__m128i_to_private(b);
798
799 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
800 r_.neon_i16 = vqaddq_s16(a_.neon_i16, b_.neon_i16);
801 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
802 r_.wasm_v128 = wasm_i16x8_add_saturate(a_.wasm_v128, b_.wasm_v128);
803 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
804 r_.altivec_i16 = vec_adds(a_.altivec_i16, b_.altivec_i16);
805 #else
806 SIMDE_VECTORIZE
807 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
808 const int_fast32_t tmp =
809 HEDLEY_STATIC_CAST(int_fast32_t, a_.i16[i]) +
810 HEDLEY_STATIC_CAST(int_fast32_t, b_.i16[i]);
811 r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, ((tmp < INT16_MAX) ? ((tmp > INT16_MIN) ? tmp : INT16_MIN) : INT16_MAX));
812 }
813 #endif
814
815 return simde__m128i_from_private(r_);
816 #endif
817 }
818 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
819 #define _mm_adds_epi16(a, b) simde_mm_adds_epi16(a, b)
820 #endif
821
822 SIMDE_FUNCTION_ATTRIBUTES
823 simde__m128i
simde_mm_adds_epu8(simde__m128i a,simde__m128i b)824 simde_mm_adds_epu8 (simde__m128i a, simde__m128i b) {
825 #if defined(SIMDE_X86_SSE2_NATIVE)
826 return _mm_adds_epu8(a, b);
827 #else
828 simde__m128i_private
829 r_,
830 a_ = simde__m128i_to_private(a),
831 b_ = simde__m128i_to_private(b);
832
833 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
834 r_.neon_u8 = vqaddq_u8(a_.neon_u8, b_.neon_u8);
835 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
836 r_.wasm_v128 = wasm_u8x16_add_saturate(a_.wasm_v128, b_.wasm_v128);
837 #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
838 r_.altivec_u8 = vec_adds(a_.altivec_u8, b_.altivec_u8);
839 #else
840 SIMDE_VECTORIZE
841 for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
842 r_.u8[i] = ((UINT8_MAX - a_.u8[i]) > b_.u8[i]) ? (a_.u8[i] + b_.u8[i]) : UINT8_MAX;
843 }
844 #endif
845
846 return simde__m128i_from_private(r_);
847 #endif
848 }
849 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
850 #define _mm_adds_epu8(a, b) simde_mm_adds_epu8(a, b)
851 #endif
852
853 SIMDE_FUNCTION_ATTRIBUTES
854 simde__m128i
simde_mm_adds_epu16(simde__m128i a,simde__m128i b)855 simde_mm_adds_epu16 (simde__m128i a, simde__m128i b) {
856 #if defined(SIMDE_X86_SSE2_NATIVE)
857 return _mm_adds_epu16(a, b);
858 #else
859 simde__m128i_private
860 r_,
861 a_ = simde__m128i_to_private(a),
862 b_ = simde__m128i_to_private(b);
863
864 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
865 r_.neon_u16 = vqaddq_u16(a_.neon_u16, b_.neon_u16);
866 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
867 r_.wasm_v128 = wasm_u16x8_add_saturate(a_.wasm_v128, b_.wasm_v128);
868 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
869 r_.altivec_u16 = vec_adds(a_.altivec_u16, b_.altivec_u16);
870 #else
871 SIMDE_VECTORIZE
872 for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
873 r_.u16[i] = ((UINT16_MAX - a_.u16[i]) > b_.u16[i]) ? (a_.u16[i] + b_.u16[i]) : UINT16_MAX;
874 }
875 #endif
876
877 return simde__m128i_from_private(r_);
878 #endif
879 }
880 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
881 #define _mm_adds_epu16(a, b) simde_mm_adds_epu16(a, b)
882 #endif
883
884 SIMDE_FUNCTION_ATTRIBUTES
885 simde__m128d
simde_mm_and_pd(simde__m128d a,simde__m128d b)886 simde_mm_and_pd (simde__m128d a, simde__m128d b) {
887 #if defined(SIMDE_X86_SSE2_NATIVE)
888 return _mm_and_pd(a, b);
889 #else
890 simde__m128d_private
891 r_,
892 a_ = simde__m128d_to_private(a),
893 b_ = simde__m128d_to_private(b);
894
895 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
896 r_.neon_i32 = vandq_s32(a_.neon_i32, b_.neon_i32);
897 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
898 r_.wasm_v128 = wasm_v128_and(a_.wasm_v128, b_.wasm_v128);
899 #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
900 r_.altivec_f64 = vec_and(a_.altivec_f64, b_.altivec_f64);
901 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
902 r_.i32f = a_.i32f & b_.i32f;
903 #else
904 SIMDE_VECTORIZE
905 for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
906 r_.i32f[i] = a_.i32f[i] & b_.i32f[i];
907 }
908 #endif
909
910 return simde__m128d_from_private(r_);
911 #endif
912 }
913 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
914 #define _mm_and_pd(a, b) simde_mm_and_pd(a, b)
915 #endif
916
917 SIMDE_FUNCTION_ATTRIBUTES
918 simde__m128i
simde_mm_and_si128(simde__m128i a,simde__m128i b)919 simde_mm_and_si128 (simde__m128i a, simde__m128i b) {
920 #if defined(SIMDE_X86_SSE2_NATIVE)
921 return _mm_and_si128(a, b);
922 #else
923 simde__m128i_private
924 r_,
925 a_ = simde__m128i_to_private(a),
926 b_ = simde__m128i_to_private(b);
927
928 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
929 r_.neon_i32 = vandq_s32(b_.neon_i32, a_.neon_i32);
930 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
931 r_.altivec_u32f = vec_and(a_.altivec_u32f, b_.altivec_u32f);
932 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
933 r_.i32f = a_.i32f & b_.i32f;
934 #else
935 SIMDE_VECTORIZE
936 for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
937 r_.i32f[i] = a_.i32f[i] & b_.i32f[i];
938 }
939 #endif
940
941 return simde__m128i_from_private(r_);
942 #endif
943 }
944 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
945 #define _mm_and_si128(a, b) simde_mm_and_si128(a, b)
946 #endif
947
948 SIMDE_FUNCTION_ATTRIBUTES
949 simde__m128d
simde_mm_andnot_pd(simde__m128d a,simde__m128d b)950 simde_mm_andnot_pd (simde__m128d a, simde__m128d b) {
951 #if defined(SIMDE_X86_SSE2_NATIVE)
952 return _mm_andnot_pd(a, b);
953 #else
954 simde__m128d_private
955 r_,
956 a_ = simde__m128d_to_private(a),
957 b_ = simde__m128d_to_private(b);
958
959 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
960 r_.neon_i32 = vbicq_s32(b_.neon_i32, a_.neon_i32);
961 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
962 r_.wasm_v128 = wasm_v128_andnot(b_.wasm_v128, a_.wasm_v128);
963 #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
964 r_.altivec_f64 = vec_andc(b_.altivec_f64, a_.altivec_f64);
965 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
966 r_.altivec_i32f = vec_andc(b_.altivec_i32f, a_.altivec_i32f);
967 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
968 r_.i32f = ~a_.i32f & b_.i32f;
969 #else
970 SIMDE_VECTORIZE
971 for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
972 r_.u64[i] = ~a_.u64[i] & b_.u64[i];
973 }
974 #endif
975
976 return simde__m128d_from_private(r_);
977 #endif
978 }
979 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
980 #define _mm_andnot_pd(a, b) simde_mm_andnot_pd(a, b)
981 #endif
982
983 SIMDE_FUNCTION_ATTRIBUTES
984 simde__m128i
simde_mm_andnot_si128(simde__m128i a,simde__m128i b)985 simde_mm_andnot_si128 (simde__m128i a, simde__m128i b) {
986 #if defined(SIMDE_X86_SSE2_NATIVE)
987 return _mm_andnot_si128(a, b);
988 #else
989 simde__m128i_private
990 r_,
991 a_ = simde__m128i_to_private(a),
992 b_ = simde__m128i_to_private(b);
993
994 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
995 r_.neon_i32 = vbicq_s32(b_.neon_i32, a_.neon_i32);
996 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
997 r_.altivec_i32 = vec_andc(b_.altivec_i32, a_.altivec_i32);
998 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
999 r_.i32f = ~a_.i32f & b_.i32f;
1000 #else
1001 SIMDE_VECTORIZE
1002 for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
1003 r_.i32f[i] = ~(a_.i32f[i]) & b_.i32f[i];
1004 }
1005 #endif
1006
1007 return simde__m128i_from_private(r_);
1008 #endif
1009 }
1010 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1011 #define _mm_andnot_si128(a, b) simde_mm_andnot_si128(a, b)
1012 #endif
1013
1014 SIMDE_FUNCTION_ATTRIBUTES
1015 simde__m128d
simde_mm_xor_pd(simde__m128d a,simde__m128d b)1016 simde_mm_xor_pd (simde__m128d a, simde__m128d b) {
1017 #if defined(SIMDE_X86_SSE2_NATIVE)
1018 return _mm_xor_pd(a, b);
1019 #else
1020 simde__m128d_private
1021 r_,
1022 a_ = simde__m128d_to_private(a),
1023 b_ = simde__m128d_to_private(b);
1024
1025 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1026 r_.i32f = a_.i32f ^ b_.i32f;
1027 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1028 r_.wasm_v128 = wasm_v128_xor(a_.wasm_v128, b_.wasm_v128);
1029 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1030 r_.neon_i64 = veorq_s64(a_.neon_i64, b_.neon_i64);
1031 #else
1032 SIMDE_VECTORIZE
1033 for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
1034 r_.i32f[i] = a_.i32f[i] ^ b_.i32f[i];
1035 }
1036 #endif
1037
1038 return simde__m128d_from_private(r_);
1039 #endif
1040 }
1041 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1042 #define _mm_xor_pd(a, b) simde_mm_xor_pd(a, b)
1043 #endif
1044
1045 SIMDE_FUNCTION_ATTRIBUTES
1046 simde__m128i
simde_mm_avg_epu8(simde__m128i a,simde__m128i b)1047 simde_mm_avg_epu8 (simde__m128i a, simde__m128i b) {
1048 #if defined(SIMDE_X86_SSE2_NATIVE)
1049 return _mm_avg_epu8(a, b);
1050 #else
1051 simde__m128i_private
1052 r_,
1053 a_ = simde__m128i_to_private(a),
1054 b_ = simde__m128i_to_private(b);
1055
1056 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1057 r_.neon_u8 = vrhaddq_u8(b_.neon_u8, a_.neon_u8);
1058 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1059 r_.wasm_v128 = wasm_u8x16_avgr(a_.wasm_v128, b_.wasm_v128);
1060 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
1061 r_.altivec_u8 = vec_avg(a_.altivec_u8, b_.altivec_u8);
1062 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_CONVERT_VECTOR_)
1063 uint16_t wa SIMDE_VECTOR(32);
1064 uint16_t wb SIMDE_VECTOR(32);
1065 uint16_t wr SIMDE_VECTOR(32);
1066 SIMDE_CONVERT_VECTOR_(wa, a_.u8);
1067 SIMDE_CONVERT_VECTOR_(wb, b_.u8);
1068 wr = (wa + wb + 1) >> 1;
1069 SIMDE_CONVERT_VECTOR_(r_.u8, wr);
1070 #else
1071 SIMDE_VECTORIZE
1072 for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
1073 r_.u8[i] = (a_.u8[i] + b_.u8[i] + 1) >> 1;
1074 }
1075 #endif
1076
1077 return simde__m128i_from_private(r_);
1078 #endif
1079 }
1080 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1081 #define _mm_avg_epu8(a, b) simde_mm_avg_epu8(a, b)
1082 #endif
1083
1084 SIMDE_FUNCTION_ATTRIBUTES
1085 simde__m128i
simde_mm_avg_epu16(simde__m128i a,simde__m128i b)1086 simde_mm_avg_epu16 (simde__m128i a, simde__m128i b) {
1087 #if defined(SIMDE_X86_SSE2_NATIVE)
1088 return _mm_avg_epu16(a, b);
1089 #else
1090 simde__m128i_private
1091 r_,
1092 a_ = simde__m128i_to_private(a),
1093 b_ = simde__m128i_to_private(b);
1094
1095 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1096 r_.neon_u16 = vrhaddq_u16(b_.neon_u16, a_.neon_u16);
1097 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1098 r_.wasm_v128 = wasm_u16x8_avgr(a_.wasm_v128, b_.wasm_v128);
1099 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
1100 r_.altivec_u16 = vec_avg(a_.altivec_u16, b_.altivec_u16);
1101 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_CONVERT_VECTOR_)
1102 uint32_t wa SIMDE_VECTOR(32);
1103 uint32_t wb SIMDE_VECTOR(32);
1104 uint32_t wr SIMDE_VECTOR(32);
1105 SIMDE_CONVERT_VECTOR_(wa, a_.u16);
1106 SIMDE_CONVERT_VECTOR_(wb, b_.u16);
1107 wr = (wa + wb + 1) >> 1;
1108 SIMDE_CONVERT_VECTOR_(r_.u16, wr);
1109 #else
1110 SIMDE_VECTORIZE
1111 for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
1112 r_.u16[i] = (a_.u16[i] + b_.u16[i] + 1) >> 1;
1113 }
1114 #endif
1115
1116 return simde__m128i_from_private(r_);
1117 #endif
1118 }
1119 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1120 #define _mm_avg_epu16(a, b) simde_mm_avg_epu16(a, b)
1121 #endif
1122
1123 SIMDE_FUNCTION_ATTRIBUTES
1124 simde__m128i
simde_mm_setzero_si128(void)1125 simde_mm_setzero_si128 (void) {
1126 #if defined(SIMDE_X86_SSE2_NATIVE)
1127 return _mm_setzero_si128();
1128 #else
1129 simde__m128i_private r_;
1130
1131 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1132 r_.neon_i32 = vdupq_n_s32(0);
1133 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
1134 r_.altivec_i32 = vec_splats(HEDLEY_STATIC_CAST(signed int, 0));
1135 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1136 r_.wasm_v128 = wasm_i32x4_splat(INT32_C(0));
1137 #elif defined(SIMDE_VECTOR_SUBSCRIPT)
1138 r_.i32 = __extension__ (__typeof__(r_.i32)) { 0, 0, 0, 0 };
1139 #else
1140 SIMDE_VECTORIZE
1141 for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
1142 r_.i32f[i] = 0;
1143 }
1144 #endif
1145
1146 return simde__m128i_from_private(r_);
1147 #endif
1148 }
1149 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1150 #define _mm_setzero_si128() (simde_mm_setzero_si128())
1151 #endif
1152
1153 SIMDE_FUNCTION_ATTRIBUTES
1154 simde__m128i
simde_mm_bslli_si128(simde__m128i a,const int imm8)1155 simde_mm_bslli_si128 (simde__m128i a, const int imm8)
1156 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
1157 simde__m128i_private
1158 r_,
1159 a_ = simde__m128i_to_private(a);
1160
1161 if (HEDLEY_UNLIKELY((imm8 & ~15))) {
1162 return simde_mm_setzero_si128();
1163 }
1164
1165 #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) && defined(SIMDE_ENDIAN_ORDER)
1166 r_.altivec_i8 =
1167 #if (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
1168 vec_slo
1169 #else /* SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_BIG */
1170 vec_sro
1171 #endif
1172 (a_.altivec_i8, vec_splats(HEDLEY_STATIC_CAST(unsigned char, imm8 * 8)));
1173 #elif defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
1174 r_.altivec_i8 = vec_srb(a_.altivec_i8, vec_splats(HEDLEY_STATIC_CAST(unsigned char, (imm8 & 15) << 3)));
1175 #elif defined(SIMDE_HAVE_INT128_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
1176 r_.u128[0] = a_.u128[0] << (imm8 * 8);
1177 #else
1178 r_ = simde__m128i_to_private(simde_mm_setzero_si128());
1179 for (int i = imm8 ; i < HEDLEY_STATIC_CAST(int, sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
1180 r_.i8[i] = a_.i8[i - imm8];
1181 }
1182 #endif
1183
1184 return simde__m128i_from_private(r_);
1185 }
1186 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
1187 #define simde_mm_bslli_si128(a, imm8) _mm_slli_si128(a, imm8)
1188 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__)
1189 #define simde_mm_bslli_si128(a, imm8) \
1190 simde__m128i_from_neon_i8(((imm8) <= 0) ? simde__m128i_to_neon_i8(a) : (((imm8) > 15) ? (vdupq_n_s8(0)) : (vextq_s8(vdupq_n_s8(0), simde__m128i_to_neon_i8(a), 16 - (imm8)))))
1191 #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) && !defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
1192 #define simde_mm_bslli_si128(a, imm8) (__extension__ ({ \
1193 const simde__m128i_private simde__tmp_a_ = simde__m128i_to_private(a); \
1194 const simde__m128i_private simde__tmp_z_ = simde__m128i_to_private(simde_mm_setzero_si128()); \
1195 simde__m128i_private simde__tmp_r_; \
1196 if (HEDLEY_UNLIKELY(imm8 > 15)) { \
1197 simde__tmp_r_ = simde__m128i_to_private(simde_mm_setzero_si128()); \
1198 } else { \
1199 simde__tmp_r_.i8 = \
1200 SIMDE_SHUFFLE_VECTOR_(8, 16, \
1201 simde__tmp_z_.i8, \
1202 (simde__tmp_a_).i8, \
1203 HEDLEY_STATIC_CAST(int8_t, (16 - imm8) & 31), \
1204 HEDLEY_STATIC_CAST(int8_t, (17 - imm8) & 31), \
1205 HEDLEY_STATIC_CAST(int8_t, (18 - imm8) & 31), \
1206 HEDLEY_STATIC_CAST(int8_t, (19 - imm8) & 31), \
1207 HEDLEY_STATIC_CAST(int8_t, (20 - imm8) & 31), \
1208 HEDLEY_STATIC_CAST(int8_t, (21 - imm8) & 31), \
1209 HEDLEY_STATIC_CAST(int8_t, (22 - imm8) & 31), \
1210 HEDLEY_STATIC_CAST(int8_t, (23 - imm8) & 31), \
1211 HEDLEY_STATIC_CAST(int8_t, (24 - imm8) & 31), \
1212 HEDLEY_STATIC_CAST(int8_t, (25 - imm8) & 31), \
1213 HEDLEY_STATIC_CAST(int8_t, (26 - imm8) & 31), \
1214 HEDLEY_STATIC_CAST(int8_t, (27 - imm8) & 31), \
1215 HEDLEY_STATIC_CAST(int8_t, (28 - imm8) & 31), \
1216 HEDLEY_STATIC_CAST(int8_t, (29 - imm8) & 31), \
1217 HEDLEY_STATIC_CAST(int8_t, (30 - imm8) & 31), \
1218 HEDLEY_STATIC_CAST(int8_t, (31 - imm8) & 31)); \
1219 } \
1220 simde__m128i_from_private(simde__tmp_r_); }))
1221 #endif
1222 #define simde_mm_slli_si128(a, imm8) simde_mm_bslli_si128(a, imm8)
1223 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1224 #define _mm_bslli_si128(a, imm8) simde_mm_bslli_si128(a, imm8)
1225 #define _mm_slli_si128(a, imm8) simde_mm_bslli_si128(a, imm8)
1226 #endif
1227
1228 SIMDE_FUNCTION_ATTRIBUTES
1229 simde__m128i
simde_mm_bsrli_si128(simde__m128i a,const int imm8)1230 simde_mm_bsrli_si128 (simde__m128i a, const int imm8)
1231 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
1232 simde__m128i_private
1233 r_,
1234 a_ = simde__m128i_to_private(a);
1235
1236 if (HEDLEY_UNLIKELY((imm8 & ~15))) {
1237 return simde_mm_setzero_si128();
1238 }
1239
1240 #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) && defined(SIMDE_ENDIAN_ORDER)
1241 r_.altivec_i8 =
1242 #if (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
1243 vec_sro
1244 #else /* SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_BIG */
1245 vec_slo
1246 #endif
1247 (a_.altivec_i8, vec_splats(HEDLEY_STATIC_CAST(unsigned char, imm8 * 8)));
1248 #elif defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
1249 r_.altivec_i8 = vec_slb(a_.altivec_i8, vec_splats(HEDLEY_STATIC_CAST(unsigned char, (imm8 & 15) << 3)));
1250 #else
1251 SIMDE_VECTORIZE
1252 for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
1253 const int e = HEDLEY_STATIC_CAST(int, i) + imm8;
1254 r_.i8[i] = (e < 16) ? a_.i8[e] : 0;
1255 }
1256 #endif
1257
1258 return simde__m128i_from_private(r_);
1259 }
1260 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
1261 #define simde_mm_bsrli_si128(a, imm8) _mm_srli_si128(a, imm8)
1262 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__)
1263 #define simde_mm_bsrli_si128(a, imm8) \
1264 simde__m128i_from_neon_i8(((imm8 < 0) || (imm8 > 15)) ? vdupq_n_s8(0) : (vextq_s8(simde__m128i_to_private(a).neon_i8, vdupq_n_s8(0), ((imm8 & 15) != 0) ? imm8 : (imm8 & 15))))
1265 #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) && !defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
1266 #define simde_mm_bsrli_si128(a, imm8) (__extension__ ({ \
1267 const simde__m128i_private simde__tmp_a_ = simde__m128i_to_private(a); \
1268 const simde__m128i_private simde__tmp_z_ = simde__m128i_to_private(simde_mm_setzero_si128()); \
1269 simde__m128i_private simde__tmp_r_ = simde__m128i_to_private(a); \
1270 if (HEDLEY_UNLIKELY(imm8 > 15)) { \
1271 simde__tmp_r_ = simde__m128i_to_private(simde_mm_setzero_si128()); \
1272 } else { \
1273 simde__tmp_r_.i8 = \
1274 SIMDE_SHUFFLE_VECTOR_(8, 16, \
1275 simde__tmp_z_.i8, \
1276 (simde__tmp_a_).i8, \
1277 HEDLEY_STATIC_CAST(int8_t, (imm8 + 16) & 31), \
1278 HEDLEY_STATIC_CAST(int8_t, (imm8 + 17) & 31), \
1279 HEDLEY_STATIC_CAST(int8_t, (imm8 + 18) & 31), \
1280 HEDLEY_STATIC_CAST(int8_t, (imm8 + 19) & 31), \
1281 HEDLEY_STATIC_CAST(int8_t, (imm8 + 20) & 31), \
1282 HEDLEY_STATIC_CAST(int8_t, (imm8 + 21) & 31), \
1283 HEDLEY_STATIC_CAST(int8_t, (imm8 + 22) & 31), \
1284 HEDLEY_STATIC_CAST(int8_t, (imm8 + 23) & 31), \
1285 HEDLEY_STATIC_CAST(int8_t, (imm8 + 24) & 31), \
1286 HEDLEY_STATIC_CAST(int8_t, (imm8 + 25) & 31), \
1287 HEDLEY_STATIC_CAST(int8_t, (imm8 + 26) & 31), \
1288 HEDLEY_STATIC_CAST(int8_t, (imm8 + 27) & 31), \
1289 HEDLEY_STATIC_CAST(int8_t, (imm8 + 28) & 31), \
1290 HEDLEY_STATIC_CAST(int8_t, (imm8 + 29) & 31), \
1291 HEDLEY_STATIC_CAST(int8_t, (imm8 + 30) & 31), \
1292 HEDLEY_STATIC_CAST(int8_t, (imm8 + 31) & 31)); \
1293 } \
1294 simde__m128i_from_private(simde__tmp_r_); }))
1295 #endif
1296 #define simde_mm_srli_si128(a, imm8) simde_mm_bsrli_si128((a), (imm8))
1297 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1298 #define _mm_bsrli_si128(a, imm8) simde_mm_bsrli_si128((a), (imm8))
1299 #define _mm_srli_si128(a, imm8) simde_mm_bsrli_si128((a), (imm8))
1300 #endif
1301
1302 SIMDE_FUNCTION_ATTRIBUTES
1303 void
simde_mm_clflush(void const * p)1304 simde_mm_clflush (void const* p) {
1305 #if defined(SIMDE_X86_SSE2_NATIVE)
1306 _mm_clflush(p);
1307 #else
1308 (void) p;
1309 #endif
1310 }
1311 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1312 #define _mm_clflush(a, b) simde_mm_clflush()
1313 #endif
1314
1315 SIMDE_FUNCTION_ATTRIBUTES
1316 int
simde_mm_comieq_sd(simde__m128d a,simde__m128d b)1317 simde_mm_comieq_sd (simde__m128d a, simde__m128d b) {
1318 #if defined(SIMDE_X86_SSE2_NATIVE)
1319 return _mm_comieq_sd(a, b);
1320 #else
1321 simde__m128d_private
1322 a_ = simde__m128d_to_private(a),
1323 b_ = simde__m128d_to_private(b);
1324 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1325 return !!vgetq_lane_u64(vceqq_f64(a_.neon_f64, b_.neon_f64), 0);
1326 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1327 return wasm_f64x2_extract_lane(a_.wasm_v128, 0) == wasm_f64x2_extract_lane(b_.wasm_v128, 0);
1328 #else
1329 return a_.f64[0] == b_.f64[0];
1330 #endif
1331 #endif
1332 }
1333 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1334 #define _mm_comieq_sd(a, b) simde_mm_comieq_sd(a, b)
1335 #endif
1336
1337 SIMDE_FUNCTION_ATTRIBUTES
1338 int
simde_mm_comige_sd(simde__m128d a,simde__m128d b)1339 simde_mm_comige_sd (simde__m128d a, simde__m128d b) {
1340 #if defined(SIMDE_X86_SSE2_NATIVE)
1341 return _mm_comige_sd(a, b);
1342 #else
1343 simde__m128d_private
1344 a_ = simde__m128d_to_private(a),
1345 b_ = simde__m128d_to_private(b);
1346 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1347 return !!vgetq_lane_u64(vcgeq_f64(a_.neon_f64, b_.neon_f64), 0);
1348 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1349 return wasm_f64x2_extract_lane(a_.wasm_v128, 0) >= wasm_f64x2_extract_lane(b_.wasm_v128, 0);
1350 #else
1351 return a_.f64[0] >= b_.f64[0];
1352 #endif
1353 #endif
1354 }
1355 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1356 #define _mm_comige_sd(a, b) simde_mm_comige_sd(a, b)
1357 #endif
1358
1359 SIMDE_FUNCTION_ATTRIBUTES
1360 int
simde_mm_comigt_sd(simde__m128d a,simde__m128d b)1361 simde_mm_comigt_sd (simde__m128d a, simde__m128d b) {
1362 #if defined(SIMDE_X86_SSE2_NATIVE)
1363 return _mm_comigt_sd(a, b);
1364 #else
1365 simde__m128d_private
1366 a_ = simde__m128d_to_private(a),
1367 b_ = simde__m128d_to_private(b);
1368 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1369 return !!vgetq_lane_u64(vcgtq_f64(a_.neon_f64, b_.neon_f64), 0);
1370 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1371 return wasm_f64x2_extract_lane(a_.wasm_v128, 0) > wasm_f64x2_extract_lane(b_.wasm_v128, 0);
1372 #else
1373 return a_.f64[0] > b_.f64[0];
1374 #endif
1375 #endif
1376 }
1377 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1378 #define _mm_comigt_sd(a, b) simde_mm_comigt_sd(a, b)
1379 #endif
1380
1381 SIMDE_FUNCTION_ATTRIBUTES
1382 int
simde_mm_comile_sd(simde__m128d a,simde__m128d b)1383 simde_mm_comile_sd (simde__m128d a, simde__m128d b) {
1384 #if defined(SIMDE_X86_SSE2_NATIVE)
1385 return _mm_comile_sd(a, b);
1386 #else
1387 simde__m128d_private
1388 a_ = simde__m128d_to_private(a),
1389 b_ = simde__m128d_to_private(b);
1390 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1391 return !!vgetq_lane_u64(vcleq_f64(a_.neon_f64, b_.neon_f64), 0);
1392 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1393 return wasm_f64x2_extract_lane(a_.wasm_v128, 0) <= wasm_f64x2_extract_lane(b_.wasm_v128, 0);
1394 #else
1395 return a_.f64[0] <= b_.f64[0];
1396 #endif
1397 #endif
1398 }
1399 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1400 #define _mm_comile_sd(a, b) simde_mm_comile_sd(a, b)
1401 #endif
1402
1403 SIMDE_FUNCTION_ATTRIBUTES
1404 int
simde_mm_comilt_sd(simde__m128d a,simde__m128d b)1405 simde_mm_comilt_sd (simde__m128d a, simde__m128d b) {
1406 #if defined(SIMDE_X86_SSE2_NATIVE)
1407 return _mm_comilt_sd(a, b);
1408 #else
1409 simde__m128d_private
1410 a_ = simde__m128d_to_private(a),
1411 b_ = simde__m128d_to_private(b);
1412 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1413 return !!vgetq_lane_u64(vcltq_f64(a_.neon_f64, b_.neon_f64), 0);
1414 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1415 return wasm_f64x2_extract_lane(a_.wasm_v128, 0) < wasm_f64x2_extract_lane(b_.wasm_v128, 0);
1416 #else
1417 return a_.f64[0] < b_.f64[0];
1418 #endif
1419 #endif
1420 }
1421 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1422 #define _mm_comilt_sd(a, b) simde_mm_comilt_sd(a, b)
1423 #endif
1424
1425 SIMDE_FUNCTION_ATTRIBUTES
1426 int
simde_mm_comineq_sd(simde__m128d a,simde__m128d b)1427 simde_mm_comineq_sd (simde__m128d a, simde__m128d b) {
1428 #if defined(SIMDE_X86_SSE2_NATIVE)
1429 return _mm_comineq_sd(a, b);
1430 #else
1431 simde__m128d_private
1432 a_ = simde__m128d_to_private(a),
1433 b_ = simde__m128d_to_private(b);
1434 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1435 return !vgetq_lane_u64(vceqq_f64(a_.neon_f64, b_.neon_f64), 0);
1436 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1437 return wasm_f64x2_extract_lane(a_.wasm_v128, 0) != wasm_f64x2_extract_lane(b_.wasm_v128, 0);
1438 #else
1439 return a_.f64[0] != b_.f64[0];
1440 #endif
1441 #endif
1442 }
1443 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1444 #define _mm_comineq_sd(a, b) simde_mm_comineq_sd(a, b)
1445 #endif
1446
1447 SIMDE_FUNCTION_ATTRIBUTES
1448 simde__m128d
simde_x_mm_copysign_pd(simde__m128d dest,simde__m128d src)1449 simde_x_mm_copysign_pd(simde__m128d dest, simde__m128d src) {
1450 simde__m128d_private
1451 r_,
1452 dest_ = simde__m128d_to_private(dest),
1453 src_ = simde__m128d_to_private(src);
1454
1455 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1456 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1457 uint64x2_t sign_pos = vreinterpretq_u64_f64(vdupq_n_f64(-SIMDE_FLOAT64_C(0.0)));
1458 #else
1459 simde_float64 dbl_nz = -SIMDE_FLOAT64_C(0.0);
1460 uint64_t u64_nz;
1461 simde_memcpy(&u64_nz, &dbl_nz, sizeof(u64_nz));
1462 uint64x2_t sign_pos = vdupq_n_u64(u64_nz);
1463 #endif
1464 r_.neon_u64 = vbslq_u64(sign_pos, src_.neon_u64, dest_.neon_u64);
1465 #elif defined(SIMDE_POWER_ALTIVEC_P9_NATIVE)
1466 #if !defined(HEDLEY_IBM_VERSION)
1467 r_.altivec_f64 = vec_cpsgn(dest_.altivec_f64, src_.altivec_f64);
1468 #else
1469 r_.altivec_f64 = vec_cpsgn(src_.altivec_f64, dest_.altivec_f64);
1470 #endif
1471 #elif defined(simde_math_copysign)
1472 SIMDE_VECTORIZE
1473 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
1474 r_.f64[i] = simde_math_copysign(dest_.f64[i], src_.f64[i]);
1475 }
1476 #else
1477 simde__m128d sgnbit = simde_mm_set1_pd(-SIMDE_FLOAT64_C(0.0));
1478 return simde_mm_xor_pd(simde_mm_and_pd(sgnbit, src), simde_mm_andnot_pd(sgnbit, dest));
1479 #endif
1480
1481 return simde__m128d_from_private(r_);
1482 }
1483
1484 SIMDE_FUNCTION_ATTRIBUTES
1485 simde__m128d
simde_x_mm_xorsign_pd(simde__m128d dest,simde__m128d src)1486 simde_x_mm_xorsign_pd(simde__m128d dest, simde__m128d src) {
1487 return simde_mm_xor_pd(simde_mm_and_pd(simde_mm_set1_pd(-0.0), src), dest);
1488 }
1489
1490 SIMDE_FUNCTION_ATTRIBUTES
1491 simde__m128
simde_mm_castpd_ps(simde__m128d a)1492 simde_mm_castpd_ps (simde__m128d a) {
1493 #if defined(SIMDE_X86_SSE2_NATIVE)
1494 return _mm_castpd_ps(a);
1495 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1496 return vreinterpretq_f32_f64(a);
1497 #else
1498 simde__m128 r;
1499 simde_memcpy(&r, &a, sizeof(a));
1500 return r;
1501 #endif
1502 }
1503 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1504 #define _mm_castpd_ps(a) simde_mm_castpd_ps(a)
1505 #endif
1506
1507 SIMDE_FUNCTION_ATTRIBUTES
1508 simde__m128i
simde_mm_castpd_si128(simde__m128d a)1509 simde_mm_castpd_si128 (simde__m128d a) {
1510 #if defined(SIMDE_X86_SSE2_NATIVE)
1511 return _mm_castpd_si128(a);
1512 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1513 return vreinterpretq_s64_f64(a);
1514 #else
1515 simde__m128i r;
1516 simde_memcpy(&r, &a, sizeof(a));
1517 return r;
1518 #endif
1519 }
1520 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1521 #define _mm_castpd_si128(a) simde_mm_castpd_si128(a)
1522 #endif
1523
1524 SIMDE_FUNCTION_ATTRIBUTES
1525 simde__m128d
simde_mm_castps_pd(simde__m128 a)1526 simde_mm_castps_pd (simde__m128 a) {
1527 #if defined(SIMDE_X86_SSE2_NATIVE)
1528 return _mm_castps_pd(a);
1529 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1530 return vreinterpretq_f64_f32(a);
1531 #else
1532 simde__m128d r;
1533 simde_memcpy(&r, &a, sizeof(a));
1534 return r;
1535 #endif
1536 }
1537 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1538 #define _mm_castps_pd(a) simde_mm_castps_pd(a)
1539 #endif
1540
1541 SIMDE_FUNCTION_ATTRIBUTES
1542 simde__m128i
simde_mm_castps_si128(simde__m128 a)1543 simde_mm_castps_si128 (simde__m128 a) {
1544 #if defined(SIMDE_X86_SSE2_NATIVE)
1545 return _mm_castps_si128(a);
1546 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1547 return simde__m128i_from_neon_i32(simde__m128_to_private(a).neon_i32);
1548 #else
1549 simde__m128i r;
1550 simde_memcpy(&r, &a, sizeof(a));
1551 return r;
1552 #endif
1553 }
1554 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1555 #define _mm_castps_si128(a) simde_mm_castps_si128(a)
1556 #endif
1557
1558 SIMDE_FUNCTION_ATTRIBUTES
1559 simde__m128d
simde_mm_castsi128_pd(simde__m128i a)1560 simde_mm_castsi128_pd (simde__m128i a) {
1561 #if defined(SIMDE_X86_SSE2_NATIVE)
1562 return _mm_castsi128_pd(a);
1563 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1564 return vreinterpretq_f64_s64(a);
1565 #else
1566 simde__m128d r;
1567 simde_memcpy(&r, &a, sizeof(a));
1568 return r;
1569 #endif
1570 }
1571 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1572 #define _mm_castsi128_pd(a) simde_mm_castsi128_pd(a)
1573 #endif
1574
1575 SIMDE_FUNCTION_ATTRIBUTES
1576 simde__m128
simde_mm_castsi128_ps(simde__m128i a)1577 simde_mm_castsi128_ps (simde__m128i a) {
1578 #if defined(SIMDE_X86_SSE2_NATIVE)
1579 return _mm_castsi128_ps(a);
1580 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1581 return HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), a);
1582 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1583 return simde__m128_from_neon_i32(simde__m128i_to_private(a).neon_i32);
1584 #else
1585 simde__m128 r;
1586 simde_memcpy(&r, &a, sizeof(a));
1587 return r;
1588 #endif
1589 }
1590 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1591 #define _mm_castsi128_ps(a) simde_mm_castsi128_ps(a)
1592 #endif
1593
1594 SIMDE_FUNCTION_ATTRIBUTES
1595 simde__m128i
simde_mm_cmpeq_epi8(simde__m128i a,simde__m128i b)1596 simde_mm_cmpeq_epi8 (simde__m128i a, simde__m128i b) {
1597 #if defined(SIMDE_X86_SSE2_NATIVE)
1598 return _mm_cmpeq_epi8(a, b);
1599 #else
1600 simde__m128i_private
1601 r_,
1602 a_ = simde__m128i_to_private(a),
1603 b_ = simde__m128i_to_private(b);
1604
1605 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1606 r_.neon_u8 = vceqq_s8(b_.neon_i8, a_.neon_i8);
1607 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1608 r_.wasm_v128 = wasm_i8x16_eq(a_.wasm_v128, b_.wasm_v128);
1609 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
1610 r_.altivec_i8 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char), vec_cmpeq(a_.altivec_i8, b_.altivec_i8));
1611 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1612 r_.i8 = HEDLEY_STATIC_CAST(__typeof__(r_.i8), (a_.i8 == b_.i8));
1613 #else
1614 SIMDE_VECTORIZE
1615 for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
1616 r_.i8[i] = (a_.i8[i] == b_.i8[i]) ? ~INT8_C(0) : INT8_C(0);
1617 }
1618 #endif
1619
1620 return simde__m128i_from_private(r_);
1621 #endif
1622 }
1623 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1624 #define _mm_cmpeq_epi8(a, b) simde_mm_cmpeq_epi8(a, b)
1625 #endif
1626
1627 SIMDE_FUNCTION_ATTRIBUTES
1628 simde__m128i
simde_mm_cmpeq_epi16(simde__m128i a,simde__m128i b)1629 simde_mm_cmpeq_epi16 (simde__m128i a, simde__m128i b) {
1630 #if defined(SIMDE_X86_SSE2_NATIVE)
1631 return _mm_cmpeq_epi16(a, b);
1632 #else
1633 simde__m128i_private
1634 r_,
1635 a_ = simde__m128i_to_private(a),
1636 b_ = simde__m128i_to_private(b);
1637
1638 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1639 r_.neon_u16 = vceqq_s16(b_.neon_i16, a_.neon_i16);
1640 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1641 r_.wasm_v128 = wasm_i16x8_eq(a_.wasm_v128, b_.wasm_v128);
1642 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
1643 r_.altivec_i16 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed short), vec_cmpeq(a_.altivec_i16, b_.altivec_i16));
1644 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1645 r_.i16 = (a_.i16 == b_.i16);
1646 #else
1647 SIMDE_VECTORIZE
1648 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
1649 r_.i16[i] = (a_.i16[i] == b_.i16[i]) ? ~INT16_C(0) : INT16_C(0);
1650 }
1651 #endif
1652
1653 return simde__m128i_from_private(r_);
1654 #endif
1655 }
1656 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1657 #define _mm_cmpeq_epi16(a, b) simde_mm_cmpeq_epi16(a, b)
1658 #endif
1659
1660 SIMDE_FUNCTION_ATTRIBUTES
1661 simde__m128i
simde_mm_cmpeq_epi32(simde__m128i a,simde__m128i b)1662 simde_mm_cmpeq_epi32 (simde__m128i a, simde__m128i b) {
1663 #if defined(SIMDE_X86_SSE2_NATIVE)
1664 return _mm_cmpeq_epi32(a, b);
1665 #else
1666 simde__m128i_private
1667 r_,
1668 a_ = simde__m128i_to_private(a),
1669 b_ = simde__m128i_to_private(b);
1670
1671 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1672 r_.neon_u32 = vceqq_s32(b_.neon_i32, a_.neon_i32);
1673 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1674 r_.wasm_v128 = wasm_i32x4_eq(a_.wasm_v128, b_.wasm_v128);
1675 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
1676 r_.altivec_i32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed int), vec_cmpeq(a_.altivec_i32, b_.altivec_i32));
1677 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1678 r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), a_.i32 == b_.i32);
1679 #else
1680 SIMDE_VECTORIZE
1681 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1682 r_.i32[i] = (a_.i32[i] == b_.i32[i]) ? ~INT32_C(0) : INT32_C(0);
1683 }
1684 #endif
1685
1686 return simde__m128i_from_private(r_);
1687 #endif
1688 }
1689 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1690 #define _mm_cmpeq_epi32(a, b) simde_mm_cmpeq_epi32(a, b)
1691 #endif
1692
1693 SIMDE_FUNCTION_ATTRIBUTES
1694 simde__m128d
simde_mm_cmpeq_pd(simde__m128d a,simde__m128d b)1695 simde_mm_cmpeq_pd (simde__m128d a, simde__m128d b) {
1696 #if defined(SIMDE_X86_SSE2_NATIVE)
1697 return _mm_cmpeq_pd(a, b);
1698 #else
1699 simde__m128d_private
1700 r_,
1701 a_ = simde__m128d_to_private(a),
1702 b_ = simde__m128d_to_private(b);
1703
1704 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1705 r_.neon_u64 = vceqq_s64(b_.neon_i64, a_.neon_i64);
1706 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1707 r_.wasm_v128 = wasm_f64x2_eq(a_.wasm_v128, b_.wasm_v128);
1708 #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
1709 r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_cmpeq(a_.altivec_f64, b_.altivec_f64));
1710 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1711 r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 == b_.f64));
1712 #else
1713 SIMDE_VECTORIZE
1714 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
1715 r_.u64[i] = (a_.f64[i] == b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
1716 }
1717 #endif
1718
1719 return simde__m128d_from_private(r_);
1720 #endif
1721 }
1722 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1723 #define _mm_cmpeq_pd(a, b) simde_mm_cmpeq_pd(a, b)
1724 #endif
1725
1726 SIMDE_FUNCTION_ATTRIBUTES
1727 simde__m128d
simde_mm_cmpeq_sd(simde__m128d a,simde__m128d b)1728 simde_mm_cmpeq_sd (simde__m128d a, simde__m128d b) {
1729 #if defined(SIMDE_X86_SSE2_NATIVE)
1730 return _mm_cmpeq_sd(a, b);
1731 #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
1732 return simde_mm_move_sd(a, simde_mm_cmpeq_pd(a, b));
1733 #else
1734 simde__m128d_private
1735 r_,
1736 a_ = simde__m128d_to_private(a),
1737 b_ = simde__m128d_to_private(b);
1738
1739 r_.u64[0] = (a_.u64[0] == b_.u64[0]) ? ~UINT64_C(0) : 0;
1740 r_.u64[1] = a_.u64[1];
1741
1742 return simde__m128d_from_private(r_);
1743 #endif
1744 }
1745 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1746 #define _mm_cmpeq_sd(a, b) simde_mm_cmpeq_sd(a, b)
1747 #endif
1748
1749 SIMDE_FUNCTION_ATTRIBUTES
1750 simde__m128d
simde_mm_cmpneq_pd(simde__m128d a,simde__m128d b)1751 simde_mm_cmpneq_pd (simde__m128d a, simde__m128d b) {
1752 #if defined(SIMDE_X86_SSE2_NATIVE)
1753 return _mm_cmpneq_pd(a, b);
1754 #else
1755 simde__m128d_private
1756 r_,
1757 a_ = simde__m128d_to_private(a),
1758 b_ = simde__m128d_to_private(b);
1759
1760 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1761 r_.neon_u32 = vmvnq_u32(vreinterpretq_u32_u64(vceqq_f64(b_.neon_f64, a_.neon_f64)));
1762 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1763 r_.wasm_v128 = wasm_f64x2_ne(a_.wasm_v128, b_.wasm_v128);
1764 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1765 r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 != b_.f64));
1766 #else
1767 SIMDE_VECTORIZE
1768 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
1769 r_.u64[i] = (a_.f64[i] != b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
1770 }
1771 #endif
1772
1773 return simde__m128d_from_private(r_);
1774 #endif
1775 }
1776 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1777 #define _mm_cmpneq_pd(a, b) simde_mm_cmpneq_pd(a, b)
1778 #endif
1779
1780 SIMDE_FUNCTION_ATTRIBUTES
1781 simde__m128d
simde_mm_cmpneq_sd(simde__m128d a,simde__m128d b)1782 simde_mm_cmpneq_sd (simde__m128d a, simde__m128d b) {
1783 #if defined(SIMDE_X86_SSE2_NATIVE)
1784 return _mm_cmpneq_sd(a, b);
1785 #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
1786 return simde_mm_move_sd(a, simde_mm_cmpneq_pd(a, b));
1787 #else
1788 simde__m128d_private
1789 r_,
1790 a_ = simde__m128d_to_private(a),
1791 b_ = simde__m128d_to_private(b);
1792
1793 r_.u64[0] = (a_.f64[0] != b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
1794 r_.u64[1] = a_.u64[1];
1795
1796
1797 return simde__m128d_from_private(r_);
1798 #endif
1799 }
1800 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1801 #define _mm_cmpneq_sd(a, b) simde_mm_cmpneq_sd(a, b)
1802 #endif
1803
1804 SIMDE_FUNCTION_ATTRIBUTES
1805 simde__m128i
simde_mm_cmplt_epi8(simde__m128i a,simde__m128i b)1806 simde_mm_cmplt_epi8 (simde__m128i a, simde__m128i b) {
1807 #if defined(SIMDE_X86_SSE2_NATIVE)
1808 return _mm_cmplt_epi8(a, b);
1809 #else
1810 simde__m128i_private
1811 r_,
1812 a_ = simde__m128i_to_private(a),
1813 b_ = simde__m128i_to_private(b);
1814
1815 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1816 r_.neon_u8 = vcltq_s8(a_.neon_i8, b_.neon_i8);
1817 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
1818 r_.altivec_i8 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char),vec_cmplt(a_.altivec_i8, b_.altivec_i8));
1819 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1820 r_.wasm_v128 = wasm_i8x16_lt(a_.wasm_v128, b_.wasm_v128);
1821 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1822 r_.i8 = HEDLEY_STATIC_CAST(__typeof__(r_.i8), (a_.i8 < b_.i8));
1823 #else
1824 SIMDE_VECTORIZE
1825 for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
1826 r_.i8[i] = (a_.i8[i] < b_.i8[i]) ? ~INT8_C(0) : INT8_C(0);
1827 }
1828 #endif
1829
1830 return simde__m128i_from_private(r_);
1831 #endif
1832 }
1833 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1834 #define _mm_cmplt_epi8(a, b) simde_mm_cmplt_epi8(a, b)
1835 #endif
1836
1837 SIMDE_FUNCTION_ATTRIBUTES
1838 simde__m128i
simde_mm_cmplt_epi16(simde__m128i a,simde__m128i b)1839 simde_mm_cmplt_epi16 (simde__m128i a, simde__m128i b) {
1840 #if defined(SIMDE_X86_SSE2_NATIVE)
1841 return _mm_cmplt_epi16(a, b);
1842 #else
1843 simde__m128i_private
1844 r_,
1845 a_ = simde__m128i_to_private(a),
1846 b_ = simde__m128i_to_private(b);
1847
1848 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1849 r_.neon_u16 = vcltq_s16(a_.neon_i16, b_.neon_i16);
1850 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
1851 r_.altivec_i16 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed short), vec_cmplt(a_.altivec_i16, b_.altivec_i16));
1852 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1853 r_.wasm_v128 = wasm_i16x8_lt(a_.wasm_v128, b_.wasm_v128);
1854 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1855 r_.i16 = HEDLEY_STATIC_CAST(__typeof__(r_.i16), (a_.i16 < b_.i16));
1856 #else
1857 SIMDE_VECTORIZE
1858 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
1859 r_.i16[i] = (a_.i16[i] < b_.i16[i]) ? ~INT16_C(0) : INT16_C(0);
1860 }
1861 #endif
1862
1863 return simde__m128i_from_private(r_);
1864 #endif
1865 }
1866 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1867 #define _mm_cmplt_epi16(a, b) simde_mm_cmplt_epi16(a, b)
1868 #endif
1869
1870 SIMDE_FUNCTION_ATTRIBUTES
1871 simde__m128i
simde_mm_cmplt_epi32(simde__m128i a,simde__m128i b)1872 simde_mm_cmplt_epi32 (simde__m128i a, simde__m128i b) {
1873 #if defined(SIMDE_X86_SSE2_NATIVE)
1874 return _mm_cmplt_epi32(a, b);
1875 #else
1876 simde__m128i_private
1877 r_,
1878 a_ = simde__m128i_to_private(a),
1879 b_ = simde__m128i_to_private(b);
1880
1881 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1882 r_.neon_u32 = vcltq_s32(a_.neon_i32, b_.neon_i32);
1883 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
1884 r_.altivec_i32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed int), vec_cmplt(a_.altivec_i32, b_.altivec_i32));
1885 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1886 r_.wasm_v128 = wasm_i32x4_lt(a_.wasm_v128, b_.wasm_v128);
1887 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1888 r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.i32 < b_.i32));
1889 #else
1890 SIMDE_VECTORIZE
1891 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1892 r_.i32[i] = (a_.i32[i] < b_.i32[i]) ? ~INT32_C(0) : INT32_C(0);
1893 }
1894 #endif
1895
1896 return simde__m128i_from_private(r_);
1897 #endif
1898 }
1899 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1900 #define _mm_cmplt_epi32(a, b) simde_mm_cmplt_epi32(a, b)
1901 #endif
1902
1903 SIMDE_FUNCTION_ATTRIBUTES
1904 simde__m128d
simde_mm_cmplt_pd(simde__m128d a,simde__m128d b)1905 simde_mm_cmplt_pd (simde__m128d a, simde__m128d b) {
1906 #if defined(SIMDE_X86_SSE2_NATIVE)
1907 return _mm_cmplt_pd(a, b);
1908 #else
1909 simde__m128d_private
1910 r_,
1911 a_ = simde__m128d_to_private(a),
1912 b_ = simde__m128d_to_private(b);
1913
1914 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1915 r_.neon_u64 = vcltq_f64(a_.neon_f64, b_.neon_f64);
1916 #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
1917 r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_cmplt(a_.altivec_f64, b_.altivec_f64));
1918 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1919 r_.wasm_v128 = wasm_f64x2_lt(a_.wasm_v128, b_.wasm_v128);
1920 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1921 r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 < b_.f64));
1922 #else
1923 SIMDE_VECTORIZE
1924 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
1925 r_.u64[i] = (a_.f64[i] < b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
1926 }
1927 #endif
1928
1929 return simde__m128d_from_private(r_);
1930 #endif
1931 }
1932 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1933 #define _mm_cmplt_pd(a, b) simde_mm_cmplt_pd(a, b)
1934 #endif
1935
1936 SIMDE_FUNCTION_ATTRIBUTES
1937 simde__m128d
simde_mm_cmplt_sd(simde__m128d a,simde__m128d b)1938 simde_mm_cmplt_sd (simde__m128d a, simde__m128d b) {
1939 #if defined(SIMDE_X86_SSE2_NATIVE)
1940 return _mm_cmplt_sd(a, b);
1941 #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
1942 return simde_mm_move_sd(a, simde_mm_cmplt_pd(a, b));
1943 #else
1944 simde__m128d_private
1945 r_,
1946 a_ = simde__m128d_to_private(a),
1947 b_ = simde__m128d_to_private(b);
1948
1949 r_.u64[0] = (a_.f64[0] < b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
1950 r_.u64[1] = a_.u64[1];
1951
1952 return simde__m128d_from_private(r_);
1953 #endif
1954 }
1955 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1956 #define _mm_cmplt_sd(a, b) simde_mm_cmplt_sd(a, b)
1957 #endif
1958
1959 SIMDE_FUNCTION_ATTRIBUTES
1960 simde__m128d
simde_mm_cmple_pd(simde__m128d a,simde__m128d b)1961 simde_mm_cmple_pd (simde__m128d a, simde__m128d b) {
1962 #if defined(SIMDE_X86_SSE2_NATIVE)
1963 return _mm_cmple_pd(a, b);
1964 #else
1965 simde__m128d_private
1966 r_,
1967 a_ = simde__m128d_to_private(a),
1968 b_ = simde__m128d_to_private(b);
1969
1970 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1971 r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 <= b_.f64));
1972 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1973 r_.neon_u64 = vcleq_f64(a_.neon_f64, b_.neon_f64);
1974 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1975 r_.wasm_v128 = wasm_f64x2_le(a_.wasm_v128, b_.wasm_v128);
1976 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
1977 r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_cmple(a_.altivec_f64, b_.altivec_f64));
1978 #else
1979 SIMDE_VECTORIZE
1980 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
1981 r_.u64[i] = (a_.f64[i] <= b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
1982 }
1983 #endif
1984
1985 return simde__m128d_from_private(r_);
1986 #endif
1987 }
1988 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1989 #define _mm_cmple_pd(a, b) simde_mm_cmple_pd(a, b)
1990 #endif
1991
1992 SIMDE_FUNCTION_ATTRIBUTES
1993 simde__m128d
simde_mm_cmple_sd(simde__m128d a,simde__m128d b)1994 simde_mm_cmple_sd (simde__m128d a, simde__m128d b) {
1995 #if defined(SIMDE_X86_SSE2_NATIVE)
1996 return _mm_cmple_sd(a, b);
1997 #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
1998 return simde_mm_move_sd(a, simde_mm_cmple_pd(a, b));
1999 #else
2000 simde__m128d_private
2001 r_,
2002 a_ = simde__m128d_to_private(a),
2003 b_ = simde__m128d_to_private(b);
2004
2005 r_.u64[0] = (a_.f64[0] <= b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
2006 r_.u64[1] = a_.u64[1];
2007
2008 return simde__m128d_from_private(r_);
2009 #endif
2010 }
2011 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2012 #define _mm_cmple_sd(a, b) simde_mm_cmple_sd(a, b)
2013 #endif
2014
2015 SIMDE_FUNCTION_ATTRIBUTES
2016 simde__m128i
simde_mm_cmpgt_epi8(simde__m128i a,simde__m128i b)2017 simde_mm_cmpgt_epi8 (simde__m128i a, simde__m128i b) {
2018 #if defined(SIMDE_X86_SSE2_NATIVE)
2019 return _mm_cmpgt_epi8(a, b);
2020 #else
2021 simde__m128i_private
2022 r_,
2023 a_ = simde__m128i_to_private(a),
2024 b_ = simde__m128i_to_private(b);
2025
2026 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2027 r_.neon_u8 = vcgtq_s8(a_.neon_i8, b_.neon_i8);
2028 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
2029 r_.wasm_v128 = wasm_i8x16_gt(a_.wasm_v128, b_.wasm_v128);
2030 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
2031 r_.altivec_i8 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char), vec_cmpgt(a_.altivec_i8, b_.altivec_i8));
2032 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
2033 r_.i8 = HEDLEY_STATIC_CAST(__typeof__(r_.i8), (a_.i8 > b_.i8));
2034 #else
2035 SIMDE_VECTORIZE
2036 for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
2037 r_.i8[i] = (a_.i8[i] > b_.i8[i]) ? ~INT8_C(0) : INT8_C(0);
2038 }
2039 #endif
2040
2041 return simde__m128i_from_private(r_);
2042 #endif
2043 }
2044 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2045 #define _mm_cmpgt_epi8(a, b) simde_mm_cmpgt_epi8(a, b)
2046 #endif
2047
2048 SIMDE_FUNCTION_ATTRIBUTES
2049 simde__m128i
simde_mm_cmpgt_epi16(simde__m128i a,simde__m128i b)2050 simde_mm_cmpgt_epi16 (simde__m128i a, simde__m128i b) {
2051 #if defined(SIMDE_X86_SSE2_NATIVE)
2052 return _mm_cmpgt_epi16(a, b);
2053 #else
2054 simde__m128i_private
2055 r_,
2056 a_ = simde__m128i_to_private(a),
2057 b_ = simde__m128i_to_private(b);
2058
2059 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2060 r_.neon_u16 = vcgtq_s16(a_.neon_i16, b_.neon_i16);
2061 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
2062 r_.wasm_v128 = wasm_i16x8_gt(a_.wasm_v128, b_.wasm_v128);
2063 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
2064 r_.altivec_i16 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed short), vec_cmpgt(a_.altivec_i16, b_.altivec_i16));
2065 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
2066 r_.i16 = HEDLEY_STATIC_CAST(__typeof__(r_.i16), (a_.i16 > b_.i16));
2067 #else
2068 SIMDE_VECTORIZE
2069 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
2070 r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? ~INT16_C(0) : INT16_C(0);
2071 }
2072 #endif
2073
2074 return simde__m128i_from_private(r_);
2075 #endif
2076 }
2077 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2078 #define _mm_cmpgt_epi16(a, b) simde_mm_cmpgt_epi16(a, b)
2079 #endif
2080
2081 SIMDE_FUNCTION_ATTRIBUTES
2082 simde__m128i
simde_mm_cmpgt_epi32(simde__m128i a,simde__m128i b)2083 simde_mm_cmpgt_epi32 (simde__m128i a, simde__m128i b) {
2084 #if defined(SIMDE_X86_SSE2_NATIVE)
2085 return _mm_cmpgt_epi32(a, b);
2086 #else
2087 simde__m128i_private
2088 r_,
2089 a_ = simde__m128i_to_private(a),
2090 b_ = simde__m128i_to_private(b);
2091
2092 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2093 r_.neon_u32 = vcgtq_s32(a_.neon_i32, b_.neon_i32);
2094 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
2095 r_.wasm_v128 = wasm_i32x4_gt(a_.wasm_v128, b_.wasm_v128);
2096 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
2097 r_.altivec_i32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed int), vec_cmpgt(a_.altivec_i32, b_.altivec_i32));
2098 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
2099 r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.i32 > b_.i32));
2100 #else
2101 SIMDE_VECTORIZE
2102 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
2103 r_.i32[i] = (a_.i32[i] > b_.i32[i]) ? ~INT32_C(0) : INT32_C(0);
2104 }
2105 #endif
2106
2107 return simde__m128i_from_private(r_);
2108 #endif
2109 }
2110 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2111 #define _mm_cmpgt_epi32(a, b) simde_mm_cmpgt_epi32(a, b)
2112 #endif
2113
2114 SIMDE_FUNCTION_ATTRIBUTES
2115 simde__m128d
simde_mm_cmpgt_pd(simde__m128d a,simde__m128d b)2116 simde_mm_cmpgt_pd (simde__m128d a, simde__m128d b) {
2117 #if defined(SIMDE_X86_SSE2_NATIVE)
2118 return _mm_cmpgt_pd(a, b);
2119 #else
2120 simde__m128d_private
2121 r_,
2122 a_ = simde__m128d_to_private(a),
2123 b_ = simde__m128d_to_private(b);
2124
2125 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
2126 r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 > b_.f64));
2127 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2128 r_.neon_u64 = vcgtq_f64(a_.neon_f64, b_.neon_f64);
2129 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
2130 r_.wasm_v128 = wasm_f64x2_gt(a_.wasm_v128, b_.wasm_v128);
2131 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
2132 r_.altivec_f64 = HEDLEY_STATIC_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_cmpgt(a_.altivec_f64, b_.altivec_f64));
2133 #else
2134 SIMDE_VECTORIZE
2135 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
2136 r_.u64[i] = (a_.f64[i] > b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
2137 }
2138 #endif
2139
2140 return simde__m128d_from_private(r_);
2141 #endif
2142 }
2143 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2144 #define _mm_cmpgt_pd(a, b) simde_mm_cmpgt_pd(a, b)
2145 #endif
2146
2147 SIMDE_FUNCTION_ATTRIBUTES
2148 simde__m128d
simde_mm_cmpgt_sd(simde__m128d a,simde__m128d b)2149 simde_mm_cmpgt_sd (simde__m128d a, simde__m128d b) {
2150 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
2151 return _mm_cmpgt_sd(a, b);
2152 #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
2153 return simde_mm_move_sd(a, simde_mm_cmpgt_pd(a, b));
2154 #else
2155 simde__m128d_private
2156 r_,
2157 a_ = simde__m128d_to_private(a),
2158 b_ = simde__m128d_to_private(b);
2159
2160 r_.u64[0] = (a_.f64[0] > b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
2161 r_.u64[1] = a_.u64[1];
2162
2163 return simde__m128d_from_private(r_);
2164 #endif
2165 }
2166 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2167 #define _mm_cmpgt_sd(a, b) simde_mm_cmpgt_sd(a, b)
2168 #endif
2169
2170 SIMDE_FUNCTION_ATTRIBUTES
2171 simde__m128d
simde_mm_cmpge_pd(simde__m128d a,simde__m128d b)2172 simde_mm_cmpge_pd (simde__m128d a, simde__m128d b) {
2173 #if defined(SIMDE_X86_SSE2_NATIVE)
2174 return _mm_cmpge_pd(a, b);
2175 #else
2176 simde__m128d_private
2177 r_,
2178 a_ = simde__m128d_to_private(a),
2179 b_ = simde__m128d_to_private(b);
2180
2181 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
2182 r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 >= b_.f64));
2183 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2184 r_.neon_u64 = vcgeq_f64(a_.neon_f64, b_.neon_f64);
2185 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
2186 r_.wasm_v128 = wasm_f64x2_ge(a_.wasm_v128, b_.wasm_v128);
2187 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
2188 r_.altivec_f64 = HEDLEY_STATIC_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_cmpge(a_.altivec_f64, b_.altivec_f64));
2189 #else
2190 SIMDE_VECTORIZE
2191 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
2192 r_.u64[i] = (a_.f64[i] >= b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
2193 }
2194 #endif
2195
2196 return simde__m128d_from_private(r_);
2197 #endif
2198 }
2199 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2200 #define _mm_cmpge_pd(a, b) simde_mm_cmpge_pd(a, b)
2201 #endif
2202
2203 SIMDE_FUNCTION_ATTRIBUTES
2204 simde__m128d
simde_mm_cmpge_sd(simde__m128d a,simde__m128d b)2205 simde_mm_cmpge_sd (simde__m128d a, simde__m128d b) {
2206 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
2207 return _mm_cmpge_sd(a, b);
2208 #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
2209 return simde_mm_move_sd(a, simde_mm_cmpge_pd(a, b));
2210 #else
2211 simde__m128d_private
2212 r_,
2213 a_ = simde__m128d_to_private(a),
2214 b_ = simde__m128d_to_private(b);
2215
2216 r_.u64[0] = (a_.f64[0] >= b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
2217 r_.u64[1] = a_.u64[1];
2218
2219 return simde__m128d_from_private(r_);
2220 #endif
2221 }
2222 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2223 #define _mm_cmpge_sd(a, b) simde_mm_cmpge_sd(a, b)
2224 #endif
2225
2226 SIMDE_FUNCTION_ATTRIBUTES
2227 simde__m128d
simde_mm_cmpngt_pd(simde__m128d a,simde__m128d b)2228 simde_mm_cmpngt_pd (simde__m128d a, simde__m128d b) {
2229 #if defined(SIMDE_X86_SSE2_NATIVE)
2230 return _mm_cmpngt_pd(a, b);
2231 #else
2232 return simde_mm_cmple_pd(a, b);
2233 #endif
2234 }
2235 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2236 #define _mm_cmpngt_pd(a, b) simde_mm_cmpngt_pd(a, b)
2237 #endif
2238
2239 SIMDE_FUNCTION_ATTRIBUTES
2240 simde__m128d
simde_mm_cmpngt_sd(simde__m128d a,simde__m128d b)2241 simde_mm_cmpngt_sd (simde__m128d a, simde__m128d b) {
2242 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
2243 return _mm_cmpngt_sd(a, b);
2244 #else
2245 return simde_mm_cmple_sd(a, b);
2246 #endif
2247 }
2248 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2249 #define _mm_cmpngt_sd(a, b) simde_mm_cmpngt_sd(a, b)
2250 #endif
2251
2252 SIMDE_FUNCTION_ATTRIBUTES
2253 simde__m128d
simde_mm_cmpnge_pd(simde__m128d a,simde__m128d b)2254 simde_mm_cmpnge_pd (simde__m128d a, simde__m128d b) {
2255 #if defined(SIMDE_X86_SSE2_NATIVE)
2256 return _mm_cmpnge_pd(a, b);
2257 #else
2258 return simde_mm_cmplt_pd(a, b);
2259 #endif
2260 }
2261 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2262 #define _mm_cmpnge_pd(a, b) simde_mm_cmpnge_pd(a, b)
2263 #endif
2264
2265 SIMDE_FUNCTION_ATTRIBUTES
2266 simde__m128d
simde_mm_cmpnge_sd(simde__m128d a,simde__m128d b)2267 simde_mm_cmpnge_sd (simde__m128d a, simde__m128d b) {
2268 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
2269 return _mm_cmpnge_sd(a, b);
2270 #else
2271 return simde_mm_cmplt_sd(a, b);
2272 #endif
2273 }
2274 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2275 #define _mm_cmpnge_sd(a, b) simde_mm_cmpnge_sd(a, b)
2276 #endif
2277
2278 SIMDE_FUNCTION_ATTRIBUTES
2279 simde__m128d
simde_mm_cmpnlt_pd(simde__m128d a,simde__m128d b)2280 simde_mm_cmpnlt_pd (simde__m128d a, simde__m128d b) {
2281 #if defined(SIMDE_X86_SSE2_NATIVE)
2282 return _mm_cmpnlt_pd(a, b);
2283 #else
2284 return simde_mm_cmpge_pd(a, b);
2285 #endif
2286 }
2287 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2288 #define _mm_cmpnlt_pd(a, b) simde_mm_cmpnlt_pd(a, b)
2289 #endif
2290
2291 SIMDE_FUNCTION_ATTRIBUTES
2292 simde__m128d
simde_mm_cmpnlt_sd(simde__m128d a,simde__m128d b)2293 simde_mm_cmpnlt_sd (simde__m128d a, simde__m128d b) {
2294 #if defined(SIMDE_X86_SSE2_NATIVE)
2295 return _mm_cmpnlt_sd(a, b);
2296 #else
2297 return simde_mm_cmpge_sd(a, b);
2298 #endif
2299 }
2300 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2301 #define _mm_cmpnlt_sd(a, b) simde_mm_cmpnlt_sd(a, b)
2302 #endif
2303
2304 SIMDE_FUNCTION_ATTRIBUTES
2305 simde__m128d
simde_mm_cmpnle_pd(simde__m128d a,simde__m128d b)2306 simde_mm_cmpnle_pd (simde__m128d a, simde__m128d b) {
2307 #if defined(SIMDE_X86_SSE2_NATIVE)
2308 return _mm_cmpnle_pd(a, b);
2309 #else
2310 return simde_mm_cmpgt_pd(a, b);
2311 #endif
2312 }
2313 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2314 #define _mm_cmpnle_pd(a, b) simde_mm_cmpnle_pd(a, b)
2315 #endif
2316
2317 SIMDE_FUNCTION_ATTRIBUTES
2318 simde__m128d
simde_mm_cmpnle_sd(simde__m128d a,simde__m128d b)2319 simde_mm_cmpnle_sd (simde__m128d a, simde__m128d b) {
2320 #if defined(SIMDE_X86_SSE2_NATIVE)
2321 return _mm_cmpnle_sd(a, b);
2322 #else
2323 return simde_mm_cmpgt_sd(a, b);
2324 #endif
2325 }
2326 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2327 #define _mm_cmpnle_sd(a, b) simde_mm_cmpnle_sd(a, b)
2328 #endif
2329
2330 SIMDE_FUNCTION_ATTRIBUTES
2331 simde__m128d
simde_mm_cmpord_pd(simde__m128d a,simde__m128d b)2332 simde_mm_cmpord_pd (simde__m128d a, simde__m128d b) {
2333 #if defined(SIMDE_X86_SSE2_NATIVE)
2334 return _mm_cmpord_pd(a, b);
2335 #else
2336 simde__m128d_private
2337 r_,
2338 a_ = simde__m128d_to_private(a),
2339 b_ = simde__m128d_to_private(b);
2340
2341 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2342 /* Note: NEON does not have ordered compare builtin
2343 Need to compare a eq a and b eq b to check for NaN
2344 Do AND of results to get final */
2345 uint64x2_t ceqaa = vceqq_f64(a_.neon_f64, a_.neon_f64);
2346 uint64x2_t ceqbb = vceqq_f64(b_.neon_f64, b_.neon_f64);
2347 r_.neon_u64 = vandq_u64(ceqaa, ceqbb);
2348 #elif defined(simde_math_isnan)
2349 SIMDE_VECTORIZE
2350 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
2351 r_.u64[i] = (!simde_math_isnan(a_.f64[i]) && !simde_math_isnan(b_.f64[i])) ? ~UINT64_C(0) : UINT64_C(0);
2352 }
2353 #else
2354 HEDLEY_UNREACHABLE();
2355 #endif
2356
2357 return simde__m128d_from_private(r_);
2358 #endif
2359 }
2360 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2361 #define _mm_cmpord_pd(a, b) simde_mm_cmpord_pd(a, b)
2362 #endif
2363
2364 SIMDE_FUNCTION_ATTRIBUTES
2365 simde_float64
simde_mm_cvtsd_f64(simde__m128d a)2366 simde_mm_cvtsd_f64 (simde__m128d a) {
2367 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
2368 return _mm_cvtsd_f64(a);
2369 #else
2370 simde__m128d_private a_ = simde__m128d_to_private(a);
2371 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2372 return HEDLEY_STATIC_CAST(simde_float64, vgetq_lane_f64(a_.neon_f64, 0));
2373 #else
2374 return a_.f64[0];
2375 #endif
2376 #endif
2377 }
2378 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2379 #define _mm_cvtsd_f64(a) simde_mm_cvtsd_f64(a)
2380 #endif
2381
2382 SIMDE_FUNCTION_ATTRIBUTES
2383 simde__m128d
simde_mm_cmpord_sd(simde__m128d a,simde__m128d b)2384 simde_mm_cmpord_sd (simde__m128d a, simde__m128d b) {
2385 #if defined(SIMDE_X86_SSE2_NATIVE)
2386 return _mm_cmpord_sd(a, b);
2387 #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
2388 return simde_mm_move_sd(a, simde_mm_cmpord_pd(a, b));
2389 #else
2390 simde__m128d_private
2391 r_,
2392 a_ = simde__m128d_to_private(a),
2393 b_ = simde__m128d_to_private(b);
2394
2395 #if defined(simde_math_isnan)
2396 r_.u64[0] = (!simde_math_isnan(a_.f64[0]) && !simde_math_isnan(b_.f64[0])) ? ~UINT64_C(0) : UINT64_C(0);
2397 r_.u64[1] = a_.u64[1];
2398 #else
2399 HEDLEY_UNREACHABLE();
2400 #endif
2401
2402 return simde__m128d_from_private(r_);
2403 #endif
2404 }
2405 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2406 #define _mm_cmpord_sd(a, b) simde_mm_cmpord_sd(a, b)
2407 #endif
2408
2409 SIMDE_FUNCTION_ATTRIBUTES
2410 simde__m128d
simde_mm_cmpunord_pd(simde__m128d a,simde__m128d b)2411 simde_mm_cmpunord_pd (simde__m128d a, simde__m128d b) {
2412 #if defined(SIMDE_X86_SSE2_NATIVE)
2413 return _mm_cmpunord_pd(a, b);
2414 #else
2415 simde__m128d_private
2416 r_,
2417 a_ = simde__m128d_to_private(a),
2418 b_ = simde__m128d_to_private(b);
2419
2420 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2421 uint64x2_t ceqaa = vceqq_f64(a_.neon_f64, a_.neon_f64);
2422 uint64x2_t ceqbb = vceqq_f64(b_.neon_f64, b_.neon_f64);
2423 r_.neon_u64 = vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(vandq_u64(ceqaa, ceqbb))));
2424 #elif defined(simde_math_isnan)
2425 SIMDE_VECTORIZE
2426 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
2427 r_.u64[i] = (simde_math_isnan(a_.f64[i]) || simde_math_isnan(b_.f64[i])) ? ~UINT64_C(0) : UINT64_C(0);
2428 }
2429 #else
2430 HEDLEY_UNREACHABLE();
2431 #endif
2432
2433 return simde__m128d_from_private(r_);
2434 #endif
2435 }
2436 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2437 #define _mm_cmpunord_pd(a, b) simde_mm_cmpunord_pd(a, b)
2438 #endif
2439
2440 SIMDE_FUNCTION_ATTRIBUTES
2441 simde__m128d
simde_mm_cmpunord_sd(simde__m128d a,simde__m128d b)2442 simde_mm_cmpunord_sd (simde__m128d a, simde__m128d b) {
2443 #if defined(SIMDE_X86_SSE2_NATIVE)
2444 return _mm_cmpunord_sd(a, b);
2445 #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
2446 return simde_mm_move_sd(a, simde_mm_cmpunord_pd(a, b));
2447 #else
2448 simde__m128d_private
2449 r_,
2450 a_ = simde__m128d_to_private(a),
2451 b_ = simde__m128d_to_private(b);
2452
2453 #if defined(simde_math_isnan)
2454 r_.u64[0] = (simde_math_isnan(a_.f64[0]) || simde_math_isnan(b_.f64[0])) ? ~UINT64_C(0) : UINT64_C(0);
2455 r_.u64[1] = a_.u64[1];
2456 #else
2457 HEDLEY_UNREACHABLE();
2458 #endif
2459
2460 return simde__m128d_from_private(r_);
2461 #endif
2462 }
2463 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2464 #define _mm_cmpunord_sd(a, b) simde_mm_cmpunord_sd(a, b)
2465 #endif
2466
2467 SIMDE_FUNCTION_ATTRIBUTES
2468 simde__m128d
simde_mm_cvtepi32_pd(simde__m128i a)2469 simde_mm_cvtepi32_pd (simde__m128i a) {
2470 #if defined(SIMDE_X86_SSE2_NATIVE)
2471 return _mm_cvtepi32_pd(a);
2472 #else
2473 simde__m128d_private r_;
2474 simde__m128i_private a_ = simde__m128i_to_private(a);
2475
2476 #if defined(SIMDE_CONVERT_VECTOR_)
2477 SIMDE_CONVERT_VECTOR_(r_.f64, a_.m64_private[0].i32);
2478 #else
2479 SIMDE_VECTORIZE
2480 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
2481 r_.f64[i] = (simde_float64) a_.i32[i];
2482 }
2483 #endif
2484
2485 return simde__m128d_from_private(r_);
2486 #endif
2487 }
2488 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2489 #define _mm_cvtepi32_pd(a) simde_mm_cvtepi32_pd(a)
2490 #endif
2491
2492 SIMDE_FUNCTION_ATTRIBUTES
2493 simde__m128
simde_mm_cvtepi32_ps(simde__m128i a)2494 simde_mm_cvtepi32_ps (simde__m128i a) {
2495 #if defined(SIMDE_X86_SSE2_NATIVE)
2496 return _mm_cvtepi32_ps(a);
2497 #else
2498 simde__m128_private r_;
2499 simde__m128i_private a_ = simde__m128i_to_private(a);
2500
2501 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2502 r_.neon_f32 = vcvtq_f32_s32(a_.neon_i32);
2503 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
2504 r_.wasm_v128 = wasm_f32x4_convert_i32x4(a_.wasm_v128);
2505 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
2506 HEDLEY_DIAGNOSTIC_PUSH
2507 #if HEDLEY_HAS_WARNING("-Wc11-extensions")
2508 #pragma clang diagnostic ignored "-Wc11-extensions"
2509 #endif
2510 r_.altivec_f32 = vec_ctf(a_.altivec_i32, 0);
2511 HEDLEY_DIAGNOSTIC_POP
2512 #elif defined(SIMDE_CONVERT_VECTOR_)
2513 SIMDE_CONVERT_VECTOR_(r_.f32, a_.i32);
2514 #else
2515 SIMDE_VECTORIZE
2516 for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
2517 r_.f32[i] = (simde_float32) a_.i32[i];
2518 }
2519 #endif
2520
2521 return simde__m128_from_private(r_);
2522 #endif
2523 }
2524 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2525 #define _mm_cvtepi32_ps(a) simde_mm_cvtepi32_ps(a)
2526 #endif
2527
2528 SIMDE_FUNCTION_ATTRIBUTES
2529 simde__m64
simde_mm_cvtpd_pi32(simde__m128d a)2530 simde_mm_cvtpd_pi32 (simde__m128d a) {
2531 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
2532 return _mm_cvtpd_pi32(a);
2533 #else
2534 simde__m64_private r_;
2535 simde__m128d_private a_ = simde__m128d_to_private(a);
2536
2537 SIMDE_VECTORIZE
2538 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
2539 simde_float64 v = simde_math_round(a_.f64[i]);
2540 #if defined(SIMDE_FAST_CONVERSION_RANGE)
2541 r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, v);
2542 #else
2543 r_.i32[i] = ((v > HEDLEY_STATIC_CAST(simde_float64, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float64, INT32_MAX))) ?
2544 SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN;
2545 #endif
2546 }
2547
2548 return simde__m64_from_private(r_);
2549 #endif
2550 }
2551 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2552 #define _mm_cvtpd_pi32(a) simde_mm_cvtpd_pi32(a)
2553 #endif
2554
2555 SIMDE_FUNCTION_ATTRIBUTES
2556 simde__m128i
simde_mm_cvtpd_epi32(simde__m128d a)2557 simde_mm_cvtpd_epi32 (simde__m128d a) {
2558 #if defined(SIMDE_X86_SSE2_NATIVE)
2559 return _mm_cvtpd_epi32(a);
2560 #else
2561 simde__m128i_private r_;
2562
2563 r_.m64[0] = simde_mm_cvtpd_pi32(a);
2564 r_.m64[1] = simde_mm_setzero_si64();
2565
2566 return simde__m128i_from_private(r_);
2567 #endif
2568 }
2569 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2570 #define _mm_cvtpd_epi32(a) simde_mm_cvtpd_epi32(a)
2571 #endif
2572
2573 SIMDE_FUNCTION_ATTRIBUTES
2574 simde__m128
simde_mm_cvtpd_ps(simde__m128d a)2575 simde_mm_cvtpd_ps (simde__m128d a) {
2576 #if defined(SIMDE_X86_SSE2_NATIVE)
2577 return _mm_cvtpd_ps(a);
2578 #else
2579 simde__m128_private r_;
2580 simde__m128d_private a_ = simde__m128d_to_private(a);
2581
2582 #if defined(SIMDE_CONVERT_VECTOR_)
2583 SIMDE_CONVERT_VECTOR_(r_.m64_private[0].f32, a_.f64);
2584 r_.m64_private[1] = simde__m64_to_private(simde_mm_setzero_si64());
2585 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2586 r_.neon_f32 = vreinterpretq_f32_f64(vcombine_f64(vreinterpret_f64_f32(vcvtx_f32_f64(a_.neon_f64)), vdup_n_f64(0)));
2587 #else
2588 SIMDE_VECTORIZE
2589 for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) {
2590 r_.f32[i] = (simde_float32) a_.f64[i];
2591 }
2592 simde_memset(&(r_.m64_private[1]), 0, sizeof(r_.m64_private[1]));
2593 #endif
2594
2595 return simde__m128_from_private(r_);
2596 #endif
2597 }
2598 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2599 #define _mm_cvtpd_ps(a) simde_mm_cvtpd_ps(a)
2600 #endif
2601
2602 SIMDE_FUNCTION_ATTRIBUTES
2603 simde__m128d
simde_mm_cvtpi32_pd(simde__m64 a)2604 simde_mm_cvtpi32_pd (simde__m64 a) {
2605 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
2606 return _mm_cvtpi32_pd(a);
2607 #else
2608 simde__m128d_private r_;
2609 simde__m64_private a_ = simde__m64_to_private(a);
2610
2611 #if defined(SIMDE_CONVERT_VECTOR_)
2612 SIMDE_CONVERT_VECTOR_(r_.f64, a_.i32);
2613 #else
2614 SIMDE_VECTORIZE
2615 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
2616 r_.f64[i] = (simde_float64) a_.i32[i];
2617 }
2618 #endif
2619
2620 return simde__m128d_from_private(r_);
2621 #endif
2622 }
2623 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2624 #define _mm_cvtpi32_pd(a) simde_mm_cvtpi32_pd(a)
2625 #endif
2626
2627 SIMDE_FUNCTION_ATTRIBUTES
2628 simde__m128i
simde_mm_cvtps_epi32(simde__m128 a)2629 simde_mm_cvtps_epi32 (simde__m128 a) {
2630 #if defined(SIMDE_X86_SSE2_NATIVE)
2631 return _mm_cvtps_epi32(a);
2632 #else
2633 simde__m128i_private r_;
2634 simde__m128_private a_ = simde__m128_to_private(a);
2635
2636 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE)
2637 r_.neon_i32 = vcvtnq_s32_f32(a_.neon_f32);
2638 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE) && defined(SIMDE_FAST_ROUND_TIES) && !defined(SIMDE_BUG_GCC_95399)
2639 r_.neon_i32 = vcvtnq_s32_f32(a_.neon_f32);
2640 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE) && defined(SIMDE_FAST_ROUND_TIES)
2641 HEDLEY_DIAGNOSTIC_PUSH
2642 SIMDE_DIAGNOSTIC_DISABLE_C11_EXTENSIONS_
2643 SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_
2644 r_.altivec_i32 = vec_cts(a_.altivec_f32, 1);
2645 HEDLEY_DIAGNOSTIC_POP
2646 #else
2647 a_ = simde__m128_to_private(simde_x_mm_round_ps(a, SIMDE_MM_FROUND_TO_NEAREST_INT, 1));
2648 SIMDE_VECTORIZE
2649 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
2650 simde_float32 v = simde_math_roundf(a_.f32[i]);
2651 #if defined(SIMDE_FAST_CONVERSION_RANGE)
2652 r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, v);
2653 #else
2654 r_.i32[i] = ((v > HEDLEY_STATIC_CAST(simde_float32, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float32, INT32_MAX))) ?
2655 SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN;
2656 #endif
2657 }
2658 #endif
2659
2660 return simde__m128i_from_private(r_);
2661 #endif
2662 }
2663 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2664 #define _mm_cvtps_epi32(a) simde_mm_cvtps_epi32(a)
2665 #endif
2666
2667 SIMDE_FUNCTION_ATTRIBUTES
2668 simde__m128d
simde_mm_cvtps_pd(simde__m128 a)2669 simde_mm_cvtps_pd (simde__m128 a) {
2670 #if defined(SIMDE_X86_SSE2_NATIVE)
2671 return _mm_cvtps_pd(a);
2672 #else
2673 simde__m128d_private r_;
2674 simde__m128_private a_ = simde__m128_to_private(a);
2675
2676 #if defined(SIMDE_CONVERT_VECTOR_)
2677 SIMDE_CONVERT_VECTOR_(r_.f64, a_.m64_private[0].f32);
2678 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2679 r_.neon_f64 = vcvt_f64_f32(vget_low_f32(a_.neon_f32));
2680 #else
2681 SIMDE_VECTORIZE
2682 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
2683 r_.f64[i] = a_.f32[i];
2684 }
2685 #endif
2686
2687 return simde__m128d_from_private(r_);
2688 #endif
2689 }
2690 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2691 #define _mm_cvtps_pd(a) simde_mm_cvtps_pd(a)
2692 #endif
2693
2694 SIMDE_FUNCTION_ATTRIBUTES
2695 int32_t
simde_mm_cvtsd_si32(simde__m128d a)2696 simde_mm_cvtsd_si32 (simde__m128d a) {
2697 #if defined(SIMDE_X86_SSE2_NATIVE)
2698 return _mm_cvtsd_si32(a);
2699 #else
2700 simde__m128d_private a_ = simde__m128d_to_private(a);
2701
2702 simde_float64 v = simde_math_round(a_.f64[0]);
2703 #if defined(SIMDE_FAST_CONVERSION_RANGE)
2704 return SIMDE_CONVERT_FTOI(int32_t, v);
2705 #else
2706 return ((v > HEDLEY_STATIC_CAST(simde_float64, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float64, INT32_MAX))) ?
2707 SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN;
2708 #endif
2709 #endif
2710 }
2711 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2712 #define _mm_cvtsd_si32(a) simde_mm_cvtsd_si32(a)
2713 #endif
2714
2715 SIMDE_FUNCTION_ATTRIBUTES
2716 int64_t
simde_mm_cvtsd_si64(simde__m128d a)2717 simde_mm_cvtsd_si64 (simde__m128d a) {
2718 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
2719 #if defined(__PGI)
2720 return _mm_cvtsd_si64x(a);
2721 #else
2722 return _mm_cvtsd_si64(a);
2723 #endif
2724 #else
2725 simde__m128d_private a_ = simde__m128d_to_private(a);
2726 return SIMDE_CONVERT_FTOI(int64_t, simde_math_round(a_.f64[0]));
2727 #endif
2728 }
2729 #define simde_mm_cvtsd_si64x(a) simde_mm_cvtsd_si64(a)
2730 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64))
2731 #define _mm_cvtsd_si64(a) simde_mm_cvtsd_si64(a)
2732 #define _mm_cvtsd_si64x(a) simde_mm_cvtsd_si64x(a)
2733 #endif
2734
2735 SIMDE_FUNCTION_ATTRIBUTES
2736 simde__m128
simde_mm_cvtsd_ss(simde__m128 a,simde__m128d b)2737 simde_mm_cvtsd_ss (simde__m128 a, simde__m128d b) {
2738 #if defined(SIMDE_X86_SSE2_NATIVE)
2739 return _mm_cvtsd_ss(a, b);
2740 #else
2741 simde__m128_private
2742 r_,
2743 a_ = simde__m128_to_private(a);
2744 simde__m128d_private b_ = simde__m128d_to_private(b);
2745
2746 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2747 r_.neon_f32 = vsetq_lane_f32(vcvtxd_f32_f64(vgetq_lane_f64(b_.neon_f64, 0)), a_.neon_f32, 0);
2748 #else
2749 r_.f32[0] = HEDLEY_STATIC_CAST(simde_float32, b_.f64[0]);
2750
2751 SIMDE_VECTORIZE
2752 for (size_t i = 1 ; i < (sizeof(r_) / sizeof(r_.i32[0])) ; i++) {
2753 r_.i32[i] = a_.i32[i];
2754 }
2755 #endif
2756 return simde__m128_from_private(r_);
2757 #endif
2758 }
2759 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2760 #define _mm_cvtsd_ss(a, b) simde_mm_cvtsd_ss(a, b)
2761 #endif
2762
2763 SIMDE_FUNCTION_ATTRIBUTES
2764 int16_t
simde_x_mm_cvtsi128_si16(simde__m128i a)2765 simde_x_mm_cvtsi128_si16 (simde__m128i a) {
2766 simde__m128i_private
2767 a_ = simde__m128i_to_private(a);
2768
2769 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2770 return vgetq_lane_s16(a_.neon_i16, 0);
2771 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
2772 return HEDLEY_STATIC_CAST(int16_t, wasm_i16x8_extract_lane(a_.wasm_v128, 0));
2773 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
2774 #if defined(SIMDE_BUG_GCC_95227)
2775 (void) a_;
2776 #endif
2777 return vec_extract(a_.altivec_i16, 0);
2778 #else
2779 return a_.i16[0];
2780 #endif
2781 }
2782
2783 SIMDE_FUNCTION_ATTRIBUTES
2784 int32_t
simde_mm_cvtsi128_si32(simde__m128i a)2785 simde_mm_cvtsi128_si32 (simde__m128i a) {
2786 #if defined(SIMDE_X86_SSE2_NATIVE)
2787 return _mm_cvtsi128_si32(a);
2788 #else
2789 simde__m128i_private
2790 a_ = simde__m128i_to_private(a);
2791
2792 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2793 return vgetq_lane_s32(a_.neon_i32, 0);
2794 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
2795 return HEDLEY_STATIC_CAST(int32_t, wasm_i32x4_extract_lane(a_.wasm_v128, 0));
2796 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
2797 #if defined(SIMDE_BUG_GCC_95227)
2798 (void) a_;
2799 #endif
2800 return vec_extract(a_.altivec_i32, 0);
2801 #else
2802 return a_.i32[0];
2803 #endif
2804 #endif
2805 }
2806 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2807 #define _mm_cvtsi128_si32(a) simde_mm_cvtsi128_si32(a)
2808 #endif
2809
2810 SIMDE_FUNCTION_ATTRIBUTES
2811 int64_t
simde_mm_cvtsi128_si64(simde__m128i a)2812 simde_mm_cvtsi128_si64 (simde__m128i a) {
2813 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
2814 #if defined(__PGI)
2815 return _mm_cvtsi128_si64x(a);
2816 #else
2817 return _mm_cvtsi128_si64(a);
2818 #endif
2819 #else
2820 simde__m128i_private a_ = simde__m128i_to_private(a);
2821 #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) && !defined(HEDLEY_IBM_VERSION)
2822 return vec_extract(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed long long), a_.i64), 0);
2823 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2824 return vgetq_lane_s64(a_.neon_i64, 0);
2825 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
2826 return HEDLEY_STATIC_CAST(int64_t, wasm_i64x2_extract_lane(a_.wasm_v128, 0));
2827 #endif
2828 return a_.i64[0];
2829 #endif
2830 }
2831 #define simde_mm_cvtsi128_si64x(a) simde_mm_cvtsi128_si64(a)
2832 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64))
2833 #define _mm_cvtsi128_si64(a) simde_mm_cvtsi128_si64(a)
2834 #define _mm_cvtsi128_si64x(a) simde_mm_cvtsi128_si64x(a)
2835 #endif
2836
2837 SIMDE_FUNCTION_ATTRIBUTES
2838 simde__m128d
simde_mm_cvtsi32_sd(simde__m128d a,int32_t b)2839 simde_mm_cvtsi32_sd (simde__m128d a, int32_t b) {
2840 #if defined(SIMDE_X86_SSE2_NATIVE)
2841 return _mm_cvtsi32_sd(a, b);
2842 #else
2843 simde__m128d_private r_;
2844 simde__m128d_private a_ = simde__m128d_to_private(a);
2845
2846 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2847 r_.neon_f64 = vsetq_lane_f64(HEDLEY_STATIC_CAST(float64_t, b), a_.neon_f64, 0);
2848 #else
2849 r_.f64[0] = HEDLEY_STATIC_CAST(simde_float64, b);
2850 r_.i64[1] = a_.i64[1];
2851 #endif
2852
2853 return simde__m128d_from_private(r_);
2854 #endif
2855 }
2856 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2857 #define _mm_cvtsi32_sd(a, b) simde_mm_cvtsi32_sd(a, b)
2858 #endif
2859
2860 SIMDE_FUNCTION_ATTRIBUTES
2861 simde__m128i
simde_x_mm_cvtsi16_si128(int16_t a)2862 simde_x_mm_cvtsi16_si128 (int16_t a) {
2863 simde__m128i_private r_;
2864
2865 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2866 r_.neon_i16 = vsetq_lane_s16(a, vdupq_n_s16(0), 0);
2867 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
2868 r_.wasm_v128 = wasm_i16x8_make(a, 0, 0, 0, 0, 0, 0, 0);
2869 #else
2870 r_.i16[0] = a;
2871 r_.i16[1] = 0;
2872 r_.i16[2] = 0;
2873 r_.i16[3] = 0;
2874 r_.i16[4] = 0;
2875 r_.i16[5] = 0;
2876 r_.i16[6] = 0;
2877 r_.i16[7] = 0;
2878 #endif
2879
2880 return simde__m128i_from_private(r_);
2881 }
2882
2883 SIMDE_FUNCTION_ATTRIBUTES
2884 simde__m128i
simde_mm_cvtsi32_si128(int32_t a)2885 simde_mm_cvtsi32_si128 (int32_t a) {
2886 #if defined(SIMDE_X86_SSE2_NATIVE)
2887 return _mm_cvtsi32_si128(a);
2888 #else
2889 simde__m128i_private r_;
2890
2891 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2892 r_.neon_i32 = vsetq_lane_s32(a, vdupq_n_s32(0), 0);
2893 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
2894 r_.wasm_v128 = wasm_i32x4_make(a, 0, 0, 0);
2895 #else
2896 r_.i32[0] = a;
2897 r_.i32[1] = 0;
2898 r_.i32[2] = 0;
2899 r_.i32[3] = 0;
2900 #endif
2901
2902 return simde__m128i_from_private(r_);
2903 #endif
2904 }
2905 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2906 #define _mm_cvtsi32_si128(a) simde_mm_cvtsi32_si128(a)
2907 #endif
2908
2909 SIMDE_FUNCTION_ATTRIBUTES
2910 simde__m128d
simde_mm_cvtsi64_sd(simde__m128d a,int64_t b)2911 simde_mm_cvtsi64_sd (simde__m128d a, int64_t b) {
2912 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
2913 #if !defined(__PGI)
2914 return _mm_cvtsi64_sd(a, b);
2915 #else
2916 return _mm_cvtsi64x_sd(a, b);
2917 #endif
2918 #else
2919 simde__m128d_private
2920 r_,
2921 a_ = simde__m128d_to_private(a);
2922
2923 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2924 r_.neon_f64 = vsetq_lane_f64(HEDLEY_STATIC_CAST(float64_t, b), a_.neon_f64, 0);
2925 #else
2926 r_.f64[0] = HEDLEY_STATIC_CAST(simde_float64, b);
2927 r_.f64[1] = a_.f64[1];
2928 #endif
2929
2930 return simde__m128d_from_private(r_);
2931 #endif
2932 }
2933 #define simde_mm_cvtsi64x_sd(a, b) simde_mm_cvtsi64_sd(a, b)
2934 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64))
2935 #define _mm_cvtsi64_sd(a, b) simde_mm_cvtsi64_sd(a, b)
2936 #define _mm_cvtsi64x_sd(a, b) simde_mm_cvtsi64x_sd(a, b)
2937 #endif
2938
2939 SIMDE_FUNCTION_ATTRIBUTES
2940 simde__m128i
simde_mm_cvtsi64_si128(int64_t a)2941 simde_mm_cvtsi64_si128 (int64_t a) {
2942 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
2943 #if !defined(__PGI)
2944 return _mm_cvtsi64_si128(a);
2945 #else
2946 return _mm_cvtsi64x_si128(a);
2947 #endif
2948 #else
2949 simde__m128i_private r_;
2950
2951 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2952 r_.neon_i64 = vsetq_lane_s64(a, vdupq_n_s64(0), 0);
2953 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
2954 r_.wasm_v128 = wasm_i64x2_make(a, 0);
2955 #else
2956 r_.i64[0] = a;
2957 r_.i64[1] = 0;
2958 #endif
2959
2960 return simde__m128i_from_private(r_);
2961 #endif
2962 }
2963 #define simde_mm_cvtsi64x_si128(a) simde_mm_cvtsi64_si128(a)
2964 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64))
2965 #define _mm_cvtsi64_si128(a) simde_mm_cvtsi64_si128(a)
2966 #define _mm_cvtsi64x_si128(a) simde_mm_cvtsi64x_si128(a)
2967 #endif
2968
2969 SIMDE_FUNCTION_ATTRIBUTES
2970 simde__m128d
simde_mm_cvtss_sd(simde__m128d a,simde__m128 b)2971 simde_mm_cvtss_sd (simde__m128d a, simde__m128 b) {
2972 #if defined(SIMDE_X86_SSE2_NATIVE)
2973 return _mm_cvtss_sd(a, b);
2974 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2975 float64x2_t temp = vcvt_f64_f32(vset_lane_f32(vgetq_lane_f32(simde__m128_to_private(b).neon_f32, 0), vdup_n_f32(0), 0));
2976 return vsetq_lane_f64(vgetq_lane_f64(simde__m128d_to_private(a).neon_f64, 1), temp, 1);
2977 #else
2978 simde__m128d_private
2979 a_ = simde__m128d_to_private(a);
2980 simde__m128_private b_ = simde__m128_to_private(b);
2981
2982 a_.f64[0] = HEDLEY_STATIC_CAST(simde_float64, b_.f32[0]);
2983
2984 return simde__m128d_from_private(a_);
2985 #endif
2986 }
2987 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2988 #define _mm_cvtss_sd(a, b) simde_mm_cvtss_sd(a, b)
2989 #endif
2990
2991 SIMDE_FUNCTION_ATTRIBUTES
2992 simde__m64
simde_mm_cvttpd_pi32(simde__m128d a)2993 simde_mm_cvttpd_pi32 (simde__m128d a) {
2994 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
2995 return _mm_cvttpd_pi32(a);
2996 #else
2997 simde__m64_private r_;
2998 simde__m128d_private a_ = simde__m128d_to_private(a);
2999
3000 #if defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FAST_CONVERSION_RANGE)
3001 SIMDE_CONVERT_VECTOR_(r_.i32, a_.f64);
3002 #else
3003 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
3004 simde_float64 v = a_.f64[i];
3005 #if defined(SIMDE_FAST_CONVERSION_RANGE)
3006 r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, v);
3007 #else
3008 r_.i32[i] = ((v > HEDLEY_STATIC_CAST(simde_float64, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float64, INT32_MAX))) ?
3009 SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN;
3010 #endif
3011 }
3012 #endif
3013
3014 return simde__m64_from_private(r_);
3015 #endif
3016 }
3017 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3018 #define _mm_cvttpd_pi32(a) simde_mm_cvttpd_pi32(a)
3019 #endif
3020
3021 SIMDE_FUNCTION_ATTRIBUTES
3022 simde__m128i
simde_mm_cvttpd_epi32(simde__m128d a)3023 simde_mm_cvttpd_epi32 (simde__m128d a) {
3024 #if defined(SIMDE_X86_SSE2_NATIVE)
3025 return _mm_cvttpd_epi32(a);
3026 #else
3027 simde__m128i_private r_;
3028
3029 r_.m64[0] = simde_mm_cvttpd_pi32(a);
3030 r_.m64[1] = simde_mm_setzero_si64();
3031
3032 return simde__m128i_from_private(r_);
3033 #endif
3034 }
3035 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3036 #define _mm_cvttpd_epi32(a) simde_mm_cvttpd_epi32(a)
3037 #endif
3038
3039 SIMDE_FUNCTION_ATTRIBUTES
3040 simde__m128i
simde_mm_cvttps_epi32(simde__m128 a)3041 simde_mm_cvttps_epi32 (simde__m128 a) {
3042 #if defined(SIMDE_X86_SSE2_NATIVE)
3043 return _mm_cvttps_epi32(a);
3044 #else
3045 simde__m128i_private r_;
3046 simde__m128_private a_ = simde__m128_to_private(a);
3047
3048 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE)
3049 r_.neon_i32 = vcvtq_s32_f32(a_.neon_f32);
3050 #elif defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FAST_CONVERSION_RANGE)
3051 SIMDE_CONVERT_VECTOR_(r_.i32, a_.f32);
3052 #else
3053 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
3054 simde_float32 v = a_.f32[i];
3055 #if defined(SIMDE_FAST_CONVERSION_RANGE)
3056 r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, v);
3057 #else
3058 r_.i32[i] = ((v > HEDLEY_STATIC_CAST(simde_float32, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float32, INT32_MAX))) ?
3059 SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN;
3060 #endif
3061 }
3062 #endif
3063
3064 return simde__m128i_from_private(r_);
3065 #endif
3066 }
3067 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3068 #define _mm_cvttps_epi32(a) simde_mm_cvttps_epi32(a)
3069 #endif
3070
3071 SIMDE_FUNCTION_ATTRIBUTES
3072 int32_t
simde_mm_cvttsd_si32(simde__m128d a)3073 simde_mm_cvttsd_si32 (simde__m128d a) {
3074 #if defined(SIMDE_X86_SSE2_NATIVE)
3075 return _mm_cvttsd_si32(a);
3076 #else
3077 simde__m128d_private a_ = simde__m128d_to_private(a);
3078 simde_float64 v = a_.f64[0];
3079 #if defined(SIMDE_FAST_CONVERSION_RANGE)
3080 return SIMDE_CONVERT_FTOI(int32_t, v);
3081 #else
3082 return ((v > HEDLEY_STATIC_CAST(simde_float64, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float64, INT32_MAX))) ?
3083 SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN;
3084 #endif
3085 #endif
3086 }
3087 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3088 #define _mm_cvttsd_si32(a) simde_mm_cvttsd_si32(a)
3089 #endif
3090
3091 SIMDE_FUNCTION_ATTRIBUTES
3092 int64_t
simde_mm_cvttsd_si64(simde__m128d a)3093 simde_mm_cvttsd_si64 (simde__m128d a) {
3094 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
3095 #if !defined(__PGI)
3096 return _mm_cvttsd_si64(a);
3097 #else
3098 return _mm_cvttsd_si64x(a);
3099 #endif
3100 #else
3101 simde__m128d_private a_ = simde__m128d_to_private(a);
3102 return SIMDE_CONVERT_FTOI(int64_t, a_.f64[0]);
3103 #endif
3104 }
3105 #define simde_mm_cvttsd_si64x(a) simde_mm_cvttsd_si64(a)
3106 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64))
3107 #define _mm_cvttsd_si64(a) simde_mm_cvttsd_si64(a)
3108 #define _mm_cvttsd_si64x(a) simde_mm_cvttsd_si64x(a)
3109 #endif
3110
3111 SIMDE_FUNCTION_ATTRIBUTES
3112 simde__m128d
simde_mm_div_pd(simde__m128d a,simde__m128d b)3113 simde_mm_div_pd (simde__m128d a, simde__m128d b) {
3114 #if defined(SIMDE_X86_SSE2_NATIVE)
3115 return _mm_div_pd(a, b);
3116 #else
3117 simde__m128d_private
3118 r_,
3119 a_ = simde__m128d_to_private(a),
3120 b_ = simde__m128d_to_private(b);
3121
3122 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
3123 r_.f64 = a_.f64 / b_.f64;
3124 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3125 r_.neon_f64 = vdivq_f64(a_.neon_f64, b_.neon_f64);
3126 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
3127 r_.wasm_v128 = wasm_f64x2_div(a_.wasm_v128, b_.wasm_v128);
3128 #else
3129 SIMDE_VECTORIZE
3130 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
3131 r_.f64[i] = a_.f64[i] / b_.f64[i];
3132 }
3133 #endif
3134
3135 return simde__m128d_from_private(r_);
3136 #endif
3137 }
3138 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3139 #define _mm_div_pd(a, b) simde_mm_div_pd(a, b)
3140 #endif
3141
3142 SIMDE_FUNCTION_ATTRIBUTES
3143 simde__m128d
simde_mm_div_sd(simde__m128d a,simde__m128d b)3144 simde_mm_div_sd (simde__m128d a, simde__m128d b) {
3145 #if defined(SIMDE_X86_SSE2_NATIVE)
3146 return _mm_div_sd(a, b);
3147 #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
3148 return simde_mm_move_sd(a, simde_mm_div_pd(a, b));
3149 #else
3150 simde__m128d_private
3151 r_,
3152 a_ = simde__m128d_to_private(a),
3153 b_ = simde__m128d_to_private(b);
3154
3155 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3156 float64x2_t temp = vdivq_f64(a_.neon_f64, b_.neon_f64);
3157 r_.neon_f64 = vsetq_lane_f64(vgetq_lane(a_.neon_f64, 1), temp, 1);
3158 #else
3159 r_.f64[0] = a_.f64[0] / b_.f64[0];
3160 r_.f64[1] = a_.f64[1];
3161 #endif
3162
3163 return simde__m128d_from_private(r_);
3164 #endif
3165 }
3166 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3167 #define _mm_div_sd(a, b) simde_mm_div_sd(a, b)
3168 #endif
3169
3170 SIMDE_FUNCTION_ATTRIBUTES
3171 int32_t
simde_mm_extract_epi16(simde__m128i a,const int imm8)3172 simde_mm_extract_epi16 (simde__m128i a, const int imm8)
3173 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 7) {
3174 uint16_t r;
3175 simde__m128i_private a_ = simde__m128i_to_private(a);
3176
3177 #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
3178 #if defined(SIMDE_BUG_GCC_95227)
3179 (void) a_;
3180 (void) imm8;
3181 #endif
3182 r = HEDLEY_STATIC_CAST(uint16_t, vec_extract(a_.altivec_i16, imm8));
3183 #else
3184 r = a_.u16[imm8 & 7];
3185 #endif
3186
3187 return HEDLEY_STATIC_CAST(int32_t, r);
3188 }
3189 #if defined(SIMDE_X86_SSE2_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(4,6,0))
3190 #define simde_mm_extract_epi16(a, imm8) _mm_extract_epi16(a, imm8)
3191 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3192 #define simde_mm_extract_epi16(a, imm8) (HEDLEY_STATIC_CAST(int32_t, vgetq_lane_s16(simde__m128i_to_private(a).neon_i16, (imm8))) & (INT32_C(0x0000ffff)))
3193 #endif
3194 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3195 #define _mm_extract_epi16(a, imm8) simde_mm_extract_epi16(a, imm8)
3196 #endif
3197
3198 SIMDE_FUNCTION_ATTRIBUTES
3199 simde__m128i
simde_mm_insert_epi16(simde__m128i a,int16_t i,const int imm8)3200 simde_mm_insert_epi16 (simde__m128i a, int16_t i, const int imm8)
3201 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 7) {
3202 simde__m128i_private a_ = simde__m128i_to_private(a);
3203 a_.i16[imm8 & 7] = i;
3204 return simde__m128i_from_private(a_);
3205 }
3206 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
3207 #define simde_mm_insert_epi16(a, i, imm8) _mm_insert_epi16((a), (i), (imm8))
3208 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3209 #define simde_mm_insert_epi16(a, i, imm8) simde__m128i_from_neon_i16(vsetq_lane_s16((i), simde__m128i_to_neon_i16(a), (imm8)))
3210 #endif
3211 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3212 #define _mm_insert_epi16(a, i, imm8) simde_mm_insert_epi16(a, i, imm8)
3213 #endif
3214
3215 SIMDE_FUNCTION_ATTRIBUTES
3216 simde__m128d
simde_mm_load_pd(simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM (2)])3217 simde_mm_load_pd (simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)]) {
3218 #if defined(SIMDE_X86_SSE2_NATIVE)
3219 return _mm_load_pd(mem_addr);
3220 #else
3221 simde__m128d_private r_;
3222
3223 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3224 r_.neon_f64 = vld1q_f64(mem_addr);
3225 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3226 r_.neon_u32 = vld1q_u32(HEDLEY_REINTERPRET_CAST(uint32_t const*, mem_addr));
3227 #else
3228 simde_memcpy(&r_, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m128d), sizeof(r_));
3229 #endif
3230
3231 return simde__m128d_from_private(r_);
3232 #endif
3233 }
3234 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3235 #define _mm_load_pd(mem_addr) simde_mm_load_pd(mem_addr)
3236 #endif
3237
3238 SIMDE_FUNCTION_ATTRIBUTES
3239 simde__m128d
simde_mm_load1_pd(simde_float64 const * mem_addr)3240 simde_mm_load1_pd (simde_float64 const* mem_addr) {
3241 #if defined(SIMDE_X86_SSE2_NATIVE)
3242 return _mm_load1_pd(mem_addr);
3243 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3244 return simde__m128d_from_neon_f64(vld1q_dup_f64(mem_addr));
3245 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
3246 return simde__m128d_from_wasm_v128(wasm_v64x2_load_splat(mem_addr));
3247 #else
3248 return simde_mm_set1_pd(*mem_addr);
3249 #endif
3250 }
3251 #define simde_mm_load_pd1(mem_addr) simde_mm_load1_pd(mem_addr)
3252 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3253 #define _mm_load_pd1(mem_addr) simde_mm_load1_pd(mem_addr)
3254 #define _mm_load1_pd(mem_addr) simde_mm_load1_pd(mem_addr)
3255 #endif
3256
3257 SIMDE_FUNCTION_ATTRIBUTES
3258 simde__m128d
simde_mm_load_sd(simde_float64 const * mem_addr)3259 simde_mm_load_sd (simde_float64 const* mem_addr) {
3260 #if defined(SIMDE_X86_SSE2_NATIVE)
3261 return _mm_load_sd(mem_addr);
3262 #else
3263 simde__m128d_private r_;
3264
3265 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3266 r_.neon_f64 = vsetq_lane_f64(*mem_addr, vdupq_n_f64(0), 0);
3267 #else
3268 r_.f64[0] = *mem_addr;
3269 r_.u64[1] = UINT64_C(0);
3270 #endif
3271
3272 return simde__m128d_from_private(r_);
3273 #endif
3274 }
3275 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3276 #define _mm_load_sd(mem_addr) simde_mm_load_sd(mem_addr)
3277 #endif
3278
3279 SIMDE_FUNCTION_ATTRIBUTES
3280 simde__m128i
simde_mm_load_si128(simde__m128i const * mem_addr)3281 simde_mm_load_si128 (simde__m128i const* mem_addr) {
3282 #if defined(SIMDE_X86_SSE2_NATIVE)
3283 return _mm_load_si128(HEDLEY_REINTERPRET_CAST(__m128i const*, mem_addr));
3284 #else
3285 simde__m128i_private r_;
3286
3287 #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
3288 r_.altivec_i32 = vec_ld(0, HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(int) const*, mem_addr));
3289 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3290 r_.neon_i32 = vld1q_s32(HEDLEY_REINTERPRET_CAST(int32_t const*, mem_addr));
3291 #else
3292 simde_memcpy(&r_, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m128i), sizeof(simde__m128i));
3293 #endif
3294
3295 return simde__m128i_from_private(r_);
3296 #endif
3297 }
3298 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3299 #define _mm_load_si128(mem_addr) simde_mm_load_si128(mem_addr)
3300 #endif
3301
3302 SIMDE_FUNCTION_ATTRIBUTES
3303 simde__m128d
simde_mm_loadh_pd(simde__m128d a,simde_float64 const * mem_addr)3304 simde_mm_loadh_pd (simde__m128d a, simde_float64 const* mem_addr) {
3305 #if defined(SIMDE_X86_SSE2_NATIVE)
3306 return _mm_loadh_pd(a, mem_addr);
3307 #else
3308 simde__m128d_private
3309 r_,
3310 a_ = simde__m128d_to_private(a);
3311
3312 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3313 r_.neon_f64 = vcombine_f64(vget_low_f64(a_.neon_f64), vld1_f64(HEDLEY_REINTERPRET_CAST(const float64_t*, mem_addr)));
3314 #else
3315 simde_float64 t;
3316
3317 simde_memcpy(&t, mem_addr, sizeof(t));
3318 r_.f64[0] = a_.f64[0];
3319 r_.f64[1] = t;
3320 #endif
3321
3322 return simde__m128d_from_private(r_);
3323 #endif
3324 }
3325 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3326 #define _mm_loadh_pd(a, mem_addr) simde_mm_loadh_pd(a, mem_addr)
3327 #endif
3328
3329 SIMDE_FUNCTION_ATTRIBUTES
3330 simde__m128i
simde_mm_loadl_epi64(simde__m128i const * mem_addr)3331 simde_mm_loadl_epi64 (simde__m128i const* mem_addr) {
3332 #if defined(SIMDE_X86_SSE2_NATIVE)
3333 return _mm_loadl_epi64(mem_addr);
3334 #else
3335 simde__m128i_private r_;
3336
3337 int64_t value;
3338 simde_memcpy(&value, mem_addr, sizeof(value));
3339
3340 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3341 r_.neon_i64 = vcombine_s64(vld1_s64(HEDLEY_REINTERPRET_CAST(int64_t const *, mem_addr)), vdup_n_s64(0));
3342 #else
3343 r_.i64[0] = value;
3344 r_.i64[1] = 0;
3345 #endif
3346
3347 return simde__m128i_from_private(r_);
3348 #endif
3349 }
3350 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3351 #define _mm_loadl_epi64(mem_addr) simde_mm_loadl_epi64(mem_addr)
3352 #endif
3353
3354 SIMDE_FUNCTION_ATTRIBUTES
3355 simde__m128d
simde_mm_loadl_pd(simde__m128d a,simde_float64 const * mem_addr)3356 simde_mm_loadl_pd (simde__m128d a, simde_float64 const* mem_addr) {
3357 #if defined(SIMDE_X86_SSE2_NATIVE)
3358 return _mm_loadl_pd(a, mem_addr);
3359 #else
3360 simde__m128d_private
3361 r_,
3362 a_ = simde__m128d_to_private(a);
3363
3364 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3365 r_.neon_f64 = vcombine_f64(vld1_f64(
3366 HEDLEY_REINTERPRET_CAST(const float64_t*, mem_addr)), vget_high_f64(a_.neon_f64));
3367 #else
3368 r_.f64[0] = *mem_addr;
3369 r_.u64[1] = a_.u64[1];
3370 #endif
3371
3372 return simde__m128d_from_private(r_);
3373 #endif
3374 }
3375 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3376 #define _mm_loadl_pd(a, mem_addr) simde_mm_loadl_pd(a, mem_addr)
3377 #endif
3378
3379 SIMDE_FUNCTION_ATTRIBUTES
3380 simde__m128d
simde_mm_loadr_pd(simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM (2)])3381 simde_mm_loadr_pd (simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)]) {
3382 #if defined(SIMDE_X86_SSE2_NATIVE)
3383 return _mm_loadr_pd(mem_addr);
3384 #else
3385 simde__m128d_private
3386 r_;
3387
3388 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3389 r_.neon_f64 = vld1q_f64(mem_addr);
3390 r_.neon_f64 = vextq_f64(r_.neon_f64, r_.neon_f64, 1);
3391 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3392 r_.neon_i64 = vld1q_s64(HEDLEY_REINTERPRET_CAST(int64_t const *, mem_addr));
3393 r_.neon_i64 = vextq_s64(r_.neon_i64, r_.neon_i64, 1);
3394 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
3395 v128_t tmp = wasm_v128_load(mem_addr);
3396 r_.wasm_v128 = wasm_v64x2_shuffle(tmp, tmp, 1, 0);
3397 #else
3398 r_.f64[0] = mem_addr[1];
3399 r_.f64[1] = mem_addr[0];
3400 #endif
3401
3402 return simde__m128d_from_private(r_);
3403 #endif
3404 }
3405 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3406 #define _mm_loadr_pd(mem_addr) simde_mm_loadr_pd(mem_addr)
3407 #endif
3408
3409 SIMDE_FUNCTION_ATTRIBUTES
3410 simde__m128d
simde_mm_loadu_pd(simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM (2)])3411 simde_mm_loadu_pd (simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)]) {
3412 #if defined(SIMDE_X86_SSE2_NATIVE)
3413 return _mm_loadu_pd(mem_addr);
3414 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3415 return vld1q_f64(mem_addr);
3416 #else
3417 simde__m128d_private r_;
3418
3419 simde_memcpy(&r_, mem_addr, sizeof(r_));
3420
3421 return simde__m128d_from_private(r_);
3422 #endif
3423 }
3424 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3425 #define _mm_loadu_pd(mem_addr) simde_mm_loadu_pd(mem_addr)
3426 #endif
3427
3428 SIMDE_FUNCTION_ATTRIBUTES
3429 simde__m128i
simde_mm_loadu_epi8(void const * mem_addr)3430 simde_mm_loadu_epi8(void const * mem_addr) {
3431 #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) && !defined(SIMDE_BUG_GCC_95483) && !defined(SIMDE_BUG_CLANG_REV_344862)
3432 return _mm_loadu_epi8(mem_addr);
3433 #elif defined(SIMDE_X86_SSE2_NATIVE)
3434 return _mm_loadu_si128(SIMDE_ALIGN_CAST(__m128i const *, mem_addr));
3435 #else
3436 simde__m128i_private r_;
3437
3438 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3439 r_.neon_i8 = vld1q_s8(HEDLEY_REINTERPRET_CAST(int8_t const*, mem_addr));
3440 #else
3441 simde_memcpy(&r_, mem_addr, sizeof(r_));
3442 #endif
3443
3444 return simde__m128i_from_private(r_);
3445 #endif
3446 }
3447 #define simde_x_mm_loadu_epi8(mem_addr) simde_mm_loadu_epi8(mem_addr)
3448 #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && (defined(SIMDE_BUG_GCC_95483) || defined(SIMDE_BUG_CLANG_REV_344862)))
3449 #undef _mm_loadu_epi8
3450 #define _mm_loadu_epi8(a) simde_mm_loadu_epi8(a)
3451 #endif
3452
3453 SIMDE_FUNCTION_ATTRIBUTES
3454 simde__m128i
simde_mm_loadu_epi16(void const * mem_addr)3455 simde_mm_loadu_epi16(void const * mem_addr) {
3456 #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) && !defined(SIMDE_BUG_GCC_95483) && !defined(SIMDE_BUG_CLANG_REV_344862)
3457 return _mm_loadu_epi16(mem_addr);
3458 #elif defined(SIMDE_X86_SSE2_NATIVE)
3459 return _mm_loadu_si128(SIMDE_ALIGN_CAST(__m128i const *, mem_addr));
3460 #else
3461 simde__m128i_private r_;
3462
3463 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3464 r_.neon_i16 = vreinterpretq_s16_s8(vld1q_s8(HEDLEY_REINTERPRET_CAST(int8_t const*, mem_addr)));
3465 #else
3466 simde_memcpy(&r_, mem_addr, sizeof(r_));
3467 #endif
3468
3469 return simde__m128i_from_private(r_);
3470 #endif
3471 }
3472 #define simde_x_mm_loadu_epi16(mem_addr) simde_mm_loadu_epi16(mem_addr)
3473 #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && (defined(SIMDE_BUG_GCC_95483) || defined(SIMDE_BUG_CLANG_REV_344862)))
3474 #undef _mm_loadu_epi16
3475 #define _mm_loadu_epi16(a) simde_mm_loadu_epi16(a)
3476 #endif
3477
3478 SIMDE_FUNCTION_ATTRIBUTES
3479 simde__m128i
simde_mm_loadu_epi32(void const * mem_addr)3480 simde_mm_loadu_epi32(void const * mem_addr) {
3481 #if defined(SIMDE_X86_AVX512VL_NATIVE) && !defined(SIMDE_BUG_GCC_95483) && !defined(SIMDE_BUG_CLANG_REV_344862)
3482 return _mm_loadu_epi32(mem_addr);
3483 #elif defined(SIMDE_X86_SSE2_NATIVE)
3484 return _mm_loadu_si128(SIMDE_ALIGN_CAST(__m128i const *, mem_addr));
3485 #else
3486 simde__m128i_private r_;
3487
3488 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3489 r_.neon_i32 = vreinterpretq_s32_s8(vld1q_s8(HEDLEY_REINTERPRET_CAST(int8_t const*, mem_addr)));
3490 #else
3491 simde_memcpy(&r_, mem_addr, sizeof(r_));
3492 #endif
3493
3494 return simde__m128i_from_private(r_);
3495 #endif
3496 }
3497 #define simde_x_mm_loadu_epi32(mem_addr) simde_mm_loadu_epi32(mem_addr)
3498 #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && (defined(SIMDE_BUG_GCC_95483) || defined(SIMDE_BUG_CLANG_REV_344862)))
3499 #undef _mm_loadu_epi32
3500 #define _mm_loadu_epi32(a) simde_mm_loadu_epi32(a)
3501 #endif
3502
3503 SIMDE_FUNCTION_ATTRIBUTES
3504 simde__m128i
simde_mm_loadu_epi64(void const * mem_addr)3505 simde_mm_loadu_epi64(void const * mem_addr) {
3506 #if defined(SIMDE_X86_AVX512VL_NATIVE) && !defined(SIMDE_BUG_GCC_95483) && !defined(SIMDE_BUG_CLANG_REV_344862)
3507 return _mm_loadu_epi64(mem_addr);
3508 #elif defined(SIMDE_X86_SSE2_NATIVE)
3509 return _mm_loadu_si128(SIMDE_ALIGN_CAST(__m128i const *, mem_addr));
3510 #else
3511 simde__m128i_private r_;
3512
3513 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3514 r_.neon_i64 = vreinterpretq_s64_s8(vld1q_s8(HEDLEY_REINTERPRET_CAST(int8_t const*, mem_addr)));
3515 #else
3516 simde_memcpy(&r_, mem_addr, sizeof(r_));
3517 #endif
3518
3519 return simde__m128i_from_private(r_);
3520 #endif
3521 }
3522 #define simde_x_mm_loadu_epi64(mem_addr) simde_mm_loadu_epi64(mem_addr)
3523 #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && (defined(SIMDE_BUG_GCC_95483) || defined(SIMDE_BUG_CLANG_REV_344862)))
3524 #undef _mm_loadu_epi64
3525 #define _mm_loadu_epi64(a) simde_mm_loadu_epi64(a)
3526 #endif
3527
3528 SIMDE_FUNCTION_ATTRIBUTES
3529 simde__m128i
simde_mm_loadu_si128(void const * mem_addr)3530 simde_mm_loadu_si128 (void const* mem_addr) {
3531 #if defined(SIMDE_X86_SSE2_NATIVE)
3532 return _mm_loadu_si128(HEDLEY_STATIC_CAST(__m128i const*, mem_addr));
3533 #else
3534 simde__m128i_private r_;
3535
3536 #if HEDLEY_GNUC_HAS_ATTRIBUTE(may_alias,3,3,0)
3537 HEDLEY_DIAGNOSTIC_PUSH
3538 SIMDE_DIAGNOSTIC_DISABLE_PACKED_
3539 struct simde_mm_loadu_si128_s {
3540 __typeof__(r_) v;
3541 } __attribute__((__packed__, __may_alias__));
3542 r_ = HEDLEY_REINTERPRET_CAST(const struct simde_mm_loadu_si128_s *, mem_addr)->v;
3543 HEDLEY_DIAGNOSTIC_POP
3544 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3545 r_.neon_i8 = vld1q_s8(HEDLEY_REINTERPRET_CAST(int8_t const*, mem_addr));
3546 #else
3547 simde_memcpy(&r_, mem_addr, sizeof(r_));
3548 #endif
3549
3550 return simde__m128i_from_private(r_);
3551 #endif
3552 }
3553 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3554 #define _mm_loadu_si128(mem_addr) simde_mm_loadu_si128(mem_addr)
3555 #endif
3556
3557 SIMDE_FUNCTION_ATTRIBUTES
3558 simde__m128i
simde_mm_madd_epi16(simde__m128i a,simde__m128i b)3559 simde_mm_madd_epi16 (simde__m128i a, simde__m128i b) {
3560 #if defined(SIMDE_X86_SSE2_NATIVE)
3561 return _mm_madd_epi16(a, b);
3562 #else
3563 simde__m128i_private
3564 r_,
3565 a_ = simde__m128i_to_private(a),
3566 b_ = simde__m128i_to_private(b);
3567
3568 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3569 int32x4_t pl = vmull_s16(vget_low_s16(a_.neon_i16), vget_low_s16(b_.neon_i16));
3570 int32x4_t ph = vmull_high_s16(a_.neon_i16, b_.neon_i16);
3571 r_.neon_i32 = vpaddq_s32(pl, ph);
3572 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3573 int32x4_t pl = vmull_s16(vget_low_s16(a_.neon_i16), vget_low_s16(b_.neon_i16));
3574 int32x4_t ph = vmull_s16(vget_high_s16(a_.neon_i16), vget_high_s16(b_.neon_i16));
3575 int32x2_t rl = vpadd_s32(vget_low_s32(pl), vget_high_s32(pl));
3576 int32x2_t rh = vpadd_s32(vget_low_s32(ph), vget_high_s32(ph));
3577 r_.neon_i32 = vcombine_s32(rl, rh);
3578 #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
3579 static const SIMDE_POWER_ALTIVEC_VECTOR(int) tz = { 0, 0, 0, 0 };
3580 r_.altivec_i32 = vec_msum(a_.altivec_i16, b_.altivec_i16, tz);
3581 #else
3582 SIMDE_VECTORIZE
3583 for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i16[0])) ; i += 2) {
3584 r_.i32[i / 2] = (a_.i16[i] * b_.i16[i]) + (a_.i16[i + 1] * b_.i16[i + 1]);
3585 }
3586 #endif
3587
3588 return simde__m128i_from_private(r_);
3589 #endif
3590 }
3591 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3592 #define _mm_madd_epi16(a, b) simde_mm_madd_epi16(a, b)
3593 #endif
3594
3595 SIMDE_FUNCTION_ATTRIBUTES
3596 void
simde_mm_maskmoveu_si128(simde__m128i a,simde__m128i mask,int8_t mem_addr[HEDLEY_ARRAY_PARAM (16)])3597 simde_mm_maskmoveu_si128 (simde__m128i a, simde__m128i mask, int8_t mem_addr[HEDLEY_ARRAY_PARAM(16)]) {
3598 #if defined(SIMDE_X86_SSE2_NATIVE)
3599 _mm_maskmoveu_si128(a, mask, HEDLEY_REINTERPRET_CAST(char*, mem_addr));
3600 #else
3601 simde__m128i_private
3602 a_ = simde__m128i_to_private(a),
3603 mask_ = simde__m128i_to_private(mask);
3604
3605 for (size_t i = 0 ; i < (sizeof(a_.i8) / sizeof(a_.i8[0])) ; i++) {
3606 if (mask_.u8[i] & 0x80) {
3607 mem_addr[i] = a_.i8[i];
3608 }
3609 }
3610 #endif
3611 }
3612 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3613 #define _mm_maskmoveu_si128(a, mask, mem_addr) simde_mm_maskmoveu_si128((a), (mask), SIMDE_CHECKED_REINTERPRET_CAST(int8_t*, char*, (mem_addr)))
3614 #endif
3615
3616 SIMDE_FUNCTION_ATTRIBUTES
3617 int32_t
simde_mm_movemask_epi8(simde__m128i a)3618 simde_mm_movemask_epi8 (simde__m128i a) {
3619 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__INTEL_COMPILER)
3620 /* ICC has trouble with _mm_movemask_epi8 at -O2 and above: */
3621 return _mm_movemask_epi8(a);
3622 #else
3623 int32_t r = 0;
3624 simde__m128i_private a_ = simde__m128i_to_private(a);
3625
3626 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3627 // Use increasingly wide shifts+adds to collect the sign bits
3628 // together.
3629 // Since the widening shifts would be rather confusing to follow in little endian, everything
3630 // will be illustrated in big endian order instead. This has a different result - the bits
3631 // would actually be reversed on a big endian machine.
3632
3633 // Starting input (only half the elements are shown):
3634 // 89 ff 1d c0 00 10 99 33
3635 uint8x16_t input = a_.neon_u8;
3636
3637 // Shift out everything but the sign bits with an unsigned shift right.
3638 //
3639 // Bytes of the vector::
3640 // 89 ff 1d c0 00 10 99 33
3641 // \ \ \ \ \ \ \ \ high_bits = (uint16x4_t)(input >> 7)
3642 // | | | | | | | |
3643 // 01 01 00 01 00 00 01 00
3644 //
3645 // Bits of first important lane(s):
3646 // 10001001 (89)
3647 // \______
3648 // |
3649 // 00000001 (01)
3650 uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
3651
3652 // Merge the even lanes together with a 16-bit unsigned shift right + add.
3653 // 'xx' represents garbage data which will be ignored in the final result.
3654 // In the important bytes, the add functions like a binary OR.
3655 //
3656 // 01 01 00 01 00 00 01 00
3657 // \_ | \_ | \_ | \_ | paired16 = (uint32x4_t)(input + (input >> 7))
3658 // \| \| \| \|
3659 // xx 03 xx 01 xx 00 xx 02
3660 //
3661 // 00000001 00000001 (01 01)
3662 // \_______ |
3663 // \|
3664 // xxxxxxxx xxxxxx11 (xx 03)
3665 uint32x4_t paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
3666
3667 // Repeat with a wider 32-bit shift + add.
3668 // xx 03 xx 01 xx 00 xx 02
3669 // \____ | \____ | paired32 = (uint64x1_t)(paired16 + (paired16 >> 14))
3670 // \| \|
3671 // xx xx xx 0d xx xx xx 02
3672 //
3673 // 00000011 00000001 (03 01)
3674 // \\_____ ||
3675 // '----.\||
3676 // xxxxxxxx xxxx1101 (xx 0d)
3677 uint64x2_t paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
3678
3679 // Last, an even wider 64-bit shift + add to get our result in the low 8 bit lanes.
3680 // xx xx xx 0d xx xx xx 02
3681 // \_________ | paired64 = (uint8x8_t)(paired32 + (paired32 >> 28))
3682 // \|
3683 // xx xx xx xx xx xx xx d2
3684 //
3685 // 00001101 00000010 (0d 02)
3686 // \ \___ | |
3687 // '---. \| |
3688 // xxxxxxxx 11010010 (xx d2)
3689 uint8x16_t paired64 = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
3690
3691 // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
3692 // xx xx xx xx xx xx xx d2
3693 // || return paired64[0]
3694 // d2
3695 // Note: Little endian would return the correct value 4b (01001011) instead.
3696 r = vgetq_lane_u8(paired64, 0) | (HEDLEY_STATIC_CAST(int32_t, vgetq_lane_u8(paired64, 8)) << 8);
3697 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) && !defined(HEDLEY_IBM_VERSION) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
3698 static const SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) perm = { 120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8, 0 };
3699 r = HEDLEY_STATIC_CAST(int32_t, vec_extract(vec_vbpermq(a_.altivec_u8, perm), 1));
3700 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) && !defined(HEDLEY_IBM_VERSION) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_BIG)
3701 static const SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) perm = { 120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8, 0 };
3702 r = HEDLEY_STATIC_CAST(int32_t, vec_extract(vec_vbpermq(a_.altivec_u8, perm), 14));
3703 #else
3704 SIMDE_VECTORIZE_REDUCTION(|:r)
3705 for (size_t i = 0 ; i < (sizeof(a_.u8) / sizeof(a_.u8[0])) ; i++) {
3706 r |= (a_.u8[15 - i] >> 7) << (15 - i);
3707 }
3708 #endif
3709
3710 return r;
3711 #endif
3712 }
3713 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3714 #define _mm_movemask_epi8(a) simde_mm_movemask_epi8(a)
3715 #endif
3716
3717 SIMDE_FUNCTION_ATTRIBUTES
3718 int32_t
simde_mm_movemask_pd(simde__m128d a)3719 simde_mm_movemask_pd (simde__m128d a) {
3720 #if defined(SIMDE_X86_SSE2_NATIVE)
3721 return _mm_movemask_pd(a);
3722 #else
3723 int32_t r = 0;
3724 simde__m128d_private a_ = simde__m128d_to_private(a);
3725
3726 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3727 static const int64_t shift_amount[] = { 0, 1 };
3728 const int64x2_t shift = vld1q_s64(shift_amount);
3729 uint64x2_t tmp = vshrq_n_u64(a_.neon_u64, 63);
3730 return HEDLEY_STATIC_CAST(int32_t, vaddvq_u64(vshlq_u64(tmp, shift)));
3731 #else
3732 SIMDE_VECTORIZE_REDUCTION(|:r)
3733 for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) {
3734 r |= (a_.u64[i] >> 63) << i;
3735 }
3736 #endif
3737
3738 return r;
3739 #endif
3740 }
3741 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3742 #define _mm_movemask_pd(a) simde_mm_movemask_pd(a)
3743 #endif
3744
3745 SIMDE_FUNCTION_ATTRIBUTES
3746 simde__m64
simde_mm_movepi64_pi64(simde__m128i a)3747 simde_mm_movepi64_pi64 (simde__m128i a) {
3748 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
3749 return _mm_movepi64_pi64(a);
3750 #else
3751 simde__m64_private r_;
3752 simde__m128i_private a_ = simde__m128i_to_private(a);
3753
3754 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3755 r_.neon_i64 = vget_low_s64(a_.neon_i64);
3756 #else
3757 r_.i64[0] = a_.i64[0];
3758 #endif
3759
3760 return simde__m64_from_private(r_);
3761 #endif
3762 }
3763 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3764 #define _mm_movepi64_pi64(a) simde_mm_movepi64_pi64(a)
3765 #endif
3766
3767 SIMDE_FUNCTION_ATTRIBUTES
3768 simde__m128i
simde_mm_movpi64_epi64(simde__m64 a)3769 simde_mm_movpi64_epi64 (simde__m64 a) {
3770 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
3771 return _mm_movpi64_epi64(a);
3772 #else
3773 simde__m128i_private r_;
3774 simde__m64_private a_ = simde__m64_to_private(a);
3775
3776 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3777 r_.neon_i64 = vcombine_s64(a_.neon_i64, vdup_n_s64(0));
3778 #else
3779 r_.i64[0] = a_.i64[0];
3780 r_.i64[1] = 0;
3781 #endif
3782
3783 return simde__m128i_from_private(r_);
3784 #endif
3785 }
3786 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3787 #define _mm_movpi64_epi64(a) simde_mm_movpi64_epi64(a)
3788 #endif
3789
3790 SIMDE_FUNCTION_ATTRIBUTES
3791 simde__m128i
simde_mm_min_epi16(simde__m128i a,simde__m128i b)3792 simde_mm_min_epi16 (simde__m128i a, simde__m128i b) {
3793 #if defined(SIMDE_X86_SSE2_NATIVE)
3794 return _mm_min_epi16(a, b);
3795 #else
3796 simde__m128i_private
3797 r_,
3798 a_ = simde__m128i_to_private(a),
3799 b_ = simde__m128i_to_private(b);
3800
3801 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3802 r_.neon_i16 = vminq_s16(a_.neon_i16, b_.neon_i16);
3803 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
3804 r_.wasm_v128 = wasm_i16x8_min(a_.wasm_v128, b_.wasm_v128);
3805 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
3806 r_.altivec_i16 = vec_min(a_.altivec_i16, b_.altivec_i16);
3807 #else
3808 SIMDE_VECTORIZE
3809 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
3810 r_.i16[i] = (a_.i16[i] < b_.i16[i]) ? a_.i16[i] : b_.i16[i];
3811 }
3812 #endif
3813
3814 return simde__m128i_from_private(r_);
3815 #endif
3816 }
3817 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3818 #define _mm_min_epi16(a, b) simde_mm_min_epi16(a, b)
3819 #endif
3820
3821 SIMDE_FUNCTION_ATTRIBUTES
3822 simde__m128i
simde_mm_min_epu8(simde__m128i a,simde__m128i b)3823 simde_mm_min_epu8 (simde__m128i a, simde__m128i b) {
3824 #if defined(SIMDE_X86_SSE2_NATIVE)
3825 return _mm_min_epu8(a, b);
3826 #else
3827 simde__m128i_private
3828 r_,
3829 a_ = simde__m128i_to_private(a),
3830 b_ = simde__m128i_to_private(b);
3831
3832 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3833 r_.neon_u8 = vminq_u8(a_.neon_u8, b_.neon_u8);
3834 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
3835 r_.wasm_v128 = wasm_u8x16_min(a_.wasm_v128, b_.wasm_v128);
3836 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
3837 r_.altivec_u8 = vec_min(a_.altivec_u8, b_.altivec_u8);
3838 #else
3839 SIMDE_VECTORIZE
3840 for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
3841 r_.u8[i] = (a_.u8[i] < b_.u8[i]) ? a_.u8[i] : b_.u8[i];
3842 }
3843 #endif
3844
3845 return simde__m128i_from_private(r_);
3846 #endif
3847 }
3848 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3849 #define _mm_min_epu8(a, b) simde_mm_min_epu8(a, b)
3850 #endif
3851
3852 SIMDE_FUNCTION_ATTRIBUTES
3853 simde__m128d
simde_mm_min_pd(simde__m128d a,simde__m128d b)3854 simde_mm_min_pd (simde__m128d a, simde__m128d b) {
3855 #if defined(SIMDE_X86_SSE2_NATIVE)
3856 return _mm_min_pd(a, b);
3857 #else
3858 simde__m128d_private
3859 r_,
3860 a_ = simde__m128d_to_private(a),
3861 b_ = simde__m128d_to_private(b);
3862
3863 #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
3864 r_.altivec_f64 = vec_min(a_.altivec_f64, b_.altivec_f64);
3865 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3866 r_.neon_f64 = vminq_f64(a_.neon_f64, b_.neon_f64);
3867 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
3868 r_.wasm_v128 = wasm_f64x2_min(a_.wasm_v128, b_.wasm_v128);
3869 #else
3870 SIMDE_VECTORIZE
3871 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
3872 r_.f64[i] = (a_.f64[i] < b_.f64[i]) ? a_.f64[i] : b_.f64[i];
3873 }
3874 #endif
3875
3876 return simde__m128d_from_private(r_);
3877 #endif
3878 }
3879 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3880 #define _mm_min_pd(a, b) simde_mm_min_pd(a, b)
3881 #endif
3882
3883 SIMDE_FUNCTION_ATTRIBUTES
3884 simde__m128d
simde_mm_min_sd(simde__m128d a,simde__m128d b)3885 simde_mm_min_sd (simde__m128d a, simde__m128d b) {
3886 #if defined(SIMDE_X86_SSE2_NATIVE)
3887 return _mm_min_sd(a, b);
3888 #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
3889 return simde_mm_move_sd(a, simde_mm_min_pd(a, b));
3890 #else
3891 simde__m128d_private
3892 r_,
3893 a_ = simde__m128d_to_private(a),
3894 b_ = simde__m128d_to_private(b);
3895
3896 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3897 float64x2_t temp = vminq_f64(a_.neon_f64, b_.neon_f64);
3898 r_.neon_f64 = vsetq_lane_f64(vgetq_lane(a_.neon_f64, 1), temp, 1);
3899 #else
3900 r_.f64[0] = (a_.f64[0] < b_.f64[0]) ? a_.f64[0] : b_.f64[0];
3901 r_.f64[1] = a_.f64[1];
3902 #endif
3903
3904 return simde__m128d_from_private(r_);
3905 #endif
3906 }
3907 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3908 #define _mm_min_sd(a, b) simde_mm_min_sd(a, b)
3909 #endif
3910
3911 SIMDE_FUNCTION_ATTRIBUTES
3912 simde__m128i
simde_mm_max_epi16(simde__m128i a,simde__m128i b)3913 simde_mm_max_epi16 (simde__m128i a, simde__m128i b) {
3914 #if defined(SIMDE_X86_SSE2_NATIVE)
3915 return _mm_max_epi16(a, b);
3916 #else
3917 simde__m128i_private
3918 r_,
3919 a_ = simde__m128i_to_private(a),
3920 b_ = simde__m128i_to_private(b);
3921
3922 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3923 r_.neon_i16 = vmaxq_s16(a_.neon_i16, b_.neon_i16);
3924 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
3925 r_.wasm_v128 = wasm_i16x8_max(a_.wasm_v128, b_.wasm_v128);
3926 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
3927 r_.altivec_i16 = vec_max(a_.altivec_i16, b_.altivec_i16);
3928 #else
3929 SIMDE_VECTORIZE
3930 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
3931 r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? a_.i16[i] : b_.i16[i];
3932 }
3933 #endif
3934
3935 return simde__m128i_from_private(r_);
3936 #endif
3937 }
3938 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3939 #define _mm_max_epi16(a, b) simde_mm_max_epi16(a, b)
3940 #endif
3941
3942 SIMDE_FUNCTION_ATTRIBUTES
3943 simde__m128i
simde_mm_max_epu8(simde__m128i a,simde__m128i b)3944 simde_mm_max_epu8 (simde__m128i a, simde__m128i b) {
3945 #if defined(SIMDE_X86_SSE2_NATIVE)
3946 return _mm_max_epu8(a, b);
3947 #else
3948 simde__m128i_private
3949 r_,
3950 a_ = simde__m128i_to_private(a),
3951 b_ = simde__m128i_to_private(b);
3952
3953 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3954 r_.neon_u8 = vmaxq_u8(a_.neon_u8, b_.neon_u8);
3955 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
3956 r_.wasm_v128 = wasm_u8x16_max(a_.wasm_v128, b_.wasm_v128);
3957 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
3958 r_.altivec_u8 = vec_max(a_.altivec_u8, b_.altivec_u8);
3959 #else
3960 SIMDE_VECTORIZE
3961 for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
3962 r_.u8[i] = (a_.u8[i] > b_.u8[i]) ? a_.u8[i] : b_.u8[i];
3963 }
3964 #endif
3965
3966 return simde__m128i_from_private(r_);
3967 #endif
3968 }
3969 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3970 #define _mm_max_epu8(a, b) simde_mm_max_epu8(a, b)
3971 #endif
3972
3973 SIMDE_FUNCTION_ATTRIBUTES
3974 simde__m128d
simde_mm_max_pd(simde__m128d a,simde__m128d b)3975 simde_mm_max_pd (simde__m128d a, simde__m128d b) {
3976 #if defined(SIMDE_X86_SSE2_NATIVE)
3977 return _mm_max_pd(a, b);
3978 #else
3979 simde__m128d_private
3980 r_,
3981 a_ = simde__m128d_to_private(a),
3982 b_ = simde__m128d_to_private(b);
3983
3984 #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
3985 r_.altivec_f64 = vec_max(a_.altivec_f64, b_.altivec_f64);
3986 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
3987 r_.wasm_v128 = wasm_f64x2_max(a_.wasm_v128, b_.wasm_v128);
3988 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3989 r_.neon_f64 = vmaxq_f64(a_.neon_f64, b_.neon_f64);
3990 #else
3991 SIMDE_VECTORIZE
3992 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
3993 r_.f64[i] = (a_.f64[i] > b_.f64[i]) ? a_.f64[i] : b_.f64[i];
3994 }
3995 #endif
3996
3997 return simde__m128d_from_private(r_);
3998 #endif
3999 }
4000 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4001 #define _mm_max_pd(a, b) simde_mm_max_pd(a, b)
4002 #endif
4003
4004 SIMDE_FUNCTION_ATTRIBUTES
4005 simde__m128d
simde_mm_max_sd(simde__m128d a,simde__m128d b)4006 simde_mm_max_sd (simde__m128d a, simde__m128d b) {
4007 #if defined(SIMDE_X86_SSE2_NATIVE)
4008 return _mm_max_sd(a, b);
4009 #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
4010 return simde_mm_move_sd(a, simde_mm_max_pd(a, b));
4011 #else
4012 simde__m128d_private
4013 r_,
4014 a_ = simde__m128d_to_private(a),
4015 b_ = simde__m128d_to_private(b);
4016
4017 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
4018 float64x2_t temp = vmaxq_f64(a_.neon_f64, b_.neon_f64);
4019 r_.neon_f64 = vsetq_lane_f64(vgetq_lane(a_.neon_f64, 1), temp, 1);
4020 #else
4021 r_.f64[0] = (a_.f64[0] > b_.f64[0]) ? a_.f64[0] : b_.f64[0];
4022 r_.f64[1] = a_.f64[1];
4023 #endif
4024
4025 return simde__m128d_from_private(r_);
4026 #endif
4027 }
4028 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4029 #define _mm_max_sd(a, b) simde_mm_max_sd(a, b)
4030 #endif
4031
4032 SIMDE_FUNCTION_ATTRIBUTES
4033 simde__m128i
simde_mm_move_epi64(simde__m128i a)4034 simde_mm_move_epi64 (simde__m128i a) {
4035 #if defined(SIMDE_X86_SSE2_NATIVE)
4036 return _mm_move_epi64(a);
4037 #else
4038 simde__m128i_private
4039 r_,
4040 a_ = simde__m128i_to_private(a);
4041
4042 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4043 r_.neon_i64 = vsetq_lane_s64(0, a_.neon_i64, 1);
4044 #else
4045 r_.i64[0] = a_.i64[0];
4046 r_.i64[1] = 0;
4047 #endif
4048
4049 return simde__m128i_from_private(r_);
4050 #endif
4051 }
4052 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4053 #define _mm_move_epi64(a) simde_mm_move_epi64(a)
4054 #endif
4055
4056 SIMDE_FUNCTION_ATTRIBUTES
4057 simde__m128i
simde_mm_mul_epu32(simde__m128i a,simde__m128i b)4058 simde_mm_mul_epu32 (simde__m128i a, simde__m128i b) {
4059 #if defined(SIMDE_X86_SSE2_NATIVE)
4060 return _mm_mul_epu32(a, b);
4061 #else
4062 simde__m128i_private
4063 r_,
4064 a_ = simde__m128i_to_private(a),
4065 b_ = simde__m128i_to_private(b);
4066
4067 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4068 uint32x2_t a_lo = vmovn_u64(a_.neon_u64);
4069 uint32x2_t b_lo = vmovn_u64(b_.neon_u64);
4070 r_.neon_u64 = vmull_u32(a_lo, b_lo);
4071 #elif defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
4072 __typeof__(a_.u32) z = { 0, };
4073 a_.u32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.u32, z, 0, 4, 2, 6);
4074 b_.u32 = SIMDE_SHUFFLE_VECTOR_(32, 16, b_.u32, z, 0, 4, 2, 6);
4075 r_.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), a_.u32) *
4076 HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), b_.u32);
4077 #else
4078 SIMDE_VECTORIZE
4079 for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
4080 r_.u64[i] = HEDLEY_STATIC_CAST(uint64_t, a_.u32[i * 2]) * HEDLEY_STATIC_CAST(uint64_t, b_.u32[i * 2]);
4081 }
4082 #endif
4083
4084 return simde__m128i_from_private(r_);
4085 #endif
4086 }
4087 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4088 #define _mm_mul_epu32(a, b) simde_mm_mul_epu32(a, b)
4089 #endif
4090
4091 SIMDE_FUNCTION_ATTRIBUTES
4092 simde__m128i
simde_x_mm_mul_epi64(simde__m128i a,simde__m128i b)4093 simde_x_mm_mul_epi64 (simde__m128i a, simde__m128i b) {
4094 simde__m128i_private
4095 r_,
4096 a_ = simde__m128i_to_private(a),
4097 b_ = simde__m128i_to_private(b);
4098
4099 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
4100 r_.i64 = a_.i64 * b_.i64;
4101 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
4102 r_.neon_f64 = vmulq_s64(a_.neon_f64, b_.neon_f64);
4103 #else
4104 SIMDE_VECTORIZE
4105 for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
4106 r_.i64[i] = a_.i64[i] * b_.i64[i];
4107 }
4108 #endif
4109
4110 return simde__m128i_from_private(r_);
4111 }
4112
4113 SIMDE_FUNCTION_ATTRIBUTES
4114 simde__m128i
simde_x_mm_mod_epi64(simde__m128i a,simde__m128i b)4115 simde_x_mm_mod_epi64 (simde__m128i a, simde__m128i b) {
4116 simde__m128i_private
4117 r_,
4118 a_ = simde__m128i_to_private(a),
4119 b_ = simde__m128i_to_private(b);
4120
4121 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
4122 r_.i64 = a_.i64 % b_.i64;
4123 #else
4124 SIMDE_VECTORIZE
4125 for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
4126 r_.i64[i] = a_.i64[i] % b_.i64[i];
4127 }
4128 #endif
4129
4130 return simde__m128i_from_private(r_);
4131 }
4132
4133 SIMDE_FUNCTION_ATTRIBUTES
4134 simde__m128d
simde_mm_mul_pd(simde__m128d a,simde__m128d b)4135 simde_mm_mul_pd (simde__m128d a, simde__m128d b) {
4136 #if defined(SIMDE_X86_SSE2_NATIVE)
4137 return _mm_mul_pd(a, b);
4138 #else
4139 simde__m128d_private
4140 r_,
4141 a_ = simde__m128d_to_private(a),
4142 b_ = simde__m128d_to_private(b);
4143
4144 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
4145 r_.f64 = a_.f64 * b_.f64;
4146 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
4147 r_.neon_f64 = vmulq_f64(a_.neon_f64, b_.neon_f64);
4148 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4149 r_.wasm_v128 = wasm_f64x2_mul(a_.wasm_v128, b_.wasm_v128);
4150 #else
4151 SIMDE_VECTORIZE
4152 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
4153 r_.f64[i] = a_.f64[i] * b_.f64[i];
4154 }
4155 #endif
4156
4157 return simde__m128d_from_private(r_);
4158 #endif
4159 }
4160 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4161 #define _mm_mul_pd(a, b) simde_mm_mul_pd(a, b)
4162 #endif
4163
4164 SIMDE_FUNCTION_ATTRIBUTES
4165 simde__m128d
simde_mm_mul_sd(simde__m128d a,simde__m128d b)4166 simde_mm_mul_sd (simde__m128d a, simde__m128d b) {
4167 #if defined(SIMDE_X86_SSE2_NATIVE)
4168 return _mm_mul_sd(a, b);
4169 #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
4170 return simde_mm_move_sd(a, simde_mm_mul_pd(a, b));
4171 #else
4172 simde__m128d_private
4173 r_,
4174 a_ = simde__m128d_to_private(a),
4175 b_ = simde__m128d_to_private(b);
4176
4177 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
4178 float64x2_t temp = vmulq_f64(a_.neon_f64, b_.neon_f64);
4179 r_.neon_f64 = vsetq_lane_f64(vgetq_lane(a_.neon_f64, 1), temp, 1);
4180 #else
4181 r_.f64[0] = a_.f64[0] * b_.f64[0];
4182 r_.f64[1] = a_.f64[1];
4183 #endif
4184
4185 return simde__m128d_from_private(r_);
4186 #endif
4187 }
4188 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4189 #define _mm_mul_sd(a, b) simde_mm_mul_sd(a, b)
4190 #endif
4191
4192 SIMDE_FUNCTION_ATTRIBUTES
4193 simde__m64
simde_mm_mul_su32(simde__m64 a,simde__m64 b)4194 simde_mm_mul_su32 (simde__m64 a, simde__m64 b) {
4195 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI)
4196 return _mm_mul_su32(a, b);
4197 #else
4198 simde__m64_private
4199 r_,
4200 a_ = simde__m64_to_private(a),
4201 b_ = simde__m64_to_private(b);
4202
4203 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4204 r_.u64[0] = vget_lane_u64(vget_low_u64(vmull_u32(vreinterpret_u32_s64(a_.neon_i64), vreinterpret_u32_s64(b_.neon_i64))), 0);
4205 #else
4206 r_.u64[0] = HEDLEY_STATIC_CAST(uint64_t, a_.u32[0]) * HEDLEY_STATIC_CAST(uint64_t, b_.u32[0]);
4207 #endif
4208
4209 return simde__m64_from_private(r_);
4210 #endif
4211 }
4212 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4213 #define _mm_mul_su32(a, b) simde_mm_mul_su32(a, b)
4214 #endif
4215
4216 SIMDE_FUNCTION_ATTRIBUTES
4217 simde__m128i
simde_mm_mulhi_epi16(simde__m128i a,simde__m128i b)4218 simde_mm_mulhi_epi16 (simde__m128i a, simde__m128i b) {
4219 #if defined(SIMDE_X86_SSE2_NATIVE)
4220 return _mm_mulhi_epi16(a, b);
4221 #else
4222 simde__m128i_private
4223 r_,
4224 a_ = simde__m128i_to_private(a),
4225 b_ = simde__m128i_to_private(b);
4226
4227 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4228 int16x4_t a3210 = vget_low_s16(a_.neon_i16);
4229 int16x4_t b3210 = vget_low_s16(b_.neon_i16);
4230 int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
4231 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
4232 int32x4_t ab7654 = vmull_high_s16(a_.neon_i16, b_.neon_i16);
4233 r_.neon_i16 = vuzp2q_s16(vreinterpretq_s16_s32(ab3210), vreinterpretq_s16_s32(ab7654));
4234 #else
4235 int16x4_t a7654 = vget_high_s16(a_.neon_i16);
4236 int16x4_t b7654 = vget_high_s16(b_.neon_i16);
4237 int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
4238 uint16x8x2_t rv = vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
4239 r_.neon_u16 = rv.val[1];
4240 #endif
4241 #else
4242 SIMDE_VECTORIZE
4243 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
4244 r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, (HEDLEY_STATIC_CAST(uint32_t, HEDLEY_STATIC_CAST(int32_t, a_.i16[i]) * HEDLEY_STATIC_CAST(int32_t, b_.i16[i])) >> 16));
4245 }
4246 #endif
4247
4248 return simde__m128i_from_private(r_);
4249 #endif
4250 }
4251 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4252 #define _mm_mulhi_epi16(a, b) simde_mm_mulhi_epi16(a, b)
4253 #endif
4254
4255 SIMDE_FUNCTION_ATTRIBUTES
4256 simde__m128i
simde_mm_mulhi_epu16(simde__m128i a,simde__m128i b)4257 simde_mm_mulhi_epu16 (simde__m128i a, simde__m128i b) {
4258 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
4259 return _mm_mulhi_epu16(a, b);
4260 #else
4261 simde__m128i_private
4262 r_,
4263 a_ = simde__m128i_to_private(a),
4264 b_ = simde__m128i_to_private(b);
4265
4266 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4267 uint16x4_t a3210 = vget_low_u16(a_.neon_u16);
4268 uint16x4_t b3210 = vget_low_u16(b_.neon_u16);
4269 uint32x4_t ab3210 = vmull_u16(a3210, b3210); /* 3333222211110000 */
4270 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
4271 uint32x4_t ab7654 = vmull_high_u16(a_.neon_u16, b_.neon_u16);
4272 r_.neon_u16 = vuzp2q_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654));
4273 #else
4274 uint16x4_t a7654 = vget_high_u16(a_.neon_u16);
4275 uint16x4_t b7654 = vget_high_u16(b_.neon_u16);
4276 uint32x4_t ab7654 = vmull_u16(a7654, b7654); /* 7777666655554444 */
4277 uint16x8x2_t neon_r = vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654));
4278 r_.neon_u16 = neon_r.val[1];
4279 #endif
4280 #else
4281 SIMDE_VECTORIZE
4282 for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
4283 r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, HEDLEY_STATIC_CAST(uint32_t, a_.u16[i]) * HEDLEY_STATIC_CAST(uint32_t, b_.u16[i]) >> 16);
4284 }
4285 #endif
4286
4287 return simde__m128i_from_private(r_);
4288 #endif
4289 }
4290 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4291 #define _mm_mulhi_epu16(a, b) simde_mm_mulhi_epu16(a, b)
4292 #endif
4293
4294 SIMDE_FUNCTION_ATTRIBUTES
4295 simde__m128i
simde_mm_mullo_epi16(simde__m128i a,simde__m128i b)4296 simde_mm_mullo_epi16 (simde__m128i a, simde__m128i b) {
4297 #if defined(SIMDE_X86_SSE2_NATIVE)
4298 return _mm_mullo_epi16(a, b);
4299 #else
4300 simde__m128i_private
4301 r_,
4302 a_ = simde__m128i_to_private(a),
4303 b_ = simde__m128i_to_private(b);
4304
4305 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4306 r_.neon_i16 = vmulq_s16(a_.neon_i16, b_.neon_i16);
4307 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
4308 (void) a_;
4309 (void) b_;
4310 r_.altivec_i16 = vec_mul(a_.altivec_i16, b_.altivec_i16);
4311 #else
4312 SIMDE_VECTORIZE
4313 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
4314 r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, HEDLEY_STATIC_CAST(uint32_t, a_.u16[i]) * HEDLEY_STATIC_CAST(uint32_t, b_.u16[i]));
4315 }
4316 #endif
4317
4318 return simde__m128i_from_private(r_);
4319 #endif
4320 }
4321 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4322 #define _mm_mullo_epi16(a, b) simde_mm_mullo_epi16(a, b)
4323 #endif
4324
4325 SIMDE_FUNCTION_ATTRIBUTES
4326 simde__m128d
simde_mm_or_pd(simde__m128d a,simde__m128d b)4327 simde_mm_or_pd (simde__m128d a, simde__m128d b) {
4328 #if defined(SIMDE_X86_SSE2_NATIVE)
4329 return _mm_or_pd(a, b);
4330 #else
4331 simde__m128d_private
4332 r_,
4333 a_ = simde__m128d_to_private(a),
4334 b_ = simde__m128d_to_private(b);
4335
4336 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
4337 r_.i32f = a_.i32f | b_.i32f;
4338 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4339 r_.wasm_v128 = wasm_v128_or(a_.wasm_v128, b_.wasm_v128);
4340 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4341 r_.neon_i64 = vorrq_s64(a_.neon_i64, b_.neon_i64);
4342 #else
4343 SIMDE_VECTORIZE
4344 for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
4345 r_.i32f[i] = a_.i32f[i] | b_.i32f[i];
4346 }
4347 #endif
4348
4349 return simde__m128d_from_private(r_);
4350 #endif
4351 }
4352 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4353 #define _mm_or_pd(a, b) simde_mm_or_pd(a, b)
4354 #endif
4355
4356 SIMDE_FUNCTION_ATTRIBUTES
4357 simde__m128i
simde_mm_or_si128(simde__m128i a,simde__m128i b)4358 simde_mm_or_si128 (simde__m128i a, simde__m128i b) {
4359 #if defined(SIMDE_X86_SSE2_NATIVE)
4360 return _mm_or_si128(a, b);
4361 #else
4362 simde__m128i_private
4363 r_,
4364 a_ = simde__m128i_to_private(a),
4365 b_ = simde__m128i_to_private(b);
4366
4367 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4368 r_.neon_i32 = vorrq_s32(a_.neon_i32, b_.neon_i32);
4369 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
4370 r_.altivec_i32 = vec_or(a_.altivec_i32, b_.altivec_i32);
4371 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
4372 r_.i32f = a_.i32f | b_.i32f;
4373 #else
4374 SIMDE_VECTORIZE
4375 for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
4376 r_.i32f[i] = a_.i32f[i] | b_.i32f[i];
4377 }
4378 #endif
4379
4380 return simde__m128i_from_private(r_);
4381 #endif
4382 }
4383 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4384 #define _mm_or_si128(a, b) simde_mm_or_si128(a, b)
4385 #endif
4386
4387 SIMDE_FUNCTION_ATTRIBUTES
4388 simde__m128i
simde_mm_packs_epi16(simde__m128i a,simde__m128i b)4389 simde_mm_packs_epi16 (simde__m128i a, simde__m128i b) {
4390 #if defined(SIMDE_X86_SSE2_NATIVE)
4391 return _mm_packs_epi16(a, b);
4392 #else
4393 simde__m128i_private
4394 r_,
4395 a_ = simde__m128i_to_private(a),
4396 b_ = simde__m128i_to_private(b);
4397
4398 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4399 r_.neon_i8 = vcombine_s8(vqmovn_s16(a_.neon_i16), vqmovn_s16(b_.neon_i16));
4400 #else
4401 SIMDE_VECTORIZE
4402 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
4403 r_.i8[i] = (a_.i16[i] > INT8_MAX) ? INT8_MAX : ((a_.i16[i] < INT8_MIN) ? INT8_MIN : HEDLEY_STATIC_CAST(int8_t, a_.i16[i]));
4404 r_.i8[i + 8] = (b_.i16[i] > INT8_MAX) ? INT8_MAX : ((b_.i16[i] < INT8_MIN) ? INT8_MIN : HEDLEY_STATIC_CAST(int8_t, b_.i16[i]));
4405 }
4406 #endif
4407
4408 return simde__m128i_from_private(r_);
4409 #endif
4410 }
4411 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4412 #define _mm_packs_epi16(a, b) simde_mm_packs_epi16(a, b)
4413 #endif
4414
4415 SIMDE_FUNCTION_ATTRIBUTES
4416 simde__m128i
simde_mm_packs_epi32(simde__m128i a,simde__m128i b)4417 simde_mm_packs_epi32 (simde__m128i a, simde__m128i b) {
4418 #if defined(SIMDE_X86_SSE2_NATIVE)
4419 return _mm_packs_epi32(a, b);
4420 #else
4421 simde__m128i_private
4422 r_,
4423 a_ = simde__m128i_to_private(a),
4424 b_ = simde__m128i_to_private(b);
4425
4426 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4427 r_.neon_i16 = vcombine_s16(vqmovn_s32(a_.neon_i32), vqmovn_s32(b_.neon_i32));
4428 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
4429 r_.altivec_i16 = vec_packs(a_.altivec_i32, b_.altivec_i32);
4430 #else
4431 SIMDE_VECTORIZE
4432 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
4433 r_.i16[i] = (a_.i32[i] > INT16_MAX) ? INT16_MAX : ((a_.i32[i] < INT16_MIN) ? INT16_MIN : HEDLEY_STATIC_CAST(int16_t, a_.i32[i]));
4434 r_.i16[i + 4] = (b_.i32[i] > INT16_MAX) ? INT16_MAX : ((b_.i32[i] < INT16_MIN) ? INT16_MIN : HEDLEY_STATIC_CAST(int16_t, b_.i32[i]));
4435 }
4436 #endif
4437
4438 return simde__m128i_from_private(r_);
4439 #endif
4440 }
4441 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4442 #define _mm_packs_epi32(a, b) simde_mm_packs_epi32(a, b)
4443 #endif
4444
4445 SIMDE_FUNCTION_ATTRIBUTES
4446 simde__m128i
simde_mm_packus_epi16(simde__m128i a,simde__m128i b)4447 simde_mm_packus_epi16 (simde__m128i a, simde__m128i b) {
4448 #if defined(SIMDE_X86_SSE2_NATIVE)
4449 return _mm_packus_epi16(a, b);
4450 #else
4451 simde__m128i_private
4452 r_,
4453 a_ = simde__m128i_to_private(a),
4454 b_ = simde__m128i_to_private(b);
4455
4456 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4457 r_.neon_u8 = vcombine_u8(vqmovun_s16(a_.neon_i16), vqmovun_s16(b_.neon_i16));
4458 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
4459 r_.altivec_u8 = vec_packsu(a_.altivec_i16, b_.altivec_i16);
4460 #else
4461 SIMDE_VECTORIZE
4462 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
4463 r_.u8[i] = (a_.i16[i] > UINT8_MAX) ? UINT8_MAX : ((a_.i16[i] < 0) ? UINT8_C(0) : HEDLEY_STATIC_CAST(uint8_t, a_.i16[i]));
4464 r_.u8[i + 8] = (b_.i16[i] > UINT8_MAX) ? UINT8_MAX : ((b_.i16[i] < 0) ? UINT8_C(0) : HEDLEY_STATIC_CAST(uint8_t, b_.i16[i]));
4465 }
4466 #endif
4467
4468 return simde__m128i_from_private(r_);
4469 #endif
4470 }
4471 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4472 #define _mm_packus_epi16(a, b) simde_mm_packus_epi16(a, b)
4473 #endif
4474
4475 SIMDE_FUNCTION_ATTRIBUTES
4476 void
simde_mm_pause(void)4477 simde_mm_pause (void) {
4478 #if defined(SIMDE_X86_SSE2_NATIVE)
4479 _mm_pause();
4480 #endif
4481 }
4482 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4483 #define _mm_pause() (simde_mm_pause())
4484 #endif
4485
4486 SIMDE_FUNCTION_ATTRIBUTES
4487 simde__m128i
simde_mm_sad_epu8(simde__m128i a,simde__m128i b)4488 simde_mm_sad_epu8 (simde__m128i a, simde__m128i b) {
4489 #if defined(SIMDE_X86_SSE2_NATIVE)
4490 return _mm_sad_epu8(a, b);
4491 #else
4492 simde__m128i_private
4493 r_,
4494 a_ = simde__m128i_to_private(a),
4495 b_ = simde__m128i_to_private(b);
4496
4497 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4498 const uint16x8_t t = vpaddlq_u8(vabdq_u8(a_.neon_u8, b_.neon_u8));
4499 r_.neon_u64 = vcombine_u64(
4500 vpaddl_u32(vpaddl_u16(vget_low_u16(t))),
4501 vpaddl_u32(vpaddl_u16(vget_high_u16(t))));
4502 #else
4503 for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
4504 uint16_t tmp = 0;
4505 SIMDE_VECTORIZE_REDUCTION(+:tmp)
4506 for (size_t j = 0 ; j < ((sizeof(r_.u8) / sizeof(r_.u8[0])) / 2) ; j++) {
4507 const size_t e = j + (i * 8);
4508 tmp += (a_.u8[e] > b_.u8[e]) ? (a_.u8[e] - b_.u8[e]) : (b_.u8[e] - a_.u8[e]);
4509 }
4510 r_.i64[i] = tmp;
4511 }
4512 #endif
4513
4514 return simde__m128i_from_private(r_);
4515 #endif
4516 }
4517 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4518 #define _mm_sad_epu8(a, b) simde_mm_sad_epu8(a, b)
4519 #endif
4520
4521 SIMDE_FUNCTION_ATTRIBUTES
4522 simde__m128i
simde_mm_set_epi8(int8_t e15,int8_t e14,int8_t e13,int8_t e12,int8_t e11,int8_t e10,int8_t e9,int8_t e8,int8_t e7,int8_t e6,int8_t e5,int8_t e4,int8_t e3,int8_t e2,int8_t e1,int8_t e0)4523 simde_mm_set_epi8 (int8_t e15, int8_t e14, int8_t e13, int8_t e12,
4524 int8_t e11, int8_t e10, int8_t e9, int8_t e8,
4525 int8_t e7, int8_t e6, int8_t e5, int8_t e4,
4526 int8_t e3, int8_t e2, int8_t e1, int8_t e0) {
4527
4528 #if defined(SIMDE_X86_SSE2_NATIVE)
4529 return _mm_set_epi8(
4530 e15, e14, e13, e12, e11, e10, e9, e8,
4531 e7, e6, e5, e4, e3, e2, e1, e0);
4532 #else
4533 simde__m128i_private r_;
4534
4535 #if defined(SIMDE_WASM_SIMD128_NATIVE)
4536 r_.wasm_v128 = wasm_i8x16_make(
4537 e0, e1, e2, e3, e4, e5, e6, e7,
4538 e8, e9, e10, e11, e12, e13, e14, e15);
4539 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4540 SIMDE_ALIGN_LIKE_16(int8x16_t) int8_t data[16] = {
4541 e0, e1, e2, e3,
4542 e4, e5, e6, e7,
4543 e8, e9, e10, e11,
4544 e12, e13, e14, e15};
4545 r_.neon_i8 = vld1q_s8(data);
4546 #else
4547 r_.i8[ 0] = e0;
4548 r_.i8[ 1] = e1;
4549 r_.i8[ 2] = e2;
4550 r_.i8[ 3] = e3;
4551 r_.i8[ 4] = e4;
4552 r_.i8[ 5] = e5;
4553 r_.i8[ 6] = e6;
4554 r_.i8[ 7] = e7;
4555 r_.i8[ 8] = e8;
4556 r_.i8[ 9] = e9;
4557 r_.i8[10] = e10;
4558 r_.i8[11] = e11;
4559 r_.i8[12] = e12;
4560 r_.i8[13] = e13;
4561 r_.i8[14] = e14;
4562 r_.i8[15] = e15;
4563 #endif
4564
4565 return simde__m128i_from_private(r_);
4566 #endif
4567 }
4568 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4569 #define _mm_set_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) simde_mm_set_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0)
4570 #endif
4571
4572 SIMDE_FUNCTION_ATTRIBUTES
4573 simde__m128i
simde_mm_set_epi16(int16_t e7,int16_t e6,int16_t e5,int16_t e4,int16_t e3,int16_t e2,int16_t e1,int16_t e0)4574 simde_mm_set_epi16 (int16_t e7, int16_t e6, int16_t e5, int16_t e4,
4575 int16_t e3, int16_t e2, int16_t e1, int16_t e0) {
4576 #if defined(SIMDE_X86_SSE2_NATIVE)
4577 return _mm_set_epi16(e7, e6, e5, e4, e3, e2, e1, e0);
4578 #else
4579 simde__m128i_private r_;
4580
4581 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4582 SIMDE_ALIGN_LIKE_16(int16x8_t) int16_t data[8] = { e0, e1, e2, e3, e4, e5, e6, e7 };
4583 r_.neon_i16 = vld1q_s16(data);
4584 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4585 r_.wasm_v128 = wasm_i16x8_make(e0, e1, e2, e3, e4, e5, e6, e7);
4586 #else
4587 r_.i16[0] = e0;
4588 r_.i16[1] = e1;
4589 r_.i16[2] = e2;
4590 r_.i16[3] = e3;
4591 r_.i16[4] = e4;
4592 r_.i16[5] = e5;
4593 r_.i16[6] = e6;
4594 r_.i16[7] = e7;
4595 #endif
4596
4597 return simde__m128i_from_private(r_);
4598 #endif
4599 }
4600 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4601 #define _mm_set_epi16(e7, e6, e5, e4, e3, e2, e1, e0) simde_mm_set_epi16(e7, e6, e5, e4, e3, e2, e1, e0)
4602 #endif
4603
4604 SIMDE_FUNCTION_ATTRIBUTES
4605 simde__m128i
simde_mm_loadu_si16(void const * mem_addr)4606 simde_mm_loadu_si16 (void const* mem_addr) {
4607 #if defined(SIMDE_X86_SSE2_NATIVE) && ( \
4608 SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0) || \
4609 HEDLEY_GCC_VERSION_CHECK(11,0,0) || \
4610 HEDLEY_INTEL_VERSION_CHECK(20,21,1))
4611 return _mm_loadu_si16(mem_addr);
4612 #else
4613 int16_t val;
4614 simde_memcpy(&val, mem_addr, sizeof(val));
4615 return simde_x_mm_cvtsi16_si128(val);
4616 #endif
4617 }
4618 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4619 #define _mm_loadu_si16(mem_addr) simde_mm_loadu_si16(mem_addr)
4620 #endif
4621
4622 SIMDE_FUNCTION_ATTRIBUTES
4623 simde__m128i
simde_mm_set_epi32(int32_t e3,int32_t e2,int32_t e1,int32_t e0)4624 simde_mm_set_epi32 (int32_t e3, int32_t e2, int32_t e1, int32_t e0) {
4625 #if defined(SIMDE_X86_SSE2_NATIVE)
4626 return _mm_set_epi32(e3, e2, e1, e0);
4627 #else
4628 simde__m128i_private r_;
4629
4630 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4631 SIMDE_ALIGN_LIKE_16(int32x4_t) int32_t data[4] = { e0, e1, e2, e3 };
4632 r_.neon_i32 = vld1q_s32(data);
4633 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4634 r_.wasm_v128 = wasm_i32x4_make(e0, e1, e2, e3);
4635 #else
4636 r_.i32[0] = e0;
4637 r_.i32[1] = e1;
4638 r_.i32[2] = e2;
4639 r_.i32[3] = e3;
4640 #endif
4641
4642 return simde__m128i_from_private(r_);
4643 #endif
4644 }
4645 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4646 #define _mm_set_epi32(e3, e2, e1, e0) simde_mm_set_epi32(e3, e2, e1, e0)
4647 #endif
4648
4649 SIMDE_FUNCTION_ATTRIBUTES
4650 simde__m128i
simde_mm_loadu_si32(void const * mem_addr)4651 simde_mm_loadu_si32 (void const* mem_addr) {
4652 #if defined(SIMDE_X86_SSE2_NATIVE) && ( \
4653 SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0) || \
4654 HEDLEY_GCC_VERSION_CHECK(11,0,0) || \
4655 HEDLEY_INTEL_VERSION_CHECK(20,21,1))
4656 return _mm_loadu_si32(mem_addr);
4657 #else
4658 int32_t val;
4659 simde_memcpy(&val, mem_addr, sizeof(val));
4660 return simde_mm_cvtsi32_si128(val);
4661 #endif
4662 }
4663 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4664 #define _mm_loadu_si32(mem_addr) simde_mm_loadu_si32(mem_addr)
4665 #endif
4666
4667 SIMDE_FUNCTION_ATTRIBUTES
4668 simde__m128i
simde_mm_set_epi64(simde__m64 e1,simde__m64 e0)4669 simde_mm_set_epi64 (simde__m64 e1, simde__m64 e0) {
4670 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
4671 return _mm_set_epi64(e1, e0);
4672 #else
4673 simde__m128i_private r_;
4674
4675 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4676 r_.neon_i64 = vcombine_s64(simde__m64_to_neon_i64(e0), simde__m64_to_neon_i64(e1));
4677 #else
4678 r_.m64[0] = e0;
4679 r_.m64[1] = e1;
4680 #endif
4681
4682 return simde__m128i_from_private(r_);
4683 #endif
4684 }
4685 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4686 #define _mm_set_epi64(e1, e0) (simde_mm_set_epi64((e1), (e0)))
4687 #endif
4688
4689 SIMDE_FUNCTION_ATTRIBUTES
4690 simde__m128i
simde_mm_set_epi64x(int64_t e1,int64_t e0)4691 simde_mm_set_epi64x (int64_t e1, int64_t e0) {
4692 #if defined(SIMDE_X86_SSE2_NATIVE) && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,0,0))
4693 return _mm_set_epi64x(e1, e0);
4694 #else
4695 simde__m128i_private r_;
4696
4697 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4698 SIMDE_ALIGN_LIKE_16(int64x2_t) int64_t data[2] = {e0, e1};
4699 r_.neon_i64 = vld1q_s64(data);
4700 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4701 r_.wasm_v128 = wasm_i64x2_make(e0, e1);
4702 #else
4703 r_.i64[0] = e0;
4704 r_.i64[1] = e1;
4705 #endif
4706
4707 return simde__m128i_from_private(r_);
4708 #endif
4709 }
4710 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4711 #define _mm_set_epi64x(e1, e0) simde_mm_set_epi64x(e1, e0)
4712 #endif
4713
4714 SIMDE_FUNCTION_ATTRIBUTES
4715 simde__m128i
simde_mm_loadu_si64(void const * mem_addr)4716 simde_mm_loadu_si64 (void const* mem_addr) {
4717 #if defined(SIMDE_X86_SSE2_NATIVE) && ( \
4718 SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0) || \
4719 HEDLEY_GCC_VERSION_CHECK(11,0,0) || \
4720 HEDLEY_INTEL_VERSION_CHECK(20,21,1))
4721 return _mm_loadu_si64(mem_addr);
4722 #else
4723 int64_t val;
4724 simde_memcpy(&val, mem_addr, sizeof(val));
4725 return simde_mm_cvtsi64_si128(val);
4726 #endif
4727 }
4728 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4729 #define _mm_loadu_si64(mem_addr) simde_mm_loadu_si64(mem_addr)
4730 #endif
4731
4732 SIMDE_FUNCTION_ATTRIBUTES
4733 simde__m128i
simde_x_mm_set_epu8(uint8_t e15,uint8_t e14,uint8_t e13,uint8_t e12,uint8_t e11,uint8_t e10,uint8_t e9,uint8_t e8,uint8_t e7,uint8_t e6,uint8_t e5,uint8_t e4,uint8_t e3,uint8_t e2,uint8_t e1,uint8_t e0)4734 simde_x_mm_set_epu8 (uint8_t e15, uint8_t e14, uint8_t e13, uint8_t e12,
4735 uint8_t e11, uint8_t e10, uint8_t e9, uint8_t e8,
4736 uint8_t e7, uint8_t e6, uint8_t e5, uint8_t e4,
4737 uint8_t e3, uint8_t e2, uint8_t e1, uint8_t e0) {
4738 #if defined(SIMDE_X86_SSE2_NATIVE)
4739 return _mm_set_epi8(
4740 HEDLEY_STATIC_CAST(char, e15), HEDLEY_STATIC_CAST(char, e14), HEDLEY_STATIC_CAST(char, e13), HEDLEY_STATIC_CAST(char, e12),
4741 HEDLEY_STATIC_CAST(char, e11), HEDLEY_STATIC_CAST(char, e10), HEDLEY_STATIC_CAST(char, e9), HEDLEY_STATIC_CAST(char, e8),
4742 HEDLEY_STATIC_CAST(char, e7), HEDLEY_STATIC_CAST(char, e6), HEDLEY_STATIC_CAST(char, e5), HEDLEY_STATIC_CAST(char, e4),
4743 HEDLEY_STATIC_CAST(char, e3), HEDLEY_STATIC_CAST(char, e2), HEDLEY_STATIC_CAST(char, e1), HEDLEY_STATIC_CAST(char, e0));
4744 #else
4745 simde__m128i_private r_;
4746
4747 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4748 SIMDE_ALIGN_LIKE_16(uint8x16_t) uint8_t data[16] = {
4749 e0, e1, e2, e3,
4750 e4, e5, e6, e7,
4751 e8, e9, e10, e11,
4752 e12, e13, e14, e15};
4753 r_.neon_u8 = vld1q_u8(data);
4754 #else
4755 r_.u8[ 0] = e0; r_.u8[ 1] = e1; r_.u8[ 2] = e2; r_.u8[ 3] = e3;
4756 r_.u8[ 4] = e4; r_.u8[ 5] = e5; r_.u8[ 6] = e6; r_.u8[ 7] = e7;
4757 r_.u8[ 8] = e8; r_.u8[ 9] = e9; r_.u8[10] = e10; r_.u8[11] = e11;
4758 r_.u8[12] = e12; r_.u8[13] = e13; r_.u8[14] = e14; r_.u8[15] = e15;
4759 #endif
4760
4761 return simde__m128i_from_private(r_);
4762 #endif
4763 }
4764
4765 SIMDE_FUNCTION_ATTRIBUTES
4766 simde__m128i
simde_x_mm_set_epu16(uint16_t e7,uint16_t e6,uint16_t e5,uint16_t e4,uint16_t e3,uint16_t e2,uint16_t e1,uint16_t e0)4767 simde_x_mm_set_epu16 (uint16_t e7, uint16_t e6, uint16_t e5, uint16_t e4,
4768 uint16_t e3, uint16_t e2, uint16_t e1, uint16_t e0) {
4769 #if defined(SIMDE_X86_SSE2_NATIVE)
4770 return _mm_set_epi16(
4771 HEDLEY_STATIC_CAST(short, e7), HEDLEY_STATIC_CAST(short, e6), HEDLEY_STATIC_CAST(short, e5), HEDLEY_STATIC_CAST(short, e4),
4772 HEDLEY_STATIC_CAST(short, e3), HEDLEY_STATIC_CAST(short, e2), HEDLEY_STATIC_CAST(short, e1), HEDLEY_STATIC_CAST(short, e0));
4773 #else
4774 simde__m128i_private r_;
4775
4776 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4777 SIMDE_ALIGN_LIKE_16(uint16x8_t) uint16_t data[8] = { e0, e1, e2, e3, e4, e5, e6, e7 };
4778 r_.neon_u16 = vld1q_u16(data);
4779 #else
4780 r_.u16[0] = e0; r_.u16[1] = e1; r_.u16[2] = e2; r_.u16[3] = e3;
4781 r_.u16[4] = e4; r_.u16[5] = e5; r_.u16[6] = e6; r_.u16[7] = e7;
4782 #endif
4783
4784 return simde__m128i_from_private(r_);
4785 #endif
4786 }
4787
4788 SIMDE_FUNCTION_ATTRIBUTES
4789 simde__m128i
simde_x_mm_set_epu32(uint32_t e3,uint32_t e2,uint32_t e1,uint32_t e0)4790 simde_x_mm_set_epu32 (uint32_t e3, uint32_t e2, uint32_t e1, uint32_t e0) {
4791 #if defined(SIMDE_X86_SSE2_NATIVE)
4792 return _mm_set_epi32(
4793 HEDLEY_STATIC_CAST(int, e3), HEDLEY_STATIC_CAST(int, e2), HEDLEY_STATIC_CAST(int, e1), HEDLEY_STATIC_CAST(int, e0));
4794 #else
4795 simde__m128i_private r_;
4796
4797 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4798 SIMDE_ALIGN_LIKE_16(uint32x4_t) uint32_t data[4] = { e0, e1, e2, e3 };
4799 r_.neon_u32 = vld1q_u32(data);
4800 #else
4801 r_.u32[0] = e0;
4802 r_.u32[1] = e1;
4803 r_.u32[2] = e2;
4804 r_.u32[3] = e3;
4805 #endif
4806
4807 return simde__m128i_from_private(r_);
4808 #endif
4809 }
4810
4811 SIMDE_FUNCTION_ATTRIBUTES
4812 simde__m128i
simde_x_mm_set_epu64x(uint64_t e1,uint64_t e0)4813 simde_x_mm_set_epu64x (uint64_t e1, uint64_t e0) {
4814 #if defined(SIMDE_X86_SSE2_NATIVE) && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,0,0))
4815 return _mm_set_epi64x(HEDLEY_STATIC_CAST(int64_t, e1), HEDLEY_STATIC_CAST(int64_t, e0));
4816 #else
4817 simde__m128i_private r_;
4818
4819 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4820 SIMDE_ALIGN_LIKE_16(uint64x2_t) uint64_t data[2] = {e0, e1};
4821 r_.neon_u64 = vld1q_u64(data);
4822 #else
4823 r_.u64[0] = e0;
4824 r_.u64[1] = e1;
4825 #endif
4826
4827 return simde__m128i_from_private(r_);
4828 #endif
4829 }
4830
4831 SIMDE_FUNCTION_ATTRIBUTES
4832 simde__m128d
simde_mm_set_sd(simde_float64 a)4833 simde_mm_set_sd (simde_float64 a) {
4834 #if defined(SIMDE_X86_SSE2_NATIVE)
4835 return _mm_set_sd(a);
4836 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
4837 return vsetq_lane_f64(a, vdupq_n_f64(SIMDE_FLOAT64_C(0.0)), 0);
4838 #else
4839 return simde_mm_set_pd(SIMDE_FLOAT64_C(0.0), a);
4840 #endif
4841 }
4842 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4843 #define _mm_set_sd(a) simde_mm_set_sd(a)
4844 #endif
4845
4846 SIMDE_FUNCTION_ATTRIBUTES
4847 simde__m128i
simde_mm_set1_epi8(int8_t a)4848 simde_mm_set1_epi8 (int8_t a) {
4849 #if defined(SIMDE_X86_SSE2_NATIVE)
4850 return _mm_set1_epi8(a);
4851 #else
4852 simde__m128i_private r_;
4853
4854 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4855 r_.neon_i8 = vdupq_n_s8(a);
4856 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4857 r_.wasm_v128 = wasm_i8x16_splat(a);
4858 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
4859 r_.altivec_i8 = vec_splats(HEDLEY_STATIC_CAST(signed char, a));
4860 #else
4861 SIMDE_VECTORIZE
4862 for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
4863 r_.i8[i] = a;
4864 }
4865 #endif
4866
4867 return simde__m128i_from_private(r_);
4868 #endif
4869 }
4870 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4871 #define _mm_set1_epi8(a) simde_mm_set1_epi8(a)
4872 #endif
4873
4874 SIMDE_FUNCTION_ATTRIBUTES
4875 simde__m128i
simde_mm_set1_epi16(int16_t a)4876 simde_mm_set1_epi16 (int16_t a) {
4877 #if defined(SIMDE_X86_SSE2_NATIVE)
4878 return _mm_set1_epi16(a);
4879 #else
4880 simde__m128i_private r_;
4881
4882 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4883 r_.neon_i16 = vdupq_n_s16(a);
4884 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4885 r_.wasm_v128 = wasm_i16x8_splat(a);
4886 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
4887 r_.altivec_i16 = vec_splats(HEDLEY_STATIC_CAST(signed short, a));
4888 #else
4889 SIMDE_VECTORIZE
4890 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
4891 r_.i16[i] = a;
4892 }
4893 #endif
4894
4895 return simde__m128i_from_private(r_);
4896 #endif
4897 }
4898 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4899 #define _mm_set1_epi16(a) simde_mm_set1_epi16(a)
4900 #endif
4901
4902 SIMDE_FUNCTION_ATTRIBUTES
4903 simde__m128i
simde_mm_set1_epi32(int32_t a)4904 simde_mm_set1_epi32 (int32_t a) {
4905 #if defined(SIMDE_X86_SSE2_NATIVE)
4906 return _mm_set1_epi32(a);
4907 #else
4908 simde__m128i_private r_;
4909
4910 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4911 r_.neon_i32 = vdupq_n_s32(a);
4912 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4913 r_.wasm_v128 = wasm_i32x4_splat(a);
4914 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
4915 r_.altivec_i32 = vec_splats(HEDLEY_STATIC_CAST(signed int, a));
4916 #else
4917 SIMDE_VECTORIZE
4918 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
4919 r_.i32[i] = a;
4920 }
4921 #endif
4922
4923 return simde__m128i_from_private(r_);
4924 #endif
4925 }
4926 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4927 #define _mm_set1_epi32(a) simde_mm_set1_epi32(a)
4928 #endif
4929
4930 SIMDE_FUNCTION_ATTRIBUTES
4931 simde__m128i
simde_mm_set1_epi64x(int64_t a)4932 simde_mm_set1_epi64x (int64_t a) {
4933 #if defined(SIMDE_X86_SSE2_NATIVE) && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,0,0))
4934 return _mm_set1_epi64x(a);
4935 #else
4936 simde__m128i_private r_;
4937
4938 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4939 r_.neon_i64 = vdupq_n_s64(a);
4940 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4941 r_.wasm_v128 = wasm_i64x2_splat(a);
4942 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
4943 r_.altivec_i64 = vec_splats(HEDLEY_STATIC_CAST(signed long long, a));
4944 #else
4945 SIMDE_VECTORIZE
4946 for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
4947 r_.i64[i] = a;
4948 }
4949 #endif
4950
4951 return simde__m128i_from_private(r_);
4952 #endif
4953 }
4954 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4955 #define _mm_set1_epi64x(a) simde_mm_set1_epi64x(a)
4956 #endif
4957
4958 SIMDE_FUNCTION_ATTRIBUTES
4959 simde__m128i
simde_mm_set1_epi64(simde__m64 a)4960 simde_mm_set1_epi64 (simde__m64 a) {
4961 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
4962 return _mm_set1_epi64(a);
4963 #else
4964 simde__m64_private a_ = simde__m64_to_private(a);
4965 return simde_mm_set1_epi64x(a_.i64[0]);
4966 #endif
4967 }
4968 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4969 #define _mm_set1_epi64(a) simde_mm_set1_epi64(a)
4970 #endif
4971
4972 SIMDE_FUNCTION_ATTRIBUTES
4973 simde__m128i
simde_x_mm_set1_epu8(uint8_t value)4974 simde_x_mm_set1_epu8 (uint8_t value) {
4975 #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
4976 return simde__m128i_from_altivec_u8(vec_splats(HEDLEY_STATIC_CAST(unsigned char, value)));
4977 #else
4978 return simde_mm_set1_epi8(HEDLEY_STATIC_CAST(int8_t, value));
4979 #endif
4980 }
4981
4982 SIMDE_FUNCTION_ATTRIBUTES
4983 simde__m128i
simde_x_mm_set1_epu16(uint16_t value)4984 simde_x_mm_set1_epu16 (uint16_t value) {
4985 #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
4986 return simde__m128i_from_altivec_u16(vec_splats(HEDLEY_STATIC_CAST(unsigned short, value)));
4987 #else
4988 return simde_mm_set1_epi16(HEDLEY_STATIC_CAST(int16_t, value));
4989 #endif
4990 }
4991
4992 SIMDE_FUNCTION_ATTRIBUTES
4993 simde__m128i
simde_x_mm_set1_epu32(uint32_t value)4994 simde_x_mm_set1_epu32 (uint32_t value) {
4995 #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
4996 return simde__m128i_from_altivec_u32(vec_splats(HEDLEY_STATIC_CAST(unsigned int, value)));
4997 #else
4998 return simde_mm_set1_epi32(HEDLEY_STATIC_CAST(int32_t, value));
4999 #endif
5000 }
5001
5002 SIMDE_FUNCTION_ATTRIBUTES
5003 simde__m128i
simde_x_mm_set1_epu64(uint64_t value)5004 simde_x_mm_set1_epu64 (uint64_t value) {
5005 #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
5006 return simde__m128i_from_altivec_u64(vec_splats(HEDLEY_STATIC_CAST(unsigned long long, value)));
5007 #else
5008 return simde_mm_set1_epi64x(HEDLEY_STATIC_CAST(int64_t, value));
5009 #endif
5010 }
5011
5012 SIMDE_FUNCTION_ATTRIBUTES
5013 simde__m128i
simde_mm_setr_epi8(int8_t e15,int8_t e14,int8_t e13,int8_t e12,int8_t e11,int8_t e10,int8_t e9,int8_t e8,int8_t e7,int8_t e6,int8_t e5,int8_t e4,int8_t e3,int8_t e2,int8_t e1,int8_t e0)5014 simde_mm_setr_epi8 (int8_t e15, int8_t e14, int8_t e13, int8_t e12,
5015 int8_t e11, int8_t e10, int8_t e9, int8_t e8,
5016 int8_t e7, int8_t e6, int8_t e5, int8_t e4,
5017 int8_t e3, int8_t e2, int8_t e1, int8_t e0) {
5018 #if defined(SIMDE_X86_SSE2_NATIVE)
5019 return _mm_setr_epi8(
5020 e15, e14, e13, e12, e11, e10, e9, e8,
5021 e7, e6, e5, e4, e3, e2, e1, e0);
5022 #else
5023 return simde_mm_set_epi8(
5024 e0, e1, e2, e3, e4, e5, e6, e7,
5025 e8, e9, e10, e11, e12, e13, e14, e15);
5026 #endif
5027 }
5028 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5029 #define _mm_setr_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) simde_mm_setr_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0)
5030 #endif
5031
5032 SIMDE_FUNCTION_ATTRIBUTES
5033 simde__m128i
simde_mm_setr_epi16(int16_t e7,int16_t e6,int16_t e5,int16_t e4,int16_t e3,int16_t e2,int16_t e1,int16_t e0)5034 simde_mm_setr_epi16 (int16_t e7, int16_t e6, int16_t e5, int16_t e4,
5035 int16_t e3, int16_t e2, int16_t e1, int16_t e0) {
5036 #if defined(SIMDE_X86_SSE2_NATIVE)
5037 return _mm_setr_epi16(e7, e6, e5, e4, e3, e2, e1, e0);
5038 #else
5039 return simde_mm_set_epi16(e0, e1, e2, e3, e4, e5, e6, e7);
5040 #endif
5041 }
5042 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5043 #define _mm_setr_epi16(e7, e6, e5, e4, e3, e2, e1, e0) simde_mm_setr_epi16(e7, e6, e5, e4, e3, e2, e1, e0)
5044 #endif
5045
5046 SIMDE_FUNCTION_ATTRIBUTES
5047 simde__m128i
simde_mm_setr_epi32(int32_t e3,int32_t e2,int32_t e1,int32_t e0)5048 simde_mm_setr_epi32 (int32_t e3, int32_t e2, int32_t e1, int32_t e0) {
5049 #if defined(SIMDE_X86_SSE2_NATIVE)
5050 return _mm_setr_epi32(e3, e2, e1, e0);
5051 #else
5052 return simde_mm_set_epi32(e0, e1, e2, e3);
5053 #endif
5054 }
5055 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5056 #define _mm_setr_epi32(e3, e2, e1, e0) simde_mm_setr_epi32(e3, e2, e1, e0)
5057 #endif
5058
5059 SIMDE_FUNCTION_ATTRIBUTES
5060 simde__m128i
simde_mm_setr_epi64(simde__m64 e1,simde__m64 e0)5061 simde_mm_setr_epi64 (simde__m64 e1, simde__m64 e0) {
5062 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
5063 return _mm_setr_epi64(e1, e0);
5064 #else
5065 return simde_mm_set_epi64(e0, e1);
5066 #endif
5067 }
5068 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5069 #define _mm_setr_epi64(e1, e0) (simde_mm_setr_epi64((e1), (e0)))
5070 #endif
5071
5072 SIMDE_FUNCTION_ATTRIBUTES
5073 simde__m128d
simde_mm_setr_pd(simde_float64 e1,simde_float64 e0)5074 simde_mm_setr_pd (simde_float64 e1, simde_float64 e0) {
5075 #if defined(SIMDE_X86_SSE2_NATIVE)
5076 return _mm_setr_pd(e1, e0);
5077 #else
5078 return simde_mm_set_pd(e0, e1);
5079 #endif
5080 }
5081 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5082 #define _mm_setr_pd(e1, e0) simde_mm_setr_pd(e1, e0)
5083 #endif
5084
5085 SIMDE_FUNCTION_ATTRIBUTES
5086 simde__m128d
simde_mm_setzero_pd(void)5087 simde_mm_setzero_pd (void) {
5088 #if defined(SIMDE_X86_SSE2_NATIVE)
5089 return _mm_setzero_pd();
5090 #else
5091 return simde_mm_castsi128_pd(simde_mm_setzero_si128());
5092 #endif
5093 }
5094 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5095 #define _mm_setzero_pd() simde_mm_setzero_pd()
5096 #endif
5097
5098 #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
5099 HEDLEY_DIAGNOSTIC_PUSH
5100 SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_
5101 #endif
5102
5103 SIMDE_FUNCTION_ATTRIBUTES
5104 simde__m128d
simde_mm_undefined_pd(void)5105 simde_mm_undefined_pd (void) {
5106 simde__m128d_private r_;
5107
5108 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE__HAVE_UNDEFINED128)
5109 r_.n = _mm_undefined_pd();
5110 #elif !defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
5111 r_ = simde__m128d_to_private(simde_mm_setzero_pd());
5112 #endif
5113
5114 return simde__m128d_from_private(r_);
5115 }
5116 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5117 #define _mm_undefined_pd() simde_mm_undefined_pd()
5118 #endif
5119
5120 SIMDE_FUNCTION_ATTRIBUTES
5121 simde__m128i
simde_mm_undefined_si128(void)5122 simde_mm_undefined_si128 (void) {
5123 simde__m128i_private r_;
5124
5125 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE__HAVE_UNDEFINED128)
5126 r_.n = _mm_undefined_si128();
5127 #elif !defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
5128 r_ = simde__m128i_to_private(simde_mm_setzero_si128());
5129 #endif
5130
5131 return simde__m128i_from_private(r_);
5132 }
5133 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5134 #define _mm_undefined_si128() (simde_mm_undefined_si128())
5135 #endif
5136
5137 #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
5138 HEDLEY_DIAGNOSTIC_POP
5139 #endif
5140
5141 SIMDE_FUNCTION_ATTRIBUTES
5142 simde__m128d
simde_x_mm_setone_pd(void)5143 simde_x_mm_setone_pd (void) {
5144 return simde_mm_castps_pd(simde_x_mm_setone_ps());
5145 }
5146
5147 SIMDE_FUNCTION_ATTRIBUTES
5148 simde__m128i
simde_x_mm_setone_si128(void)5149 simde_x_mm_setone_si128 (void) {
5150 return simde_mm_castps_si128(simde_x_mm_setone_ps());
5151 }
5152
5153 SIMDE_FUNCTION_ATTRIBUTES
5154 simde__m128i
simde_mm_shuffle_epi32(simde__m128i a,const int imm8)5155 simde_mm_shuffle_epi32 (simde__m128i a, const int imm8)
5156 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
5157 simde__m128i_private
5158 r_,
5159 a_ = simde__m128i_to_private(a);
5160
5161 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
5162 r_.i32[i] = a_.i32[(imm8 >> (i * 2)) & 3];
5163 }
5164
5165 return simde__m128i_from_private(r_);
5166 }
5167 #if defined(SIMDE_X86_SSE2_NATIVE)
5168 #define simde_mm_shuffle_epi32(a, imm8) _mm_shuffle_epi32((a), (imm8))
5169 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5170 #define simde_mm_shuffle_epi32(a, imm8) \
5171 __extension__({ \
5172 int32x4_t ret; \
5173 ret = vmovq_n_s32( \
5174 vgetq_lane_s32(vreinterpretq_s32_s64(a), (imm8) & (0x3))); \
5175 ret = vsetq_lane_s32( \
5176 vgetq_lane_s32(vreinterpretq_s32_s64(a), ((imm8) >> 2) & 0x3), \
5177 ret, 1); \
5178 ret = vsetq_lane_s32( \
5179 vgetq_lane_s32(vreinterpretq_s32_s64(a), ((imm8) >> 4) & 0x3), \
5180 ret, 2); \
5181 ret = vsetq_lane_s32( \
5182 vgetq_lane_s32(vreinterpretq_s32_s64(a), ((imm8) >> 6) & 0x3), \
5183 ret, 3); \
5184 vreinterpretq_s64_s32(ret); \
5185 })
5186 #elif defined(SIMDE_SHUFFLE_VECTOR_)
5187 #define simde_mm_shuffle_epi32(a, imm8) (__extension__ ({ \
5188 const simde__m128i_private simde__tmp_a_ = simde__m128i_to_private(a); \
5189 simde__m128i_from_private((simde__m128i_private) { .i32 = \
5190 SIMDE_SHUFFLE_VECTOR_(32, 16, \
5191 (simde__tmp_a_).i32, \
5192 (simde__tmp_a_).i32, \
5193 ((imm8) ) & 3, \
5194 ((imm8) >> 2) & 3, \
5195 ((imm8) >> 4) & 3, \
5196 ((imm8) >> 6) & 3) }); }))
5197 #endif
5198 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5199 #define _mm_shuffle_epi32(a, imm8) simde_mm_shuffle_epi32(a, imm8)
5200 #endif
5201
5202 SIMDE_FUNCTION_ATTRIBUTES
5203 simde__m128d
simde_mm_shuffle_pd(simde__m128d a,simde__m128d b,const int imm8)5204 simde_mm_shuffle_pd (simde__m128d a, simde__m128d b, const int imm8)
5205 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3) {
5206 simde__m128d_private
5207 r_,
5208 a_ = simde__m128d_to_private(a),
5209 b_ = simde__m128d_to_private(b);
5210
5211 r_.f64[0] = ((imm8 & 1) == 0) ? a_.f64[0] : a_.f64[1];
5212 r_.f64[1] = ((imm8 & 2) == 0) ? b_.f64[0] : b_.f64[1];
5213
5214 return simde__m128d_from_private(r_);
5215 }
5216 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
5217 #define simde_mm_shuffle_pd(a, b, imm8) _mm_shuffle_pd((a), (b), (imm8))
5218 #elif defined(SIMDE_SHUFFLE_VECTOR_)
5219 #define simde_mm_shuffle_pd(a, b, imm8) (__extension__ ({ \
5220 simde__m128d_from_private((simde__m128d_private) { .f64 = \
5221 SIMDE_SHUFFLE_VECTOR_(64, 16, \
5222 simde__m128d_to_private(a).f64, \
5223 simde__m128d_to_private(b).f64, \
5224 (((imm8) ) & 1), \
5225 (((imm8) >> 1) & 1) + 2) }); }))
5226 #endif
5227 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5228 #define _mm_shuffle_pd(a, b, imm8) simde_mm_shuffle_pd(a, b, imm8)
5229 #endif
5230
5231 SIMDE_FUNCTION_ATTRIBUTES
5232 simde__m128i
simde_mm_shufflehi_epi16(simde__m128i a,const int imm8)5233 simde_mm_shufflehi_epi16 (simde__m128i a, const int imm8)
5234 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
5235 simde__m128i_private
5236 r_,
5237 a_ = simde__m128i_to_private(a);
5238
5239 SIMDE_VECTORIZE
5240 for (size_t i = 0 ; i < ((sizeof(a_.i16) / sizeof(a_.i16[0])) / 2) ; i++) {
5241 r_.i16[i] = a_.i16[i];
5242 }
5243 for (size_t i = ((sizeof(a_.i16) / sizeof(a_.i16[0])) / 2) ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
5244 r_.i16[i] = a_.i16[((imm8 >> ((i - 4) * 2)) & 3) + 4];
5245 }
5246
5247 return simde__m128i_from_private(r_);
5248 }
5249 #if defined(SIMDE_X86_SSE2_NATIVE)
5250 #define simde_mm_shufflehi_epi16(a, imm8) _mm_shufflehi_epi16((a), (imm8))
5251 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5252 #define simde_mm_shufflehi_epi16(a, imm8) \
5253 __extension__({ \
5254 int16x8_t ret = vreinterpretq_s16_s64(a); \
5255 int16x4_t highBits = vget_high_s16(ret); \
5256 ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm8) & (0x3)), ret, 4); \
5257 ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm8) >> 2) & 0x3), ret, \
5258 5); \
5259 ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm8) >> 4) & 0x3), ret, \
5260 6); \
5261 ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm8) >> 6) & 0x3), ret, \
5262 7); \
5263 vreinterpretq_s64_s16(ret); \
5264 })
5265 #elif defined(SIMDE_SHUFFLE_VECTOR_)
5266 #define simde_mm_shufflehi_epi16(a, imm8) (__extension__ ({ \
5267 const simde__m128i_private simde__tmp_a_ = simde__m128i_to_private(a); \
5268 simde__m128i_from_private((simde__m128i_private) { .i16 = \
5269 SIMDE_SHUFFLE_VECTOR_(16, 16, \
5270 (simde__tmp_a_).i16, \
5271 (simde__tmp_a_).i16, \
5272 0, 1, 2, 3, \
5273 (((imm8) ) & 3) + 4, \
5274 (((imm8) >> 2) & 3) + 4, \
5275 (((imm8) >> 4) & 3) + 4, \
5276 (((imm8) >> 6) & 3) + 4) }); }))
5277 #endif
5278 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5279 #define _mm_shufflehi_epi16(a, imm8) simde_mm_shufflehi_epi16(a, imm8)
5280 #endif
5281
5282 SIMDE_FUNCTION_ATTRIBUTES
5283 simde__m128i
simde_mm_shufflelo_epi16(simde__m128i a,const int imm8)5284 simde_mm_shufflelo_epi16 (simde__m128i a, const int imm8)
5285 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
5286 simde__m128i_private
5287 r_,
5288 a_ = simde__m128i_to_private(a);
5289
5290 for (size_t i = 0 ; i < ((sizeof(r_.i16) / sizeof(r_.i16[0])) / 2) ; i++) {
5291 r_.i16[i] = a_.i16[((imm8 >> (i * 2)) & 3)];
5292 }
5293 SIMDE_VECTORIZE
5294 for (size_t i = ((sizeof(a_.i16) / sizeof(a_.i16[0])) / 2) ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
5295 r_.i16[i] = a_.i16[i];
5296 }
5297
5298 return simde__m128i_from_private(r_);
5299 }
5300 #if defined(SIMDE_X86_SSE2_NATIVE)
5301 #define simde_mm_shufflelo_epi16(a, imm8) _mm_shufflelo_epi16((a), (imm8))
5302 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5303 #define simde_mm_shufflelo_epi16(a, imm8) \
5304 __extension__({ \
5305 int16x8_t ret = vreinterpretq_s16_s64(a); \
5306 int16x4_t lowBits = vget_low_s16(ret); \
5307 ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm8) & (0x3)), ret, 0); \
5308 ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm8) >> 2) & 0x3), ret, \
5309 1); \
5310 ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm8) >> 4) & 0x3), ret, \
5311 2); \
5312 ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm8) >> 6) & 0x3), ret, \
5313 3); \
5314 vreinterpretq_s64_s16(ret); \
5315 })
5316 #elif defined(SIMDE_SHUFFLE_VECTOR_)
5317 #define simde_mm_shufflelo_epi16(a, imm8) (__extension__ ({ \
5318 const simde__m128i_private simde__tmp_a_ = simde__m128i_to_private(a); \
5319 simde__m128i_from_private((simde__m128i_private) { .i16 = \
5320 SIMDE_SHUFFLE_VECTOR_(16, 16, \
5321 (simde__tmp_a_).i16, \
5322 (simde__tmp_a_).i16, \
5323 (((imm8) ) & 3), \
5324 (((imm8) >> 2) & 3), \
5325 (((imm8) >> 4) & 3), \
5326 (((imm8) >> 6) & 3), \
5327 4, 5, 6, 7) }); }))
5328 #endif
5329 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5330 #define _mm_shufflelo_epi16(a, imm8) simde_mm_shufflelo_epi16(a, imm8)
5331 #endif
5332
5333 SIMDE_FUNCTION_ATTRIBUTES
5334 simde__m128i
simde_mm_sll_epi16(simde__m128i a,simde__m128i count)5335 simde_mm_sll_epi16 (simde__m128i a, simde__m128i count) {
5336 #if defined(SIMDE_X86_SSE2_NATIVE)
5337 return _mm_sll_epi16(a, count);
5338 #else
5339 simde__m128i_private
5340 r_,
5341 a_ = simde__m128i_to_private(a),
5342 count_ = simde__m128i_to_private(count);
5343
5344 if (count_.u64[0] > 15)
5345 return simde_mm_setzero_si128();
5346
5347 #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
5348 r_.u16 = (a_.u16 << count_.u64[0]);
5349 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5350 r_.neon_u16 = vshlq_u16(a_.neon_u16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, count_.u64[0])));
5351 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5352 r_.wasm_v128 = ((wasm_i64x2_extract_lane(count_.wasm_v128, 0) < 16) ? wasm_i16x8_shl(a_.wasm_v128, HEDLEY_STATIC_CAST(int32_t, wasm_i64x2_extract_lane(count_.wasm_v128, 0))) : wasm_i16x8_const(0,0,0,0,0,0,0,0));
5353 #else
5354 SIMDE_VECTORIZE
5355 for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
5356 r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, (a_.u16[i] << count_.u64[0]));
5357 }
5358 #endif
5359
5360 return simde__m128i_from_private(r_);
5361 #endif
5362 }
5363 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5364 #define _mm_sll_epi16(a, count) simde_mm_sll_epi16((a), (count))
5365 #endif
5366
5367 SIMDE_FUNCTION_ATTRIBUTES
5368 simde__m128i
simde_mm_sll_epi32(simde__m128i a,simde__m128i count)5369 simde_mm_sll_epi32 (simde__m128i a, simde__m128i count) {
5370 #if defined(SIMDE_X86_SSE2_NATIVE)
5371 return _mm_sll_epi32(a, count);
5372 #else
5373 simde__m128i_private
5374 r_,
5375 a_ = simde__m128i_to_private(a),
5376 count_ = simde__m128i_to_private(count);
5377
5378 if (count_.u64[0] > 31)
5379 return simde_mm_setzero_si128();
5380
5381 #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
5382 r_.u32 = (a_.u32 << count_.u64[0]);
5383 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5384 r_.neon_u32 = vshlq_u32(a_.neon_u32, vdupq_n_s32(HEDLEY_STATIC_CAST(int32_t, count_.u64[0])));
5385 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5386 r_.wasm_v128 = ((wasm_i64x2_extract_lane(count_.wasm_v128, 0) < 32) ? wasm_i32x4_shl(a_.wasm_v128, HEDLEY_STATIC_CAST(int32_t, wasm_i64x2_extract_lane(count_.wasm_v128, 0))) : wasm_i32x4_const(0,0,0,0));
5387 #else
5388 SIMDE_VECTORIZE
5389 for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
5390 r_.u32[i] = HEDLEY_STATIC_CAST(uint32_t, (a_.u32[i] << count_.u64[0]));
5391 }
5392 #endif
5393
5394 return simde__m128i_from_private(r_);
5395 #endif
5396 }
5397 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5398 #define _mm_sll_epi32(a, count) (simde_mm_sll_epi32(a, (count)))
5399 #endif
5400
5401 SIMDE_FUNCTION_ATTRIBUTES
5402 simde__m128i
simde_mm_sll_epi64(simde__m128i a,simde__m128i count)5403 simde_mm_sll_epi64 (simde__m128i a, simde__m128i count) {
5404 #if defined(SIMDE_X86_SSE2_NATIVE)
5405 return _mm_sll_epi64(a, count);
5406 #else
5407 simde__m128i_private
5408 r_,
5409 a_ = simde__m128i_to_private(a),
5410 count_ = simde__m128i_to_private(count);
5411
5412 if (count_.u64[0] > 63)
5413 return simde_mm_setzero_si128();
5414
5415 const int_fast16_t s = HEDLEY_STATIC_CAST(int_fast16_t, count_.u64[0]);
5416 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5417 r_.neon_u64 = vshlq_u64(a_.neon_u64, vdupq_n_s64(HEDLEY_STATIC_CAST(int64_t, s)));
5418 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5419 r_.wasm_v128 = (s < 64) ? wasm_i64x2_shl(a_.wasm_v128, s) : wasm_i64x2_const(0,0);
5420 #else
5421 #if !defined(SIMDE_BUG_GCC_94488)
5422 SIMDE_VECTORIZE
5423 #endif
5424 for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
5425 r_.u64[i] = a_.u64[i] << s;
5426 }
5427 #endif
5428
5429 return simde__m128i_from_private(r_);
5430 #endif
5431 }
5432 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5433 #define _mm_sll_epi64(a, count) (simde_mm_sll_epi64(a, (count)))
5434 #endif
5435
5436 SIMDE_FUNCTION_ATTRIBUTES
5437 simde__m128d
simde_mm_sqrt_pd(simde__m128d a)5438 simde_mm_sqrt_pd (simde__m128d a) {
5439 #if defined(SIMDE_X86_SSE2_NATIVE)
5440 return _mm_sqrt_pd(a);
5441 #else
5442 simde__m128d_private
5443 r_,
5444 a_ = simde__m128d_to_private(a);
5445
5446 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
5447 r_.neon_f64 = vsqrtq_f64(a_.neon_f64);
5448 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5449 r_.wasm_v128 = wasm_f64x2_sqrt(a_.wasm_v128);
5450 #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
5451 r_.altivec_f64 = vec_sqrt(a_.altivec_f64);
5452 #elif defined(simde_math_sqrt)
5453 SIMDE_VECTORIZE
5454 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
5455 r_.f64[i] = simde_math_sqrt(a_.f64[i]);
5456 }
5457 #else
5458 HEDLEY_UNREACHABLE();
5459 #endif
5460
5461 return simde__m128d_from_private(r_);
5462 #endif
5463 }
5464 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5465 #define _mm_sqrt_pd(a) simde_mm_sqrt_pd(a)
5466 #endif
5467
5468 SIMDE_FUNCTION_ATTRIBUTES
5469 simde__m128d
simde_mm_sqrt_sd(simde__m128d a,simde__m128d b)5470 simde_mm_sqrt_sd (simde__m128d a, simde__m128d b) {
5471 #if defined(SIMDE_X86_SSE2_NATIVE)
5472 return _mm_sqrt_sd(a, b);
5473 #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
5474 return simde_mm_move_sd(a, simde_mm_sqrt_pd(b));
5475 #else
5476 simde__m128d_private
5477 r_,
5478 a_ = simde__m128d_to_private(a),
5479 b_ = simde__m128d_to_private(b);
5480
5481 #if defined(simde_math_sqrt)
5482 r_.f64[0] = simde_math_sqrt(b_.f64[0]);
5483 r_.f64[1] = a_.f64[1];
5484 #else
5485 HEDLEY_UNREACHABLE();
5486 #endif
5487
5488 return simde__m128d_from_private(r_);
5489 #endif
5490 }
5491 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5492 #define _mm_sqrt_sd(a, b) simde_mm_sqrt_sd(a, b)
5493 #endif
5494
5495 SIMDE_FUNCTION_ATTRIBUTES
5496 simde__m128i
simde_mm_srl_epi16(simde__m128i a,simde__m128i count)5497 simde_mm_srl_epi16 (simde__m128i a, simde__m128i count) {
5498 #if defined(SIMDE_X86_SSE2_NATIVE)
5499 return _mm_srl_epi16(a, count);
5500 #else
5501 simde__m128i_private
5502 r_,
5503 a_ = simde__m128i_to_private(a),
5504 count_ = simde__m128i_to_private(count);
5505
5506 const int cnt = HEDLEY_STATIC_CAST(int, (count_.i64[0] > 16 ? 16 : count_.i64[0]));
5507
5508 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5509 r_.neon_u16 = vshlq_u16(a_.neon_u16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, -cnt)));
5510 #else
5511 SIMDE_VECTORIZE
5512 for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
5513 r_.u16[i] = a_.u16[i] >> cnt;
5514 }
5515 #endif
5516
5517 return simde__m128i_from_private(r_);
5518 #endif
5519 }
5520 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5521 #define _mm_srl_epi16(a, count) (simde_mm_srl_epi16(a, (count)))
5522 #endif
5523
5524 SIMDE_FUNCTION_ATTRIBUTES
5525 simde__m128i
simde_mm_srl_epi32(simde__m128i a,simde__m128i count)5526 simde_mm_srl_epi32 (simde__m128i a, simde__m128i count) {
5527 #if defined(SIMDE_X86_SSE2_NATIVE)
5528 return _mm_srl_epi32(a, count);
5529 #else
5530 simde__m128i_private
5531 r_,
5532 a_ = simde__m128i_to_private(a),
5533 count_ = simde__m128i_to_private(count);
5534
5535 const int cnt = HEDLEY_STATIC_CAST(int, (count_.i64[0] > 32 ? 32 : count_.i64[0]));
5536
5537 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5538 r_.neon_u32 = vshlq_u32(a_.neon_u32, vdupq_n_s32(HEDLEY_STATIC_CAST(int32_t, -cnt)));
5539 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5540 r_.wasm_v128 = wasm_u32x4_shr(a_.wasm_v128, cnt);
5541 #else
5542 SIMDE_VECTORIZE
5543 for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
5544 r_.u32[i] = a_.u32[i] >> cnt;
5545 }
5546 #endif
5547
5548 return simde__m128i_from_private(r_);
5549 #endif
5550 }
5551 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5552 #define _mm_srl_epi32(a, count) (simde_mm_srl_epi32(a, (count)))
5553 #endif
5554
5555 SIMDE_FUNCTION_ATTRIBUTES
5556 simde__m128i
simde_mm_srl_epi64(simde__m128i a,simde__m128i count)5557 simde_mm_srl_epi64 (simde__m128i a, simde__m128i count) {
5558 #if defined(SIMDE_X86_SSE2_NATIVE)
5559 return _mm_srl_epi64(a, count);
5560 #else
5561 simde__m128i_private
5562 r_,
5563 a_ = simde__m128i_to_private(a),
5564 count_ = simde__m128i_to_private(count);
5565
5566 const int cnt = HEDLEY_STATIC_CAST(int, (count_.i64[0] > 64 ? 64 : count_.i64[0]));
5567
5568 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5569 r_.neon_u64 = vshlq_u64(a_.neon_u64, vdupq_n_s64(HEDLEY_STATIC_CAST(int64_t, -cnt)));
5570 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5571 r_.wasm_v128 = wasm_u64x2_shr(a_.wasm_v128, cnt);
5572 #else
5573 #if !defined(SIMDE_BUG_GCC_94488)
5574 SIMDE_VECTORIZE
5575 #endif
5576 for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
5577 r_.u64[i] = a_.u64[i] >> cnt;
5578 }
5579 #endif
5580
5581 return simde__m128i_from_private(r_);
5582 #endif
5583 }
5584 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5585 #define _mm_srl_epi64(a, count) (simde_mm_srl_epi64(a, (count)))
5586 #endif
5587
5588 SIMDE_FUNCTION_ATTRIBUTES
5589 simde__m128i
simde_mm_srai_epi16(simde__m128i a,const int imm8)5590 simde_mm_srai_epi16 (simde__m128i a, const int imm8)
5591 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
5592 /* MSVC requires a range of (0, 255). */
5593 simde__m128i_private
5594 r_,
5595 a_ = simde__m128i_to_private(a);
5596
5597 const int cnt = (imm8 & ~15) ? 15 : imm8;
5598
5599 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5600 r_.neon_i16 = vshlq_s16(a_.neon_i16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, -cnt)));
5601 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5602 r_.wasm_v128 = wasm_i16x8_shr(a_.wasm_v128, cnt);
5603 #else
5604 SIMDE_VECTORIZE
5605 for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i16[0])) ; i++) {
5606 r_.i16[i] = a_.i16[i] >> cnt;
5607 }
5608 #endif
5609
5610 return simde__m128i_from_private(r_);
5611 }
5612 #if defined(SIMDE_X86_SSE2_NATIVE)
5613 #define simde_mm_srai_epi16(a, imm8) _mm_srai_epi16((a), (imm8))
5614 #endif
5615 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5616 #define _mm_srai_epi16(a, imm8) simde_mm_srai_epi16(a, imm8)
5617 #endif
5618
5619 SIMDE_FUNCTION_ATTRIBUTES
5620 simde__m128i
simde_mm_srai_epi32(simde__m128i a,const int imm8)5621 simde_mm_srai_epi32 (simde__m128i a, const int imm8)
5622 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
5623 /* MSVC requires a range of (0, 255). */
5624 simde__m128i_private
5625 r_,
5626 a_ = simde__m128i_to_private(a);
5627
5628 const int cnt = (imm8 & ~31) ? 31 : imm8;
5629
5630 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5631 r_.neon_i32 = vshlq_s32(a_.neon_i32, vdupq_n_s32(-cnt));
5632 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5633 r_.wasm_v128 = wasm_i32x4_shr(a_.wasm_v128, cnt);
5634 #else
5635 SIMDE_VECTORIZE
5636 for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i32[0])) ; i++) {
5637 r_.i32[i] = a_.i32[i] >> cnt;
5638 }
5639 #endif
5640
5641 return simde__m128i_from_private(r_);
5642 }
5643 #if defined(SIMDE_X86_SSE2_NATIVE)
5644 #define simde_mm_srai_epi32(a, imm8) _mm_srai_epi32((a), (imm8))
5645 #endif
5646 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5647 #define _mm_srai_epi32(a, imm8) simde_mm_srai_epi32(a, imm8)
5648 #endif
5649
5650 SIMDE_FUNCTION_ATTRIBUTES
5651 simde__m128i
simde_mm_sra_epi16(simde__m128i a,simde__m128i count)5652 simde_mm_sra_epi16 (simde__m128i a, simde__m128i count) {
5653 #if defined(SIMDE_X86_SSE2_NATIVE)
5654 return _mm_sra_epi16(a, count);
5655 #else
5656 simde__m128i_private
5657 r_,
5658 a_ = simde__m128i_to_private(a),
5659 count_ = simde__m128i_to_private(count);
5660
5661 const int cnt = HEDLEY_STATIC_CAST(int, (count_.i64[0] > 15 ? 15 : count_.i64[0]));
5662
5663 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5664 r_.neon_i16 = vshlq_s16(a_.neon_i16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, -cnt)));
5665 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5666 r_.wasm_v128 = wasm_i16x8_shr(a_.wasm_v128, cnt);
5667 #else
5668 SIMDE_VECTORIZE
5669 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
5670 r_.i16[i] = a_.i16[i] >> cnt;
5671 }
5672 #endif
5673
5674 return simde__m128i_from_private(r_);
5675 #endif
5676 }
5677 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5678 #define _mm_sra_epi16(a, count) (simde_mm_sra_epi16(a, count))
5679 #endif
5680
5681 SIMDE_FUNCTION_ATTRIBUTES
5682 simde__m128i
simde_mm_sra_epi32(simde__m128i a,simde__m128i count)5683 simde_mm_sra_epi32 (simde__m128i a, simde__m128i count) {
5684 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_MM_SRA_EPI32)
5685 return _mm_sra_epi32(a, count);
5686 #else
5687 simde__m128i_private
5688 r_,
5689 a_ = simde__m128i_to_private(a),
5690 count_ = simde__m128i_to_private(count);
5691
5692 const int cnt = count_.u64[0] > 31 ? 31 : HEDLEY_STATIC_CAST(int, count_.u64[0]);
5693
5694 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5695 r_.neon_i32 = vshlq_s32(a_.neon_i32, vdupq_n_s32(HEDLEY_STATIC_CAST(int32_t, -cnt)));
5696 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5697 r_.wasm_v128 = wasm_i32x4_shr(a_.wasm_v128, cnt);
5698 #else
5699 SIMDE_VECTORIZE
5700 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
5701 r_.i32[i] = a_.i32[i] >> cnt;
5702 }
5703 #endif
5704
5705 return simde__m128i_from_private(r_);
5706 #endif
5707 }
5708 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5709 #define _mm_sra_epi32(a, count) (simde_mm_sra_epi32(a, (count)))
5710 #endif
5711
5712 SIMDE_FUNCTION_ATTRIBUTES
5713 simde__m128i
simde_mm_slli_epi16(simde__m128i a,const int imm8)5714 simde_mm_slli_epi16 (simde__m128i a, const int imm8)
5715 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
5716 if (HEDLEY_UNLIKELY((imm8 > 15))) {
5717 return simde_mm_setzero_si128();
5718 }
5719
5720 simde__m128i_private
5721 r_,
5722 a_ = simde__m128i_to_private(a);
5723
5724 #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
5725 r_.i16 = a_.i16 << SIMDE_CAST_VECTOR_SHIFT_COUNT(8, imm8 & 0xff);
5726 #else
5727 const int s = (imm8 > HEDLEY_STATIC_CAST(int, sizeof(r_.i16[0]) * CHAR_BIT) - 1) ? 0 : imm8;
5728 SIMDE_VECTORIZE
5729 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
5730 r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, a_.i16[i] << s);
5731 }
5732 #endif
5733
5734 return simde__m128i_from_private(r_);
5735 }
5736 #if defined(SIMDE_X86_SSE2_NATIVE)
5737 #define simde_mm_slli_epi16(a, imm8) _mm_slli_epi16(a, imm8)
5738 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5739 #define simde_mm_slli_epi16(a, imm8) \
5740 (__extension__ ({ \
5741 simde__m128i ret; \
5742 if ((imm8) <= 0) { \
5743 ret = a; \
5744 } else if ((imm8) > 15) { \
5745 ret = simde_mm_setzero_si128(); \
5746 } else { \
5747 ret = simde__m128i_from_neon_i16( \
5748 vshlq_n_s16(simde__m128i_to_neon_i16(a), ((imm8) & 15))); \
5749 } \
5750 ret; \
5751 }))
5752 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5753 #define simde_mm_slli_epi16(a, imm8) \
5754 ((imm8 < 16) ? wasm_i16x8_shl(simde__m128i_to_private(a).wasm_v128, imm8) : wasm_i16x8_const(0,0,0,0,0,0,0,0))
5755 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
5756 #define simde_mm_slli_epi16(a, imm8) \
5757 ((imm8 & ~15) ? simde_mm_setzero_si128() : simde__m128i_from_altivec_i16(vec_sl(simde__m128i_to_altivec_i16(a), vec_splat_u16(HEDLEY_STATIC_CAST(unsigned short, imm8)))))
5758 #endif
5759 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5760 #define _mm_slli_epi16(a, imm8) simde_mm_slli_epi16(a, imm8)
5761 #endif
5762
5763 SIMDE_FUNCTION_ATTRIBUTES
5764 simde__m128i
simde_mm_slli_epi32(simde__m128i a,const int imm8)5765 simde_mm_slli_epi32 (simde__m128i a, const int imm8)
5766 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
5767 if (HEDLEY_UNLIKELY((imm8 > 31))) {
5768 return simde_mm_setzero_si128();
5769 }
5770 simde__m128i_private
5771 r_,
5772 a_ = simde__m128i_to_private(a);
5773
5774 #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
5775 r_.i32 = a_.i32 << imm8;
5776 #else
5777 SIMDE_VECTORIZE
5778 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
5779 r_.i32[i] = a_.i32[i] << (imm8 & 0xff);
5780 }
5781 #endif
5782
5783 return simde__m128i_from_private(r_);
5784 }
5785 #if defined(SIMDE_X86_SSE2_NATIVE)
5786 #define simde_mm_slli_epi32(a, imm8) _mm_slli_epi32(a, imm8)
5787 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5788 #define simde_mm_slli_epi32(a, imm8) \
5789 (__extension__ ({ \
5790 simde__m128i ret; \
5791 if ((imm8) <= 0) { \
5792 ret = a; \
5793 } else if ((imm8) > 31) { \
5794 ret = simde_mm_setzero_si128(); \
5795 } else { \
5796 ret = simde__m128i_from_neon_i32( \
5797 vshlq_n_s32(simde__m128i_to_neon_i32(a), ((imm8) & 31))); \
5798 } \
5799 ret; \
5800 }))
5801 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5802 #define simde_mm_slli_epi32(a, imm8) \
5803 ((imm8 < 32) ? wasm_i32x4_shl(simde__m128i_to_private(a).wasm_v128, imm8) : wasm_i32x4_const(0,0,0,0))
5804 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
5805 #define simde_mm_slli_epi32(a, imm8) \
5806 (__extension__ ({ \
5807 simde__m128i ret; \
5808 if ((imm8) <= 0) { \
5809 ret = a; \
5810 } else if ((imm8) > 31) { \
5811 ret = simde_mm_setzero_si128(); \
5812 } else { \
5813 ret = simde__m128i_from_altivec_i32( \
5814 vec_sl(simde__m128i_to_altivec_i32(a), \
5815 vec_splats(HEDLEY_STATIC_CAST(unsigned int, (imm8) & 31)))); \
5816 } \
5817 ret; \
5818 }))
5819 #endif
5820 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5821 #define _mm_slli_epi32(a, imm8) simde_mm_slli_epi32(a, imm8)
5822 #endif
5823
5824 SIMDE_FUNCTION_ATTRIBUTES
5825 simde__m128i
simde_mm_slli_epi64(simde__m128i a,const int imm8)5826 simde_mm_slli_epi64 (simde__m128i a, const int imm8)
5827 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
5828 if (HEDLEY_UNLIKELY((imm8 > 63))) {
5829 return simde_mm_setzero_si128();
5830 }
5831 simde__m128i_private
5832 r_,
5833 a_ = simde__m128i_to_private(a);
5834
5835 #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
5836 r_.i64 = a_.i64 << imm8;
5837 #else
5838 SIMDE_VECTORIZE
5839 for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
5840 r_.i64[i] = a_.i64[i] << (imm8 & 0xff);
5841 }
5842 #endif
5843
5844 return simde__m128i_from_private(r_);
5845 }
5846 #if defined(SIMDE_X86_SSE2_NATIVE)
5847 #define simde_mm_slli_epi64(a, imm8) _mm_slli_epi64(a, imm8)
5848 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5849 #define simde_mm_slli_epi64(a, imm8) \
5850 (__extension__ ({ \
5851 simde__m128i ret; \
5852 if ((imm8) <= 0) { \
5853 ret = a; \
5854 } else if ((imm8) > 63) { \
5855 ret = simde_mm_setzero_si128(); \
5856 } else { \
5857 ret = simde__m128i_from_neon_i64( \
5858 vshlq_n_s64(simde__m128i_to_neon_i64(a), ((imm8) & 63))); \
5859 } \
5860 ret; \
5861 }))
5862 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5863 #define simde_mm_slli_epi64(a, imm8) \
5864 ((imm8 < 64) ? wasm_i64x2_shl(simde__m128i_to_private(a).wasm_v128, imm8) : wasm_i64x2_const(0,0))
5865 #endif
5866 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5867 #define _mm_slli_epi64(a, imm8) simde_mm_slli_epi64(a, imm8)
5868 #endif
5869
5870 SIMDE_FUNCTION_ATTRIBUTES
5871 simde__m128i
simde_mm_srli_epi16(simde__m128i a,const int imm8)5872 simde_mm_srli_epi16 (simde__m128i a, const int imm8)
5873 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
5874 if (HEDLEY_UNLIKELY((imm8 > 15))) {
5875 return simde_mm_setzero_si128();
5876 }
5877 simde__m128i_private
5878 r_,
5879 a_ = simde__m128i_to_private(a);
5880
5881 #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
5882 r_.u16 = a_.u16 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(8, imm8);
5883 #else
5884 SIMDE_VECTORIZE
5885 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
5886 r_.u16[i] = a_.u16[i] >> (imm8 & 0xff);
5887 }
5888 #endif
5889
5890 return simde__m128i_from_private(r_);
5891 }
5892 #if defined(SIMDE_X86_SSE2_NATIVE)
5893 #define simde_mm_srli_epi16(a, imm8) _mm_srli_epi16(a, imm8)
5894 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5895 #define simde_mm_srli_epi16(a, imm8) \
5896 (__extension__ ({ \
5897 simde__m128i ret; \
5898 if ((imm8) <= 0) { \
5899 ret = a; \
5900 } else if ((imm8) > 15) { \
5901 ret = simde_mm_setzero_si128(); \
5902 } else { \
5903 ret = simde__m128i_from_neon_u16( \
5904 vshrq_n_u16(simde__m128i_to_neon_u16(a), (((imm8) & 15) | (((imm8) & 15) == 0)))); \
5905 } \
5906 ret; \
5907 }))
5908 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5909 #define simde_mm_srli_epi16(a, imm8) \
5910 ((imm8 < 16) ? wasm_u16x8_shr(simde__m128i_to_private(a).wasm_v128, imm8) : wasm_i16x8_const(0,0,0,0,0,0,0,0))
5911 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
5912 #define simde_mm_srli_epi16(a, imm8) \
5913 ((imm8 & ~15) ? simde_mm_setzero_si128() : simde__m128i_from_altivec_i16(vec_sr(simde__m128i_to_altivec_i16(a), vec_splat_u16(HEDLEY_STATIC_CAST(unsigned short, imm8)))))
5914 #endif
5915 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5916 #define _mm_srli_epi16(a, imm8) simde_mm_srli_epi16(a, imm8)
5917 #endif
5918
5919 SIMDE_FUNCTION_ATTRIBUTES
5920 simde__m128i
simde_mm_srli_epi32(simde__m128i a,const int imm8)5921 simde_mm_srli_epi32 (simde__m128i a, const int imm8)
5922 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
5923 if (HEDLEY_UNLIKELY((imm8 > 31))) {
5924 return simde_mm_setzero_si128();
5925 }
5926 simde__m128i_private
5927 r_,
5928 a_ = simde__m128i_to_private(a);
5929
5930 #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
5931 r_.u32 = a_.u32 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(8, imm8 & 0xff);
5932 #else
5933 SIMDE_VECTORIZE
5934 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
5935 r_.u32[i] = a_.u32[i] >> (imm8 & 0xff);
5936 }
5937 #endif
5938
5939 return simde__m128i_from_private(r_);
5940 }
5941 #if defined(SIMDE_X86_SSE2_NATIVE)
5942 #define simde_mm_srli_epi32(a, imm8) _mm_srli_epi32(a, imm8)
5943 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5944 #define simde_mm_srli_epi32(a, imm8) \
5945 (__extension__ ({ \
5946 simde__m128i ret; \
5947 if ((imm8) <= 0) { \
5948 ret = a; \
5949 } else if ((imm8) > 31) { \
5950 ret = simde_mm_setzero_si128(); \
5951 } else { \
5952 ret = simde__m128i_from_neon_u32( \
5953 vshrq_n_u32(simde__m128i_to_neon_u32(a), (((imm8) & 31) | (((imm8) & 31) == 0)))); \
5954 } \
5955 ret; \
5956 }))
5957 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5958 #define simde_mm_srli_epi32(a, imm8) \
5959 ((imm8 < 32) ? wasm_u32x4_shr(simde__m128i_to_private(a).wasm_v128, imm8) : wasm_i32x4_const(0,0,0,0))
5960 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
5961 #define simde_mm_srli_epi32(a, imm8) \
5962 (__extension__ ({ \
5963 simde__m128i ret; \
5964 if ((imm8) <= 0) { \
5965 ret = a; \
5966 } else if ((imm8) > 31) { \
5967 ret = simde_mm_setzero_si128(); \
5968 } else { \
5969 ret = simde__m128i_from_altivec_i32( \
5970 vec_sr(simde__m128i_to_altivec_i32(a), \
5971 vec_splats(HEDLEY_STATIC_CAST(unsigned int, (imm8) & 31)))); \
5972 } \
5973 ret; \
5974 }))
5975 #endif
5976 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5977 #define _mm_srli_epi32(a, imm8) simde_mm_srli_epi32(a, imm8)
5978 #endif
5979
5980 SIMDE_FUNCTION_ATTRIBUTES
5981 simde__m128i
simde_mm_srli_epi64(simde__m128i a,const int imm8)5982 simde_mm_srli_epi64 (simde__m128i a, const int imm8)
5983 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
5984 simde__m128i_private
5985 r_,
5986 a_ = simde__m128i_to_private(a);
5987
5988 if (HEDLEY_UNLIKELY((imm8 & 63) != imm8))
5989 return simde_mm_setzero_si128();
5990
5991 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5992 r_.neon_u64 = vshlq_u64(a_.neon_u64, vdupq_n_s64(-imm8));
5993 #else
5994 #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_94488)
5995 r_.u64 = a_.u64 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(8, imm8);
5996 #else
5997 SIMDE_VECTORIZE
5998 for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
5999 r_.u64[i] = a_.u64[i] >> imm8;
6000 }
6001 #endif
6002 #endif
6003
6004 return simde__m128i_from_private(r_);
6005 }
6006 #if defined(SIMDE_X86_SSE2_NATIVE)
6007 #define simde_mm_srli_epi64(a, imm8) _mm_srli_epi64(a, imm8)
6008 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6009 #define simde_mm_srli_epi64(a, imm8) \
6010 (__extension__ ({ \
6011 simde__m128i ret; \
6012 if ((imm8) <= 0) { \
6013 ret = a; \
6014 } else if ((imm8) > 63) { \
6015 ret = simde_mm_setzero_si128(); \
6016 } else { \
6017 ret = simde__m128i_from_neon_u64( \
6018 vshrq_n_u64(simde__m128i_to_neon_u64(a), (((imm8) & 63) | (((imm8) & 63) == 0)))); \
6019 } \
6020 ret; \
6021 }))
6022 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
6023 #define simde_mm_srli_epi64(a, imm8) \
6024 ((imm8 < 64) ? wasm_u64x2_shr(simde__m128i_to_private(a).wasm_v128, imm8) : wasm_i64x2_const(0,0))
6025 #endif
6026 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6027 #define _mm_srli_epi64(a, imm8) simde_mm_srli_epi64(a, imm8)
6028 #endif
6029
6030 SIMDE_FUNCTION_ATTRIBUTES
6031 void
simde_mm_store_pd(simde_float64 mem_addr[HEDLEY_ARRAY_PARAM (2)],simde__m128d a)6032 simde_mm_store_pd (simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128d a) {
6033 #if defined(SIMDE_X86_SSE2_NATIVE)
6034 _mm_store_pd(mem_addr, a);
6035 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6036 vst1q_f64(mem_addr, simde__m128d_to_private(a).neon_f64);
6037 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6038 vst1q_s64(HEDLEY_REINTERPRET_CAST(int64_t*, mem_addr), simde__m128d_to_private(a).neon_i64);
6039 #else
6040 simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m128d), &a, sizeof(a));
6041 #endif
6042 }
6043 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6044 #define _mm_store_pd(mem_addr, a) simde_mm_store_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
6045 #endif
6046
6047 SIMDE_FUNCTION_ATTRIBUTES
6048 void
simde_mm_store1_pd(simde_float64 mem_addr[HEDLEY_ARRAY_PARAM (2)],simde__m128d a)6049 simde_mm_store1_pd (simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128d a) {
6050 #if defined(SIMDE_X86_SSE2_NATIVE)
6051 _mm_store1_pd(mem_addr, a);
6052 #else
6053 simde__m128d_private a_ = simde__m128d_to_private(a);
6054
6055 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6056 vst1q_f64(mem_addr, vdupq_laneq_f64(a_.neon_f64, 0));
6057 #else
6058 mem_addr[0] = a_.f64[0];
6059 mem_addr[1] = a_.f64[0];
6060 #endif
6061 #endif
6062 }
6063 #define simde_mm_store_pd1(mem_addr, a) simde_mm_store1_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
6064 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6065 #define _mm_store1_pd(mem_addr, a) simde_mm_store1_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
6066 #define _mm_store_pd1(mem_addr, a) simde_mm_store_pd1(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
6067 #endif
6068
6069 SIMDE_FUNCTION_ATTRIBUTES
6070 void
simde_mm_store_sd(simde_float64 * mem_addr,simde__m128d a)6071 simde_mm_store_sd (simde_float64* mem_addr, simde__m128d a) {
6072 #if defined(SIMDE_X86_SSE2_NATIVE)
6073 _mm_store_sd(mem_addr, a);
6074 #else
6075 simde__m128d_private a_ = simde__m128d_to_private(a);
6076
6077 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6078 const simde_float64 v = vgetq_lane_f64(a_.neon_f64, 0);
6079 simde_memcpy(mem_addr, &v, sizeof(v));
6080 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6081 const int64_t v = vgetq_lane_s64(a_.neon_i64, 0);
6082 simde_memcpy(HEDLEY_REINTERPRET_CAST(int64_t*, mem_addr), &v, sizeof(v));
6083 #else
6084 simde_float64 v = a_.f64[0];
6085 simde_memcpy(mem_addr, &v, sizeof(simde_float64));
6086 #endif
6087 #endif
6088 }
6089 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6090 #define _mm_store_sd(mem_addr, a) simde_mm_store_sd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
6091 #endif
6092
6093 SIMDE_FUNCTION_ATTRIBUTES
6094 void
simde_mm_store_si128(simde__m128i * mem_addr,simde__m128i a)6095 simde_mm_store_si128 (simde__m128i* mem_addr, simde__m128i a) {
6096 #if defined(SIMDE_X86_SSE2_NATIVE)
6097 _mm_store_si128(HEDLEY_STATIC_CAST(__m128i*, mem_addr), a);
6098 #else
6099 simde__m128i_private a_ = simde__m128i_to_private(a);
6100
6101 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6102 vst1q_s32(HEDLEY_REINTERPRET_CAST(int32_t*, mem_addr), a_.neon_i32);
6103 #else
6104 simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m128i), &a_, sizeof(a_));
6105 #endif
6106 #endif
6107 }
6108 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6109 #define _mm_store_si128(mem_addr, a) simde_mm_store_si128(mem_addr, a)
6110 #endif
6111
6112 SIMDE_FUNCTION_ATTRIBUTES
6113 void
simde_mm_storeh_pd(simde_float64 * mem_addr,simde__m128d a)6114 simde_mm_storeh_pd (simde_float64* mem_addr, simde__m128d a) {
6115 #if defined(SIMDE_X86_SSE2_NATIVE)
6116 _mm_storeh_pd(mem_addr, a);
6117 #else
6118 simde__m128d_private a_ = simde__m128d_to_private(a);
6119
6120 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6121 *mem_addr = vgetq_lane_f64(a_.neon_f64, 1);
6122 #else
6123 *mem_addr = a_.f64[1];
6124 #endif
6125 #endif
6126 }
6127 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6128 #define _mm_storeh_pd(mem_addr, a) simde_mm_storeh_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
6129 #endif
6130
6131 SIMDE_FUNCTION_ATTRIBUTES
6132 void
simde_mm_storel_epi64(simde__m128i * mem_addr,simde__m128i a)6133 simde_mm_storel_epi64 (simde__m128i* mem_addr, simde__m128i a) {
6134 #if defined(SIMDE_X86_SSE2_NATIVE)
6135 _mm_storel_epi64(HEDLEY_STATIC_CAST(__m128i*, mem_addr), a);
6136 #else
6137 simde__m128i_private a_ = simde__m128i_to_private(a);
6138 int64_t tmp;
6139
6140 /* memcpy to prevent aliasing, tmp because we can't take the
6141 * address of a vector element. */
6142
6143 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6144 tmp = vgetq_lane_s64(a_.neon_i64, 0);
6145 #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
6146 #if defined(SIMDE_BUG_GCC_95227)
6147 (void) a_;
6148 #endif
6149 tmp = vec_extract(a_.altivec_i64, 0);
6150 #else
6151 tmp = a_.i64[0];
6152 #endif
6153
6154 simde_memcpy(mem_addr, &tmp, sizeof(tmp));
6155 #endif
6156 }
6157 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6158 #define _mm_storel_epi64(mem_addr, a) simde_mm_storel_epi64(mem_addr, a)
6159 #endif
6160
6161 SIMDE_FUNCTION_ATTRIBUTES
6162 void
simde_mm_storel_pd(simde_float64 * mem_addr,simde__m128d a)6163 simde_mm_storel_pd (simde_float64* mem_addr, simde__m128d a) {
6164 #if defined(SIMDE_X86_SSE2_NATIVE)
6165 _mm_storel_pd(mem_addr, a);
6166 #else
6167 simde__m128d_private a_ = simde__m128d_to_private(a);
6168
6169 simde_float64 tmp;
6170 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6171 tmp = vgetq_lane_f64(a_.neon_f64, 0);
6172 #else
6173 tmp = a_.f64[0];
6174 #endif
6175 simde_memcpy(mem_addr, &tmp, sizeof(tmp));
6176 #endif
6177 }
6178 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6179 #define _mm_storel_pd(mem_addr, a) simde_mm_storel_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
6180 #endif
6181
6182 SIMDE_FUNCTION_ATTRIBUTES
6183 void
simde_mm_storer_pd(simde_float64 mem_addr[2],simde__m128d a)6184 simde_mm_storer_pd (simde_float64 mem_addr[2], simde__m128d a) {
6185 #if defined(SIMDE_X86_SSE2_NATIVE)
6186 _mm_storer_pd(mem_addr, a);
6187 #else
6188 simde__m128d_private a_ = simde__m128d_to_private(a);
6189
6190 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6191 vst1q_s64(HEDLEY_REINTERPRET_CAST(int64_t*, mem_addr), vextq_s64(a_.neon_i64, a_.neon_i64, 1));
6192 #elif defined(SIMDE_SHUFFLE_VECTOR_)
6193 a_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, a_.f64, 1, 0);
6194 simde_mm_store_pd(mem_addr, simde__m128d_from_private(a_));
6195 #else
6196 mem_addr[0] = a_.f64[1];
6197 mem_addr[1] = a_.f64[0];
6198 #endif
6199 #endif
6200 }
6201 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6202 #define _mm_storer_pd(mem_addr, a) simde_mm_storer_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
6203 #endif
6204
6205 SIMDE_FUNCTION_ATTRIBUTES
6206 void
simde_mm_storeu_pd(simde_float64 * mem_addr,simde__m128d a)6207 simde_mm_storeu_pd (simde_float64* mem_addr, simde__m128d a) {
6208 #if defined(SIMDE_X86_SSE2_NATIVE)
6209 _mm_storeu_pd(mem_addr, a);
6210 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6211 vst1q_f64(mem_addr, simde__m128d_to_private(a).neon_f64);
6212 #else
6213 simde_memcpy(mem_addr, &a, sizeof(a));
6214 #endif
6215 }
6216 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6217 #define _mm_storeu_pd(mem_addr, a) simde_mm_storeu_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
6218 #endif
6219
6220 SIMDE_FUNCTION_ATTRIBUTES
6221 void
simde_mm_storeu_si128(void * mem_addr,simde__m128i a)6222 simde_mm_storeu_si128 (void* mem_addr, simde__m128i a) {
6223 #if defined(SIMDE_X86_SSE2_NATIVE)
6224 _mm_storeu_si128(HEDLEY_STATIC_CAST(__m128i*, mem_addr), a);
6225 #else
6226 simde_memcpy(mem_addr, &a, sizeof(a));
6227 #endif
6228 }
6229 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6230 #define _mm_storeu_si128(mem_addr, a) simde_mm_storeu_si128(mem_addr, a)
6231 #endif
6232
6233 SIMDE_FUNCTION_ATTRIBUTES
6234 void
simde_mm_storeu_si16(void * mem_addr,simde__m128i a)6235 simde_mm_storeu_si16 (void* mem_addr, simde__m128i a) {
6236 #if defined(SIMDE_X86_SSE2_NATIVE) && ( \
6237 SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0) || \
6238 HEDLEY_GCC_VERSION_CHECK(11,0,0) || \
6239 HEDLEY_INTEL_VERSION_CHECK(20,21,1))
6240 _mm_storeu_si16(mem_addr, a);
6241 #else
6242 int16_t val = simde_x_mm_cvtsi128_si16(a);
6243 simde_memcpy(mem_addr, &val, sizeof(val));
6244 #endif
6245 }
6246 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6247 #define _mm_storeu_si16(mem_addr, a) simde_mm_storeu_si16(mem_addr, a)
6248 #endif
6249
6250 SIMDE_FUNCTION_ATTRIBUTES
6251 void
simde_mm_storeu_si32(void * mem_addr,simde__m128i a)6252 simde_mm_storeu_si32 (void* mem_addr, simde__m128i a) {
6253 #if defined(SIMDE_X86_SSE2_NATIVE) && ( \
6254 SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0) || \
6255 HEDLEY_GCC_VERSION_CHECK(11,0,0) || \
6256 HEDLEY_INTEL_VERSION_CHECK(20,21,1))
6257 _mm_storeu_si32(mem_addr, a);
6258 #else
6259 int32_t val = simde_mm_cvtsi128_si32(a);
6260 simde_memcpy(mem_addr, &val, sizeof(val));
6261 #endif
6262 }
6263 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6264 #define _mm_storeu_si32(mem_addr, a) simde_mm_storeu_si32(mem_addr, a)
6265 #endif
6266
6267 SIMDE_FUNCTION_ATTRIBUTES
6268 void
simde_mm_storeu_si64(void * mem_addr,simde__m128i a)6269 simde_mm_storeu_si64 (void* mem_addr, simde__m128i a) {
6270 #if defined(SIMDE_X86_SSE2_NATIVE) && ( \
6271 SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0) || \
6272 HEDLEY_GCC_VERSION_CHECK(11,0,0) || \
6273 HEDLEY_INTEL_VERSION_CHECK(20,21,1))
6274 _mm_storeu_si64(mem_addr, a);
6275 #else
6276 int64_t val = simde_mm_cvtsi128_si64(a);
6277 simde_memcpy(mem_addr, &val, sizeof(val));
6278 #endif
6279 }
6280 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6281 #define _mm_storeu_si64(mem_addr, a) simde_mm_storeu_si64(mem_addr, a)
6282 #endif
6283
6284 SIMDE_FUNCTION_ATTRIBUTES
6285 void
simde_mm_stream_pd(simde_float64 mem_addr[HEDLEY_ARRAY_PARAM (2)],simde__m128d a)6286 simde_mm_stream_pd (simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128d a) {
6287 #if defined(SIMDE_X86_SSE2_NATIVE)
6288 _mm_stream_pd(mem_addr, a);
6289 #else
6290 simde_memcpy(mem_addr, &a, sizeof(a));
6291 #endif
6292 }
6293 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6294 #define _mm_stream_pd(mem_addr, a) simde_mm_stream_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
6295 #endif
6296
6297 SIMDE_FUNCTION_ATTRIBUTES
6298 void
simde_mm_stream_si128(simde__m128i * mem_addr,simde__m128i a)6299 simde_mm_stream_si128 (simde__m128i* mem_addr, simde__m128i a) {
6300 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
6301 _mm_stream_si128(HEDLEY_STATIC_CAST(__m128i*, mem_addr), a);
6302 #else
6303 simde_memcpy(mem_addr, &a, sizeof(a));
6304 #endif
6305 }
6306 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6307 #define _mm_stream_si128(mem_addr, a) simde_mm_stream_si128(mem_addr, a)
6308 #endif
6309
6310 SIMDE_FUNCTION_ATTRIBUTES
6311 void
simde_mm_stream_si32(int32_t * mem_addr,int32_t a)6312 simde_mm_stream_si32 (int32_t* mem_addr, int32_t a) {
6313 #if defined(SIMDE_X86_SSE2_NATIVE)
6314 _mm_stream_si32(mem_addr, a);
6315 #else
6316 *mem_addr = a;
6317 #endif
6318 }
6319 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6320 #define _mm_stream_si32(mem_addr, a) simde_mm_stream_si32(mem_addr, a)
6321 #endif
6322
6323 SIMDE_FUNCTION_ATTRIBUTES
6324 void
simde_mm_stream_si64(int64_t * mem_addr,int64_t a)6325 simde_mm_stream_si64 (int64_t* mem_addr, int64_t a) {
6326 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64) && !defined(HEDLEY_MSVC_VERSION)
6327 _mm_stream_si64(SIMDE_CHECKED_REINTERPRET_CAST(long long int*, int64_t*, mem_addr), a);
6328 #else
6329 *mem_addr = a;
6330 #endif
6331 }
6332 #define simde_mm_stream_si64x(mem_addr, a) simde_mm_stream_si64(mem_addr, a)
6333 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64))
6334 #define _mm_stream_si64(mem_addr, a) simde_mm_stream_si64(SIMDE_CHECKED_REINTERPRET_CAST(int64_t*, __int64*, mem_addr), a)
6335 #define _mm_stream_si64x(mem_addr, a) simde_mm_stream_si64(SIMDE_CHECKED_REINTERPRET_CAST(int64_t*, __int64*, mem_addr), a)
6336 #endif
6337
6338 SIMDE_FUNCTION_ATTRIBUTES
6339 simde__m128i
simde_mm_sub_epi8(simde__m128i a,simde__m128i b)6340 simde_mm_sub_epi8 (simde__m128i a, simde__m128i b) {
6341 #if defined(SIMDE_X86_SSE2_NATIVE)
6342 return _mm_sub_epi8(a, b);
6343 #else
6344 simde__m128i_private
6345 r_,
6346 a_ = simde__m128i_to_private(a),
6347 b_ = simde__m128i_to_private(b);
6348
6349 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6350 r_.neon_i8 = vsubq_s8(a_.neon_i8, b_.neon_i8);
6351 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
6352 r_.i8 = a_.i8 - b_.i8;
6353 #else
6354 SIMDE_VECTORIZE
6355 for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
6356 r_.i8[i] = a_.i8[i] - b_.i8[i];
6357 }
6358 #endif
6359
6360 return simde__m128i_from_private(r_);
6361 #endif
6362 }
6363 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6364 #define _mm_sub_epi8(a, b) simde_mm_sub_epi8(a, b)
6365 #endif
6366
6367 SIMDE_FUNCTION_ATTRIBUTES
6368 simde__m128i
simde_mm_sub_epi16(simde__m128i a,simde__m128i b)6369 simde_mm_sub_epi16 (simde__m128i a, simde__m128i b) {
6370 #if defined(SIMDE_X86_SSE2_NATIVE)
6371 return _mm_sub_epi16(a, b);
6372 #else
6373 simde__m128i_private
6374 r_,
6375 a_ = simde__m128i_to_private(a),
6376 b_ = simde__m128i_to_private(b);
6377
6378 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6379 r_.neon_i16 = vsubq_s16(a_.neon_i16, b_.neon_i16);
6380 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
6381 r_.i16 = a_.i16 - b_.i16;
6382 #else
6383 SIMDE_VECTORIZE
6384 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
6385 r_.i16[i] = a_.i16[i] - b_.i16[i];
6386 }
6387 #endif
6388
6389 return simde__m128i_from_private(r_);
6390 #endif
6391 }
6392 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6393 #define _mm_sub_epi16(a, b) simde_mm_sub_epi16(a, b)
6394 #endif
6395
6396 SIMDE_FUNCTION_ATTRIBUTES
6397 simde__m128i
simde_mm_sub_epi32(simde__m128i a,simde__m128i b)6398 simde_mm_sub_epi32 (simde__m128i a, simde__m128i b) {
6399 #if defined(SIMDE_X86_SSE2_NATIVE)
6400 return _mm_sub_epi32(a, b);
6401 #else
6402 simde__m128i_private
6403 r_,
6404 a_ = simde__m128i_to_private(a),
6405 b_ = simde__m128i_to_private(b);
6406
6407 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6408 r_.neon_i32 = vsubq_s32(a_.neon_i32, b_.neon_i32);
6409 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
6410 r_.i32 = a_.i32 - b_.i32;
6411 #else
6412 SIMDE_VECTORIZE
6413 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
6414 r_.i32[i] = a_.i32[i] - b_.i32[i];
6415 }
6416 #endif
6417
6418 return simde__m128i_from_private(r_);
6419 #endif
6420 }
6421 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6422 #define _mm_sub_epi32(a, b) simde_mm_sub_epi32(a, b)
6423 #endif
6424
6425 SIMDE_FUNCTION_ATTRIBUTES
6426 simde__m128i
simde_mm_sub_epi64(simde__m128i a,simde__m128i b)6427 simde_mm_sub_epi64 (simde__m128i a, simde__m128i b) {
6428 #if defined(SIMDE_X86_SSE2_NATIVE)
6429 return _mm_sub_epi64(a, b);
6430 #else
6431 simde__m128i_private
6432 r_,
6433 a_ = simde__m128i_to_private(a),
6434 b_ = simde__m128i_to_private(b);
6435
6436 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6437 r_.neon_i64 = vsubq_s64(a_.neon_i64, b_.neon_i64);
6438 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
6439 r_.i64 = a_.i64 - b_.i64;
6440 #else
6441 SIMDE_VECTORIZE
6442 for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
6443 r_.i64[i] = a_.i64[i] - b_.i64[i];
6444 }
6445 #endif
6446
6447 return simde__m128i_from_private(r_);
6448 #endif
6449 }
6450 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6451 #define _mm_sub_epi64(a, b) simde_mm_sub_epi64(a, b)
6452 #endif
6453
6454 SIMDE_FUNCTION_ATTRIBUTES
6455 simde__m128i
simde_x_mm_sub_epu32(simde__m128i a,simde__m128i b)6456 simde_x_mm_sub_epu32 (simde__m128i a, simde__m128i b) {
6457 simde__m128i_private
6458 r_,
6459 a_ = simde__m128i_to_private(a),
6460 b_ = simde__m128i_to_private(b);
6461
6462 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
6463 r_.u32 = a_.u32 - b_.u32;
6464 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6465 r_.neon_u32 = vsubq_u32(a_.neon_u32, b_.neon_u32);
6466 #else
6467 SIMDE_VECTORIZE
6468 for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
6469 r_.u32[i] = a_.u32[i] - b_.u32[i];
6470 }
6471 #endif
6472
6473 return simde__m128i_from_private(r_);
6474 }
6475
6476 SIMDE_FUNCTION_ATTRIBUTES
6477 simde__m128d
simde_mm_sub_pd(simde__m128d a,simde__m128d b)6478 simde_mm_sub_pd (simde__m128d a, simde__m128d b) {
6479 #if defined(SIMDE_X86_SSE2_NATIVE)
6480 return _mm_sub_pd(a, b);
6481 #else
6482 simde__m128d_private
6483 r_,
6484 a_ = simde__m128d_to_private(a),
6485 b_ = simde__m128d_to_private(b);
6486
6487 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
6488 r_.f64 = a_.f64 - b_.f64;
6489 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6490 r_.neon_f64 = vsubq_f64(a_.neon_f64, b_.neon_f64);
6491 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
6492 r_.wasm_v128 = wasm_f64x2_sub(a_.wasm_v128, b_.wasm_v128);
6493 #else
6494 SIMDE_VECTORIZE
6495 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
6496 r_.f64[i] = a_.f64[i] - b_.f64[i];
6497 }
6498 #endif
6499
6500 return simde__m128d_from_private(r_);
6501 #endif
6502 }
6503 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6504 #define _mm_sub_pd(a, b) simde_mm_sub_pd(a, b)
6505 #endif
6506
6507 SIMDE_FUNCTION_ATTRIBUTES
6508 simde__m128d
simde_mm_sub_sd(simde__m128d a,simde__m128d b)6509 simde_mm_sub_sd (simde__m128d a, simde__m128d b) {
6510 #if defined(SIMDE_X86_SSE2_NATIVE)
6511 return _mm_sub_sd(a, b);
6512 #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
6513 return simde_mm_move_sd(a, simde_mm_sub_pd(a, b));
6514 #else
6515 simde__m128d_private
6516 r_,
6517 a_ = simde__m128d_to_private(a),
6518 b_ = simde__m128d_to_private(b);
6519
6520 r_.f64[0] = a_.f64[0] - b_.f64[0];
6521 r_.f64[1] = a_.f64[1];
6522
6523 return simde__m128d_from_private(r_);
6524 #endif
6525 }
6526 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6527 #define _mm_sub_sd(a, b) simde_mm_sub_sd(a, b)
6528 #endif
6529
6530 SIMDE_FUNCTION_ATTRIBUTES
6531 simde__m64
simde_mm_sub_si64(simde__m64 a,simde__m64 b)6532 simde_mm_sub_si64 (simde__m64 a, simde__m64 b) {
6533 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
6534 return _mm_sub_si64(a, b);
6535 #else
6536 simde__m64_private
6537 r_,
6538 a_ = simde__m64_to_private(a),
6539 b_ = simde__m64_to_private(b);
6540
6541 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
6542 r_.i64 = a_.i64 - b_.i64;
6543 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6544 r_.neon_i64 = vsub_s64(a_.neon_i64, b_.neon_i64);
6545 #else
6546 r_.i64[0] = a_.i64[0] - b_.i64[0];
6547 #endif
6548
6549 return simde__m64_from_private(r_);
6550 #endif
6551 }
6552 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6553 #define _mm_sub_si64(a, b) simde_mm_sub_si64(a, b)
6554 #endif
6555
6556 SIMDE_FUNCTION_ATTRIBUTES
6557 simde__m128i
simde_mm_subs_epi8(simde__m128i a,simde__m128i b)6558 simde_mm_subs_epi8 (simde__m128i a, simde__m128i b) {
6559 #if defined(SIMDE_X86_SSE2_NATIVE)
6560 return _mm_subs_epi8(a, b);
6561 #else
6562 simde__m128i_private
6563 r_,
6564 a_ = simde__m128i_to_private(a),
6565 b_ = simde__m128i_to_private(b);
6566
6567 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6568 r_.neon_i8 = vqsubq_s8(a_.neon_i8, b_.neon_i8);
6569 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
6570 r_.wasm_v128 = wasm_i8x16_sub_saturate(a_.wasm_v128, b_.wasm_v128);
6571 #else
6572 SIMDE_VECTORIZE
6573 for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i8[0])) ; i++) {
6574 if (((b_.i8[i]) > 0 && (a_.i8[i]) < INT8_MIN + (b_.i8[i]))) {
6575 r_.i8[i] = INT8_MIN;
6576 } else if ((b_.i8[i]) < 0 && (a_.i8[i]) > INT8_MAX + (b_.i8[i])) {
6577 r_.i8[i] = INT8_MAX;
6578 } else {
6579 r_.i8[i] = (a_.i8[i]) - (b_.i8[i]);
6580 }
6581 }
6582 #endif
6583
6584 return simde__m128i_from_private(r_);
6585 #endif
6586 }
6587 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6588 #define _mm_subs_epi8(a, b) simde_mm_subs_epi8(a, b)
6589 #endif
6590
6591 SIMDE_FUNCTION_ATTRIBUTES
6592 simde__m128i
simde_mm_subs_epi16(simde__m128i a,simde__m128i b)6593 simde_mm_subs_epi16 (simde__m128i a, simde__m128i b) {
6594 #if defined(SIMDE_X86_SSE2_NATIVE)
6595 return _mm_subs_epi16(a, b);
6596 #else
6597 simde__m128i_private
6598 r_,
6599 a_ = simde__m128i_to_private(a),
6600 b_ = simde__m128i_to_private(b);
6601
6602 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6603 r_.neon_i16 = vqsubq_s16(a_.neon_i16, b_.neon_i16);
6604 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
6605 r_.wasm_v128 = wasm_i16x8_sub_saturate(a_.wasm_v128, b_.wasm_v128);
6606 #else
6607 SIMDE_VECTORIZE
6608 for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i16[0])) ; i++) {
6609 if (((b_.i16[i]) > 0 && (a_.i16[i]) < INT16_MIN + (b_.i16[i]))) {
6610 r_.i16[i] = INT16_MIN;
6611 } else if ((b_.i16[i]) < 0 && (a_.i16[i]) > INT16_MAX + (b_.i16[i])) {
6612 r_.i16[i] = INT16_MAX;
6613 } else {
6614 r_.i16[i] = (a_.i16[i]) - (b_.i16[i]);
6615 }
6616 }
6617 #endif
6618
6619 return simde__m128i_from_private(r_);
6620 #endif
6621 }
6622 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6623 #define _mm_subs_epi16(a, b) simde_mm_subs_epi16(a, b)
6624 #endif
6625
6626 SIMDE_FUNCTION_ATTRIBUTES
6627 simde__m128i
simde_mm_subs_epu8(simde__m128i a,simde__m128i b)6628 simde_mm_subs_epu8 (simde__m128i a, simde__m128i b) {
6629 #if defined(SIMDE_X86_SSE2_NATIVE)
6630 return _mm_subs_epu8(a, b);
6631 #else
6632 simde__m128i_private
6633 r_,
6634 a_ = simde__m128i_to_private(a),
6635 b_ = simde__m128i_to_private(b);
6636
6637 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6638 r_.neon_u8 = vqsubq_u8(a_.neon_u8, b_.neon_u8);
6639 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
6640 r_.wasm_v128 = wasm_u8x16_sub_saturate(a_.wasm_v128, b_.wasm_v128);
6641 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
6642 r_.altivec_u8 = vec_subs(a_.altivec_u8, b_.altivec_u8);
6643 #else
6644 SIMDE_VECTORIZE
6645 for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i8[0])) ; i++) {
6646 const int32_t x = a_.u8[i] - b_.u8[i];
6647 if (x < 0) {
6648 r_.u8[i] = 0;
6649 } else if (x > UINT8_MAX) {
6650 r_.u8[i] = UINT8_MAX;
6651 } else {
6652 r_.u8[i] = HEDLEY_STATIC_CAST(uint8_t, x);
6653 }
6654 }
6655 #endif
6656
6657 return simde__m128i_from_private(r_);
6658 #endif
6659 }
6660 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6661 #define _mm_subs_epu8(a, b) simde_mm_subs_epu8(a, b)
6662 #endif
6663
6664 SIMDE_FUNCTION_ATTRIBUTES
6665 simde__m128i
simde_mm_subs_epu16(simde__m128i a,simde__m128i b)6666 simde_mm_subs_epu16 (simde__m128i a, simde__m128i b) {
6667 #if defined(SIMDE_X86_SSE2_NATIVE)
6668 return _mm_subs_epu16(a, b);
6669 #else
6670 simde__m128i_private
6671 r_,
6672 a_ = simde__m128i_to_private(a),
6673 b_ = simde__m128i_to_private(b);
6674
6675 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6676 r_.neon_u16 = vqsubq_u16(a_.neon_u16, b_.neon_u16);
6677 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
6678 r_.wasm_v128 = wasm_u16x8_sub_saturate(a_.wasm_v128, b_.wasm_v128);
6679 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
6680 r_.altivec_u16 = vec_subs(a_.altivec_u16, b_.altivec_u16);
6681 #else
6682 SIMDE_VECTORIZE
6683 for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i16[0])) ; i++) {
6684 const int32_t x = a_.u16[i] - b_.u16[i];
6685 if (x < 0) {
6686 r_.u16[i] = 0;
6687 } else if (x > UINT16_MAX) {
6688 r_.u16[i] = UINT16_MAX;
6689 } else {
6690 r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, x);
6691 }
6692 }
6693 #endif
6694
6695 return simde__m128i_from_private(r_);
6696 #endif
6697 }
6698 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6699 #define _mm_subs_epu16(a, b) simde_mm_subs_epu16(a, b)
6700 #endif
6701
6702 SIMDE_FUNCTION_ATTRIBUTES
6703 int
simde_mm_ucomieq_sd(simde__m128d a,simde__m128d b)6704 simde_mm_ucomieq_sd (simde__m128d a, simde__m128d b) {
6705 #if defined(SIMDE_X86_SSE2_NATIVE)
6706 return _mm_ucomieq_sd(a, b);
6707 #else
6708 simde__m128d_private
6709 a_ = simde__m128d_to_private(a),
6710 b_ = simde__m128d_to_private(b);
6711 int r;
6712
6713 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6714 uint64x2_t a_not_nan = vceqq_f64(a_.neon_f64, a_.neon_f64);
6715 uint64x2_t b_not_nan = vceqq_f64(b_.neon_f64, b_.neon_f64);
6716 uint64x2_t a_or_b_nan = vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(vandq_u64(a_not_nan, b_not_nan))));
6717 uint64x2_t a_eq_b = vceqq_f64(a_.neon_f64, b_.neon_f64);
6718 r = !!(vgetq_lane_u64(vorrq_u64(a_or_b_nan, a_eq_b), 0) != 0);
6719 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
6720 return wasm_f64x2_extract_lane(a_.wasm_v128, 0) == wasm_f64x2_extract_lane(b_.wasm_v128, 0);
6721 #elif defined(SIMDE_HAVE_FENV_H)
6722 fenv_t envp;
6723 int x = feholdexcept(&envp);
6724 r = a_.f64[0] == b_.f64[0];
6725 if (HEDLEY_LIKELY(x == 0))
6726 fesetenv(&envp);
6727 #else
6728 r = a_.f64[0] == b_.f64[0];
6729 #endif
6730
6731 return r;
6732 #endif
6733 }
6734 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6735 #define _mm_ucomieq_sd(a, b) simde_mm_ucomieq_sd(a, b)
6736 #endif
6737
6738 SIMDE_FUNCTION_ATTRIBUTES
6739 int
simde_mm_ucomige_sd(simde__m128d a,simde__m128d b)6740 simde_mm_ucomige_sd (simde__m128d a, simde__m128d b) {
6741 #if defined(SIMDE_X86_SSE2_NATIVE)
6742 return _mm_ucomige_sd(a, b);
6743 #else
6744 simde__m128d_private
6745 a_ = simde__m128d_to_private(a),
6746 b_ = simde__m128d_to_private(b);
6747 int r;
6748
6749 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6750 uint64x2_t a_not_nan = vceqq_f64(a_.neon_f64, a_.neon_f64);
6751 uint64x2_t b_not_nan = vceqq_f64(b_.neon_f64, b_.neon_f64);
6752 uint64x2_t a_and_b_not_nan = vandq_u64(a_not_nan, b_not_nan);
6753 uint64x2_t a_ge_b = vcgeq_f64(a_.neon_f64, b_.neon_f64);
6754 r = !!(vgetq_lane_u64(vandq_u64(a_and_b_not_nan, a_ge_b), 0) != 0);
6755 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
6756 return wasm_f64x2_extract_lane(a_.wasm_v128, 0) >= wasm_f64x2_extract_lane(b_.wasm_v128, 0);
6757 #elif defined(SIMDE_HAVE_FENV_H)
6758 fenv_t envp;
6759 int x = feholdexcept(&envp);
6760 r = a_.f64[0] >= b_.f64[0];
6761 if (HEDLEY_LIKELY(x == 0))
6762 fesetenv(&envp);
6763 #else
6764 r = a_.f64[0] >= b_.f64[0];
6765 #endif
6766
6767 return r;
6768 #endif
6769 }
6770 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6771 #define _mm_ucomige_sd(a, b) simde_mm_ucomige_sd(a, b)
6772 #endif
6773
6774 SIMDE_FUNCTION_ATTRIBUTES
6775 int
simde_mm_ucomigt_sd(simde__m128d a,simde__m128d b)6776 simde_mm_ucomigt_sd (simde__m128d a, simde__m128d b) {
6777 #if defined(SIMDE_X86_SSE2_NATIVE)
6778 return _mm_ucomigt_sd(a, b);
6779 #else
6780 simde__m128d_private
6781 a_ = simde__m128d_to_private(a),
6782 b_ = simde__m128d_to_private(b);
6783 int r;
6784
6785 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6786 uint64x2_t a_not_nan = vceqq_f64(a_.neon_f64, a_.neon_f64);
6787 uint64x2_t b_not_nan = vceqq_f64(b_.neon_f64, b_.neon_f64);
6788 uint64x2_t a_and_b_not_nan = vandq_u64(a_not_nan, b_not_nan);
6789 uint64x2_t a_gt_b = vcgtq_f64(a_.neon_f64, b_.neon_f64);
6790 r = !!(vgetq_lane_u64(vandq_u64(a_and_b_not_nan, a_gt_b), 0) != 0);
6791 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
6792 return wasm_f64x2_extract_lane(a_.wasm_v128, 0) > wasm_f64x2_extract_lane(b_.wasm_v128, 0);
6793 #elif defined(SIMDE_HAVE_FENV_H)
6794 fenv_t envp;
6795 int x = feholdexcept(&envp);
6796 r = a_.f64[0] > b_.f64[0];
6797 if (HEDLEY_LIKELY(x == 0))
6798 fesetenv(&envp);
6799 #else
6800 r = a_.f64[0] > b_.f64[0];
6801 #endif
6802
6803 return r;
6804 #endif
6805 }
6806 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6807 #define _mm_ucomigt_sd(a, b) simde_mm_ucomigt_sd(a, b)
6808 #endif
6809
6810 SIMDE_FUNCTION_ATTRIBUTES
6811 int
simde_mm_ucomile_sd(simde__m128d a,simde__m128d b)6812 simde_mm_ucomile_sd (simde__m128d a, simde__m128d b) {
6813 #if defined(SIMDE_X86_SSE2_NATIVE)
6814 return _mm_ucomile_sd(a, b);
6815 #else
6816 simde__m128d_private
6817 a_ = simde__m128d_to_private(a),
6818 b_ = simde__m128d_to_private(b);
6819 int r;
6820
6821 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6822 uint64x2_t a_not_nan = vceqq_f64(a_.neon_f64, a_.neon_f64);
6823 uint64x2_t b_not_nan = vceqq_f64(b_.neon_f64, b_.neon_f64);
6824 uint64x2_t a_or_b_nan = vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(vandq_u64(a_not_nan, b_not_nan))));
6825 uint64x2_t a_le_b = vcleq_f64(a_.neon_f64, b_.neon_f64);
6826 r = !!(vgetq_lane_u64(vorrq_u64(a_or_b_nan, a_le_b), 0) != 0);
6827 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
6828 return wasm_f64x2_extract_lane(a_.wasm_v128, 0) <= wasm_f64x2_extract_lane(b_.wasm_v128, 0);
6829 #elif defined(SIMDE_HAVE_FENV_H)
6830 fenv_t envp;
6831 int x = feholdexcept(&envp);
6832 r = a_.f64[0] <= b_.f64[0];
6833 if (HEDLEY_LIKELY(x == 0))
6834 fesetenv(&envp);
6835 #else
6836 r = a_.f64[0] <= b_.f64[0];
6837 #endif
6838
6839 return r;
6840 #endif
6841 }
6842 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6843 #define _mm_ucomile_sd(a, b) simde_mm_ucomile_sd(a, b)
6844 #endif
6845
6846 SIMDE_FUNCTION_ATTRIBUTES
6847 int
simde_mm_ucomilt_sd(simde__m128d a,simde__m128d b)6848 simde_mm_ucomilt_sd (simde__m128d a, simde__m128d b) {
6849 #if defined(SIMDE_X86_SSE2_NATIVE)
6850 return _mm_ucomilt_sd(a, b);
6851 #else
6852 simde__m128d_private
6853 a_ = simde__m128d_to_private(a),
6854 b_ = simde__m128d_to_private(b);
6855 int r;
6856
6857 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6858 uint64x2_t a_not_nan = vceqq_f64(a_.neon_f64, a_.neon_f64);
6859 uint64x2_t b_not_nan = vceqq_f64(b_.neon_f64, b_.neon_f64);
6860 uint64x2_t a_or_b_nan = vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(vandq_u64(a_not_nan, b_not_nan))));
6861 uint64x2_t a_lt_b = vcltq_f64(a_.neon_f64, b_.neon_f64);
6862 r = !!(vgetq_lane_u64(vorrq_u64(a_or_b_nan, a_lt_b), 0) != 0);
6863 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
6864 return wasm_f64x2_extract_lane(a_.wasm_v128, 0) < wasm_f64x2_extract_lane(b_.wasm_v128, 0);
6865 #elif defined(SIMDE_HAVE_FENV_H)
6866 fenv_t envp;
6867 int x = feholdexcept(&envp);
6868 r = a_.f64[0] < b_.f64[0];
6869 if (HEDLEY_LIKELY(x == 0))
6870 fesetenv(&envp);
6871 #else
6872 r = a_.f64[0] < b_.f64[0];
6873 #endif
6874
6875 return r;
6876 #endif
6877 }
6878 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6879 #define _mm_ucomilt_sd(a, b) simde_mm_ucomilt_sd(a, b)
6880 #endif
6881
6882 SIMDE_FUNCTION_ATTRIBUTES
6883 int
simde_mm_ucomineq_sd(simde__m128d a,simde__m128d b)6884 simde_mm_ucomineq_sd (simde__m128d a, simde__m128d b) {
6885 #if defined(SIMDE_X86_SSE2_NATIVE)
6886 return _mm_ucomineq_sd(a, b);
6887 #else
6888 simde__m128d_private
6889 a_ = simde__m128d_to_private(a),
6890 b_ = simde__m128d_to_private(b);
6891 int r;
6892
6893 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6894 uint64x2_t a_not_nan = vceqq_f64(a_.neon_f64, a_.neon_f64);
6895 uint64x2_t b_not_nan = vceqq_f64(b_.neon_f64, b_.neon_f64);
6896 uint64x2_t a_and_b_not_nan = vandq_u64(a_not_nan, b_not_nan);
6897 uint64x2_t a_neq_b = vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(vceqq_f64(a_.neon_f64, b_.neon_f64))));
6898 r = !!(vgetq_lane_u64(vandq_u64(a_and_b_not_nan, a_neq_b), 0) != 0);
6899 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
6900 return wasm_f64x2_extract_lane(a_.wasm_v128, 0) != wasm_f64x2_extract_lane(b_.wasm_v128, 0);
6901 #elif defined(SIMDE_HAVE_FENV_H)
6902 fenv_t envp;
6903 int x = feholdexcept(&envp);
6904 r = a_.f64[0] != b_.f64[0];
6905 if (HEDLEY_LIKELY(x == 0))
6906 fesetenv(&envp);
6907 #else
6908 r = a_.f64[0] != b_.f64[0];
6909 #endif
6910
6911 return r;
6912 #endif
6913 }
6914 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6915 #define _mm_ucomineq_sd(a, b) simde_mm_ucomineq_sd(a, b)
6916 #endif
6917
6918 #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
6919 HEDLEY_DIAGNOSTIC_PUSH
6920 SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_
6921 #endif
6922
6923 #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
6924 HEDLEY_DIAGNOSTIC_POP
6925 #endif
6926
6927 SIMDE_FUNCTION_ATTRIBUTES
6928 void
simde_mm_lfence(void)6929 simde_mm_lfence (void) {
6930 #if defined(SIMDE_X86_SSE2_NATIVE)
6931 _mm_lfence();
6932 #else
6933 simde_mm_sfence();
6934 #endif
6935 }
6936 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6937 #define _mm_lfence() simde_mm_lfence()
6938 #endif
6939
6940 SIMDE_FUNCTION_ATTRIBUTES
6941 void
simde_mm_mfence(void)6942 simde_mm_mfence (void) {
6943 #if defined(SIMDE_X86_SSE2_NATIVE)
6944 _mm_mfence();
6945 #else
6946 simde_mm_sfence();
6947 #endif
6948 }
6949 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6950 #define _mm_mfence() simde_mm_mfence()
6951 #endif
6952
6953 SIMDE_FUNCTION_ATTRIBUTES
6954 simde__m128i
simde_mm_unpackhi_epi8(simde__m128i a,simde__m128i b)6955 simde_mm_unpackhi_epi8 (simde__m128i a, simde__m128i b) {
6956 #if defined(SIMDE_X86_SSE2_NATIVE)
6957 return _mm_unpackhi_epi8(a, b);
6958 #else
6959 simde__m128i_private
6960 r_,
6961 a_ = simde__m128i_to_private(a),
6962 b_ = simde__m128i_to_private(b);
6963
6964 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6965 r_.neon_i8 = vzip2q_s8(a_.neon_i8, b_.neon_i8);
6966 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6967 int8x8_t a1 = vreinterpret_s8_s16(vget_high_s16(a_.neon_i16));
6968 int8x8_t b1 = vreinterpret_s8_s16(vget_high_s16(b_.neon_i16));
6969 int8x8x2_t result = vzip_s8(a1, b1);
6970 r_.neon_i8 = vcombine_s8(result.val[0], result.val[1]);
6971 #elif defined(SIMDE_SHUFFLE_VECTOR_)
6972 r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, b_.i8, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
6973 #else
6974 SIMDE_VECTORIZE
6975 for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i8[0])) / 2) ; i++) {
6976 r_.i8[(i * 2)] = a_.i8[i + ((sizeof(r_) / sizeof(r_.i8[0])) / 2)];
6977 r_.i8[(i * 2) + 1] = b_.i8[i + ((sizeof(r_) / sizeof(r_.i8[0])) / 2)];
6978 }
6979 #endif
6980
6981 return simde__m128i_from_private(r_);
6982 #endif
6983 }
6984 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6985 #define _mm_unpackhi_epi8(a, b) simde_mm_unpackhi_epi8(a, b)
6986 #endif
6987
6988 SIMDE_FUNCTION_ATTRIBUTES
6989 simde__m128i
simde_mm_unpackhi_epi16(simde__m128i a,simde__m128i b)6990 simde_mm_unpackhi_epi16 (simde__m128i a, simde__m128i b) {
6991 #if defined(SIMDE_X86_SSE2_NATIVE)
6992 return _mm_unpackhi_epi16(a, b);
6993 #else
6994 simde__m128i_private
6995 r_,
6996 a_ = simde__m128i_to_private(a),
6997 b_ = simde__m128i_to_private(b);
6998
6999 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
7000 r_.neon_i16 = vzip2q_s16(a_.neon_i16, b_.neon_i16);
7001 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
7002 int16x4_t a1 = vget_high_s16(a_.neon_i16);
7003 int16x4_t b1 = vget_high_s16(b_.neon_i16);
7004 int16x4x2_t result = vzip_s16(a1, b1);
7005 r_.neon_i16 = vcombine_s16(result.val[0], result.val[1]);
7006 #elif defined(SIMDE_SHUFFLE_VECTOR_)
7007 r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, b_.i16, 4, 12, 5, 13, 6, 14, 7, 15);
7008 #else
7009 SIMDE_VECTORIZE
7010 for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i16[0])) / 2) ; i++) {
7011 r_.i16[(i * 2)] = a_.i16[i + ((sizeof(r_) / sizeof(r_.i16[0])) / 2)];
7012 r_.i16[(i * 2) + 1] = b_.i16[i + ((sizeof(r_) / sizeof(r_.i16[0])) / 2)];
7013 }
7014 #endif
7015
7016 return simde__m128i_from_private(r_);
7017 #endif
7018 }
7019 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
7020 #define _mm_unpackhi_epi16(a, b) simde_mm_unpackhi_epi16(a, b)
7021 #endif
7022
7023 SIMDE_FUNCTION_ATTRIBUTES
7024 simde__m128i
simde_mm_unpackhi_epi32(simde__m128i a,simde__m128i b)7025 simde_mm_unpackhi_epi32 (simde__m128i a, simde__m128i b) {
7026 #if defined(SIMDE_X86_SSE2_NATIVE)
7027 return _mm_unpackhi_epi32(a, b);
7028 #else
7029 simde__m128i_private
7030 r_,
7031 a_ = simde__m128i_to_private(a),
7032 b_ = simde__m128i_to_private(b);
7033
7034 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
7035 r_.neon_i32 = vzip2q_s32(a_.neon_i32, b_.neon_i32);
7036 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
7037 int32x2_t a1 = vget_high_s32(a_.neon_i32);
7038 int32x2_t b1 = vget_high_s32(b_.neon_i32);
7039 int32x2x2_t result = vzip_s32(a1, b1);
7040 r_.neon_i32 = vcombine_s32(result.val[0], result.val[1]);
7041 #elif defined(SIMDE_SHUFFLE_VECTOR_)
7042 r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.i32, b_.i32, 2, 6, 3, 7);
7043 #else
7044 SIMDE_VECTORIZE
7045 for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i32[0])) / 2) ; i++) {
7046 r_.i32[(i * 2)] = a_.i32[i + ((sizeof(r_) / sizeof(r_.i32[0])) / 2)];
7047 r_.i32[(i * 2) + 1] = b_.i32[i + ((sizeof(r_) / sizeof(r_.i32[0])) / 2)];
7048 }
7049 #endif
7050
7051 return simde__m128i_from_private(r_);
7052 #endif
7053 }
7054 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
7055 #define _mm_unpackhi_epi32(a, b) simde_mm_unpackhi_epi32(a, b)
7056 #endif
7057
7058 SIMDE_FUNCTION_ATTRIBUTES
7059 simde__m128i
simde_mm_unpackhi_epi64(simde__m128i a,simde__m128i b)7060 simde_mm_unpackhi_epi64 (simde__m128i a, simde__m128i b) {
7061 #if defined(SIMDE_X86_SSE2_NATIVE)
7062 return _mm_unpackhi_epi64(a, b);
7063 #else
7064 simde__m128i_private
7065 r_,
7066 a_ = simde__m128i_to_private(a),
7067 b_ = simde__m128i_to_private(b);
7068
7069 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
7070 int64x1_t a_h = vget_high_s64(a_.neon_i64);
7071 int64x1_t b_h = vget_high_s64(b_.neon_i64);
7072 r_.neon_i64 = vcombine_s64(a_h, b_h);
7073 #elif defined(SIMDE_SHUFFLE_VECTOR_)
7074 r_.i64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.i64, b_.i64, 1, 3);
7075 #else
7076 SIMDE_VECTORIZE
7077 for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i64[0])) / 2) ; i++) {
7078 r_.i64[(i * 2)] = a_.i64[i + ((sizeof(r_) / sizeof(r_.i64[0])) / 2)];
7079 r_.i64[(i * 2) + 1] = b_.i64[i + ((sizeof(r_) / sizeof(r_.i64[0])) / 2)];
7080 }
7081 #endif
7082
7083 return simde__m128i_from_private(r_);
7084 #endif
7085 }
7086 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
7087 #define _mm_unpackhi_epi64(a, b) simde_mm_unpackhi_epi64(a, b)
7088 #endif
7089
7090 SIMDE_FUNCTION_ATTRIBUTES
7091 simde__m128d
simde_mm_unpackhi_pd(simde__m128d a,simde__m128d b)7092 simde_mm_unpackhi_pd (simde__m128d a, simde__m128d b) {
7093 #if defined(SIMDE_X86_SSE2_NATIVE)
7094 return _mm_unpackhi_pd(a, b);
7095 #else
7096 simde__m128d_private
7097 r_,
7098 a_ = simde__m128d_to_private(a),
7099 b_ = simde__m128d_to_private(b);
7100
7101 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
7102 float64x1_t a_l = vget_high_f64(a_.f64);
7103 float64x1_t b_l = vget_high_f64(b_.f64);
7104 r_.neon_f64 = vcombine_f64(a_l, b_l);
7105 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
7106 r_.wasm_v128 = wasm_v64x2_shuffle(a_.wasm_v128, b_.wasm_v128, 1, 3);
7107 #elif defined(SIMDE_SHUFFLE_VECTOR_)
7108 r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 1, 3);
7109 #else
7110 SIMDE_VECTORIZE
7111 for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.f64[0])) / 2) ; i++) {
7112 r_.f64[(i * 2)] = a_.f64[i + ((sizeof(r_) / sizeof(r_.f64[0])) / 2)];
7113 r_.f64[(i * 2) + 1] = b_.f64[i + ((sizeof(r_) / sizeof(r_.f64[0])) / 2)];
7114 }
7115 #endif
7116
7117 return simde__m128d_from_private(r_);
7118 #endif
7119 }
7120 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
7121 #define _mm_unpackhi_pd(a, b) simde_mm_unpackhi_pd(a, b)
7122 #endif
7123
7124 SIMDE_FUNCTION_ATTRIBUTES
7125 simde__m128i
simde_mm_unpacklo_epi8(simde__m128i a,simde__m128i b)7126 simde_mm_unpacklo_epi8 (simde__m128i a, simde__m128i b) {
7127 #if defined(SIMDE_X86_SSE2_NATIVE)
7128 return _mm_unpacklo_epi8(a, b);
7129 #else
7130 simde__m128i_private
7131 r_,
7132 a_ = simde__m128i_to_private(a),
7133 b_ = simde__m128i_to_private(b);
7134
7135 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
7136 r_.neon_i8 = vzip1q_s8(a_.neon_i8, b_.neon_i8);
7137 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
7138 int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(a_.neon_i16));
7139 int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(b_.neon_i16));
7140 int8x8x2_t result = vzip_s8(a1, b1);
7141 r_.neon_i8 = vcombine_s8(result.val[0], result.val[1]);
7142 #elif defined(SIMDE_SHUFFLE_VECTOR_)
7143 r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, b_.i8, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
7144 #else
7145 SIMDE_VECTORIZE
7146 for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i8[0])) / 2) ; i++) {
7147 r_.i8[(i * 2)] = a_.i8[i];
7148 r_.i8[(i * 2) + 1] = b_.i8[i];
7149 }
7150 #endif
7151
7152 return simde__m128i_from_private(r_);
7153 #endif
7154 }
7155 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
7156 #define _mm_unpacklo_epi8(a, b) simde_mm_unpacklo_epi8(a, b)
7157 #endif
7158
7159 SIMDE_FUNCTION_ATTRIBUTES
7160 simde__m128i
simde_mm_unpacklo_epi16(simde__m128i a,simde__m128i b)7161 simde_mm_unpacklo_epi16 (simde__m128i a, simde__m128i b) {
7162 #if defined(SIMDE_X86_SSE2_NATIVE)
7163 return _mm_unpacklo_epi16(a, b);
7164 #else
7165 simde__m128i_private
7166 r_,
7167 a_ = simde__m128i_to_private(a),
7168 b_ = simde__m128i_to_private(b);
7169
7170 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
7171 r_.neon_i16 = vzip1q_s16(a_.neon_i16, b_.neon_i16);
7172 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
7173 int16x4_t a1 = vget_low_s16(a_.neon_i16);
7174 int16x4_t b1 = vget_low_s16(b_.neon_i16);
7175 int16x4x2_t result = vzip_s16(a1, b1);
7176 r_.neon_i16 = vcombine_s16(result.val[0], result.val[1]);
7177 #elif defined(SIMDE_SHUFFLE_VECTOR_)
7178 r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, b_.i16, 0, 8, 1, 9, 2, 10, 3, 11);
7179 #else
7180 SIMDE_VECTORIZE
7181 for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i16[0])) / 2) ; i++) {
7182 r_.i16[(i * 2)] = a_.i16[i];
7183 r_.i16[(i * 2) + 1] = b_.i16[i];
7184 }
7185 #endif
7186
7187 return simde__m128i_from_private(r_);
7188 #endif
7189 }
7190 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
7191 #define _mm_unpacklo_epi16(a, b) simde_mm_unpacklo_epi16(a, b)
7192 #endif
7193
7194 SIMDE_FUNCTION_ATTRIBUTES
7195 simde__m128i
simde_mm_unpacklo_epi32(simde__m128i a,simde__m128i b)7196 simde_mm_unpacklo_epi32 (simde__m128i a, simde__m128i b) {
7197 #if defined(SIMDE_X86_SSE2_NATIVE)
7198 return _mm_unpacklo_epi32(a, b);
7199 #else
7200 simde__m128i_private
7201 r_,
7202 a_ = simde__m128i_to_private(a),
7203 b_ = simde__m128i_to_private(b);
7204
7205 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
7206 r_.neon_i32 = vzip1q_s32(a_.neon_i32, b_.neon_i32);
7207 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
7208 int32x2_t a1 = vget_low_s32(a_.neon_i32);
7209 int32x2_t b1 = vget_low_s32(b_.neon_i32);
7210 int32x2x2_t result = vzip_s32(a1, b1);
7211 r_.neon_i32 = vcombine_s32(result.val[0], result.val[1]);
7212 #elif defined(SIMDE_SHUFFLE_VECTOR_)
7213 r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.i32, b_.i32, 0, 4, 1, 5);
7214 #else
7215 SIMDE_VECTORIZE
7216 for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i32[0])) / 2) ; i++) {
7217 r_.i32[(i * 2)] = a_.i32[i];
7218 r_.i32[(i * 2) + 1] = b_.i32[i];
7219 }
7220 #endif
7221
7222 return simde__m128i_from_private(r_);
7223 #endif
7224 }
7225 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
7226 #define _mm_unpacklo_epi32(a, b) simde_mm_unpacklo_epi32(a, b)
7227 #endif
7228
7229 SIMDE_FUNCTION_ATTRIBUTES
7230 simde__m128i
simde_mm_unpacklo_epi64(simde__m128i a,simde__m128i b)7231 simde_mm_unpacklo_epi64 (simde__m128i a, simde__m128i b) {
7232 #if defined(SIMDE_X86_SSE2_NATIVE)
7233 return _mm_unpacklo_epi64(a, b);
7234 #else
7235 simde__m128i_private
7236 r_,
7237 a_ = simde__m128i_to_private(a),
7238 b_ = simde__m128i_to_private(b);
7239
7240 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
7241 int64x1_t a_l = vget_low_s64(a_.neon_i64);
7242 int64x1_t b_l = vget_low_s64(b_.neon_i64);
7243 r_.neon_i64 = vcombine_s64(a_l, b_l);
7244 #elif defined(SIMDE_SHUFFLE_VECTOR_)
7245 r_.i64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.i64, b_.i64, 0, 2);
7246 #else
7247 SIMDE_VECTORIZE
7248 for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i64[0])) / 2) ; i++) {
7249 r_.i64[(i * 2)] = a_.i64[i];
7250 r_.i64[(i * 2) + 1] = b_.i64[i];
7251 }
7252 #endif
7253
7254 return simde__m128i_from_private(r_);
7255 #endif
7256 }
7257 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
7258 #define _mm_unpacklo_epi64(a, b) simde_mm_unpacklo_epi64(a, b)
7259 #endif
7260
7261 SIMDE_FUNCTION_ATTRIBUTES
7262 simde__m128d
simde_mm_unpacklo_pd(simde__m128d a,simde__m128d b)7263 simde_mm_unpacklo_pd (simde__m128d a, simde__m128d b) {
7264 #if defined(SIMDE_X86_SSE2_NATIVE)
7265 return _mm_unpacklo_pd(a, b);
7266 #else
7267 simde__m128d_private
7268 r_,
7269 a_ = simde__m128d_to_private(a),
7270 b_ = simde__m128d_to_private(b);
7271
7272 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
7273 float64x1_t a_l = vget_low_f64(a_.f64);
7274 float64x1_t b_l = vget_low_f64(b_.f64);
7275 r_.neon_f64 = vcombine_f64(a_l, b_l);
7276 #elif defined(SIMDE_SHUFFLE_VECTOR_)
7277 r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 0, 2);
7278 #else
7279 SIMDE_VECTORIZE
7280 for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.f64[0])) / 2) ; i++) {
7281 r_.f64[(i * 2)] = a_.f64[i];
7282 r_.f64[(i * 2) + 1] = b_.f64[i];
7283 }
7284 #endif
7285
7286 return simde__m128d_from_private(r_);
7287 #endif
7288 }
7289 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
7290 #define _mm_unpacklo_pd(a, b) simde_mm_unpacklo_pd(a, b)
7291 #endif
7292
7293 SIMDE_FUNCTION_ATTRIBUTES
7294 simde__m128d
simde_x_mm_negate_pd(simde__m128d a)7295 simde_x_mm_negate_pd(simde__m128d a) {
7296 #if defined(SIMDE_X86_SSE2_NATIVE)
7297 return simde_mm_xor_pd(a, _mm_set1_pd(SIMDE_FLOAT64_C(-0.0)));
7298 #else
7299 simde__m128d_private
7300 r_,
7301 a_ = simde__m128d_to_private(a);
7302
7303 #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) && \
7304 (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,1,0))
7305 r_.altivec_f64 = vec_neg(a_.altivec_f64);
7306 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
7307 r_.neon_f64 = vnegq_f64(a_.neon_f64);
7308 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
7309 r_.wasm_v128 = wasm_f64x2_neg(a_.wasm_v128);
7310 #elif defined(SIMDE_VECTOR_NEGATE)
7311 r_.f64 = -a_.f64;
7312 #else
7313 SIMDE_VECTORIZE
7314 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
7315 r_.f64[i] = -a_.f64[i];
7316 }
7317 #endif
7318
7319 return simde__m128d_from_private(r_);
7320 #endif
7321 }
7322
7323 SIMDE_FUNCTION_ATTRIBUTES
7324 simde__m128i
simde_mm_xor_si128(simde__m128i a,simde__m128i b)7325 simde_mm_xor_si128 (simde__m128i a, simde__m128i b) {
7326 #if defined(SIMDE_X86_SSE2_NATIVE)
7327 return _mm_xor_si128(a, b);
7328 #else
7329 simde__m128i_private
7330 r_,
7331 a_ = simde__m128i_to_private(a),
7332 b_ = simde__m128i_to_private(b);
7333
7334 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
7335 r_.neon_i32 = veorq_s32(a_.neon_i32, b_.neon_i32);
7336 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
7337 r_.altivec_i32 = vec_xor(a_.altivec_i32, b_.altivec_i32);
7338 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
7339 r_.i32f = a_.i32f ^ b_.i32f;
7340 #else
7341 SIMDE_VECTORIZE
7342 for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
7343 r_.i32f[i] = a_.i32f[i] ^ b_.i32f[i];
7344 }
7345 #endif
7346
7347 return simde__m128i_from_private(r_);
7348 #endif
7349 }
7350 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
7351 #define _mm_xor_si128(a, b) simde_mm_xor_si128(a, b)
7352 #endif
7353
7354 SIMDE_FUNCTION_ATTRIBUTES
7355 simde__m128i
simde_x_mm_not_si128(simde__m128i a)7356 simde_x_mm_not_si128 (simde__m128i a) {
7357 #if defined(SIMDE_X86_AVX512VL_NATIVE)
7358 return _mm_ternarylogic_epi32(a, a, a, 0x55);
7359 #else
7360 simde__m128i_private
7361 r_,
7362 a_ = simde__m128i_to_private(a);
7363
7364 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
7365 r_.neon_i32 = vmvnq_s32(a_.neon_i32);
7366 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
7367 r_.altivec_i32 = vec_nor(a_.altivec_i32, a_.altivec_i32);
7368 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
7369 r_.wasm_v128 = wasm_v128_not(a_.wasm_v128);
7370 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
7371 r_.i32f = ~a_.i32f;
7372 #else
7373 SIMDE_VECTORIZE
7374 for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
7375 r_.i32f[i] = ~(a_.i32f[i]);
7376 }
7377 #endif
7378
7379 return simde__m128i_from_private(r_);
7380 #endif
7381 }
7382
7383 #define SIMDE_MM_SHUFFLE2(x, y) (((x) << 1) | (y))
7384 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
7385 #define _MM_SHUFFLE2(x, y) SIMDE_MM_SHUFFLE2(x, y)
7386 #endif
7387
7388 SIMDE_END_DECLS_
7389
7390 HEDLEY_DIAGNOSTIC_POP
7391
7392 #endif /* !defined(SIMDE_X86_SSE2_H) */
7393