1 /* SPDX-License-Identifier: MIT
2 *
3 * Permission is hereby granted, free of charge, to any person
4 * obtaining a copy of this software and associated documentation
5 * files (the "Software"), to deal in the Software without
6 * restriction, including without limitation the rights to use, copy,
7 * modify, merge, publish, distribute, sublicense, and/or sell copies
8 * of the Software, and to permit persons to whom the Software is
9 * furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be
12 * included in all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
18 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
19 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Copyright:
24 * 2017-2020 Evan Nemerson <evan@nemerson.com>
25 * 2015-2017 John W. Ratcliff <jratcliffscarab@gmail.com>
26 * 2015 Brandon Rowlett <browlett@nvidia.com>
27 * 2015 Ken Fast <kfast@gdeb.com>
28 * 2017 Hasindu Gamaarachchi <hasindu@unsw.edu.au>
29 * 2018 Jeff Daily <jeff.daily@amd.com>
30 */
31
32 #if !defined(SIMDE_X86_SSE2_H)
33 #define SIMDE_X86_SSE2_H
34
35 #include "sse.h"
36
37 HEDLEY_DIAGNOSTIC_PUSH
38 SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
39 SIMDE_BEGIN_DECLS_
40
41 typedef union {
42 #if defined(SIMDE_VECTOR_SUBSCRIPT)
43 SIMDE_ALIGN_TO_16 int8_t i8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
44 SIMDE_ALIGN_TO_16 int16_t i16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
45 SIMDE_ALIGN_TO_16 int32_t i32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
46 SIMDE_ALIGN_TO_16 int64_t i64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
47 SIMDE_ALIGN_TO_16 uint8_t u8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
48 SIMDE_ALIGN_TO_16 uint16_t u16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
49 SIMDE_ALIGN_TO_16 uint32_t u32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
50 SIMDE_ALIGN_TO_16 uint64_t u64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
51 #if defined(SIMDE_HAVE_INT128_)
52 SIMDE_ALIGN_TO_16 simde_int128 i128 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
53 SIMDE_ALIGN_TO_16 simde_uint128 u128 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
54 #endif
55 SIMDE_ALIGN_TO_16 simde_float32 f32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
56 SIMDE_ALIGN_TO_16 simde_float64 f64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
57
58 SIMDE_ALIGN_TO_16 int_fast32_t i32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
59 SIMDE_ALIGN_TO_16 uint_fast32_t u32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
60 #else
61 SIMDE_ALIGN_TO_16 int8_t i8[16];
62 SIMDE_ALIGN_TO_16 int16_t i16[8];
63 SIMDE_ALIGN_TO_16 int32_t i32[4];
64 SIMDE_ALIGN_TO_16 int64_t i64[2];
65 SIMDE_ALIGN_TO_16 uint8_t u8[16];
66 SIMDE_ALIGN_TO_16 uint16_t u16[8];
67 SIMDE_ALIGN_TO_16 uint32_t u32[4];
68 SIMDE_ALIGN_TO_16 uint64_t u64[2];
69 #if defined(SIMDE_HAVE_INT128_)
70 SIMDE_ALIGN_TO_16 simde_int128 i128[1];
71 SIMDE_ALIGN_TO_16 simde_uint128 u128[1];
72 #endif
73 SIMDE_ALIGN_TO_16 simde_float32 f32[4];
74 SIMDE_ALIGN_TO_16 simde_float64 f64[2];
75
76 SIMDE_ALIGN_TO_16 int_fast32_t i32f[16 / sizeof(int_fast32_t)];
77 SIMDE_ALIGN_TO_16 uint_fast32_t u32f[16 / sizeof(uint_fast32_t)];
78 #endif
79
80 SIMDE_ALIGN_TO_16 simde__m64_private m64_private[2];
81 SIMDE_ALIGN_TO_16 simde__m64 m64[2];
82
83 #if defined(SIMDE_X86_SSE2_NATIVE)
84 SIMDE_ALIGN_TO_16 __m128i n;
85 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
86 SIMDE_ALIGN_TO_16 int8x16_t neon_i8;
87 SIMDE_ALIGN_TO_16 int16x8_t neon_i16;
88 SIMDE_ALIGN_TO_16 int32x4_t neon_i32;
89 SIMDE_ALIGN_TO_16 int64x2_t neon_i64;
90 SIMDE_ALIGN_TO_16 uint8x16_t neon_u8;
91 SIMDE_ALIGN_TO_16 uint16x8_t neon_u16;
92 SIMDE_ALIGN_TO_16 uint32x4_t neon_u32;
93 SIMDE_ALIGN_TO_16 uint64x2_t neon_u64;
94 SIMDE_ALIGN_TO_16 float32x4_t neon_f32;
95 #if defined(SIMDE_ARCH_AARCH64)
96 SIMDE_ALIGN_TO_16 float64x2_t neon_f64;
97 #endif
98 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
99 SIMDE_ALIGN_TO_16 v128_t wasm_v128;
100 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
101 SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed char) altivec_i8;
102 SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed short) altivec_i16;
103 SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed int) altivec_i32;
104 #if defined(__UINT_FAST32_TYPE__) && defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
105 SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(__INT_FAST32_TYPE__) altivec_i32f;
106 #else
107 SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed int) altivec_i32f;
108 #endif
109 SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) altivec_u8;
110 SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) altivec_u16;
111 SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32;
112 #if defined(__UINT_FAST32_TYPE__) && defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
113 SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(__UINT_FAST32_TYPE__) altivec_u32f;
114 #else
115 SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32f;
116 #endif
117 SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(float) altivec_f32;
118 #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
119 SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed long long) altivec_i64;
120 SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) altivec_u64;
121 SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(double) altivec_f64;
122 #endif
123 #endif
124 } simde__m128i_private;
125
126 typedef union {
127 #if defined(SIMDE_VECTOR_SUBSCRIPT)
128 SIMDE_ALIGN_TO_16 int8_t i8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
129 SIMDE_ALIGN_TO_16 int16_t i16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
130 SIMDE_ALIGN_TO_16 int32_t i32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
131 SIMDE_ALIGN_TO_16 int64_t i64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
132 SIMDE_ALIGN_TO_16 uint8_t u8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
133 SIMDE_ALIGN_TO_16 uint16_t u16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
134 SIMDE_ALIGN_TO_16 uint32_t u32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
135 SIMDE_ALIGN_TO_16 uint64_t u64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
136 SIMDE_ALIGN_TO_16 simde_float32 f32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
137 SIMDE_ALIGN_TO_16 simde_float64 f64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
138 SIMDE_ALIGN_TO_16 int_fast32_t i32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
139 SIMDE_ALIGN_TO_16 uint_fast32_t u32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
140 #else
141 SIMDE_ALIGN_TO_16 int8_t i8[16];
142 SIMDE_ALIGN_TO_16 int16_t i16[8];
143 SIMDE_ALIGN_TO_16 int32_t i32[4];
144 SIMDE_ALIGN_TO_16 int64_t i64[2];
145 SIMDE_ALIGN_TO_16 uint8_t u8[16];
146 SIMDE_ALIGN_TO_16 uint16_t u16[8];
147 SIMDE_ALIGN_TO_16 uint32_t u32[4];
148 SIMDE_ALIGN_TO_16 uint64_t u64[2];
149 SIMDE_ALIGN_TO_16 simde_float32 f32[4];
150 SIMDE_ALIGN_TO_16 simde_float64 f64[2];
151 SIMDE_ALIGN_TO_16 int_fast32_t i32f[16 / sizeof(int_fast32_t)];
152 SIMDE_ALIGN_TO_16 uint_fast32_t u32f[16 / sizeof(uint_fast32_t)];
153 #endif
154
155 SIMDE_ALIGN_TO_16 simde__m64_private m64_private[2];
156 SIMDE_ALIGN_TO_16 simde__m64 m64[2];
157
158 #if defined(SIMDE_X86_SSE2_NATIVE)
159 SIMDE_ALIGN_TO_16 __m128d n;
160 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
161 SIMDE_ALIGN_TO_16 int8x16_t neon_i8;
162 SIMDE_ALIGN_TO_16 int16x8_t neon_i16;
163 SIMDE_ALIGN_TO_16 int32x4_t neon_i32;
164 SIMDE_ALIGN_TO_16 int64x2_t neon_i64;
165 SIMDE_ALIGN_TO_16 uint8x16_t neon_u8;
166 SIMDE_ALIGN_TO_16 uint16x8_t neon_u16;
167 SIMDE_ALIGN_TO_16 uint32x4_t neon_u32;
168 SIMDE_ALIGN_TO_16 uint64x2_t neon_u64;
169 SIMDE_ALIGN_TO_16 float32x4_t neon_f32;
170 #if defined(SIMDE_ARCH_AARCH64)
171 SIMDE_ALIGN_TO_16 float64x2_t neon_f64;
172 #endif
173 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
174 SIMDE_ALIGN_TO_16 v128_t wasm_v128;
175 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
176 SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed char) altivec_i8;
177 SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed short) altivec_i16;
178 SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed int) altivec_i32;
179 #if defined(__INT_FAST32_TYPE__) && defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
180 SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(__INT_FAST32_TYPE__) altivec_i32f;
181 #else
182 SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed int) altivec_i32f;
183 #endif
184 SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) altivec_u8;
185 SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) altivec_u16;
186 SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32;
187 #if defined(__UINT_FAST32_TYPE__) && defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
188 SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(__UINT_FAST32_TYPE__) altivec_u32f;
189 #else
190 SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32f;
191 #endif
192 SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(float) altivec_f32;
193 #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
194 SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed long long) altivec_i64;
195 SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) altivec_u64;
196 SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(double) altivec_f64;
197 #endif
198 #endif
199 } simde__m128d_private;
200
201 #if defined(SIMDE_X86_SSE2_NATIVE)
202 typedef __m128i simde__m128i;
203 typedef __m128d simde__m128d;
204 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
205 typedef int64x2_t simde__m128i;
206 # if defined(SIMDE_ARCH_AARCH64)
207 typedef float64x2_t simde__m128d;
208 # elif defined(SIMDE_VECTOR_SUBSCRIPT)
209 typedef simde_float64 simde__m128d SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
210 # else
211 typedef simde__m128d_private simde__m128d;
212 # endif
213 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
214 typedef v128_t simde__m128i;
215 typedef v128_t simde__m128d;
216 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
217 typedef SIMDE_POWER_ALTIVEC_VECTOR(float) simde__m128i;
218 #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
219 typedef SIMDE_POWER_ALTIVEC_VECTOR(double) simde__m128d;
220 #else
221 typedef simde__m128d_private simde__m128d;
222 #endif
223 #elif defined(SIMDE_VECTOR_SUBSCRIPT)
224 typedef int64_t simde__m128i SIMDE_ALIGN_TO_16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
225 typedef simde_float64 simde__m128d SIMDE_ALIGN_TO_16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
226 #else
227 typedef simde__m128i_private simde__m128i;
228 typedef simde__m128d_private simde__m128d;
229 #endif
230
231 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
232 typedef simde__m128i __m128i;
233 typedef simde__m128d __m128d;
234 #endif
235
236 HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128i), "simde__m128i size incorrect");
237 HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128i_private), "simde__m128i_private size incorrect");
238 HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128d), "simde__m128d size incorrect");
239 HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128d_private), "simde__m128d_private size incorrect");
240 #if defined(SIMDE_CHECK_ALIGNMENT) && defined(SIMDE_ALIGN_OF)
241 HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128i) == 16, "simde__m128i is not 16-byte aligned");
242 HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128i_private) == 16, "simde__m128i_private is not 16-byte aligned");
243 HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128d) == 16, "simde__m128d is not 16-byte aligned");
244 HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128d_private) == 16, "simde__m128d_private is not 16-byte aligned");
245 #endif
246
247 SIMDE_FUNCTION_ATTRIBUTES
248 simde__m128i
simde__m128i_from_private(simde__m128i_private v)249 simde__m128i_from_private(simde__m128i_private v) {
250 simde__m128i r;
251 simde_memcpy(&r, &v, sizeof(r));
252 return r;
253 }
254
255 SIMDE_FUNCTION_ATTRIBUTES
256 simde__m128i_private
simde__m128i_to_private(simde__m128i v)257 simde__m128i_to_private(simde__m128i v) {
258 simde__m128i_private r;
259 simde_memcpy(&r, &v, sizeof(r));
260 return r;
261 }
262
263 SIMDE_FUNCTION_ATTRIBUTES
264 simde__m128d
simde__m128d_from_private(simde__m128d_private v)265 simde__m128d_from_private(simde__m128d_private v) {
266 simde__m128d r;
267 simde_memcpy(&r, &v, sizeof(r));
268 return r;
269 }
270
271 SIMDE_FUNCTION_ATTRIBUTES
272 simde__m128d_private
simde__m128d_to_private(simde__m128d v)273 simde__m128d_to_private(simde__m128d v) {
274 simde__m128d_private r;
275 simde_memcpy(&r, &v, sizeof(r));
276 return r;
277 }
278
279 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i,int8x16_t,neon,i8)280 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, int8x16_t, neon, i8)
281 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, int16x8_t, neon, i16)
282 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, int32x4_t, neon, i32)
283 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, int64x2_t, neon, i64)
284 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, uint8x16_t, neon, u8)
285 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, uint16x8_t, neon, u16)
286 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, uint32x4_t, neon, u32)
287 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, uint64x2_t, neon, u64)
288 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, float32x4_t, neon, f32)
289 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
290 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, float64x2_t, neon, f64)
291 #endif
292 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
293 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(signed char), altivec, i8)
294 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(signed short), altivec, i16)
295 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(signed int), altivec, i32)
296 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), altivec, u8)
297 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(unsigned short), altivec, u16)
298 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(unsigned int), altivec, u32)
299 #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
300 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long), altivec, u64)
301 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(signed long long), altivec, i64)
302 #endif
303 #endif /* defined(SIMDE_ARM_NEON_A32V7_NATIVE) */
304
305 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
306 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, int8x16_t, neon, i8)
307 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, int16x8_t, neon, i16)
308 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, int32x4_t, neon, i32)
309 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, int64x2_t, neon, i64)
310 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, uint8x16_t, neon, u8)
311 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, uint16x8_t, neon, u16)
312 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, uint32x4_t, neon, u32)
313 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, uint64x2_t, neon, u64)
314 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, float32x4_t, neon, f32)
315 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
316 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, float64x2_t, neon, f64)
317 #endif
318 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
319 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(signed char), altivec, i8)
320 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(signed short), altivec, i16)
321 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(signed int), altivec, i32)
322 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), altivec, u8)
323 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(unsigned short), altivec, u16)
324 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(unsigned int), altivec, u32)
325 #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
326 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long), altivec, u64)
327 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(signed long long), altivec, i64)
328 #if defined(SIMDE_BUG_GCC_95782)
329 SIMDE_FUNCTION_ATTRIBUTES
330 SIMDE_POWER_ALTIVEC_VECTOR(double)
331 simde__m128d_to_altivec_f64(simde__m128d value) {
332 simde__m128d_private r_ = simde__m128d_to_private(value);
333 return r_.altivec_f64;
334 }
335
336 SIMDE_FUNCTION_ATTRIBUTES
337 simde__m128d
338 simde__m128d_from_altivec_f64(SIMDE_POWER_ALTIVEC_VECTOR(double) value) {
339 simde__m128d_private r_;
340 r_.altivec_f64 = value;
341 return simde__m128d_from_private(r_);
342 }
343 #else
344 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(double), altivec, f64)
345 #endif
346 #endif
347 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
348 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, v128_t, wasm, v128);
349 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, v128_t, wasm, v128);
350 #endif /* defined(SIMDE_ARM_NEON_A32V7_NATIVE) */
351
352 SIMDE_FUNCTION_ATTRIBUTES
353 simde__m128d
354 simde_mm_set_pd (simde_float64 e1, simde_float64 e0) {
355 #if defined(SIMDE_X86_SSE2_NATIVE)
356 return _mm_set_pd(e1, e0);
357 #else
358 simde__m128d_private r_;
359
360 #if defined(SIMDE_WASM_SIMD128_NATIVE)
361 r_.wasm_v128 = wasm_f64x2_make(e0, e1);
362 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
363 SIMDE_ALIGN_TO_16 simde_float64 data[2] = { e0, e1 };
364 r_.neon_f64 = vld1q_f64(data);
365 #else
366 r_.f64[0] = e0;
367 r_.f64[1] = e1;
368 #endif
369
370 return simde__m128d_from_private(r_);
371 #endif
372 }
373 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
374 #define _mm_set_pd(e1, e0) simde_mm_set_pd(e1, e0)
375 #endif
376
377 SIMDE_FUNCTION_ATTRIBUTES
378 simde__m128d
simde_mm_set1_pd(simde_float64 a)379 simde_mm_set1_pd (simde_float64 a) {
380 #if defined(SIMDE_X86_SSE2_NATIVE)
381 return _mm_set1_pd(a);
382 #else
383 simde__m128d_private r_;
384
385 #if defined(SIMDE_WASM_SIMD128_NATIVE)
386 r_.wasm_v128 = wasm_f64x2_splat(a);
387 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
388 r_.neon_f64 = vdupq_n_f64(a);
389 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
390 r_.altivec_f64 = vec_splats(HEDLEY_STATIC_CAST(double, a));
391 #else
392 SIMDE_VECTORIZE
393 for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
394 r_.f64[i] = a;
395 }
396 #endif
397
398 return simde__m128d_from_private(r_);
399 #endif
400 }
401 #define simde_mm_set_pd1(a) simde_mm_set1_pd(a)
402 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
403 #define _mm_set1_pd(a) simde_mm_set1_pd(a)
404 #define _mm_set_pd1(a) simde_mm_set1_pd(a)
405 #endif
406
407 SIMDE_FUNCTION_ATTRIBUTES
408 simde__m128d
simde_x_mm_abs_pd(simde__m128d a)409 simde_x_mm_abs_pd(simde__m128d a) {
410 #if defined(SIMDE_X86_AVX512F_NATIVE) && \
411 (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(7,4,0))
412 return _mm512_castpd512_pd128(_mm512_abs_pd(_mm512_castpd128_pd512(a)));
413 #else
414 simde__m128d_private
415 r_,
416 a_ = simde__m128d_to_private(a);
417
418 #if defined(SIMDE_ARM_NEON_A32V8_NATIVE)
419 r_.neon_f32 = vabsq_f32(a_.neon_f32);
420 #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
421 r_.altivec_f32 = vec_abs(a_.altivec_f32);
422 #else
423 SIMDE_VECTORIZE
424 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
425 r_.f64[i] = simde_math_fabs(a_.f64[i]);
426 }
427 #endif
428
429 return simde__m128d_from_private(r_);
430 #endif
431 }
432
433 SIMDE_FUNCTION_ATTRIBUTES
434 simde__m128d
simde_x_mm_not_pd(simde__m128d a)435 simde_x_mm_not_pd(simde__m128d a) {
436 #if defined(SIMDE_X86_AVX512VL_NATIVE)
437 __m128i ai = _mm_castpd_si128(a);
438 return _mm_castsi128_pd(_mm_ternarylogic_epi64(ai, ai, ai, 0x55));
439 #else
440 simde__m128d_private
441 r_,
442 a_ = simde__m128d_to_private(a);
443
444 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
445 r_.neon_i32 = vmvnq_s32(a_.neon_i32);
446 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
447 r_.altivec_f64 = vec_nor(a_.altivec_f64, a_.altivec_f64);
448 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
449 r_.altivec_i32 = vec_nor(a_.altivec_i32, a_.altivec_i32);
450 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
451 r_.wasm_v128 = wasm_v128_not(a_.wasm_v128);
452 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
453 r_.i32f = ~a_.i32f;
454 #else
455 SIMDE_VECTORIZE
456 for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
457 r_.i32f[i] = ~(a_.i32f[i]);
458 }
459 #endif
460
461 return simde__m128d_from_private(r_);
462 #endif
463 }
464
465 SIMDE_FUNCTION_ATTRIBUTES
466 simde__m128d
simde_x_mm_select_pd(simde__m128d a,simde__m128d b,simde__m128d mask)467 simde_x_mm_select_pd(simde__m128d a, simde__m128d b, simde__m128d mask) {
468 /* This function is for when you want to blend two elements together
469 * according to a mask. It is similar to _mm_blendv_pd, except that
470 * it is undefined whether the blend is based on the highest bit in
471 * each lane (like blendv) or just bitwise operations. This allows
472 * us to implement the function efficiently everywhere.
473 *
474 * Basically, you promise that all the lanes in mask are either 0 or
475 * ~0. */
476 #if defined(SIMDE_X86_SSE4_1_NATIVE)
477 return _mm_blendv_pd(a, b, mask);
478 #else
479 simde__m128d_private
480 r_,
481 a_ = simde__m128d_to_private(a),
482 b_ = simde__m128d_to_private(b),
483 mask_ = simde__m128d_to_private(mask);
484
485 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
486 r_.i64 = a_.i64 ^ ((a_.i64 ^ b_.i64) & mask_.i64);
487 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
488 r_.neon_i64 = vbslq_s64(mask_.neon_u64, b_.neon_i64, a_.neon_i64);
489 #else
490 SIMDE_VECTORIZE
491 for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
492 r_.i64[i] = a_.i64[i] ^ ((a_.i64[i] ^ b_.i64[i]) & mask_.i64[i]);
493 }
494 #endif
495
496 return simde__m128d_from_private(r_);
497 #endif
498 }
499
500 SIMDE_FUNCTION_ATTRIBUTES
501 simde__m128i
simde_mm_add_epi8(simde__m128i a,simde__m128i b)502 simde_mm_add_epi8 (simde__m128i a, simde__m128i b) {
503 #if defined(SIMDE_X86_SSE2_NATIVE)
504 return _mm_add_epi8(a, b);
505 #else
506 simde__m128i_private
507 r_,
508 a_ = simde__m128i_to_private(a),
509 b_ = simde__m128i_to_private(b);
510
511 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
512 r_.neon_i8 = vaddq_s8(a_.neon_i8, b_.neon_i8);
513 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
514 r_.altivec_i8 = vec_add(a_.altivec_i8, b_.altivec_i8);
515 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
516 r_.wasm_v128 = wasm_i8x16_add(a_.wasm_v128, b_.wasm_v128);
517 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
518 r_.i8 = a_.i8 + b_.i8;
519 #else
520 SIMDE_VECTORIZE
521 for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
522 r_.i8[i] = a_.i8[i] + b_.i8[i];
523 }
524 #endif
525
526 return simde__m128i_from_private(r_);
527 #endif
528 }
529 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
530 #define _mm_add_epi8(a, b) simde_mm_add_epi8(a, b)
531 #endif
532
533 SIMDE_FUNCTION_ATTRIBUTES
534 simde__m128i
simde_mm_add_epi16(simde__m128i a,simde__m128i b)535 simde_mm_add_epi16 (simde__m128i a, simde__m128i b) {
536 #if defined(SIMDE_X86_SSE2_NATIVE)
537 return _mm_add_epi16(a, b);
538 #else
539 simde__m128i_private
540 r_,
541 a_ = simde__m128i_to_private(a),
542 b_ = simde__m128i_to_private(b);
543
544 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
545 r_.neon_i16 = vaddq_s16(a_.neon_i16, b_.neon_i16);
546 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
547 r_.altivec_i16 = vec_add(a_.altivec_i16, b_.altivec_i16);
548 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
549 r_.wasm_v128 = wasm_i16x8_add(a_.wasm_v128, b_.wasm_v128);
550 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
551 r_.i16 = a_.i16 + b_.i16;
552 #else
553 SIMDE_VECTORIZE
554 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
555 r_.i16[i] = a_.i16[i] + b_.i16[i];
556 }
557 #endif
558
559 return simde__m128i_from_private(r_);
560 #endif
561 }
562 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
563 #define _mm_add_epi16(a, b) simde_mm_add_epi16(a, b)
564 #endif
565
566 SIMDE_FUNCTION_ATTRIBUTES
567 simde__m128i
simde_mm_add_epi32(simde__m128i a,simde__m128i b)568 simde_mm_add_epi32 (simde__m128i a, simde__m128i b) {
569 #if defined(SIMDE_X86_SSE2_NATIVE)
570 return _mm_add_epi32(a, b);
571 #else
572 simde__m128i_private
573 r_,
574 a_ = simde__m128i_to_private(a),
575 b_ = simde__m128i_to_private(b);
576
577 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
578 r_.neon_i32 = vaddq_s32(a_.neon_i32, b_.neon_i32);
579 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
580 r_.altivec_i32 = vec_add(a_.altivec_i32, b_.altivec_i32);
581 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
582 r_.wasm_v128 = wasm_i32x4_add(a_.wasm_v128, b_.wasm_v128);
583 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
584 r_.i32 = a_.i32 + b_.i32;
585 #else
586 SIMDE_VECTORIZE
587 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
588 r_.i32[i] = a_.i32[i] + b_.i32[i];
589 }
590 #endif
591
592 return simde__m128i_from_private(r_);
593 #endif
594 }
595 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
596 #define _mm_add_epi32(a, b) simde_mm_add_epi32(a, b)
597 #endif
598
599 SIMDE_FUNCTION_ATTRIBUTES
600 simde__m128i
simde_mm_add_epi64(simde__m128i a,simde__m128i b)601 simde_mm_add_epi64 (simde__m128i a, simde__m128i b) {
602 #if defined(SIMDE_X86_SSE2_NATIVE)
603 return _mm_add_epi64(a, b);
604 #else
605 simde__m128i_private
606 r_,
607 a_ = simde__m128i_to_private(a),
608 b_ = simde__m128i_to_private(b);
609
610 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
611 r_.neon_i64 = vaddq_s64(a_.neon_i64, b_.neon_i64);
612 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
613 r_.altivec_i64 = vec_add(a_.altivec_i64, b_.altivec_i64);
614 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
615 r_.wasm_v128 = wasm_i64x2_add(a_.wasm_v128, b_.wasm_v128);
616 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
617 r_.i64 = a_.i64 + b_.i64;
618 #else
619 SIMDE_VECTORIZE
620 for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
621 r_.i64[i] = a_.i64[i] + b_.i64[i];
622 }
623 #endif
624
625 return simde__m128i_from_private(r_);
626 #endif
627 }
628 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
629 #define _mm_add_epi64(a, b) simde_mm_add_epi64(a, b)
630 #endif
631
632 SIMDE_FUNCTION_ATTRIBUTES
633 simde__m128d
simde_mm_add_pd(simde__m128d a,simde__m128d b)634 simde_mm_add_pd (simde__m128d a, simde__m128d b) {
635 #if defined(SIMDE_X86_SSE2_NATIVE)
636 return _mm_add_pd(a, b);
637 #else
638 simde__m128d_private
639 r_,
640 a_ = simde__m128d_to_private(a),
641 b_ = simde__m128d_to_private(b);
642
643 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
644 r_.neon_f64 = vaddq_f64(a_.neon_f64, b_.neon_f64);
645 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
646 r_.wasm_v128 = wasm_f64x2_add(a_.wasm_v128, b_.wasm_v128);
647 #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
648 r_.altivec_f64 = vec_add(a_.altivec_f64, b_.altivec_f64);
649 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
650 r_.wasm_v128 = wasm_f64x2_add(a_.wasm_v128, b_.wasm_v128);
651 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
652 r_.f64 = a_.f64 + b_.f64;
653 #else
654 SIMDE_VECTORIZE
655 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
656 r_.f64[i] = a_.f64[i] + b_.f64[i];
657 }
658 #endif
659
660 return simde__m128d_from_private(r_);
661 #endif
662 }
663 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
664 #define _mm_add_pd(a, b) simde_mm_add_pd(a, b)
665 #endif
666
667 SIMDE_FUNCTION_ATTRIBUTES
668 simde__m128d
simde_mm_move_sd(simde__m128d a,simde__m128d b)669 simde_mm_move_sd (simde__m128d a, simde__m128d b) {
670 #if defined(SIMDE_X86_SSE2_NATIVE)
671 return _mm_move_sd(a, b);
672 #else
673 simde__m128d_private
674 r_,
675 a_ = simde__m128d_to_private(a),
676 b_ = simde__m128d_to_private(b);
677
678 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
679 r_.neon_f64 = vsetq_lane_f64(vgetq_lane_f64(b_.neon_f64, 0), a_.neon_f64, 0);
680 #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
681 #if defined(HEDLEY_IBM_VERSION)
682 r_.altivec_f64 = vec_xxpermdi(a_.altivec_f64, b_.altivec_f64, 1);
683 #else
684 r_.altivec_f64 = vec_xxpermdi(b_.altivec_f64, a_.altivec_f64, 1);
685 #endif
686 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
687 r_.wasm_v128 = wasm_v64x2_shuffle(a_.wasm_v128, b_.wasm_v128, 2, 1);
688 #elif defined(SIMDE_SHUFFLE_VECTOR_)
689 r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 2, 1);
690 #else
691 r_.f64[0] = b_.f64[0];
692 r_.f64[1] = a_.f64[1];
693 #endif
694
695 return simde__m128d_from_private(r_);
696 #endif
697 }
698 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
699 #define _mm_move_sd(a, b) simde_mm_move_sd(a, b)
700 #endif
701
702 SIMDE_FUNCTION_ATTRIBUTES
703 simde__m128d
simde_mm_add_sd(simde__m128d a,simde__m128d b)704 simde_mm_add_sd (simde__m128d a, simde__m128d b) {
705 #if defined(SIMDE_X86_SSE2_NATIVE)
706 return _mm_add_sd(a, b);
707 #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
708 return simde_mm_move_sd(a, simde_mm_add_pd(a, b));
709 #else
710 simde__m128d_private
711 r_,
712 a_ = simde__m128d_to_private(a),
713 b_ = simde__m128d_to_private(b);
714
715 r_.f64[0] = a_.f64[0] + b_.f64[0];
716 r_.f64[1] = a_.f64[1];
717
718 return simde__m128d_from_private(r_);
719 #endif
720 }
721 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
722 #define _mm_add_sd(a, b) simde_mm_add_sd(a, b)
723 #endif
724
725 SIMDE_FUNCTION_ATTRIBUTES
726 simde__m64
simde_mm_add_si64(simde__m64 a,simde__m64 b)727 simde_mm_add_si64 (simde__m64 a, simde__m64 b) {
728 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
729 return _mm_add_si64(a, b);
730 #else
731 simde__m64_private
732 r_,
733 a_ = simde__m64_to_private(a),
734 b_ = simde__m64_to_private(b);
735
736 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
737 r_.neon_i64 = vadd_s64(a_.neon_i64, b_.neon_i64);
738 #else
739 r_.i64[0] = a_.i64[0] + b_.i64[0];
740 #endif
741
742 return simde__m64_from_private(r_);
743 #endif
744 }
745 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
746 #define _mm_add_si64(a, b) simde_mm_add_si64(a, b)
747 #endif
748
749 SIMDE_FUNCTION_ATTRIBUTES
750 simde__m128i
simde_mm_adds_epi8(simde__m128i a,simde__m128i b)751 simde_mm_adds_epi8 (simde__m128i a, simde__m128i b) {
752 #if defined(SIMDE_X86_SSE2_NATIVE)
753 return _mm_adds_epi8(a, b);
754 #else
755 simde__m128i_private
756 r_,
757 a_ = simde__m128i_to_private(a),
758 b_ = simde__m128i_to_private(b);
759
760 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
761 r_.neon_i8 = vqaddq_s8(a_.neon_i8, b_.neon_i8);
762 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
763 r_.wasm_v128 = wasm_i8x16_add_saturate(a_.wasm_v128, b_.wasm_v128);
764 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
765 r_.altivec_i8 = vec_adds(a_.altivec_i8, b_.altivec_i8);
766 #else
767 SIMDE_VECTORIZE
768 for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
769 const int_fast16_t tmp =
770 HEDLEY_STATIC_CAST(int_fast16_t, a_.i8[i]) +
771 HEDLEY_STATIC_CAST(int_fast16_t, b_.i8[i]);
772 r_.i8[i] = HEDLEY_STATIC_CAST(int8_t, ((tmp < INT8_MAX) ? ((tmp > INT8_MIN) ? tmp : INT8_MIN) : INT8_MAX));
773 }
774 #endif
775
776 return simde__m128i_from_private(r_);
777 #endif
778 }
779 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
780 #define _mm_adds_epi8(a, b) simde_mm_adds_epi8(a, b)
781 #endif
782
783 SIMDE_FUNCTION_ATTRIBUTES
784 simde__m128i
simde_mm_adds_epi16(simde__m128i a,simde__m128i b)785 simde_mm_adds_epi16 (simde__m128i a, simde__m128i b) {
786 #if defined(SIMDE_X86_SSE2_NATIVE)
787 return _mm_adds_epi16(a, b);
788 #else
789 simde__m128i_private
790 r_,
791 a_ = simde__m128i_to_private(a),
792 b_ = simde__m128i_to_private(b);
793
794 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
795 r_.neon_i16 = vqaddq_s16(a_.neon_i16, b_.neon_i16);
796 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
797 r_.wasm_v128 = wasm_i16x8_add_saturate(a_.wasm_v128, b_.wasm_v128);
798 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
799 r_.altivec_i16 = vec_adds(a_.altivec_i16, b_.altivec_i16);
800 #else
801 SIMDE_VECTORIZE
802 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
803 const int_fast32_t tmp =
804 HEDLEY_STATIC_CAST(int_fast32_t, a_.i16[i]) +
805 HEDLEY_STATIC_CAST(int_fast32_t, b_.i16[i]);
806 r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, ((tmp < INT16_MAX) ? ((tmp > INT16_MIN) ? tmp : INT16_MIN) : INT16_MAX));
807 }
808 #endif
809
810 return simde__m128i_from_private(r_);
811 #endif
812 }
813 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
814 #define _mm_adds_epi16(a, b) simde_mm_adds_epi16(a, b)
815 #endif
816
817 SIMDE_FUNCTION_ATTRIBUTES
818 simde__m128i
simde_mm_adds_epu8(simde__m128i a,simde__m128i b)819 simde_mm_adds_epu8 (simde__m128i a, simde__m128i b) {
820 #if defined(SIMDE_X86_SSE2_NATIVE)
821 return _mm_adds_epu8(a, b);
822 #else
823 simde__m128i_private
824 r_,
825 a_ = simde__m128i_to_private(a),
826 b_ = simde__m128i_to_private(b);
827
828 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
829 r_.neon_u8 = vqaddq_u8(a_.neon_u8, b_.neon_u8);
830 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
831 r_.wasm_v128 = wasm_u8x16_add_saturate(a_.wasm_v128, b_.wasm_v128);
832 #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
833 r_.altivec_u8 = vec_adds(a_.altivec_u8, b_.altivec_u8);
834 #else
835 SIMDE_VECTORIZE
836 for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
837 r_.u8[i] = ((UINT8_MAX - a_.u8[i]) > b_.u8[i]) ? (a_.u8[i] + b_.u8[i]) : UINT8_MAX;
838 }
839 #endif
840
841 return simde__m128i_from_private(r_);
842 #endif
843 }
844 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
845 #define _mm_adds_epu8(a, b) simde_mm_adds_epu8(a, b)
846 #endif
847
848 SIMDE_FUNCTION_ATTRIBUTES
849 simde__m128i
simde_mm_adds_epu16(simde__m128i a,simde__m128i b)850 simde_mm_adds_epu16 (simde__m128i a, simde__m128i b) {
851 #if defined(SIMDE_X86_SSE2_NATIVE)
852 return _mm_adds_epu16(a, b);
853 #else
854 simde__m128i_private
855 r_,
856 a_ = simde__m128i_to_private(a),
857 b_ = simde__m128i_to_private(b);
858
859 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
860 r_.neon_u16 = vqaddq_u16(a_.neon_u16, b_.neon_u16);
861 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
862 r_.wasm_v128 = wasm_u16x8_add_saturate(a_.wasm_v128, b_.wasm_v128);
863 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
864 r_.altivec_u16 = vec_adds(a_.altivec_u16, b_.altivec_u16);
865 #else
866 SIMDE_VECTORIZE
867 for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
868 r_.u16[i] = ((UINT16_MAX - a_.u16[i]) > b_.u16[i]) ? (a_.u16[i] + b_.u16[i]) : UINT16_MAX;
869 }
870 #endif
871
872 return simde__m128i_from_private(r_);
873 #endif
874 }
875 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
876 #define _mm_adds_epu16(a, b) simde_mm_adds_epu16(a, b)
877 #endif
878
879 SIMDE_FUNCTION_ATTRIBUTES
880 simde__m128d
simde_mm_and_pd(simde__m128d a,simde__m128d b)881 simde_mm_and_pd (simde__m128d a, simde__m128d b) {
882 #if defined(SIMDE_X86_SSE2_NATIVE)
883 return _mm_and_pd(a, b);
884 #else
885 simde__m128d_private
886 r_,
887 a_ = simde__m128d_to_private(a),
888 b_ = simde__m128d_to_private(b);
889
890 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
891 r_.neon_i32 = vandq_s32(a_.neon_i32, b_.neon_i32);
892 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
893 r_.wasm_v128 = wasm_v128_and(a_.wasm_v128, b_.wasm_v128);
894 #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
895 r_.altivec_f64 = vec_and(a_.altivec_f64, b_.altivec_f64);
896 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
897 r_.i32f = a_.i32f & b_.i32f;
898 #else
899 SIMDE_VECTORIZE
900 for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
901 r_.i32f[i] = a_.i32f[i] & b_.i32f[i];
902 }
903 #endif
904
905 return simde__m128d_from_private(r_);
906 #endif
907 }
908 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
909 #define _mm_and_pd(a, b) simde_mm_and_pd(a, b)
910 #endif
911
912 SIMDE_FUNCTION_ATTRIBUTES
913 simde__m128i
simde_mm_and_si128(simde__m128i a,simde__m128i b)914 simde_mm_and_si128 (simde__m128i a, simde__m128i b) {
915 #if defined(SIMDE_X86_SSE2_NATIVE)
916 return _mm_and_si128(a, b);
917 #else
918 simde__m128i_private
919 r_,
920 a_ = simde__m128i_to_private(a),
921 b_ = simde__m128i_to_private(b);
922
923 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
924 r_.neon_i32 = vandq_s32(b_.neon_i32, a_.neon_i32);
925 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
926 r_.altivec_u32f = vec_and(a_.altivec_u32f, b_.altivec_u32f);
927 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
928 r_.i32f = a_.i32f & b_.i32f;
929 #else
930 SIMDE_VECTORIZE
931 for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
932 r_.i32f[i] = a_.i32f[i] & b_.i32f[i];
933 }
934 #endif
935
936 return simde__m128i_from_private(r_);
937 #endif
938 }
939 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
940 #define _mm_and_si128(a, b) simde_mm_and_si128(a, b)
941 #endif
942
943 SIMDE_FUNCTION_ATTRIBUTES
944 simde__m128d
simde_mm_andnot_pd(simde__m128d a,simde__m128d b)945 simde_mm_andnot_pd (simde__m128d a, simde__m128d b) {
946 #if defined(SIMDE_X86_SSE2_NATIVE)
947 return _mm_andnot_pd(a, b);
948 #else
949 simde__m128d_private
950 r_,
951 a_ = simde__m128d_to_private(a),
952 b_ = simde__m128d_to_private(b);
953
954 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
955 r_.neon_i32 = vbicq_s32(b_.neon_i32, a_.neon_i32);
956 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
957 r_.wasm_v128 = wasm_v128_andnot(b_.wasm_v128, a_.wasm_v128);
958 #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
959 r_.altivec_f64 = vec_andc(b_.altivec_f64, a_.altivec_f64);
960 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
961 r_.altivec_i32f = vec_andc(b_.altivec_i32f, a_.altivec_i32f);
962 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
963 r_.i32f = ~a_.i32f & b_.i32f;
964 #else
965 SIMDE_VECTORIZE
966 for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
967 r_.u64[i] = ~a_.u64[i] & b_.u64[i];
968 }
969 #endif
970
971 return simde__m128d_from_private(r_);
972 #endif
973 }
974 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
975 #define _mm_andnot_pd(a, b) simde_mm_andnot_pd(a, b)
976 #endif
977
978 SIMDE_FUNCTION_ATTRIBUTES
979 simde__m128i
simde_mm_andnot_si128(simde__m128i a,simde__m128i b)980 simde_mm_andnot_si128 (simde__m128i a, simde__m128i b) {
981 #if defined(SIMDE_X86_SSE2_NATIVE)
982 return _mm_andnot_si128(a, b);
983 #else
984 simde__m128i_private
985 r_,
986 a_ = simde__m128i_to_private(a),
987 b_ = simde__m128i_to_private(b);
988
989 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
990 r_.neon_i32 = vbicq_s32(b_.neon_i32, a_.neon_i32);
991 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
992 r_.altivec_i32 = vec_andc(b_.altivec_i32, a_.altivec_i32);
993 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
994 r_.i32f = ~a_.i32f & b_.i32f;
995 #else
996 SIMDE_VECTORIZE
997 for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
998 r_.i32f[i] = ~(a_.i32f[i]) & b_.i32f[i];
999 }
1000 #endif
1001
1002 return simde__m128i_from_private(r_);
1003 #endif
1004 }
1005 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1006 #define _mm_andnot_si128(a, b) simde_mm_andnot_si128(a, b)
1007 #endif
1008
1009 SIMDE_FUNCTION_ATTRIBUTES
1010 simde__m128d
simde_mm_xor_pd(simde__m128d a,simde__m128d b)1011 simde_mm_xor_pd (simde__m128d a, simde__m128d b) {
1012 #if defined(SIMDE_X86_SSE2_NATIVE)
1013 return _mm_xor_pd(a, b);
1014 #else
1015 simde__m128d_private
1016 r_,
1017 a_ = simde__m128d_to_private(a),
1018 b_ = simde__m128d_to_private(b);
1019
1020 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1021 r_.i32f = a_.i32f ^ b_.i32f;
1022 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1023 r_.wasm_v128 = wasm_v128_xor(a_.wasm_v128, b_.wasm_v128);
1024 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1025 r_.neon_i64 = veorq_s64(a_.neon_i64, b_.neon_i64);
1026 #else
1027 SIMDE_VECTORIZE
1028 for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
1029 r_.i32f[i] = a_.i32f[i] ^ b_.i32f[i];
1030 }
1031 #endif
1032
1033 return simde__m128d_from_private(r_);
1034 #endif
1035 }
1036 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1037 #define _mm_xor_pd(a, b) simde_mm_xor_pd(a, b)
1038 #endif
1039
1040 SIMDE_FUNCTION_ATTRIBUTES
1041 simde__m128i
simde_mm_avg_epu8(simde__m128i a,simde__m128i b)1042 simde_mm_avg_epu8 (simde__m128i a, simde__m128i b) {
1043 #if defined(SIMDE_X86_SSE2_NATIVE)
1044 return _mm_avg_epu8(a, b);
1045 #else
1046 simde__m128i_private
1047 r_,
1048 a_ = simde__m128i_to_private(a),
1049 b_ = simde__m128i_to_private(b);
1050
1051 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1052 r_.neon_u8 = vrhaddq_u8(b_.neon_u8, a_.neon_u8);
1053 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1054 r_.wasm_v128 = wasm_u8x16_avgr(a_.wasm_v128, b_.wasm_v128);
1055 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1056 r_.altivec_u8 = vec_avg(a_.altivec_u8, b_.altivec_u8);
1057 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_CONVERT_VECTOR_)
1058 uint16_t wa SIMDE_VECTOR(32);
1059 uint16_t wb SIMDE_VECTOR(32);
1060 uint16_t wr SIMDE_VECTOR(32);
1061 SIMDE_CONVERT_VECTOR_(wa, a_.u8);
1062 SIMDE_CONVERT_VECTOR_(wb, b_.u8);
1063 wr = (wa + wb + 1) >> 1;
1064 SIMDE_CONVERT_VECTOR_(r_.u8, wr);
1065 #else
1066 SIMDE_VECTORIZE
1067 for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
1068 r_.u8[i] = (a_.u8[i] + b_.u8[i] + 1) >> 1;
1069 }
1070 #endif
1071
1072 return simde__m128i_from_private(r_);
1073 #endif
1074 }
1075 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1076 #define _mm_avg_epu8(a, b) simde_mm_avg_epu8(a, b)
1077 #endif
1078
1079 SIMDE_FUNCTION_ATTRIBUTES
1080 simde__m128i
simde_mm_avg_epu16(simde__m128i a,simde__m128i b)1081 simde_mm_avg_epu16 (simde__m128i a, simde__m128i b) {
1082 #if defined(SIMDE_X86_SSE2_NATIVE)
1083 return _mm_avg_epu16(a, b);
1084 #else
1085 simde__m128i_private
1086 r_,
1087 a_ = simde__m128i_to_private(a),
1088 b_ = simde__m128i_to_private(b);
1089
1090 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1091 r_.neon_u16 = vrhaddq_u16(b_.neon_u16, a_.neon_u16);
1092 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1093 r_.wasm_v128 = wasm_u16x8_avgr(a_.wasm_v128, b_.wasm_v128);
1094 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1095 r_.altivec_u16 = vec_avg(a_.altivec_u16, b_.altivec_u16);
1096 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_CONVERT_VECTOR_)
1097 uint32_t wa SIMDE_VECTOR(32);
1098 uint32_t wb SIMDE_VECTOR(32);
1099 uint32_t wr SIMDE_VECTOR(32);
1100 SIMDE_CONVERT_VECTOR_(wa, a_.u16);
1101 SIMDE_CONVERT_VECTOR_(wb, b_.u16);
1102 wr = (wa + wb + 1) >> 1;
1103 SIMDE_CONVERT_VECTOR_(r_.u16, wr);
1104 #else
1105 SIMDE_VECTORIZE
1106 for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
1107 r_.u16[i] = (a_.u16[i] + b_.u16[i] + 1) >> 1;
1108 }
1109 #endif
1110
1111 return simde__m128i_from_private(r_);
1112 #endif
1113 }
1114 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1115 #define _mm_avg_epu16(a, b) simde_mm_avg_epu16(a, b)
1116 #endif
1117
1118 SIMDE_FUNCTION_ATTRIBUTES
1119 simde__m128i
simde_mm_setzero_si128(void)1120 simde_mm_setzero_si128 (void) {
1121 #if defined(SIMDE_X86_SSE2_NATIVE)
1122 return _mm_setzero_si128();
1123 #else
1124 simde__m128i_private r_;
1125
1126 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1127 r_.neon_i32 = vdupq_n_s32(0);
1128 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1129 r_.altivec_i32 = vec_splats(HEDLEY_STATIC_CAST(signed int, 0));
1130 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1131 r_.wasm_v128 = wasm_i32x4_splat(INT32_C(0));
1132 #elif defined(SIMDE_VECTOR_SUBSCRIPT)
1133 r_.i32 = __extension__ (__typeof__(r_.i32)) { 0, 0, 0, 0 };
1134 #else
1135 SIMDE_VECTORIZE
1136 for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
1137 r_.i32f[i] = 0;
1138 }
1139 #endif
1140
1141 return simde__m128i_from_private(r_);
1142 #endif
1143 }
1144 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1145 #define _mm_setzero_si128() (simde_mm_setzero_si128())
1146 #endif
1147
1148 SIMDE_FUNCTION_ATTRIBUTES
1149 simde__m128i
simde_mm_bslli_si128(simde__m128i a,const int imm8)1150 simde_mm_bslli_si128 (simde__m128i a, const int imm8)
1151 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
1152 simde__m128i_private
1153 r_,
1154 a_ = simde__m128i_to_private(a);
1155
1156 if (HEDLEY_UNLIKELY((imm8 & ~15))) {
1157 return simde_mm_setzero_si128();
1158 }
1159
1160 #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) && defined(SIMDE_ENDIAN_ORDER)
1161 r_.altivec_i8 =
1162 #if (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
1163 vec_slo
1164 #else /* SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_BIG */
1165 vec_sro
1166 #endif
1167 (a_.altivec_i8, vec_splats(HEDLEY_STATIC_CAST(unsigned char, imm8 * 8)));
1168 #elif defined(SIMDE_HAVE_INT128_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
1169 r_.u128[0] = a_.u128[0] << (imm8 * 8);
1170 #else
1171 r_ = simde__m128i_to_private(simde_mm_setzero_si128());
1172 for (int i = imm8 ; i < HEDLEY_STATIC_CAST(int, sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
1173 r_.i8[i] = a_.i8[i - imm8];
1174 }
1175 #endif
1176
1177 return simde__m128i_from_private(r_);
1178 }
1179 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
1180 #define simde_mm_bslli_si128(a, imm8) _mm_slli_si128(a, imm8)
1181 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__)
1182 #define simde_mm_bslli_si128(a, imm8) \
1183 simde__m128i_from_neon_i8(((imm8) <= 0) ? simde__m128i_to_neon_i8(a) : (((imm8) > 15) ? (vdupq_n_s8(0)) : (vextq_s8(vdupq_n_s8(0), simde__m128i_to_neon_i8(a), 16 - (imm8)))))
1184 #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1185 #define simde_mm_bslli_si128(a, imm8) (__extension__ ({ \
1186 const simde__m128i_private simde__tmp_a_ = simde__m128i_to_private(a); \
1187 const simde__m128i_private simde__tmp_z_ = simde__m128i_to_private(simde_mm_setzero_si128()); \
1188 simde__m128i_private simde__tmp_r_; \
1189 if (HEDLEY_UNLIKELY(imm8 > 15)) { \
1190 simde__tmp_r_ = simde__m128i_to_private(simde_mm_setzero_si128()); \
1191 } else { \
1192 simde__tmp_r_.i8 = \
1193 SIMDE_SHUFFLE_VECTOR_(8, 16, \
1194 simde__tmp_z_.i8, \
1195 (simde__tmp_a_).i8, \
1196 HEDLEY_STATIC_CAST(int8_t, (16 - imm8) & 31), \
1197 HEDLEY_STATIC_CAST(int8_t, (17 - imm8) & 31), \
1198 HEDLEY_STATIC_CAST(int8_t, (18 - imm8) & 31), \
1199 HEDLEY_STATIC_CAST(int8_t, (19 - imm8) & 31), \
1200 HEDLEY_STATIC_CAST(int8_t, (20 - imm8) & 31), \
1201 HEDLEY_STATIC_CAST(int8_t, (21 - imm8) & 31), \
1202 HEDLEY_STATIC_CAST(int8_t, (22 - imm8) & 31), \
1203 HEDLEY_STATIC_CAST(int8_t, (23 - imm8) & 31), \
1204 HEDLEY_STATIC_CAST(int8_t, (24 - imm8) & 31), \
1205 HEDLEY_STATIC_CAST(int8_t, (25 - imm8) & 31), \
1206 HEDLEY_STATIC_CAST(int8_t, (26 - imm8) & 31), \
1207 HEDLEY_STATIC_CAST(int8_t, (27 - imm8) & 31), \
1208 HEDLEY_STATIC_CAST(int8_t, (28 - imm8) & 31), \
1209 HEDLEY_STATIC_CAST(int8_t, (29 - imm8) & 31), \
1210 HEDLEY_STATIC_CAST(int8_t, (30 - imm8) & 31), \
1211 HEDLEY_STATIC_CAST(int8_t, (31 - imm8) & 31)); \
1212 } \
1213 simde__m128i_from_private(simde__tmp_r_); }))
1214 #endif
1215 #define simde_mm_slli_si128(a, imm8) simde_mm_bslli_si128(a, imm8)
1216 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1217 #define _mm_bslli_si128(a, imm8) simde_mm_bslli_si128(a, imm8)
1218 #define _mm_slli_si128(a, imm8) simde_mm_bslli_si128(a, imm8)
1219 #endif
1220
1221 SIMDE_FUNCTION_ATTRIBUTES
1222 simde__m128i
simde_mm_bsrli_si128(simde__m128i a,const int imm8)1223 simde_mm_bsrli_si128 (simde__m128i a, const int imm8)
1224 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
1225 simde__m128i_private
1226 r_,
1227 a_ = simde__m128i_to_private(a);
1228
1229 if (HEDLEY_UNLIKELY((imm8 & ~15))) {
1230 return simde_mm_setzero_si128();
1231 }
1232
1233 #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) && defined(SIMDE_ENDIAN_ORDER)
1234 r_.altivec_i8 =
1235 #if (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
1236 vec_sro
1237 #else /* SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_BIG */
1238 vec_slo
1239 #endif
1240 (a_.altivec_i8, vec_splats(HEDLEY_STATIC_CAST(unsigned char, imm8 * 8)));
1241 #else
1242 SIMDE_VECTORIZE
1243 for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
1244 const int e = HEDLEY_STATIC_CAST(int, i) + imm8;
1245 r_.i8[i] = (e < 16) ? a_.i8[e] : 0;
1246 }
1247 #endif
1248
1249 return simde__m128i_from_private(r_);
1250 }
1251 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
1252 #define simde_mm_bsrli_si128(a, imm8) _mm_srli_si128(a, imm8)
1253 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__)
1254 #define simde_mm_bsrli_si128(a, imm8) \
1255 simde__m128i_from_neon_i8(((imm8 < 0) || (imm8 > 15)) ? vdupq_n_s8(0) : (vextq_s8(simde__m128i_to_private(a).neon_i8, vdupq_n_s8(0), ((imm8 & 15) != 0) ? imm8 : (imm8 & 15))))
1256 #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1257 #define simde_mm_bsrli_si128(a, imm8) (__extension__ ({ \
1258 const simde__m128i_private simde__tmp_a_ = simde__m128i_to_private(a); \
1259 const simde__m128i_private simde__tmp_z_ = simde__m128i_to_private(simde_mm_setzero_si128()); \
1260 simde__m128i_private simde__tmp_r_ = simde__m128i_to_private(a); \
1261 if (HEDLEY_UNLIKELY(imm8 > 15)) { \
1262 simde__tmp_r_ = simde__m128i_to_private(simde_mm_setzero_si128()); \
1263 } else { \
1264 simde__tmp_r_.i8 = \
1265 SIMDE_SHUFFLE_VECTOR_(8, 16, \
1266 simde__tmp_z_.i8, \
1267 (simde__tmp_a_).i8, \
1268 HEDLEY_STATIC_CAST(int8_t, (imm8 + 16) & 31), \
1269 HEDLEY_STATIC_CAST(int8_t, (imm8 + 17) & 31), \
1270 HEDLEY_STATIC_CAST(int8_t, (imm8 + 18) & 31), \
1271 HEDLEY_STATIC_CAST(int8_t, (imm8 + 19) & 31), \
1272 HEDLEY_STATIC_CAST(int8_t, (imm8 + 20) & 31), \
1273 HEDLEY_STATIC_CAST(int8_t, (imm8 + 21) & 31), \
1274 HEDLEY_STATIC_CAST(int8_t, (imm8 + 22) & 31), \
1275 HEDLEY_STATIC_CAST(int8_t, (imm8 + 23) & 31), \
1276 HEDLEY_STATIC_CAST(int8_t, (imm8 + 24) & 31), \
1277 HEDLEY_STATIC_CAST(int8_t, (imm8 + 25) & 31), \
1278 HEDLEY_STATIC_CAST(int8_t, (imm8 + 26) & 31), \
1279 HEDLEY_STATIC_CAST(int8_t, (imm8 + 27) & 31), \
1280 HEDLEY_STATIC_CAST(int8_t, (imm8 + 28) & 31), \
1281 HEDLEY_STATIC_CAST(int8_t, (imm8 + 29) & 31), \
1282 HEDLEY_STATIC_CAST(int8_t, (imm8 + 30) & 31), \
1283 HEDLEY_STATIC_CAST(int8_t, (imm8 + 31) & 31)); \
1284 } \
1285 simde__m128i_from_private(simde__tmp_r_); }))
1286 #endif
1287 #define simde_mm_srli_si128(a, imm8) simde_mm_bsrli_si128((a), (imm8))
1288 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1289 #define _mm_bsrli_si128(a, imm8) simde_mm_bsrli_si128((a), (imm8))
1290 #define _mm_srli_si128(a, imm8) simde_mm_bsrli_si128((a), (imm8))
1291 #endif
1292
1293 SIMDE_FUNCTION_ATTRIBUTES
1294 void
simde_mm_clflush(void const * p)1295 simde_mm_clflush (void const* p) {
1296 #if defined(SIMDE_X86_SSE2_NATIVE)
1297 _mm_clflush(p);
1298 #else
1299 (void) p;
1300 #endif
1301 }
1302 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1303 #define _mm_clflush(a, b) simde_mm_clflush()
1304 #endif
1305
1306 SIMDE_FUNCTION_ATTRIBUTES
1307 int
simde_mm_comieq_sd(simde__m128d a,simde__m128d b)1308 simde_mm_comieq_sd (simde__m128d a, simde__m128d b) {
1309 #if defined(SIMDE_X86_SSE2_NATIVE)
1310 return _mm_comieq_sd(a, b);
1311 #else
1312 simde__m128d_private
1313 a_ = simde__m128d_to_private(a),
1314 b_ = simde__m128d_to_private(b);
1315 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1316 return !!vgetq_lane_u64(vceqq_f64(a_.neon_f64, b_.neon_f64), 0);
1317 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1318 return wasm_f64x2_extract_lane(a_.wasm_v128, 0) == wasm_f64x2_extract_lane(b_.wasm_v128, 0);
1319 #else
1320 return a_.f64[0] == b_.f64[0];
1321 #endif
1322 #endif
1323 }
1324 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1325 #define _mm_comieq_sd(a, b) simde_mm_comieq_sd(a, b)
1326 #endif
1327
1328 SIMDE_FUNCTION_ATTRIBUTES
1329 int
simde_mm_comige_sd(simde__m128d a,simde__m128d b)1330 simde_mm_comige_sd (simde__m128d a, simde__m128d b) {
1331 #if defined(SIMDE_X86_SSE2_NATIVE)
1332 return _mm_comige_sd(a, b);
1333 #else
1334 simde__m128d_private
1335 a_ = simde__m128d_to_private(a),
1336 b_ = simde__m128d_to_private(b);
1337 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1338 return !!vgetq_lane_u64(vcgeq_f64(a_.neon_f64, b_.neon_f64), 0);
1339 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1340 return wasm_f64x2_extract_lane(a_.wasm_v128, 0) >= wasm_f64x2_extract_lane(b_.wasm_v128, 0);
1341 #else
1342 return a_.f64[0] >= b_.f64[0];
1343 #endif
1344 #endif
1345 }
1346 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1347 #define _mm_comige_sd(a, b) simde_mm_comige_sd(a, b)
1348 #endif
1349
1350 SIMDE_FUNCTION_ATTRIBUTES
1351 int
simde_mm_comigt_sd(simde__m128d a,simde__m128d b)1352 simde_mm_comigt_sd (simde__m128d a, simde__m128d b) {
1353 #if defined(SIMDE_X86_SSE2_NATIVE)
1354 return _mm_comigt_sd(a, b);
1355 #else
1356 simde__m128d_private
1357 a_ = simde__m128d_to_private(a),
1358 b_ = simde__m128d_to_private(b);
1359 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1360 return !!vgetq_lane_u64(vcgtq_f64(a_.neon_f64, b_.neon_f64), 0);
1361 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1362 return wasm_f64x2_extract_lane(a_.wasm_v128, 0) > wasm_f64x2_extract_lane(b_.wasm_v128, 0);
1363 #else
1364 return a_.f64[0] > b_.f64[0];
1365 #endif
1366 #endif
1367 }
1368 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1369 #define _mm_comigt_sd(a, b) simde_mm_comigt_sd(a, b)
1370 #endif
1371
1372 SIMDE_FUNCTION_ATTRIBUTES
1373 int
simde_mm_comile_sd(simde__m128d a,simde__m128d b)1374 simde_mm_comile_sd (simde__m128d a, simde__m128d b) {
1375 #if defined(SIMDE_X86_SSE2_NATIVE)
1376 return _mm_comile_sd(a, b);
1377 #else
1378 simde__m128d_private
1379 a_ = simde__m128d_to_private(a),
1380 b_ = simde__m128d_to_private(b);
1381 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1382 return !!vgetq_lane_u64(vcleq_f64(a_.neon_f64, b_.neon_f64), 0);
1383 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1384 return wasm_f64x2_extract_lane(a_.wasm_v128, 0) <= wasm_f64x2_extract_lane(b_.wasm_v128, 0);
1385 #else
1386 return a_.f64[0] <= b_.f64[0];
1387 #endif
1388 #endif
1389 }
1390 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1391 #define _mm_comile_sd(a, b) simde_mm_comile_sd(a, b)
1392 #endif
1393
1394 SIMDE_FUNCTION_ATTRIBUTES
1395 int
simde_mm_comilt_sd(simde__m128d a,simde__m128d b)1396 simde_mm_comilt_sd (simde__m128d a, simde__m128d b) {
1397 #if defined(SIMDE_X86_SSE2_NATIVE)
1398 return _mm_comilt_sd(a, b);
1399 #else
1400 simde__m128d_private
1401 a_ = simde__m128d_to_private(a),
1402 b_ = simde__m128d_to_private(b);
1403 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1404 return !!vgetq_lane_u64(vcltq_f64(a_.neon_f64, b_.neon_f64), 0);
1405 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1406 return wasm_f64x2_extract_lane(a_.wasm_v128, 0) < wasm_f64x2_extract_lane(b_.wasm_v128, 0);
1407 #else
1408 return a_.f64[0] < b_.f64[0];
1409 #endif
1410 #endif
1411 }
1412 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1413 #define _mm_comilt_sd(a, b) simde_mm_comilt_sd(a, b)
1414 #endif
1415
1416 SIMDE_FUNCTION_ATTRIBUTES
1417 int
simde_mm_comineq_sd(simde__m128d a,simde__m128d b)1418 simde_mm_comineq_sd (simde__m128d a, simde__m128d b) {
1419 #if defined(SIMDE_X86_SSE2_NATIVE)
1420 return _mm_comineq_sd(a, b);
1421 #else
1422 simde__m128d_private
1423 a_ = simde__m128d_to_private(a),
1424 b_ = simde__m128d_to_private(b);
1425 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1426 return !vgetq_lane_u64(vceqq_f64(a_.neon_f64, b_.neon_f64), 0);
1427 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1428 return wasm_f64x2_extract_lane(a_.wasm_v128, 0) != wasm_f64x2_extract_lane(b_.wasm_v128, 0);
1429 #else
1430 return a_.f64[0] != b_.f64[0];
1431 #endif
1432 #endif
1433 }
1434 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1435 #define _mm_comineq_sd(a, b) simde_mm_comineq_sd(a, b)
1436 #endif
1437
1438 SIMDE_FUNCTION_ATTRIBUTES
1439 simde__m128d
simde_x_mm_copysign_pd(simde__m128d dest,simde__m128d src)1440 simde_x_mm_copysign_pd(simde__m128d dest, simde__m128d src) {
1441 simde__m128d_private
1442 r_,
1443 dest_ = simde__m128d_to_private(dest),
1444 src_ = simde__m128d_to_private(src);
1445
1446 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1447 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1448 uint64x2_t sign_pos = vreinterpretq_u64_f64(vdupq_n_f64(-SIMDE_FLOAT64_C(0.0)));
1449 #else
1450 simde_float64 dbl_nz = -SIMDE_FLOAT64_C(0.0);
1451 uint64_t u64_nz;
1452 simde_memcpy(&u64_nz, &dbl_nz, sizeof(u64_nz));
1453 uint64x2_t sign_pos = vdupq_n_u64(u64_nz);
1454 #endif
1455 r_.neon_u64 = vbslq_u64(sign_pos, src_.neon_u64, dest_.neon_u64);
1456 #elif defined(SIMDE_POWER_ALTIVEC_P9_NATIVE)
1457 #if !defined(HEDLEY_IBM_VERSION)
1458 r_.altivec_f64 = vec_cpsgn(dest_.altivec_f64, src_.altivec_f64);
1459 #else
1460 r_.altivec_f64 = vec_cpsgn(src_.altivec_f64, dest_.altivec_f64);
1461 #endif
1462 #elif defined(simde_math_copysign)
1463 SIMDE_VECTORIZE
1464 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
1465 r_.f64[i] = simde_math_copysign(dest_.f64[i], src_.f64[i]);
1466 }
1467 #else
1468 simde__m128d sgnbit = simde_mm_set1_pd(-SIMDE_FLOAT64_C(0.0));
1469 return simde_mm_xor_pd(simde_mm_and_pd(sgnbit, src), simde_mm_andnot_pd(sgnbit, dest));
1470 #endif
1471
1472 return simde__m128d_from_private(r_);
1473 }
1474
1475 SIMDE_FUNCTION_ATTRIBUTES
1476 simde__m128d
simde_x_mm_xorsign_pd(simde__m128d dest,simde__m128d src)1477 simde_x_mm_xorsign_pd(simde__m128d dest, simde__m128d src) {
1478 return simde_mm_xor_pd(simde_mm_and_pd(simde_mm_set1_pd(-0.0), src), dest);
1479 }
1480
1481 SIMDE_FUNCTION_ATTRIBUTES
1482 simde__m128
simde_mm_castpd_ps(simde__m128d a)1483 simde_mm_castpd_ps (simde__m128d a) {
1484 #if defined(SIMDE_X86_SSE2_NATIVE)
1485 return _mm_castpd_ps(a);
1486 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1487 return vreinterpretq_f32_f64(a);
1488 #else
1489 simde__m128 r;
1490 simde_memcpy(&r, &a, sizeof(a));
1491 return r;
1492 #endif
1493 }
1494 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1495 #define _mm_castpd_ps(a) simde_mm_castpd_ps(a)
1496 #endif
1497
1498 SIMDE_FUNCTION_ATTRIBUTES
1499 simde__m128i
simde_mm_castpd_si128(simde__m128d a)1500 simde_mm_castpd_si128 (simde__m128d a) {
1501 #if defined(SIMDE_X86_SSE2_NATIVE)
1502 return _mm_castpd_si128(a);
1503 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1504 return vreinterpretq_s64_f64(a);
1505 #else
1506 simde__m128i r;
1507 simde_memcpy(&r, &a, sizeof(a));
1508 return r;
1509 #endif
1510 }
1511 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1512 #define _mm_castpd_si128(a) simde_mm_castpd_si128(a)
1513 #endif
1514
1515 SIMDE_FUNCTION_ATTRIBUTES
1516 simde__m128d
simde_mm_castps_pd(simde__m128 a)1517 simde_mm_castps_pd (simde__m128 a) {
1518 #if defined(SIMDE_X86_SSE2_NATIVE)
1519 return _mm_castps_pd(a);
1520 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1521 return vreinterpretq_f64_f32(a);
1522 #else
1523 simde__m128d r;
1524 simde_memcpy(&r, &a, sizeof(a));
1525 return r;
1526 #endif
1527 }
1528 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1529 #define _mm_castps_pd(a) simde_mm_castps_pd(a)
1530 #endif
1531
1532 SIMDE_FUNCTION_ATTRIBUTES
1533 simde__m128i
simde_mm_castps_si128(simde__m128 a)1534 simde_mm_castps_si128 (simde__m128 a) {
1535 #if defined(SIMDE_X86_SSE2_NATIVE)
1536 return _mm_castps_si128(a);
1537 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1538 return simde__m128i_from_neon_i32(simde__m128_to_private(a).neon_i32);
1539 #else
1540 simde__m128i r;
1541 simde_memcpy(&r, &a, sizeof(a));
1542 return r;
1543 #endif
1544 }
1545 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1546 #define _mm_castps_si128(a) simde_mm_castps_si128(a)
1547 #endif
1548
1549 SIMDE_FUNCTION_ATTRIBUTES
1550 simde__m128d
simde_mm_castsi128_pd(simde__m128i a)1551 simde_mm_castsi128_pd (simde__m128i a) {
1552 #if defined(SIMDE_X86_SSE2_NATIVE)
1553 return _mm_castsi128_pd(a);
1554 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1555 return vreinterpretq_f64_s64(a);
1556 #else
1557 simde__m128d r;
1558 simde_memcpy(&r, &a, sizeof(a));
1559 return r;
1560 #endif
1561 }
1562 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1563 #define _mm_castsi128_pd(a) simde_mm_castsi128_pd(a)
1564 #endif
1565
1566 SIMDE_FUNCTION_ATTRIBUTES
1567 simde__m128
simde_mm_castsi128_ps(simde__m128i a)1568 simde_mm_castsi128_ps (simde__m128i a) {
1569 #if defined(SIMDE_X86_SSE2_NATIVE)
1570 return _mm_castsi128_ps(a);
1571 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1572 return HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), a);
1573 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1574 return simde__m128_from_neon_i32(simde__m128i_to_private(a).neon_i32);
1575 #else
1576 simde__m128 r;
1577 simde_memcpy(&r, &a, sizeof(a));
1578 return r;
1579 #endif
1580 }
1581 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1582 #define _mm_castsi128_ps(a) simde_mm_castsi128_ps(a)
1583 #endif
1584
1585 SIMDE_FUNCTION_ATTRIBUTES
1586 simde__m128i
simde_mm_cmpeq_epi8(simde__m128i a,simde__m128i b)1587 simde_mm_cmpeq_epi8 (simde__m128i a, simde__m128i b) {
1588 #if defined(SIMDE_X86_SSE2_NATIVE)
1589 return _mm_cmpeq_epi8(a, b);
1590 #else
1591 simde__m128i_private
1592 r_,
1593 a_ = simde__m128i_to_private(a),
1594 b_ = simde__m128i_to_private(b);
1595
1596 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1597 r_.neon_u8 = vceqq_s8(b_.neon_i8, a_.neon_i8);
1598 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1599 r_.wasm_v128 = wasm_i8x16_eq(a_.wasm_v128, b_.wasm_v128);
1600 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1601 r_.altivec_i8 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char), vec_cmpeq(a_.altivec_i8, b_.altivec_i8));
1602 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1603 r_.i8 = HEDLEY_STATIC_CAST(__typeof__(r_.i8), (a_.i8 == b_.i8));
1604 #else
1605 SIMDE_VECTORIZE
1606 for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
1607 r_.i8[i] = (a_.i8[i] == b_.i8[i]) ? ~INT8_C(0) : INT8_C(0);
1608 }
1609 #endif
1610
1611 return simde__m128i_from_private(r_);
1612 #endif
1613 }
1614 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1615 #define _mm_cmpeq_epi8(a, b) simde_mm_cmpeq_epi8(a, b)
1616 #endif
1617
1618 SIMDE_FUNCTION_ATTRIBUTES
1619 simde__m128i
simde_mm_cmpeq_epi16(simde__m128i a,simde__m128i b)1620 simde_mm_cmpeq_epi16 (simde__m128i a, simde__m128i b) {
1621 #if defined(SIMDE_X86_SSE2_NATIVE)
1622 return _mm_cmpeq_epi16(a, b);
1623 #else
1624 simde__m128i_private
1625 r_,
1626 a_ = simde__m128i_to_private(a),
1627 b_ = simde__m128i_to_private(b);
1628
1629 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1630 r_.neon_u16 = vceqq_s16(b_.neon_i16, a_.neon_i16);
1631 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1632 r_.wasm_v128 = wasm_i16x8_eq(a_.wasm_v128, b_.wasm_v128);
1633 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1634 r_.altivec_i16 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed short), vec_cmpeq(a_.altivec_i16, b_.altivec_i16));
1635 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1636 r_.i16 = (a_.i16 == b_.i16);
1637 #else
1638 SIMDE_VECTORIZE
1639 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
1640 r_.i16[i] = (a_.i16[i] == b_.i16[i]) ? ~INT16_C(0) : INT16_C(0);
1641 }
1642 #endif
1643
1644 return simde__m128i_from_private(r_);
1645 #endif
1646 }
1647 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1648 #define _mm_cmpeq_epi16(a, b) simde_mm_cmpeq_epi16(a, b)
1649 #endif
1650
1651 SIMDE_FUNCTION_ATTRIBUTES
1652 simde__m128i
simde_mm_cmpeq_epi32(simde__m128i a,simde__m128i b)1653 simde_mm_cmpeq_epi32 (simde__m128i a, simde__m128i b) {
1654 #if defined(SIMDE_X86_SSE2_NATIVE)
1655 return _mm_cmpeq_epi32(a, b);
1656 #else
1657 simde__m128i_private
1658 r_,
1659 a_ = simde__m128i_to_private(a),
1660 b_ = simde__m128i_to_private(b);
1661
1662 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1663 r_.neon_u32 = vceqq_s32(b_.neon_i32, a_.neon_i32);
1664 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1665 r_.wasm_v128 = wasm_i32x4_eq(a_.wasm_v128, b_.wasm_v128);
1666 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1667 r_.altivec_i32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed int), vec_cmpeq(a_.altivec_i32, b_.altivec_i32));
1668 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1669 r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), a_.i32 == b_.i32);
1670 #else
1671 SIMDE_VECTORIZE
1672 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1673 r_.i32[i] = (a_.i32[i] == b_.i32[i]) ? ~INT32_C(0) : INT32_C(0);
1674 }
1675 #endif
1676
1677 return simde__m128i_from_private(r_);
1678 #endif
1679 }
1680 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1681 #define _mm_cmpeq_epi32(a, b) simde_mm_cmpeq_epi32(a, b)
1682 #endif
1683
1684 SIMDE_FUNCTION_ATTRIBUTES
1685 simde__m128d
simde_mm_cmpeq_pd(simde__m128d a,simde__m128d b)1686 simde_mm_cmpeq_pd (simde__m128d a, simde__m128d b) {
1687 #if defined(SIMDE_X86_SSE2_NATIVE)
1688 return _mm_cmpeq_pd(a, b);
1689 #else
1690 simde__m128d_private
1691 r_,
1692 a_ = simde__m128d_to_private(a),
1693 b_ = simde__m128d_to_private(b);
1694
1695 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1696 r_.neon_u64 = vceqq_s64(b_.neon_i64, a_.neon_i64);
1697 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1698 r_.wasm_v128 = wasm_f64x2_eq(a_.wasm_v128, b_.wasm_v128);
1699 #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
1700 r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_cmpeq(a_.altivec_f64, b_.altivec_f64));
1701 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1702 r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 == b_.f64));
1703 #else
1704 SIMDE_VECTORIZE
1705 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
1706 r_.u64[i] = (a_.f64[i] == b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
1707 }
1708 #endif
1709
1710 return simde__m128d_from_private(r_);
1711 #endif
1712 }
1713 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1714 #define _mm_cmpeq_pd(a, b) simde_mm_cmpeq_pd(a, b)
1715 #endif
1716
1717 SIMDE_FUNCTION_ATTRIBUTES
1718 simde__m128d
simde_mm_cmpeq_sd(simde__m128d a,simde__m128d b)1719 simde_mm_cmpeq_sd (simde__m128d a, simde__m128d b) {
1720 #if defined(SIMDE_X86_SSE2_NATIVE)
1721 return _mm_cmpeq_sd(a, b);
1722 #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
1723 return simde_mm_move_sd(a, simde_mm_cmpeq_pd(a, b));
1724 #else
1725 simde__m128d_private
1726 r_,
1727 a_ = simde__m128d_to_private(a),
1728 b_ = simde__m128d_to_private(b);
1729
1730 r_.u64[0] = (a_.u64[0] == b_.u64[0]) ? ~UINT64_C(0) : 0;
1731 r_.u64[1] = a_.u64[1];
1732
1733 return simde__m128d_from_private(r_);
1734 #endif
1735 }
1736 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1737 #define _mm_cmpeq_sd(a, b) simde_mm_cmpeq_sd(a, b)
1738 #endif
1739
1740 SIMDE_FUNCTION_ATTRIBUTES
1741 simde__m128d
simde_mm_cmpneq_pd(simde__m128d a,simde__m128d b)1742 simde_mm_cmpneq_pd (simde__m128d a, simde__m128d b) {
1743 #if defined(SIMDE_X86_SSE2_NATIVE)
1744 return _mm_cmpneq_pd(a, b);
1745 #else
1746 simde__m128d_private
1747 r_,
1748 a_ = simde__m128d_to_private(a),
1749 b_ = simde__m128d_to_private(b);
1750
1751 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1752 r_.neon_u32 = vmvnq_u32(vreinterpretq_u32_u64(vceqq_f64(b_.neon_f64, a_.neon_f64)));
1753 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1754 r_.wasm_v128 = wasm_f64x2_ne(a_.wasm_v128, b_.wasm_v128);
1755 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1756 r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 != b_.f64));
1757 #else
1758 SIMDE_VECTORIZE
1759 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
1760 r_.u64[i] = (a_.f64[i] != b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
1761 }
1762 #endif
1763
1764 return simde__m128d_from_private(r_);
1765 #endif
1766 }
1767 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1768 #define _mm_cmpneq_pd(a, b) simde_mm_cmpneq_pd(a, b)
1769 #endif
1770
1771 SIMDE_FUNCTION_ATTRIBUTES
1772 simde__m128d
simde_mm_cmpneq_sd(simde__m128d a,simde__m128d b)1773 simde_mm_cmpneq_sd (simde__m128d a, simde__m128d b) {
1774 #if defined(SIMDE_X86_SSE2_NATIVE)
1775 return _mm_cmpneq_sd(a, b);
1776 #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
1777 return simde_mm_move_sd(a, simde_mm_cmpneq_pd(a, b));
1778 #else
1779 simde__m128d_private
1780 r_,
1781 a_ = simde__m128d_to_private(a),
1782 b_ = simde__m128d_to_private(b);
1783
1784 r_.u64[0] = (a_.f64[0] != b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
1785 r_.u64[1] = a_.u64[1];
1786
1787
1788 return simde__m128d_from_private(r_);
1789 #endif
1790 }
1791 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1792 #define _mm_cmpneq_sd(a, b) simde_mm_cmpneq_sd(a, b)
1793 #endif
1794
1795 SIMDE_FUNCTION_ATTRIBUTES
1796 simde__m128i
simde_mm_cmplt_epi8(simde__m128i a,simde__m128i b)1797 simde_mm_cmplt_epi8 (simde__m128i a, simde__m128i b) {
1798 #if defined(SIMDE_X86_SSE2_NATIVE)
1799 return _mm_cmplt_epi8(a, b);
1800 #else
1801 simde__m128i_private
1802 r_,
1803 a_ = simde__m128i_to_private(a),
1804 b_ = simde__m128i_to_private(b);
1805
1806 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1807 r_.neon_u8 = vcltq_s8(a_.neon_i8, b_.neon_i8);
1808 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1809 r_.altivec_i8 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char),vec_cmplt(a_.altivec_i8, b_.altivec_i8));
1810 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1811 r_.wasm_v128 = wasm_i8x16_lt(a_.wasm_v128, b_.wasm_v128);
1812 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1813 r_.i8 = HEDLEY_STATIC_CAST(__typeof__(r_.i8), (a_.i8 < b_.i8));
1814 #else
1815 SIMDE_VECTORIZE
1816 for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
1817 r_.i8[i] = (a_.i8[i] < b_.i8[i]) ? ~INT8_C(0) : INT8_C(0);
1818 }
1819 #endif
1820
1821 return simde__m128i_from_private(r_);
1822 #endif
1823 }
1824 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1825 #define _mm_cmplt_epi8(a, b) simde_mm_cmplt_epi8(a, b)
1826 #endif
1827
1828 SIMDE_FUNCTION_ATTRIBUTES
1829 simde__m128i
simde_mm_cmplt_epi16(simde__m128i a,simde__m128i b)1830 simde_mm_cmplt_epi16 (simde__m128i a, simde__m128i b) {
1831 #if defined(SIMDE_X86_SSE2_NATIVE)
1832 return _mm_cmplt_epi16(a, b);
1833 #else
1834 simde__m128i_private
1835 r_,
1836 a_ = simde__m128i_to_private(a),
1837 b_ = simde__m128i_to_private(b);
1838
1839 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1840 r_.neon_u16 = vcltq_s16(a_.neon_i16, b_.neon_i16);
1841 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1842 r_.altivec_i16 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed short), vec_cmplt(a_.altivec_i16, b_.altivec_i16));
1843 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1844 r_.wasm_v128 = wasm_i16x8_lt(a_.wasm_v128, b_.wasm_v128);
1845 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1846 r_.i16 = HEDLEY_STATIC_CAST(__typeof__(r_.i16), (a_.i16 < b_.i16));
1847 #else
1848 SIMDE_VECTORIZE
1849 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
1850 r_.i16[i] = (a_.i16[i] < b_.i16[i]) ? ~INT16_C(0) : INT16_C(0);
1851 }
1852 #endif
1853
1854 return simde__m128i_from_private(r_);
1855 #endif
1856 }
1857 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1858 #define _mm_cmplt_epi16(a, b) simde_mm_cmplt_epi16(a, b)
1859 #endif
1860
1861 SIMDE_FUNCTION_ATTRIBUTES
1862 simde__m128i
simde_mm_cmplt_epi32(simde__m128i a,simde__m128i b)1863 simde_mm_cmplt_epi32 (simde__m128i a, simde__m128i b) {
1864 #if defined(SIMDE_X86_SSE2_NATIVE)
1865 return _mm_cmplt_epi32(a, b);
1866 #else
1867 simde__m128i_private
1868 r_,
1869 a_ = simde__m128i_to_private(a),
1870 b_ = simde__m128i_to_private(b);
1871
1872 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1873 r_.neon_u32 = vcltq_s32(a_.neon_i32, b_.neon_i32);
1874 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1875 r_.altivec_i32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed int), vec_cmplt(a_.altivec_i32, b_.altivec_i32));
1876 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1877 r_.wasm_v128 = wasm_i32x4_lt(a_.wasm_v128, b_.wasm_v128);
1878 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1879 r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.i32 < b_.i32));
1880 #else
1881 SIMDE_VECTORIZE
1882 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1883 r_.i32[i] = (a_.i32[i] < b_.i32[i]) ? ~INT32_C(0) : INT32_C(0);
1884 }
1885 #endif
1886
1887 return simde__m128i_from_private(r_);
1888 #endif
1889 }
1890 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1891 #define _mm_cmplt_epi32(a, b) simde_mm_cmplt_epi32(a, b)
1892 #endif
1893
1894 SIMDE_FUNCTION_ATTRIBUTES
1895 simde__m128d
simde_mm_cmplt_pd(simde__m128d a,simde__m128d b)1896 simde_mm_cmplt_pd (simde__m128d a, simde__m128d b) {
1897 #if defined(SIMDE_X86_SSE2_NATIVE)
1898 return _mm_cmplt_pd(a, b);
1899 #else
1900 simde__m128d_private
1901 r_,
1902 a_ = simde__m128d_to_private(a),
1903 b_ = simde__m128d_to_private(b);
1904
1905 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1906 r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 < b_.f64));
1907 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1908 r_.neon_u64 = vcltq_f64(a_.neon_f64, b_.neon_f64);
1909 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1910 r_.wasm_v128 = wasm_f64x2_lt(a_.wasm_v128, b_.wasm_v128);
1911 #else
1912 SIMDE_VECTORIZE
1913 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
1914 r_.u64[i] = (a_.f64[i] < b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
1915 }
1916 #endif
1917
1918 return simde__m128d_from_private(r_);
1919 #endif
1920 }
1921 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1922 #define _mm_cmplt_pd(a, b) simde_mm_cmplt_pd(a, b)
1923 #endif
1924
1925 SIMDE_FUNCTION_ATTRIBUTES
1926 simde__m128d
simde_mm_cmplt_sd(simde__m128d a,simde__m128d b)1927 simde_mm_cmplt_sd (simde__m128d a, simde__m128d b) {
1928 #if defined(SIMDE_X86_SSE2_NATIVE)
1929 return _mm_cmplt_sd(a, b);
1930 #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
1931 return simde_mm_move_sd(a, simde_mm_cmplt_pd(a, b));
1932 #else
1933 simde__m128d_private
1934 r_,
1935 a_ = simde__m128d_to_private(a),
1936 b_ = simde__m128d_to_private(b);
1937
1938 r_.u64[0] = (a_.f64[0] < b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
1939 r_.u64[1] = a_.u64[1];
1940
1941 return simde__m128d_from_private(r_);
1942 #endif
1943 }
1944 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1945 #define _mm_cmplt_sd(a, b) simde_mm_cmplt_sd(a, b)
1946 #endif
1947
1948 SIMDE_FUNCTION_ATTRIBUTES
1949 simde__m128d
simde_mm_cmple_pd(simde__m128d a,simde__m128d b)1950 simde_mm_cmple_pd (simde__m128d a, simde__m128d b) {
1951 #if defined(SIMDE_X86_SSE2_NATIVE)
1952 return _mm_cmple_pd(a, b);
1953 #else
1954 simde__m128d_private
1955 r_,
1956 a_ = simde__m128d_to_private(a),
1957 b_ = simde__m128d_to_private(b);
1958
1959 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1960 r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 <= b_.f64));
1961 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1962 r_.neon_u64 = vcleq_f64(a_.neon_f64, b_.neon_f64);
1963 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1964 r_.wasm_v128 = wasm_f64x2_le(a_.wasm_v128, b_.wasm_v128);
1965 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1966 r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_cmple(a_.altivec_f64, b_.altivec_f64));
1967 #else
1968 SIMDE_VECTORIZE
1969 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
1970 r_.u64[i] = (a_.f64[i] <= b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
1971 }
1972 #endif
1973
1974 return simde__m128d_from_private(r_);
1975 #endif
1976 }
1977 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1978 #define _mm_cmple_pd(a, b) simde_mm_cmple_pd(a, b)
1979 #endif
1980
1981 SIMDE_FUNCTION_ATTRIBUTES
1982 simde__m128d
simde_mm_cmple_sd(simde__m128d a,simde__m128d b)1983 simde_mm_cmple_sd (simde__m128d a, simde__m128d b) {
1984 #if defined(SIMDE_X86_SSE2_NATIVE)
1985 return _mm_cmple_sd(a, b);
1986 #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
1987 return simde_mm_move_sd(a, simde_mm_cmple_pd(a, b));
1988 #else
1989 simde__m128d_private
1990 r_,
1991 a_ = simde__m128d_to_private(a),
1992 b_ = simde__m128d_to_private(b);
1993
1994 r_.u64[0] = (a_.f64[0] <= b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
1995 r_.u64[1] = a_.u64[1];
1996
1997 return simde__m128d_from_private(r_);
1998 #endif
1999 }
2000 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2001 #define _mm_cmple_sd(a, b) simde_mm_cmple_sd(a, b)
2002 #endif
2003
2004 SIMDE_FUNCTION_ATTRIBUTES
2005 simde__m128i
simde_mm_cmpgt_epi8(simde__m128i a,simde__m128i b)2006 simde_mm_cmpgt_epi8 (simde__m128i a, simde__m128i b) {
2007 #if defined(SIMDE_X86_SSE2_NATIVE)
2008 return _mm_cmpgt_epi8(a, b);
2009 #else
2010 simde__m128i_private
2011 r_,
2012 a_ = simde__m128i_to_private(a),
2013 b_ = simde__m128i_to_private(b);
2014
2015 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2016 r_.neon_u8 = vcgtq_s8(a_.neon_i8, b_.neon_i8);
2017 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
2018 r_.wasm_v128 = wasm_i8x16_gt(a_.wasm_v128, b_.wasm_v128);
2019 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
2020 r_.altivec_i8 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char), vec_cmpgt(a_.altivec_i8, b_.altivec_i8));
2021 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
2022 r_.i8 = HEDLEY_STATIC_CAST(__typeof__(r_.i8), (a_.i8 > b_.i8));
2023 #else
2024 SIMDE_VECTORIZE
2025 for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
2026 r_.i8[i] = (a_.i8[i] > b_.i8[i]) ? ~INT8_C(0) : INT8_C(0);
2027 }
2028 #endif
2029
2030 return simde__m128i_from_private(r_);
2031 #endif
2032 }
2033 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2034 #define _mm_cmpgt_epi8(a, b) simde_mm_cmpgt_epi8(a, b)
2035 #endif
2036
2037 SIMDE_FUNCTION_ATTRIBUTES
2038 simde__m128i
simde_mm_cmpgt_epi16(simde__m128i a,simde__m128i b)2039 simde_mm_cmpgt_epi16 (simde__m128i a, simde__m128i b) {
2040 #if defined(SIMDE_X86_SSE2_NATIVE)
2041 return _mm_cmpgt_epi16(a, b);
2042 #else
2043 simde__m128i_private
2044 r_,
2045 a_ = simde__m128i_to_private(a),
2046 b_ = simde__m128i_to_private(b);
2047
2048 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2049 r_.neon_u16 = vcgtq_s16(a_.neon_i16, b_.neon_i16);
2050 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
2051 r_.wasm_v128 = wasm_i16x8_gt(a_.wasm_v128, b_.wasm_v128);
2052 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
2053 r_.altivec_i16 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed short), vec_cmpgt(a_.altivec_i16, b_.altivec_i16));
2054 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
2055 r_.i16 = HEDLEY_STATIC_CAST(__typeof__(r_.i16), (a_.i16 > b_.i16));
2056 #else
2057 SIMDE_VECTORIZE
2058 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
2059 r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? ~INT16_C(0) : INT16_C(0);
2060 }
2061 #endif
2062
2063 return simde__m128i_from_private(r_);
2064 #endif
2065 }
2066 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2067 #define _mm_cmpgt_epi16(a, b) simde_mm_cmpgt_epi16(a, b)
2068 #endif
2069
2070 SIMDE_FUNCTION_ATTRIBUTES
2071 simde__m128i
simde_mm_cmpgt_epi32(simde__m128i a,simde__m128i b)2072 simde_mm_cmpgt_epi32 (simde__m128i a, simde__m128i b) {
2073 #if defined(SIMDE_X86_SSE2_NATIVE)
2074 return _mm_cmpgt_epi32(a, b);
2075 #else
2076 simde__m128i_private
2077 r_,
2078 a_ = simde__m128i_to_private(a),
2079 b_ = simde__m128i_to_private(b);
2080
2081 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2082 r_.neon_u32 = vcgtq_s32(a_.neon_i32, b_.neon_i32);
2083 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
2084 r_.wasm_v128 = wasm_i32x4_gt(a_.wasm_v128, b_.wasm_v128);
2085 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
2086 r_.altivec_i32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed int), vec_cmpgt(a_.altivec_i32, b_.altivec_i32));
2087 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
2088 r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.i32 > b_.i32));
2089 #else
2090 SIMDE_VECTORIZE
2091 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
2092 r_.i32[i] = (a_.i32[i] > b_.i32[i]) ? ~INT32_C(0) : INT32_C(0);
2093 }
2094 #endif
2095
2096 return simde__m128i_from_private(r_);
2097 #endif
2098 }
2099 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2100 #define _mm_cmpgt_epi32(a, b) simde_mm_cmpgt_epi32(a, b)
2101 #endif
2102
2103 SIMDE_FUNCTION_ATTRIBUTES
2104 simde__m128d
simde_mm_cmpgt_pd(simde__m128d a,simde__m128d b)2105 simde_mm_cmpgt_pd (simde__m128d a, simde__m128d b) {
2106 #if defined(SIMDE_X86_SSE2_NATIVE)
2107 return _mm_cmpgt_pd(a, b);
2108 #else
2109 simde__m128d_private
2110 r_,
2111 a_ = simde__m128d_to_private(a),
2112 b_ = simde__m128d_to_private(b);
2113
2114 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
2115 r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 > b_.f64));
2116 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2117 r_.neon_u64 = vcgtq_f64(a_.neon_f64, b_.neon_f64);
2118 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
2119 r_.wasm_v128 = wasm_f64x2_gt(a_.wasm_v128, b_.wasm_v128);
2120 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
2121 r_.altivec_f64 = HEDLEY_STATIC_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_cmpgt(a_.altivec_f64, b_.altivec_f64));
2122 #else
2123 SIMDE_VECTORIZE
2124 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
2125 r_.u64[i] = (a_.f64[i] > b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
2126 }
2127 #endif
2128
2129 return simde__m128d_from_private(r_);
2130 #endif
2131 }
2132 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2133 #define _mm_cmpgt_pd(a, b) simde_mm_cmpgt_pd(a, b)
2134 #endif
2135
2136 SIMDE_FUNCTION_ATTRIBUTES
2137 simde__m128d
simde_mm_cmpgt_sd(simde__m128d a,simde__m128d b)2138 simde_mm_cmpgt_sd (simde__m128d a, simde__m128d b) {
2139 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
2140 return _mm_cmpgt_sd(a, b);
2141 #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
2142 return simde_mm_move_sd(a, simde_mm_cmpgt_pd(a, b));
2143 #else
2144 simde__m128d_private
2145 r_,
2146 a_ = simde__m128d_to_private(a),
2147 b_ = simde__m128d_to_private(b);
2148
2149 r_.u64[0] = (a_.f64[0] > b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
2150 r_.u64[1] = a_.u64[1];
2151
2152 return simde__m128d_from_private(r_);
2153 #endif
2154 }
2155 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2156 #define _mm_cmpgt_sd(a, b) simde_mm_cmpgt_sd(a, b)
2157 #endif
2158
2159 SIMDE_FUNCTION_ATTRIBUTES
2160 simde__m128d
simde_mm_cmpge_pd(simde__m128d a,simde__m128d b)2161 simde_mm_cmpge_pd (simde__m128d a, simde__m128d b) {
2162 #if defined(SIMDE_X86_SSE2_NATIVE)
2163 return _mm_cmpge_pd(a, b);
2164 #else
2165 simde__m128d_private
2166 r_,
2167 a_ = simde__m128d_to_private(a),
2168 b_ = simde__m128d_to_private(b);
2169
2170 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
2171 r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 >= b_.f64));
2172 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2173 r_.neon_u64 = vcgeq_f64(a_.neon_f64, b_.neon_f64);
2174 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
2175 r_.wasm_v128 = wasm_f64x2_ge(a_.wasm_v128, b_.wasm_v128);
2176 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
2177 r_.altivec_f64 = HEDLEY_STATIC_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_cmpge(a_.altivec_f64, b_.altivec_f64));
2178 #else
2179 SIMDE_VECTORIZE
2180 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
2181 r_.u64[i] = (a_.f64[i] >= b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
2182 }
2183 #endif
2184
2185 return simde__m128d_from_private(r_);
2186 #endif
2187 }
2188 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2189 #define _mm_cmpge_pd(a, b) simde_mm_cmpge_pd(a, b)
2190 #endif
2191
2192 SIMDE_FUNCTION_ATTRIBUTES
2193 simde__m128d
simde_mm_cmpge_sd(simde__m128d a,simde__m128d b)2194 simde_mm_cmpge_sd (simde__m128d a, simde__m128d b) {
2195 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
2196 return _mm_cmpge_sd(a, b);
2197 #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
2198 return simde_mm_move_sd(a, simde_mm_cmpge_pd(a, b));
2199 #else
2200 simde__m128d_private
2201 r_,
2202 a_ = simde__m128d_to_private(a),
2203 b_ = simde__m128d_to_private(b);
2204
2205 r_.u64[0] = (a_.f64[0] >= b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
2206 r_.u64[1] = a_.u64[1];
2207
2208 return simde__m128d_from_private(r_);
2209 #endif
2210 }
2211 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2212 #define _mm_cmpge_sd(a, b) simde_mm_cmpge_sd(a, b)
2213 #endif
2214
2215 SIMDE_FUNCTION_ATTRIBUTES
2216 simde__m128d
simde_mm_cmpngt_pd(simde__m128d a,simde__m128d b)2217 simde_mm_cmpngt_pd (simde__m128d a, simde__m128d b) {
2218 #if defined(SIMDE_X86_SSE2_NATIVE)
2219 return _mm_cmpngt_pd(a, b);
2220 #else
2221 return simde_mm_cmple_pd(a, b);
2222 #endif
2223 }
2224 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2225 #define _mm_cmpngt_pd(a, b) simde_mm_cmpngt_pd(a, b)
2226 #endif
2227
2228 SIMDE_FUNCTION_ATTRIBUTES
2229 simde__m128d
simde_mm_cmpngt_sd(simde__m128d a,simde__m128d b)2230 simde_mm_cmpngt_sd (simde__m128d a, simde__m128d b) {
2231 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
2232 return _mm_cmpngt_sd(a, b);
2233 #else
2234 return simde_mm_cmple_sd(a, b);
2235 #endif
2236 }
2237 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2238 #define _mm_cmpngt_sd(a, b) simde_mm_cmpngt_sd(a, b)
2239 #endif
2240
2241 SIMDE_FUNCTION_ATTRIBUTES
2242 simde__m128d
simde_mm_cmpnge_pd(simde__m128d a,simde__m128d b)2243 simde_mm_cmpnge_pd (simde__m128d a, simde__m128d b) {
2244 #if defined(SIMDE_X86_SSE2_NATIVE)
2245 return _mm_cmpnge_pd(a, b);
2246 #else
2247 return simde_mm_cmplt_pd(a, b);
2248 #endif
2249 }
2250 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2251 #define _mm_cmpnge_pd(a, b) simde_mm_cmpnge_pd(a, b)
2252 #endif
2253
2254 SIMDE_FUNCTION_ATTRIBUTES
2255 simde__m128d
simde_mm_cmpnge_sd(simde__m128d a,simde__m128d b)2256 simde_mm_cmpnge_sd (simde__m128d a, simde__m128d b) {
2257 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
2258 return _mm_cmpnge_sd(a, b);
2259 #else
2260 return simde_mm_cmplt_sd(a, b);
2261 #endif
2262 }
2263 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2264 #define _mm_cmpnge_sd(a, b) simde_mm_cmpnge_sd(a, b)
2265 #endif
2266
2267 SIMDE_FUNCTION_ATTRIBUTES
2268 simde__m128d
simde_mm_cmpnlt_pd(simde__m128d a,simde__m128d b)2269 simde_mm_cmpnlt_pd (simde__m128d a, simde__m128d b) {
2270 #if defined(SIMDE_X86_SSE2_NATIVE)
2271 return _mm_cmpnlt_pd(a, b);
2272 #else
2273 return simde_mm_cmpge_pd(a, b);
2274 #endif
2275 }
2276 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2277 #define _mm_cmpnlt_pd(a, b) simde_mm_cmpnlt_pd(a, b)
2278 #endif
2279
2280 SIMDE_FUNCTION_ATTRIBUTES
2281 simde__m128d
simde_mm_cmpnlt_sd(simde__m128d a,simde__m128d b)2282 simde_mm_cmpnlt_sd (simde__m128d a, simde__m128d b) {
2283 #if defined(SIMDE_X86_SSE2_NATIVE)
2284 return _mm_cmpnlt_sd(a, b);
2285 #else
2286 return simde_mm_cmpge_sd(a, b);
2287 #endif
2288 }
2289 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2290 #define _mm_cmpnlt_sd(a, b) simde_mm_cmpnlt_sd(a, b)
2291 #endif
2292
2293 SIMDE_FUNCTION_ATTRIBUTES
2294 simde__m128d
simde_mm_cmpnle_pd(simde__m128d a,simde__m128d b)2295 simde_mm_cmpnle_pd (simde__m128d a, simde__m128d b) {
2296 #if defined(SIMDE_X86_SSE2_NATIVE)
2297 return _mm_cmpnle_pd(a, b);
2298 #else
2299 return simde_mm_cmpgt_pd(a, b);
2300 #endif
2301 }
2302 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2303 #define _mm_cmpnle_pd(a, b) simde_mm_cmpnle_pd(a, b)
2304 #endif
2305
2306 SIMDE_FUNCTION_ATTRIBUTES
2307 simde__m128d
simde_mm_cmpnle_sd(simde__m128d a,simde__m128d b)2308 simde_mm_cmpnle_sd (simde__m128d a, simde__m128d b) {
2309 #if defined(SIMDE_X86_SSE2_NATIVE)
2310 return _mm_cmpnle_sd(a, b);
2311 #else
2312 return simde_mm_cmpgt_sd(a, b);
2313 #endif
2314 }
2315 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2316 #define _mm_cmpnle_sd(a, b) simde_mm_cmpnle_sd(a, b)
2317 #endif
2318
2319 SIMDE_FUNCTION_ATTRIBUTES
2320 simde__m128d
simde_mm_cmpord_pd(simde__m128d a,simde__m128d b)2321 simde_mm_cmpord_pd (simde__m128d a, simde__m128d b) {
2322 #if defined(SIMDE_X86_SSE2_NATIVE)
2323 return _mm_cmpord_pd(a, b);
2324 #else
2325 simde__m128d_private
2326 r_,
2327 a_ = simde__m128d_to_private(a),
2328 b_ = simde__m128d_to_private(b);
2329
2330 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2331 /* Note: NEON does not have ordered compare builtin
2332 Need to compare a eq a and b eq b to check for NaN
2333 Do AND of results to get final */
2334 uint64x2_t ceqaa = vceqq_f64(a_.neon_f64, a_.neon_f64);
2335 uint64x2_t ceqbb = vceqq_f64(b_.neon_f64, b_.neon_f64);
2336 r_.neon_u64 = vandq_u64(ceqaa, ceqbb);
2337 #elif defined(simde_math_isnan)
2338 SIMDE_VECTORIZE
2339 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
2340 r_.u64[i] = (!simde_math_isnan(a_.f64[i]) && !simde_math_isnan(b_.f64[i])) ? ~UINT64_C(0) : UINT64_C(0);
2341 }
2342 #else
2343 HEDLEY_UNREACHABLE();
2344 #endif
2345
2346 return simde__m128d_from_private(r_);
2347 #endif
2348 }
2349 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2350 #define _mm_cmpord_pd(a, b) simde_mm_cmpord_pd(a, b)
2351 #endif
2352
2353 SIMDE_FUNCTION_ATTRIBUTES
2354 simde_float64
simde_mm_cvtsd_f64(simde__m128d a)2355 simde_mm_cvtsd_f64 (simde__m128d a) {
2356 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
2357 return _mm_cvtsd_f64(a);
2358 #else
2359 simde__m128d_private a_ = simde__m128d_to_private(a);
2360 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2361 return HEDLEY_STATIC_CAST(simde_float64, vgetq_lane_f64(a_.neon_f64, 0));
2362 #else
2363 return a_.f64[0];
2364 #endif
2365 #endif
2366 }
2367 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2368 #define _mm_cvtsd_f64(a) simde_mm_cvtsd_f64(a)
2369 #endif
2370
2371 SIMDE_FUNCTION_ATTRIBUTES
2372 simde__m128d
simde_mm_cmpord_sd(simde__m128d a,simde__m128d b)2373 simde_mm_cmpord_sd (simde__m128d a, simde__m128d b) {
2374 #if defined(SIMDE_X86_SSE2_NATIVE)
2375 return _mm_cmpord_sd(a, b);
2376 #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
2377 return simde_mm_move_sd(a, simde_mm_cmpord_pd(a, b));
2378 #else
2379 simde__m128d_private
2380 r_,
2381 a_ = simde__m128d_to_private(a),
2382 b_ = simde__m128d_to_private(b);
2383
2384 #if defined(simde_math_isnan)
2385 r_.u64[0] = (!simde_math_isnan(a_.f64[0]) && !simde_math_isnan(b_.f64[0])) ? ~UINT64_C(0) : UINT64_C(0);
2386 r_.u64[1] = a_.u64[1];
2387 #else
2388 HEDLEY_UNREACHABLE();
2389 #endif
2390
2391 return simde__m128d_from_private(r_);
2392 #endif
2393 }
2394 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2395 #define _mm_cmpord_sd(a, b) simde_mm_cmpord_sd(a, b)
2396 #endif
2397
2398 SIMDE_FUNCTION_ATTRIBUTES
2399 simde__m128d
simde_mm_cmpunord_pd(simde__m128d a,simde__m128d b)2400 simde_mm_cmpunord_pd (simde__m128d a, simde__m128d b) {
2401 #if defined(SIMDE_X86_SSE2_NATIVE)
2402 return _mm_cmpunord_pd(a, b);
2403 #else
2404 simde__m128d_private
2405 r_,
2406 a_ = simde__m128d_to_private(a),
2407 b_ = simde__m128d_to_private(b);
2408
2409 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2410 uint64x2_t ceqaa = vceqq_f64(a_.neon_f64, a_.neon_f64);
2411 uint64x2_t ceqbb = vceqq_f64(b_.neon_f64, b_.neon_f64);
2412 r_.neon_u64 = vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(vandq_u64(ceqaa, ceqbb))));
2413 #elif defined(simde_math_isnan)
2414 SIMDE_VECTORIZE
2415 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
2416 r_.u64[i] = (simde_math_isnan(a_.f64[i]) || simde_math_isnan(b_.f64[i])) ? ~UINT64_C(0) : UINT64_C(0);
2417 }
2418 #else
2419 HEDLEY_UNREACHABLE();
2420 #endif
2421
2422 return simde__m128d_from_private(r_);
2423 #endif
2424 }
2425 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2426 #define _mm_cmpunord_pd(a, b) simde_mm_cmpunord_pd(a, b)
2427 #endif
2428
2429 SIMDE_FUNCTION_ATTRIBUTES
2430 simde__m128d
simde_mm_cmpunord_sd(simde__m128d a,simde__m128d b)2431 simde_mm_cmpunord_sd (simde__m128d a, simde__m128d b) {
2432 #if defined(SIMDE_X86_SSE2_NATIVE)
2433 return _mm_cmpunord_sd(a, b);
2434 #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
2435 return simde_mm_move_sd(a, simde_mm_cmpunord_pd(a, b));
2436 #else
2437 simde__m128d_private
2438 r_,
2439 a_ = simde__m128d_to_private(a),
2440 b_ = simde__m128d_to_private(b);
2441
2442 #if defined(simde_math_isnan)
2443 r_.u64[0] = (simde_math_isnan(a_.f64[0]) || simde_math_isnan(b_.f64[0])) ? ~UINT64_C(0) : UINT64_C(0);
2444 r_.u64[1] = a_.u64[1];
2445 #else
2446 HEDLEY_UNREACHABLE();
2447 #endif
2448
2449 return simde__m128d_from_private(r_);
2450 #endif
2451 }
2452 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2453 #define _mm_cmpunord_sd(a, b) simde_mm_cmpunord_sd(a, b)
2454 #endif
2455
2456 SIMDE_FUNCTION_ATTRIBUTES
2457 simde__m128d
simde_mm_cvtepi32_pd(simde__m128i a)2458 simde_mm_cvtepi32_pd (simde__m128i a) {
2459 #if defined(SIMDE_X86_SSE2_NATIVE)
2460 return _mm_cvtepi32_pd(a);
2461 #else
2462 simde__m128d_private r_;
2463 simde__m128i_private a_ = simde__m128i_to_private(a);
2464
2465 #if defined(SIMDE_CONVERT_VECTOR_)
2466 SIMDE_CONVERT_VECTOR_(r_.f64, a_.m64_private[0].i32);
2467 #else
2468 SIMDE_VECTORIZE
2469 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
2470 r_.f64[i] = (simde_float64) a_.i32[i];
2471 }
2472 #endif
2473
2474 return simde__m128d_from_private(r_);
2475 #endif
2476 }
2477 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2478 #define _mm_cvtepi32_pd(a) simde_mm_cvtepi32_pd(a)
2479 #endif
2480
2481 SIMDE_FUNCTION_ATTRIBUTES
2482 simde__m128
simde_mm_cvtepi32_ps(simde__m128i a)2483 simde_mm_cvtepi32_ps (simde__m128i a) {
2484 #if defined(SIMDE_X86_SSE2_NATIVE)
2485 return _mm_cvtepi32_ps(a);
2486 #else
2487 simde__m128_private r_;
2488 simde__m128i_private a_ = simde__m128i_to_private(a);
2489
2490 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2491 r_.neon_f32 = vcvtq_f32_s32(a_.neon_i32);
2492 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
2493 r_.wasm_v128 = wasm_f32x4_convert_i32x4(a_.wasm_v128);
2494 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
2495 HEDLEY_DIAGNOSTIC_PUSH
2496 #if HEDLEY_HAS_WARNING("-Wc11-extensions")
2497 #pragma clang diagnostic ignored "-Wc11-extensions"
2498 #endif
2499 r_.altivec_f32 = vec_ctf(a_.altivec_i32, 0);
2500 HEDLEY_DIAGNOSTIC_POP
2501 #elif defined(SIMDE_CONVERT_VECTOR_)
2502 SIMDE_CONVERT_VECTOR_(r_.f32, a_.i32);
2503 #else
2504 SIMDE_VECTORIZE
2505 for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
2506 r_.f32[i] = (simde_float32) a_.i32[i];
2507 }
2508 #endif
2509
2510 return simde__m128_from_private(r_);
2511 #endif
2512 }
2513 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2514 #define _mm_cvtepi32_ps(a) simde_mm_cvtepi32_ps(a)
2515 #endif
2516
2517 SIMDE_FUNCTION_ATTRIBUTES
2518 simde__m64
simde_mm_cvtpd_pi32(simde__m128d a)2519 simde_mm_cvtpd_pi32 (simde__m128d a) {
2520 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
2521 return _mm_cvtpd_pi32(a);
2522 #else
2523 simde__m64_private r_;
2524 simde__m128d_private a_ = simde__m128d_to_private(a);
2525
2526 SIMDE_VECTORIZE
2527 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
2528 simde_float64 v = simde_math_round(a_.f64[i]);
2529 #if defined(SIMDE_FAST_CONVERSION_RANGE)
2530 r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, v);
2531 #else
2532 r_.i32[i] = ((v > HEDLEY_STATIC_CAST(simde_float64, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float64, INT32_MAX))) ?
2533 SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN;
2534 #endif
2535 }
2536
2537 return simde__m64_from_private(r_);
2538 #endif
2539 }
2540 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2541 #define _mm_cvtpd_pi32(a) simde_mm_cvtpd_pi32(a)
2542 #endif
2543
2544 SIMDE_FUNCTION_ATTRIBUTES
2545 simde__m128i
simde_mm_cvtpd_epi32(simde__m128d a)2546 simde_mm_cvtpd_epi32 (simde__m128d a) {
2547 #if defined(SIMDE_X86_SSE2_NATIVE)
2548 return _mm_cvtpd_epi32(a);
2549 #else
2550 simde__m128i_private r_;
2551
2552 r_.m64[0] = simde_mm_cvtpd_pi32(a);
2553 r_.m64[1] = simde_mm_setzero_si64();
2554
2555 return simde__m128i_from_private(r_);
2556 #endif
2557 }
2558 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2559 #define _mm_cvtpd_epi32(a) simde_mm_cvtpd_epi32(a)
2560 #endif
2561
2562 SIMDE_FUNCTION_ATTRIBUTES
2563 simde__m128
simde_mm_cvtpd_ps(simde__m128d a)2564 simde_mm_cvtpd_ps (simde__m128d a) {
2565 #if defined(SIMDE_X86_SSE2_NATIVE)
2566 return _mm_cvtpd_ps(a);
2567 #else
2568 simde__m128_private r_;
2569 simde__m128d_private a_ = simde__m128d_to_private(a);
2570
2571 #if defined(SIMDE_CONVERT_VECTOR_)
2572 SIMDE_CONVERT_VECTOR_(r_.m64_private[0].f32, a_.f64);
2573 r_.m64_private[1] = simde__m64_to_private(simde_mm_setzero_si64());
2574 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2575 r_.neon_f32 = vreinterpretq_f32_f64(vcombine_f64(vreinterpret_f64_f32(vcvtx_f32_f64(a_.neon_f64)), vdup_n_f64(0)));
2576 #else
2577 SIMDE_VECTORIZE
2578 for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) {
2579 r_.f32[i] = (simde_float32) a_.f64[i];
2580 }
2581 simde_memset(&(r_.m64_private[1]), 0, sizeof(r_.m64_private[1]));
2582 #endif
2583
2584 return simde__m128_from_private(r_);
2585 #endif
2586 }
2587 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2588 #define _mm_cvtpd_ps(a) simde_mm_cvtpd_ps(a)
2589 #endif
2590
2591 SIMDE_FUNCTION_ATTRIBUTES
2592 simde__m128d
simde_mm_cvtpi32_pd(simde__m64 a)2593 simde_mm_cvtpi32_pd (simde__m64 a) {
2594 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
2595 return _mm_cvtpi32_pd(a);
2596 #else
2597 simde__m128d_private r_;
2598 simde__m64_private a_ = simde__m64_to_private(a);
2599
2600 #if defined(SIMDE_CONVERT_VECTOR_)
2601 SIMDE_CONVERT_VECTOR_(r_.f64, a_.i32);
2602 #else
2603 SIMDE_VECTORIZE
2604 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
2605 r_.f64[i] = (simde_float64) a_.i32[i];
2606 }
2607 #endif
2608
2609 return simde__m128d_from_private(r_);
2610 #endif
2611 }
2612 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2613 #define _mm_cvtpi32_pd(a) simde_mm_cvtpi32_pd(a)
2614 #endif
2615
2616 SIMDE_FUNCTION_ATTRIBUTES
2617 simde__m128i
simde_mm_cvtps_epi32(simde__m128 a)2618 simde_mm_cvtps_epi32 (simde__m128 a) {
2619 #if defined(SIMDE_X86_SSE2_NATIVE)
2620 return _mm_cvtps_epi32(a);
2621 #else
2622 simde__m128i_private r_;
2623 simde__m128_private a_ = simde__m128_to_private(a);
2624
2625 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE)
2626 r_.neon_i32 = vcvtnq_s32_f32(a_.neon_f32);
2627 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE) && defined(SIMDE_FAST_ROUND_TIES)
2628 r_.neon_i32 = vcvtnq_s32_f32(a_.neon_f32);
2629 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE) && defined(SIMDE_FAST_ROUND_TIES)
2630 HEDLEY_DIAGNOSTIC_PUSH
2631 SIMDE_DIAGNOSTIC_DISABLE_C11_EXTENSIONS_
2632 SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_
2633 r_.altivec_i32 = vec_cts(a_.altivec_f32, 1);
2634 HEDLEY_DIAGNOSTIC_POP
2635 #else
2636 a_ = simde__m128_to_private(simde_x_mm_round_ps(a, SIMDE_MM_FROUND_TO_NEAREST_INT, 1));
2637 SIMDE_VECTORIZE
2638 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
2639 simde_float32 v = simde_math_roundf(a_.f32[i]);
2640 #if defined(SIMDE_FAST_CONVERSION_RANGE)
2641 r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, v);
2642 #else
2643 r_.i32[i] = ((v > HEDLEY_STATIC_CAST(simde_float32, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float32, INT32_MAX))) ?
2644 SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN;
2645 #endif
2646 }
2647 #endif
2648
2649 return simde__m128i_from_private(r_);
2650 #endif
2651 }
2652 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2653 #define _mm_cvtps_epi32(a) simde_mm_cvtps_epi32(a)
2654 #endif
2655
2656 SIMDE_FUNCTION_ATTRIBUTES
2657 simde__m128d
simde_mm_cvtps_pd(simde__m128 a)2658 simde_mm_cvtps_pd (simde__m128 a) {
2659 #if defined(SIMDE_X86_SSE2_NATIVE)
2660 return _mm_cvtps_pd(a);
2661 #else
2662 simde__m128d_private r_;
2663 simde__m128_private a_ = simde__m128_to_private(a);
2664
2665 #if defined(SIMDE_CONVERT_VECTOR_)
2666 SIMDE_CONVERT_VECTOR_(r_.f64, a_.m64_private[0].f32);
2667 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2668 r_.neon_f64 = vcvt_f64_f32(vget_low_f32(a_.neon_f32));
2669 #else
2670 SIMDE_VECTORIZE
2671 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
2672 r_.f64[i] = a_.f32[i];
2673 }
2674 #endif
2675
2676 return simde__m128d_from_private(r_);
2677 #endif
2678 }
2679 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2680 #define _mm_cvtps_pd(a) simde_mm_cvtps_pd(a)
2681 #endif
2682
2683 SIMDE_FUNCTION_ATTRIBUTES
2684 int32_t
simde_mm_cvtsd_si32(simde__m128d a)2685 simde_mm_cvtsd_si32 (simde__m128d a) {
2686 #if defined(SIMDE_X86_SSE2_NATIVE)
2687 return _mm_cvtsd_si32(a);
2688 #else
2689 simde__m128d_private a_ = simde__m128d_to_private(a);
2690
2691 simde_float64 v = simde_math_round(a_.f64[0]);
2692 #if defined(SIMDE_FAST_CONVERSION_RANGE)
2693 return SIMDE_CONVERT_FTOI(int32_t, v);
2694 #else
2695 return ((v > HEDLEY_STATIC_CAST(simde_float64, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float64, INT32_MAX))) ?
2696 SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN;
2697 #endif
2698 #endif
2699 }
2700 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2701 #define _mm_cvtsd_si32(a) simde_mm_cvtsd_si32(a)
2702 #endif
2703
2704 SIMDE_FUNCTION_ATTRIBUTES
2705 int64_t
simde_mm_cvtsd_si64(simde__m128d a)2706 simde_mm_cvtsd_si64 (simde__m128d a) {
2707 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
2708 #if defined(__PGI)
2709 return _mm_cvtsd_si64x(a);
2710 #else
2711 return _mm_cvtsd_si64(a);
2712 #endif
2713 #else
2714 simde__m128d_private a_ = simde__m128d_to_private(a);
2715 return SIMDE_CONVERT_FTOI(int64_t, simde_math_round(a_.f64[0]));
2716 #endif
2717 }
2718 #define simde_mm_cvtsd_si64x(a) simde_mm_cvtsd_si64(a)
2719 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2720 #define _mm_cvtsd_si64(a) simde_mm_cvtsd_si64(a)
2721 #define _mm_cvtsd_si64x(a) simde_mm_cvtsd_si64x(a)
2722 #endif
2723
2724 SIMDE_FUNCTION_ATTRIBUTES
2725 simde__m128
simde_mm_cvtsd_ss(simde__m128 a,simde__m128d b)2726 simde_mm_cvtsd_ss (simde__m128 a, simde__m128d b) {
2727 #if defined(SIMDE_X86_SSE2_NATIVE)
2728 return _mm_cvtsd_ss(a, b);
2729 #else
2730 simde__m128_private
2731 r_,
2732 a_ = simde__m128_to_private(a);
2733 simde__m128d_private b_ = simde__m128d_to_private(b);
2734
2735 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2736 r_.neon_f32 = vsetq_lane_f32(vcvtxd_f32_f64(vgetq_lane_f64(b_.neon_f64, 0)), a_.neon_f32, 0);
2737 #else
2738 r_.f32[0] = HEDLEY_STATIC_CAST(simde_float32, b_.f64[0]);
2739
2740 SIMDE_VECTORIZE
2741 for (size_t i = 1 ; i < (sizeof(r_) / sizeof(r_.i32[0])) ; i++) {
2742 r_.i32[i] = a_.i32[i];
2743 }
2744 #endif
2745 return simde__m128_from_private(r_);
2746 #endif
2747 }
2748 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2749 #define _mm_cvtsd_ss(a, b) simde_mm_cvtsd_ss(a, b)
2750 #endif
2751
2752 SIMDE_FUNCTION_ATTRIBUTES
2753 int16_t
simde_x_mm_cvtsi128_si16(simde__m128i a)2754 simde_x_mm_cvtsi128_si16 (simde__m128i a) {
2755 simde__m128i_private
2756 a_ = simde__m128i_to_private(a);
2757
2758 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2759 return vgetq_lane_s16(a_.neon_i16, 0);
2760 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
2761 return HEDLEY_STATIC_CAST(int16_t, wasm_i16x8_extract_lane(a_.wasm_v128, 0));
2762 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
2763 #if defined(SIMDE_BUG_GCC_95227)
2764 (void) a_;
2765 #endif
2766 return vec_extract(a_.altivec_i16, 0);
2767 #else
2768 return a_.i16[0];
2769 #endif
2770 }
2771
2772 SIMDE_FUNCTION_ATTRIBUTES
2773 int32_t
simde_mm_cvtsi128_si32(simde__m128i a)2774 simde_mm_cvtsi128_si32 (simde__m128i a) {
2775 #if defined(SIMDE_X86_SSE2_NATIVE)
2776 return _mm_cvtsi128_si32(a);
2777 #else
2778 simde__m128i_private
2779 a_ = simde__m128i_to_private(a);
2780
2781 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2782 return vgetq_lane_s32(a_.neon_i32, 0);
2783 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
2784 return HEDLEY_STATIC_CAST(int32_t, wasm_i32x4_extract_lane(a_.wasm_v128, 0));
2785 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
2786 #if defined(SIMDE_BUG_GCC_95227)
2787 (void) a_;
2788 #endif
2789 return vec_extract(a_.altivec_i32, 0);
2790 #else
2791 return a_.i32[0];
2792 #endif
2793 #endif
2794 }
2795 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2796 #define _mm_cvtsi128_si32(a) simde_mm_cvtsi128_si32(a)
2797 #endif
2798
2799 SIMDE_FUNCTION_ATTRIBUTES
2800 int64_t
simde_mm_cvtsi128_si64(simde__m128i a)2801 simde_mm_cvtsi128_si64 (simde__m128i a) {
2802 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
2803 #if defined(__PGI)
2804 return _mm_cvtsi128_si64x(a);
2805 #else
2806 return _mm_cvtsi128_si64(a);
2807 #endif
2808 #else
2809 simde__m128i_private a_ = simde__m128i_to_private(a);
2810 #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) && !defined(HEDLEY_IBM_VERSION)
2811 return vec_extract(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed long long), a_.i64), 0);
2812 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2813 return vgetq_lane_s64(a_.neon_i64, 0);
2814 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
2815 return HEDLEY_STATIC_CAST(int64_t, wasm_i64x2_extract_lane(a_.wasm_v128, 0));
2816 #endif
2817 return a_.i64[0];
2818 #endif
2819 }
2820 #define simde_mm_cvtsi128_si64x(a) simde_mm_cvtsi128_si64(a)
2821 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2822 #define _mm_cvtsi128_si64(a) simde_mm_cvtsi128_si64(a)
2823 #define _mm_cvtsi128_si64x(a) simde_mm_cvtsi128_si64x(a)
2824 #endif
2825
2826 SIMDE_FUNCTION_ATTRIBUTES
2827 simde__m128d
simde_mm_cvtsi32_sd(simde__m128d a,int32_t b)2828 simde_mm_cvtsi32_sd (simde__m128d a, int32_t b) {
2829 #if defined(SIMDE_X86_SSE2_NATIVE)
2830 return _mm_cvtsi32_sd(a, b);
2831 #else
2832 simde__m128d_private r_;
2833 simde__m128d_private a_ = simde__m128d_to_private(a);
2834
2835 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_AMD64)
2836 r_.neon_f64 = vsetq_lane_f64(HEDLEY_STATIC_CAST(float64_t, b), a_.neon_f64, 0);
2837 #else
2838 r_.f64[0] = HEDLEY_STATIC_CAST(simde_float64, b);
2839 r_.i64[1] = a_.i64[1];
2840 #endif
2841
2842 return simde__m128d_from_private(r_);
2843 #endif
2844 }
2845 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2846 #define _mm_cvtsi32_sd(a, b) simde_mm_cvtsi32_sd(a, b)
2847 #endif
2848
2849 SIMDE_FUNCTION_ATTRIBUTES
2850 simde__m128i
simde_x_mm_cvtsi16_si128(int16_t a)2851 simde_x_mm_cvtsi16_si128 (int16_t a) {
2852 simde__m128i_private r_;
2853
2854 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2855 r_.neon_i16 = vsetq_lane_s16(a, vdupq_n_s16(0), 0);
2856 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
2857 r_.wasm_v128 = wasm_i16x8_make(a, 0, 0, 0, 0, 0, 0, 0);
2858 #else
2859 r_.i16[0] = a;
2860 r_.i16[1] = 0;
2861 r_.i16[2] = 0;
2862 r_.i16[3] = 0;
2863 r_.i16[4] = 0;
2864 r_.i16[5] = 0;
2865 r_.i16[6] = 0;
2866 r_.i16[7] = 0;
2867 #endif
2868
2869 return simde__m128i_from_private(r_);
2870 }
2871
2872 SIMDE_FUNCTION_ATTRIBUTES
2873 simde__m128i
simde_mm_cvtsi32_si128(int32_t a)2874 simde_mm_cvtsi32_si128 (int32_t a) {
2875 #if defined(SIMDE_X86_SSE2_NATIVE)
2876 return _mm_cvtsi32_si128(a);
2877 #else
2878 simde__m128i_private r_;
2879
2880 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2881 r_.neon_i32 = vsetq_lane_s32(a, vdupq_n_s32(0), 0);
2882 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
2883 r_.wasm_v128 = wasm_i32x4_make(a, 0, 0, 0);
2884 #else
2885 r_.i32[0] = a;
2886 r_.i32[1] = 0;
2887 r_.i32[2] = 0;
2888 r_.i32[3] = 0;
2889 #endif
2890
2891 return simde__m128i_from_private(r_);
2892 #endif
2893 }
2894 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2895 #define _mm_cvtsi32_si128(a) simde_mm_cvtsi32_si128(a)
2896 #endif
2897
2898 SIMDE_FUNCTION_ATTRIBUTES
2899 simde__m128d
simde_mm_cvtsi64_sd(simde__m128d a,int64_t b)2900 simde_mm_cvtsi64_sd (simde__m128d a, int64_t b) {
2901 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
2902 #if !defined(__PGI)
2903 return _mm_cvtsi64_sd(a, b);
2904 #else
2905 return _mm_cvtsi64x_sd(a, b);
2906 #endif
2907 #else
2908 simde__m128d_private
2909 r_,
2910 a_ = simde__m128d_to_private(a);
2911
2912 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2913 r_.neon_f64 = vsetq_lane_f64(HEDLEY_STATIC_CAST(float64_t, b), a_.neon_f64, 0);
2914 #else
2915 r_.f64[0] = HEDLEY_STATIC_CAST(simde_float64, b);
2916 r_.f64[1] = a_.f64[1];
2917 #endif
2918
2919 return simde__m128d_from_private(r_);
2920 #endif
2921 }
2922 #define simde_mm_cvtsi64x_sd(a, b) simde_mm_cvtsi64_sd(a, b)
2923 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2924 #define _mm_cvtsi64_sd(a, b) simde_mm_cvtsi64_sd(a, b)
2925 #define _mm_cvtsi64x_sd(a, b) simde_mm_cvtsi64x_sd(a, b)
2926 #endif
2927
2928 SIMDE_FUNCTION_ATTRIBUTES
2929 simde__m128i
simde_mm_cvtsi64_si128(int64_t a)2930 simde_mm_cvtsi64_si128 (int64_t a) {
2931 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
2932 #if !defined(__PGI)
2933 return _mm_cvtsi64_si128(a);
2934 #else
2935 return _mm_cvtsi64x_si128(a);
2936 #endif
2937 #else
2938 simde__m128i_private r_;
2939
2940 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2941 r_.neon_i64 = vsetq_lane_s64(a, vdupq_n_s64(0), 0);
2942 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
2943 r_.wasm_v128 = wasm_i64x2_make(a, 0);
2944 #else
2945 r_.i64[0] = a;
2946 r_.i64[1] = 0;
2947 #endif
2948
2949 return simde__m128i_from_private(r_);
2950 #endif
2951 }
2952 #define simde_mm_cvtsi64x_si128(a) simde_mm_cvtsi64_si128(a)
2953 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2954 #define _mm_cvtsi64_si128(a) simde_mm_cvtsi64_si128(a)
2955 #define _mm_cvtsi64x_si128(a) simde_mm_cvtsi64x_si128(a)
2956 #endif
2957
2958 SIMDE_FUNCTION_ATTRIBUTES
2959 simde__m128d
simde_mm_cvtss_sd(simde__m128d a,simde__m128 b)2960 simde_mm_cvtss_sd (simde__m128d a, simde__m128 b) {
2961 #if defined(SIMDE_X86_SSE2_NATIVE)
2962 return _mm_cvtss_sd(a, b);
2963 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2964 float64x2_t temp = vcvt_f64_f32(vset_lane_f32(vgetq_lane_f32(simde__m128_to_private(b).neon_f32, 0), vdup_n_f32(0), 0));
2965 return vsetq_lane_f64(vgetq_lane_f64(simde__m128d_to_private(a).neon_f64, 1), temp, 1);
2966 #else
2967 simde__m128d_private
2968 a_ = simde__m128d_to_private(a);
2969 simde__m128_private b_ = simde__m128_to_private(b);
2970
2971 a_.f64[0] = HEDLEY_STATIC_CAST(simde_float64, b_.f32[0]);
2972
2973 return simde__m128d_from_private(a_);
2974 #endif
2975 }
2976 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2977 #define _mm_cvtss_sd(a, b) simde_mm_cvtss_sd(a, b)
2978 #endif
2979
2980 SIMDE_FUNCTION_ATTRIBUTES
2981 simde__m64
simde_mm_cvttpd_pi32(simde__m128d a)2982 simde_mm_cvttpd_pi32 (simde__m128d a) {
2983 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
2984 return _mm_cvttpd_pi32(a);
2985 #else
2986 simde__m64_private r_;
2987 simde__m128d_private a_ = simde__m128d_to_private(a);
2988
2989 #if defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FAST_CONVERSION_RANGE)
2990 SIMDE_CONVERT_VECTOR_(r_.i32, a_.f64);
2991 #else
2992 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
2993 simde_float64 v = a_.f64[i];
2994 #if defined(SIMDE_FAST_CONVERSION_RANGE)
2995 r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, v);
2996 #else
2997 r_.i32[i] = ((v > HEDLEY_STATIC_CAST(simde_float64, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float64, INT32_MAX))) ?
2998 SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN;
2999 #endif
3000 }
3001 #endif
3002
3003 return simde__m64_from_private(r_);
3004 #endif
3005 }
3006 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3007 #define _mm_cvttpd_pi32(a) simde_mm_cvttpd_pi32(a)
3008 #endif
3009
3010 SIMDE_FUNCTION_ATTRIBUTES
3011 simde__m128i
simde_mm_cvttpd_epi32(simde__m128d a)3012 simde_mm_cvttpd_epi32 (simde__m128d a) {
3013 #if defined(SIMDE_X86_SSE2_NATIVE)
3014 return _mm_cvttpd_epi32(a);
3015 #else
3016 simde__m128i_private r_;
3017
3018 r_.m64[0] = simde_mm_cvttpd_pi32(a);
3019 r_.m64[1] = simde_mm_setzero_si64();
3020
3021 return simde__m128i_from_private(r_);
3022 #endif
3023 }
3024 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3025 #define _mm_cvttpd_epi32(a) simde_mm_cvttpd_epi32(a)
3026 #endif
3027
3028 SIMDE_FUNCTION_ATTRIBUTES
3029 simde__m128i
simde_mm_cvttps_epi32(simde__m128 a)3030 simde_mm_cvttps_epi32 (simde__m128 a) {
3031 #if defined(SIMDE_X86_SSE2_NATIVE)
3032 return _mm_cvttps_epi32(a);
3033 #else
3034 simde__m128i_private r_;
3035 simde__m128_private a_ = simde__m128_to_private(a);
3036
3037 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE)
3038 r_.neon_i32 = vcvtq_s32_f32(a_.neon_f32);
3039 #elif defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FAST_CONVERSION_RANGE)
3040 SIMDE_CONVERT_VECTOR_(r_.i32, a_.f32);
3041 #else
3042 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
3043 simde_float32 v = a_.f32[i];
3044 #if defined(SIMDE_FAST_CONVERSION_RANGE)
3045 r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, v);
3046 #else
3047 r_.i32[i] = ((v > HEDLEY_STATIC_CAST(simde_float32, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float32, INT32_MAX))) ?
3048 SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN;
3049 #endif
3050 }
3051 #endif
3052
3053 return simde__m128i_from_private(r_);
3054 #endif
3055 }
3056 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3057 #define _mm_cvttps_epi32(a) simde_mm_cvttps_epi32(a)
3058 #endif
3059
3060 SIMDE_FUNCTION_ATTRIBUTES
3061 int32_t
simde_mm_cvttsd_si32(simde__m128d a)3062 simde_mm_cvttsd_si32 (simde__m128d a) {
3063 #if defined(SIMDE_X86_SSE2_NATIVE)
3064 return _mm_cvttsd_si32(a);
3065 #else
3066 simde__m128d_private a_ = simde__m128d_to_private(a);
3067 simde_float64 v = a_.f64[0];
3068 #if defined(SIMDE_FAST_CONVERSION_RANGE)
3069 return SIMDE_CONVERT_FTOI(int32_t, v);
3070 #else
3071 return ((v > HEDLEY_STATIC_CAST(simde_float64, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float64, INT32_MAX))) ?
3072 SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN;
3073 #endif
3074 #endif
3075 }
3076 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3077 #define _mm_cvttsd_si32(a) simde_mm_cvttsd_si32(a)
3078 #endif
3079
3080 SIMDE_FUNCTION_ATTRIBUTES
3081 int64_t
simde_mm_cvttsd_si64(simde__m128d a)3082 simde_mm_cvttsd_si64 (simde__m128d a) {
3083 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
3084 #if !defined(__PGI)
3085 return _mm_cvttsd_si64(a);
3086 #else
3087 return _mm_cvttsd_si64x(a);
3088 #endif
3089 #else
3090 simde__m128d_private a_ = simde__m128d_to_private(a);
3091 return SIMDE_CONVERT_FTOI(int64_t, a_.f64[0]);
3092 #endif
3093 }
3094 #define simde_mm_cvttsd_si64x(a) simde_mm_cvttsd_si64(a)
3095 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3096 #define _mm_cvttsd_si64(a) simde_mm_cvttsd_si64(a)
3097 #define _mm_cvttsd_si64x(a) simde_mm_cvttsd_si64x(a)
3098 #endif
3099
3100 SIMDE_FUNCTION_ATTRIBUTES
3101 simde__m128d
simde_mm_div_pd(simde__m128d a,simde__m128d b)3102 simde_mm_div_pd (simde__m128d a, simde__m128d b) {
3103 #if defined(SIMDE_X86_SSE2_NATIVE)
3104 return _mm_div_pd(a, b);
3105 #else
3106 simde__m128d_private
3107 r_,
3108 a_ = simde__m128d_to_private(a),
3109 b_ = simde__m128d_to_private(b);
3110
3111 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
3112 r_.f64 = a_.f64 / b_.f64;
3113 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3114 r_.neon_f64 = vdivq_f64(a_.neon_f64, b_.neon_f64);
3115 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
3116 r_.wasm_v128 = wasm_f64x2_div(a_.wasm_v128, b_.wasm_v128);
3117 #else
3118 SIMDE_VECTORIZE
3119 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
3120 r_.f64[i] = a_.f64[i] / b_.f64[i];
3121 }
3122 #endif
3123
3124 return simde__m128d_from_private(r_);
3125 #endif
3126 }
3127 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3128 #define _mm_div_pd(a, b) simde_mm_div_pd(a, b)
3129 #endif
3130
3131 SIMDE_FUNCTION_ATTRIBUTES
3132 simde__m128d
simde_mm_div_sd(simde__m128d a,simde__m128d b)3133 simde_mm_div_sd (simde__m128d a, simde__m128d b) {
3134 #if defined(SIMDE_X86_SSE2_NATIVE)
3135 return _mm_div_sd(a, b);
3136 #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
3137 return simde_mm_move_sd(a, simde_mm_div_pd(a, b));
3138 #else
3139 simde__m128d_private
3140 r_,
3141 a_ = simde__m128d_to_private(a),
3142 b_ = simde__m128d_to_private(b);
3143
3144 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3145 float64x2_t temp = vdivq_f64(a_.neon_f64, b_.neon_f64);
3146 r_.neon_f64 = vsetq_lane_f64(vgetq_lane(a_.neon_f64, 1), temp, 1);
3147 #else
3148 r_.f64[0] = a_.f64[0] / b_.f64[0];
3149 r_.f64[1] = a_.f64[1];
3150 #endif
3151
3152 return simde__m128d_from_private(r_);
3153 #endif
3154 }
3155 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3156 #define _mm_div_sd(a, b) simde_mm_div_sd(a, b)
3157 #endif
3158
3159 SIMDE_FUNCTION_ATTRIBUTES
3160 int32_t
simde_mm_extract_epi16(simde__m128i a,const int imm8)3161 simde_mm_extract_epi16 (simde__m128i a, const int imm8)
3162 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 7) {
3163 uint16_t r;
3164 simde__m128i_private a_ = simde__m128i_to_private(a);
3165
3166 #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
3167 #if defined(SIMDE_BUG_GCC_95227)
3168 (void) a_;
3169 (void) imm8;
3170 #endif
3171 r = HEDLEY_STATIC_CAST(uint16_t, vec_extract(a_.altivec_i16, imm8));
3172 #else
3173 r = a_.u16[imm8 & 7];
3174 #endif
3175
3176 return HEDLEY_STATIC_CAST(int32_t, r);
3177 }
3178 #if defined(SIMDE_X86_SSE2_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(4,6,0))
3179 #define simde_mm_extract_epi16(a, imm8) _mm_extract_epi16(a, imm8)
3180 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3181 #define simde_mm_extract_epi16(a, imm8) (HEDLEY_STATIC_CAST(int32_t, vgetq_lane_s16(simde__m128i_to_private(a).neon_i16, (imm8))) & (INT32_C(0x0000ffff)))
3182 #endif
3183 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3184 #define _mm_extract_epi16(a, imm8) simde_mm_extract_epi16(a, imm8)
3185 #endif
3186
3187 SIMDE_FUNCTION_ATTRIBUTES
3188 simde__m128i
simde_mm_insert_epi16(simde__m128i a,int16_t i,const int imm8)3189 simde_mm_insert_epi16 (simde__m128i a, int16_t i, const int imm8)
3190 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 7) {
3191 simde__m128i_private a_ = simde__m128i_to_private(a);
3192 a_.i16[imm8 & 7] = i;
3193 return simde__m128i_from_private(a_);
3194 }
3195 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
3196 #define simde_mm_insert_epi16(a, i, imm8) _mm_insert_epi16((a), (i), (imm8))
3197 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3198 #define simde_mm_insert_epi16(a, i, imm8) simde__m128i_from_neon_i16(vsetq_lane_s16((i), simde__m128i_to_neon_i16(a), (imm8)))
3199 #endif
3200 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3201 #define _mm_insert_epi16(a, i, imm8) simde_mm_insert_epi16(a, i, imm8)
3202 #endif
3203
3204 SIMDE_FUNCTION_ATTRIBUTES
3205 simde__m128d
simde_mm_load_pd(simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM (2)])3206 simde_mm_load_pd (simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)]) {
3207 #if defined(SIMDE_X86_SSE2_NATIVE)
3208 return _mm_load_pd(mem_addr);
3209 #else
3210 simde__m128d_private r_;
3211
3212 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3213 r_.neon_f64 = vld1q_f64(mem_addr);
3214 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3215 r_.neon_u32 = vld1q_u32(HEDLEY_REINTERPRET_CAST(uint32_t const*, mem_addr));
3216 #else
3217 simde_memcpy(&r_, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m128d), sizeof(r_));
3218 #endif
3219
3220 return simde__m128d_from_private(r_);
3221 #endif
3222 }
3223 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3224 #define _mm_load_pd(mem_addr) simde_mm_load_pd(mem_addr)
3225 #endif
3226
3227 SIMDE_FUNCTION_ATTRIBUTES
3228 simde__m128d
simde_mm_load1_pd(simde_float64 const * mem_addr)3229 simde_mm_load1_pd (simde_float64 const* mem_addr) {
3230 #if defined(SIMDE_X86_SSE2_NATIVE)
3231 return _mm_load1_pd(mem_addr);
3232 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3233 return simde__m128d_from_neon_f64(vld1q_dup_f64(mem_addr));
3234 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
3235 return simde__m128d_from_wasm_v128(wasm_v64x2_load_splat(mem_addr));
3236 #else
3237 return simde_mm_set1_pd(*mem_addr);
3238 #endif
3239 }
3240 #define simde_mm_load_pd1(mem_addr) simde_mm_load1_pd(mem_addr)
3241 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3242 #define _mm_load_pd1(mem_addr) simde_mm_load1_pd(mem_addr)
3243 #define _mm_load1_pd(mem_addr) simde_mm_load1_pd(mem_addr)
3244 #endif
3245
3246 SIMDE_FUNCTION_ATTRIBUTES
3247 simde__m128d
simde_mm_load_sd(simde_float64 const * mem_addr)3248 simde_mm_load_sd (simde_float64 const* mem_addr) {
3249 #if defined(SIMDE_X86_SSE2_NATIVE)
3250 return _mm_load_sd(mem_addr);
3251 #else
3252 simde__m128d_private r_;
3253
3254 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3255 r_.neon_f64 = vsetq_lane_f64(*mem_addr, vdupq_n_f64(0), 0);
3256 #else
3257 r_.f64[0] = *mem_addr;
3258 r_.u64[1] = UINT64_C(0);
3259 #endif
3260
3261 return simde__m128d_from_private(r_);
3262 #endif
3263 }
3264 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3265 #define _mm_load_sd(mem_addr) simde_mm_load_sd(mem_addr)
3266 #endif
3267
3268 SIMDE_FUNCTION_ATTRIBUTES
3269 simde__m128i
simde_mm_load_si128(simde__m128i const * mem_addr)3270 simde_mm_load_si128 (simde__m128i const* mem_addr) {
3271 #if defined(SIMDE_X86_SSE2_NATIVE)
3272 return _mm_load_si128(HEDLEY_REINTERPRET_CAST(__m128i const*, mem_addr));
3273 #else
3274 simde__m128i_private r_;
3275
3276 #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
3277 r_.altivec_i32 = vec_ld(0, HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(int) const*, mem_addr));
3278 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3279 r_.neon_i32 = vld1q_s32(HEDLEY_REINTERPRET_CAST(int32_t const*, mem_addr));
3280 #else
3281 simde_memcpy(&r_, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m128i), sizeof(simde__m128i));
3282 #endif
3283
3284 return simde__m128i_from_private(r_);
3285 #endif
3286 }
3287 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3288 #define _mm_load_si128(mem_addr) simde_mm_load_si128(mem_addr)
3289 #endif
3290
3291 SIMDE_FUNCTION_ATTRIBUTES
3292 simde__m128d
simde_mm_loadh_pd(simde__m128d a,simde_float64 const * mem_addr)3293 simde_mm_loadh_pd (simde__m128d a, simde_float64 const* mem_addr) {
3294 #if defined(SIMDE_X86_SSE2_NATIVE)
3295 return _mm_loadh_pd(a, mem_addr);
3296 #else
3297 simde__m128d_private
3298 r_,
3299 a_ = simde__m128d_to_private(a);
3300
3301 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3302 r_.neon_f64 = vcombine_f64(vget_low_f64(a_.neon_f64), vld1_f64(HEDLEY_REINTERPRET_CAST(const float64_t*, mem_addr)));
3303 #else
3304 simde_float64 t;
3305
3306 simde_memcpy(&t, mem_addr, sizeof(t));
3307 r_.f64[0] = a_.f64[0];
3308 r_.f64[1] = t;
3309 #endif
3310
3311 return simde__m128d_from_private(r_);
3312 #endif
3313 }
3314 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3315 #define _mm_loadh_pd(a, mem_addr) simde_mm_loadh_pd(a, mem_addr)
3316 #endif
3317
3318 SIMDE_FUNCTION_ATTRIBUTES
3319 simde__m128i
simde_mm_loadl_epi64(simde__m128i const * mem_addr)3320 simde_mm_loadl_epi64 (simde__m128i const* mem_addr) {
3321 #if defined(SIMDE_X86_SSE2_NATIVE)
3322 return _mm_loadl_epi64(mem_addr);
3323 #else
3324 simde__m128i_private r_;
3325
3326 int64_t value;
3327 simde_memcpy(&value, mem_addr, sizeof(value));
3328
3329 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3330 r_.neon_i64 = vcombine_s64(vld1_s64(HEDLEY_REINTERPRET_CAST(int64_t const *, mem_addr)), vdup_n_s64(0));
3331 #else
3332 r_.i64[0] = value;
3333 r_.i64[1] = 0;
3334 #endif
3335
3336 return simde__m128i_from_private(r_);
3337 #endif
3338 }
3339 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3340 #define _mm_loadl_epi64(mem_addr) simde_mm_loadl_epi64(mem_addr)
3341 #endif
3342
3343 SIMDE_FUNCTION_ATTRIBUTES
3344 simde__m128d
simde_mm_loadl_pd(simde__m128d a,simde_float64 const * mem_addr)3345 simde_mm_loadl_pd (simde__m128d a, simde_float64 const* mem_addr) {
3346 #if defined(SIMDE_X86_SSE2_NATIVE)
3347 return _mm_loadl_pd(a, mem_addr);
3348 #else
3349 simde__m128d_private
3350 r_,
3351 a_ = simde__m128d_to_private(a);
3352
3353 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3354 r_.neon_f64 = vcombine_f64(vld1_f64(
3355 HEDLEY_REINTERPRET_CAST(const float64_t*, mem_addr)), vget_high_f64(a_.neon_f64));
3356 #else
3357 r_.f64[0] = *mem_addr;
3358 r_.u64[1] = a_.u64[1];
3359 #endif
3360
3361 return simde__m128d_from_private(r_);
3362 #endif
3363 }
3364 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3365 #define _mm_loadl_pd(a, mem_addr) simde_mm_loadl_pd(a, mem_addr)
3366 #endif
3367
3368 SIMDE_FUNCTION_ATTRIBUTES
3369 simde__m128d
simde_mm_loadr_pd(simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM (2)])3370 simde_mm_loadr_pd (simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)]) {
3371 #if defined(SIMDE_X86_SSE2_NATIVE)
3372 return _mm_loadr_pd(mem_addr);
3373 #else
3374 simde__m128d_private
3375 r_;
3376
3377 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3378 r_.neon_f64 = vld1q_f64(mem_addr);
3379 r_.neon_f64 = vextq_f64(r_.neon_f64, r_.neon_f64, 1);
3380 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3381 r_.neon_i64 = vld1q_s64(HEDLEY_REINTERPRET_CAST(int64_t const *, mem_addr));
3382 r_.neon_i64 = vextq_s64(r_.neon_i64, r_.neon_i64, 1);
3383 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
3384 v128_t tmp = wasm_v128_load(mem_addr);
3385 r_.wasm_v128 = wasm_v64x2_shuffle(tmp, tmp, 1, 0);
3386 #else
3387 r_.f64[0] = mem_addr[1];
3388 r_.f64[1] = mem_addr[0];
3389 #endif
3390
3391 return simde__m128d_from_private(r_);
3392 #endif
3393 }
3394 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3395 #define _mm_loadr_pd(mem_addr) simde_mm_loadr_pd(mem_addr)
3396 #endif
3397
3398 SIMDE_FUNCTION_ATTRIBUTES
3399 simde__m128d
simde_mm_loadu_pd(simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM (2)])3400 simde_mm_loadu_pd (simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)]) {
3401 #if defined(SIMDE_X86_SSE2_NATIVE)
3402 return _mm_loadu_pd(mem_addr);
3403 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3404 return vld1q_f64(mem_addr);
3405 #else
3406 simde__m128d_private r_;
3407
3408 simde_memcpy(&r_, mem_addr, sizeof(r_));
3409
3410 return simde__m128d_from_private(r_);
3411 #endif
3412 }
3413 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3414 #define _mm_loadu_pd(mem_addr) simde_mm_loadu_pd(mem_addr)
3415 #endif
3416
3417 SIMDE_FUNCTION_ATTRIBUTES
3418 simde__m128i
simde_x_mm_loadu_epi8(int8_t const * mem_addr)3419 simde_x_mm_loadu_epi8(int8_t const* mem_addr) {
3420 #if defined(SIMDE_X86_SSE2_NATIVE)
3421 return _mm_loadu_si128(SIMDE_ALIGN_CAST(simde__m128i const*, mem_addr));
3422 #else
3423 simde__m128i_private r_;
3424
3425 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3426 r_.neon_i8 = vld1q_s8(HEDLEY_REINTERPRET_CAST(int8_t const*, mem_addr));
3427 #else
3428 simde_memcpy(&r_, mem_addr, sizeof(r_));
3429 #endif
3430
3431 return simde__m128i_from_private(r_);
3432 #endif
3433 }
3434
3435 SIMDE_FUNCTION_ATTRIBUTES
3436 simde__m128i
simde_x_mm_loadu_epi16(int16_t const * mem_addr)3437 simde_x_mm_loadu_epi16(int16_t const* mem_addr) {
3438 #if defined(SIMDE_X86_SSE2_NATIVE)
3439 return _mm_loadu_si128(SIMDE_ALIGN_CAST(simde__m128i const*, mem_addr));
3440 #else
3441 simde__m128i_private r_;
3442
3443 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3444 r_.neon_i16 = vld1q_s16(HEDLEY_REINTERPRET_CAST(int16_t const*, mem_addr));
3445 #else
3446 simde_memcpy(&r_, mem_addr, sizeof(r_));
3447 #endif
3448
3449 return simde__m128i_from_private(r_);
3450 #endif
3451 }
3452
3453 SIMDE_FUNCTION_ATTRIBUTES
3454 simde__m128i
simde_x_mm_loadu_epi32(int32_t const * mem_addr)3455 simde_x_mm_loadu_epi32(int32_t const* mem_addr) {
3456 #if defined(SIMDE_X86_SSE2_NATIVE)
3457 return _mm_loadu_si128(SIMDE_ALIGN_CAST(simde__m128i const*, mem_addr));
3458 #else
3459 simde__m128i_private r_;
3460
3461 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3462 r_.neon_i32 = vld1q_s32(HEDLEY_REINTERPRET_CAST(int32_t const*, mem_addr));
3463 #else
3464 simde_memcpy(&r_, mem_addr, sizeof(r_));
3465 #endif
3466
3467 return simde__m128i_from_private(r_);
3468 #endif
3469 }
3470
3471 SIMDE_FUNCTION_ATTRIBUTES
3472 simde__m128i
simde_x_mm_loadu_epi64(int64_t const * mem_addr)3473 simde_x_mm_loadu_epi64(int64_t const* mem_addr) {
3474 #if defined(SIMDE_X86_SSE2_NATIVE)
3475 return _mm_loadu_si128(SIMDE_ALIGN_CAST(simde__m128i const*, mem_addr));
3476 #else
3477 simde__m128i_private r_;
3478
3479 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3480 r_.neon_i64 = vld1q_s64(HEDLEY_REINTERPRET_CAST(int64_t const*, mem_addr));
3481 #else
3482 simde_memcpy(&r_, mem_addr, sizeof(r_));
3483 #endif
3484
3485 return simde__m128i_from_private(r_);
3486 #endif
3487 }
3488
3489 SIMDE_FUNCTION_ATTRIBUTES
3490 simde__m128i
simde_mm_loadu_si128(void const * mem_addr)3491 simde_mm_loadu_si128 (void const* mem_addr) {
3492 #if defined(SIMDE_X86_SSE2_NATIVE)
3493 return _mm_loadu_si128(HEDLEY_STATIC_CAST(__m128i const*, mem_addr));
3494 #else
3495 simde__m128i_private r_;
3496
3497 #if HEDLEY_GNUC_HAS_ATTRIBUTE(may_alias,3,3,0)
3498 HEDLEY_DIAGNOSTIC_PUSH
3499 SIMDE_DIAGNOSTIC_DISABLE_PACKED_
3500 struct simde_mm_loadu_si128_s {
3501 __typeof__(r_) v;
3502 } __attribute__((__packed__, __may_alias__));
3503 r_ = HEDLEY_REINTERPRET_CAST(const struct simde_mm_loadu_si128_s *, mem_addr)->v;
3504 HEDLEY_DIAGNOSTIC_POP
3505 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3506 /* Note that this is a lower priority than the struct above since
3507 * clang assumes mem_addr is aligned (since it is a __m128i*). */
3508 r_.neon_i32 = vld1q_s32(HEDLEY_REINTERPRET_CAST(int32_t const*, mem_addr));
3509 #else
3510 simde_memcpy(&r_, mem_addr, sizeof(r_));
3511 #endif
3512
3513 return simde__m128i_from_private(r_);
3514 #endif
3515 }
3516 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3517 #define _mm_loadu_si128(mem_addr) simde_mm_loadu_si128(mem_addr)
3518 #endif
3519
3520 SIMDE_FUNCTION_ATTRIBUTES
3521 simde__m128i
simde_mm_madd_epi16(simde__m128i a,simde__m128i b)3522 simde_mm_madd_epi16 (simde__m128i a, simde__m128i b) {
3523 #if defined(SIMDE_X86_SSE2_NATIVE)
3524 return _mm_madd_epi16(a, b);
3525 #else
3526 simde__m128i_private
3527 r_,
3528 a_ = simde__m128i_to_private(a),
3529 b_ = simde__m128i_to_private(b);
3530
3531 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3532 int32x4_t pl = vmull_s16(vget_low_s16(a_.neon_i16), vget_low_s16(b_.neon_i16));
3533 int32x4_t ph = vmull_high_s16(a_.neon_i16, b_.neon_i16);
3534 r_.neon_i32 = vpaddq_s32(pl, ph);
3535 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3536 int32x4_t pl = vmull_s16(vget_low_s16(a_.neon_i16), vget_low_s16(b_.neon_i16));
3537 int32x4_t ph = vmull_s16(vget_high_s16(a_.neon_i16), vget_high_s16(b_.neon_i16));
3538 int32x2_t rl = vpadd_s32(vget_low_s32(pl), vget_high_s32(pl));
3539 int32x2_t rh = vpadd_s32(vget_low_s32(ph), vget_high_s32(ph));
3540 r_.neon_i32 = vcombine_s32(rl, rh);
3541 #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
3542 static const SIMDE_POWER_ALTIVEC_VECTOR(int) tz = { 0, 0, 0, 0 };
3543 r_.altivec_i32 = vec_msum(a_.altivec_i16, b_.altivec_i16, tz);
3544 #else
3545 SIMDE_VECTORIZE
3546 for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i16[0])) ; i += 2) {
3547 r_.i32[i / 2] = (a_.i16[i] * b_.i16[i]) + (a_.i16[i + 1] * b_.i16[i + 1]);
3548 }
3549 #endif
3550
3551 return simde__m128i_from_private(r_);
3552 #endif
3553 }
3554 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3555 #define _mm_madd_epi16(a, b) simde_mm_madd_epi16(a, b)
3556 #endif
3557
3558 SIMDE_FUNCTION_ATTRIBUTES
3559 void
simde_mm_maskmoveu_si128(simde__m128i a,simde__m128i mask,int8_t mem_addr[HEDLEY_ARRAY_PARAM (16)])3560 simde_mm_maskmoveu_si128 (simde__m128i a, simde__m128i mask, int8_t mem_addr[HEDLEY_ARRAY_PARAM(16)]) {
3561 #if defined(SIMDE_X86_SSE2_NATIVE)
3562 _mm_maskmoveu_si128(a, mask, HEDLEY_REINTERPRET_CAST(char*, mem_addr));
3563 #else
3564 simde__m128i_private
3565 a_ = simde__m128i_to_private(a),
3566 mask_ = simde__m128i_to_private(mask);
3567
3568 for (size_t i = 0 ; i < (sizeof(a_.i8) / sizeof(a_.i8[0])) ; i++) {
3569 if (mask_.u8[i] & 0x80) {
3570 mem_addr[i] = a_.i8[i];
3571 }
3572 }
3573 #endif
3574 }
3575 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3576 #define _mm_maskmoveu_si128(a, mask, mem_addr) simde_mm_maskmoveu_si128((a), (mask), SIMDE_CHECKED_REINTERPRET_CAST(int8_t*, char*, (mem_addr)))
3577 #endif
3578
3579 SIMDE_FUNCTION_ATTRIBUTES
3580 int32_t
simde_mm_movemask_epi8(simde__m128i a)3581 simde_mm_movemask_epi8 (simde__m128i a) {
3582 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__INTEL_COMPILER)
3583 /* ICC has trouble with _mm_movemask_epi8 at -O2 and above: */
3584 return _mm_movemask_epi8(a);
3585 #else
3586 int32_t r = 0;
3587 simde__m128i_private a_ = simde__m128i_to_private(a);
3588
3589 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3590 uint8x16_t input = a_.neon_u8;
3591 const int8_t xr[16] = {-7, -6, -5, -4, -3, -2, -1, 0, -7, -6, -5, -4, -3, -2, -1, 0};
3592 const uint8x16_t mask_and = vdupq_n_u8(0x80);
3593 const int8x16_t mask_shift = vld1q_s8(xr);
3594 const uint8x16_t mask_result =
3595 vshlq_u8(vandq_u8(input, mask_and), mask_shift);
3596 uint8x8_t lo = vget_low_u8(mask_result);
3597 uint8x8_t hi = vget_high_u8(mask_result);
3598 r = vaddv_u8(lo) + (vaddv_u8(hi) << 8);
3599 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3600 // Use increasingly wide shifts+adds to collect the sign bits
3601 // together.
3602 // Since the widening shifts would be rather confusing to follow in little endian, everything
3603 // will be illustrated in big endian order instead. This has a different result - the bits
3604 // would actually be reversed on a big endian machine.
3605
3606 // Starting input (only half the elements are shown):
3607 // 89 ff 1d c0 00 10 99 33
3608 uint8x16_t input = a_.neon_u8;
3609
3610 // Shift out everything but the sign bits with an unsigned shift right.
3611 //
3612 // Bytes of the vector::
3613 // 89 ff 1d c0 00 10 99 33
3614 // \ \ \ \ \ \ \ \ high_bits = (uint16x4_t)(input >> 7)
3615 // | | | | | | | |
3616 // 01 01 00 01 00 00 01 00
3617 //
3618 // Bits of first important lane(s):
3619 // 10001001 (89)
3620 // \______
3621 // |
3622 // 00000001 (01)
3623 uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
3624
3625 // Merge the even lanes together with a 16-bit unsigned shift right + add.
3626 // 'xx' represents garbage data which will be ignored in the final result.
3627 // In the important bytes, the add functions like a binary OR.
3628 //
3629 // 01 01 00 01 00 00 01 00
3630 // \_ | \_ | \_ | \_ | paired16 = (uint32x4_t)(input + (input >> 7))
3631 // \| \| \| \|
3632 // xx 03 xx 01 xx 00 xx 02
3633 //
3634 // 00000001 00000001 (01 01)
3635 // \_______ |
3636 // \|
3637 // xxxxxxxx xxxxxx11 (xx 03)
3638 uint32x4_t paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
3639
3640 // Repeat with a wider 32-bit shift + add.
3641 // xx 03 xx 01 xx 00 xx 02
3642 // \____ | \____ | paired32 = (uint64x1_t)(paired16 + (paired16 >> 14))
3643 // \| \|
3644 // xx xx xx 0d xx xx xx 02
3645 //
3646 // 00000011 00000001 (03 01)
3647 // \\_____ ||
3648 // '----.\||
3649 // xxxxxxxx xxxx1101 (xx 0d)
3650 uint64x2_t paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
3651
3652 // Last, an even wider 64-bit shift + add to get our result in the low 8 bit lanes.
3653 // xx xx xx 0d xx xx xx 02
3654 // \_________ | paired64 = (uint8x8_t)(paired32 + (paired32 >> 28))
3655 // \|
3656 // xx xx xx xx xx xx xx d2
3657 //
3658 // 00001101 00000010 (0d 02)
3659 // \ \___ | |
3660 // '---. \| |
3661 // xxxxxxxx 11010010 (xx d2)
3662 uint8x16_t paired64 = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
3663
3664 // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
3665 // xx xx xx xx xx xx xx d2
3666 // || return paired64[0]
3667 // d2
3668 // Note: Little endian would return the correct value 4b (01001011) instead.
3669 r = vgetq_lane_u8(paired64, 0) | (HEDLEY_STATIC_CAST(int32_t, vgetq_lane_u8(paired64, 8)) << 8);
3670 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) && !defined(HEDLEY_IBM_VERSION) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
3671 static const SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) perm = { 120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8, 0 };
3672 r = HEDLEY_STATIC_CAST(int32_t, vec_extract(vec_vbpermq(a_.altivec_u8, perm), 1));
3673 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) && !defined(HEDLEY_IBM_VERSION) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_BIG)
3674 static const SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) perm = { 120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8, 0 };
3675 r = HEDLEY_STATIC_CAST(int32_t, vec_extract(vec_vbpermq(a_.altivec_u8, perm), 14));
3676 #else
3677 SIMDE_VECTORIZE_REDUCTION(|:r)
3678 for (size_t i = 0 ; i < (sizeof(a_.u8) / sizeof(a_.u8[0])) ; i++) {
3679 r |= (a_.u8[15 - i] >> 7) << (15 - i);
3680 }
3681 #endif
3682
3683 return r;
3684 #endif
3685 }
3686 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3687 #define _mm_movemask_epi8(a) simde_mm_movemask_epi8(a)
3688 #endif
3689
3690 SIMDE_FUNCTION_ATTRIBUTES
3691 int32_t
simde_mm_movemask_pd(simde__m128d a)3692 simde_mm_movemask_pd (simde__m128d a) {
3693 #if defined(SIMDE_X86_SSE2_NATIVE)
3694 return _mm_movemask_pd(a);
3695 #else
3696 int32_t r = 0;
3697 simde__m128d_private a_ = simde__m128d_to_private(a);
3698
3699 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3700 static const int64_t shift_amount[] = { 0, 1 };
3701 const int64x2_t shift = vld1q_s64(shift_amount);
3702 uint64x2_t tmp = vshrq_n_u64(a_.neon_u64, 63);
3703 return HEDLEY_STATIC_CAST(int32_t, vaddvq_u64(vshlq_u64(tmp, shift)));
3704 #else
3705 SIMDE_VECTORIZE_REDUCTION(|:r)
3706 for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) {
3707 r |= (a_.u64[i] >> 63) << i;
3708 }
3709 #endif
3710
3711 return r;
3712 #endif
3713 }
3714 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3715 #define _mm_movemask_pd(a) simde_mm_movemask_pd(a)
3716 #endif
3717
3718 SIMDE_FUNCTION_ATTRIBUTES
3719 simde__m64
simde_mm_movepi64_pi64(simde__m128i a)3720 simde_mm_movepi64_pi64 (simde__m128i a) {
3721 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
3722 return _mm_movepi64_pi64(a);
3723 #else
3724 simde__m64_private r_;
3725 simde__m128i_private a_ = simde__m128i_to_private(a);
3726
3727 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3728 r_.neon_i64 = vget_low_s64(a_.neon_i64);
3729 #else
3730 r_.i64[0] = a_.i64[0];
3731 #endif
3732
3733 return simde__m64_from_private(r_);
3734 #endif
3735 }
3736 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3737 #define _mm_movepi64_pi64(a) simde_mm_movepi64_pi64(a)
3738 #endif
3739
3740 SIMDE_FUNCTION_ATTRIBUTES
3741 simde__m128i
simde_mm_movpi64_epi64(simde__m64 a)3742 simde_mm_movpi64_epi64 (simde__m64 a) {
3743 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
3744 return _mm_movpi64_epi64(a);
3745 #else
3746 simde__m128i_private r_;
3747 simde__m64_private a_ = simde__m64_to_private(a);
3748
3749 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3750 r_.neon_i64 = vcombine_s64(a_.neon_i64, vdup_n_s64(0));
3751 #else
3752 r_.i64[0] = a_.i64[0];
3753 r_.i64[1] = 0;
3754 #endif
3755
3756 return simde__m128i_from_private(r_);
3757 #endif
3758 }
3759 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3760 #define _mm_movpi64_epi64(a) simde_mm_movpi64_epi64(a)
3761 #endif
3762
3763 SIMDE_FUNCTION_ATTRIBUTES
3764 simde__m128i
simde_mm_min_epi16(simde__m128i a,simde__m128i b)3765 simde_mm_min_epi16 (simde__m128i a, simde__m128i b) {
3766 #if defined(SIMDE_X86_SSE2_NATIVE)
3767 return _mm_min_epi16(a, b);
3768 #else
3769 simde__m128i_private
3770 r_,
3771 a_ = simde__m128i_to_private(a),
3772 b_ = simde__m128i_to_private(b);
3773
3774 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3775 r_.neon_i16 = vminq_s16(a_.neon_i16, b_.neon_i16);
3776 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
3777 r_.wasm_v128 = wasm_i16x8_min(a_.wasm_v128, b_.wasm_v128);
3778 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
3779 r_.altivec_i16 = vec_min(a_.altivec_i16, b_.altivec_i16);
3780 #else
3781 SIMDE_VECTORIZE
3782 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
3783 r_.i16[i] = (a_.i16[i] < b_.i16[i]) ? a_.i16[i] : b_.i16[i];
3784 }
3785 #endif
3786
3787 return simde__m128i_from_private(r_);
3788 #endif
3789 }
3790 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3791 #define _mm_min_epi16(a, b) simde_mm_min_epi16(a, b)
3792 #endif
3793
3794 SIMDE_FUNCTION_ATTRIBUTES
3795 simde__m128i
simde_mm_min_epu8(simde__m128i a,simde__m128i b)3796 simde_mm_min_epu8 (simde__m128i a, simde__m128i b) {
3797 #if defined(SIMDE_X86_SSE2_NATIVE)
3798 return _mm_min_epu8(a, b);
3799 #else
3800 simde__m128i_private
3801 r_,
3802 a_ = simde__m128i_to_private(a),
3803 b_ = simde__m128i_to_private(b);
3804
3805 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3806 r_.neon_u8 = vminq_u8(a_.neon_u8, b_.neon_u8);
3807 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
3808 r_.wasm_v128 = wasm_u8x16_min(a_.wasm_v128, b_.wasm_v128);
3809 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
3810 r_.altivec_u8 = vec_min(a_.altivec_u8, b_.altivec_u8);
3811 #else
3812 SIMDE_VECTORIZE
3813 for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
3814 r_.u8[i] = (a_.u8[i] < b_.u8[i]) ? a_.u8[i] : b_.u8[i];
3815 }
3816 #endif
3817
3818 return simde__m128i_from_private(r_);
3819 #endif
3820 }
3821 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3822 #define _mm_min_epu8(a, b) simde_mm_min_epu8(a, b)
3823 #endif
3824
3825 SIMDE_FUNCTION_ATTRIBUTES
3826 simde__m128d
simde_mm_min_pd(simde__m128d a,simde__m128d b)3827 simde_mm_min_pd (simde__m128d a, simde__m128d b) {
3828 #if defined(SIMDE_X86_SSE2_NATIVE)
3829 return _mm_min_pd(a, b);
3830 #else
3831 simde__m128d_private
3832 r_,
3833 a_ = simde__m128d_to_private(a),
3834 b_ = simde__m128d_to_private(b);
3835
3836 #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
3837 r_.altivec_f64 = vec_min(a_.altivec_f64, b_.altivec_f64);
3838 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3839 r_.neon_f64 = vminq_f64(a_.neon_f64, b_.neon_f64);
3840 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
3841 r_.wasm_v128 = wasm_f64x2_min(a_.wasm_v128, b_.wasm_v128);
3842 #else
3843 SIMDE_VECTORIZE
3844 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
3845 r_.f64[i] = (a_.f64[i] < b_.f64[i]) ? a_.f64[i] : b_.f64[i];
3846 }
3847 #endif
3848
3849 return simde__m128d_from_private(r_);
3850 #endif
3851 }
3852 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3853 #define _mm_min_pd(a, b) simde_mm_min_pd(a, b)
3854 #endif
3855
3856 SIMDE_FUNCTION_ATTRIBUTES
3857 simde__m128d
simde_mm_min_sd(simde__m128d a,simde__m128d b)3858 simde_mm_min_sd (simde__m128d a, simde__m128d b) {
3859 #if defined(SIMDE_X86_SSE2_NATIVE)
3860 return _mm_min_sd(a, b);
3861 #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
3862 return simde_mm_move_sd(a, simde_mm_min_pd(a, b));
3863 #else
3864 simde__m128d_private
3865 r_,
3866 a_ = simde__m128d_to_private(a),
3867 b_ = simde__m128d_to_private(b);
3868
3869 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3870 float64x2_t temp = vminq_f64(a_.neon_f64, b_.neon_f64);
3871 r_.neon_f64 = vsetq_lane_f64(vgetq_lane(a_.neon_f64, 1), temp, 1);
3872 #else
3873 r_.f64[0] = (a_.f64[0] < b_.f64[0]) ? a_.f64[0] : b_.f64[0];
3874 r_.f64[1] = a_.f64[1];
3875 #endif
3876
3877 return simde__m128d_from_private(r_);
3878 #endif
3879 }
3880 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3881 #define _mm_min_sd(a, b) simde_mm_min_sd(a, b)
3882 #endif
3883
3884 SIMDE_FUNCTION_ATTRIBUTES
3885 simde__m128i
simde_mm_max_epi16(simde__m128i a,simde__m128i b)3886 simde_mm_max_epi16 (simde__m128i a, simde__m128i b) {
3887 #if defined(SIMDE_X86_SSE2_NATIVE)
3888 return _mm_max_epi16(a, b);
3889 #else
3890 simde__m128i_private
3891 r_,
3892 a_ = simde__m128i_to_private(a),
3893 b_ = simde__m128i_to_private(b);
3894
3895 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3896 r_.neon_i16 = vmaxq_s16(a_.neon_i16, b_.neon_i16);
3897 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
3898 r_.wasm_v128 = wasm_i16x8_max(a_.wasm_v128, b_.wasm_v128);
3899 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
3900 r_.altivec_i16 = vec_max(a_.altivec_i16, b_.altivec_i16);
3901 #else
3902 SIMDE_VECTORIZE
3903 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
3904 r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? a_.i16[i] : b_.i16[i];
3905 }
3906 #endif
3907
3908 return simde__m128i_from_private(r_);
3909 #endif
3910 }
3911 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3912 #define _mm_max_epi16(a, b) simde_mm_max_epi16(a, b)
3913 #endif
3914
3915 SIMDE_FUNCTION_ATTRIBUTES
3916 simde__m128i
simde_mm_max_epu8(simde__m128i a,simde__m128i b)3917 simde_mm_max_epu8 (simde__m128i a, simde__m128i b) {
3918 #if defined(SIMDE_X86_SSE2_NATIVE)
3919 return _mm_max_epu8(a, b);
3920 #else
3921 simde__m128i_private
3922 r_,
3923 a_ = simde__m128i_to_private(a),
3924 b_ = simde__m128i_to_private(b);
3925
3926 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3927 r_.neon_u8 = vmaxq_u8(a_.neon_u8, b_.neon_u8);
3928 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
3929 r_.wasm_v128 = wasm_u8x16_max(a_.wasm_v128, b_.wasm_v128);
3930 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
3931 r_.altivec_u8 = vec_max(a_.altivec_u8, b_.altivec_u8);
3932 #else
3933 SIMDE_VECTORIZE
3934 for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
3935 r_.u8[i] = (a_.u8[i] > b_.u8[i]) ? a_.u8[i] : b_.u8[i];
3936 }
3937 #endif
3938
3939 return simde__m128i_from_private(r_);
3940 #endif
3941 }
3942 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3943 #define _mm_max_epu8(a, b) simde_mm_max_epu8(a, b)
3944 #endif
3945
3946 SIMDE_FUNCTION_ATTRIBUTES
3947 simde__m128d
simde_mm_max_pd(simde__m128d a,simde__m128d b)3948 simde_mm_max_pd (simde__m128d a, simde__m128d b) {
3949 #if defined(SIMDE_X86_SSE2_NATIVE)
3950 return _mm_max_pd(a, b);
3951 #else
3952 simde__m128d_private
3953 r_,
3954 a_ = simde__m128d_to_private(a),
3955 b_ = simde__m128d_to_private(b);
3956
3957 #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
3958 r_.altivec_f64 = vec_max(a_.altivec_f64, b_.altivec_f64);
3959 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
3960 r_.wasm_v128 = wasm_f64x2_max(a_.wasm_v128, b_.wasm_v128);
3961 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3962 r_.neon_f64 = vmaxq_f64(a_.neon_f64, b_.neon_f64);
3963 #else
3964 SIMDE_VECTORIZE
3965 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
3966 r_.f64[i] = (a_.f64[i] > b_.f64[i]) ? a_.f64[i] : b_.f64[i];
3967 }
3968 #endif
3969
3970 return simde__m128d_from_private(r_);
3971 #endif
3972 }
3973 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3974 #define _mm_max_pd(a, b) simde_mm_max_pd(a, b)
3975 #endif
3976
3977 SIMDE_FUNCTION_ATTRIBUTES
3978 simde__m128d
simde_mm_max_sd(simde__m128d a,simde__m128d b)3979 simde_mm_max_sd (simde__m128d a, simde__m128d b) {
3980 #if defined(SIMDE_X86_SSE2_NATIVE)
3981 return _mm_max_sd(a, b);
3982 #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
3983 return simde_mm_move_sd(a, simde_mm_max_pd(a, b));
3984 #else
3985 simde__m128d_private
3986 r_,
3987 a_ = simde__m128d_to_private(a),
3988 b_ = simde__m128d_to_private(b);
3989
3990 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3991 float64x2_t temp = vmaxq_f64(a_.neon_f64, b_.neon_f64);
3992 r_.neon_f64 = vsetq_lane_f64(vgetq_lane(a_.neon_f64, 1), temp, 1);
3993 #else
3994 r_.f64[0] = (a_.f64[0] > b_.f64[0]) ? a_.f64[0] : b_.f64[0];
3995 r_.f64[1] = a_.f64[1];
3996 #endif
3997
3998 return simde__m128d_from_private(r_);
3999 #endif
4000 }
4001 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4002 #define _mm_max_sd(a, b) simde_mm_max_sd(a, b)
4003 #endif
4004
4005 SIMDE_FUNCTION_ATTRIBUTES
4006 simde__m128i
simde_mm_move_epi64(simde__m128i a)4007 simde_mm_move_epi64 (simde__m128i a) {
4008 #if defined(SIMDE_X86_SSE2_NATIVE)
4009 return _mm_move_epi64(a);
4010 #else
4011 simde__m128i_private
4012 r_,
4013 a_ = simde__m128i_to_private(a);
4014
4015 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4016 r_.neon_i64 = vsetq_lane_s64(0, a_.neon_i64, 1);
4017 #else
4018 r_.i64[0] = a_.i64[0];
4019 r_.i64[1] = 0;
4020 #endif
4021
4022 return simde__m128i_from_private(r_);
4023 #endif
4024 }
4025 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4026 #define _mm_move_epi64(a) simde_mm_move_epi64(a)
4027 #endif
4028
4029 SIMDE_FUNCTION_ATTRIBUTES
4030 simde__m128i
simde_mm_mul_epu32(simde__m128i a,simde__m128i b)4031 simde_mm_mul_epu32 (simde__m128i a, simde__m128i b) {
4032 #if defined(SIMDE_X86_SSE2_NATIVE)
4033 return _mm_mul_epu32(a, b);
4034 #else
4035 simde__m128i_private
4036 r_,
4037 a_ = simde__m128i_to_private(a),
4038 b_ = simde__m128i_to_private(b);
4039
4040 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4041 uint32x2_t a_lo = vmovn_u64(a_.neon_u64);
4042 uint32x2_t b_lo = vmovn_u64(b_.neon_u64);
4043 r_.neon_u64 = vmull_u32(a_lo, b_lo);
4044 #elif defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
4045 __typeof__(a_.u32) z = { 0, };
4046 a_.u32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.u32, z, 0, 4, 2, 6);
4047 b_.u32 = SIMDE_SHUFFLE_VECTOR_(32, 16, b_.u32, z, 0, 4, 2, 6);
4048 r_.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), a_.u32) *
4049 HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), b_.u32);
4050 #else
4051 SIMDE_VECTORIZE
4052 for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
4053 r_.u64[i] = HEDLEY_STATIC_CAST(uint64_t, a_.u32[i * 2]) * HEDLEY_STATIC_CAST(uint64_t, b_.u32[i * 2]);
4054 }
4055 #endif
4056
4057 return simde__m128i_from_private(r_);
4058 #endif
4059 }
4060 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4061 #define _mm_mul_epu32(a, b) simde_mm_mul_epu32(a, b)
4062 #endif
4063
4064 SIMDE_FUNCTION_ATTRIBUTES
4065 simde__m128i
simde_x_mm_mul_epi64(simde__m128i a,simde__m128i b)4066 simde_x_mm_mul_epi64 (simde__m128i a, simde__m128i b) {
4067 simde__m128i_private
4068 r_,
4069 a_ = simde__m128i_to_private(a),
4070 b_ = simde__m128i_to_private(b);
4071
4072 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
4073 r_.i64 = a_.i64 * b_.i64;
4074 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
4075 r_.neon_f64 = vmulq_s64(a_.neon_f64, b_.neon_f64);
4076 #else
4077 SIMDE_VECTORIZE
4078 for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
4079 r_.i64[i] = a_.i64[i] * b_.i64[i];
4080 }
4081 #endif
4082
4083 return simde__m128i_from_private(r_);
4084 }
4085
4086 SIMDE_FUNCTION_ATTRIBUTES
4087 simde__m128i
simde_x_mm_mod_epi64(simde__m128i a,simde__m128i b)4088 simde_x_mm_mod_epi64 (simde__m128i a, simde__m128i b) {
4089 simde__m128i_private
4090 r_,
4091 a_ = simde__m128i_to_private(a),
4092 b_ = simde__m128i_to_private(b);
4093
4094 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
4095 r_.i64 = a_.i64 % b_.i64;
4096 #else
4097 SIMDE_VECTORIZE
4098 for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
4099 r_.i64[i] = a_.i64[i] % b_.i64[i];
4100 }
4101 #endif
4102
4103 return simde__m128i_from_private(r_);
4104 }
4105
4106 SIMDE_FUNCTION_ATTRIBUTES
4107 simde__m128d
simde_mm_mul_pd(simde__m128d a,simde__m128d b)4108 simde_mm_mul_pd (simde__m128d a, simde__m128d b) {
4109 #if defined(SIMDE_X86_SSE2_NATIVE)
4110 return _mm_mul_pd(a, b);
4111 #else
4112 simde__m128d_private
4113 r_,
4114 a_ = simde__m128d_to_private(a),
4115 b_ = simde__m128d_to_private(b);
4116
4117 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
4118 r_.f64 = a_.f64 * b_.f64;
4119 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
4120 r_.neon_f64 = vmulq_f64(a_.neon_f64, b_.neon_f64);
4121 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4122 r_.wasm_v128 = wasm_f64x2_mul(a_.wasm_v128, b_.wasm_v128);
4123 #else
4124 SIMDE_VECTORIZE
4125 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
4126 r_.f64[i] = a_.f64[i] * b_.f64[i];
4127 }
4128 #endif
4129
4130 return simde__m128d_from_private(r_);
4131 #endif
4132 }
4133 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4134 #define _mm_mul_pd(a, b) simde_mm_mul_pd(a, b)
4135 #endif
4136
4137 SIMDE_FUNCTION_ATTRIBUTES
4138 simde__m128d
simde_mm_mul_sd(simde__m128d a,simde__m128d b)4139 simde_mm_mul_sd (simde__m128d a, simde__m128d b) {
4140 #if defined(SIMDE_X86_SSE2_NATIVE)
4141 return _mm_mul_sd(a, b);
4142 #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
4143 return simde_mm_move_sd(a, simde_mm_mul_pd(a, b));
4144 #else
4145 simde__m128d_private
4146 r_,
4147 a_ = simde__m128d_to_private(a),
4148 b_ = simde__m128d_to_private(b);
4149
4150 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
4151 float64x2_t temp = vmulq_f64(a_.neon_f64, b_.neon_f64);
4152 r_.neon_f64 = vsetq_lane_f64(vgetq_lane(a_.neon_f64, 1), temp, 1);
4153 #else
4154 r_.f64[0] = a_.f64[0] * b_.f64[0];
4155 r_.f64[1] = a_.f64[1];
4156 #endif
4157
4158 return simde__m128d_from_private(r_);
4159 #endif
4160 }
4161 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4162 #define _mm_mul_sd(a, b) simde_mm_mul_sd(a, b)
4163 #endif
4164
4165 SIMDE_FUNCTION_ATTRIBUTES
4166 simde__m64
simde_mm_mul_su32(simde__m64 a,simde__m64 b)4167 simde_mm_mul_su32 (simde__m64 a, simde__m64 b) {
4168 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI)
4169 return _mm_mul_su32(a, b);
4170 #else
4171 simde__m64_private
4172 r_,
4173 a_ = simde__m64_to_private(a),
4174 b_ = simde__m64_to_private(b);
4175
4176 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4177 r_.u64[0] = vget_lane_u64(vget_low_u64(vmull_u32(vreinterpret_u32_s64(a_.neon_i64), vreinterpret_u32_s64(b_.neon_i64))), 0);
4178 #else
4179 r_.u64[0] = HEDLEY_STATIC_CAST(uint64_t, a_.u32[0]) * HEDLEY_STATIC_CAST(uint64_t, b_.u32[0]);
4180 #endif
4181
4182 return simde__m64_from_private(r_);
4183 #endif
4184 }
4185 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4186 #define _mm_mul_su32(a, b) simde_mm_mul_su32(a, b)
4187 #endif
4188
4189 SIMDE_FUNCTION_ATTRIBUTES
4190 simde__m128i
simde_mm_mulhi_epi16(simde__m128i a,simde__m128i b)4191 simde_mm_mulhi_epi16 (simde__m128i a, simde__m128i b) {
4192 #if defined(SIMDE_X86_SSE2_NATIVE)
4193 return _mm_mulhi_epi16(a, b);
4194 #else
4195 simde__m128i_private
4196 r_,
4197 a_ = simde__m128i_to_private(a),
4198 b_ = simde__m128i_to_private(b);
4199
4200 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4201 int16x4_t a3210 = vget_low_s16(a_.neon_i16);
4202 int16x4_t b3210 = vget_low_s16(b_.neon_i16);
4203 int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
4204 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
4205 int32x4_t ab7654 = vmull_high_s16(a_.neon_i16, b_.neon_i16);
4206 r_.neon_i16 = vuzp2q_s16(vreinterpretq_s16_s32(ab3210), vreinterpretq_s16_s32(ab7654));
4207 #else
4208 int16x4_t a7654 = vget_high_s16(a_.neon_i16);
4209 int16x4_t b7654 = vget_high_s16(b_.neon_i16);
4210 int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
4211 uint16x8x2_t rv = vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
4212 r_.neon_u16 = rv.val[1];
4213 #endif
4214 #else
4215 SIMDE_VECTORIZE
4216 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
4217 r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, (HEDLEY_STATIC_CAST(uint32_t, HEDLEY_STATIC_CAST(int32_t, a_.i16[i]) * HEDLEY_STATIC_CAST(int32_t, b_.i16[i])) >> 16));
4218 }
4219 #endif
4220
4221 return simde__m128i_from_private(r_);
4222 #endif
4223 }
4224 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4225 #define _mm_mulhi_epi16(a, b) simde_mm_mulhi_epi16(a, b)
4226 #endif
4227
4228 SIMDE_FUNCTION_ATTRIBUTES
4229 simde__m128i
simde_mm_mulhi_epu16(simde__m128i a,simde__m128i b)4230 simde_mm_mulhi_epu16 (simde__m128i a, simde__m128i b) {
4231 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
4232 return _mm_mulhi_epu16(a, b);
4233 #else
4234 simde__m128i_private
4235 r_,
4236 a_ = simde__m128i_to_private(a),
4237 b_ = simde__m128i_to_private(b);
4238
4239 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4240 uint16x4_t a3210 = vget_low_u16(a_.neon_u16);
4241 uint16x4_t b3210 = vget_low_u16(b_.neon_u16);
4242 uint32x4_t ab3210 = vmull_u16(a3210, b3210); /* 3333222211110000 */
4243 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
4244 uint32x4_t ab7654 = vmull_high_u16(a_.neon_u16, b_.neon_u16);
4245 r_.neon_u16 = vuzp2q_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654));
4246 #else
4247 uint16x4_t a7654 = vget_high_u16(a_.neon_u16);
4248 uint16x4_t b7654 = vget_high_u16(b_.neon_u16);
4249 uint32x4_t ab7654 = vmull_u16(a7654, b7654); /* 7777666655554444 */
4250 uint16x8x2_t neon_r = vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654));
4251 r_.neon_u16 = neon_r.val[1];
4252 #endif
4253 #else
4254 SIMDE_VECTORIZE
4255 for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
4256 r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, HEDLEY_STATIC_CAST(uint32_t, a_.u16[i]) * HEDLEY_STATIC_CAST(uint32_t, b_.u16[i]) >> 16);
4257 }
4258 #endif
4259
4260 return simde__m128i_from_private(r_);
4261 #endif
4262 }
4263 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4264 #define _mm_mulhi_epu16(a, b) simde_mm_mulhi_epu16(a, b)
4265 #endif
4266
4267 SIMDE_FUNCTION_ATTRIBUTES
4268 simde__m128i
simde_mm_mullo_epi16(simde__m128i a,simde__m128i b)4269 simde_mm_mullo_epi16 (simde__m128i a, simde__m128i b) {
4270 #if defined(SIMDE_X86_SSE2_NATIVE)
4271 return _mm_mullo_epi16(a, b);
4272 #else
4273 simde__m128i_private
4274 r_,
4275 a_ = simde__m128i_to_private(a),
4276 b_ = simde__m128i_to_private(b);
4277
4278 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4279 r_.neon_i16 = vmulq_s16(a_.neon_i16, b_.neon_i16);
4280 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
4281 (void) a_;
4282 (void) b_;
4283 r_.altivec_i16 = vec_mul(a_.altivec_i16, b_.altivec_i16);
4284 #else
4285 SIMDE_VECTORIZE
4286 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
4287 r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, HEDLEY_STATIC_CAST(uint32_t, a_.u16[i]) * HEDLEY_STATIC_CAST(uint32_t, b_.u16[i]));
4288 }
4289 #endif
4290
4291 return simde__m128i_from_private(r_);
4292 #endif
4293 }
4294 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4295 #define _mm_mullo_epi16(a, b) simde_mm_mullo_epi16(a, b)
4296 #endif
4297
4298 SIMDE_FUNCTION_ATTRIBUTES
4299 simde__m128d
simde_mm_or_pd(simde__m128d a,simde__m128d b)4300 simde_mm_or_pd (simde__m128d a, simde__m128d b) {
4301 #if defined(SIMDE_X86_SSE2_NATIVE)
4302 return _mm_or_pd(a, b);
4303 #else
4304 simde__m128d_private
4305 r_,
4306 a_ = simde__m128d_to_private(a),
4307 b_ = simde__m128d_to_private(b);
4308
4309 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
4310 r_.i32f = a_.i32f | b_.i32f;
4311 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4312 r_.wasm_v128 = wasm_v128_or(a_.wasm_v128, b_.wasm_v128);
4313 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4314 r_.neon_i64 = vorrq_s64(a_.neon_i64, b_.neon_i64);
4315 #else
4316 SIMDE_VECTORIZE
4317 for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
4318 r_.i32f[i] = a_.i32f[i] | b_.i32f[i];
4319 }
4320 #endif
4321
4322 return simde__m128d_from_private(r_);
4323 #endif
4324 }
4325 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4326 #define _mm_or_pd(a, b) simde_mm_or_pd(a, b)
4327 #endif
4328
4329 SIMDE_FUNCTION_ATTRIBUTES
4330 simde__m128i
simde_mm_or_si128(simde__m128i a,simde__m128i b)4331 simde_mm_or_si128 (simde__m128i a, simde__m128i b) {
4332 #if defined(SIMDE_X86_SSE2_NATIVE)
4333 return _mm_or_si128(a, b);
4334 #else
4335 simde__m128i_private
4336 r_,
4337 a_ = simde__m128i_to_private(a),
4338 b_ = simde__m128i_to_private(b);
4339
4340 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4341 r_.neon_i32 = vorrq_s32(a_.neon_i32, b_.neon_i32);
4342 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
4343 r_.altivec_i32 = vec_or(a_.altivec_i32, b_.altivec_i32);
4344 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
4345 r_.i32f = a_.i32f | b_.i32f;
4346 #else
4347 SIMDE_VECTORIZE
4348 for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
4349 r_.i32f[i] = a_.i32f[i] | b_.i32f[i];
4350 }
4351 #endif
4352
4353 return simde__m128i_from_private(r_);
4354 #endif
4355 }
4356 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4357 #define _mm_or_si128(a, b) simde_mm_or_si128(a, b)
4358 #endif
4359
4360 SIMDE_FUNCTION_ATTRIBUTES
4361 simde__m128i
simde_mm_packs_epi16(simde__m128i a,simde__m128i b)4362 simde_mm_packs_epi16 (simde__m128i a, simde__m128i b) {
4363 #if defined(SIMDE_X86_SSE2_NATIVE)
4364 return _mm_packs_epi16(a, b);
4365 #else
4366 simde__m128i_private
4367 r_,
4368 a_ = simde__m128i_to_private(a),
4369 b_ = simde__m128i_to_private(b);
4370
4371 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4372 r_.neon_i8 = vcombine_s8(vqmovn_s16(a_.neon_i16), vqmovn_s16(b_.neon_i16));
4373 #else
4374 SIMDE_VECTORIZE
4375 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
4376 r_.i8[i] = (a_.i16[i] > INT8_MAX) ? INT8_MAX : ((a_.i16[i] < INT8_MIN) ? INT8_MIN : HEDLEY_STATIC_CAST(int8_t, a_.i16[i]));
4377 r_.i8[i + 8] = (b_.i16[i] > INT8_MAX) ? INT8_MAX : ((b_.i16[i] < INT8_MIN) ? INT8_MIN : HEDLEY_STATIC_CAST(int8_t, b_.i16[i]));
4378 }
4379 #endif
4380
4381 return simde__m128i_from_private(r_);
4382 #endif
4383 }
4384 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4385 #define _mm_packs_epi16(a, b) simde_mm_packs_epi16(a, b)
4386 #endif
4387
4388 SIMDE_FUNCTION_ATTRIBUTES
4389 simde__m128i
simde_mm_packs_epi32(simde__m128i a,simde__m128i b)4390 simde_mm_packs_epi32 (simde__m128i a, simde__m128i b) {
4391 #if defined(SIMDE_X86_SSE2_NATIVE)
4392 return _mm_packs_epi32(a, b);
4393 #else
4394 simde__m128i_private
4395 r_,
4396 a_ = simde__m128i_to_private(a),
4397 b_ = simde__m128i_to_private(b);
4398
4399 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4400 r_.neon_i16 = vcombine_s16(vqmovn_s32(a_.neon_i32), vqmovn_s32(b_.neon_i32));
4401 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
4402 r_.altivec_i16 = vec_packs(a_.altivec_i32, b_.altivec_i32);
4403 #else
4404 SIMDE_VECTORIZE
4405 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
4406 r_.i16[i] = (a_.i32[i] > INT16_MAX) ? INT16_MAX : ((a_.i32[i] < INT16_MIN) ? INT16_MIN : HEDLEY_STATIC_CAST(int16_t, a_.i32[i]));
4407 r_.i16[i + 4] = (b_.i32[i] > INT16_MAX) ? INT16_MAX : ((b_.i32[i] < INT16_MIN) ? INT16_MIN : HEDLEY_STATIC_CAST(int16_t, b_.i32[i]));
4408 }
4409 #endif
4410
4411 return simde__m128i_from_private(r_);
4412 #endif
4413 }
4414 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4415 #define _mm_packs_epi32(a, b) simde_mm_packs_epi32(a, b)
4416 #endif
4417
4418 SIMDE_FUNCTION_ATTRIBUTES
4419 simde__m128i
simde_mm_packus_epi16(simde__m128i a,simde__m128i b)4420 simde_mm_packus_epi16 (simde__m128i a, simde__m128i b) {
4421 #if defined(SIMDE_X86_SSE2_NATIVE)
4422 return _mm_packus_epi16(a, b);
4423 #else
4424 simde__m128i_private
4425 r_,
4426 a_ = simde__m128i_to_private(a),
4427 b_ = simde__m128i_to_private(b);
4428
4429 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4430 r_.neon_u8 = vcombine_u8(vqmovun_s16(a_.neon_i16), vqmovun_s16(b_.neon_i16));
4431 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
4432 r_.altivec_u8 = vec_packsu(a_.altivec_i16, b_.altivec_i16);
4433 #else
4434 SIMDE_VECTORIZE
4435 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
4436 r_.u8[i] = (a_.i16[i] > UINT8_MAX) ? UINT8_MAX : ((a_.i16[i] < 0) ? UINT8_C(0) : HEDLEY_STATIC_CAST(uint8_t, a_.i16[i]));
4437 r_.u8[i + 8] = (b_.i16[i] > UINT8_MAX) ? UINT8_MAX : ((b_.i16[i] < 0) ? UINT8_C(0) : HEDLEY_STATIC_CAST(uint8_t, b_.i16[i]));
4438 }
4439 #endif
4440
4441 return simde__m128i_from_private(r_);
4442 #endif
4443 }
4444 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4445 #define _mm_packus_epi16(a, b) simde_mm_packus_epi16(a, b)
4446 #endif
4447
4448 SIMDE_FUNCTION_ATTRIBUTES
4449 void
simde_mm_pause(void)4450 simde_mm_pause (void) {
4451 #if defined(SIMDE_X86_SSE2_NATIVE)
4452 _mm_pause();
4453 #endif
4454 }
4455 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4456 #define _mm_pause() (simde_mm_pause())
4457 #endif
4458
4459 SIMDE_FUNCTION_ATTRIBUTES
4460 simde__m128i
simde_mm_sad_epu8(simde__m128i a,simde__m128i b)4461 simde_mm_sad_epu8 (simde__m128i a, simde__m128i b) {
4462 #if defined(SIMDE_X86_SSE2_NATIVE)
4463 return _mm_sad_epu8(a, b);
4464 #else
4465 simde__m128i_private
4466 r_,
4467 a_ = simde__m128i_to_private(a),
4468 b_ = simde__m128i_to_private(b);
4469
4470 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4471 const uint16x8_t t = vpaddlq_u8(vabdq_u8(a_.neon_u8, b_.neon_u8));
4472 r_.neon_u64 = vcombine_u64(
4473 vpaddl_u32(vpaddl_u16(vget_low_u16(t))),
4474 vpaddl_u32(vpaddl_u16(vget_high_u16(t))));
4475 #else
4476 for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
4477 uint16_t tmp = 0;
4478 SIMDE_VECTORIZE_REDUCTION(+:tmp)
4479 for (size_t j = 0 ; j < ((sizeof(r_.u8) / sizeof(r_.u8[0])) / 2) ; j++) {
4480 const size_t e = j + (i * 8);
4481 tmp += (a_.u8[e] > b_.u8[e]) ? (a_.u8[e] - b_.u8[e]) : (b_.u8[e] - a_.u8[e]);
4482 }
4483 r_.i64[i] = tmp;
4484 }
4485 #endif
4486
4487 return simde__m128i_from_private(r_);
4488 #endif
4489 }
4490 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4491 #define _mm_sad_epu8(a, b) simde_mm_sad_epu8(a, b)
4492 #endif
4493
4494 SIMDE_FUNCTION_ATTRIBUTES
4495 simde__m128i
simde_mm_set_epi8(int8_t e15,int8_t e14,int8_t e13,int8_t e12,int8_t e11,int8_t e10,int8_t e9,int8_t e8,int8_t e7,int8_t e6,int8_t e5,int8_t e4,int8_t e3,int8_t e2,int8_t e1,int8_t e0)4496 simde_mm_set_epi8 (int8_t e15, int8_t e14, int8_t e13, int8_t e12,
4497 int8_t e11, int8_t e10, int8_t e9, int8_t e8,
4498 int8_t e7, int8_t e6, int8_t e5, int8_t e4,
4499 int8_t e3, int8_t e2, int8_t e1, int8_t e0) {
4500
4501 #if defined(SIMDE_X86_SSE2_NATIVE)
4502 return _mm_set_epi8(
4503 e15, e14, e13, e12, e11, e10, e9, e8,
4504 e7, e6, e5, e4, e3, e2, e1, e0);
4505 #else
4506 simde__m128i_private r_;
4507
4508 #if defined(SIMDE_WASM_SIMD128_NATIVE)
4509 r_.wasm_v128 = wasm_i8x16_make(
4510 e0, e1, e2, e3, e4, e5, e6, e7,
4511 e8, e9, e10, e11, e12, e13, e14, e15);
4512 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4513 SIMDE_ALIGN_LIKE_16(int8x16_t) int8_t data[16] = {
4514 e0, e1, e2, e3,
4515 e4, e5, e6, e7,
4516 e8, e9, e10, e11,
4517 e12, e13, e14, e15};
4518 r_.neon_i8 = vld1q_s8(data);
4519 #else
4520 r_.i8[ 0] = e0;
4521 r_.i8[ 1] = e1;
4522 r_.i8[ 2] = e2;
4523 r_.i8[ 3] = e3;
4524 r_.i8[ 4] = e4;
4525 r_.i8[ 5] = e5;
4526 r_.i8[ 6] = e6;
4527 r_.i8[ 7] = e7;
4528 r_.i8[ 8] = e8;
4529 r_.i8[ 9] = e9;
4530 r_.i8[10] = e10;
4531 r_.i8[11] = e11;
4532 r_.i8[12] = e12;
4533 r_.i8[13] = e13;
4534 r_.i8[14] = e14;
4535 r_.i8[15] = e15;
4536 #endif
4537
4538 return simde__m128i_from_private(r_);
4539 #endif
4540 }
4541 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4542 #define _mm_set_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) simde_mm_set_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0)
4543 #endif
4544
4545 SIMDE_FUNCTION_ATTRIBUTES
4546 simde__m128i
simde_mm_set_epi16(int16_t e7,int16_t e6,int16_t e5,int16_t e4,int16_t e3,int16_t e2,int16_t e1,int16_t e0)4547 simde_mm_set_epi16 (int16_t e7, int16_t e6, int16_t e5, int16_t e4,
4548 int16_t e3, int16_t e2, int16_t e1, int16_t e0) {
4549 #if defined(SIMDE_X86_SSE2_NATIVE)
4550 return _mm_set_epi16(e7, e6, e5, e4, e3, e2, e1, e0);
4551 #else
4552 simde__m128i_private r_;
4553
4554 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4555 SIMDE_ALIGN_LIKE_16(int16x8_t) int16_t data[8] = { e0, e1, e2, e3, e4, e5, e6, e7 };
4556 r_.neon_i16 = vld1q_s16(data);
4557 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4558 r_.wasm_v128 = wasm_i16x8_make(e0, e1, e2, e3, e4, e5, e6, e7);
4559 #else
4560 r_.i16[0] = e0;
4561 r_.i16[1] = e1;
4562 r_.i16[2] = e2;
4563 r_.i16[3] = e3;
4564 r_.i16[4] = e4;
4565 r_.i16[5] = e5;
4566 r_.i16[6] = e6;
4567 r_.i16[7] = e7;
4568 #endif
4569
4570 return simde__m128i_from_private(r_);
4571 #endif
4572 }
4573 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4574 #define _mm_set_epi16(e7, e6, e5, e4, e3, e2, e1, e0) simde_mm_set_epi16(e7, e6, e5, e4, e3, e2, e1, e0)
4575 #endif
4576
4577 SIMDE_FUNCTION_ATTRIBUTES
4578 simde__m128i
simde_mm_loadu_si16(void const * mem_addr)4579 simde_mm_loadu_si16 (void const* mem_addr) {
4580 #if defined(SIMDE_X86_SSE2_NATIVE) && ( \
4581 SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0) || \
4582 HEDLEY_GCC_VERSION_CHECK(11,0,0) || \
4583 HEDLEY_INTEL_VERSION_CHECK(20,21,1))
4584 return _mm_loadu_si16(mem_addr);
4585 #else
4586 int16_t val;
4587 simde_memcpy(&val, mem_addr, sizeof(val));
4588 return simde_x_mm_cvtsi16_si128(val);
4589 #endif
4590 }
4591 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4592 #define _mm_loadu_si16(mem_addr) simde_mm_loadu_si16(mem_addr)
4593 #endif
4594
4595 SIMDE_FUNCTION_ATTRIBUTES
4596 simde__m128i
simde_mm_set_epi32(int32_t e3,int32_t e2,int32_t e1,int32_t e0)4597 simde_mm_set_epi32 (int32_t e3, int32_t e2, int32_t e1, int32_t e0) {
4598 #if defined(SIMDE_X86_SSE2_NATIVE)
4599 return _mm_set_epi32(e3, e2, e1, e0);
4600 #else
4601 simde__m128i_private r_;
4602
4603 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4604 SIMDE_ALIGN_LIKE_16(int32x4_t) int32_t data[4] = { e0, e1, e2, e3 };
4605 r_.neon_i32 = vld1q_s32(data);
4606 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4607 r_.wasm_v128 = wasm_i32x4_make(e0, e1, e2, e3);
4608 #else
4609 r_.i32[0] = e0;
4610 r_.i32[1] = e1;
4611 r_.i32[2] = e2;
4612 r_.i32[3] = e3;
4613 #endif
4614
4615 return simde__m128i_from_private(r_);
4616 #endif
4617 }
4618 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4619 #define _mm_set_epi32(e3, e2, e1, e0) simde_mm_set_epi32(e3, e2, e1, e0)
4620 #endif
4621
4622 SIMDE_FUNCTION_ATTRIBUTES
4623 simde__m128i
simde_mm_loadu_si32(void const * mem_addr)4624 simde_mm_loadu_si32 (void const* mem_addr) {
4625 #if defined(SIMDE_X86_SSE2_NATIVE) && ( \
4626 SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0) || \
4627 HEDLEY_GCC_VERSION_CHECK(11,0,0) || \
4628 HEDLEY_INTEL_VERSION_CHECK(20,21,1))
4629 return _mm_loadu_si32(mem_addr);
4630 #else
4631 int32_t val;
4632 simde_memcpy(&val, mem_addr, sizeof(val));
4633 return simde_mm_cvtsi32_si128(val);
4634 #endif
4635 }
4636 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4637 #define _mm_loadu_si32(mem_addr) simde_mm_loadu_si32(mem_addr)
4638 #endif
4639
4640 SIMDE_FUNCTION_ATTRIBUTES
4641 simde__m128i
simde_mm_set_epi64(simde__m64 e1,simde__m64 e0)4642 simde_mm_set_epi64 (simde__m64 e1, simde__m64 e0) {
4643 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
4644 return _mm_set_epi64(e1, e0);
4645 #else
4646 simde__m128i_private r_;
4647
4648 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4649 r_.neon_i64 = vcombine_s64(simde__m64_to_neon_i64(e0), simde__m64_to_neon_i64(e1));
4650 #else
4651 r_.m64[0] = e0;
4652 r_.m64[1] = e1;
4653 #endif
4654
4655 return simde__m128i_from_private(r_);
4656 #endif
4657 }
4658 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4659 #define _mm_set_epi64(e1, e0) (simde_mm_set_epi64((e1), (e0)))
4660 #endif
4661
4662 SIMDE_FUNCTION_ATTRIBUTES
4663 simde__m128i
simde_mm_set_epi64x(int64_t e1,int64_t e0)4664 simde_mm_set_epi64x (int64_t e1, int64_t e0) {
4665 #if defined(SIMDE_X86_SSE2_NATIVE) && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,0,0))
4666 return _mm_set_epi64x(e1, e0);
4667 #else
4668 simde__m128i_private r_;
4669
4670 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4671 SIMDE_ALIGN_LIKE_16(int64x2_t) int64_t data[2] = {e0, e1};
4672 r_.neon_i64 = vld1q_s64(data);
4673 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4674 r_.wasm_v128 = wasm_i64x2_make(e0, e1);
4675 #else
4676 r_.i64[0] = e0;
4677 r_.i64[1] = e1;
4678 #endif
4679
4680 return simde__m128i_from_private(r_);
4681 #endif
4682 }
4683 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4684 #define _mm_set_epi64x(e1, e0) simde_mm_set_epi64x(e1, e0)
4685 #endif
4686
4687 SIMDE_FUNCTION_ATTRIBUTES
4688 simde__m128i
simde_mm_loadu_si64(void const * mem_addr)4689 simde_mm_loadu_si64 (void const* mem_addr) {
4690 #if defined(SIMDE_X86_SSE2_NATIVE) && ( \
4691 SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0) || \
4692 HEDLEY_GCC_VERSION_CHECK(11,0,0) || \
4693 HEDLEY_INTEL_VERSION_CHECK(20,21,1))
4694 return _mm_loadu_si64(mem_addr);
4695 #else
4696 int64_t val;
4697 simde_memcpy(&val, mem_addr, sizeof(val));
4698 return simde_mm_cvtsi64_si128(val);
4699 #endif
4700 }
4701 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4702 #define _mm_loadu_si64(mem_addr) simde_mm_loadu_si64(mem_addr)
4703 #endif
4704
4705 SIMDE_FUNCTION_ATTRIBUTES
4706 simde__m128i
simde_x_mm_set_epu8(uint8_t e15,uint8_t e14,uint8_t e13,uint8_t e12,uint8_t e11,uint8_t e10,uint8_t e9,uint8_t e8,uint8_t e7,uint8_t e6,uint8_t e5,uint8_t e4,uint8_t e3,uint8_t e2,uint8_t e1,uint8_t e0)4707 simde_x_mm_set_epu8 (uint8_t e15, uint8_t e14, uint8_t e13, uint8_t e12,
4708 uint8_t e11, uint8_t e10, uint8_t e9, uint8_t e8,
4709 uint8_t e7, uint8_t e6, uint8_t e5, uint8_t e4,
4710 uint8_t e3, uint8_t e2, uint8_t e1, uint8_t e0) {
4711 #if defined(SIMDE_X86_SSE2_NATIVE)
4712 return _mm_set_epi8(
4713 HEDLEY_STATIC_CAST(char, e15), HEDLEY_STATIC_CAST(char, e14), HEDLEY_STATIC_CAST(char, e13), HEDLEY_STATIC_CAST(char, e12),
4714 HEDLEY_STATIC_CAST(char, e11), HEDLEY_STATIC_CAST(char, e10), HEDLEY_STATIC_CAST(char, e9), HEDLEY_STATIC_CAST(char, e8),
4715 HEDLEY_STATIC_CAST(char, e7), HEDLEY_STATIC_CAST(char, e6), HEDLEY_STATIC_CAST(char, e5), HEDLEY_STATIC_CAST(char, e4),
4716 HEDLEY_STATIC_CAST(char, e3), HEDLEY_STATIC_CAST(char, e2), HEDLEY_STATIC_CAST(char, e1), HEDLEY_STATIC_CAST(char, e0));
4717 #else
4718 simde__m128i_private r_;
4719
4720 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4721 SIMDE_ALIGN_LIKE_16(uint8x16_t) uint8_t data[16] = {
4722 e0, e1, e2, e3,
4723 e4, e5, e6, e7,
4724 e8, e9, e10, e11,
4725 e12, e13, e14, e15};
4726 r_.neon_u8 = vld1q_u8(data);
4727 #else
4728 r_.u8[ 0] = e0; r_.u8[ 1] = e1; r_.u8[ 2] = e2; r_.u8[ 3] = e3;
4729 r_.u8[ 4] = e4; r_.u8[ 5] = e5; r_.u8[ 6] = e6; r_.u8[ 7] = e7;
4730 r_.u8[ 8] = e8; r_.u8[ 9] = e9; r_.u8[10] = e10; r_.u8[11] = e11;
4731 r_.u8[12] = e12; r_.u8[13] = e13; r_.u8[14] = e14; r_.u8[15] = e15;
4732 #endif
4733
4734 return simde__m128i_from_private(r_);
4735 #endif
4736 }
4737
4738 SIMDE_FUNCTION_ATTRIBUTES
4739 simde__m128i
simde_x_mm_set_epu16(uint16_t e7,uint16_t e6,uint16_t e5,uint16_t e4,uint16_t e3,uint16_t e2,uint16_t e1,uint16_t e0)4740 simde_x_mm_set_epu16 (uint16_t e7, uint16_t e6, uint16_t e5, uint16_t e4,
4741 uint16_t e3, uint16_t e2, uint16_t e1, uint16_t e0) {
4742 #if defined(SIMDE_X86_SSE2_NATIVE)
4743 return _mm_set_epi16(
4744 HEDLEY_STATIC_CAST(short, e7), HEDLEY_STATIC_CAST(short, e6), HEDLEY_STATIC_CAST(short, e5), HEDLEY_STATIC_CAST(short, e4),
4745 HEDLEY_STATIC_CAST(short, e3), HEDLEY_STATIC_CAST(short, e2), HEDLEY_STATIC_CAST(short, e1), HEDLEY_STATIC_CAST(short, e0));
4746 #else
4747 simde__m128i_private r_;
4748
4749 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4750 SIMDE_ALIGN_LIKE_16(uint16x8_t) uint16_t data[8] = { e0, e1, e2, e3, e4, e5, e6, e7 };
4751 r_.neon_u16 = vld1q_u16(data);
4752 #else
4753 r_.u16[0] = e0; r_.u16[1] = e1; r_.u16[2] = e2; r_.u16[3] = e3;
4754 r_.u16[4] = e4; r_.u16[5] = e5; r_.u16[6] = e6; r_.u16[7] = e7;
4755 #endif
4756
4757 return simde__m128i_from_private(r_);
4758 #endif
4759 }
4760
4761 SIMDE_FUNCTION_ATTRIBUTES
4762 simde__m128i
simde_x_mm_set_epu32(uint32_t e3,uint32_t e2,uint32_t e1,uint32_t e0)4763 simde_x_mm_set_epu32 (uint32_t e3, uint32_t e2, uint32_t e1, uint32_t e0) {
4764 #if defined(SIMDE_X86_SSE2_NATIVE)
4765 return _mm_set_epi32(
4766 HEDLEY_STATIC_CAST(int, e3), HEDLEY_STATIC_CAST(int, e2), HEDLEY_STATIC_CAST(int, e1), HEDLEY_STATIC_CAST(int, e0));
4767 #else
4768 simde__m128i_private r_;
4769
4770 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4771 SIMDE_ALIGN_LIKE_16(uint32x4_t) uint32_t data[4] = { e0, e1, e2, e3 };
4772 r_.neon_u32 = vld1q_u32(data);
4773 #else
4774 r_.u32[0] = e0;
4775 r_.u32[1] = e1;
4776 r_.u32[2] = e2;
4777 r_.u32[3] = e3;
4778 #endif
4779
4780 return simde__m128i_from_private(r_);
4781 #endif
4782 }
4783
4784 SIMDE_FUNCTION_ATTRIBUTES
4785 simde__m128i
simde_x_mm_set_epu64x(uint64_t e1,uint64_t e0)4786 simde_x_mm_set_epu64x (uint64_t e1, uint64_t e0) {
4787 #if defined(SIMDE_X86_SSE2_NATIVE) && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,0,0))
4788 return _mm_set_epi64x(HEDLEY_STATIC_CAST(int64_t, e1), HEDLEY_STATIC_CAST(int64_t, e0));
4789 #else
4790 simde__m128i_private r_;
4791
4792 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4793 SIMDE_ALIGN_LIKE_16(uint64x2_t) uint64_t data[2] = {e0, e1};
4794 r_.neon_u64 = vld1q_u64(data);
4795 #else
4796 r_.u64[0] = e0;
4797 r_.u64[1] = e1;
4798 #endif
4799
4800 return simde__m128i_from_private(r_);
4801 #endif
4802 }
4803
4804 SIMDE_FUNCTION_ATTRIBUTES
4805 simde__m128d
simde_mm_set_sd(simde_float64 a)4806 simde_mm_set_sd (simde_float64 a) {
4807 #if defined(SIMDE_X86_SSE2_NATIVE)
4808 return _mm_set_sd(a);
4809 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
4810 return vsetq_lane_f64(a, vdupq_n_f64(SIMDE_FLOAT64_C(0.0)), 0);
4811 #else
4812 return simde_mm_set_pd(SIMDE_FLOAT64_C(0.0), a);
4813 #endif
4814 }
4815 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4816 #define _mm_set_sd(a) simde_mm_set_sd(a)
4817 #endif
4818
4819 SIMDE_FUNCTION_ATTRIBUTES
4820 simde__m128i
simde_mm_set1_epi8(int8_t a)4821 simde_mm_set1_epi8 (int8_t a) {
4822 #if defined(SIMDE_X86_SSE2_NATIVE)
4823 return _mm_set1_epi8(a);
4824 #else
4825 simde__m128i_private r_;
4826
4827 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4828 r_.neon_i8 = vdupq_n_s8(a);
4829 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4830 r_.wasm_v128 = wasm_i8x16_splat(a);
4831 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
4832 r_.altivec_i8 = vec_splats(HEDLEY_STATIC_CAST(signed char, a));
4833 #else
4834 SIMDE_VECTORIZE
4835 for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
4836 r_.i8[i] = a;
4837 }
4838 #endif
4839
4840 return simde__m128i_from_private(r_);
4841 #endif
4842 }
4843 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4844 #define _mm_set1_epi8(a) simde_mm_set1_epi8(a)
4845 #endif
4846
4847 SIMDE_FUNCTION_ATTRIBUTES
4848 simde__m128i
simde_mm_set1_epi16(int16_t a)4849 simde_mm_set1_epi16 (int16_t a) {
4850 #if defined(SIMDE_X86_SSE2_NATIVE)
4851 return _mm_set1_epi16(a);
4852 #else
4853 simde__m128i_private r_;
4854
4855 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4856 r_.neon_i16 = vdupq_n_s16(a);
4857 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4858 r_.wasm_v128 = wasm_i16x8_splat(a);
4859 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
4860 r_.altivec_i16 = vec_splats(HEDLEY_STATIC_CAST(signed short, a));
4861 #else
4862 SIMDE_VECTORIZE
4863 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
4864 r_.i16[i] = a;
4865 }
4866 #endif
4867
4868 return simde__m128i_from_private(r_);
4869 #endif
4870 }
4871 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4872 #define _mm_set1_epi16(a) simde_mm_set1_epi16(a)
4873 #endif
4874
4875 SIMDE_FUNCTION_ATTRIBUTES
4876 simde__m128i
simde_mm_set1_epi32(int32_t a)4877 simde_mm_set1_epi32 (int32_t a) {
4878 #if defined(SIMDE_X86_SSE2_NATIVE)
4879 return _mm_set1_epi32(a);
4880 #else
4881 simde__m128i_private r_;
4882
4883 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4884 r_.neon_i32 = vdupq_n_s32(a);
4885 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4886 r_.wasm_v128 = wasm_i32x4_splat(a);
4887 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
4888 r_.altivec_i32 = vec_splats(HEDLEY_STATIC_CAST(signed int, a));
4889 #else
4890 SIMDE_VECTORIZE
4891 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
4892 r_.i32[i] = a;
4893 }
4894 #endif
4895
4896 return simde__m128i_from_private(r_);
4897 #endif
4898 }
4899 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4900 #define _mm_set1_epi32(a) simde_mm_set1_epi32(a)
4901 #endif
4902
4903 SIMDE_FUNCTION_ATTRIBUTES
4904 simde__m128i
simde_mm_set1_epi64x(int64_t a)4905 simde_mm_set1_epi64x (int64_t a) {
4906 #if defined(SIMDE_X86_SSE2_NATIVE) && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,0,0))
4907 return _mm_set1_epi64x(a);
4908 #else
4909 simde__m128i_private r_;
4910
4911 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4912 r_.neon_i64 = vdupq_n_s64(a);
4913 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4914 r_.wasm_v128 = wasm_i64x2_splat(a);
4915 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
4916 r_.altivec_i64 = vec_splats(HEDLEY_STATIC_CAST(signed long long, a));
4917 #else
4918 SIMDE_VECTORIZE
4919 for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
4920 r_.i64[i] = a;
4921 }
4922 #endif
4923
4924 return simde__m128i_from_private(r_);
4925 #endif
4926 }
4927 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4928 #define _mm_set1_epi64x(a) simde_mm_set1_epi64x(a)
4929 #endif
4930
4931 SIMDE_FUNCTION_ATTRIBUTES
4932 simde__m128i
simde_mm_set1_epi64(simde__m64 a)4933 simde_mm_set1_epi64 (simde__m64 a) {
4934 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
4935 return _mm_set1_epi64(a);
4936 #else
4937 simde__m64_private a_ = simde__m64_to_private(a);
4938 return simde_mm_set1_epi64x(a_.i64[0]);
4939 #endif
4940 }
4941 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4942 #define _mm_set1_epi64(a) simde_mm_set1_epi64(a)
4943 #endif
4944
4945 SIMDE_FUNCTION_ATTRIBUTES
4946 simde__m128i
simde_x_mm_set1_epu8(uint8_t value)4947 simde_x_mm_set1_epu8 (uint8_t value) {
4948 #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
4949 return simde__m128i_from_altivec_u8(vec_splats(HEDLEY_STATIC_CAST(unsigned char, value)));
4950 #else
4951 return simde_mm_set1_epi8(HEDLEY_STATIC_CAST(int8_t, value));
4952 #endif
4953 }
4954
4955 SIMDE_FUNCTION_ATTRIBUTES
4956 simde__m128i
simde_x_mm_set1_epu16(uint16_t value)4957 simde_x_mm_set1_epu16 (uint16_t value) {
4958 #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
4959 return simde__m128i_from_altivec_u16(vec_splats(HEDLEY_STATIC_CAST(unsigned short, value)));
4960 #else
4961 return simde_mm_set1_epi16(HEDLEY_STATIC_CAST(int16_t, value));
4962 #endif
4963 }
4964
4965 SIMDE_FUNCTION_ATTRIBUTES
4966 simde__m128i
simde_x_mm_set1_epu32(uint32_t value)4967 simde_x_mm_set1_epu32 (uint32_t value) {
4968 #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
4969 return simde__m128i_from_altivec_u32(vec_splats(HEDLEY_STATIC_CAST(unsigned int, value)));
4970 #else
4971 return simde_mm_set1_epi32(HEDLEY_STATIC_CAST(int32_t, value));
4972 #endif
4973 }
4974
4975 SIMDE_FUNCTION_ATTRIBUTES
4976 simde__m128i
simde_x_mm_set1_epu64(uint64_t value)4977 simde_x_mm_set1_epu64 (uint64_t value) {
4978 #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
4979 return simde__m128i_from_altivec_u64(vec_splats(HEDLEY_STATIC_CAST(unsigned long long, value)));
4980 #else
4981 return simde_mm_set1_epi64x(HEDLEY_STATIC_CAST(int64_t, value));
4982 #endif
4983 }
4984
4985 SIMDE_FUNCTION_ATTRIBUTES
4986 simde__m128i
simde_mm_setr_epi8(int8_t e15,int8_t e14,int8_t e13,int8_t e12,int8_t e11,int8_t e10,int8_t e9,int8_t e8,int8_t e7,int8_t e6,int8_t e5,int8_t e4,int8_t e3,int8_t e2,int8_t e1,int8_t e0)4987 simde_mm_setr_epi8 (int8_t e15, int8_t e14, int8_t e13, int8_t e12,
4988 int8_t e11, int8_t e10, int8_t e9, int8_t e8,
4989 int8_t e7, int8_t e6, int8_t e5, int8_t e4,
4990 int8_t e3, int8_t e2, int8_t e1, int8_t e0) {
4991 #if defined(SIMDE_X86_SSE2_NATIVE)
4992 return _mm_setr_epi8(
4993 e15, e14, e13, e12, e11, e10, e9, e8,
4994 e7, e6, e5, e4, e3, e2, e1, e0);
4995 #else
4996 return simde_mm_set_epi8(
4997 e0, e1, e2, e3, e4, e5, e6, e7,
4998 e8, e9, e10, e11, e12, e13, e14, e15);
4999 #endif
5000 }
5001 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5002 #define _mm_setr_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) simde_mm_setr_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0)
5003 #endif
5004
5005 SIMDE_FUNCTION_ATTRIBUTES
5006 simde__m128i
simde_mm_setr_epi16(int16_t e7,int16_t e6,int16_t e5,int16_t e4,int16_t e3,int16_t e2,int16_t e1,int16_t e0)5007 simde_mm_setr_epi16 (int16_t e7, int16_t e6, int16_t e5, int16_t e4,
5008 int16_t e3, int16_t e2, int16_t e1, int16_t e0) {
5009 #if defined(SIMDE_X86_SSE2_NATIVE)
5010 return _mm_setr_epi16(e7, e6, e5, e4, e3, e2, e1, e0);
5011 #else
5012 return simde_mm_set_epi16(e0, e1, e2, e3, e4, e5, e6, e7);
5013 #endif
5014 }
5015 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5016 #define _mm_setr_epi16(e7, e6, e5, e4, e3, e2, e1, e0) simde_mm_setr_epi16(e7, e6, e5, e4, e3, e2, e1, e0)
5017 #endif
5018
5019 SIMDE_FUNCTION_ATTRIBUTES
5020 simde__m128i
simde_mm_setr_epi32(int32_t e3,int32_t e2,int32_t e1,int32_t e0)5021 simde_mm_setr_epi32 (int32_t e3, int32_t e2, int32_t e1, int32_t e0) {
5022 #if defined(SIMDE_X86_SSE2_NATIVE)
5023 return _mm_setr_epi32(e3, e2, e1, e0);
5024 #else
5025 return simde_mm_set_epi32(e0, e1, e2, e3);
5026 #endif
5027 }
5028 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5029 #define _mm_setr_epi32(e3, e2, e1, e0) simde_mm_setr_epi32(e3, e2, e1, e0)
5030 #endif
5031
5032 SIMDE_FUNCTION_ATTRIBUTES
5033 simde__m128i
simde_mm_setr_epi64(simde__m64 e1,simde__m64 e0)5034 simde_mm_setr_epi64 (simde__m64 e1, simde__m64 e0) {
5035 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
5036 return _mm_setr_epi64(e1, e0);
5037 #else
5038 return simde_mm_set_epi64(e0, e1);
5039 #endif
5040 }
5041 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5042 #define _mm_setr_epi64(e1, e0) (simde_mm_setr_epi64((e1), (e0)))
5043 #endif
5044
5045 SIMDE_FUNCTION_ATTRIBUTES
5046 simde__m128d
simde_mm_setr_pd(simde_float64 e1,simde_float64 e0)5047 simde_mm_setr_pd (simde_float64 e1, simde_float64 e0) {
5048 #if defined(SIMDE_X86_SSE2_NATIVE)
5049 return _mm_setr_pd(e1, e0);
5050 #else
5051 return simde_mm_set_pd(e0, e1);
5052 #endif
5053 }
5054 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5055 #define _mm_setr_pd(e1, e0) simde_mm_setr_pd(e1, e0)
5056 #endif
5057
5058 SIMDE_FUNCTION_ATTRIBUTES
5059 simde__m128d
simde_mm_setzero_pd(void)5060 simde_mm_setzero_pd (void) {
5061 #if defined(SIMDE_X86_SSE2_NATIVE)
5062 return _mm_setzero_pd();
5063 #else
5064 return simde_mm_castsi128_pd(simde_mm_setzero_si128());
5065 #endif
5066 }
5067 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5068 #define _mm_setzero_pd() simde_mm_setzero_pd()
5069 #endif
5070
5071 #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
5072 HEDLEY_DIAGNOSTIC_PUSH
5073 SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_
5074 #endif
5075
5076 SIMDE_FUNCTION_ATTRIBUTES
5077 simde__m128d
simde_mm_undefined_pd(void)5078 simde_mm_undefined_pd (void) {
5079 simde__m128d_private r_;
5080
5081 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE__HAVE_UNDEFINED128)
5082 r_.n = _mm_undefined_pd();
5083 #elif !defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
5084 r_ = simde__m128d_to_private(simde_mm_setzero_pd());
5085 #endif
5086
5087 return simde__m128d_from_private(r_);
5088 }
5089 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5090 #define _mm_undefined_pd() simde_mm_undefined_pd()
5091 #endif
5092
5093 SIMDE_FUNCTION_ATTRIBUTES
5094 simde__m128i
simde_mm_undefined_si128(void)5095 simde_mm_undefined_si128 (void) {
5096 simde__m128i_private r_;
5097
5098 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE__HAVE_UNDEFINED128)
5099 r_.n = _mm_undefined_si128();
5100 #elif !defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
5101 r_ = simde__m128i_to_private(simde_mm_setzero_si128());
5102 #endif
5103
5104 return simde__m128i_from_private(r_);
5105 }
5106 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5107 #define _mm_undefined_si128() (simde_mm_undefined_si128())
5108 #endif
5109
5110 #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
5111 HEDLEY_DIAGNOSTIC_POP
5112 #endif
5113
5114 SIMDE_FUNCTION_ATTRIBUTES
5115 simde__m128d
simde_x_mm_setone_pd(void)5116 simde_x_mm_setone_pd (void) {
5117 return simde_mm_castps_pd(simde_x_mm_setone_ps());
5118 }
5119
5120 SIMDE_FUNCTION_ATTRIBUTES
5121 simde__m128i
simde_x_mm_setone_si128(void)5122 simde_x_mm_setone_si128 (void) {
5123 return simde_mm_castps_si128(simde_x_mm_setone_ps());
5124 }
5125
5126 SIMDE_FUNCTION_ATTRIBUTES
5127 simde__m128i
simde_mm_shuffle_epi32(simde__m128i a,const int imm8)5128 simde_mm_shuffle_epi32 (simde__m128i a, const int imm8)
5129 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
5130 simde__m128i_private
5131 r_,
5132 a_ = simde__m128i_to_private(a);
5133
5134 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
5135 r_.i32[i] = a_.i32[(imm8 >> (i * 2)) & 3];
5136 }
5137
5138 return simde__m128i_from_private(r_);
5139 }
5140 #if defined(SIMDE_X86_SSE2_NATIVE)
5141 #define simde_mm_shuffle_epi32(a, imm8) _mm_shuffle_epi32((a), (imm8))
5142 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5143 #define simde_mm_shuffle_epi32(a, imm8) \
5144 __extension__({ \
5145 int32x4_t ret; \
5146 ret = vmovq_n_s32( \
5147 vgetq_lane_s32(vreinterpretq_s32_s64(a), (imm8) & (0x3))); \
5148 ret = vsetq_lane_s32( \
5149 vgetq_lane_s32(vreinterpretq_s32_s64(a), ((imm8) >> 2) & 0x3), \
5150 ret, 1); \
5151 ret = vsetq_lane_s32( \
5152 vgetq_lane_s32(vreinterpretq_s32_s64(a), ((imm8) >> 4) & 0x3), \
5153 ret, 2); \
5154 ret = vsetq_lane_s32( \
5155 vgetq_lane_s32(vreinterpretq_s32_s64(a), ((imm8) >> 6) & 0x3), \
5156 ret, 3); \
5157 vreinterpretq_s64_s32(ret); \
5158 })
5159 #elif defined(SIMDE_SHUFFLE_VECTOR_)
5160 #define simde_mm_shuffle_epi32(a, imm8) (__extension__ ({ \
5161 const simde__m128i_private simde__tmp_a_ = simde__m128i_to_private(a); \
5162 simde__m128i_from_private((simde__m128i_private) { .i32 = \
5163 SIMDE_SHUFFLE_VECTOR_(32, 16, \
5164 (simde__tmp_a_).i32, \
5165 (simde__tmp_a_).i32, \
5166 ((imm8) ) & 3, \
5167 ((imm8) >> 2) & 3, \
5168 ((imm8) >> 4) & 3, \
5169 ((imm8) >> 6) & 3) }); }))
5170 #endif
5171 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5172 #define _mm_shuffle_epi32(a, imm8) simde_mm_shuffle_epi32(a, imm8)
5173 #endif
5174
5175 SIMDE_FUNCTION_ATTRIBUTES
5176 simde__m128d
simde_mm_shuffle_pd(simde__m128d a,simde__m128d b,const int imm8)5177 simde_mm_shuffle_pd (simde__m128d a, simde__m128d b, const int imm8)
5178 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3) {
5179 simde__m128d_private
5180 r_,
5181 a_ = simde__m128d_to_private(a),
5182 b_ = simde__m128d_to_private(b);
5183
5184 r_.f64[0] = ((imm8 & 1) == 0) ? a_.f64[0] : a_.f64[1];
5185 r_.f64[1] = ((imm8 & 2) == 0) ? b_.f64[0] : b_.f64[1];
5186
5187 return simde__m128d_from_private(r_);
5188 }
5189 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
5190 #define simde_mm_shuffle_pd(a, b, imm8) _mm_shuffle_pd((a), (b), (imm8))
5191 #elif defined(SIMDE_SHUFFLE_VECTOR_)
5192 #define simde_mm_shuffle_pd(a, b, imm8) (__extension__ ({ \
5193 simde__m128d_from_private((simde__m128d_private) { .f64 = \
5194 SIMDE_SHUFFLE_VECTOR_(64, 16, \
5195 simde__m128d_to_private(a).f64, \
5196 simde__m128d_to_private(b).f64, \
5197 (((imm8) ) & 1), \
5198 (((imm8) >> 1) & 1) + 2) }); }))
5199 #endif
5200 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5201 #define _mm_shuffle_pd(a, b, imm8) simde_mm_shuffle_pd(a, b, imm8)
5202 #endif
5203
5204 SIMDE_FUNCTION_ATTRIBUTES
5205 simde__m128i
simde_mm_shufflehi_epi16(simde__m128i a,const int imm8)5206 simde_mm_shufflehi_epi16 (simde__m128i a, const int imm8)
5207 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
5208 simde__m128i_private
5209 r_,
5210 a_ = simde__m128i_to_private(a);
5211
5212 SIMDE_VECTORIZE
5213 for (size_t i = 0 ; i < ((sizeof(a_.i16) / sizeof(a_.i16[0])) / 2) ; i++) {
5214 r_.i16[i] = a_.i16[i];
5215 }
5216 for (size_t i = ((sizeof(a_.i16) / sizeof(a_.i16[0])) / 2) ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
5217 r_.i16[i] = a_.i16[((imm8 >> ((i - 4) * 2)) & 3) + 4];
5218 }
5219
5220 return simde__m128i_from_private(r_);
5221 }
5222 #if defined(SIMDE_X86_SSE2_NATIVE)
5223 #define simde_mm_shufflehi_epi16(a, imm8) _mm_shufflehi_epi16((a), (imm8))
5224 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5225 #define simde_mm_shufflehi_epi16(a, imm8) \
5226 __extension__({ \
5227 int16x8_t ret = vreinterpretq_s16_s64(a); \
5228 int16x4_t highBits = vget_high_s16(ret); \
5229 ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm8) & (0x3)), ret, 4); \
5230 ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm8) >> 2) & 0x3), ret, \
5231 5); \
5232 ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm8) >> 4) & 0x3), ret, \
5233 6); \
5234 ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm8) >> 6) & 0x3), ret, \
5235 7); \
5236 vreinterpretq_s64_s16(ret); \
5237 })
5238 #elif defined(SIMDE_SHUFFLE_VECTOR_)
5239 #define simde_mm_shufflehi_epi16(a, imm8) (__extension__ ({ \
5240 const simde__m128i_private simde__tmp_a_ = simde__m128i_to_private(a); \
5241 simde__m128i_from_private((simde__m128i_private) { .i16 = \
5242 SIMDE_SHUFFLE_VECTOR_(16, 16, \
5243 (simde__tmp_a_).i16, \
5244 (simde__tmp_a_).i16, \
5245 0, 1, 2, 3, \
5246 (((imm8) ) & 3) + 4, \
5247 (((imm8) >> 2) & 3) + 4, \
5248 (((imm8) >> 4) & 3) + 4, \
5249 (((imm8) >> 6) & 3) + 4) }); }))
5250 #endif
5251 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5252 #define _mm_shufflehi_epi16(a, imm8) simde_mm_shufflehi_epi16(a, imm8)
5253 #endif
5254
5255 SIMDE_FUNCTION_ATTRIBUTES
5256 simde__m128i
simde_mm_shufflelo_epi16(simde__m128i a,const int imm8)5257 simde_mm_shufflelo_epi16 (simde__m128i a, const int imm8)
5258 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
5259 simde__m128i_private
5260 r_,
5261 a_ = simde__m128i_to_private(a);
5262
5263 for (size_t i = 0 ; i < ((sizeof(r_.i16) / sizeof(r_.i16[0])) / 2) ; i++) {
5264 r_.i16[i] = a_.i16[((imm8 >> (i * 2)) & 3)];
5265 }
5266 SIMDE_VECTORIZE
5267 for (size_t i = ((sizeof(a_.i16) / sizeof(a_.i16[0])) / 2) ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
5268 r_.i16[i] = a_.i16[i];
5269 }
5270
5271 return simde__m128i_from_private(r_);
5272 }
5273 #if defined(SIMDE_X86_SSE2_NATIVE)
5274 #define simde_mm_shufflelo_epi16(a, imm8) _mm_shufflelo_epi16((a), (imm8))
5275 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5276 #define simde_mm_shufflelo_epi16(a, imm8) \
5277 __extension__({ \
5278 int16x8_t ret = vreinterpretq_s16_s64(a); \
5279 int16x4_t lowBits = vget_low_s16(ret); \
5280 ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm8) & (0x3)), ret, 0); \
5281 ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm8) >> 2) & 0x3), ret, \
5282 1); \
5283 ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm8) >> 4) & 0x3), ret, \
5284 2); \
5285 ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm8) >> 6) & 0x3), ret, \
5286 3); \
5287 vreinterpretq_s64_s16(ret); \
5288 })
5289 #elif defined(SIMDE_SHUFFLE_VECTOR_)
5290 #define simde_mm_shufflelo_epi16(a, imm8) (__extension__ ({ \
5291 const simde__m128i_private simde__tmp_a_ = simde__m128i_to_private(a); \
5292 simde__m128i_from_private((simde__m128i_private) { .i16 = \
5293 SIMDE_SHUFFLE_VECTOR_(16, 16, \
5294 (simde__tmp_a_).i16, \
5295 (simde__tmp_a_).i16, \
5296 (((imm8) ) & 3), \
5297 (((imm8) >> 2) & 3), \
5298 (((imm8) >> 4) & 3), \
5299 (((imm8) >> 6) & 3), \
5300 4, 5, 6, 7) }); }))
5301 #endif
5302 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5303 #define _mm_shufflelo_epi16(a, imm8) simde_mm_shufflelo_epi16(a, imm8)
5304 #endif
5305
5306 SIMDE_FUNCTION_ATTRIBUTES
5307 simde__m128i
simde_mm_sll_epi16(simde__m128i a,simde__m128i count)5308 simde_mm_sll_epi16 (simde__m128i a, simde__m128i count) {
5309 #if defined(SIMDE_X86_SSE2_NATIVE)
5310 return _mm_sll_epi16(a, count);
5311 #else
5312 simde__m128i_private
5313 r_,
5314 a_ = simde__m128i_to_private(a),
5315 count_ = simde__m128i_to_private(count);
5316
5317 if (count_.u64[0] > 15)
5318 return simde_mm_setzero_si128();
5319
5320 #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
5321 r_.u16 = (a_.u16 << count_.u64[0]);
5322 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5323 r_.neon_u16 = vshlq_u16(a_.neon_u16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, count_.u64[0])));
5324 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5325 r_.wasm_v128 = ((wasm_i64x2_extract_lane(count_.wasm_v128, 0) < 16) ? wasm_i16x8_shl(a_.wasm_v128, HEDLEY_STATIC_CAST(int32_t, wasm_i64x2_extract_lane(count_.wasm_v128, 0))) : wasm_i16x8_const(0,0,0,0,0,0,0,0));
5326 #else
5327 SIMDE_VECTORIZE
5328 for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
5329 r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, (a_.u16[i] << count_.u64[0]));
5330 }
5331 #endif
5332
5333 return simde__m128i_from_private(r_);
5334 #endif
5335 }
5336 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5337 #define _mm_sll_epi16(a, count) simde_mm_sll_epi16((a), (count))
5338 #endif
5339
5340 SIMDE_FUNCTION_ATTRIBUTES
5341 simde__m128i
simde_mm_sll_epi32(simde__m128i a,simde__m128i count)5342 simde_mm_sll_epi32 (simde__m128i a, simde__m128i count) {
5343 #if defined(SIMDE_X86_SSE2_NATIVE)
5344 return _mm_sll_epi32(a, count);
5345 #else
5346 simde__m128i_private
5347 r_,
5348 a_ = simde__m128i_to_private(a),
5349 count_ = simde__m128i_to_private(count);
5350
5351 if (count_.u64[0] > 31)
5352 return simde_mm_setzero_si128();
5353
5354 #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
5355 r_.u32 = (a_.u32 << count_.u64[0]);
5356 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5357 r_.neon_u32 = vshlq_u32(a_.neon_u32, vdupq_n_s32(HEDLEY_STATIC_CAST(int32_t, count_.u64[0])));
5358 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5359 r_.wasm_v128 = ((wasm_i64x2_extract_lane(count_.wasm_v128, 0) < 32) ? wasm_i32x4_shl(a_.wasm_v128, HEDLEY_STATIC_CAST(int32_t, wasm_i64x2_extract_lane(count_.wasm_v128, 0))) : wasm_i32x4_const(0,0,0,0));
5360 #else
5361 SIMDE_VECTORIZE
5362 for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
5363 r_.u32[i] = HEDLEY_STATIC_CAST(uint32_t, (a_.u32[i] << count_.u64[0]));
5364 }
5365 #endif
5366
5367 return simde__m128i_from_private(r_);
5368 #endif
5369 }
5370 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5371 #define _mm_sll_epi32(a, count) (simde_mm_sll_epi32(a, (count)))
5372 #endif
5373
5374 SIMDE_FUNCTION_ATTRIBUTES
5375 simde__m128i
simde_mm_sll_epi64(simde__m128i a,simde__m128i count)5376 simde_mm_sll_epi64 (simde__m128i a, simde__m128i count) {
5377 #if defined(SIMDE_X86_SSE2_NATIVE)
5378 return _mm_sll_epi64(a, count);
5379 #else
5380 simde__m128i_private
5381 r_,
5382 a_ = simde__m128i_to_private(a),
5383 count_ = simde__m128i_to_private(count);
5384
5385 if (count_.u64[0] > 63)
5386 return simde_mm_setzero_si128();
5387
5388 const int_fast16_t s = HEDLEY_STATIC_CAST(int_fast16_t, count_.u64[0]);
5389 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5390 r_.neon_u64 = vshlq_u64(a_.neon_u64, vdupq_n_s64(HEDLEY_STATIC_CAST(int64_t, s)));
5391 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5392 r_.wasm_v128 = (s < 64) ? wasm_i64x2_shl(a_.wasm_v128, s) : wasm_i64x2_const(0,0);
5393 #else
5394 #if !defined(SIMDE_BUG_GCC_94488)
5395 SIMDE_VECTORIZE
5396 #endif
5397 for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
5398 r_.u64[i] = a_.u64[i] << s;
5399 }
5400 #endif
5401
5402 return simde__m128i_from_private(r_);
5403 #endif
5404 }
5405 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5406 #define _mm_sll_epi64(a, count) (simde_mm_sll_epi64(a, (count)))
5407 #endif
5408
5409 SIMDE_FUNCTION_ATTRIBUTES
5410 simde__m128d
simde_mm_sqrt_pd(simde__m128d a)5411 simde_mm_sqrt_pd (simde__m128d a) {
5412 #if defined(SIMDE_X86_SSE2_NATIVE)
5413 return _mm_sqrt_pd(a);
5414 #else
5415 simde__m128d_private
5416 r_,
5417 a_ = simde__m128d_to_private(a);
5418
5419 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
5420 r_.neon_f64 = vsqrtq_f64(a_.neon_f64);
5421 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5422 r_.wasm_v128 = wasm_f64x2_sqrt(a_.wasm_v128);
5423 #elif defined(simde_math_sqrt)
5424 SIMDE_VECTORIZE
5425 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
5426 r_.f64[i] = simde_math_sqrt(a_.f64[i]);
5427 }
5428 #else
5429 HEDLEY_UNREACHABLE();
5430 #endif
5431
5432 return simde__m128d_from_private(r_);
5433 #endif
5434 }
5435 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5436 #define _mm_sqrt_pd(a) simde_mm_sqrt_pd(a)
5437 #endif
5438
5439 SIMDE_FUNCTION_ATTRIBUTES
5440 simde__m128d
simde_mm_sqrt_sd(simde__m128d a,simde__m128d b)5441 simde_mm_sqrt_sd (simde__m128d a, simde__m128d b) {
5442 #if defined(SIMDE_X86_SSE2_NATIVE)
5443 return _mm_sqrt_sd(a, b);
5444 #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
5445 return simde_mm_move_sd(a, simde_mm_sqrt_pd(b));
5446 #else
5447 simde__m128d_private
5448 r_,
5449 a_ = simde__m128d_to_private(a),
5450 b_ = simde__m128d_to_private(b);
5451
5452 #if defined(simde_math_sqrt)
5453 r_.f64[0] = simde_math_sqrt(b_.f64[0]);
5454 r_.f64[1] = a_.f64[1];
5455 #else
5456 HEDLEY_UNREACHABLE();
5457 #endif
5458
5459 return simde__m128d_from_private(r_);
5460 #endif
5461 }
5462 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5463 #define _mm_sqrt_sd(a, b) simde_mm_sqrt_sd(a, b)
5464 #endif
5465
5466 SIMDE_FUNCTION_ATTRIBUTES
5467 simde__m128i
simde_mm_srl_epi16(simde__m128i a,simde__m128i count)5468 simde_mm_srl_epi16 (simde__m128i a, simde__m128i count) {
5469 #if defined(SIMDE_X86_SSE2_NATIVE)
5470 return _mm_srl_epi16(a, count);
5471 #else
5472 simde__m128i_private
5473 r_,
5474 a_ = simde__m128i_to_private(a),
5475 count_ = simde__m128i_to_private(count);
5476
5477 const int cnt = HEDLEY_STATIC_CAST(int, (count_.i64[0] > 16 ? 16 : count_.i64[0]));
5478
5479 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5480 r_.neon_u16 = vshlq_u16(a_.neon_u16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, -cnt)));
5481 #else
5482 SIMDE_VECTORIZE
5483 for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
5484 r_.u16[i] = a_.u16[i] >> cnt;
5485 }
5486 #endif
5487
5488 return simde__m128i_from_private(r_);
5489 #endif
5490 }
5491 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5492 #define _mm_srl_epi16(a, count) (simde_mm_srl_epi16(a, (count)))
5493 #endif
5494
5495 SIMDE_FUNCTION_ATTRIBUTES
5496 simde__m128i
simde_mm_srl_epi32(simde__m128i a,simde__m128i count)5497 simde_mm_srl_epi32 (simde__m128i a, simde__m128i count) {
5498 #if defined(SIMDE_X86_SSE2_NATIVE)
5499 return _mm_srl_epi32(a, count);
5500 #else
5501 simde__m128i_private
5502 r_,
5503 a_ = simde__m128i_to_private(a),
5504 count_ = simde__m128i_to_private(count);
5505
5506 const int cnt = HEDLEY_STATIC_CAST(int, (count_.i64[0] > 32 ? 32 : count_.i64[0]));
5507
5508 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5509 r_.neon_u32 = vshlq_u32(a_.neon_u32, vdupq_n_s32(HEDLEY_STATIC_CAST(int32_t, -cnt)));
5510 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5511 r_.wasm_v128 = wasm_u32x4_shr(a_.wasm_v128, cnt);
5512 #else
5513 SIMDE_VECTORIZE
5514 for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
5515 r_.u32[i] = a_.u32[i] >> cnt;
5516 }
5517 #endif
5518
5519 return simde__m128i_from_private(r_);
5520 #endif
5521 }
5522 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5523 #define _mm_srl_epi32(a, count) (simde_mm_srl_epi32(a, (count)))
5524 #endif
5525
5526 SIMDE_FUNCTION_ATTRIBUTES
5527 simde__m128i
simde_mm_srl_epi64(simde__m128i a,simde__m128i count)5528 simde_mm_srl_epi64 (simde__m128i a, simde__m128i count) {
5529 #if defined(SIMDE_X86_SSE2_NATIVE)
5530 return _mm_srl_epi64(a, count);
5531 #else
5532 simde__m128i_private
5533 r_,
5534 a_ = simde__m128i_to_private(a),
5535 count_ = simde__m128i_to_private(count);
5536
5537 const int cnt = HEDLEY_STATIC_CAST(int, (count_.i64[0] > 64 ? 64 : count_.i64[0]));
5538
5539 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5540 r_.neon_u64 = vshlq_u64(a_.neon_u64, vdupq_n_s64(HEDLEY_STATIC_CAST(int64_t, -cnt)));
5541 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5542 r_.wasm_v128 = wasm_u64x2_shr(a_.wasm_v128, cnt);
5543 #else
5544 #if !defined(SIMDE_BUG_GCC_94488)
5545 SIMDE_VECTORIZE
5546 #endif
5547 for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
5548 r_.u64[i] = a_.u64[i] >> cnt;
5549 }
5550 #endif
5551
5552 return simde__m128i_from_private(r_);
5553 #endif
5554 }
5555 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5556 #define _mm_srl_epi64(a, count) (simde_mm_srl_epi64(a, (count)))
5557 #endif
5558
5559 SIMDE_FUNCTION_ATTRIBUTES
5560 simde__m128i
simde_mm_srai_epi16(simde__m128i a,const int imm8)5561 simde_mm_srai_epi16 (simde__m128i a, const int imm8)
5562 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
5563 /* MSVC requires a range of (0, 255). */
5564 simde__m128i_private
5565 r_,
5566 a_ = simde__m128i_to_private(a);
5567
5568 const int cnt = (imm8 & ~15) ? 15 : imm8;
5569
5570 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5571 r_.neon_i16 = vshlq_s16(a_.neon_i16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, -cnt)));
5572 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5573 r_.wasm_v128 = wasm_i16x8_shr(a_.wasm_v128, cnt);
5574 #else
5575 SIMDE_VECTORIZE
5576 for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i16[0])) ; i++) {
5577 r_.i16[i] = a_.i16[i] >> cnt;
5578 }
5579 #endif
5580
5581 return simde__m128i_from_private(r_);
5582 }
5583 #if defined(SIMDE_X86_SSE2_NATIVE)
5584 #define simde_mm_srai_epi16(a, imm8) _mm_srai_epi16((a), (imm8))
5585 #endif
5586 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5587 #define _mm_srai_epi16(a, imm8) simde_mm_srai_epi16(a, imm8)
5588 #endif
5589
5590 SIMDE_FUNCTION_ATTRIBUTES
5591 simde__m128i
simde_mm_srai_epi32(simde__m128i a,const int imm8)5592 simde_mm_srai_epi32 (simde__m128i a, const int imm8)
5593 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
5594 /* MSVC requires a range of (0, 255). */
5595 simde__m128i_private
5596 r_,
5597 a_ = simde__m128i_to_private(a);
5598
5599 const int cnt = (imm8 & ~31) ? 31 : imm8;
5600
5601 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5602 r_.neon_i32 = vshlq_s32(a_.neon_i32, vdupq_n_s32(-cnt));
5603 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5604 r_.wasm_v128 = wasm_i32x4_shr(a_.wasm_v128, cnt);
5605 #else
5606 SIMDE_VECTORIZE
5607 for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i32[0])) ; i++) {
5608 r_.i32[i] = a_.i32[i] >> cnt;
5609 }
5610 #endif
5611
5612 return simde__m128i_from_private(r_);
5613 }
5614 #if defined(SIMDE_X86_SSE2_NATIVE)
5615 #define simde_mm_srai_epi32(a, imm8) _mm_srai_epi32((a), (imm8))
5616 #endif
5617 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5618 #define _mm_srai_epi32(a, imm8) simde_mm_srai_epi32(a, imm8)
5619 #endif
5620
5621 SIMDE_FUNCTION_ATTRIBUTES
5622 simde__m128i
simde_mm_sra_epi16(simde__m128i a,simde__m128i count)5623 simde_mm_sra_epi16 (simde__m128i a, simde__m128i count) {
5624 #if defined(SIMDE_X86_SSE2_NATIVE)
5625 return _mm_sra_epi16(a, count);
5626 #else
5627 simde__m128i_private
5628 r_,
5629 a_ = simde__m128i_to_private(a),
5630 count_ = simde__m128i_to_private(count);
5631
5632 const int cnt = HEDLEY_STATIC_CAST(int, (count_.i64[0] > 15 ? 15 : count_.i64[0]));
5633
5634 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5635 r_.neon_i16 = vshlq_s16(a_.neon_i16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, -cnt)));
5636 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5637 r_.wasm_v128 = wasm_i16x8_shr(a_.wasm_v128, cnt);
5638 #else
5639 SIMDE_VECTORIZE
5640 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
5641 r_.i16[i] = a_.i16[i] >> cnt;
5642 }
5643 #endif
5644
5645 return simde__m128i_from_private(r_);
5646 #endif
5647 }
5648 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5649 #define _mm_sra_epi16(a, count) (simde_mm_sra_epi16(a, count))
5650 #endif
5651
5652 SIMDE_FUNCTION_ATTRIBUTES
5653 simde__m128i
simde_mm_sra_epi32(simde__m128i a,simde__m128i count)5654 simde_mm_sra_epi32 (simde__m128i a, simde__m128i count) {
5655 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_MM_SRA_EPI32)
5656 return _mm_sra_epi32(a, count);
5657 #else
5658 simde__m128i_private
5659 r_,
5660 a_ = simde__m128i_to_private(a),
5661 count_ = simde__m128i_to_private(count);
5662
5663 const int cnt = count_.u64[0] > 31 ? 31 : HEDLEY_STATIC_CAST(int, count_.u64[0]);
5664
5665 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5666 r_.neon_i32 = vshlq_s32(a_.neon_i32, vdupq_n_s32(HEDLEY_STATIC_CAST(int32_t, -cnt)));
5667 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5668 r_.wasm_v128 = wasm_i32x4_shr(a_.wasm_v128, cnt);
5669 #else
5670 SIMDE_VECTORIZE
5671 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
5672 r_.i32[i] = a_.i32[i] >> cnt;
5673 }
5674 #endif
5675
5676 return simde__m128i_from_private(r_);
5677 #endif
5678 }
5679 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5680 #define _mm_sra_epi32(a, count) (simde_mm_sra_epi32(a, (count)))
5681 #endif
5682
5683 SIMDE_FUNCTION_ATTRIBUTES
5684 simde__m128i
simde_mm_slli_epi16(simde__m128i a,const int imm8)5685 simde_mm_slli_epi16 (simde__m128i a, const int imm8)
5686 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
5687 if (HEDLEY_UNLIKELY((imm8 > 15))) {
5688 return simde_mm_setzero_si128();
5689 }
5690
5691 simde__m128i_private
5692 r_,
5693 a_ = simde__m128i_to_private(a);
5694
5695 #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
5696 r_.i16 = a_.i16 << (imm8 & 0xff);
5697 #else
5698 const int s = (imm8 > HEDLEY_STATIC_CAST(int, sizeof(r_.i16[0]) * CHAR_BIT) - 1) ? 0 : imm8;
5699 SIMDE_VECTORIZE
5700 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
5701 r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, a_.i16[i] << s);
5702 }
5703 #endif
5704
5705 return simde__m128i_from_private(r_);
5706 }
5707 #if defined(SIMDE_X86_SSE2_NATIVE)
5708 #define simde_mm_slli_epi16(a, imm8) _mm_slli_epi16(a, imm8)
5709 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5710 #define simde_mm_slli_epi16(a, imm8) \
5711 (__extension__ ({ \
5712 simde__m128i ret; \
5713 if ((imm8) <= 0) { \
5714 ret = a; \
5715 } else if ((imm8) > 15) { \
5716 ret = simde_mm_setzero_si128(); \
5717 } else { \
5718 ret = simde__m128i_from_neon_i16( \
5719 vshlq_n_s16(simde__m128i_to_neon_i16(a), ((imm8) & 15))); \
5720 } \
5721 ret; \
5722 }))
5723 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5724 #define simde_mm_slli_epi16(a, imm8) \
5725 ((imm8 < 16) ? wasm_i16x8_shl(simde__m128i_to_private(a).wasm_v128, imm8) : wasm_i16x8_const(0,0,0,0,0,0,0,0))
5726 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
5727 #define simde_mm_slli_epi16(a, imm8) \
5728 ((imm8 & ~15) ? simde_mm_setzero_si128() : simde__m128i_from_altivec_i16(vec_sl(simde__m128i_to_altivec_i16(a), vec_splat_u16(HEDLEY_STATIC_CAST(unsigned short, imm8)))))
5729 #endif
5730 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5731 #define _mm_slli_epi16(a, imm8) simde_mm_slli_epi16(a, imm8)
5732 #endif
5733
5734 SIMDE_FUNCTION_ATTRIBUTES
5735 simde__m128i
simde_mm_slli_epi32(simde__m128i a,const int imm8)5736 simde_mm_slli_epi32 (simde__m128i a, const int imm8)
5737 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
5738 if (HEDLEY_UNLIKELY((imm8 > 31))) {
5739 return simde_mm_setzero_si128();
5740 }
5741 simde__m128i_private
5742 r_,
5743 a_ = simde__m128i_to_private(a);
5744
5745 #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
5746 r_.i32 = a_.i32 << imm8;
5747 #else
5748 SIMDE_VECTORIZE
5749 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
5750 r_.i32[i] = a_.i32[i] << (imm8 & 0xff);
5751 }
5752 #endif
5753
5754 return simde__m128i_from_private(r_);
5755 }
5756 #if defined(SIMDE_X86_SSE2_NATIVE)
5757 #define simde_mm_slli_epi32(a, imm8) _mm_slli_epi32(a, imm8)
5758 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5759 #define simde_mm_slli_epi32(a, imm8) \
5760 (__extension__ ({ \
5761 simde__m128i ret; \
5762 if ((imm8) <= 0) { \
5763 ret = a; \
5764 } else if ((imm8) > 31) { \
5765 ret = simde_mm_setzero_si128(); \
5766 } else { \
5767 ret = simde__m128i_from_neon_i32( \
5768 vshlq_n_s32(simde__m128i_to_neon_i32(a), ((imm8) & 31))); \
5769 } \
5770 ret; \
5771 }))
5772 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5773 #define simde_mm_slli_epi32(a, imm8) \
5774 ((imm8 < 32) ? wasm_i32x4_shl(simde__m128i_to_private(a).wasm_v128, imm8) : wasm_i32x4_const(0,0,0,0))
5775 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
5776 #define simde_mm_slli_epi32(a, imm8) \
5777 (__extension__ ({ \
5778 simde__m128i ret; \
5779 if ((imm8) <= 0) { \
5780 ret = a; \
5781 } else if ((imm8) > 31) { \
5782 ret = simde_mm_setzero_si128(); \
5783 } else { \
5784 ret = simde__m128i_from_altivec_i32( \
5785 vec_sl(simde__m128i_to_altivec_i32(a), \
5786 vec_splats(HEDLEY_STATIC_CAST(unsigned int, (imm8) & 31)))); \
5787 } \
5788 ret; \
5789 }))
5790 #endif
5791 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5792 #define _mm_slli_epi32(a, imm8) simde_mm_slli_epi32(a, imm8)
5793 #endif
5794
5795 SIMDE_FUNCTION_ATTRIBUTES
5796 simde__m128i
simde_mm_slli_epi64(simde__m128i a,const int imm8)5797 simde_mm_slli_epi64 (simde__m128i a, const int imm8)
5798 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
5799 if (HEDLEY_UNLIKELY((imm8 > 63))) {
5800 return simde_mm_setzero_si128();
5801 }
5802 simde__m128i_private
5803 r_,
5804 a_ = simde__m128i_to_private(a);
5805
5806 #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
5807 r_.i64 = a_.i64 << imm8;
5808 #else
5809 SIMDE_VECTORIZE
5810 for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
5811 r_.i64[i] = a_.i64[i] << (imm8 & 0xff);
5812 }
5813 #endif
5814
5815 return simde__m128i_from_private(r_);
5816 }
5817 #if defined(SIMDE_X86_SSE2_NATIVE)
5818 #define simde_mm_slli_epi64(a, imm8) _mm_slli_epi64(a, imm8)
5819 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5820 #define simde_mm_slli_epi64(a, imm8) \
5821 (__extension__ ({ \
5822 simde__m128i ret; \
5823 if ((imm8) <= 0) { \
5824 ret = a; \
5825 } else if ((imm8) > 63) { \
5826 ret = simde_mm_setzero_si128(); \
5827 } else { \
5828 ret = simde__m128i_from_neon_i64( \
5829 vshlq_n_s64(simde__m128i_to_neon_i64(a), ((imm8) & 63))); \
5830 } \
5831 ret; \
5832 }))
5833 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5834 #define simde_mm_slli_epi64(a, imm8) \
5835 ((imm8 < 64) ? wasm_i64x2_shl(simde__m128i_to_private(a).wasm_v128, imm8) : wasm_i64x2_const(0,0))
5836 #endif
5837 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5838 #define _mm_slli_epi64(a, imm8) simde_mm_slli_epi64(a, imm8)
5839 #endif
5840
5841 SIMDE_FUNCTION_ATTRIBUTES
5842 simde__m128i
simde_mm_srli_epi16(simde__m128i a,const int imm8)5843 simde_mm_srli_epi16 (simde__m128i a, const int imm8)
5844 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
5845 if (HEDLEY_UNLIKELY((imm8 > 15))) {
5846 return simde_mm_setzero_si128();
5847 }
5848 simde__m128i_private
5849 r_,
5850 a_ = simde__m128i_to_private(a);
5851
5852 #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
5853 r_.u16 = a_.u16 >> imm8;
5854 #else
5855 SIMDE_VECTORIZE
5856 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
5857 r_.u16[i] = a_.u16[i] >> (imm8 & 0xff);
5858 }
5859 #endif
5860
5861 return simde__m128i_from_private(r_);
5862 }
5863 #if defined(SIMDE_X86_SSE2_NATIVE)
5864 #define simde_mm_srli_epi16(a, imm8) _mm_srli_epi16(a, imm8)
5865 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5866 #define simde_mm_srli_epi16(a, imm8) \
5867 (__extension__ ({ \
5868 simde__m128i ret; \
5869 if ((imm8) <= 0) { \
5870 ret = a; \
5871 } else if ((imm8) > 15) { \
5872 ret = simde_mm_setzero_si128(); \
5873 } else { \
5874 ret = simde__m128i_from_neon_u16( \
5875 vshrq_n_u16(simde__m128i_to_neon_u16(a), (((imm8) & 15) | (((imm8) & 15) == 0)))); \
5876 } \
5877 ret; \
5878 }))
5879 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5880 #define simde_mm_srli_epi16(a, imm8) \
5881 ((imm8 < 16) ? wasm_u16x8_shr(simde__m128i_to_private(a).wasm_v128, imm8) : wasm_i16x8_const(0,0,0,0,0,0,0,0))
5882 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
5883 #define simde_mm_srli_epi16(a, imm8) \
5884 ((imm8 & ~15) ? simde_mm_setzero_si128() : simde__m128i_from_altivec_i16(vec_sr(simde__m128i_to_altivec_i16(a), vec_splat_u16(HEDLEY_STATIC_CAST(unsigned short, imm8)))))
5885 #endif
5886 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5887 #define _mm_srli_epi16(a, imm8) simde_mm_srli_epi16(a, imm8)
5888 #endif
5889
5890 SIMDE_FUNCTION_ATTRIBUTES
5891 simde__m128i
simde_mm_srli_epi32(simde__m128i a,const int imm8)5892 simde_mm_srli_epi32 (simde__m128i a, const int imm8)
5893 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
5894 if (HEDLEY_UNLIKELY((imm8 > 31))) {
5895 return simde_mm_setzero_si128();
5896 }
5897 simde__m128i_private
5898 r_,
5899 a_ = simde__m128i_to_private(a);
5900
5901 #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
5902 r_.u32 = a_.u32 >> (imm8 & 0xff);
5903 #else
5904 SIMDE_VECTORIZE
5905 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
5906 r_.u32[i] = a_.u32[i] >> (imm8 & 0xff);
5907 }
5908 #endif
5909
5910 return simde__m128i_from_private(r_);
5911 }
5912 #if defined(SIMDE_X86_SSE2_NATIVE)
5913 #define simde_mm_srli_epi32(a, imm8) _mm_srli_epi32(a, imm8)
5914 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5915 #define simde_mm_srli_epi32(a, imm8) \
5916 (__extension__ ({ \
5917 simde__m128i ret; \
5918 if ((imm8) <= 0) { \
5919 ret = a; \
5920 } else if ((imm8) > 31) { \
5921 ret = simde_mm_setzero_si128(); \
5922 } else { \
5923 ret = simde__m128i_from_neon_u32( \
5924 vshrq_n_u32(simde__m128i_to_neon_u32(a), (((imm8) & 31) | (((imm8) & 31) == 0)))); \
5925 } \
5926 ret; \
5927 }))
5928 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5929 #define simde_mm_srli_epi32(a, imm8) \
5930 ((imm8 < 32) ? wasm_u32x4_shr(simde__m128i_to_private(a).wasm_v128, imm8) : wasm_i32x4_const(0,0,0,0))
5931 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
5932 #define simde_mm_srli_epi32(a, imm8) \
5933 (__extension__ ({ \
5934 simde__m128i ret; \
5935 if ((imm8) <= 0) { \
5936 ret = a; \
5937 } else if ((imm8) > 31) { \
5938 ret = simde_mm_setzero_si128(); \
5939 } else { \
5940 ret = simde__m128i_from_altivec_i32( \
5941 vec_sr(simde__m128i_to_altivec_i32(a), \
5942 vec_splats(HEDLEY_STATIC_CAST(unsigned int, (imm8) & 31)))); \
5943 } \
5944 ret; \
5945 }))
5946 #endif
5947 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5948 #define _mm_srli_epi32(a, imm8) simde_mm_srli_epi32(a, imm8)
5949 #endif
5950
5951 SIMDE_FUNCTION_ATTRIBUTES
5952 simde__m128i
simde_mm_srli_epi64(simde__m128i a,const int imm8)5953 simde_mm_srli_epi64 (simde__m128i a, const int imm8)
5954 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
5955 simde__m128i_private
5956 r_,
5957 a_ = simde__m128i_to_private(a);
5958
5959 if (HEDLEY_UNLIKELY((imm8 & 63) != imm8))
5960 return simde_mm_setzero_si128();
5961
5962 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5963 r_.neon_u64 = vshlq_u64(a_.neon_u64, vdupq_n_s64(-imm8));
5964 #else
5965 #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_94488)
5966 r_.u64 = a_.u64 >> imm8;
5967 #else
5968 SIMDE_VECTORIZE
5969 for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
5970 r_.u64[i] = a_.u64[i] >> imm8;
5971 }
5972 #endif
5973 #endif
5974
5975 return simde__m128i_from_private(r_);
5976 }
5977 #if defined(SIMDE_X86_SSE2_NATIVE)
5978 #define simde_mm_srli_epi64(a, imm8) _mm_srli_epi64(a, imm8)
5979 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5980 #define simde_mm_srli_epi64(a, imm8) \
5981 (__extension__ ({ \
5982 simde__m128i ret; \
5983 if ((imm8) <= 0) { \
5984 ret = a; \
5985 } else if ((imm8) > 63) { \
5986 ret = simde_mm_setzero_si128(); \
5987 } else { \
5988 ret = simde__m128i_from_neon_u64( \
5989 vshrq_n_u64(simde__m128i_to_neon_u64(a), (((imm8) & 63) | (((imm8) & 63) == 0)))); \
5990 } \
5991 ret; \
5992 }))
5993 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5994 #define simde_mm_srli_epi64(a, imm8) \
5995 ((imm8 < 64) ? wasm_u64x2_shr(simde__m128i_to_private(a).wasm_v128, imm8) : wasm_i64x2_const(0,0))
5996 #endif
5997 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5998 #define _mm_srli_epi64(a, imm8) simde_mm_srli_epi64(a, imm8)
5999 #endif
6000
6001 SIMDE_FUNCTION_ATTRIBUTES
6002 void
simde_mm_store_pd(simde_float64 mem_addr[HEDLEY_ARRAY_PARAM (2)],simde__m128d a)6003 simde_mm_store_pd (simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128d a) {
6004 #if defined(SIMDE_X86_SSE2_NATIVE)
6005 _mm_store_pd(mem_addr, a);
6006 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6007 vst1q_f64(mem_addr, simde__m128d_to_private(a).neon_f64);
6008 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6009 vst1q_s64(HEDLEY_REINTERPRET_CAST(int64_t*, mem_addr), simde__m128d_to_private(a).neon_i64);
6010 #else
6011 simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m128d), &a, sizeof(a));
6012 #endif
6013 }
6014 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6015 #define _mm_store_pd(mem_addr, a) simde_mm_store_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
6016 #endif
6017
6018 SIMDE_FUNCTION_ATTRIBUTES
6019 void
simde_mm_store1_pd(simde_float64 mem_addr[HEDLEY_ARRAY_PARAM (2)],simde__m128d a)6020 simde_mm_store1_pd (simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128d a) {
6021 #if defined(SIMDE_X86_SSE2_NATIVE)
6022 _mm_store1_pd(mem_addr, a);
6023 #else
6024 simde__m128d_private a_ = simde__m128d_to_private(a);
6025
6026 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6027 vst1q_f64(mem_addr, vdupq_laneq_f64(a_.neon_f64, 0));
6028 #else
6029 mem_addr[0] = a_.f64[0];
6030 mem_addr[1] = a_.f64[0];
6031 #endif
6032 #endif
6033 }
6034 #define simde_mm_store_pd1(mem_addr, a) simde_mm_store1_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
6035 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6036 #define _mm_store1_pd(mem_addr, a) simde_mm_store1_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
6037 #define _mm_store_pd1(mem_addr, a) simde_mm_store_pd1(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
6038 #endif
6039
6040 SIMDE_FUNCTION_ATTRIBUTES
6041 void
simde_mm_store_sd(simde_float64 * mem_addr,simde__m128d a)6042 simde_mm_store_sd (simde_float64* mem_addr, simde__m128d a) {
6043 #if defined(SIMDE_X86_SSE2_NATIVE)
6044 _mm_store_sd(mem_addr, a);
6045 #else
6046 simde__m128d_private a_ = simde__m128d_to_private(a);
6047
6048 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6049 const simde_float64 v = vgetq_lane_f64(a_.neon_f64, 0);
6050 simde_memcpy(mem_addr, &v, sizeof(v));
6051 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6052 const int64_t v = vgetq_lane_s64(a_.neon_i64, 0);
6053 simde_memcpy(HEDLEY_REINTERPRET_CAST(int64_t*, mem_addr), &v, sizeof(v));
6054 #else
6055 simde_float64 v = a_.f64[0];
6056 simde_memcpy(mem_addr, &v, sizeof(simde_float64));
6057 #endif
6058 #endif
6059 }
6060 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6061 #define _mm_store_sd(mem_addr, a) simde_mm_store_sd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
6062 #endif
6063
6064 SIMDE_FUNCTION_ATTRIBUTES
6065 void
simde_mm_store_si128(simde__m128i * mem_addr,simde__m128i a)6066 simde_mm_store_si128 (simde__m128i* mem_addr, simde__m128i a) {
6067 #if defined(SIMDE_X86_SSE2_NATIVE)
6068 _mm_store_si128(HEDLEY_STATIC_CAST(__m128i*, mem_addr), a);
6069 #else
6070 simde__m128i_private a_ = simde__m128i_to_private(a);
6071
6072 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6073 vst1q_s32(HEDLEY_REINTERPRET_CAST(int32_t*, mem_addr), a_.neon_i32);
6074 #else
6075 simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m128i), &a_, sizeof(a_));
6076 #endif
6077 #endif
6078 }
6079 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6080 #define _mm_store_si128(mem_addr, a) simde_mm_store_si128(mem_addr, a)
6081 #endif
6082
6083 SIMDE_FUNCTION_ATTRIBUTES
6084 void
simde_mm_storeh_pd(simde_float64 * mem_addr,simde__m128d a)6085 simde_mm_storeh_pd (simde_float64* mem_addr, simde__m128d a) {
6086 #if defined(SIMDE_X86_SSE2_NATIVE)
6087 _mm_storeh_pd(mem_addr, a);
6088 #else
6089 simde__m128d_private a_ = simde__m128d_to_private(a);
6090
6091 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6092 *mem_addr = vgetq_lane_f64(a_.neon_f64, 1);
6093 #else
6094 *mem_addr = a_.f64[1];
6095 #endif
6096 #endif
6097 }
6098 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6099 #define _mm_storeh_pd(mem_addr, a) simde_mm_storeh_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
6100 #endif
6101
6102 SIMDE_FUNCTION_ATTRIBUTES
6103 void
simde_mm_storel_epi64(simde__m128i * mem_addr,simde__m128i a)6104 simde_mm_storel_epi64 (simde__m128i* mem_addr, simde__m128i a) {
6105 #if defined(SIMDE_X86_SSE2_NATIVE)
6106 _mm_storel_epi64(HEDLEY_STATIC_CAST(__m128i*, mem_addr), a);
6107 #else
6108 simde__m128i_private a_ = simde__m128i_to_private(a);
6109 int64_t tmp;
6110
6111 /* memcpy to prevent aliasing, tmp because we can't take the
6112 * address of a vector element. */
6113
6114 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6115 tmp = vgetq_lane_s64(a_.neon_i64, 0);
6116 #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
6117 #if defined(SIMDE_BUG_GCC_95227)
6118 (void) a_;
6119 #endif
6120 tmp = vec_extract(a_.altivec_i64, 0);
6121 #else
6122 tmp = a_.i64[0];
6123 #endif
6124
6125 simde_memcpy(mem_addr, &tmp, sizeof(tmp));
6126 #endif
6127 }
6128 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6129 #define _mm_storel_epi64(mem_addr, a) simde_mm_storel_epi64(mem_addr, a)
6130 #endif
6131
6132 SIMDE_FUNCTION_ATTRIBUTES
6133 void
simde_mm_storel_pd(simde_float64 * mem_addr,simde__m128d a)6134 simde_mm_storel_pd (simde_float64* mem_addr, simde__m128d a) {
6135 #if defined(SIMDE_X86_SSE2_NATIVE)
6136 _mm_storel_pd(mem_addr, a);
6137 #else
6138 simde__m128d_private a_ = simde__m128d_to_private(a);
6139
6140 simde_float64 tmp;
6141 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6142 tmp = vgetq_lane_f64(a_.neon_f64, 0);
6143 #else
6144 tmp = a_.f64[0];
6145 #endif
6146 simde_memcpy(mem_addr, &tmp, sizeof(tmp));
6147 #endif
6148 }
6149 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6150 #define _mm_storel_pd(mem_addr, a) simde_mm_storel_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
6151 #endif
6152
6153 SIMDE_FUNCTION_ATTRIBUTES
6154 void
simde_mm_storer_pd(simde_float64 mem_addr[2],simde__m128d a)6155 simde_mm_storer_pd (simde_float64 mem_addr[2], simde__m128d a) {
6156 #if defined(SIMDE_X86_SSE2_NATIVE)
6157 _mm_storer_pd(mem_addr, a);
6158 #else
6159 simde__m128d_private a_ = simde__m128d_to_private(a);
6160
6161 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6162 vst1q_s64(HEDLEY_REINTERPRET_CAST(int64_t*, mem_addr), vextq_s64(a_.neon_i64, a_.neon_i64, 1));
6163 #elif defined(SIMDE_SHUFFLE_VECTOR_)
6164 a_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, a_.f64, 1, 0);
6165 simde_mm_store_pd(mem_addr, simde__m128d_from_private(a_));
6166 #else
6167 mem_addr[0] = a_.f64[1];
6168 mem_addr[1] = a_.f64[0];
6169 #endif
6170 #endif
6171 }
6172 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6173 #define _mm_storer_pd(mem_addr, a) simde_mm_storer_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
6174 #endif
6175
6176 SIMDE_FUNCTION_ATTRIBUTES
6177 void
simde_mm_storeu_pd(simde_float64 * mem_addr,simde__m128d a)6178 simde_mm_storeu_pd (simde_float64* mem_addr, simde__m128d a) {
6179 #if defined(SIMDE_X86_SSE2_NATIVE)
6180 _mm_storeu_pd(mem_addr, a);
6181 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6182 vst1q_f64(mem_addr, simde__m128d_to_private(a).neon_f64);
6183 #else
6184 simde_memcpy(mem_addr, &a, sizeof(a));
6185 #endif
6186 }
6187 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6188 #define _mm_storeu_pd(mem_addr, a) simde_mm_storeu_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
6189 #endif
6190
6191 SIMDE_FUNCTION_ATTRIBUTES
6192 void
simde_mm_storeu_si128(simde__m128i * mem_addr,simde__m128i a)6193 simde_mm_storeu_si128 (simde__m128i* mem_addr, simde__m128i a) {
6194 #if defined(SIMDE_X86_SSE2_NATIVE)
6195 _mm_storeu_si128(HEDLEY_STATIC_CAST(__m128i*, mem_addr), a);
6196 #else
6197 simde_memcpy(mem_addr, &a, sizeof(a));
6198 #endif
6199 }
6200 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6201 #define _mm_storeu_si128(mem_addr, a) simde_mm_storeu_si128(mem_addr, a)
6202 #endif
6203
6204 SIMDE_FUNCTION_ATTRIBUTES
6205 void
simde_mm_storeu_si16(void * mem_addr,simde__m128i a)6206 simde_mm_storeu_si16 (void* mem_addr, simde__m128i a) {
6207 #if defined(SIMDE_X86_SSE2_NATIVE) && ( \
6208 SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0) || \
6209 HEDLEY_GCC_VERSION_CHECK(11,0,0) || \
6210 HEDLEY_INTEL_VERSION_CHECK(20,21,1))
6211 _mm_storeu_si16(mem_addr, a);
6212 #else
6213 int16_t val = simde_x_mm_cvtsi128_si16(a);
6214 simde_memcpy(mem_addr, &val, sizeof(val));
6215 #endif
6216 }
6217 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6218 #define _mm_storeu_si16(mem_addr, a) simde_mm_storeu_si16(mem_addr, a)
6219 #endif
6220
6221 SIMDE_FUNCTION_ATTRIBUTES
6222 void
simde_mm_storeu_si32(void * mem_addr,simde__m128i a)6223 simde_mm_storeu_si32 (void* mem_addr, simde__m128i a) {
6224 #if defined(SIMDE_X86_SSE2_NATIVE) && ( \
6225 SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0) || \
6226 HEDLEY_GCC_VERSION_CHECK(11,0,0) || \
6227 HEDLEY_INTEL_VERSION_CHECK(20,21,1))
6228 _mm_storeu_si32(mem_addr, a);
6229 #else
6230 int32_t val = simde_mm_cvtsi128_si32(a);
6231 simde_memcpy(mem_addr, &val, sizeof(val));
6232 #endif
6233 }
6234 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6235 #define _mm_storeu_si32(mem_addr, a) simde_mm_storeu_si32(mem_addr, a)
6236 #endif
6237
6238 SIMDE_FUNCTION_ATTRIBUTES
6239 void
simde_mm_storeu_si64(void * mem_addr,simde__m128i a)6240 simde_mm_storeu_si64 (void* mem_addr, simde__m128i a) {
6241 #if defined(SIMDE_X86_SSE2_NATIVE) && ( \
6242 SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0) || \
6243 HEDLEY_GCC_VERSION_CHECK(11,0,0) || \
6244 HEDLEY_INTEL_VERSION_CHECK(20,21,1))
6245 _mm_storeu_si64(mem_addr, a);
6246 #else
6247 int64_t val = simde_mm_cvtsi128_si64(a);
6248 simde_memcpy(mem_addr, &val, sizeof(val));
6249 #endif
6250 }
6251 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6252 #define _mm_storeu_si64(mem_addr, a) simde_mm_storeu_si64(mem_addr, a)
6253 #endif
6254
6255 SIMDE_FUNCTION_ATTRIBUTES
6256 void
simde_mm_stream_pd(simde_float64 mem_addr[HEDLEY_ARRAY_PARAM (2)],simde__m128d a)6257 simde_mm_stream_pd (simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128d a) {
6258 #if defined(SIMDE_X86_SSE2_NATIVE)
6259 _mm_stream_pd(mem_addr, a);
6260 #else
6261 simde_memcpy(mem_addr, &a, sizeof(a));
6262 #endif
6263 }
6264 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6265 #define _mm_stream_pd(mem_addr, a) simde_mm_stream_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
6266 #endif
6267
6268 SIMDE_FUNCTION_ATTRIBUTES
6269 void
simde_mm_stream_si128(simde__m128i * mem_addr,simde__m128i a)6270 simde_mm_stream_si128 (simde__m128i* mem_addr, simde__m128i a) {
6271 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
6272 _mm_stream_si128(HEDLEY_STATIC_CAST(__m128i*, mem_addr), a);
6273 #else
6274 simde_memcpy(mem_addr, &a, sizeof(a));
6275 #endif
6276 }
6277 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6278 #define _mm_stream_si128(mem_addr, a) simde_mm_stream_si128(mem_addr, a)
6279 #endif
6280
6281 SIMDE_FUNCTION_ATTRIBUTES
6282 void
simde_mm_stream_si32(int32_t * mem_addr,int32_t a)6283 simde_mm_stream_si32 (int32_t* mem_addr, int32_t a) {
6284 #if defined(SIMDE_X86_SSE2_NATIVE)
6285 _mm_stream_si32(mem_addr, a);
6286 #else
6287 *mem_addr = a;
6288 #endif
6289 }
6290 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6291 #define _mm_stream_si32(mem_addr, a) simde_mm_stream_si32(mem_addr, a)
6292 #endif
6293
6294 SIMDE_FUNCTION_ATTRIBUTES
6295 void
simde_mm_stream_si64(int64_t * mem_addr,int64_t a)6296 simde_mm_stream_si64 (int64_t* mem_addr, int64_t a) {
6297 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64) && !defined(HEDLEY_MSVC_VERSION)
6298 _mm_stream_si64(SIMDE_CHECKED_REINTERPRET_CAST(long long int*, int64_t*, mem_addr), a);
6299 #else
6300 *mem_addr = a;
6301 #endif
6302 }
6303 #define simde_mm_stream_si64x(mem_addr, a) simde_mm_stream_si64(mem_addr, a)
6304 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6305 #define _mm_stream_si64(mem_addr, a) simde_mm_stream_si64(SIMDE_CHECKED_REINTERPRET_CAST(int64_t*, __int64*, mem_addr), a)
6306 #define _mm_stream_si64x(mem_addr, a) simde_mm_stream_si64(SIMDE_CHECKED_REINTERPRET_CAST(int64_t*, __int64*, mem_addr), a)
6307 #endif
6308
6309 SIMDE_FUNCTION_ATTRIBUTES
6310 simde__m128i
simde_mm_sub_epi8(simde__m128i a,simde__m128i b)6311 simde_mm_sub_epi8 (simde__m128i a, simde__m128i b) {
6312 #if defined(SIMDE_X86_SSE2_NATIVE)
6313 return _mm_sub_epi8(a, b);
6314 #else
6315 simde__m128i_private
6316 r_,
6317 a_ = simde__m128i_to_private(a),
6318 b_ = simde__m128i_to_private(b);
6319
6320 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6321 r_.neon_i8 = vsubq_s8(a_.neon_i8, b_.neon_i8);
6322 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
6323 r_.i8 = a_.i8 - b_.i8;
6324 #else
6325 SIMDE_VECTORIZE
6326 for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
6327 r_.i8[i] = a_.i8[i] - b_.i8[i];
6328 }
6329 #endif
6330
6331 return simde__m128i_from_private(r_);
6332 #endif
6333 }
6334 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6335 #define _mm_sub_epi8(a, b) simde_mm_sub_epi8(a, b)
6336 #endif
6337
6338 SIMDE_FUNCTION_ATTRIBUTES
6339 simde__m128i
simde_mm_sub_epi16(simde__m128i a,simde__m128i b)6340 simde_mm_sub_epi16 (simde__m128i a, simde__m128i b) {
6341 #if defined(SIMDE_X86_SSE2_NATIVE)
6342 return _mm_sub_epi16(a, b);
6343 #else
6344 simde__m128i_private
6345 r_,
6346 a_ = simde__m128i_to_private(a),
6347 b_ = simde__m128i_to_private(b);
6348
6349 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6350 r_.neon_i16 = vsubq_s16(a_.neon_i16, b_.neon_i16);
6351 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
6352 r_.i16 = a_.i16 - b_.i16;
6353 #else
6354 SIMDE_VECTORIZE
6355 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
6356 r_.i16[i] = a_.i16[i] - b_.i16[i];
6357 }
6358 #endif
6359
6360 return simde__m128i_from_private(r_);
6361 #endif
6362 }
6363 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6364 #define _mm_sub_epi16(a, b) simde_mm_sub_epi16(a, b)
6365 #endif
6366
6367 SIMDE_FUNCTION_ATTRIBUTES
6368 simde__m128i
simde_mm_sub_epi32(simde__m128i a,simde__m128i b)6369 simde_mm_sub_epi32 (simde__m128i a, simde__m128i b) {
6370 #if defined(SIMDE_X86_SSE2_NATIVE)
6371 return _mm_sub_epi32(a, b);
6372 #else
6373 simde__m128i_private
6374 r_,
6375 a_ = simde__m128i_to_private(a),
6376 b_ = simde__m128i_to_private(b);
6377
6378 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6379 r_.neon_i32 = vsubq_s32(a_.neon_i32, b_.neon_i32);
6380 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
6381 r_.i32 = a_.i32 - b_.i32;
6382 #else
6383 SIMDE_VECTORIZE
6384 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
6385 r_.i32[i] = a_.i32[i] - b_.i32[i];
6386 }
6387 #endif
6388
6389 return simde__m128i_from_private(r_);
6390 #endif
6391 }
6392 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6393 #define _mm_sub_epi32(a, b) simde_mm_sub_epi32(a, b)
6394 #endif
6395
6396 SIMDE_FUNCTION_ATTRIBUTES
6397 simde__m128i
simde_mm_sub_epi64(simde__m128i a,simde__m128i b)6398 simde_mm_sub_epi64 (simde__m128i a, simde__m128i b) {
6399 #if defined(SIMDE_X86_SSE2_NATIVE)
6400 return _mm_sub_epi64(a, b);
6401 #else
6402 simde__m128i_private
6403 r_,
6404 a_ = simde__m128i_to_private(a),
6405 b_ = simde__m128i_to_private(b);
6406
6407 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6408 r_.neon_i64 = vsubq_s64(a_.neon_i64, b_.neon_i64);
6409 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
6410 r_.i64 = a_.i64 - b_.i64;
6411 #else
6412 SIMDE_VECTORIZE
6413 for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
6414 r_.i64[i] = a_.i64[i] - b_.i64[i];
6415 }
6416 #endif
6417
6418 return simde__m128i_from_private(r_);
6419 #endif
6420 }
6421 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6422 #define _mm_sub_epi64(a, b) simde_mm_sub_epi64(a, b)
6423 #endif
6424
6425 SIMDE_FUNCTION_ATTRIBUTES
6426 simde__m128i
simde_x_mm_sub_epu32(simde__m128i a,simde__m128i b)6427 simde_x_mm_sub_epu32 (simde__m128i a, simde__m128i b) {
6428 simde__m128i_private
6429 r_,
6430 a_ = simde__m128i_to_private(a),
6431 b_ = simde__m128i_to_private(b);
6432
6433 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
6434 r_.u32 = a_.u32 - b_.u32;
6435 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6436 r_.neon_u32 = vsubq_u32(a_.neon_u32, b_.neon_u32);
6437 #else
6438 SIMDE_VECTORIZE
6439 for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
6440 r_.u32[i] = a_.u32[i] - b_.u32[i];
6441 }
6442 #endif
6443
6444 return simde__m128i_from_private(r_);
6445 }
6446
6447 SIMDE_FUNCTION_ATTRIBUTES
6448 simde__m128d
simde_mm_sub_pd(simde__m128d a,simde__m128d b)6449 simde_mm_sub_pd (simde__m128d a, simde__m128d b) {
6450 #if defined(SIMDE_X86_SSE2_NATIVE)
6451 return _mm_sub_pd(a, b);
6452 #else
6453 simde__m128d_private
6454 r_,
6455 a_ = simde__m128d_to_private(a),
6456 b_ = simde__m128d_to_private(b);
6457
6458 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
6459 r_.f64 = a_.f64 - b_.f64;
6460 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6461 r_.neon_f64 = vsubq_f64(a_.neon_f64, b_.neon_f64);
6462 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
6463 r_.wasm_v128 = wasm_f64x2_sub(a_.wasm_v128, b_.wasm_v128);
6464 #else
6465 SIMDE_VECTORIZE
6466 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
6467 r_.f64[i] = a_.f64[i] - b_.f64[i];
6468 }
6469 #endif
6470
6471 return simde__m128d_from_private(r_);
6472 #endif
6473 }
6474 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6475 #define _mm_sub_pd(a, b) simde_mm_sub_pd(a, b)
6476 #endif
6477
6478 SIMDE_FUNCTION_ATTRIBUTES
6479 simde__m128d
simde_mm_sub_sd(simde__m128d a,simde__m128d b)6480 simde_mm_sub_sd (simde__m128d a, simde__m128d b) {
6481 #if defined(SIMDE_X86_SSE2_NATIVE)
6482 return _mm_sub_sd(a, b);
6483 #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
6484 return simde_mm_move_sd(a, simde_mm_sub_pd(a, b));
6485 #else
6486 simde__m128d_private
6487 r_,
6488 a_ = simde__m128d_to_private(a),
6489 b_ = simde__m128d_to_private(b);
6490
6491 r_.f64[0] = a_.f64[0] - b_.f64[0];
6492 r_.f64[1] = a_.f64[1];
6493
6494 return simde__m128d_from_private(r_);
6495 #endif
6496 }
6497 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6498 #define _mm_sub_sd(a, b) simde_mm_sub_sd(a, b)
6499 #endif
6500
6501 SIMDE_FUNCTION_ATTRIBUTES
6502 simde__m64
simde_mm_sub_si64(simde__m64 a,simde__m64 b)6503 simde_mm_sub_si64 (simde__m64 a, simde__m64 b) {
6504 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
6505 return _mm_sub_si64(a, b);
6506 #else
6507 simde__m64_private
6508 r_,
6509 a_ = simde__m64_to_private(a),
6510 b_ = simde__m64_to_private(b);
6511
6512 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
6513 r_.i64 = a_.i64 - b_.i64;
6514 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6515 r_.neon_i64 = vsub_s64(a_.neon_i64, b_.neon_i64);
6516 #else
6517 r_.i64[0] = a_.i64[0] - b_.i64[0];
6518 #endif
6519
6520 return simde__m64_from_private(r_);
6521 #endif
6522 }
6523 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6524 #define _mm_sub_si64(a, b) simde_mm_sub_si64(a, b)
6525 #endif
6526
6527 SIMDE_FUNCTION_ATTRIBUTES
6528 simde__m128i
simde_mm_subs_epi8(simde__m128i a,simde__m128i b)6529 simde_mm_subs_epi8 (simde__m128i a, simde__m128i b) {
6530 #if defined(SIMDE_X86_SSE2_NATIVE)
6531 return _mm_subs_epi8(a, b);
6532 #else
6533 simde__m128i_private
6534 r_,
6535 a_ = simde__m128i_to_private(a),
6536 b_ = simde__m128i_to_private(b);
6537
6538 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6539 r_.neon_i8 = vqsubq_s8(a_.neon_i8, b_.neon_i8);
6540 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
6541 r_.wasm_v128 = wasm_i8x16_sub_saturate(a_.wasm_v128, b_.wasm_v128);
6542 #else
6543 SIMDE_VECTORIZE
6544 for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i8[0])) ; i++) {
6545 if (((b_.i8[i]) > 0 && (a_.i8[i]) < INT8_MIN + (b_.i8[i]))) {
6546 r_.i8[i] = INT8_MIN;
6547 } else if ((b_.i8[i]) < 0 && (a_.i8[i]) > INT8_MAX + (b_.i8[i])) {
6548 r_.i8[i] = INT8_MAX;
6549 } else {
6550 r_.i8[i] = (a_.i8[i]) - (b_.i8[i]);
6551 }
6552 }
6553 #endif
6554
6555 return simde__m128i_from_private(r_);
6556 #endif
6557 }
6558 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6559 #define _mm_subs_epi8(a, b) simde_mm_subs_epi8(a, b)
6560 #endif
6561
6562 SIMDE_FUNCTION_ATTRIBUTES
6563 simde__m128i
simde_mm_subs_epi16(simde__m128i a,simde__m128i b)6564 simde_mm_subs_epi16 (simde__m128i a, simde__m128i b) {
6565 #if defined(SIMDE_X86_SSE2_NATIVE)
6566 return _mm_subs_epi16(a, b);
6567 #else
6568 simde__m128i_private
6569 r_,
6570 a_ = simde__m128i_to_private(a),
6571 b_ = simde__m128i_to_private(b);
6572
6573 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6574 r_.neon_i16 = vqsubq_s16(a_.neon_i16, b_.neon_i16);
6575 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
6576 r_.wasm_v128 = wasm_i16x8_sub_saturate(a_.wasm_v128, b_.wasm_v128);
6577 #else
6578 SIMDE_VECTORIZE
6579 for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i16[0])) ; i++) {
6580 if (((b_.i16[i]) > 0 && (a_.i16[i]) < INT16_MIN + (b_.i16[i]))) {
6581 r_.i16[i] = INT16_MIN;
6582 } else if ((b_.i16[i]) < 0 && (a_.i16[i]) > INT16_MAX + (b_.i16[i])) {
6583 r_.i16[i] = INT16_MAX;
6584 } else {
6585 r_.i16[i] = (a_.i16[i]) - (b_.i16[i]);
6586 }
6587 }
6588 #endif
6589
6590 return simde__m128i_from_private(r_);
6591 #endif
6592 }
6593 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6594 #define _mm_subs_epi16(a, b) simde_mm_subs_epi16(a, b)
6595 #endif
6596
6597 SIMDE_FUNCTION_ATTRIBUTES
6598 simde__m128i
simde_mm_subs_epu8(simde__m128i a,simde__m128i b)6599 simde_mm_subs_epu8 (simde__m128i a, simde__m128i b) {
6600 #if defined(SIMDE_X86_SSE2_NATIVE)
6601 return _mm_subs_epu8(a, b);
6602 #else
6603 simde__m128i_private
6604 r_,
6605 a_ = simde__m128i_to_private(a),
6606 b_ = simde__m128i_to_private(b);
6607
6608 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6609 r_.neon_u8 = vqsubq_u8(a_.neon_u8, b_.neon_u8);
6610 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
6611 r_.wasm_v128 = wasm_u8x16_sub_saturate(a_.wasm_v128, b_.wasm_v128);
6612 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
6613 r_.altivec_u8 = vec_subs(a_.altivec_u8, b_.altivec_u8);
6614 #else
6615 SIMDE_VECTORIZE
6616 for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i8[0])) ; i++) {
6617 const int32_t x = a_.u8[i] - b_.u8[i];
6618 if (x < 0) {
6619 r_.u8[i] = 0;
6620 } else if (x > UINT8_MAX) {
6621 r_.u8[i] = UINT8_MAX;
6622 } else {
6623 r_.u8[i] = HEDLEY_STATIC_CAST(uint8_t, x);
6624 }
6625 }
6626 #endif
6627
6628 return simde__m128i_from_private(r_);
6629 #endif
6630 }
6631 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6632 #define _mm_subs_epu8(a, b) simde_mm_subs_epu8(a, b)
6633 #endif
6634
6635 SIMDE_FUNCTION_ATTRIBUTES
6636 simde__m128i
simde_mm_subs_epu16(simde__m128i a,simde__m128i b)6637 simde_mm_subs_epu16 (simde__m128i a, simde__m128i b) {
6638 #if defined(SIMDE_X86_SSE2_NATIVE)
6639 return _mm_subs_epu16(a, b);
6640 #else
6641 simde__m128i_private
6642 r_,
6643 a_ = simde__m128i_to_private(a),
6644 b_ = simde__m128i_to_private(b);
6645
6646 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6647 r_.neon_u16 = vqsubq_u16(a_.neon_u16, b_.neon_u16);
6648 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
6649 r_.wasm_v128 = wasm_u16x8_sub_saturate(a_.wasm_v128, b_.wasm_v128);
6650 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
6651 r_.altivec_u16 = vec_subs(a_.altivec_u16, b_.altivec_u16);
6652 #else
6653 SIMDE_VECTORIZE
6654 for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i16[0])) ; i++) {
6655 const int32_t x = a_.u16[i] - b_.u16[i];
6656 if (x < 0) {
6657 r_.u16[i] = 0;
6658 } else if (x > UINT16_MAX) {
6659 r_.u16[i] = UINT16_MAX;
6660 } else {
6661 r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, x);
6662 }
6663 }
6664 #endif
6665
6666 return simde__m128i_from_private(r_);
6667 #endif
6668 }
6669 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6670 #define _mm_subs_epu16(a, b) simde_mm_subs_epu16(a, b)
6671 #endif
6672
6673 SIMDE_FUNCTION_ATTRIBUTES
6674 int
simde_mm_ucomieq_sd(simde__m128d a,simde__m128d b)6675 simde_mm_ucomieq_sd (simde__m128d a, simde__m128d b) {
6676 #if defined(SIMDE_X86_SSE2_NATIVE)
6677 return _mm_ucomieq_sd(a, b);
6678 #else
6679 simde__m128d_private
6680 a_ = simde__m128d_to_private(a),
6681 b_ = simde__m128d_to_private(b);
6682 int r;
6683
6684 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6685 uint64x2_t a_not_nan = vceqq_f64(a_.neon_f64, a_.neon_f64);
6686 uint64x2_t b_not_nan = vceqq_f64(b_.neon_f64, b_.neon_f64);
6687 uint64x2_t a_or_b_nan = vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(vandq_u64(a_not_nan, b_not_nan))));
6688 uint64x2_t a_eq_b = vceqq_f64(a_.neon_f64, b_.neon_f64);
6689 r = !!(vgetq_lane_u64(vorrq_u64(a_or_b_nan, a_eq_b), 0) != 0);
6690 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
6691 return wasm_f64x2_extract_lane(a_.wasm_v128, 0) == wasm_f64x2_extract_lane(b_.wasm_v128, 0);
6692 #elif defined(SIMDE_HAVE_FENV_H)
6693 fenv_t envp;
6694 int x = feholdexcept(&envp);
6695 r = a_.f64[0] == b_.f64[0];
6696 if (HEDLEY_LIKELY(x == 0))
6697 fesetenv(&envp);
6698 #else
6699 r = a_.f64[0] == b_.f64[0];
6700 #endif
6701
6702 return r;
6703 #endif
6704 }
6705 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6706 #define _mm_ucomieq_sd(a, b) simde_mm_ucomieq_sd(a, b)
6707 #endif
6708
6709 SIMDE_FUNCTION_ATTRIBUTES
6710 int
simde_mm_ucomige_sd(simde__m128d a,simde__m128d b)6711 simde_mm_ucomige_sd (simde__m128d a, simde__m128d b) {
6712 #if defined(SIMDE_X86_SSE2_NATIVE)
6713 return _mm_ucomige_sd(a, b);
6714 #else
6715 simde__m128d_private
6716 a_ = simde__m128d_to_private(a),
6717 b_ = simde__m128d_to_private(b);
6718 int r;
6719
6720 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6721 uint64x2_t a_not_nan = vceqq_f64(a_.neon_f64, a_.neon_f64);
6722 uint64x2_t b_not_nan = vceqq_f64(b_.neon_f64, b_.neon_f64);
6723 uint64x2_t a_and_b_not_nan = vandq_u64(a_not_nan, b_not_nan);
6724 uint64x2_t a_ge_b = vcgeq_f64(a_.neon_f64, b_.neon_f64);
6725 r = !!(vgetq_lane_u64(vandq_u64(a_and_b_not_nan, a_ge_b), 0) != 0);
6726 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
6727 return wasm_f64x2_extract_lane(a_.wasm_v128, 0) >= wasm_f64x2_extract_lane(b_.wasm_v128, 0);
6728 #elif defined(SIMDE_HAVE_FENV_H)
6729 fenv_t envp;
6730 int x = feholdexcept(&envp);
6731 r = a_.f64[0] >= b_.f64[0];
6732 if (HEDLEY_LIKELY(x == 0))
6733 fesetenv(&envp);
6734 #else
6735 r = a_.f64[0] >= b_.f64[0];
6736 #endif
6737
6738 return r;
6739 #endif
6740 }
6741 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6742 #define _mm_ucomige_sd(a, b) simde_mm_ucomige_sd(a, b)
6743 #endif
6744
6745 SIMDE_FUNCTION_ATTRIBUTES
6746 int
simde_mm_ucomigt_sd(simde__m128d a,simde__m128d b)6747 simde_mm_ucomigt_sd (simde__m128d a, simde__m128d b) {
6748 #if defined(SIMDE_X86_SSE2_NATIVE)
6749 return _mm_ucomigt_sd(a, b);
6750 #else
6751 simde__m128d_private
6752 a_ = simde__m128d_to_private(a),
6753 b_ = simde__m128d_to_private(b);
6754 int r;
6755
6756 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6757 uint64x2_t a_not_nan = vceqq_f64(a_.neon_f64, a_.neon_f64);
6758 uint64x2_t b_not_nan = vceqq_f64(b_.neon_f64, b_.neon_f64);
6759 uint64x2_t a_and_b_not_nan = vandq_u64(a_not_nan, b_not_nan);
6760 uint64x2_t a_gt_b = vcgtq_f64(a_.neon_f64, b_.neon_f64);
6761 r = !!(vgetq_lane_u64(vandq_u64(a_and_b_not_nan, a_gt_b), 0) != 0);
6762 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
6763 return wasm_f64x2_extract_lane(a_.wasm_v128, 0) > wasm_f64x2_extract_lane(b_.wasm_v128, 0);
6764 #elif defined(SIMDE_HAVE_FENV_H)
6765 fenv_t envp;
6766 int x = feholdexcept(&envp);
6767 r = a_.f64[0] > b_.f64[0];
6768 if (HEDLEY_LIKELY(x == 0))
6769 fesetenv(&envp);
6770 #else
6771 r = a_.f64[0] > b_.f64[0];
6772 #endif
6773
6774 return r;
6775 #endif
6776 }
6777 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6778 #define _mm_ucomigt_sd(a, b) simde_mm_ucomigt_sd(a, b)
6779 #endif
6780
6781 SIMDE_FUNCTION_ATTRIBUTES
6782 int
simde_mm_ucomile_sd(simde__m128d a,simde__m128d b)6783 simde_mm_ucomile_sd (simde__m128d a, simde__m128d b) {
6784 #if defined(SIMDE_X86_SSE2_NATIVE)
6785 return _mm_ucomile_sd(a, b);
6786 #else
6787 simde__m128d_private
6788 a_ = simde__m128d_to_private(a),
6789 b_ = simde__m128d_to_private(b);
6790 int r;
6791
6792 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6793 uint64x2_t a_not_nan = vceqq_f64(a_.neon_f64, a_.neon_f64);
6794 uint64x2_t b_not_nan = vceqq_f64(b_.neon_f64, b_.neon_f64);
6795 uint64x2_t a_or_b_nan = vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(vandq_u64(a_not_nan, b_not_nan))));
6796 uint64x2_t a_le_b = vcleq_f64(a_.neon_f64, b_.neon_f64);
6797 r = !!(vgetq_lane_u64(vorrq_u64(a_or_b_nan, a_le_b), 0) != 0);
6798 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
6799 return wasm_f64x2_extract_lane(a_.wasm_v128, 0) <= wasm_f64x2_extract_lane(b_.wasm_v128, 0);
6800 #elif defined(SIMDE_HAVE_FENV_H)
6801 fenv_t envp;
6802 int x = feholdexcept(&envp);
6803 r = a_.f64[0] <= b_.f64[0];
6804 if (HEDLEY_LIKELY(x == 0))
6805 fesetenv(&envp);
6806 #else
6807 r = a_.f64[0] <= b_.f64[0];
6808 #endif
6809
6810 return r;
6811 #endif
6812 }
6813 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6814 #define _mm_ucomile_sd(a, b) simde_mm_ucomile_sd(a, b)
6815 #endif
6816
6817 SIMDE_FUNCTION_ATTRIBUTES
6818 int
simde_mm_ucomilt_sd(simde__m128d a,simde__m128d b)6819 simde_mm_ucomilt_sd (simde__m128d a, simde__m128d b) {
6820 #if defined(SIMDE_X86_SSE2_NATIVE)
6821 return _mm_ucomilt_sd(a, b);
6822 #else
6823 simde__m128d_private
6824 a_ = simde__m128d_to_private(a),
6825 b_ = simde__m128d_to_private(b);
6826 int r;
6827
6828 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6829 uint64x2_t a_not_nan = vceqq_f64(a_.neon_f64, a_.neon_f64);
6830 uint64x2_t b_not_nan = vceqq_f64(b_.neon_f64, b_.neon_f64);
6831 uint64x2_t a_or_b_nan = vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(vandq_u64(a_not_nan, b_not_nan))));
6832 uint64x2_t a_lt_b = vcltq_f64(a_.neon_f64, b_.neon_f64);
6833 r = !!(vgetq_lane_u64(vorrq_u64(a_or_b_nan, a_lt_b), 0) != 0);
6834 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
6835 return wasm_f64x2_extract_lane(a_.wasm_v128, 0) < wasm_f64x2_extract_lane(b_.wasm_v128, 0);
6836 #elif defined(SIMDE_HAVE_FENV_H)
6837 fenv_t envp;
6838 int x = feholdexcept(&envp);
6839 r = a_.f64[0] < b_.f64[0];
6840 if (HEDLEY_LIKELY(x == 0))
6841 fesetenv(&envp);
6842 #else
6843 r = a_.f64[0] < b_.f64[0];
6844 #endif
6845
6846 return r;
6847 #endif
6848 }
6849 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6850 #define _mm_ucomilt_sd(a, b) simde_mm_ucomilt_sd(a, b)
6851 #endif
6852
6853 SIMDE_FUNCTION_ATTRIBUTES
6854 int
simde_mm_ucomineq_sd(simde__m128d a,simde__m128d b)6855 simde_mm_ucomineq_sd (simde__m128d a, simde__m128d b) {
6856 #if defined(SIMDE_X86_SSE2_NATIVE)
6857 return _mm_ucomineq_sd(a, b);
6858 #else
6859 simde__m128d_private
6860 a_ = simde__m128d_to_private(a),
6861 b_ = simde__m128d_to_private(b);
6862 int r;
6863
6864 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6865 uint64x2_t a_not_nan = vceqq_f64(a_.neon_f64, a_.neon_f64);
6866 uint64x2_t b_not_nan = vceqq_f64(b_.neon_f64, b_.neon_f64);
6867 uint64x2_t a_and_b_not_nan = vandq_u64(a_not_nan, b_not_nan);
6868 uint64x2_t a_neq_b = vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(vceqq_f64(a_.neon_f64, b_.neon_f64))));
6869 r = !!(vgetq_lane_u64(vandq_u64(a_and_b_not_nan, a_neq_b), 0) != 0);
6870 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
6871 return wasm_f64x2_extract_lane(a_.wasm_v128, 0) != wasm_f64x2_extract_lane(b_.wasm_v128, 0);
6872 #elif defined(SIMDE_HAVE_FENV_H)
6873 fenv_t envp;
6874 int x = feholdexcept(&envp);
6875 r = a_.f64[0] != b_.f64[0];
6876 if (HEDLEY_LIKELY(x == 0))
6877 fesetenv(&envp);
6878 #else
6879 r = a_.f64[0] != b_.f64[0];
6880 #endif
6881
6882 return r;
6883 #endif
6884 }
6885 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6886 #define _mm_ucomineq_sd(a, b) simde_mm_ucomineq_sd(a, b)
6887 #endif
6888
6889 #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
6890 HEDLEY_DIAGNOSTIC_PUSH
6891 SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_
6892 #endif
6893
6894 #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
6895 HEDLEY_DIAGNOSTIC_POP
6896 #endif
6897
6898 SIMDE_FUNCTION_ATTRIBUTES
6899 void
simde_mm_lfence(void)6900 simde_mm_lfence (void) {
6901 #if defined(SIMDE_X86_SSE2_NATIVE)
6902 _mm_lfence();
6903 #else
6904 simde_mm_sfence();
6905 #endif
6906 }
6907 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6908 #define _mm_lfence() simde_mm_lfence()
6909 #endif
6910
6911 SIMDE_FUNCTION_ATTRIBUTES
6912 void
simde_mm_mfence(void)6913 simde_mm_mfence (void) {
6914 #if defined(SIMDE_X86_SSE2_NATIVE)
6915 _mm_mfence();
6916 #else
6917 simde_mm_sfence();
6918 #endif
6919 }
6920 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6921 #define _mm_mfence() simde_mm_mfence()
6922 #endif
6923
6924 SIMDE_FUNCTION_ATTRIBUTES
6925 simde__m128i
simde_mm_unpackhi_epi8(simde__m128i a,simde__m128i b)6926 simde_mm_unpackhi_epi8 (simde__m128i a, simde__m128i b) {
6927 #if defined(SIMDE_X86_SSE2_NATIVE)
6928 return _mm_unpackhi_epi8(a, b);
6929 #else
6930 simde__m128i_private
6931 r_,
6932 a_ = simde__m128i_to_private(a),
6933 b_ = simde__m128i_to_private(b);
6934
6935 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6936 r_.neon_i8 = vzip2q_s8(a_.neon_i8, b_.neon_i8);
6937 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6938 int8x8_t a1 = vreinterpret_s8_s16(vget_high_s16(a_.neon_i16));
6939 int8x8_t b1 = vreinterpret_s8_s16(vget_high_s16(b_.neon_i16));
6940 int8x8x2_t result = vzip_s8(a1, b1);
6941 r_.neon_i8 = vcombine_s8(result.val[0], result.val[1]);
6942 #elif defined(SIMDE_SHUFFLE_VECTOR_)
6943 r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, b_.i8, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
6944 #else
6945 SIMDE_VECTORIZE
6946 for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i8[0])) / 2) ; i++) {
6947 r_.i8[(i * 2)] = a_.i8[i + ((sizeof(r_) / sizeof(r_.i8[0])) / 2)];
6948 r_.i8[(i * 2) + 1] = b_.i8[i + ((sizeof(r_) / sizeof(r_.i8[0])) / 2)];
6949 }
6950 #endif
6951
6952 return simde__m128i_from_private(r_);
6953 #endif
6954 }
6955 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6956 #define _mm_unpackhi_epi8(a, b) simde_mm_unpackhi_epi8(a, b)
6957 #endif
6958
6959 SIMDE_FUNCTION_ATTRIBUTES
6960 simde__m128i
simde_mm_unpackhi_epi16(simde__m128i a,simde__m128i b)6961 simde_mm_unpackhi_epi16 (simde__m128i a, simde__m128i b) {
6962 #if defined(SIMDE_X86_SSE2_NATIVE)
6963 return _mm_unpackhi_epi16(a, b);
6964 #else
6965 simde__m128i_private
6966 r_,
6967 a_ = simde__m128i_to_private(a),
6968 b_ = simde__m128i_to_private(b);
6969
6970 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6971 r_.neon_i16 = vzip2q_s16(a_.neon_i16, b_.neon_i16);
6972 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6973 int16x4_t a1 = vget_high_s16(a_.neon_i16);
6974 int16x4_t b1 = vget_high_s16(b_.neon_i16);
6975 int16x4x2_t result = vzip_s16(a1, b1);
6976 r_.neon_i16 = vcombine_s16(result.val[0], result.val[1]);
6977 #elif defined(SIMDE_SHUFFLE_VECTOR_)
6978 r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, b_.i16, 4, 12, 5, 13, 6, 14, 7, 15);
6979 #else
6980 SIMDE_VECTORIZE
6981 for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i16[0])) / 2) ; i++) {
6982 r_.i16[(i * 2)] = a_.i16[i + ((sizeof(r_) / sizeof(r_.i16[0])) / 2)];
6983 r_.i16[(i * 2) + 1] = b_.i16[i + ((sizeof(r_) / sizeof(r_.i16[0])) / 2)];
6984 }
6985 #endif
6986
6987 return simde__m128i_from_private(r_);
6988 #endif
6989 }
6990 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6991 #define _mm_unpackhi_epi16(a, b) simde_mm_unpackhi_epi16(a, b)
6992 #endif
6993
6994 SIMDE_FUNCTION_ATTRIBUTES
6995 simde__m128i
simde_mm_unpackhi_epi32(simde__m128i a,simde__m128i b)6996 simde_mm_unpackhi_epi32 (simde__m128i a, simde__m128i b) {
6997 #if defined(SIMDE_X86_SSE2_NATIVE)
6998 return _mm_unpackhi_epi32(a, b);
6999 #else
7000 simde__m128i_private
7001 r_,
7002 a_ = simde__m128i_to_private(a),
7003 b_ = simde__m128i_to_private(b);
7004
7005 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
7006 r_.neon_i32 = vzip2q_s32(a_.neon_i32, b_.neon_i32);
7007 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
7008 int32x2_t a1 = vget_high_s32(a_.neon_i32);
7009 int32x2_t b1 = vget_high_s32(b_.neon_i32);
7010 int32x2x2_t result = vzip_s32(a1, b1);
7011 r_.neon_i32 = vcombine_s32(result.val[0], result.val[1]);
7012 #elif defined(SIMDE_SHUFFLE_VECTOR_)
7013 r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.i32, b_.i32, 2, 6, 3, 7);
7014 #else
7015 SIMDE_VECTORIZE
7016 for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i32[0])) / 2) ; i++) {
7017 r_.i32[(i * 2)] = a_.i32[i + ((sizeof(r_) / sizeof(r_.i32[0])) / 2)];
7018 r_.i32[(i * 2) + 1] = b_.i32[i + ((sizeof(r_) / sizeof(r_.i32[0])) / 2)];
7019 }
7020 #endif
7021
7022 return simde__m128i_from_private(r_);
7023 #endif
7024 }
7025 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
7026 #define _mm_unpackhi_epi32(a, b) simde_mm_unpackhi_epi32(a, b)
7027 #endif
7028
7029 SIMDE_FUNCTION_ATTRIBUTES
7030 simde__m128i
simde_mm_unpackhi_epi64(simde__m128i a,simde__m128i b)7031 simde_mm_unpackhi_epi64 (simde__m128i a, simde__m128i b) {
7032 #if defined(SIMDE_X86_SSE2_NATIVE)
7033 return _mm_unpackhi_epi64(a, b);
7034 #else
7035 simde__m128i_private
7036 r_,
7037 a_ = simde__m128i_to_private(a),
7038 b_ = simde__m128i_to_private(b);
7039
7040 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
7041 int64x1_t a_h = vget_high_s64(a_.neon_i64);
7042 int64x1_t b_h = vget_high_s64(b_.neon_i64);
7043 r_.neon_i64 = vcombine_s64(a_h, b_h);
7044 #elif defined(SIMDE_SHUFFLE_VECTOR_)
7045 r_.i64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.i64, b_.i64, 1, 3);
7046 #else
7047 SIMDE_VECTORIZE
7048 for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i64[0])) / 2) ; i++) {
7049 r_.i64[(i * 2)] = a_.i64[i + ((sizeof(r_) / sizeof(r_.i64[0])) / 2)];
7050 r_.i64[(i * 2) + 1] = b_.i64[i + ((sizeof(r_) / sizeof(r_.i64[0])) / 2)];
7051 }
7052 #endif
7053
7054 return simde__m128i_from_private(r_);
7055 #endif
7056 }
7057 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
7058 #define _mm_unpackhi_epi64(a, b) simde_mm_unpackhi_epi64(a, b)
7059 #endif
7060
7061 SIMDE_FUNCTION_ATTRIBUTES
7062 simde__m128d
simde_mm_unpackhi_pd(simde__m128d a,simde__m128d b)7063 simde_mm_unpackhi_pd (simde__m128d a, simde__m128d b) {
7064 #if defined(SIMDE_X86_SSE2_NATIVE)
7065 return _mm_unpackhi_pd(a, b);
7066 #else
7067 simde__m128d_private
7068 r_,
7069 a_ = simde__m128d_to_private(a),
7070 b_ = simde__m128d_to_private(b);
7071
7072 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
7073 float64x1_t a_l = vget_high_f64(a_.f64);
7074 float64x1_t b_l = vget_high_f64(b_.f64);
7075 r_.neon_f64 = vcombine_f64(a_l, b_l);
7076 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
7077 r_.wasm_v128 = wasm_v64x2_shuffle(a_.wasm_v128, b_.wasm_v128, 1, 3);
7078 #elif defined(SIMDE_SHUFFLE_VECTOR_)
7079 r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 1, 3);
7080 #else
7081 SIMDE_VECTORIZE
7082 for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.f64[0])) / 2) ; i++) {
7083 r_.f64[(i * 2)] = a_.f64[i + ((sizeof(r_) / sizeof(r_.f64[0])) / 2)];
7084 r_.f64[(i * 2) + 1] = b_.f64[i + ((sizeof(r_) / sizeof(r_.f64[0])) / 2)];
7085 }
7086 #endif
7087
7088 return simde__m128d_from_private(r_);
7089 #endif
7090 }
7091 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
7092 #define _mm_unpackhi_pd(a, b) simde_mm_unpackhi_pd(a, b)
7093 #endif
7094
7095 SIMDE_FUNCTION_ATTRIBUTES
7096 simde__m128i
simde_mm_unpacklo_epi8(simde__m128i a,simde__m128i b)7097 simde_mm_unpacklo_epi8 (simde__m128i a, simde__m128i b) {
7098 #if defined(SIMDE_X86_SSE2_NATIVE)
7099 return _mm_unpacklo_epi8(a, b);
7100 #else
7101 simde__m128i_private
7102 r_,
7103 a_ = simde__m128i_to_private(a),
7104 b_ = simde__m128i_to_private(b);
7105
7106 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
7107 r_.neon_i8 = vzip1q_s8(a_.neon_i8, b_.neon_i8);
7108 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
7109 int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(a_.neon_i16));
7110 int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(b_.neon_i16));
7111 int8x8x2_t result = vzip_s8(a1, b1);
7112 r_.neon_i8 = vcombine_s8(result.val[0], result.val[1]);
7113 #elif defined(SIMDE_SHUFFLE_VECTOR_)
7114 r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, b_.i8, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
7115 #else
7116 SIMDE_VECTORIZE
7117 for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i8[0])) / 2) ; i++) {
7118 r_.i8[(i * 2)] = a_.i8[i];
7119 r_.i8[(i * 2) + 1] = b_.i8[i];
7120 }
7121 #endif
7122
7123 return simde__m128i_from_private(r_);
7124 #endif
7125 }
7126 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
7127 #define _mm_unpacklo_epi8(a, b) simde_mm_unpacklo_epi8(a, b)
7128 #endif
7129
7130 SIMDE_FUNCTION_ATTRIBUTES
7131 simde__m128i
simde_mm_unpacklo_epi16(simde__m128i a,simde__m128i b)7132 simde_mm_unpacklo_epi16 (simde__m128i a, simde__m128i b) {
7133 #if defined(SIMDE_X86_SSE2_NATIVE)
7134 return _mm_unpacklo_epi16(a, b);
7135 #else
7136 simde__m128i_private
7137 r_,
7138 a_ = simde__m128i_to_private(a),
7139 b_ = simde__m128i_to_private(b);
7140
7141 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
7142 r_.neon_i16 = vzip1q_s16(a_.neon_i16, b_.neon_i16);
7143 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
7144 int16x4_t a1 = vget_low_s16(a_.neon_i16);
7145 int16x4_t b1 = vget_low_s16(b_.neon_i16);
7146 int16x4x2_t result = vzip_s16(a1, b1);
7147 r_.neon_i16 = vcombine_s16(result.val[0], result.val[1]);
7148 #elif defined(SIMDE_SHUFFLE_VECTOR_)
7149 r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, b_.i16, 0, 8, 1, 9, 2, 10, 3, 11);
7150 #else
7151 SIMDE_VECTORIZE
7152 for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i16[0])) / 2) ; i++) {
7153 r_.i16[(i * 2)] = a_.i16[i];
7154 r_.i16[(i * 2) + 1] = b_.i16[i];
7155 }
7156 #endif
7157
7158 return simde__m128i_from_private(r_);
7159 #endif
7160 }
7161 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
7162 #define _mm_unpacklo_epi16(a, b) simde_mm_unpacklo_epi16(a, b)
7163 #endif
7164
7165 SIMDE_FUNCTION_ATTRIBUTES
7166 simde__m128i
simde_mm_unpacklo_epi32(simde__m128i a,simde__m128i b)7167 simde_mm_unpacklo_epi32 (simde__m128i a, simde__m128i b) {
7168 #if defined(SIMDE_X86_SSE2_NATIVE)
7169 return _mm_unpacklo_epi32(a, b);
7170 #else
7171 simde__m128i_private
7172 r_,
7173 a_ = simde__m128i_to_private(a),
7174 b_ = simde__m128i_to_private(b);
7175
7176 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
7177 r_.neon_i32 = vzip1q_s32(a_.neon_i32, b_.neon_i32);
7178 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
7179 int32x2_t a1 = vget_low_s32(a_.neon_i32);
7180 int32x2_t b1 = vget_low_s32(b_.neon_i32);
7181 int32x2x2_t result = vzip_s32(a1, b1);
7182 r_.neon_i32 = vcombine_s32(result.val[0], result.val[1]);
7183 #elif defined(SIMDE_SHUFFLE_VECTOR_)
7184 r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.i32, b_.i32, 0, 4, 1, 5);
7185 #else
7186 SIMDE_VECTORIZE
7187 for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i32[0])) / 2) ; i++) {
7188 r_.i32[(i * 2)] = a_.i32[i];
7189 r_.i32[(i * 2) + 1] = b_.i32[i];
7190 }
7191 #endif
7192
7193 return simde__m128i_from_private(r_);
7194 #endif
7195 }
7196 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
7197 #define _mm_unpacklo_epi32(a, b) simde_mm_unpacklo_epi32(a, b)
7198 #endif
7199
7200 SIMDE_FUNCTION_ATTRIBUTES
7201 simde__m128i
simde_mm_unpacklo_epi64(simde__m128i a,simde__m128i b)7202 simde_mm_unpacklo_epi64 (simde__m128i a, simde__m128i b) {
7203 #if defined(SIMDE_X86_SSE2_NATIVE)
7204 return _mm_unpacklo_epi64(a, b);
7205 #else
7206 simde__m128i_private
7207 r_,
7208 a_ = simde__m128i_to_private(a),
7209 b_ = simde__m128i_to_private(b);
7210
7211 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
7212 int64x1_t a_l = vget_low_s64(a_.i64);
7213 int64x1_t b_l = vget_low_s64(b_.i64);
7214 r_.neon_i64 = vcombine_s64(a_l, b_l);
7215 #elif defined(SIMDE_SHUFFLE_VECTOR_)
7216 r_.i64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.i64, b_.i64, 0, 2);
7217 #else
7218 SIMDE_VECTORIZE
7219 for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i64[0])) / 2) ; i++) {
7220 r_.i64[(i * 2)] = a_.i64[i];
7221 r_.i64[(i * 2) + 1] = b_.i64[i];
7222 }
7223 #endif
7224
7225 return simde__m128i_from_private(r_);
7226 #endif
7227 }
7228 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
7229 #define _mm_unpacklo_epi64(a, b) simde_mm_unpacklo_epi64(a, b)
7230 #endif
7231
7232 SIMDE_FUNCTION_ATTRIBUTES
7233 simde__m128d
simde_mm_unpacklo_pd(simde__m128d a,simde__m128d b)7234 simde_mm_unpacklo_pd (simde__m128d a, simde__m128d b) {
7235 #if defined(SIMDE_X86_SSE2_NATIVE)
7236 return _mm_unpacklo_pd(a, b);
7237 #else
7238 simde__m128d_private
7239 r_,
7240 a_ = simde__m128d_to_private(a),
7241 b_ = simde__m128d_to_private(b);
7242
7243 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
7244 float64x1_t a_l = vget_low_f64(a_.f64);
7245 float64x1_t b_l = vget_low_f64(b_.f64);
7246 r_.neon_f64 = vcombine_f64(a_l, b_l);
7247 #elif defined(SIMDE_SHUFFLE_VECTOR_)
7248 r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 0, 2);
7249 #else
7250 SIMDE_VECTORIZE
7251 for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.f64[0])) / 2) ; i++) {
7252 r_.f64[(i * 2)] = a_.f64[i];
7253 r_.f64[(i * 2) + 1] = b_.f64[i];
7254 }
7255 #endif
7256
7257 return simde__m128d_from_private(r_);
7258 #endif
7259 }
7260 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
7261 #define _mm_unpacklo_pd(a, b) simde_mm_unpacklo_pd(a, b)
7262 #endif
7263
7264 SIMDE_FUNCTION_ATTRIBUTES
7265 simde__m128d
simde_x_mm_negate_pd(simde__m128d a)7266 simde_x_mm_negate_pd(simde__m128d a) {
7267 #if defined(SIMDE_X86_SSE_NATIVE)
7268 return simde_mm_xor_pd(a, _mm_set1_pd(SIMDE_FLOAT64_C(-0.0)));
7269 #else
7270 simde__m128d_private
7271 r_,
7272 a_ = simde__m128d_to_private(a);
7273
7274 #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) && \
7275 (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,1,0))
7276 r_.altivec_f64 = vec_neg(a_.altivec_f64);
7277 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
7278 r_.neon_f64 = vnegq_f64(a_.neon_f64);
7279 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
7280 r_.wasm_v128 = wasm_f64x2_neg(a_.wasm_v128);
7281 #elif defined(SIMDE_VECTOR_NEGATE)
7282 r_.f64 = -a_.f64;
7283 #else
7284 SIMDE_VECTORIZE
7285 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
7286 r_.f64[i] = -a_.f64[i];
7287 }
7288 #endif
7289
7290 return simde__m128d_from_private(r_);
7291 #endif
7292 }
7293
7294 SIMDE_FUNCTION_ATTRIBUTES
7295 simde__m128i
simde_mm_xor_si128(simde__m128i a,simde__m128i b)7296 simde_mm_xor_si128 (simde__m128i a, simde__m128i b) {
7297 #if defined(SIMDE_X86_SSE2_NATIVE)
7298 return _mm_xor_si128(a, b);
7299 #else
7300 simde__m128i_private
7301 r_,
7302 a_ = simde__m128i_to_private(a),
7303 b_ = simde__m128i_to_private(b);
7304
7305 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
7306 r_.neon_i32 = veorq_s32(a_.neon_i32, b_.neon_i32);
7307 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
7308 r_.altivec_i32 = vec_xor(a_.altivec_i32, b_.altivec_i32);
7309 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
7310 r_.i32f = a_.i32f ^ b_.i32f;
7311 #else
7312 SIMDE_VECTORIZE
7313 for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
7314 r_.i32f[i] = a_.i32f[i] ^ b_.i32f[i];
7315 }
7316 #endif
7317
7318 return simde__m128i_from_private(r_);
7319 #endif
7320 }
7321 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
7322 #define _mm_xor_si128(a, b) simde_mm_xor_si128(a, b)
7323 #endif
7324
7325 SIMDE_FUNCTION_ATTRIBUTES
7326 simde__m128i
simde_x_mm_not_si128(simde__m128i a)7327 simde_x_mm_not_si128 (simde__m128i a) {
7328 #if defined(SIMDE_X86_AVX512VL_NATIVE)
7329 return _mm_ternarylogic_epi32(a, a, a, 0x55);
7330 #else
7331 simde__m128i_private
7332 r_,
7333 a_ = simde__m128i_to_private(a);
7334
7335 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
7336 r_.neon_i32 = vmvnq_s32(a_.neon_i32);
7337 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
7338 r_.altivec_i32 = vec_nor(a_.altivec_i32, a_.altivec_i32);
7339 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
7340 r_.wasm_v128 = wasm_v128_not(a_.wasm_v128);
7341 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
7342 r_.i32f = ~a_.i32f;
7343 #else
7344 SIMDE_VECTORIZE
7345 for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
7346 r_.i32f[i] = ~(a_.i32f[i]);
7347 }
7348 #endif
7349
7350 return simde__m128i_from_private(r_);
7351 #endif
7352 }
7353
7354 #define SIMDE_MM_SHUFFLE2(x, y) (((x) << 1) | (y))
7355 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
7356 #define _MM_SHUFFLE2(x, y) SIMDE_MM_SHUFFLE2(x, y)
7357 #endif
7358
7359 SIMDE_END_DECLS_
7360
7361 HEDLEY_DIAGNOSTIC_POP
7362
7363 #endif /* !defined(SIMDE_X86_SSE2_H) */
7364