1 /* SPDX-License-Identifier: MIT
2 *
3 * Permission is hereby granted, free of charge, to any person
4 * obtaining a copy of this software and associated documentation
5 * files (the "Software"), to deal in the Software without
6 * restriction, including without limitation the rights to use, copy,
7 * modify, merge, publish, distribute, sublicense, and/or sell copies
8 * of the Software, and to permit persons to whom the Software is
9 * furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be
12 * included in all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
18 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
19 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Copyright:
24 * 2017-2020 Evan Nemerson <evan@nemerson.com>
25 * 2015-2017 John W. Ratcliff <jratcliffscarab@gmail.com>
26 * 2015 Brandon Rowlett <browlett@nvidia.com>
27 * 2015 Ken Fast <kfast@gdeb.com>
28 * 2017 Hasindu Gamaarachchi <hasindu@unsw.edu.au>
29 * 2018 Jeff Daily <jeff.daily@amd.com>
30 */
31
32 #if !defined(SIMDE_X86_SSE2_H)
33 #define SIMDE_X86_SSE2_H
34
35 #include "sse.h"
36
37 HEDLEY_DIAGNOSTIC_PUSH
38 SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
39 SIMDE_BEGIN_DECLS_
40
41 typedef union {
42 #if defined(SIMDE_VECTOR_SUBSCRIPT)
43 SIMDE_ALIGN(16) int8_t i8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
44 SIMDE_ALIGN(16) int16_t i16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
45 SIMDE_ALIGN(16) int32_t i32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
46 SIMDE_ALIGN(16) int64_t i64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
47 SIMDE_ALIGN(16) uint8_t u8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
48 SIMDE_ALIGN(16) uint16_t u16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
49 SIMDE_ALIGN(16) uint32_t u32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
50 SIMDE_ALIGN(16) uint64_t u64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
51 #if defined(SIMDE_HAVE_INT128_)
52 SIMDE_ALIGN(16) simde_int128 i128 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
53 SIMDE_ALIGN(16) simde_uint128 u128 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
54 #endif
55 SIMDE_ALIGN(16) simde_float32 f32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
56 SIMDE_ALIGN(16) simde_float64 f64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
57
58 SIMDE_ALIGN(16) int_fast32_t i32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
59 SIMDE_ALIGN(16) uint_fast32_t u32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
60 #else
61 SIMDE_ALIGN(16) int8_t i8[16];
62 SIMDE_ALIGN(16) int16_t i16[8];
63 SIMDE_ALIGN(16) int32_t i32[4];
64 SIMDE_ALIGN(16) int64_t i64[2];
65 SIMDE_ALIGN(16) uint8_t u8[16];
66 SIMDE_ALIGN(16) uint16_t u16[8];
67 SIMDE_ALIGN(16) uint32_t u32[4];
68 SIMDE_ALIGN(16) uint64_t u64[2];
69 #if defined(SIMDE_HAVE_INT128_)
70 SIMDE_ALIGN(16) simde_int128 i128[1];
71 SIMDE_ALIGN(16) simde_uint128 u128[1];
72 #endif
73 SIMDE_ALIGN(16) simde_float32 f32[4];
74 SIMDE_ALIGN(16) simde_float64 f64[2];
75
76 SIMDE_ALIGN(16) int_fast32_t i32f[16 / sizeof(int_fast32_t)];
77 SIMDE_ALIGN(16) uint_fast32_t u32f[16 / sizeof(uint_fast32_t)];
78 #endif
79
80 SIMDE_ALIGN(16) simde__m64_private m64_private[2];
81 SIMDE_ALIGN(16) simde__m64 m64[2];
82
83 #if defined(SIMDE_X86_SSE2_NATIVE)
84 SIMDE_ALIGN(16) __m128i n;
85 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
86 SIMDE_ALIGN(16) int8x16_t neon_i8;
87 SIMDE_ALIGN(16) int16x8_t neon_i16;
88 SIMDE_ALIGN(16) int32x4_t neon_i32;
89 SIMDE_ALIGN(16) int64x2_t neon_i64;
90 SIMDE_ALIGN(16) uint8x16_t neon_u8;
91 SIMDE_ALIGN(16) uint16x8_t neon_u16;
92 SIMDE_ALIGN(16) uint32x4_t neon_u32;
93 SIMDE_ALIGN(16) uint64x2_t neon_u64;
94 SIMDE_ALIGN(16) float32x4_t neon_f32;
95 #if defined(SIMDE_ARCH_AARCH64)
96 SIMDE_ALIGN(16) float64x2_t neon_f64;
97 #endif
98 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
99 SIMDE_ALIGN(16) v128_t wasm_v128;
100 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
101 SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(signed char) altivec_i8;
102 SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(signed short) altivec_i16;
103 SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(signed int) altivec_i32;
104 #if defined(__UINT_FAST32_TYPE__) && defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
105 SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(__INT_FAST32_TYPE__) altivec_i32f;
106 #else
107 SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(signed int) altivec_i32f;
108 #endif
109 SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) altivec_u8;
110 SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) altivec_u16;
111 SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32;
112 #if defined(__UINT_FAST32_TYPE__) && defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
113 SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(__UINT_FAST32_TYPE__) altivec_u32f;
114 #else
115 SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32f;
116 #endif
117 SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(float) altivec_f32;
118 #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
119 SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(signed long long) altivec_i64;
120 SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) altivec_u64;
121 SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(double) altivec_f64;
122 #endif
123 #endif
124 } simde__m128i_private;
125
126 typedef union {
127 #if defined(SIMDE_VECTOR_SUBSCRIPT)
128 SIMDE_ALIGN(16) int8_t i8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
129 SIMDE_ALIGN(16) int16_t i16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
130 SIMDE_ALIGN(16) int32_t i32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
131 SIMDE_ALIGN(16) int64_t i64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
132 SIMDE_ALIGN(16) uint8_t u8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
133 SIMDE_ALIGN(16) uint16_t u16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
134 SIMDE_ALIGN(16) uint32_t u32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
135 SIMDE_ALIGN(16) uint64_t u64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
136 SIMDE_ALIGN(16) simde_float32 f32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
137 SIMDE_ALIGN(16) simde_float64 f64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
138 SIMDE_ALIGN(16) int_fast32_t i32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
139 SIMDE_ALIGN(16) uint_fast32_t u32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
140 #else
141 SIMDE_ALIGN(16) int8_t i8[16];
142 SIMDE_ALIGN(16) int16_t i16[8];
143 SIMDE_ALIGN(16) int32_t i32[4];
144 SIMDE_ALIGN(16) int64_t i64[2];
145 SIMDE_ALIGN(16) uint8_t u8[16];
146 SIMDE_ALIGN(16) uint16_t u16[8];
147 SIMDE_ALIGN(16) uint32_t u32[4];
148 SIMDE_ALIGN(16) uint64_t u64[2];
149 SIMDE_ALIGN(16) simde_float32 f32[4];
150 SIMDE_ALIGN(16) simde_float64 f64[2];
151 SIMDE_ALIGN(16) int_fast32_t i32f[16 / sizeof(int_fast32_t)];
152 SIMDE_ALIGN(16) uint_fast32_t u32f[16 / sizeof(uint_fast32_t)];
153 #endif
154
155 SIMDE_ALIGN(16) simde__m64_private m64_private[2];
156 SIMDE_ALIGN(16) simde__m64 m64[2];
157
158 #if defined(SIMDE_X86_SSE2_NATIVE)
159 SIMDE_ALIGN(16) __m128d n;
160 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
161 SIMDE_ALIGN(16) int8x16_t neon_i8;
162 SIMDE_ALIGN(16) int16x8_t neon_i16;
163 SIMDE_ALIGN(16) int32x4_t neon_i32;
164 SIMDE_ALIGN(16) int64x2_t neon_i64;
165 SIMDE_ALIGN(16) uint8x16_t neon_u8;
166 SIMDE_ALIGN(16) uint16x8_t neon_u16;
167 SIMDE_ALIGN(16) uint32x4_t neon_u32;
168 SIMDE_ALIGN(16) uint64x2_t neon_u64;
169 SIMDE_ALIGN(16) float32x4_t neon_f32;
170 #if defined(SIMDE_ARCH_AARCH64)
171 SIMDE_ALIGN(16) float64x2_t neon_f64;
172 #endif
173 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
174 SIMDE_ALIGN(16) v128_t wasm_v128;
175 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
176 SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(signed char) altivec_i8;
177 SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(signed short) altivec_i16;
178 SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(signed int) altivec_i32;
179 #if defined(__INT_FAST32_TYPE__) && defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
180 SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(__INT_FAST32_TYPE__) altivec_i32f;
181 #else
182 SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(signed int) altivec_i32f;
183 #endif
184 SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) altivec_u8;
185 SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) altivec_u16;
186 SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32;
187 #if defined(__UINT_FAST32_TYPE__) && defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
188 SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(__UINT_FAST32_TYPE__) altivec_u32f;
189 #else
190 SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32f;
191 #endif
192 SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(float) altivec_f32;
193 #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
194 SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(signed long long) altivec_i64;
195 SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) altivec_u64;
196 SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(double) altivec_f64;
197 #endif
198 #endif
199 } simde__m128d_private;
200
201 #if defined(SIMDE_X86_SSE2_NATIVE)
202 typedef __m128i simde__m128i;
203 typedef __m128d simde__m128d;
204 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
205 typedef int64x2_t simde__m128i;
206 # if defined(SIMDE_ARCH_AARCH64)
207 typedef float64x2_t simde__m128d;
208 # elif defined(SIMDE_VECTOR_SUBSCRIPT)
209 typedef simde_float64 simde__m128d SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
210 # else
211 typedef simde__m128d_private simde__m128d;
212 # endif
213 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
214 typedef v128_t simde__m128i;
215 typedef v128_t simde__m128d;
216 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
217 typedef SIMDE_POWER_ALTIVEC_VECTOR(float) simde__m128i;
218 #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
219 typedef SIMDE_POWER_ALTIVEC_VECTOR(double) simde__m128d;
220 #else
221 typedef simde__m128d_private simde__m128d;
222 #endif
223 #elif defined(SIMDE_VECTOR_SUBSCRIPT)
224 typedef int64_t simde__m128i SIMDE_ALIGN(16) SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
225 typedef simde_float64 simde__m128d SIMDE_ALIGN(16) SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
226 #else
227 typedef simde__m128i_private simde__m128i;
228 typedef simde__m128d_private simde__m128d;
229 #endif
230
231 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
232 typedef simde__m128i __m128i;
233 typedef simde__m128d __m128d;
234 #endif
235
236 HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128i), "simde__m128i size incorrect");
237 HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128i_private), "simde__m128i_private size incorrect");
238 HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128d), "simde__m128d size incorrect");
239 HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128d_private), "simde__m128d_private size incorrect");
240 #if defined(SIMDE_CHECK_ALIGNMENT) && defined(SIMDE_ALIGN_OF)
241 HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128i) == 16, "simde__m128i is not 16-byte aligned");
242 HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128i_private) == 16, "simde__m128i_private is not 16-byte aligned");
243 HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128d) == 16, "simde__m128d is not 16-byte aligned");
244 HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128d_private) == 16, "simde__m128d_private is not 16-byte aligned");
245 #endif
246
247 SIMDE_FUNCTION_ATTRIBUTES
248 simde__m128i
simde__m128i_from_private(simde__m128i_private v)249 simde__m128i_from_private(simde__m128i_private v) {
250 simde__m128i r;
251 simde_memcpy(&r, &v, sizeof(r));
252 return r;
253 }
254
255 SIMDE_FUNCTION_ATTRIBUTES
256 simde__m128i_private
simde__m128i_to_private(simde__m128i v)257 simde__m128i_to_private(simde__m128i v) {
258 simde__m128i_private r;
259 simde_memcpy(&r, &v, sizeof(r));
260 return r;
261 }
262
263 SIMDE_FUNCTION_ATTRIBUTES
264 simde__m128d
simde__m128d_from_private(simde__m128d_private v)265 simde__m128d_from_private(simde__m128d_private v) {
266 simde__m128d r;
267 simde_memcpy(&r, &v, sizeof(r));
268 return r;
269 }
270
271 SIMDE_FUNCTION_ATTRIBUTES
272 simde__m128d_private
simde__m128d_to_private(simde__m128d v)273 simde__m128d_to_private(simde__m128d v) {
274 simde__m128d_private r;
275 simde_memcpy(&r, &v, sizeof(r));
276 return r;
277 }
278
279 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i,int8x16_t,neon,i8)280 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, int8x16_t, neon, i8)
281 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, int16x8_t, neon, i16)
282 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, int32x4_t, neon, i32)
283 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, int64x2_t, neon, i64)
284 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, uint8x16_t, neon, u8)
285 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, uint16x8_t, neon, u16)
286 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, uint32x4_t, neon, u32)
287 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, uint64x2_t, neon, u64)
288 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, float32x4_t, neon, f32)
289 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
290 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, float64x2_t, neon, f64)
291 #endif
292 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
293 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(signed char), altivec, i8)
294 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(signed short), altivec, i16)
295 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(signed int), altivec, i32)
296 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), altivec, u8)
297 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(unsigned short), altivec, u16)
298 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(unsigned int), altivec, u32)
299 #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
300 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long), altivec, u64)
301 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(signed long long), altivec, i64)
302 #endif
303 #endif /* defined(SIMDE_ARM_NEON_A32V7_NATIVE) */
304
305 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
306 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, int8x16_t, neon, i8)
307 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, int16x8_t, neon, i16)
308 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, int32x4_t, neon, i32)
309 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, int64x2_t, neon, i64)
310 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, uint8x16_t, neon, u8)
311 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, uint16x8_t, neon, u16)
312 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, uint32x4_t, neon, u32)
313 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, uint64x2_t, neon, u64)
314 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, float32x4_t, neon, f32)
315 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
316 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, float64x2_t, neon, f64)
317 #endif
318 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
319 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(signed char), altivec, i8)
320 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(signed short), altivec, i16)
321 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(signed int), altivec, i32)
322 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), altivec, u8)
323 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(unsigned short), altivec, u16)
324 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(unsigned int), altivec, u32)
325 #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
326 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long), altivec, u64)
327 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(signed long long), altivec, i64)
328 #if defined(SIMDE_BUG_GCC_95782)
329 SIMDE_FUNCTION_ATTRIBUTES
330 SIMDE_POWER_ALTIVEC_VECTOR(double)
331 simde__m128d_to_altivec_f64(simde__m128d value) {
332 simde__m128d_private r_ = simde__m128d_to_private(value);
333 return r_.altivec_f64;
334 }
335
336 SIMDE_FUNCTION_ATTRIBUTES
337 simde__m128d
338 simde__m128d_from_altivec_f64(SIMDE_POWER_ALTIVEC_VECTOR(double) value) {
339 simde__m128d_private r_;
340 r_.altivec_f64 = value;
341 return simde__m128d_from_private(r_);
342 }
343 #else
344 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(double), altivec, f64)
345 #endif
346 #endif
347 #endif /* defined(SIMDE_ARM_NEON_A32V7_NATIVE) */
348
349 SIMDE_FUNCTION_ATTRIBUTES
350 simde__m128i
351 simde_mm_add_epi8 (simde__m128i a, simde__m128i b) {
352 #if defined(SIMDE_X86_SSE2_NATIVE)
353 return _mm_add_epi8(a, b);
354 #else
355 simde__m128i_private
356 r_,
357 a_ = simde__m128i_to_private(a),
358 b_ = simde__m128i_to_private(b);
359
360 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
361 r_.neon_i8 = vaddq_s8(a_.neon_i8, b_.neon_i8);
362 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
363 r_.altivec_i8 = vec_add(a_.altivec_i8, b_.altivec_i8);
364 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
365 r_.i8 = a_.i8 + b_.i8;
366 #else
367 SIMDE_VECTORIZE
368 for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
369 r_.i8[i] = a_.i8[i] + b_.i8[i];
370 }
371 #endif
372
373 return simde__m128i_from_private(r_);
374 #endif
375 }
376 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
377 # define _mm_add_epi8(a, b) simde_mm_add_epi8(a, b)
378 #endif
379
380 SIMDE_FUNCTION_ATTRIBUTES
381 simde__m128i
simde_mm_add_epi16(simde__m128i a,simde__m128i b)382 simde_mm_add_epi16 (simde__m128i a, simde__m128i b) {
383 #if defined(SIMDE_X86_SSE2_NATIVE)
384 return _mm_add_epi16(a, b);
385 #else
386 simde__m128i_private
387 r_,
388 a_ = simde__m128i_to_private(a),
389 b_ = simde__m128i_to_private(b);
390
391 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
392 r_.neon_i16 = vaddq_s16(a_.neon_i16, b_.neon_i16);
393 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
394 r_.altivec_i16 = vec_add(a_.altivec_i16, b_.altivec_i16);
395 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
396 r_.i16 = a_.i16 + b_.i16;
397 #else
398 SIMDE_VECTORIZE
399 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
400 r_.i16[i] = a_.i16[i] + b_.i16[i];
401 }
402 #endif
403
404 return simde__m128i_from_private(r_);
405 #endif
406 }
407 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
408 # define _mm_add_epi16(a, b) simde_mm_add_epi16(a, b)
409 #endif
410
411 SIMDE_FUNCTION_ATTRIBUTES
412 simde__m128i
simde_mm_add_epi32(simde__m128i a,simde__m128i b)413 simde_mm_add_epi32 (simde__m128i a, simde__m128i b) {
414 #if defined(SIMDE_X86_SSE2_NATIVE)
415 return _mm_add_epi32(a, b);
416 #else
417 simde__m128i_private
418 r_,
419 a_ = simde__m128i_to_private(a),
420 b_ = simde__m128i_to_private(b);
421
422 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
423 r_.neon_i32 = vaddq_s32(a_.neon_i32, b_.neon_i32);
424 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
425 r_.altivec_i32 = vec_add(a_.altivec_i32, b_.altivec_i32);
426 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
427 r_.i32 = a_.i32 + b_.i32;
428 #else
429 SIMDE_VECTORIZE
430 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
431 r_.i32[i] = a_.i32[i] + b_.i32[i];
432 }
433 #endif
434
435 return simde__m128i_from_private(r_);
436 #endif
437 }
438 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
439 # define _mm_add_epi32(a, b) simde_mm_add_epi32(a, b)
440 #endif
441
442 SIMDE_FUNCTION_ATTRIBUTES
443 simde__m128i
simde_mm_add_epi64(simde__m128i a,simde__m128i b)444 simde_mm_add_epi64 (simde__m128i a, simde__m128i b) {
445 #if defined(SIMDE_X86_SSE2_NATIVE)
446 return _mm_add_epi64(a, b);
447 #else
448 simde__m128i_private
449 r_,
450 a_ = simde__m128i_to_private(a),
451 b_ = simde__m128i_to_private(b);
452
453 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
454 r_.neon_i64 = vaddq_s64(a_.neon_i64, b_.neon_i64);
455 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
456 r_.altivec_i64 = vec_add(a_.altivec_i64, b_.altivec_i64);
457 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
458 r_.i64 = a_.i64 + b_.i64;
459 #else
460 SIMDE_VECTORIZE
461 for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
462 r_.i64[i] = a_.i64[i] + b_.i64[i];
463 }
464 #endif
465
466 return simde__m128i_from_private(r_);
467 #endif
468 }
469 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
470 # define _mm_add_epi64(a, b) simde_mm_add_epi64(a, b)
471 #endif
472
473 SIMDE_FUNCTION_ATTRIBUTES
474 simde__m128d
simde_mm_add_pd(simde__m128d a,simde__m128d b)475 simde_mm_add_pd (simde__m128d a, simde__m128d b) {
476 #if defined(SIMDE_X86_SSE2_NATIVE)
477 return _mm_add_pd(a, b);
478 #else
479 simde__m128d_private
480 r_,
481 a_ = simde__m128d_to_private(a),
482 b_ = simde__m128d_to_private(b);
483
484 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
485 r_.neon_f64 = vaddq_f64(a_.neon_f64, b_.neon_f64);
486 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
487 r_.wasm_v128 = wasm_f64x2_add(a_.wasm_v128, b_.wasm_v128);
488 #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
489 r_.altivec_f64 = vec_add(a_.altivec_f64, b_.altivec_f64);
490 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
491 r_.f64 = a_.f64 + b_.f64;
492 #else
493 SIMDE_VECTORIZE
494 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
495 r_.f64[i] = a_.f64[i] + b_.f64[i];
496 }
497 #endif
498
499 return simde__m128d_from_private(r_);
500 #endif
501 }
502 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
503 # define _mm_add_pd(a, b) simde_mm_add_pd(a, b)
504 #endif
505
506 SIMDE_FUNCTION_ATTRIBUTES
507 simde__m128d
simde_mm_move_sd(simde__m128d a,simde__m128d b)508 simde_mm_move_sd (simde__m128d a, simde__m128d b) {
509 #if defined(SIMDE_X86_SSE2_NATIVE)
510 return _mm_move_sd(a, b);
511 #else
512 simde__m128d_private
513 r_,
514 a_ = simde__m128d_to_private(a),
515 b_ = simde__m128d_to_private(b);
516
517 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
518 r_.neon_f64 = vsetq_lane_f64(vgetq_lane_f64(b_.neon_f64, 0), a_.neon_f64, 0);
519 #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
520 SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) m = {
521 16, 17, 18, 19, 20, 21, 22, 23,
522 8, 9, 10, 11, 12, 13, 14, 15
523 };
524 r_.altivec_f64 = vec_perm(a_.altivec_f64, b_.altivec_f64, m);
525 #elif defined(SIMDE_SHUFFLE_VECTOR_)
526 r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 2, 1);
527 #else
528 r_.f64[0] = b_.f64[0];
529 r_.f64[1] = a_.f64[1];
530 #endif
531
532 return simde__m128d_from_private(r_);
533 #endif
534 }
535 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
536 # define _mm_move_sd(a, b) simde_mm_move_sd(a, b)
537 #endif
538
539 SIMDE_FUNCTION_ATTRIBUTES
540 simde__m128d
simde_mm_add_sd(simde__m128d a,simde__m128d b)541 simde_mm_add_sd (simde__m128d a, simde__m128d b) {
542 #if defined(SIMDE_X86_SSE2_NATIVE)
543 return _mm_add_sd(a, b);
544 #else
545 simde__m128d_private
546 r_,
547 a_ = simde__m128d_to_private(a),
548 b_ = simde__m128d_to_private(b);
549
550 r_.f64[0] = a_.f64[0] + b_.f64[0];
551 r_.f64[1] = a_.f64[1];
552
553 #if defined(SIMDE_ASSUME_VECTORIZATION)
554 return simde_mm_move_sd(a, simde_mm_add_pd(a, b));
555 #else
556 r_.f64[0] = a_.f64[0] + b_.f64[0];
557 r_.f64[1] = a_.f64[1];
558 #endif
559
560 return simde__m128d_from_private(r_);
561 #endif
562 }
563 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
564 # define _mm_add_sd(a, b) simde_mm_add_sd(a, b)
565 #endif
566
567 SIMDE_FUNCTION_ATTRIBUTES
568 simde__m64
simde_mm_add_si64(simde__m64 a,simde__m64 b)569 simde_mm_add_si64 (simde__m64 a, simde__m64 b) {
570 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
571 return _mm_add_si64(a, b);
572 #else
573 simde__m64_private
574 r_,
575 a_ = simde__m64_to_private(a),
576 b_ = simde__m64_to_private(b);
577
578 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
579 r_.neon_i64 = vadd_s64(a_.neon_i64, b_.neon_i64);
580 #else
581 r_.i64[0] = a_.i64[0] + b_.i64[0];
582 #endif
583
584 return simde__m64_from_private(r_);
585 #endif
586 }
587 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
588 # define _mm_add_si64(a, b) simde_mm_add_si64(a, b)
589 #endif
590
591 SIMDE_FUNCTION_ATTRIBUTES
592 simde__m128i
simde_mm_adds_epi8(simde__m128i a,simde__m128i b)593 simde_mm_adds_epi8 (simde__m128i a, simde__m128i b) {
594 #if defined(SIMDE_X86_SSE2_NATIVE)
595 return _mm_adds_epi8(a, b);
596 #else
597 simde__m128i_private
598 r_,
599 a_ = simde__m128i_to_private(a),
600 b_ = simde__m128i_to_private(b);
601
602 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
603 r_.neon_i8 = vqaddq_s8(a_.neon_i8, b_.neon_i8);
604 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
605 r_.altivec_i8 = vec_adds(a_.altivec_i8, b_.altivec_i8);
606 #else
607 SIMDE_VECTORIZE
608 for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
609 const int32_t tmp =
610 HEDLEY_STATIC_CAST(int16_t, a_.i8[i]) +
611 HEDLEY_STATIC_CAST(int16_t, b_.i8[i]);
612 r_.i8[i] = HEDLEY_STATIC_CAST(int8_t, ((tmp < INT8_MAX) ? ((tmp > INT8_MIN) ? tmp : INT8_MIN) : INT8_MAX));
613 }
614 #endif
615
616 return simde__m128i_from_private(r_);
617 #endif
618 }
619 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
620 # define _mm_adds_epi8(a, b) simde_mm_adds_epi8(a, b)
621 #endif
622
623 SIMDE_FUNCTION_ATTRIBUTES
624 simde__m128i
simde_mm_adds_epi16(simde__m128i a,simde__m128i b)625 simde_mm_adds_epi16 (simde__m128i a, simde__m128i b) {
626 #if defined(SIMDE_X86_SSE2_NATIVE)
627 return _mm_adds_epi16(a, b);
628 #else
629 simde__m128i_private
630 r_,
631 a_ = simde__m128i_to_private(a),
632 b_ = simde__m128i_to_private(b);
633
634
635 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
636 r_.neon_i16 = vqaddq_s16(a_.neon_i16, b_.neon_i16);
637 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
638 r_.altivec_i16 = vec_adds(a_.altivec_i16, b_.altivec_i16);
639 #else
640 SIMDE_VECTORIZE
641 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
642 const int32_t tmp =
643 HEDLEY_STATIC_CAST(int32_t, a_.i16[i]) +
644 HEDLEY_STATIC_CAST(int32_t, b_.i16[i]);
645 r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, ((tmp < INT16_MAX) ? ((tmp > INT16_MIN) ? tmp : INT16_MIN) : INT16_MAX));
646 }
647 #endif
648
649 return simde__m128i_from_private(r_);
650 #endif
651 }
652 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
653 # define _mm_adds_epi16(a, b) simde_mm_adds_epi16(a, b)
654 #endif
655
656 SIMDE_FUNCTION_ATTRIBUTES
657 simde__m128i
simde_mm_adds_epu8(simde__m128i a,simde__m128i b)658 simde_mm_adds_epu8 (simde__m128i a, simde__m128i b) {
659 #if defined(SIMDE_X86_SSE2_NATIVE)
660 return _mm_adds_epu8(a, b);
661 #else
662 simde__m128i_private
663 r_,
664 a_ = simde__m128i_to_private(a),
665 b_ = simde__m128i_to_private(b);
666
667 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
668 r_.neon_u8 = vqaddq_u8(a_.neon_u8, b_.neon_u8);
669 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
670 r_.altivec_u8 = vec_adds(a_.altivec_u8, b_.altivec_u8);
671 #else
672 SIMDE_VECTORIZE
673 for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
674 r_.u8[i] = ((UINT8_MAX - a_.u8[i]) > b_.u8[i]) ? (a_.u8[i] + b_.u8[i]) : UINT8_MAX;
675 }
676 #endif
677
678 return simde__m128i_from_private(r_);
679 #endif
680 }
681 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
682 # define _mm_adds_epu8(a, b) simde_mm_adds_epu8(a, b)
683 #endif
684
685 SIMDE_FUNCTION_ATTRIBUTES
686 simde__m128i
simde_mm_adds_epu16(simde__m128i a,simde__m128i b)687 simde_mm_adds_epu16 (simde__m128i a, simde__m128i b) {
688 #if defined(SIMDE_X86_SSE2_NATIVE)
689 return _mm_adds_epu16(a, b);
690 #else
691 simde__m128i_private
692 r_,
693 a_ = simde__m128i_to_private(a),
694 b_ = simde__m128i_to_private(b);
695
696 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
697 r_.neon_u16 = vqaddq_u16(a_.neon_u16, b_.neon_u16);
698 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
699 r_.altivec_u16 = vec_adds(a_.altivec_u16, b_.altivec_u16);
700 #else
701 SIMDE_VECTORIZE
702 for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
703 r_.u16[i] = ((UINT16_MAX - a_.u16[i]) > b_.u16[i]) ? (a_.u16[i] + b_.u16[i]) : UINT16_MAX;
704 }
705 #endif
706
707 return simde__m128i_from_private(r_);
708 #endif
709 }
710 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
711 # define _mm_adds_epu16(a, b) simde_mm_adds_epu16(a, b)
712 #endif
713
714 SIMDE_FUNCTION_ATTRIBUTES
715 simde__m128d
simde_mm_and_pd(simde__m128d a,simde__m128d b)716 simde_mm_and_pd (simde__m128d a, simde__m128d b) {
717 #if defined(SIMDE_X86_SSE2_NATIVE)
718 return _mm_and_pd(a, b);
719 #else
720 simde__m128d_private
721 r_,
722 a_ = simde__m128d_to_private(a),
723 b_ = simde__m128d_to_private(b);
724
725 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
726 r_.neon_i32 = vandq_s32(a_.neon_i32, b_.neon_i32);
727 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
728 r_.wasm_v128 = wasm_v128_and(a_.wasm_v128, b_.wasm_v128);
729 #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
730 r_.altivec_f64 = vec_and(a_.altivec_f64, b_.altivec_f64);
731 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
732 r_.i32f = a_.i32f & b_.i32f;
733 #else
734 SIMDE_VECTORIZE
735 for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
736 r_.i32f[i] = a_.i32f[i] & b_.i32f[i];
737 }
738 #endif
739
740 return simde__m128d_from_private(r_);
741 #endif
742 }
743 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
744 # define _mm_and_pd(a, b) simde_mm_and_pd(a, b)
745 #endif
746
747 SIMDE_FUNCTION_ATTRIBUTES
748 simde__m128i
simde_mm_and_si128(simde__m128i a,simde__m128i b)749 simde_mm_and_si128 (simde__m128i a, simde__m128i b) {
750 #if defined(SIMDE_X86_SSE2_NATIVE)
751 return _mm_and_si128(a, b);
752 #else
753 simde__m128i_private
754 r_,
755 a_ = simde__m128i_to_private(a),
756 b_ = simde__m128i_to_private(b);
757
758 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
759 r_.neon_i32 = vandq_s32(b_.neon_i32, a_.neon_i32);
760 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
761 r_.altivec_u32f = vec_and(a_.altivec_u32f, b_.altivec_u32f);
762 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
763 r_.i32f = a_.i32f & b_.i32f;
764 #else
765 SIMDE_VECTORIZE
766 for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
767 r_.i32f[i] = a_.i32f[i] & b_.i32f[i];
768 }
769 #endif
770
771 return simde__m128i_from_private(r_);
772 #endif
773 }
774 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
775 # define _mm_and_si128(a, b) simde_mm_and_si128(a, b)
776 #endif
777
778 SIMDE_FUNCTION_ATTRIBUTES
779 simde__m128d
simde_mm_andnot_pd(simde__m128d a,simde__m128d b)780 simde_mm_andnot_pd (simde__m128d a, simde__m128d b) {
781 #if defined(SIMDE_X86_SSE2_NATIVE)
782 return _mm_andnot_pd(a, b);
783 #else
784 simde__m128d_private
785 r_,
786 a_ = simde__m128d_to_private(a),
787 b_ = simde__m128d_to_private(b);
788
789 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
790 r_.neon_i32 = vbicq_s32(a_.neon_i32, b_.neon_i32);
791 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
792 r_.wasm_v128 = wasm_v128_andnot(b_.wasm_v128, a_.wasm_v128);
793 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
794 r_.altivec_i32f = vec_andc(a_.altivec_i32f, b_.altivec_i32f);
795 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
796 r_.i32f = ~a_.i32f & b_.i32f;
797 #else
798 SIMDE_VECTORIZE
799 for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
800 r_.u64[i] = ~a_.u64[i] & b_.u64[i];
801 }
802 #endif
803
804 return simde__m128d_from_private(r_);
805 #endif
806 }
807 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
808 # define _mm_andnot_pd(a, b) simde_mm_andnot_pd(a, b)
809 #endif
810
811 SIMDE_FUNCTION_ATTRIBUTES
812 simde__m128i
simde_mm_andnot_si128(simde__m128i a,simde__m128i b)813 simde_mm_andnot_si128 (simde__m128i a, simde__m128i b) {
814 #if defined(SIMDE_X86_SSE2_NATIVE)
815 return _mm_andnot_si128(a, b);
816 #else
817 simde__m128i_private
818 r_,
819 a_ = simde__m128i_to_private(a),
820 b_ = simde__m128i_to_private(b);
821
822 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
823 r_.neon_i32 = vbicq_s32(b_.neon_i32, a_.neon_i32);
824 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
825 r_.altivec_i32 = vec_andc(b_.altivec_i32, a_.altivec_i32);
826 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
827 r_.i32f = ~a_.i32f & b_.i32f;
828 #else
829 SIMDE_VECTORIZE
830 for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
831 r_.i32f[i] = ~(a_.i32f[i]) & b_.i32f[i];
832 }
833 #endif
834
835 return simde__m128i_from_private(r_);
836 #endif
837 }
838 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
839 # define _mm_andnot_si128(a, b) simde_mm_andnot_si128(a, b)
840 #endif
841
842 SIMDE_FUNCTION_ATTRIBUTES
843 simde__m128i
simde_mm_avg_epu8(simde__m128i a,simde__m128i b)844 simde_mm_avg_epu8 (simde__m128i a, simde__m128i b) {
845 #if defined(SIMDE_X86_SSE2_NATIVE)
846 return _mm_avg_epu8(a, b);
847 #else
848 simde__m128i_private
849 r_,
850 a_ = simde__m128i_to_private(a),
851 b_ = simde__m128i_to_private(b);
852
853 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
854 r_.neon_u8 = vrhaddq_u8(b_.neon_u8, a_.neon_u8);
855 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
856 r_.altivec_u8 = vec_avg(a_.altivec_u8, b_.altivec_u8);
857 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_CONVERT_VECTOR_)
858 uint16_t wa SIMDE_VECTOR(32);
859 uint16_t wb SIMDE_VECTOR(32);
860 uint16_t wr SIMDE_VECTOR(32);
861 SIMDE_CONVERT_VECTOR_(wa, a_.u8);
862 SIMDE_CONVERT_VECTOR_(wb, b_.u8);
863 wr = (wa + wb + 1) >> 1;
864 SIMDE_CONVERT_VECTOR_(r_.u8, wr);
865 #else
866 SIMDE_VECTORIZE
867 for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
868 r_.u8[i] = (a_.u8[i] + b_.u8[i] + 1) >> 1;
869 }
870 #endif
871
872 return simde__m128i_from_private(r_);
873 #endif
874 }
875 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
876 # define _mm_avg_epu8(a, b) simde_mm_avg_epu8(a, b)
877 #endif
878
879 SIMDE_FUNCTION_ATTRIBUTES
880 simde__m128i
simde_mm_avg_epu16(simde__m128i a,simde__m128i b)881 simde_mm_avg_epu16 (simde__m128i a, simde__m128i b) {
882 #if defined(SIMDE_X86_SSE2_NATIVE)
883 return _mm_avg_epu16(a, b);
884 #else
885 simde__m128i_private
886 r_,
887 a_ = simde__m128i_to_private(a),
888 b_ = simde__m128i_to_private(b);
889
890 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
891 r_.neon_u16 = vrhaddq_u16(b_.neon_u16, a_.neon_u16);
892 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
893 r_.altivec_u16 = vec_avg(a_.altivec_u16, b_.altivec_u16);
894 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_CONVERT_VECTOR_)
895 uint32_t wa SIMDE_VECTOR(32);
896 uint32_t wb SIMDE_VECTOR(32);
897 uint32_t wr SIMDE_VECTOR(32);
898 SIMDE_CONVERT_VECTOR_(wa, a_.u16);
899 SIMDE_CONVERT_VECTOR_(wb, b_.u16);
900 wr = (wa + wb + 1) >> 1;
901 SIMDE_CONVERT_VECTOR_(r_.u16, wr);
902 #else
903 SIMDE_VECTORIZE
904 for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
905 r_.u16[i] = (a_.u16[i] + b_.u16[i] + 1) >> 1;
906 }
907 #endif
908
909 return simde__m128i_from_private(r_);
910 #endif
911 }
912 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
913 # define _mm_avg_epu16(a, b) simde_mm_avg_epu16(a, b)
914 #endif
915
916 SIMDE_FUNCTION_ATTRIBUTES
917 simde__m128i
simde_mm_setzero_si128(void)918 simde_mm_setzero_si128 (void) {
919 #if defined(SIMDE_X86_SSE2_NATIVE)
920 return _mm_setzero_si128();
921 #else
922 simde__m128i_private r_;
923
924 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
925 r_.neon_i32 = vdupq_n_s32(0);
926 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
927 r_.altivec_i32 = vec_splats(HEDLEY_STATIC_CAST(signed int, 0));
928 #elif defined(SIMDE_VECTOR_SUBSCRIPT)
929 r_.i32 = __extension__ (__typeof__(r_.i32)) { 0, 0, 0, 0 };
930 #else
931 SIMDE_VECTORIZE
932 for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
933 r_.i32f[i] = 0;
934 }
935 #endif
936
937 return simde__m128i_from_private(r_);
938 #endif
939 }
940 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
941 # define _mm_setzero_si128() (simde_mm_setzero_si128())
942 #endif
943
944 SIMDE_FUNCTION_ATTRIBUTES
945 simde__m128i
simde_mm_bslli_si128(simde__m128i a,const int imm8)946 simde_mm_bslli_si128 (simde__m128i a, const int imm8)
947 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
948 simde__m128i_private
949 r_,
950 a_ = simde__m128i_to_private(a);
951
952 if (HEDLEY_UNLIKELY((imm8 & ~15))) {
953 return simde_mm_setzero_si128();
954 }
955
956 #if defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) && defined(SIMDE_ENDIAN_ORDER)
957 r_.altivec_i8 =
958 #if (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
959 vec_slo
960 #else /* SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_BIG */
961 vec_sro
962 #endif
963 (a_.altivec_i8, vec_splats(HEDLEY_STATIC_CAST(unsigned char, imm8 * 8)));
964 #elif defined(SIMDE_HAVE_INT128_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) && 0
965 r_.u128[0] = a_.u128[0] << s;
966 #else
967 r_ = simde__m128i_to_private(simde_mm_setzero_si128());
968 for (int i = imm8 ; i < HEDLEY_STATIC_CAST(int, sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
969 r_.i8[i] = a_.i8[i - imm8];
970 }
971 #endif
972
973 return simde__m128i_from_private(r_);
974 }
975 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
976 # define simde_mm_bslli_si128(a, imm8) _mm_slli_si128(a, imm8)
977 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__)
978 # define simde_mm_bslli_si128(a, imm8) \
979 simde__m128i_from_neon_i8(((imm8) <= 0) ? simde__m128i_to_neon_i8(a) : (((imm8) > 15) ? (vdupq_n_s8(0)) : (vextq_s8(vdupq_n_s8(0), simde__m128i_to_neon_i8(a), 16 - (imm8)))))
980 #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
981 #define simde_mm_bslli_si128(a, imm8) (__extension__ ({ \
982 const simde__m128i_private simde__tmp_a_ = simde__m128i_to_private(a); \
983 const simde__m128i_private simde__tmp_z_ = simde__m128i_to_private(simde_mm_setzero_si128()); \
984 simde__m128i_private simde__tmp_r_; \
985 if (HEDLEY_UNLIKELY(imm8 > 15)) { \
986 simde__tmp_r_ = simde__m128i_to_private(simde_mm_setzero_si128()); \
987 } else { \
988 simde__tmp_r_.i8 = \
989 SIMDE_SHUFFLE_VECTOR_(8, 16, \
990 simde__tmp_z_.i8, \
991 (simde__tmp_a_).i8, \
992 HEDLEY_STATIC_CAST(int8_t, (16 - imm8) & 31), \
993 HEDLEY_STATIC_CAST(int8_t, (17 - imm8) & 31), \
994 HEDLEY_STATIC_CAST(int8_t, (18 - imm8) & 31), \
995 HEDLEY_STATIC_CAST(int8_t, (19 - imm8) & 31), \
996 HEDLEY_STATIC_CAST(int8_t, (20 - imm8) & 31), \
997 HEDLEY_STATIC_CAST(int8_t, (21 - imm8) & 31), \
998 HEDLEY_STATIC_CAST(int8_t, (22 - imm8) & 31), \
999 HEDLEY_STATIC_CAST(int8_t, (23 - imm8) & 31), \
1000 HEDLEY_STATIC_CAST(int8_t, (24 - imm8) & 31), \
1001 HEDLEY_STATIC_CAST(int8_t, (25 - imm8) & 31), \
1002 HEDLEY_STATIC_CAST(int8_t, (26 - imm8) & 31), \
1003 HEDLEY_STATIC_CAST(int8_t, (27 - imm8) & 31), \
1004 HEDLEY_STATIC_CAST(int8_t, (28 - imm8) & 31), \
1005 HEDLEY_STATIC_CAST(int8_t, (29 - imm8) & 31), \
1006 HEDLEY_STATIC_CAST(int8_t, (30 - imm8) & 31), \
1007 HEDLEY_STATIC_CAST(int8_t, (31 - imm8) & 31)); \
1008 } \
1009 simde__m128i_from_private(simde__tmp_r_); }))
1010 #endif
1011 #define simde_mm_slli_si128(a, imm8) simde_mm_bslli_si128(a, imm8)
1012 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1013 # define _mm_bslli_si128(a, imm8) simde_mm_bslli_si128(a, imm8)
1014 # define _mm_slli_si128(a, imm8) simde_mm_bslli_si128(a, imm8)
1015 #endif
1016
1017 SIMDE_FUNCTION_ATTRIBUTES
1018 simde__m128i
simde_mm_bsrli_si128(simde__m128i a,const int imm8)1019 simde_mm_bsrli_si128 (simde__m128i a, const int imm8)
1020 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
1021 simde__m128i_private
1022 r_,
1023 a_ = simde__m128i_to_private(a);
1024
1025 if (HEDLEY_UNLIKELY((imm8 & ~15))) {
1026 return simde_mm_setzero_si128();
1027 }
1028
1029 #if defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) && defined(SIMDE_ENDIAN_ORDER)
1030 r_.altivec_i8 =
1031 #if (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
1032 vec_sro
1033 #else /* SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_BIG */
1034 vec_slo
1035 #endif
1036 (a_.altivec_i8, vec_splats(HEDLEY_STATIC_CAST(unsigned char, imm8 * 8)));
1037 #else
1038 SIMDE_VECTORIZE
1039 for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
1040 const int e = HEDLEY_STATIC_CAST(int, i) + imm8;
1041 r_.i8[i] = (e < 16) ? a_.i8[e] : 0;
1042 }
1043 #endif
1044
1045 return simde__m128i_from_private(r_);
1046 }
1047 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
1048 # define simde_mm_bsrli_si128(a, imm8) _mm_srli_si128(a, imm8)
1049 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__)
1050 # define simde_mm_bsrli_si128(a, imm8) \
1051 simde__m128i_from_neon_i8(((imm8 < 0) || (imm8 > 15)) ? vdupq_n_s8(0) : (vextq_s8(simde__m128i_to_private(a).neon_i8, vdupq_n_s8(0), ((imm8 & 15) != 0) ? imm8 : (imm8 & 15))))
1052 #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1053 #define simde_mm_bsrli_si128(a, imm8) (__extension__ ({ \
1054 const simde__m128i_private simde__tmp_a_ = simde__m128i_to_private(a); \
1055 const simde__m128i_private simde__tmp_z_ = simde__m128i_to_private(simde_mm_setzero_si128()); \
1056 simde__m128i_private simde__tmp_r_ = simde__m128i_to_private(a); \
1057 if (HEDLEY_UNLIKELY(imm8 > 15)) { \
1058 simde__tmp_r_ = simde__m128i_to_private(simde_mm_setzero_si128()); \
1059 } else { \
1060 simde__tmp_r_.i8 = \
1061 SIMDE_SHUFFLE_VECTOR_(8, 16, \
1062 simde__tmp_z_.i8, \
1063 (simde__tmp_a_).i8, \
1064 HEDLEY_STATIC_CAST(int8_t, (imm8 + 16) & 31), \
1065 HEDLEY_STATIC_CAST(int8_t, (imm8 + 17) & 31), \
1066 HEDLEY_STATIC_CAST(int8_t, (imm8 + 18) & 31), \
1067 HEDLEY_STATIC_CAST(int8_t, (imm8 + 19) & 31), \
1068 HEDLEY_STATIC_CAST(int8_t, (imm8 + 20) & 31), \
1069 HEDLEY_STATIC_CAST(int8_t, (imm8 + 21) & 31), \
1070 HEDLEY_STATIC_CAST(int8_t, (imm8 + 22) & 31), \
1071 HEDLEY_STATIC_CAST(int8_t, (imm8 + 23) & 31), \
1072 HEDLEY_STATIC_CAST(int8_t, (imm8 + 24) & 31), \
1073 HEDLEY_STATIC_CAST(int8_t, (imm8 + 25) & 31), \
1074 HEDLEY_STATIC_CAST(int8_t, (imm8 + 26) & 31), \
1075 HEDLEY_STATIC_CAST(int8_t, (imm8 + 27) & 31), \
1076 HEDLEY_STATIC_CAST(int8_t, (imm8 + 28) & 31), \
1077 HEDLEY_STATIC_CAST(int8_t, (imm8 + 29) & 31), \
1078 HEDLEY_STATIC_CAST(int8_t, (imm8 + 30) & 31), \
1079 HEDLEY_STATIC_CAST(int8_t, (imm8 + 31) & 31)); \
1080 } \
1081 simde__m128i_from_private(simde__tmp_r_); }))
1082 #endif
1083 #define simde_mm_srli_si128(a, imm8) simde_mm_bsrli_si128((a), (imm8))
1084 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1085 # define _mm_bsrli_si128(a, imm8) simde_mm_bsrli_si128((a), (imm8))
1086 # define _mm_srli_si128(a, imm8) simde_mm_bsrli_si128((a), (imm8))
1087 #endif
1088
1089 SIMDE_FUNCTION_ATTRIBUTES
1090 void
simde_mm_clflush(void const * p)1091 simde_mm_clflush (void const* p) {
1092 #if defined(SIMDE_X86_SSE2_NATIVE)
1093 _mm_clflush(p);
1094 #else
1095 (void) p;
1096 #endif
1097 }
1098 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1099 # define _mm_clflush(a, b) simde_mm_clflush()
1100 #endif
1101
1102 SIMDE_FUNCTION_ATTRIBUTES
1103 int
simde_mm_comieq_sd(simde__m128d a,simde__m128d b)1104 simde_mm_comieq_sd (simde__m128d a, simde__m128d b) {
1105 #if defined(SIMDE_X86_SSE2_NATIVE)
1106 return _mm_comieq_sd(a, b);
1107 #else
1108 simde__m128d_private
1109 a_ = simde__m128d_to_private(a),
1110 b_ = simde__m128d_to_private(b);
1111 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1112 return !!vgetq_lane_u64(vceqq_f64(a_.neon_f64, b_.neon_f64), 0);
1113 #else
1114 return a_.f64[0] == b_.f64[0];
1115 #endif
1116 #endif
1117 }
1118 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1119 # define _mm_comieq_sd(a, b) simde_mm_comieq_sd(a, b)
1120 #endif
1121
1122 SIMDE_FUNCTION_ATTRIBUTES
1123 int
simde_mm_comige_sd(simde__m128d a,simde__m128d b)1124 simde_mm_comige_sd (simde__m128d a, simde__m128d b) {
1125 #if defined(SIMDE_X86_SSE2_NATIVE)
1126 return _mm_comige_sd(a, b);
1127 #else
1128 simde__m128d_private
1129 a_ = simde__m128d_to_private(a),
1130 b_ = simde__m128d_to_private(b);
1131 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1132 return !!vgetq_lane_u64(vcgeq_f64(a_.neon_f64, b_.neon_f64), 0);
1133 #else
1134 return a_.f64[0] >= b_.f64[0];
1135 #endif
1136 #endif
1137 }
1138 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1139 # define _mm_comige_sd(a, b) simde_mm_comige_sd(a, b)
1140 #endif
1141
1142 SIMDE_FUNCTION_ATTRIBUTES
1143 int
simde_mm_comigt_sd(simde__m128d a,simde__m128d b)1144 simde_mm_comigt_sd (simde__m128d a, simde__m128d b) {
1145 #if defined(SIMDE_X86_SSE2_NATIVE)
1146 return _mm_comigt_sd(a, b);
1147 #else
1148 simde__m128d_private
1149 a_ = simde__m128d_to_private(a),
1150 b_ = simde__m128d_to_private(b);
1151 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1152 return !!vgetq_lane_u64(vcgtq_f64(a_.neon_f64, b_.neon_f64), 0);
1153 #else
1154 return a_.f64[0] > b_.f64[0];
1155 #endif
1156 #endif
1157 }
1158 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1159 # define _mm_comigt_sd(a, b) simde_mm_comigt_sd(a, b)
1160 #endif
1161
1162 SIMDE_FUNCTION_ATTRIBUTES
1163 int
simde_mm_comile_sd(simde__m128d a,simde__m128d b)1164 simde_mm_comile_sd (simde__m128d a, simde__m128d b) {
1165 #if defined(SIMDE_X86_SSE2_NATIVE)
1166 return _mm_comile_sd(a, b);
1167 #else
1168 simde__m128d_private
1169 a_ = simde__m128d_to_private(a),
1170 b_ = simde__m128d_to_private(b);
1171 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1172 return !!vgetq_lane_u64(vcleq_f64(a_.neon_f64, b_.neon_f64), 0);
1173 #else
1174 return a_.f64[0] <= b_.f64[0];
1175 #endif
1176 #endif
1177 }
1178 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1179 # define _mm_comile_sd(a, b) simde_mm_comile_sd(a, b)
1180 #endif
1181
1182 SIMDE_FUNCTION_ATTRIBUTES
1183 int
simde_mm_comilt_sd(simde__m128d a,simde__m128d b)1184 simde_mm_comilt_sd (simde__m128d a, simde__m128d b) {
1185 #if defined(SIMDE_X86_SSE2_NATIVE)
1186 return _mm_comilt_sd(a, b);
1187 #else
1188 simde__m128d_private
1189 a_ = simde__m128d_to_private(a),
1190 b_ = simde__m128d_to_private(b);
1191 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1192 return !!vgetq_lane_u64(vcltq_f64(a_.neon_f64, b_.neon_f64), 0);
1193 #else
1194 return a_.f64[0] < b_.f64[0];
1195 #endif
1196 #endif
1197 }
1198 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1199 # define _mm_comilt_sd(a, b) simde_mm_comilt_sd(a, b)
1200 #endif
1201
1202 SIMDE_FUNCTION_ATTRIBUTES
1203 int
simde_mm_comineq_sd(simde__m128d a,simde__m128d b)1204 simde_mm_comineq_sd (simde__m128d a, simde__m128d b) {
1205 #if defined(SIMDE_X86_SSE2_NATIVE)
1206 return _mm_comineq_sd(a, b);
1207 #else
1208 simde__m128d_private
1209 a_ = simde__m128d_to_private(a),
1210 b_ = simde__m128d_to_private(b);
1211 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1212 return !vgetq_lane_u64(vceqq_f64(a_.neon_f64, b_.neon_f64), 0);
1213 #else
1214 return a_.f64[0] != b_.f64[0];
1215 #endif
1216 #endif
1217 }
1218 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1219 # define _mm_comineq_sd(a, b) simde_mm_comineq_sd(a, b)
1220 #endif
1221
1222 SIMDE_FUNCTION_ATTRIBUTES
1223 simde__m128d
simde_x_mm_copysign_pd(simde__m128d dest,simde__m128d src)1224 simde_x_mm_copysign_pd(simde__m128d dest, simde__m128d src) {
1225 simde__m128d_private
1226 r_,
1227 dest_ = simde__m128d_to_private(dest),
1228 src_ = simde__m128d_to_private(src);
1229
1230 #if defined(simde_math_copysign)
1231 SIMDE_VECTORIZE
1232 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
1233 r_.f64[i] = simde_math_copysign(dest_.f64[i], src_.f64[i]);
1234 }
1235 #else
1236 simde__m128d sgnbit = simde_mm_xor_pd(simde_mm_set1_pd(SIMDE_FLOAT64_C(0.0)), simde_mm_set1_pd(-SIMDE_FLOAT64_C(0.0)));
1237 return simde_mm_xor_pd(simde_mm_and_pd(sgnbit, src), simde_mm_andnot_pd(sgnbit, dest));
1238 #endif
1239
1240 return simde__m128d_from_private(r_);
1241 }
1242
1243 SIMDE_FUNCTION_ATTRIBUTES
1244 simde__m128
simde_mm_castpd_ps(simde__m128d a)1245 simde_mm_castpd_ps (simde__m128d a) {
1246 #if defined(SIMDE_X86_SSE2_NATIVE)
1247 return _mm_castpd_ps(a);
1248 #else
1249 simde__m128 r;
1250 simde_memcpy(&r, &a, sizeof(a));
1251 return r;
1252 #endif
1253 }
1254 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1255 # define _mm_castpd_ps(a) simde_mm_castpd_ps(a)
1256 #endif
1257
1258 SIMDE_FUNCTION_ATTRIBUTES
1259 simde__m128i
simde_mm_castpd_si128(simde__m128d a)1260 simde_mm_castpd_si128 (simde__m128d a) {
1261 #if defined(SIMDE_X86_SSE2_NATIVE)
1262 return _mm_castpd_si128(a);
1263 #else
1264 simde__m128i r;
1265 simde_memcpy(&r, &a, sizeof(a));
1266 return r;
1267 #endif
1268 }
1269 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1270 # define _mm_castpd_si128(a) simde_mm_castpd_si128(a)
1271 #endif
1272
1273 SIMDE_FUNCTION_ATTRIBUTES
1274 simde__m128d
simde_mm_castps_pd(simde__m128 a)1275 simde_mm_castps_pd (simde__m128 a) {
1276 #if defined(SIMDE_X86_SSE2_NATIVE)
1277 return _mm_castps_pd(a);
1278 #else
1279 simde__m128d r;
1280 simde_memcpy(&r, &a, sizeof(a));
1281 return r;
1282 #endif
1283 }
1284 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1285 # define _mm_castps_pd(a) simde_mm_castps_pd(a)
1286 #endif
1287
1288 SIMDE_FUNCTION_ATTRIBUTES
1289 simde__m128i
simde_mm_castps_si128(simde__m128 a)1290 simde_mm_castps_si128 (simde__m128 a) {
1291 #if defined(SIMDE_X86_SSE2_NATIVE)
1292 return _mm_castps_si128(a);
1293 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1294 return simde__m128i_from_neon_i32(simde__m128_to_private(a).neon_i32);
1295 #else
1296 simde__m128i r;
1297 simde_memcpy(&r, &a, sizeof(a));
1298 return r;
1299 #endif
1300 }
1301 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1302 # define _mm_castps_si128(a) simde_mm_castps_si128(a)
1303 #endif
1304
1305 SIMDE_FUNCTION_ATTRIBUTES
1306 simde__m128d
simde_mm_castsi128_pd(simde__m128i a)1307 simde_mm_castsi128_pd (simde__m128i a) {
1308 #if defined(SIMDE_X86_SSE2_NATIVE)
1309 return _mm_castsi128_pd(a);
1310 #else
1311 simde__m128d r;
1312 simde_memcpy(&r, &a, sizeof(a));
1313 return r;
1314 #endif
1315 }
1316 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1317 # define _mm_castsi128_pd(a) simde_mm_castsi128_pd(a)
1318 #endif
1319
1320 SIMDE_FUNCTION_ATTRIBUTES
1321 simde__m128
simde_mm_castsi128_ps(simde__m128i a)1322 simde_mm_castsi128_ps (simde__m128i a) {
1323 #if defined(SIMDE_X86_SSE2_NATIVE)
1324 return _mm_castsi128_ps(a);
1325 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1326 return HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), a);
1327 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1328 return simde__m128_from_neon_i32(simde__m128i_to_private(a).neon_i32);
1329 #else
1330 simde__m128 r;
1331 simde_memcpy(&r, &a, sizeof(a));
1332 return r;
1333 #endif
1334 }
1335 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1336 # define _mm_castsi128_ps(a) simde_mm_castsi128_ps(a)
1337 #endif
1338
1339 SIMDE_FUNCTION_ATTRIBUTES
1340 simde__m128i
simde_mm_cmpeq_epi8(simde__m128i a,simde__m128i b)1341 simde_mm_cmpeq_epi8 (simde__m128i a, simde__m128i b) {
1342 #if defined(SIMDE_X86_SSE2_NATIVE)
1343 return _mm_cmpeq_epi8(a, b);
1344 #else
1345 simde__m128i_private
1346 r_,
1347 a_ = simde__m128i_to_private(a),
1348 b_ = simde__m128i_to_private(b);
1349
1350 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1351 r_.neon_u8 = vceqq_s8(b_.neon_i8, a_.neon_i8);
1352 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1353 r_.wasm_v128 = wasm_i8x16_eq(a_.wasm_v128, b_.wasm_v128);
1354 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1355 r_.altivec_i8 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char), vec_cmpeq(a_.altivec_i8, b_.altivec_i8));
1356 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1357 r_.i8 = HEDLEY_STATIC_CAST(__typeof__(r_.i8), (a_.i8 == b_.i8));
1358 #else
1359 SIMDE_VECTORIZE
1360 for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
1361 r_.i8[i] = (a_.i8[i] == b_.i8[i]) ? ~INT8_C(0) : INT8_C(0);
1362 }
1363 #endif
1364
1365 return simde__m128i_from_private(r_);
1366 #endif
1367 }
1368 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1369 # define _mm_cmpeq_epi8(a, b) simde_mm_cmpeq_epi8(a, b)
1370 #endif
1371
1372 SIMDE_FUNCTION_ATTRIBUTES
1373 simde__m128i
simde_mm_cmpeq_epi16(simde__m128i a,simde__m128i b)1374 simde_mm_cmpeq_epi16 (simde__m128i a, simde__m128i b) {
1375 #if defined(SIMDE_X86_SSE2_NATIVE)
1376 return _mm_cmpeq_epi16(a, b);
1377 #else
1378 simde__m128i_private
1379 r_,
1380 a_ = simde__m128i_to_private(a),
1381 b_ = simde__m128i_to_private(b);
1382
1383 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1384 r_.neon_u16 = vceqq_s16(b_.neon_i16, a_.neon_i16);
1385 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1386 r_.wasm_v128 = wasm_i16x8_eq(a_.wasm_v128, b_.wasm_v128);
1387 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1388 r_.altivec_i16 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed short), vec_cmpeq(a_.altivec_i16, b_.altivec_i16));
1389 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1390 r_.i16 = (a_.i16 == b_.i16);
1391 #else
1392 SIMDE_VECTORIZE
1393 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
1394 r_.i16[i] = (a_.i16[i] == b_.i16[i]) ? ~INT16_C(0) : INT16_C(0);
1395 }
1396 #endif
1397
1398 return simde__m128i_from_private(r_);
1399 #endif
1400 }
1401 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1402 # define _mm_cmpeq_epi16(a, b) simde_mm_cmpeq_epi16(a, b)
1403 #endif
1404
1405 SIMDE_FUNCTION_ATTRIBUTES
1406 simde__m128i
simde_mm_cmpeq_epi32(simde__m128i a,simde__m128i b)1407 simde_mm_cmpeq_epi32 (simde__m128i a, simde__m128i b) {
1408 #if defined(SIMDE_X86_SSE2_NATIVE)
1409 return _mm_cmpeq_epi32(a, b);
1410 #else
1411 simde__m128i_private
1412 r_,
1413 a_ = simde__m128i_to_private(a),
1414 b_ = simde__m128i_to_private(b);
1415
1416 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1417 r_.neon_u32 = vceqq_s32(b_.neon_i32, a_.neon_i32);
1418 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1419 r_.wasm_v128 = wasm_i32x4_eq(a_.wasm_v128, b_.wasm_v128);
1420 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1421 r_.altivec_i32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed int), vec_cmpeq(a_.altivec_i32, b_.altivec_i32));
1422 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1423 r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), a_.i32 == b_.i32);
1424 #else
1425 SIMDE_VECTORIZE
1426 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1427 r_.i32[i] = (a_.i32[i] == b_.i32[i]) ? ~INT32_C(0) : INT32_C(0);
1428 }
1429 #endif
1430
1431 return simde__m128i_from_private(r_);
1432 #endif
1433 }
1434 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1435 # define _mm_cmpeq_epi32(a, b) simde_mm_cmpeq_epi32(a, b)
1436 #endif
1437
1438 SIMDE_FUNCTION_ATTRIBUTES
1439 simde__m128d
simde_mm_cmpeq_pd(simde__m128d a,simde__m128d b)1440 simde_mm_cmpeq_pd (simde__m128d a, simde__m128d b) {
1441 #if defined(SIMDE_X86_SSE2_NATIVE)
1442 return _mm_cmpeq_pd(a, b);
1443 #else
1444 simde__m128d_private
1445 r_,
1446 a_ = simde__m128d_to_private(a),
1447 b_ = simde__m128d_to_private(b);
1448
1449 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1450 r_.neon_u64 = vceqq_s64(b_.neon_i64, a_.neon_i64);
1451 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1452 r_.wasm_v128 = wasm_f64x2_eq(a_.wasm_v128, b_.wasm_v128);
1453 #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
1454 r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_cmpeq(a_.altivec_f64, b_.altivec_f64));
1455 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1456 r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 == b_.f64));
1457 #else
1458 SIMDE_VECTORIZE
1459 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
1460 r_.u64[i] = (a_.f64[i] == b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
1461 }
1462 #endif
1463
1464 return simde__m128d_from_private(r_);
1465 #endif
1466 }
1467 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1468 # define _mm_cmpeq_pd(a, b) simde_mm_cmpeq_pd(a, b)
1469 #endif
1470
1471 SIMDE_FUNCTION_ATTRIBUTES
1472 simde__m128d
simde_mm_cmpeq_sd(simde__m128d a,simde__m128d b)1473 simde_mm_cmpeq_sd (simde__m128d a, simde__m128d b) {
1474 #if defined(SIMDE_X86_SSE2_NATIVE)
1475 return _mm_cmpeq_sd(a, b);
1476 #elif defined(SIMDE_ASSUME_VECTORIZATION)
1477 return simde_mm_move_sd(a, simde_mm_cmpeq_pd(a, b));
1478 #else
1479 simde__m128d_private
1480 r_,
1481 a_ = simde__m128d_to_private(a),
1482 b_ = simde__m128d_to_private(b);
1483
1484 r_.u64[0] = (a_.u64[0] == b_.u64[0]) ? ~UINT64_C(0) : 0;
1485 r_.u64[1] = a_.u64[1];
1486
1487 return simde__m128d_from_private(r_);
1488 #endif
1489 }
1490 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1491 # define _mm_cmpeq_sd(a, b) simde_mm_cmpeq_sd(a, b)
1492 #endif
1493
1494 SIMDE_FUNCTION_ATTRIBUTES
1495 simde__m128d
simde_mm_cmpneq_pd(simde__m128d a,simde__m128d b)1496 simde_mm_cmpneq_pd (simde__m128d a, simde__m128d b) {
1497 #if defined(SIMDE_X86_SSE2_NATIVE)
1498 return _mm_cmpneq_pd(a, b);
1499 #else
1500 simde__m128d_private
1501 r_,
1502 a_ = simde__m128d_to_private(a),
1503 b_ = simde__m128d_to_private(b);
1504
1505 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1506 r_.neon_u16 = vmvnq_u16(vceqq_s16(b_.neon_i16, a_.neon_i16));
1507 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1508 r_.wasm_v128 = wasm_f64x2_ne(a_.wasm_v128, b_.wasm_v128);
1509 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1510 r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 != b_.f64));
1511 #else
1512 SIMDE_VECTORIZE
1513 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
1514 r_.u64[i] = (a_.f64[i] != b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
1515 }
1516 #endif
1517
1518 return simde__m128d_from_private(r_);
1519 #endif
1520 }
1521 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1522 # define _mm_cmpneq_pd(a, b) simde_mm_cmpneq_pd(a, b)
1523 #endif
1524
1525 SIMDE_FUNCTION_ATTRIBUTES
1526 simde__m128d
simde_mm_cmpneq_sd(simde__m128d a,simde__m128d b)1527 simde_mm_cmpneq_sd (simde__m128d a, simde__m128d b) {
1528 #if defined(SIMDE_X86_SSE2_NATIVE)
1529 return _mm_cmpneq_sd(a, b);
1530 #elif defined(SIMDE_ASSUME_VECTORIZATION)
1531 return simde_mm_move_sd(a, simde_mm_cmpneq_pd(a, b));
1532 #else
1533 simde__m128d_private
1534 r_,
1535 a_ = simde__m128d_to_private(a),
1536 b_ = simde__m128d_to_private(b);
1537
1538 r_.u64[0] = (a_.f64[0] != b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
1539 r_.u64[1] = a_.u64[1];
1540
1541
1542 return simde__m128d_from_private(r_);
1543 #endif
1544 }
1545 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1546 # define _mm_cmpneq_sd(a, b) simde_mm_cmpneq_sd(a, b)
1547 #endif
1548
1549 SIMDE_FUNCTION_ATTRIBUTES
1550 simde__m128i
simde_mm_cmplt_epi8(simde__m128i a,simde__m128i b)1551 simde_mm_cmplt_epi8 (simde__m128i a, simde__m128i b) {
1552 #if defined(SIMDE_X86_SSE2_NATIVE)
1553 return _mm_cmplt_epi8(a, b);
1554 #else
1555 simde__m128i_private
1556 r_,
1557 a_ = simde__m128i_to_private(a),
1558 b_ = simde__m128i_to_private(b);
1559
1560 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1561 r_.neon_u8 = vcltq_s8(a_.neon_i8, b_.neon_i8);
1562 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1563 r_.altivec_i8 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char),vec_cmplt(a_.altivec_i8, b_.altivec_i8));
1564 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1565 r_.wasm_v128 = wasm_i8x16_lt(a_.wasm_v128, b_.wasm_v128);
1566 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1567 r_.i8 = HEDLEY_STATIC_CAST(__typeof__(r_.i8), (a_.i8 < b_.i8));
1568 #else
1569 SIMDE_VECTORIZE
1570 for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
1571 r_.i8[i] = (a_.i8[i] < b_.i8[i]) ? ~INT8_C(0) : INT8_C(0);
1572 }
1573 #endif
1574
1575 return simde__m128i_from_private(r_);
1576 #endif
1577 }
1578 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1579 # define _mm_cmplt_epi8(a, b) simde_mm_cmplt_epi8(a, b)
1580 #endif
1581
1582 SIMDE_FUNCTION_ATTRIBUTES
1583 simde__m128i
simde_mm_cmplt_epi16(simde__m128i a,simde__m128i b)1584 simde_mm_cmplt_epi16 (simde__m128i a, simde__m128i b) {
1585 #if defined(SIMDE_X86_SSE2_NATIVE)
1586 return _mm_cmplt_epi16(a, b);
1587 #else
1588 simde__m128i_private
1589 r_,
1590 a_ = simde__m128i_to_private(a),
1591 b_ = simde__m128i_to_private(b);
1592
1593 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1594 r_.neon_u16 = vcltq_s16(a_.neon_i16, b_.neon_i16);
1595 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1596 r_.altivec_i16 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed short), vec_cmplt(a_.altivec_i16, b_.altivec_i16));
1597 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1598 r_.wasm_v128 = wasm_i16x8_lt(a_.wasm_v128, b_.wasm_v128);
1599 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1600 r_.i16 = HEDLEY_STATIC_CAST(__typeof__(r_.i16), (a_.i16 < b_.i16));
1601 #else
1602 SIMDE_VECTORIZE
1603 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
1604 r_.i16[i] = (a_.i16[i] < b_.i16[i]) ? ~INT16_C(0) : INT16_C(0);
1605 }
1606 #endif
1607
1608 return simde__m128i_from_private(r_);
1609 #endif
1610 }
1611 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1612 # define _mm_cmplt_epi16(a, b) simde_mm_cmplt_epi16(a, b)
1613 #endif
1614
1615 SIMDE_FUNCTION_ATTRIBUTES
1616 simde__m128i
simde_mm_cmplt_epi32(simde__m128i a,simde__m128i b)1617 simde_mm_cmplt_epi32 (simde__m128i a, simde__m128i b) {
1618 #if defined(SIMDE_X86_SSE2_NATIVE)
1619 return _mm_cmplt_epi32(a, b);
1620 #else
1621 simde__m128i_private
1622 r_,
1623 a_ = simde__m128i_to_private(a),
1624 b_ = simde__m128i_to_private(b);
1625
1626 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1627 r_.neon_u32 = vcltq_s32(a_.neon_i32, b_.neon_i32);
1628 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1629 r_.altivec_i32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed int), vec_cmplt(a_.altivec_i32, b_.altivec_i32));
1630 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1631 r_.wasm_v128 = wasm_i32x4_lt(a_.wasm_v128, b_.wasm_v128);
1632 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1633 r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.i32 < b_.i32));
1634 #else
1635 SIMDE_VECTORIZE
1636 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1637 r_.i32[i] = (a_.i32[i] < b_.i32[i]) ? ~INT32_C(0) : INT32_C(0);
1638 }
1639 #endif
1640
1641 return simde__m128i_from_private(r_);
1642 #endif
1643 }
1644 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1645 # define _mm_cmplt_epi32(a, b) simde_mm_cmplt_epi32(a, b)
1646 #endif
1647
1648 SIMDE_FUNCTION_ATTRIBUTES
1649 simde__m128d
simde_mm_cmplt_pd(simde__m128d a,simde__m128d b)1650 simde_mm_cmplt_pd (simde__m128d a, simde__m128d b) {
1651 #if defined(SIMDE_X86_SSE2_NATIVE)
1652 return _mm_cmplt_pd(a, b);
1653 #else
1654 simde__m128d_private
1655 r_,
1656 a_ = simde__m128d_to_private(a),
1657 b_ = simde__m128d_to_private(b);
1658
1659 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1660 r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 < b_.f64));
1661 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1662 r_.wasm_v128 = wasm_f64x2_lt(a_.wasm_v128, b_.wasm_v128);
1663 #else
1664 SIMDE_VECTORIZE
1665 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
1666 r_.u64[i] = (a_.f64[i] < b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
1667 }
1668 #endif
1669
1670 return simde__m128d_from_private(r_);
1671 #endif
1672 }
1673 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1674 # define _mm_cmplt_pd(a, b) simde_mm_cmplt_pd(a, b)
1675 #endif
1676
1677 SIMDE_FUNCTION_ATTRIBUTES
1678 simde__m128d
simde_mm_cmplt_sd(simde__m128d a,simde__m128d b)1679 simde_mm_cmplt_sd (simde__m128d a, simde__m128d b) {
1680 #if defined(SIMDE_X86_SSE2_NATIVE)
1681 return _mm_cmplt_sd(a, b);
1682 #elif defined(SIMDE_ASSUME_VECTORIZATION)
1683 return simde_mm_move_sd(a, simde_mm_cmplt_pd(a, b));
1684 #else
1685 simde__m128d_private
1686 r_,
1687 a_ = simde__m128d_to_private(a),
1688 b_ = simde__m128d_to_private(b);
1689
1690 r_.u64[0] = (a_.f64[0] < b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
1691 r_.u64[1] = a_.u64[1];
1692
1693 return simde__m128d_from_private(r_);
1694 #endif
1695 }
1696 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1697 # define _mm_cmplt_sd(a, b) simde_mm_cmplt_sd(a, b)
1698 #endif
1699
1700 SIMDE_FUNCTION_ATTRIBUTES
1701 simde__m128d
simde_mm_cmple_pd(simde__m128d a,simde__m128d b)1702 simde_mm_cmple_pd (simde__m128d a, simde__m128d b) {
1703 #if defined(SIMDE_X86_SSE2_NATIVE)
1704 return _mm_cmple_pd(a, b);
1705 #else
1706 simde__m128d_private
1707 r_,
1708 a_ = simde__m128d_to_private(a),
1709 b_ = simde__m128d_to_private(b);
1710
1711 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1712 r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 <= b_.f64));
1713 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1714 r_.wasm_v128 = wasm_f64x2_le(a_.wasm_v128, b_.wasm_v128);
1715 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1716 r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_cmple(a_.altivec_f64, b_.altivec_f64));
1717 #else
1718 SIMDE_VECTORIZE
1719 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
1720 r_.u64[i] = (a_.f64[i] <= b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
1721 }
1722 #endif
1723
1724 return simde__m128d_from_private(r_);
1725 #endif
1726 }
1727 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1728 # define _mm_cmple_pd(a, b) simde_mm_cmple_pd(a, b)
1729 #endif
1730
1731 SIMDE_FUNCTION_ATTRIBUTES
1732 simde__m128d
simde_mm_cmple_sd(simde__m128d a,simde__m128d b)1733 simde_mm_cmple_sd (simde__m128d a, simde__m128d b) {
1734 #if defined(SIMDE_X86_SSE2_NATIVE)
1735 return _mm_cmple_sd(a, b);
1736 #elif defined(SIMDE_ASSUME_VECTORIZATION)
1737 return simde_mm_move_sd(a, simde_mm_cmple_pd(a, b));
1738 #else
1739 simde__m128d_private
1740 r_,
1741 a_ = simde__m128d_to_private(a),
1742 b_ = simde__m128d_to_private(b);
1743
1744 r_.u64[0] = (a_.f64[0] <= b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
1745 r_.u64[1] = a_.u64[1];
1746
1747 return simde__m128d_from_private(r_);
1748 #endif
1749 }
1750 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1751 # define _mm_cmple_sd(a, b) simde_mm_cmple_sd(a, b)
1752 #endif
1753
1754 SIMDE_FUNCTION_ATTRIBUTES
1755 simde__m128i
simde_mm_cmpgt_epi8(simde__m128i a,simde__m128i b)1756 simde_mm_cmpgt_epi8 (simde__m128i a, simde__m128i b) {
1757 #if defined(SIMDE_X86_SSE2_NATIVE)
1758 return _mm_cmpgt_epi8(a, b);
1759 #else
1760 simde__m128i_private
1761 r_,
1762 a_ = simde__m128i_to_private(a),
1763 b_ = simde__m128i_to_private(b);
1764
1765 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1766 r_.neon_u8 = vcgtq_s8(a_.neon_i8, b_.neon_i8);
1767 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1768 r_.wasm_v128 = wasm_i8x16_gt(a_.wasm_v128, b_.wasm_v128);
1769 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1770 r_.altivec_i8 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char), vec_cmpgt(a_.altivec_i8, b_.altivec_i8));
1771 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1772 r_.i8 = HEDLEY_STATIC_CAST(__typeof__(r_.i8), (a_.i8 > b_.i8));
1773 #else
1774 SIMDE_VECTORIZE
1775 for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
1776 r_.i8[i] = (a_.i8[i] > b_.i8[i]) ? ~INT8_C(0) : INT8_C(0);
1777 }
1778 #endif
1779
1780 return simde__m128i_from_private(r_);
1781 #endif
1782 }
1783 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1784 # define _mm_cmpgt_epi8(a, b) simde_mm_cmpgt_epi8(a, b)
1785 #endif
1786
1787 SIMDE_FUNCTION_ATTRIBUTES
1788 simde__m128i
simde_mm_cmpgt_epi16(simde__m128i a,simde__m128i b)1789 simde_mm_cmpgt_epi16 (simde__m128i a, simde__m128i b) {
1790 #if defined(SIMDE_X86_SSE2_NATIVE)
1791 return _mm_cmpgt_epi16(a, b);
1792 #else
1793 simde__m128i_private
1794 r_,
1795 a_ = simde__m128i_to_private(a),
1796 b_ = simde__m128i_to_private(b);
1797
1798 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1799 r_.neon_u16 = vcgtq_s16(a_.neon_i16, b_.neon_i16);
1800 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1801 r_.wasm_v128 = wasm_i16x8_gt(a_.wasm_v128, b_.wasm_v128);
1802 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1803 r_.altivec_i16 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed short), vec_cmpgt(a_.altivec_i16, b_.altivec_i16));
1804 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1805 r_.i16 = HEDLEY_STATIC_CAST(__typeof__(r_.i16), (a_.i16 > b_.i16));
1806 #else
1807 SIMDE_VECTORIZE
1808 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
1809 r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? ~INT16_C(0) : INT16_C(0);
1810 }
1811 #endif
1812
1813 return simde__m128i_from_private(r_);
1814 #endif
1815 }
1816 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1817 # define _mm_cmpgt_epi16(a, b) simde_mm_cmpgt_epi16(a, b)
1818 #endif
1819
1820 SIMDE_FUNCTION_ATTRIBUTES
1821 simde__m128i
simde_mm_cmpgt_epi32(simde__m128i a,simde__m128i b)1822 simde_mm_cmpgt_epi32 (simde__m128i a, simde__m128i b) {
1823 #if defined(SIMDE_X86_SSE2_NATIVE)
1824 return _mm_cmpgt_epi32(a, b);
1825 #else
1826 simde__m128i_private
1827 r_,
1828 a_ = simde__m128i_to_private(a),
1829 b_ = simde__m128i_to_private(b);
1830
1831 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1832 r_.neon_u32 = vcgtq_s32(a_.neon_i32, b_.neon_i32);
1833 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1834 r_.wasm_v128 = wasm_i32x4_gt(a_.wasm_v128, b_.wasm_v128);
1835 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1836 r_.altivec_i32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed int), vec_cmpgt(a_.altivec_i32, b_.altivec_i32));
1837 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1838 r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.i32 > b_.i32));
1839 #else
1840 SIMDE_VECTORIZE
1841 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1842 r_.i32[i] = (a_.i32[i] > b_.i32[i]) ? ~INT32_C(0) : INT32_C(0);
1843 }
1844 #endif
1845
1846 return simde__m128i_from_private(r_);
1847 #endif
1848 }
1849 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1850 # define _mm_cmpgt_epi32(a, b) simde_mm_cmpgt_epi32(a, b)
1851 #endif
1852
1853 SIMDE_FUNCTION_ATTRIBUTES
1854 simde__m128d
simde_mm_cmpgt_pd(simde__m128d a,simde__m128d b)1855 simde_mm_cmpgt_pd (simde__m128d a, simde__m128d b) {
1856 #if defined(SIMDE_X86_SSE2_NATIVE)
1857 return _mm_cmpgt_pd(a, b);
1858 #else
1859 simde__m128d_private
1860 r_,
1861 a_ = simde__m128d_to_private(a),
1862 b_ = simde__m128d_to_private(b);
1863
1864 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1865 r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 > b_.f64));
1866 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1867 r_.wasm_v128 = wasm_f64x2_gt(a_.wasm_v128, b_.wasm_v128);
1868 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1869 r_.altivec_f64 = HEDLEY_STATIC_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_cmpgt(a_.altivec_f64, b_.altivec_f64));
1870 #else
1871 SIMDE_VECTORIZE
1872 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
1873 r_.u64[i] = (a_.f64[i] > b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
1874 }
1875 #endif
1876
1877 return simde__m128d_from_private(r_);
1878 #endif
1879 }
1880 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1881 # define _mm_cmpgt_pd(a, b) simde_mm_cmpgt_pd(a, b)
1882 #endif
1883
1884 SIMDE_FUNCTION_ATTRIBUTES
1885 simde__m128d
simde_mm_cmpgt_sd(simde__m128d a,simde__m128d b)1886 simde_mm_cmpgt_sd (simde__m128d a, simde__m128d b) {
1887 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
1888 return _mm_cmpgt_sd(a, b);
1889 #elif defined(SIMDE_ASSUME_VECTORIZATION)
1890 return simde_mm_move_sd(a, simde_mm_cmpgt_pd(a, b));
1891 #else
1892 simde__m128d_private
1893 r_,
1894 a_ = simde__m128d_to_private(a),
1895 b_ = simde__m128d_to_private(b);
1896
1897 r_.u64[0] = (a_.f64[0] > b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
1898 r_.u64[1] = a_.u64[1];
1899
1900 return simde__m128d_from_private(r_);
1901 #endif
1902 }
1903 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1904 # define _mm_cmpgt_sd(a, b) simde_mm_cmpgt_sd(a, b)
1905 #endif
1906
1907 SIMDE_FUNCTION_ATTRIBUTES
1908 simde__m128d
simde_mm_cmpge_pd(simde__m128d a,simde__m128d b)1909 simde_mm_cmpge_pd (simde__m128d a, simde__m128d b) {
1910 #if defined(SIMDE_X86_SSE2_NATIVE)
1911 return _mm_cmpge_pd(a, b);
1912 #else
1913 simde__m128d_private
1914 r_,
1915 a_ = simde__m128d_to_private(a),
1916 b_ = simde__m128d_to_private(b);
1917
1918 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1919 r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 >= b_.f64));
1920 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1921 r_.wasm_v128 = wasm_f64x2_ge(a_.wasm_v128, b_.wasm_v128);
1922 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1923 r_.altivec_f64 = HEDLEY_STATIC_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_cmpge(a_.altivec_f64, b_.altivec_f64));
1924 #else
1925 SIMDE_VECTORIZE
1926 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
1927 r_.u64[i] = (a_.f64[i] >= b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
1928 }
1929 #endif
1930
1931 return simde__m128d_from_private(r_);
1932 #endif
1933 }
1934 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1935 # define _mm_cmpge_pd(a, b) simde_mm_cmpge_pd(a, b)
1936 #endif
1937
1938 SIMDE_FUNCTION_ATTRIBUTES
1939 simde__m128d
simde_mm_cmpge_sd(simde__m128d a,simde__m128d b)1940 simde_mm_cmpge_sd (simde__m128d a, simde__m128d b) {
1941 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
1942 return _mm_cmpge_sd(a, b);
1943 #elif defined(SIMDE_ASSUME_VECTORIZATION)
1944 return simde_mm_move_sd(a, simde_mm_cmpge_pd(a, b));
1945 #else
1946 simde__m128d_private
1947 r_,
1948 a_ = simde__m128d_to_private(a),
1949 b_ = simde__m128d_to_private(b);
1950
1951 r_.u64[0] = (a_.f64[0] >= b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
1952 r_.u64[1] = a_.u64[1];
1953
1954 return simde__m128d_from_private(r_);
1955 #endif
1956 }
1957 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1958 # define _mm_cmpge_sd(a, b) simde_mm_cmpge_sd(a, b)
1959 #endif
1960
1961 SIMDE_FUNCTION_ATTRIBUTES
1962 simde__m128d
simde_mm_cmpnge_pd(simde__m128d a,simde__m128d b)1963 simde_mm_cmpnge_pd (simde__m128d a, simde__m128d b) {
1964 #if defined(SIMDE_X86_SSE2_NATIVE)
1965 return _mm_cmpnge_pd(a, b);
1966 #else
1967 return simde_mm_cmplt_pd(a, b);
1968 #endif
1969 }
1970 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1971 # define _mm_cmpnge_pd(a, b) simde_mm_cmpnge_pd(a, b)
1972 #endif
1973
1974 SIMDE_FUNCTION_ATTRIBUTES
1975 simde__m128d
simde_mm_cmpnge_sd(simde__m128d a,simde__m128d b)1976 simde_mm_cmpnge_sd (simde__m128d a, simde__m128d b) {
1977 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
1978 return _mm_cmpnge_sd(a, b);
1979 #else
1980 return simde_mm_cmplt_sd(a, b);
1981 #endif
1982 }
1983 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1984 # define _mm_cmpnge_sd(a, b) simde_mm_cmpnge_sd(a, b)
1985 #endif
1986
1987 SIMDE_FUNCTION_ATTRIBUTES
1988 simde__m128d
simde_mm_cmpnlt_pd(simde__m128d a,simde__m128d b)1989 simde_mm_cmpnlt_pd (simde__m128d a, simde__m128d b) {
1990 #if defined(SIMDE_X86_SSE2_NATIVE)
1991 return _mm_cmpnlt_pd(a, b);
1992 #else
1993 return simde_mm_cmpge_pd(a, b);
1994 #endif
1995 }
1996 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1997 # define _mm_cmpnlt_pd(a, b) simde_mm_cmpnlt_pd(a, b)
1998 #endif
1999
2000 SIMDE_FUNCTION_ATTRIBUTES
2001 simde__m128d
simde_mm_cmpnlt_sd(simde__m128d a,simde__m128d b)2002 simde_mm_cmpnlt_sd (simde__m128d a, simde__m128d b) {
2003 #if defined(SIMDE_X86_SSE2_NATIVE)
2004 return _mm_cmpnlt_sd(a, b);
2005 #else
2006 return simde_mm_cmpge_sd(a, b);
2007 #endif
2008 }
2009 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2010 # define _mm_cmpnlt_sd(a, b) simde_mm_cmpnlt_sd(a, b)
2011 #endif
2012
2013 SIMDE_FUNCTION_ATTRIBUTES
2014 simde__m128d
simde_mm_cmpnle_pd(simde__m128d a,simde__m128d b)2015 simde_mm_cmpnle_pd (simde__m128d a, simde__m128d b) {
2016 #if defined(SIMDE_X86_SSE2_NATIVE)
2017 return _mm_cmpnle_pd(a, b);
2018 #else
2019 return simde_mm_cmpgt_pd(a, b);
2020 #endif
2021 }
2022 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2023 # define _mm_cmpnle_pd(a, b) simde_mm_cmpnle_pd(a, b)
2024 #endif
2025
2026 SIMDE_FUNCTION_ATTRIBUTES
2027 simde__m128d
simde_mm_cmpnle_sd(simde__m128d a,simde__m128d b)2028 simde_mm_cmpnle_sd (simde__m128d a, simde__m128d b) {
2029 #if defined(SIMDE_X86_SSE2_NATIVE)
2030 return _mm_cmpnle_sd(a, b);
2031 #else
2032 return simde_mm_cmpgt_sd(a, b);
2033 #endif
2034 }
2035 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2036 # define _mm_cmpnle_sd(a, b) simde_mm_cmpnle_sd(a, b)
2037 #endif
2038
2039 SIMDE_FUNCTION_ATTRIBUTES
2040 simde__m128d
simde_mm_cmpord_pd(simde__m128d a,simde__m128d b)2041 simde_mm_cmpord_pd (simde__m128d a, simde__m128d b) {
2042 #if defined(SIMDE_X86_SSE2_NATIVE)
2043 return _mm_cmpord_pd(a, b);
2044 #else
2045 simde__m128d_private
2046 r_,
2047 a_ = simde__m128d_to_private(a),
2048 b_ = simde__m128d_to_private(b);
2049
2050 #if defined(simde_math_isnan)
2051 SIMDE_VECTORIZE
2052 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
2053 r_.u64[i] = (!simde_math_isnan(a_.f64[i]) && !simde_math_isnan(b_.f64[i])) ? ~UINT64_C(0) : UINT64_C(0);
2054 }
2055 #else
2056 HEDLEY_UNREACHABLE();
2057 #endif
2058
2059 return simde__m128d_from_private(r_);
2060 #endif
2061 }
2062 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2063 # define _mm_cmpord_pd(a, b) simde_mm_cmpord_pd(a, b)
2064 #endif
2065
2066 SIMDE_FUNCTION_ATTRIBUTES
2067 simde_float64
simde_mm_cvtsd_f64(simde__m128d a)2068 simde_mm_cvtsd_f64 (simde__m128d a) {
2069 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
2070 return _mm_cvtsd_f64(a);
2071 #else
2072 simde__m128d_private a_ = simde__m128d_to_private(a);
2073 return a_.f64[0];
2074 #endif
2075 }
2076 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2077 # define _mm_cvtsd_f64(a) simde_mm_cvtsd_f64(a)
2078 #endif
2079
2080 SIMDE_FUNCTION_ATTRIBUTES
2081 simde__m128d
simde_mm_cmpord_sd(simde__m128d a,simde__m128d b)2082 simde_mm_cmpord_sd (simde__m128d a, simde__m128d b) {
2083 #if defined(SIMDE_X86_SSE2_NATIVE)
2084 return _mm_cmpord_sd(a, b);
2085 #elif defined(SIMDE_ASSUME_VECTORIZATION)
2086 return simde_mm_move_sd(a, simde_mm_cmpord_pd(a, b));
2087 #else
2088 simde__m128d_private
2089 r_,
2090 a_ = simde__m128d_to_private(a),
2091 b_ = simde__m128d_to_private(b);
2092
2093 #if defined(simde_math_isnan)
2094 r_.u64[0] = (!simde_math_isnan(a_.f64[0]) && !simde_math_isnan(b_.f64[0])) ? ~UINT64_C(0) : UINT64_C(0);
2095 r_.u64[1] = a_.u64[1];
2096 #else
2097 HEDLEY_UNREACHABLE();
2098 #endif
2099
2100 return simde__m128d_from_private(r_);
2101 #endif
2102 }
2103 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2104 # define _mm_cmpord_sd(a, b) simde_mm_cmpord_sd(a, b)
2105 #endif
2106
2107 SIMDE_FUNCTION_ATTRIBUTES
2108 simde__m128d
simde_mm_cmpunord_pd(simde__m128d a,simde__m128d b)2109 simde_mm_cmpunord_pd (simde__m128d a, simde__m128d b) {
2110 #if defined(SIMDE_X86_SSE2_NATIVE)
2111 return _mm_cmpunord_pd(a, b);
2112 #else
2113 simde__m128d_private
2114 r_,
2115 a_ = simde__m128d_to_private(a),
2116 b_ = simde__m128d_to_private(b);
2117
2118 #if defined(simde_math_isnan)
2119 SIMDE_VECTORIZE
2120 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
2121 r_.u64[i] = (simde_math_isnan(a_.f64[i]) || simde_math_isnan(b_.f64[i])) ? ~UINT64_C(0) : UINT64_C(0);
2122 }
2123 #else
2124 HEDLEY_UNREACHABLE();
2125 #endif
2126
2127 return simde__m128d_from_private(r_);
2128 #endif
2129 }
2130 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2131 # define _mm_cmpunord_pd(a, b) simde_mm_cmpunord_pd(a, b)
2132 #endif
2133
2134 SIMDE_FUNCTION_ATTRIBUTES
2135 simde__m128d
simde_mm_cmpunord_sd(simde__m128d a,simde__m128d b)2136 simde_mm_cmpunord_sd (simde__m128d a, simde__m128d b) {
2137 #if defined(SIMDE_X86_SSE2_NATIVE)
2138 return _mm_cmpunord_sd(a, b);
2139 #elif defined(SIMDE_ASSUME_VECTORIZATION)
2140 return simde_mm_move_sd(a, simde_mm_cmpunord_pd(a, b));
2141 #else
2142 simde__m128d_private
2143 r_,
2144 a_ = simde__m128d_to_private(a),
2145 b_ = simde__m128d_to_private(b);
2146
2147 #if defined(simde_math_isnan)
2148 r_.u64[0] = (simde_math_isnan(a_.f64[0]) || simde_math_isnan(b_.f64[0])) ? ~UINT64_C(0) : UINT64_C(0);
2149 r_.u64[1] = a_.u64[1];
2150
2151 #else
2152 HEDLEY_UNREACHABLE();
2153 #endif
2154
2155 return simde__m128d_from_private(r_);
2156 #endif
2157 }
2158 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2159 # define _mm_cmpunord_sd(a, b) simde_mm_cmpunord_sd(a, b)
2160 #endif
2161
2162 SIMDE_FUNCTION_ATTRIBUTES
2163 simde__m128d
simde_mm_cvtepi32_pd(simde__m128i a)2164 simde_mm_cvtepi32_pd (simde__m128i a) {
2165 #if defined(SIMDE_X86_SSE2_NATIVE)
2166 return _mm_cvtepi32_pd(a);
2167 #else
2168 simde__m128d_private r_;
2169 simde__m128i_private a_ = simde__m128i_to_private(a);
2170
2171 #if defined(SIMDE_CONVERT_VECTOR_)
2172 SIMDE_CONVERT_VECTOR_(r_.f64, a_.m64_private[0].i32);
2173 #else
2174 SIMDE_VECTORIZE
2175 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
2176 r_.f64[i] = (simde_float64) a_.i32[i];
2177 }
2178 #endif
2179
2180 return simde__m128d_from_private(r_);
2181 #endif
2182 }
2183 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2184 # define _mm_cvtepi32_pd(a) simde_mm_cvtepi32_pd(a)
2185 #endif
2186
2187 SIMDE_FUNCTION_ATTRIBUTES
2188 simde__m128
simde_mm_cvtepi32_ps(simde__m128i a)2189 simde_mm_cvtepi32_ps (simde__m128i a) {
2190 #if defined(SIMDE_X86_SSE2_NATIVE)
2191 return _mm_cvtepi32_ps(a);
2192 #else
2193 simde__m128_private r_;
2194 simde__m128i_private a_ = simde__m128i_to_private(a);
2195
2196 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2197 r_.neon_f32 = vcvtq_f32_s32(a_.neon_i32);
2198 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
2199 HEDLEY_DIAGNOSTIC_PUSH
2200 #if HEDLEY_HAS_WARNING("-Wc11-extensions")
2201 #pragma clang diagnostic ignored "-Wc11-extensions"
2202 #endif
2203 r_.altivec_f32 = vec_ctf(a_.altivec_i32, 0);
2204 HEDLEY_DIAGNOSTIC_POP
2205 #elif defined(SIMDE_CONVERT_VECTOR_)
2206 SIMDE_CONVERT_VECTOR_(r_.f32, a_.i32);
2207 #else
2208 SIMDE_VECTORIZE
2209 for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
2210 r_.f32[i] = (simde_float32) a_.i32[i];
2211 }
2212 #endif
2213
2214 return simde__m128_from_private(r_);
2215 #endif
2216 }
2217 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2218 # define _mm_cvtepi32_ps(a) simde_mm_cvtepi32_ps(a)
2219 #endif
2220
2221 SIMDE_FUNCTION_ATTRIBUTES
2222 simde__m128i
simde_mm_cvtpd_epi32(simde__m128d a)2223 simde_mm_cvtpd_epi32 (simde__m128d a) {
2224 #if defined(SIMDE_X86_SSE2_NATIVE)
2225 return _mm_cvtpd_epi32(a);
2226 #else
2227 simde__m128i_private r_;
2228 simde__m128d_private a_ = simde__m128d_to_private(a);
2229
2230 SIMDE_VECTORIZE
2231 for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) {
2232 r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, simde_math_nearbyint(a_.f64[i]));
2233 }
2234 simde_memset(&(r_.m64_private[1]), 0, sizeof(r_.m64_private[1]));
2235
2236 return simde__m128i_from_private(r_);
2237 #endif
2238 }
2239 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2240 # define _mm_cvtpd_epi32(a) simde_mm_cvtpd_epi32(a)
2241 #endif
2242
2243 SIMDE_FUNCTION_ATTRIBUTES
2244 simde__m64
simde_mm_cvtpd_pi32(simde__m128d a)2245 simde_mm_cvtpd_pi32 (simde__m128d a) {
2246 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
2247 return _mm_cvtpd_pi32(a);
2248 #else
2249 simde__m64_private r_;
2250 simde__m128d_private a_ = simde__m128d_to_private(a);
2251
2252 SIMDE_VECTORIZE
2253 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
2254 r_.i32[i] = HEDLEY_STATIC_CAST(int32_t, simde_math_nearbyint(a_.f64[i]));
2255 }
2256
2257 return simde__m64_from_private(r_);
2258 #endif
2259 }
2260 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2261 # define _mm_cvtpd_pi32(a) simde_mm_cvtpd_pi32(a)
2262 #endif
2263
2264 SIMDE_FUNCTION_ATTRIBUTES
2265 simde__m128
simde_mm_cvtpd_ps(simde__m128d a)2266 simde_mm_cvtpd_ps (simde__m128d a) {
2267 #if defined(SIMDE_X86_SSE2_NATIVE)
2268 return _mm_cvtpd_ps(a);
2269 #else
2270 simde__m128_private r_;
2271 simde__m128d_private a_ = simde__m128d_to_private(a);
2272
2273 #if defined(SIMDE_CONVERT_VECTOR_)
2274 SIMDE_CONVERT_VECTOR_(r_.m64_private[0].f32, a_.f64);
2275 r_.m64_private[1] = simde__m64_to_private(simde_mm_setzero_si64());
2276 #else
2277 SIMDE_VECTORIZE
2278 for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) {
2279 r_.f32[i] = (simde_float32) a_.f64[i];
2280 }
2281 simde_memset(&(r_.m64_private[1]), 0, sizeof(r_.m64_private[1]));
2282 #endif
2283
2284 return simde__m128_from_private(r_);
2285 #endif
2286 }
2287 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2288 # define _mm_cvtpd_ps(a) simde_mm_cvtpd_ps(a)
2289 #endif
2290
2291 SIMDE_FUNCTION_ATTRIBUTES
2292 simde__m128d
simde_mm_cvtpi32_pd(simde__m64 a)2293 simde_mm_cvtpi32_pd (simde__m64 a) {
2294 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
2295 return _mm_cvtpi32_pd(a);
2296 #else
2297 simde__m128d_private r_;
2298 simde__m64_private a_ = simde__m64_to_private(a);
2299
2300 #if defined(SIMDE_CONVERT_VECTOR_)
2301 SIMDE_CONVERT_VECTOR_(r_.f64, a_.i32);
2302 #else
2303 SIMDE_VECTORIZE
2304 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
2305 r_.f64[i] = (simde_float64) a_.i32[i];
2306 }
2307 #endif
2308
2309 return simde__m128d_from_private(r_);
2310 #endif
2311 }
2312 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2313 # define _mm_cvtpi32_pd(a) simde_mm_cvtpi32_pd(a)
2314 #endif
2315
2316 SIMDE_FUNCTION_ATTRIBUTES
2317 simde__m128i
simde_mm_cvtps_epi32(simde__m128 a)2318 simde_mm_cvtps_epi32 (simde__m128 a) {
2319 #if defined(SIMDE_X86_SSE2_NATIVE)
2320 return _mm_cvtps_epi32(a);
2321 #else
2322 simde__m128i_private r_;
2323 simde__m128_private a_ = simde__m128_to_private(a);
2324
2325 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2326 /* The default rounding mode on SSE is 'round to even', which ArmV7
2327 does not support! It is supported on ARMv8 however. */
2328 #if defined(SIMDE_ARCH_AARCH64)
2329 r_.neon_i32 = vcvtnq_s32_f32(a_.neon_f32);
2330 #else
2331 uint32x4_t signmask = vdupq_n_u32(0x80000000);
2332 float32x4_t half = vbslq_f32(signmask, a_.neon_f32, vdupq_n_f32(0.5f)); /* +/- 0.5 */
2333 int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(a_.neon_f32, half)); /* round to integer: [a + 0.5]*/
2334 int32x4_t r_trunc = vcvtq_s32_f32(a_.neon_f32); /* truncate to integer: [a] */
2335 int32x4_t plusone = vshrq_n_s32(vnegq_s32(r_trunc), 31); /* 1 or 0 */
2336 int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone), vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
2337 float32x4_t delta = vsubq_f32(a_.neon_f32, vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
2338 uint32x4_t is_delta_half = vceqq_f32(delta, half); /* delta == +/- 0.5 */
2339 r_.neon_i32 = vbslq_s32(is_delta_half, r_even, r_normal);
2340 #endif
2341 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
2342 r_.altivec_i32 = vec_cts(vec_round(a_.altivec_f32), 0);
2343 #else
2344 SIMDE_VECTORIZE
2345 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
2346 r_.i32[i] = HEDLEY_STATIC_CAST(int32_t, simde_math_roundf(a_.f32[i]));
2347 }
2348 #endif
2349
2350 return simde__m128i_from_private(r_);
2351 #endif
2352 }
2353 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2354 # define _mm_cvtps_epi32(a) simde_mm_cvtps_epi32(a)
2355 #endif
2356
2357 SIMDE_FUNCTION_ATTRIBUTES
2358 simde__m128d
simde_mm_cvtps_pd(simde__m128 a)2359 simde_mm_cvtps_pd (simde__m128 a) {
2360 #if defined(SIMDE_X86_SSE2_NATIVE)
2361 return _mm_cvtps_pd(a);
2362 #else
2363 simde__m128d_private r_;
2364 simde__m128_private a_ = simde__m128_to_private(a);
2365
2366 #if defined(SIMDE_CONVERT_VECTOR_)
2367 SIMDE_CONVERT_VECTOR_(r_.f64, a_.m64_private[0].f32);
2368 #else
2369 SIMDE_VECTORIZE
2370 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
2371 r_.f64[i] = a_.f32[i];
2372 }
2373 #endif
2374
2375 return simde__m128d_from_private(r_);
2376 #endif
2377 }
2378 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2379 # define _mm_cvtps_pd(a) simde_mm_cvtps_pd(a)
2380 #endif
2381
2382 SIMDE_FUNCTION_ATTRIBUTES
2383 int32_t
simde_mm_cvtsd_si32(simde__m128d a)2384 simde_mm_cvtsd_si32 (simde__m128d a) {
2385 #if defined(SIMDE_X86_SSE2_NATIVE)
2386 return _mm_cvtsd_si32(a);
2387 #else
2388 simde__m128d_private a_ = simde__m128d_to_private(a);
2389 return SIMDE_CONVERT_FTOI(int32_t, simde_math_round(a_.f64[0]));
2390 #endif
2391 }
2392 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2393 # define _mm_cvtsd_si32(a) simde_mm_cvtsd_si32(a)
2394 #endif
2395
2396 SIMDE_FUNCTION_ATTRIBUTES
2397 int64_t
simde_mm_cvtsd_si64(simde__m128d a)2398 simde_mm_cvtsd_si64 (simde__m128d a) {
2399 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
2400 #if defined(__PGI)
2401 return _mm_cvtsd_si64x(a);
2402 #else
2403 return _mm_cvtsd_si64(a);
2404 #endif
2405 #else
2406 simde__m128d_private a_ = simde__m128d_to_private(a);
2407 return SIMDE_CONVERT_FTOI(int64_t, simde_math_round(a_.f64[0]));
2408 #endif
2409 }
2410 #define simde_mm_cvtsd_si64x(a) simde_mm_cvtsd_si64(a)
2411 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2412 # define _mm_cvtsd_si64(a) simde_mm_cvtsd_si64(a)
2413 # define _mm_cvtsd_si64x(a) simde_mm_cvtsd_si64x(a)
2414 #endif
2415
2416 SIMDE_FUNCTION_ATTRIBUTES
2417 simde__m128
simde_mm_cvtsd_ss(simde__m128 a,simde__m128d b)2418 simde_mm_cvtsd_ss (simde__m128 a, simde__m128d b) {
2419 #if defined(SIMDE_X86_SSE2_NATIVE)
2420 return _mm_cvtsd_ss(a, b);
2421 #else
2422 simde__m128_private
2423 r_,
2424 a_ = simde__m128_to_private(a);
2425 simde__m128d_private b_ = simde__m128d_to_private(b);
2426
2427 r_.f32[0] = HEDLEY_STATIC_CAST(simde_float32, b_.f64[0]);
2428
2429 SIMDE_VECTORIZE
2430 for (size_t i = 1 ; i < (sizeof(r_) / sizeof(r_.i32[0])) ; i++) {
2431 r_.i32[i] = a_.i32[i];
2432 }
2433
2434 return simde__m128_from_private(r_);
2435 #endif
2436 }
2437 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2438 # define _mm_cvtsd_ss(a, b) simde_mm_cvtsd_ss(a, b)
2439 #endif
2440
2441 SIMDE_FUNCTION_ATTRIBUTES
2442 int32_t
simde_mm_cvtsi128_si32(simde__m128i a)2443 simde_mm_cvtsi128_si32 (simde__m128i a) {
2444 #if defined(SIMDE_X86_SSE2_NATIVE)
2445 return _mm_cvtsi128_si32(a);
2446 #else
2447 simde__m128i_private
2448 a_ = simde__m128i_to_private(a);
2449
2450 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2451 return vgetq_lane_s32(a_.neon_i32, 0);
2452 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
2453 #if defined(SIMDE_BUG_GCC_95227)
2454 (void) a_;
2455 #endif
2456 return vec_extract(a_.altivec_i32, 0);
2457 #else
2458 return a_.i32[0];
2459 #endif
2460 #endif
2461 }
2462 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2463 # define _mm_cvtsi128_si32(a) simde_mm_cvtsi128_si32(a)
2464 #endif
2465
2466 SIMDE_FUNCTION_ATTRIBUTES
2467 int64_t
simde_mm_cvtsi128_si64(simde__m128i a)2468 simde_mm_cvtsi128_si64 (simde__m128i a) {
2469 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
2470 #if defined(__PGI)
2471 return _mm_cvtsi128_si64x(a);
2472 #else
2473 return _mm_cvtsi128_si64(a);
2474 #endif
2475 #else
2476 simde__m128i_private a_ = simde__m128i_to_private(a);
2477 #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) && !defined(HEDLEY_IBM_VERSION)
2478 return vec_extract(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed long long), a_.i64), 0);
2479 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2480 return vgetq_lane_s64(a_.neon_i64, 0);
2481 #endif
2482 return a_.i64[0];
2483 #endif
2484 }
2485 #define simde_mm_cvtsi128_si64x(a) simde_mm_cvtsi128_si64(a)
2486 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2487 # define _mm_cvtsi128_si64(a) simde_mm_cvtsi128_si64(a)
2488 # define _mm_cvtsi128_si64x(a) simde_mm_cvtsi128_si64x(a)
2489 #endif
2490
2491 SIMDE_FUNCTION_ATTRIBUTES
2492 simde__m128d
simde_mm_cvtsi32_sd(simde__m128d a,int32_t b)2493 simde_mm_cvtsi32_sd (simde__m128d a, int32_t b) {
2494
2495 #if defined(SIMDE_X86_SSE2_NATIVE)
2496 return _mm_cvtsi32_sd(a, b);
2497 #else
2498 simde__m128d_private r_;
2499 simde__m128d_private a_ = simde__m128d_to_private(a);
2500
2501 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_AMD64)
2502 r_.neon_f64 = vsetq_lane_f64(HEDLEY_STATIC_CAST(float64_t, b), a_.neon_f64, 0);
2503 #else
2504 r_.f64[0] = HEDLEY_STATIC_CAST(simde_float64, b);
2505 r_.i64[1] = a_.i64[1];
2506 #endif
2507
2508 return simde__m128d_from_private(r_);
2509 #endif
2510 }
2511 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2512 # define _mm_cvtsi32_sd(a, b) simde_mm_cvtsi32_sd(a, b)
2513 #endif
2514
2515 SIMDE_FUNCTION_ATTRIBUTES
2516 simde__m128i
simde_mm_cvtsi32_si128(int32_t a)2517 simde_mm_cvtsi32_si128 (int32_t a) {
2518 #if defined(SIMDE_X86_SSE2_NATIVE)
2519 return _mm_cvtsi32_si128(a);
2520 #else
2521 simde__m128i_private r_;
2522
2523 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2524 r_.neon_i32 = vsetq_lane_s32(a, vdupq_n_s32(0), 0);
2525 #else
2526 r_.i32[0] = a;
2527 r_.i32[1] = 0;
2528 r_.i32[2] = 0;
2529 r_.i32[3] = 0;
2530 #endif
2531
2532 return simde__m128i_from_private(r_);
2533 #endif
2534 }
2535 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2536 # define _mm_cvtsi32_si128(a) simde_mm_cvtsi32_si128(a)
2537 #endif
2538
2539 SIMDE_FUNCTION_ATTRIBUTES
2540 simde__m128d
simde_mm_cvtsi64_sd(simde__m128d a,int64_t b)2541 simde_mm_cvtsi64_sd (simde__m128d a, int64_t b) {
2542 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
2543 #if !defined(__PGI)
2544 return _mm_cvtsi64_sd(a, b);
2545 #else
2546 return _mm_cvtsi64x_sd(a, b);
2547 #endif
2548 #else
2549 simde__m128d_private
2550 r_,
2551 a_ = simde__m128d_to_private(a);
2552
2553 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2554 r_.neon_f64 = vsetq_lane_f64(HEDLEY_STATIC_CAST(float64_t, b), a_.neon_f64, 0);
2555 #else
2556 r_.f64[0] = HEDLEY_STATIC_CAST(simde_float64, b);
2557 r_.f64[1] = a_.f64[1];
2558 #endif
2559
2560 return simde__m128d_from_private(r_);
2561 #endif
2562 }
2563 #define simde_mm_cvtsi64x_sd(a, b) simde_mm_cvtsi64_sd(a, b)
2564 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2565 # define _mm_cvtsi64_sd(a, b) simde_mm_cvtsi64_sd(a, b)
2566 # define _mm_cvtsi64x_sd(a, b) simde_mm_cvtsi64x_sd(a, b)
2567 #endif
2568
2569 SIMDE_FUNCTION_ATTRIBUTES
2570 simde__m128i
simde_mm_cvtsi64_si128(int64_t a)2571 simde_mm_cvtsi64_si128 (int64_t a) {
2572 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
2573 #if !defined(__PGI)
2574 return _mm_cvtsi64_si128(a);
2575 #else
2576 return _mm_cvtsi64x_si128(a);
2577 #endif
2578 #else
2579 simde__m128i_private r_;
2580
2581 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2582 r_.neon_i64 = vsetq_lane_s64(a, vdupq_n_s64(0), 0);
2583 #else
2584 r_.i64[0] = a;
2585 r_.i64[1] = 0;
2586 #endif
2587
2588 return simde__m128i_from_private(r_);
2589 #endif
2590 }
2591 #define simde_mm_cvtsi64x_si128(a) simde_mm_cvtsi64_si128(a)
2592 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2593 # define _mm_cvtsi64_si128(a) simde_mm_cvtsi64_si128(a)
2594 # define _mm_cvtsi64x_si128(a) simde_mm_cvtsi64x_si128(a)
2595 #endif
2596
2597 SIMDE_FUNCTION_ATTRIBUTES
2598 simde__m128d
simde_mm_cvtss_sd(simde__m128d a,simde__m128 b)2599 simde_mm_cvtss_sd (simde__m128d a, simde__m128 b) {
2600 #if defined(SIMDE_X86_SSE2_NATIVE)
2601 return _mm_cvtss_sd(a, b);
2602 #else
2603 simde__m128d_private
2604 a_ = simde__m128d_to_private(a);
2605 simde__m128_private b_ = simde__m128_to_private(b);
2606
2607 a_.f64[0] = HEDLEY_STATIC_CAST(simde_float64, b_.f32[0]);
2608
2609 return simde__m128d_from_private(a_);
2610 #endif
2611 }
2612 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2613 # define _mm_cvtss_sd(a, b) simde_mm_cvtss_sd(a, b)
2614 #endif
2615
2616 SIMDE_FUNCTION_ATTRIBUTES
2617 simde__m128i
simde_mm_cvttpd_epi32(simde__m128d a)2618 simde_mm_cvttpd_epi32 (simde__m128d a) {
2619 #if defined(SIMDE_X86_SSE2_NATIVE)
2620 return _mm_cvttpd_epi32(a);
2621 #else
2622 simde__m128i_private r_;
2623 simde__m128d_private a_ = simde__m128d_to_private(a);
2624
2625 for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) {
2626 r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, a_.f64[i]);
2627 }
2628 simde_memset(&(r_.m64_private[1]), 0, sizeof(r_.m64_private[1]));
2629
2630 return simde__m128i_from_private(r_);
2631 #endif
2632 }
2633 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2634 # define _mm_cvttpd_epi32(a) simde_mm_cvttpd_epi32(a)
2635 #endif
2636
2637 SIMDE_FUNCTION_ATTRIBUTES
2638 simde__m64
simde_mm_cvttpd_pi32(simde__m128d a)2639 simde_mm_cvttpd_pi32 (simde__m128d a) {
2640 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
2641 return _mm_cvttpd_pi32(a);
2642 #else
2643 simde__m64_private r_;
2644 simde__m128d_private a_ = simde__m128d_to_private(a);
2645
2646 #if defined(SIMDE_CONVERT_VECTOR_)
2647 SIMDE_CONVERT_VECTOR_(r_.i32, a_.f64);
2648 #else
2649 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
2650 r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, a_.f64[i]);
2651 }
2652 #endif
2653
2654 return simde__m64_from_private(r_);
2655 #endif
2656 }
2657 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2658 # define _mm_cvttpd_pi32(a) simde_mm_cvttpd_pi32(a)
2659 #endif
2660
2661 SIMDE_FUNCTION_ATTRIBUTES
2662 simde__m128i
simde_mm_cvttps_epi32(simde__m128 a)2663 simde_mm_cvttps_epi32 (simde__m128 a) {
2664 #if defined(SIMDE_X86_SSE2_NATIVE)
2665 return _mm_cvttps_epi32(a);
2666 #else
2667 simde__m128i_private r_;
2668 simde__m128_private a_ = simde__m128_to_private(a);
2669
2670 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2671 r_.neon_i32 = vcvtq_s32_f32(a_.neon_f32);
2672 #elif defined(SIMDE_CONVERT_VECTOR_)
2673 SIMDE_CONVERT_VECTOR_(r_.i32, a_.f32);
2674 #else
2675 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
2676 r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, a_.f32[i]);
2677 }
2678 #endif
2679
2680 return simde__m128i_from_private(r_);
2681 #endif
2682 }
2683 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2684 # define _mm_cvttps_epi32(a) simde_mm_cvttps_epi32(a)
2685 #endif
2686
2687 SIMDE_FUNCTION_ATTRIBUTES
2688 int32_t
simde_mm_cvttsd_si32(simde__m128d a)2689 simde_mm_cvttsd_si32 (simde__m128d a) {
2690 #if defined(SIMDE_X86_SSE2_NATIVE)
2691 return _mm_cvttsd_si32(a);
2692 #else
2693 simde__m128d_private a_ = simde__m128d_to_private(a);
2694 return SIMDE_CONVERT_FTOI(int32_t, a_.f64[0]);
2695 #endif
2696 }
2697 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2698 # define _mm_cvttsd_si32(a) simde_mm_cvttsd_si32(a)
2699 #endif
2700
2701 SIMDE_FUNCTION_ATTRIBUTES
2702 int64_t
simde_mm_cvttsd_si64(simde__m128d a)2703 simde_mm_cvttsd_si64 (simde__m128d a) {
2704 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
2705 #if !defined(__PGI)
2706 return _mm_cvttsd_si64(a);
2707 #else
2708 return _mm_cvttsd_si64x(a);
2709 #endif
2710 #else
2711 simde__m128d_private a_ = simde__m128d_to_private(a);
2712 return SIMDE_CONVERT_FTOI(int64_t, a_.f64[0]);
2713 #endif
2714 }
2715 #define simde_mm_cvttsd_si64x(a) simde_mm_cvttsd_si64(a)
2716 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2717 # define _mm_cvttsd_si64(a) simde_mm_cvttsd_si64(a)
2718 # define _mm_cvttsd_si64x(a) simde_mm_cvttsd_si64x(a)
2719 #endif
2720
2721 SIMDE_FUNCTION_ATTRIBUTES
2722 simde__m128d
simde_mm_div_pd(simde__m128d a,simde__m128d b)2723 simde_mm_div_pd (simde__m128d a, simde__m128d b) {
2724 #if defined(SIMDE_X86_SSE2_NATIVE)
2725 return _mm_div_pd(a, b);
2726 #else
2727 simde__m128d_private
2728 r_,
2729 a_ = simde__m128d_to_private(a),
2730 b_ = simde__m128d_to_private(b);
2731
2732 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
2733 r_.f64 = a_.f64 / b_.f64;
2734 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
2735 r_.wasm_v128 = wasm_f64x2_div(a_.wasm_v128, b_.wasm_v128);
2736 #else
2737 SIMDE_VECTORIZE
2738 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
2739 r_.f64[i] = a_.f64[i] / b_.f64[i];
2740 }
2741 #endif
2742
2743 return simde__m128d_from_private(r_);
2744 #endif
2745 }
2746 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2747 # define _mm_div_pd(a, b) simde_mm_div_pd(a, b)
2748 #endif
2749
2750 SIMDE_FUNCTION_ATTRIBUTES
2751 simde__m128d
simde_mm_div_sd(simde__m128d a,simde__m128d b)2752 simde_mm_div_sd (simde__m128d a, simde__m128d b) {
2753 #if defined(SIMDE_X86_SSE2_NATIVE)
2754 return _mm_div_sd(a, b);
2755 #elif defined(SIMDE_ASSUME_VECTORIZATION)
2756 return simde_mm_move_sd(a, simde_mm_div_pd(a, b));
2757 #else
2758 simde__m128d_private
2759 r_,
2760 a_ = simde__m128d_to_private(a),
2761 b_ = simde__m128d_to_private(b);
2762
2763 r_.f64[0] = a_.f64[0] / b_.f64[0];
2764 r_.f64[1] = a_.f64[1];
2765
2766 return simde__m128d_from_private(r_);
2767 #endif
2768 }
2769 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2770 # define _mm_div_sd(a, b) simde_mm_div_sd(a, b)
2771 #endif
2772
2773 SIMDE_FUNCTION_ATTRIBUTES
2774 int32_t
simde_mm_extract_epi16(simde__m128i a,const int imm8)2775 simde_mm_extract_epi16 (simde__m128i a, const int imm8)
2776 SIMDE_REQUIRE_RANGE(imm8, 0, 7) {
2777 uint16_t r;
2778 simde__m128i_private a_ = simde__m128i_to_private(a);
2779
2780 #if defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
2781 #if defined(SIMDE_BUG_GCC_95227)
2782 (void) a_;
2783 (void) imm8;
2784 #endif
2785 r = HEDLEY_STATIC_CAST(uint16_t, vec_extract(a_.altivec_i16, imm8));
2786 #else
2787 r = a_.u16[imm8 & 7];
2788 #endif
2789
2790 return HEDLEY_STATIC_CAST(int32_t, r);
2791 }
2792 #if defined(SIMDE_X86_SSE2_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(4,6,0))
2793 # define simde_mm_extract_epi16(a, imm8) _mm_extract_epi16(a, imm8)
2794 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2795 # define simde_mm_extract_epi16(a, imm8) (HEDLEY_STATIC_CAST(int32_t, vgetq_lane_s16(simde__m128i_to_private(a).neon_i16, (imm8))) & (INT32_C(0x0000ffff)))
2796 #endif
2797 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2798 # define _mm_extract_epi16(a, imm8) simde_mm_extract_epi16(a, imm8)
2799 #endif
2800
2801 SIMDE_FUNCTION_ATTRIBUTES
2802 simde__m128i
simde_mm_insert_epi16(simde__m128i a,int16_t i,const int imm8)2803 simde_mm_insert_epi16 (simde__m128i a, int16_t i, const int imm8)
2804 SIMDE_REQUIRE_RANGE(imm8, 0, 7) {
2805 simde__m128i_private a_ = simde__m128i_to_private(a);
2806 a_.i16[imm8 & 7] = i;
2807 return simde__m128i_from_private(a_);
2808 }
2809 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
2810 # define simde_mm_insert_epi16(a, i, imm8) _mm_insert_epi16((a), (i), (imm8))
2811 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2812 # define simde_mm_insert_epi16(a, i, imm8) simde__m128i_from_neon_i16(vsetq_lane_s16((i), simde__m128i_to_neon_i16(a), (imm8)))
2813 #endif
2814 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2815 # define _mm_insert_epi16(a, i, imm8) simde_mm_insert_epi16(a, i, imm8)
2816 #endif
2817
2818 SIMDE_FUNCTION_ATTRIBUTES
2819 simde__m128d
simde_mm_load_pd(simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM (2)])2820 simde_mm_load_pd (simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)]) {
2821 simde_assert_aligned(16, mem_addr);
2822
2823 #if defined(SIMDE_X86_SSE2_NATIVE)
2824 return _mm_load_pd(mem_addr);
2825 #else
2826 simde__m128d_private r_;
2827
2828 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2829 r_.neon_u32 = vld1q_u32(HEDLEY_REINTERPRET_CAST(uint32_t const*, mem_addr));
2830 #else
2831 r_ = *SIMDE_ALIGN_CAST(simde__m128d_private const*, mem_addr);
2832 #endif
2833
2834 return simde__m128d_from_private(r_);
2835 #endif
2836 }
2837 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2838 # define _mm_load_pd(mem_addr) simde_mm_load_pd(mem_addr)
2839 #endif
2840
2841 SIMDE_FUNCTION_ATTRIBUTES
2842 simde__m128d
simde_mm_load_pd1(simde_float64 const * mem_addr)2843 simde_mm_load_pd1 (simde_float64 const* mem_addr) {
2844 #if defined(SIMDE_X86_SSE2_NATIVE)
2845 return _mm_load1_pd(mem_addr);
2846 #else
2847 simde__m128d_private r_;
2848
2849 r_.f64[0] = *mem_addr;
2850 r_.f64[1] = *mem_addr;
2851
2852 return simde__m128d_from_private(r_);
2853 #endif
2854 }
2855 #define simde_mm_load1_pd(mem_addr) simde_mm_load_pd1(mem_addr)
2856 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2857 # define _mm_load_pd1(mem_addr) simde_mm_load_pd1(mem_addr)
2858 # define _mm_load1_pd(mem_addr) simde_mm_load1_pd(mem_addr)
2859 #endif
2860
2861 SIMDE_FUNCTION_ATTRIBUTES
2862 simde__m128d
simde_mm_load_sd(simde_float64 const * mem_addr)2863 simde_mm_load_sd (simde_float64 const* mem_addr) {
2864 #if defined(SIMDE_X86_SSE2_NATIVE)
2865 return _mm_load_sd(mem_addr);
2866 #else
2867 simde__m128d_private r_;
2868
2869 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2870 r_.neon_f64 = vsetq_lane_f64(*mem_addr, vdupq_n_f64(0), 0);
2871 #else
2872 r_.f64[0] = *mem_addr;
2873 r_.u64[1] = UINT64_C(0);
2874 #endif
2875
2876 return simde__m128d_from_private(r_);
2877 #endif
2878 }
2879 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2880 # define _mm_load_sd(mem_addr) simde_mm_load_sd(mem_addr)
2881 #endif
2882
2883 SIMDE_FUNCTION_ATTRIBUTES
2884 simde__m128i
simde_mm_load_si128(simde__m128i const * mem_addr)2885 simde_mm_load_si128 (simde__m128i const* mem_addr) {
2886 simde_assert_aligned(16, mem_addr);
2887
2888 #if defined(SIMDE_X86_SSE2_NATIVE)
2889 return _mm_load_si128(HEDLEY_REINTERPRET_CAST(__m128i const*, mem_addr));
2890 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) || defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
2891 simde__m128i_private r_;
2892
2893 #if defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
2894 r_.altivec_i32 = vec_ld(0, HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(int) const*, mem_addr));
2895 #else
2896 r_.neon_i32 = vld1q_s32(HEDLEY_REINTERPRET_CAST(int32_t const*, mem_addr));
2897 #endif
2898
2899 return simde__m128i_from_private(r_);
2900 #else
2901 return *mem_addr;
2902 #endif
2903 }
2904 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2905 # define _mm_load_si128(mem_addr) simde_mm_load_si128(mem_addr)
2906 #endif
2907
2908 SIMDE_FUNCTION_ATTRIBUTES
2909 simde__m128d
simde_mm_loadh_pd(simde__m128d a,simde_float64 const * mem_addr)2910 simde_mm_loadh_pd (simde__m128d a, simde_float64 const* mem_addr) {
2911 #if defined(SIMDE_X86_SSE2_NATIVE)
2912 return _mm_loadh_pd(a, mem_addr);
2913 #else
2914 simde__m128d_private
2915 r_,
2916 a_ = simde__m128d_to_private(a);
2917 simde_float64 t;
2918
2919 simde_memcpy(&t, mem_addr, sizeof(t));
2920 r_.f64[0] = a_.f64[0];
2921 r_.f64[1] = t;
2922
2923 return simde__m128d_from_private(r_);
2924 #endif
2925 }
2926 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2927 # define _mm_loadh_pd(a, mem_addr) simde_mm_loadh_pd(a, mem_addr)
2928 #endif
2929
2930 SIMDE_FUNCTION_ATTRIBUTES
2931 simde__m128i
simde_mm_loadl_epi64(simde__m128i const * mem_addr)2932 simde_mm_loadl_epi64 (simde__m128i const* mem_addr) {
2933 #if defined(SIMDE_X86_SSE2_NATIVE)
2934 return _mm_loadl_epi64(mem_addr);
2935 #else
2936 simde__m128i_private r_;
2937
2938 int64_t value;
2939 simde_memcpy(&value, mem_addr, sizeof(value));
2940
2941 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2942 r_.neon_i64 = vcombine_s64(vld1_s64(HEDLEY_REINTERPRET_CAST(int64_t const *, mem_addr)), vdup_n_s64(0));
2943 #else
2944 r_.i64[0] = value;
2945 r_.i64[1] = 0;
2946 #endif
2947
2948 return simde__m128i_from_private(r_);
2949 #endif
2950 }
2951 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2952 # define _mm_loadl_epi64(mem_addr) simde_mm_loadl_epi64(mem_addr)
2953 #endif
2954
2955 SIMDE_FUNCTION_ATTRIBUTES
2956 simde__m128d
simde_mm_loadl_pd(simde__m128d a,simde_float64 const * mem_addr)2957 simde_mm_loadl_pd (simde__m128d a, simde_float64 const* mem_addr) {
2958 #if defined(SIMDE_X86_SSE2_NATIVE)
2959 return _mm_loadl_pd(a, mem_addr);
2960 #else
2961 simde__m128d_private
2962 r_,
2963 a_ = simde__m128d_to_private(a);
2964
2965 r_.f64[0] = *mem_addr;
2966 r_.u64[1] = a_.u64[1];
2967
2968 return simde__m128d_from_private(r_);
2969 #endif
2970 }
2971 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2972 # define _mm_loadl_pd(a, mem_addr) simde_mm_loadl_pd(a, mem_addr)
2973 #endif
2974
2975 SIMDE_FUNCTION_ATTRIBUTES
2976 simde__m128d
simde_mm_loadr_pd(simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM (2)])2977 simde_mm_loadr_pd (simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)]) {
2978 simde_assert_aligned(16, mem_addr);
2979
2980 #if defined(SIMDE_X86_SSE2_NATIVE)
2981 return _mm_loadr_pd(mem_addr);
2982 #else
2983 simde__m128d_private r_;
2984
2985 r_.f64[0] = mem_addr[1];
2986 r_.f64[1] = mem_addr[0];
2987
2988 return simde__m128d_from_private(r_);
2989 #endif
2990 }
2991 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2992 # define _mm_loadr_pd(mem_addr) simde_mm_loadr_pd(mem_addr)
2993 #endif
2994
2995 SIMDE_FUNCTION_ATTRIBUTES
2996 simde__m128d
simde_mm_loadu_pd(simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM (2)])2997 simde_mm_loadu_pd (simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)]) {
2998 #if defined(SIMDE_X86_SSE2_NATIVE)
2999 return _mm_loadu_pd(mem_addr);
3000 #else
3001 simde__m128d_private r_;
3002
3003 simde_memcpy(&r_, mem_addr, sizeof(r_));
3004
3005 return simde__m128d_from_private(r_);
3006 #endif
3007 }
3008 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3009 # define _mm_loadu_pd(mem_addr) simde_mm_loadu_pd(mem_addr)
3010 #endif
3011
3012 SIMDE_FUNCTION_ATTRIBUTES
3013 simde__m128i
simde_x_mm_loadu_epi8(int8_t const * mem_addr)3014 simde_x_mm_loadu_epi8(int8_t const* mem_addr) {
3015 #if defined(SIMDE_X86_SSE2_NATIVE)
3016 return _mm_loadu_si128(SIMDE_ALIGN_CAST(simde__m128i const*, mem_addr));
3017 #else
3018 simde__m128i_private r_;
3019
3020 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3021 r_.neon_i8 = vld1q_s8(HEDLEY_REINTERPRET_CAST(int8_t const*, mem_addr));
3022 #else
3023 simde_memcpy(&r_, mem_addr, sizeof(r_));
3024 #endif
3025
3026 return simde__m128i_from_private(r_);
3027 #endif
3028 }
3029
3030 SIMDE_FUNCTION_ATTRIBUTES
3031 simde__m128i
simde_x_mm_loadu_epi16(int16_t const * mem_addr)3032 simde_x_mm_loadu_epi16(int16_t const* mem_addr) {
3033 #if defined(SIMDE_X86_SSE2_NATIVE)
3034 return _mm_loadu_si128(SIMDE_ALIGN_CAST(simde__m128i const*, mem_addr));
3035 #else
3036 simde__m128i_private r_;
3037
3038 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3039 r_.neon_i16 = vld1q_s16(HEDLEY_REINTERPRET_CAST(int16_t const*, mem_addr));
3040 #else
3041 simde_memcpy(&r_, mem_addr, sizeof(r_));
3042 #endif
3043
3044 return simde__m128i_from_private(r_);
3045 #endif
3046 }
3047
3048 SIMDE_FUNCTION_ATTRIBUTES
3049 simde__m128i
simde_x_mm_loadu_epi32(int32_t const * mem_addr)3050 simde_x_mm_loadu_epi32(int32_t const* mem_addr) {
3051 #if defined(SIMDE_X86_SSE2_NATIVE)
3052 return _mm_loadu_si128(SIMDE_ALIGN_CAST(simde__m128i const*, mem_addr));
3053 #else
3054 simde__m128i_private r_;
3055
3056 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3057 r_.neon_i32 = vld1q_s32(HEDLEY_REINTERPRET_CAST(int32_t const*, mem_addr));
3058 #else
3059 simde_memcpy(&r_, mem_addr, sizeof(r_));
3060 #endif
3061
3062 return simde__m128i_from_private(r_);
3063 #endif
3064 }
3065
3066 SIMDE_FUNCTION_ATTRIBUTES
3067 simde__m128i
simde_x_mm_loadu_epi64(int64_t const * mem_addr)3068 simde_x_mm_loadu_epi64(int64_t const* mem_addr) {
3069 #if defined(SIMDE_X86_SSE2_NATIVE)
3070 return _mm_loadu_si128(SIMDE_ALIGN_CAST(simde__m128i const*, mem_addr));
3071 #else
3072 simde__m128i_private r_;
3073
3074 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3075 r_.neon_i64 = vld1q_s64(HEDLEY_REINTERPRET_CAST(int64_t const*, mem_addr));
3076 #else
3077 simde_memcpy(&r_, mem_addr, sizeof(r_));
3078 #endif
3079
3080 return simde__m128i_from_private(r_);
3081 #endif
3082 }
3083
3084 SIMDE_FUNCTION_ATTRIBUTES
3085 simde__m128i
simde_mm_loadu_si128(void const * mem_addr)3086 simde_mm_loadu_si128 (void const* mem_addr) {
3087 #if defined(SIMDE_X86_SSE2_NATIVE)
3088 return _mm_loadu_si128(HEDLEY_STATIC_CAST(__m128i const*, mem_addr));
3089 #else
3090 simde__m128i_private r_;
3091
3092 #if HEDLEY_GNUC_HAS_ATTRIBUTE(may_alias,3,3,0)
3093 HEDLEY_DIAGNOSTIC_PUSH
3094 SIMDE_DIAGNOSTIC_DISABLE_PACKED_
3095 struct simde_mm_loadu_si128_s {
3096 __typeof__(r_) v;
3097 } __attribute__((__packed__, __may_alias__));
3098 r_ = HEDLEY_REINTERPRET_CAST(const struct simde_mm_loadu_si128_s *, mem_addr)->v;
3099 HEDLEY_DIAGNOSTIC_POP
3100 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3101 /* Note that this is a lower priority than the struct above since
3102 * clang assumes mem_addr is aligned (since it is a __m128i*). */
3103 r_.neon_i32 = vld1q_s32(HEDLEY_REINTERPRET_CAST(int32_t const*, mem_addr));
3104 #else
3105 simde_memcpy(&r_, mem_addr, sizeof(r_));
3106 #endif
3107
3108 return simde__m128i_from_private(r_);
3109 #endif
3110 }
3111 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3112 # define _mm_loadu_si128(mem_addr) simde_mm_loadu_si128(mem_addr)
3113 #endif
3114
3115 SIMDE_FUNCTION_ATTRIBUTES
3116 simde__m128i
simde_mm_madd_epi16(simde__m128i a,simde__m128i b)3117 simde_mm_madd_epi16 (simde__m128i a, simde__m128i b) {
3118 #if defined(SIMDE_X86_SSE2_NATIVE)
3119 return _mm_madd_epi16(a, b);
3120 #else
3121 simde__m128i_private
3122 r_,
3123 a_ = simde__m128i_to_private(a),
3124 b_ = simde__m128i_to_private(b);
3125
3126 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3127 int32x4_t pl = vmull_s16(vget_low_s16(a_.neon_i16), vget_low_s16(b_.neon_i16));
3128 int32x4_t ph = vmull_s16(vget_high_s16(a_.neon_i16), vget_high_s16(b_.neon_i16));
3129 int32x2_t rl = vpadd_s32(vget_low_s32(pl), vget_high_s32(pl));
3130 int32x2_t rh = vpadd_s32(vget_low_s32(ph), vget_high_s32(ph));
3131 r_.neon_i32 = vcombine_s32(rl, rh);
3132 #else
3133 SIMDE_VECTORIZE
3134 for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i16[0])) ; i += 2) {
3135 r_.i32[i / 2] = (a_.i16[i] * b_.i16[i]) + (a_.i16[i + 1] * b_.i16[i + 1]);
3136 }
3137 #endif
3138
3139 return simde__m128i_from_private(r_);
3140 #endif
3141 }
3142 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3143 # define _mm_madd_epi16(a, b) simde_mm_madd_epi16(a, b)
3144 #endif
3145
3146 SIMDE_FUNCTION_ATTRIBUTES
3147 void
simde_mm_maskmoveu_si128(simde__m128i a,simde__m128i mask,int8_t mem_addr[HEDLEY_ARRAY_PARAM (16)])3148 simde_mm_maskmoveu_si128 (simde__m128i a, simde__m128i mask, int8_t mem_addr[HEDLEY_ARRAY_PARAM(16)]) {
3149 #if defined(SIMDE_X86_SSE2_NATIVE)
3150 _mm_maskmoveu_si128(a, mask, HEDLEY_REINTERPRET_CAST(char*, mem_addr));
3151 #else
3152 simde__m128i_private
3153 a_ = simde__m128i_to_private(a),
3154 mask_ = simde__m128i_to_private(mask);
3155
3156 for (size_t i = 0 ; i < (sizeof(a_.i8) / sizeof(a_.i8[0])) ; i++) {
3157 if (mask_.u8[i] & 0x80) {
3158 mem_addr[i] = a_.i8[i];
3159 }
3160 }
3161 #endif
3162 }
3163 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3164 # define _mm_maskmoveu_si128(a, mask, mem_addr) simde_mm_maskmoveu_si128((a), (mask), SIMDE_CHECKED_REINTERPRET_CAST(int8_t*, char*, (mem_addr)))
3165 #endif
3166
3167 SIMDE_FUNCTION_ATTRIBUTES
3168 int32_t
simde_mm_movemask_epi8(simde__m128i a)3169 simde_mm_movemask_epi8 (simde__m128i a) {
3170 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__INTEL_COMPILER)
3171 /* ICC has trouble with _mm_movemask_epi8 at -O2 and above: */
3172 return _mm_movemask_epi8(a);
3173 #else
3174 int32_t r = 0;
3175 simde__m128i_private a_ = simde__m128i_to_private(a);
3176
3177 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3178 // Use increasingly wide shifts+adds to collect the sign bits
3179 // together.
3180 // Since the widening shifts would be rather confusing to follow in little endian, everything
3181 // will be illustrated in big endian order instead. This has a different result - the bits
3182 // would actually be reversed on a big endian machine.
3183
3184 // Starting input (only half the elements are shown):
3185 // 89 ff 1d c0 00 10 99 33
3186 uint8x16_t input = a_.neon_u8;
3187
3188 // Shift out everything but the sign bits with an unsigned shift right.
3189 //
3190 // Bytes of the vector::
3191 // 89 ff 1d c0 00 10 99 33
3192 // \ \ \ \ \ \ \ \ high_bits = (uint16x4_t)(input >> 7)
3193 // | | | | | | | |
3194 // 01 01 00 01 00 00 01 00
3195 //
3196 // Bits of first important lane(s):
3197 // 10001001 (89)
3198 // \______
3199 // |
3200 // 00000001 (01)
3201 uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
3202
3203 // Merge the even lanes together with a 16-bit unsigned shift right + add.
3204 // 'xx' represents garbage data which will be ignored in the final result.
3205 // In the important bytes, the add functions like a binary OR.
3206 //
3207 // 01 01 00 01 00 00 01 00
3208 // \_ | \_ | \_ | \_ | paired16 = (uint32x4_t)(input + (input >> 7))
3209 // \| \| \| \|
3210 // xx 03 xx 01 xx 00 xx 02
3211 //
3212 // 00000001 00000001 (01 01)
3213 // \_______ |
3214 // \|
3215 // xxxxxxxx xxxxxx11 (xx 03)
3216 uint32x4_t paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
3217
3218 // Repeat with a wider 32-bit shift + add.
3219 // xx 03 xx 01 xx 00 xx 02
3220 // \____ | \____ | paired32 = (uint64x1_t)(paired16 + (paired16 >> 14))
3221 // \| \|
3222 // xx xx xx 0d xx xx xx 02
3223 //
3224 // 00000011 00000001 (03 01)
3225 // \\_____ ||
3226 // '----.\||
3227 // xxxxxxxx xxxx1101 (xx 0d)
3228 uint64x2_t paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
3229
3230 // Last, an even wider 64-bit shift + add to get our result in the low 8 bit lanes.
3231 // xx xx xx 0d xx xx xx 02
3232 // \_________ | paired64 = (uint8x8_t)(paired32 + (paired32 >> 28))
3233 // \|
3234 // xx xx xx xx xx xx xx d2
3235 //
3236 // 00001101 00000010 (0d 02)
3237 // \ \___ | |
3238 // '---. \| |
3239 // xxxxxxxx 11010010 (xx d2)
3240 uint8x16_t paired64 = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
3241
3242 // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
3243 // xx xx xx xx xx xx xx d2
3244 // || return paired64[0]
3245 // d2
3246 // Note: Little endian would return the correct value 4b (01001011) instead.
3247 r = vgetq_lane_u8(paired64, 0) | (HEDLEY_STATIC_CAST(int32_t, vgetq_lane_u8(paired64, 8)) << 8);
3248 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) && !defined(HEDLEY_IBM_VERSION) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
3249 static const SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) perm = { 120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8, 0 };
3250 r = HEDLEY_STATIC_CAST(int32_t, vec_extract(vec_vbpermq(a_.altivec_u8, perm), 1));
3251 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) && !defined(HEDLEY_IBM_VERSION) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_BIG)
3252 static const SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) perm = { 120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8, 0 };
3253 r = HEDLEY_STATIC_CAST(int32_t, vec_extract(vec_vbpermq(a_.altivec_u8, perm), 14));
3254 #else
3255 SIMDE_VECTORIZE_REDUCTION(|:r)
3256 for (size_t i = 0 ; i < (sizeof(a_.u8) / sizeof(a_.u8[0])) ; i++) {
3257 r |= (a_.u8[15 - i] >> 7) << (15 - i);
3258 }
3259 #endif
3260
3261 return r;
3262 #endif
3263 }
3264 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3265 # define _mm_movemask_epi8(a) simde_mm_movemask_epi8(a)
3266 #endif
3267
3268 SIMDE_FUNCTION_ATTRIBUTES
3269 int32_t
simde_mm_movemask_pd(simde__m128d a)3270 simde_mm_movemask_pd (simde__m128d a) {
3271 #if defined(SIMDE_X86_SSE2_NATIVE)
3272 return _mm_movemask_pd(a);
3273 #else
3274 int32_t r = 0;
3275 simde__m128d_private a_ = simde__m128d_to_private(a);
3276
3277 SIMDE_VECTORIZE
3278 for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) {
3279 r |= (a_.u64[i] >> 63) << i;
3280 }
3281
3282 return r;
3283 #endif
3284 }
3285 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3286 # define _mm_movemask_pd(a) simde_mm_movemask_pd(a)
3287 #endif
3288
3289 SIMDE_FUNCTION_ATTRIBUTES
3290 simde__m64
simde_mm_movepi64_pi64(simde__m128i a)3291 simde_mm_movepi64_pi64 (simde__m128i a) {
3292 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
3293 return _mm_movepi64_pi64(a);
3294 #else
3295 simde__m64_private r_;
3296 simde__m128i_private a_ = simde__m128i_to_private(a);
3297
3298 r_.i64[0] = a_.i64[0];
3299
3300 return simde__m64_from_private(r_);
3301 #endif
3302 }
3303 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3304 # define _mm_movepi64_pi64(a) simde_mm_movepi64_pi64(a)
3305 #endif
3306
3307 SIMDE_FUNCTION_ATTRIBUTES
3308 simde__m128i
simde_mm_movpi64_epi64(simde__m64 a)3309 simde_mm_movpi64_epi64 (simde__m64 a) {
3310 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
3311 return _mm_movpi64_epi64(a);
3312 #else
3313 simde__m128i_private r_;
3314 simde__m64_private a_ = simde__m64_to_private(a);
3315
3316 r_.i64[0] = a_.i64[0];
3317 r_.i64[1] = 0;
3318
3319 return simde__m128i_from_private(r_);
3320 #endif
3321 }
3322 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3323 # define _mm_movpi64_epi64(a) simde_mm_movpi64_epi64(a)
3324 #endif
3325
3326 SIMDE_FUNCTION_ATTRIBUTES
3327 simde__m128i
simde_mm_min_epi16(simde__m128i a,simde__m128i b)3328 simde_mm_min_epi16 (simde__m128i a, simde__m128i b) {
3329 #if defined(SIMDE_X86_SSE2_NATIVE)
3330 return _mm_min_epi16(a, b);
3331 #else
3332 simde__m128i_private
3333 r_,
3334 a_ = simde__m128i_to_private(a),
3335 b_ = simde__m128i_to_private(b);
3336
3337 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3338 r_.neon_i16 = vminq_s16(a_.neon_i16, b_.neon_i16);
3339 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
3340 r_.altivec_i16 = vec_min(a_.altivec_i16, b_.altivec_i16);
3341 #else
3342 SIMDE_VECTORIZE
3343 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
3344 r_.i16[i] = (a_.i16[i] < b_.i16[i]) ? a_.i16[i] : b_.i16[i];
3345 }
3346 #endif
3347
3348 return simde__m128i_from_private(r_);
3349 #endif
3350 }
3351 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3352 # define _mm_min_epi16(a, b) simde_mm_min_epi16(a, b)
3353 #endif
3354
3355 SIMDE_FUNCTION_ATTRIBUTES
3356 simde__m128i
simde_mm_min_epu8(simde__m128i a,simde__m128i b)3357 simde_mm_min_epu8 (simde__m128i a, simde__m128i b) {
3358 #if defined(SIMDE_X86_SSE2_NATIVE)
3359 return _mm_min_epu8(a, b);
3360 #else
3361 simde__m128i_private
3362 r_,
3363 a_ = simde__m128i_to_private(a),
3364 b_ = simde__m128i_to_private(b);
3365
3366 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3367 r_.neon_u8 = vminq_u8(a_.neon_u8, b_.neon_u8);
3368 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
3369 r_.altivec_u8 = vec_min(a_.altivec_u8, b_.altivec_u8);
3370 #else
3371 SIMDE_VECTORIZE
3372 for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
3373 r_.u8[i] = (a_.u8[i] < b_.u8[i]) ? a_.u8[i] : b_.u8[i];
3374 }
3375 #endif
3376
3377 return simde__m128i_from_private(r_);
3378 #endif
3379 }
3380 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3381 # define _mm_min_epu8(a, b) simde_mm_min_epu8(a, b)
3382 #endif
3383
3384 SIMDE_FUNCTION_ATTRIBUTES
3385 simde__m128d
simde_mm_min_pd(simde__m128d a,simde__m128d b)3386 simde_mm_min_pd (simde__m128d a, simde__m128d b) {
3387 #if defined(SIMDE_X86_SSE2_NATIVE)
3388 return _mm_min_pd(a, b);
3389 #else
3390 simde__m128d_private
3391 r_,
3392 a_ = simde__m128d_to_private(a),
3393 b_ = simde__m128d_to_private(b);
3394
3395 #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
3396 r_.altivec_f64 = vec_min(a_.altivec_f64, b_.altivec_f64);
3397 #else
3398 SIMDE_VECTORIZE
3399 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
3400 r_.f64[i] = (a_.f64[i] < b_.f64[i]) ? a_.f64[i] : b_.f64[i];
3401 }
3402 #endif
3403
3404 return simde__m128d_from_private(r_);
3405 #endif
3406 }
3407 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3408 # define _mm_min_pd(a, b) simde_mm_min_pd(a, b)
3409 #endif
3410
3411 SIMDE_FUNCTION_ATTRIBUTES
3412 simde__m128d
simde_mm_min_sd(simde__m128d a,simde__m128d b)3413 simde_mm_min_sd (simde__m128d a, simde__m128d b) {
3414 #if defined(SIMDE_X86_SSE2_NATIVE)
3415 return _mm_min_sd(a, b);
3416 #elif defined(SIMDE_ASSUME_VECTORIZATION)
3417 return simde_mm_move_sd(a, simde_mm_min_pd(a, b));
3418 #else
3419 simde__m128d_private
3420 r_,
3421 a_ = simde__m128d_to_private(a),
3422 b_ = simde__m128d_to_private(b);
3423
3424 r_.f64[0] = (a_.f64[0] < b_.f64[0]) ? a_.f64[0] : b_.f64[0];
3425 r_.f64[1] = a_.f64[1];
3426
3427 return simde__m128d_from_private(r_);
3428 #endif
3429 }
3430 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3431 # define _mm_min_sd(a, b) simde_mm_min_sd(a, b)
3432 #endif
3433
3434 SIMDE_FUNCTION_ATTRIBUTES
3435 simde__m128i
simde_mm_max_epi16(simde__m128i a,simde__m128i b)3436 simde_mm_max_epi16 (simde__m128i a, simde__m128i b) {
3437 #if defined(SIMDE_X86_SSE2_NATIVE)
3438 return _mm_max_epi16(a, b);
3439 #else
3440 simde__m128i_private
3441 r_,
3442 a_ = simde__m128i_to_private(a),
3443 b_ = simde__m128i_to_private(b);
3444
3445 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3446 r_.neon_i16 = vmaxq_s16(a_.neon_i16, b_.neon_i16);
3447 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
3448 r_.altivec_i16 = vec_max(a_.altivec_i16, b_.altivec_i16);
3449 #else
3450 SIMDE_VECTORIZE
3451 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
3452 r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? a_.i16[i] : b_.i16[i];
3453 }
3454 #endif
3455
3456 return simde__m128i_from_private(r_);
3457 #endif
3458 }
3459 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3460 # define _mm_max_epi16(a, b) simde_mm_max_epi16(a, b)
3461 #endif
3462
3463 SIMDE_FUNCTION_ATTRIBUTES
3464 simde__m128i
simde_mm_max_epu8(simde__m128i a,simde__m128i b)3465 simde_mm_max_epu8 (simde__m128i a, simde__m128i b) {
3466 #if defined(SIMDE_X86_SSE2_NATIVE)
3467 return _mm_max_epu8(a, b);
3468 #else
3469 simde__m128i_private
3470 r_,
3471 a_ = simde__m128i_to_private(a),
3472 b_ = simde__m128i_to_private(b);
3473
3474 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3475 r_.neon_u8 = vmaxq_u8(a_.neon_u8, b_.neon_u8);
3476 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
3477 r_.altivec_u8 = vec_max(a_.altivec_u8, b_.altivec_u8);
3478 #else
3479 SIMDE_VECTORIZE
3480 for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
3481 r_.u8[i] = (a_.u8[i] > b_.u8[i]) ? a_.u8[i] : b_.u8[i];
3482 }
3483 #endif
3484
3485 return simde__m128i_from_private(r_);
3486 #endif
3487 }
3488 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3489 # define _mm_max_epu8(a, b) simde_mm_max_epu8(a, b)
3490 #endif
3491
3492 SIMDE_FUNCTION_ATTRIBUTES
3493 simde__m128d
simde_mm_max_pd(simde__m128d a,simde__m128d b)3494 simde_mm_max_pd (simde__m128d a, simde__m128d b) {
3495 #if defined(SIMDE_X86_SSE2_NATIVE)
3496 return _mm_max_pd(a, b);
3497 #else
3498 simde__m128d_private
3499 r_,
3500 a_ = simde__m128d_to_private(a),
3501 b_ = simde__m128d_to_private(b);
3502
3503 #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
3504 r_.altivec_f64 = vec_max(a_.altivec_f64, b_.altivec_f64);
3505 #else
3506 SIMDE_VECTORIZE
3507 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
3508 r_.f64[i] = (a_.f64[i] > b_.f64[i]) ? a_.f64[i] : b_.f64[i];
3509 }
3510 #endif
3511
3512 return simde__m128d_from_private(r_);
3513 #endif
3514 }
3515 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3516 # define _mm_max_pd(a, b) simde_mm_max_pd(a, b)
3517 #endif
3518
3519 SIMDE_FUNCTION_ATTRIBUTES
3520 simde__m128d
simde_mm_max_sd(simde__m128d a,simde__m128d b)3521 simde_mm_max_sd (simde__m128d a, simde__m128d b) {
3522 #if defined(SIMDE_X86_SSE2_NATIVE)
3523 return _mm_max_sd(a, b);
3524 #elif defined(SIMDE_ASSUME_VECTORIZATION)
3525 return simde_mm_move_sd(a, simde_mm_max_pd(a, b));
3526 #else
3527 simde__m128d_private
3528 r_,
3529 a_ = simde__m128d_to_private(a),
3530 b_ = simde__m128d_to_private(b);
3531
3532 r_.f64[0] = (a_.f64[0] > b_.f64[0]) ? a_.f64[0] : b_.f64[0];
3533 r_.f64[1] = a_.f64[1];
3534
3535 return simde__m128d_from_private(r_);
3536 #endif
3537 }
3538 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3539 # define _mm_max_sd(a, b) simde_mm_max_sd(a, b)
3540 #endif
3541
3542 SIMDE_FUNCTION_ATTRIBUTES
3543 simde__m128i
simde_mm_move_epi64(simde__m128i a)3544 simde_mm_move_epi64 (simde__m128i a) {
3545 #if defined(SIMDE_X86_SSE2_NATIVE)
3546 return _mm_move_epi64(a);
3547 #else
3548 simde__m128i_private
3549 r_,
3550 a_ = simde__m128i_to_private(a);
3551
3552 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3553 r_.neon_i64 = vsetq_lane_s64(0, a_.neon_i64, 1);
3554 #else
3555 r_.i64[0] = a_.i64[0];
3556 r_.i64[1] = 0;
3557 #endif
3558
3559 return simde__m128i_from_private(r_);
3560 #endif
3561 }
3562 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3563 # define _mm_move_epi64(a) simde_mm_move_epi64(a)
3564 #endif
3565
3566 SIMDE_FUNCTION_ATTRIBUTES
3567 simde__m128i
simde_mm_mul_epu32(simde__m128i a,simde__m128i b)3568 simde_mm_mul_epu32 (simde__m128i a, simde__m128i b) {
3569 #if defined(SIMDE_X86_SSE2_NATIVE)
3570 return _mm_mul_epu32(a, b);
3571 #else
3572 simde__m128i_private
3573 r_,
3574 a_ = simde__m128i_to_private(a),
3575 b_ = simde__m128i_to_private(b);
3576
3577 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3578 uint32x2_t a_lo = vmovn_u64(a_.neon_u64);
3579 uint32x2_t b_lo = vmovn_u64(b_.neon_u64);
3580 r_.neon_u64 = vmull_u32(a_lo, b_lo);
3581 #else
3582 SIMDE_VECTORIZE
3583 for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
3584 r_.u64[i] = HEDLEY_STATIC_CAST(uint64_t, a_.u32[i * 2]) * HEDLEY_STATIC_CAST(uint64_t, b_.u32[i * 2]);
3585 }
3586 #endif
3587
3588 return simde__m128i_from_private(r_);
3589 #endif
3590 }
3591 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3592 # define _mm_mul_epu32(a, b) simde_mm_mul_epu32(a, b)
3593 #endif
3594
3595 SIMDE_FUNCTION_ATTRIBUTES
3596 simde__m128i
simde_x_mm_mul_epi64(simde__m128i a,simde__m128i b)3597 simde_x_mm_mul_epi64 (simde__m128i a, simde__m128i b) {
3598 simde__m128i_private
3599 r_,
3600 a_ = simde__m128i_to_private(a),
3601 b_ = simde__m128i_to_private(b);
3602
3603 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
3604 r_.i64 = a_.i64 * b_.i64;
3605 #else
3606 SIMDE_VECTORIZE
3607 for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
3608 r_.i64[i] = a_.i64[i] * b_.i64[i];
3609 }
3610 #endif
3611
3612 return simde__m128i_from_private(r_);
3613 }
3614
3615 SIMDE_FUNCTION_ATTRIBUTES
3616 simde__m128i
simde_x_mm_mod_epi64(simde__m128i a,simde__m128i b)3617 simde_x_mm_mod_epi64 (simde__m128i a, simde__m128i b) {
3618 simde__m128i_private
3619 r_,
3620 a_ = simde__m128i_to_private(a),
3621 b_ = simde__m128i_to_private(b);
3622
3623 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
3624 r_.i64 = a_.i64 % b_.i64;
3625 #else
3626 SIMDE_VECTORIZE
3627 for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
3628 r_.i64[i] = a_.i64[i] % b_.i64[i];
3629 }
3630 #endif
3631
3632 return simde__m128i_from_private(r_);
3633 }
3634
3635 SIMDE_FUNCTION_ATTRIBUTES
3636 simde__m128d
simde_mm_mul_pd(simde__m128d a,simde__m128d b)3637 simde_mm_mul_pd (simde__m128d a, simde__m128d b) {
3638 #if defined(SIMDE_X86_SSE2_NATIVE)
3639 return _mm_mul_pd(a, b);
3640 #else
3641 simde__m128d_private
3642 r_,
3643 a_ = simde__m128d_to_private(a),
3644 b_ = simde__m128d_to_private(b);
3645
3646 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
3647 r_.f64 = a_.f64 * b_.f64;
3648 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
3649 r_.wasm_v128 = wasm_f64x2_mul(a_.wasm_v128, b_.wasm_v128);
3650 #else
3651 SIMDE_VECTORIZE
3652 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
3653 r_.f64[i] = a_.f64[i] * b_.f64[i];
3654 }
3655 #endif
3656
3657 return simde__m128d_from_private(r_);
3658 #endif
3659 }
3660 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3661 # define _mm_mul_pd(a, b) simde_mm_mul_pd(a, b)
3662 #endif
3663
3664 SIMDE_FUNCTION_ATTRIBUTES
3665 simde__m128d
simde_mm_mul_sd(simde__m128d a,simde__m128d b)3666 simde_mm_mul_sd (simde__m128d a, simde__m128d b) {
3667 #if defined(SIMDE_X86_SSE2_NATIVE)
3668 return _mm_mul_sd(a, b);
3669 #elif defined(SIMDE_ASSUME_VECTORIZATION)
3670 return simde_mm_move_sd(a, simde_mm_mul_pd(a, b));
3671 #else
3672 simde__m128d_private
3673 r_,
3674 a_ = simde__m128d_to_private(a),
3675 b_ = simde__m128d_to_private(b);
3676
3677 r_.f64[0] = a_.f64[0] * b_.f64[0];
3678 r_.f64[1] = a_.f64[1];
3679
3680 return simde__m128d_from_private(r_);
3681 #endif
3682 }
3683 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3684 # define _mm_mul_sd(a, b) simde_mm_mul_sd(a, b)
3685 #endif
3686
3687 SIMDE_FUNCTION_ATTRIBUTES
3688 simde__m64
simde_mm_mul_su32(simde__m64 a,simde__m64 b)3689 simde_mm_mul_su32 (simde__m64 a, simde__m64 b) {
3690 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI)
3691 return _mm_mul_su32(a, b);
3692 #else
3693 simde__m64_private
3694 r_,
3695 a_ = simde__m64_to_private(a),
3696 b_ = simde__m64_to_private(b);
3697
3698 r_.u64[0] = HEDLEY_STATIC_CAST(uint64_t, a_.u32[0]) * HEDLEY_STATIC_CAST(uint64_t, b_.u32[0]);
3699
3700 return simde__m64_from_private(r_);
3701 #endif
3702 }
3703 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3704 # define _mm_mul_su32(a, b) simde_mm_mul_su32(a, b)
3705 #endif
3706
3707 SIMDE_FUNCTION_ATTRIBUTES
3708 simde__m128i
simde_mm_mulhi_epi16(simde__m128i a,simde__m128i b)3709 simde_mm_mulhi_epi16 (simde__m128i a, simde__m128i b) {
3710 #if defined(SIMDE_X86_SSE2_NATIVE)
3711 return _mm_mulhi_epi16(a, b);
3712 #else
3713 simde__m128i_private
3714 r_,
3715 a_ = simde__m128i_to_private(a),
3716 b_ = simde__m128i_to_private(b);
3717
3718 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3719 int16x4_t a3210 = vget_low_s16(a_.neon_i16);
3720 int16x4_t b3210 = vget_low_s16(b_.neon_i16);
3721 int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
3722 int16x4_t a7654 = vget_high_s16(a_.neon_i16);
3723 int16x4_t b7654 = vget_high_s16(b_.neon_i16);
3724 int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
3725 uint16x8x2_t rv = vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
3726 r_.neon_u16 = rv.val[1];
3727 #else
3728 SIMDE_VECTORIZE
3729 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
3730 r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, (HEDLEY_STATIC_CAST(uint32_t, HEDLEY_STATIC_CAST(int32_t, a_.i16[i]) * HEDLEY_STATIC_CAST(int32_t, b_.i16[i])) >> 16));
3731 }
3732 #endif
3733
3734 return simde__m128i_from_private(r_);
3735 #endif
3736 }
3737 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3738 # define _mm_mulhi_epi16(a, b) simde_mm_mulhi_epi16(a, b)
3739 #endif
3740
3741 SIMDE_FUNCTION_ATTRIBUTES
3742 simde__m128i
simde_mm_mulhi_epu16(simde__m128i a,simde__m128i b)3743 simde_mm_mulhi_epu16 (simde__m128i a, simde__m128i b) {
3744 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
3745 return _mm_mulhi_epu16(a, b);
3746 #else
3747 simde__m128i_private
3748 r_,
3749 a_ = simde__m128i_to_private(a),
3750 b_ = simde__m128i_to_private(b);
3751
3752 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3753 uint16x4_t a3210 = vget_low_u16(a_.neon_u16);
3754 uint16x4_t b3210 = vget_low_u16(b_.neon_u16);
3755 uint32x4_t ab3210 = vmull_u16(a3210, b3210); /* 3333222211110000 */
3756 uint16x4_t a7654 = vget_high_u16(a_.neon_u16);
3757 uint16x4_t b7654 = vget_high_u16(b_.neon_u16);
3758 uint32x4_t ab7654 = vmull_u16(a7654, b7654); /* 7777666655554444 */
3759 uint16x8x2_t neon_r =
3760 vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654));
3761 r_.neon_u16 = neon_r.val[1];
3762 #else
3763 SIMDE_VECTORIZE
3764 for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
3765 r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, HEDLEY_STATIC_CAST(uint32_t, a_.u16[i]) * HEDLEY_STATIC_CAST(uint32_t, b_.u16[i]) >> 16);
3766 }
3767 #endif
3768
3769 return simde__m128i_from_private(r_);
3770 #endif
3771 }
3772 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3773 # define _mm_mulhi_epu16(a, b) simde_mm_mulhi_epu16(a, b)
3774 #endif
3775
3776 SIMDE_FUNCTION_ATTRIBUTES
3777 simde__m128i
simde_mm_mullo_epi16(simde__m128i a,simde__m128i b)3778 simde_mm_mullo_epi16 (simde__m128i a, simde__m128i b) {
3779 #if defined(SIMDE_X86_SSE2_NATIVE)
3780 return _mm_mullo_epi16(a, b);
3781 #else
3782 simde__m128i_private
3783 r_,
3784 a_ = simde__m128i_to_private(a),
3785 b_ = simde__m128i_to_private(b);
3786
3787 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3788 r_.neon_i16 = vmulq_s16(a_.neon_i16, b_.neon_i16);
3789 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
3790 (void) a_;
3791 (void) b_;
3792 r_.altivec_i16 = vec_mul(a_.altivec_i16, b_.altivec_i16);
3793 #else
3794 SIMDE_VECTORIZE
3795 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
3796 r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, HEDLEY_STATIC_CAST(uint32_t, a_.u16[i]) * HEDLEY_STATIC_CAST(uint32_t, b_.u16[i]));
3797 }
3798 #endif
3799
3800 return simde__m128i_from_private(r_);
3801 #endif
3802 }
3803 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3804 # define _mm_mullo_epi16(a, b) simde_mm_mullo_epi16(a, b)
3805 #endif
3806
3807 SIMDE_FUNCTION_ATTRIBUTES
3808 simde__m128d
simde_mm_or_pd(simde__m128d a,simde__m128d b)3809 simde_mm_or_pd (simde__m128d a, simde__m128d b) {
3810 #if defined(SIMDE_X86_SSE2_NATIVE)
3811 return _mm_or_pd(a, b);
3812 #else
3813 simde__m128d_private
3814 r_,
3815 a_ = simde__m128d_to_private(a),
3816 b_ = simde__m128d_to_private(b);
3817
3818 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
3819 r_.i32f = a_.i32f | b_.i32f;
3820 #else
3821 SIMDE_VECTORIZE
3822 for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
3823 r_.i32f[i] = a_.i32f[i] | b_.i32f[i];
3824 }
3825 #endif
3826
3827 return simde__m128d_from_private(r_);
3828 #endif
3829 }
3830 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3831 # define _mm_or_pd(a, b) simde_mm_or_pd(a, b)
3832 #endif
3833
3834 SIMDE_FUNCTION_ATTRIBUTES
3835 simde__m128i
simde_mm_or_si128(simde__m128i a,simde__m128i b)3836 simde_mm_or_si128 (simde__m128i a, simde__m128i b) {
3837 #if defined(SIMDE_X86_SSE2_NATIVE)
3838 return _mm_or_si128(a, b);
3839 #else
3840 simde__m128i_private
3841 r_,
3842 a_ = simde__m128i_to_private(a),
3843 b_ = simde__m128i_to_private(b);
3844
3845 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3846 r_.neon_i32 = vorrq_s32(a_.neon_i32, b_.neon_i32);
3847 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
3848 r_.altivec_i32 = vec_or(a_.altivec_i32, b_.altivec_i32);
3849 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
3850 r_.i32f = a_.i32f | b_.i32f;
3851 #else
3852 SIMDE_VECTORIZE
3853 for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
3854 r_.i32f[i] = a_.i32f[i] | b_.i32f[i];
3855 }
3856 #endif
3857
3858 return simde__m128i_from_private(r_);
3859 #endif
3860 }
3861 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3862 # define _mm_or_si128(a, b) simde_mm_or_si128(a, b)
3863 #endif
3864
3865 SIMDE_FUNCTION_ATTRIBUTES
3866 simde__m128i
simde_mm_packs_epi16(simde__m128i a,simde__m128i b)3867 simde_mm_packs_epi16 (simde__m128i a, simde__m128i b) {
3868 #if defined(SIMDE_X86_SSE2_NATIVE)
3869 return _mm_packs_epi16(a, b);
3870 #else
3871 simde__m128i_private
3872 r_,
3873 a_ = simde__m128i_to_private(a),
3874 b_ = simde__m128i_to_private(b);
3875
3876 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3877 r_.neon_i8 = vcombine_s8(vqmovn_s16(a_.neon_i16), vqmovn_s16(b_.neon_i16));
3878 #else
3879 SIMDE_VECTORIZE
3880 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
3881 r_.i8[i] = (a_.i16[i] > INT8_MAX) ? INT8_MAX : ((a_.i16[i] < INT8_MIN) ? INT8_MIN : HEDLEY_STATIC_CAST(int8_t, a_.i16[i]));
3882 r_.i8[i + 8] = (b_.i16[i] > INT8_MAX) ? INT8_MAX : ((b_.i16[i] < INT8_MIN) ? INT8_MIN : HEDLEY_STATIC_CAST(int8_t, b_.i16[i]));
3883 }
3884 #endif
3885
3886 return simde__m128i_from_private(r_);
3887 #endif
3888 }
3889 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3890 # define _mm_packs_epi16(a, b) simde_mm_packs_epi16(a, b)
3891 #endif
3892
3893 SIMDE_FUNCTION_ATTRIBUTES
3894 simde__m128i
simde_mm_packs_epi32(simde__m128i a,simde__m128i b)3895 simde_mm_packs_epi32 (simde__m128i a, simde__m128i b) {
3896 #if defined(SIMDE_X86_SSE2_NATIVE)
3897 return _mm_packs_epi32(a, b);
3898 #else
3899 simde__m128i_private
3900 r_,
3901 a_ = simde__m128i_to_private(a),
3902 b_ = simde__m128i_to_private(b);
3903
3904 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3905 r_.neon_i16 = vcombine_s16(vqmovn_s32(a_.neon_i32), vqmovn_s32(b_.neon_i32));
3906 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
3907 r_.altivec_i16 = vec_packs(a_.altivec_i32, b_.altivec_i32);
3908 #else
3909 SIMDE_VECTORIZE
3910 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
3911 r_.i16[i] = (a_.i32[i] > INT16_MAX) ? INT16_MAX : ((a_.i32[i] < INT16_MIN) ? INT16_MIN : HEDLEY_STATIC_CAST(int16_t, a_.i32[i]));
3912 r_.i16[i + 4] = (b_.i32[i] > INT16_MAX) ? INT16_MAX : ((b_.i32[i] < INT16_MIN) ? INT16_MIN : HEDLEY_STATIC_CAST(int16_t, b_.i32[i]));
3913 }
3914 #endif
3915
3916 return simde__m128i_from_private(r_);
3917 #endif
3918 }
3919 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3920 # define _mm_packs_epi32(a, b) simde_mm_packs_epi32(a, b)
3921 #endif
3922
3923 SIMDE_FUNCTION_ATTRIBUTES
3924 simde__m128i
simde_mm_packus_epi16(simde__m128i a,simde__m128i b)3925 simde_mm_packus_epi16 (simde__m128i a, simde__m128i b) {
3926 #if defined(SIMDE_X86_SSE2_NATIVE)
3927 return _mm_packus_epi16(a, b);
3928 #else
3929 simde__m128i_private
3930 r_,
3931 a_ = simde__m128i_to_private(a),
3932 b_ = simde__m128i_to_private(b);
3933
3934 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3935 r_.neon_u8 = vcombine_u8(vqmovun_s16(a_.neon_i16), vqmovun_s16(b_.neon_i16));
3936 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
3937 r_.altivec_u8 = vec_packsu(a_.altivec_i16, b_.altivec_i16);
3938 #else
3939 SIMDE_VECTORIZE
3940 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
3941 r_.u8[i] = (a_.i16[i] > UINT8_MAX) ? UINT8_MAX : ((a_.i16[i] < 0) ? UINT8_C(0) : HEDLEY_STATIC_CAST(uint8_t, a_.i16[i]));
3942 r_.u8[i + 8] = (b_.i16[i] > UINT8_MAX) ? UINT8_MAX : ((b_.i16[i] < 0) ? UINT8_C(0) : HEDLEY_STATIC_CAST(uint8_t, b_.i16[i]));
3943 }
3944 #endif
3945
3946 return simde__m128i_from_private(r_);
3947 #endif
3948 }
3949 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3950 # define _mm_packus_epi16(a, b) simde_mm_packus_epi16(a, b)
3951 #endif
3952
3953 SIMDE_FUNCTION_ATTRIBUTES
3954 void
simde_mm_pause(void)3955 simde_mm_pause (void) {
3956 #if defined(SIMDE_X86_SSE2_NATIVE)
3957 _mm_pause();
3958 #endif
3959 }
3960 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3961 # define _mm_pause() (simde_mm_pause())
3962 #endif
3963
3964 SIMDE_FUNCTION_ATTRIBUTES
3965 simde__m128i
simde_mm_sad_epu8(simde__m128i a,simde__m128i b)3966 simde_mm_sad_epu8 (simde__m128i a, simde__m128i b) {
3967 #if defined(SIMDE_X86_SSE2_NATIVE)
3968 return _mm_sad_epu8(a, b);
3969 #else
3970 simde__m128i_private
3971 r_,
3972 a_ = simde__m128i_to_private(a),
3973 b_ = simde__m128i_to_private(b);
3974
3975 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3976 uint16x8_t t = vpaddlq_u8(vabdq_u8(a_.neon_u8, b_.neon_u8));
3977 uint16_t r0 = t[0] + t[1] + t[2] + t[3];
3978 uint16_t r4 = t[4] + t[5] + t[6] + t[7];
3979 uint16x8_t r = vsetq_lane_u16(r0, vdupq_n_u16(0), 0);
3980 r_.neon_u16 = vsetq_lane_u16(r4, r, 4);
3981 #else
3982 for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
3983 uint16_t tmp = 0;
3984 SIMDE_VECTORIZE_REDUCTION(+:tmp)
3985 for (size_t j = 0 ; j < ((sizeof(r_.u8) / sizeof(r_.u8[0])) / 2) ; j++) {
3986 const size_t e = j + (i * 8);
3987 tmp += (a_.u8[e] > b_.u8[e]) ? (a_.u8[e] - b_.u8[e]) : (b_.u8[e] - a_.u8[e]);
3988 }
3989 r_.i64[i] = tmp;
3990 }
3991 #endif
3992
3993 return simde__m128i_from_private(r_);
3994 #endif
3995 }
3996 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3997 # define _mm_sad_epu8(a, b) simde_mm_sad_epu8(a, b)
3998 #endif
3999
4000 SIMDE_FUNCTION_ATTRIBUTES
4001 simde__m128i
simde_mm_set_epi8(int8_t e15,int8_t e14,int8_t e13,int8_t e12,int8_t e11,int8_t e10,int8_t e9,int8_t e8,int8_t e7,int8_t e6,int8_t e5,int8_t e4,int8_t e3,int8_t e2,int8_t e1,int8_t e0)4002 simde_mm_set_epi8 (int8_t e15, int8_t e14, int8_t e13, int8_t e12,
4003 int8_t e11, int8_t e10, int8_t e9, int8_t e8,
4004 int8_t e7, int8_t e6, int8_t e5, int8_t e4,
4005 int8_t e3, int8_t e2, int8_t e1, int8_t e0) {
4006
4007 #if defined(SIMDE_X86_SSE2_NATIVE)
4008 return _mm_set_epi8(
4009 e15, e14, e13, e12, e11, e10, e9, e8,
4010 e7, e6, e5, e4, e3, e2, e1, e0);
4011 #else
4012 simde__m128i_private r_;
4013
4014 #if defined(SIMDE_WASM_SIMD128_NATIVE)
4015 r_.wasm_v128 = wasm_i8x16_make(
4016 e0, e1, e2, e3, e4, e5, e6, e7,
4017 e8, e9, e10, e11, e12, e13, e14, e15);
4018 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4019 SIMDE_ALIGN_AS(16, int8x16_t) int8_t data[16] = {
4020 e0, e1, e2, e3,
4021 e4, e5, e6, e7,
4022 e8, e9, e10, e11,
4023 e12, e13, e14, e15};
4024 r_.neon_i8 = vld1q_s8(data);
4025 #else
4026 r_.i8[ 0] = e0;
4027 r_.i8[ 1] = e1;
4028 r_.i8[ 2] = e2;
4029 r_.i8[ 3] = e3;
4030 r_.i8[ 4] = e4;
4031 r_.i8[ 5] = e5;
4032 r_.i8[ 6] = e6;
4033 r_.i8[ 7] = e7;
4034 r_.i8[ 8] = e8;
4035 r_.i8[ 9] = e9;
4036 r_.i8[10] = e10;
4037 r_.i8[11] = e11;
4038 r_.i8[12] = e12;
4039 r_.i8[13] = e13;
4040 r_.i8[14] = e14;
4041 r_.i8[15] = e15;
4042 #endif
4043
4044 return simde__m128i_from_private(r_);
4045 #endif
4046 }
4047 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4048 # define _mm_set_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) simde_mm_set_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0)
4049 #endif
4050
4051 SIMDE_FUNCTION_ATTRIBUTES
4052 simde__m128i
simde_mm_set_epi16(int16_t e7,int16_t e6,int16_t e5,int16_t e4,int16_t e3,int16_t e2,int16_t e1,int16_t e0)4053 simde_mm_set_epi16 (int16_t e7, int16_t e6, int16_t e5, int16_t e4,
4054 int16_t e3, int16_t e2, int16_t e1, int16_t e0) {
4055 #if defined(SIMDE_X86_SSE2_NATIVE)
4056 return _mm_set_epi16(e7, e6, e5, e4, e3, e2, e1, e0);
4057 #else
4058 simde__m128i_private r_;
4059
4060 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4061 SIMDE_ALIGN_AS(16, int16x8_t) int16_t data[8] = { e0, e1, e2, e3, e4, e5, e6, e7 };
4062 r_.neon_i16 = vld1q_s16(data);
4063 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4064 r_.wasm_v128 = wasm_i16x8_make(e0, e1, e2, e3, e4, e5, e6, e7);
4065 #else
4066 r_.i16[0] = e0;
4067 r_.i16[1] = e1;
4068 r_.i16[2] = e2;
4069 r_.i16[3] = e3;
4070 r_.i16[4] = e4;
4071 r_.i16[5] = e5;
4072 r_.i16[6] = e6;
4073 r_.i16[7] = e7;
4074 #endif
4075
4076 return simde__m128i_from_private(r_);
4077 #endif
4078 }
4079 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4080 # define _mm_set_epi16(e7, e6, e5, e4, e3, e2, e1, e0) simde_mm_set_epi16(e7, e6, e5, e4, e3, e2, e1, e0)
4081 #endif
4082
4083 SIMDE_FUNCTION_ATTRIBUTES
4084 simde__m128i
simde_mm_set_epi32(int32_t e3,int32_t e2,int32_t e1,int32_t e0)4085 simde_mm_set_epi32 (int32_t e3, int32_t e2, int32_t e1, int32_t e0) {
4086 #if defined(SIMDE_X86_SSE2_NATIVE)
4087 return _mm_set_epi32(e3, e2, e1, e0);
4088 #else
4089 simde__m128i_private r_;
4090
4091 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4092 SIMDE_ALIGN_AS(16, int32x4_t) int32_t data[4] = { e0, e1, e2, e3 };
4093 r_.neon_i32 = vld1q_s32(data);
4094 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4095 r_.wasm_v128 = wasm_i32x4_make(e0, e1, e2, e3);
4096 #else
4097 r_.i32[0] = e0;
4098 r_.i32[1] = e1;
4099 r_.i32[2] = e2;
4100 r_.i32[3] = e3;
4101 #endif
4102
4103 return simde__m128i_from_private(r_);
4104 #endif
4105 }
4106 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4107 # define _mm_set_epi32(e3, e2, e1, e0) simde_mm_set_epi32(e3, e2, e1, e0)
4108 #endif
4109
4110 SIMDE_FUNCTION_ATTRIBUTES
4111 simde__m128i
simde_mm_set_epi64(simde__m64 e1,simde__m64 e0)4112 simde_mm_set_epi64 (simde__m64 e1, simde__m64 e0) {
4113 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
4114 return _mm_set_epi64(e1, e0);
4115 #else
4116 simde__m128i_private r_;
4117
4118 r_.m64_private[0] = simde__m64_to_private(e0);
4119 r_.m64_private[1] = simde__m64_to_private(e1);
4120
4121 return simde__m128i_from_private(r_);
4122 #endif
4123 }
4124 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4125 # define _mm_set_epi64(e1, e0) (simde_mm_set_epi64((e1), (e0)))
4126 #endif
4127
4128 SIMDE_FUNCTION_ATTRIBUTES
4129 simde__m128i
simde_mm_set_epi64x(int64_t e1,int64_t e0)4130 simde_mm_set_epi64x (int64_t e1, int64_t e0) {
4131 #if defined(SIMDE_X86_SSE2_NATIVE) && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,0,0))
4132 return _mm_set_epi64x(e1, e0);
4133 #else
4134 simde__m128i_private r_;
4135
4136 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4137 SIMDE_ALIGN_AS(16, int64x2_t) int64_t data[2] = {e0, e1};
4138 r_.neon_i64 = vld1q_s64(data);
4139 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4140 r_.wasm_v128 = wasm_i64x2_make(e0, e1);
4141 #else
4142 r_.i64[0] = e0;
4143 r_.i64[1] = e1;
4144 #endif
4145
4146 return simde__m128i_from_private(r_);
4147 #endif
4148 }
4149 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4150 # define _mm_set_epi64x(e1, e0) simde_mm_set_epi64x(e1, e0)
4151 #endif
4152
4153 SIMDE_FUNCTION_ATTRIBUTES
4154 simde__m128i
simde_x_mm_set_epu8(uint8_t e15,uint8_t e14,uint8_t e13,uint8_t e12,uint8_t e11,uint8_t e10,uint8_t e9,uint8_t e8,uint8_t e7,uint8_t e6,uint8_t e5,uint8_t e4,uint8_t e3,uint8_t e2,uint8_t e1,uint8_t e0)4155 simde_x_mm_set_epu8 (uint8_t e15, uint8_t e14, uint8_t e13, uint8_t e12,
4156 uint8_t e11, uint8_t e10, uint8_t e9, uint8_t e8,
4157 uint8_t e7, uint8_t e6, uint8_t e5, uint8_t e4,
4158 uint8_t e3, uint8_t e2, uint8_t e1, uint8_t e0) {
4159 #if defined(SIMDE_X86_SSE2_NATIVE)
4160 return _mm_set_epi8(
4161 HEDLEY_STATIC_CAST(char, e15), HEDLEY_STATIC_CAST(char, e14), HEDLEY_STATIC_CAST(char, e13), HEDLEY_STATIC_CAST(char, e12),
4162 HEDLEY_STATIC_CAST(char, e11), HEDLEY_STATIC_CAST(char, e10), HEDLEY_STATIC_CAST(char, e9), HEDLEY_STATIC_CAST(char, e8),
4163 HEDLEY_STATIC_CAST(char, e7), HEDLEY_STATIC_CAST(char, e6), HEDLEY_STATIC_CAST(char, e5), HEDLEY_STATIC_CAST(char, e4),
4164 HEDLEY_STATIC_CAST(char, e3), HEDLEY_STATIC_CAST(char, e2), HEDLEY_STATIC_CAST(char, e1), HEDLEY_STATIC_CAST(char, e0));
4165 #else
4166 simde__m128i_private r_;
4167
4168 r_.u8[ 0] = e0; r_.u8[ 1] = e1; r_.u8[ 2] = e2; r_.u8[ 3] = e3;
4169 r_.u8[ 4] = e4; r_.u8[ 5] = e5; r_.u8[ 6] = e6; r_.u8[ 7] = e7;
4170 r_.u8[ 8] = e8; r_.u8[ 9] = e9; r_.u8[10] = e10; r_.u8[11] = e11;
4171 r_.u8[12] = e12; r_.u8[13] = e13; r_.u8[14] = e14; r_.u8[15] = e15;
4172
4173 return simde__m128i_from_private(r_);
4174 #endif
4175 }
4176
4177 SIMDE_FUNCTION_ATTRIBUTES
4178 simde__m128i
simde_x_mm_set_epu16(uint16_t e7,uint16_t e6,uint16_t e5,uint16_t e4,uint16_t e3,uint16_t e2,uint16_t e1,uint16_t e0)4179 simde_x_mm_set_epu16 (uint16_t e7, uint16_t e6, uint16_t e5, uint16_t e4,
4180 uint16_t e3, uint16_t e2, uint16_t e1, uint16_t e0) {
4181 #if defined(SIMDE_X86_SSE2_NATIVE)
4182 return _mm_set_epi16(
4183 HEDLEY_STATIC_CAST(short, e7), HEDLEY_STATIC_CAST(short, e6), HEDLEY_STATIC_CAST(short, e5), HEDLEY_STATIC_CAST(short, e4),
4184 HEDLEY_STATIC_CAST(short, e3), HEDLEY_STATIC_CAST(short, e2), HEDLEY_STATIC_CAST(short, e1), HEDLEY_STATIC_CAST(short, e0));
4185 #else
4186 simde__m128i_private r_;
4187
4188 r_.u16[0] = e0; r_.u16[1] = e1; r_.u16[2] = e2; r_.u16[3] = e3;
4189 r_.u16[4] = e4; r_.u16[5] = e5; r_.u16[6] = e6; r_.u16[7] = e7;
4190
4191 return simde__m128i_from_private(r_);
4192 #endif
4193 }
4194
4195 SIMDE_FUNCTION_ATTRIBUTES
4196 simde__m128i
simde_x_mm_set_epu32(uint32_t e3,uint32_t e2,uint32_t e1,uint32_t e0)4197 simde_x_mm_set_epu32 (uint32_t e3, uint32_t e2, uint32_t e1, uint32_t e0) {
4198 #if defined(SIMDE_X86_SSE2_NATIVE)
4199 return _mm_set_epi32(
4200 HEDLEY_STATIC_CAST(int, e3), HEDLEY_STATIC_CAST(int, e2), HEDLEY_STATIC_CAST(int, e1), HEDLEY_STATIC_CAST(int, e0));
4201 #else
4202 simde__m128i_private r_;
4203
4204 r_.u32[0] = e0;
4205 r_.u32[1] = e1;
4206 r_.u32[2] = e2;
4207 r_.u32[3] = e3;
4208
4209 return simde__m128i_from_private(r_);
4210 #endif
4211 }
4212
4213 SIMDE_FUNCTION_ATTRIBUTES
4214 simde__m128i
simde_x_mm_set_epu64x(uint64_t e1,uint64_t e0)4215 simde_x_mm_set_epu64x (uint64_t e1, uint64_t e0) {
4216 #if defined(SIMDE_X86_SSE2_NATIVE) && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,0,0))
4217 return _mm_set_epi64x(HEDLEY_STATIC_CAST(int64_t, e1), HEDLEY_STATIC_CAST(int64_t, e0));
4218 #else
4219 simde__m128i_private r_;
4220
4221 r_.u64[0] = e0;
4222 r_.u64[1] = e1;
4223
4224 return simde__m128i_from_private(r_);
4225 #endif
4226 }
4227
4228 SIMDE_FUNCTION_ATTRIBUTES
4229 simde__m128d
simde_mm_set_pd(simde_float64 e1,simde_float64 e0)4230 simde_mm_set_pd (simde_float64 e1, simde_float64 e0) {
4231 #if defined(SIMDE_X86_SSE2_NATIVE)
4232 return _mm_set_pd(e1, e0);
4233 #else
4234 simde__m128d_private r_;
4235
4236 #if defined(SIMDE_WASM_SIMD128_NATIVE)
4237 r_.wasm_v128 = wasm_f64x2_make(e0, e1);
4238 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4239 r_.wasm_v128 = wasm_f64x2_make(e0, e1);
4240 #else
4241 r_.f64[0] = e0;
4242 r_.f64[1] = e1;
4243 #endif
4244
4245 return simde__m128d_from_private(r_);
4246 #endif
4247 }
4248 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4249 # define _mm_set_pd(e1, e0) simde_mm_set_pd(e1, e0)
4250 #endif
4251
4252 SIMDE_FUNCTION_ATTRIBUTES
4253 simde__m128d
simde_mm_set_pd1(simde_float64 a)4254 simde_mm_set_pd1 (simde_float64 a) {
4255 #if defined(SIMDE_X86_SSE2_NATIVE)
4256 return _mm_set1_pd(a);
4257 #else
4258 simde__m128d_private r_;
4259
4260 r_.f64[0] = a;
4261 r_.f64[1] = a;
4262
4263 return simde__m128d_from_private(r_);
4264 #endif
4265 }
4266 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4267 # define _mm_set_pd1(a) simde_mm_set1_pd(a)
4268 #endif
4269
4270 SIMDE_FUNCTION_ATTRIBUTES
4271 simde__m128d
simde_mm_set_sd(simde_float64 a)4272 simde_mm_set_sd (simde_float64 a) {
4273 #if defined(SIMDE_X86_SSE2_NATIVE)
4274 return _mm_set_sd(a);
4275 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
4276 return vsetq_lane_f64(a, vdupq_n_f64(SIMDE_FLOAT64_C(0.0)), 0);
4277 #else
4278 return simde_mm_set_pd(SIMDE_FLOAT64_C(0.0), a);
4279
4280 #endif
4281 }
4282 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4283 # define _mm_set_sd(a) simde_mm_set_sd(a)
4284 #endif
4285
4286 SIMDE_FUNCTION_ATTRIBUTES
4287 simde__m128i
simde_mm_set1_epi8(int8_t a)4288 simde_mm_set1_epi8 (int8_t a) {
4289 #if defined(SIMDE_X86_SSE2_NATIVE)
4290 return _mm_set1_epi8(a);
4291 #else
4292 simde__m128i_private r_;
4293
4294 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4295 r_.neon_i8 = vdupq_n_s8(a);
4296 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4297 r_.wasm_v128 = wasm_i8x16_splat(a);
4298 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
4299 r_.altivec_i8 = vec_splats(HEDLEY_STATIC_CAST(signed char, a));
4300 #else
4301 SIMDE_VECTORIZE
4302 for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
4303 r_.i8[i] = a;
4304 }
4305 #endif
4306
4307 return simde__m128i_from_private(r_);
4308 #endif
4309 }
4310 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4311 # define _mm_set1_epi8(a) simde_mm_set1_epi8(a)
4312 #endif
4313
4314 SIMDE_FUNCTION_ATTRIBUTES
4315 simde__m128i
simde_mm_set1_epi16(int16_t a)4316 simde_mm_set1_epi16 (int16_t a) {
4317 #if defined(SIMDE_X86_SSE2_NATIVE)
4318 return _mm_set1_epi16(a);
4319 #else
4320 simde__m128i_private r_;
4321
4322 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4323 r_.neon_i16 = vdupq_n_s16(a);
4324 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4325 r_.wasm_v128 = wasm_i16x8_splat(a);
4326 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
4327 r_.altivec_i16 = vec_splats(HEDLEY_STATIC_CAST(signed short, a));
4328 #else
4329 SIMDE_VECTORIZE
4330 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
4331 r_.i16[i] = a;
4332 }
4333 #endif
4334
4335 return simde__m128i_from_private(r_);
4336 #endif
4337 }
4338 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4339 # define _mm_set1_epi16(a) simde_mm_set1_epi16(a)
4340 #endif
4341
4342 SIMDE_FUNCTION_ATTRIBUTES
4343 simde__m128i
simde_mm_set1_epi32(int32_t a)4344 simde_mm_set1_epi32 (int32_t a) {
4345 #if defined(SIMDE_X86_SSE2_NATIVE)
4346 return _mm_set1_epi32(a);
4347 #else
4348 simde__m128i_private r_;
4349
4350 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4351 r_.neon_i32 = vdupq_n_s32(a);
4352 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4353 r_.wasm_v128 = wasm_i32x4_splat(a);
4354 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
4355 r_.altivec_i32 = vec_splats(HEDLEY_STATIC_CAST(signed int, a));
4356 #else
4357 SIMDE_VECTORIZE
4358 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
4359 r_.i32[i] = a;
4360 }
4361 #endif
4362
4363 return simde__m128i_from_private(r_);
4364 #endif
4365 }
4366 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4367 # define _mm_set1_epi32(a) simde_mm_set1_epi32(a)
4368 #endif
4369
4370 SIMDE_FUNCTION_ATTRIBUTES
4371 simde__m128i
simde_mm_set1_epi64x(int64_t a)4372 simde_mm_set1_epi64x (int64_t a) {
4373 #if defined(SIMDE_X86_SSE2_NATIVE) && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,0,0))
4374 return _mm_set1_epi64x(a);
4375 #else
4376 simde__m128i_private r_;
4377
4378 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4379 r_.neon_i64 = vmovq_n_s64(a);
4380 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4381 r_.wasm_v128 = wasm_i64x2_splat(a);
4382 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
4383 r_.altivec_i64 = vec_splats(HEDLEY_STATIC_CAST(signed long long, a));
4384 #else
4385 SIMDE_VECTORIZE
4386 for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
4387 r_.i64[i] = a;
4388 }
4389 #endif
4390
4391 return simde__m128i_from_private(r_);
4392 #endif
4393 }
4394 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4395 # define _mm_set1_epi64x(a) simde_mm_set1_epi64x(a)
4396 #endif
4397
4398 SIMDE_FUNCTION_ATTRIBUTES
4399 simde__m128i
simde_mm_set1_epi64(simde__m64 a)4400 simde_mm_set1_epi64 (simde__m64 a) {
4401 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
4402 return _mm_set1_epi64(a);
4403 #else
4404 simde__m64_private a_ = simde__m64_to_private(a);
4405 return simde_mm_set1_epi64x(a_.i64[0]);
4406 #endif
4407 }
4408 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4409 # define _mm_set1_epi64(a) simde_mm_set1_epi64(a)
4410 #endif
4411
4412 SIMDE_FUNCTION_ATTRIBUTES
4413 simde__m128i
simde_x_mm_set1_epu8(uint8_t value)4414 simde_x_mm_set1_epu8 (uint8_t value) {
4415 #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
4416 return simde__m128i_from_altivec_u8(vec_splats(HEDLEY_STATIC_CAST(unsigned char, value)));
4417 #else
4418 return simde_mm_set1_epi8(HEDLEY_STATIC_CAST(int8_t, value));
4419 #endif
4420 }
4421
4422 SIMDE_FUNCTION_ATTRIBUTES
4423 simde__m128i
simde_x_mm_set1_epu16(uint16_t value)4424 simde_x_mm_set1_epu16 (uint16_t value) {
4425 #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
4426 return simde__m128i_from_altivec_u16(vec_splats(HEDLEY_STATIC_CAST(unsigned short, value)));
4427 #else
4428 return simde_mm_set1_epi16(HEDLEY_STATIC_CAST(int16_t, value));
4429 #endif
4430 }
4431
4432 SIMDE_FUNCTION_ATTRIBUTES
4433 simde__m128i
simde_x_mm_set1_epu32(uint32_t value)4434 simde_x_mm_set1_epu32 (uint32_t value) {
4435 #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
4436 return simde__m128i_from_altivec_u32(vec_splats(HEDLEY_STATIC_CAST(unsigned int, value)));
4437 #else
4438 return simde_mm_set1_epi32(HEDLEY_STATIC_CAST(int32_t, value));
4439 #endif
4440 }
4441
4442 SIMDE_FUNCTION_ATTRIBUTES
4443 simde__m128i
simde_x_mm_set1_epu64(uint64_t value)4444 simde_x_mm_set1_epu64 (uint64_t value) {
4445 #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
4446 return simde__m128i_from_altivec_u64(vec_splats(HEDLEY_STATIC_CAST(unsigned long long, value)));
4447 #else
4448 return simde_mm_set1_epi64x(HEDLEY_STATIC_CAST(int64_t, value));
4449 #endif
4450 }
4451
4452 SIMDE_FUNCTION_ATTRIBUTES
4453 simde__m128d
simde_mm_set1_pd(simde_float64 a)4454 simde_mm_set1_pd (simde_float64 a) {
4455 #if defined(SIMDE_X86_SSE2_NATIVE)
4456 return _mm_set1_pd(a);
4457 #else
4458 simde__m128d_private r_;
4459
4460 #if defined(SIMDE_WASM_SIMD128_NATIVE)
4461 r_.wasm_v128 = wasm_f64x2_splat(a);
4462 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
4463 r_.altivec_f64 = vec_splats(HEDLEY_STATIC_CAST(double, a));
4464 #else
4465 SIMDE_VECTORIZE
4466 for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
4467 r_.f64[i] = a;
4468 }
4469 #endif
4470
4471 return simde__m128d_from_private(r_);
4472 #endif
4473 }
4474 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4475 # define _mm_set1_pd(a) simde_mm_set1_pd(a)
4476 #endif
4477
4478 SIMDE_FUNCTION_ATTRIBUTES
4479 simde__m128i
simde_mm_setr_epi8(int8_t e15,int8_t e14,int8_t e13,int8_t e12,int8_t e11,int8_t e10,int8_t e9,int8_t e8,int8_t e7,int8_t e6,int8_t e5,int8_t e4,int8_t e3,int8_t e2,int8_t e1,int8_t e0)4480 simde_mm_setr_epi8 (int8_t e15, int8_t e14, int8_t e13, int8_t e12,
4481 int8_t e11, int8_t e10, int8_t e9, int8_t e8,
4482 int8_t e7, int8_t e6, int8_t e5, int8_t e4,
4483 int8_t e3, int8_t e2, int8_t e1, int8_t e0) {
4484 #if defined(SIMDE_X86_SSE2_NATIVE)
4485 return _mm_setr_epi8(
4486 e15, e14, e13, e12, e11, e10, e9, e8,
4487 e7, e6, e5, e4, e3, e2, e1, e0);
4488 #else
4489 return simde_mm_set_epi8(
4490 e0, e1, e2, e3, e4, e5, e6, e7,
4491 e8, e9, e10, e11, e12, e13, e14, e15);
4492 #endif
4493 }
4494 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4495 # define _mm_setr_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) simde_mm_setr_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0)
4496 #endif
4497
4498 SIMDE_FUNCTION_ATTRIBUTES
4499 simde__m128i
simde_mm_setr_epi16(int16_t e7,int16_t e6,int16_t e5,int16_t e4,int16_t e3,int16_t e2,int16_t e1,int16_t e0)4500 simde_mm_setr_epi16 (int16_t e7, int16_t e6, int16_t e5, int16_t e4,
4501 int16_t e3, int16_t e2, int16_t e1, int16_t e0) {
4502 #if defined(SIMDE_X86_SSE2_NATIVE)
4503 return _mm_setr_epi16(e7, e6, e5, e4, e3, e2, e1, e0);
4504 #else
4505 return simde_mm_set_epi16(e0, e1, e2, e3, e4, e5, e6, e7);
4506 #endif
4507 }
4508 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4509 # define _mm_setr_epi16(e7, e6, e5, e4, e3, e2, e1, e0) simde_mm_setr_epi16(e7, e6, e5, e4, e3, e2, e1, e0)
4510 #endif
4511
4512 SIMDE_FUNCTION_ATTRIBUTES
4513 simde__m128i
simde_mm_setr_epi32(int32_t e3,int32_t e2,int32_t e1,int32_t e0)4514 simde_mm_setr_epi32 (int32_t e3, int32_t e2, int32_t e1, int32_t e0) {
4515 #if defined(SIMDE_X86_SSE2_NATIVE)
4516 return _mm_setr_epi32(e3, e2, e1, e0);
4517 #else
4518 return simde_mm_set_epi32(e0, e1, e2, e3);
4519 #endif
4520 }
4521 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4522 # define _mm_setr_epi32(e3, e2, e1, e0) simde_mm_setr_epi32(e3, e2, e1, e0)
4523 #endif
4524
4525 SIMDE_FUNCTION_ATTRIBUTES
4526 simde__m128i
simde_mm_setr_epi64(simde__m64 e1,simde__m64 e0)4527 simde_mm_setr_epi64 (simde__m64 e1, simde__m64 e0) {
4528 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
4529 return _mm_setr_epi64(e1, e0);
4530 #else
4531 return simde_mm_set_epi64(e0, e1);
4532 #endif
4533 }
4534 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4535 # define _mm_setr_epi64(e1, e0) (simde_mm_setr_epi64((e1), (e0)))
4536 #endif
4537
4538 SIMDE_FUNCTION_ATTRIBUTES
4539 simde__m128d
simde_mm_setr_pd(simde_float64 e1,simde_float64 e0)4540 simde_mm_setr_pd (simde_float64 e1, simde_float64 e0) {
4541 #if defined(SIMDE_X86_SSE2_NATIVE)
4542 return _mm_setr_pd(e1, e0);
4543 #else
4544 return simde_mm_set_pd(e0, e1);
4545 #endif
4546 }
4547 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4548 # define _mm_setr_pd(e1, e0) simde_mm_setr_pd(e1, e0)
4549 #endif
4550
4551 SIMDE_FUNCTION_ATTRIBUTES
4552 simde__m128d
simde_mm_setzero_pd(void)4553 simde_mm_setzero_pd (void) {
4554 #if defined(SIMDE_X86_SSE2_NATIVE)
4555 return _mm_setzero_pd();
4556 #else
4557 return simde_mm_castsi128_pd(simde_mm_setzero_si128());
4558 #endif
4559 }
4560 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4561 # define _mm_setzero_pd() simde_mm_setzero_pd()
4562 #endif
4563
4564 #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
4565 HEDLEY_DIAGNOSTIC_PUSH
4566 SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_
4567 #endif
4568
4569 SIMDE_FUNCTION_ATTRIBUTES
4570 simde__m128d
simde_mm_undefined_pd(void)4571 simde_mm_undefined_pd (void) {
4572 simde__m128d_private r_;
4573
4574 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE__HAVE_UNDEFINED128)
4575 r_.n = _mm_undefined_pd();
4576 #elif !defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
4577 r_ = simde__m128d_to_private(simde_mm_setzero_pd());
4578 #endif
4579
4580 return simde__m128d_from_private(r_);
4581 }
4582 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4583 # define _mm_undefined_pd() simde_mm_undefined_pd()
4584 #endif
4585
4586 SIMDE_FUNCTION_ATTRIBUTES
4587 simde__m128i
simde_mm_undefined_si128(void)4588 simde_mm_undefined_si128 (void) {
4589 simde__m128i_private r_;
4590
4591 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE__HAVE_UNDEFINED128)
4592 r_.n = _mm_undefined_si128();
4593 #elif !defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
4594 r_ = simde__m128i_to_private(simde_mm_setzero_si128());
4595 #endif
4596
4597 return simde__m128i_from_private(r_);
4598 }
4599 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4600 # define _mm_undefined_si128() (simde_mm_undefined_si128())
4601 #endif
4602
4603 #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
4604 HEDLEY_DIAGNOSTIC_POP
4605 #endif
4606
4607 SIMDE_FUNCTION_ATTRIBUTES
4608 simde__m128d
simde_x_mm_setone_pd(void)4609 simde_x_mm_setone_pd (void) {
4610 return simde_mm_castps_pd(simde_x_mm_setone_ps());
4611 }
4612
4613 SIMDE_FUNCTION_ATTRIBUTES
4614 simde__m128i
simde_x_mm_setone_si128(void)4615 simde_x_mm_setone_si128 (void) {
4616 return simde_mm_castps_si128(simde_x_mm_setone_ps());
4617 }
4618
4619 SIMDE_FUNCTION_ATTRIBUTES
4620 simde__m128i
simde_mm_shuffle_epi32(simde__m128i a,const int imm8)4621 simde_mm_shuffle_epi32 (simde__m128i a, const int imm8)
4622 SIMDE_REQUIRE_RANGE(imm8, 0, 255) {
4623 simde__m128i_private
4624 r_,
4625 a_ = simde__m128i_to_private(a);
4626
4627 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
4628 r_.i32[i] = a_.i32[(imm8 >> (i * 2)) & 3];
4629 }
4630
4631 return simde__m128i_from_private(r_);
4632 }
4633 #if defined(SIMDE_X86_SSE2_NATIVE)
4634 # define simde_mm_shuffle_epi32(a, imm8) _mm_shuffle_epi32((a), (imm8))
4635 #elif defined(SIMDE_SHUFFLE_VECTOR_)
4636 # define simde_mm_shuffle_epi32(a, imm8) (__extension__ ({ \
4637 const simde__m128i_private simde__tmp_a_ = simde__m128i_to_private(a); \
4638 simde__m128i_from_private((simde__m128i_private) { .i32 = \
4639 SIMDE_SHUFFLE_VECTOR_(32, 16, \
4640 (simde__tmp_a_).i32, \
4641 (simde__tmp_a_).i32, \
4642 ((imm8) ) & 3, \
4643 ((imm8) >> 2) & 3, \
4644 ((imm8) >> 4) & 3, \
4645 ((imm8) >> 6) & 3) }); }))
4646 #endif
4647 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4648 # define _mm_shuffle_epi32(a, imm8) simde_mm_shuffle_epi32(a, imm8)
4649 #endif
4650
4651 SIMDE_FUNCTION_ATTRIBUTES
4652 simde__m128d
simde_mm_shuffle_pd(simde__m128d a,simde__m128d b,const int imm8)4653 simde_mm_shuffle_pd (simde__m128d a, simde__m128d b, const int imm8)
4654 SIMDE_REQUIRE_RANGE(imm8, 0, 3) {
4655 simde__m128d_private
4656 r_,
4657 a_ = simde__m128d_to_private(a),
4658 b_ = simde__m128d_to_private(b);
4659
4660 r_.f64[0] = ((imm8 & 1) == 0) ? a_.f64[0] : a_.f64[1];
4661 r_.f64[1] = ((imm8 & 2) == 0) ? b_.f64[0] : b_.f64[1];
4662
4663 return simde__m128d_from_private(r_);
4664 }
4665 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
4666 # define simde_mm_shuffle_pd(a, b, imm8) _mm_shuffle_pd((a), (b), (imm8))
4667 #elif defined(SIMDE_SHUFFLE_VECTOR_)
4668 # define simde_mm_shuffle_pd(a, b, imm8) (__extension__ ({ \
4669 simde__m128d_from_private((simde__m128d_private) { .f64 = \
4670 SIMDE_SHUFFLE_VECTOR_(64, 16, \
4671 simde__m128d_to_private(a).f64, \
4672 simde__m128d_to_private(b).f64, \
4673 (((imm8) ) & 1), \
4674 (((imm8) >> 1) & 1) + 2) }); }))
4675 #endif
4676 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4677 # define _mm_shuffle_pd(a, b, imm8) simde_mm_shuffle_pd(a, b, imm8)
4678 #endif
4679
4680 SIMDE_FUNCTION_ATTRIBUTES
4681 simde__m128i
simde_mm_shufflehi_epi16(simde__m128i a,const int imm8)4682 simde_mm_shufflehi_epi16 (simde__m128i a, const int imm8)
4683 SIMDE_REQUIRE_RANGE(imm8, 0, 255) {
4684 simde__m128i_private
4685 r_,
4686 a_ = simde__m128i_to_private(a);
4687
4688 SIMDE_VECTORIZE
4689 for (size_t i = 0 ; i < ((sizeof(a_.i16) / sizeof(a_.i16[0])) / 2) ; i++) {
4690 r_.i16[i] = a_.i16[i];
4691 }
4692 for (size_t i = ((sizeof(a_.i16) / sizeof(a_.i16[0])) / 2) ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
4693 r_.i16[i] = a_.i16[((imm8 >> ((i - 4) * 2)) & 3) + 4];
4694 }
4695
4696 return simde__m128i_from_private(r_);
4697 }
4698 #if defined(SIMDE_X86_SSE2_NATIVE)
4699 # define simde_mm_shufflehi_epi16(a, imm8) _mm_shufflehi_epi16((a), (imm8))
4700 #elif defined(SIMDE_SHUFFLE_VECTOR_)
4701 # define simde_mm_shufflehi_epi16(a, imm8) (__extension__ ({ \
4702 const simde__m128i_private simde__tmp_a_ = simde__m128i_to_private(a); \
4703 simde__m128i_from_private((simde__m128i_private) { .i16 = \
4704 SIMDE_SHUFFLE_VECTOR_(16, 16, \
4705 (simde__tmp_a_).i16, \
4706 (simde__tmp_a_).i16, \
4707 0, 1, 2, 3, \
4708 (((imm8) ) & 3) + 4, \
4709 (((imm8) >> 2) & 3) + 4, \
4710 (((imm8) >> 4) & 3) + 4, \
4711 (((imm8) >> 6) & 3) + 4) }); }))
4712 #endif
4713 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4714 # define _mm_shufflehi_epi16(a, imm8) simde_mm_shufflehi_epi16(a, imm8)
4715 #endif
4716
4717 SIMDE_FUNCTION_ATTRIBUTES
4718 simde__m128i
simde_mm_shufflelo_epi16(simde__m128i a,const int imm8)4719 simde_mm_shufflelo_epi16 (simde__m128i a, const int imm8)
4720 SIMDE_REQUIRE_RANGE(imm8, 0, 255) {
4721 simde__m128i_private
4722 r_,
4723 a_ = simde__m128i_to_private(a);
4724
4725 for (size_t i = 0 ; i < ((sizeof(r_.i16) / sizeof(r_.i16[0])) / 2) ; i++) {
4726 r_.i16[i] = a_.i16[((imm8 >> (i * 2)) & 3)];
4727 }
4728 SIMDE_VECTORIZE
4729 for (size_t i = ((sizeof(a_.i16) / sizeof(a_.i16[0])) / 2) ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
4730 r_.i16[i] = a_.i16[i];
4731 }
4732
4733 return simde__m128i_from_private(r_);
4734 }
4735 #if defined(SIMDE_X86_SSE2_NATIVE)
4736 # define simde_mm_shufflelo_epi16(a, imm8) _mm_shufflelo_epi16((a), (imm8))
4737 #elif defined(SIMDE_SHUFFLE_VECTOR_)
4738 # define simde_mm_shufflelo_epi16(a, imm8) (__extension__ ({ \
4739 const simde__m128i_private simde__tmp_a_ = simde__m128i_to_private(a); \
4740 simde__m128i_from_private((simde__m128i_private) { .i16 = \
4741 SIMDE_SHUFFLE_VECTOR_(16, 16, \
4742 (simde__tmp_a_).i16, \
4743 (simde__tmp_a_).i16, \
4744 (((imm8) ) & 3), \
4745 (((imm8) >> 2) & 3), \
4746 (((imm8) >> 4) & 3), \
4747 (((imm8) >> 6) & 3), \
4748 4, 5, 6, 7) }); }))
4749 #endif
4750 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4751 # define _mm_shufflelo_epi16(a, imm8) simde_mm_shufflelo_epi16(a, imm8)
4752 #endif
4753
4754 SIMDE_FUNCTION_ATTRIBUTES
4755 simde__m128i
simde_mm_sll_epi16(simde__m128i a,simde__m128i count)4756 simde_mm_sll_epi16 (simde__m128i a, simde__m128i count) {
4757 #if defined(SIMDE_X86_SSE2_NATIVE)
4758 return _mm_sll_epi16(a, count);
4759 #else
4760 simde__m128i_private
4761 r_,
4762 a_ = simde__m128i_to_private(a),
4763 count_ = simde__m128i_to_private(count);
4764
4765 if (count_.u64[0] > 15)
4766 return simde_mm_setzero_si128();
4767
4768 #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
4769 r_.u16 = (a_.u16 << count_.u64[0]);
4770 #else
4771 SIMDE_VECTORIZE
4772 for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
4773 r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, (a_.u16[i] << count_.u64[0]));
4774 }
4775 #endif
4776
4777 return simde__m128i_from_private(r_);
4778 #endif
4779 }
4780 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4781 # define _mm_sll_epi16(a, count) simde_mm_sll_epi16((a), (count))
4782 #endif
4783
4784 SIMDE_FUNCTION_ATTRIBUTES
4785 simde__m128i
simde_mm_sll_epi32(simde__m128i a,simde__m128i count)4786 simde_mm_sll_epi32 (simde__m128i a, simde__m128i count) {
4787 #if defined(SIMDE_X86_SSE2_NATIVE)
4788 return _mm_sll_epi32(a, count);
4789 #else
4790 simde__m128i_private
4791 r_,
4792 a_ = simde__m128i_to_private(a),
4793 count_ = simde__m128i_to_private(count);
4794
4795 if (count_.u64[0] > 31)
4796 return simde_mm_setzero_si128();
4797
4798 #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
4799 r_.u32 = (a_.u32 << count_.u64[0]);
4800 #else
4801 SIMDE_VECTORIZE
4802 for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
4803 r_.u32[i] = HEDLEY_STATIC_CAST(uint32_t, (a_.u32[i] << count_.u64[0]));
4804 }
4805 #endif
4806
4807 return simde__m128i_from_private(r_);
4808 #endif
4809 }
4810 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4811 # define _mm_sll_epi32(a, count) (simde_mm_sll_epi32(a, (count)))
4812 #endif
4813
4814 SIMDE_FUNCTION_ATTRIBUTES
4815 simde__m128i
simde_mm_sll_epi64(simde__m128i a,simde__m128i count)4816 simde_mm_sll_epi64 (simde__m128i a, simde__m128i count) {
4817 #if defined(SIMDE_X86_SSE2_NATIVE)
4818 return _mm_sll_epi64(a, count);
4819 #else
4820 simde__m128i_private
4821 r_,
4822 a_ = simde__m128i_to_private(a),
4823 count_ = simde__m128i_to_private(count);
4824
4825 if (count_.u64[0] > 63)
4826 return simde_mm_setzero_si128();
4827
4828 const int_fast16_t s = HEDLEY_STATIC_CAST(int_fast16_t, count_.u64[0]);
4829 #if !defined(SIMDE_BUG_GCC_94488)
4830 SIMDE_VECTORIZE
4831 #endif
4832 for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
4833 r_.u64[i] = a_.u64[i] << s;
4834 }
4835
4836 return simde__m128i_from_private(r_);
4837 #endif
4838 }
4839 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4840 # define _mm_sll_epi64(a, count) (simde_mm_sll_epi64(a, (count)))
4841 #endif
4842
4843 SIMDE_FUNCTION_ATTRIBUTES
4844 simde__m128d
simde_mm_sqrt_pd(simde__m128d a)4845 simde_mm_sqrt_pd (simde__m128d a) {
4846 #if defined(SIMDE_X86_SSE2_NATIVE)
4847 return _mm_sqrt_pd(a);
4848 #else
4849 simde__m128d_private
4850 r_,
4851 a_ = simde__m128d_to_private(a);
4852
4853 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
4854 r_.neon_f64 = vsqrtq_f64(a_.neon_f64);
4855 #elif defined(simde_math_sqrt)
4856 SIMDE_VECTORIZE
4857 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
4858 r_.f64[i] = simde_math_sqrt(a_.f64[i]);
4859 }
4860 #else
4861 HEDLEY_UNREACHABLE();
4862 #endif
4863
4864 return simde__m128d_from_private(r_);
4865 #endif
4866 }
4867 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4868 # define _mm_sqrt_pd(a) simde_mm_sqrt_pd(a)
4869 #endif
4870
4871 SIMDE_FUNCTION_ATTRIBUTES
4872 simde__m128d
simde_mm_sqrt_sd(simde__m128d a,simde__m128d b)4873 simde_mm_sqrt_sd (simde__m128d a, simde__m128d b) {
4874 #if defined(SIMDE_X86_SSE2_NATIVE)
4875 return _mm_sqrt_sd(a, b);
4876 #elif defined(SIMDE_ASSUME_VECTORIZATION)
4877 return simde_mm_move_sd(a, simde_mm_sqrt_pd(b));
4878 #else
4879 simde__m128d_private
4880 r_,
4881 a_ = simde__m128d_to_private(a),
4882 b_ = simde__m128d_to_private(b);
4883
4884 #if defined(simde_math_sqrt)
4885 r_.f64[0] = simde_math_sqrt(b_.f64[0]);
4886 r_.f64[1] = a_.f64[1];
4887 #else
4888 HEDLEY_UNREACHABLE();
4889 #endif
4890
4891 return simde__m128d_from_private(r_);
4892 #endif
4893 }
4894 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4895 # define _mm_sqrt_sd(a, b) simde_mm_sqrt_sd(a, b)
4896 #endif
4897
4898 SIMDE_FUNCTION_ATTRIBUTES
4899 simde__m128i
simde_mm_srl_epi16(simde__m128i a,simde__m128i count)4900 simde_mm_srl_epi16 (simde__m128i a, simde__m128i count) {
4901 #if defined(SIMDE_X86_SSE2_NATIVE)
4902 return _mm_srl_epi16(a, count);
4903 #else
4904 simde__m128i_private
4905 r_,
4906 a_ = simde__m128i_to_private(a),
4907 count_ = simde__m128i_to_private(count);
4908
4909 const int cnt = HEDLEY_STATIC_CAST(int, (count_.i64[0] > 16 ? 16 : count_.i64[0]));
4910
4911 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4912 r_.neon_u16 = vshlq_u16(a_.neon_u16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, -cnt)));
4913 #else
4914 SIMDE_VECTORIZE
4915 for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
4916 r_.u16[i] = a_.u16[i] >> cnt;
4917 }
4918 #endif
4919
4920 return simde__m128i_from_private(r_);
4921 #endif
4922 }
4923 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4924 #define _mm_srl_epi16(a, count) (simde_mm_srl_epi16(a, (count)))
4925 #endif
4926
4927 SIMDE_FUNCTION_ATTRIBUTES
4928 simde__m128i
simde_mm_srl_epi32(simde__m128i a,simde__m128i count)4929 simde_mm_srl_epi32 (simde__m128i a, simde__m128i count) {
4930 #if defined(SIMDE_X86_SSE2_NATIVE)
4931 return _mm_srl_epi32(a, count);
4932 #else
4933 simde__m128i_private
4934 r_,
4935 a_ = simde__m128i_to_private(a),
4936 count_ = simde__m128i_to_private(count);
4937
4938 const int cnt = HEDLEY_STATIC_CAST(int, (count_.i64[0] > 32 ? 32 : count_.i64[0]));
4939
4940 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4941 r_.neon_u32 = vshlq_u32(a_.neon_u32, vdupq_n_s32(HEDLEY_STATIC_CAST(int32_t, -cnt)));
4942 #else
4943 SIMDE_VECTORIZE
4944 for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
4945 r_.u32[i] = a_.u32[i] >> cnt;
4946 }
4947 #endif
4948
4949 return simde__m128i_from_private(r_);
4950 #endif
4951 }
4952 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4953 # define _mm_srl_epi32(a, count) (simde_mm_srl_epi32(a, (count)))
4954 #endif
4955
4956 SIMDE_FUNCTION_ATTRIBUTES
4957 simde__m128i
simde_mm_srl_epi64(simde__m128i a,simde__m128i count)4958 simde_mm_srl_epi64 (simde__m128i a, simde__m128i count) {
4959 #if defined(SIMDE_X86_SSE2_NATIVE)
4960 return _mm_srl_epi64(a, count);
4961 #else
4962 simde__m128i_private
4963 r_,
4964 a_ = simde__m128i_to_private(a),
4965 count_ = simde__m128i_to_private(count);
4966
4967 const int cnt = HEDLEY_STATIC_CAST(int, (count_.i64[0] > 64 ? 64 : count_.i64[0]));
4968
4969 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4970 r_.neon_u64 = vshlq_u64(a_.neon_u64, vdupq_n_s64(HEDLEY_STATIC_CAST(int64_t, -cnt)));
4971 #else
4972 #if !defined(SIMDE_BUG_GCC_94488)
4973 SIMDE_VECTORIZE
4974 #endif
4975 for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
4976 r_.u64[i] = a_.u64[i] >> cnt;
4977 }
4978 #endif
4979
4980 return simde__m128i_from_private(r_);
4981 #endif
4982 }
4983 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4984 # define _mm_srl_epi64(a, count) (simde_mm_srl_epi64(a, (count)))
4985 #endif
4986
4987 SIMDE_FUNCTION_ATTRIBUTES
4988 simde__m128i
simde_mm_srai_epi16(simde__m128i a,const int imm8)4989 simde_mm_srai_epi16 (simde__m128i a, const int imm8)
4990 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
4991 /* MSVC requires a range of (0, 255). */
4992 simde__m128i_private
4993 r_,
4994 a_ = simde__m128i_to_private(a);
4995
4996 const int cnt = (imm8 & ~15) ? 15 : imm8;
4997
4998 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4999 r_.neon_i16 = vshlq_s16(a_.neon_i16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, -cnt)));
5000 #else
5001 SIMDE_VECTORIZE
5002 for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i16[0])) ; i++) {
5003 r_.i16[i] = a_.i16[i] >> cnt;
5004 }
5005 #endif
5006
5007 return simde__m128i_from_private(r_);
5008 }
5009 #if defined(SIMDE_X86_SSE2_NATIVE)
5010 #define simde_mm_srai_epi16(a, imm8) _mm_srai_epi16((a), (imm8))
5011 #endif
5012 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5013 #define _mm_srai_epi16(a, imm8) simde_mm_srai_epi16(a, imm8)
5014 #endif
5015
5016 SIMDE_FUNCTION_ATTRIBUTES
5017 simde__m128i
simde_mm_srai_epi32(simde__m128i a,const int imm8)5018 simde_mm_srai_epi32 (simde__m128i a, const int imm8)
5019 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
5020 /* MSVC requires a range of (0, 255). */
5021 simde__m128i_private
5022 r_,
5023 a_ = simde__m128i_to_private(a);
5024
5025 const int cnt = (imm8 & ~31) ? 31 : imm8;
5026
5027 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5028 r_.neon_i32 = vshlq_s32(a_.neon_i32, vdupq_n_s32(-cnt));
5029 #else
5030 SIMDE_VECTORIZE
5031 for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i32[0])) ; i++) {
5032 r_.i32[i] = a_.i32[i] >> cnt;
5033 }
5034 #endif
5035
5036 return simde__m128i_from_private(r_);
5037 }
5038 #if defined(SIMDE_X86_SSE2_NATIVE)
5039 #define simde_mm_srai_epi32(a, imm8) _mm_srai_epi32((a), (imm8))
5040 #endif
5041 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5042 #define _mm_srai_epi32(a, imm8) simde_mm_srai_epi32(a, imm8)
5043 #endif
5044
5045 SIMDE_FUNCTION_ATTRIBUTES
5046 simde__m128i
simde_mm_sra_epi16(simde__m128i a,simde__m128i count)5047 simde_mm_sra_epi16 (simde__m128i a, simde__m128i count) {
5048 #if defined(SIMDE_X86_SSE2_NATIVE)
5049 return _mm_sra_epi16(a, count);
5050 #else
5051 simde__m128i_private
5052 r_,
5053 a_ = simde__m128i_to_private(a),
5054 count_ = simde__m128i_to_private(count);
5055
5056 const int cnt = HEDLEY_STATIC_CAST(int, (count_.i64[0] > 15 ? 15 : count_.i64[0]));
5057
5058 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5059 r_.neon_i16 = vshlq_s16(a_.neon_i16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, -cnt)));
5060 #else
5061 SIMDE_VECTORIZE
5062 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
5063 r_.i16[i] = a_.i16[i] >> cnt;
5064 }
5065 #endif
5066
5067 return simde__m128i_from_private(r_);
5068 #endif
5069 }
5070 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5071 # define _mm_sra_epi16(a, count) (simde_mm_sra_epi16(a, count))
5072 #endif
5073
5074 SIMDE_FUNCTION_ATTRIBUTES
5075 simde__m128i
simde_mm_sra_epi32(simde__m128i a,simde__m128i count)5076 simde_mm_sra_epi32 (simde__m128i a, simde__m128i count) {
5077 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_MM_SRA_EPI32)
5078 return _mm_sra_epi32(a, count);
5079 #else
5080 simde__m128i_private
5081 r_,
5082 a_ = simde__m128i_to_private(a),
5083 count_ = simde__m128i_to_private(count);
5084
5085 const int cnt = count_.u64[0] > 31 ? 31 : HEDLEY_STATIC_CAST(int, count_.u64[0]);
5086
5087 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5088 r_.neon_i32 = vshlq_s32(a_.neon_i32, vdupq_n_s32(HEDLEY_STATIC_CAST(int32_t, -cnt)));
5089 #else
5090 SIMDE_VECTORIZE
5091 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
5092 r_.i32[i] = a_.i32[i] >> cnt;
5093 }
5094 #endif
5095
5096 return simde__m128i_from_private(r_);
5097 #endif
5098 }
5099 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5100 # define _mm_sra_epi32(a, count) (simde_mm_sra_epi32(a, (count)))
5101 #endif
5102
5103 SIMDE_FUNCTION_ATTRIBUTES
5104 simde__m128i
simde_mm_slli_epi16(simde__m128i a,const int imm8)5105 simde_mm_slli_epi16 (simde__m128i a, const int imm8)
5106 SIMDE_REQUIRE_RANGE(imm8, 0, 255) {
5107 if (HEDLEY_UNLIKELY((imm8 > 15))) {
5108 return simde_mm_setzero_si128();
5109 }
5110
5111 simde__m128i_private
5112 r_,
5113 a_ = simde__m128i_to_private(a);
5114
5115 #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
5116 r_.i16 = a_.i16 << (imm8 & 0xff);
5117 #else
5118 const int s = (imm8 > HEDLEY_STATIC_CAST(int, sizeof(r_.i16[0]) * CHAR_BIT) - 1) ? 0 : imm8;
5119 SIMDE_VECTORIZE
5120 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
5121 r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, a_.i16[i] << s);
5122 }
5123 #endif
5124
5125 return simde__m128i_from_private(r_);
5126 }
5127 #if defined(SIMDE_X86_SSE2_NATIVE)
5128 # define simde_mm_slli_epi16(a, imm8) _mm_slli_epi16(a, imm8)
5129 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(__clang__)
5130 # define simde_mm_slli_epi16(a, imm8) \
5131 simde__m128i_from_neon_u16(vshlq_n_u16(simde__m128i_to_neon_u16(a), (imm8)))
5132 // The above is allowed by gcc/g++ 9 with -march=armv8-a, might work on A32V8 and elsewhere but needs testing
5133 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) // clang can't handle the potential out of range use of imm8 even though that is handled
5134 # define simde_mm_slli_epi16(a, imm8) \
5135 ({ \
5136 simde__m128i ret; \
5137 if ((imm8) <= 0) { \
5138 ret = a; \
5139 } else if ((imm8) > 15) { \
5140 ret = simde_mm_setzero_si128(); \
5141 } else { \
5142 ret = simde__m128i_from_neon_i16( \
5143 vshlq_n_s16(simde__m128i_to_neon_i16(a), (imm8))); \
5144 } \
5145 ret; \
5146 })
5147 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
5148 #define simde_mm_slli_epi16(a, imm8) \
5149 ((imm8 & ~15) ? simde_mm_setzero_si128() : simde__m128i_from_altivec_i16(vec_sl(simde__m128i_to_altivec_i16(a), vec_splat_u16(HEDLEY_STATIC_CAST(unsigned short, imm8)))))
5150 #endif
5151 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5152 # define _mm_slli_epi16(a, imm8) simde_mm_slli_epi16(a, imm8)
5153 #endif
5154
5155 SIMDE_FUNCTION_ATTRIBUTES
5156 simde__m128i
simde_mm_slli_epi32(simde__m128i a,const int imm8)5157 simde_mm_slli_epi32 (simde__m128i a, const int imm8)
5158 SIMDE_REQUIRE_RANGE(imm8, 0, 255) {
5159 if (HEDLEY_UNLIKELY((imm8 > 31))) {
5160 return simde_mm_setzero_si128();
5161 }
5162 simde__m128i_private
5163 r_,
5164 a_ = simde__m128i_to_private(a);
5165
5166 #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
5167 r_.i32 = a_.i32 << imm8;
5168 #else
5169 SIMDE_VECTORIZE
5170 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
5171 r_.i32[i] = a_.i32[i] << (imm8 & 0xff);
5172 }
5173 #endif
5174
5175 return simde__m128i_from_private(r_);
5176 }
5177 #if defined(SIMDE_X86_SSE2_NATIVE)
5178 # define simde_mm_slli_epi32(a, imm8) _mm_slli_epi32(a, imm8)
5179 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(__clang__)
5180 # define simde_mm_slli_epi32(a, imm8) \
5181 simde__m128i_from_neon_u32(vshlq_n_u32(simde__m128i_to_neon_u32(a), (imm8)))
5182 // The above is allowed by gcc/g++ 9 with -march=armv8-a, might work on A32V8 and elsewhere but needs testing
5183 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) // clang can't handle the potential out of range use of imm8 even though that is handled
5184 # define simde_mm_slli_epi32(a, imm8) \
5185 ({ \
5186 simde__m128i ret; \
5187 if ((imm8) <= 0) { \
5188 ret = a; \
5189 } else if ((imm8) > 31) { \
5190 ret = simde_mm_setzero_si128(); \
5191 } else { \
5192 ret = simde__m128i_from_neon_i32( \
5193 vshlq_n_s32(simde__m128i_to_neon_i32(a), (imm8))); \
5194 } \
5195 ret; \
5196 })
5197 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
5198 #define simde_mm_slli_epi32(a, imm8) \
5199 ({ \
5200 simde__m128i ret; \
5201 if ((imm8) <= 0) { \
5202 ret = a; \
5203 } else if ((imm8) > 31) { \
5204 ret = simde_mm_setzero_si128(); \
5205 } else { \
5206 ret = simde__m128i_from_altivec_i32( \
5207 vec_sl(simde__m128i_to_altivec_i32(a), \
5208 vec_splats(HEDLEY_STATIC_CAST(unsigned int, imm8)))); \
5209 } \
5210 ret; \
5211 })
5212 #endif
5213 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5214 # define _mm_slli_epi32(a, imm8) simde_mm_slli_epi32(a, imm8)
5215 #endif
5216
5217 SIMDE_FUNCTION_ATTRIBUTES
5218 simde__m128i
simde_mm_slli_epi64(simde__m128i a,const int imm8)5219 simde_mm_slli_epi64 (simde__m128i a, const int imm8)
5220 SIMDE_REQUIRE_RANGE(imm8, 0, 255) {
5221 if (HEDLEY_UNLIKELY((imm8 > 63))) {
5222 return simde_mm_setzero_si128();
5223 }
5224 simde__m128i_private
5225 r_,
5226 a_ = simde__m128i_to_private(a);
5227
5228 #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
5229 r_.i64 = a_.i64 << imm8;
5230 #else
5231 SIMDE_VECTORIZE
5232 for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
5233 r_.i64[i] = a_.i64[i] << (imm8 & 0xff);
5234 }
5235 #endif
5236
5237 return simde__m128i_from_private(r_);
5238 }
5239 #if defined(SIMDE_X86_SSE2_NATIVE)
5240 # define simde_mm_slli_epi64(a, imm8) _mm_slli_epi64(a, imm8)
5241 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(__clang__)
5242 # define simde_mm_slli_epi64(a, imm8) \
5243 simde__m128i_from_neon_u64(vshlq_n_u64(simde__m128i_to_neon_u64(a), (imm8)))
5244 // The above is allowed by gcc/g++ 9 with -march=armv8-a, might work on A32V8 and elsewhere but needs testing
5245 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) // clang can't handle the potential out of range use of imm8 even though that is handled
5246 # define simde_mm_slli_epi64(a, imm8) \
5247 ({ \
5248 simde__m128i ret; \
5249 if ((imm8) <= 0) { \
5250 ret = a; \
5251 } else if ((imm8) > 63) { \
5252 ret = simde_mm_setzero_si128(); \
5253 } else { \
5254 ret = simde__m128i_from_neon_i64( \
5255 vshlq_n_s64(simde__m128i_to_neon_i64(a), (imm8))); \
5256 } \
5257 ret; \
5258 })
5259 #endif
5260 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5261 # define _mm_slli_epi64(a, imm8) simde_mm_slli_epi64(a, imm8)
5262 #endif
5263
5264 SIMDE_FUNCTION_ATTRIBUTES
5265 simde__m128i
simde_mm_srli_epi16(simde__m128i a,const int imm8)5266 simde_mm_srli_epi16 (simde__m128i a, const int imm8)
5267 SIMDE_REQUIRE_RANGE(imm8, 0, 255) {
5268 if (HEDLEY_UNLIKELY((imm8 > 15))) {
5269 return simde_mm_setzero_si128();
5270 }
5271 simde__m128i_private
5272 r_,
5273 a_ = simde__m128i_to_private(a);
5274
5275 #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
5276 r_.u16 = a_.u16 >> imm8;
5277 #else
5278 SIMDE_VECTORIZE
5279 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
5280 r_.u16[i] = a_.u16[i] >> (imm8 & 0xff);
5281 }
5282 #endif
5283
5284 return simde__m128i_from_private(r_);
5285 }
5286 #if defined(SIMDE_X86_SSE2_NATIVE)
5287 # define simde_mm_srli_epi16(a, imm8) _mm_srli_epi16(a, imm8)
5288 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(__clang__)
5289 # define simde_mm_srli_epi16(a, imm8) \
5290 simde__m128i_from_neon_u16(vshrq_n_u16(simde__m128i_to_neon_u16(a), imm8))
5291 // The above is allowed by gcc/g++ 9 with -march=armv8-a, might work on A32V8 and elsewhere but needs testing
5292 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) // clang can't handle the potential out of range use of imm8 even though that is handled
5293 # define simde_mm_srli_epi16(a, imm8) \
5294 ({ \
5295 simde__m128i ret; \
5296 if ((imm8) <= 0) { \
5297 ret = a; \
5298 } else if ((imm8) > 15) { \
5299 ret = simde_mm_setzero_si128(); \
5300 } else { \
5301 ret = simde__m128i_from_neon_u16( \
5302 vshrq_n_u16(simde__m128i_to_neon_u16(a), (imm8))); \
5303 } \
5304 ret; \
5305 })
5306 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
5307 #define simde_mm_srli_epi16(a, imm8) \
5308 ((imm8 & ~15) ? simde_mm_setzero_si128() : simde__m128i_from_altivec_i16(vec_sr(simde__m128i_to_altivec_i16(a), vec_splat_u16(HEDLEY_STATIC_CAST(unsigned short, imm8)))))
5309 #endif
5310 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5311 # define _mm_srli_epi16(a, imm8) simde_mm_srli_epi16(a, imm8)
5312 #endif
5313
5314 SIMDE_FUNCTION_ATTRIBUTES
5315 simde__m128i
simde_mm_srli_epi32(simde__m128i a,const int imm8)5316 simde_mm_srli_epi32 (simde__m128i a, const int imm8)
5317 SIMDE_REQUIRE_RANGE(imm8, 0, 255) {
5318 if (HEDLEY_UNLIKELY((imm8 > 31))) {
5319 return simde_mm_setzero_si128();
5320 }
5321 simde__m128i_private
5322 r_,
5323 a_ = simde__m128i_to_private(a);
5324
5325 #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
5326 r_.u32 = a_.u32 >> (imm8 & 0xff);
5327 #else
5328 SIMDE_VECTORIZE
5329 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
5330 r_.u32[i] = a_.u32[i] >> (imm8 & 0xff);
5331 }
5332 #endif
5333
5334 return simde__m128i_from_private(r_);
5335 }
5336 #if defined(SIMDE_X86_SSE2_NATIVE)
5337 # define simde_mm_srli_epi32(a, imm8) _mm_srli_epi32(a, imm8)
5338 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(__clang__)
5339 # define simde_mm_srli_epi32(a, imm8) \
5340 simde__m128i_from_neon_u32(vshrq_n_u32(simde__m128i_to_neon_u32(a), imm8))
5341 // The above is allowed by gcc/g++ 9 with -march=armv8-a, might work on A32V8 and elsewhere but needs testing
5342 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) // clang can't handle the potential out of range use of imm8 even though that is handled
5343 # define simde_mm_srli_epi32(a, imm8) \
5344 ({ \
5345 simde__m128i ret; \
5346 if ((imm8) <= 0) { \
5347 ret = a; \
5348 } else if ((imm8) > 31) { \
5349 ret = simde_mm_setzero_si128(); \
5350 } else { \
5351 ret = simde__m128i_from_neon_u32( \
5352 vshrq_n_u32(simde__m128i_to_neon_u32(a), (imm8))); \
5353 } \
5354 ret; \
5355 })
5356 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
5357 # define simde_mm_srli_epi32(a, imm8) \
5358 ({ \
5359 simde__m128i ret; \
5360 if ((imm8) <= 0) { \
5361 ret = a; \
5362 } else if ((imm8) > 31) { \
5363 ret = simde_mm_setzero_si128(); \
5364 } else { \
5365 ret = simde__m128i_from_altivec_i32( \
5366 vec_sr(simde__m128i_to_altivec_i32(a), \
5367 vec_splats(HEDLEY_STATIC_CAST(unsigned int, imm8)))); \
5368 } \
5369 ret; \
5370 })
5371 #endif
5372 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5373 # define _mm_srli_epi32(a, imm8) simde_mm_srli_epi32(a, imm8)
5374 #endif
5375
5376 SIMDE_FUNCTION_ATTRIBUTES
5377 simde__m128i
simde_mm_srli_epi64(simde__m128i a,const int imm8)5378 simde_mm_srli_epi64 (simde__m128i a, const int imm8)
5379 SIMDE_REQUIRE_RANGE(imm8, 0, 255) {
5380 simde__m128i_private
5381 r_,
5382 a_ = simde__m128i_to_private(a);
5383
5384 if (HEDLEY_UNLIKELY((imm8 & 63) != imm8))
5385 return simde_mm_setzero_si128();
5386
5387 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5388 r_.neon_u64 = vshlq_u64(a_.neon_u64, vdupq_n_s64(-imm8));
5389 #else
5390 #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_94488)
5391 r_.u64 = a_.u64 >> imm8;
5392 #else
5393 SIMDE_VECTORIZE
5394 for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
5395 r_.u64[i] = a_.u64[i] >> imm8;
5396 }
5397 #endif
5398 #endif
5399
5400 return simde__m128i_from_private(r_);
5401 }
5402 #if defined(SIMDE_X86_SSE2_NATIVE)
5403 # define simde_mm_srli_epi64(a, imm8) _mm_srli_epi64(a, imm8)
5404 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(__clang__)
5405 # define simde_mm_srli_epi64(a, imm8) \
5406 ((imm8 == 0) ? (a) : (simde__m128i_from_neon_u64(vshrq_n_u64(simde__m128i_to_neon_u64(a), imm8))))
5407 // The above is allowed by gcc/g++ 9 with -march=armv8-a, might work on A32V8 and elsewhere but needs testing
5408 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) // clang can't handle the potential out of range use of imm8 even though that is handled
5409 # define simde_mm_srli_epi64(a, imm8) \
5410 ({ \
5411 simde__m128i ret; \
5412 if ((imm8) <= 0) { \
5413 ret = a; \
5414 } else if ((imm8) > 63) { \
5415 ret = simde_mm_setzero_si128(); \
5416 } else { \
5417 ret = simde__m128i_from_neon_u64( \
5418 vshrq_n_u64(simde__m128i_to_neon_u64(a), (imm8))); \
5419 } \
5420 ret; \
5421 })
5422 #endif
5423 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5424 # define _mm_srli_epi64(a, imm8) simde_mm_srli_epi64(a, imm8)
5425 #endif
5426
5427 SIMDE_FUNCTION_ATTRIBUTES
5428 void
simde_mm_store_pd(simde_float64 mem_addr[HEDLEY_ARRAY_PARAM (2)],simde__m128d a)5429 simde_mm_store_pd (simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128d a) {
5430 simde_assert_aligned(16, mem_addr);
5431
5432 #if defined(SIMDE_X86_SSE2_NATIVE)
5433 _mm_store_pd(mem_addr, a);
5434 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
5435 vst1q_f64(mem_addr, simde__m128d_to_private(a).neon_f64);
5436 #else
5437 simde_memcpy(mem_addr, &a, sizeof(a));
5438 #endif
5439 }
5440 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5441 # define _mm_store_pd(mem_addr, a) simde_mm_store_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
5442 #endif
5443
5444 SIMDE_FUNCTION_ATTRIBUTES
5445 void
simde_mm_store1_pd(simde_float64 mem_addr[HEDLEY_ARRAY_PARAM (2)],simde__m128d a)5446 simde_mm_store1_pd (simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128d a) {
5447 simde_assert_aligned(16, mem_addr);
5448
5449 #if defined(SIMDE_X86_SSE2_NATIVE)
5450 _mm_store1_pd(mem_addr, a);
5451 #else
5452 simde__m128d_private a_ = simde__m128d_to_private(a);
5453
5454 mem_addr[0] = a_.f64[0];
5455 mem_addr[1] = a_.f64[0];
5456 #endif
5457 }
5458 #define simde_mm_store_pd1(mem_addr, a) simde_mm_store1_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
5459 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5460 # define _mm_store1_pd(mem_addr, a) simde_mm_store1_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
5461 # define _mm_store_pd1(mem_addr, a) simde_mm_store_pd1(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
5462 #endif
5463
5464 SIMDE_FUNCTION_ATTRIBUTES
5465 void
simde_mm_store_sd(simde_float64 * mem_addr,simde__m128d a)5466 simde_mm_store_sd (simde_float64* mem_addr, simde__m128d a) {
5467 #if defined(SIMDE_X86_SSE2_NATIVE)
5468 _mm_store_sd(mem_addr, a);
5469 #else
5470 simde__m128d_private a_ = simde__m128d_to_private(a);
5471
5472 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
5473 simde_float64 v = vgetq_lane_f64(a_.neon_f64, 0);
5474 simde_memcpy(mem_addr, &v, sizeof(simde_float64));
5475 #else
5476 simde_float64 v = a_.f64[0];
5477 simde_memcpy(mem_addr, &v, sizeof(simde_float64));
5478 #endif
5479 #endif
5480 }
5481 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5482 # define _mm_store_sd(mem_addr, a) simde_mm_store_sd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
5483 #endif
5484
5485 SIMDE_FUNCTION_ATTRIBUTES
5486 void
simde_mm_store_si128(simde__m128i * mem_addr,simde__m128i a)5487 simde_mm_store_si128 (simde__m128i* mem_addr, simde__m128i a) {
5488 #if defined(SIMDE_X86_SSE2_NATIVE)
5489 _mm_store_si128(HEDLEY_STATIC_CAST(__m128i*, mem_addr), a);
5490 #else
5491 simde__m128i_private a_ = simde__m128i_to_private(a);
5492
5493 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5494 vst1q_s32(HEDLEY_REINTERPRET_CAST(int32_t*, mem_addr), a_.neon_i32);
5495 #else
5496 simde_memcpy(SIMDE_ASSUME_ALIGNED(16, mem_addr), &a_, sizeof(a_));
5497 #endif
5498 #endif
5499 }
5500 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5501 # define _mm_store_si128(mem_addr, a) simde_mm_store_si128(mem_addr, a)
5502 #endif
5503
5504 SIMDE_FUNCTION_ATTRIBUTES
5505 void
simde_mm_storeh_pd(simde_float64 * mem_addr,simde__m128d a)5506 simde_mm_storeh_pd (simde_float64* mem_addr, simde__m128d a) {
5507 #if defined(SIMDE_X86_SSE2_NATIVE)
5508 _mm_storeh_pd(mem_addr, a);
5509 #else
5510 simde__m128d_private a_ = simde__m128d_to_private(a);
5511
5512 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
5513 *mem_addr = vgetq_lane_f64(a_.neon_f64, 1);
5514 #else
5515 *mem_addr = a_.f64[1];
5516 #endif
5517 #endif
5518 }
5519 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5520 # define _mm_storeh_pd(mem_addr, a) simde_mm_storeh_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
5521 #endif
5522
5523 SIMDE_FUNCTION_ATTRIBUTES
5524 void
simde_mm_storel_epi64(simde__m128i * mem_addr,simde__m128i a)5525 simde_mm_storel_epi64 (simde__m128i* mem_addr, simde__m128i a) {
5526 #if defined(SIMDE_X86_SSE2_NATIVE)
5527 _mm_storel_epi64(HEDLEY_STATIC_CAST(__m128i*, mem_addr), a);
5528 #else
5529 simde__m128i_private a_ = simde__m128i_to_private(a);
5530 int64_t tmp;
5531
5532 /* memcpy to prevent aliasing, tmp because we can't take the
5533 * address of a vector element. */
5534
5535 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5536 tmp = vgetq_lane_s64(a_.neon_i64, 0);
5537 #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
5538 #if defined(SIMDE_BUG_GCC_95227)
5539 (void) a_;
5540 #endif
5541 tmp = vec_extract(a_.altivec_i64, 0);
5542 #else
5543 tmp = a_.i64[0];
5544 #endif
5545
5546 simde_memcpy(mem_addr, &tmp, sizeof(tmp));
5547 #endif
5548 }
5549 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5550 # define _mm_storel_epi64(mem_addr, a) simde_mm_storel_epi64(mem_addr, a)
5551 #endif
5552
5553 SIMDE_FUNCTION_ATTRIBUTES
5554 void
simde_mm_storel_pd(simde_float64 * mem_addr,simde__m128d a)5555 simde_mm_storel_pd (simde_float64* mem_addr, simde__m128d a) {
5556 #if defined(SIMDE_X86_SSE2_NATIVE)
5557 _mm_storel_pd(mem_addr, a);
5558 #else
5559 simde__m128d_private a_ = simde__m128d_to_private(a);
5560
5561 *mem_addr = a_.f64[0];
5562 #endif
5563 }
5564 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5565 # define _mm_storel_pd(mem_addr, a) simde_mm_storel_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
5566 #endif
5567
5568 SIMDE_FUNCTION_ATTRIBUTES
5569 void
simde_mm_storer_pd(simde_float64 mem_addr[2],simde__m128d a)5570 simde_mm_storer_pd (simde_float64 mem_addr[2], simde__m128d a) {
5571 simde_assert_aligned(16, mem_addr);
5572
5573 #if defined(SIMDE_X86_SSE2_NATIVE)
5574 _mm_storer_pd(mem_addr, a);
5575 #else
5576 simde__m128d_private a_ = simde__m128d_to_private(a);
5577
5578 mem_addr[0] = a_.f64[1];
5579 mem_addr[1] = a_.f64[0];
5580 #endif
5581 }
5582 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5583 # define _mm_storer_pd(mem_addr, a) simde_mm_storer_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
5584 #endif
5585
5586 SIMDE_FUNCTION_ATTRIBUTES
5587 void
simde_mm_storeu_pd(simde_float64 * mem_addr,simde__m128d a)5588 simde_mm_storeu_pd (simde_float64* mem_addr, simde__m128d a) {
5589 #if defined(SIMDE_X86_SSE2_NATIVE)
5590 _mm_storeu_pd(mem_addr, a);
5591 #else
5592 simde_memcpy(mem_addr, &a, sizeof(a));
5593 #endif
5594 }
5595 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5596 # define _mm_storeu_pd(mem_addr, a) simde_mm_storeu_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
5597 #endif
5598
5599 SIMDE_FUNCTION_ATTRIBUTES
5600 void
simde_mm_storeu_si128(simde__m128i * mem_addr,simde__m128i a)5601 simde_mm_storeu_si128 (simde__m128i* mem_addr, simde__m128i a) {
5602 #if defined(SIMDE_X86_SSE2_NATIVE)
5603 _mm_storeu_si128(HEDLEY_STATIC_CAST(__m128i*, mem_addr), a);
5604 #else
5605 simde__m128i_private a_ = simde__m128i_to_private(a);
5606
5607 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5608 vst1q_s32(HEDLEY_REINTERPRET_CAST(int32_t*, mem_addr), a_.neon_i32);
5609 #else
5610 simde_memcpy(mem_addr, &a_, sizeof(a_));
5611 #endif
5612 #endif
5613 }
5614 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5615 # define _mm_storeu_si128(mem_addr, a) simde_mm_storeu_si128(mem_addr, a)
5616 #endif
5617
5618 SIMDE_FUNCTION_ATTRIBUTES
5619 void
simde_mm_stream_pd(simde_float64 mem_addr[HEDLEY_ARRAY_PARAM (2)],simde__m128d a)5620 simde_mm_stream_pd (simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128d a) {
5621 simde_assert_aligned(16, mem_addr);
5622
5623 #if defined(SIMDE_X86_SSE2_NATIVE)
5624 _mm_stream_pd(mem_addr, a);
5625 #else
5626 simde_memcpy(mem_addr, &a, sizeof(a));
5627 #endif
5628 }
5629 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5630 # define _mm_stream_pd(mem_addr, a) simde_mm_stream_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
5631 #endif
5632
5633 SIMDE_FUNCTION_ATTRIBUTES
5634 void
simde_mm_stream_si128(simde__m128i * mem_addr,simde__m128i a)5635 simde_mm_stream_si128 (simde__m128i* mem_addr, simde__m128i a) {
5636 simde_assert_aligned(16, mem_addr);
5637
5638 #if defined(SIMDE_X86_SSE2_NATIVE)
5639 _mm_stream_si128(HEDLEY_STATIC_CAST(__m128i*, mem_addr), a);
5640 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5641 vst1q_s64(HEDLEY_REINTERPRET_CAST(int64_t *, mem_addr), simde__m128i_to_neon_i64(a));
5642 #else
5643 simde_memcpy(mem_addr, &a, sizeof(a));
5644 #endif
5645 }
5646 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5647 # define _mm_stream_si128(mem_addr, a) simde_mm_stream_si128(mem_addr, a)
5648 #endif
5649
5650 SIMDE_FUNCTION_ATTRIBUTES
5651 void
simde_mm_stream_si32(int32_t * mem_addr,int32_t a)5652 simde_mm_stream_si32 (int32_t* mem_addr, int32_t a) {
5653 #if defined(SIMDE_X86_SSE2_NATIVE)
5654 _mm_stream_si32(mem_addr, a);
5655 #else
5656 *mem_addr = a;
5657 #endif
5658 }
5659 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5660 # define _mm_stream_si32(mem_addr, a) simde_mm_stream_si32(mem_addr, a)
5661 #endif
5662
5663 SIMDE_FUNCTION_ATTRIBUTES
5664 void
simde_mm_stream_si64(int64_t * mem_addr,int64_t a)5665 simde_mm_stream_si64 (int64_t* mem_addr, int64_t a) {
5666 *mem_addr = a;
5667 }
5668 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5669 # define _mm_stream_si64(mem_addr, a) simde_mm_stream_si64(SIMDE_CHECKED_REINTERPRET_CAST(int64_t*, __int64*, mem_addr), a)
5670 #endif
5671
5672 SIMDE_FUNCTION_ATTRIBUTES
5673 simde__m128i
simde_mm_sub_epi8(simde__m128i a,simde__m128i b)5674 simde_mm_sub_epi8 (simde__m128i a, simde__m128i b) {
5675 #if defined(SIMDE_X86_SSE2_NATIVE)
5676 return _mm_sub_epi8(a, b);
5677 #else
5678 simde__m128i_private
5679 r_,
5680 a_ = simde__m128i_to_private(a),
5681 b_ = simde__m128i_to_private(b);
5682
5683 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5684 r_.neon_i8 = vsubq_s8(a_.neon_i8, b_.neon_i8);
5685 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
5686 r_.i8 = a_.i8 - b_.i8;
5687 #else
5688 SIMDE_VECTORIZE
5689 for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
5690 r_.i8[i] = a_.i8[i] - b_.i8[i];
5691 }
5692 #endif
5693
5694 return simde__m128i_from_private(r_);
5695 #endif
5696 }
5697 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5698 # define _mm_sub_epi8(a, b) simde_mm_sub_epi8(a, b)
5699 #endif
5700
5701 SIMDE_FUNCTION_ATTRIBUTES
5702 simde__m128i
simde_mm_sub_epi16(simde__m128i a,simde__m128i b)5703 simde_mm_sub_epi16 (simde__m128i a, simde__m128i b) {
5704 #if defined(SIMDE_X86_SSE2_NATIVE)
5705 return _mm_sub_epi16(a, b);
5706 #else
5707 simde__m128i_private
5708 r_,
5709 a_ = simde__m128i_to_private(a),
5710 b_ = simde__m128i_to_private(b);
5711
5712 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5713 r_.neon_i16 = vsubq_s16(a_.neon_i16, b_.neon_i16);
5714 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
5715 r_.i16 = a_.i16 - b_.i16;
5716 #else
5717 SIMDE_VECTORIZE
5718 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
5719 r_.i16[i] = a_.i16[i] - b_.i16[i];
5720 }
5721 #endif
5722
5723 return simde__m128i_from_private(r_);
5724 #endif
5725 }
5726 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5727 # define _mm_sub_epi16(a, b) simde_mm_sub_epi16(a, b)
5728 #endif
5729
5730 SIMDE_FUNCTION_ATTRIBUTES
5731 simde__m128i
simde_mm_sub_epi32(simde__m128i a,simde__m128i b)5732 simde_mm_sub_epi32 (simde__m128i a, simde__m128i b) {
5733 #if defined(SIMDE_X86_SSE2_NATIVE)
5734 return _mm_sub_epi32(a, b);
5735 #else
5736 simde__m128i_private
5737 r_,
5738 a_ = simde__m128i_to_private(a),
5739 b_ = simde__m128i_to_private(b);
5740
5741 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5742 r_.neon_i32 = vsubq_s32(a_.neon_i32, b_.neon_i32);
5743 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
5744 r_.i32 = a_.i32 - b_.i32;
5745 #else
5746 SIMDE_VECTORIZE
5747 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
5748 r_.i32[i] = a_.i32[i] - b_.i32[i];
5749 }
5750 #endif
5751
5752 return simde__m128i_from_private(r_);
5753 #endif
5754 }
5755 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5756 # define _mm_sub_epi32(a, b) simde_mm_sub_epi32(a, b)
5757 #endif
5758
5759 SIMDE_FUNCTION_ATTRIBUTES
5760 simde__m128i
simde_mm_sub_epi64(simde__m128i a,simde__m128i b)5761 simde_mm_sub_epi64 (simde__m128i a, simde__m128i b) {
5762 #if defined(SIMDE_X86_SSE2_NATIVE)
5763 return _mm_sub_epi64(a, b);
5764 #else
5765 simde__m128i_private
5766 r_,
5767 a_ = simde__m128i_to_private(a),
5768 b_ = simde__m128i_to_private(b);
5769
5770 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5771 r_.neon_i64 = vsubq_s64(a_.neon_i64, b_.neon_i64);
5772 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
5773 r_.i64 = a_.i64 - b_.i64;
5774 #else
5775 SIMDE_VECTORIZE
5776 for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
5777 r_.i64[i] = a_.i64[i] - b_.i64[i];
5778 }
5779 #endif
5780
5781 return simde__m128i_from_private(r_);
5782 #endif
5783 }
5784 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5785 # define _mm_sub_epi64(a, b) simde_mm_sub_epi64(a, b)
5786 #endif
5787
5788 SIMDE_FUNCTION_ATTRIBUTES
5789 simde__m128i
simde_x_mm_sub_epu32(simde__m128i a,simde__m128i b)5790 simde_x_mm_sub_epu32 (simde__m128i a, simde__m128i b) {
5791 simde__m128i_private
5792 r_,
5793 a_ = simde__m128i_to_private(a),
5794 b_ = simde__m128i_to_private(b);
5795
5796 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
5797 r_.u32 = a_.u32 - b_.u32;
5798 #else
5799 SIMDE_VECTORIZE
5800 for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
5801 r_.u32[i] = a_.u32[i] - b_.u32[i];
5802 }
5803 #endif
5804
5805 return simde__m128i_from_private(r_);
5806 }
5807
5808 SIMDE_FUNCTION_ATTRIBUTES
5809 simde__m128d
simde_mm_sub_pd(simde__m128d a,simde__m128d b)5810 simde_mm_sub_pd (simde__m128d a, simde__m128d b) {
5811 #if defined(SIMDE_X86_SSE2_NATIVE)
5812 return _mm_sub_pd(a, b);
5813 #else
5814 simde__m128d_private
5815 r_,
5816 a_ = simde__m128d_to_private(a),
5817 b_ = simde__m128d_to_private(b);
5818
5819 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
5820 r_.f64 = a_.f64 - b_.f64;
5821 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5822 r_.wasm_v128 = wasm_f64x2_sub(a_.wasm_v128, b_.wasm_v128);
5823 #else
5824 SIMDE_VECTORIZE
5825 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
5826 r_.f64[i] = a_.f64[i] - b_.f64[i];
5827 }
5828 #endif
5829
5830 return simde__m128d_from_private(r_);
5831 #endif
5832 }
5833 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5834 # define _mm_sub_pd(a, b) simde_mm_sub_pd(a, b)
5835 #endif
5836
5837 SIMDE_FUNCTION_ATTRIBUTES
5838 simde__m128d
simde_mm_sub_sd(simde__m128d a,simde__m128d b)5839 simde_mm_sub_sd (simde__m128d a, simde__m128d b) {
5840 #if defined(SIMDE_X86_SSE2_NATIVE)
5841 return _mm_sub_sd(a, b);
5842 #elif defined(SIMDE_ASSUME_VECTORIZATION)
5843 return simde_mm_move_sd(a, simde_mm_sub_pd(a, b));
5844 #else
5845 simde__m128d_private
5846 r_,
5847 a_ = simde__m128d_to_private(a),
5848 b_ = simde__m128d_to_private(b);
5849
5850 r_.f64[0] = a_.f64[0] - b_.f64[0];
5851 r_.f64[1] = a_.f64[1];
5852
5853 return simde__m128d_from_private(r_);
5854 #endif
5855 }
5856 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5857 # define _mm_sub_sd(a, b) simde_mm_sub_sd(a, b)
5858 #endif
5859
5860 SIMDE_FUNCTION_ATTRIBUTES
5861 simde__m64
simde_mm_sub_si64(simde__m64 a,simde__m64 b)5862 simde_mm_sub_si64 (simde__m64 a, simde__m64 b) {
5863 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
5864 return _mm_sub_si64(a, b);
5865 #else
5866 simde__m64_private
5867 r_,
5868 a_ = simde__m64_to_private(a),
5869 b_ = simde__m64_to_private(b);
5870
5871 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
5872 r_.i64 = a_.i64 - b_.i64;
5873 #else
5874 r_.i64[0] = a_.i64[0] - b_.i64[0];
5875 #endif
5876
5877 return simde__m64_from_private(r_);
5878 #endif
5879 }
5880 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5881 # define _mm_sub_si64(a, b) simde_mm_sub_si64(a, b)
5882 #endif
5883
5884 SIMDE_FUNCTION_ATTRIBUTES
5885 simde__m128i
simde_mm_subs_epi8(simde__m128i a,simde__m128i b)5886 simde_mm_subs_epi8 (simde__m128i a, simde__m128i b) {
5887 #if defined(SIMDE_X86_SSE2_NATIVE)
5888 return _mm_subs_epi8(a, b);
5889 #else
5890 simde__m128i_private
5891 r_,
5892 a_ = simde__m128i_to_private(a),
5893 b_ = simde__m128i_to_private(b);
5894
5895 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5896 r_.neon_i8 = vqsubq_s8(a_.neon_i8, b_.neon_i8);
5897 #else
5898 SIMDE_VECTORIZE
5899 for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i8[0])) ; i++) {
5900 if (((b_.i8[i]) > 0 && (a_.i8[i]) < INT8_MIN + (b_.i8[i]))) {
5901 r_.i8[i] = INT8_MIN;
5902 } else if ((b_.i8[i]) < 0 && (a_.i8[i]) > INT8_MAX + (b_.i8[i])) {
5903 r_.i8[i] = INT8_MAX;
5904 } else {
5905 r_.i8[i] = (a_.i8[i]) - (b_.i8[i]);
5906 }
5907 }
5908 #endif
5909
5910 return simde__m128i_from_private(r_);
5911 #endif
5912 }
5913 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5914 # define _mm_subs_epi8(a, b) simde_mm_subs_epi8(a, b)
5915 #endif
5916
5917 SIMDE_FUNCTION_ATTRIBUTES
5918 simde__m128i
simde_mm_subs_epi16(simde__m128i a,simde__m128i b)5919 simde_mm_subs_epi16 (simde__m128i a, simde__m128i b) {
5920 #if defined(SIMDE_X86_SSE2_NATIVE)
5921 return _mm_subs_epi16(a, b);
5922 #else
5923 simde__m128i_private
5924 r_,
5925 a_ = simde__m128i_to_private(a),
5926 b_ = simde__m128i_to_private(b);
5927
5928 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5929 r_.neon_i16 = vqsubq_s16(a_.neon_i16, b_.neon_i16);
5930 #else
5931 SIMDE_VECTORIZE
5932 for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i16[0])) ; i++) {
5933 if (((b_.i16[i]) > 0 && (a_.i16[i]) < INT16_MIN + (b_.i16[i]))) {
5934 r_.i16[i] = INT16_MIN;
5935 } else if ((b_.i16[i]) < 0 && (a_.i16[i]) > INT16_MAX + (b_.i16[i])) {
5936 r_.i16[i] = INT16_MAX;
5937 } else {
5938 r_.i16[i] = (a_.i16[i]) - (b_.i16[i]);
5939 }
5940 }
5941 #endif
5942
5943 return simde__m128i_from_private(r_);
5944 #endif
5945 }
5946 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5947 # define _mm_subs_epi16(a, b) simde_mm_subs_epi16(a, b)
5948 #endif
5949
5950 SIMDE_FUNCTION_ATTRIBUTES
5951 simde__m128i
simde_mm_subs_epu8(simde__m128i a,simde__m128i b)5952 simde_mm_subs_epu8 (simde__m128i a, simde__m128i b) {
5953 #if defined(SIMDE_X86_SSE2_NATIVE)
5954 return _mm_subs_epu8(a, b);
5955 #else
5956 simde__m128i_private
5957 r_,
5958 a_ = simde__m128i_to_private(a),
5959 b_ = simde__m128i_to_private(b);
5960
5961 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5962 r_.neon_u8 = vqsubq_u8(a_.neon_u8, b_.neon_u8);
5963 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
5964 r_.altivec_u8 = vec_subs(a_.altivec_u8, b_.altivec_u8);
5965 #else
5966 SIMDE_VECTORIZE
5967 for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i8[0])) ; i++) {
5968 const int32_t x = a_.u8[i] - b_.u8[i];
5969 if (x < 0) {
5970 r_.u8[i] = 0;
5971 } else if (x > UINT8_MAX) {
5972 r_.u8[i] = UINT8_MAX;
5973 } else {
5974 r_.u8[i] = HEDLEY_STATIC_CAST(uint8_t, x);
5975 }
5976 }
5977 #endif
5978
5979 return simde__m128i_from_private(r_);
5980 #endif
5981 }
5982 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5983 # define _mm_subs_epu8(a, b) simde_mm_subs_epu8(a, b)
5984 #endif
5985
5986 SIMDE_FUNCTION_ATTRIBUTES
5987 simde__m128i
simde_mm_subs_epu16(simde__m128i a,simde__m128i b)5988 simde_mm_subs_epu16 (simde__m128i a, simde__m128i b) {
5989 #if defined(SIMDE_X86_SSE2_NATIVE)
5990 return _mm_subs_epu16(a, b);
5991 #else
5992 simde__m128i_private
5993 r_,
5994 a_ = simde__m128i_to_private(a),
5995 b_ = simde__m128i_to_private(b);
5996
5997 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5998 r_.neon_u16 = vqsubq_u16(a_.neon_u16, b_.neon_u16);
5999 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
6000 r_.altivec_u16 = vec_subs(a_.altivec_u16, b_.altivec_u16);
6001 #else
6002 SIMDE_VECTORIZE
6003 for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i16[0])) ; i++) {
6004 const int32_t x = a_.u16[i] - b_.u16[i];
6005 if (x < 0) {
6006 r_.u16[i] = 0;
6007 } else if (x > UINT16_MAX) {
6008 r_.u16[i] = UINT16_MAX;
6009 } else {
6010 r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, x);
6011 }
6012 }
6013 #endif
6014
6015 return simde__m128i_from_private(r_);
6016 #endif
6017 }
6018 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6019 # define _mm_subs_epu16(a, b) simde_mm_subs_epu16(a, b)
6020 #endif
6021
6022 SIMDE_FUNCTION_ATTRIBUTES
6023 int
simde_mm_ucomieq_sd(simde__m128d a,simde__m128d b)6024 simde_mm_ucomieq_sd (simde__m128d a, simde__m128d b) {
6025 #if defined(SIMDE_X86_SSE2_NATIVE)
6026 return _mm_ucomieq_sd(a, b);
6027 #else
6028 simde__m128d_private
6029 a_ = simde__m128d_to_private(a),
6030 b_ = simde__m128d_to_private(b);
6031 int r;
6032
6033 #if defined(SIMDE_HAVE_FENV_H)
6034 fenv_t envp;
6035 int x = feholdexcept(&envp);
6036 r = a_.f64[0] == b_.f64[0];
6037 if (HEDLEY_LIKELY(x == 0))
6038 fesetenv(&envp);
6039 #else
6040 r = a_.f64[0] == b_.f64[0];
6041 #endif
6042
6043 return r;
6044 #endif
6045 }
6046 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6047 # define _mm_ucomieq_sd(a, b) simde_mm_ucomieq_sd(a, b)
6048 #endif
6049
6050 SIMDE_FUNCTION_ATTRIBUTES
6051 int
simde_mm_ucomige_sd(simde__m128d a,simde__m128d b)6052 simde_mm_ucomige_sd (simde__m128d a, simde__m128d b) {
6053 #if defined(SIMDE_X86_SSE2_NATIVE)
6054 return _mm_ucomige_sd(a, b);
6055 #else
6056 simde__m128d_private
6057 a_ = simde__m128d_to_private(a),
6058 b_ = simde__m128d_to_private(b);
6059 int r;
6060
6061 #if defined(SIMDE_HAVE_FENV_H)
6062 fenv_t envp;
6063 int x = feholdexcept(&envp);
6064 r = a_.f64[0] >= b_.f64[0];
6065 if (HEDLEY_LIKELY(x == 0))
6066 fesetenv(&envp);
6067 #else
6068 r = a_.f64[0] >= b_.f64[0];
6069 #endif
6070
6071 return r;
6072 #endif
6073 }
6074 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6075 # define _mm_ucomige_sd(a, b) simde_mm_ucomige_sd(a, b)
6076 #endif
6077
6078 SIMDE_FUNCTION_ATTRIBUTES
6079 int
simde_mm_ucomigt_sd(simde__m128d a,simde__m128d b)6080 simde_mm_ucomigt_sd (simde__m128d a, simde__m128d b) {
6081 #if defined(SIMDE_X86_SSE2_NATIVE)
6082 return _mm_ucomigt_sd(a, b);
6083 #else
6084 simde__m128d_private
6085 a_ = simde__m128d_to_private(a),
6086 b_ = simde__m128d_to_private(b);
6087 int r;
6088
6089 #if defined(SIMDE_HAVE_FENV_H)
6090 fenv_t envp;
6091 int x = feholdexcept(&envp);
6092 r = a_.f64[0] > b_.f64[0];
6093 if (HEDLEY_LIKELY(x == 0))
6094 fesetenv(&envp);
6095 #else
6096 r = a_.f64[0] > b_.f64[0];
6097 #endif
6098
6099 return r;
6100 #endif
6101 }
6102 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6103 # define _mm_ucomigt_sd(a, b) simde_mm_ucomigt_sd(a, b)
6104 #endif
6105
6106 SIMDE_FUNCTION_ATTRIBUTES
6107 int
simde_mm_ucomile_sd(simde__m128d a,simde__m128d b)6108 simde_mm_ucomile_sd (simde__m128d a, simde__m128d b) {
6109 #if defined(SIMDE_X86_SSE2_NATIVE)
6110 return _mm_ucomile_sd(a, b);
6111 #else
6112 simde__m128d_private
6113 a_ = simde__m128d_to_private(a),
6114 b_ = simde__m128d_to_private(b);
6115 int r;
6116
6117 #if defined(SIMDE_HAVE_FENV_H)
6118 fenv_t envp;
6119 int x = feholdexcept(&envp);
6120 r = a_.f64[0] <= b_.f64[0];
6121 if (HEDLEY_LIKELY(x == 0))
6122 fesetenv(&envp);
6123 #else
6124 r = a_.f64[0] <= b_.f64[0];
6125 #endif
6126
6127 return r;
6128 #endif
6129 }
6130 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6131 # define _mm_ucomile_sd(a, b) simde_mm_ucomile_sd(a, b)
6132 #endif
6133
6134 SIMDE_FUNCTION_ATTRIBUTES
6135 int
simde_mm_ucomilt_sd(simde__m128d a,simde__m128d b)6136 simde_mm_ucomilt_sd (simde__m128d a, simde__m128d b) {
6137 #if defined(SIMDE_X86_SSE2_NATIVE)
6138 return _mm_ucomilt_sd(a, b);
6139 #else
6140 simde__m128d_private
6141 a_ = simde__m128d_to_private(a),
6142 b_ = simde__m128d_to_private(b);
6143 int r;
6144
6145 #if defined(SIMDE_HAVE_FENV_H)
6146 fenv_t envp;
6147 int x = feholdexcept(&envp);
6148 r = a_.f64[0] < b_.f64[0];
6149 if (HEDLEY_LIKELY(x == 0))
6150 fesetenv(&envp);
6151 #else
6152 r = a_.f64[0] < b_.f64[0];
6153 #endif
6154
6155 return r;
6156 #endif
6157 }
6158 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6159 # define _mm_ucomilt_sd(a, b) simde_mm_ucomilt_sd(a, b)
6160 #endif
6161
6162 SIMDE_FUNCTION_ATTRIBUTES
6163 int
simde_mm_ucomineq_sd(simde__m128d a,simde__m128d b)6164 simde_mm_ucomineq_sd (simde__m128d a, simde__m128d b) {
6165 #if defined(SIMDE_X86_SSE2_NATIVE)
6166 return _mm_ucomineq_sd(a, b);
6167 #else
6168 simde__m128d_private
6169 a_ = simde__m128d_to_private(a),
6170 b_ = simde__m128d_to_private(b);
6171 int r;
6172
6173 #if defined(SIMDE_HAVE_FENV_H)
6174 fenv_t envp;
6175 int x = feholdexcept(&envp);
6176 r = a_.f64[0] != b_.f64[0];
6177 if (HEDLEY_LIKELY(x == 0))
6178 fesetenv(&envp);
6179 #else
6180 r = a_.f64[0] != b_.f64[0];
6181 #endif
6182
6183 return r;
6184 #endif
6185 }
6186 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6187 # define _mm_ucomineq_sd(a, b) simde_mm_ucomineq_sd(a, b)
6188 #endif
6189
6190 #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
6191 HEDLEY_DIAGNOSTIC_PUSH
6192 SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_
6193 #endif
6194
6195 #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
6196 HEDLEY_DIAGNOSTIC_POP
6197 #endif
6198
6199 SIMDE_FUNCTION_ATTRIBUTES
6200 void
simde_mm_lfence(void)6201 simde_mm_lfence (void) {
6202 #if defined(SIMDE_X86_SSE2_NATIVE)
6203 _mm_lfence();
6204 #else
6205 simde_mm_sfence();
6206 #endif
6207 }
6208 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6209 # define _mm_lfence() simde_mm_lfence()
6210 #endif
6211
6212 SIMDE_FUNCTION_ATTRIBUTES
6213 void
simde_mm_mfence(void)6214 simde_mm_mfence (void) {
6215 #if defined(SIMDE_X86_SSE2_NATIVE)
6216 _mm_mfence();
6217 #else
6218 simde_mm_sfence();
6219 #endif
6220 }
6221 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6222 # define _mm_mfence() simde_mm_mfence()
6223 #endif
6224
6225 SIMDE_FUNCTION_ATTRIBUTES
6226 simde__m128i
simde_mm_unpackhi_epi8(simde__m128i a,simde__m128i b)6227 simde_mm_unpackhi_epi8 (simde__m128i a, simde__m128i b) {
6228 #if defined(SIMDE_X86_SSE2_NATIVE)
6229 return _mm_unpackhi_epi8(a, b);
6230 #else
6231 simde__m128i_private
6232 r_,
6233 a_ = simde__m128i_to_private(a),
6234 b_ = simde__m128i_to_private(b);
6235
6236 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6237 r_.neon_i8 = vzip2q_s8(a_.neon_i8, b_.neon_i8);
6238 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6239 int8x8_t a1 = vreinterpret_s8_s16(vget_high_s16(a_.neon_i16));
6240 int8x8_t b1 = vreinterpret_s8_s16(vget_high_s16(b_.neon_i16));
6241 int8x8x2_t result = vzip_s8(a1, b1);
6242 r_.neon_i8 = vcombine_s8(result.val[0], result.val[1]);
6243 #elif defined(SIMDE_SHUFFLE_VECTOR_)
6244 r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, b_.i8, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
6245 #else
6246 SIMDE_VECTORIZE
6247 for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i8[0])) / 2) ; i++) {
6248 r_.i8[(i * 2)] = a_.i8[i + ((sizeof(r_) / sizeof(r_.i8[0])) / 2)];
6249 r_.i8[(i * 2) + 1] = b_.i8[i + ((sizeof(r_) / sizeof(r_.i8[0])) / 2)];
6250 }
6251 #endif
6252
6253 return simde__m128i_from_private(r_);
6254 #endif
6255 }
6256 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6257 # define _mm_unpackhi_epi8(a, b) simde_mm_unpackhi_epi8(a, b)
6258 #endif
6259
6260 SIMDE_FUNCTION_ATTRIBUTES
6261 simde__m128i
simde_mm_unpackhi_epi16(simde__m128i a,simde__m128i b)6262 simde_mm_unpackhi_epi16 (simde__m128i a, simde__m128i b) {
6263 #if defined(SIMDE_X86_SSE2_NATIVE)
6264 return _mm_unpackhi_epi16(a, b);
6265 #else
6266 simde__m128i_private
6267 r_,
6268 a_ = simde__m128i_to_private(a),
6269 b_ = simde__m128i_to_private(b);
6270
6271 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6272 r_.neon_i16 = vzip2q_s16(a_.neon_i16, b_.neon_i16);
6273 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6274 int16x4_t a1 = vget_high_s16(a_.neon_i16);
6275 int16x4_t b1 = vget_high_s16(b_.neon_i16);
6276 int16x4x2_t result = vzip_s16(a1, b1);
6277 r_.neon_i16 = vcombine_s16(result.val[0], result.val[1]);
6278 #elif defined(SIMDE_SHUFFLE_VECTOR_)
6279 r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, b_.i16, 4, 12, 5, 13, 6, 14, 7, 15);
6280 #else
6281 SIMDE_VECTORIZE
6282 for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i16[0])) / 2) ; i++) {
6283 r_.i16[(i * 2)] = a_.i16[i + ((sizeof(r_) / sizeof(r_.i16[0])) / 2)];
6284 r_.i16[(i * 2) + 1] = b_.i16[i + ((sizeof(r_) / sizeof(r_.i16[0])) / 2)];
6285 }
6286 #endif
6287
6288 return simde__m128i_from_private(r_);
6289 #endif
6290 }
6291 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6292 # define _mm_unpackhi_epi16(a, b) simde_mm_unpackhi_epi16(a, b)
6293 #endif
6294
6295 SIMDE_FUNCTION_ATTRIBUTES
6296 simde__m128i
simde_mm_unpackhi_epi32(simde__m128i a,simde__m128i b)6297 simde_mm_unpackhi_epi32 (simde__m128i a, simde__m128i b) {
6298 #if defined(SIMDE_X86_SSE2_NATIVE)
6299 return _mm_unpackhi_epi32(a, b);
6300 #else
6301 simde__m128i_private
6302 r_,
6303 a_ = simde__m128i_to_private(a),
6304 b_ = simde__m128i_to_private(b);
6305
6306 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6307 r_.neon_i32 = vzip2q_s32(a_.neon_i32, b_.neon_i32);
6308 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6309 int32x2_t a1 = vget_high_s32(a_.neon_i32);
6310 int32x2_t b1 = vget_high_s32(b_.neon_i32);
6311 int32x2x2_t result = vzip_s32(a1, b1);
6312 r_.neon_i32 = vcombine_s32(result.val[0], result.val[1]);
6313 #elif defined(SIMDE_SHUFFLE_VECTOR_)
6314 r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.i32, b_.i32, 2, 6, 3, 7);
6315 #else
6316 SIMDE_VECTORIZE
6317 for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i32[0])) / 2) ; i++) {
6318 r_.i32[(i * 2)] = a_.i32[i + ((sizeof(r_) / sizeof(r_.i32[0])) / 2)];
6319 r_.i32[(i * 2) + 1] = b_.i32[i + ((sizeof(r_) / sizeof(r_.i32[0])) / 2)];
6320 }
6321 #endif
6322
6323 return simde__m128i_from_private(r_);
6324 #endif
6325 }
6326 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6327 # define _mm_unpackhi_epi32(a, b) simde_mm_unpackhi_epi32(a, b)
6328 #endif
6329
6330 SIMDE_FUNCTION_ATTRIBUTES
6331 simde__m128i
simde_mm_unpackhi_epi64(simde__m128i a,simde__m128i b)6332 simde_mm_unpackhi_epi64 (simde__m128i a, simde__m128i b) {
6333 #if defined(SIMDE_X86_SSE2_NATIVE)
6334 return _mm_unpackhi_epi64(a, b);
6335 #else
6336 simde__m128i_private
6337 r_,
6338 a_ = simde__m128i_to_private(a),
6339 b_ = simde__m128i_to_private(b);
6340
6341 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6342 int64x1_t a_h = vget_high_s64(a_.neon_i64);
6343 int64x1_t b_h = vget_high_s64(b_.neon_i64);
6344 r_.neon_i64 = vcombine_s64(a_h, b_h);
6345 #elif defined(SIMDE_SHUFFLE_VECTOR_)
6346 r_.i64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.i64, b_.i64, 1, 3);
6347 #else
6348 SIMDE_VECTORIZE
6349 for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i64[0])) / 2) ; i++) {
6350 r_.i64[(i * 2)] = a_.i64[i + ((sizeof(r_) / sizeof(r_.i64[0])) / 2)];
6351 r_.i64[(i * 2) + 1] = b_.i64[i + ((sizeof(r_) / sizeof(r_.i64[0])) / 2)];
6352 }
6353 #endif
6354
6355 return simde__m128i_from_private(r_);
6356 #endif
6357 }
6358 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6359 # define _mm_unpackhi_epi64(a, b) simde_mm_unpackhi_epi64(a, b)
6360 #endif
6361
6362 SIMDE_FUNCTION_ATTRIBUTES
6363 simde__m128d
simde_mm_unpackhi_pd(simde__m128d a,simde__m128d b)6364 simde_mm_unpackhi_pd (simde__m128d a, simde__m128d b) {
6365 #if defined(SIMDE_X86_SSE2_NATIVE)
6366 return _mm_unpackhi_pd(a, b);
6367 #else
6368 simde__m128d_private
6369 r_,
6370 a_ = simde__m128d_to_private(a),
6371 b_ = simde__m128d_to_private(b);
6372
6373 #if defined(SIMDE_SHUFFLE_VECTOR_)
6374 r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 1, 3);
6375 #else
6376 SIMDE_VECTORIZE
6377 for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.f64[0])) / 2) ; i++) {
6378 r_.f64[(i * 2)] = a_.f64[i + ((sizeof(r_) / sizeof(r_.f64[0])) / 2)];
6379 r_.f64[(i * 2) + 1] = b_.f64[i + ((sizeof(r_) / sizeof(r_.f64[0])) / 2)];
6380 }
6381 #endif
6382
6383 return simde__m128d_from_private(r_);
6384 #endif
6385 }
6386 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6387 # define _mm_unpackhi_pd(a, b) simde_mm_unpackhi_pd(a, b)
6388 #endif
6389
6390 SIMDE_FUNCTION_ATTRIBUTES
6391 simde__m128i
simde_mm_unpacklo_epi8(simde__m128i a,simde__m128i b)6392 simde_mm_unpacklo_epi8 (simde__m128i a, simde__m128i b) {
6393 #if defined(SIMDE_X86_SSE2_NATIVE)
6394 return _mm_unpacklo_epi8(a, b);
6395 #else
6396 simde__m128i_private
6397 r_,
6398 a_ = simde__m128i_to_private(a),
6399 b_ = simde__m128i_to_private(b);
6400
6401 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6402 r_.neon_i8 = vzip1q_s8(a_.neon_i8, b_.neon_i8);
6403 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6404 int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(a_.neon_i16));
6405 int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(b_.neon_i16));
6406 int8x8x2_t result = vzip_s8(a1, b1);
6407 r_.neon_i8 = vcombine_s8(result.val[0], result.val[1]);
6408 #elif defined(SIMDE_SHUFFLE_VECTOR_)
6409 r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, b_.i8, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
6410 #else
6411 SIMDE_VECTORIZE
6412 for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i8[0])) / 2) ; i++) {
6413 r_.i8[(i * 2)] = a_.i8[i];
6414 r_.i8[(i * 2) + 1] = b_.i8[i];
6415 }
6416 #endif
6417
6418 return simde__m128i_from_private(r_);
6419 #endif
6420 }
6421 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6422 # define _mm_unpacklo_epi8(a, b) simde_mm_unpacklo_epi8(a, b)
6423 #endif
6424
6425 SIMDE_FUNCTION_ATTRIBUTES
6426 simde__m128i
simde_mm_unpacklo_epi16(simde__m128i a,simde__m128i b)6427 simde_mm_unpacklo_epi16 (simde__m128i a, simde__m128i b) {
6428 #if defined(SIMDE_X86_SSE2_NATIVE)
6429 return _mm_unpacklo_epi16(a, b);
6430 #else
6431 simde__m128i_private
6432 r_,
6433 a_ = simde__m128i_to_private(a),
6434 b_ = simde__m128i_to_private(b);
6435
6436 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6437 r_.neon_i16 = vzip1q_s16(a_.neon_i16, b_.neon_i16);
6438 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6439 int16x4_t a1 = vget_low_s16(a_.neon_i16);
6440 int16x4_t b1 = vget_low_s16(b_.neon_i16);
6441 int16x4x2_t result = vzip_s16(a1, b1);
6442 r_.neon_i16 = vcombine_s16(result.val[0], result.val[1]);
6443 #elif defined(SIMDE_SHUFFLE_VECTOR_)
6444 r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, b_.i16, 0, 8, 1, 9, 2, 10, 3, 11);
6445 #else
6446 SIMDE_VECTORIZE
6447 for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i16[0])) / 2) ; i++) {
6448 r_.i16[(i * 2)] = a_.i16[i];
6449 r_.i16[(i * 2) + 1] = b_.i16[i];
6450 }
6451 #endif
6452
6453 return simde__m128i_from_private(r_);
6454 #endif
6455 }
6456 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6457 # define _mm_unpacklo_epi16(a, b) simde_mm_unpacklo_epi16(a, b)
6458 #endif
6459
6460 SIMDE_FUNCTION_ATTRIBUTES
6461 simde__m128i
simde_mm_unpacklo_epi32(simde__m128i a,simde__m128i b)6462 simde_mm_unpacklo_epi32 (simde__m128i a, simde__m128i b) {
6463 #if defined(SIMDE_X86_SSE2_NATIVE)
6464 return _mm_unpacklo_epi32(a, b);
6465 #else
6466 simde__m128i_private
6467 r_,
6468 a_ = simde__m128i_to_private(a),
6469 b_ = simde__m128i_to_private(b);
6470
6471 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6472 r_.neon_i32 = vzip1q_s32(a_.neon_i32, b_.neon_i32);
6473 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6474 int32x2_t a1 = vget_low_s32(a_.neon_i32);
6475 int32x2_t b1 = vget_low_s32(b_.neon_i32);
6476 int32x2x2_t result = vzip_s32(a1, b1);
6477 r_.neon_i32 = vcombine_s32(result.val[0], result.val[1]);
6478 #elif defined(SIMDE_SHUFFLE_VECTOR_)
6479 r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.i32, b_.i32, 0, 4, 1, 5);
6480 #else
6481 SIMDE_VECTORIZE
6482 for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i32[0])) / 2) ; i++) {
6483 r_.i32[(i * 2)] = a_.i32[i];
6484 r_.i32[(i * 2) + 1] = b_.i32[i];
6485 }
6486 #endif
6487
6488 return simde__m128i_from_private(r_);
6489 #endif
6490 }
6491 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6492 # define _mm_unpacklo_epi32(a, b) simde_mm_unpacklo_epi32(a, b)
6493 #endif
6494
6495 SIMDE_FUNCTION_ATTRIBUTES
6496 simde__m128i
simde_mm_unpacklo_epi64(simde__m128i a,simde__m128i b)6497 simde_mm_unpacklo_epi64 (simde__m128i a, simde__m128i b) {
6498 #if defined(SIMDE_X86_SSE2_NATIVE)
6499 return _mm_unpacklo_epi64(a, b);
6500 #else
6501 simde__m128i_private
6502 r_,
6503 a_ = simde__m128i_to_private(a),
6504 b_ = simde__m128i_to_private(b);
6505
6506 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6507 int64x1_t a_l = vget_low_s64(a_.i64);
6508 int64x1_t b_l = vget_low_s64(b_.i64);
6509 r_.neon_i64 = vcombine_s64(a_l, b_l);
6510 #elif defined(SIMDE_SHUFFLE_VECTOR_)
6511 r_.i64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.i64, b_.i64, 0, 2);
6512 #else
6513 SIMDE_VECTORIZE
6514 for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i64[0])) / 2) ; i++) {
6515 r_.i64[(i * 2)] = a_.i64[i];
6516 r_.i64[(i * 2) + 1] = b_.i64[i];
6517 }
6518 #endif
6519
6520 return simde__m128i_from_private(r_);
6521 #endif
6522 }
6523 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6524 # define _mm_unpacklo_epi64(a, b) simde_mm_unpacklo_epi64(a, b)
6525 #endif
6526
6527 SIMDE_FUNCTION_ATTRIBUTES
6528 simde__m128d
simde_mm_unpacklo_pd(simde__m128d a,simde__m128d b)6529 simde_mm_unpacklo_pd (simde__m128d a, simde__m128d b) {
6530 #if defined(SIMDE_X86_SSE2_NATIVE)
6531 return _mm_unpacklo_pd(a, b);
6532 #else
6533 simde__m128d_private
6534 r_,
6535 a_ = simde__m128d_to_private(a),
6536 b_ = simde__m128d_to_private(b);
6537
6538 #if defined(SIMDE_SHUFFLE_VECTOR_)
6539 r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 0, 2);
6540 #else
6541 SIMDE_VECTORIZE
6542 for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.f64[0])) / 2) ; i++) {
6543 r_.f64[(i * 2)] = a_.f64[i];
6544 r_.f64[(i * 2) + 1] = b_.f64[i];
6545 }
6546 #endif
6547
6548 return simde__m128d_from_private(r_);
6549 #endif
6550 }
6551 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6552 # define _mm_unpacklo_pd(a, b) simde_mm_unpacklo_pd(a, b)
6553 #endif
6554
6555 SIMDE_FUNCTION_ATTRIBUTES
6556 simde__m128d
simde_mm_xor_pd(simde__m128d a,simde__m128d b)6557 simde_mm_xor_pd (simde__m128d a, simde__m128d b) {
6558 #if defined(SIMDE_X86_SSE2_NATIVE)
6559 return _mm_xor_pd(a, b);
6560 #else
6561 simde__m128d_private
6562 r_,
6563 a_ = simde__m128d_to_private(a),
6564 b_ = simde__m128d_to_private(b);
6565
6566 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
6567 r_.i32f = a_.i32f ^ b_.i32f;
6568 #else
6569 SIMDE_VECTORIZE
6570 for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
6571 r_.i32f[i] = a_.i32f[i] ^ b_.i32f[i];
6572 }
6573 #endif
6574
6575 return simde__m128d_from_private(r_);
6576 #endif
6577 }
6578 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6579 # define _mm_xor_pd(a, b) simde_mm_xor_pd(a, b)
6580 #endif
6581
6582 SIMDE_FUNCTION_ATTRIBUTES
6583 simde__m128d
simde_x_mm_negate_pd(simde__m128d a)6584 simde_x_mm_negate_pd(simde__m128d a) {
6585 #if defined(SIMDE_X86_SSE_NATIVE)
6586 return simde_mm_xor_pd(a, _mm_set1_pd(SIMDE_FLOAT64_C(-0.0)));
6587 #else
6588 simde__m128d_private
6589 r_,
6590 a_ = simde__m128d_to_private(a);
6591
6592 #if defined(SIMDE_POWER_ALTIVEC_P9_NATIVE)
6593 r_.altivec_f64 = vec_neg(a_.altivec_f64);
6594 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6595 r_.neon_f64 = vnegq_f64(a_.neon_f64);
6596 #elif defined(SIMDE_WASM_SIMD128d_NATIVE)
6597 r_.wasm_v128d = wasm_f64x2_neg(a_.wasm_v128d);
6598 #elif defined(SIMDE_VECTOR_OPS)
6599 r_.f64 = -a_.f64;
6600 #else
6601 SIMDE_VECTORIZE
6602 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
6603 r_.f64[i] = -a_.f64[i];
6604 }
6605 #endif
6606
6607 return simde__m128d_from_private(r_);
6608 #endif
6609 }
6610
6611 SIMDE_FUNCTION_ATTRIBUTES
6612 simde__m128i
simde_mm_xor_si128(simde__m128i a,simde__m128i b)6613 simde_mm_xor_si128 (simde__m128i a, simde__m128i b) {
6614 #if defined(SIMDE_X86_SSE2_NATIVE)
6615 return _mm_xor_si128(a, b);
6616 #else
6617 simde__m128i_private
6618 r_,
6619 a_ = simde__m128i_to_private(a),
6620 b_ = simde__m128i_to_private(b);
6621
6622 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6623 r_.neon_i32 = veorq_s32(a_.neon_i32, b_.neon_i32);
6624 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
6625 r_.altivec_i32 = vec_xor(a_.altivec_i32, b_.altivec_i32);
6626 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
6627 r_.i32f = a_.i32f ^ b_.i32f;
6628 #else
6629 SIMDE_VECTORIZE
6630 for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
6631 r_.i32f[i] = a_.i32f[i] ^ b_.i32f[i];
6632 }
6633 #endif
6634
6635 return simde__m128i_from_private(r_);
6636 #endif
6637 }
6638 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6639 # define _mm_xor_si128(a, b) simde_mm_xor_si128(a, b)
6640 #endif
6641
6642 SIMDE_FUNCTION_ATTRIBUTES
6643 simde__m128i
simde_x_mm_not_si128(simde__m128i a)6644 simde_x_mm_not_si128 (simde__m128i a) {
6645 simde__m128i_private
6646 r_,
6647 a_ = simde__m128i_to_private(a);
6648
6649 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6650 r_.neon_i32 = vmvnq_s32(a_.neon_i32);
6651 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
6652 r_.i32f = ~(a_.i32f);
6653 #else
6654 SIMDE_VECTORIZE
6655 for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
6656 r_.i32f[i] = ~(a_.i32f[i]);
6657 }
6658 #endif
6659
6660 return simde__m128i_from_private(r_);
6661 }
6662
6663 #define SIMDE_MM_SHUFFLE2(x, y) (((x) << 1) | (y))
6664 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6665 # define _MM_SHUFFLE2(x, y) SIMDE_MM_SHUFFLE2(x, y)
6666 #endif
6667
6668 SIMDE_END_DECLS_
6669
6670 HEDLEY_DIAGNOSTIC_POP
6671
6672 #endif /* !defined(SIMDE_X86_SSE2_H) */
6673