1 /* SPDX-License-Identifier: MIT
2 *
3 * Permission is hereby granted, free of charge, to any person
4 * obtaining a copy of this software and associated documentation
5 * files (the "Software"), to deal in the Software without
6 * restriction, including without limitation the rights to use, copy,
7 * modify, merge, publish, distribute, sublicense, and/or sell copies
8 * of the Software, and to permit persons to whom the Software is
9 * furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be
12 * included in all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
18 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
19 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Copyright:
24 * 2017-2020 Evan Nemerson <evan@nemerson.com>
25 * 2015-2017 John W. Ratcliff <jratcliffscarab@gmail.com>
26 * 2015 Brandon Rowlett <browlett@nvidia.com>
27 * 2015 Ken Fast <kfast@gdeb.com>
28 * 2017 Hasindu Gamaarachchi <hasindu@unsw.edu.au>
29 * 2018 Jeff Daily <jeff.daily@amd.com>
30 */
31
32 #if !defined(SIMDE_X86_SSE2_H)
33 #define SIMDE_X86_SSE2_H
34
35 #include "sse.h"
36
37 HEDLEY_DIAGNOSTIC_PUSH
38 SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
39 SIMDE_BEGIN_DECLS_
40
41 typedef union {
42 #if defined(SIMDE_VECTOR_SUBSCRIPT)
43 SIMDE_ALIGN(16) int8_t i8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
44 SIMDE_ALIGN(16) int16_t i16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
45 SIMDE_ALIGN(16) int32_t i32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
46 SIMDE_ALIGN(16) int64_t i64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
47 SIMDE_ALIGN(16) uint8_t u8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
48 SIMDE_ALIGN(16) uint16_t u16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
49 SIMDE_ALIGN(16) uint32_t u32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
50 SIMDE_ALIGN(16) uint64_t u64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
51 #if defined(SIMDE_HAVE_INT128_)
52 SIMDE_ALIGN(16) simde_int128 i128 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
53 SIMDE_ALIGN(16) simde_uint128 u128 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
54 #endif
55 SIMDE_ALIGN(16) simde_float32 f32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
56 SIMDE_ALIGN(16) simde_float64 f64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
57
58 SIMDE_ALIGN(16) int_fast32_t i32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
59 SIMDE_ALIGN(16) uint_fast32_t u32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
60 #else
61 SIMDE_ALIGN(16) int8_t i8[16];
62 SIMDE_ALIGN(16) int16_t i16[8];
63 SIMDE_ALIGN(16) int32_t i32[4];
64 SIMDE_ALIGN(16) int64_t i64[2];
65 SIMDE_ALIGN(16) uint8_t u8[16];
66 SIMDE_ALIGN(16) uint16_t u16[8];
67 SIMDE_ALIGN(16) uint32_t u32[4];
68 SIMDE_ALIGN(16) uint64_t u64[2];
69 #if defined(SIMDE_HAVE_INT128_)
70 SIMDE_ALIGN(16) simde_int128 i128[1];
71 SIMDE_ALIGN(16) simde_uint128 u128[1];
72 #endif
73 SIMDE_ALIGN(16) simde_float32 f32[4];
74 SIMDE_ALIGN(16) simde_float64 f64[2];
75
76 SIMDE_ALIGN(16) int_fast32_t i32f[16 / sizeof(int_fast32_t)];
77 SIMDE_ALIGN(16) uint_fast32_t u32f[16 / sizeof(uint_fast32_t)];
78 #endif
79
80 SIMDE_ALIGN(16) simde__m64_private m64_private[2];
81 SIMDE_ALIGN(16) simde__m64 m64[2];
82
83 #if defined(SIMDE_X86_SSE2_NATIVE)
84 SIMDE_ALIGN(16) __m128i n;
85 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
86 SIMDE_ALIGN(16) int8x16_t neon_i8;
87 SIMDE_ALIGN(16) int16x8_t neon_i16;
88 SIMDE_ALIGN(16) int32x4_t neon_i32;
89 SIMDE_ALIGN(16) int64x2_t neon_i64;
90 SIMDE_ALIGN(16) uint8x16_t neon_u8;
91 SIMDE_ALIGN(16) uint16x8_t neon_u16;
92 SIMDE_ALIGN(16) uint32x4_t neon_u32;
93 SIMDE_ALIGN(16) uint64x2_t neon_u64;
94 SIMDE_ALIGN(16) float32x4_t neon_f32;
95 #if defined(SIMDE_ARCH_AARCH64)
96 SIMDE_ALIGN(16) float64x2_t neon_f64;
97 #endif
98 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
99 SIMDE_ALIGN(16) v128_t wasm_v128;
100 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
101 SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(signed char) altivec_i8;
102 SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(signed short) altivec_i16;
103 SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(signed int) altivec_i32;
104 #if defined(__UINT_FAST32_TYPE__)
105 SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(__INT_FAST32_TYPE__) altivec_i32f;
106 #else
107 SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(signed int) altivec_i32f;
108 #endif
109 SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(signed long long) altivec_i64;
110 SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) altivec_u8;
111 SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) altivec_u16;
112 SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32;
113 #if defined(__UINT_FAST32_TYPE__)
114 SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(__UINT_FAST32_TYPE__) altivec_u32f;
115 #else
116 SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32f;
117 #endif
118 SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) altivec_u64;
119 SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(float) altivec_f32;
120 #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
121 SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(double) altivec_f64;
122 #endif
123 #endif
124 } simde__m128i_private;
125
126 typedef union {
127 #if defined(SIMDE_VECTOR_SUBSCRIPT)
128 SIMDE_ALIGN(16) int8_t i8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
129 SIMDE_ALIGN(16) int16_t i16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
130 SIMDE_ALIGN(16) int32_t i32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
131 SIMDE_ALIGN(16) int64_t i64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
132 SIMDE_ALIGN(16) uint8_t u8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
133 SIMDE_ALIGN(16) uint16_t u16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
134 SIMDE_ALIGN(16) uint32_t u32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
135 SIMDE_ALIGN(16) uint64_t u64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
136 SIMDE_ALIGN(16) simde_float32 f32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
137 SIMDE_ALIGN(16) simde_float64 f64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
138 SIMDE_ALIGN(16) int_fast32_t i32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
139 SIMDE_ALIGN(16) uint_fast32_t u32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
140 #else
141 SIMDE_ALIGN(16) int8_t i8[16];
142 SIMDE_ALIGN(16) int16_t i16[8];
143 SIMDE_ALIGN(16) int32_t i32[4];
144 SIMDE_ALIGN(16) int64_t i64[2];
145 SIMDE_ALIGN(16) uint8_t u8[16];
146 SIMDE_ALIGN(16) uint16_t u16[8];
147 SIMDE_ALIGN(16) uint32_t u32[4];
148 SIMDE_ALIGN(16) uint64_t u64[2];
149 SIMDE_ALIGN(16) simde_float32 f32[4];
150 SIMDE_ALIGN(16) simde_float64 f64[2];
151 SIMDE_ALIGN(16) int_fast32_t i32f[16 / sizeof(int_fast32_t)];
152 SIMDE_ALIGN(16) uint_fast32_t u32f[16 / sizeof(uint_fast32_t)];
153 #endif
154
155 SIMDE_ALIGN(16) simde__m64_private m64_private[2];
156 SIMDE_ALIGN(16) simde__m64 m64[2];
157
158 #if defined(SIMDE_X86_SSE2_NATIVE)
159 SIMDE_ALIGN(16) __m128d n;
160 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
161 SIMDE_ALIGN(16) int8x16_t neon_i8;
162 SIMDE_ALIGN(16) int16x8_t neon_i16;
163 SIMDE_ALIGN(16) int32x4_t neon_i32;
164 SIMDE_ALIGN(16) int64x2_t neon_i64;
165 SIMDE_ALIGN(16) uint8x16_t neon_u8;
166 SIMDE_ALIGN(16) uint16x8_t neon_u16;
167 SIMDE_ALIGN(16) uint32x4_t neon_u32;
168 SIMDE_ALIGN(16) uint64x2_t neon_u64;
169 SIMDE_ALIGN(16) float32x4_t neon_f32;
170 #if defined(SIMDE_ARCH_AARCH64)
171 SIMDE_ALIGN(16) float64x2_t neon_f64;
172 #endif
173 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
174 SIMDE_ALIGN(16) v128_t wasm_v128;
175 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
176 SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(signed char) altivec_i8;
177 SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(signed short) altivec_i16;
178 SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(signed int) altivec_i32;
179 #if defined(__INT_FAST32_TYPE__)
180 SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(__INT_FAST32_TYPE__) altivec_i32f;
181 #else
182 SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(signed int) altivec_i32f;
183 #endif
184 SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(signed long long) altivec_i64;
185 SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) altivec_u8;
186 SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) altivec_u16;
187 SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32;
188 #if defined(__UINT_FAST32_TYPE__)
189 SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(__UINT_FAST32_TYPE__) altivec_u32f;
190 #else
191 SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32f;
192 #endif
193 SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) altivec_u64;
194 SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(float) altivec_f32;
195 #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
196 SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(double) altivec_f64;
197 #endif
198 #endif
199 } simde__m128d_private;
200
201 #if defined(SIMDE_X86_SSE2_NATIVE)
202 typedef __m128i simde__m128i;
203 typedef __m128d simde__m128d;
204 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
205 typedef int64x2_t simde__m128i;
206 # if defined(SIMDE_ARCH_AARCH64)
207 typedef float64x2_t simde__m128d;
208 # elif defined(SIMDE_VECTOR_SUBSCRIPT)
209 typedef simde_float64 simde__m128d SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
210 # else
211 typedef simde__m128d_private simde__m128d;
212 # endif
213 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
214 typedef v128_t simde__m128i;
215 typedef v128_t simde__m128d;
216 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
217 typedef SIMDE_POWER_ALTIVEC_VECTOR(float) simde__m128i;
218 #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
219 typedef SIMDE_POWER_ALTIVEC_VECTOR(double) simde__m128d;
220 #else
221 typedef simde__m128d_private simde__m128d;
222 #endif
223 #elif defined(SIMDE_VECTOR_SUBSCRIPT)
224 typedef int64_t simde__m128i SIMDE_ALIGN(16) SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
225 typedef simde_float64 simde__m128d SIMDE_ALIGN(16) SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
226 #else
227 typedef simde__m128i_private simde__m128i;
228 typedef simde__m128d_private simde__m128d;
229 #endif
230
231 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
232 typedef simde__m128i __m128i;
233 typedef simde__m128d __m128d;
234 #endif
235
236 HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128i), "simde__m128i size incorrect");
237 HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128i_private), "simde__m128i_private size incorrect");
238 HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128d), "simde__m128d size incorrect");
239 HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128d_private), "simde__m128d_private size incorrect");
240 #if defined(SIMDE_CHECK_ALIGNMENT) && defined(SIMDE_ALIGN_OF)
241 HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128i) == 16, "simde__m128i is not 16-byte aligned");
242 HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128i_private) == 16, "simde__m128i_private is not 16-byte aligned");
243 HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128d) == 16, "simde__m128d is not 16-byte aligned");
244 HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128d_private) == 16, "simde__m128d_private is not 16-byte aligned");
245 #endif
246
247 SIMDE_FUNCTION_ATTRIBUTES
248 simde__m128i
simde__m128i_from_private(simde__m128i_private v)249 simde__m128i_from_private(simde__m128i_private v) {
250 simde__m128i r;
251 simde_memcpy(&r, &v, sizeof(r));
252 return r;
253 }
254
255 SIMDE_FUNCTION_ATTRIBUTES
256 simde__m128i_private
simde__m128i_to_private(simde__m128i v)257 simde__m128i_to_private(simde__m128i v) {
258 simde__m128i_private r;
259 simde_memcpy(&r, &v, sizeof(r));
260 return r;
261 }
262
263 SIMDE_FUNCTION_ATTRIBUTES
264 simde__m128d
simde__m128d_from_private(simde__m128d_private v)265 simde__m128d_from_private(simde__m128d_private v) {
266 simde__m128d r;
267 simde_memcpy(&r, &v, sizeof(r));
268 return r;
269 }
270
271 SIMDE_FUNCTION_ATTRIBUTES
272 simde__m128d_private
simde__m128d_to_private(simde__m128d v)273 simde__m128d_to_private(simde__m128d v) {
274 simde__m128d_private r;
275 simde_memcpy(&r, &v, sizeof(r));
276 return r;
277 }
278
279 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i,int8x16_t,neon,i8)280 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, int8x16_t, neon, i8)
281 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, int16x8_t, neon, i16)
282 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, int32x4_t, neon, i32)
283 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, int64x2_t, neon, i64)
284 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, uint8x16_t, neon, u8)
285 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, uint16x8_t, neon, u16)
286 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, uint32x4_t, neon, u32)
287 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, uint64x2_t, neon, u64)
288 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, float32x4_t, neon, f32)
289 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
290 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, float64x2_t, neon, f64)
291 #endif
292 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
293 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(signed char), altivec, i8)
294 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(signed short), altivec, i16)
295 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(signed int), altivec, i32)
296 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), altivec, u8)
297 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(unsigned short), altivec, u16)
298 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(unsigned int), altivec, u32)
299 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long), altivec, u64)
300 #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
301 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(signed long long), altivec, i64)
302 #endif
303 #endif /* defined(SIMDE_ARM_NEON_A32V7_NATIVE) */
304
305 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
306 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, int8x16_t, neon, i8)
307 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, int16x8_t, neon, i16)
308 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, int32x4_t, neon, i32)
309 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, int64x2_t, neon, i64)
310 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, uint8x16_t, neon, u8)
311 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, uint16x8_t, neon, u16)
312 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, uint32x4_t, neon, u32)
313 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, uint64x2_t, neon, u64)
314 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, float32x4_t, neon, f32)
315 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
316 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, float64x2_t, neon, f64)
317 #endif
318 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
319 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(signed char), altivec, i8)
320 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(signed short), altivec, i16)
321 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(signed int), altivec, i32)
322 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), altivec, u8)
323 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(unsigned short), altivec, u16)
324 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(unsigned int), altivec, u32)
325 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long), altivec, u64)
326 #if defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
327 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(signed long long), altivec, i64)
328 #endif
329
330 #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
331 #if defined(SIMDE_BUG_GCC_95782)
332 SIMDE_FUNCTION_ATTRIBUTES
333 SIMDE_POWER_ALTIVEC_VECTOR(double)
334 simde__m128d_to_altivec_f64(simde__m128d value) {
335 simde__m128d_private r_ = simde__m128d_to_private(value);
336 return r_.altivec_f64;
337 }
338
339 SIMDE_FUNCTION_ATTRIBUTES
340 simde__m128d
341 simde__m128d_from_altivec_f64(SIMDE_POWER_ALTIVEC_VECTOR(double) value) {
342 simde__m128d_private r_;
343 r_.altivec_f64 = value;
344 return simde__m128d_from_private(r_);
345 }
346 #else
347 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(double), altivec, f64)
348 #endif
349 #endif
350 #endif /* defined(SIMDE_ARM_NEON_A32V7_NATIVE) */
351
352 SIMDE_FUNCTION_ATTRIBUTES
353 simde__m128i
354 simde_mm_add_epi8 (simde__m128i a, simde__m128i b) {
355 #if defined(SIMDE_X86_SSE2_NATIVE)
356 return _mm_add_epi8(a, b);
357 #else
358 simde__m128i_private
359 r_,
360 a_ = simde__m128i_to_private(a),
361 b_ = simde__m128i_to_private(b);
362
363 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
364 r_.neon_i8 = vaddq_s8(a_.neon_i8, b_.neon_i8);
365 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
366 r_.altivec_i8 = vec_add(a_.altivec_i8, b_.altivec_i8);
367 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
368 r_.i8 = a_.i8 + b_.i8;
369 #else
370 SIMDE_VECTORIZE
371 for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
372 r_.i8[i] = a_.i8[i] + b_.i8[i];
373 }
374 #endif
375
376 return simde__m128i_from_private(r_);
377 #endif
378 }
379 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
380 # define _mm_add_epi8(a, b) simde_mm_add_epi8(a, b)
381 #endif
382
383 SIMDE_FUNCTION_ATTRIBUTES
384 simde__m128i
simde_mm_add_epi16(simde__m128i a,simde__m128i b)385 simde_mm_add_epi16 (simde__m128i a, simde__m128i b) {
386 #if defined(SIMDE_X86_SSE2_NATIVE)
387 return _mm_add_epi16(a, b);
388 #else
389 simde__m128i_private
390 r_,
391 a_ = simde__m128i_to_private(a),
392 b_ = simde__m128i_to_private(b);
393
394 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
395 r_.neon_i16 = vaddq_s16(a_.neon_i16, b_.neon_i16);
396 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
397 r_.altivec_i16 = vec_add(a_.altivec_i16, b_.altivec_i16);
398 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
399 r_.i16 = a_.i16 + b_.i16;
400 #else
401 SIMDE_VECTORIZE
402 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
403 r_.i16[i] = a_.i16[i] + b_.i16[i];
404 }
405 #endif
406
407 return simde__m128i_from_private(r_);
408 #endif
409 }
410 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
411 # define _mm_add_epi16(a, b) simde_mm_add_epi16(a, b)
412 #endif
413
414 SIMDE_FUNCTION_ATTRIBUTES
415 simde__m128i
simde_mm_add_epi32(simde__m128i a,simde__m128i b)416 simde_mm_add_epi32 (simde__m128i a, simde__m128i b) {
417 #if defined(SIMDE_X86_SSE2_NATIVE)
418 return _mm_add_epi32(a, b);
419 #else
420 simde__m128i_private
421 r_,
422 a_ = simde__m128i_to_private(a),
423 b_ = simde__m128i_to_private(b);
424
425 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
426 r_.neon_i32 = vaddq_s32(a_.neon_i32, b_.neon_i32);
427 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
428 r_.altivec_i32 = vec_add(a_.altivec_i32, b_.altivec_i32);
429 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
430 r_.i32 = a_.i32 + b_.i32;
431 #else
432 SIMDE_VECTORIZE
433 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
434 r_.i32[i] = a_.i32[i] + b_.i32[i];
435 }
436 #endif
437
438 return simde__m128i_from_private(r_);
439 #endif
440 }
441 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
442 # define _mm_add_epi32(a, b) simde_mm_add_epi32(a, b)
443 #endif
444
445 SIMDE_FUNCTION_ATTRIBUTES
446 simde__m128i
simde_mm_add_epi64(simde__m128i a,simde__m128i b)447 simde_mm_add_epi64 (simde__m128i a, simde__m128i b) {
448 #if defined(SIMDE_X86_SSE2_NATIVE)
449 return _mm_add_epi64(a, b);
450 #else
451 simde__m128i_private
452 r_,
453 a_ = simde__m128i_to_private(a),
454 b_ = simde__m128i_to_private(b);
455
456 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
457 r_.neon_i64 = vaddq_s64(a_.neon_i64, b_.neon_i64);
458 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
459 r_.altivec_i64 = vec_add(a_.altivec_i64, b_.altivec_i64);
460 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
461 r_.i64 = a_.i64 + b_.i64;
462 #else
463 SIMDE_VECTORIZE
464 for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
465 r_.i64[i] = a_.i64[i] + b_.i64[i];
466 }
467 #endif
468
469 return simde__m128i_from_private(r_);
470 #endif
471 }
472 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
473 # define _mm_add_epi64(a, b) simde_mm_add_epi64(a, b)
474 #endif
475
476 SIMDE_FUNCTION_ATTRIBUTES
477 simde__m128d
simde_mm_add_pd(simde__m128d a,simde__m128d b)478 simde_mm_add_pd (simde__m128d a, simde__m128d b) {
479 #if defined(SIMDE_X86_SSE2_NATIVE)
480 return _mm_add_pd(a, b);
481 #else
482 simde__m128d_private
483 r_,
484 a_ = simde__m128d_to_private(a),
485 b_ = simde__m128d_to_private(b);
486
487 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
488 r_.neon_f64 = vaddq_f64(a_.neon_f64, b_.neon_f64);
489 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
490 r_.wasm_v128 = wasm_f64x2_add(a_.wasm_v128, b_.wasm_v128);
491 #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
492 r_.altivec_f64 = vec_add(a_.altivec_f64, b_.altivec_f64);
493 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
494 r_.f64 = a_.f64 + b_.f64;
495 #else
496 SIMDE_VECTORIZE
497 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
498 r_.f64[i] = a_.f64[i] + b_.f64[i];
499 }
500 #endif
501
502 return simde__m128d_from_private(r_);
503 #endif
504 }
505 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
506 # define _mm_add_pd(a, b) simde_mm_add_pd(a, b)
507 #endif
508
509 SIMDE_FUNCTION_ATTRIBUTES
510 simde__m128d
simde_mm_move_sd(simde__m128d a,simde__m128d b)511 simde_mm_move_sd (simde__m128d a, simde__m128d b) {
512 #if defined(SIMDE_X86_SSE2_NATIVE)
513 return _mm_move_sd(a, b);
514 #else
515 simde__m128d_private
516 r_,
517 a_ = simde__m128d_to_private(a),
518 b_ = simde__m128d_to_private(b);
519
520 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
521 r_.neon_f64 = vsetq_lane_f64(vgetq_lane_f64(b_.neon_f64, 0), a_.neon_f64, 0);
522 #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
523 SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) m = {
524 16, 17, 18, 19, 20, 21, 22, 23,
525 8, 9, 10, 11, 12, 13, 14, 15
526 };
527 r_.altivec_f64 = vec_perm(a_.altivec_f64, b_.altivec_f64, m);
528 #elif defined(SIMDE_SHUFFLE_VECTOR_)
529 r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 2, 1);
530 #else
531 r_.f64[0] = b_.f64[0];
532 r_.f64[1] = a_.f64[1];
533 #endif
534
535 return simde__m128d_from_private(r_);
536 #endif
537 }
538 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
539 # define _mm_move_sd(a, b) simde_mm_move_sd(a, b)
540 #endif
541
542 SIMDE_FUNCTION_ATTRIBUTES
543 simde__m128d
simde_mm_add_sd(simde__m128d a,simde__m128d b)544 simde_mm_add_sd (simde__m128d a, simde__m128d b) {
545 #if defined(SIMDE_X86_SSE2_NATIVE)
546 return _mm_add_sd(a, b);
547 #else
548 simde__m128d_private
549 r_,
550 a_ = simde__m128d_to_private(a),
551 b_ = simde__m128d_to_private(b);
552
553 r_.f64[0] = a_.f64[0] + b_.f64[0];
554 r_.f64[1] = a_.f64[1];
555
556 #if defined(SIMDE_ASSUME_VECTORIZATION)
557 return simde_mm_move_sd(a, simde_mm_add_pd(a, b));
558 #else
559 r_.f64[0] = a_.f64[0] + b_.f64[0];
560 r_.f64[1] = a_.f64[1];
561 #endif
562
563 return simde__m128d_from_private(r_);
564 #endif
565 }
566 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
567 # define _mm_add_sd(a, b) simde_mm_add_sd(a, b)
568 #endif
569
570 SIMDE_FUNCTION_ATTRIBUTES
571 simde__m64
simde_mm_add_si64(simde__m64 a,simde__m64 b)572 simde_mm_add_si64 (simde__m64 a, simde__m64 b) {
573 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
574 return _mm_add_si64(a, b);
575 #else
576 simde__m64_private
577 r_,
578 a_ = simde__m64_to_private(a),
579 b_ = simde__m64_to_private(b);
580
581 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
582 r_.neon_i64 = vadd_s64(a_.neon_i64, b_.neon_i64);
583 #else
584 r_.i64[0] = a_.i64[0] + b_.i64[0];
585 #endif
586
587 return simde__m64_from_private(r_);
588 #endif
589 }
590 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
591 # define _mm_add_si64(a, b) simde_mm_add_si64(a, b)
592 #endif
593
594 SIMDE_FUNCTION_ATTRIBUTES
595 simde__m128i
simde_mm_adds_epi8(simde__m128i a,simde__m128i b)596 simde_mm_adds_epi8 (simde__m128i a, simde__m128i b) {
597 #if defined(SIMDE_X86_SSE2_NATIVE)
598 return _mm_adds_epi8(a, b);
599 #else
600 simde__m128i_private
601 r_,
602 a_ = simde__m128i_to_private(a),
603 b_ = simde__m128i_to_private(b);
604
605 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
606 r_.neon_i8 = vqaddq_s8(a_.neon_i8, b_.neon_i8);
607 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
608 r_.altivec_i8 = vec_adds(a_.altivec_i8, b_.altivec_i8);
609 #else
610 SIMDE_VECTORIZE
611 for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
612 const int32_t tmp =
613 HEDLEY_STATIC_CAST(int16_t, a_.i8[i]) +
614 HEDLEY_STATIC_CAST(int16_t, b_.i8[i]);
615 r_.i8[i] = HEDLEY_STATIC_CAST(int8_t, ((tmp < INT8_MAX) ? ((tmp > INT8_MIN) ? tmp : INT8_MIN) : INT8_MAX));
616 }
617 #endif
618
619 return simde__m128i_from_private(r_);
620 #endif
621 }
622 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
623 # define _mm_adds_epi8(a, b) simde_mm_adds_epi8(a, b)
624 #endif
625
626 SIMDE_FUNCTION_ATTRIBUTES
627 simde__m128i
simde_mm_adds_epi16(simde__m128i a,simde__m128i b)628 simde_mm_adds_epi16 (simde__m128i a, simde__m128i b) {
629 #if defined(SIMDE_X86_SSE2_NATIVE)
630 return _mm_adds_epi16(a, b);
631 #else
632 simde__m128i_private
633 r_,
634 a_ = simde__m128i_to_private(a),
635 b_ = simde__m128i_to_private(b);
636
637
638 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
639 r_.neon_i16 = vqaddq_s16(a_.neon_i16, b_.neon_i16);
640 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
641 r_.altivec_i16 = vec_adds(a_.altivec_i16, b_.altivec_i16);
642 #else
643 SIMDE_VECTORIZE
644 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
645 const int32_t tmp =
646 HEDLEY_STATIC_CAST(int32_t, a_.i16[i]) +
647 HEDLEY_STATIC_CAST(int32_t, b_.i16[i]);
648 r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, ((tmp < INT16_MAX) ? ((tmp > INT16_MIN) ? tmp : INT16_MIN) : INT16_MAX));
649 }
650 #endif
651
652 return simde__m128i_from_private(r_);
653 #endif
654 }
655 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
656 # define _mm_adds_epi16(a, b) simde_mm_adds_epi16(a, b)
657 #endif
658
659 SIMDE_FUNCTION_ATTRIBUTES
660 simde__m128i
simde_mm_adds_epu8(simde__m128i a,simde__m128i b)661 simde_mm_adds_epu8 (simde__m128i a, simde__m128i b) {
662 #if defined(SIMDE_X86_SSE2_NATIVE)
663 return _mm_adds_epu8(a, b);
664 #else
665 simde__m128i_private
666 r_,
667 a_ = simde__m128i_to_private(a),
668 b_ = simde__m128i_to_private(b);
669
670 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
671 r_.neon_u8 = vqaddq_u8(a_.neon_u8, b_.neon_u8);
672 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
673 r_.altivec_u8 = vec_adds(a_.altivec_u8, b_.altivec_u8);
674 #else
675 SIMDE_VECTORIZE
676 for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
677 r_.u8[i] = ((UINT8_MAX - a_.u8[i]) > b_.u8[i]) ? (a_.u8[i] + b_.u8[i]) : UINT8_MAX;
678 }
679 #endif
680
681 return simde__m128i_from_private(r_);
682 #endif
683 }
684 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
685 # define _mm_adds_epu8(a, b) simde_mm_adds_epu8(a, b)
686 #endif
687
688 SIMDE_FUNCTION_ATTRIBUTES
689 simde__m128i
simde_mm_adds_epu16(simde__m128i a,simde__m128i b)690 simde_mm_adds_epu16 (simde__m128i a, simde__m128i b) {
691 #if defined(SIMDE_X86_SSE2_NATIVE)
692 return _mm_adds_epu16(a, b);
693 #else
694 simde__m128i_private
695 r_,
696 a_ = simde__m128i_to_private(a),
697 b_ = simde__m128i_to_private(b);
698
699 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
700 r_.neon_u16 = vqaddq_u16(a_.neon_u16, b_.neon_u16);
701 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
702 r_.altivec_u16 = vec_adds(a_.altivec_u16, b_.altivec_u16);
703 #else
704 SIMDE_VECTORIZE
705 for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
706 r_.u16[i] = ((UINT16_MAX - a_.u16[i]) > b_.u16[i]) ? (a_.u16[i] + b_.u16[i]) : UINT16_MAX;
707 }
708 #endif
709
710 return simde__m128i_from_private(r_);
711 #endif
712 }
713 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
714 # define _mm_adds_epu16(a, b) simde_mm_adds_epu16(a, b)
715 #endif
716
717 SIMDE_FUNCTION_ATTRIBUTES
718 simde__m128d
simde_mm_and_pd(simde__m128d a,simde__m128d b)719 simde_mm_and_pd (simde__m128d a, simde__m128d b) {
720 #if defined(SIMDE_X86_SSE2_NATIVE)
721 return _mm_and_pd(a, b);
722 #else
723 simde__m128d_private
724 r_,
725 a_ = simde__m128d_to_private(a),
726 b_ = simde__m128d_to_private(b);
727
728 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
729 r_.neon_i32 = vandq_s32(a_.neon_i32, b_.neon_i32);
730 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
731 r_.wasm_v128 = wasm_v128_and(a_.wasm_v128, b_.wasm_v128);
732 #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
733 r_.altivec_f64 = vec_and(a_.altivec_f64, b_.altivec_f64);
734 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
735 r_.i32f = a_.i32f & b_.i32f;
736 #else
737 SIMDE_VECTORIZE
738 for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
739 r_.i32f[i] = a_.i32f[i] & b_.i32f[i];
740 }
741 #endif
742
743 return simde__m128d_from_private(r_);
744 #endif
745 }
746 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
747 # define _mm_and_pd(a, b) simde_mm_and_pd(a, b)
748 #endif
749
750 SIMDE_FUNCTION_ATTRIBUTES
751 simde__m128i
simde_mm_and_si128(simde__m128i a,simde__m128i b)752 simde_mm_and_si128 (simde__m128i a, simde__m128i b) {
753 #if defined(SIMDE_X86_SSE2_NATIVE)
754 return _mm_and_si128(a, b);
755 #else
756 simde__m128i_private
757 r_,
758 a_ = simde__m128i_to_private(a),
759 b_ = simde__m128i_to_private(b);
760
761 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
762 r_.neon_i32 = vandq_s32(b_.neon_i32, a_.neon_i32);
763 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
764 r_.altivec_u32f = vec_and(a_.altivec_u32f, b_.altivec_u32f);
765 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
766 r_.i32f = a_.i32f & b_.i32f;
767 #else
768 SIMDE_VECTORIZE
769 for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
770 r_.i32f[i] = a_.i32f[i] & b_.i32f[i];
771 }
772 #endif
773
774 return simde__m128i_from_private(r_);
775 #endif
776 }
777 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
778 # define _mm_and_si128(a, b) simde_mm_and_si128(a, b)
779 #endif
780
781 SIMDE_FUNCTION_ATTRIBUTES
782 simde__m128d
simde_mm_andnot_pd(simde__m128d a,simde__m128d b)783 simde_mm_andnot_pd (simde__m128d a, simde__m128d b) {
784 #if defined(SIMDE_X86_SSE2_NATIVE)
785 return _mm_andnot_pd(a, b);
786 #else
787 simde__m128d_private
788 r_,
789 a_ = simde__m128d_to_private(a),
790 b_ = simde__m128d_to_private(b);
791
792 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
793 r_.neon_i32 = vbicq_s32(a_.neon_i32, b_.neon_i32);
794 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
795 r_.wasm_v128 = wasm_v128_andnot(b_.wasm_v128, a_.wasm_v128);
796 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
797 r_.altivec_i32f = vec_andc(a_.altivec_i32f, b_.altivec_i32f);
798 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
799 r_.i32f = ~a_.i32f & b_.i32f;
800 #else
801 SIMDE_VECTORIZE
802 for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
803 r_.u64[i] = ~a_.u64[i] & b_.u64[i];
804 }
805 #endif
806
807 return simde__m128d_from_private(r_);
808 #endif
809 }
810 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
811 # define _mm_andnot_pd(a, b) simde_mm_andnot_pd(a, b)
812 #endif
813
814 SIMDE_FUNCTION_ATTRIBUTES
815 simde__m128i
simde_mm_andnot_si128(simde__m128i a,simde__m128i b)816 simde_mm_andnot_si128 (simde__m128i a, simde__m128i b) {
817 #if defined(SIMDE_X86_SSE2_NATIVE)
818 return _mm_andnot_si128(a, b);
819 #else
820 simde__m128i_private
821 r_,
822 a_ = simde__m128i_to_private(a),
823 b_ = simde__m128i_to_private(b);
824
825 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
826 r_.neon_i32 = vbicq_s32(b_.neon_i32, a_.neon_i32);
827 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
828 r_.altivec_i32 = vec_andc(b_.altivec_i32, a_.altivec_i32);
829 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
830 r_.i32f = ~a_.i32f & b_.i32f;
831 #else
832 SIMDE_VECTORIZE
833 for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
834 r_.i32f[i] = ~(a_.i32f[i]) & b_.i32f[i];
835 }
836 #endif
837
838 return simde__m128i_from_private(r_);
839 #endif
840 }
841 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
842 # define _mm_andnot_si128(a, b) simde_mm_andnot_si128(a, b)
843 #endif
844
845 SIMDE_FUNCTION_ATTRIBUTES
846 simde__m128i
simde_mm_avg_epu8(simde__m128i a,simde__m128i b)847 simde_mm_avg_epu8 (simde__m128i a, simde__m128i b) {
848 #if defined(SIMDE_X86_SSE2_NATIVE)
849 return _mm_avg_epu8(a, b);
850 #else
851 simde__m128i_private
852 r_,
853 a_ = simde__m128i_to_private(a),
854 b_ = simde__m128i_to_private(b);
855
856 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
857 r_.neon_u8 = vrhaddq_u8(b_.neon_u8, a_.neon_u8);
858 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
859 r_.altivec_u8 = vec_avg(a_.altivec_u8, b_.altivec_u8);
860 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_CONVERT_VECTOR_)
861 uint16_t wa SIMDE_VECTOR(32);
862 uint16_t wb SIMDE_VECTOR(32);
863 uint16_t wr SIMDE_VECTOR(32);
864 SIMDE_CONVERT_VECTOR_(wa, a_.u8);
865 SIMDE_CONVERT_VECTOR_(wb, b_.u8);
866 wr = (wa + wb + 1) >> 1;
867 SIMDE_CONVERT_VECTOR_(r_.u8, wr);
868 #else
869 SIMDE_VECTORIZE
870 for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
871 r_.u8[i] = (a_.u8[i] + b_.u8[i] + 1) >> 1;
872 }
873 #endif
874
875 return simde__m128i_from_private(r_);
876 #endif
877 }
878 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
879 # define _mm_avg_epu8(a, b) simde_mm_avg_epu8(a, b)
880 #endif
881
882 SIMDE_FUNCTION_ATTRIBUTES
883 simde__m128i
simde_mm_avg_epu16(simde__m128i a,simde__m128i b)884 simde_mm_avg_epu16 (simde__m128i a, simde__m128i b) {
885 #if defined(SIMDE_X86_SSE2_NATIVE)
886 return _mm_avg_epu16(a, b);
887 #else
888 simde__m128i_private
889 r_,
890 a_ = simde__m128i_to_private(a),
891 b_ = simde__m128i_to_private(b);
892
893 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
894 r_.neon_u16 = vrhaddq_u16(b_.neon_u16, a_.neon_u16);
895 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
896 r_.altivec_u16 = vec_avg(a_.altivec_u16, b_.altivec_u16);
897 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_CONVERT_VECTOR_)
898 uint32_t wa SIMDE_VECTOR(32);
899 uint32_t wb SIMDE_VECTOR(32);
900 uint32_t wr SIMDE_VECTOR(32);
901 SIMDE_CONVERT_VECTOR_(wa, a_.u16);
902 SIMDE_CONVERT_VECTOR_(wb, b_.u16);
903 wr = (wa + wb + 1) >> 1;
904 SIMDE_CONVERT_VECTOR_(r_.u16, wr);
905 #else
906 SIMDE_VECTORIZE
907 for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
908 r_.u16[i] = (a_.u16[i] + b_.u16[i] + 1) >> 1;
909 }
910 #endif
911
912 return simde__m128i_from_private(r_);
913 #endif
914 }
915 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
916 # define _mm_avg_epu16(a, b) simde_mm_avg_epu16(a, b)
917 #endif
918
919 SIMDE_FUNCTION_ATTRIBUTES
920 simde__m128i
simde_mm_setzero_si128(void)921 simde_mm_setzero_si128 (void) {
922 #if defined(SIMDE_X86_SSE2_NATIVE)
923 return _mm_setzero_si128();
924 #else
925 simde__m128i_private r_;
926
927 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
928 r_.neon_i32 = vdupq_n_s32(0);
929 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
930 r_.altivec_i32 = vec_splats(HEDLEY_STATIC_CAST(signed int, 0));
931 #elif defined(SIMDE_VECTOR_SUBSCRIPT)
932 r_.i32 = __extension__ (__typeof__(r_.i32)) { 0, 0, 0, 0 };
933 #else
934 SIMDE_VECTORIZE
935 for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
936 r_.i32f[i] = 0;
937 }
938 #endif
939
940 return simde__m128i_from_private(r_);
941 #endif
942 }
943 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
944 # define _mm_setzero_si128() (simde_mm_setzero_si128())
945 #endif
946
947 SIMDE_FUNCTION_ATTRIBUTES
948 simde__m128i
simde_mm_bslli_si128(simde__m128i a,const int imm8)949 simde_mm_bslli_si128 (simde__m128i a, const int imm8)
950 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
951 simde__m128i_private
952 r_,
953 a_ = simde__m128i_to_private(a);
954
955 if (HEDLEY_UNLIKELY((imm8 & ~15))) {
956 return simde_mm_setzero_si128();
957 }
958
959 #if defined(SIMDE_HAVE_INT128_) && defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) && 0
960 r_.u128[0] = a_.u128[0] << s;
961 #else
962 r_ = simde__m128i_to_private(simde_mm_setzero_si128());
963 for (int i = imm8 ; i < HEDLEY_STATIC_CAST(int, sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
964 r_.i8[i] = a_.i8[i - imm8];
965 }
966 #endif
967
968 return simde__m128i_from_private(r_);
969 }
970 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
971 # define simde_mm_bslli_si128(a, imm8) _mm_slli_si128(a, imm8)
972 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__)
973 # define simde_mm_bslli_si128(a, imm8) \
974 simde__m128i_from_neon_i8(((imm8) <= 0) ? simde__m128i_to_neon_i8(a) : (((imm8) > 15) ? (vdupq_n_s8(0)) : (vextq_s8(vdupq_n_s8(0), simde__m128i_to_neon_i8(a), 16 - (imm8)))))
975 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
976 #define simde_mm_bslli_si128(a, imm8) \
977 (__extension__ ({ \
978 SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) simde_mm_bslli_si128_z_ = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; \
979 simde__m128i_from_altivec_u8((imm8 < 16) ? vec_sld(simde__m128i_to_altivec_u8(a), simde_mm_bslli_si128_z_, imm8 & 15) : simde_mm_bslli_si128_z_); \
980 }))
981 #elif defined(SIMDE_SHUFFLE_VECTOR_)
982 #define simde_mm_bslli_si128(a, imm8) (__extension__ ({ \
983 const simde__m128i_private simde__tmp_a_ = simde__m128i_to_private(a); \
984 const simde__m128i_private simde__tmp_z_ = simde__m128i_to_private(simde_mm_setzero_si128()); \
985 simde__m128i_private simde__tmp_r_; \
986 if (HEDLEY_UNLIKELY(imm8 > 15)) { \
987 simde__tmp_r_ = simde__m128i_to_private(simde_mm_setzero_si128()); \
988 } else { \
989 simde__tmp_r_.i8 = \
990 SIMDE_SHUFFLE_VECTOR_(8, 16, \
991 simde__tmp_z_.i8, \
992 (simde__tmp_a_).i8, \
993 HEDLEY_STATIC_CAST(int8_t, (16 - imm8) & 31), \
994 HEDLEY_STATIC_CAST(int8_t, (17 - imm8) & 31), \
995 HEDLEY_STATIC_CAST(int8_t, (18 - imm8) & 31), \
996 HEDLEY_STATIC_CAST(int8_t, (19 - imm8) & 31), \
997 HEDLEY_STATIC_CAST(int8_t, (20 - imm8) & 31), \
998 HEDLEY_STATIC_CAST(int8_t, (21 - imm8) & 31), \
999 HEDLEY_STATIC_CAST(int8_t, (22 - imm8) & 31), \
1000 HEDLEY_STATIC_CAST(int8_t, (23 - imm8) & 31), \
1001 HEDLEY_STATIC_CAST(int8_t, (24 - imm8) & 31), \
1002 HEDLEY_STATIC_CAST(int8_t, (25 - imm8) & 31), \
1003 HEDLEY_STATIC_CAST(int8_t, (26 - imm8) & 31), \
1004 HEDLEY_STATIC_CAST(int8_t, (27 - imm8) & 31), \
1005 HEDLEY_STATIC_CAST(int8_t, (28 - imm8) & 31), \
1006 HEDLEY_STATIC_CAST(int8_t, (29 - imm8) & 31), \
1007 HEDLEY_STATIC_CAST(int8_t, (30 - imm8) & 31), \
1008 HEDLEY_STATIC_CAST(int8_t, (31 - imm8) & 31)); \
1009 } \
1010 simde__m128i_from_private(simde__tmp_r_); }))
1011 #endif
1012 #define simde_mm_slli_si128(a, imm8) simde_mm_bslli_si128(a, imm8)
1013 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1014 # define _mm_bslli_si128(a, imm8) simde_mm_bslli_si128(a, imm8)
1015 # define _mm_slli_si128(a, imm8) simde_mm_bslli_si128(a, imm8)
1016 #endif
1017
1018 SIMDE_FUNCTION_ATTRIBUTES
1019 simde__m128i
simde_mm_bsrli_si128(simde__m128i a,const int imm8)1020 simde_mm_bsrli_si128 (simde__m128i a, const int imm8)
1021 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
1022 simde__m128i_private
1023 r_,
1024 a_ = simde__m128i_to_private(a);
1025
1026 SIMDE_VECTORIZE
1027 for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
1028 const int e = HEDLEY_STATIC_CAST(int, i) + imm8;
1029 r_.i8[i] = (e < 16) ? a_.i8[e] : 0;
1030 }
1031
1032 return simde__m128i_from_private(r_);
1033 }
1034 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
1035 # define simde_mm_bsrli_si128(a, imm8) _mm_srli_si128(a, imm8)
1036 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__)
1037 # define simde_mm_bsrli_si128(a, imm8) \
1038 simde__m128i_from_neon_i8(((imm8 < 0) || (imm8 > 15)) ? vdupq_n_s8(0) : (vextq_s8(simde__m128i_to_private(a).neon_i8, vdupq_n_s8(0), ((imm8 & 15) != 0) ? imm8 : (imm8 & 15))))
1039 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
1040 #define simde_mm_bsrli_si128(a, imm8) \
1041 (__extension__ ({ \
1042 SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) simde_mm_bslli_si128_z_ = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; \
1043 simde__m128i_from_altivec_u8((imm8 < 16) ? vec_sro(simde__m128i_to_altivec_u8(a), vec_splats(HEDLEY_STATIC_CAST(unsigned char, imm8 * 8))) : simde_mm_bslli_si128_z_); \
1044 }))
1045 #elif defined(SIMDE_SHUFFLE_VECTOR_)
1046 #define simde_mm_bsrli_si128(a, imm8) (__extension__ ({ \
1047 const simde__m128i_private simde__tmp_a_ = simde__m128i_to_private(a); \
1048 const simde__m128i_private simde__tmp_z_ = simde__m128i_to_private(simde_mm_setzero_si128()); \
1049 simde__m128i_private simde__tmp_r_ = simde__m128i_to_private(a); \
1050 if (HEDLEY_UNLIKELY(imm8 > 15)) { \
1051 simde__tmp_r_ = simde__m128i_to_private(simde_mm_setzero_si128()); \
1052 } else { \
1053 simde__tmp_r_.i8 = \
1054 SIMDE_SHUFFLE_VECTOR_(8, 16, \
1055 simde__tmp_z_.i8, \
1056 (simde__tmp_a_).i8, \
1057 HEDLEY_STATIC_CAST(int8_t, (imm8 + 16) & 31), \
1058 HEDLEY_STATIC_CAST(int8_t, (imm8 + 17) & 31), \
1059 HEDLEY_STATIC_CAST(int8_t, (imm8 + 18) & 31), \
1060 HEDLEY_STATIC_CAST(int8_t, (imm8 + 19) & 31), \
1061 HEDLEY_STATIC_CAST(int8_t, (imm8 + 20) & 31), \
1062 HEDLEY_STATIC_CAST(int8_t, (imm8 + 21) & 31), \
1063 HEDLEY_STATIC_CAST(int8_t, (imm8 + 22) & 31), \
1064 HEDLEY_STATIC_CAST(int8_t, (imm8 + 23) & 31), \
1065 HEDLEY_STATIC_CAST(int8_t, (imm8 + 24) & 31), \
1066 HEDLEY_STATIC_CAST(int8_t, (imm8 + 25) & 31), \
1067 HEDLEY_STATIC_CAST(int8_t, (imm8 + 26) & 31), \
1068 HEDLEY_STATIC_CAST(int8_t, (imm8 + 27) & 31), \
1069 HEDLEY_STATIC_CAST(int8_t, (imm8 + 28) & 31), \
1070 HEDLEY_STATIC_CAST(int8_t, (imm8 + 29) & 31), \
1071 HEDLEY_STATIC_CAST(int8_t, (imm8 + 30) & 31), \
1072 HEDLEY_STATIC_CAST(int8_t, (imm8 + 31) & 31)); \
1073 } \
1074 simde__m128i_from_private(simde__tmp_r_); }))
1075 #endif
1076 #define simde_mm_srli_si128(a, imm8) simde_mm_bsrli_si128((a), (imm8))
1077 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1078 # define _mm_bsrli_si128(a, imm8) simde_mm_bsrli_si128((a), (imm8))
1079 # define _mm_srli_si128(a, imm8) simde_mm_bsrli_si128((a), (imm8))
1080 #endif
1081
1082 SIMDE_FUNCTION_ATTRIBUTES
1083 void
simde_mm_clflush(void const * p)1084 simde_mm_clflush (void const* p) {
1085 #if defined(SIMDE_X86_SSE2_NATIVE)
1086 _mm_clflush(p);
1087 #else
1088 (void) p;
1089 #endif
1090 }
1091 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1092 # define _mm_clflush(a, b) simde_mm_clflush()
1093 #endif
1094
1095 SIMDE_FUNCTION_ATTRIBUTES
1096 int
simde_mm_comieq_sd(simde__m128d a,simde__m128d b)1097 simde_mm_comieq_sd (simde__m128d a, simde__m128d b) {
1098 #if defined(SIMDE_X86_SSE2_NATIVE)
1099 return _mm_comieq_sd(a, b);
1100 #else
1101 simde__m128d_private
1102 a_ = simde__m128d_to_private(a),
1103 b_ = simde__m128d_to_private(b);
1104 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1105 return !!vgetq_lane_u64(vceqq_f64(a_.neon_f64, b_.neon_f64), 0);
1106 #else
1107 return a_.f64[0] == b_.f64[0];
1108 #endif
1109 #endif
1110 }
1111 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1112 # define _mm_comieq_sd(a, b) simde_mm_comieq_sd(a, b)
1113 #endif
1114
1115 SIMDE_FUNCTION_ATTRIBUTES
1116 int
simde_mm_comige_sd(simde__m128d a,simde__m128d b)1117 simde_mm_comige_sd (simde__m128d a, simde__m128d b) {
1118 #if defined(SIMDE_X86_SSE2_NATIVE)
1119 return _mm_comige_sd(a, b);
1120 #else
1121 simde__m128d_private
1122 a_ = simde__m128d_to_private(a),
1123 b_ = simde__m128d_to_private(b);
1124 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1125 return !!vgetq_lane_u64(vcgeq_f64(a_.neon_f64, b_.neon_f64), 0);
1126 #else
1127 return a_.f64[0] >= b_.f64[0];
1128 #endif
1129 #endif
1130 }
1131 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1132 # define _mm_comige_sd(a, b) simde_mm_comige_sd(a, b)
1133 #endif
1134
1135 SIMDE_FUNCTION_ATTRIBUTES
1136 int
simde_mm_comigt_sd(simde__m128d a,simde__m128d b)1137 simde_mm_comigt_sd (simde__m128d a, simde__m128d b) {
1138 #if defined(SIMDE_X86_SSE2_NATIVE)
1139 return _mm_comigt_sd(a, b);
1140 #else
1141 simde__m128d_private
1142 a_ = simde__m128d_to_private(a),
1143 b_ = simde__m128d_to_private(b);
1144 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1145 return !!vgetq_lane_u64(vcgtq_f64(a_.neon_f64, b_.neon_f64), 0);
1146 #else
1147 return a_.f64[0] > b_.f64[0];
1148 #endif
1149 #endif
1150 }
1151 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1152 # define _mm_comigt_sd(a, b) simde_mm_comigt_sd(a, b)
1153 #endif
1154
1155 SIMDE_FUNCTION_ATTRIBUTES
1156 int
simde_mm_comile_sd(simde__m128d a,simde__m128d b)1157 simde_mm_comile_sd (simde__m128d a, simde__m128d b) {
1158 #if defined(SIMDE_X86_SSE2_NATIVE)
1159 return _mm_comile_sd(a, b);
1160 #else
1161 simde__m128d_private
1162 a_ = simde__m128d_to_private(a),
1163 b_ = simde__m128d_to_private(b);
1164 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1165 return !!vgetq_lane_u64(vcleq_f64(a_.neon_f64, b_.neon_f64), 0);
1166 #else
1167 return a_.f64[0] <= b_.f64[0];
1168 #endif
1169 #endif
1170 }
1171 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1172 # define _mm_comile_sd(a, b) simde_mm_comile_sd(a, b)
1173 #endif
1174
1175 SIMDE_FUNCTION_ATTRIBUTES
1176 int
simde_mm_comilt_sd(simde__m128d a,simde__m128d b)1177 simde_mm_comilt_sd (simde__m128d a, simde__m128d b) {
1178 #if defined(SIMDE_X86_SSE2_NATIVE)
1179 return _mm_comilt_sd(a, b);
1180 #else
1181 simde__m128d_private
1182 a_ = simde__m128d_to_private(a),
1183 b_ = simde__m128d_to_private(b);
1184 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1185 return !!vgetq_lane_u64(vcltq_f64(a_.neon_f64, b_.neon_f64), 0);
1186 #else
1187 return a_.f64[0] < b_.f64[0];
1188 #endif
1189 #endif
1190 }
1191 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1192 # define _mm_comilt_sd(a, b) simde_mm_comilt_sd(a, b)
1193 #endif
1194
1195 SIMDE_FUNCTION_ATTRIBUTES
1196 int
simde_mm_comineq_sd(simde__m128d a,simde__m128d b)1197 simde_mm_comineq_sd (simde__m128d a, simde__m128d b) {
1198 #if defined(SIMDE_X86_SSE2_NATIVE)
1199 return _mm_comineq_sd(a, b);
1200 #else
1201 simde__m128d_private
1202 a_ = simde__m128d_to_private(a),
1203 b_ = simde__m128d_to_private(b);
1204 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1205 return !vgetq_lane_u64(vceqq_f64(a_.neon_f64, b_.neon_f64), 0);
1206 #else
1207 return a_.f64[0] != b_.f64[0];
1208 #endif
1209 #endif
1210 }
1211 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1212 # define _mm_comineq_sd(a, b) simde_mm_comineq_sd(a, b)
1213 #endif
1214
1215 SIMDE_FUNCTION_ATTRIBUTES
1216 simde__m128
simde_mm_castpd_ps(simde__m128d a)1217 simde_mm_castpd_ps (simde__m128d a) {
1218 #if defined(SIMDE_X86_SSE2_NATIVE)
1219 return _mm_castpd_ps(a);
1220 #else
1221 simde__m128 r;
1222 simde_memcpy(&r, &a, sizeof(a));
1223 return r;
1224 #endif
1225 }
1226 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1227 # define _mm_castpd_ps(a) simde_mm_castpd_ps(a)
1228 #endif
1229
1230 SIMDE_FUNCTION_ATTRIBUTES
1231 simde__m128i
simde_mm_castpd_si128(simde__m128d a)1232 simde_mm_castpd_si128 (simde__m128d a) {
1233 #if defined(SIMDE_X86_SSE2_NATIVE)
1234 return _mm_castpd_si128(a);
1235 #else
1236 simde__m128i r;
1237 simde_memcpy(&r, &a, sizeof(a));
1238 return r;
1239 #endif
1240 }
1241 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1242 # define _mm_castpd_si128(a) simde_mm_castpd_si128(a)
1243 #endif
1244
1245 SIMDE_FUNCTION_ATTRIBUTES
1246 simde__m128d
simde_mm_castps_pd(simde__m128 a)1247 simde_mm_castps_pd (simde__m128 a) {
1248 #if defined(SIMDE_X86_SSE2_NATIVE)
1249 return _mm_castps_pd(a);
1250 #else
1251 simde__m128d r;
1252 simde_memcpy(&r, &a, sizeof(a));
1253 return r;
1254 #endif
1255 }
1256 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1257 # define _mm_castps_pd(a) simde_mm_castps_pd(a)
1258 #endif
1259
1260 SIMDE_FUNCTION_ATTRIBUTES
1261 simde__m128i
simde_mm_castps_si128(simde__m128 a)1262 simde_mm_castps_si128 (simde__m128 a) {
1263 #if defined(SIMDE_X86_SSE2_NATIVE)
1264 return _mm_castps_si128(a);
1265 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1266 return simde__m128i_from_neon_i32(simde__m128_to_private(a).neon_i32);
1267 #else
1268 simde__m128i r;
1269 simde_memcpy(&r, &a, sizeof(a));
1270 return r;
1271 #endif
1272 }
1273 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1274 # define _mm_castps_si128(a) simde_mm_castps_si128(a)
1275 #endif
1276
1277 SIMDE_FUNCTION_ATTRIBUTES
1278 simde__m128d
simde_mm_castsi128_pd(simde__m128i a)1279 simde_mm_castsi128_pd (simde__m128i a) {
1280 #if defined(SIMDE_X86_SSE2_NATIVE)
1281 return _mm_castsi128_pd(a);
1282 #else
1283 simde__m128d r;
1284 simde_memcpy(&r, &a, sizeof(a));
1285 return r;
1286 #endif
1287 }
1288 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1289 # define _mm_castsi128_pd(a) simde_mm_castsi128_pd(a)
1290 #endif
1291
1292 SIMDE_FUNCTION_ATTRIBUTES
1293 simde__m128
simde_mm_castsi128_ps(simde__m128i a)1294 simde_mm_castsi128_ps (simde__m128i a) {
1295 #if defined(SIMDE_X86_SSE2_NATIVE)
1296 return _mm_castsi128_ps(a);
1297 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1298 return HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), a);
1299 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1300 return simde__m128_from_neon_i32(simde__m128i_to_private(a).neon_i32);
1301 #else
1302 simde__m128 r;
1303 simde_memcpy(&r, &a, sizeof(a));
1304 return r;
1305 #endif
1306 }
1307 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1308 # define _mm_castsi128_ps(a) simde_mm_castsi128_ps(a)
1309 #endif
1310
1311 SIMDE_FUNCTION_ATTRIBUTES
1312 simde__m128i
simde_mm_cmpeq_epi8(simde__m128i a,simde__m128i b)1313 simde_mm_cmpeq_epi8 (simde__m128i a, simde__m128i b) {
1314 #if defined(SIMDE_X86_SSE2_NATIVE)
1315 return _mm_cmpeq_epi8(a, b);
1316 #else
1317 simde__m128i_private
1318 r_,
1319 a_ = simde__m128i_to_private(a),
1320 b_ = simde__m128i_to_private(b);
1321
1322 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1323 r_.neon_u8 = vceqq_s8(b_.neon_i8, a_.neon_i8);
1324 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1325 r_.wasm_v128 = wasm_i8x16_eq(a_.wasm_v128, b_.wasm_v128);
1326 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1327 r_.altivec_i8 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char), vec_cmpeq(a_.altivec_i8, b_.altivec_i8));
1328 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1329 r_.i8 = HEDLEY_STATIC_CAST(__typeof__(r_.i8), (a_.i8 == b_.i8));
1330 #else
1331 SIMDE_VECTORIZE
1332 for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
1333 r_.i8[i] = (a_.i8[i] == b_.i8[i]) ? ~INT8_C(0) : INT8_C(0);
1334 }
1335 #endif
1336
1337 return simde__m128i_from_private(r_);
1338 #endif
1339 }
1340 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1341 # define _mm_cmpeq_epi8(a, b) simde_mm_cmpeq_epi8(a, b)
1342 #endif
1343
1344 SIMDE_FUNCTION_ATTRIBUTES
1345 simde__m128i
simde_mm_cmpeq_epi16(simde__m128i a,simde__m128i b)1346 simde_mm_cmpeq_epi16 (simde__m128i a, simde__m128i b) {
1347 #if defined(SIMDE_X86_SSE2_NATIVE)
1348 return _mm_cmpeq_epi16(a, b);
1349 #else
1350 simde__m128i_private
1351 r_,
1352 a_ = simde__m128i_to_private(a),
1353 b_ = simde__m128i_to_private(b);
1354
1355 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1356 r_.neon_u16 = vceqq_s16(b_.neon_i16, a_.neon_i16);
1357 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1358 r_.wasm_v128 = wasm_i16x8_eq(a_.wasm_v128, b_.wasm_v128);
1359 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1360 r_.altivec_i16 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed short), vec_cmpeq(a_.altivec_i16, b_.altivec_i16));
1361 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1362 r_.i16 = (a_.i16 == b_.i16);
1363 #else
1364 SIMDE_VECTORIZE
1365 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
1366 r_.i16[i] = (a_.i16[i] == b_.i16[i]) ? ~INT16_C(0) : INT16_C(0);
1367 }
1368 #endif
1369
1370 return simde__m128i_from_private(r_);
1371 #endif
1372 }
1373 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1374 # define _mm_cmpeq_epi16(a, b) simde_mm_cmpeq_epi16(a, b)
1375 #endif
1376
1377 SIMDE_FUNCTION_ATTRIBUTES
1378 simde__m128i
simde_mm_cmpeq_epi32(simde__m128i a,simde__m128i b)1379 simde_mm_cmpeq_epi32 (simde__m128i a, simde__m128i b) {
1380 #if defined(SIMDE_X86_SSE2_NATIVE)
1381 return _mm_cmpeq_epi32(a, b);
1382 #else
1383 simde__m128i_private
1384 r_,
1385 a_ = simde__m128i_to_private(a),
1386 b_ = simde__m128i_to_private(b);
1387
1388 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1389 r_.neon_u32 = vceqq_s32(b_.neon_i32, a_.neon_i32);
1390 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1391 r_.wasm_v128 = wasm_i32x4_eq(a_.wasm_v128, b_.wasm_v128);
1392 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1393 r_.altivec_i32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed int), vec_cmpeq(a_.altivec_i32, b_.altivec_i32));
1394 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1395 r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), a_.i32 == b_.i32);
1396 #else
1397 SIMDE_VECTORIZE
1398 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1399 r_.i32[i] = (a_.i32[i] == b_.i32[i]) ? ~INT32_C(0) : INT32_C(0);
1400 }
1401 #endif
1402
1403 return simde__m128i_from_private(r_);
1404 #endif
1405 }
1406 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1407 # define _mm_cmpeq_epi32(a, b) simde_mm_cmpeq_epi32(a, b)
1408 #endif
1409
1410 SIMDE_FUNCTION_ATTRIBUTES
1411 simde__m128d
simde_mm_cmpeq_pd(simde__m128d a,simde__m128d b)1412 simde_mm_cmpeq_pd (simde__m128d a, simde__m128d b) {
1413 #if defined(SIMDE_X86_SSE2_NATIVE)
1414 return _mm_cmpeq_pd(a, b);
1415 #else
1416 simde__m128d_private
1417 r_,
1418 a_ = simde__m128d_to_private(a),
1419 b_ = simde__m128d_to_private(b);
1420
1421 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1422 r_.neon_u64 = vceqq_s64(b_.neon_i64, a_.neon_i64);
1423 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1424 r_.wasm_v128 = wasm_f64x2_eq(a_.wasm_v128, b_.wasm_v128);
1425 #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
1426 r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_cmpeq(a_.altivec_f64, b_.altivec_f64));
1427 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1428 r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 == b_.f64));
1429 #else
1430 SIMDE_VECTORIZE
1431 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
1432 r_.u64[i] = (a_.f64[i] == b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
1433 }
1434 #endif
1435
1436 return simde__m128d_from_private(r_);
1437 #endif
1438 }
1439 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1440 # define _mm_cmpeq_pd(a, b) simde_mm_cmpeq_pd(a, b)
1441 #endif
1442
1443 SIMDE_FUNCTION_ATTRIBUTES
1444 simde__m128d
simde_mm_cmpeq_sd(simde__m128d a,simde__m128d b)1445 simde_mm_cmpeq_sd (simde__m128d a, simde__m128d b) {
1446 #if defined(SIMDE_X86_SSE2_NATIVE)
1447 return _mm_cmpeq_sd(a, b);
1448 #elif defined(SIMDE_ASSUME_VECTORIZATION)
1449 return simde_mm_move_sd(a, simde_mm_cmpeq_pd(a, b));
1450 #else
1451 simde__m128d_private
1452 r_,
1453 a_ = simde__m128d_to_private(a),
1454 b_ = simde__m128d_to_private(b);
1455
1456 r_.u64[0] = (a_.u64[0] == b_.u64[0]) ? ~UINT64_C(0) : 0;
1457 r_.u64[1] = a_.u64[1];
1458
1459 return simde__m128d_from_private(r_);
1460 #endif
1461 }
1462 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1463 # define _mm_cmpeq_sd(a, b) simde_mm_cmpeq_sd(a, b)
1464 #endif
1465
1466 SIMDE_FUNCTION_ATTRIBUTES
1467 simde__m128d
simde_mm_cmpneq_pd(simde__m128d a,simde__m128d b)1468 simde_mm_cmpneq_pd (simde__m128d a, simde__m128d b) {
1469 #if defined(SIMDE_X86_SSE2_NATIVE)
1470 return _mm_cmpneq_pd(a, b);
1471 #else
1472 simde__m128d_private
1473 r_,
1474 a_ = simde__m128d_to_private(a),
1475 b_ = simde__m128d_to_private(b);
1476
1477 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1478 r_.neon_u16 = vmvnq_u16(vceqq_s16(b_.neon_i16, a_.neon_i16));
1479 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1480 r_.wasm_v128 = wasm_f64x2_ne(a_.wasm_v128, b_.wasm_v128);
1481 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1482 r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 != b_.f64));
1483 #else
1484 SIMDE_VECTORIZE
1485 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
1486 r_.u64[i] = (a_.f64[i] != b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
1487 }
1488 #endif
1489
1490 return simde__m128d_from_private(r_);
1491 #endif
1492 }
1493 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1494 # define _mm_cmpneq_pd(a, b) simde_mm_cmpneq_pd(a, b)
1495 #endif
1496
1497 SIMDE_FUNCTION_ATTRIBUTES
1498 simde__m128d
simde_mm_cmpneq_sd(simde__m128d a,simde__m128d b)1499 simde_mm_cmpneq_sd (simde__m128d a, simde__m128d b) {
1500 #if defined(SIMDE_X86_SSE2_NATIVE)
1501 return _mm_cmpneq_sd(a, b);
1502 #elif defined(SIMDE_ASSUME_VECTORIZATION)
1503 return simde_mm_move_sd(a, simde_mm_cmpneq_pd(a, b));
1504 #else
1505 simde__m128d_private
1506 r_,
1507 a_ = simde__m128d_to_private(a),
1508 b_ = simde__m128d_to_private(b);
1509
1510 r_.u64[0] = (a_.f64[0] != b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
1511 r_.u64[1] = a_.u64[1];
1512
1513
1514 return simde__m128d_from_private(r_);
1515 #endif
1516 }
1517 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1518 # define _mm_cmpneq_sd(a, b) simde_mm_cmpneq_sd(a, b)
1519 #endif
1520
1521 SIMDE_FUNCTION_ATTRIBUTES
1522 simde__m128i
simde_mm_cmplt_epi8(simde__m128i a,simde__m128i b)1523 simde_mm_cmplt_epi8 (simde__m128i a, simde__m128i b) {
1524 #if defined(SIMDE_X86_SSE2_NATIVE)
1525 return _mm_cmplt_epi8(a, b);
1526 #else
1527 simde__m128i_private
1528 r_,
1529 a_ = simde__m128i_to_private(a),
1530 b_ = simde__m128i_to_private(b);
1531
1532 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1533 r_.neon_u8 = vcltq_s8(a_.neon_i8, b_.neon_i8);
1534 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1535 r_.altivec_i8 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char),vec_cmplt(a_.altivec_i8, b_.altivec_i8));
1536 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1537 r_.wasm_v128 = wasm_i8x16_lt(a_.wasm_v128, b_.wasm_v128);
1538 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1539 r_.i8 = HEDLEY_STATIC_CAST(__typeof__(r_.i8), (a_.i8 < b_.i8));
1540 #else
1541 SIMDE_VECTORIZE
1542 for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
1543 r_.i8[i] = (a_.i8[i] < b_.i8[i]) ? ~INT8_C(0) : INT8_C(0);
1544 }
1545 #endif
1546
1547 return simde__m128i_from_private(r_);
1548 #endif
1549 }
1550 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1551 # define _mm_cmplt_epi8(a, b) simde_mm_cmplt_epi8(a, b)
1552 #endif
1553
1554 SIMDE_FUNCTION_ATTRIBUTES
1555 simde__m128i
simde_mm_cmplt_epi16(simde__m128i a,simde__m128i b)1556 simde_mm_cmplt_epi16 (simde__m128i a, simde__m128i b) {
1557 #if defined(SIMDE_X86_SSE2_NATIVE)
1558 return _mm_cmplt_epi16(a, b);
1559 #else
1560 simde__m128i_private
1561 r_,
1562 a_ = simde__m128i_to_private(a),
1563 b_ = simde__m128i_to_private(b);
1564
1565 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1566 r_.neon_u16 = vcltq_s16(a_.neon_i16, b_.neon_i16);
1567 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1568 r_.altivec_i16 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed short), vec_cmplt(a_.altivec_i16, b_.altivec_i16));
1569 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1570 r_.wasm_v128 = wasm_i16x8_lt(a_.wasm_v128, b_.wasm_v128);
1571 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1572 r_.i16 = HEDLEY_STATIC_CAST(__typeof__(r_.i16), (a_.i16 < b_.i16));
1573 #else
1574 SIMDE_VECTORIZE
1575 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
1576 r_.i16[i] = (a_.i16[i] < b_.i16[i]) ? ~INT16_C(0) : INT16_C(0);
1577 }
1578 #endif
1579
1580 return simde__m128i_from_private(r_);
1581 #endif
1582 }
1583 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1584 # define _mm_cmplt_epi16(a, b) simde_mm_cmplt_epi16(a, b)
1585 #endif
1586
1587 SIMDE_FUNCTION_ATTRIBUTES
1588 simde__m128i
simde_mm_cmplt_epi32(simde__m128i a,simde__m128i b)1589 simde_mm_cmplt_epi32 (simde__m128i a, simde__m128i b) {
1590 #if defined(SIMDE_X86_SSE2_NATIVE)
1591 return _mm_cmplt_epi32(a, b);
1592 #else
1593 simde__m128i_private
1594 r_,
1595 a_ = simde__m128i_to_private(a),
1596 b_ = simde__m128i_to_private(b);
1597
1598 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1599 r_.neon_u32 = vcltq_s32(a_.neon_i32, b_.neon_i32);
1600 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1601 r_.altivec_i32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed int), vec_cmplt(a_.altivec_i32, b_.altivec_i32));
1602 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1603 r_.wasm_v128 = wasm_i32x4_lt(a_.wasm_v128, b_.wasm_v128);
1604 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1605 r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.i32 < b_.i32));
1606 #else
1607 SIMDE_VECTORIZE
1608 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1609 r_.i32[i] = (a_.i32[i] < b_.i32[i]) ? ~INT32_C(0) : INT32_C(0);
1610 }
1611 #endif
1612
1613 return simde__m128i_from_private(r_);
1614 #endif
1615 }
1616 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1617 # define _mm_cmplt_epi32(a, b) simde_mm_cmplt_epi32(a, b)
1618 #endif
1619
1620 SIMDE_FUNCTION_ATTRIBUTES
1621 simde__m128d
simde_mm_cmplt_pd(simde__m128d a,simde__m128d b)1622 simde_mm_cmplt_pd (simde__m128d a, simde__m128d b) {
1623 #if defined(SIMDE_X86_SSE2_NATIVE)
1624 return _mm_cmplt_pd(a, b);
1625 #else
1626 simde__m128d_private
1627 r_,
1628 a_ = simde__m128d_to_private(a),
1629 b_ = simde__m128d_to_private(b);
1630
1631 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1632 r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 < b_.f64));
1633 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1634 r_.wasm_v128 = wasm_f64x2_lt(a_.wasm_v128, b_.wasm_v128);
1635 #else
1636 SIMDE_VECTORIZE
1637 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
1638 r_.u64[i] = (a_.f64[i] < b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
1639 }
1640 #endif
1641
1642 return simde__m128d_from_private(r_);
1643 #endif
1644 }
1645 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1646 # define _mm_cmplt_pd(a, b) simde_mm_cmplt_pd(a, b)
1647 #endif
1648
1649 SIMDE_FUNCTION_ATTRIBUTES
1650 simde__m128d
simde_mm_cmplt_sd(simde__m128d a,simde__m128d b)1651 simde_mm_cmplt_sd (simde__m128d a, simde__m128d b) {
1652 #if defined(SIMDE_X86_SSE2_NATIVE)
1653 return _mm_cmplt_sd(a, b);
1654 #elif defined(SIMDE_ASSUME_VECTORIZATION)
1655 return simde_mm_move_sd(a, simde_mm_cmplt_pd(a, b));
1656 #else
1657 simde__m128d_private
1658 r_,
1659 a_ = simde__m128d_to_private(a),
1660 b_ = simde__m128d_to_private(b);
1661
1662 r_.u64[0] = (a_.f64[0] < b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
1663 r_.u64[1] = a_.u64[1];
1664
1665 return simde__m128d_from_private(r_);
1666 #endif
1667 }
1668 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1669 # define _mm_cmplt_sd(a, b) simde_mm_cmplt_sd(a, b)
1670 #endif
1671
1672 SIMDE_FUNCTION_ATTRIBUTES
1673 simde__m128d
simde_mm_cmple_pd(simde__m128d a,simde__m128d b)1674 simde_mm_cmple_pd (simde__m128d a, simde__m128d b) {
1675 #if defined(SIMDE_X86_SSE2_NATIVE)
1676 return _mm_cmple_pd(a, b);
1677 #else
1678 simde__m128d_private
1679 r_,
1680 a_ = simde__m128d_to_private(a),
1681 b_ = simde__m128d_to_private(b);
1682
1683 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1684 r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 <= b_.f64));
1685 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1686 r_.wasm_v128 = wasm_f64x2_le(a_.wasm_v128, b_.wasm_v128);
1687 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1688 r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_cmple(a_.altivec_f64, b_.altivec_f64));
1689 #else
1690 SIMDE_VECTORIZE
1691 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
1692 r_.u64[i] = (a_.f64[i] <= b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
1693 }
1694 #endif
1695
1696 return simde__m128d_from_private(r_);
1697 #endif
1698 }
1699 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1700 # define _mm_cmple_pd(a, b) simde_mm_cmple_pd(a, b)
1701 #endif
1702
1703 SIMDE_FUNCTION_ATTRIBUTES
1704 simde__m128d
simde_mm_cmple_sd(simde__m128d a,simde__m128d b)1705 simde_mm_cmple_sd (simde__m128d a, simde__m128d b) {
1706 #if defined(SIMDE_X86_SSE2_NATIVE)
1707 return _mm_cmple_sd(a, b);
1708 #elif defined(SIMDE_ASSUME_VECTORIZATION)
1709 return simde_mm_move_sd(a, simde_mm_cmple_pd(a, b));
1710 #else
1711 simde__m128d_private
1712 r_,
1713 a_ = simde__m128d_to_private(a),
1714 b_ = simde__m128d_to_private(b);
1715
1716 r_.u64[0] = (a_.f64[0] <= b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
1717 r_.u64[1] = a_.u64[1];
1718
1719 return simde__m128d_from_private(r_);
1720 #endif
1721 }
1722 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1723 # define _mm_cmple_sd(a, b) simde_mm_cmple_sd(a, b)
1724 #endif
1725
1726 SIMDE_FUNCTION_ATTRIBUTES
1727 simde__m128i
simde_mm_cmpgt_epi8(simde__m128i a,simde__m128i b)1728 simde_mm_cmpgt_epi8 (simde__m128i a, simde__m128i b) {
1729 #if defined(SIMDE_X86_SSE2_NATIVE)
1730 return _mm_cmpgt_epi8(a, b);
1731 #else
1732 simde__m128i_private
1733 r_,
1734 a_ = simde__m128i_to_private(a),
1735 b_ = simde__m128i_to_private(b);
1736
1737 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1738 r_.neon_u8 = vcgtq_s8(a_.neon_i8, b_.neon_i8);
1739 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1740 r_.wasm_v128 = wasm_i8x16_gt(a_.wasm_v128, b_.wasm_v128);
1741 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1742 r_.altivec_i8 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char), vec_cmpgt(a_.altivec_i8, b_.altivec_i8));
1743 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1744 r_.i8 = HEDLEY_STATIC_CAST(__typeof__(r_.i8), (a_.i8 > b_.i8));
1745 #else
1746 SIMDE_VECTORIZE
1747 for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
1748 r_.i8[i] = (a_.i8[i] > b_.i8[i]) ? ~INT8_C(0) : INT8_C(0);
1749 }
1750 #endif
1751
1752 return simde__m128i_from_private(r_);
1753 #endif
1754 }
1755 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1756 # define _mm_cmpgt_epi8(a, b) simde_mm_cmpgt_epi8(a, b)
1757 #endif
1758
1759 SIMDE_FUNCTION_ATTRIBUTES
1760 simde__m128i
simde_mm_cmpgt_epi16(simde__m128i a,simde__m128i b)1761 simde_mm_cmpgt_epi16 (simde__m128i a, simde__m128i b) {
1762 #if defined(SIMDE_X86_SSE2_NATIVE)
1763 return _mm_cmpgt_epi16(a, b);
1764 #else
1765 simde__m128i_private
1766 r_,
1767 a_ = simde__m128i_to_private(a),
1768 b_ = simde__m128i_to_private(b);
1769
1770 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1771 r_.neon_u16 = vcgtq_s16(a_.neon_i16, b_.neon_i16);
1772 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1773 r_.wasm_v128 = wasm_i16x8_gt(a_.wasm_v128, b_.wasm_v128);
1774 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1775 r_.altivec_i16 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed short), vec_cmpgt(a_.altivec_i16, b_.altivec_i16));
1776 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1777 r_.i16 = HEDLEY_STATIC_CAST(__typeof__(r_.i16), (a_.i16 > b_.i16));
1778 #else
1779 SIMDE_VECTORIZE
1780 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
1781 r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? ~INT16_C(0) : INT16_C(0);
1782 }
1783 #endif
1784
1785 return simde__m128i_from_private(r_);
1786 #endif
1787 }
1788 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1789 # define _mm_cmpgt_epi16(a, b) simde_mm_cmpgt_epi16(a, b)
1790 #endif
1791
1792 SIMDE_FUNCTION_ATTRIBUTES
1793 simde__m128i
simde_mm_cmpgt_epi32(simde__m128i a,simde__m128i b)1794 simde_mm_cmpgt_epi32 (simde__m128i a, simde__m128i b) {
1795 #if defined(SIMDE_X86_SSE2_NATIVE)
1796 return _mm_cmpgt_epi32(a, b);
1797 #else
1798 simde__m128i_private
1799 r_,
1800 a_ = simde__m128i_to_private(a),
1801 b_ = simde__m128i_to_private(b);
1802
1803 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1804 r_.neon_u32 = vcgtq_s32(a_.neon_i32, b_.neon_i32);
1805 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1806 r_.wasm_v128 = wasm_i32x4_gt(a_.wasm_v128, b_.wasm_v128);
1807 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1808 r_.altivec_i32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed int), vec_cmpgt(a_.altivec_i32, b_.altivec_i32));
1809 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1810 r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.i32 > b_.i32));
1811 #else
1812 SIMDE_VECTORIZE
1813 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1814 r_.i32[i] = (a_.i32[i] > b_.i32[i]) ? ~INT32_C(0) : INT32_C(0);
1815 }
1816 #endif
1817
1818 return simde__m128i_from_private(r_);
1819 #endif
1820 }
1821 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1822 # define _mm_cmpgt_epi32(a, b) simde_mm_cmpgt_epi32(a, b)
1823 #endif
1824
1825 SIMDE_FUNCTION_ATTRIBUTES
1826 simde__m128d
simde_mm_cmpgt_pd(simde__m128d a,simde__m128d b)1827 simde_mm_cmpgt_pd (simde__m128d a, simde__m128d b) {
1828 #if defined(SIMDE_X86_SSE2_NATIVE)
1829 return _mm_cmpgt_pd(a, b);
1830 #else
1831 simde__m128d_private
1832 r_,
1833 a_ = simde__m128d_to_private(a),
1834 b_ = simde__m128d_to_private(b);
1835
1836 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1837 r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 > b_.f64));
1838 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1839 r_.wasm_v128 = wasm_f64x2_gt(a_.wasm_v128, b_.wasm_v128);
1840 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1841 r_.altivec_f64 = HEDLEY_STATIC_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_cmpgt(a_.altivec_f64, b_.altivec_f64));
1842 #else
1843 SIMDE_VECTORIZE
1844 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
1845 r_.u64[i] = (a_.f64[i] > b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
1846 }
1847 #endif
1848
1849 return simde__m128d_from_private(r_);
1850 #endif
1851 }
1852 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1853 # define _mm_cmpgt_pd(a, b) simde_mm_cmpgt_pd(a, b)
1854 #endif
1855
1856 SIMDE_FUNCTION_ATTRIBUTES
1857 simde__m128d
simde_mm_cmpgt_sd(simde__m128d a,simde__m128d b)1858 simde_mm_cmpgt_sd (simde__m128d a, simde__m128d b) {
1859 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
1860 return _mm_cmpgt_sd(a, b);
1861 #elif defined(SIMDE_ASSUME_VECTORIZATION)
1862 return simde_mm_move_sd(a, simde_mm_cmpgt_pd(a, b));
1863 #else
1864 simde__m128d_private
1865 r_,
1866 a_ = simde__m128d_to_private(a),
1867 b_ = simde__m128d_to_private(b);
1868
1869 r_.u64[0] = (a_.f64[0] > b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
1870 r_.u64[1] = a_.u64[1];
1871
1872 return simde__m128d_from_private(r_);
1873 #endif
1874 }
1875 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1876 # define _mm_cmpgt_sd(a, b) simde_mm_cmpgt_sd(a, b)
1877 #endif
1878
1879 SIMDE_FUNCTION_ATTRIBUTES
1880 simde__m128d
simde_mm_cmpge_pd(simde__m128d a,simde__m128d b)1881 simde_mm_cmpge_pd (simde__m128d a, simde__m128d b) {
1882 #if defined(SIMDE_X86_SSE2_NATIVE)
1883 return _mm_cmpge_pd(a, b);
1884 #else
1885 simde__m128d_private
1886 r_,
1887 a_ = simde__m128d_to_private(a),
1888 b_ = simde__m128d_to_private(b);
1889
1890 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1891 r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 >= b_.f64));
1892 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1893 r_.wasm_v128 = wasm_f64x2_ge(a_.wasm_v128, b_.wasm_v128);
1894 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1895 r_.altivec_f64 = HEDLEY_STATIC_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_cmpge(a_.altivec_f64, b_.altivec_f64));
1896 #else
1897 SIMDE_VECTORIZE
1898 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
1899 r_.u64[i] = (a_.f64[i] >= b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
1900 }
1901 #endif
1902
1903 return simde__m128d_from_private(r_);
1904 #endif
1905 }
1906 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1907 # define _mm_cmpge_pd(a, b) simde_mm_cmpge_pd(a, b)
1908 #endif
1909
1910 SIMDE_FUNCTION_ATTRIBUTES
1911 simde__m128d
simde_mm_cmpge_sd(simde__m128d a,simde__m128d b)1912 simde_mm_cmpge_sd (simde__m128d a, simde__m128d b) {
1913 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
1914 return _mm_cmpge_sd(a, b);
1915 #elif defined(SIMDE_ASSUME_VECTORIZATION)
1916 return simde_mm_move_sd(a, simde_mm_cmpge_pd(a, b));
1917 #else
1918 simde__m128d_private
1919 r_,
1920 a_ = simde__m128d_to_private(a),
1921 b_ = simde__m128d_to_private(b);
1922
1923 r_.u64[0] = (a_.f64[0] >= b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
1924 r_.u64[1] = a_.u64[1];
1925
1926 return simde__m128d_from_private(r_);
1927 #endif
1928 }
1929 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1930 # define _mm_cmpge_sd(a, b) simde_mm_cmpge_sd(a, b)
1931 #endif
1932
1933 SIMDE_FUNCTION_ATTRIBUTES
1934 simde__m128d
simde_mm_cmpnge_pd(simde__m128d a,simde__m128d b)1935 simde_mm_cmpnge_pd (simde__m128d a, simde__m128d b) {
1936 #if defined(SIMDE_X86_SSE2_NATIVE)
1937 return _mm_cmpnge_pd(a, b);
1938 #else
1939 return simde_mm_cmplt_pd(a, b);
1940 #endif
1941 }
1942 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1943 # define _mm_cmpnge_pd(a, b) simde_mm_cmpnge_pd(a, b)
1944 #endif
1945
1946 SIMDE_FUNCTION_ATTRIBUTES
1947 simde__m128d
simde_mm_cmpnge_sd(simde__m128d a,simde__m128d b)1948 simde_mm_cmpnge_sd (simde__m128d a, simde__m128d b) {
1949 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
1950 return _mm_cmpnge_sd(a, b);
1951 #else
1952 return simde_mm_cmplt_sd(a, b);
1953 #endif
1954 }
1955 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1956 # define _mm_cmpnge_sd(a, b) simde_mm_cmpnge_sd(a, b)
1957 #endif
1958
1959 SIMDE_FUNCTION_ATTRIBUTES
1960 simde__m128d
simde_mm_cmpnlt_pd(simde__m128d a,simde__m128d b)1961 simde_mm_cmpnlt_pd (simde__m128d a, simde__m128d b) {
1962 #if defined(SIMDE_X86_SSE2_NATIVE)
1963 return _mm_cmpnlt_pd(a, b);
1964 #else
1965 return simde_mm_cmpge_pd(a, b);
1966 #endif
1967 }
1968 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1969 # define _mm_cmpnlt_pd(a, b) simde_mm_cmpnlt_pd(a, b)
1970 #endif
1971
1972 SIMDE_FUNCTION_ATTRIBUTES
1973 simde__m128d
simde_mm_cmpnlt_sd(simde__m128d a,simde__m128d b)1974 simde_mm_cmpnlt_sd (simde__m128d a, simde__m128d b) {
1975 #if defined(SIMDE_X86_SSE2_NATIVE)
1976 return _mm_cmpnlt_sd(a, b);
1977 #else
1978 return simde_mm_cmpge_sd(a, b);
1979 #endif
1980 }
1981 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1982 # define _mm_cmpnlt_sd(a, b) simde_mm_cmpnlt_sd(a, b)
1983 #endif
1984
1985 SIMDE_FUNCTION_ATTRIBUTES
1986 simde__m128d
simde_mm_cmpnle_pd(simde__m128d a,simde__m128d b)1987 simde_mm_cmpnle_pd (simde__m128d a, simde__m128d b) {
1988 #if defined(SIMDE_X86_SSE2_NATIVE)
1989 return _mm_cmpnle_pd(a, b);
1990 #else
1991 return simde_mm_cmpgt_pd(a, b);
1992 #endif
1993 }
1994 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1995 # define _mm_cmpnle_pd(a, b) simde_mm_cmpnle_pd(a, b)
1996 #endif
1997
1998 SIMDE_FUNCTION_ATTRIBUTES
1999 simde__m128d
simde_mm_cmpnle_sd(simde__m128d a,simde__m128d b)2000 simde_mm_cmpnle_sd (simde__m128d a, simde__m128d b) {
2001 #if defined(SIMDE_X86_SSE2_NATIVE)
2002 return _mm_cmpnle_sd(a, b);
2003 #else
2004 return simde_mm_cmpgt_sd(a, b);
2005 #endif
2006 }
2007 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2008 # define _mm_cmpnle_sd(a, b) simde_mm_cmpnle_sd(a, b)
2009 #endif
2010
2011 SIMDE_FUNCTION_ATTRIBUTES
2012 simde__m128d
simde_mm_cmpord_pd(simde__m128d a,simde__m128d b)2013 simde_mm_cmpord_pd (simde__m128d a, simde__m128d b) {
2014 #if defined(SIMDE_X86_SSE2_NATIVE)
2015 return _mm_cmpord_pd(a, b);
2016 #else
2017 simde__m128d_private
2018 r_,
2019 a_ = simde__m128d_to_private(a),
2020 b_ = simde__m128d_to_private(b);
2021
2022 #if defined(simde_math_isnan)
2023 SIMDE_VECTORIZE
2024 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
2025 r_.u64[i] = (!simde_math_isnan(a_.f64[i]) && !simde_math_isnan(b_.f64[i])) ? ~UINT64_C(0) : UINT64_C(0);
2026 }
2027 #else
2028 HEDLEY_UNREACHABLE();
2029 #endif
2030
2031 return simde__m128d_from_private(r_);
2032 #endif
2033 }
2034 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2035 # define _mm_cmpord_pd(a, b) simde_mm_cmpord_pd(a, b)
2036 #endif
2037
2038 SIMDE_FUNCTION_ATTRIBUTES
2039 simde_float64
simde_mm_cvtsd_f64(simde__m128d a)2040 simde_mm_cvtsd_f64 (simde__m128d a) {
2041 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
2042 return _mm_cvtsd_f64(a);
2043 #else
2044 simde__m128d_private a_ = simde__m128d_to_private(a);
2045 return a_.f64[0];
2046 #endif
2047 }
2048 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2049 # define _mm_cvtsd_f64(a) simde_mm_cvtsd_f64(a)
2050 #endif
2051
2052 SIMDE_FUNCTION_ATTRIBUTES
2053 simde__m128d
simde_mm_cmpord_sd(simde__m128d a,simde__m128d b)2054 simde_mm_cmpord_sd (simde__m128d a, simde__m128d b) {
2055 #if defined(SIMDE_X86_SSE2_NATIVE)
2056 return _mm_cmpord_sd(a, b);
2057 #elif defined(SIMDE_ASSUME_VECTORIZATION)
2058 return simde_mm_move_sd(a, simde_mm_cmpord_pd(a, b));
2059 #else
2060 simde__m128d_private
2061 r_,
2062 a_ = simde__m128d_to_private(a),
2063 b_ = simde__m128d_to_private(b);
2064
2065 #if defined(simde_math_isnan)
2066 r_.u64[0] = (!simde_math_isnan(a_.f64[0]) && !simde_math_isnan(b_.f64[0])) ? ~UINT64_C(0) : UINT64_C(0);
2067 r_.u64[1] = a_.u64[1];
2068 #else
2069 HEDLEY_UNREACHABLE();
2070 #endif
2071
2072 return simde__m128d_from_private(r_);
2073 #endif
2074 }
2075 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2076 # define _mm_cmpord_sd(a, b) simde_mm_cmpord_sd(a, b)
2077 #endif
2078
2079 SIMDE_FUNCTION_ATTRIBUTES
2080 simde__m128d
simde_mm_cmpunord_pd(simde__m128d a,simde__m128d b)2081 simde_mm_cmpunord_pd (simde__m128d a, simde__m128d b) {
2082 #if defined(SIMDE_X86_SSE2_NATIVE)
2083 return _mm_cmpunord_pd(a, b);
2084 #else
2085 simde__m128d_private
2086 r_,
2087 a_ = simde__m128d_to_private(a),
2088 b_ = simde__m128d_to_private(b);
2089
2090 #if defined(simde_math_isnan)
2091 SIMDE_VECTORIZE
2092 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
2093 r_.u64[i] = (simde_math_isnan(a_.f64[i]) || simde_math_isnan(b_.f64[i])) ? ~UINT64_C(0) : UINT64_C(0);
2094 }
2095 #else
2096 HEDLEY_UNREACHABLE();
2097 #endif
2098
2099 return simde__m128d_from_private(r_);
2100 #endif
2101 }
2102 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2103 # define _mm_cmpunord_pd(a, b) simde_mm_cmpunord_pd(a, b)
2104 #endif
2105
2106 SIMDE_FUNCTION_ATTRIBUTES
2107 simde__m128d
simde_mm_cmpunord_sd(simde__m128d a,simde__m128d b)2108 simde_mm_cmpunord_sd (simde__m128d a, simde__m128d b) {
2109 #if defined(SIMDE_X86_SSE2_NATIVE)
2110 return _mm_cmpunord_sd(a, b);
2111 #elif defined(SIMDE_ASSUME_VECTORIZATION)
2112 return simde_mm_move_sd(a, simde_mm_cmpunord_pd(a, b));
2113 #else
2114 simde__m128d_private
2115 r_,
2116 a_ = simde__m128d_to_private(a),
2117 b_ = simde__m128d_to_private(b);
2118
2119 #if defined(simde_math_isnan)
2120 r_.u64[0] = (simde_math_isnan(a_.f64[0]) || simde_math_isnan(b_.f64[0])) ? ~UINT64_C(0) : UINT64_C(0);
2121 r_.u64[1] = a_.u64[1];
2122
2123 #else
2124 HEDLEY_UNREACHABLE();
2125 #endif
2126
2127 return simde__m128d_from_private(r_);
2128 #endif
2129 }
2130 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2131 # define _mm_cmpunord_sd(a, b) simde_mm_cmpunord_sd(a, b)
2132 #endif
2133
2134 SIMDE_FUNCTION_ATTRIBUTES
2135 simde__m128d
simde_mm_cvtepi32_pd(simde__m128i a)2136 simde_mm_cvtepi32_pd (simde__m128i a) {
2137 #if defined(SIMDE_X86_SSE2_NATIVE)
2138 return _mm_cvtepi32_pd(a);
2139 #else
2140 simde__m128d_private r_;
2141 simde__m128i_private a_ = simde__m128i_to_private(a);
2142
2143 #if defined(SIMDE_CONVERT_VECTOR_)
2144 SIMDE_CONVERT_VECTOR_(r_.f64, a_.m64_private[0].i32);
2145 #else
2146 SIMDE_VECTORIZE
2147 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
2148 r_.f64[i] = (simde_float64) a_.i32[i];
2149 }
2150 #endif
2151
2152 return simde__m128d_from_private(r_);
2153 #endif
2154 }
2155 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2156 # define _mm_cvtepi32_pd(a) simde_mm_cvtepi32_pd(a)
2157 #endif
2158
2159 SIMDE_FUNCTION_ATTRIBUTES
2160 simde__m128
simde_mm_cvtepi32_ps(simde__m128i a)2161 simde_mm_cvtepi32_ps (simde__m128i a) {
2162 #if defined(SIMDE_X86_SSE2_NATIVE)
2163 return _mm_cvtepi32_ps(a);
2164 #else
2165 simde__m128_private r_;
2166 simde__m128i_private a_ = simde__m128i_to_private(a);
2167
2168 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2169 r_.neon_f32 = vcvtq_f32_s32(a_.neon_i32);
2170 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
2171 HEDLEY_DIAGNOSTIC_PUSH
2172 #if HEDLEY_HAS_WARNING("-Wc11-extensions")
2173 #pragma clang diagnostic ignored "-Wc11-extensions"
2174 #endif
2175 r_.altivec_f32 = vec_ctf(a_.altivec_i32, 0);
2176 HEDLEY_DIAGNOSTIC_POP
2177 #elif defined(SIMDE_CONVERT_VECTOR_)
2178 SIMDE_CONVERT_VECTOR_(r_.f32, a_.i32);
2179 #else
2180 SIMDE_VECTORIZE
2181 for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
2182 r_.f32[i] = (simde_float32) a_.i32[i];
2183 }
2184 #endif
2185
2186 return simde__m128_from_private(r_);
2187 #endif
2188 }
2189 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2190 # define _mm_cvtepi32_ps(a) simde_mm_cvtepi32_ps(a)
2191 #endif
2192
2193 SIMDE_FUNCTION_ATTRIBUTES
2194 simde__m128i
simde_mm_cvtpd_epi32(simde__m128d a)2195 simde_mm_cvtpd_epi32 (simde__m128d a) {
2196 #if defined(SIMDE_X86_SSE2_NATIVE)
2197 return _mm_cvtpd_epi32(a);
2198 #else
2199 simde__m128i_private r_;
2200 simde__m128d_private a_ = simde__m128d_to_private(a);
2201
2202 SIMDE_VECTORIZE
2203 for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) {
2204 r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, simde_math_nearbyint(a_.f64[i]));
2205 }
2206 simde_memset(&(r_.m64_private[1]), 0, sizeof(r_.m64_private[1]));
2207
2208 return simde__m128i_from_private(r_);
2209 #endif
2210 }
2211 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2212 # define _mm_cvtpd_epi32(a) simde_mm_cvtpd_epi32(a)
2213 #endif
2214
2215 SIMDE_FUNCTION_ATTRIBUTES
2216 simde__m64
simde_mm_cvtpd_pi32(simde__m128d a)2217 simde_mm_cvtpd_pi32 (simde__m128d a) {
2218 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
2219 return _mm_cvtpd_pi32(a);
2220 #else
2221 simde__m64_private r_;
2222 simde__m128d_private a_ = simde__m128d_to_private(a);
2223
2224 SIMDE_VECTORIZE
2225 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
2226 r_.i32[i] = HEDLEY_STATIC_CAST(int32_t, simde_math_nearbyint(a_.f64[i]));
2227 }
2228
2229 return simde__m64_from_private(r_);
2230 #endif
2231 }
2232 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2233 # define _mm_cvtpd_pi32(a) simde_mm_cvtpd_pi32(a)
2234 #endif
2235
2236 SIMDE_FUNCTION_ATTRIBUTES
2237 simde__m128
simde_mm_cvtpd_ps(simde__m128d a)2238 simde_mm_cvtpd_ps (simde__m128d a) {
2239 #if defined(SIMDE_X86_SSE2_NATIVE)
2240 return _mm_cvtpd_ps(a);
2241 #else
2242 simde__m128_private r_;
2243 simde__m128d_private a_ = simde__m128d_to_private(a);
2244
2245 #if defined(SIMDE_CONVERT_VECTOR_)
2246 SIMDE_CONVERT_VECTOR_(r_.m64_private[0].f32, a_.f64);
2247 r_.m64_private[1] = simde__m64_to_private(simde_mm_setzero_si64());
2248 #else
2249 SIMDE_VECTORIZE
2250 for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) {
2251 r_.f32[i] = (simde_float32) a_.f64[i];
2252 }
2253 simde_memset(&(r_.m64_private[1]), 0, sizeof(r_.m64_private[1]));
2254 #endif
2255
2256 return simde__m128_from_private(r_);
2257 #endif
2258 }
2259 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2260 # define _mm_cvtpd_ps(a) simde_mm_cvtpd_ps(a)
2261 #endif
2262
2263 SIMDE_FUNCTION_ATTRIBUTES
2264 simde__m128d
simde_mm_cvtpi32_pd(simde__m64 a)2265 simde_mm_cvtpi32_pd (simde__m64 a) {
2266 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
2267 return _mm_cvtpi32_pd(a);
2268 #else
2269 simde__m128d_private r_;
2270 simde__m64_private a_ = simde__m64_to_private(a);
2271
2272 #if defined(SIMDE_CONVERT_VECTOR_)
2273 SIMDE_CONVERT_VECTOR_(r_.f64, a_.i32);
2274 #else
2275 SIMDE_VECTORIZE
2276 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
2277 r_.f64[i] = (simde_float64) a_.i32[i];
2278 }
2279 #endif
2280
2281 return simde__m128d_from_private(r_);
2282 #endif
2283 }
2284 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2285 # define _mm_cvtpi32_pd(a) simde_mm_cvtpi32_pd(a)
2286 #endif
2287
2288 SIMDE_FUNCTION_ATTRIBUTES
2289 simde__m128i
simde_mm_cvtps_epi32(simde__m128 a)2290 simde_mm_cvtps_epi32 (simde__m128 a) {
2291 #if defined(SIMDE_X86_SSE2_NATIVE)
2292 return _mm_cvtps_epi32(a);
2293 #else
2294 simde__m128i_private r_;
2295 simde__m128_private a_ = simde__m128_to_private(a);
2296
2297 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2298 /* The default rounding mode on SSE is 'round to even', which ArmV7
2299 does not support! It is supported on ARMv8 however. */
2300 #if defined(SIMDE_ARCH_AARCH64)
2301 r_.neon_i32 = vcvtnq_s32_f32(a_.neon_f32);
2302 #else
2303 uint32x4_t signmask = vdupq_n_u32(0x80000000);
2304 float32x4_t half = vbslq_f32(signmask, a_.neon_f32, vdupq_n_f32(0.5f)); /* +/- 0.5 */
2305 int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(a_.neon_f32, half)); /* round to integer: [a + 0.5]*/
2306 int32x4_t r_trunc = vcvtq_s32_f32(a_.neon_f32); /* truncate to integer: [a] */
2307 int32x4_t plusone = vshrq_n_s32(vnegq_s32(r_trunc), 31); /* 1 or 0 */
2308 int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone), vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
2309 float32x4_t delta = vsubq_f32(a_.neon_f32, vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
2310 uint32x4_t is_delta_half = vceqq_f32(delta, half); /* delta == +/- 0.5 */
2311 r_.neon_i32 = vbslq_s32(is_delta_half, r_even, r_normal);
2312 #endif
2313 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
2314 r_.altivec_i32 = vec_cts(vec_round(a_.altivec_f32), 0);
2315 #else
2316 SIMDE_VECTORIZE
2317 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
2318 r_.i32[i] = HEDLEY_STATIC_CAST(int32_t, simde_math_roundf(a_.f32[i]));
2319 }
2320 #endif
2321
2322 return simde__m128i_from_private(r_);
2323 #endif
2324 }
2325 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2326 # define _mm_cvtps_epi32(a) simde_mm_cvtps_epi32(a)
2327 #endif
2328
2329 SIMDE_FUNCTION_ATTRIBUTES
2330 simde__m128d
simde_mm_cvtps_pd(simde__m128 a)2331 simde_mm_cvtps_pd (simde__m128 a) {
2332 #if defined(SIMDE_X86_SSE2_NATIVE)
2333 return _mm_cvtps_pd(a);
2334 #else
2335 simde__m128d_private r_;
2336 simde__m128_private a_ = simde__m128_to_private(a);
2337
2338 #if defined(SIMDE_CONVERT_VECTOR_)
2339 SIMDE_CONVERT_VECTOR_(r_.f64, a_.m64_private[0].f32);
2340 #else
2341 SIMDE_VECTORIZE
2342 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
2343 r_.f64[i] = a_.f32[i];
2344 }
2345 #endif
2346
2347 return simde__m128d_from_private(r_);
2348 #endif
2349 }
2350 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2351 # define _mm_cvtps_pd(a) simde_mm_cvtps_pd(a)
2352 #endif
2353
2354 SIMDE_FUNCTION_ATTRIBUTES
2355 int32_t
simde_mm_cvtsd_si32(simde__m128d a)2356 simde_mm_cvtsd_si32 (simde__m128d a) {
2357 #if defined(SIMDE_X86_SSE2_NATIVE)
2358 return _mm_cvtsd_si32(a);
2359 #else
2360 simde__m128d_private a_ = simde__m128d_to_private(a);
2361 return SIMDE_CONVERT_FTOI(int32_t, simde_math_round(a_.f64[0]));
2362 #endif
2363 }
2364 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2365 # define _mm_cvtsd_si32(a) simde_mm_cvtsd_si32(a)
2366 #endif
2367
2368 SIMDE_FUNCTION_ATTRIBUTES
2369 int64_t
simde_mm_cvtsd_si64(simde__m128d a)2370 simde_mm_cvtsd_si64 (simde__m128d a) {
2371 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
2372 #if defined(__PGI)
2373 return _mm_cvtsd_si64x(a);
2374 #else
2375 return _mm_cvtsd_si64(a);
2376 #endif
2377 #else
2378 simde__m128d_private a_ = simde__m128d_to_private(a);
2379 return SIMDE_CONVERT_FTOI(int64_t, simde_math_round(a_.f64[0]));
2380 #endif
2381 }
2382 #define simde_mm_cvtsd_si64x(a) simde_mm_cvtsd_si64(a)
2383 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2384 # define _mm_cvtsd_si64(a) simde_mm_cvtsd_si64(a)
2385 # define _mm_cvtsd_si64x(a) simde_mm_cvtsd_si64x(a)
2386 #endif
2387
2388 SIMDE_FUNCTION_ATTRIBUTES
2389 simde__m128
simde_mm_cvtsd_ss(simde__m128 a,simde__m128d b)2390 simde_mm_cvtsd_ss (simde__m128 a, simde__m128d b) {
2391 #if defined(SIMDE_X86_SSE2_NATIVE)
2392 return _mm_cvtsd_ss(a, b);
2393 #else
2394 simde__m128_private
2395 r_,
2396 a_ = simde__m128_to_private(a);
2397 simde__m128d_private b_ = simde__m128d_to_private(b);
2398
2399 r_.f32[0] = HEDLEY_STATIC_CAST(simde_float32, b_.f64[0]);
2400
2401 SIMDE_VECTORIZE
2402 for (size_t i = 1 ; i < (sizeof(r_) / sizeof(r_.i32[0])) ; i++) {
2403 r_.i32[i] = a_.i32[i];
2404 }
2405
2406 return simde__m128_from_private(r_);
2407 #endif
2408 }
2409 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2410 # define _mm_cvtsd_ss(a, b) simde_mm_cvtsd_ss(a, b)
2411 #endif
2412
2413 SIMDE_FUNCTION_ATTRIBUTES
2414 int32_t
simde_mm_cvtsi128_si32(simde__m128i a)2415 simde_mm_cvtsi128_si32 (simde__m128i a) {
2416 #if defined(SIMDE_X86_SSE2_NATIVE)
2417 return _mm_cvtsi128_si32(a);
2418 #else
2419 simde__m128i_private
2420 a_ = simde__m128i_to_private(a);
2421
2422 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2423 return vgetq_lane_s32(a_.neon_i32, 0);
2424 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
2425 #if defined(SIMDE_BUG_GCC_95227)
2426 (void) a_;
2427 #endif
2428 return vec_extract(a_.altivec_i32, 0);
2429 #else
2430 return a_.i32[0];
2431 #endif
2432 #endif
2433 }
2434 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2435 # define _mm_cvtsi128_si32(a) simde_mm_cvtsi128_si32(a)
2436 #endif
2437
2438 SIMDE_FUNCTION_ATTRIBUTES
2439 int64_t
simde_mm_cvtsi128_si64(simde__m128i a)2440 simde_mm_cvtsi128_si64 (simde__m128i a) {
2441 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
2442 #if defined(__PGI)
2443 return _mm_cvtsi128_si64x(a);
2444 #else
2445 return _mm_cvtsi128_si64(a);
2446 #endif
2447 #else
2448 simde__m128i_private a_ = simde__m128i_to_private(a);
2449 #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) && !defined(HEDLEY_IBM_VERSION)
2450 return vec_extract(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed long long), a_.i64), 0);
2451 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2452 return vgetq_lane_s64(a_.neon_i64, 0);
2453 #endif
2454 return a_.i64[0];
2455 #endif
2456 }
2457 #define simde_mm_cvtsi128_si64x(a) simde_mm_cvtsi128_si64(a)
2458 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2459 # define _mm_cvtsi128_si64(a) simde_mm_cvtsi128_si64(a)
2460 # define _mm_cvtsi128_si64x(a) simde_mm_cvtsi128_si64x(a)
2461 #endif
2462
2463 SIMDE_FUNCTION_ATTRIBUTES
2464 simde__m128d
simde_mm_cvtsi32_sd(simde__m128d a,int32_t b)2465 simde_mm_cvtsi32_sd (simde__m128d a, int32_t b) {
2466
2467 #if defined(SIMDE_X86_SSE2_NATIVE)
2468 return _mm_cvtsi32_sd(a, b);
2469 #else
2470 simde__m128d_private r_;
2471 simde__m128d_private a_ = simde__m128d_to_private(a);
2472
2473 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_AMD64)
2474 r_.neon_f64 = vsetq_lane_f64(HEDLEY_STATIC_CAST(float64_t, b), a_.neon_f64, 0);
2475 #else
2476 r_.f64[0] = HEDLEY_STATIC_CAST(simde_float64, b);
2477 r_.i64[1] = a_.i64[1];
2478 #endif
2479
2480 return simde__m128d_from_private(r_);
2481 #endif
2482 }
2483 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2484 # define _mm_cvtsi32_sd(a, b) simde_mm_cvtsi32_sd(a, b)
2485 #endif
2486
2487 SIMDE_FUNCTION_ATTRIBUTES
2488 simde__m128i
simde_mm_cvtsi32_si128(int32_t a)2489 simde_mm_cvtsi32_si128 (int32_t a) {
2490 #if defined(SIMDE_X86_SSE2_NATIVE)
2491 return _mm_cvtsi32_si128(a);
2492 #else
2493 simde__m128i_private r_;
2494
2495 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2496 r_.neon_i32 = vsetq_lane_s32(a, vdupq_n_s32(0), 0);
2497 #else
2498 r_.i32[0] = a;
2499 r_.i32[1] = 0;
2500 r_.i32[2] = 0;
2501 r_.i32[3] = 0;
2502 #endif
2503
2504 return simde__m128i_from_private(r_);
2505 #endif
2506 }
2507 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2508 # define _mm_cvtsi32_si128(a) simde_mm_cvtsi32_si128(a)
2509 #endif
2510
2511 SIMDE_FUNCTION_ATTRIBUTES
2512 simde__m128d
simde_mm_cvtsi64_sd(simde__m128d a,int64_t b)2513 simde_mm_cvtsi64_sd (simde__m128d a, int64_t b) {
2514 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
2515 #if !defined(__PGI)
2516 return _mm_cvtsi64_sd(a, b);
2517 #else
2518 return _mm_cvtsi64x_sd(a, b);
2519 #endif
2520 #else
2521 simde__m128d_private
2522 r_,
2523 a_ = simde__m128d_to_private(a);
2524
2525 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2526 r_.neon_f64 = vsetq_lane_f64(HEDLEY_STATIC_CAST(float64_t, b), a_.neon_f64, 0);
2527 #else
2528 r_.f64[0] = HEDLEY_STATIC_CAST(simde_float64, b);
2529 r_.f64[1] = a_.f64[1];
2530 #endif
2531
2532 return simde__m128d_from_private(r_);
2533 #endif
2534 }
2535 #define simde_mm_cvtsi64x_sd(a, b) simde_mm_cvtsi64_sd(a, b)
2536 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2537 # define _mm_cvtsi64_sd(a, b) simde_mm_cvtsi64_sd(a, b)
2538 # define _mm_cvtsi64x_sd(a, b) simde_mm_cvtsi64x_sd(a, b)
2539 #endif
2540
2541 SIMDE_FUNCTION_ATTRIBUTES
2542 simde__m128i
simde_mm_cvtsi64_si128(int64_t a)2543 simde_mm_cvtsi64_si128 (int64_t a) {
2544 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
2545 #if !defined(__PGI)
2546 return _mm_cvtsi64_si128(a);
2547 #else
2548 return _mm_cvtsi64x_si128(a);
2549 #endif
2550 #else
2551 simde__m128i_private r_;
2552
2553 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2554 r_.neon_i64 = vsetq_lane_s64(a, vdupq_n_s64(0), 0);
2555 #else
2556 r_.i64[0] = a;
2557 r_.i64[1] = 0;
2558 #endif
2559
2560 return simde__m128i_from_private(r_);
2561 #endif
2562 }
2563 #define simde_mm_cvtsi64x_si128(a) simde_mm_cvtsi64_si128(a)
2564 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2565 # define _mm_cvtsi64_si128(a) simde_mm_cvtsi64_si128(a)
2566 # define _mm_cvtsi64x_si128(a) simde_mm_cvtsi64x_si128(a)
2567 #endif
2568
2569 SIMDE_FUNCTION_ATTRIBUTES
2570 simde__m128d
simde_mm_cvtss_sd(simde__m128d a,simde__m128 b)2571 simde_mm_cvtss_sd (simde__m128d a, simde__m128 b) {
2572 #if defined(SIMDE_X86_SSE2_NATIVE)
2573 return _mm_cvtss_sd(a, b);
2574 #else
2575 simde__m128d_private
2576 a_ = simde__m128d_to_private(a);
2577 simde__m128_private b_ = simde__m128_to_private(b);
2578
2579 a_.f64[0] = HEDLEY_STATIC_CAST(simde_float64, b_.f32[0]);
2580
2581 return simde__m128d_from_private(a_);
2582 #endif
2583 }
2584 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2585 # define _mm_cvtss_sd(a, b) simde_mm_cvtss_sd(a, b)
2586 #endif
2587
2588 SIMDE_FUNCTION_ATTRIBUTES
2589 simde__m128i
simde_mm_cvttpd_epi32(simde__m128d a)2590 simde_mm_cvttpd_epi32 (simde__m128d a) {
2591 #if defined(SIMDE_X86_SSE2_NATIVE)
2592 return _mm_cvttpd_epi32(a);
2593 #else
2594 simde__m128i_private r_;
2595 simde__m128d_private a_ = simde__m128d_to_private(a);
2596
2597 for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) {
2598 r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, a_.f64[i]);
2599 }
2600 simde_memset(&(r_.m64_private[1]), 0, sizeof(r_.m64_private[1]));
2601
2602 return simde__m128i_from_private(r_);
2603 #endif
2604 }
2605 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2606 # define _mm_cvttpd_epi32(a) simde_mm_cvttpd_epi32(a)
2607 #endif
2608
2609 SIMDE_FUNCTION_ATTRIBUTES
2610 simde__m64
simde_mm_cvttpd_pi32(simde__m128d a)2611 simde_mm_cvttpd_pi32 (simde__m128d a) {
2612 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
2613 return _mm_cvttpd_pi32(a);
2614 #else
2615 simde__m64_private r_;
2616 simde__m128d_private a_ = simde__m128d_to_private(a);
2617
2618 #if defined(SIMDE_CONVERT_VECTOR_)
2619 SIMDE_CONVERT_VECTOR_(r_.i32, a_.f64);
2620 #else
2621 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
2622 r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, a_.f64[i]);
2623 }
2624 #endif
2625
2626 return simde__m64_from_private(r_);
2627 #endif
2628 }
2629 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2630 # define _mm_cvttpd_pi32(a) simde_mm_cvttpd_pi32(a)
2631 #endif
2632
2633 SIMDE_FUNCTION_ATTRIBUTES
2634 simde__m128i
simde_mm_cvttps_epi32(simde__m128 a)2635 simde_mm_cvttps_epi32 (simde__m128 a) {
2636 #if defined(SIMDE_X86_SSE2_NATIVE)
2637 return _mm_cvttps_epi32(a);
2638 #else
2639 simde__m128i_private r_;
2640 simde__m128_private a_ = simde__m128_to_private(a);
2641
2642 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2643 r_.neon_i32 = vcvtq_s32_f32(a_.neon_f32);
2644 #elif defined(SIMDE_CONVERT_VECTOR_)
2645 SIMDE_CONVERT_VECTOR_(r_.i32, a_.f32);
2646 #else
2647 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
2648 r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, a_.f32[i]);
2649 }
2650 #endif
2651
2652 return simde__m128i_from_private(r_);
2653 #endif
2654 }
2655 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2656 # define _mm_cvttps_epi32(a) simde_mm_cvttps_epi32(a)
2657 #endif
2658
2659 SIMDE_FUNCTION_ATTRIBUTES
2660 int32_t
simde_mm_cvttsd_si32(simde__m128d a)2661 simde_mm_cvttsd_si32 (simde__m128d a) {
2662 #if defined(SIMDE_X86_SSE2_NATIVE)
2663 return _mm_cvttsd_si32(a);
2664 #else
2665 simde__m128d_private a_ = simde__m128d_to_private(a);
2666 return SIMDE_CONVERT_FTOI(int32_t, a_.f64[0]);
2667 #endif
2668 }
2669 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2670 # define _mm_cvttsd_si32(a) simde_mm_cvttsd_si32(a)
2671 #endif
2672
2673 SIMDE_FUNCTION_ATTRIBUTES
2674 int64_t
simde_mm_cvttsd_si64(simde__m128d a)2675 simde_mm_cvttsd_si64 (simde__m128d a) {
2676 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
2677 #if !defined(__PGI)
2678 return _mm_cvttsd_si64(a);
2679 #else
2680 return _mm_cvttsd_si64x(a);
2681 #endif
2682 #else
2683 simde__m128d_private a_ = simde__m128d_to_private(a);
2684 return SIMDE_CONVERT_FTOI(int64_t, a_.f64[0]);
2685 #endif
2686 }
2687 #define simde_mm_cvttsd_si64x(a) simde_mm_cvttsd_si64(a)
2688 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2689 # define _mm_cvttsd_si64(a) simde_mm_cvttsd_si64(a)
2690 # define _mm_cvttsd_si64x(a) simde_mm_cvttsd_si64x(a)
2691 #endif
2692
2693 SIMDE_FUNCTION_ATTRIBUTES
2694 simde__m128d
simde_mm_div_pd(simde__m128d a,simde__m128d b)2695 simde_mm_div_pd (simde__m128d a, simde__m128d b) {
2696 #if defined(SIMDE_X86_SSE2_NATIVE)
2697 return _mm_div_pd(a, b);
2698 #else
2699 simde__m128d_private
2700 r_,
2701 a_ = simde__m128d_to_private(a),
2702 b_ = simde__m128d_to_private(b);
2703
2704 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
2705 r_.f64 = a_.f64 / b_.f64;
2706 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
2707 r_.wasm_v128 = wasm_f64x2_div(a_.wasm_v128, b_.wasm_v128);
2708 #else
2709 SIMDE_VECTORIZE
2710 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
2711 r_.f64[i] = a_.f64[i] / b_.f64[i];
2712 }
2713 #endif
2714
2715 return simde__m128d_from_private(r_);
2716 #endif
2717 }
2718 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2719 # define _mm_div_pd(a, b) simde_mm_div_pd(a, b)
2720 #endif
2721
2722 SIMDE_FUNCTION_ATTRIBUTES
2723 simde__m128d
simde_mm_div_sd(simde__m128d a,simde__m128d b)2724 simde_mm_div_sd (simde__m128d a, simde__m128d b) {
2725 #if defined(SIMDE_X86_SSE2_NATIVE)
2726 return _mm_div_sd(a, b);
2727 #elif defined(SIMDE_ASSUME_VECTORIZATION)
2728 return simde_mm_move_sd(a, simde_mm_div_pd(a, b));
2729 #else
2730 simde__m128d_private
2731 r_,
2732 a_ = simde__m128d_to_private(a),
2733 b_ = simde__m128d_to_private(b);
2734
2735 r_.f64[0] = a_.f64[0] / b_.f64[0];
2736 r_.f64[1] = a_.f64[1];
2737
2738 return simde__m128d_from_private(r_);
2739 #endif
2740 }
2741 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2742 # define _mm_div_sd(a, b) simde_mm_div_sd(a, b)
2743 #endif
2744
2745 SIMDE_FUNCTION_ATTRIBUTES
2746 int32_t
simde_mm_extract_epi16(simde__m128i a,const int imm8)2747 simde_mm_extract_epi16 (simde__m128i a, const int imm8)
2748 SIMDE_REQUIRE_RANGE(imm8, 0, 7) {
2749 uint16_t r;
2750 simde__m128i_private a_ = simde__m128i_to_private(a);
2751
2752 #if defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
2753 #if defined(SIMDE_BUG_GCC_95227)
2754 (void) a_;
2755 (void) imm8;
2756 #endif
2757 r = HEDLEY_STATIC_CAST(uint16_t, vec_extract(a_.altivec_i16, imm8));
2758 #else
2759 r = a_.u16[imm8 & 7];
2760 #endif
2761
2762 return HEDLEY_STATIC_CAST(int32_t, r);
2763 }
2764 #if defined(SIMDE_X86_SSE2_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(4,6,0))
2765 # define simde_mm_extract_epi16(a, imm8) _mm_extract_epi16(a, imm8)
2766 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2767 # define simde_mm_extract_epi16(a, imm8) (HEDLEY_STATIC_CAST(int32_t, vgetq_lane_s16(simde__m128i_to_private(a).neon_i16, (imm8))) & (INT32_C(0x0000ffff)))
2768 #endif
2769 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2770 # define _mm_extract_epi16(a, imm8) simde_mm_extract_epi16(a, imm8)
2771 #endif
2772
2773 SIMDE_FUNCTION_ATTRIBUTES
2774 simde__m128i
simde_mm_insert_epi16(simde__m128i a,int16_t i,const int imm8)2775 simde_mm_insert_epi16 (simde__m128i a, int16_t i, const int imm8)
2776 SIMDE_REQUIRE_RANGE(imm8, 0, 7) {
2777 simde__m128i_private a_ = simde__m128i_to_private(a);
2778 a_.i16[imm8 & 7] = i;
2779 return simde__m128i_from_private(a_);
2780 }
2781 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
2782 # define simde_mm_insert_epi16(a, i, imm8) _mm_insert_epi16((a), (i), (imm8))
2783 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2784 # define simde_mm_insert_epi16(a, i, imm8) simde__m128i_from_neon_i16(vsetq_lane_s16((i), simde__m128i_to_neon_i16(a), (imm8)))
2785 #endif
2786 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2787 # define _mm_insert_epi16(a, i, imm8) simde_mm_insert_epi16(a, i, imm8)
2788 #endif
2789
2790 SIMDE_FUNCTION_ATTRIBUTES
2791 simde__m128d
simde_mm_load_pd(simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM (2)])2792 simde_mm_load_pd (simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)]) {
2793 simde_assert_aligned(16, mem_addr);
2794
2795 #if defined(SIMDE_X86_SSE2_NATIVE)
2796 return _mm_load_pd(mem_addr);
2797 #else
2798 simde__m128d_private r_;
2799
2800 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2801 r_.neon_u32 = vld1q_u32(HEDLEY_REINTERPRET_CAST(uint32_t const*, mem_addr));
2802 #else
2803 r_ = *SIMDE_ALIGN_CAST(simde__m128d_private const*, mem_addr);
2804 #endif
2805
2806 return simde__m128d_from_private(r_);
2807 #endif
2808 }
2809 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2810 # define _mm_load_pd(mem_addr) simde_mm_load_pd(mem_addr)
2811 #endif
2812
2813 SIMDE_FUNCTION_ATTRIBUTES
2814 simde__m128d
simde_mm_load_pd1(simde_float64 const * mem_addr)2815 simde_mm_load_pd1 (simde_float64 const* mem_addr) {
2816 #if defined(SIMDE_X86_SSE2_NATIVE)
2817 return _mm_load1_pd(mem_addr);
2818 #else
2819 simde__m128d_private r_;
2820
2821 r_.f64[0] = *mem_addr;
2822 r_.f64[1] = *mem_addr;
2823
2824 return simde__m128d_from_private(r_);
2825 #endif
2826 }
2827 #define simde_mm_load1_pd(mem_addr) simde_mm_load_pd1(mem_addr)
2828 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2829 # define _mm_load_pd1(mem_addr) simde_mm_load_pd1(mem_addr)
2830 # define _mm_load1_pd(mem_addr) simde_mm_load1_pd(mem_addr)
2831 #endif
2832
2833 SIMDE_FUNCTION_ATTRIBUTES
2834 simde__m128d
simde_mm_load_sd(simde_float64 const * mem_addr)2835 simde_mm_load_sd (simde_float64 const* mem_addr) {
2836 #if defined(SIMDE_X86_SSE2_NATIVE)
2837 return _mm_load_sd(mem_addr);
2838 #else
2839 simde__m128d_private r_;
2840
2841 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2842 r_.neon_f64 = vsetq_lane_f64(*mem_addr, vdupq_n_f64(0), 0);
2843 #else
2844 r_.f64[0] = *mem_addr;
2845 r_.u64[1] = UINT64_C(0);
2846 #endif
2847
2848 return simde__m128d_from_private(r_);
2849 #endif
2850 }
2851 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2852 # define _mm_load_sd(mem_addr) simde_mm_load_sd(mem_addr)
2853 #endif
2854
2855 SIMDE_FUNCTION_ATTRIBUTES
2856 simde__m128i
simde_mm_load_si128(simde__m128i const * mem_addr)2857 simde_mm_load_si128 (simde__m128i const* mem_addr) {
2858 simde_assert_aligned(16, mem_addr);
2859
2860 #if defined(SIMDE_X86_SSE2_NATIVE)
2861 return _mm_load_si128(HEDLEY_REINTERPRET_CAST(__m128i const*, mem_addr));
2862 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) || defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
2863 simde__m128i_private r_;
2864
2865 #if defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
2866 r_.altivec_i32 = vec_ld(0, HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(int) const*, mem_addr));
2867 #else
2868 r_.neon_i32 = vld1q_s32(HEDLEY_REINTERPRET_CAST(int32_t const*, mem_addr));
2869 #endif
2870
2871 return simde__m128i_from_private(r_);
2872 #else
2873 return *mem_addr;
2874 #endif
2875 }
2876 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2877 # define _mm_load_si128(mem_addr) simde_mm_load_si128(mem_addr)
2878 #endif
2879
2880 SIMDE_FUNCTION_ATTRIBUTES
2881 simde__m128d
simde_mm_loadh_pd(simde__m128d a,simde_float64 const * mem_addr)2882 simde_mm_loadh_pd (simde__m128d a, simde_float64 const* mem_addr) {
2883 #if defined(SIMDE_X86_SSE2_NATIVE)
2884 return _mm_loadh_pd(a, mem_addr);
2885 #else
2886 simde__m128d_private
2887 r_,
2888 a_ = simde__m128d_to_private(a);
2889 simde_float64 t;
2890
2891 simde_memcpy(&t, mem_addr, sizeof(t));
2892 r_.f64[0] = a_.f64[0];
2893 r_.f64[1] = t;
2894
2895 return simde__m128d_from_private(r_);
2896 #endif
2897 }
2898 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2899 # define _mm_loadh_pd(a, mem_addr) simde_mm_loadh_pd(a, mem_addr)
2900 #endif
2901
2902 SIMDE_FUNCTION_ATTRIBUTES
2903 simde__m128i
simde_mm_loadl_epi64(simde__m128i const * mem_addr)2904 simde_mm_loadl_epi64 (simde__m128i const* mem_addr) {
2905 #if defined(SIMDE_X86_SSE2_NATIVE)
2906 return _mm_loadl_epi64(mem_addr);
2907 #else
2908 simde__m128i_private r_;
2909
2910 int64_t value;
2911 simde_memcpy(&value, mem_addr, sizeof(value));
2912
2913 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2914 r_.neon_i64 = vcombine_s64(vld1_s64(HEDLEY_REINTERPRET_CAST(int64_t const *, mem_addr)), vdup_n_s64(0));
2915 #else
2916 r_.i64[0] = value;
2917 r_.i64[1] = 0;
2918 #endif
2919
2920 return simde__m128i_from_private(r_);
2921 #endif
2922 }
2923 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2924 # define _mm_loadl_epi64(mem_addr) simde_mm_loadl_epi64(mem_addr)
2925 #endif
2926
2927 SIMDE_FUNCTION_ATTRIBUTES
2928 simde__m128d
simde_mm_loadl_pd(simde__m128d a,simde_float64 const * mem_addr)2929 simde_mm_loadl_pd (simde__m128d a, simde_float64 const* mem_addr) {
2930 #if defined(SIMDE_X86_SSE2_NATIVE)
2931 return _mm_loadl_pd(a, mem_addr);
2932 #else
2933 simde__m128d_private
2934 r_,
2935 a_ = simde__m128d_to_private(a);
2936
2937 r_.f64[0] = *mem_addr;
2938 r_.u64[1] = a_.u64[1];
2939
2940 return simde__m128d_from_private(r_);
2941 #endif
2942 }
2943 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2944 # define _mm_loadl_pd(a, mem_addr) simde_mm_loadl_pd(a, mem_addr)
2945 #endif
2946
2947 SIMDE_FUNCTION_ATTRIBUTES
2948 simde__m128d
simde_mm_loadr_pd(simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM (2)])2949 simde_mm_loadr_pd (simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)]) {
2950 simde_assert_aligned(16, mem_addr);
2951
2952 #if defined(SIMDE_X86_SSE2_NATIVE)
2953 return _mm_loadr_pd(mem_addr);
2954 #else
2955 simde__m128d_private r_;
2956
2957 r_.f64[0] = mem_addr[1];
2958 r_.f64[1] = mem_addr[0];
2959
2960 return simde__m128d_from_private(r_);
2961 #endif
2962 }
2963 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2964 # define _mm_loadr_pd(mem_addr) simde_mm_loadr_pd(mem_addr)
2965 #endif
2966
2967 SIMDE_FUNCTION_ATTRIBUTES
2968 simde__m128d
simde_mm_loadu_pd(simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM (2)])2969 simde_mm_loadu_pd (simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)]) {
2970 #if defined(SIMDE_X86_SSE2_NATIVE)
2971 return _mm_loadu_pd(mem_addr);
2972 #else
2973 simde__m128d_private r_;
2974
2975 simde_memcpy(&r_, mem_addr, sizeof(r_));
2976
2977 return simde__m128d_from_private(r_);
2978 #endif
2979 }
2980 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2981 # define _mm_loadu_pd(mem_addr) simde_mm_loadu_pd(mem_addr)
2982 #endif
2983
2984 SIMDE_FUNCTION_ATTRIBUTES
2985 simde__m128i
simde_x_mm_loadu_epi8(int8_t const * mem_addr)2986 simde_x_mm_loadu_epi8(int8_t const* mem_addr) {
2987 #if defined(SIMDE_X86_SSE2_NATIVE)
2988 return _mm_loadu_si128(SIMDE_ALIGN_CAST(simde__m128i const*, mem_addr));
2989 #else
2990 simde__m128i_private r_;
2991
2992 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2993 r_.neon_i8 = vld1q_s8(HEDLEY_REINTERPRET_CAST(int8_t const*, mem_addr));
2994 #else
2995 simde_memcpy(&r_, mem_addr, sizeof(r_));
2996 #endif
2997
2998 return simde__m128i_from_private(r_);
2999 #endif
3000 }
3001
3002 SIMDE_FUNCTION_ATTRIBUTES
3003 simde__m128i
simde_x_mm_loadu_epi16(int16_t const * mem_addr)3004 simde_x_mm_loadu_epi16(int16_t const* mem_addr) {
3005 #if defined(SIMDE_X86_SSE2_NATIVE)
3006 return _mm_loadu_si128(SIMDE_ALIGN_CAST(simde__m128i const*, mem_addr));
3007 #else
3008 simde__m128i_private r_;
3009
3010 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3011 r_.neon_i16 = vld1q_s16(HEDLEY_REINTERPRET_CAST(int16_t const*, mem_addr));
3012 #else
3013 simde_memcpy(&r_, mem_addr, sizeof(r_));
3014 #endif
3015
3016 return simde__m128i_from_private(r_);
3017 #endif
3018 }
3019
3020 SIMDE_FUNCTION_ATTRIBUTES
3021 simde__m128i
simde_x_mm_loadu_epi32(int32_t const * mem_addr)3022 simde_x_mm_loadu_epi32(int32_t const* mem_addr) {
3023 #if defined(SIMDE_X86_SSE2_NATIVE)
3024 return _mm_loadu_si128(SIMDE_ALIGN_CAST(simde__m128i const*, mem_addr));
3025 #else
3026 simde__m128i_private r_;
3027
3028 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3029 r_.neon_i32 = vld1q_s32(HEDLEY_REINTERPRET_CAST(int32_t const*, mem_addr));
3030 #else
3031 simde_memcpy(&r_, mem_addr, sizeof(r_));
3032 #endif
3033
3034 return simde__m128i_from_private(r_);
3035 #endif
3036 }
3037
3038 SIMDE_FUNCTION_ATTRIBUTES
3039 simde__m128i
simde_x_mm_loadu_epi64(int64_t const * mem_addr)3040 simde_x_mm_loadu_epi64(int64_t const* mem_addr) {
3041 #if defined(SIMDE_X86_SSE2_NATIVE)
3042 return _mm_loadu_si128(SIMDE_ALIGN_CAST(simde__m128i const*, mem_addr));
3043 #else
3044 simde__m128i_private r_;
3045
3046 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3047 r_.neon_i64 = vld1q_s64(HEDLEY_REINTERPRET_CAST(int64_t const*, mem_addr));
3048 #else
3049 simde_memcpy(&r_, mem_addr, sizeof(r_));
3050 #endif
3051
3052 return simde__m128i_from_private(r_);
3053 #endif
3054 }
3055
3056 SIMDE_FUNCTION_ATTRIBUTES
3057 simde__m128i
simde_mm_loadu_si128(void const * mem_addr)3058 simde_mm_loadu_si128 (void const* mem_addr) {
3059 #if defined(SIMDE_X86_SSE2_NATIVE)
3060 return _mm_loadu_si128(HEDLEY_STATIC_CAST(__m128i const*, mem_addr));
3061 #else
3062 simde__m128i_private r_;
3063
3064 #if HEDLEY_GNUC_HAS_ATTRIBUTE(may_alias,3,3,0)
3065 HEDLEY_DIAGNOSTIC_PUSH
3066 SIMDE_DIAGNOSTIC_DISABLE_PACKED_
3067 struct simde_mm_loadu_si128_s {
3068 __typeof__(r_) v;
3069 } __attribute__((__packed__, __may_alias__));
3070 r_ = HEDLEY_REINTERPRET_CAST(const struct simde_mm_loadu_si128_s *, mem_addr)->v;
3071 HEDLEY_DIAGNOSTIC_POP
3072 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3073 /* Note that this is a lower priority than the struct above since
3074 * clang assumes mem_addr is aligned (since it is a __m128i*). */
3075 r_.neon_i32 = vld1q_s32(HEDLEY_REINTERPRET_CAST(int32_t const*, mem_addr));
3076 #else
3077 simde_memcpy(&r_, mem_addr, sizeof(r_));
3078 #endif
3079
3080 return simde__m128i_from_private(r_);
3081 #endif
3082 }
3083 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3084 # define _mm_loadu_si128(mem_addr) simde_mm_loadu_si128(mem_addr)
3085 #endif
3086
3087 SIMDE_FUNCTION_ATTRIBUTES
3088 simde__m128i
simde_mm_madd_epi16(simde__m128i a,simde__m128i b)3089 simde_mm_madd_epi16 (simde__m128i a, simde__m128i b) {
3090 #if defined(SIMDE_X86_SSE2_NATIVE)
3091 return _mm_madd_epi16(a, b);
3092 #else
3093 simde__m128i_private
3094 r_,
3095 a_ = simde__m128i_to_private(a),
3096 b_ = simde__m128i_to_private(b);
3097
3098 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3099 int32x4_t pl = vmull_s16(vget_low_s16(a_.neon_i16), vget_low_s16(b_.neon_i16));
3100 int32x4_t ph = vmull_s16(vget_high_s16(a_.neon_i16), vget_high_s16(b_.neon_i16));
3101 int32x2_t rl = vpadd_s32(vget_low_s32(pl), vget_high_s32(pl));
3102 int32x2_t rh = vpadd_s32(vget_low_s32(ph), vget_high_s32(ph));
3103 r_.neon_i32 = vcombine_s32(rl, rh);
3104 #else
3105 SIMDE_VECTORIZE
3106 for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i16[0])) ; i += 2) {
3107 r_.i32[i / 2] = (a_.i16[i] * b_.i16[i]) + (a_.i16[i + 1] * b_.i16[i + 1]);
3108 }
3109 #endif
3110
3111 return simde__m128i_from_private(r_);
3112 #endif
3113 }
3114 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3115 # define _mm_madd_epi16(a, b) simde_mm_madd_epi16(a, b)
3116 #endif
3117
3118 SIMDE_FUNCTION_ATTRIBUTES
3119 void
simde_mm_maskmoveu_si128(simde__m128i a,simde__m128i mask,int8_t mem_addr[HEDLEY_ARRAY_PARAM (16)])3120 simde_mm_maskmoveu_si128 (simde__m128i a, simde__m128i mask, int8_t mem_addr[HEDLEY_ARRAY_PARAM(16)]) {
3121 #if defined(SIMDE_X86_SSE2_NATIVE)
3122 _mm_maskmoveu_si128(a, mask, HEDLEY_REINTERPRET_CAST(char*, mem_addr));
3123 #else
3124 simde__m128i_private
3125 a_ = simde__m128i_to_private(a),
3126 mask_ = simde__m128i_to_private(mask);
3127
3128 for (size_t i = 0 ; i < (sizeof(a_.i8) / sizeof(a_.i8[0])) ; i++) {
3129 if (mask_.u8[i] & 0x80) {
3130 mem_addr[i] = a_.i8[i];
3131 }
3132 }
3133 #endif
3134 }
3135 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3136 # define _mm_maskmoveu_si128(a, mask, mem_addr) simde_mm_maskmoveu_si128((a), (mask), SIMDE_CHECKED_REINTERPRET_CAST(int8_t*, char*, (mem_addr)))
3137 #endif
3138
3139 SIMDE_FUNCTION_ATTRIBUTES
3140 int32_t
simde_mm_movemask_epi8(simde__m128i a)3141 simde_mm_movemask_epi8 (simde__m128i a) {
3142 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__INTEL_COMPILER)
3143 /* ICC has trouble with _mm_movemask_epi8 at -O2 and above: */
3144 return _mm_movemask_epi8(a);
3145 #else
3146 int32_t r = 0;
3147 simde__m128i_private a_ = simde__m128i_to_private(a);
3148
3149 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3150 // Use increasingly wide shifts+adds to collect the sign bits
3151 // together.
3152 // Since the widening shifts would be rather confusing to follow in little endian, everything
3153 // will be illustrated in big endian order instead. This has a different result - the bits
3154 // would actually be reversed on a big endian machine.
3155
3156 // Starting input (only half the elements are shown):
3157 // 89 ff 1d c0 00 10 99 33
3158 uint8x16_t input = a_.neon_u8;
3159
3160 // Shift out everything but the sign bits with an unsigned shift right.
3161 //
3162 // Bytes of the vector::
3163 // 89 ff 1d c0 00 10 99 33
3164 // \ \ \ \ \ \ \ \ high_bits = (uint16x4_t)(input >> 7)
3165 // | | | | | | | |
3166 // 01 01 00 01 00 00 01 00
3167 //
3168 // Bits of first important lane(s):
3169 // 10001001 (89)
3170 // \______
3171 // |
3172 // 00000001 (01)
3173 uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
3174
3175 // Merge the even lanes together with a 16-bit unsigned shift right + add.
3176 // 'xx' represents garbage data which will be ignored in the final result.
3177 // In the important bytes, the add functions like a binary OR.
3178 //
3179 // 01 01 00 01 00 00 01 00
3180 // \_ | \_ | \_ | \_ | paired16 = (uint32x4_t)(input + (input >> 7))
3181 // \| \| \| \|
3182 // xx 03 xx 01 xx 00 xx 02
3183 //
3184 // 00000001 00000001 (01 01)
3185 // \_______ |
3186 // \|
3187 // xxxxxxxx xxxxxx11 (xx 03)
3188 uint32x4_t paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
3189
3190 // Repeat with a wider 32-bit shift + add.
3191 // xx 03 xx 01 xx 00 xx 02
3192 // \____ | \____ | paired32 = (uint64x1_t)(paired16 + (paired16 >> 14))
3193 // \| \|
3194 // xx xx xx 0d xx xx xx 02
3195 //
3196 // 00000011 00000001 (03 01)
3197 // \\_____ ||
3198 // '----.\||
3199 // xxxxxxxx xxxx1101 (xx 0d)
3200 uint64x2_t paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
3201
3202 // Last, an even wider 64-bit shift + add to get our result in the low 8 bit lanes.
3203 // xx xx xx 0d xx xx xx 02
3204 // \_________ | paired64 = (uint8x8_t)(paired32 + (paired32 >> 28))
3205 // \|
3206 // xx xx xx xx xx xx xx d2
3207 //
3208 // 00001101 00000010 (0d 02)
3209 // \ \___ | |
3210 // '---. \| |
3211 // xxxxxxxx 11010010 (xx d2)
3212 uint8x16_t paired64 = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
3213
3214 // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
3215 // xx xx xx xx xx xx xx d2
3216 // || return paired64[0]
3217 // d2
3218 // Note: Little endian would return the correct value 4b (01001011) instead.
3219 r = vgetq_lane_u8(paired64, 0) | (HEDLEY_STATIC_CAST(int32_t, vgetq_lane_u8(paired64, 8)) << 8);
3220 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) && !defined(HEDLEY_IBM_VERSION)
3221 static const SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) perm = { 120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8, 0 };
3222 r = HEDLEY_STATIC_CAST(int32_t, vec_extract(vec_vbpermq(a_.altivec_u8, perm), 1));
3223 #else
3224 SIMDE_VECTORIZE_REDUCTION(|:r)
3225 for (size_t i = 0 ; i < (sizeof(a_.u8) / sizeof(a_.u8[0])) ; i++) {
3226 r |= (a_.u8[15 - i] >> 7) << (15 - i);
3227 }
3228 #endif
3229
3230 return r;
3231 #endif
3232 }
3233 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3234 # define _mm_movemask_epi8(a) simde_mm_movemask_epi8(a)
3235 #endif
3236
3237 SIMDE_FUNCTION_ATTRIBUTES
3238 int32_t
simde_mm_movemask_pd(simde__m128d a)3239 simde_mm_movemask_pd (simde__m128d a) {
3240 #if defined(SIMDE_X86_SSE2_NATIVE)
3241 return _mm_movemask_pd(a);
3242 #else
3243 int32_t r = 0;
3244 simde__m128d_private a_ = simde__m128d_to_private(a);
3245
3246 SIMDE_VECTORIZE
3247 for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) {
3248 r |= (a_.u64[i] >> 63) << i;
3249 }
3250
3251 return r;
3252 #endif
3253 }
3254 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3255 # define _mm_movemask_pd(a) simde_mm_movemask_pd(a)
3256 #endif
3257
3258 SIMDE_FUNCTION_ATTRIBUTES
3259 simde__m64
simde_mm_movepi64_pi64(simde__m128i a)3260 simde_mm_movepi64_pi64 (simde__m128i a) {
3261 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
3262 return _mm_movepi64_pi64(a);
3263 #else
3264 simde__m64_private r_;
3265 simde__m128i_private a_ = simde__m128i_to_private(a);
3266
3267 r_.i64[0] = a_.i64[0];
3268
3269 return simde__m64_from_private(r_);
3270 #endif
3271 }
3272 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3273 # define _mm_movepi64_pi64(a) simde_mm_movepi64_pi64(a)
3274 #endif
3275
3276 SIMDE_FUNCTION_ATTRIBUTES
3277 simde__m128i
simde_mm_movpi64_epi64(simde__m64 a)3278 simde_mm_movpi64_epi64 (simde__m64 a) {
3279 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
3280 return _mm_movpi64_epi64(a);
3281 #else
3282 simde__m128i_private r_;
3283 simde__m64_private a_ = simde__m64_to_private(a);
3284
3285 r_.i64[0] = a_.i64[0];
3286 r_.i64[1] = 0;
3287
3288 return simde__m128i_from_private(r_);
3289 #endif
3290 }
3291 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3292 # define _mm_movpi64_epi64(a) simde_mm_movpi64_epi64(a)
3293 #endif
3294
3295 SIMDE_FUNCTION_ATTRIBUTES
3296 simde__m128i
simde_mm_min_epi16(simde__m128i a,simde__m128i b)3297 simde_mm_min_epi16 (simde__m128i a, simde__m128i b) {
3298 #if defined(SIMDE_X86_SSE2_NATIVE)
3299 return _mm_min_epi16(a, b);
3300 #else
3301 simde__m128i_private
3302 r_,
3303 a_ = simde__m128i_to_private(a),
3304 b_ = simde__m128i_to_private(b);
3305
3306 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3307 r_.neon_i16 = vminq_s16(a_.neon_i16, b_.neon_i16);
3308 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
3309 r_.altivec_i16 = vec_min(a_.altivec_i16, b_.altivec_i16);
3310 #else
3311 SIMDE_VECTORIZE
3312 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
3313 r_.i16[i] = (a_.i16[i] < b_.i16[i]) ? a_.i16[i] : b_.i16[i];
3314 }
3315 #endif
3316
3317 return simde__m128i_from_private(r_);
3318 #endif
3319 }
3320 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3321 # define _mm_min_epi16(a, b) simde_mm_min_epi16(a, b)
3322 #endif
3323
3324 SIMDE_FUNCTION_ATTRIBUTES
3325 simde__m128i
simde_mm_min_epu8(simde__m128i a,simde__m128i b)3326 simde_mm_min_epu8 (simde__m128i a, simde__m128i b) {
3327 #if defined(SIMDE_X86_SSE2_NATIVE)
3328 return _mm_min_epu8(a, b);
3329 #else
3330 simde__m128i_private
3331 r_,
3332 a_ = simde__m128i_to_private(a),
3333 b_ = simde__m128i_to_private(b);
3334
3335 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3336 r_.neon_u8 = vminq_u8(a_.neon_u8, b_.neon_u8);
3337 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
3338 r_.altivec_u8 = vec_min(a_.altivec_u8, b_.altivec_u8);
3339 #else
3340 SIMDE_VECTORIZE
3341 for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
3342 r_.u8[i] = (a_.u8[i] < b_.u8[i]) ? a_.u8[i] : b_.u8[i];
3343 }
3344 #endif
3345
3346 return simde__m128i_from_private(r_);
3347 #endif
3348 }
3349 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3350 # define _mm_min_epu8(a, b) simde_mm_min_epu8(a, b)
3351 #endif
3352
3353 SIMDE_FUNCTION_ATTRIBUTES
3354 simde__m128d
simde_mm_min_pd(simde__m128d a,simde__m128d b)3355 simde_mm_min_pd (simde__m128d a, simde__m128d b) {
3356 #if defined(SIMDE_X86_SSE2_NATIVE)
3357 return _mm_min_pd(a, b);
3358 #else
3359 simde__m128d_private
3360 r_,
3361 a_ = simde__m128d_to_private(a),
3362 b_ = simde__m128d_to_private(b);
3363
3364 #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
3365 r_.altivec_f64 = vec_min(a_.altivec_f64, b_.altivec_f64);
3366 #else
3367 SIMDE_VECTORIZE
3368 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
3369 r_.f64[i] = (a_.f64[i] < b_.f64[i]) ? a_.f64[i] : b_.f64[i];
3370 }
3371 #endif
3372
3373 return simde__m128d_from_private(r_);
3374 #endif
3375 }
3376 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3377 # define _mm_min_pd(a, b) simde_mm_min_pd(a, b)
3378 #endif
3379
3380 SIMDE_FUNCTION_ATTRIBUTES
3381 simde__m128d
simde_mm_min_sd(simde__m128d a,simde__m128d b)3382 simde_mm_min_sd (simde__m128d a, simde__m128d b) {
3383 #if defined(SIMDE_X86_SSE2_NATIVE)
3384 return _mm_min_sd(a, b);
3385 #elif defined(SIMDE_ASSUME_VECTORIZATION)
3386 return simde_mm_move_sd(a, simde_mm_min_pd(a, b));
3387 #else
3388 simde__m128d_private
3389 r_,
3390 a_ = simde__m128d_to_private(a),
3391 b_ = simde__m128d_to_private(b);
3392
3393 r_.f64[0] = (a_.f64[0] < b_.f64[0]) ? a_.f64[0] : b_.f64[0];
3394 r_.f64[1] = a_.f64[1];
3395
3396 return simde__m128d_from_private(r_);
3397 #endif
3398 }
3399 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3400 # define _mm_min_sd(a, b) simde_mm_min_sd(a, b)
3401 #endif
3402
3403 SIMDE_FUNCTION_ATTRIBUTES
3404 simde__m128i
simde_mm_max_epi16(simde__m128i a,simde__m128i b)3405 simde_mm_max_epi16 (simde__m128i a, simde__m128i b) {
3406 #if defined(SIMDE_X86_SSE2_NATIVE)
3407 return _mm_max_epi16(a, b);
3408 #else
3409 simde__m128i_private
3410 r_,
3411 a_ = simde__m128i_to_private(a),
3412 b_ = simde__m128i_to_private(b);
3413
3414 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3415 r_.neon_i16 = vmaxq_s16(a_.neon_i16, b_.neon_i16);
3416 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
3417 r_.altivec_i16 = vec_max(a_.altivec_i16, b_.altivec_i16);
3418 #else
3419 SIMDE_VECTORIZE
3420 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
3421 r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? a_.i16[i] : b_.i16[i];
3422 }
3423 #endif
3424
3425 return simde__m128i_from_private(r_);
3426 #endif
3427 }
3428 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3429 # define _mm_max_epi16(a, b) simde_mm_max_epi16(a, b)
3430 #endif
3431
3432 SIMDE_FUNCTION_ATTRIBUTES
3433 simde__m128i
simde_mm_max_epu8(simde__m128i a,simde__m128i b)3434 simde_mm_max_epu8 (simde__m128i a, simde__m128i b) {
3435 #if defined(SIMDE_X86_SSE2_NATIVE)
3436 return _mm_max_epu8(a, b);
3437 #else
3438 simde__m128i_private
3439 r_,
3440 a_ = simde__m128i_to_private(a),
3441 b_ = simde__m128i_to_private(b);
3442
3443 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3444 r_.neon_u8 = vmaxq_u8(a_.neon_u8, b_.neon_u8);
3445 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
3446 r_.altivec_u8 = vec_max(a_.altivec_u8, b_.altivec_u8);
3447 #else
3448 SIMDE_VECTORIZE
3449 for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
3450 r_.u8[i] = (a_.u8[i] > b_.u8[i]) ? a_.u8[i] : b_.u8[i];
3451 }
3452 #endif
3453
3454 return simde__m128i_from_private(r_);
3455 #endif
3456 }
3457 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3458 # define _mm_max_epu8(a, b) simde_mm_max_epu8(a, b)
3459 #endif
3460
3461 SIMDE_FUNCTION_ATTRIBUTES
3462 simde__m128d
simde_mm_max_pd(simde__m128d a,simde__m128d b)3463 simde_mm_max_pd (simde__m128d a, simde__m128d b) {
3464 #if defined(SIMDE_X86_SSE2_NATIVE)
3465 return _mm_max_pd(a, b);
3466 #else
3467 simde__m128d_private
3468 r_,
3469 a_ = simde__m128d_to_private(a),
3470 b_ = simde__m128d_to_private(b);
3471
3472 #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
3473 r_.altivec_f64 = vec_max(a_.altivec_f64, b_.altivec_f64);
3474 #else
3475 SIMDE_VECTORIZE
3476 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
3477 r_.f64[i] = (a_.f64[i] > b_.f64[i]) ? a_.f64[i] : b_.f64[i];
3478 }
3479 #endif
3480
3481 return simde__m128d_from_private(r_);
3482 #endif
3483 }
3484 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3485 # define _mm_max_pd(a, b) simde_mm_max_pd(a, b)
3486 #endif
3487
3488 SIMDE_FUNCTION_ATTRIBUTES
3489 simde__m128d
simde_mm_max_sd(simde__m128d a,simde__m128d b)3490 simde_mm_max_sd (simde__m128d a, simde__m128d b) {
3491 #if defined(SIMDE_X86_SSE2_NATIVE)
3492 return _mm_max_sd(a, b);
3493 #elif defined(SIMDE_ASSUME_VECTORIZATION)
3494 return simde_mm_move_sd(a, simde_mm_max_pd(a, b));
3495 #else
3496 simde__m128d_private
3497 r_,
3498 a_ = simde__m128d_to_private(a),
3499 b_ = simde__m128d_to_private(b);
3500
3501 r_.f64[0] = (a_.f64[0] > b_.f64[0]) ? a_.f64[0] : b_.f64[0];
3502 r_.f64[1] = a_.f64[1];
3503
3504 return simde__m128d_from_private(r_);
3505 #endif
3506 }
3507 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3508 # define _mm_max_sd(a, b) simde_mm_max_sd(a, b)
3509 #endif
3510
3511 SIMDE_FUNCTION_ATTRIBUTES
3512 simde__m128i
simde_mm_move_epi64(simde__m128i a)3513 simde_mm_move_epi64 (simde__m128i a) {
3514 #if defined(SIMDE_X86_SSE2_NATIVE)
3515 return _mm_move_epi64(a);
3516 #else
3517 simde__m128i_private
3518 r_,
3519 a_ = simde__m128i_to_private(a);
3520
3521 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3522 r_.neon_i64 = vsetq_lane_s64(0, a_.neon_i64, 1);
3523 #else
3524 r_.i64[0] = a_.i64[0];
3525 r_.i64[1] = 0;
3526 #endif
3527
3528 return simde__m128i_from_private(r_);
3529 #endif
3530 }
3531 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3532 # define _mm_move_epi64(a) simde_mm_move_epi64(a)
3533 #endif
3534
3535 SIMDE_FUNCTION_ATTRIBUTES
3536 simde__m128i
simde_mm_mul_epu32(simde__m128i a,simde__m128i b)3537 simde_mm_mul_epu32 (simde__m128i a, simde__m128i b) {
3538 #if defined(SIMDE_X86_SSE2_NATIVE)
3539 return _mm_mul_epu32(a, b);
3540 #else
3541 simde__m128i_private
3542 r_,
3543 a_ = simde__m128i_to_private(a),
3544 b_ = simde__m128i_to_private(b);
3545
3546 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3547 uint32x2_t a_lo = vmovn_u64(a_.neon_u64);
3548 uint32x2_t b_lo = vmovn_u64(b_.neon_u64);
3549 r_.neon_u64 = vmull_u32(a_lo, b_lo);
3550 #else
3551 SIMDE_VECTORIZE
3552 for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
3553 r_.u64[i] = HEDLEY_STATIC_CAST(uint64_t, a_.u32[i * 2]) * HEDLEY_STATIC_CAST(uint64_t, b_.u32[i * 2]);
3554 }
3555 #endif
3556
3557 return simde__m128i_from_private(r_);
3558 #endif
3559 }
3560 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3561 # define _mm_mul_epu32(a, b) simde_mm_mul_epu32(a, b)
3562 #endif
3563
3564 SIMDE_FUNCTION_ATTRIBUTES
3565 simde__m128i
simde_x_mm_mul_epi64(simde__m128i a,simde__m128i b)3566 simde_x_mm_mul_epi64 (simde__m128i a, simde__m128i b) {
3567 simde__m128i_private
3568 r_,
3569 a_ = simde__m128i_to_private(a),
3570 b_ = simde__m128i_to_private(b);
3571
3572 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
3573 r_.i64 = a_.i64 * b_.i64;
3574 #else
3575 SIMDE_VECTORIZE
3576 for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
3577 r_.i64[i] = a_.i64[i] * b_.i64[i];
3578 }
3579 #endif
3580
3581 return simde__m128i_from_private(r_);
3582 }
3583
3584 SIMDE_FUNCTION_ATTRIBUTES
3585 simde__m128i
simde_x_mm_mod_epi64(simde__m128i a,simde__m128i b)3586 simde_x_mm_mod_epi64 (simde__m128i a, simde__m128i b) {
3587 simde__m128i_private
3588 r_,
3589 a_ = simde__m128i_to_private(a),
3590 b_ = simde__m128i_to_private(b);
3591
3592 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
3593 r_.i64 = a_.i64 % b_.i64;
3594 #else
3595 SIMDE_VECTORIZE
3596 for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
3597 r_.i64[i] = a_.i64[i] % b_.i64[i];
3598 }
3599 #endif
3600
3601 return simde__m128i_from_private(r_);
3602 }
3603
3604 SIMDE_FUNCTION_ATTRIBUTES
3605 simde__m128d
simde_mm_mul_pd(simde__m128d a,simde__m128d b)3606 simde_mm_mul_pd (simde__m128d a, simde__m128d b) {
3607 #if defined(SIMDE_X86_SSE2_NATIVE)
3608 return _mm_mul_pd(a, b);
3609 #else
3610 simde__m128d_private
3611 r_,
3612 a_ = simde__m128d_to_private(a),
3613 b_ = simde__m128d_to_private(b);
3614
3615 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
3616 r_.f64 = a_.f64 * b_.f64;
3617 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
3618 r_.wasm_v128 = wasm_f64x2_mul(a_.wasm_v128, b_.wasm_v128);
3619 #else
3620 SIMDE_VECTORIZE
3621 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
3622 r_.f64[i] = a_.f64[i] * b_.f64[i];
3623 }
3624 #endif
3625
3626 return simde__m128d_from_private(r_);
3627 #endif
3628 }
3629 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3630 # define _mm_mul_pd(a, b) simde_mm_mul_pd(a, b)
3631 #endif
3632
3633 SIMDE_FUNCTION_ATTRIBUTES
3634 simde__m128d
simde_mm_mul_sd(simde__m128d a,simde__m128d b)3635 simde_mm_mul_sd (simde__m128d a, simde__m128d b) {
3636 #if defined(SIMDE_X86_SSE2_NATIVE)
3637 return _mm_mul_sd(a, b);
3638 #elif defined(SIMDE_ASSUME_VECTORIZATION)
3639 return simde_mm_move_sd(a, simde_mm_mul_pd(a, b));
3640 #else
3641 simde__m128d_private
3642 r_,
3643 a_ = simde__m128d_to_private(a),
3644 b_ = simde__m128d_to_private(b);
3645
3646 r_.f64[0] = a_.f64[0] * b_.f64[0];
3647 r_.f64[1] = a_.f64[1];
3648
3649 return simde__m128d_from_private(r_);
3650 #endif
3651 }
3652 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3653 # define _mm_mul_sd(a, b) simde_mm_mul_sd(a, b)
3654 #endif
3655
3656 SIMDE_FUNCTION_ATTRIBUTES
3657 simde__m64
simde_mm_mul_su32(simde__m64 a,simde__m64 b)3658 simde_mm_mul_su32 (simde__m64 a, simde__m64 b) {
3659 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI)
3660 return _mm_mul_su32(a, b);
3661 #else
3662 simde__m64_private
3663 r_,
3664 a_ = simde__m64_to_private(a),
3665 b_ = simde__m64_to_private(b);
3666
3667 r_.u64[0] = HEDLEY_STATIC_CAST(uint64_t, a_.u32[0]) * HEDLEY_STATIC_CAST(uint64_t, b_.u32[0]);
3668
3669 return simde__m64_from_private(r_);
3670 #endif
3671 }
3672 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3673 # define _mm_mul_su32(a, b) simde_mm_mul_su32(a, b)
3674 #endif
3675
3676 SIMDE_FUNCTION_ATTRIBUTES
3677 simde__m128i
simde_mm_mulhi_epi16(simde__m128i a,simde__m128i b)3678 simde_mm_mulhi_epi16 (simde__m128i a, simde__m128i b) {
3679 #if defined(SIMDE_X86_SSE2_NATIVE)
3680 return _mm_mulhi_epi16(a, b);
3681 #else
3682 simde__m128i_private
3683 r_,
3684 a_ = simde__m128i_to_private(a),
3685 b_ = simde__m128i_to_private(b);
3686
3687 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3688 int16x4_t a3210 = vget_low_s16(a_.neon_i16);
3689 int16x4_t b3210 = vget_low_s16(b_.neon_i16);
3690 int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
3691 int16x4_t a7654 = vget_high_s16(a_.neon_i16);
3692 int16x4_t b7654 = vget_high_s16(b_.neon_i16);
3693 int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
3694 uint16x8x2_t rv = vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
3695 r_.neon_u16 = rv.val[1];
3696 #else
3697 SIMDE_VECTORIZE
3698 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
3699 r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, (HEDLEY_STATIC_CAST(uint32_t, HEDLEY_STATIC_CAST(int32_t, a_.i16[i]) * HEDLEY_STATIC_CAST(int32_t, b_.i16[i])) >> 16));
3700 }
3701 #endif
3702
3703 return simde__m128i_from_private(r_);
3704 #endif
3705 }
3706 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3707 # define _mm_mulhi_epi16(a, b) simde_mm_mulhi_epi16(a, b)
3708 #endif
3709
3710 SIMDE_FUNCTION_ATTRIBUTES
3711 simde__m128i
simde_mm_mulhi_epu16(simde__m128i a,simde__m128i b)3712 simde_mm_mulhi_epu16 (simde__m128i a, simde__m128i b) {
3713 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
3714 return _mm_mulhi_epu16(a, b);
3715 #else
3716 simde__m128i_private
3717 r_,
3718 a_ = simde__m128i_to_private(a),
3719 b_ = simde__m128i_to_private(b);
3720
3721 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3722 uint16x4_t a3210 = vget_low_u16(a_.neon_u16);
3723 uint16x4_t b3210 = vget_low_u16(b_.neon_u16);
3724 uint32x4_t ab3210 = vmull_u16(a3210, b3210); /* 3333222211110000 */
3725 uint16x4_t a7654 = vget_high_u16(a_.neon_u16);
3726 uint16x4_t b7654 = vget_high_u16(b_.neon_u16);
3727 uint32x4_t ab7654 = vmull_u16(a7654, b7654); /* 7777666655554444 */
3728 uint16x8x2_t neon_r =
3729 vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654));
3730 r_.neon_u16 = neon_r.val[1];
3731 #else
3732 SIMDE_VECTORIZE
3733 for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
3734 r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, HEDLEY_STATIC_CAST(uint32_t, a_.u16[i]) * HEDLEY_STATIC_CAST(uint32_t, b_.u16[i]) >> 16);
3735 }
3736 #endif
3737
3738 return simde__m128i_from_private(r_);
3739 #endif
3740 }
3741 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3742 # define _mm_mulhi_epu16(a, b) simde_mm_mulhi_epu16(a, b)
3743 #endif
3744
3745 SIMDE_FUNCTION_ATTRIBUTES
3746 simde__m128i
simde_mm_mullo_epi16(simde__m128i a,simde__m128i b)3747 simde_mm_mullo_epi16 (simde__m128i a, simde__m128i b) {
3748 #if defined(SIMDE_X86_SSE2_NATIVE)
3749 return _mm_mullo_epi16(a, b);
3750 #else
3751 simde__m128i_private
3752 r_,
3753 a_ = simde__m128i_to_private(a),
3754 b_ = simde__m128i_to_private(b);
3755
3756 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3757 r_.neon_i16 = vmulq_s16(a_.neon_i16, b_.neon_i16);
3758 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
3759 (void) a_;
3760 (void) b_;
3761 r_.altivec_i16 = vec_mul(a_.altivec_i16, b_.altivec_i16);
3762 #else
3763 SIMDE_VECTORIZE
3764 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
3765 r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, HEDLEY_STATIC_CAST(uint32_t, a_.u16[i]) * HEDLEY_STATIC_CAST(uint32_t, b_.u16[i]));
3766 }
3767 #endif
3768
3769 return simde__m128i_from_private(r_);
3770 #endif
3771 }
3772 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3773 # define _mm_mullo_epi16(a, b) simde_mm_mullo_epi16(a, b)
3774 #endif
3775
3776 SIMDE_FUNCTION_ATTRIBUTES
3777 simde__m128d
simde_mm_or_pd(simde__m128d a,simde__m128d b)3778 simde_mm_or_pd (simde__m128d a, simde__m128d b) {
3779 #if defined(SIMDE_X86_SSE2_NATIVE)
3780 return _mm_or_pd(a, b);
3781 #else
3782 simde__m128d_private
3783 r_,
3784 a_ = simde__m128d_to_private(a),
3785 b_ = simde__m128d_to_private(b);
3786
3787 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
3788 r_.i32f = a_.i32f | b_.i32f;
3789 #else
3790 SIMDE_VECTORIZE
3791 for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
3792 r_.i32f[i] = a_.i32f[i] | b_.i32f[i];
3793 }
3794 #endif
3795
3796 return simde__m128d_from_private(r_);
3797 #endif
3798 }
3799 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3800 # define _mm_or_pd(a, b) simde_mm_or_pd(a, b)
3801 #endif
3802
3803 SIMDE_FUNCTION_ATTRIBUTES
3804 simde__m128i
simde_mm_or_si128(simde__m128i a,simde__m128i b)3805 simde_mm_or_si128 (simde__m128i a, simde__m128i b) {
3806 #if defined(SIMDE_X86_SSE2_NATIVE)
3807 return _mm_or_si128(a, b);
3808 #else
3809 simde__m128i_private
3810 r_,
3811 a_ = simde__m128i_to_private(a),
3812 b_ = simde__m128i_to_private(b);
3813
3814 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3815 r_.neon_i32 = vorrq_s32(a_.neon_i32, b_.neon_i32);
3816 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
3817 r_.altivec_i32 = vec_or(a_.altivec_i32, b_.altivec_i32);
3818 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
3819 r_.i32f = a_.i32f | b_.i32f;
3820 #else
3821 SIMDE_VECTORIZE
3822 for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
3823 r_.i32f[i] = a_.i32f[i] | b_.i32f[i];
3824 }
3825 #endif
3826
3827 return simde__m128i_from_private(r_);
3828 #endif
3829 }
3830 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3831 # define _mm_or_si128(a, b) simde_mm_or_si128(a, b)
3832 #endif
3833
3834 SIMDE_FUNCTION_ATTRIBUTES
3835 simde__m128i
simde_mm_packs_epi16(simde__m128i a,simde__m128i b)3836 simde_mm_packs_epi16 (simde__m128i a, simde__m128i b) {
3837 #if defined(SIMDE_X86_SSE2_NATIVE)
3838 return _mm_packs_epi16(a, b);
3839 #else
3840 simde__m128i_private
3841 r_,
3842 a_ = simde__m128i_to_private(a),
3843 b_ = simde__m128i_to_private(b);
3844
3845 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3846 r_.neon_i8 = vcombine_s8(vqmovn_s16(a_.neon_i16), vqmovn_s16(b_.neon_i16));
3847 #else
3848 SIMDE_VECTORIZE
3849 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
3850 r_.i8[i] = (a_.i16[i] > INT8_MAX) ? INT8_MAX : ((a_.i16[i] < INT8_MIN) ? INT8_MIN : HEDLEY_STATIC_CAST(int8_t, a_.i16[i]));
3851 r_.i8[i + 8] = (b_.i16[i] > INT8_MAX) ? INT8_MAX : ((b_.i16[i] < INT8_MIN) ? INT8_MIN : HEDLEY_STATIC_CAST(int8_t, b_.i16[i]));
3852 }
3853 #endif
3854
3855 return simde__m128i_from_private(r_);
3856 #endif
3857 }
3858 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3859 # define _mm_packs_epi16(a, b) simde_mm_packs_epi16(a, b)
3860 #endif
3861
3862 SIMDE_FUNCTION_ATTRIBUTES
3863 simde__m128i
simde_mm_packs_epi32(simde__m128i a,simde__m128i b)3864 simde_mm_packs_epi32 (simde__m128i a, simde__m128i b) {
3865 #if defined(SIMDE_X86_SSE2_NATIVE)
3866 return _mm_packs_epi32(a, b);
3867 #else
3868 simde__m128i_private
3869 r_,
3870 a_ = simde__m128i_to_private(a),
3871 b_ = simde__m128i_to_private(b);
3872
3873 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3874 r_.neon_i16 = vcombine_s16(vqmovn_s32(a_.neon_i32), vqmovn_s32(b_.neon_i32));
3875 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
3876 r_.altivec_i16 = vec_packs(a_.altivec_i32, b_.altivec_i32);
3877 #else
3878 SIMDE_VECTORIZE
3879 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
3880 r_.i16[i] = (a_.i32[i] > INT16_MAX) ? INT16_MAX : ((a_.i32[i] < INT16_MIN) ? INT16_MIN : HEDLEY_STATIC_CAST(int16_t, a_.i32[i]));
3881 r_.i16[i + 4] = (b_.i32[i] > INT16_MAX) ? INT16_MAX : ((b_.i32[i] < INT16_MIN) ? INT16_MIN : HEDLEY_STATIC_CAST(int16_t, b_.i32[i]));
3882 }
3883 #endif
3884
3885 return simde__m128i_from_private(r_);
3886 #endif
3887 }
3888 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3889 # define _mm_packs_epi32(a, b) simde_mm_packs_epi32(a, b)
3890 #endif
3891
3892 SIMDE_FUNCTION_ATTRIBUTES
3893 simde__m128i
simde_mm_packus_epi16(simde__m128i a,simde__m128i b)3894 simde_mm_packus_epi16 (simde__m128i a, simde__m128i b) {
3895 #if defined(SIMDE_X86_SSE2_NATIVE)
3896 return _mm_packus_epi16(a, b);
3897 #else
3898 simde__m128i_private
3899 r_,
3900 a_ = simde__m128i_to_private(a),
3901 b_ = simde__m128i_to_private(b);
3902
3903 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3904 r_.neon_u8 = vcombine_u8(vqmovun_s16(a_.neon_i16), vqmovun_s16(b_.neon_i16));
3905 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
3906 r_.altivec_u8 = vec_packsu(a_.altivec_i16, b_.altivec_i16);
3907 #else
3908 SIMDE_VECTORIZE
3909 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
3910 r_.u8[i] = (a_.i16[i] > UINT8_MAX) ? UINT8_MAX : ((a_.i16[i] < 0) ? UINT8_C(0) : HEDLEY_STATIC_CAST(uint8_t, a_.i16[i]));
3911 r_.u8[i + 8] = (b_.i16[i] > UINT8_MAX) ? UINT8_MAX : ((b_.i16[i] < 0) ? UINT8_C(0) : HEDLEY_STATIC_CAST(uint8_t, b_.i16[i]));
3912 }
3913 #endif
3914
3915 return simde__m128i_from_private(r_);
3916 #endif
3917 }
3918 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3919 # define _mm_packus_epi16(a, b) simde_mm_packus_epi16(a, b)
3920 #endif
3921
3922 SIMDE_FUNCTION_ATTRIBUTES
3923 void
simde_mm_pause(void)3924 simde_mm_pause (void) {
3925 #if defined(SIMDE_X86_SSE2_NATIVE)
3926 _mm_pause();
3927 #endif
3928 }
3929 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3930 # define _mm_pause() (simde_mm_pause())
3931 #endif
3932
3933 SIMDE_FUNCTION_ATTRIBUTES
3934 simde__m128i
simde_mm_sad_epu8(simde__m128i a,simde__m128i b)3935 simde_mm_sad_epu8 (simde__m128i a, simde__m128i b) {
3936 #if defined(SIMDE_X86_SSE2_NATIVE)
3937 return _mm_sad_epu8(a, b);
3938 #else
3939 simde__m128i_private
3940 r_,
3941 a_ = simde__m128i_to_private(a),
3942 b_ = simde__m128i_to_private(b);
3943
3944 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3945 uint16x8_t t = vpaddlq_u8(vabdq_u8(a_.neon_u8, b_.neon_u8));
3946 uint16_t r0 = t[0] + t[1] + t[2] + t[3];
3947 uint16_t r4 = t[4] + t[5] + t[6] + t[7];
3948 uint16x8_t r = vsetq_lane_u16(r0, vdupq_n_u16(0), 0);
3949 r_.neon_u16 = vsetq_lane_u16(r4, r, 4);
3950 #else
3951 for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
3952 uint16_t tmp = 0;
3953 SIMDE_VECTORIZE_REDUCTION(+:tmp)
3954 for (size_t j = 0 ; j < ((sizeof(r_.u8) / sizeof(r_.u8[0])) / 2) ; j++) {
3955 const size_t e = j + (i * 8);
3956 tmp += (a_.u8[e] > b_.u8[e]) ? (a_.u8[e] - b_.u8[e]) : (b_.u8[e] - a_.u8[e]);
3957 }
3958 r_.i64[i] = tmp;
3959 }
3960 #endif
3961
3962 return simde__m128i_from_private(r_);
3963 #endif
3964 }
3965 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3966 # define _mm_sad_epu8(a, b) simde_mm_sad_epu8(a, b)
3967 #endif
3968
3969 SIMDE_FUNCTION_ATTRIBUTES
3970 simde__m128i
simde_mm_set_epi8(int8_t e15,int8_t e14,int8_t e13,int8_t e12,int8_t e11,int8_t e10,int8_t e9,int8_t e8,int8_t e7,int8_t e6,int8_t e5,int8_t e4,int8_t e3,int8_t e2,int8_t e1,int8_t e0)3971 simde_mm_set_epi8 (int8_t e15, int8_t e14, int8_t e13, int8_t e12,
3972 int8_t e11, int8_t e10, int8_t e9, int8_t e8,
3973 int8_t e7, int8_t e6, int8_t e5, int8_t e4,
3974 int8_t e3, int8_t e2, int8_t e1, int8_t e0) {
3975
3976 #if defined(SIMDE_X86_SSE2_NATIVE)
3977 return _mm_set_epi8(
3978 e15, e14, e13, e12, e11, e10, e9, e8,
3979 e7, e6, e5, e4, e3, e2, e1, e0);
3980 #else
3981 simde__m128i_private r_;
3982
3983 #if defined(SIMDE_WASM_SIMD128_NATIVE)
3984 r_.wasm_v128 = wasm_i8x16_make(
3985 e0, e1, e2, e3, e4, e5, e6, e7,
3986 e8, e9, e10, e11, e12, e13, e14, e15);
3987 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3988 SIMDE_ALIGN_AS(16, int8x16_t) int8_t data[16] = {
3989 e0, e1, e2, e3,
3990 e4, e5, e6, e7,
3991 e8, e9, e10, e11,
3992 e12, e13, e14, e15};
3993 r_.neon_i8 = vld1q_s8(data);
3994 #else
3995 r_.i8[ 0] = e0;
3996 r_.i8[ 1] = e1;
3997 r_.i8[ 2] = e2;
3998 r_.i8[ 3] = e3;
3999 r_.i8[ 4] = e4;
4000 r_.i8[ 5] = e5;
4001 r_.i8[ 6] = e6;
4002 r_.i8[ 7] = e7;
4003 r_.i8[ 8] = e8;
4004 r_.i8[ 9] = e9;
4005 r_.i8[10] = e10;
4006 r_.i8[11] = e11;
4007 r_.i8[12] = e12;
4008 r_.i8[13] = e13;
4009 r_.i8[14] = e14;
4010 r_.i8[15] = e15;
4011 #endif
4012
4013 return simde__m128i_from_private(r_);
4014 #endif
4015 }
4016 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4017 # define _mm_set_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) simde_mm_set_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0)
4018 #endif
4019
4020 SIMDE_FUNCTION_ATTRIBUTES
4021 simde__m128i
simde_mm_set_epi16(int16_t e7,int16_t e6,int16_t e5,int16_t e4,int16_t e3,int16_t e2,int16_t e1,int16_t e0)4022 simde_mm_set_epi16 (int16_t e7, int16_t e6, int16_t e5, int16_t e4,
4023 int16_t e3, int16_t e2, int16_t e1, int16_t e0) {
4024 #if defined(SIMDE_X86_SSE2_NATIVE)
4025 return _mm_set_epi16(e7, e6, e5, e4, e3, e2, e1, e0);
4026 #else
4027 simde__m128i_private r_;
4028
4029 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4030 SIMDE_ALIGN_AS(16, int16x8_t) int16_t data[8] = { e0, e1, e2, e3, e4, e5, e6, e7 };
4031 r_.neon_i16 = vld1q_s16(data);
4032 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4033 r_.wasm_v128 = wasm_i16x8_make(e0, e1, e2, e3, e4, e5, e6, e7);
4034 #else
4035 r_.i16[0] = e0;
4036 r_.i16[1] = e1;
4037 r_.i16[2] = e2;
4038 r_.i16[3] = e3;
4039 r_.i16[4] = e4;
4040 r_.i16[5] = e5;
4041 r_.i16[6] = e6;
4042 r_.i16[7] = e7;
4043 #endif
4044
4045 return simde__m128i_from_private(r_);
4046 #endif
4047 }
4048 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4049 # define _mm_set_epi16(e7, e6, e5, e4, e3, e2, e1, e0) simde_mm_set_epi16(e7, e6, e5, e4, e3, e2, e1, e0)
4050 #endif
4051
4052 SIMDE_FUNCTION_ATTRIBUTES
4053 simde__m128i
simde_mm_set_epi32(int32_t e3,int32_t e2,int32_t e1,int32_t e0)4054 simde_mm_set_epi32 (int32_t e3, int32_t e2, int32_t e1, int32_t e0) {
4055 #if defined(SIMDE_X86_SSE2_NATIVE)
4056 return _mm_set_epi32(e3, e2, e1, e0);
4057 #else
4058 simde__m128i_private r_;
4059
4060 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4061 SIMDE_ALIGN_AS(16, int32x4_t) int32_t data[4] = { e0, e1, e2, e3 };
4062 r_.neon_i32 = vld1q_s32(data);
4063 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4064 r_.wasm_v128 = wasm_i32x4_make(e0, e1, e2, e3);
4065 #else
4066 r_.i32[0] = e0;
4067 r_.i32[1] = e1;
4068 r_.i32[2] = e2;
4069 r_.i32[3] = e3;
4070 #endif
4071
4072 return simde__m128i_from_private(r_);
4073 #endif
4074 }
4075 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4076 # define _mm_set_epi32(e3, e2, e1, e0) simde_mm_set_epi32(e3, e2, e1, e0)
4077 #endif
4078
4079 SIMDE_FUNCTION_ATTRIBUTES
4080 simde__m128i
simde_mm_set_epi64(simde__m64 e1,simde__m64 e0)4081 simde_mm_set_epi64 (simde__m64 e1, simde__m64 e0) {
4082 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
4083 return _mm_set_epi64(e1, e0);
4084 #else
4085 simde__m128i_private r_;
4086
4087 r_.m64_private[0] = simde__m64_to_private(e0);
4088 r_.m64_private[1] = simde__m64_to_private(e1);
4089
4090 return simde__m128i_from_private(r_);
4091 #endif
4092 }
4093 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4094 # define _mm_set_epi64(e1, e0) (simde_mm_set_epi64((e1), (e0)))
4095 #endif
4096
4097 SIMDE_FUNCTION_ATTRIBUTES
4098 simde__m128i
simde_mm_set_epi64x(int64_t e1,int64_t e0)4099 simde_mm_set_epi64x (int64_t e1, int64_t e0) {
4100 #if defined(SIMDE_X86_SSE2_NATIVE) && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,0,0))
4101 return _mm_set_epi64x(e1, e0);
4102 #else
4103 simde__m128i_private r_;
4104
4105 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4106 SIMDE_ALIGN_AS(16, int64x2_t) int64_t data[2] = {e0, e1};
4107 r_.neon_i64 = vld1q_s64(data);
4108 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4109 r_.wasm_v128 = wasm_i64x2_make(e0, e1);
4110 #else
4111 r_.i64[0] = e0;
4112 r_.i64[1] = e1;
4113 #endif
4114
4115 return simde__m128i_from_private(r_);
4116 #endif
4117 }
4118 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4119 # define _mm_set_epi64x(e1, e0) simde_mm_set_epi64x(e1, e0)
4120 #endif
4121
4122 SIMDE_FUNCTION_ATTRIBUTES
4123 simde__m128i
simde_x_mm_set_epu8(uint8_t e15,uint8_t e14,uint8_t e13,uint8_t e12,uint8_t e11,uint8_t e10,uint8_t e9,uint8_t e8,uint8_t e7,uint8_t e6,uint8_t e5,uint8_t e4,uint8_t e3,uint8_t e2,uint8_t e1,uint8_t e0)4124 simde_x_mm_set_epu8 (uint8_t e15, uint8_t e14, uint8_t e13, uint8_t e12,
4125 uint8_t e11, uint8_t e10, uint8_t e9, uint8_t e8,
4126 uint8_t e7, uint8_t e6, uint8_t e5, uint8_t e4,
4127 uint8_t e3, uint8_t e2, uint8_t e1, uint8_t e0) {
4128 #if defined(SIMDE_X86_SSE2_NATIVE)
4129 return _mm_set_epi8(
4130 HEDLEY_STATIC_CAST(char, e15), HEDLEY_STATIC_CAST(char, e14), HEDLEY_STATIC_CAST(char, e13), HEDLEY_STATIC_CAST(char, e12),
4131 HEDLEY_STATIC_CAST(char, e11), HEDLEY_STATIC_CAST(char, e10), HEDLEY_STATIC_CAST(char, e9), HEDLEY_STATIC_CAST(char, e8),
4132 HEDLEY_STATIC_CAST(char, e7), HEDLEY_STATIC_CAST(char, e6), HEDLEY_STATIC_CAST(char, e5), HEDLEY_STATIC_CAST(char, e4),
4133 HEDLEY_STATIC_CAST(char, e3), HEDLEY_STATIC_CAST(char, e2), HEDLEY_STATIC_CAST(char, e1), HEDLEY_STATIC_CAST(char, e0));
4134 #else
4135 simde__m128i_private r_;
4136
4137 r_.u8[ 0] = e0; r_.u8[ 1] = e1; r_.u8[ 2] = e2; r_.u8[ 3] = e3;
4138 r_.u8[ 4] = e4; r_.u8[ 5] = e5; r_.u8[ 6] = e6; r_.u8[ 7] = e7;
4139 r_.u8[ 8] = e8; r_.u8[ 9] = e9; r_.u8[10] = e10; r_.u8[11] = e11;
4140 r_.u8[12] = e12; r_.u8[13] = e13; r_.u8[14] = e14; r_.u8[15] = e15;
4141
4142 return simde__m128i_from_private(r_);
4143 #endif
4144 }
4145
4146 SIMDE_FUNCTION_ATTRIBUTES
4147 simde__m128i
simde_x_mm_set_epu16(uint16_t e7,uint16_t e6,uint16_t e5,uint16_t e4,uint16_t e3,uint16_t e2,uint16_t e1,uint16_t e0)4148 simde_x_mm_set_epu16 (uint16_t e7, uint16_t e6, uint16_t e5, uint16_t e4,
4149 uint16_t e3, uint16_t e2, uint16_t e1, uint16_t e0) {
4150 #if defined(SIMDE_X86_SSE2_NATIVE)
4151 return _mm_set_epi16(
4152 HEDLEY_STATIC_CAST(short, e7), HEDLEY_STATIC_CAST(short, e6), HEDLEY_STATIC_CAST(short, e5), HEDLEY_STATIC_CAST(short, e4),
4153 HEDLEY_STATIC_CAST(short, e3), HEDLEY_STATIC_CAST(short, e2), HEDLEY_STATIC_CAST(short, e1), HEDLEY_STATIC_CAST(short, e0));
4154 #else
4155 simde__m128i_private r_;
4156
4157 r_.u16[0] = e0; r_.u16[1] = e1; r_.u16[2] = e2; r_.u16[3] = e3;
4158 r_.u16[4] = e4; r_.u16[5] = e5; r_.u16[6] = e6; r_.u16[7] = e7;
4159
4160 return simde__m128i_from_private(r_);
4161 #endif
4162 }
4163
4164 SIMDE_FUNCTION_ATTRIBUTES
4165 simde__m128i
simde_x_mm_set_epu32(uint32_t e3,uint32_t e2,uint32_t e1,uint32_t e0)4166 simde_x_mm_set_epu32 (uint32_t e3, uint32_t e2, uint32_t e1, uint32_t e0) {
4167 #if defined(SIMDE_X86_SSE2_NATIVE)
4168 return _mm_set_epi32(
4169 HEDLEY_STATIC_CAST(int, e3), HEDLEY_STATIC_CAST(int, e2), HEDLEY_STATIC_CAST(int, e1), HEDLEY_STATIC_CAST(int, e0));
4170 #else
4171 simde__m128i_private r_;
4172
4173 r_.u32[0] = e0;
4174 r_.u32[1] = e1;
4175 r_.u32[2] = e2;
4176 r_.u32[3] = e3;
4177
4178 return simde__m128i_from_private(r_);
4179 #endif
4180 }
4181
4182 SIMDE_FUNCTION_ATTRIBUTES
4183 simde__m128i
simde_x_mm_set_epu64x(uint64_t e1,uint64_t e0)4184 simde_x_mm_set_epu64x (uint64_t e1, uint64_t e0) {
4185 #if defined(SIMDE_X86_SSE2_NATIVE) && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,0,0))
4186 return _mm_set_epi64x(HEDLEY_STATIC_CAST(int64_t, e1), HEDLEY_STATIC_CAST(int64_t, e0));
4187 #else
4188 simde__m128i_private r_;
4189
4190 r_.u64[0] = e0;
4191 r_.u64[1] = e1;
4192
4193 return simde__m128i_from_private(r_);
4194 #endif
4195 }
4196
4197 SIMDE_FUNCTION_ATTRIBUTES
4198 simde__m128d
simde_mm_set_pd(simde_float64 e1,simde_float64 e0)4199 simde_mm_set_pd (simde_float64 e1, simde_float64 e0) {
4200 #if defined(SIMDE_X86_SSE2_NATIVE)
4201 return _mm_set_pd(e1, e0);
4202 #else
4203 simde__m128d_private r_;
4204
4205 #if defined(SIMDE_WASM_SIMD128_NATIVE)
4206 r_.wasm_v128 = wasm_f64x2_make(e0, e1);
4207 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4208 r_.wasm_v128 = wasm_f64x2_make(e0, e1);
4209 #else
4210 r_.f64[0] = e0;
4211 r_.f64[1] = e1;
4212 #endif
4213
4214 return simde__m128d_from_private(r_);
4215 #endif
4216 }
4217 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4218 # define _mm_set_pd(e1, e0) simde_mm_set_pd(e1, e0)
4219 #endif
4220
4221 SIMDE_FUNCTION_ATTRIBUTES
4222 simde__m128d
simde_mm_set_pd1(simde_float64 a)4223 simde_mm_set_pd1 (simde_float64 a) {
4224 #if defined(SIMDE_X86_SSE2_NATIVE)
4225 return _mm_set1_pd(a);
4226 #else
4227 simde__m128d_private r_;
4228
4229 r_.f64[0] = a;
4230 r_.f64[1] = a;
4231
4232 return simde__m128d_from_private(r_);
4233 #endif
4234 }
4235 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4236 # define _mm_set_pd1(a) simde_mm_set1_pd(a)
4237 #endif
4238
4239 SIMDE_FUNCTION_ATTRIBUTES
4240 simde__m128d
simde_mm_set_sd(simde_float64 a)4241 simde_mm_set_sd (simde_float64 a) {
4242 #if defined(SIMDE_X86_SSE2_NATIVE)
4243 return _mm_set_sd(a);
4244 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
4245 return vsetq_lane_f64(a, vdupq_n_f64(SIMDE_FLOAT64_C(0.0)), 0);
4246 #else
4247 return simde_mm_set_pd(SIMDE_FLOAT64_C(0.0), a);
4248
4249 #endif
4250 }
4251 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4252 # define _mm_set_sd(a) simde_mm_set_sd(a)
4253 #endif
4254
4255 SIMDE_FUNCTION_ATTRIBUTES
4256 simde__m128i
simde_mm_set1_epi8(int8_t a)4257 simde_mm_set1_epi8 (int8_t a) {
4258 #if defined(SIMDE_X86_SSE2_NATIVE)
4259 return _mm_set1_epi8(a);
4260 #else
4261 simde__m128i_private r_;
4262
4263 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4264 r_.neon_i8 = vdupq_n_s8(a);
4265 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4266 r_.wasm_v128 = wasm_i8x16_splat(a);
4267 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
4268 r_.altivec_i8 = vec_splats(HEDLEY_STATIC_CAST(signed char, a));
4269 #else
4270 SIMDE_VECTORIZE
4271 for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
4272 r_.i8[i] = a;
4273 }
4274 #endif
4275
4276 return simde__m128i_from_private(r_);
4277 #endif
4278 }
4279 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4280 # define _mm_set1_epi8(a) simde_mm_set1_epi8(a)
4281 #endif
4282
4283 SIMDE_FUNCTION_ATTRIBUTES
4284 simde__m128i
simde_mm_set1_epi16(int16_t a)4285 simde_mm_set1_epi16 (int16_t a) {
4286 #if defined(SIMDE_X86_SSE2_NATIVE)
4287 return _mm_set1_epi16(a);
4288 #else
4289 simde__m128i_private r_;
4290
4291 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4292 r_.neon_i16 = vdupq_n_s16(a);
4293 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4294 r_.wasm_v128 = wasm_i16x8_splat(a);
4295 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
4296 r_.altivec_i16 = vec_splats(HEDLEY_STATIC_CAST(signed short, a));
4297 #else
4298 SIMDE_VECTORIZE
4299 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
4300 r_.i16[i] = a;
4301 }
4302 #endif
4303
4304 return simde__m128i_from_private(r_);
4305 #endif
4306 }
4307 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4308 # define _mm_set1_epi16(a) simde_mm_set1_epi16(a)
4309 #endif
4310
4311 SIMDE_FUNCTION_ATTRIBUTES
4312 simde__m128i
simde_mm_set1_epi32(int32_t a)4313 simde_mm_set1_epi32 (int32_t a) {
4314 #if defined(SIMDE_X86_SSE2_NATIVE)
4315 return _mm_set1_epi32(a);
4316 #else
4317 simde__m128i_private r_;
4318
4319 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4320 r_.neon_i32 = vdupq_n_s32(a);
4321 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4322 r_.wasm_v128 = wasm_i32x4_splat(a);
4323 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
4324 r_.altivec_i32 = vec_splats(HEDLEY_STATIC_CAST(signed int, a));
4325 #else
4326 SIMDE_VECTORIZE
4327 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
4328 r_.i32[i] = a;
4329 }
4330 #endif
4331
4332 return simde__m128i_from_private(r_);
4333 #endif
4334 }
4335 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4336 # define _mm_set1_epi32(a) simde_mm_set1_epi32(a)
4337 #endif
4338
4339 SIMDE_FUNCTION_ATTRIBUTES
4340 simde__m128i
simde_mm_set1_epi64x(int64_t a)4341 simde_mm_set1_epi64x (int64_t a) {
4342 #if defined(SIMDE_X86_SSE2_NATIVE) && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,0,0))
4343 return _mm_set1_epi64x(a);
4344 #else
4345 simde__m128i_private r_;
4346
4347 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4348 r_.neon_i64 = vmovq_n_s64(a);
4349 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4350 r_.wasm_v128 = wasm_i64x2_splat(a);
4351 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
4352 r_.altivec_i64 = vec_splats(HEDLEY_STATIC_CAST(signed long long, a));
4353 #else
4354 SIMDE_VECTORIZE
4355 for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
4356 r_.i64[i] = a;
4357 }
4358 #endif
4359
4360 return simde__m128i_from_private(r_);
4361 #endif
4362 }
4363 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4364 # define _mm_set1_epi64x(a) simde_mm_set1_epi64x(a)
4365 #endif
4366
4367 SIMDE_FUNCTION_ATTRIBUTES
4368 simde__m128i
simde_mm_set1_epi64(simde__m64 a)4369 simde_mm_set1_epi64 (simde__m64 a) {
4370 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
4371 return _mm_set1_epi64(a);
4372 #else
4373 simde__m64_private a_ = simde__m64_to_private(a);
4374 return simde_mm_set1_epi64x(a_.i64[0]);
4375 #endif
4376 }
4377 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4378 # define _mm_set1_epi64(a) simde_mm_set1_epi64(a)
4379 #endif
4380
4381 SIMDE_FUNCTION_ATTRIBUTES
4382 simde__m128i
simde_x_mm_set1_epu8(uint8_t value)4383 simde_x_mm_set1_epu8 (uint8_t value) {
4384 #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
4385 return simde__m128i_from_altivec_u8(vec_splats(HEDLEY_STATIC_CAST(unsigned char, value)));
4386 #else
4387 return simde_mm_set1_epi8(HEDLEY_STATIC_CAST(int8_t, value));
4388 #endif
4389 }
4390
4391 SIMDE_FUNCTION_ATTRIBUTES
4392 simde__m128i
simde_x_mm_set1_epu16(uint16_t value)4393 simde_x_mm_set1_epu16 (uint16_t value) {
4394 #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
4395 return simde__m128i_from_altivec_u16(vec_splats(HEDLEY_STATIC_CAST(unsigned short, value)));
4396 #else
4397 return simde_mm_set1_epi16(HEDLEY_STATIC_CAST(int16_t, value));
4398 #endif
4399 }
4400
4401 SIMDE_FUNCTION_ATTRIBUTES
4402 simde__m128i
simde_x_mm_set1_epu32(uint32_t value)4403 simde_x_mm_set1_epu32 (uint32_t value) {
4404 #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
4405 return simde__m128i_from_altivec_u32(vec_splats(HEDLEY_STATIC_CAST(unsigned int, value)));
4406 #else
4407 return simde_mm_set1_epi32(HEDLEY_STATIC_CAST(int32_t, value));
4408 #endif
4409 }
4410
4411 SIMDE_FUNCTION_ATTRIBUTES
4412 simde__m128i
simde_x_mm_set1_epu64(uint64_t value)4413 simde_x_mm_set1_epu64 (uint64_t value) {
4414 #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
4415 return simde__m128i_from_altivec_u64(vec_splats(HEDLEY_STATIC_CAST(unsigned long long, value)));
4416 #else
4417 return simde_mm_set1_epi64x(HEDLEY_STATIC_CAST(int64_t, value));
4418 #endif
4419 }
4420
4421 SIMDE_FUNCTION_ATTRIBUTES
4422 simde__m128d
simde_mm_set1_pd(simde_float64 a)4423 simde_mm_set1_pd (simde_float64 a) {
4424 #if defined(SIMDE_X86_SSE2_NATIVE)
4425 return _mm_set1_pd(a);
4426 #else
4427 simde__m128d_private r_;
4428
4429 #if defined(SIMDE_WASM_SIMD128_NATIVE)
4430 r_.wasm_v128 = wasm_f64x2_splat(a);
4431 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
4432 r_.altivec_f64 = vec_splats(HEDLEY_STATIC_CAST(double, a));
4433 #else
4434 SIMDE_VECTORIZE
4435 for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
4436 r_.f64[i] = a;
4437 }
4438 #endif
4439
4440 return simde__m128d_from_private(r_);
4441 #endif
4442 }
4443 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4444 # define _mm_set1_pd(a) simde_mm_set1_pd(a)
4445 #endif
4446
4447 SIMDE_FUNCTION_ATTRIBUTES
4448 simde__m128i
simde_mm_setr_epi8(int8_t e15,int8_t e14,int8_t e13,int8_t e12,int8_t e11,int8_t e10,int8_t e9,int8_t e8,int8_t e7,int8_t e6,int8_t e5,int8_t e4,int8_t e3,int8_t e2,int8_t e1,int8_t e0)4449 simde_mm_setr_epi8 (int8_t e15, int8_t e14, int8_t e13, int8_t e12,
4450 int8_t e11, int8_t e10, int8_t e9, int8_t e8,
4451 int8_t e7, int8_t e6, int8_t e5, int8_t e4,
4452 int8_t e3, int8_t e2, int8_t e1, int8_t e0) {
4453 #if defined(SIMDE_X86_SSE2_NATIVE)
4454 return _mm_setr_epi8(
4455 e15, e14, e13, e12, e11, e10, e9, e8,
4456 e7, e6, e5, e4, e3, e2, e1, e0);
4457 #else
4458 return simde_mm_set_epi8(
4459 e0, e1, e2, e3, e4, e5, e6, e7,
4460 e8, e9, e10, e11, e12, e13, e14, e15);
4461 #endif
4462 }
4463 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4464 # define _mm_setr_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) simde_mm_setr_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0)
4465 #endif
4466
4467 SIMDE_FUNCTION_ATTRIBUTES
4468 simde__m128i
simde_mm_setr_epi16(int16_t e7,int16_t e6,int16_t e5,int16_t e4,int16_t e3,int16_t e2,int16_t e1,int16_t e0)4469 simde_mm_setr_epi16 (int16_t e7, int16_t e6, int16_t e5, int16_t e4,
4470 int16_t e3, int16_t e2, int16_t e1, int16_t e0) {
4471 #if defined(SIMDE_X86_SSE2_NATIVE)
4472 return _mm_setr_epi16(e7, e6, e5, e4, e3, e2, e1, e0);
4473 #else
4474 return simde_mm_set_epi16(e0, e1, e2, e3, e4, e5, e6, e7);
4475 #endif
4476 }
4477 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4478 # define _mm_setr_epi16(e7, e6, e5, e4, e3, e2, e1, e0) simde_mm_setr_epi16(e7, e6, e5, e4, e3, e2, e1, e0)
4479 #endif
4480
4481 SIMDE_FUNCTION_ATTRIBUTES
4482 simde__m128i
simde_mm_setr_epi32(int32_t e3,int32_t e2,int32_t e1,int32_t e0)4483 simde_mm_setr_epi32 (int32_t e3, int32_t e2, int32_t e1, int32_t e0) {
4484 #if defined(SIMDE_X86_SSE2_NATIVE)
4485 return _mm_setr_epi32(e3, e2, e1, e0);
4486 #else
4487 return simde_mm_set_epi32(e0, e1, e2, e3);
4488 #endif
4489 }
4490 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4491 # define _mm_setr_epi32(e3, e2, e1, e0) simde_mm_setr_epi32(e3, e2, e1, e0)
4492 #endif
4493
4494 SIMDE_FUNCTION_ATTRIBUTES
4495 simde__m128i
simde_mm_setr_epi64(simde__m64 e1,simde__m64 e0)4496 simde_mm_setr_epi64 (simde__m64 e1, simde__m64 e0) {
4497 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
4498 return _mm_setr_epi64(e1, e0);
4499 #else
4500 return simde_mm_set_epi64(e0, e1);
4501 #endif
4502 }
4503 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4504 # define _mm_setr_epi64(e1, e0) (simde_mm_setr_epi64((e1), (e0)))
4505 #endif
4506
4507 SIMDE_FUNCTION_ATTRIBUTES
4508 simde__m128d
simde_mm_setr_pd(simde_float64 e1,simde_float64 e0)4509 simde_mm_setr_pd (simde_float64 e1, simde_float64 e0) {
4510 #if defined(SIMDE_X86_SSE2_NATIVE)
4511 return _mm_setr_pd(e1, e0);
4512 #else
4513 return simde_mm_set_pd(e0, e1);
4514 #endif
4515 }
4516 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4517 # define _mm_setr_pd(e1, e0) simde_mm_setr_pd(e1, e0)
4518 #endif
4519
4520 SIMDE_FUNCTION_ATTRIBUTES
4521 simde__m128d
simde_mm_setzero_pd(void)4522 simde_mm_setzero_pd (void) {
4523 #if defined(SIMDE_X86_SSE2_NATIVE)
4524 return _mm_setzero_pd();
4525 #else
4526 return simde_mm_castsi128_pd(simde_mm_setzero_si128());
4527 #endif
4528 }
4529 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4530 # define _mm_setzero_pd() simde_mm_setzero_pd()
4531 #endif
4532
4533 #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
4534 HEDLEY_DIAGNOSTIC_PUSH
4535 SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_
4536 #endif
4537
4538 SIMDE_FUNCTION_ATTRIBUTES
4539 simde__m128d
simde_mm_undefined_pd(void)4540 simde_mm_undefined_pd (void) {
4541 simde__m128d_private r_;
4542
4543 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE__HAVE_UNDEFINED128)
4544 r_.n = _mm_undefined_pd();
4545 #elif !defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
4546 r_ = simde__m128d_to_private(simde_mm_setzero_pd());
4547 #endif
4548
4549 return simde__m128d_from_private(r_);
4550 }
4551 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4552 # define _mm_undefined_pd() simde_mm_undefined_pd()
4553 #endif
4554
4555 SIMDE_FUNCTION_ATTRIBUTES
4556 simde__m128i
simde_mm_undefined_si128(void)4557 simde_mm_undefined_si128 (void) {
4558 simde__m128i_private r_;
4559
4560 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE__HAVE_UNDEFINED128)
4561 r_.n = _mm_undefined_si128();
4562 #elif !defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
4563 r_ = simde__m128i_to_private(simde_mm_setzero_si128());
4564 #endif
4565
4566 return simde__m128i_from_private(r_);
4567 }
4568 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4569 # define _mm_undefined_si128() (simde_mm_undefined_si128())
4570 #endif
4571
4572 #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
4573 HEDLEY_DIAGNOSTIC_POP
4574 #endif
4575
4576 SIMDE_FUNCTION_ATTRIBUTES
4577 simde__m128d
simde_x_mm_setone_pd(void)4578 simde_x_mm_setone_pd (void) {
4579 return simde_mm_castps_pd(simde_x_mm_setone_ps());
4580 }
4581
4582 SIMDE_FUNCTION_ATTRIBUTES
4583 simde__m128i
simde_x_mm_setone_si128(void)4584 simde_x_mm_setone_si128 (void) {
4585 return simde_mm_castps_si128(simde_x_mm_setone_ps());
4586 }
4587
4588 SIMDE_FUNCTION_ATTRIBUTES
4589 simde__m128i
simde_mm_shuffle_epi32(simde__m128i a,const int imm8)4590 simde_mm_shuffle_epi32 (simde__m128i a, const int imm8)
4591 SIMDE_REQUIRE_RANGE(imm8, 0, 255) {
4592 simde__m128i_private
4593 r_,
4594 a_ = simde__m128i_to_private(a);
4595
4596 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
4597 r_.i32[i] = a_.i32[(imm8 >> (i * 2)) & 3];
4598 }
4599
4600 return simde__m128i_from_private(r_);
4601 }
4602 #if defined(SIMDE_X86_SSE2_NATIVE)
4603 # define simde_mm_shuffle_epi32(a, imm8) _mm_shuffle_epi32((a), (imm8))
4604 #elif defined(SIMDE_SHUFFLE_VECTOR_)
4605 # define simde_mm_shuffle_epi32(a, imm8) (__extension__ ({ \
4606 const simde__m128i_private simde__tmp_a_ = simde__m128i_to_private(a); \
4607 simde__m128i_from_private((simde__m128i_private) { .i32 = \
4608 SIMDE_SHUFFLE_VECTOR_(32, 16, \
4609 (simde__tmp_a_).i32, \
4610 (simde__tmp_a_).i32, \
4611 ((imm8) ) & 3, \
4612 ((imm8) >> 2) & 3, \
4613 ((imm8) >> 4) & 3, \
4614 ((imm8) >> 6) & 3) }); }))
4615 #endif
4616 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4617 # define _mm_shuffle_epi32(a, imm8) simde_mm_shuffle_epi32(a, imm8)
4618 #endif
4619
4620 SIMDE_FUNCTION_ATTRIBUTES
4621 simde__m128d
simde_mm_shuffle_pd(simde__m128d a,simde__m128d b,const int imm8)4622 simde_mm_shuffle_pd (simde__m128d a, simde__m128d b, const int imm8)
4623 SIMDE_REQUIRE_RANGE(imm8, 0, 3) {
4624 simde__m128d_private
4625 r_,
4626 a_ = simde__m128d_to_private(a),
4627 b_ = simde__m128d_to_private(b);
4628
4629 r_.f64[0] = ((imm8 & 1) == 0) ? a_.f64[0] : a_.f64[1];
4630 r_.f64[1] = ((imm8 & 2) == 0) ? b_.f64[0] : b_.f64[1];
4631
4632 return simde__m128d_from_private(r_);
4633 }
4634 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
4635 # define simde_mm_shuffle_pd(a, b, imm8) _mm_shuffle_pd((a), (b), (imm8))
4636 #elif defined(SIMDE_SHUFFLE_VECTOR_)
4637 # define simde_mm_shuffle_pd(a, b, imm8) (__extension__ ({ \
4638 simde__m128d_from_private((simde__m128d_private) { .f64 = \
4639 SIMDE_SHUFFLE_VECTOR_(64, 16, \
4640 simde__m128d_to_private(a).f64, \
4641 simde__m128d_to_private(b).f64, \
4642 (((imm8) ) & 1), \
4643 (((imm8) >> 1) & 1) + 2) }); }))
4644 #endif
4645 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4646 # define _mm_shuffle_pd(a, b, imm8) simde_mm_shuffle_pd(a, b, imm8)
4647 #endif
4648
4649 SIMDE_FUNCTION_ATTRIBUTES
4650 simde__m128i
simde_mm_shufflehi_epi16(simde__m128i a,const int imm8)4651 simde_mm_shufflehi_epi16 (simde__m128i a, const int imm8)
4652 SIMDE_REQUIRE_RANGE(imm8, 0, 255) {
4653 simde__m128i_private
4654 r_,
4655 a_ = simde__m128i_to_private(a);
4656
4657 SIMDE_VECTORIZE
4658 for (size_t i = 0 ; i < ((sizeof(a_.i16) / sizeof(a_.i16[0])) / 2) ; i++) {
4659 r_.i16[i] = a_.i16[i];
4660 }
4661 for (size_t i = ((sizeof(a_.i16) / sizeof(a_.i16[0])) / 2) ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
4662 r_.i16[i] = a_.i16[((imm8 >> ((i - 4) * 2)) & 3) + 4];
4663 }
4664
4665 return simde__m128i_from_private(r_);
4666 }
4667 #if defined(SIMDE_X86_SSE2_NATIVE)
4668 # define simde_mm_shufflehi_epi16(a, imm8) _mm_shufflehi_epi16((a), (imm8))
4669 #elif defined(SIMDE_SHUFFLE_VECTOR_)
4670 # define simde_mm_shufflehi_epi16(a, imm8) (__extension__ ({ \
4671 const simde__m128i_private simde__tmp_a_ = simde__m128i_to_private(a); \
4672 simde__m128i_from_private((simde__m128i_private) { .i16 = \
4673 SIMDE_SHUFFLE_VECTOR_(16, 16, \
4674 (simde__tmp_a_).i16, \
4675 (simde__tmp_a_).i16, \
4676 0, 1, 2, 3, \
4677 (((imm8) ) & 3) + 4, \
4678 (((imm8) >> 2) & 3) + 4, \
4679 (((imm8) >> 4) & 3) + 4, \
4680 (((imm8) >> 6) & 3) + 4) }); }))
4681 #endif
4682 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4683 # define _mm_shufflehi_epi16(a, imm8) simde_mm_shufflehi_epi16(a, imm8)
4684 #endif
4685
4686 SIMDE_FUNCTION_ATTRIBUTES
4687 simde__m128i
simde_mm_shufflelo_epi16(simde__m128i a,const int imm8)4688 simde_mm_shufflelo_epi16 (simde__m128i a, const int imm8)
4689 SIMDE_REQUIRE_RANGE(imm8, 0, 255) {
4690 simde__m128i_private
4691 r_,
4692 a_ = simde__m128i_to_private(a);
4693
4694 for (size_t i = 0 ; i < ((sizeof(r_.i16) / sizeof(r_.i16[0])) / 2) ; i++) {
4695 r_.i16[i] = a_.i16[((imm8 >> (i * 2)) & 3)];
4696 }
4697 SIMDE_VECTORIZE
4698 for (size_t i = ((sizeof(a_.i16) / sizeof(a_.i16[0])) / 2) ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
4699 r_.i16[i] = a_.i16[i];
4700 }
4701
4702 return simde__m128i_from_private(r_);
4703 }
4704 #if defined(SIMDE_X86_SSE2_NATIVE)
4705 # define simde_mm_shufflelo_epi16(a, imm8) _mm_shufflelo_epi16((a), (imm8))
4706 #elif defined(SIMDE_SHUFFLE_VECTOR_)
4707 # define simde_mm_shufflelo_epi16(a, imm8) (__extension__ ({ \
4708 const simde__m128i_private simde__tmp_a_ = simde__m128i_to_private(a); \
4709 simde__m128i_from_private((simde__m128i_private) { .i16 = \
4710 SIMDE_SHUFFLE_VECTOR_(16, 16, \
4711 (simde__tmp_a_).i16, \
4712 (simde__tmp_a_).i16, \
4713 (((imm8) ) & 3), \
4714 (((imm8) >> 2) & 3), \
4715 (((imm8) >> 4) & 3), \
4716 (((imm8) >> 6) & 3), \
4717 4, 5, 6, 7) }); }))
4718 #endif
4719 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4720 # define _mm_shufflelo_epi16(a, imm8) simde_mm_shufflelo_epi16(a, imm8)
4721 #endif
4722
4723 SIMDE_FUNCTION_ATTRIBUTES
4724 simde__m128i
simde_mm_sll_epi16(simde__m128i a,simde__m128i count)4725 simde_mm_sll_epi16 (simde__m128i a, simde__m128i count) {
4726 #if defined(SIMDE_X86_SSE2_NATIVE)
4727 return _mm_sll_epi16(a, count);
4728 #else
4729 simde__m128i_private
4730 r_,
4731 a_ = simde__m128i_to_private(a),
4732 count_ = simde__m128i_to_private(count);
4733
4734 if (count_.u64[0] > 15)
4735 return simde_mm_setzero_si128();
4736
4737 #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
4738 r_.u16 = (a_.u16 << count_.u64[0]);
4739 #else
4740 SIMDE_VECTORIZE
4741 for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
4742 r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, (a_.u16[i] << count_.u64[0]));
4743 }
4744 #endif
4745
4746 return simde__m128i_from_private(r_);
4747 #endif
4748 }
4749 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4750 # define _mm_sll_epi16(a, count) simde_mm_sll_epi16((a), (count))
4751 #endif
4752
4753 SIMDE_FUNCTION_ATTRIBUTES
4754 simde__m128i
simde_mm_sll_epi32(simde__m128i a,simde__m128i count)4755 simde_mm_sll_epi32 (simde__m128i a, simde__m128i count) {
4756 #if defined(SIMDE_X86_SSE2_NATIVE)
4757 return _mm_sll_epi32(a, count);
4758 #else
4759 simde__m128i_private
4760 r_,
4761 a_ = simde__m128i_to_private(a),
4762 count_ = simde__m128i_to_private(count);
4763
4764 if (count_.u64[0] > 31)
4765 return simde_mm_setzero_si128();
4766
4767 #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
4768 r_.u32 = (a_.u32 << count_.u64[0]);
4769 #else
4770 SIMDE_VECTORIZE
4771 for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
4772 r_.u32[i] = HEDLEY_STATIC_CAST(uint32_t, (a_.u32[i] << count_.u64[0]));
4773 }
4774 #endif
4775
4776 return simde__m128i_from_private(r_);
4777 #endif
4778 }
4779 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4780 # define _mm_sll_epi32(a, count) (simde_mm_sll_epi32(a, (count)))
4781 #endif
4782
4783 SIMDE_FUNCTION_ATTRIBUTES
4784 simde__m128i
simde_mm_sll_epi64(simde__m128i a,simde__m128i count)4785 simde_mm_sll_epi64 (simde__m128i a, simde__m128i count) {
4786 #if defined(SIMDE_X86_SSE2_NATIVE)
4787 return _mm_sll_epi64(a, count);
4788 #else
4789 simde__m128i_private
4790 r_,
4791 a_ = simde__m128i_to_private(a),
4792 count_ = simde__m128i_to_private(count);
4793
4794 if (count_.u64[0] > 63)
4795 return simde_mm_setzero_si128();
4796
4797 const int_fast16_t s = HEDLEY_STATIC_CAST(int_fast16_t, count_.u64[0]);
4798 #if !defined(SIMDE_BUG_GCC_94488)
4799 SIMDE_VECTORIZE
4800 #endif
4801 for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
4802 r_.u64[i] = a_.u64[i] << s;
4803 }
4804
4805 return simde__m128i_from_private(r_);
4806 #endif
4807 }
4808 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4809 # define _mm_sll_epi64(a, count) (simde_mm_sll_epi64(a, (count)))
4810 #endif
4811
4812 SIMDE_FUNCTION_ATTRIBUTES
4813 simde__m128d
simde_mm_sqrt_pd(simde__m128d a)4814 simde_mm_sqrt_pd (simde__m128d a) {
4815 #if defined(SIMDE_X86_SSE2_NATIVE)
4816 return _mm_sqrt_pd(a);
4817 #else
4818 simde__m128d_private
4819 r_,
4820 a_ = simde__m128d_to_private(a);
4821
4822 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
4823 r_.neon_f64 = vsqrtq_f64(a_.neon_f64);
4824 #elif defined(simde_math_sqrt)
4825 SIMDE_VECTORIZE
4826 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
4827 r_.f64[i] = simde_math_sqrt(a_.f64[i]);
4828 }
4829 #else
4830 HEDLEY_UNREACHABLE();
4831 #endif
4832
4833 return simde__m128d_from_private(r_);
4834 #endif
4835 }
4836 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4837 # define _mm_sqrt_pd(a) simde_mm_sqrt_pd(a)
4838 #endif
4839
4840 SIMDE_FUNCTION_ATTRIBUTES
4841 simde__m128d
simde_mm_sqrt_sd(simde__m128d a,simde__m128d b)4842 simde_mm_sqrt_sd (simde__m128d a, simde__m128d b) {
4843 #if defined(SIMDE_X86_SSE2_NATIVE)
4844 return _mm_sqrt_sd(a, b);
4845 #elif defined(SIMDE_ASSUME_VECTORIZATION)
4846 return simde_mm_move_sd(a, simde_mm_sqrt_pd(b));
4847 #else
4848 simde__m128d_private
4849 r_,
4850 a_ = simde__m128d_to_private(a),
4851 b_ = simde__m128d_to_private(b);
4852
4853 #if defined(simde_math_sqrt)
4854 r_.f64[0] = simde_math_sqrt(b_.f64[0]);
4855 r_.f64[1] = a_.f64[1];
4856 #else
4857 HEDLEY_UNREACHABLE();
4858 #endif
4859
4860 return simde__m128d_from_private(r_);
4861 #endif
4862 }
4863 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4864 # define _mm_sqrt_sd(a, b) simde_mm_sqrt_sd(a, b)
4865 #endif
4866
4867 SIMDE_FUNCTION_ATTRIBUTES
4868 simde__m128i
simde_mm_srl_epi16(simde__m128i a,simde__m128i count)4869 simde_mm_srl_epi16 (simde__m128i a, simde__m128i count) {
4870 #if defined(SIMDE_X86_SSE2_NATIVE)
4871 return _mm_srl_epi16(a, count);
4872 #else
4873 simde__m128i_private
4874 r_,
4875 a_ = simde__m128i_to_private(a),
4876 count_ = simde__m128i_to_private(count);
4877
4878 const int cnt = HEDLEY_STATIC_CAST(int, (count_.i64[0] > 16 ? 16 : count_.i64[0]));
4879
4880 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4881 r_.neon_u16 = vshlq_u16(a_.neon_u16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, -cnt)));
4882 #else
4883 SIMDE_VECTORIZE
4884 for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
4885 r_.u16[i] = a_.u16[i] >> cnt;
4886 }
4887 #endif
4888
4889 return simde__m128i_from_private(r_);
4890 #endif
4891 }
4892 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4893 #define _mm_srl_epi16(a, count) (simde_mm_srl_epi16(a, (count)))
4894 #endif
4895
4896 SIMDE_FUNCTION_ATTRIBUTES
4897 simde__m128i
simde_mm_srl_epi32(simde__m128i a,simde__m128i count)4898 simde_mm_srl_epi32 (simde__m128i a, simde__m128i count) {
4899 #if defined(SIMDE_X86_SSE2_NATIVE)
4900 return _mm_srl_epi32(a, count);
4901 #else
4902 simde__m128i_private
4903 r_,
4904 a_ = simde__m128i_to_private(a),
4905 count_ = simde__m128i_to_private(count);
4906
4907 const int cnt = HEDLEY_STATIC_CAST(int, (count_.i64[0] > 32 ? 32 : count_.i64[0]));
4908
4909 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4910 r_.neon_u32 = vshlq_u32(a_.neon_u32, vdupq_n_s32(HEDLEY_STATIC_CAST(int32_t, -cnt)));
4911 #else
4912 SIMDE_VECTORIZE
4913 for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
4914 r_.u32[i] = a_.u32[i] >> cnt;
4915 }
4916 #endif
4917
4918 return simde__m128i_from_private(r_);
4919 #endif
4920 }
4921 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4922 # define _mm_srl_epi32(a, count) (simde_mm_srl_epi32(a, (count)))
4923 #endif
4924
4925 SIMDE_FUNCTION_ATTRIBUTES
4926 simde__m128i
simde_mm_srl_epi64(simde__m128i a,simde__m128i count)4927 simde_mm_srl_epi64 (simde__m128i a, simde__m128i count) {
4928 #if defined(SIMDE_X86_SSE2_NATIVE)
4929 return _mm_srl_epi64(a, count);
4930 #else
4931 simde__m128i_private
4932 r_,
4933 a_ = simde__m128i_to_private(a),
4934 count_ = simde__m128i_to_private(count);
4935
4936 const int cnt = HEDLEY_STATIC_CAST(int, (count_.i64[0] > 64 ? 64 : count_.i64[0]));
4937
4938 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4939 r_.neon_u64 = vshlq_u64(a_.neon_u64, vdupq_n_s64(HEDLEY_STATIC_CAST(int64_t, -cnt)));
4940 #else
4941 #if !defined(SIMDE_BUG_GCC_94488)
4942 SIMDE_VECTORIZE
4943 #endif
4944 for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
4945 r_.u64[i] = a_.u64[i] >> cnt;
4946 }
4947 #endif
4948
4949 return simde__m128i_from_private(r_);
4950 #endif
4951 }
4952 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4953 # define _mm_srl_epi64(a, count) (simde_mm_srl_epi64(a, (count)))
4954 #endif
4955
4956 SIMDE_FUNCTION_ATTRIBUTES
4957 simde__m128i
simde_mm_srai_epi16(simde__m128i a,const int imm8)4958 simde_mm_srai_epi16 (simde__m128i a, const int imm8)
4959 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
4960 /* MSVC requires a range of (0, 255). */
4961 simde__m128i_private
4962 r_,
4963 a_ = simde__m128i_to_private(a);
4964
4965 const int cnt = (imm8 & ~15) ? 15 : imm8;
4966
4967 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4968 r_.neon_i16 = vshlq_s16(a_.neon_i16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, -cnt)));
4969 #else
4970 SIMDE_VECTORIZE
4971 for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i16[0])) ; i++) {
4972 r_.i16[i] = a_.i16[i] >> cnt;
4973 }
4974 #endif
4975
4976 return simde__m128i_from_private(r_);
4977 }
4978 #if defined(SIMDE_X86_SSE2_NATIVE)
4979 #define simde_mm_srai_epi16(a, imm8) _mm_srai_epi16((a), (imm8))
4980 #endif
4981 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4982 #define _mm_srai_epi16(a, imm8) simde_mm_srai_epi16(a, imm8)
4983 #endif
4984
4985 SIMDE_FUNCTION_ATTRIBUTES
4986 simde__m128i
simde_mm_srai_epi32(simde__m128i a,const int imm8)4987 simde_mm_srai_epi32 (simde__m128i a, const int imm8)
4988 SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
4989 /* MSVC requires a range of (0, 255). */
4990 simde__m128i_private
4991 r_,
4992 a_ = simde__m128i_to_private(a);
4993
4994 const int cnt = (imm8 & ~31) ? 31 : imm8;
4995
4996 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4997 r_.neon_i32 = vshlq_s32(a_.neon_i32, vdupq_n_s32(-cnt));
4998 #else
4999 SIMDE_VECTORIZE
5000 for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i32[0])) ; i++) {
5001 r_.i32[i] = a_.i32[i] >> cnt;
5002 }
5003 #endif
5004
5005 return simde__m128i_from_private(r_);
5006 }
5007 #if defined(SIMDE_X86_SSE2_NATIVE)
5008 #define simde_mm_srai_epi32(a, imm8) _mm_srai_epi32((a), (imm8))
5009 #endif
5010 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5011 #define _mm_srai_epi32(a, imm8) simde_mm_srai_epi32(a, imm8)
5012 #endif
5013
5014 SIMDE_FUNCTION_ATTRIBUTES
5015 simde__m128i
simde_mm_sra_epi16(simde__m128i a,simde__m128i count)5016 simde_mm_sra_epi16 (simde__m128i a, simde__m128i count) {
5017 #if defined(SIMDE_X86_SSE2_NATIVE)
5018 return _mm_sra_epi16(a, count);
5019 #else
5020 simde__m128i_private
5021 r_,
5022 a_ = simde__m128i_to_private(a),
5023 count_ = simde__m128i_to_private(count);
5024
5025 const int cnt = HEDLEY_STATIC_CAST(int, (count_.i64[0] > 15 ? 15 : count_.i64[0]));
5026
5027 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5028 r_.neon_i16 = vshlq_s16(a_.neon_i16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, -cnt)));
5029 #else
5030 SIMDE_VECTORIZE
5031 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
5032 r_.i16[i] = a_.i16[i] >> cnt;
5033 }
5034 #endif
5035
5036 return simde__m128i_from_private(r_);
5037 #endif
5038 }
5039 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5040 # define _mm_sra_epi16(a, count) (simde_mm_sra_epi16(a, count))
5041 #endif
5042
5043 SIMDE_FUNCTION_ATTRIBUTES
5044 simde__m128i
simde_mm_sra_epi32(simde__m128i a,simde__m128i count)5045 simde_mm_sra_epi32 (simde__m128i a, simde__m128i count) {
5046 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_MM_SRA_EPI32)
5047 return _mm_sra_epi32(a, count);
5048 #else
5049 simde__m128i_private
5050 r_,
5051 a_ = simde__m128i_to_private(a),
5052 count_ = simde__m128i_to_private(count);
5053
5054 const int cnt = count_.u64[0] > 31 ? 31 : HEDLEY_STATIC_CAST(int, count_.u64[0]);
5055
5056 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5057 r_.neon_i32 = vshlq_s32(a_.neon_i32, vdupq_n_s32(HEDLEY_STATIC_CAST(int32_t, -cnt)));
5058 #else
5059 SIMDE_VECTORIZE
5060 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
5061 r_.i32[i] = a_.i32[i] >> cnt;
5062 }
5063 #endif
5064
5065 return simde__m128i_from_private(r_);
5066 #endif
5067 }
5068 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5069 # define _mm_sra_epi32(a, count) (simde_mm_sra_epi32(a, (count)))
5070 #endif
5071
5072 SIMDE_FUNCTION_ATTRIBUTES
5073 simde__m128i
simde_mm_slli_epi16(simde__m128i a,const int imm8)5074 simde_mm_slli_epi16 (simde__m128i a, const int imm8)
5075 SIMDE_REQUIRE_RANGE(imm8, 0, 255) {
5076 if (HEDLEY_UNLIKELY((imm8 > 15))) {
5077 return simde_mm_setzero_si128();
5078 }
5079
5080 simde__m128i_private
5081 r_,
5082 a_ = simde__m128i_to_private(a);
5083
5084 #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
5085 r_.i16 = a_.i16 << (imm8 & 0xff);
5086 #else
5087 const int s = (imm8 > HEDLEY_STATIC_CAST(int, sizeof(r_.i16[0]) * CHAR_BIT) - 1) ? 0 : imm8;
5088 SIMDE_VECTORIZE
5089 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
5090 r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, a_.i16[i] << s);
5091 }
5092 #endif
5093
5094 return simde__m128i_from_private(r_);
5095 }
5096 #if defined(SIMDE_X86_SSE2_NATIVE)
5097 # define simde_mm_slli_epi16(a, imm8) _mm_slli_epi16(a, imm8)
5098 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(__clang__)
5099 # define simde_mm_slli_epi16(a, imm8) \
5100 simde__m128i_from_neon_u16(vshlq_n_u16(simde__m128i_to_neon_u16(a), (imm8)))
5101 // The above is allowed by gcc/g++ 9 with -march=armv8-a, might work on A32V8 and elsewhere but needs testing
5102 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) // clang can't handle the potential out of range use of imm8 even though that is handled
5103 # define simde_mm_slli_epi16(a, imm8) \
5104 ({ \
5105 simde__m128i ret; \
5106 if ((imm8) <= 0) { \
5107 ret = a; \
5108 } else if ((imm8) > 15) { \
5109 ret = simde_mm_setzero_si128(); \
5110 } else { \
5111 ret = simde__m128i_from_neon_i16( \
5112 vshlq_n_s16(simde__m128i_to_neon_i16(a), (imm8))); \
5113 } \
5114 ret; \
5115 })
5116 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
5117 #define simde_mm_slli_epi16(a, imm8) \
5118 ((imm8 & ~15) ? simde_mm_setzero_si128() : simde__m128i_from_altivec_i16(vec_sl(simde__m128i_to_altivec_i16(a), vec_splat_u16(HEDLEY_STATIC_CAST(unsigned short, imm8)))))
5119 #endif
5120 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5121 # define _mm_slli_epi16(a, imm8) simde_mm_slli_epi16(a, imm8)
5122 #endif
5123
5124 SIMDE_FUNCTION_ATTRIBUTES
5125 simde__m128i
simde_mm_slli_epi32(simde__m128i a,const int imm8)5126 simde_mm_slli_epi32 (simde__m128i a, const int imm8)
5127 SIMDE_REQUIRE_RANGE(imm8, 0, 255) {
5128 if (HEDLEY_UNLIKELY((imm8 > 31))) {
5129 return simde_mm_setzero_si128();
5130 }
5131 simde__m128i_private
5132 r_,
5133 a_ = simde__m128i_to_private(a);
5134
5135 #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
5136 r_.i32 = a_.i32 << imm8;
5137 #else
5138 SIMDE_VECTORIZE
5139 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
5140 r_.i32[i] = a_.i32[i] << (imm8 & 0xff);
5141 }
5142 #endif
5143
5144 return simde__m128i_from_private(r_);
5145 }
5146 #if defined(SIMDE_X86_SSE2_NATIVE)
5147 # define simde_mm_slli_epi32(a, imm8) _mm_slli_epi32(a, imm8)
5148 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(__clang__)
5149 # define simde_mm_slli_epi32(a, imm8) \
5150 simde__m128i_from_neon_u32(vshlq_n_u32(simde__m128i_to_neon_u32(a), (imm8)))
5151 // The above is allowed by gcc/g++ 9 with -march=armv8-a, might work on A32V8 and elsewhere but needs testing
5152 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) // clang can't handle the potential out of range use of imm8 even though that is handled
5153 # define simde_mm_slli_epi32(a, imm8) \
5154 ({ \
5155 simde__m128i ret; \
5156 if ((imm8) <= 0) { \
5157 ret = a; \
5158 } else if ((imm8) > 31) { \
5159 ret = simde_mm_setzero_si128(); \
5160 } else { \
5161 ret = simde__m128i_from_neon_i32( \
5162 vshlq_n_s32(simde__m128i_to_neon_i32(a), (imm8))); \
5163 } \
5164 ret; \
5165 })
5166 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
5167 #define simde_mm_slli_epi32(a, imm8) \
5168 ({ \
5169 simde__m128i ret; \
5170 if ((imm8) <= 0) { \
5171 ret = a; \
5172 } else if ((imm8) > 31) { \
5173 ret = simde_mm_setzero_si128(); \
5174 } else { \
5175 ret = simde__m128i_from_altivec_i32( \
5176 vec_sl(simde__m128i_to_altivec_i32(a), \
5177 vec_splats(HEDLEY_STATIC_CAST(unsigned int, imm8)))); \
5178 } \
5179 ret; \
5180 })
5181 #endif
5182 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5183 # define _mm_slli_epi32(a, imm8) simde_mm_slli_epi32(a, imm8)
5184 #endif
5185
5186 SIMDE_FUNCTION_ATTRIBUTES
5187 simde__m128i
simde_mm_slli_epi64(simde__m128i a,const int imm8)5188 simde_mm_slli_epi64 (simde__m128i a, const int imm8)
5189 SIMDE_REQUIRE_RANGE(imm8, 0, 255) {
5190 if (HEDLEY_UNLIKELY((imm8 > 63))) {
5191 return simde_mm_setzero_si128();
5192 }
5193 simde__m128i_private
5194 r_,
5195 a_ = simde__m128i_to_private(a);
5196
5197 #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
5198 r_.i64 = a_.i64 << imm8;
5199 #else
5200 SIMDE_VECTORIZE
5201 for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
5202 r_.i64[i] = a_.i64[i] << (imm8 & 0xff);
5203 }
5204 #endif
5205
5206 return simde__m128i_from_private(r_);
5207 }
5208 #if defined(SIMDE_X86_SSE2_NATIVE)
5209 # define simde_mm_slli_epi64(a, imm8) _mm_slli_epi64(a, imm8)
5210 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(__clang__)
5211 # define simde_mm_slli_epi64(a, imm8) \
5212 simde__m128i_from_neon_u64(vshlq_n_u64(simde__m128i_to_neon_u64(a), (imm8)))
5213 // The above is allowed by gcc/g++ 9 with -march=armv8-a, might work on A32V8 and elsewhere but needs testing
5214 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) // clang can't handle the potential out of range use of imm8 even though that is handled
5215 # define simde_mm_slli_epi64(a, imm8) \
5216 ({ \
5217 simde__m128i ret; \
5218 if ((imm8) <= 0) { \
5219 ret = a; \
5220 } else if ((imm8) > 63) { \
5221 ret = simde_mm_setzero_si128(); \
5222 } else { \
5223 ret = simde__m128i_from_neon_i64( \
5224 vshlq_n_s64(simde__m128i_to_neon_i64(a), (imm8))); \
5225 } \
5226 ret; \
5227 })
5228 #endif
5229 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5230 # define _mm_slli_epi64(a, imm8) simde_mm_slli_epi64(a, imm8)
5231 #endif
5232
5233 SIMDE_FUNCTION_ATTRIBUTES
5234 simde__m128i
simde_mm_srli_epi16(simde__m128i a,const int imm8)5235 simde_mm_srli_epi16 (simde__m128i a, const int imm8)
5236 SIMDE_REQUIRE_RANGE(imm8, 0, 255) {
5237 if (HEDLEY_UNLIKELY((imm8 > 15))) {
5238 return simde_mm_setzero_si128();
5239 }
5240 simde__m128i_private
5241 r_,
5242 a_ = simde__m128i_to_private(a);
5243
5244 #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
5245 r_.u16 = a_.u16 >> imm8;
5246 #else
5247 SIMDE_VECTORIZE
5248 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
5249 r_.u16[i] = a_.u16[i] >> (imm8 & 0xff);
5250 }
5251 #endif
5252
5253 return simde__m128i_from_private(r_);
5254 }
5255 #if defined(SIMDE_X86_SSE2_NATIVE)
5256 # define simde_mm_srli_epi16(a, imm8) _mm_srli_epi16(a, imm8)
5257 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(__clang__)
5258 # define simde_mm_srli_epi16(a, imm8) \
5259 simde__m128i_from_neon_u16(vshrq_n_u16(simde__m128i_to_neon_u16(a), imm8))
5260 // The above is allowed by gcc/g++ 9 with -march=armv8-a, might work on A32V8 and elsewhere but needs testing
5261 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) // clang can't handle the potential out of range use of imm8 even though that is handled
5262 # define simde_mm_srli_epi16(a, imm8) \
5263 ({ \
5264 simde__m128i ret; \
5265 if ((imm8) <= 0) { \
5266 ret = a; \
5267 } else if ((imm8) > 15) { \
5268 ret = simde_mm_setzero_si128(); \
5269 } else { \
5270 ret = simde__m128i_from_neon_u16( \
5271 vshrq_n_u16(simde__m128i_to_neon_u16(a), (imm8))); \
5272 } \
5273 ret; \
5274 })
5275 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
5276 #define simde_mm_srli_epi16(a, imm8) \
5277 ((imm8 & ~15) ? simde_mm_setzero_si128() : simde__m128i_from_altivec_i16(vec_sr(simde__m128i_to_altivec_i16(a), vec_splat_u16(HEDLEY_STATIC_CAST(unsigned short, imm8)))))
5278 #endif
5279 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5280 # define _mm_srli_epi16(a, imm8) simde_mm_srli_epi16(a, imm8)
5281 #endif
5282
5283 SIMDE_FUNCTION_ATTRIBUTES
5284 simde__m128i
simde_mm_srli_epi32(simde__m128i a,const int imm8)5285 simde_mm_srli_epi32 (simde__m128i a, const int imm8)
5286 SIMDE_REQUIRE_RANGE(imm8, 0, 255) {
5287 if (HEDLEY_UNLIKELY((imm8 > 31))) {
5288 return simde_mm_setzero_si128();
5289 }
5290 simde__m128i_private
5291 r_,
5292 a_ = simde__m128i_to_private(a);
5293
5294 #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
5295 r_.u32 = a_.u32 >> (imm8 & 0xff);
5296 #else
5297 SIMDE_VECTORIZE
5298 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
5299 r_.u32[i] = a_.u32[i] >> (imm8 & 0xff);
5300 }
5301 #endif
5302
5303 return simde__m128i_from_private(r_);
5304 }
5305 #if defined(SIMDE_X86_SSE2_NATIVE)
5306 # define simde_mm_srli_epi32(a, imm8) _mm_srli_epi32(a, imm8)
5307 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(__clang__)
5308 # define simde_mm_srli_epi32(a, imm8) \
5309 simde__m128i_from_neon_u32(vshrq_n_u32(simde__m128i_to_neon_u32(a), imm8))
5310 // The above is allowed by gcc/g++ 9 with -march=armv8-a, might work on A32V8 and elsewhere but needs testing
5311 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) // clang can't handle the potential out of range use of imm8 even though that is handled
5312 # define simde_mm_srli_epi32(a, imm8) \
5313 ({ \
5314 simde__m128i ret; \
5315 if ((imm8) <= 0) { \
5316 ret = a; \
5317 } else if ((imm8) > 31) { \
5318 ret = simde_mm_setzero_si128(); \
5319 } else { \
5320 ret = simde__m128i_from_neon_u32( \
5321 vshrq_n_u32(simde__m128i_to_neon_u32(a), (imm8))); \
5322 } \
5323 ret; \
5324 })
5325 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
5326 # define simde_mm_srli_epi32(a, imm8) \
5327 ({ \
5328 simde__m128i ret; \
5329 if ((imm8) <= 0) { \
5330 ret = a; \
5331 } else if ((imm8) > 31) { \
5332 ret = simde_mm_setzero_si128(); \
5333 } else { \
5334 ret = simde__m128i_from_altivec_i32( \
5335 vec_sr(simde__m128i_to_altivec_i32(a), \
5336 vec_splats(HEDLEY_STATIC_CAST(unsigned int, imm8)))); \
5337 } \
5338 ret; \
5339 })
5340 #endif
5341 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5342 # define _mm_srli_epi32(a, imm8) simde_mm_srli_epi32(a, imm8)
5343 #endif
5344
5345 SIMDE_FUNCTION_ATTRIBUTES
5346 simde__m128i
simde_mm_srli_epi64(simde__m128i a,const int imm8)5347 simde_mm_srli_epi64 (simde__m128i a, const int imm8)
5348 SIMDE_REQUIRE_RANGE(imm8, 0, 255) {
5349 simde__m128i_private
5350 r_,
5351 a_ = simde__m128i_to_private(a);
5352
5353 if (HEDLEY_UNLIKELY((imm8 & 63) != imm8))
5354 return simde_mm_setzero_si128();
5355
5356 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5357 r_.neon_u64 = vshlq_u64(a_.neon_u64, vdupq_n_s64(-imm8));
5358 #else
5359 #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_94488)
5360 r_.u64 = a_.u64 >> imm8;
5361 #else
5362 SIMDE_VECTORIZE
5363 for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
5364 r_.u64[i] = a_.u64[i] >> imm8;
5365 }
5366 #endif
5367 #endif
5368
5369 return simde__m128i_from_private(r_);
5370 }
5371 #if defined(SIMDE_X86_SSE2_NATIVE)
5372 # define simde_mm_srli_epi64(a, imm8) _mm_srli_epi64(a, imm8)
5373 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(__clang__)
5374 # define simde_mm_srli_epi64(a, imm8) \
5375 ((imm8 == 0) ? (a) : (simde__m128i_from_neon_u64(vshrq_n_u64(simde__m128i_to_neon_u64(a), imm8))))
5376 // The above is allowed by gcc/g++ 9 with -march=armv8-a, might work on A32V8 and elsewhere but needs testing
5377 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) // clang can't handle the potential out of range use of imm8 even though that is handled
5378 # define simde_mm_srli_epi64(a, imm8) \
5379 ({ \
5380 simde__m128i ret; \
5381 if ((imm8) <= 0) { \
5382 ret = a; \
5383 } else if ((imm8) > 63) { \
5384 ret = simde_mm_setzero_si128(); \
5385 } else { \
5386 ret = simde__m128i_from_neon_u64( \
5387 vshrq_n_u64(simde__m128i_to_neon_u64(a), (imm8))); \
5388 } \
5389 ret; \
5390 })
5391 #endif
5392 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5393 # define _mm_srli_epi64(a, imm8) simde_mm_srli_epi64(a, imm8)
5394 #endif
5395
5396 SIMDE_FUNCTION_ATTRIBUTES
5397 void
simde_mm_store_pd(simde_float64 mem_addr[HEDLEY_ARRAY_PARAM (2)],simde__m128d a)5398 simde_mm_store_pd (simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128d a) {
5399 simde_assert_aligned(16, mem_addr);
5400
5401 #if defined(SIMDE_X86_SSE2_NATIVE)
5402 _mm_store_pd(mem_addr, a);
5403 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
5404 vst1q_f64(mem_addr, simde__m128d_to_private(a).neon_f64);
5405 #else
5406 simde_memcpy(mem_addr, &a, sizeof(a));
5407 #endif
5408 }
5409 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5410 # define _mm_store_pd(mem_addr, a) simde_mm_store_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
5411 #endif
5412
5413 SIMDE_FUNCTION_ATTRIBUTES
5414 void
simde_mm_store1_pd(simde_float64 mem_addr[HEDLEY_ARRAY_PARAM (2)],simde__m128d a)5415 simde_mm_store1_pd (simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128d a) {
5416 simde_assert_aligned(16, mem_addr);
5417
5418 #if defined(SIMDE_X86_SSE2_NATIVE)
5419 _mm_store1_pd(mem_addr, a);
5420 #else
5421 simde__m128d_private a_ = simde__m128d_to_private(a);
5422
5423 mem_addr[0] = a_.f64[0];
5424 mem_addr[1] = a_.f64[0];
5425 #endif
5426 }
5427 #define simde_mm_store_pd1(mem_addr, a) simde_mm_store1_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
5428 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5429 # define _mm_store1_pd(mem_addr, a) simde_mm_store1_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
5430 # define _mm_store_pd1(mem_addr, a) simde_mm_store_pd1(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
5431 #endif
5432
5433 SIMDE_FUNCTION_ATTRIBUTES
5434 void
simde_mm_store_sd(simde_float64 * mem_addr,simde__m128d a)5435 simde_mm_store_sd (simde_float64* mem_addr, simde__m128d a) {
5436 #if defined(SIMDE_X86_SSE2_NATIVE)
5437 _mm_store_sd(mem_addr, a);
5438 #else
5439 simde__m128d_private a_ = simde__m128d_to_private(a);
5440
5441 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
5442 simde_float64 v = vgetq_lane_f64(a_.neon_f64, 0);
5443 simde_memcpy(mem_addr, &v, sizeof(simde_float64));
5444 #else
5445 simde_float64 v = a_.f64[0];
5446 simde_memcpy(mem_addr, &v, sizeof(simde_float64));
5447 #endif
5448 #endif
5449 }
5450 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5451 # define _mm_store_sd(mem_addr, a) simde_mm_store_sd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
5452 #endif
5453
5454 SIMDE_FUNCTION_ATTRIBUTES
5455 void
simde_mm_store_si128(simde__m128i * mem_addr,simde__m128i a)5456 simde_mm_store_si128 (simde__m128i* mem_addr, simde__m128i a) {
5457 #if defined(SIMDE_X86_SSE2_NATIVE)
5458 _mm_store_si128(HEDLEY_STATIC_CAST(__m128i*, mem_addr), a);
5459 #else
5460 simde__m128i_private a_ = simde__m128i_to_private(a);
5461
5462 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5463 vst1q_s32(HEDLEY_REINTERPRET_CAST(int32_t*, mem_addr), a_.neon_i32);
5464 #else
5465 simde_memcpy(SIMDE_ASSUME_ALIGNED(16, mem_addr), &a_, sizeof(a_));
5466 #endif
5467 #endif
5468 }
5469 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5470 # define _mm_store_si128(mem_addr, a) simde_mm_store_si128(mem_addr, a)
5471 #endif
5472
5473 SIMDE_FUNCTION_ATTRIBUTES
5474 void
simde_mm_storeh_pd(simde_float64 * mem_addr,simde__m128d a)5475 simde_mm_storeh_pd (simde_float64* mem_addr, simde__m128d a) {
5476 #if defined(SIMDE_X86_SSE2_NATIVE)
5477 _mm_storeh_pd(mem_addr, a);
5478 #else
5479 simde__m128d_private a_ = simde__m128d_to_private(a);
5480
5481 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
5482 *mem_addr = vgetq_lane_f64(a_.neon_f64, 1);
5483 #else
5484 *mem_addr = a_.f64[1];
5485 #endif
5486 #endif
5487 }
5488 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5489 # define _mm_storeh_pd(mem_addr, a) simde_mm_storeh_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
5490 #endif
5491
5492 SIMDE_FUNCTION_ATTRIBUTES
5493 void
simde_mm_storel_epi64(simde__m128i * mem_addr,simde__m128i a)5494 simde_mm_storel_epi64 (simde__m128i* mem_addr, simde__m128i a) {
5495 #if defined(SIMDE_X86_SSE2_NATIVE)
5496 _mm_storel_epi64(HEDLEY_STATIC_CAST(__m128i*, mem_addr), a);
5497 #else
5498 simde__m128i_private a_ = simde__m128i_to_private(a);
5499 int64_t tmp;
5500
5501 /* memcpy to prevent aliasing, tmp because we can't take the
5502 * address of a vector element. */
5503
5504 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5505 tmp = vgetq_lane_s64(a_.neon_i64, 0);
5506 #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
5507 #if defined(SIMDE_BUG_GCC_95227)
5508 (void) a_;
5509 #endif
5510 tmp = vec_extract(a_.altivec_i64, 0);
5511 #else
5512 tmp = a_.i64[0];
5513 #endif
5514
5515 simde_memcpy(mem_addr, &tmp, sizeof(tmp));
5516 #endif
5517 }
5518 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5519 # define _mm_storel_epi64(mem_addr, a) simde_mm_storel_epi64(mem_addr, a)
5520 #endif
5521
5522 SIMDE_FUNCTION_ATTRIBUTES
5523 void
simde_mm_storel_pd(simde_float64 * mem_addr,simde__m128d a)5524 simde_mm_storel_pd (simde_float64* mem_addr, simde__m128d a) {
5525 #if defined(SIMDE_X86_SSE2_NATIVE)
5526 _mm_storel_pd(mem_addr, a);
5527 #else
5528 simde__m128d_private a_ = simde__m128d_to_private(a);
5529
5530 *mem_addr = a_.f64[0];
5531 #endif
5532 }
5533 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5534 # define _mm_storel_pd(mem_addr, a) simde_mm_storel_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
5535 #endif
5536
5537 SIMDE_FUNCTION_ATTRIBUTES
5538 void
simde_mm_storer_pd(simde_float64 mem_addr[2],simde__m128d a)5539 simde_mm_storer_pd (simde_float64 mem_addr[2], simde__m128d a) {
5540 simde_assert_aligned(16, mem_addr);
5541
5542 #if defined(SIMDE_X86_SSE2_NATIVE)
5543 _mm_storer_pd(mem_addr, a);
5544 #else
5545 simde__m128d_private a_ = simde__m128d_to_private(a);
5546
5547 mem_addr[0] = a_.f64[1];
5548 mem_addr[1] = a_.f64[0];
5549 #endif
5550 }
5551 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5552 # define _mm_storer_pd(mem_addr, a) simde_mm_storer_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
5553 #endif
5554
5555 SIMDE_FUNCTION_ATTRIBUTES
5556 void
simde_mm_storeu_pd(simde_float64 * mem_addr,simde__m128d a)5557 simde_mm_storeu_pd (simde_float64* mem_addr, simde__m128d a) {
5558 #if defined(SIMDE_X86_SSE2_NATIVE)
5559 _mm_storeu_pd(mem_addr, a);
5560 #else
5561 simde_memcpy(mem_addr, &a, sizeof(a));
5562 #endif
5563 }
5564 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5565 # define _mm_storeu_pd(mem_addr, a) simde_mm_storeu_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
5566 #endif
5567
5568 SIMDE_FUNCTION_ATTRIBUTES
5569 void
simde_mm_storeu_si128(simde__m128i * mem_addr,simde__m128i a)5570 simde_mm_storeu_si128 (simde__m128i* mem_addr, simde__m128i a) {
5571 #if defined(SIMDE_X86_SSE2_NATIVE)
5572 _mm_storeu_si128(HEDLEY_STATIC_CAST(__m128i*, mem_addr), a);
5573 #else
5574 simde__m128i_private a_ = simde__m128i_to_private(a);
5575
5576 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5577 vst1q_s32(HEDLEY_REINTERPRET_CAST(int32_t*, mem_addr), a_.neon_i32);
5578 #else
5579 simde_memcpy(mem_addr, &a_, sizeof(a_));
5580 #endif
5581 #endif
5582 }
5583 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5584 # define _mm_storeu_si128(mem_addr, a) simde_mm_storeu_si128(mem_addr, a)
5585 #endif
5586
5587 SIMDE_FUNCTION_ATTRIBUTES
5588 void
simde_mm_stream_pd(simde_float64 mem_addr[HEDLEY_ARRAY_PARAM (2)],simde__m128d a)5589 simde_mm_stream_pd (simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128d a) {
5590 simde_assert_aligned(16, mem_addr);
5591
5592 #if defined(SIMDE_X86_SSE2_NATIVE)
5593 _mm_stream_pd(mem_addr, a);
5594 #else
5595 simde_memcpy(mem_addr, &a, sizeof(a));
5596 #endif
5597 }
5598 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5599 # define _mm_stream_pd(mem_addr, a) simde_mm_stream_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
5600 #endif
5601
5602 SIMDE_FUNCTION_ATTRIBUTES
5603 void
simde_mm_stream_si128(simde__m128i * mem_addr,simde__m128i a)5604 simde_mm_stream_si128 (simde__m128i* mem_addr, simde__m128i a) {
5605 simde_assert_aligned(16, mem_addr);
5606
5607 #if defined(SIMDE_X86_SSE2_NATIVE)
5608 _mm_stream_si128(HEDLEY_STATIC_CAST(__m128i*, mem_addr), a);
5609 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5610 vst1q_s64(HEDLEY_REINTERPRET_CAST(int64_t *, mem_addr), simde__m128i_to_neon_i64(a));
5611 #else
5612 simde_memcpy(mem_addr, &a, sizeof(a));
5613 #endif
5614 }
5615 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5616 # define _mm_stream_si128(mem_addr, a) simde_mm_stream_si128(mem_addr, a)
5617 #endif
5618
5619 SIMDE_FUNCTION_ATTRIBUTES
5620 void
simde_mm_stream_si32(int32_t * mem_addr,int32_t a)5621 simde_mm_stream_si32 (int32_t* mem_addr, int32_t a) {
5622 #if defined(SIMDE_X86_SSE2_NATIVE)
5623 _mm_stream_si32(mem_addr, a);
5624 #else
5625 *mem_addr = a;
5626 #endif
5627 }
5628 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5629 # define _mm_stream_si32(mem_addr, a) simde_mm_stream_si32(mem_addr, a)
5630 #endif
5631
5632 SIMDE_FUNCTION_ATTRIBUTES
5633 void
simde_mm_stream_si64(int64_t * mem_addr,int64_t a)5634 simde_mm_stream_si64 (int64_t* mem_addr, int64_t a) {
5635 *mem_addr = a;
5636 }
5637 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5638 # define _mm_stream_si64(mem_addr, a) simde_mm_stream_si64(SIMDE_CHECKED_REINTERPRET_CAST(int64_t*, __int64*, mem_addr), a)
5639 #endif
5640
5641 SIMDE_FUNCTION_ATTRIBUTES
5642 simde__m128i
simde_mm_sub_epi8(simde__m128i a,simde__m128i b)5643 simde_mm_sub_epi8 (simde__m128i a, simde__m128i b) {
5644 #if defined(SIMDE_X86_SSE2_NATIVE)
5645 return _mm_sub_epi8(a, b);
5646 #else
5647 simde__m128i_private
5648 r_,
5649 a_ = simde__m128i_to_private(a),
5650 b_ = simde__m128i_to_private(b);
5651
5652 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5653 r_.neon_i8 = vsubq_s8(a_.neon_i8, b_.neon_i8);
5654 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
5655 r_.i8 = a_.i8 - b_.i8;
5656 #else
5657 SIMDE_VECTORIZE
5658 for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
5659 r_.i8[i] = a_.i8[i] - b_.i8[i];
5660 }
5661 #endif
5662
5663 return simde__m128i_from_private(r_);
5664 #endif
5665 }
5666 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5667 # define _mm_sub_epi8(a, b) simde_mm_sub_epi8(a, b)
5668 #endif
5669
5670 SIMDE_FUNCTION_ATTRIBUTES
5671 simde__m128i
simde_mm_sub_epi16(simde__m128i a,simde__m128i b)5672 simde_mm_sub_epi16 (simde__m128i a, simde__m128i b) {
5673 #if defined(SIMDE_X86_SSE2_NATIVE)
5674 return _mm_sub_epi16(a, b);
5675 #else
5676 simde__m128i_private
5677 r_,
5678 a_ = simde__m128i_to_private(a),
5679 b_ = simde__m128i_to_private(b);
5680
5681 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5682 r_.neon_i16 = vsubq_s16(a_.neon_i16, b_.neon_i16);
5683 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
5684 r_.i16 = a_.i16 - b_.i16;
5685 #else
5686 SIMDE_VECTORIZE
5687 for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
5688 r_.i16[i] = a_.i16[i] - b_.i16[i];
5689 }
5690 #endif
5691
5692 return simde__m128i_from_private(r_);
5693 #endif
5694 }
5695 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5696 # define _mm_sub_epi16(a, b) simde_mm_sub_epi16(a, b)
5697 #endif
5698
5699 SIMDE_FUNCTION_ATTRIBUTES
5700 simde__m128i
simde_mm_sub_epi32(simde__m128i a,simde__m128i b)5701 simde_mm_sub_epi32 (simde__m128i a, simde__m128i b) {
5702 #if defined(SIMDE_X86_SSE2_NATIVE)
5703 return _mm_sub_epi32(a, b);
5704 #else
5705 simde__m128i_private
5706 r_,
5707 a_ = simde__m128i_to_private(a),
5708 b_ = simde__m128i_to_private(b);
5709
5710 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5711 r_.neon_i32 = vsubq_s32(a_.neon_i32, b_.neon_i32);
5712 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
5713 r_.i32 = a_.i32 - b_.i32;
5714 #else
5715 SIMDE_VECTORIZE
5716 for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
5717 r_.i32[i] = a_.i32[i] - b_.i32[i];
5718 }
5719 #endif
5720
5721 return simde__m128i_from_private(r_);
5722 #endif
5723 }
5724 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5725 # define _mm_sub_epi32(a, b) simde_mm_sub_epi32(a, b)
5726 #endif
5727
5728 SIMDE_FUNCTION_ATTRIBUTES
5729 simde__m128i
simde_mm_sub_epi64(simde__m128i a,simde__m128i b)5730 simde_mm_sub_epi64 (simde__m128i a, simde__m128i b) {
5731 #if defined(SIMDE_X86_SSE2_NATIVE)
5732 return _mm_sub_epi64(a, b);
5733 #else
5734 simde__m128i_private
5735 r_,
5736 a_ = simde__m128i_to_private(a),
5737 b_ = simde__m128i_to_private(b);
5738
5739 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5740 r_.neon_i64 = vsubq_s64(a_.neon_i64, b_.neon_i64);
5741 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
5742 r_.i64 = a_.i64 - b_.i64;
5743 #else
5744 SIMDE_VECTORIZE
5745 for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
5746 r_.i64[i] = a_.i64[i] - b_.i64[i];
5747 }
5748 #endif
5749
5750 return simde__m128i_from_private(r_);
5751 #endif
5752 }
5753 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5754 # define _mm_sub_epi64(a, b) simde_mm_sub_epi64(a, b)
5755 #endif
5756
5757 SIMDE_FUNCTION_ATTRIBUTES
5758 simde__m128i
simde_x_mm_sub_epu32(simde__m128i a,simde__m128i b)5759 simde_x_mm_sub_epu32 (simde__m128i a, simde__m128i b) {
5760 simde__m128i_private
5761 r_,
5762 a_ = simde__m128i_to_private(a),
5763 b_ = simde__m128i_to_private(b);
5764
5765 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
5766 r_.u32 = a_.u32 - b_.u32;
5767 #else
5768 SIMDE_VECTORIZE
5769 for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
5770 r_.u32[i] = a_.u32[i] - b_.u32[i];
5771 }
5772 #endif
5773
5774 return simde__m128i_from_private(r_);
5775 }
5776
5777 SIMDE_FUNCTION_ATTRIBUTES
5778 simde__m128d
simde_mm_sub_pd(simde__m128d a,simde__m128d b)5779 simde_mm_sub_pd (simde__m128d a, simde__m128d b) {
5780 #if defined(SIMDE_X86_SSE2_NATIVE)
5781 return _mm_sub_pd(a, b);
5782 #else
5783 simde__m128d_private
5784 r_,
5785 a_ = simde__m128d_to_private(a),
5786 b_ = simde__m128d_to_private(b);
5787
5788 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
5789 r_.f64 = a_.f64 - b_.f64;
5790 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5791 r_.wasm_v128 = wasm_f64x2_sub(a_.wasm_v128, b_.wasm_v128);
5792 #else
5793 SIMDE_VECTORIZE
5794 for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
5795 r_.f64[i] = a_.f64[i] - b_.f64[i];
5796 }
5797 #endif
5798
5799 return simde__m128d_from_private(r_);
5800 #endif
5801 }
5802 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5803 # define _mm_sub_pd(a, b) simde_mm_sub_pd(a, b)
5804 #endif
5805
5806 SIMDE_FUNCTION_ATTRIBUTES
5807 simde__m128d
simde_mm_sub_sd(simde__m128d a,simde__m128d b)5808 simde_mm_sub_sd (simde__m128d a, simde__m128d b) {
5809 #if defined(SIMDE_X86_SSE2_NATIVE)
5810 return _mm_sub_sd(a, b);
5811 #elif defined(SIMDE_ASSUME_VECTORIZATION)
5812 return simde_mm_move_sd(a, simde_mm_sub_pd(a, b));
5813 #else
5814 simde__m128d_private
5815 r_,
5816 a_ = simde__m128d_to_private(a),
5817 b_ = simde__m128d_to_private(b);
5818
5819 r_.f64[0] = a_.f64[0] - b_.f64[0];
5820 r_.f64[1] = a_.f64[1];
5821
5822 return simde__m128d_from_private(r_);
5823 #endif
5824 }
5825 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5826 # define _mm_sub_sd(a, b) simde_mm_sub_sd(a, b)
5827 #endif
5828
5829 SIMDE_FUNCTION_ATTRIBUTES
5830 simde__m64
simde_mm_sub_si64(simde__m64 a,simde__m64 b)5831 simde_mm_sub_si64 (simde__m64 a, simde__m64 b) {
5832 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
5833 return _mm_sub_si64(a, b);
5834 #else
5835 simde__m64_private
5836 r_,
5837 a_ = simde__m64_to_private(a),
5838 b_ = simde__m64_to_private(b);
5839
5840 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
5841 r_.i64 = a_.i64 - b_.i64;
5842 #else
5843 r_.i64[0] = a_.i64[0] - b_.i64[0];
5844 #endif
5845
5846 return simde__m64_from_private(r_);
5847 #endif
5848 }
5849 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5850 # define _mm_sub_si64(a, b) simde_mm_sub_si64(a, b)
5851 #endif
5852
5853 SIMDE_FUNCTION_ATTRIBUTES
5854 simde__m128i
simde_mm_subs_epi8(simde__m128i a,simde__m128i b)5855 simde_mm_subs_epi8 (simde__m128i a, simde__m128i b) {
5856 #if defined(SIMDE_X86_SSE2_NATIVE)
5857 return _mm_subs_epi8(a, b);
5858 #else
5859 simde__m128i_private
5860 r_,
5861 a_ = simde__m128i_to_private(a),
5862 b_ = simde__m128i_to_private(b);
5863
5864 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5865 r_.neon_i8 = vqsubq_s8(a_.neon_i8, b_.neon_i8);
5866 #else
5867 SIMDE_VECTORIZE
5868 for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i8[0])) ; i++) {
5869 if (((b_.i8[i]) > 0 && (a_.i8[i]) < INT8_MIN + (b_.i8[i]))) {
5870 r_.i8[i] = INT8_MIN;
5871 } else if ((b_.i8[i]) < 0 && (a_.i8[i]) > INT8_MAX + (b_.i8[i])) {
5872 r_.i8[i] = INT8_MAX;
5873 } else {
5874 r_.i8[i] = (a_.i8[i]) - (b_.i8[i]);
5875 }
5876 }
5877 #endif
5878
5879 return simde__m128i_from_private(r_);
5880 #endif
5881 }
5882 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5883 # define _mm_subs_epi8(a, b) simde_mm_subs_epi8(a, b)
5884 #endif
5885
5886 SIMDE_FUNCTION_ATTRIBUTES
5887 simde__m128i
simde_mm_subs_epi16(simde__m128i a,simde__m128i b)5888 simde_mm_subs_epi16 (simde__m128i a, simde__m128i b) {
5889 #if defined(SIMDE_X86_SSE2_NATIVE)
5890 return _mm_subs_epi16(a, b);
5891 #else
5892 simde__m128i_private
5893 r_,
5894 a_ = simde__m128i_to_private(a),
5895 b_ = simde__m128i_to_private(b);
5896
5897 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5898 r_.neon_i16 = vqsubq_s16(a_.neon_i16, b_.neon_i16);
5899 #else
5900 SIMDE_VECTORIZE
5901 for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i16[0])) ; i++) {
5902 if (((b_.i16[i]) > 0 && (a_.i16[i]) < INT16_MIN + (b_.i16[i]))) {
5903 r_.i16[i] = INT16_MIN;
5904 } else if ((b_.i16[i]) < 0 && (a_.i16[i]) > INT16_MAX + (b_.i16[i])) {
5905 r_.i16[i] = INT16_MAX;
5906 } else {
5907 r_.i16[i] = (a_.i16[i]) - (b_.i16[i]);
5908 }
5909 }
5910 #endif
5911
5912 return simde__m128i_from_private(r_);
5913 #endif
5914 }
5915 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5916 # define _mm_subs_epi16(a, b) simde_mm_subs_epi16(a, b)
5917 #endif
5918
5919 SIMDE_FUNCTION_ATTRIBUTES
5920 simde__m128i
simde_mm_subs_epu8(simde__m128i a,simde__m128i b)5921 simde_mm_subs_epu8 (simde__m128i a, simde__m128i b) {
5922 #if defined(SIMDE_X86_SSE2_NATIVE)
5923 return _mm_subs_epu8(a, b);
5924 #else
5925 simde__m128i_private
5926 r_,
5927 a_ = simde__m128i_to_private(a),
5928 b_ = simde__m128i_to_private(b);
5929
5930 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5931 r_.neon_u8 = vqsubq_u8(a_.neon_u8, b_.neon_u8);
5932 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
5933 r_.altivec_u8 = vec_subs(a_.altivec_u8, b_.altivec_u8);
5934 #else
5935 SIMDE_VECTORIZE
5936 for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i8[0])) ; i++) {
5937 const int32_t x = a_.u8[i] - b_.u8[i];
5938 if (x < 0) {
5939 r_.u8[i] = 0;
5940 } else if (x > UINT8_MAX) {
5941 r_.u8[i] = UINT8_MAX;
5942 } else {
5943 r_.u8[i] = HEDLEY_STATIC_CAST(uint8_t, x);
5944 }
5945 }
5946 #endif
5947
5948 return simde__m128i_from_private(r_);
5949 #endif
5950 }
5951 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5952 # define _mm_subs_epu8(a, b) simde_mm_subs_epu8(a, b)
5953 #endif
5954
5955 SIMDE_FUNCTION_ATTRIBUTES
5956 simde__m128i
simde_mm_subs_epu16(simde__m128i a,simde__m128i b)5957 simde_mm_subs_epu16 (simde__m128i a, simde__m128i b) {
5958 #if defined(SIMDE_X86_SSE2_NATIVE)
5959 return _mm_subs_epu16(a, b);
5960 #else
5961 simde__m128i_private
5962 r_,
5963 a_ = simde__m128i_to_private(a),
5964 b_ = simde__m128i_to_private(b);
5965
5966 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5967 r_.neon_u16 = vqsubq_u16(a_.neon_u16, b_.neon_u16);
5968 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
5969 r_.altivec_u16 = vec_subs(a_.altivec_u16, b_.altivec_u16);
5970 #else
5971 SIMDE_VECTORIZE
5972 for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i16[0])) ; i++) {
5973 const int32_t x = a_.u16[i] - b_.u16[i];
5974 if (x < 0) {
5975 r_.u16[i] = 0;
5976 } else if (x > UINT16_MAX) {
5977 r_.u16[i] = UINT16_MAX;
5978 } else {
5979 r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, x);
5980 }
5981 }
5982 #endif
5983
5984 return simde__m128i_from_private(r_);
5985 #endif
5986 }
5987 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5988 # define _mm_subs_epu16(a, b) simde_mm_subs_epu16(a, b)
5989 #endif
5990
5991 SIMDE_FUNCTION_ATTRIBUTES
5992 int
simde_mm_ucomieq_sd(simde__m128d a,simde__m128d b)5993 simde_mm_ucomieq_sd (simde__m128d a, simde__m128d b) {
5994 #if defined(SIMDE_X86_SSE2_NATIVE)
5995 return _mm_ucomieq_sd(a, b);
5996 #else
5997 simde__m128d_private
5998 a_ = simde__m128d_to_private(a),
5999 b_ = simde__m128d_to_private(b);
6000 int r;
6001
6002 #if defined(SIMDE_HAVE_FENV_H)
6003 fenv_t envp;
6004 int x = feholdexcept(&envp);
6005 r = a_.f64[0] == b_.f64[0];
6006 if (HEDLEY_LIKELY(x == 0))
6007 fesetenv(&envp);
6008 #else
6009 r = a_.f64[0] == b_.f64[0];
6010 #endif
6011
6012 return r;
6013 #endif
6014 }
6015 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6016 # define _mm_ucomieq_sd(a, b) simde_mm_ucomieq_sd(a, b)
6017 #endif
6018
6019 SIMDE_FUNCTION_ATTRIBUTES
6020 int
simde_mm_ucomige_sd(simde__m128d a,simde__m128d b)6021 simde_mm_ucomige_sd (simde__m128d a, simde__m128d b) {
6022 #if defined(SIMDE_X86_SSE2_NATIVE)
6023 return _mm_ucomige_sd(a, b);
6024 #else
6025 simde__m128d_private
6026 a_ = simde__m128d_to_private(a),
6027 b_ = simde__m128d_to_private(b);
6028 int r;
6029
6030 #if defined(SIMDE_HAVE_FENV_H)
6031 fenv_t envp;
6032 int x = feholdexcept(&envp);
6033 r = a_.f64[0] >= b_.f64[0];
6034 if (HEDLEY_LIKELY(x == 0))
6035 fesetenv(&envp);
6036 #else
6037 r = a_.f64[0] >= b_.f64[0];
6038 #endif
6039
6040 return r;
6041 #endif
6042 }
6043 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6044 # define _mm_ucomige_sd(a, b) simde_mm_ucomige_sd(a, b)
6045 #endif
6046
6047 SIMDE_FUNCTION_ATTRIBUTES
6048 int
simde_mm_ucomigt_sd(simde__m128d a,simde__m128d b)6049 simde_mm_ucomigt_sd (simde__m128d a, simde__m128d b) {
6050 #if defined(SIMDE_X86_SSE2_NATIVE)
6051 return _mm_ucomigt_sd(a, b);
6052 #else
6053 simde__m128d_private
6054 a_ = simde__m128d_to_private(a),
6055 b_ = simde__m128d_to_private(b);
6056 int r;
6057
6058 #if defined(SIMDE_HAVE_FENV_H)
6059 fenv_t envp;
6060 int x = feholdexcept(&envp);
6061 r = a_.f64[0] > b_.f64[0];
6062 if (HEDLEY_LIKELY(x == 0))
6063 fesetenv(&envp);
6064 #else
6065 r = a_.f64[0] > b_.f64[0];
6066 #endif
6067
6068 return r;
6069 #endif
6070 }
6071 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6072 # define _mm_ucomigt_sd(a, b) simde_mm_ucomigt_sd(a, b)
6073 #endif
6074
6075 SIMDE_FUNCTION_ATTRIBUTES
6076 int
simde_mm_ucomile_sd(simde__m128d a,simde__m128d b)6077 simde_mm_ucomile_sd (simde__m128d a, simde__m128d b) {
6078 #if defined(SIMDE_X86_SSE2_NATIVE)
6079 return _mm_ucomile_sd(a, b);
6080 #else
6081 simde__m128d_private
6082 a_ = simde__m128d_to_private(a),
6083 b_ = simde__m128d_to_private(b);
6084 int r;
6085
6086 #if defined(SIMDE_HAVE_FENV_H)
6087 fenv_t envp;
6088 int x = feholdexcept(&envp);
6089 r = a_.f64[0] <= b_.f64[0];
6090 if (HEDLEY_LIKELY(x == 0))
6091 fesetenv(&envp);
6092 #else
6093 r = a_.f64[0] <= b_.f64[0];
6094 #endif
6095
6096 return r;
6097 #endif
6098 }
6099 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6100 # define _mm_ucomile_sd(a, b) simde_mm_ucomile_sd(a, b)
6101 #endif
6102
6103 SIMDE_FUNCTION_ATTRIBUTES
6104 int
simde_mm_ucomilt_sd(simde__m128d a,simde__m128d b)6105 simde_mm_ucomilt_sd (simde__m128d a, simde__m128d b) {
6106 #if defined(SIMDE_X86_SSE2_NATIVE)
6107 return _mm_ucomilt_sd(a, b);
6108 #else
6109 simde__m128d_private
6110 a_ = simde__m128d_to_private(a),
6111 b_ = simde__m128d_to_private(b);
6112 int r;
6113
6114 #if defined(SIMDE_HAVE_FENV_H)
6115 fenv_t envp;
6116 int x = feholdexcept(&envp);
6117 r = a_.f64[0] < b_.f64[0];
6118 if (HEDLEY_LIKELY(x == 0))
6119 fesetenv(&envp);
6120 #else
6121 r = a_.f64[0] < b_.f64[0];
6122 #endif
6123
6124 return r;
6125 #endif
6126 }
6127 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6128 # define _mm_ucomilt_sd(a, b) simde_mm_ucomilt_sd(a, b)
6129 #endif
6130
6131 SIMDE_FUNCTION_ATTRIBUTES
6132 int
simde_mm_ucomineq_sd(simde__m128d a,simde__m128d b)6133 simde_mm_ucomineq_sd (simde__m128d a, simde__m128d b) {
6134 #if defined(SIMDE_X86_SSE2_NATIVE)
6135 return _mm_ucomineq_sd(a, b);
6136 #else
6137 simde__m128d_private
6138 a_ = simde__m128d_to_private(a),
6139 b_ = simde__m128d_to_private(b);
6140 int r;
6141
6142 #if defined(SIMDE_HAVE_FENV_H)
6143 fenv_t envp;
6144 int x = feholdexcept(&envp);
6145 r = a_.f64[0] != b_.f64[0];
6146 if (HEDLEY_LIKELY(x == 0))
6147 fesetenv(&envp);
6148 #else
6149 r = a_.f64[0] != b_.f64[0];
6150 #endif
6151
6152 return r;
6153 #endif
6154 }
6155 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6156 # define _mm_ucomineq_sd(a, b) simde_mm_ucomineq_sd(a, b)
6157 #endif
6158
6159 #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
6160 HEDLEY_DIAGNOSTIC_PUSH
6161 SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_
6162 #endif
6163
6164 #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
6165 HEDLEY_DIAGNOSTIC_POP
6166 #endif
6167
6168 SIMDE_FUNCTION_ATTRIBUTES
6169 void
simde_mm_lfence(void)6170 simde_mm_lfence (void) {
6171 #if defined(SIMDE_X86_SSE2_NATIVE)
6172 _mm_lfence();
6173 #else
6174 simde_mm_sfence();
6175 #endif
6176 }
6177 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6178 # define _mm_lfence() simde_mm_lfence()
6179 #endif
6180
6181 SIMDE_FUNCTION_ATTRIBUTES
6182 void
simde_mm_mfence(void)6183 simde_mm_mfence (void) {
6184 #if defined(SIMDE_X86_SSE2_NATIVE)
6185 _mm_mfence();
6186 #else
6187 simde_mm_sfence();
6188 #endif
6189 }
6190 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6191 # define _mm_mfence() simde_mm_mfence()
6192 #endif
6193
6194 SIMDE_FUNCTION_ATTRIBUTES
6195 simde__m128i
simde_mm_unpackhi_epi8(simde__m128i a,simde__m128i b)6196 simde_mm_unpackhi_epi8 (simde__m128i a, simde__m128i b) {
6197 #if defined(SIMDE_X86_SSE2_NATIVE)
6198 return _mm_unpackhi_epi8(a, b);
6199 #else
6200 simde__m128i_private
6201 r_,
6202 a_ = simde__m128i_to_private(a),
6203 b_ = simde__m128i_to_private(b);
6204
6205 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6206 r_.neon_i8 = vzip2q_s8(a_.neon_i8, b_.neon_i8);
6207 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6208 int8x8_t a1 = vreinterpret_s8_s16(vget_high_s16(a_.neon_i16));
6209 int8x8_t b1 = vreinterpret_s8_s16(vget_high_s16(b_.neon_i16));
6210 int8x8x2_t result = vzip_s8(a1, b1);
6211 r_.neon_i8 = vcombine_s8(result.val[0], result.val[1]);
6212 #elif defined(SIMDE_SHUFFLE_VECTOR_)
6213 r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, b_.i8, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
6214 #else
6215 SIMDE_VECTORIZE
6216 for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i8[0])) / 2) ; i++) {
6217 r_.i8[(i * 2)] = a_.i8[i + ((sizeof(r_) / sizeof(r_.i8[0])) / 2)];
6218 r_.i8[(i * 2) + 1] = b_.i8[i + ((sizeof(r_) / sizeof(r_.i8[0])) / 2)];
6219 }
6220 #endif
6221
6222 return simde__m128i_from_private(r_);
6223 #endif
6224 }
6225 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6226 # define _mm_unpackhi_epi8(a, b) simde_mm_unpackhi_epi8(a, b)
6227 #endif
6228
6229 SIMDE_FUNCTION_ATTRIBUTES
6230 simde__m128i
simde_mm_unpackhi_epi16(simde__m128i a,simde__m128i b)6231 simde_mm_unpackhi_epi16 (simde__m128i a, simde__m128i b) {
6232 #if defined(SIMDE_X86_SSE2_NATIVE)
6233 return _mm_unpackhi_epi16(a, b);
6234 #else
6235 simde__m128i_private
6236 r_,
6237 a_ = simde__m128i_to_private(a),
6238 b_ = simde__m128i_to_private(b);
6239
6240 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6241 r_.neon_i16 = vzip2q_s16(a_.neon_i16, b_.neon_i16);
6242 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6243 int16x4_t a1 = vget_high_s16(a_.neon_i16);
6244 int16x4_t b1 = vget_high_s16(b_.neon_i16);
6245 int16x4x2_t result = vzip_s16(a1, b1);
6246 r_.neon_i16 = vcombine_s16(result.val[0], result.val[1]);
6247 #elif defined(SIMDE_SHUFFLE_VECTOR_)
6248 r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, b_.i16, 4, 12, 5, 13, 6, 14, 7, 15);
6249 #else
6250 SIMDE_VECTORIZE
6251 for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i16[0])) / 2) ; i++) {
6252 r_.i16[(i * 2)] = a_.i16[i + ((sizeof(r_) / sizeof(r_.i16[0])) / 2)];
6253 r_.i16[(i * 2) + 1] = b_.i16[i + ((sizeof(r_) / sizeof(r_.i16[0])) / 2)];
6254 }
6255 #endif
6256
6257 return simde__m128i_from_private(r_);
6258 #endif
6259 }
6260 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6261 # define _mm_unpackhi_epi16(a, b) simde_mm_unpackhi_epi16(a, b)
6262 #endif
6263
6264 SIMDE_FUNCTION_ATTRIBUTES
6265 simde__m128i
simde_mm_unpackhi_epi32(simde__m128i a,simde__m128i b)6266 simde_mm_unpackhi_epi32 (simde__m128i a, simde__m128i b) {
6267 #if defined(SIMDE_X86_SSE2_NATIVE)
6268 return _mm_unpackhi_epi32(a, b);
6269 #else
6270 simde__m128i_private
6271 r_,
6272 a_ = simde__m128i_to_private(a),
6273 b_ = simde__m128i_to_private(b);
6274
6275 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6276 r_.neon_i32 = vzip2q_s32(a_.neon_i32, b_.neon_i32);
6277 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6278 int32x2_t a1 = vget_high_s32(a_.neon_i32);
6279 int32x2_t b1 = vget_high_s32(b_.neon_i32);
6280 int32x2x2_t result = vzip_s32(a1, b1);
6281 r_.neon_i32 = vcombine_s32(result.val[0], result.val[1]);
6282 #elif defined(SIMDE_SHUFFLE_VECTOR_)
6283 r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.i32, b_.i32, 2, 6, 3, 7);
6284 #else
6285 SIMDE_VECTORIZE
6286 for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i32[0])) / 2) ; i++) {
6287 r_.i32[(i * 2)] = a_.i32[i + ((sizeof(r_) / sizeof(r_.i32[0])) / 2)];
6288 r_.i32[(i * 2) + 1] = b_.i32[i + ((sizeof(r_) / sizeof(r_.i32[0])) / 2)];
6289 }
6290 #endif
6291
6292 return simde__m128i_from_private(r_);
6293 #endif
6294 }
6295 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6296 # define _mm_unpackhi_epi32(a, b) simde_mm_unpackhi_epi32(a, b)
6297 #endif
6298
6299 SIMDE_FUNCTION_ATTRIBUTES
6300 simde__m128i
simde_mm_unpackhi_epi64(simde__m128i a,simde__m128i b)6301 simde_mm_unpackhi_epi64 (simde__m128i a, simde__m128i b) {
6302 #if defined(SIMDE_X86_SSE2_NATIVE)
6303 return _mm_unpackhi_epi64(a, b);
6304 #else
6305 simde__m128i_private
6306 r_,
6307 a_ = simde__m128i_to_private(a),
6308 b_ = simde__m128i_to_private(b);
6309
6310 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6311 int64x1_t a_h = vget_high_s64(a_.neon_i64);
6312 int64x1_t b_h = vget_high_s64(b_.neon_i64);
6313 r_.neon_i64 = vcombine_s64(a_h, b_h);
6314 #elif defined(SIMDE_SHUFFLE_VECTOR_)
6315 r_.i64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.i64, b_.i64, 1, 3);
6316 #else
6317 SIMDE_VECTORIZE
6318 for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i64[0])) / 2) ; i++) {
6319 r_.i64[(i * 2)] = a_.i64[i + ((sizeof(r_) / sizeof(r_.i64[0])) / 2)];
6320 r_.i64[(i * 2) + 1] = b_.i64[i + ((sizeof(r_) / sizeof(r_.i64[0])) / 2)];
6321 }
6322 #endif
6323
6324 return simde__m128i_from_private(r_);
6325 #endif
6326 }
6327 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6328 # define _mm_unpackhi_epi64(a, b) simde_mm_unpackhi_epi64(a, b)
6329 #endif
6330
6331 SIMDE_FUNCTION_ATTRIBUTES
6332 simde__m128d
simde_mm_unpackhi_pd(simde__m128d a,simde__m128d b)6333 simde_mm_unpackhi_pd (simde__m128d a, simde__m128d b) {
6334 #if defined(SIMDE_X86_SSE2_NATIVE)
6335 return _mm_unpackhi_pd(a, b);
6336 #else
6337 simde__m128d_private
6338 r_,
6339 a_ = simde__m128d_to_private(a),
6340 b_ = simde__m128d_to_private(b);
6341
6342 #if defined(SIMDE_SHUFFLE_VECTOR_)
6343 r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 1, 3);
6344 #else
6345 SIMDE_VECTORIZE
6346 for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.f64[0])) / 2) ; i++) {
6347 r_.f64[(i * 2)] = a_.f64[i + ((sizeof(r_) / sizeof(r_.f64[0])) / 2)];
6348 r_.f64[(i * 2) + 1] = b_.f64[i + ((sizeof(r_) / sizeof(r_.f64[0])) / 2)];
6349 }
6350 #endif
6351
6352 return simde__m128d_from_private(r_);
6353 #endif
6354 }
6355 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6356 # define _mm_unpackhi_pd(a, b) simde_mm_unpackhi_pd(a, b)
6357 #endif
6358
6359 SIMDE_FUNCTION_ATTRIBUTES
6360 simde__m128i
simde_mm_unpacklo_epi8(simde__m128i a,simde__m128i b)6361 simde_mm_unpacklo_epi8 (simde__m128i a, simde__m128i b) {
6362 #if defined(SIMDE_X86_SSE2_NATIVE)
6363 return _mm_unpacklo_epi8(a, b);
6364 #else
6365 simde__m128i_private
6366 r_,
6367 a_ = simde__m128i_to_private(a),
6368 b_ = simde__m128i_to_private(b);
6369
6370 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6371 r_.neon_i8 = vzip1q_s8(a_.neon_i8, b_.neon_i8);
6372 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6373 int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(a_.neon_i16));
6374 int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(b_.neon_i16));
6375 int8x8x2_t result = vzip_s8(a1, b1);
6376 r_.neon_i8 = vcombine_s8(result.val[0], result.val[1]);
6377 #elif defined(SIMDE_SHUFFLE_VECTOR_)
6378 r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, b_.i8, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
6379 #else
6380 SIMDE_VECTORIZE
6381 for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i8[0])) / 2) ; i++) {
6382 r_.i8[(i * 2)] = a_.i8[i];
6383 r_.i8[(i * 2) + 1] = b_.i8[i];
6384 }
6385 #endif
6386
6387 return simde__m128i_from_private(r_);
6388 #endif
6389 }
6390 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6391 # define _mm_unpacklo_epi8(a, b) simde_mm_unpacklo_epi8(a, b)
6392 #endif
6393
6394 SIMDE_FUNCTION_ATTRIBUTES
6395 simde__m128i
simde_mm_unpacklo_epi16(simde__m128i a,simde__m128i b)6396 simde_mm_unpacklo_epi16 (simde__m128i a, simde__m128i b) {
6397 #if defined(SIMDE_X86_SSE2_NATIVE)
6398 return _mm_unpacklo_epi16(a, b);
6399 #else
6400 simde__m128i_private
6401 r_,
6402 a_ = simde__m128i_to_private(a),
6403 b_ = simde__m128i_to_private(b);
6404
6405 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6406 r_.neon_i16 = vzip1q_s16(a_.neon_i16, b_.neon_i16);
6407 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6408 int16x4_t a1 = vget_low_s16(a_.neon_i16);
6409 int16x4_t b1 = vget_low_s16(b_.neon_i16);
6410 int16x4x2_t result = vzip_s16(a1, b1);
6411 r_.neon_i16 = vcombine_s16(result.val[0], result.val[1]);
6412 #elif defined(SIMDE_SHUFFLE_VECTOR_)
6413 r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, b_.i16, 0, 8, 1, 9, 2, 10, 3, 11);
6414 #else
6415 SIMDE_VECTORIZE
6416 for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i16[0])) / 2) ; i++) {
6417 r_.i16[(i * 2)] = a_.i16[i];
6418 r_.i16[(i * 2) + 1] = b_.i16[i];
6419 }
6420 #endif
6421
6422 return simde__m128i_from_private(r_);
6423 #endif
6424 }
6425 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6426 # define _mm_unpacklo_epi16(a, b) simde_mm_unpacklo_epi16(a, b)
6427 #endif
6428
6429 SIMDE_FUNCTION_ATTRIBUTES
6430 simde__m128i
simde_mm_unpacklo_epi32(simde__m128i a,simde__m128i b)6431 simde_mm_unpacklo_epi32 (simde__m128i a, simde__m128i b) {
6432 #if defined(SIMDE_X86_SSE2_NATIVE)
6433 return _mm_unpacklo_epi32(a, b);
6434 #else
6435 simde__m128i_private
6436 r_,
6437 a_ = simde__m128i_to_private(a),
6438 b_ = simde__m128i_to_private(b);
6439
6440 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6441 r_.neon_i32 = vzip1q_s32(a_.neon_i32, b_.neon_i32);
6442 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6443 int32x2_t a1 = vget_low_s32(a_.neon_i32);
6444 int32x2_t b1 = vget_low_s32(b_.neon_i32);
6445 int32x2x2_t result = vzip_s32(a1, b1);
6446 r_.neon_i32 = vcombine_s32(result.val[0], result.val[1]);
6447 #elif defined(SIMDE_SHUFFLE_VECTOR_)
6448 r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.i32, b_.i32, 0, 4, 1, 5);
6449 #else
6450 SIMDE_VECTORIZE
6451 for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i32[0])) / 2) ; i++) {
6452 r_.i32[(i * 2)] = a_.i32[i];
6453 r_.i32[(i * 2) + 1] = b_.i32[i];
6454 }
6455 #endif
6456
6457 return simde__m128i_from_private(r_);
6458 #endif
6459 }
6460 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6461 # define _mm_unpacklo_epi32(a, b) simde_mm_unpacklo_epi32(a, b)
6462 #endif
6463
6464 SIMDE_FUNCTION_ATTRIBUTES
6465 simde__m128i
simde_mm_unpacklo_epi64(simde__m128i a,simde__m128i b)6466 simde_mm_unpacklo_epi64 (simde__m128i a, simde__m128i b) {
6467 #if defined(SIMDE_X86_SSE2_NATIVE)
6468 return _mm_unpacklo_epi64(a, b);
6469 #else
6470 simde__m128i_private
6471 r_,
6472 a_ = simde__m128i_to_private(a),
6473 b_ = simde__m128i_to_private(b);
6474
6475 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6476 int64x1_t a_l = vget_low_s64(a_.i64);
6477 int64x1_t b_l = vget_low_s64(b_.i64);
6478 r_.neon_i64 = vcombine_s64(a_l, b_l);
6479 #elif defined(SIMDE_SHUFFLE_VECTOR_)
6480 r_.i64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.i64, b_.i64, 0, 2);
6481 #else
6482 SIMDE_VECTORIZE
6483 for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i64[0])) / 2) ; i++) {
6484 r_.i64[(i * 2)] = a_.i64[i];
6485 r_.i64[(i * 2) + 1] = b_.i64[i];
6486 }
6487 #endif
6488
6489 return simde__m128i_from_private(r_);
6490 #endif
6491 }
6492 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6493 # define _mm_unpacklo_epi64(a, b) simde_mm_unpacklo_epi64(a, b)
6494 #endif
6495
6496 SIMDE_FUNCTION_ATTRIBUTES
6497 simde__m128d
simde_mm_unpacklo_pd(simde__m128d a,simde__m128d b)6498 simde_mm_unpacklo_pd (simde__m128d a, simde__m128d b) {
6499 #if defined(SIMDE_X86_SSE2_NATIVE)
6500 return _mm_unpacklo_pd(a, b);
6501 #else
6502 simde__m128d_private
6503 r_,
6504 a_ = simde__m128d_to_private(a),
6505 b_ = simde__m128d_to_private(b);
6506
6507 #if defined(SIMDE_SHUFFLE_VECTOR_)
6508 r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 0, 2);
6509 #else
6510 SIMDE_VECTORIZE
6511 for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.f64[0])) / 2) ; i++) {
6512 r_.f64[(i * 2)] = a_.f64[i];
6513 r_.f64[(i * 2) + 1] = b_.f64[i];
6514 }
6515 #endif
6516
6517 return simde__m128d_from_private(r_);
6518 #endif
6519 }
6520 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6521 # define _mm_unpacklo_pd(a, b) simde_mm_unpacklo_pd(a, b)
6522 #endif
6523
6524 SIMDE_FUNCTION_ATTRIBUTES
6525 simde__m128d
simde_mm_xor_pd(simde__m128d a,simde__m128d b)6526 simde_mm_xor_pd (simde__m128d a, simde__m128d b) {
6527 #if defined(SIMDE_X86_SSE2_NATIVE)
6528 return _mm_xor_pd(a, b);
6529 #else
6530 simde__m128d_private
6531 r_,
6532 a_ = simde__m128d_to_private(a),
6533 b_ = simde__m128d_to_private(b);
6534
6535 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
6536 r_.i32f = a_.i32f ^ b_.i32f;
6537 #else
6538 SIMDE_VECTORIZE
6539 for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
6540 r_.i32f[i] = a_.i32f[i] ^ b_.i32f[i];
6541 }
6542 #endif
6543
6544 return simde__m128d_from_private(r_);
6545 #endif
6546 }
6547 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6548 # define _mm_xor_pd(a, b) simde_mm_xor_pd(a, b)
6549 #endif
6550
6551 SIMDE_FUNCTION_ATTRIBUTES
6552 simde__m128i
simde_mm_xor_si128(simde__m128i a,simde__m128i b)6553 simde_mm_xor_si128 (simde__m128i a, simde__m128i b) {
6554 #if defined(SIMDE_X86_SSE2_NATIVE)
6555 return _mm_xor_si128(a, b);
6556 #else
6557 simde__m128i_private
6558 r_,
6559 a_ = simde__m128i_to_private(a),
6560 b_ = simde__m128i_to_private(b);
6561
6562 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6563 r_.neon_i32 = veorq_s32(a_.neon_i32, b_.neon_i32);
6564 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
6565 r_.altivec_i32 = vec_xor(a_.altivec_i32, b_.altivec_i32);
6566 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
6567 r_.i32f = a_.i32f ^ b_.i32f;
6568 #else
6569 SIMDE_VECTORIZE
6570 for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
6571 r_.i32f[i] = a_.i32f[i] ^ b_.i32f[i];
6572 }
6573 #endif
6574
6575 return simde__m128i_from_private(r_);
6576 #endif
6577 }
6578 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6579 # define _mm_xor_si128(a, b) simde_mm_xor_si128(a, b)
6580 #endif
6581
6582 SIMDE_FUNCTION_ATTRIBUTES
6583 simde__m128i
simde_x_mm_not_si128(simde__m128i a)6584 simde_x_mm_not_si128 (simde__m128i a) {
6585 simde__m128i_private
6586 r_,
6587 a_ = simde__m128i_to_private(a);
6588
6589 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6590 r_.neon_i32 = vmvnq_s32(a_.neon_i32);
6591 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
6592 r_.i32f = ~(a_.i32f);
6593 #else
6594 SIMDE_VECTORIZE
6595 for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
6596 r_.i32f[i] = ~(a_.i32f[i]);
6597 }
6598 #endif
6599
6600 return simde__m128i_from_private(r_);
6601 }
6602
6603 #define SIMDE_MM_SHUFFLE2(x, y) (((x) << 1) | (y))
6604 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6605 # define _MM_SHUFFLE2(x, y) SIMDE_MM_SHUFFLE2(x, y)
6606 #endif
6607
6608 SIMDE_END_DECLS_
6609
6610 HEDLEY_DIAGNOSTIC_POP
6611
6612 #endif /* !defined(SIMDE_X86_SSE2_H) */
6613