1 /* SPDX-License-Identifier: MIT
2  *
3  * Permission is hereby granted, free of charge, to any person
4  * obtaining a copy of this software and associated documentation
5  * files (the "Software"), to deal in the Software without
6  * restriction, including without limitation the rights to use, copy,
7  * modify, merge, publish, distribute, sublicense, and/or sell copies
8  * of the Software, and to permit persons to whom the Software is
9  * furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be
12  * included in all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
18  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
19  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  * Copyright:
24  *   2017-2020 Evan Nemerson <evan@nemerson.com>
25  *   2015-2017 John W. Ratcliff <jratcliffscarab@gmail.com>
26  *   2015      Brandon Rowlett <browlett@nvidia.com>
27  *   2015      Ken Fast <kfast@gdeb.com>
28  *   2017      Hasindu Gamaarachchi <hasindu@unsw.edu.au>
29  *   2018      Jeff Daily <jeff.daily@amd.com>
30  */
31 
32 #if !defined(SIMDE_X86_SSE2_H)
33 #define SIMDE_X86_SSE2_H
34 
35 #include "sse.h"
36 
37 HEDLEY_DIAGNOSTIC_PUSH
38 SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
39 SIMDE_BEGIN_DECLS_
40 
41 typedef union {
42   #if defined(SIMDE_VECTOR_SUBSCRIPT)
43     SIMDE_ALIGN_TO_16 int8_t          i8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
44     SIMDE_ALIGN_TO_16 int16_t        i16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
45     SIMDE_ALIGN_TO_16 int32_t        i32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
46     SIMDE_ALIGN_TO_16 int64_t        i64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
47     SIMDE_ALIGN_TO_16 uint8_t         u8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
48     SIMDE_ALIGN_TO_16 uint16_t       u16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
49     SIMDE_ALIGN_TO_16 uint32_t       u32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
50     SIMDE_ALIGN_TO_16 uint64_t       u64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
51     #if defined(SIMDE_HAVE_INT128_)
52     SIMDE_ALIGN_TO_16 simde_int128  i128 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
53     SIMDE_ALIGN_TO_16 simde_uint128 u128 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
54     #endif
55     SIMDE_ALIGN_TO_16 simde_float32  f32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
56     SIMDE_ALIGN_TO_16 simde_float64  f64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
57 
58     SIMDE_ALIGN_TO_16 int_fast32_t  i32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
59     SIMDE_ALIGN_TO_16 uint_fast32_t u32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
60   #else
61     SIMDE_ALIGN_TO_16 int8_t         i8[16];
62     SIMDE_ALIGN_TO_16 int16_t        i16[8];
63     SIMDE_ALIGN_TO_16 int32_t        i32[4];
64     SIMDE_ALIGN_TO_16 int64_t        i64[2];
65     SIMDE_ALIGN_TO_16 uint8_t        u8[16];
66     SIMDE_ALIGN_TO_16 uint16_t       u16[8];
67     SIMDE_ALIGN_TO_16 uint32_t       u32[4];
68     SIMDE_ALIGN_TO_16 uint64_t       u64[2];
69     #if defined(SIMDE_HAVE_INT128_)
70     SIMDE_ALIGN_TO_16 simde_int128  i128[1];
71     SIMDE_ALIGN_TO_16 simde_uint128 u128[1];
72     #endif
73     SIMDE_ALIGN_TO_16 simde_float32  f32[4];
74     SIMDE_ALIGN_TO_16 simde_float64  f64[2];
75 
76     SIMDE_ALIGN_TO_16 int_fast32_t  i32f[16 / sizeof(int_fast32_t)];
77     SIMDE_ALIGN_TO_16 uint_fast32_t u32f[16 / sizeof(uint_fast32_t)];
78   #endif
79 
80     SIMDE_ALIGN_TO_16 simde__m64_private m64_private[2];
81     SIMDE_ALIGN_TO_16 simde__m64         m64[2];
82 
83   #if defined(SIMDE_X86_SSE2_NATIVE)
84     SIMDE_ALIGN_TO_16 __m128i        n;
85   #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
86     SIMDE_ALIGN_TO_16 int8x16_t      neon_i8;
87     SIMDE_ALIGN_TO_16 int16x8_t      neon_i16;
88     SIMDE_ALIGN_TO_16 int32x4_t      neon_i32;
89     SIMDE_ALIGN_TO_16 int64x2_t      neon_i64;
90     SIMDE_ALIGN_TO_16 uint8x16_t     neon_u8;
91     SIMDE_ALIGN_TO_16 uint16x8_t     neon_u16;
92     SIMDE_ALIGN_TO_16 uint32x4_t     neon_u32;
93     SIMDE_ALIGN_TO_16 uint64x2_t     neon_u64;
94     #if defined(__ARM_FP16_FORMAT_IEEE)
95     SIMDE_ALIGN_TO_16 float16x8_t    neon_f16;
96     #endif
97     SIMDE_ALIGN_TO_16 float32x4_t    neon_f32;
98     #if defined(SIMDE_ARCH_AARCH64)
99     SIMDE_ALIGN_TO_16 float64x2_t    neon_f64;
100     #endif
101   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
102     SIMDE_ALIGN_TO_16 v128_t         wasm_v128;
103   #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
104     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed char)          altivec_i8;
105     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed short)         altivec_i16;
106     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed int)           altivec_i32;
107     #if defined(__UINT_FAST32_TYPE__) && (defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE))
108       SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(__INT_FAST32_TYPE__)  altivec_i32f;
109     #else
110       SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed int)           altivec_i32f;
111     #endif
112     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned char)        altivec_u8;
113     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned short)       altivec_u16;
114     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int)         altivec_u32;
115     #if defined(__UINT_FAST32_TYPE__) && (defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE))
116       SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(__UINT_FAST32_TYPE__) altivec_u32f;
117     #else
118       SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int)         altivec_u32f;
119     #endif
120       SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(float)                altivec_f32;
121     #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
122       SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed long long)   altivec_i64;
123       SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) altivec_u64;
124       SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(double)             altivec_f64;
125     #endif
126   #endif
127 } simde__m128i_private;
128 
129 typedef union {
130   #if defined(SIMDE_VECTOR_SUBSCRIPT)
131     SIMDE_ALIGN_TO_16 int8_t          i8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
132     SIMDE_ALIGN_TO_16 int16_t        i16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
133     SIMDE_ALIGN_TO_16 int32_t        i32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
134     SIMDE_ALIGN_TO_16 int64_t        i64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
135     SIMDE_ALIGN_TO_16 uint8_t         u8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
136     SIMDE_ALIGN_TO_16 uint16_t       u16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
137     SIMDE_ALIGN_TO_16 uint32_t       u32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
138     SIMDE_ALIGN_TO_16 uint64_t       u64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
139     SIMDE_ALIGN_TO_16 simde_float32  f32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
140     SIMDE_ALIGN_TO_16 simde_float64  f64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
141     SIMDE_ALIGN_TO_16 int_fast32_t  i32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
142     SIMDE_ALIGN_TO_16 uint_fast32_t u32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
143   #else
144     SIMDE_ALIGN_TO_16 int8_t         i8[16];
145     SIMDE_ALIGN_TO_16 int16_t        i16[8];
146     SIMDE_ALIGN_TO_16 int32_t        i32[4];
147     SIMDE_ALIGN_TO_16 int64_t        i64[2];
148     SIMDE_ALIGN_TO_16 uint8_t        u8[16];
149     SIMDE_ALIGN_TO_16 uint16_t       u16[8];
150     SIMDE_ALIGN_TO_16 uint32_t       u32[4];
151     SIMDE_ALIGN_TO_16 uint64_t       u64[2];
152     SIMDE_ALIGN_TO_16 simde_float32  f32[4];
153     SIMDE_ALIGN_TO_16 simde_float64  f64[2];
154     SIMDE_ALIGN_TO_16 int_fast32_t  i32f[16 / sizeof(int_fast32_t)];
155     SIMDE_ALIGN_TO_16 uint_fast32_t u32f[16 / sizeof(uint_fast32_t)];
156   #endif
157 
158     SIMDE_ALIGN_TO_16 simde__m64_private m64_private[2];
159     SIMDE_ALIGN_TO_16 simde__m64         m64[2];
160 
161   #if defined(SIMDE_X86_SSE2_NATIVE)
162     SIMDE_ALIGN_TO_16 __m128d        n;
163   #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
164     SIMDE_ALIGN_TO_16 int8x16_t      neon_i8;
165     SIMDE_ALIGN_TO_16 int16x8_t      neon_i16;
166     SIMDE_ALIGN_TO_16 int32x4_t      neon_i32;
167     SIMDE_ALIGN_TO_16 int64x2_t      neon_i64;
168     SIMDE_ALIGN_TO_16 uint8x16_t     neon_u8;
169     SIMDE_ALIGN_TO_16 uint16x8_t     neon_u16;
170     SIMDE_ALIGN_TO_16 uint32x4_t     neon_u32;
171     SIMDE_ALIGN_TO_16 uint64x2_t     neon_u64;
172     SIMDE_ALIGN_TO_16 float32x4_t    neon_f32;
173     #if defined(SIMDE_ARCH_AARCH64)
174     SIMDE_ALIGN_TO_16 float64x2_t    neon_f64;
175     #endif
176   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
177     SIMDE_ALIGN_TO_16 v128_t         wasm_v128;
178   #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
179     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed char)          altivec_i8;
180     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed short)         altivec_i16;
181     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed int)           altivec_i32;
182     #if defined(__INT_FAST32_TYPE__) && (defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE))
183       SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(__INT_FAST32_TYPE__)  altivec_i32f;
184     #else
185       SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed int)           altivec_i32f;
186     #endif
187     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned char)        altivec_u8;
188     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned short)       altivec_u16;
189     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int)         altivec_u32;
190     #if defined(__UINT_FAST32_TYPE__) && (defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE))
191       SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(__UINT_FAST32_TYPE__) altivec_u32f;
192     #else
193       SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int)         altivec_u32f;
194     #endif
195     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(float)                altivec_f32;
196     #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
197       SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed long long)   altivec_i64;
198       SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) altivec_u64;
199       SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(double)             altivec_f64;
200     #endif
201   #endif
202 } simde__m128d_private;
203 
204 #if defined(SIMDE_X86_SSE2_NATIVE)
205   typedef __m128i simde__m128i;
206   typedef __m128d simde__m128d;
207 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
208    typedef int64x2_t simde__m128i;
209 #  if defined(SIMDE_ARCH_AARCH64)
210      typedef float64x2_t simde__m128d;
211 #  elif defined(SIMDE_VECTOR_SUBSCRIPT)
212      typedef simde_float64 simde__m128d SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
213 #  else
214      typedef simde__m128d_private simde__m128d;
215 #  endif
216 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
217    typedef v128_t simde__m128i;
218    typedef v128_t simde__m128d;
219 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
220   typedef SIMDE_POWER_ALTIVEC_VECTOR(float) simde__m128i;
221   #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
222      typedef SIMDE_POWER_ALTIVEC_VECTOR(double) simde__m128d;
223   #else
224      typedef simde__m128d_private simde__m128d;
225   #endif
226 #elif defined(SIMDE_VECTOR_SUBSCRIPT)
227   typedef int64_t simde__m128i SIMDE_ALIGN_TO_16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
228   typedef simde_float64 simde__m128d SIMDE_ALIGN_TO_16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
229 #else
230   typedef simde__m128i_private simde__m128i;
231   typedef simde__m128d_private simde__m128d;
232 #endif
233 
234 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
235   typedef simde__m128i __m128i;
236   typedef simde__m128d __m128d;
237 #endif
238 
239 HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128i), "simde__m128i size incorrect");
240 HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128i_private), "simde__m128i_private size incorrect");
241 HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128d), "simde__m128d size incorrect");
242 HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128d_private), "simde__m128d_private size incorrect");
243 #if defined(SIMDE_CHECK_ALIGNMENT) && defined(SIMDE_ALIGN_OF)
244 HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128i) == 16, "simde__m128i is not 16-byte aligned");
245 HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128i_private) == 16, "simde__m128i_private is not 16-byte aligned");
246 HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128d) == 16, "simde__m128d is not 16-byte aligned");
247 HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128d_private) == 16, "simde__m128d_private is not 16-byte aligned");
248 #endif
249 
250 SIMDE_FUNCTION_ATTRIBUTES
251 simde__m128i
simde__m128i_from_private(simde__m128i_private v)252 simde__m128i_from_private(simde__m128i_private v) {
253   simde__m128i r;
254   simde_memcpy(&r, &v, sizeof(r));
255   return r;
256 }
257 
258 SIMDE_FUNCTION_ATTRIBUTES
259 simde__m128i_private
simde__m128i_to_private(simde__m128i v)260 simde__m128i_to_private(simde__m128i v) {
261   simde__m128i_private r;
262   simde_memcpy(&r, &v, sizeof(r));
263   return r;
264 }
265 
266 SIMDE_FUNCTION_ATTRIBUTES
267 simde__m128d
simde__m128d_from_private(simde__m128d_private v)268 simde__m128d_from_private(simde__m128d_private v) {
269   simde__m128d r;
270   simde_memcpy(&r, &v, sizeof(r));
271   return r;
272 }
273 
274 SIMDE_FUNCTION_ATTRIBUTES
275 simde__m128d_private
simde__m128d_to_private(simde__m128d v)276 simde__m128d_to_private(simde__m128d v) {
277   simde__m128d_private r;
278   simde_memcpy(&r, &v, sizeof(r));
279   return r;
280 }
281 
282 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i,int8x16_t,neon,i8)283   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, int8x16_t, neon, i8)
284   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, int16x8_t, neon, i16)
285   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, int32x4_t, neon, i32)
286   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, int64x2_t, neon, i64)
287   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, uint8x16_t, neon, u8)
288   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, uint16x8_t, neon, u16)
289   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, uint32x4_t, neon, u32)
290   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, uint64x2_t, neon, u64)
291   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, float32x4_t, neon, f32)
292   #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
293     SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, float64x2_t, neon, f64)
294   #endif
295 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
296   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(signed char), altivec, i8)
297   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(signed short), altivec, i16)
298   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(signed int), altivec, i32)
299   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), altivec, u8)
300   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(unsigned short), altivec, u16)
301   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(unsigned int), altivec, u32)
302   #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
303     SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long), altivec, u64)
304     SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(signed long long), altivec, i64)
305   #endif
306 #endif /* defined(SIMDE_ARM_NEON_A32V7_NATIVE) */
307 
308 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
309   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, int8x16_t, neon, i8)
310   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, int16x8_t, neon, i16)
311   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, int32x4_t, neon, i32)
312   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, int64x2_t, neon, i64)
313   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, uint8x16_t, neon, u8)
314   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, uint16x8_t, neon, u16)
315   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, uint32x4_t, neon, u32)
316   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, uint64x2_t, neon, u64)
317   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, float32x4_t, neon, f32)
318   #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
319     SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, float64x2_t, neon, f64)
320   #endif
321 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
322   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(signed char), altivec, i8)
323   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(signed short), altivec, i16)
324   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(signed int), altivec, i32)
325   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), altivec, u8)
326   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(unsigned short), altivec, u16)
327   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(unsigned int), altivec, u32)
328   #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
329     SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long), altivec, u64)
330     SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(signed long long), altivec, i64)
331     #if defined(SIMDE_BUG_GCC_95782)
332       SIMDE_FUNCTION_ATTRIBUTES
333       SIMDE_POWER_ALTIVEC_VECTOR(double)
334       simde__m128d_to_altivec_f64(simde__m128d value) {
335         simde__m128d_private r_ = simde__m128d_to_private(value);
336         return r_.altivec_f64;
337       }
338 
339       SIMDE_FUNCTION_ATTRIBUTES
340       simde__m128d
341       simde__m128d_from_altivec_f64(SIMDE_POWER_ALTIVEC_VECTOR(double) value) {
342         simde__m128d_private r_;
343         r_.altivec_f64 = value;
344         return simde__m128d_from_private(r_);
345       }
346     #else
347       SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(double), altivec, f64)
348     #endif
349   #endif
350 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
351   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, v128_t, wasm, v128);
352   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, v128_t, wasm, v128);
353 #endif /* defined(SIMDE_ARM_NEON_A32V7_NATIVE) */
354 
355 SIMDE_FUNCTION_ATTRIBUTES
356 simde__m128d
357 simde_mm_set_pd (simde_float64 e1, simde_float64 e0) {
358   #if defined(SIMDE_X86_SSE2_NATIVE)
359     return _mm_set_pd(e1, e0);
360   #else
361     simde__m128d_private r_;
362 
363     #if defined(SIMDE_WASM_SIMD128_NATIVE)
364       r_.wasm_v128 = wasm_f64x2_make(e0, e1);
365     #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
366       SIMDE_ALIGN_TO_16 simde_float64 data[2] = { e0, e1 };
367       r_.neon_f64 = vld1q_f64(data);
368     #else
369       r_.f64[0] = e0;
370       r_.f64[1] = e1;
371     #endif
372 
373     return simde__m128d_from_private(r_);
374   #endif
375 }
376 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
377   #define _mm_set_pd(e1, e0) simde_mm_set_pd(e1, e0)
378 #endif
379 
380 SIMDE_FUNCTION_ATTRIBUTES
381 simde__m128d
simde_mm_set1_pd(simde_float64 a)382 simde_mm_set1_pd (simde_float64 a) {
383   #if defined(SIMDE_X86_SSE2_NATIVE)
384     return _mm_set1_pd(a);
385   #else
386     simde__m128d_private r_;
387 
388     #if defined(SIMDE_WASM_SIMD128_NATIVE)
389       r_.wasm_v128 = wasm_f64x2_splat(a);
390     #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
391       r_.neon_f64 = vdupq_n_f64(a);
392     #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
393       r_.altivec_f64 = vec_splats(HEDLEY_STATIC_CAST(double, a));
394     #else
395       SIMDE_VECTORIZE
396       for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
397         r_.f64[i] = a;
398       }
399     #endif
400 
401     return simde__m128d_from_private(r_);
402   #endif
403 }
404 #define simde_mm_set_pd1(a) simde_mm_set1_pd(a)
405 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
406   #define _mm_set1_pd(a) simde_mm_set1_pd(a)
407   #define _mm_set_pd1(a) simde_mm_set1_pd(a)
408 #endif
409 
410 SIMDE_FUNCTION_ATTRIBUTES
411 simde__m128d
simde_x_mm_abs_pd(simde__m128d a)412 simde_x_mm_abs_pd(simde__m128d a) {
413   #if defined(SIMDE_X86_SSE2_NATIVE)
414     simde_float64 mask_;
415     uint64_t u64_ = UINT64_C(0x7FFFFFFFFFFFFFFF);
416     simde_memcpy(&mask_, &u64_, sizeof(u64_));
417     return _mm_and_pd(_mm_set1_pd(mask_), a);
418   #else
419     simde__m128d_private
420       r_,
421       a_ = simde__m128d_to_private(a);
422 
423     #if defined(SIMDE_ARM_NEON_A32V8_NATIVE)
424       r_.neon_f64 = vabsq_f64(a_.neon_f64);
425     #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
426       r_.altivec_f64 = vec_abs(a_.altivec_f64);
427     #else
428       SIMDE_VECTORIZE
429       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
430         r_.f64[i] = simde_math_fabs(a_.f64[i]);
431       }
432     #endif
433 
434     return simde__m128d_from_private(r_);
435   #endif
436 }
437 
438 SIMDE_FUNCTION_ATTRIBUTES
439 simde__m128d
simde_x_mm_not_pd(simde__m128d a)440 simde_x_mm_not_pd(simde__m128d a) {
441   #if defined(SIMDE_X86_AVX512VL_NATIVE)
442     __m128i ai = _mm_castpd_si128(a);
443     return _mm_castsi128_pd(_mm_ternarylogic_epi64(ai, ai, ai, 0x55));
444   #else
445     simde__m128d_private
446       r_,
447       a_ = simde__m128d_to_private(a);
448 
449     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
450       r_.neon_i32 = vmvnq_s32(a_.neon_i32);
451     #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
452       r_.altivec_f64 = vec_nor(a_.altivec_f64, a_.altivec_f64);
453     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
454       r_.altivec_i32 = vec_nor(a_.altivec_i32, a_.altivec_i32);
455     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
456       r_.wasm_v128 = wasm_v128_not(a_.wasm_v128);
457     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
458       r_.i32f = ~a_.i32f;
459     #else
460       SIMDE_VECTORIZE
461       for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
462         r_.i32f[i] = ~(a_.i32f[i]);
463       }
464     #endif
465 
466     return simde__m128d_from_private(r_);
467   #endif
468 }
469 
470 SIMDE_FUNCTION_ATTRIBUTES
471 simde__m128d
simde_x_mm_select_pd(simde__m128d a,simde__m128d b,simde__m128d mask)472 simde_x_mm_select_pd(simde__m128d a, simde__m128d b, simde__m128d mask) {
473   /* This function is for when you want to blend two elements together
474    * according to a mask.  It is similar to _mm_blendv_pd, except that
475    * it is undefined whether the blend is based on the highest bit in
476    * each lane (like blendv) or just bitwise operations.  This allows
477    * us to implement the function efficiently everywhere.
478    *
479    * Basically, you promise that all the lanes in mask are either 0 or
480    * ~0. */
481   #if defined(SIMDE_X86_SSE4_1_NATIVE)
482     return _mm_blendv_pd(a, b, mask);
483   #else
484     simde__m128d_private
485       r_,
486       a_ = simde__m128d_to_private(a),
487       b_ = simde__m128d_to_private(b),
488       mask_ = simde__m128d_to_private(mask);
489 
490     #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
491       r_.i64 = a_.i64 ^ ((a_.i64 ^ b_.i64) & mask_.i64);
492     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
493       r_.neon_i64 = vbslq_s64(mask_.neon_u64, b_.neon_i64, a_.neon_i64);
494     #else
495       SIMDE_VECTORIZE
496       for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
497         r_.i64[i] = a_.i64[i] ^ ((a_.i64[i] ^ b_.i64[i]) & mask_.i64[i]);
498       }
499     #endif
500 
501     return simde__m128d_from_private(r_);
502   #endif
503 }
504 
505 SIMDE_FUNCTION_ATTRIBUTES
506 simde__m128i
simde_mm_add_epi8(simde__m128i a,simde__m128i b)507 simde_mm_add_epi8 (simde__m128i a, simde__m128i b) {
508   #if defined(SIMDE_X86_SSE2_NATIVE)
509     return _mm_add_epi8(a, b);
510   #else
511     simde__m128i_private
512       r_,
513       a_ = simde__m128i_to_private(a),
514       b_ = simde__m128i_to_private(b);
515 
516     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
517       r_.neon_i8 = vaddq_s8(a_.neon_i8, b_.neon_i8);
518     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
519       r_.altivec_i8 = vec_add(a_.altivec_i8, b_.altivec_i8);
520     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
521       r_.wasm_v128 = wasm_i8x16_add(a_.wasm_v128, b_.wasm_v128);
522     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
523       r_.i8 = a_.i8 + b_.i8;
524     #else
525       SIMDE_VECTORIZE
526       for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
527         r_.i8[i] = a_.i8[i] + b_.i8[i];
528       }
529     #endif
530 
531     return simde__m128i_from_private(r_);
532   #endif
533 }
534 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
535   #define _mm_add_epi8(a, b) simde_mm_add_epi8(a, b)
536 #endif
537 
538 SIMDE_FUNCTION_ATTRIBUTES
539 simde__m128i
simde_mm_add_epi16(simde__m128i a,simde__m128i b)540 simde_mm_add_epi16 (simde__m128i a, simde__m128i b) {
541   #if defined(SIMDE_X86_SSE2_NATIVE)
542     return _mm_add_epi16(a, b);
543   #else
544     simde__m128i_private
545       r_,
546       a_ = simde__m128i_to_private(a),
547       b_ = simde__m128i_to_private(b);
548 
549     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
550       r_.neon_i16 = vaddq_s16(a_.neon_i16, b_.neon_i16);
551     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
552       r_.altivec_i16 = vec_add(a_.altivec_i16, b_.altivec_i16);
553     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
554       r_.wasm_v128 = wasm_i16x8_add(a_.wasm_v128, b_.wasm_v128);
555     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
556       r_.i16 = a_.i16 + b_.i16;
557     #else
558       SIMDE_VECTORIZE
559       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
560         r_.i16[i] = a_.i16[i] + b_.i16[i];
561       }
562     #endif
563 
564     return simde__m128i_from_private(r_);
565   #endif
566 }
567 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
568   #define _mm_add_epi16(a, b) simde_mm_add_epi16(a, b)
569 #endif
570 
571 SIMDE_FUNCTION_ATTRIBUTES
572 simde__m128i
simde_mm_add_epi32(simde__m128i a,simde__m128i b)573 simde_mm_add_epi32 (simde__m128i a, simde__m128i b) {
574   #if defined(SIMDE_X86_SSE2_NATIVE)
575     return _mm_add_epi32(a, b);
576   #else
577     simde__m128i_private
578       r_,
579       a_ = simde__m128i_to_private(a),
580       b_ = simde__m128i_to_private(b);
581 
582     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
583       r_.neon_i32 = vaddq_s32(a_.neon_i32, b_.neon_i32);
584     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
585       r_.altivec_i32 = vec_add(a_.altivec_i32, b_.altivec_i32);
586     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
587       r_.wasm_v128 = wasm_i32x4_add(a_.wasm_v128, b_.wasm_v128);
588     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
589       r_.i32 = a_.i32 + b_.i32;
590     #else
591       SIMDE_VECTORIZE
592       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
593         r_.i32[i] = a_.i32[i] + b_.i32[i];
594       }
595     #endif
596 
597     return simde__m128i_from_private(r_);
598   #endif
599 }
600 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
601   #define _mm_add_epi32(a, b) simde_mm_add_epi32(a, b)
602 #endif
603 
604 SIMDE_FUNCTION_ATTRIBUTES
605 simde__m128i
simde_mm_add_epi64(simde__m128i a,simde__m128i b)606 simde_mm_add_epi64 (simde__m128i a, simde__m128i b) {
607   #if defined(SIMDE_X86_SSE2_NATIVE)
608     return _mm_add_epi64(a, b);
609   #else
610     simde__m128i_private
611       r_,
612       a_ = simde__m128i_to_private(a),
613       b_ = simde__m128i_to_private(b);
614 
615     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
616       r_.neon_i64 = vaddq_s64(a_.neon_i64, b_.neon_i64);
617     #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
618       r_.altivec_i64 = vec_add(a_.altivec_i64, b_.altivec_i64);
619     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
620       r_.wasm_v128 = wasm_i64x2_add(a_.wasm_v128, b_.wasm_v128);
621     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
622       r_.i64 = a_.i64 + b_.i64;
623     #else
624       SIMDE_VECTORIZE
625       for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
626         r_.i64[i] = a_.i64[i] + b_.i64[i];
627       }
628     #endif
629 
630     return simde__m128i_from_private(r_);
631   #endif
632 }
633 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
634   #define _mm_add_epi64(a, b) simde_mm_add_epi64(a, b)
635 #endif
636 
637 SIMDE_FUNCTION_ATTRIBUTES
638 simde__m128d
simde_mm_add_pd(simde__m128d a,simde__m128d b)639 simde_mm_add_pd (simde__m128d a, simde__m128d b) {
640   #if defined(SIMDE_X86_SSE2_NATIVE)
641     return _mm_add_pd(a, b);
642   #else
643     simde__m128d_private
644       r_,
645       a_ = simde__m128d_to_private(a),
646       b_ = simde__m128d_to_private(b);
647 
648     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
649       r_.neon_f64 = vaddq_f64(a_.neon_f64, b_.neon_f64);
650     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
651       r_.wasm_v128 = wasm_f64x2_add(a_.wasm_v128, b_.wasm_v128);
652     #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
653       r_.altivec_f64 = vec_add(a_.altivec_f64, b_.altivec_f64);
654     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
655       r_.wasm_v128 = wasm_f64x2_add(a_.wasm_v128, b_.wasm_v128);
656     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
657       r_.f64 = a_.f64 + b_.f64;
658     #else
659       SIMDE_VECTORIZE
660       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
661         r_.f64[i] = a_.f64[i] + b_.f64[i];
662       }
663     #endif
664 
665     return simde__m128d_from_private(r_);
666   #endif
667 }
668 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
669   #define _mm_add_pd(a, b) simde_mm_add_pd(a, b)
670 #endif
671 
672 SIMDE_FUNCTION_ATTRIBUTES
673 simde__m128d
simde_mm_move_sd(simde__m128d a,simde__m128d b)674 simde_mm_move_sd (simde__m128d a, simde__m128d b) {
675   #if defined(SIMDE_X86_SSE2_NATIVE)
676     return _mm_move_sd(a, b);
677   #else
678     simde__m128d_private
679       r_,
680       a_ = simde__m128d_to_private(a),
681       b_ = simde__m128d_to_private(b);
682 
683     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
684       r_.neon_f64 = vsetq_lane_f64(vgetq_lane_f64(b_.neon_f64, 0), a_.neon_f64, 0);
685     #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
686       #if defined(HEDLEY_IBM_VERSION)
687         r_.altivec_f64 = vec_xxpermdi(a_.altivec_f64, b_.altivec_f64, 1);
688       #else
689         r_.altivec_f64 = vec_xxpermdi(b_.altivec_f64, a_.altivec_f64, 1);
690       #endif
691     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
692       r_.wasm_v128 = wasm_v64x2_shuffle(a_.wasm_v128, b_.wasm_v128, 2, 1);
693     #elif defined(SIMDE_SHUFFLE_VECTOR_)
694       r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 2, 1);
695     #else
696       r_.f64[0] = b_.f64[0];
697       r_.f64[1] = a_.f64[1];
698     #endif
699 
700     return simde__m128d_from_private(r_);
701   #endif
702 }
703 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
704   #define _mm_move_sd(a, b) simde_mm_move_sd(a, b)
705 #endif
706 
707 SIMDE_FUNCTION_ATTRIBUTES
708 simde__m128d
simde_mm_add_sd(simde__m128d a,simde__m128d b)709 simde_mm_add_sd (simde__m128d a, simde__m128d b) {
710   #if defined(SIMDE_X86_SSE2_NATIVE)
711     return _mm_add_sd(a, b);
712   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
713     return simde_mm_move_sd(a, simde_mm_add_pd(a, b));
714   #else
715     simde__m128d_private
716       r_,
717       a_ = simde__m128d_to_private(a),
718       b_ = simde__m128d_to_private(b);
719 
720     r_.f64[0] = a_.f64[0] + b_.f64[0];
721     r_.f64[1] = a_.f64[1];
722 
723     return simde__m128d_from_private(r_);
724   #endif
725 }
726 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
727   #define _mm_add_sd(a, b) simde_mm_add_sd(a, b)
728 #endif
729 
730 SIMDE_FUNCTION_ATTRIBUTES
731 simde__m64
simde_mm_add_si64(simde__m64 a,simde__m64 b)732 simde_mm_add_si64 (simde__m64 a, simde__m64 b) {
733   #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
734     return _mm_add_si64(a, b);
735   #else
736     simde__m64_private
737       r_,
738       a_ = simde__m64_to_private(a),
739       b_ = simde__m64_to_private(b);
740 
741     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
742       r_.neon_i64 = vadd_s64(a_.neon_i64, b_.neon_i64);
743     #else
744       r_.i64[0] = a_.i64[0] + b_.i64[0];
745     #endif
746 
747     return simde__m64_from_private(r_);
748   #endif
749 }
750 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
751   #define _mm_add_si64(a, b) simde_mm_add_si64(a, b)
752 #endif
753 
754 SIMDE_FUNCTION_ATTRIBUTES
755 simde__m128i
simde_mm_adds_epi8(simde__m128i a,simde__m128i b)756 simde_mm_adds_epi8 (simde__m128i a, simde__m128i b) {
757   #if defined(SIMDE_X86_SSE2_NATIVE)
758     return _mm_adds_epi8(a, b);
759   #else
760     simde__m128i_private
761       r_,
762       a_ = simde__m128i_to_private(a),
763       b_ = simde__m128i_to_private(b);
764 
765     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
766       r_.neon_i8 = vqaddq_s8(a_.neon_i8, b_.neon_i8);
767     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
768       r_.wasm_v128 = wasm_i8x16_add_saturate(a_.wasm_v128, b_.wasm_v128);
769     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
770       r_.altivec_i8 = vec_adds(a_.altivec_i8, b_.altivec_i8);
771     #else
772       SIMDE_VECTORIZE
773       for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
774         const int_fast16_t tmp =
775           HEDLEY_STATIC_CAST(int_fast16_t, a_.i8[i]) +
776           HEDLEY_STATIC_CAST(int_fast16_t, b_.i8[i]);
777         r_.i8[i] = HEDLEY_STATIC_CAST(int8_t, ((tmp < INT8_MAX) ? ((tmp > INT8_MIN) ? tmp : INT8_MIN) : INT8_MAX));
778       }
779     #endif
780 
781     return simde__m128i_from_private(r_);
782   #endif
783 }
784 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
785   #define _mm_adds_epi8(a, b) simde_mm_adds_epi8(a, b)
786 #endif
787 
788 SIMDE_FUNCTION_ATTRIBUTES
789 simde__m128i
simde_mm_adds_epi16(simde__m128i a,simde__m128i b)790 simde_mm_adds_epi16 (simde__m128i a, simde__m128i b) {
791   #if defined(SIMDE_X86_SSE2_NATIVE)
792     return _mm_adds_epi16(a, b);
793   #else
794     simde__m128i_private
795       r_,
796       a_ = simde__m128i_to_private(a),
797       b_ = simde__m128i_to_private(b);
798 
799     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
800       r_.neon_i16 = vqaddq_s16(a_.neon_i16, b_.neon_i16);
801     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
802       r_.wasm_v128 = wasm_i16x8_add_saturate(a_.wasm_v128, b_.wasm_v128);
803     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
804       r_.altivec_i16 = vec_adds(a_.altivec_i16, b_.altivec_i16);
805     #else
806       SIMDE_VECTORIZE
807       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
808         const int_fast32_t tmp =
809           HEDLEY_STATIC_CAST(int_fast32_t, a_.i16[i]) +
810           HEDLEY_STATIC_CAST(int_fast32_t, b_.i16[i]);
811         r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, ((tmp < INT16_MAX) ? ((tmp > INT16_MIN) ? tmp : INT16_MIN) : INT16_MAX));
812       }
813     #endif
814 
815     return simde__m128i_from_private(r_);
816   #endif
817 }
818 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
819   #define _mm_adds_epi16(a, b) simde_mm_adds_epi16(a, b)
820 #endif
821 
822 SIMDE_FUNCTION_ATTRIBUTES
823 simde__m128i
simde_mm_adds_epu8(simde__m128i a,simde__m128i b)824 simde_mm_adds_epu8 (simde__m128i a, simde__m128i b) {
825   #if defined(SIMDE_X86_SSE2_NATIVE)
826     return _mm_adds_epu8(a, b);
827   #else
828     simde__m128i_private
829       r_,
830       a_ = simde__m128i_to_private(a),
831       b_ = simde__m128i_to_private(b);
832 
833     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
834       r_.neon_u8 = vqaddq_u8(a_.neon_u8, b_.neon_u8);
835     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
836       r_.wasm_v128 = wasm_u8x16_add_saturate(a_.wasm_v128, b_.wasm_v128);
837     #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
838       r_.altivec_u8 = vec_adds(a_.altivec_u8, b_.altivec_u8);
839     #else
840       SIMDE_VECTORIZE
841       for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
842         r_.u8[i] = ((UINT8_MAX - a_.u8[i]) > b_.u8[i]) ? (a_.u8[i] + b_.u8[i]) : UINT8_MAX;
843       }
844     #endif
845 
846     return simde__m128i_from_private(r_);
847   #endif
848 }
849 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
850   #define _mm_adds_epu8(a, b) simde_mm_adds_epu8(a, b)
851 #endif
852 
853 SIMDE_FUNCTION_ATTRIBUTES
854 simde__m128i
simde_mm_adds_epu16(simde__m128i a,simde__m128i b)855 simde_mm_adds_epu16 (simde__m128i a, simde__m128i b) {
856   #if defined(SIMDE_X86_SSE2_NATIVE)
857     return _mm_adds_epu16(a, b);
858   #else
859     simde__m128i_private
860       r_,
861       a_ = simde__m128i_to_private(a),
862       b_ = simde__m128i_to_private(b);
863 
864     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
865       r_.neon_u16 = vqaddq_u16(a_.neon_u16, b_.neon_u16);
866     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
867       r_.wasm_v128 = wasm_u16x8_add_saturate(a_.wasm_v128, b_.wasm_v128);
868     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
869       r_.altivec_u16 = vec_adds(a_.altivec_u16, b_.altivec_u16);
870     #else
871       SIMDE_VECTORIZE
872       for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
873         r_.u16[i] = ((UINT16_MAX - a_.u16[i]) > b_.u16[i]) ? (a_.u16[i] + b_.u16[i]) : UINT16_MAX;
874       }
875     #endif
876 
877     return simde__m128i_from_private(r_);
878   #endif
879 }
880 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
881   #define _mm_adds_epu16(a, b) simde_mm_adds_epu16(a, b)
882 #endif
883 
884 SIMDE_FUNCTION_ATTRIBUTES
885 simde__m128d
simde_mm_and_pd(simde__m128d a,simde__m128d b)886 simde_mm_and_pd (simde__m128d a, simde__m128d b) {
887   #if defined(SIMDE_X86_SSE2_NATIVE)
888     return _mm_and_pd(a, b);
889   #else
890     simde__m128d_private
891       r_,
892       a_ = simde__m128d_to_private(a),
893       b_ = simde__m128d_to_private(b);
894 
895     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
896       r_.neon_i32 = vandq_s32(a_.neon_i32, b_.neon_i32);
897     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
898       r_.wasm_v128 = wasm_v128_and(a_.wasm_v128, b_.wasm_v128);
899     #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
900       r_.altivec_f64 = vec_and(a_.altivec_f64, b_.altivec_f64);
901     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
902       r_.i32f = a_.i32f & b_.i32f;
903     #else
904       SIMDE_VECTORIZE
905       for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
906         r_.i32f[i] = a_.i32f[i] & b_.i32f[i];
907       }
908     #endif
909 
910     return simde__m128d_from_private(r_);
911   #endif
912 }
913 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
914   #define _mm_and_pd(a, b) simde_mm_and_pd(a, b)
915 #endif
916 
917 SIMDE_FUNCTION_ATTRIBUTES
918 simde__m128i
simde_mm_and_si128(simde__m128i a,simde__m128i b)919 simde_mm_and_si128 (simde__m128i a, simde__m128i b) {
920   #if defined(SIMDE_X86_SSE2_NATIVE)
921     return _mm_and_si128(a, b);
922   #else
923     simde__m128i_private
924       r_,
925       a_ = simde__m128i_to_private(a),
926       b_ = simde__m128i_to_private(b);
927 
928     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
929       r_.neon_i32 = vandq_s32(b_.neon_i32, a_.neon_i32);
930     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
931       r_.altivec_u32f = vec_and(a_.altivec_u32f, b_.altivec_u32f);
932     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
933       r_.i32f = a_.i32f & b_.i32f;
934     #else
935       SIMDE_VECTORIZE
936       for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
937         r_.i32f[i] = a_.i32f[i] & b_.i32f[i];
938       }
939     #endif
940 
941     return simde__m128i_from_private(r_);
942   #endif
943 }
944 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
945   #define _mm_and_si128(a, b) simde_mm_and_si128(a, b)
946 #endif
947 
948 SIMDE_FUNCTION_ATTRIBUTES
949 simde__m128d
simde_mm_andnot_pd(simde__m128d a,simde__m128d b)950 simde_mm_andnot_pd (simde__m128d a, simde__m128d b) {
951   #if defined(SIMDE_X86_SSE2_NATIVE)
952     return _mm_andnot_pd(a, b);
953   #else
954     simde__m128d_private
955       r_,
956       a_ = simde__m128d_to_private(a),
957       b_ = simde__m128d_to_private(b);
958 
959     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
960       r_.neon_i32 = vbicq_s32(b_.neon_i32, a_.neon_i32);
961     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
962       r_.wasm_v128 = wasm_v128_andnot(b_.wasm_v128, a_.wasm_v128);
963     #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
964       r_.altivec_f64 = vec_andc(b_.altivec_f64, a_.altivec_f64);
965     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
966       r_.altivec_i32f = vec_andc(b_.altivec_i32f, a_.altivec_i32f);
967     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
968       r_.i32f = ~a_.i32f & b_.i32f;
969     #else
970       SIMDE_VECTORIZE
971       for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
972         r_.u64[i] = ~a_.u64[i] & b_.u64[i];
973       }
974     #endif
975 
976     return simde__m128d_from_private(r_);
977   #endif
978 }
979 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
980   #define _mm_andnot_pd(a, b) simde_mm_andnot_pd(a, b)
981 #endif
982 
983 SIMDE_FUNCTION_ATTRIBUTES
984 simde__m128i
simde_mm_andnot_si128(simde__m128i a,simde__m128i b)985 simde_mm_andnot_si128 (simde__m128i a, simde__m128i b) {
986   #if defined(SIMDE_X86_SSE2_NATIVE)
987     return _mm_andnot_si128(a, b);
988   #else
989     simde__m128i_private
990       r_,
991       a_ = simde__m128i_to_private(a),
992       b_ = simde__m128i_to_private(b);
993 
994     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
995       r_.neon_i32 = vbicq_s32(b_.neon_i32, a_.neon_i32);
996     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
997       r_.altivec_i32 = vec_andc(b_.altivec_i32, a_.altivec_i32);
998     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
999       r_.i32f = ~a_.i32f & b_.i32f;
1000     #else
1001       SIMDE_VECTORIZE
1002       for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
1003         r_.i32f[i] = ~(a_.i32f[i]) & b_.i32f[i];
1004       }
1005     #endif
1006 
1007     return simde__m128i_from_private(r_);
1008   #endif
1009 }
1010 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1011   #define _mm_andnot_si128(a, b) simde_mm_andnot_si128(a, b)
1012 #endif
1013 
1014 SIMDE_FUNCTION_ATTRIBUTES
1015 simde__m128d
simde_mm_xor_pd(simde__m128d a,simde__m128d b)1016 simde_mm_xor_pd (simde__m128d a, simde__m128d b) {
1017   #if defined(SIMDE_X86_SSE2_NATIVE)
1018     return _mm_xor_pd(a, b);
1019   #else
1020     simde__m128d_private
1021       r_,
1022       a_ = simde__m128d_to_private(a),
1023       b_ = simde__m128d_to_private(b);
1024 
1025     #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1026       r_.i32f = a_.i32f ^ b_.i32f;
1027     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1028       r_.wasm_v128 = wasm_v128_xor(a_.wasm_v128, b_.wasm_v128);
1029     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1030       r_.neon_i64 = veorq_s64(a_.neon_i64, b_.neon_i64);
1031     #else
1032       SIMDE_VECTORIZE
1033       for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
1034         r_.i32f[i] = a_.i32f[i] ^ b_.i32f[i];
1035       }
1036     #endif
1037 
1038     return simde__m128d_from_private(r_);
1039   #endif
1040 }
1041 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1042   #define _mm_xor_pd(a, b) simde_mm_xor_pd(a, b)
1043 #endif
1044 
1045 SIMDE_FUNCTION_ATTRIBUTES
1046 simde__m128i
simde_mm_avg_epu8(simde__m128i a,simde__m128i b)1047 simde_mm_avg_epu8 (simde__m128i a, simde__m128i b) {
1048   #if defined(SIMDE_X86_SSE2_NATIVE)
1049     return _mm_avg_epu8(a, b);
1050   #else
1051     simde__m128i_private
1052       r_,
1053       a_ = simde__m128i_to_private(a),
1054       b_ = simde__m128i_to_private(b);
1055 
1056     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1057       r_.neon_u8 = vrhaddq_u8(b_.neon_u8, a_.neon_u8);
1058     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1059       r_.wasm_v128 = wasm_u8x16_avgr(a_.wasm_v128, b_.wasm_v128);
1060     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
1061       r_.altivec_u8 = vec_avg(a_.altivec_u8, b_.altivec_u8);
1062     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_CONVERT_VECTOR_)
1063       uint16_t wa SIMDE_VECTOR(32);
1064       uint16_t wb SIMDE_VECTOR(32);
1065       uint16_t wr SIMDE_VECTOR(32);
1066       SIMDE_CONVERT_VECTOR_(wa, a_.u8);
1067       SIMDE_CONVERT_VECTOR_(wb, b_.u8);
1068       wr = (wa + wb + 1) >> 1;
1069       SIMDE_CONVERT_VECTOR_(r_.u8, wr);
1070     #else
1071       SIMDE_VECTORIZE
1072       for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
1073         r_.u8[i] = (a_.u8[i] + b_.u8[i] + 1) >> 1;
1074       }
1075     #endif
1076 
1077     return simde__m128i_from_private(r_);
1078   #endif
1079 }
1080 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1081   #define _mm_avg_epu8(a, b) simde_mm_avg_epu8(a, b)
1082 #endif
1083 
1084 SIMDE_FUNCTION_ATTRIBUTES
1085 simde__m128i
simde_mm_avg_epu16(simde__m128i a,simde__m128i b)1086 simde_mm_avg_epu16 (simde__m128i a, simde__m128i b) {
1087   #if defined(SIMDE_X86_SSE2_NATIVE)
1088     return _mm_avg_epu16(a, b);
1089   #else
1090     simde__m128i_private
1091       r_,
1092       a_ = simde__m128i_to_private(a),
1093       b_ = simde__m128i_to_private(b);
1094 
1095     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1096       r_.neon_u16 = vrhaddq_u16(b_.neon_u16, a_.neon_u16);
1097     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1098       r_.wasm_v128 = wasm_u16x8_avgr(a_.wasm_v128, b_.wasm_v128);
1099     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
1100       r_.altivec_u16 = vec_avg(a_.altivec_u16, b_.altivec_u16);
1101     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_CONVERT_VECTOR_)
1102       uint32_t wa SIMDE_VECTOR(32);
1103       uint32_t wb SIMDE_VECTOR(32);
1104       uint32_t wr SIMDE_VECTOR(32);
1105       SIMDE_CONVERT_VECTOR_(wa, a_.u16);
1106       SIMDE_CONVERT_VECTOR_(wb, b_.u16);
1107       wr = (wa + wb + 1) >> 1;
1108       SIMDE_CONVERT_VECTOR_(r_.u16, wr);
1109     #else
1110       SIMDE_VECTORIZE
1111       for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
1112         r_.u16[i] = (a_.u16[i] + b_.u16[i] + 1) >> 1;
1113       }
1114     #endif
1115 
1116     return simde__m128i_from_private(r_);
1117   #endif
1118 }
1119 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1120   #define _mm_avg_epu16(a, b) simde_mm_avg_epu16(a, b)
1121 #endif
1122 
1123 SIMDE_FUNCTION_ATTRIBUTES
1124 simde__m128i
simde_mm_setzero_si128(void)1125 simde_mm_setzero_si128 (void) {
1126   #if defined(SIMDE_X86_SSE2_NATIVE)
1127     return _mm_setzero_si128();
1128   #else
1129     simde__m128i_private r_;
1130 
1131     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1132       r_.neon_i32 = vdupq_n_s32(0);
1133     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
1134       r_.altivec_i32 = vec_splats(HEDLEY_STATIC_CAST(signed int, 0));
1135     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1136       r_.wasm_v128 = wasm_i32x4_splat(INT32_C(0));
1137     #elif defined(SIMDE_VECTOR_SUBSCRIPT)
1138       r_.i32 = __extension__ (__typeof__(r_.i32)) { 0, 0, 0, 0 };
1139     #else
1140       SIMDE_VECTORIZE
1141       for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
1142         r_.i32f[i] = 0;
1143       }
1144     #endif
1145 
1146     return simde__m128i_from_private(r_);
1147   #endif
1148 }
1149 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1150   #define _mm_setzero_si128() (simde_mm_setzero_si128())
1151 #endif
1152 
1153 SIMDE_FUNCTION_ATTRIBUTES
1154 simde__m128i
simde_mm_bslli_si128(simde__m128i a,const int imm8)1155 simde_mm_bslli_si128 (simde__m128i a, const int imm8)
1156     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)  {
1157   simde__m128i_private
1158     r_,
1159     a_ = simde__m128i_to_private(a);
1160 
1161   if (HEDLEY_UNLIKELY((imm8 & ~15))) {
1162     return simde_mm_setzero_si128();
1163   }
1164 
1165   #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) && defined(SIMDE_ENDIAN_ORDER)
1166     r_.altivec_i8 =
1167       #if (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
1168         vec_slo
1169       #else /* SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_BIG */
1170         vec_sro
1171       #endif
1172         (a_.altivec_i8, vec_splats(HEDLEY_STATIC_CAST(unsigned char, imm8 * 8)));
1173   #elif defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
1174     r_.altivec_i8 = vec_srb(a_.altivec_i8, vec_splats(HEDLEY_STATIC_CAST(unsigned char, (imm8 & 15) << 3)));
1175   #elif defined(SIMDE_HAVE_INT128_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
1176     r_.u128[0] = a_.u128[0] << (imm8 * 8);
1177   #else
1178     r_ = simde__m128i_to_private(simde_mm_setzero_si128());
1179     for (int i = imm8 ; i < HEDLEY_STATIC_CAST(int, sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
1180       r_.i8[i] = a_.i8[i - imm8];
1181     }
1182   #endif
1183 
1184   return simde__m128i_from_private(r_);
1185 }
1186 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
1187   #define simde_mm_bslli_si128(a, imm8) _mm_slli_si128(a, imm8)
1188 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__)
1189   #define simde_mm_bslli_si128(a, imm8) \
1190   simde__m128i_from_neon_i8(((imm8) <= 0) ? simde__m128i_to_neon_i8(a) : (((imm8) > 15) ? (vdupq_n_s8(0)) : (vextq_s8(vdupq_n_s8(0), simde__m128i_to_neon_i8(a), 16 - (imm8)))))
1191 #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) && !defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
1192   #define simde_mm_bslli_si128(a, imm8) (__extension__ ({ \
1193     const simde__m128i_private simde__tmp_a_ = simde__m128i_to_private(a); \
1194     const simde__m128i_private simde__tmp_z_ = simde__m128i_to_private(simde_mm_setzero_si128()); \
1195     simde__m128i_private simde__tmp_r_; \
1196     if (HEDLEY_UNLIKELY(imm8 > 15)) { \
1197       simde__tmp_r_ = simde__m128i_to_private(simde_mm_setzero_si128()); \
1198     } else { \
1199       simde__tmp_r_.i8 = \
1200         SIMDE_SHUFFLE_VECTOR_(8, 16, \
1201           simde__tmp_z_.i8, \
1202           (simde__tmp_a_).i8, \
1203           HEDLEY_STATIC_CAST(int8_t, (16 - imm8) & 31), \
1204           HEDLEY_STATIC_CAST(int8_t, (17 - imm8) & 31), \
1205           HEDLEY_STATIC_CAST(int8_t, (18 - imm8) & 31), \
1206           HEDLEY_STATIC_CAST(int8_t, (19 - imm8) & 31), \
1207           HEDLEY_STATIC_CAST(int8_t, (20 - imm8) & 31), \
1208           HEDLEY_STATIC_CAST(int8_t, (21 - imm8) & 31), \
1209           HEDLEY_STATIC_CAST(int8_t, (22 - imm8) & 31), \
1210           HEDLEY_STATIC_CAST(int8_t, (23 - imm8) & 31), \
1211           HEDLEY_STATIC_CAST(int8_t, (24 - imm8) & 31), \
1212           HEDLEY_STATIC_CAST(int8_t, (25 - imm8) & 31), \
1213           HEDLEY_STATIC_CAST(int8_t, (26 - imm8) & 31), \
1214           HEDLEY_STATIC_CAST(int8_t, (27 - imm8) & 31), \
1215           HEDLEY_STATIC_CAST(int8_t, (28 - imm8) & 31), \
1216           HEDLEY_STATIC_CAST(int8_t, (29 - imm8) & 31), \
1217           HEDLEY_STATIC_CAST(int8_t, (30 - imm8) & 31), \
1218           HEDLEY_STATIC_CAST(int8_t, (31 - imm8) & 31)); \
1219     } \
1220     simde__m128i_from_private(simde__tmp_r_); }))
1221 #endif
1222 #define simde_mm_slli_si128(a, imm8) simde_mm_bslli_si128(a, imm8)
1223 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1224   #define _mm_bslli_si128(a, imm8) simde_mm_bslli_si128(a, imm8)
1225   #define _mm_slli_si128(a, imm8) simde_mm_bslli_si128(a, imm8)
1226 #endif
1227 
1228 SIMDE_FUNCTION_ATTRIBUTES
1229 simde__m128i
simde_mm_bsrli_si128(simde__m128i a,const int imm8)1230 simde_mm_bsrli_si128 (simde__m128i a, const int imm8)
1231     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)  {
1232   simde__m128i_private
1233     r_,
1234     a_ = simde__m128i_to_private(a);
1235 
1236   if (HEDLEY_UNLIKELY((imm8 & ~15))) {
1237     return simde_mm_setzero_si128();
1238   }
1239 
1240   #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) && defined(SIMDE_ENDIAN_ORDER)
1241     r_.altivec_i8 =
1242     #if (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
1243       vec_sro
1244     #else /* SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_BIG */
1245       vec_slo
1246     #endif
1247         (a_.altivec_i8, vec_splats(HEDLEY_STATIC_CAST(unsigned char, imm8 * 8)));
1248   #elif defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
1249     r_.altivec_i8 = vec_slb(a_.altivec_i8, vec_splats(HEDLEY_STATIC_CAST(unsigned char, (imm8 & 15) << 3)));
1250   #else
1251     SIMDE_VECTORIZE
1252     for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
1253       const int e = HEDLEY_STATIC_CAST(int, i) + imm8;
1254       r_.i8[i] = (e < 16) ? a_.i8[e] : 0;
1255     }
1256   #endif
1257 
1258   return simde__m128i_from_private(r_);
1259 }
1260 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
1261   #define simde_mm_bsrli_si128(a, imm8) _mm_srli_si128(a, imm8)
1262 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__)
1263   #define simde_mm_bsrli_si128(a, imm8) \
1264   simde__m128i_from_neon_i8(((imm8 < 0) || (imm8 > 15)) ? vdupq_n_s8(0) : (vextq_s8(simde__m128i_to_private(a).neon_i8, vdupq_n_s8(0), ((imm8 & 15) != 0) ? imm8 : (imm8 & 15))))
1265 #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) && !defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
1266   #define simde_mm_bsrli_si128(a, imm8) (__extension__ ({ \
1267     const simde__m128i_private simde__tmp_a_ = simde__m128i_to_private(a); \
1268     const simde__m128i_private simde__tmp_z_ = simde__m128i_to_private(simde_mm_setzero_si128()); \
1269     simde__m128i_private simde__tmp_r_ = simde__m128i_to_private(a); \
1270     if (HEDLEY_UNLIKELY(imm8 > 15)) { \
1271       simde__tmp_r_ = simde__m128i_to_private(simde_mm_setzero_si128()); \
1272     } else { \
1273       simde__tmp_r_.i8 = \
1274       SIMDE_SHUFFLE_VECTOR_(8, 16, \
1275         simde__tmp_z_.i8, \
1276         (simde__tmp_a_).i8, \
1277         HEDLEY_STATIC_CAST(int8_t, (imm8 + 16) & 31), \
1278         HEDLEY_STATIC_CAST(int8_t, (imm8 + 17) & 31), \
1279         HEDLEY_STATIC_CAST(int8_t, (imm8 + 18) & 31), \
1280         HEDLEY_STATIC_CAST(int8_t, (imm8 + 19) & 31), \
1281         HEDLEY_STATIC_CAST(int8_t, (imm8 + 20) & 31), \
1282         HEDLEY_STATIC_CAST(int8_t, (imm8 + 21) & 31), \
1283         HEDLEY_STATIC_CAST(int8_t, (imm8 + 22) & 31), \
1284         HEDLEY_STATIC_CAST(int8_t, (imm8 + 23) & 31), \
1285         HEDLEY_STATIC_CAST(int8_t, (imm8 + 24) & 31), \
1286         HEDLEY_STATIC_CAST(int8_t, (imm8 + 25) & 31), \
1287         HEDLEY_STATIC_CAST(int8_t, (imm8 + 26) & 31), \
1288         HEDLEY_STATIC_CAST(int8_t, (imm8 + 27) & 31), \
1289         HEDLEY_STATIC_CAST(int8_t, (imm8 + 28) & 31), \
1290         HEDLEY_STATIC_CAST(int8_t, (imm8 + 29) & 31), \
1291         HEDLEY_STATIC_CAST(int8_t, (imm8 + 30) & 31), \
1292         HEDLEY_STATIC_CAST(int8_t, (imm8 + 31) & 31)); \
1293     } \
1294     simde__m128i_from_private(simde__tmp_r_); }))
1295 #endif
1296 #define simde_mm_srli_si128(a, imm8) simde_mm_bsrli_si128((a), (imm8))
1297 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1298   #define _mm_bsrli_si128(a, imm8) simde_mm_bsrli_si128((a), (imm8))
1299   #define _mm_srli_si128(a, imm8) simde_mm_bsrli_si128((a), (imm8))
1300 #endif
1301 
1302 SIMDE_FUNCTION_ATTRIBUTES
1303 void
simde_mm_clflush(void const * p)1304 simde_mm_clflush (void const* p) {
1305   #if defined(SIMDE_X86_SSE2_NATIVE)
1306     _mm_clflush(p);
1307   #else
1308     (void) p;
1309   #endif
1310 }
1311 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1312   #define _mm_clflush(a, b) simde_mm_clflush()
1313 #endif
1314 
1315 SIMDE_FUNCTION_ATTRIBUTES
1316 int
simde_mm_comieq_sd(simde__m128d a,simde__m128d b)1317 simde_mm_comieq_sd (simde__m128d a, simde__m128d b) {
1318   #if defined(SIMDE_X86_SSE2_NATIVE)
1319     return _mm_comieq_sd(a, b);
1320   #else
1321     simde__m128d_private
1322       a_ = simde__m128d_to_private(a),
1323       b_ = simde__m128d_to_private(b);
1324     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1325       return !!vgetq_lane_u64(vceqq_f64(a_.neon_f64, b_.neon_f64), 0);
1326     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1327       return wasm_f64x2_extract_lane(a_.wasm_v128, 0) == wasm_f64x2_extract_lane(b_.wasm_v128, 0);
1328     #else
1329       return a_.f64[0] == b_.f64[0];
1330     #endif
1331   #endif
1332 }
1333 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1334   #define _mm_comieq_sd(a, b) simde_mm_comieq_sd(a, b)
1335 #endif
1336 
1337 SIMDE_FUNCTION_ATTRIBUTES
1338 int
simde_mm_comige_sd(simde__m128d a,simde__m128d b)1339 simde_mm_comige_sd (simde__m128d a, simde__m128d b) {
1340   #if defined(SIMDE_X86_SSE2_NATIVE)
1341     return _mm_comige_sd(a, b);
1342   #else
1343     simde__m128d_private
1344       a_ = simde__m128d_to_private(a),
1345       b_ = simde__m128d_to_private(b);
1346     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1347       return !!vgetq_lane_u64(vcgeq_f64(a_.neon_f64, b_.neon_f64), 0);
1348     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1349       return wasm_f64x2_extract_lane(a_.wasm_v128, 0) >= wasm_f64x2_extract_lane(b_.wasm_v128, 0);
1350     #else
1351       return a_.f64[0] >= b_.f64[0];
1352     #endif
1353   #endif
1354 }
1355 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1356   #define _mm_comige_sd(a, b) simde_mm_comige_sd(a, b)
1357 #endif
1358 
1359 SIMDE_FUNCTION_ATTRIBUTES
1360 int
simde_mm_comigt_sd(simde__m128d a,simde__m128d b)1361 simde_mm_comigt_sd (simde__m128d a, simde__m128d b) {
1362   #if defined(SIMDE_X86_SSE2_NATIVE)
1363     return _mm_comigt_sd(a, b);
1364   #else
1365     simde__m128d_private
1366       a_ = simde__m128d_to_private(a),
1367       b_ = simde__m128d_to_private(b);
1368     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1369       return !!vgetq_lane_u64(vcgtq_f64(a_.neon_f64, b_.neon_f64), 0);
1370     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1371       return wasm_f64x2_extract_lane(a_.wasm_v128, 0) > wasm_f64x2_extract_lane(b_.wasm_v128, 0);
1372     #else
1373       return a_.f64[0] > b_.f64[0];
1374     #endif
1375   #endif
1376 }
1377 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1378   #define _mm_comigt_sd(a, b) simde_mm_comigt_sd(a, b)
1379 #endif
1380 
1381 SIMDE_FUNCTION_ATTRIBUTES
1382 int
simde_mm_comile_sd(simde__m128d a,simde__m128d b)1383 simde_mm_comile_sd (simde__m128d a, simde__m128d b) {
1384   #if defined(SIMDE_X86_SSE2_NATIVE)
1385     return _mm_comile_sd(a, b);
1386   #else
1387     simde__m128d_private
1388       a_ = simde__m128d_to_private(a),
1389       b_ = simde__m128d_to_private(b);
1390     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1391       return !!vgetq_lane_u64(vcleq_f64(a_.neon_f64, b_.neon_f64), 0);
1392     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1393       return wasm_f64x2_extract_lane(a_.wasm_v128, 0) <= wasm_f64x2_extract_lane(b_.wasm_v128, 0);
1394     #else
1395       return a_.f64[0] <= b_.f64[0];
1396     #endif
1397   #endif
1398 }
1399 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1400   #define _mm_comile_sd(a, b) simde_mm_comile_sd(a, b)
1401 #endif
1402 
1403 SIMDE_FUNCTION_ATTRIBUTES
1404 int
simde_mm_comilt_sd(simde__m128d a,simde__m128d b)1405 simde_mm_comilt_sd (simde__m128d a, simde__m128d b) {
1406   #if defined(SIMDE_X86_SSE2_NATIVE)
1407     return _mm_comilt_sd(a, b);
1408   #else
1409     simde__m128d_private
1410       a_ = simde__m128d_to_private(a),
1411       b_ = simde__m128d_to_private(b);
1412     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1413       return !!vgetq_lane_u64(vcltq_f64(a_.neon_f64, b_.neon_f64), 0);
1414     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1415       return wasm_f64x2_extract_lane(a_.wasm_v128, 0) < wasm_f64x2_extract_lane(b_.wasm_v128, 0);
1416     #else
1417       return a_.f64[0] < b_.f64[0];
1418     #endif
1419   #endif
1420 }
1421 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1422   #define _mm_comilt_sd(a, b) simde_mm_comilt_sd(a, b)
1423 #endif
1424 
1425 SIMDE_FUNCTION_ATTRIBUTES
1426 int
simde_mm_comineq_sd(simde__m128d a,simde__m128d b)1427 simde_mm_comineq_sd (simde__m128d a, simde__m128d b) {
1428   #if defined(SIMDE_X86_SSE2_NATIVE)
1429     return _mm_comineq_sd(a, b);
1430   #else
1431     simde__m128d_private
1432       a_ = simde__m128d_to_private(a),
1433       b_ = simde__m128d_to_private(b);
1434     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1435       return !vgetq_lane_u64(vceqq_f64(a_.neon_f64, b_.neon_f64), 0);
1436     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1437       return wasm_f64x2_extract_lane(a_.wasm_v128, 0) != wasm_f64x2_extract_lane(b_.wasm_v128, 0);
1438     #else
1439       return a_.f64[0] != b_.f64[0];
1440     #endif
1441   #endif
1442 }
1443 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1444   #define _mm_comineq_sd(a, b) simde_mm_comineq_sd(a, b)
1445 #endif
1446 
1447 SIMDE_FUNCTION_ATTRIBUTES
1448 simde__m128d
simde_x_mm_copysign_pd(simde__m128d dest,simde__m128d src)1449 simde_x_mm_copysign_pd(simde__m128d dest, simde__m128d src) {
1450   simde__m128d_private
1451     r_,
1452     dest_ = simde__m128d_to_private(dest),
1453     src_ = simde__m128d_to_private(src);
1454 
1455   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1456     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1457       uint64x2_t sign_pos = vreinterpretq_u64_f64(vdupq_n_f64(-SIMDE_FLOAT64_C(0.0)));
1458     #else
1459       simde_float64 dbl_nz = -SIMDE_FLOAT64_C(0.0);
1460       uint64_t u64_nz;
1461       simde_memcpy(&u64_nz, &dbl_nz, sizeof(u64_nz));
1462       uint64x2_t sign_pos = vdupq_n_u64(u64_nz);
1463     #endif
1464     r_.neon_u64 = vbslq_u64(sign_pos, src_.neon_u64, dest_.neon_u64);
1465   #elif defined(SIMDE_POWER_ALTIVEC_P9_NATIVE)
1466     #if !defined(HEDLEY_IBM_VERSION)
1467       r_.altivec_f64 = vec_cpsgn(dest_.altivec_f64, src_.altivec_f64);
1468     #else
1469       r_.altivec_f64 = vec_cpsgn(src_.altivec_f64, dest_.altivec_f64);
1470     #endif
1471   #elif defined(simde_math_copysign)
1472     SIMDE_VECTORIZE
1473     for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
1474       r_.f64[i] = simde_math_copysign(dest_.f64[i], src_.f64[i]);
1475     }
1476   #else
1477     simde__m128d sgnbit = simde_mm_set1_pd(-SIMDE_FLOAT64_C(0.0));
1478     return simde_mm_xor_pd(simde_mm_and_pd(sgnbit, src), simde_mm_andnot_pd(sgnbit, dest));
1479   #endif
1480 
1481   return simde__m128d_from_private(r_);
1482 }
1483 
1484 SIMDE_FUNCTION_ATTRIBUTES
1485 simde__m128d
simde_x_mm_xorsign_pd(simde__m128d dest,simde__m128d src)1486 simde_x_mm_xorsign_pd(simde__m128d dest, simde__m128d src) {
1487   return simde_mm_xor_pd(simde_mm_and_pd(simde_mm_set1_pd(-0.0), src), dest);
1488 }
1489 
1490 SIMDE_FUNCTION_ATTRIBUTES
1491 simde__m128
simde_mm_castpd_ps(simde__m128d a)1492 simde_mm_castpd_ps (simde__m128d a) {
1493   #if defined(SIMDE_X86_SSE2_NATIVE)
1494     return _mm_castpd_ps(a);
1495   #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1496     return vreinterpretq_f32_f64(a);
1497   #else
1498     simde__m128 r;
1499     simde_memcpy(&r, &a, sizeof(a));
1500     return r;
1501   #endif
1502 }
1503 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1504   #define _mm_castpd_ps(a) simde_mm_castpd_ps(a)
1505 #endif
1506 
1507 SIMDE_FUNCTION_ATTRIBUTES
1508 simde__m128i
simde_mm_castpd_si128(simde__m128d a)1509 simde_mm_castpd_si128 (simde__m128d a) {
1510   #if defined(SIMDE_X86_SSE2_NATIVE)
1511     return _mm_castpd_si128(a);
1512   #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1513     return vreinterpretq_s64_f64(a);
1514   #else
1515     simde__m128i r;
1516     simde_memcpy(&r, &a, sizeof(a));
1517     return r;
1518   #endif
1519 }
1520 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1521   #define _mm_castpd_si128(a) simde_mm_castpd_si128(a)
1522 #endif
1523 
1524 SIMDE_FUNCTION_ATTRIBUTES
1525 simde__m128d
simde_mm_castps_pd(simde__m128 a)1526 simde_mm_castps_pd (simde__m128 a) {
1527   #if defined(SIMDE_X86_SSE2_NATIVE)
1528     return _mm_castps_pd(a);
1529   #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1530     return vreinterpretq_f64_f32(a);
1531   #else
1532     simde__m128d r;
1533     simde_memcpy(&r, &a, sizeof(a));
1534     return r;
1535   #endif
1536 }
1537 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1538   #define _mm_castps_pd(a) simde_mm_castps_pd(a)
1539 #endif
1540 
1541 SIMDE_FUNCTION_ATTRIBUTES
1542 simde__m128i
simde_mm_castps_si128(simde__m128 a)1543 simde_mm_castps_si128 (simde__m128 a) {
1544   #if defined(SIMDE_X86_SSE2_NATIVE)
1545     return _mm_castps_si128(a);
1546   #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1547     return simde__m128i_from_neon_i32(simde__m128_to_private(a).neon_i32);
1548   #else
1549     simde__m128i r;
1550     simde_memcpy(&r, &a, sizeof(a));
1551     return r;
1552   #endif
1553 }
1554 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1555   #define _mm_castps_si128(a) simde_mm_castps_si128(a)
1556 #endif
1557 
1558 SIMDE_FUNCTION_ATTRIBUTES
1559 simde__m128d
simde_mm_castsi128_pd(simde__m128i a)1560 simde_mm_castsi128_pd (simde__m128i a) {
1561   #if defined(SIMDE_X86_SSE2_NATIVE)
1562     return _mm_castsi128_pd(a);
1563   #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1564     return vreinterpretq_f64_s64(a);
1565   #else
1566     simde__m128d r;
1567     simde_memcpy(&r, &a, sizeof(a));
1568     return r;
1569   #endif
1570 }
1571 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1572   #define _mm_castsi128_pd(a) simde_mm_castsi128_pd(a)
1573 #endif
1574 
1575 SIMDE_FUNCTION_ATTRIBUTES
1576 simde__m128
simde_mm_castsi128_ps(simde__m128i a)1577 simde_mm_castsi128_ps (simde__m128i a) {
1578   #if defined(SIMDE_X86_SSE2_NATIVE)
1579     return _mm_castsi128_ps(a);
1580   #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1581     return HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), a);
1582   #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1583     return simde__m128_from_neon_i32(simde__m128i_to_private(a).neon_i32);
1584   #else
1585     simde__m128 r;
1586     simde_memcpy(&r, &a, sizeof(a));
1587     return r;
1588   #endif
1589 }
1590 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1591   #define _mm_castsi128_ps(a) simde_mm_castsi128_ps(a)
1592 #endif
1593 
1594 SIMDE_FUNCTION_ATTRIBUTES
1595 simde__m128i
simde_mm_cmpeq_epi8(simde__m128i a,simde__m128i b)1596 simde_mm_cmpeq_epi8 (simde__m128i a, simde__m128i b) {
1597   #if defined(SIMDE_X86_SSE2_NATIVE)
1598     return _mm_cmpeq_epi8(a, b);
1599   #else
1600     simde__m128i_private
1601       r_,
1602       a_ = simde__m128i_to_private(a),
1603       b_ = simde__m128i_to_private(b);
1604 
1605     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1606       r_.neon_u8 = vceqq_s8(b_.neon_i8, a_.neon_i8);
1607     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1608       r_.wasm_v128 = wasm_i8x16_eq(a_.wasm_v128, b_.wasm_v128);
1609     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
1610       r_.altivec_i8 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char), vec_cmpeq(a_.altivec_i8, b_.altivec_i8));
1611     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1612       r_.i8 = HEDLEY_STATIC_CAST(__typeof__(r_.i8), (a_.i8 == b_.i8));
1613     #else
1614       SIMDE_VECTORIZE
1615       for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
1616         r_.i8[i] = (a_.i8[i] == b_.i8[i]) ? ~INT8_C(0) : INT8_C(0);
1617       }
1618     #endif
1619 
1620     return simde__m128i_from_private(r_);
1621   #endif
1622 }
1623 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1624   #define _mm_cmpeq_epi8(a, b) simde_mm_cmpeq_epi8(a, b)
1625 #endif
1626 
1627 SIMDE_FUNCTION_ATTRIBUTES
1628 simde__m128i
simde_mm_cmpeq_epi16(simde__m128i a,simde__m128i b)1629 simde_mm_cmpeq_epi16 (simde__m128i a, simde__m128i b) {
1630   #if defined(SIMDE_X86_SSE2_NATIVE)
1631     return _mm_cmpeq_epi16(a, b);
1632   #else
1633     simde__m128i_private
1634       r_,
1635       a_ = simde__m128i_to_private(a),
1636       b_ = simde__m128i_to_private(b);
1637 
1638     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1639       r_.neon_u16 = vceqq_s16(b_.neon_i16, a_.neon_i16);
1640     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1641       r_.wasm_v128 = wasm_i16x8_eq(a_.wasm_v128, b_.wasm_v128);
1642     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
1643       r_.altivec_i16 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed short), vec_cmpeq(a_.altivec_i16, b_.altivec_i16));
1644     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1645       r_.i16 = (a_.i16 == b_.i16);
1646     #else
1647       SIMDE_VECTORIZE
1648       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
1649         r_.i16[i] = (a_.i16[i] == b_.i16[i]) ? ~INT16_C(0) : INT16_C(0);
1650       }
1651     #endif
1652 
1653     return simde__m128i_from_private(r_);
1654   #endif
1655 }
1656 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1657   #define _mm_cmpeq_epi16(a, b) simde_mm_cmpeq_epi16(a, b)
1658 #endif
1659 
1660 SIMDE_FUNCTION_ATTRIBUTES
1661 simde__m128i
simde_mm_cmpeq_epi32(simde__m128i a,simde__m128i b)1662 simde_mm_cmpeq_epi32 (simde__m128i a, simde__m128i b) {
1663   #if defined(SIMDE_X86_SSE2_NATIVE)
1664     return _mm_cmpeq_epi32(a, b);
1665   #else
1666     simde__m128i_private
1667       r_,
1668       a_ = simde__m128i_to_private(a),
1669       b_ = simde__m128i_to_private(b);
1670 
1671     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1672       r_.neon_u32 = vceqq_s32(b_.neon_i32, a_.neon_i32);
1673     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1674       r_.wasm_v128 = wasm_i32x4_eq(a_.wasm_v128, b_.wasm_v128);
1675     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
1676       r_.altivec_i32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed int), vec_cmpeq(a_.altivec_i32, b_.altivec_i32));
1677     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1678       r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), a_.i32 == b_.i32);
1679     #else
1680       SIMDE_VECTORIZE
1681       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1682         r_.i32[i] = (a_.i32[i] == b_.i32[i]) ? ~INT32_C(0) : INT32_C(0);
1683       }
1684     #endif
1685 
1686     return simde__m128i_from_private(r_);
1687   #endif
1688 }
1689 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1690   #define _mm_cmpeq_epi32(a, b) simde_mm_cmpeq_epi32(a, b)
1691 #endif
1692 
1693 SIMDE_FUNCTION_ATTRIBUTES
1694 simde__m128d
simde_mm_cmpeq_pd(simde__m128d a,simde__m128d b)1695 simde_mm_cmpeq_pd (simde__m128d a, simde__m128d b) {
1696   #if defined(SIMDE_X86_SSE2_NATIVE)
1697     return _mm_cmpeq_pd(a, b);
1698   #else
1699     simde__m128d_private
1700       r_,
1701       a_ = simde__m128d_to_private(a),
1702       b_ = simde__m128d_to_private(b);
1703 
1704     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1705       r_.neon_u64 = vceqq_s64(b_.neon_i64, a_.neon_i64);
1706     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1707       r_.wasm_v128 = wasm_f64x2_eq(a_.wasm_v128, b_.wasm_v128);
1708     #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
1709       r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_cmpeq(a_.altivec_f64, b_.altivec_f64));
1710     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1711       r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 == b_.f64));
1712     #else
1713       SIMDE_VECTORIZE
1714       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
1715         r_.u64[i] = (a_.f64[i] == b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
1716       }
1717     #endif
1718 
1719     return simde__m128d_from_private(r_);
1720   #endif
1721 }
1722 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1723   #define _mm_cmpeq_pd(a, b) simde_mm_cmpeq_pd(a, b)
1724 #endif
1725 
1726 SIMDE_FUNCTION_ATTRIBUTES
1727 simde__m128d
simde_mm_cmpeq_sd(simde__m128d a,simde__m128d b)1728 simde_mm_cmpeq_sd (simde__m128d a, simde__m128d b) {
1729   #if defined(SIMDE_X86_SSE2_NATIVE)
1730     return _mm_cmpeq_sd(a, b);
1731   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
1732     return simde_mm_move_sd(a, simde_mm_cmpeq_pd(a, b));
1733   #else
1734     simde__m128d_private
1735       r_,
1736       a_ = simde__m128d_to_private(a),
1737       b_ = simde__m128d_to_private(b);
1738 
1739     r_.u64[0] = (a_.u64[0] == b_.u64[0]) ? ~UINT64_C(0) : 0;
1740     r_.u64[1] = a_.u64[1];
1741 
1742     return simde__m128d_from_private(r_);
1743   #endif
1744 }
1745 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1746   #define _mm_cmpeq_sd(a, b) simde_mm_cmpeq_sd(a, b)
1747 #endif
1748 
1749 SIMDE_FUNCTION_ATTRIBUTES
1750 simde__m128d
simde_mm_cmpneq_pd(simde__m128d a,simde__m128d b)1751 simde_mm_cmpneq_pd (simde__m128d a, simde__m128d b) {
1752   #if defined(SIMDE_X86_SSE2_NATIVE)
1753     return _mm_cmpneq_pd(a, b);
1754   #else
1755     simde__m128d_private
1756       r_,
1757       a_ = simde__m128d_to_private(a),
1758       b_ = simde__m128d_to_private(b);
1759 
1760     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1761       r_.neon_u32 = vmvnq_u32(vreinterpretq_u32_u64(vceqq_f64(b_.neon_f64, a_.neon_f64)));
1762     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1763       r_.wasm_v128 = wasm_f64x2_ne(a_.wasm_v128, b_.wasm_v128);
1764     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1765       r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 != b_.f64));
1766     #else
1767       SIMDE_VECTORIZE
1768       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
1769         r_.u64[i] = (a_.f64[i] != b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
1770       }
1771     #endif
1772 
1773     return simde__m128d_from_private(r_);
1774   #endif
1775 }
1776 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1777   #define _mm_cmpneq_pd(a, b) simde_mm_cmpneq_pd(a, b)
1778 #endif
1779 
1780 SIMDE_FUNCTION_ATTRIBUTES
1781 simde__m128d
simde_mm_cmpneq_sd(simde__m128d a,simde__m128d b)1782 simde_mm_cmpneq_sd (simde__m128d a, simde__m128d b) {
1783   #if defined(SIMDE_X86_SSE2_NATIVE)
1784     return _mm_cmpneq_sd(a, b);
1785   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
1786     return simde_mm_move_sd(a, simde_mm_cmpneq_pd(a, b));
1787   #else
1788     simde__m128d_private
1789       r_,
1790       a_ = simde__m128d_to_private(a),
1791       b_ = simde__m128d_to_private(b);
1792 
1793     r_.u64[0] = (a_.f64[0] != b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
1794     r_.u64[1] = a_.u64[1];
1795 
1796 
1797     return simde__m128d_from_private(r_);
1798   #endif
1799 }
1800 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1801   #define _mm_cmpneq_sd(a, b) simde_mm_cmpneq_sd(a, b)
1802 #endif
1803 
1804 SIMDE_FUNCTION_ATTRIBUTES
1805 simde__m128i
simde_mm_cmplt_epi8(simde__m128i a,simde__m128i b)1806 simde_mm_cmplt_epi8 (simde__m128i a, simde__m128i b) {
1807   #if defined(SIMDE_X86_SSE2_NATIVE)
1808     return _mm_cmplt_epi8(a, b);
1809   #else
1810     simde__m128i_private
1811       r_,
1812       a_ = simde__m128i_to_private(a),
1813       b_ = simde__m128i_to_private(b);
1814 
1815     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1816       r_.neon_u8 = vcltq_s8(a_.neon_i8, b_.neon_i8);
1817     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
1818       r_.altivec_i8 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char),vec_cmplt(a_.altivec_i8, b_.altivec_i8));
1819     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1820       r_.wasm_v128 = wasm_i8x16_lt(a_.wasm_v128, b_.wasm_v128);
1821     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1822       r_.i8 = HEDLEY_STATIC_CAST(__typeof__(r_.i8), (a_.i8 < b_.i8));
1823     #else
1824       SIMDE_VECTORIZE
1825       for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
1826         r_.i8[i] = (a_.i8[i] < b_.i8[i]) ? ~INT8_C(0) : INT8_C(0);
1827       }
1828     #endif
1829 
1830     return simde__m128i_from_private(r_);
1831   #endif
1832 }
1833 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1834   #define _mm_cmplt_epi8(a, b) simde_mm_cmplt_epi8(a, b)
1835 #endif
1836 
1837 SIMDE_FUNCTION_ATTRIBUTES
1838 simde__m128i
simde_mm_cmplt_epi16(simde__m128i a,simde__m128i b)1839 simde_mm_cmplt_epi16 (simde__m128i a, simde__m128i b) {
1840   #if defined(SIMDE_X86_SSE2_NATIVE)
1841     return _mm_cmplt_epi16(a, b);
1842   #else
1843     simde__m128i_private
1844       r_,
1845       a_ = simde__m128i_to_private(a),
1846       b_ = simde__m128i_to_private(b);
1847 
1848     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1849       r_.neon_u16 = vcltq_s16(a_.neon_i16, b_.neon_i16);
1850     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
1851       r_.altivec_i16 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed short), vec_cmplt(a_.altivec_i16, b_.altivec_i16));
1852     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1853       r_.wasm_v128 = wasm_i16x8_lt(a_.wasm_v128, b_.wasm_v128);
1854     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1855       r_.i16 = HEDLEY_STATIC_CAST(__typeof__(r_.i16), (a_.i16 < b_.i16));
1856     #else
1857       SIMDE_VECTORIZE
1858       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
1859         r_.i16[i] = (a_.i16[i] < b_.i16[i]) ? ~INT16_C(0) : INT16_C(0);
1860       }
1861     #endif
1862 
1863     return simde__m128i_from_private(r_);
1864   #endif
1865 }
1866 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1867   #define _mm_cmplt_epi16(a, b) simde_mm_cmplt_epi16(a, b)
1868 #endif
1869 
1870 SIMDE_FUNCTION_ATTRIBUTES
1871 simde__m128i
simde_mm_cmplt_epi32(simde__m128i a,simde__m128i b)1872 simde_mm_cmplt_epi32 (simde__m128i a, simde__m128i b) {
1873   #if defined(SIMDE_X86_SSE2_NATIVE)
1874     return _mm_cmplt_epi32(a, b);
1875   #else
1876     simde__m128i_private
1877       r_,
1878       a_ = simde__m128i_to_private(a),
1879       b_ = simde__m128i_to_private(b);
1880 
1881     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1882       r_.neon_u32 = vcltq_s32(a_.neon_i32, b_.neon_i32);
1883     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
1884       r_.altivec_i32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed int), vec_cmplt(a_.altivec_i32, b_.altivec_i32));
1885     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1886       r_.wasm_v128 = wasm_i32x4_lt(a_.wasm_v128, b_.wasm_v128);
1887     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1888       r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.i32 < b_.i32));
1889     #else
1890       SIMDE_VECTORIZE
1891       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1892         r_.i32[i] = (a_.i32[i] < b_.i32[i]) ? ~INT32_C(0) : INT32_C(0);
1893       }
1894     #endif
1895 
1896     return simde__m128i_from_private(r_);
1897   #endif
1898 }
1899 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1900   #define _mm_cmplt_epi32(a, b) simde_mm_cmplt_epi32(a, b)
1901 #endif
1902 
1903 SIMDE_FUNCTION_ATTRIBUTES
1904 simde__m128d
simde_mm_cmplt_pd(simde__m128d a,simde__m128d b)1905 simde_mm_cmplt_pd (simde__m128d a, simde__m128d b) {
1906   #if defined(SIMDE_X86_SSE2_NATIVE)
1907     return _mm_cmplt_pd(a, b);
1908   #else
1909     simde__m128d_private
1910       r_,
1911       a_ = simde__m128d_to_private(a),
1912       b_ = simde__m128d_to_private(b);
1913 
1914     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1915       r_.neon_u64 = vcltq_f64(a_.neon_f64, b_.neon_f64);
1916     #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
1917       r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_cmplt(a_.altivec_f64, b_.altivec_f64));
1918     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1919       r_.wasm_v128 = wasm_f64x2_lt(a_.wasm_v128, b_.wasm_v128);
1920     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1921       r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 < b_.f64));
1922     #else
1923       SIMDE_VECTORIZE
1924       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
1925         r_.u64[i] = (a_.f64[i] < b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
1926       }
1927     #endif
1928 
1929     return simde__m128d_from_private(r_);
1930   #endif
1931 }
1932 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1933   #define _mm_cmplt_pd(a, b) simde_mm_cmplt_pd(a, b)
1934 #endif
1935 
1936 SIMDE_FUNCTION_ATTRIBUTES
1937 simde__m128d
simde_mm_cmplt_sd(simde__m128d a,simde__m128d b)1938 simde_mm_cmplt_sd (simde__m128d a, simde__m128d b) {
1939   #if defined(SIMDE_X86_SSE2_NATIVE)
1940     return _mm_cmplt_sd(a, b);
1941   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
1942     return simde_mm_move_sd(a, simde_mm_cmplt_pd(a, b));
1943   #else
1944     simde__m128d_private
1945       r_,
1946       a_ = simde__m128d_to_private(a),
1947       b_ = simde__m128d_to_private(b);
1948 
1949     r_.u64[0] = (a_.f64[0] < b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
1950     r_.u64[1] = a_.u64[1];
1951 
1952     return simde__m128d_from_private(r_);
1953   #endif
1954 }
1955 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1956   #define _mm_cmplt_sd(a, b) simde_mm_cmplt_sd(a, b)
1957 #endif
1958 
1959 SIMDE_FUNCTION_ATTRIBUTES
1960 simde__m128d
simde_mm_cmple_pd(simde__m128d a,simde__m128d b)1961 simde_mm_cmple_pd (simde__m128d a, simde__m128d b) {
1962   #if defined(SIMDE_X86_SSE2_NATIVE)
1963     return _mm_cmple_pd(a, b);
1964   #else
1965     simde__m128d_private
1966       r_,
1967       a_ = simde__m128d_to_private(a),
1968       b_ = simde__m128d_to_private(b);
1969 
1970     #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1971       r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 <= b_.f64));
1972     #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1973       r_.neon_u64 = vcleq_f64(a_.neon_f64, b_.neon_f64);
1974     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1975       r_.wasm_v128 = wasm_f64x2_le(a_.wasm_v128, b_.wasm_v128);
1976     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
1977       r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_cmple(a_.altivec_f64, b_.altivec_f64));
1978     #else
1979       SIMDE_VECTORIZE
1980       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
1981         r_.u64[i] = (a_.f64[i] <= b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
1982       }
1983     #endif
1984 
1985     return simde__m128d_from_private(r_);
1986   #endif
1987 }
1988 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1989   #define _mm_cmple_pd(a, b) simde_mm_cmple_pd(a, b)
1990 #endif
1991 
1992 SIMDE_FUNCTION_ATTRIBUTES
1993 simde__m128d
simde_mm_cmple_sd(simde__m128d a,simde__m128d b)1994 simde_mm_cmple_sd (simde__m128d a, simde__m128d b) {
1995   #if defined(SIMDE_X86_SSE2_NATIVE)
1996     return _mm_cmple_sd(a, b);
1997   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
1998     return simde_mm_move_sd(a, simde_mm_cmple_pd(a, b));
1999   #else
2000     simde__m128d_private
2001       r_,
2002       a_ = simde__m128d_to_private(a),
2003       b_ = simde__m128d_to_private(b);
2004 
2005     r_.u64[0] = (a_.f64[0] <= b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
2006     r_.u64[1] = a_.u64[1];
2007 
2008     return simde__m128d_from_private(r_);
2009   #endif
2010 }
2011 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2012   #define _mm_cmple_sd(a, b) simde_mm_cmple_sd(a, b)
2013 #endif
2014 
2015 SIMDE_FUNCTION_ATTRIBUTES
2016 simde__m128i
simde_mm_cmpgt_epi8(simde__m128i a,simde__m128i b)2017 simde_mm_cmpgt_epi8 (simde__m128i a, simde__m128i b) {
2018   #if defined(SIMDE_X86_SSE2_NATIVE)
2019     return _mm_cmpgt_epi8(a, b);
2020   #else
2021     simde__m128i_private
2022       r_,
2023       a_ = simde__m128i_to_private(a),
2024       b_ = simde__m128i_to_private(b);
2025 
2026     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2027       r_.neon_u8 = vcgtq_s8(a_.neon_i8, b_.neon_i8);
2028     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
2029       r_.wasm_v128 = wasm_i8x16_gt(a_.wasm_v128, b_.wasm_v128);
2030     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
2031       r_.altivec_i8 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char), vec_cmpgt(a_.altivec_i8, b_.altivec_i8));
2032     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
2033       r_.i8 = HEDLEY_STATIC_CAST(__typeof__(r_.i8), (a_.i8 > b_.i8));
2034     #else
2035       SIMDE_VECTORIZE
2036       for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
2037         r_.i8[i] = (a_.i8[i] > b_.i8[i]) ? ~INT8_C(0) : INT8_C(0);
2038       }
2039     #endif
2040 
2041     return simde__m128i_from_private(r_);
2042   #endif
2043 }
2044 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2045   #define _mm_cmpgt_epi8(a, b) simde_mm_cmpgt_epi8(a, b)
2046 #endif
2047 
2048 SIMDE_FUNCTION_ATTRIBUTES
2049 simde__m128i
simde_mm_cmpgt_epi16(simde__m128i a,simde__m128i b)2050 simde_mm_cmpgt_epi16 (simde__m128i a, simde__m128i b) {
2051   #if defined(SIMDE_X86_SSE2_NATIVE)
2052     return _mm_cmpgt_epi16(a, b);
2053   #else
2054     simde__m128i_private
2055       r_,
2056       a_ = simde__m128i_to_private(a),
2057       b_ = simde__m128i_to_private(b);
2058 
2059     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2060       r_.neon_u16 = vcgtq_s16(a_.neon_i16, b_.neon_i16);
2061     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
2062       r_.wasm_v128 = wasm_i16x8_gt(a_.wasm_v128, b_.wasm_v128);
2063     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
2064       r_.altivec_i16 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed short), vec_cmpgt(a_.altivec_i16, b_.altivec_i16));
2065     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
2066       r_.i16 = HEDLEY_STATIC_CAST(__typeof__(r_.i16), (a_.i16 > b_.i16));
2067     #else
2068       SIMDE_VECTORIZE
2069       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
2070         r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? ~INT16_C(0) : INT16_C(0);
2071       }
2072     #endif
2073 
2074     return simde__m128i_from_private(r_);
2075   #endif
2076 }
2077 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2078   #define _mm_cmpgt_epi16(a, b) simde_mm_cmpgt_epi16(a, b)
2079 #endif
2080 
2081 SIMDE_FUNCTION_ATTRIBUTES
2082 simde__m128i
simde_mm_cmpgt_epi32(simde__m128i a,simde__m128i b)2083 simde_mm_cmpgt_epi32 (simde__m128i a, simde__m128i b) {
2084   #if defined(SIMDE_X86_SSE2_NATIVE)
2085     return _mm_cmpgt_epi32(a, b);
2086   #else
2087     simde__m128i_private
2088       r_,
2089       a_ = simde__m128i_to_private(a),
2090       b_ = simde__m128i_to_private(b);
2091 
2092     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2093       r_.neon_u32 = vcgtq_s32(a_.neon_i32, b_.neon_i32);
2094     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
2095       r_.wasm_v128 = wasm_i32x4_gt(a_.wasm_v128, b_.wasm_v128);
2096     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
2097       r_.altivec_i32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed int), vec_cmpgt(a_.altivec_i32, b_.altivec_i32));
2098     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
2099       r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.i32 > b_.i32));
2100     #else
2101       SIMDE_VECTORIZE
2102       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
2103         r_.i32[i] = (a_.i32[i] > b_.i32[i]) ? ~INT32_C(0) : INT32_C(0);
2104       }
2105     #endif
2106 
2107     return simde__m128i_from_private(r_);
2108   #endif
2109 }
2110 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2111   #define _mm_cmpgt_epi32(a, b) simde_mm_cmpgt_epi32(a, b)
2112 #endif
2113 
2114 SIMDE_FUNCTION_ATTRIBUTES
2115 simde__m128d
simde_mm_cmpgt_pd(simde__m128d a,simde__m128d b)2116 simde_mm_cmpgt_pd (simde__m128d a, simde__m128d b) {
2117   #if defined(SIMDE_X86_SSE2_NATIVE)
2118     return _mm_cmpgt_pd(a, b);
2119   #else
2120     simde__m128d_private
2121       r_,
2122       a_ = simde__m128d_to_private(a),
2123       b_ = simde__m128d_to_private(b);
2124 
2125     #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
2126       r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 > b_.f64));
2127     #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2128       r_.neon_u64 = vcgtq_f64(a_.neon_f64, b_.neon_f64);
2129     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
2130       r_.wasm_v128 = wasm_f64x2_gt(a_.wasm_v128, b_.wasm_v128);
2131     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
2132       r_.altivec_f64 = HEDLEY_STATIC_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_cmpgt(a_.altivec_f64, b_.altivec_f64));
2133     #else
2134       SIMDE_VECTORIZE
2135       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
2136         r_.u64[i] = (a_.f64[i] > b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
2137       }
2138     #endif
2139 
2140     return simde__m128d_from_private(r_);
2141   #endif
2142 }
2143 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2144   #define _mm_cmpgt_pd(a, b) simde_mm_cmpgt_pd(a, b)
2145 #endif
2146 
2147 SIMDE_FUNCTION_ATTRIBUTES
2148 simde__m128d
simde_mm_cmpgt_sd(simde__m128d a,simde__m128d b)2149 simde_mm_cmpgt_sd (simde__m128d a, simde__m128d b) {
2150   #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
2151     return _mm_cmpgt_sd(a, b);
2152   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
2153     return simde_mm_move_sd(a, simde_mm_cmpgt_pd(a, b));
2154   #else
2155     simde__m128d_private
2156       r_,
2157       a_ = simde__m128d_to_private(a),
2158       b_ = simde__m128d_to_private(b);
2159 
2160     r_.u64[0] = (a_.f64[0] > b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
2161     r_.u64[1] = a_.u64[1];
2162 
2163     return simde__m128d_from_private(r_);
2164   #endif
2165 }
2166 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2167   #define _mm_cmpgt_sd(a, b) simde_mm_cmpgt_sd(a, b)
2168 #endif
2169 
2170 SIMDE_FUNCTION_ATTRIBUTES
2171 simde__m128d
simde_mm_cmpge_pd(simde__m128d a,simde__m128d b)2172 simde_mm_cmpge_pd (simde__m128d a, simde__m128d b) {
2173   #if defined(SIMDE_X86_SSE2_NATIVE)
2174     return _mm_cmpge_pd(a, b);
2175   #else
2176     simde__m128d_private
2177       r_,
2178       a_ = simde__m128d_to_private(a),
2179       b_ = simde__m128d_to_private(b);
2180 
2181     #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
2182       r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 >= b_.f64));
2183     #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2184       r_.neon_u64 = vcgeq_f64(a_.neon_f64, b_.neon_f64);
2185     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
2186       r_.wasm_v128 = wasm_f64x2_ge(a_.wasm_v128, b_.wasm_v128);
2187     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
2188       r_.altivec_f64 = HEDLEY_STATIC_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_cmpge(a_.altivec_f64, b_.altivec_f64));
2189     #else
2190       SIMDE_VECTORIZE
2191       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
2192         r_.u64[i] = (a_.f64[i] >= b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
2193       }
2194     #endif
2195 
2196     return simde__m128d_from_private(r_);
2197   #endif
2198 }
2199 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2200   #define _mm_cmpge_pd(a, b) simde_mm_cmpge_pd(a, b)
2201 #endif
2202 
2203 SIMDE_FUNCTION_ATTRIBUTES
2204 simde__m128d
simde_mm_cmpge_sd(simde__m128d a,simde__m128d b)2205 simde_mm_cmpge_sd (simde__m128d a, simde__m128d b) {
2206   #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
2207     return _mm_cmpge_sd(a, b);
2208   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
2209     return simde_mm_move_sd(a, simde_mm_cmpge_pd(a, b));
2210   #else
2211     simde__m128d_private
2212       r_,
2213       a_ = simde__m128d_to_private(a),
2214       b_ = simde__m128d_to_private(b);
2215 
2216     r_.u64[0] = (a_.f64[0] >= b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
2217     r_.u64[1] = a_.u64[1];
2218 
2219     return simde__m128d_from_private(r_);
2220   #endif
2221 }
2222 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2223   #define _mm_cmpge_sd(a, b) simde_mm_cmpge_sd(a, b)
2224 #endif
2225 
2226 SIMDE_FUNCTION_ATTRIBUTES
2227 simde__m128d
simde_mm_cmpngt_pd(simde__m128d a,simde__m128d b)2228 simde_mm_cmpngt_pd (simde__m128d a, simde__m128d b) {
2229   #if defined(SIMDE_X86_SSE2_NATIVE)
2230     return _mm_cmpngt_pd(a, b);
2231   #else
2232     return simde_mm_cmple_pd(a, b);
2233   #endif
2234 }
2235 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2236   #define _mm_cmpngt_pd(a, b) simde_mm_cmpngt_pd(a, b)
2237 #endif
2238 
2239 SIMDE_FUNCTION_ATTRIBUTES
2240 simde__m128d
simde_mm_cmpngt_sd(simde__m128d a,simde__m128d b)2241 simde_mm_cmpngt_sd (simde__m128d a, simde__m128d b) {
2242   #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
2243     return _mm_cmpngt_sd(a, b);
2244   #else
2245     return simde_mm_cmple_sd(a, b);
2246   #endif
2247 }
2248 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2249   #define _mm_cmpngt_sd(a, b) simde_mm_cmpngt_sd(a, b)
2250 #endif
2251 
2252 SIMDE_FUNCTION_ATTRIBUTES
2253 simde__m128d
simde_mm_cmpnge_pd(simde__m128d a,simde__m128d b)2254 simde_mm_cmpnge_pd (simde__m128d a, simde__m128d b) {
2255   #if defined(SIMDE_X86_SSE2_NATIVE)
2256     return _mm_cmpnge_pd(a, b);
2257   #else
2258     return simde_mm_cmplt_pd(a, b);
2259   #endif
2260 }
2261 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2262   #define _mm_cmpnge_pd(a, b) simde_mm_cmpnge_pd(a, b)
2263 #endif
2264 
2265 SIMDE_FUNCTION_ATTRIBUTES
2266 simde__m128d
simde_mm_cmpnge_sd(simde__m128d a,simde__m128d b)2267 simde_mm_cmpnge_sd (simde__m128d a, simde__m128d b) {
2268   #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
2269     return _mm_cmpnge_sd(a, b);
2270   #else
2271     return simde_mm_cmplt_sd(a, b);
2272   #endif
2273 }
2274 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2275   #define _mm_cmpnge_sd(a, b) simde_mm_cmpnge_sd(a, b)
2276 #endif
2277 
2278 SIMDE_FUNCTION_ATTRIBUTES
2279 simde__m128d
simde_mm_cmpnlt_pd(simde__m128d a,simde__m128d b)2280 simde_mm_cmpnlt_pd (simde__m128d a, simde__m128d b) {
2281   #if defined(SIMDE_X86_SSE2_NATIVE)
2282     return _mm_cmpnlt_pd(a, b);
2283   #else
2284     return simde_mm_cmpge_pd(a, b);
2285   #endif
2286 }
2287 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2288   #define _mm_cmpnlt_pd(a, b) simde_mm_cmpnlt_pd(a, b)
2289 #endif
2290 
2291 SIMDE_FUNCTION_ATTRIBUTES
2292 simde__m128d
simde_mm_cmpnlt_sd(simde__m128d a,simde__m128d b)2293 simde_mm_cmpnlt_sd (simde__m128d a, simde__m128d b) {
2294   #if defined(SIMDE_X86_SSE2_NATIVE)
2295     return _mm_cmpnlt_sd(a, b);
2296   #else
2297     return simde_mm_cmpge_sd(a, b);
2298   #endif
2299 }
2300 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2301   #define _mm_cmpnlt_sd(a, b) simde_mm_cmpnlt_sd(a, b)
2302 #endif
2303 
2304 SIMDE_FUNCTION_ATTRIBUTES
2305 simde__m128d
simde_mm_cmpnle_pd(simde__m128d a,simde__m128d b)2306 simde_mm_cmpnle_pd (simde__m128d a, simde__m128d b) {
2307   #if defined(SIMDE_X86_SSE2_NATIVE)
2308     return _mm_cmpnle_pd(a, b);
2309   #else
2310     return simde_mm_cmpgt_pd(a, b);
2311   #endif
2312 }
2313 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2314   #define _mm_cmpnle_pd(a, b) simde_mm_cmpnle_pd(a, b)
2315 #endif
2316 
2317 SIMDE_FUNCTION_ATTRIBUTES
2318 simde__m128d
simde_mm_cmpnle_sd(simde__m128d a,simde__m128d b)2319 simde_mm_cmpnle_sd (simde__m128d a, simde__m128d b) {
2320   #if defined(SIMDE_X86_SSE2_NATIVE)
2321     return _mm_cmpnle_sd(a, b);
2322   #else
2323     return simde_mm_cmpgt_sd(a, b);
2324   #endif
2325 }
2326 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2327   #define _mm_cmpnle_sd(a, b) simde_mm_cmpnle_sd(a, b)
2328 #endif
2329 
2330 SIMDE_FUNCTION_ATTRIBUTES
2331 simde__m128d
simde_mm_cmpord_pd(simde__m128d a,simde__m128d b)2332 simde_mm_cmpord_pd (simde__m128d a, simde__m128d b) {
2333   #if defined(SIMDE_X86_SSE2_NATIVE)
2334     return _mm_cmpord_pd(a, b);
2335   #else
2336     simde__m128d_private
2337       r_,
2338       a_ = simde__m128d_to_private(a),
2339       b_ = simde__m128d_to_private(b);
2340 
2341     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2342       /* Note: NEON does not have ordered compare builtin
2343         Need to compare a eq a and b eq b to check for NaN
2344         Do AND of results to get final */
2345       uint64x2_t ceqaa = vceqq_f64(a_.neon_f64, a_.neon_f64);
2346       uint64x2_t ceqbb = vceqq_f64(b_.neon_f64, b_.neon_f64);
2347       r_.neon_u64 = vandq_u64(ceqaa, ceqbb);
2348     #elif defined(simde_math_isnan)
2349       SIMDE_VECTORIZE
2350       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
2351         r_.u64[i] = (!simde_math_isnan(a_.f64[i]) && !simde_math_isnan(b_.f64[i])) ? ~UINT64_C(0) : UINT64_C(0);
2352       }
2353     #else
2354       HEDLEY_UNREACHABLE();
2355     #endif
2356 
2357     return simde__m128d_from_private(r_);
2358   #endif
2359 }
2360 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2361   #define _mm_cmpord_pd(a, b) simde_mm_cmpord_pd(a, b)
2362 #endif
2363 
2364 SIMDE_FUNCTION_ATTRIBUTES
2365 simde_float64
simde_mm_cvtsd_f64(simde__m128d a)2366 simde_mm_cvtsd_f64 (simde__m128d a) {
2367   #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
2368     return _mm_cvtsd_f64(a);
2369   #else
2370     simde__m128d_private a_ = simde__m128d_to_private(a);
2371     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2372       return HEDLEY_STATIC_CAST(simde_float64, vgetq_lane_f64(a_.neon_f64, 0));
2373     #else
2374       return a_.f64[0];
2375     #endif
2376   #endif
2377 }
2378 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2379   #define _mm_cvtsd_f64(a) simde_mm_cvtsd_f64(a)
2380 #endif
2381 
2382 SIMDE_FUNCTION_ATTRIBUTES
2383 simde__m128d
simde_mm_cmpord_sd(simde__m128d a,simde__m128d b)2384 simde_mm_cmpord_sd (simde__m128d a, simde__m128d b) {
2385   #if defined(SIMDE_X86_SSE2_NATIVE)
2386     return _mm_cmpord_sd(a, b);
2387   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
2388     return simde_mm_move_sd(a, simde_mm_cmpord_pd(a, b));
2389   #else
2390     simde__m128d_private
2391       r_,
2392       a_ = simde__m128d_to_private(a),
2393       b_ = simde__m128d_to_private(b);
2394 
2395     #if defined(simde_math_isnan)
2396       r_.u64[0] = (!simde_math_isnan(a_.f64[0]) && !simde_math_isnan(b_.f64[0])) ? ~UINT64_C(0) : UINT64_C(0);
2397       r_.u64[1] = a_.u64[1];
2398     #else
2399       HEDLEY_UNREACHABLE();
2400     #endif
2401 
2402     return simde__m128d_from_private(r_);
2403   #endif
2404 }
2405 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2406   #define _mm_cmpord_sd(a, b) simde_mm_cmpord_sd(a, b)
2407 #endif
2408 
2409 SIMDE_FUNCTION_ATTRIBUTES
2410 simde__m128d
simde_mm_cmpunord_pd(simde__m128d a,simde__m128d b)2411 simde_mm_cmpunord_pd (simde__m128d a, simde__m128d b) {
2412   #if defined(SIMDE_X86_SSE2_NATIVE)
2413     return _mm_cmpunord_pd(a, b);
2414   #else
2415     simde__m128d_private
2416       r_,
2417       a_ = simde__m128d_to_private(a),
2418       b_ = simde__m128d_to_private(b);
2419 
2420     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2421       uint64x2_t ceqaa = vceqq_f64(a_.neon_f64, a_.neon_f64);
2422       uint64x2_t ceqbb = vceqq_f64(b_.neon_f64, b_.neon_f64);
2423       r_.neon_u64 = vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(vandq_u64(ceqaa, ceqbb))));
2424     #elif defined(simde_math_isnan)
2425       SIMDE_VECTORIZE
2426       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
2427         r_.u64[i] = (simde_math_isnan(a_.f64[i]) || simde_math_isnan(b_.f64[i])) ? ~UINT64_C(0) : UINT64_C(0);
2428       }
2429     #else
2430       HEDLEY_UNREACHABLE();
2431     #endif
2432 
2433     return simde__m128d_from_private(r_);
2434   #endif
2435 }
2436 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2437   #define _mm_cmpunord_pd(a, b) simde_mm_cmpunord_pd(a, b)
2438 #endif
2439 
2440 SIMDE_FUNCTION_ATTRIBUTES
2441 simde__m128d
simde_mm_cmpunord_sd(simde__m128d a,simde__m128d b)2442 simde_mm_cmpunord_sd (simde__m128d a, simde__m128d b) {
2443   #if defined(SIMDE_X86_SSE2_NATIVE)
2444     return _mm_cmpunord_sd(a, b);
2445   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
2446     return simde_mm_move_sd(a, simde_mm_cmpunord_pd(a, b));
2447   #else
2448     simde__m128d_private
2449       r_,
2450       a_ = simde__m128d_to_private(a),
2451       b_ = simde__m128d_to_private(b);
2452 
2453     #if defined(simde_math_isnan)
2454       r_.u64[0] = (simde_math_isnan(a_.f64[0]) || simde_math_isnan(b_.f64[0])) ? ~UINT64_C(0) : UINT64_C(0);
2455       r_.u64[1] = a_.u64[1];
2456     #else
2457       HEDLEY_UNREACHABLE();
2458     #endif
2459 
2460     return simde__m128d_from_private(r_);
2461   #endif
2462 }
2463 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2464   #define _mm_cmpunord_sd(a, b) simde_mm_cmpunord_sd(a, b)
2465 #endif
2466 
2467 SIMDE_FUNCTION_ATTRIBUTES
2468 simde__m128d
simde_mm_cvtepi32_pd(simde__m128i a)2469 simde_mm_cvtepi32_pd (simde__m128i a) {
2470   #if defined(SIMDE_X86_SSE2_NATIVE)
2471     return _mm_cvtepi32_pd(a);
2472   #else
2473     simde__m128d_private r_;
2474     simde__m128i_private a_ = simde__m128i_to_private(a);
2475 
2476     #if defined(SIMDE_CONVERT_VECTOR_)
2477       SIMDE_CONVERT_VECTOR_(r_.f64, a_.m64_private[0].i32);
2478     #else
2479       SIMDE_VECTORIZE
2480       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
2481         r_.f64[i] = (simde_float64) a_.i32[i];
2482       }
2483     #endif
2484 
2485     return simde__m128d_from_private(r_);
2486   #endif
2487 }
2488 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2489   #define _mm_cvtepi32_pd(a) simde_mm_cvtepi32_pd(a)
2490 #endif
2491 
2492 SIMDE_FUNCTION_ATTRIBUTES
2493 simde__m128
simde_mm_cvtepi32_ps(simde__m128i a)2494 simde_mm_cvtepi32_ps (simde__m128i a) {
2495   #if defined(SIMDE_X86_SSE2_NATIVE)
2496     return _mm_cvtepi32_ps(a);
2497   #else
2498     simde__m128_private r_;
2499     simde__m128i_private a_ = simde__m128i_to_private(a);
2500 
2501     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2502       r_.neon_f32 = vcvtq_f32_s32(a_.neon_i32);
2503     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
2504       r_.wasm_v128 = wasm_f32x4_convert_i32x4(a_.wasm_v128);
2505     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
2506       HEDLEY_DIAGNOSTIC_PUSH
2507       #if HEDLEY_HAS_WARNING("-Wc11-extensions")
2508         #pragma clang diagnostic ignored "-Wc11-extensions"
2509       #endif
2510       r_.altivec_f32 = vec_ctf(a_.altivec_i32, 0);
2511       HEDLEY_DIAGNOSTIC_POP
2512     #elif defined(SIMDE_CONVERT_VECTOR_)
2513       SIMDE_CONVERT_VECTOR_(r_.f32, a_.i32);
2514     #else
2515       SIMDE_VECTORIZE
2516       for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
2517         r_.f32[i] = (simde_float32) a_.i32[i];
2518       }
2519     #endif
2520 
2521     return simde__m128_from_private(r_);
2522   #endif
2523 }
2524 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2525   #define _mm_cvtepi32_ps(a) simde_mm_cvtepi32_ps(a)
2526 #endif
2527 
2528 SIMDE_FUNCTION_ATTRIBUTES
2529 simde__m64
simde_mm_cvtpd_pi32(simde__m128d a)2530 simde_mm_cvtpd_pi32 (simde__m128d a) {
2531   #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
2532     return _mm_cvtpd_pi32(a);
2533   #else
2534     simde__m64_private r_;
2535     simde__m128d_private a_ = simde__m128d_to_private(a);
2536 
2537     SIMDE_VECTORIZE
2538     for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
2539       simde_float64 v = simde_math_round(a_.f64[i]);
2540       #if defined(SIMDE_FAST_CONVERSION_RANGE)
2541         r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, v);
2542       #else
2543         r_.i32[i] = ((v > HEDLEY_STATIC_CAST(simde_float64, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float64, INT32_MAX))) ?
2544           SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN;
2545       #endif
2546     }
2547 
2548     return simde__m64_from_private(r_);
2549   #endif
2550 }
2551 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2552   #define _mm_cvtpd_pi32(a) simde_mm_cvtpd_pi32(a)
2553 #endif
2554 
2555 SIMDE_FUNCTION_ATTRIBUTES
2556 simde__m128i
simde_mm_cvtpd_epi32(simde__m128d a)2557 simde_mm_cvtpd_epi32 (simde__m128d a) {
2558   #if defined(SIMDE_X86_SSE2_NATIVE)
2559     return _mm_cvtpd_epi32(a);
2560   #else
2561     simde__m128i_private r_;
2562 
2563     r_.m64[0] = simde_mm_cvtpd_pi32(a);
2564     r_.m64[1] = simde_mm_setzero_si64();
2565 
2566     return simde__m128i_from_private(r_);
2567   #endif
2568 }
2569 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2570   #define _mm_cvtpd_epi32(a) simde_mm_cvtpd_epi32(a)
2571 #endif
2572 
2573 SIMDE_FUNCTION_ATTRIBUTES
2574 simde__m128
simde_mm_cvtpd_ps(simde__m128d a)2575 simde_mm_cvtpd_ps (simde__m128d a) {
2576   #if defined(SIMDE_X86_SSE2_NATIVE)
2577     return _mm_cvtpd_ps(a);
2578   #else
2579     simde__m128_private r_;
2580     simde__m128d_private a_ = simde__m128d_to_private(a);
2581 
2582     #if defined(SIMDE_CONVERT_VECTOR_)
2583       SIMDE_CONVERT_VECTOR_(r_.m64_private[0].f32, a_.f64);
2584       r_.m64_private[1] = simde__m64_to_private(simde_mm_setzero_si64());
2585     #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2586       r_.neon_f32 = vreinterpretq_f32_f64(vcombine_f64(vreinterpret_f64_f32(vcvtx_f32_f64(a_.neon_f64)), vdup_n_f64(0)));
2587     #else
2588       SIMDE_VECTORIZE
2589       for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) {
2590         r_.f32[i] = (simde_float32) a_.f64[i];
2591       }
2592       simde_memset(&(r_.m64_private[1]), 0, sizeof(r_.m64_private[1]));
2593     #endif
2594 
2595     return simde__m128_from_private(r_);
2596   #endif
2597 }
2598 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2599   #define _mm_cvtpd_ps(a) simde_mm_cvtpd_ps(a)
2600 #endif
2601 
2602 SIMDE_FUNCTION_ATTRIBUTES
2603 simde__m128d
simde_mm_cvtpi32_pd(simde__m64 a)2604 simde_mm_cvtpi32_pd (simde__m64 a) {
2605   #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
2606     return _mm_cvtpi32_pd(a);
2607   #else
2608     simde__m128d_private r_;
2609     simde__m64_private a_ = simde__m64_to_private(a);
2610 
2611     #if defined(SIMDE_CONVERT_VECTOR_)
2612       SIMDE_CONVERT_VECTOR_(r_.f64, a_.i32);
2613     #else
2614       SIMDE_VECTORIZE
2615       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
2616         r_.f64[i] = (simde_float64) a_.i32[i];
2617       }
2618     #endif
2619 
2620     return simde__m128d_from_private(r_);
2621   #endif
2622 }
2623 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2624   #define _mm_cvtpi32_pd(a) simde_mm_cvtpi32_pd(a)
2625 #endif
2626 
2627 SIMDE_FUNCTION_ATTRIBUTES
2628 simde__m128i
simde_mm_cvtps_epi32(simde__m128 a)2629 simde_mm_cvtps_epi32 (simde__m128 a) {
2630   #if defined(SIMDE_X86_SSE2_NATIVE)
2631     return _mm_cvtps_epi32(a);
2632   #else
2633     simde__m128i_private r_;
2634     simde__m128_private a_ = simde__m128_to_private(a);
2635 
2636     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE)
2637       r_.neon_i32 = vcvtnq_s32_f32(a_.neon_f32);
2638     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE) && defined(SIMDE_FAST_ROUND_TIES) && !defined(SIMDE_BUG_GCC_95399)
2639       r_.neon_i32 = vcvtnq_s32_f32(a_.neon_f32);
2640     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE) && defined(SIMDE_FAST_ROUND_TIES)
2641       HEDLEY_DIAGNOSTIC_PUSH
2642       SIMDE_DIAGNOSTIC_DISABLE_C11_EXTENSIONS_
2643       SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_
2644       r_.altivec_i32 = vec_cts(a_.altivec_f32, 1);
2645       HEDLEY_DIAGNOSTIC_POP
2646     #else
2647       a_ = simde__m128_to_private(simde_x_mm_round_ps(a, SIMDE_MM_FROUND_TO_NEAREST_INT, 1));
2648       SIMDE_VECTORIZE
2649       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
2650         simde_float32 v = simde_math_roundf(a_.f32[i]);
2651         #if defined(SIMDE_FAST_CONVERSION_RANGE)
2652           r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, v);
2653         #else
2654           r_.i32[i] = ((v > HEDLEY_STATIC_CAST(simde_float32, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float32, INT32_MAX))) ?
2655             SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN;
2656         #endif
2657       }
2658     #endif
2659 
2660     return simde__m128i_from_private(r_);
2661   #endif
2662 }
2663 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2664   #define _mm_cvtps_epi32(a) simde_mm_cvtps_epi32(a)
2665 #endif
2666 
2667 SIMDE_FUNCTION_ATTRIBUTES
2668 simde__m128d
simde_mm_cvtps_pd(simde__m128 a)2669 simde_mm_cvtps_pd (simde__m128 a) {
2670   #if defined(SIMDE_X86_SSE2_NATIVE)
2671     return _mm_cvtps_pd(a);
2672   #else
2673     simde__m128d_private r_;
2674     simde__m128_private a_ = simde__m128_to_private(a);
2675 
2676     #if defined(SIMDE_CONVERT_VECTOR_)
2677       SIMDE_CONVERT_VECTOR_(r_.f64, a_.m64_private[0].f32);
2678     #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2679       r_.neon_f64 = vcvt_f64_f32(vget_low_f32(a_.neon_f32));
2680     #else
2681       SIMDE_VECTORIZE
2682       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
2683         r_.f64[i] = a_.f32[i];
2684       }
2685     #endif
2686 
2687     return simde__m128d_from_private(r_);
2688   #endif
2689 }
2690 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2691   #define _mm_cvtps_pd(a) simde_mm_cvtps_pd(a)
2692 #endif
2693 
2694 SIMDE_FUNCTION_ATTRIBUTES
2695 int32_t
simde_mm_cvtsd_si32(simde__m128d a)2696 simde_mm_cvtsd_si32 (simde__m128d a) {
2697   #if defined(SIMDE_X86_SSE2_NATIVE)
2698     return _mm_cvtsd_si32(a);
2699   #else
2700     simde__m128d_private a_ = simde__m128d_to_private(a);
2701 
2702     simde_float64 v = simde_math_round(a_.f64[0]);
2703     #if defined(SIMDE_FAST_CONVERSION_RANGE)
2704       return SIMDE_CONVERT_FTOI(int32_t, v);
2705     #else
2706       return ((v > HEDLEY_STATIC_CAST(simde_float64, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float64, INT32_MAX))) ?
2707         SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN;
2708     #endif
2709   #endif
2710 }
2711 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2712   #define _mm_cvtsd_si32(a) simde_mm_cvtsd_si32(a)
2713 #endif
2714 
2715 SIMDE_FUNCTION_ATTRIBUTES
2716 int64_t
simde_mm_cvtsd_si64(simde__m128d a)2717 simde_mm_cvtsd_si64 (simde__m128d a) {
2718   #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
2719     #if defined(__PGI)
2720       return _mm_cvtsd_si64x(a);
2721     #else
2722       return _mm_cvtsd_si64(a);
2723     #endif
2724   #else
2725     simde__m128d_private a_ = simde__m128d_to_private(a);
2726     return SIMDE_CONVERT_FTOI(int64_t, simde_math_round(a_.f64[0]));
2727   #endif
2728 }
2729 #define simde_mm_cvtsd_si64x(a) simde_mm_cvtsd_si64(a)
2730 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64))
2731   #define _mm_cvtsd_si64(a) simde_mm_cvtsd_si64(a)
2732   #define _mm_cvtsd_si64x(a) simde_mm_cvtsd_si64x(a)
2733 #endif
2734 
2735 SIMDE_FUNCTION_ATTRIBUTES
2736 simde__m128
simde_mm_cvtsd_ss(simde__m128 a,simde__m128d b)2737 simde_mm_cvtsd_ss (simde__m128 a, simde__m128d b) {
2738   #if defined(SIMDE_X86_SSE2_NATIVE)
2739     return _mm_cvtsd_ss(a, b);
2740   #else
2741     simde__m128_private
2742       r_,
2743       a_ = simde__m128_to_private(a);
2744     simde__m128d_private b_ = simde__m128d_to_private(b);
2745 
2746     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2747       r_.neon_f32 = vsetq_lane_f32(vcvtxd_f32_f64(vgetq_lane_f64(b_.neon_f64, 0)), a_.neon_f32, 0);
2748     #else
2749       r_.f32[0] = HEDLEY_STATIC_CAST(simde_float32, b_.f64[0]);
2750 
2751       SIMDE_VECTORIZE
2752       for (size_t i = 1 ; i < (sizeof(r_) / sizeof(r_.i32[0])) ; i++) {
2753         r_.i32[i] = a_.i32[i];
2754       }
2755     #endif
2756     return simde__m128_from_private(r_);
2757   #endif
2758 }
2759 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2760   #define _mm_cvtsd_ss(a, b) simde_mm_cvtsd_ss(a, b)
2761 #endif
2762 
2763 SIMDE_FUNCTION_ATTRIBUTES
2764 int16_t
simde_x_mm_cvtsi128_si16(simde__m128i a)2765 simde_x_mm_cvtsi128_si16 (simde__m128i a) {
2766   simde__m128i_private
2767     a_ = simde__m128i_to_private(a);
2768 
2769   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2770     return vgetq_lane_s16(a_.neon_i16, 0);
2771   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
2772     return HEDLEY_STATIC_CAST(int16_t, wasm_i16x8_extract_lane(a_.wasm_v128, 0));
2773   #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
2774     #if defined(SIMDE_BUG_GCC_95227)
2775       (void) a_;
2776     #endif
2777     return vec_extract(a_.altivec_i16, 0);
2778   #else
2779     return a_.i16[0];
2780   #endif
2781 }
2782 
2783 SIMDE_FUNCTION_ATTRIBUTES
2784 int32_t
simde_mm_cvtsi128_si32(simde__m128i a)2785 simde_mm_cvtsi128_si32 (simde__m128i a) {
2786   #if defined(SIMDE_X86_SSE2_NATIVE)
2787     return _mm_cvtsi128_si32(a);
2788   #else
2789     simde__m128i_private
2790       a_ = simde__m128i_to_private(a);
2791 
2792     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2793       return vgetq_lane_s32(a_.neon_i32, 0);
2794     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
2795       return HEDLEY_STATIC_CAST(int32_t, wasm_i32x4_extract_lane(a_.wasm_v128, 0));
2796     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
2797       #if defined(SIMDE_BUG_GCC_95227)
2798         (void) a_;
2799       #endif
2800       return vec_extract(a_.altivec_i32, 0);
2801     #else
2802       return a_.i32[0];
2803     #endif
2804   #endif
2805 }
2806 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2807   #define _mm_cvtsi128_si32(a) simde_mm_cvtsi128_si32(a)
2808 #endif
2809 
2810 SIMDE_FUNCTION_ATTRIBUTES
2811 int64_t
simde_mm_cvtsi128_si64(simde__m128i a)2812 simde_mm_cvtsi128_si64 (simde__m128i a) {
2813   #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
2814     #if defined(__PGI)
2815       return _mm_cvtsi128_si64x(a);
2816     #else
2817       return _mm_cvtsi128_si64(a);
2818     #endif
2819   #else
2820     simde__m128i_private a_ = simde__m128i_to_private(a);
2821   #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) && !defined(HEDLEY_IBM_VERSION)
2822     return vec_extract(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed long long), a_.i64), 0);
2823   #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2824     return vgetq_lane_s64(a_.neon_i64, 0);
2825   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
2826     return HEDLEY_STATIC_CAST(int64_t, wasm_i64x2_extract_lane(a_.wasm_v128, 0));
2827   #endif
2828     return a_.i64[0];
2829   #endif
2830 }
2831 #define simde_mm_cvtsi128_si64x(a) simde_mm_cvtsi128_si64(a)
2832 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64))
2833   #define _mm_cvtsi128_si64(a) simde_mm_cvtsi128_si64(a)
2834   #define _mm_cvtsi128_si64x(a) simde_mm_cvtsi128_si64x(a)
2835 #endif
2836 
2837 SIMDE_FUNCTION_ATTRIBUTES
2838 simde__m128d
simde_mm_cvtsi32_sd(simde__m128d a,int32_t b)2839 simde_mm_cvtsi32_sd (simde__m128d a, int32_t b) {
2840   #if defined(SIMDE_X86_SSE2_NATIVE)
2841     return _mm_cvtsi32_sd(a, b);
2842   #else
2843     simde__m128d_private r_;
2844     simde__m128d_private a_ = simde__m128d_to_private(a);
2845 
2846     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2847       r_.neon_f64 = vsetq_lane_f64(HEDLEY_STATIC_CAST(float64_t, b), a_.neon_f64, 0);
2848     #else
2849       r_.f64[0] = HEDLEY_STATIC_CAST(simde_float64, b);
2850       r_.i64[1] = a_.i64[1];
2851     #endif
2852 
2853     return simde__m128d_from_private(r_);
2854   #endif
2855 }
2856 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2857   #define _mm_cvtsi32_sd(a, b) simde_mm_cvtsi32_sd(a, b)
2858 #endif
2859 
2860 SIMDE_FUNCTION_ATTRIBUTES
2861 simde__m128i
simde_x_mm_cvtsi16_si128(int16_t a)2862 simde_x_mm_cvtsi16_si128 (int16_t a) {
2863   simde__m128i_private r_;
2864 
2865   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2866     r_.neon_i16 = vsetq_lane_s16(a, vdupq_n_s16(0), 0);
2867   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
2868     r_.wasm_v128 = wasm_i16x8_make(a, 0, 0, 0, 0, 0, 0, 0);
2869   #else
2870     r_.i16[0] = a;
2871     r_.i16[1] = 0;
2872     r_.i16[2] = 0;
2873     r_.i16[3] = 0;
2874     r_.i16[4] = 0;
2875     r_.i16[5] = 0;
2876     r_.i16[6] = 0;
2877     r_.i16[7] = 0;
2878   #endif
2879 
2880   return simde__m128i_from_private(r_);
2881 }
2882 
2883 SIMDE_FUNCTION_ATTRIBUTES
2884 simde__m128i
simde_mm_cvtsi32_si128(int32_t a)2885 simde_mm_cvtsi32_si128 (int32_t a) {
2886   #if defined(SIMDE_X86_SSE2_NATIVE)
2887     return _mm_cvtsi32_si128(a);
2888   #else
2889     simde__m128i_private r_;
2890 
2891     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2892       r_.neon_i32 = vsetq_lane_s32(a, vdupq_n_s32(0), 0);
2893     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
2894       r_.wasm_v128 = wasm_i32x4_make(a, 0, 0, 0);
2895     #else
2896       r_.i32[0] = a;
2897       r_.i32[1] = 0;
2898       r_.i32[2] = 0;
2899       r_.i32[3] = 0;
2900     #endif
2901 
2902     return simde__m128i_from_private(r_);
2903   #endif
2904 }
2905 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2906   #define _mm_cvtsi32_si128(a) simde_mm_cvtsi32_si128(a)
2907 #endif
2908 
2909 SIMDE_FUNCTION_ATTRIBUTES
2910 simde__m128d
simde_mm_cvtsi64_sd(simde__m128d a,int64_t b)2911 simde_mm_cvtsi64_sd (simde__m128d a, int64_t b) {
2912   #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
2913     #if !defined(__PGI)
2914       return _mm_cvtsi64_sd(a, b);
2915     #else
2916       return _mm_cvtsi64x_sd(a, b);
2917     #endif
2918   #else
2919     simde__m128d_private
2920       r_,
2921       a_ = simde__m128d_to_private(a);
2922 
2923     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2924       r_.neon_f64 = vsetq_lane_f64(HEDLEY_STATIC_CAST(float64_t, b), a_.neon_f64, 0);
2925     #else
2926       r_.f64[0] = HEDLEY_STATIC_CAST(simde_float64, b);
2927       r_.f64[1] = a_.f64[1];
2928     #endif
2929 
2930     return simde__m128d_from_private(r_);
2931   #endif
2932 }
2933 #define simde_mm_cvtsi64x_sd(a, b) simde_mm_cvtsi64_sd(a, b)
2934 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64))
2935   #define _mm_cvtsi64_sd(a, b) simde_mm_cvtsi64_sd(a, b)
2936   #define _mm_cvtsi64x_sd(a, b) simde_mm_cvtsi64x_sd(a, b)
2937 #endif
2938 
2939 SIMDE_FUNCTION_ATTRIBUTES
2940 simde__m128i
simde_mm_cvtsi64_si128(int64_t a)2941 simde_mm_cvtsi64_si128 (int64_t a) {
2942   #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
2943     #if !defined(__PGI)
2944       return _mm_cvtsi64_si128(a);
2945     #else
2946       return _mm_cvtsi64x_si128(a);
2947     #endif
2948   #else
2949     simde__m128i_private r_;
2950 
2951     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2952       r_.neon_i64 = vsetq_lane_s64(a, vdupq_n_s64(0), 0);
2953     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
2954       r_.wasm_v128 = wasm_i64x2_make(a, 0);
2955     #else
2956       r_.i64[0] = a;
2957       r_.i64[1] = 0;
2958     #endif
2959 
2960     return simde__m128i_from_private(r_);
2961   #endif
2962 }
2963 #define simde_mm_cvtsi64x_si128(a) simde_mm_cvtsi64_si128(a)
2964 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64))
2965   #define _mm_cvtsi64_si128(a) simde_mm_cvtsi64_si128(a)
2966   #define _mm_cvtsi64x_si128(a) simde_mm_cvtsi64x_si128(a)
2967 #endif
2968 
2969 SIMDE_FUNCTION_ATTRIBUTES
2970 simde__m128d
simde_mm_cvtss_sd(simde__m128d a,simde__m128 b)2971 simde_mm_cvtss_sd (simde__m128d a, simde__m128 b) {
2972   #if defined(SIMDE_X86_SSE2_NATIVE)
2973     return _mm_cvtss_sd(a, b);
2974   #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2975     float64x2_t temp = vcvt_f64_f32(vset_lane_f32(vgetq_lane_f32(simde__m128_to_private(b).neon_f32, 0), vdup_n_f32(0), 0));
2976     return vsetq_lane_f64(vgetq_lane_f64(simde__m128d_to_private(a).neon_f64, 1), temp, 1);
2977   #else
2978     simde__m128d_private
2979       a_ = simde__m128d_to_private(a);
2980     simde__m128_private b_ = simde__m128_to_private(b);
2981 
2982     a_.f64[0] = HEDLEY_STATIC_CAST(simde_float64, b_.f32[0]);
2983 
2984     return simde__m128d_from_private(a_);
2985   #endif
2986 }
2987 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2988   #define _mm_cvtss_sd(a, b) simde_mm_cvtss_sd(a, b)
2989 #endif
2990 
2991 SIMDE_FUNCTION_ATTRIBUTES
2992 simde__m64
simde_mm_cvttpd_pi32(simde__m128d a)2993 simde_mm_cvttpd_pi32 (simde__m128d a) {
2994   #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
2995     return _mm_cvttpd_pi32(a);
2996   #else
2997     simde__m64_private r_;
2998     simde__m128d_private a_ = simde__m128d_to_private(a);
2999 
3000     #if defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FAST_CONVERSION_RANGE)
3001       SIMDE_CONVERT_VECTOR_(r_.i32, a_.f64);
3002     #else
3003       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
3004         simde_float64 v = a_.f64[i];
3005         #if defined(SIMDE_FAST_CONVERSION_RANGE)
3006           r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, v);
3007         #else
3008           r_.i32[i] = ((v > HEDLEY_STATIC_CAST(simde_float64, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float64, INT32_MAX))) ?
3009             SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN;
3010         #endif
3011       }
3012     #endif
3013 
3014     return simde__m64_from_private(r_);
3015   #endif
3016 }
3017 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3018   #define _mm_cvttpd_pi32(a) simde_mm_cvttpd_pi32(a)
3019 #endif
3020 
3021 SIMDE_FUNCTION_ATTRIBUTES
3022 simde__m128i
simde_mm_cvttpd_epi32(simde__m128d a)3023 simde_mm_cvttpd_epi32 (simde__m128d a) {
3024   #if defined(SIMDE_X86_SSE2_NATIVE)
3025     return _mm_cvttpd_epi32(a);
3026   #else
3027     simde__m128i_private r_;
3028 
3029     r_.m64[0] = simde_mm_cvttpd_pi32(a);
3030     r_.m64[1] = simde_mm_setzero_si64();
3031 
3032     return simde__m128i_from_private(r_);
3033   #endif
3034 }
3035 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3036   #define _mm_cvttpd_epi32(a) simde_mm_cvttpd_epi32(a)
3037 #endif
3038 
3039 SIMDE_FUNCTION_ATTRIBUTES
3040 simde__m128i
simde_mm_cvttps_epi32(simde__m128 a)3041 simde_mm_cvttps_epi32 (simde__m128 a) {
3042   #if defined(SIMDE_X86_SSE2_NATIVE)
3043     return _mm_cvttps_epi32(a);
3044   #else
3045     simde__m128i_private r_;
3046     simde__m128_private a_ = simde__m128_to_private(a);
3047 
3048     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE)
3049       r_.neon_i32 = vcvtq_s32_f32(a_.neon_f32);
3050     #elif defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FAST_CONVERSION_RANGE)
3051       SIMDE_CONVERT_VECTOR_(r_.i32, a_.f32);
3052     #else
3053       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
3054         simde_float32 v = a_.f32[i];
3055         #if defined(SIMDE_FAST_CONVERSION_RANGE)
3056           r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, v);
3057         #else
3058           r_.i32[i] = ((v > HEDLEY_STATIC_CAST(simde_float32, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float32, INT32_MAX))) ?
3059             SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN;
3060         #endif
3061       }
3062     #endif
3063 
3064     return simde__m128i_from_private(r_);
3065   #endif
3066 }
3067 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3068   #define _mm_cvttps_epi32(a) simde_mm_cvttps_epi32(a)
3069 #endif
3070 
3071 SIMDE_FUNCTION_ATTRIBUTES
3072 int32_t
simde_mm_cvttsd_si32(simde__m128d a)3073 simde_mm_cvttsd_si32 (simde__m128d a) {
3074   #if defined(SIMDE_X86_SSE2_NATIVE)
3075     return _mm_cvttsd_si32(a);
3076   #else
3077     simde__m128d_private a_ = simde__m128d_to_private(a);
3078     simde_float64 v = a_.f64[0];
3079     #if defined(SIMDE_FAST_CONVERSION_RANGE)
3080       return SIMDE_CONVERT_FTOI(int32_t, v);
3081     #else
3082       return ((v > HEDLEY_STATIC_CAST(simde_float64, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float64, INT32_MAX))) ?
3083         SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN;
3084     #endif
3085   #endif
3086 }
3087 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3088   #define _mm_cvttsd_si32(a) simde_mm_cvttsd_si32(a)
3089 #endif
3090 
3091 SIMDE_FUNCTION_ATTRIBUTES
3092 int64_t
simde_mm_cvttsd_si64(simde__m128d a)3093 simde_mm_cvttsd_si64 (simde__m128d a) {
3094   #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
3095     #if !defined(__PGI)
3096       return _mm_cvttsd_si64(a);
3097     #else
3098       return _mm_cvttsd_si64x(a);
3099     #endif
3100   #else
3101     simde__m128d_private a_ = simde__m128d_to_private(a);
3102     return SIMDE_CONVERT_FTOI(int64_t, a_.f64[0]);
3103   #endif
3104 }
3105 #define simde_mm_cvttsd_si64x(a) simde_mm_cvttsd_si64(a)
3106 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64))
3107   #define _mm_cvttsd_si64(a) simde_mm_cvttsd_si64(a)
3108   #define _mm_cvttsd_si64x(a) simde_mm_cvttsd_si64x(a)
3109 #endif
3110 
3111 SIMDE_FUNCTION_ATTRIBUTES
3112 simde__m128d
simde_mm_div_pd(simde__m128d a,simde__m128d b)3113 simde_mm_div_pd (simde__m128d a, simde__m128d b) {
3114   #if defined(SIMDE_X86_SSE2_NATIVE)
3115     return _mm_div_pd(a, b);
3116   #else
3117     simde__m128d_private
3118       r_,
3119       a_ = simde__m128d_to_private(a),
3120       b_ = simde__m128d_to_private(b);
3121 
3122     #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
3123       r_.f64 = a_.f64 / b_.f64;
3124     #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3125       r_.neon_f64 = vdivq_f64(a_.neon_f64, b_.neon_f64);
3126     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
3127       r_.wasm_v128 =  wasm_f64x2_div(a_.wasm_v128, b_.wasm_v128);
3128     #else
3129       SIMDE_VECTORIZE
3130       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
3131         r_.f64[i] = a_.f64[i] / b_.f64[i];
3132       }
3133     #endif
3134 
3135     return simde__m128d_from_private(r_);
3136   #endif
3137 }
3138 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3139   #define _mm_div_pd(a, b) simde_mm_div_pd(a, b)
3140 #endif
3141 
3142 SIMDE_FUNCTION_ATTRIBUTES
3143 simde__m128d
simde_mm_div_sd(simde__m128d a,simde__m128d b)3144 simde_mm_div_sd (simde__m128d a, simde__m128d b) {
3145   #if defined(SIMDE_X86_SSE2_NATIVE)
3146     return _mm_div_sd(a, b);
3147   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
3148     return simde_mm_move_sd(a, simde_mm_div_pd(a, b));
3149   #else
3150     simde__m128d_private
3151       r_,
3152       a_ = simde__m128d_to_private(a),
3153       b_ = simde__m128d_to_private(b);
3154 
3155     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3156       float64x2_t temp = vdivq_f64(a_.neon_f64, b_.neon_f64);
3157       r_.neon_f64 = vsetq_lane_f64(vgetq_lane(a_.neon_f64, 1), temp, 1);
3158     #else
3159       r_.f64[0] = a_.f64[0] / b_.f64[0];
3160       r_.f64[1] = a_.f64[1];
3161     #endif
3162 
3163     return simde__m128d_from_private(r_);
3164   #endif
3165 }
3166 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3167   #define _mm_div_sd(a, b) simde_mm_div_sd(a, b)
3168 #endif
3169 
3170 SIMDE_FUNCTION_ATTRIBUTES
3171 int32_t
simde_mm_extract_epi16(simde__m128i a,const int imm8)3172 simde_mm_extract_epi16 (simde__m128i a, const int imm8)
3173     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 7)  {
3174   uint16_t r;
3175   simde__m128i_private a_ = simde__m128i_to_private(a);
3176 
3177   #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
3178     #if defined(SIMDE_BUG_GCC_95227)
3179       (void) a_;
3180       (void) imm8;
3181     #endif
3182     r = HEDLEY_STATIC_CAST(uint16_t, vec_extract(a_.altivec_i16, imm8));
3183   #else
3184     r = a_.u16[imm8 & 7];
3185   #endif
3186 
3187   return  HEDLEY_STATIC_CAST(int32_t, r);
3188 }
3189 #if defined(SIMDE_X86_SSE2_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(4,6,0))
3190   #define simde_mm_extract_epi16(a, imm8) _mm_extract_epi16(a, imm8)
3191 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3192   #define simde_mm_extract_epi16(a, imm8) (HEDLEY_STATIC_CAST(int32_t, vgetq_lane_s16(simde__m128i_to_private(a).neon_i16, (imm8))) & (INT32_C(0x0000ffff)))
3193 #endif
3194 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3195   #define _mm_extract_epi16(a, imm8) simde_mm_extract_epi16(a, imm8)
3196 #endif
3197 
3198 SIMDE_FUNCTION_ATTRIBUTES
3199 simde__m128i
simde_mm_insert_epi16(simde__m128i a,int16_t i,const int imm8)3200 simde_mm_insert_epi16 (simde__m128i a, int16_t i, const int imm8)
3201     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 7)  {
3202   simde__m128i_private a_ = simde__m128i_to_private(a);
3203   a_.i16[imm8 & 7] = i;
3204   return simde__m128i_from_private(a_);
3205 }
3206 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
3207   #define simde_mm_insert_epi16(a, i, imm8) _mm_insert_epi16((a), (i), (imm8))
3208 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3209   #define simde_mm_insert_epi16(a, i, imm8) simde__m128i_from_neon_i16(vsetq_lane_s16((i), simde__m128i_to_neon_i16(a), (imm8)))
3210 #endif
3211 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3212   #define _mm_insert_epi16(a, i, imm8) simde_mm_insert_epi16(a, i, imm8)
3213 #endif
3214 
3215 SIMDE_FUNCTION_ATTRIBUTES
3216 simde__m128d
simde_mm_load_pd(simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM (2)])3217 simde_mm_load_pd (simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)]) {
3218   #if defined(SIMDE_X86_SSE2_NATIVE)
3219     return _mm_load_pd(mem_addr);
3220   #else
3221     simde__m128d_private r_;
3222 
3223     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3224       r_.neon_f64 = vld1q_f64(mem_addr);
3225     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3226       r_.neon_u32 = vld1q_u32(HEDLEY_REINTERPRET_CAST(uint32_t const*, mem_addr));
3227     #else
3228       simde_memcpy(&r_, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m128d), sizeof(r_));
3229     #endif
3230 
3231     return simde__m128d_from_private(r_);
3232   #endif
3233 }
3234 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3235   #define _mm_load_pd(mem_addr) simde_mm_load_pd(mem_addr)
3236 #endif
3237 
3238 SIMDE_FUNCTION_ATTRIBUTES
3239 simde__m128d
simde_mm_load1_pd(simde_float64 const * mem_addr)3240 simde_mm_load1_pd (simde_float64 const* mem_addr) {
3241   #if defined(SIMDE_X86_SSE2_NATIVE)
3242     return _mm_load1_pd(mem_addr);
3243   #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3244     return simde__m128d_from_neon_f64(vld1q_dup_f64(mem_addr));
3245   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
3246     return simde__m128d_from_wasm_v128(wasm_v64x2_load_splat(mem_addr));
3247   #else
3248     return simde_mm_set1_pd(*mem_addr);
3249   #endif
3250 }
3251 #define simde_mm_load_pd1(mem_addr) simde_mm_load1_pd(mem_addr)
3252 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3253   #define _mm_load_pd1(mem_addr) simde_mm_load1_pd(mem_addr)
3254   #define _mm_load1_pd(mem_addr) simde_mm_load1_pd(mem_addr)
3255 #endif
3256 
3257 SIMDE_FUNCTION_ATTRIBUTES
3258 simde__m128d
simde_mm_load_sd(simde_float64 const * mem_addr)3259 simde_mm_load_sd (simde_float64 const* mem_addr) {
3260   #if defined(SIMDE_X86_SSE2_NATIVE)
3261     return _mm_load_sd(mem_addr);
3262   #else
3263     simde__m128d_private r_;
3264 
3265     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3266       r_.neon_f64 = vsetq_lane_f64(*mem_addr, vdupq_n_f64(0), 0);
3267     #else
3268       r_.f64[0] = *mem_addr;
3269       r_.u64[1] = UINT64_C(0);
3270     #endif
3271 
3272     return simde__m128d_from_private(r_);
3273   #endif
3274 }
3275 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3276   #define _mm_load_sd(mem_addr) simde_mm_load_sd(mem_addr)
3277 #endif
3278 
3279 SIMDE_FUNCTION_ATTRIBUTES
3280 simde__m128i
simde_mm_load_si128(simde__m128i const * mem_addr)3281 simde_mm_load_si128 (simde__m128i const* mem_addr) {
3282   #if defined(SIMDE_X86_SSE2_NATIVE)
3283     return _mm_load_si128(HEDLEY_REINTERPRET_CAST(__m128i const*, mem_addr));
3284   #else
3285     simde__m128i_private r_;
3286 
3287     #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
3288       r_.altivec_i32 = vec_ld(0, HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(int) const*, mem_addr));
3289     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3290       r_.neon_i32 = vld1q_s32(HEDLEY_REINTERPRET_CAST(int32_t const*, mem_addr));
3291     #else
3292       simde_memcpy(&r_, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m128i), sizeof(simde__m128i));
3293     #endif
3294 
3295     return simde__m128i_from_private(r_);
3296   #endif
3297 }
3298 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3299   #define _mm_load_si128(mem_addr) simde_mm_load_si128(mem_addr)
3300 #endif
3301 
3302 SIMDE_FUNCTION_ATTRIBUTES
3303 simde__m128d
simde_mm_loadh_pd(simde__m128d a,simde_float64 const * mem_addr)3304 simde_mm_loadh_pd (simde__m128d a, simde_float64 const* mem_addr) {
3305   #if defined(SIMDE_X86_SSE2_NATIVE)
3306     return _mm_loadh_pd(a, mem_addr);
3307   #else
3308     simde__m128d_private
3309       r_,
3310       a_ = simde__m128d_to_private(a);
3311 
3312     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3313       r_.neon_f64 = vcombine_f64(vget_low_f64(a_.neon_f64), vld1_f64(HEDLEY_REINTERPRET_CAST(const float64_t*, mem_addr)));
3314     #else
3315       simde_float64 t;
3316 
3317       simde_memcpy(&t, mem_addr, sizeof(t));
3318       r_.f64[0] = a_.f64[0];
3319       r_.f64[1] = t;
3320     #endif
3321 
3322     return simde__m128d_from_private(r_);
3323   #endif
3324 }
3325 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3326   #define _mm_loadh_pd(a, mem_addr) simde_mm_loadh_pd(a, mem_addr)
3327 #endif
3328 
3329 SIMDE_FUNCTION_ATTRIBUTES
3330 simde__m128i
simde_mm_loadl_epi64(simde__m128i const * mem_addr)3331 simde_mm_loadl_epi64 (simde__m128i const* mem_addr) {
3332   #if defined(SIMDE_X86_SSE2_NATIVE)
3333     return _mm_loadl_epi64(mem_addr);
3334   #else
3335     simde__m128i_private r_;
3336 
3337     int64_t value;
3338     simde_memcpy(&value, mem_addr, sizeof(value));
3339 
3340     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3341       r_.neon_i64 = vcombine_s64(vld1_s64(HEDLEY_REINTERPRET_CAST(int64_t const *, mem_addr)), vdup_n_s64(0));
3342     #else
3343       r_.i64[0] = value;
3344       r_.i64[1] = 0;
3345     #endif
3346 
3347     return simde__m128i_from_private(r_);
3348   #endif
3349 }
3350 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3351   #define _mm_loadl_epi64(mem_addr) simde_mm_loadl_epi64(mem_addr)
3352 #endif
3353 
3354 SIMDE_FUNCTION_ATTRIBUTES
3355 simde__m128d
simde_mm_loadl_pd(simde__m128d a,simde_float64 const * mem_addr)3356 simde_mm_loadl_pd (simde__m128d a, simde_float64 const* mem_addr) {
3357   #if defined(SIMDE_X86_SSE2_NATIVE)
3358     return _mm_loadl_pd(a, mem_addr);
3359   #else
3360     simde__m128d_private
3361       r_,
3362       a_ = simde__m128d_to_private(a);
3363 
3364     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3365       r_.neon_f64 = vcombine_f64(vld1_f64(
3366         HEDLEY_REINTERPRET_CAST(const float64_t*, mem_addr)), vget_high_f64(a_.neon_f64));
3367     #else
3368       r_.f64[0] = *mem_addr;
3369       r_.u64[1] = a_.u64[1];
3370     #endif
3371 
3372     return simde__m128d_from_private(r_);
3373   #endif
3374 }
3375 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3376   #define _mm_loadl_pd(a, mem_addr) simde_mm_loadl_pd(a, mem_addr)
3377 #endif
3378 
3379 SIMDE_FUNCTION_ATTRIBUTES
3380 simde__m128d
simde_mm_loadr_pd(simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM (2)])3381 simde_mm_loadr_pd (simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)]) {
3382   #if defined(SIMDE_X86_SSE2_NATIVE)
3383     return _mm_loadr_pd(mem_addr);
3384   #else
3385     simde__m128d_private
3386       r_;
3387 
3388     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3389       r_.neon_f64 = vld1q_f64(mem_addr);
3390       r_.neon_f64 = vextq_f64(r_.neon_f64, r_.neon_f64, 1);
3391     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3392       r_.neon_i64 = vld1q_s64(HEDLEY_REINTERPRET_CAST(int64_t const *, mem_addr));
3393       r_.neon_i64 = vextq_s64(r_.neon_i64, r_.neon_i64, 1);
3394     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
3395       v128_t tmp = wasm_v128_load(mem_addr);
3396       r_.wasm_v128 = wasm_v64x2_shuffle(tmp, tmp, 1, 0);
3397     #else
3398       r_.f64[0] = mem_addr[1];
3399       r_.f64[1] = mem_addr[0];
3400     #endif
3401 
3402     return simde__m128d_from_private(r_);
3403   #endif
3404 }
3405 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3406   #define _mm_loadr_pd(mem_addr) simde_mm_loadr_pd(mem_addr)
3407 #endif
3408 
3409 SIMDE_FUNCTION_ATTRIBUTES
3410 simde__m128d
simde_mm_loadu_pd(simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM (2)])3411 simde_mm_loadu_pd (simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)]) {
3412   #if defined(SIMDE_X86_SSE2_NATIVE)
3413     return _mm_loadu_pd(mem_addr);
3414   #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3415     return vld1q_f64(mem_addr);
3416   #else
3417     simde__m128d_private r_;
3418 
3419     simde_memcpy(&r_, mem_addr, sizeof(r_));
3420 
3421     return simde__m128d_from_private(r_);
3422   #endif
3423 }
3424 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3425   #define _mm_loadu_pd(mem_addr) simde_mm_loadu_pd(mem_addr)
3426 #endif
3427 
3428 SIMDE_FUNCTION_ATTRIBUTES
3429 simde__m128i
simde_mm_loadu_epi8(void const * mem_addr)3430 simde_mm_loadu_epi8(void const * mem_addr) {
3431   #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) && !defined(SIMDE_BUG_GCC_95483) && !defined(SIMDE_BUG_CLANG_REV_344862)
3432     return _mm_loadu_epi8(mem_addr);
3433   #elif defined(SIMDE_X86_SSE2_NATIVE)
3434     return _mm_loadu_si128(SIMDE_ALIGN_CAST(__m128i const *, mem_addr));
3435   #else
3436     simde__m128i_private r_;
3437 
3438     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3439       r_.neon_i8 = vld1q_s8(HEDLEY_REINTERPRET_CAST(int8_t const*, mem_addr));
3440     #else
3441       simde_memcpy(&r_, mem_addr, sizeof(r_));
3442     #endif
3443 
3444     return simde__m128i_from_private(r_);
3445   #endif
3446 }
3447 #define simde_x_mm_loadu_epi8(mem_addr) simde_mm_loadu_epi8(mem_addr)
3448 #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && (defined(SIMDE_BUG_GCC_95483) || defined(SIMDE_BUG_CLANG_REV_344862)))
3449   #undef _mm_loadu_epi8
3450   #define _mm_loadu_epi8(a) simde_mm_loadu_epi8(a)
3451 #endif
3452 
3453 SIMDE_FUNCTION_ATTRIBUTES
3454 simde__m128i
simde_mm_loadu_epi16(void const * mem_addr)3455 simde_mm_loadu_epi16(void const * mem_addr) {
3456   #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) && !defined(SIMDE_BUG_GCC_95483) && !defined(SIMDE_BUG_CLANG_REV_344862)
3457     return _mm_loadu_epi16(mem_addr);
3458   #elif defined(SIMDE_X86_SSE2_NATIVE)
3459     return _mm_loadu_si128(SIMDE_ALIGN_CAST(__m128i const *, mem_addr));
3460   #else
3461     simde__m128i_private r_;
3462 
3463     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3464       r_.neon_i16 = vreinterpretq_s16_s8(vld1q_s8(HEDLEY_REINTERPRET_CAST(int8_t const*, mem_addr)));
3465     #else
3466       simde_memcpy(&r_, mem_addr, sizeof(r_));
3467     #endif
3468 
3469     return simde__m128i_from_private(r_);
3470   #endif
3471 }
3472 #define simde_x_mm_loadu_epi16(mem_addr) simde_mm_loadu_epi16(mem_addr)
3473 #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && (defined(SIMDE_BUG_GCC_95483) || defined(SIMDE_BUG_CLANG_REV_344862)))
3474   #undef _mm_loadu_epi16
3475   #define _mm_loadu_epi16(a) simde_mm_loadu_epi16(a)
3476 #endif
3477 
3478 SIMDE_FUNCTION_ATTRIBUTES
3479 simde__m128i
simde_mm_loadu_epi32(void const * mem_addr)3480 simde_mm_loadu_epi32(void const * mem_addr) {
3481   #if defined(SIMDE_X86_AVX512VL_NATIVE) && !defined(SIMDE_BUG_GCC_95483) && !defined(SIMDE_BUG_CLANG_REV_344862)
3482     return _mm_loadu_epi32(mem_addr);
3483   #elif defined(SIMDE_X86_SSE2_NATIVE)
3484     return _mm_loadu_si128(SIMDE_ALIGN_CAST(__m128i const *, mem_addr));
3485   #else
3486     simde__m128i_private r_;
3487 
3488     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3489       r_.neon_i32 = vreinterpretq_s32_s8(vld1q_s8(HEDLEY_REINTERPRET_CAST(int8_t const*, mem_addr)));
3490     #else
3491       simde_memcpy(&r_, mem_addr, sizeof(r_));
3492     #endif
3493 
3494     return simde__m128i_from_private(r_);
3495   #endif
3496 }
3497 #define simde_x_mm_loadu_epi32(mem_addr) simde_mm_loadu_epi32(mem_addr)
3498 #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && (defined(SIMDE_BUG_GCC_95483) || defined(SIMDE_BUG_CLANG_REV_344862)))
3499   #undef _mm_loadu_epi32
3500   #define _mm_loadu_epi32(a) simde_mm_loadu_epi32(a)
3501 #endif
3502 
3503 SIMDE_FUNCTION_ATTRIBUTES
3504 simde__m128i
simde_mm_loadu_epi64(void const * mem_addr)3505 simde_mm_loadu_epi64(void const * mem_addr) {
3506   #if defined(SIMDE_X86_AVX512VL_NATIVE) && !defined(SIMDE_BUG_GCC_95483) && !defined(SIMDE_BUG_CLANG_REV_344862)
3507     return _mm_loadu_epi64(mem_addr);
3508   #elif defined(SIMDE_X86_SSE2_NATIVE)
3509     return _mm_loadu_si128(SIMDE_ALIGN_CAST(__m128i const *, mem_addr));
3510   #else
3511     simde__m128i_private r_;
3512 
3513     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3514       r_.neon_i64 = vreinterpretq_s64_s8(vld1q_s8(HEDLEY_REINTERPRET_CAST(int8_t const*, mem_addr)));
3515     #else
3516       simde_memcpy(&r_, mem_addr, sizeof(r_));
3517     #endif
3518 
3519     return simde__m128i_from_private(r_);
3520   #endif
3521 }
3522 #define simde_x_mm_loadu_epi64(mem_addr) simde_mm_loadu_epi64(mem_addr)
3523 #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && (defined(SIMDE_BUG_GCC_95483) || defined(SIMDE_BUG_CLANG_REV_344862)))
3524   #undef _mm_loadu_epi64
3525   #define _mm_loadu_epi64(a) simde_mm_loadu_epi64(a)
3526 #endif
3527 
3528 SIMDE_FUNCTION_ATTRIBUTES
3529 simde__m128i
simde_mm_loadu_si128(void const * mem_addr)3530 simde_mm_loadu_si128 (void const* mem_addr) {
3531   #if defined(SIMDE_X86_SSE2_NATIVE)
3532     return _mm_loadu_si128(HEDLEY_STATIC_CAST(__m128i const*, mem_addr));
3533   #else
3534     simde__m128i_private r_;
3535 
3536     #if HEDLEY_GNUC_HAS_ATTRIBUTE(may_alias,3,3,0)
3537       HEDLEY_DIAGNOSTIC_PUSH
3538       SIMDE_DIAGNOSTIC_DISABLE_PACKED_
3539       struct simde_mm_loadu_si128_s {
3540         __typeof__(r_) v;
3541       } __attribute__((__packed__, __may_alias__));
3542       r_ = HEDLEY_REINTERPRET_CAST(const struct simde_mm_loadu_si128_s *, mem_addr)->v;
3543       HEDLEY_DIAGNOSTIC_POP
3544     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3545       r_.neon_i8 = vld1q_s8(HEDLEY_REINTERPRET_CAST(int8_t const*, mem_addr));
3546     #else
3547       simde_memcpy(&r_, mem_addr, sizeof(r_));
3548     #endif
3549 
3550     return simde__m128i_from_private(r_);
3551   #endif
3552 }
3553 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3554   #define _mm_loadu_si128(mem_addr) simde_mm_loadu_si128(mem_addr)
3555 #endif
3556 
3557 SIMDE_FUNCTION_ATTRIBUTES
3558 simde__m128i
simde_mm_madd_epi16(simde__m128i a,simde__m128i b)3559 simde_mm_madd_epi16 (simde__m128i a, simde__m128i b) {
3560   #if defined(SIMDE_X86_SSE2_NATIVE)
3561     return _mm_madd_epi16(a, b);
3562   #else
3563     simde__m128i_private
3564       r_,
3565       a_ = simde__m128i_to_private(a),
3566       b_ = simde__m128i_to_private(b);
3567 
3568     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3569       int32x4_t pl = vmull_s16(vget_low_s16(a_.neon_i16),  vget_low_s16(b_.neon_i16));
3570       int32x4_t ph = vmull_high_s16(a_.neon_i16, b_.neon_i16);
3571       r_.neon_i32 = vpaddq_s32(pl, ph);
3572     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3573       int32x4_t pl = vmull_s16(vget_low_s16(a_.neon_i16),  vget_low_s16(b_.neon_i16));
3574       int32x4_t ph = vmull_s16(vget_high_s16(a_.neon_i16), vget_high_s16(b_.neon_i16));
3575       int32x2_t rl = vpadd_s32(vget_low_s32(pl), vget_high_s32(pl));
3576       int32x2_t rh = vpadd_s32(vget_low_s32(ph), vget_high_s32(ph));
3577       r_.neon_i32 = vcombine_s32(rl, rh);
3578     #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
3579       static const SIMDE_POWER_ALTIVEC_VECTOR(int) tz = { 0, 0, 0, 0 };
3580       r_.altivec_i32 = vec_msum(a_.altivec_i16, b_.altivec_i16, tz);
3581     #else
3582       SIMDE_VECTORIZE
3583       for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i16[0])) ; i += 2) {
3584         r_.i32[i / 2] = (a_.i16[i] * b_.i16[i]) + (a_.i16[i + 1] * b_.i16[i + 1]);
3585       }
3586     #endif
3587 
3588     return simde__m128i_from_private(r_);
3589   #endif
3590 }
3591 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3592   #define _mm_madd_epi16(a, b) simde_mm_madd_epi16(a, b)
3593 #endif
3594 
3595 SIMDE_FUNCTION_ATTRIBUTES
3596 void
simde_mm_maskmoveu_si128(simde__m128i a,simde__m128i mask,int8_t mem_addr[HEDLEY_ARRAY_PARAM (16)])3597 simde_mm_maskmoveu_si128 (simde__m128i a, simde__m128i mask, int8_t mem_addr[HEDLEY_ARRAY_PARAM(16)]) {
3598   #if defined(SIMDE_X86_SSE2_NATIVE)
3599     _mm_maskmoveu_si128(a, mask, HEDLEY_REINTERPRET_CAST(char*, mem_addr));
3600   #else
3601     simde__m128i_private
3602       a_ = simde__m128i_to_private(a),
3603       mask_ = simde__m128i_to_private(mask);
3604 
3605     for (size_t i = 0 ; i < (sizeof(a_.i8) / sizeof(a_.i8[0])) ; i++) {
3606       if (mask_.u8[i] & 0x80) {
3607         mem_addr[i] = a_.i8[i];
3608       }
3609     }
3610   #endif
3611 }
3612 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3613   #define _mm_maskmoveu_si128(a, mask, mem_addr) simde_mm_maskmoveu_si128((a), (mask), SIMDE_CHECKED_REINTERPRET_CAST(int8_t*, char*, (mem_addr)))
3614 #endif
3615 
3616 SIMDE_FUNCTION_ATTRIBUTES
3617 int32_t
simde_mm_movemask_epi8(simde__m128i a)3618 simde_mm_movemask_epi8 (simde__m128i a) {
3619   #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__INTEL_COMPILER)
3620     /* ICC has trouble with _mm_movemask_epi8 at -O2 and above: */
3621     return _mm_movemask_epi8(a);
3622   #else
3623     int32_t r = 0;
3624     simde__m128i_private a_ = simde__m128i_to_private(a);
3625 
3626     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3627       // Use increasingly wide shifts+adds to collect the sign bits
3628       // together.
3629       // Since the widening shifts would be rather confusing to follow in little endian, everything
3630       // will be illustrated in big endian order instead. This has a different result - the bits
3631       // would actually be reversed on a big endian machine.
3632 
3633       // Starting input (only half the elements are shown):
3634       // 89 ff 1d c0 00 10 99 33
3635       uint8x16_t input = a_.neon_u8;
3636 
3637       // Shift out everything but the sign bits with an unsigned shift right.
3638       //
3639       // Bytes of the vector::
3640       // 89 ff 1d c0 00 10 99 33
3641       // \  \  \  \  \  \  \  \    high_bits = (uint16x4_t)(input >> 7)
3642       //  |  |  |  |  |  |  |  |
3643       // 01 01 00 01 00 00 01 00
3644       //
3645       // Bits of first important lane(s):
3646       // 10001001 (89)
3647       // \______
3648       //        |
3649       // 00000001 (01)
3650       uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
3651 
3652       // Merge the even lanes together with a 16-bit unsigned shift right + add.
3653       // 'xx' represents garbage data which will be ignored in the final result.
3654       // In the important bytes, the add functions like a binary OR.
3655       //
3656       // 01 01 00 01 00 00 01 00
3657       //  \_ |  \_ |  \_ |  \_ |   paired16 = (uint32x4_t)(input + (input >> 7))
3658       //    \|    \|    \|    \|
3659       // xx 03 xx 01 xx 00 xx 02
3660       //
3661       // 00000001 00000001 (01 01)
3662       //        \_______ |
3663       //                \|
3664       // xxxxxxxx xxxxxx11 (xx 03)
3665       uint32x4_t paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
3666 
3667       // Repeat with a wider 32-bit shift + add.
3668       // xx 03 xx 01 xx 00 xx 02
3669       //     \____ |     \____ |  paired32 = (uint64x1_t)(paired16 + (paired16 >> 14))
3670       //          \|          \|
3671       // xx xx xx 0d xx xx xx 02
3672       //
3673       // 00000011 00000001 (03 01)
3674       //        \\_____ ||
3675       //         '----.\||
3676       // xxxxxxxx xxxx1101 (xx 0d)
3677       uint64x2_t paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
3678 
3679       // Last, an even wider 64-bit shift + add to get our result in the low 8 bit lanes.
3680       // xx xx xx 0d xx xx xx 02
3681       //            \_________ |   paired64 = (uint8x8_t)(paired32 + (paired32 >> 28))
3682       //                      \|
3683       // xx xx xx xx xx xx xx d2
3684       //
3685       // 00001101 00000010 (0d 02)
3686       //     \   \___ |  |
3687       //      '---.  \|  |
3688       // xxxxxxxx 11010010 (xx d2)
3689       uint8x16_t paired64 = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
3690 
3691       // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
3692       // xx xx xx xx xx xx xx d2
3693       //                      ||  return paired64[0]
3694       //                      d2
3695       // Note: Little endian would return the correct value 4b (01001011) instead.
3696       r = vgetq_lane_u8(paired64, 0) | (HEDLEY_STATIC_CAST(int32_t, vgetq_lane_u8(paired64, 8)) << 8);
3697     #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) && !defined(HEDLEY_IBM_VERSION) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
3698       static const SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) perm = { 120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8, 0 };
3699       r = HEDLEY_STATIC_CAST(int32_t, vec_extract(vec_vbpermq(a_.altivec_u8, perm), 1));
3700     #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) && !defined(HEDLEY_IBM_VERSION) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_BIG)
3701       static const SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) perm = { 120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8, 0 };
3702       r = HEDLEY_STATIC_CAST(int32_t, vec_extract(vec_vbpermq(a_.altivec_u8, perm), 14));
3703     #else
3704       SIMDE_VECTORIZE_REDUCTION(|:r)
3705       for (size_t i = 0 ; i < (sizeof(a_.u8) / sizeof(a_.u8[0])) ; i++) {
3706         r |= (a_.u8[15 - i] >> 7) << (15 - i);
3707       }
3708     #endif
3709 
3710     return r;
3711   #endif
3712 }
3713 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3714   #define _mm_movemask_epi8(a) simde_mm_movemask_epi8(a)
3715 #endif
3716 
3717 SIMDE_FUNCTION_ATTRIBUTES
3718 int32_t
simde_mm_movemask_pd(simde__m128d a)3719 simde_mm_movemask_pd (simde__m128d a) {
3720   #if defined(SIMDE_X86_SSE2_NATIVE)
3721     return _mm_movemask_pd(a);
3722   #else
3723     int32_t r = 0;
3724     simde__m128d_private a_ = simde__m128d_to_private(a);
3725 
3726     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3727       static const int64_t shift_amount[] = { 0, 1 };
3728       const int64x2_t shift = vld1q_s64(shift_amount);
3729       uint64x2_t tmp = vshrq_n_u64(a_.neon_u64, 63);
3730       return HEDLEY_STATIC_CAST(int32_t, vaddvq_u64(vshlq_u64(tmp, shift)));
3731     #else
3732       SIMDE_VECTORIZE_REDUCTION(|:r)
3733       for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) {
3734         r |= (a_.u64[i] >> 63) << i;
3735       }
3736     #endif
3737 
3738     return r;
3739   #endif
3740 }
3741 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3742   #define _mm_movemask_pd(a) simde_mm_movemask_pd(a)
3743 #endif
3744 
3745 SIMDE_FUNCTION_ATTRIBUTES
3746 simde__m64
simde_mm_movepi64_pi64(simde__m128i a)3747 simde_mm_movepi64_pi64 (simde__m128i a) {
3748   #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
3749     return _mm_movepi64_pi64(a);
3750   #else
3751     simde__m64_private r_;
3752     simde__m128i_private a_ = simde__m128i_to_private(a);
3753 
3754     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3755       r_.neon_i64 = vget_low_s64(a_.neon_i64);
3756     #else
3757       r_.i64[0] = a_.i64[0];
3758     #endif
3759 
3760     return simde__m64_from_private(r_);
3761   #endif
3762 }
3763 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3764   #define _mm_movepi64_pi64(a) simde_mm_movepi64_pi64(a)
3765 #endif
3766 
3767 SIMDE_FUNCTION_ATTRIBUTES
3768 simde__m128i
simde_mm_movpi64_epi64(simde__m64 a)3769 simde_mm_movpi64_epi64 (simde__m64 a) {
3770   #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
3771     return _mm_movpi64_epi64(a);
3772   #else
3773     simde__m128i_private r_;
3774     simde__m64_private a_ = simde__m64_to_private(a);
3775 
3776     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3777       r_.neon_i64 = vcombine_s64(a_.neon_i64, vdup_n_s64(0));
3778     #else
3779       r_.i64[0] = a_.i64[0];
3780       r_.i64[1] = 0;
3781     #endif
3782 
3783     return simde__m128i_from_private(r_);
3784   #endif
3785 }
3786 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3787   #define _mm_movpi64_epi64(a) simde_mm_movpi64_epi64(a)
3788 #endif
3789 
3790 SIMDE_FUNCTION_ATTRIBUTES
3791 simde__m128i
simde_mm_min_epi16(simde__m128i a,simde__m128i b)3792 simde_mm_min_epi16 (simde__m128i a, simde__m128i b) {
3793   #if defined(SIMDE_X86_SSE2_NATIVE)
3794     return _mm_min_epi16(a, b);
3795   #else
3796     simde__m128i_private
3797       r_,
3798       a_ = simde__m128i_to_private(a),
3799       b_ = simde__m128i_to_private(b);
3800 
3801     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3802       r_.neon_i16 = vminq_s16(a_.neon_i16, b_.neon_i16);
3803     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
3804       r_.wasm_v128 = wasm_i16x8_min(a_.wasm_v128, b_.wasm_v128);
3805     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
3806       r_.altivec_i16 = vec_min(a_.altivec_i16, b_.altivec_i16);
3807     #else
3808       SIMDE_VECTORIZE
3809       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
3810         r_.i16[i] = (a_.i16[i] < b_.i16[i]) ? a_.i16[i] : b_.i16[i];
3811       }
3812     #endif
3813 
3814     return simde__m128i_from_private(r_);
3815   #endif
3816 }
3817 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3818   #define _mm_min_epi16(a, b) simde_mm_min_epi16(a, b)
3819 #endif
3820 
3821 SIMDE_FUNCTION_ATTRIBUTES
3822 simde__m128i
simde_mm_min_epu8(simde__m128i a,simde__m128i b)3823 simde_mm_min_epu8 (simde__m128i a, simde__m128i b) {
3824   #if defined(SIMDE_X86_SSE2_NATIVE)
3825     return _mm_min_epu8(a, b);
3826   #else
3827     simde__m128i_private
3828       r_,
3829       a_ = simde__m128i_to_private(a),
3830       b_ = simde__m128i_to_private(b);
3831 
3832     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3833       r_.neon_u8 = vminq_u8(a_.neon_u8, b_.neon_u8);
3834     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
3835       r_.wasm_v128 = wasm_u8x16_min(a_.wasm_v128, b_.wasm_v128);
3836     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
3837       r_.altivec_u8 = vec_min(a_.altivec_u8, b_.altivec_u8);
3838     #else
3839       SIMDE_VECTORIZE
3840       for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
3841         r_.u8[i] = (a_.u8[i] < b_.u8[i]) ? a_.u8[i] : b_.u8[i];
3842       }
3843     #endif
3844 
3845     return simde__m128i_from_private(r_);
3846   #endif
3847 }
3848 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3849   #define _mm_min_epu8(a, b) simde_mm_min_epu8(a, b)
3850 #endif
3851 
3852 SIMDE_FUNCTION_ATTRIBUTES
3853 simde__m128d
simde_mm_min_pd(simde__m128d a,simde__m128d b)3854 simde_mm_min_pd (simde__m128d a, simde__m128d b) {
3855   #if defined(SIMDE_X86_SSE2_NATIVE)
3856     return _mm_min_pd(a, b);
3857   #else
3858     simde__m128d_private
3859       r_,
3860       a_ = simde__m128d_to_private(a),
3861       b_ = simde__m128d_to_private(b);
3862 
3863     #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
3864       r_.altivec_f64 = vec_min(a_.altivec_f64, b_.altivec_f64);
3865     #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3866       r_.neon_f64 = vminq_f64(a_.neon_f64, b_.neon_f64);
3867     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
3868       r_.wasm_v128 = wasm_f64x2_min(a_.wasm_v128, b_.wasm_v128);
3869     #else
3870       SIMDE_VECTORIZE
3871       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
3872         r_.f64[i] = (a_.f64[i] < b_.f64[i]) ? a_.f64[i] : b_.f64[i];
3873       }
3874     #endif
3875 
3876     return simde__m128d_from_private(r_);
3877   #endif
3878 }
3879 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3880   #define _mm_min_pd(a, b) simde_mm_min_pd(a, b)
3881 #endif
3882 
3883 SIMDE_FUNCTION_ATTRIBUTES
3884 simde__m128d
simde_mm_min_sd(simde__m128d a,simde__m128d b)3885 simde_mm_min_sd (simde__m128d a, simde__m128d b) {
3886   #if defined(SIMDE_X86_SSE2_NATIVE)
3887     return _mm_min_sd(a, b);
3888   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
3889     return simde_mm_move_sd(a, simde_mm_min_pd(a, b));
3890   #else
3891     simde__m128d_private
3892       r_,
3893       a_ = simde__m128d_to_private(a),
3894       b_ = simde__m128d_to_private(b);
3895 
3896     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3897       float64x2_t temp = vminq_f64(a_.neon_f64, b_.neon_f64);
3898       r_.neon_f64 = vsetq_lane_f64(vgetq_lane(a_.neon_f64, 1), temp, 1);
3899     #else
3900       r_.f64[0] = (a_.f64[0] < b_.f64[0]) ? a_.f64[0] : b_.f64[0];
3901       r_.f64[1] = a_.f64[1];
3902     #endif
3903 
3904     return simde__m128d_from_private(r_);
3905   #endif
3906 }
3907 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3908   #define _mm_min_sd(a, b) simde_mm_min_sd(a, b)
3909 #endif
3910 
3911 SIMDE_FUNCTION_ATTRIBUTES
3912 simde__m128i
simde_mm_max_epi16(simde__m128i a,simde__m128i b)3913 simde_mm_max_epi16 (simde__m128i a, simde__m128i b) {
3914   #if defined(SIMDE_X86_SSE2_NATIVE)
3915     return _mm_max_epi16(a, b);
3916   #else
3917     simde__m128i_private
3918       r_,
3919       a_ = simde__m128i_to_private(a),
3920       b_ = simde__m128i_to_private(b);
3921 
3922     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3923       r_.neon_i16 = vmaxq_s16(a_.neon_i16, b_.neon_i16);
3924     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
3925       r_.wasm_v128 = wasm_i16x8_max(a_.wasm_v128, b_.wasm_v128);
3926     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
3927       r_.altivec_i16 = vec_max(a_.altivec_i16, b_.altivec_i16);
3928     #else
3929       SIMDE_VECTORIZE
3930       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
3931         r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? a_.i16[i] : b_.i16[i];
3932       }
3933     #endif
3934 
3935     return simde__m128i_from_private(r_);
3936   #endif
3937 }
3938 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3939   #define _mm_max_epi16(a, b) simde_mm_max_epi16(a, b)
3940 #endif
3941 
3942 SIMDE_FUNCTION_ATTRIBUTES
3943 simde__m128i
simde_mm_max_epu8(simde__m128i a,simde__m128i b)3944 simde_mm_max_epu8 (simde__m128i a, simde__m128i b) {
3945   #if defined(SIMDE_X86_SSE2_NATIVE)
3946     return _mm_max_epu8(a, b);
3947   #else
3948     simde__m128i_private
3949       r_,
3950       a_ = simde__m128i_to_private(a),
3951       b_ = simde__m128i_to_private(b);
3952 
3953     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3954       r_.neon_u8 = vmaxq_u8(a_.neon_u8, b_.neon_u8);
3955     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
3956       r_.wasm_v128 = wasm_u8x16_max(a_.wasm_v128, b_.wasm_v128);
3957     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
3958       r_.altivec_u8 = vec_max(a_.altivec_u8, b_.altivec_u8);
3959     #else
3960       SIMDE_VECTORIZE
3961       for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
3962         r_.u8[i] = (a_.u8[i] > b_.u8[i]) ? a_.u8[i] : b_.u8[i];
3963       }
3964     #endif
3965 
3966     return simde__m128i_from_private(r_);
3967   #endif
3968 }
3969 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3970   #define _mm_max_epu8(a, b) simde_mm_max_epu8(a, b)
3971 #endif
3972 
3973 SIMDE_FUNCTION_ATTRIBUTES
3974 simde__m128d
simde_mm_max_pd(simde__m128d a,simde__m128d b)3975 simde_mm_max_pd (simde__m128d a, simde__m128d b) {
3976   #if defined(SIMDE_X86_SSE2_NATIVE)
3977     return _mm_max_pd(a, b);
3978   #else
3979     simde__m128d_private
3980       r_,
3981       a_ = simde__m128d_to_private(a),
3982       b_ = simde__m128d_to_private(b);
3983 
3984     #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
3985       r_.altivec_f64 = vec_max(a_.altivec_f64, b_.altivec_f64);
3986     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
3987       r_.wasm_v128 = wasm_f64x2_max(a_.wasm_v128, b_.wasm_v128);
3988     #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3989       r_.neon_f64 = vmaxq_f64(a_.neon_f64, b_.neon_f64);
3990     #else
3991       SIMDE_VECTORIZE
3992       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
3993         r_.f64[i] = (a_.f64[i] > b_.f64[i]) ? a_.f64[i] : b_.f64[i];
3994       }
3995     #endif
3996 
3997     return simde__m128d_from_private(r_);
3998   #endif
3999 }
4000 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4001   #define _mm_max_pd(a, b) simde_mm_max_pd(a, b)
4002 #endif
4003 
4004 SIMDE_FUNCTION_ATTRIBUTES
4005 simde__m128d
simde_mm_max_sd(simde__m128d a,simde__m128d b)4006 simde_mm_max_sd (simde__m128d a, simde__m128d b) {
4007   #if defined(SIMDE_X86_SSE2_NATIVE)
4008     return _mm_max_sd(a, b);
4009   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
4010     return simde_mm_move_sd(a, simde_mm_max_pd(a, b));
4011   #else
4012     simde__m128d_private
4013       r_,
4014       a_ = simde__m128d_to_private(a),
4015       b_ = simde__m128d_to_private(b);
4016 
4017     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
4018       float64x2_t temp = vmaxq_f64(a_.neon_f64, b_.neon_f64);
4019       r_.neon_f64 = vsetq_lane_f64(vgetq_lane(a_.neon_f64, 1), temp, 1);
4020     #else
4021       r_.f64[0] = (a_.f64[0] > b_.f64[0]) ? a_.f64[0] : b_.f64[0];
4022       r_.f64[1] = a_.f64[1];
4023     #endif
4024 
4025     return simde__m128d_from_private(r_);
4026   #endif
4027 }
4028 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4029   #define _mm_max_sd(a, b) simde_mm_max_sd(a, b)
4030 #endif
4031 
4032 SIMDE_FUNCTION_ATTRIBUTES
4033 simde__m128i
simde_mm_move_epi64(simde__m128i a)4034 simde_mm_move_epi64 (simde__m128i a) {
4035   #if defined(SIMDE_X86_SSE2_NATIVE)
4036     return _mm_move_epi64(a);
4037   #else
4038     simde__m128i_private
4039       r_,
4040       a_ = simde__m128i_to_private(a);
4041 
4042     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4043       r_.neon_i64 = vsetq_lane_s64(0, a_.neon_i64, 1);
4044     #else
4045       r_.i64[0] = a_.i64[0];
4046       r_.i64[1] = 0;
4047     #endif
4048 
4049     return simde__m128i_from_private(r_);
4050   #endif
4051 }
4052 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4053   #define _mm_move_epi64(a) simde_mm_move_epi64(a)
4054 #endif
4055 
4056 SIMDE_FUNCTION_ATTRIBUTES
4057 simde__m128i
simde_mm_mul_epu32(simde__m128i a,simde__m128i b)4058 simde_mm_mul_epu32 (simde__m128i a, simde__m128i b) {
4059   #if defined(SIMDE_X86_SSE2_NATIVE)
4060     return _mm_mul_epu32(a, b);
4061   #else
4062     simde__m128i_private
4063       r_,
4064       a_ = simde__m128i_to_private(a),
4065       b_ = simde__m128i_to_private(b);
4066 
4067     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4068       uint32x2_t a_lo = vmovn_u64(a_.neon_u64);
4069       uint32x2_t b_lo = vmovn_u64(b_.neon_u64);
4070       r_.neon_u64 = vmull_u32(a_lo, b_lo);
4071     #elif defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
4072       __typeof__(a_.u32) z = { 0, };
4073       a_.u32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.u32, z, 0, 4, 2, 6);
4074       b_.u32 = SIMDE_SHUFFLE_VECTOR_(32, 16, b_.u32, z, 0, 4, 2, 6);
4075       r_.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), a_.u32) *
4076                HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), b_.u32);
4077     #else
4078       SIMDE_VECTORIZE
4079       for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
4080         r_.u64[i] = HEDLEY_STATIC_CAST(uint64_t, a_.u32[i * 2]) * HEDLEY_STATIC_CAST(uint64_t, b_.u32[i * 2]);
4081       }
4082     #endif
4083 
4084     return simde__m128i_from_private(r_);
4085   #endif
4086 }
4087 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4088   #define _mm_mul_epu32(a, b) simde_mm_mul_epu32(a, b)
4089 #endif
4090 
4091 SIMDE_FUNCTION_ATTRIBUTES
4092 simde__m128i
simde_x_mm_mul_epi64(simde__m128i a,simde__m128i b)4093 simde_x_mm_mul_epi64 (simde__m128i a, simde__m128i b) {
4094   simde__m128i_private
4095     r_,
4096     a_ = simde__m128i_to_private(a),
4097     b_ = simde__m128i_to_private(b);
4098 
4099   #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
4100     r_.i64 = a_.i64 * b_.i64;
4101   #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
4102     r_.neon_f64 = vmulq_s64(a_.neon_f64, b_.neon_f64);
4103   #else
4104     SIMDE_VECTORIZE
4105     for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
4106       r_.i64[i] = a_.i64[i] * b_.i64[i];
4107     }
4108   #endif
4109 
4110   return simde__m128i_from_private(r_);
4111 }
4112 
4113 SIMDE_FUNCTION_ATTRIBUTES
4114 simde__m128i
simde_x_mm_mod_epi64(simde__m128i a,simde__m128i b)4115 simde_x_mm_mod_epi64 (simde__m128i a, simde__m128i b) {
4116   simde__m128i_private
4117     r_,
4118     a_ = simde__m128i_to_private(a),
4119     b_ = simde__m128i_to_private(b);
4120 
4121   #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
4122     r_.i64 = a_.i64 % b_.i64;
4123   #else
4124     SIMDE_VECTORIZE
4125     for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
4126       r_.i64[i] = a_.i64[i] % b_.i64[i];
4127     }
4128   #endif
4129 
4130   return simde__m128i_from_private(r_);
4131 }
4132 
4133 SIMDE_FUNCTION_ATTRIBUTES
4134 simde__m128d
simde_mm_mul_pd(simde__m128d a,simde__m128d b)4135 simde_mm_mul_pd (simde__m128d a, simde__m128d b) {
4136   #if defined(SIMDE_X86_SSE2_NATIVE)
4137     return _mm_mul_pd(a, b);
4138   #else
4139     simde__m128d_private
4140       r_,
4141       a_ = simde__m128d_to_private(a),
4142       b_ = simde__m128d_to_private(b);
4143 
4144     #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
4145       r_.f64 = a_.f64 * b_.f64;
4146     #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
4147       r_.neon_f64 = vmulq_f64(a_.neon_f64, b_.neon_f64);
4148     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4149       r_.wasm_v128 = wasm_f64x2_mul(a_.wasm_v128, b_.wasm_v128);
4150     #else
4151       SIMDE_VECTORIZE
4152       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
4153         r_.f64[i] = a_.f64[i] * b_.f64[i];
4154       }
4155     #endif
4156 
4157     return simde__m128d_from_private(r_);
4158   #endif
4159 }
4160 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4161   #define _mm_mul_pd(a, b) simde_mm_mul_pd(a, b)
4162 #endif
4163 
4164 SIMDE_FUNCTION_ATTRIBUTES
4165 simde__m128d
simde_mm_mul_sd(simde__m128d a,simde__m128d b)4166 simde_mm_mul_sd (simde__m128d a, simde__m128d b) {
4167   #if defined(SIMDE_X86_SSE2_NATIVE)
4168     return _mm_mul_sd(a, b);
4169   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
4170     return simde_mm_move_sd(a, simde_mm_mul_pd(a, b));
4171   #else
4172     simde__m128d_private
4173       r_,
4174       a_ = simde__m128d_to_private(a),
4175       b_ = simde__m128d_to_private(b);
4176 
4177     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
4178       float64x2_t temp = vmulq_f64(a_.neon_f64, b_.neon_f64);
4179       r_.neon_f64 = vsetq_lane_f64(vgetq_lane(a_.neon_f64, 1), temp, 1);
4180     #else
4181       r_.f64[0] = a_.f64[0] * b_.f64[0];
4182       r_.f64[1] = a_.f64[1];
4183     #endif
4184 
4185     return simde__m128d_from_private(r_);
4186   #endif
4187 }
4188 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4189   #define _mm_mul_sd(a, b) simde_mm_mul_sd(a, b)
4190 #endif
4191 
4192 SIMDE_FUNCTION_ATTRIBUTES
4193 simde__m64
simde_mm_mul_su32(simde__m64 a,simde__m64 b)4194 simde_mm_mul_su32 (simde__m64 a, simde__m64 b) {
4195   #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI)
4196     return _mm_mul_su32(a, b);
4197   #else
4198     simde__m64_private
4199       r_,
4200       a_ = simde__m64_to_private(a),
4201       b_ = simde__m64_to_private(b);
4202 
4203     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4204       r_.u64[0] = vget_lane_u64(vget_low_u64(vmull_u32(vreinterpret_u32_s64(a_.neon_i64), vreinterpret_u32_s64(b_.neon_i64))), 0);
4205     #else
4206       r_.u64[0] = HEDLEY_STATIC_CAST(uint64_t, a_.u32[0]) * HEDLEY_STATIC_CAST(uint64_t, b_.u32[0]);
4207     #endif
4208 
4209     return simde__m64_from_private(r_);
4210   #endif
4211 }
4212 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4213   #define _mm_mul_su32(a, b) simde_mm_mul_su32(a, b)
4214 #endif
4215 
4216 SIMDE_FUNCTION_ATTRIBUTES
4217 simde__m128i
simde_mm_mulhi_epi16(simde__m128i a,simde__m128i b)4218 simde_mm_mulhi_epi16 (simde__m128i a, simde__m128i b) {
4219   #if defined(SIMDE_X86_SSE2_NATIVE)
4220     return _mm_mulhi_epi16(a, b);
4221   #else
4222     simde__m128i_private
4223       r_,
4224       a_ = simde__m128i_to_private(a),
4225       b_ = simde__m128i_to_private(b);
4226 
4227     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4228       int16x4_t a3210 = vget_low_s16(a_.neon_i16);
4229       int16x4_t b3210 = vget_low_s16(b_.neon_i16);
4230       int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
4231       #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
4232         int32x4_t ab7654 = vmull_high_s16(a_.neon_i16, b_.neon_i16);
4233         r_.neon_i16 = vuzp2q_s16(vreinterpretq_s16_s32(ab3210), vreinterpretq_s16_s32(ab7654));
4234       #else
4235         int16x4_t a7654 = vget_high_s16(a_.neon_i16);
4236         int16x4_t b7654 = vget_high_s16(b_.neon_i16);
4237         int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
4238         uint16x8x2_t rv = vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
4239         r_.neon_u16 = rv.val[1];
4240       #endif
4241     #else
4242       SIMDE_VECTORIZE
4243       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
4244         r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, (HEDLEY_STATIC_CAST(uint32_t, HEDLEY_STATIC_CAST(int32_t, a_.i16[i]) * HEDLEY_STATIC_CAST(int32_t, b_.i16[i])) >> 16));
4245       }
4246     #endif
4247 
4248     return simde__m128i_from_private(r_);
4249   #endif
4250 }
4251 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4252   #define _mm_mulhi_epi16(a, b) simde_mm_mulhi_epi16(a, b)
4253 #endif
4254 
4255 SIMDE_FUNCTION_ATTRIBUTES
4256 simde__m128i
simde_mm_mulhi_epu16(simde__m128i a,simde__m128i b)4257 simde_mm_mulhi_epu16 (simde__m128i a, simde__m128i b) {
4258   #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
4259     return _mm_mulhi_epu16(a, b);
4260   #else
4261     simde__m128i_private
4262       r_,
4263       a_ = simde__m128i_to_private(a),
4264       b_ = simde__m128i_to_private(b);
4265 
4266     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4267       uint16x4_t a3210 = vget_low_u16(a_.neon_u16);
4268       uint16x4_t b3210 = vget_low_u16(b_.neon_u16);
4269       uint32x4_t ab3210 = vmull_u16(a3210, b3210); /* 3333222211110000 */
4270       #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
4271         uint32x4_t ab7654 = vmull_high_u16(a_.neon_u16, b_.neon_u16);
4272         r_.neon_u16 = vuzp2q_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654));
4273       #else
4274         uint16x4_t a7654 = vget_high_u16(a_.neon_u16);
4275         uint16x4_t b7654 = vget_high_u16(b_.neon_u16);
4276         uint32x4_t ab7654 = vmull_u16(a7654, b7654); /* 7777666655554444 */
4277         uint16x8x2_t neon_r = vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654));
4278         r_.neon_u16 = neon_r.val[1];
4279       #endif
4280     #else
4281       SIMDE_VECTORIZE
4282       for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
4283         r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, HEDLEY_STATIC_CAST(uint32_t, a_.u16[i]) * HEDLEY_STATIC_CAST(uint32_t, b_.u16[i]) >> 16);
4284       }
4285     #endif
4286 
4287     return simde__m128i_from_private(r_);
4288   #endif
4289 }
4290 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4291   #define _mm_mulhi_epu16(a, b) simde_mm_mulhi_epu16(a, b)
4292 #endif
4293 
4294 SIMDE_FUNCTION_ATTRIBUTES
4295 simde__m128i
simde_mm_mullo_epi16(simde__m128i a,simde__m128i b)4296 simde_mm_mullo_epi16 (simde__m128i a, simde__m128i b) {
4297   #if defined(SIMDE_X86_SSE2_NATIVE)
4298     return _mm_mullo_epi16(a, b);
4299   #else
4300     simde__m128i_private
4301       r_,
4302       a_ = simde__m128i_to_private(a),
4303       b_ = simde__m128i_to_private(b);
4304 
4305     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4306       r_.neon_i16 = vmulq_s16(a_.neon_i16, b_.neon_i16);
4307     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
4308       (void) a_;
4309       (void) b_;
4310       r_.altivec_i16 = vec_mul(a_.altivec_i16, b_.altivec_i16);
4311     #else
4312       SIMDE_VECTORIZE
4313       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
4314         r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, HEDLEY_STATIC_CAST(uint32_t, a_.u16[i]) * HEDLEY_STATIC_CAST(uint32_t, b_.u16[i]));
4315       }
4316     #endif
4317 
4318     return simde__m128i_from_private(r_);
4319   #endif
4320 }
4321 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4322   #define _mm_mullo_epi16(a, b) simde_mm_mullo_epi16(a, b)
4323 #endif
4324 
4325 SIMDE_FUNCTION_ATTRIBUTES
4326 simde__m128d
simde_mm_or_pd(simde__m128d a,simde__m128d b)4327 simde_mm_or_pd (simde__m128d a, simde__m128d b) {
4328   #if defined(SIMDE_X86_SSE2_NATIVE)
4329     return _mm_or_pd(a, b);
4330   #else
4331     simde__m128d_private
4332       r_,
4333       a_ = simde__m128d_to_private(a),
4334       b_ = simde__m128d_to_private(b);
4335 
4336     #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
4337       r_.i32f = a_.i32f | b_.i32f;
4338     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4339       r_.wasm_v128 = wasm_v128_or(a_.wasm_v128, b_.wasm_v128);
4340     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4341       r_.neon_i64 = vorrq_s64(a_.neon_i64, b_.neon_i64);
4342     #else
4343       SIMDE_VECTORIZE
4344       for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
4345         r_.i32f[i] = a_.i32f[i] | b_.i32f[i];
4346       }
4347     #endif
4348 
4349     return simde__m128d_from_private(r_);
4350   #endif
4351 }
4352 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4353   #define _mm_or_pd(a, b) simde_mm_or_pd(a, b)
4354 #endif
4355 
4356 SIMDE_FUNCTION_ATTRIBUTES
4357 simde__m128i
simde_mm_or_si128(simde__m128i a,simde__m128i b)4358 simde_mm_or_si128 (simde__m128i a, simde__m128i b) {
4359   #if defined(SIMDE_X86_SSE2_NATIVE)
4360     return _mm_or_si128(a, b);
4361   #else
4362     simde__m128i_private
4363       r_,
4364       a_ = simde__m128i_to_private(a),
4365       b_ = simde__m128i_to_private(b);
4366 
4367     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4368       r_.neon_i32 = vorrq_s32(a_.neon_i32, b_.neon_i32);
4369     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
4370       r_.altivec_i32 = vec_or(a_.altivec_i32, b_.altivec_i32);
4371     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
4372       r_.i32f = a_.i32f | b_.i32f;
4373     #else
4374       SIMDE_VECTORIZE
4375       for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
4376         r_.i32f[i] = a_.i32f[i] | b_.i32f[i];
4377       }
4378     #endif
4379 
4380     return simde__m128i_from_private(r_);
4381   #endif
4382 }
4383 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4384   #define _mm_or_si128(a, b) simde_mm_or_si128(a, b)
4385 #endif
4386 
4387 SIMDE_FUNCTION_ATTRIBUTES
4388 simde__m128i
simde_mm_packs_epi16(simde__m128i a,simde__m128i b)4389 simde_mm_packs_epi16 (simde__m128i a, simde__m128i b) {
4390   #if defined(SIMDE_X86_SSE2_NATIVE)
4391     return _mm_packs_epi16(a, b);
4392   #else
4393     simde__m128i_private
4394       r_,
4395       a_ = simde__m128i_to_private(a),
4396       b_ = simde__m128i_to_private(b);
4397 
4398     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4399       r_.neon_i8 = vcombine_s8(vqmovn_s16(a_.neon_i16), vqmovn_s16(b_.neon_i16));
4400     #else
4401       SIMDE_VECTORIZE
4402       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
4403         r_.i8[i]     = (a_.i16[i] > INT8_MAX) ? INT8_MAX : ((a_.i16[i] < INT8_MIN) ? INT8_MIN : HEDLEY_STATIC_CAST(int8_t, a_.i16[i]));
4404         r_.i8[i + 8] = (b_.i16[i] > INT8_MAX) ? INT8_MAX : ((b_.i16[i] < INT8_MIN) ? INT8_MIN : HEDLEY_STATIC_CAST(int8_t, b_.i16[i]));
4405       }
4406     #endif
4407 
4408     return simde__m128i_from_private(r_);
4409   #endif
4410 }
4411 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4412   #define _mm_packs_epi16(a, b) simde_mm_packs_epi16(a, b)
4413 #endif
4414 
4415 SIMDE_FUNCTION_ATTRIBUTES
4416 simde__m128i
simde_mm_packs_epi32(simde__m128i a,simde__m128i b)4417 simde_mm_packs_epi32 (simde__m128i a, simde__m128i b) {
4418   #if defined(SIMDE_X86_SSE2_NATIVE)
4419     return _mm_packs_epi32(a, b);
4420   #else
4421     simde__m128i_private
4422       r_,
4423       a_ = simde__m128i_to_private(a),
4424       b_ = simde__m128i_to_private(b);
4425 
4426     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4427       r_.neon_i16 = vcombine_s16(vqmovn_s32(a_.neon_i32), vqmovn_s32(b_.neon_i32));
4428     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
4429       r_.altivec_i16 = vec_packs(a_.altivec_i32, b_.altivec_i32);
4430     #else
4431       SIMDE_VECTORIZE
4432       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
4433         r_.i16[i]     = (a_.i32[i] > INT16_MAX) ? INT16_MAX : ((a_.i32[i] < INT16_MIN) ? INT16_MIN : HEDLEY_STATIC_CAST(int16_t, a_.i32[i]));
4434         r_.i16[i + 4] = (b_.i32[i] > INT16_MAX) ? INT16_MAX : ((b_.i32[i] < INT16_MIN) ? INT16_MIN : HEDLEY_STATIC_CAST(int16_t, b_.i32[i]));
4435       }
4436     #endif
4437 
4438     return simde__m128i_from_private(r_);
4439   #endif
4440 }
4441 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4442   #define _mm_packs_epi32(a, b) simde_mm_packs_epi32(a, b)
4443 #endif
4444 
4445 SIMDE_FUNCTION_ATTRIBUTES
4446 simde__m128i
simde_mm_packus_epi16(simde__m128i a,simde__m128i b)4447 simde_mm_packus_epi16 (simde__m128i a, simde__m128i b) {
4448   #if defined(SIMDE_X86_SSE2_NATIVE)
4449     return _mm_packus_epi16(a, b);
4450   #else
4451     simde__m128i_private
4452       r_,
4453       a_ = simde__m128i_to_private(a),
4454       b_ = simde__m128i_to_private(b);
4455 
4456     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4457       r_.neon_u8 = vcombine_u8(vqmovun_s16(a_.neon_i16), vqmovun_s16(b_.neon_i16));
4458     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
4459       r_.altivec_u8 = vec_packsu(a_.altivec_i16, b_.altivec_i16);
4460     #else
4461       SIMDE_VECTORIZE
4462       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
4463         r_.u8[i]     = (a_.i16[i] > UINT8_MAX) ? UINT8_MAX : ((a_.i16[i] < 0) ? UINT8_C(0) : HEDLEY_STATIC_CAST(uint8_t, a_.i16[i]));
4464         r_.u8[i + 8] = (b_.i16[i] > UINT8_MAX) ? UINT8_MAX : ((b_.i16[i] < 0) ? UINT8_C(0) : HEDLEY_STATIC_CAST(uint8_t, b_.i16[i]));
4465       }
4466     #endif
4467 
4468     return simde__m128i_from_private(r_);
4469   #endif
4470 }
4471 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4472   #define _mm_packus_epi16(a, b) simde_mm_packus_epi16(a, b)
4473 #endif
4474 
4475 SIMDE_FUNCTION_ATTRIBUTES
4476 void
simde_mm_pause(void)4477 simde_mm_pause (void) {
4478   #if defined(SIMDE_X86_SSE2_NATIVE)
4479     _mm_pause();
4480   #endif
4481 }
4482 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4483   #define _mm_pause() (simde_mm_pause())
4484 #endif
4485 
4486 SIMDE_FUNCTION_ATTRIBUTES
4487 simde__m128i
simde_mm_sad_epu8(simde__m128i a,simde__m128i b)4488 simde_mm_sad_epu8 (simde__m128i a, simde__m128i b) {
4489   #if defined(SIMDE_X86_SSE2_NATIVE)
4490     return _mm_sad_epu8(a, b);
4491   #else
4492     simde__m128i_private
4493       r_,
4494       a_ = simde__m128i_to_private(a),
4495       b_ = simde__m128i_to_private(b);
4496 
4497     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4498       const uint16x8_t t = vpaddlq_u8(vabdq_u8(a_.neon_u8, b_.neon_u8));
4499       r_.neon_u64 = vcombine_u64(
4500         vpaddl_u32(vpaddl_u16(vget_low_u16(t))),
4501         vpaddl_u32(vpaddl_u16(vget_high_u16(t))));
4502     #else
4503       for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
4504         uint16_t tmp = 0;
4505         SIMDE_VECTORIZE_REDUCTION(+:tmp)
4506         for (size_t j = 0 ; j < ((sizeof(r_.u8) / sizeof(r_.u8[0])) / 2) ; j++) {
4507           const size_t e = j + (i * 8);
4508           tmp += (a_.u8[e] > b_.u8[e]) ? (a_.u8[e] - b_.u8[e]) : (b_.u8[e] - a_.u8[e]);
4509         }
4510         r_.i64[i] = tmp;
4511       }
4512     #endif
4513 
4514     return simde__m128i_from_private(r_);
4515   #endif
4516 }
4517 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4518   #define _mm_sad_epu8(a, b) simde_mm_sad_epu8(a, b)
4519 #endif
4520 
4521 SIMDE_FUNCTION_ATTRIBUTES
4522 simde__m128i
simde_mm_set_epi8(int8_t e15,int8_t e14,int8_t e13,int8_t e12,int8_t e11,int8_t e10,int8_t e9,int8_t e8,int8_t e7,int8_t e6,int8_t e5,int8_t e4,int8_t e3,int8_t e2,int8_t e1,int8_t e0)4523 simde_mm_set_epi8 (int8_t e15, int8_t e14, int8_t e13, int8_t e12,
4524        int8_t e11, int8_t e10, int8_t  e9, int8_t  e8,
4525        int8_t  e7, int8_t  e6, int8_t  e5, int8_t  e4,
4526        int8_t  e3, int8_t  e2, int8_t  e1, int8_t  e0) {
4527 
4528   #if defined(SIMDE_X86_SSE2_NATIVE)
4529     return _mm_set_epi8(
4530       e15, e14, e13, e12, e11, e10,  e9,  e8,
4531        e7,  e6,  e5,  e4,  e3,  e2,  e1,  e0);
4532   #else
4533     simde__m128i_private r_;
4534 
4535     #if defined(SIMDE_WASM_SIMD128_NATIVE)
4536       r_.wasm_v128 = wasm_i8x16_make(
4537          e0,  e1,  e2,  e3,  e4,  e5,  e6,  e7,
4538          e8,  e9, e10, e11, e12, e13, e14, e15);
4539     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4540       SIMDE_ALIGN_LIKE_16(int8x16_t) int8_t data[16] = {
4541         e0,  e1,  e2,  e3,
4542         e4,  e5,  e6,  e7,
4543         e8,  e9,  e10, e11,
4544         e12, e13, e14, e15};
4545       r_.neon_i8 = vld1q_s8(data);
4546     #else
4547       r_.i8[ 0] =  e0;
4548       r_.i8[ 1] =  e1;
4549       r_.i8[ 2] =  e2;
4550       r_.i8[ 3] =  e3;
4551       r_.i8[ 4] =  e4;
4552       r_.i8[ 5] =  e5;
4553       r_.i8[ 6] =  e6;
4554       r_.i8[ 7] =  e7;
4555       r_.i8[ 8] =  e8;
4556       r_.i8[ 9] =  e9;
4557       r_.i8[10] = e10;
4558       r_.i8[11] = e11;
4559       r_.i8[12] = e12;
4560       r_.i8[13] = e13;
4561       r_.i8[14] = e14;
4562       r_.i8[15] = e15;
4563     #endif
4564 
4565     return simde__m128i_from_private(r_);
4566   #endif
4567 }
4568 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4569   #define _mm_set_epi8(e15, e14, e13, e12, e11, e10,  e9,  e8,  e7,  e6,  e5,  e4,  e3,  e2,  e1,  e0) simde_mm_set_epi8(e15, e14, e13, e12, e11, e10,  e9,  e8,  e7,  e6,  e5,  e4,  e3,  e2,  e1,  e0)
4570 #endif
4571 
4572 SIMDE_FUNCTION_ATTRIBUTES
4573 simde__m128i
simde_mm_set_epi16(int16_t e7,int16_t e6,int16_t e5,int16_t e4,int16_t e3,int16_t e2,int16_t e1,int16_t e0)4574 simde_mm_set_epi16 (int16_t e7, int16_t e6, int16_t e5, int16_t e4,
4575         int16_t e3, int16_t e2, int16_t e1, int16_t e0) {
4576   #if defined(SIMDE_X86_SSE2_NATIVE)
4577     return _mm_set_epi16(e7, e6, e5, e4, e3, e2, e1, e0);
4578   #else
4579     simde__m128i_private r_;
4580 
4581     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4582       SIMDE_ALIGN_LIKE_16(int16x8_t) int16_t data[8] = { e0, e1, e2, e3, e4, e5, e6, e7 };
4583       r_.neon_i16 = vld1q_s16(data);
4584     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4585       r_.wasm_v128 = wasm_i16x8_make(e0, e1, e2, e3, e4, e5, e6, e7);
4586     #else
4587       r_.i16[0] = e0;
4588       r_.i16[1] = e1;
4589       r_.i16[2] = e2;
4590       r_.i16[3] = e3;
4591       r_.i16[4] = e4;
4592       r_.i16[5] = e5;
4593       r_.i16[6] = e6;
4594       r_.i16[7] = e7;
4595     #endif
4596 
4597     return simde__m128i_from_private(r_);
4598   #endif
4599 }
4600 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4601   #define _mm_set_epi16(e7,  e6,  e5,  e4,  e3,  e2,  e1,  e0) simde_mm_set_epi16(e7,  e6,  e5,  e4,  e3,  e2,  e1,  e0)
4602 #endif
4603 
4604 SIMDE_FUNCTION_ATTRIBUTES
4605 simde__m128i
simde_mm_loadu_si16(void const * mem_addr)4606 simde_mm_loadu_si16 (void const* mem_addr) {
4607   #if defined(SIMDE_X86_SSE2_NATIVE) && ( \
4608       SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0) || \
4609       HEDLEY_GCC_VERSION_CHECK(11,0,0) || \
4610       HEDLEY_INTEL_VERSION_CHECK(20,21,1))
4611     return _mm_loadu_si16(mem_addr);
4612   #else
4613     int16_t val;
4614     simde_memcpy(&val, mem_addr, sizeof(val));
4615     return simde_x_mm_cvtsi16_si128(val);
4616   #endif
4617 }
4618 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4619   #define _mm_loadu_si16(mem_addr) simde_mm_loadu_si16(mem_addr)
4620 #endif
4621 
4622 SIMDE_FUNCTION_ATTRIBUTES
4623 simde__m128i
simde_mm_set_epi32(int32_t e3,int32_t e2,int32_t e1,int32_t e0)4624 simde_mm_set_epi32 (int32_t e3, int32_t e2, int32_t e1, int32_t e0) {
4625   #if defined(SIMDE_X86_SSE2_NATIVE)
4626     return _mm_set_epi32(e3, e2, e1, e0);
4627   #else
4628     simde__m128i_private r_;
4629 
4630     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4631       SIMDE_ALIGN_LIKE_16(int32x4_t) int32_t data[4] = { e0, e1, e2, e3 };
4632       r_.neon_i32 = vld1q_s32(data);
4633     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4634       r_.wasm_v128 = wasm_i32x4_make(e0, e1, e2, e3);
4635     #else
4636       r_.i32[0] = e0;
4637       r_.i32[1] = e1;
4638       r_.i32[2] = e2;
4639       r_.i32[3] = e3;
4640     #endif
4641 
4642     return simde__m128i_from_private(r_);
4643   #endif
4644 }
4645 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4646   #define _mm_set_epi32(e3,  e2,  e1,  e0) simde_mm_set_epi32(e3,  e2,  e1,  e0)
4647 #endif
4648 
4649 SIMDE_FUNCTION_ATTRIBUTES
4650 simde__m128i
simde_mm_loadu_si32(void const * mem_addr)4651 simde_mm_loadu_si32 (void const* mem_addr) {
4652   #if defined(SIMDE_X86_SSE2_NATIVE) && ( \
4653       SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0) || \
4654       HEDLEY_GCC_VERSION_CHECK(11,0,0) || \
4655       HEDLEY_INTEL_VERSION_CHECK(20,21,1))
4656     return _mm_loadu_si32(mem_addr);
4657   #else
4658     int32_t val;
4659     simde_memcpy(&val, mem_addr, sizeof(val));
4660     return simde_mm_cvtsi32_si128(val);
4661   #endif
4662 }
4663 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4664   #define _mm_loadu_si32(mem_addr) simde_mm_loadu_si32(mem_addr)
4665 #endif
4666 
4667 SIMDE_FUNCTION_ATTRIBUTES
4668 simde__m128i
simde_mm_set_epi64(simde__m64 e1,simde__m64 e0)4669 simde_mm_set_epi64 (simde__m64 e1, simde__m64 e0) {
4670   #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
4671     return _mm_set_epi64(e1, e0);
4672   #else
4673     simde__m128i_private r_;
4674 
4675     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4676       r_.neon_i64 = vcombine_s64(simde__m64_to_neon_i64(e0), simde__m64_to_neon_i64(e1));
4677     #else
4678       r_.m64[0] = e0;
4679       r_.m64[1] = e1;
4680     #endif
4681 
4682     return simde__m128i_from_private(r_);
4683   #endif
4684 }
4685 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4686   #define _mm_set_epi64(e1, e0) (simde_mm_set_epi64((e1), (e0)))
4687 #endif
4688 
4689 SIMDE_FUNCTION_ATTRIBUTES
4690 simde__m128i
simde_mm_set_epi64x(int64_t e1,int64_t e0)4691 simde_mm_set_epi64x (int64_t e1, int64_t e0) {
4692   #if defined(SIMDE_X86_SSE2_NATIVE) && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,0,0))
4693     return _mm_set_epi64x(e1, e0);
4694   #else
4695     simde__m128i_private r_;
4696 
4697     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4698       SIMDE_ALIGN_LIKE_16(int64x2_t) int64_t data[2] = {e0, e1};
4699       r_.neon_i64 = vld1q_s64(data);
4700     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4701       r_.wasm_v128 = wasm_i64x2_make(e0, e1);
4702     #else
4703       r_.i64[0] = e0;
4704       r_.i64[1] = e1;
4705     #endif
4706 
4707     return simde__m128i_from_private(r_);
4708   #endif
4709 }
4710 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4711   #define _mm_set_epi64x(e1, e0) simde_mm_set_epi64x(e1, e0)
4712 #endif
4713 
4714 SIMDE_FUNCTION_ATTRIBUTES
4715 simde__m128i
simde_mm_loadu_si64(void const * mem_addr)4716 simde_mm_loadu_si64 (void const* mem_addr) {
4717   #if defined(SIMDE_X86_SSE2_NATIVE) && ( \
4718       SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0) || \
4719       HEDLEY_GCC_VERSION_CHECK(11,0,0) || \
4720       HEDLEY_INTEL_VERSION_CHECK(20,21,1))
4721     return _mm_loadu_si64(mem_addr);
4722   #else
4723   int64_t val;
4724     simde_memcpy(&val, mem_addr, sizeof(val));
4725     return simde_mm_cvtsi64_si128(val);
4726   #endif
4727 }
4728 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4729   #define _mm_loadu_si64(mem_addr) simde_mm_loadu_si64(mem_addr)
4730 #endif
4731 
4732 SIMDE_FUNCTION_ATTRIBUTES
4733 simde__m128i
simde_x_mm_set_epu8(uint8_t e15,uint8_t e14,uint8_t e13,uint8_t e12,uint8_t e11,uint8_t e10,uint8_t e9,uint8_t e8,uint8_t e7,uint8_t e6,uint8_t e5,uint8_t e4,uint8_t e3,uint8_t e2,uint8_t e1,uint8_t e0)4734 simde_x_mm_set_epu8 (uint8_t e15, uint8_t e14, uint8_t e13, uint8_t e12,
4735          uint8_t e11, uint8_t e10, uint8_t  e9, uint8_t  e8,
4736          uint8_t  e7, uint8_t  e6, uint8_t  e5, uint8_t  e4,
4737          uint8_t  e3, uint8_t  e2, uint8_t  e1, uint8_t  e0) {
4738   #if defined(SIMDE_X86_SSE2_NATIVE)
4739     return _mm_set_epi8(
4740       HEDLEY_STATIC_CAST(char, e15), HEDLEY_STATIC_CAST(char, e14), HEDLEY_STATIC_CAST(char, e13), HEDLEY_STATIC_CAST(char, e12),
4741       HEDLEY_STATIC_CAST(char, e11), HEDLEY_STATIC_CAST(char, e10), HEDLEY_STATIC_CAST(char,  e9), HEDLEY_STATIC_CAST(char,  e8),
4742       HEDLEY_STATIC_CAST(char,  e7), HEDLEY_STATIC_CAST(char,  e6), HEDLEY_STATIC_CAST(char,  e5), HEDLEY_STATIC_CAST(char,  e4),
4743       HEDLEY_STATIC_CAST(char,  e3), HEDLEY_STATIC_CAST(char,  e2), HEDLEY_STATIC_CAST(char,  e1), HEDLEY_STATIC_CAST(char,  e0));
4744   #else
4745     simde__m128i_private r_;
4746 
4747     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4748       SIMDE_ALIGN_LIKE_16(uint8x16_t) uint8_t data[16] = {
4749         e0,  e1,  e2,  e3,
4750         e4,  e5,  e6,  e7,
4751         e8,  e9,  e10, e11,
4752         e12, e13, e14, e15};
4753       r_.neon_u8 = vld1q_u8(data);
4754     #else
4755       r_.u8[ 0] =  e0; r_.u8[ 1] =  e1; r_.u8[ 2] =  e2; r_.u8[ 3] =  e3;
4756       r_.u8[ 4] =  e4; r_.u8[ 5] =  e5; r_.u8[ 6] =  e6; r_.u8[ 7] =  e7;
4757       r_.u8[ 8] =  e8; r_.u8[ 9] =  e9; r_.u8[10] = e10; r_.u8[11] = e11;
4758       r_.u8[12] = e12; r_.u8[13] = e13; r_.u8[14] = e14; r_.u8[15] = e15;
4759     #endif
4760 
4761     return simde__m128i_from_private(r_);
4762   #endif
4763 }
4764 
4765 SIMDE_FUNCTION_ATTRIBUTES
4766 simde__m128i
simde_x_mm_set_epu16(uint16_t e7,uint16_t e6,uint16_t e5,uint16_t e4,uint16_t e3,uint16_t e2,uint16_t e1,uint16_t e0)4767 simde_x_mm_set_epu16 (uint16_t e7, uint16_t e6, uint16_t e5, uint16_t e4,
4768           uint16_t e3, uint16_t e2, uint16_t e1, uint16_t e0) {
4769   #if defined(SIMDE_X86_SSE2_NATIVE)
4770     return _mm_set_epi16(
4771       HEDLEY_STATIC_CAST(short,  e7), HEDLEY_STATIC_CAST(short,  e6), HEDLEY_STATIC_CAST(short,  e5), HEDLEY_STATIC_CAST(short,  e4),
4772       HEDLEY_STATIC_CAST(short,  e3), HEDLEY_STATIC_CAST(short,  e2), HEDLEY_STATIC_CAST(short,  e1), HEDLEY_STATIC_CAST(short,  e0));
4773   #else
4774     simde__m128i_private r_;
4775 
4776     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4777       SIMDE_ALIGN_LIKE_16(uint16x8_t) uint16_t data[8] = { e0, e1, e2, e3, e4, e5, e6, e7 };
4778       r_.neon_u16 = vld1q_u16(data);
4779     #else
4780       r_.u16[0] = e0; r_.u16[1] = e1; r_.u16[2] = e2; r_.u16[3] = e3;
4781       r_.u16[4] = e4; r_.u16[5] = e5; r_.u16[6] = e6; r_.u16[7] = e7;
4782     #endif
4783 
4784     return simde__m128i_from_private(r_);
4785   #endif
4786 }
4787 
4788 SIMDE_FUNCTION_ATTRIBUTES
4789 simde__m128i
simde_x_mm_set_epu32(uint32_t e3,uint32_t e2,uint32_t e1,uint32_t e0)4790 simde_x_mm_set_epu32 (uint32_t e3, uint32_t e2, uint32_t e1, uint32_t e0) {
4791   #if defined(SIMDE_X86_SSE2_NATIVE)
4792     return _mm_set_epi32(
4793       HEDLEY_STATIC_CAST(int,  e3), HEDLEY_STATIC_CAST(int,  e2), HEDLEY_STATIC_CAST(int,  e1), HEDLEY_STATIC_CAST(int,  e0));
4794   #else
4795     simde__m128i_private r_;
4796 
4797     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4798       SIMDE_ALIGN_LIKE_16(uint32x4_t) uint32_t data[4] = { e0, e1, e2, e3 };
4799       r_.neon_u32 = vld1q_u32(data);
4800     #else
4801       r_.u32[0] = e0;
4802       r_.u32[1] = e1;
4803       r_.u32[2] = e2;
4804       r_.u32[3] = e3;
4805     #endif
4806 
4807     return simde__m128i_from_private(r_);
4808   #endif
4809 }
4810 
4811 SIMDE_FUNCTION_ATTRIBUTES
4812 simde__m128i
simde_x_mm_set_epu64x(uint64_t e1,uint64_t e0)4813 simde_x_mm_set_epu64x (uint64_t e1, uint64_t e0) {
4814   #if defined(SIMDE_X86_SSE2_NATIVE) && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,0,0))
4815     return _mm_set_epi64x(HEDLEY_STATIC_CAST(int64_t,  e1), HEDLEY_STATIC_CAST(int64_t,  e0));
4816   #else
4817     simde__m128i_private r_;
4818 
4819     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4820       SIMDE_ALIGN_LIKE_16(uint64x2_t) uint64_t data[2] = {e0, e1};
4821       r_.neon_u64 = vld1q_u64(data);
4822     #else
4823       r_.u64[0] = e0;
4824       r_.u64[1] = e1;
4825     #endif
4826 
4827     return simde__m128i_from_private(r_);
4828   #endif
4829 }
4830 
4831 SIMDE_FUNCTION_ATTRIBUTES
4832 simde__m128d
simde_mm_set_sd(simde_float64 a)4833 simde_mm_set_sd (simde_float64 a) {
4834   #if defined(SIMDE_X86_SSE2_NATIVE)
4835     return _mm_set_sd(a);
4836   #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
4837     return vsetq_lane_f64(a, vdupq_n_f64(SIMDE_FLOAT64_C(0.0)), 0);
4838   #else
4839     return simde_mm_set_pd(SIMDE_FLOAT64_C(0.0), a);
4840   #endif
4841 }
4842 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4843   #define _mm_set_sd(a) simde_mm_set_sd(a)
4844 #endif
4845 
4846 SIMDE_FUNCTION_ATTRIBUTES
4847 simde__m128i
simde_mm_set1_epi8(int8_t a)4848 simde_mm_set1_epi8 (int8_t a) {
4849   #if defined(SIMDE_X86_SSE2_NATIVE)
4850     return _mm_set1_epi8(a);
4851   #else
4852     simde__m128i_private r_;
4853 
4854     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4855       r_.neon_i8 = vdupq_n_s8(a);
4856     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4857       r_.wasm_v128 = wasm_i8x16_splat(a);
4858     #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
4859       r_.altivec_i8 = vec_splats(HEDLEY_STATIC_CAST(signed char, a));
4860     #else
4861       SIMDE_VECTORIZE
4862       for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
4863         r_.i8[i] = a;
4864       }
4865     #endif
4866 
4867     return simde__m128i_from_private(r_);
4868   #endif
4869 }
4870 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4871   #define _mm_set1_epi8(a) simde_mm_set1_epi8(a)
4872 #endif
4873 
4874 SIMDE_FUNCTION_ATTRIBUTES
4875 simde__m128i
simde_mm_set1_epi16(int16_t a)4876 simde_mm_set1_epi16 (int16_t a) {
4877   #if defined(SIMDE_X86_SSE2_NATIVE)
4878     return _mm_set1_epi16(a);
4879   #else
4880     simde__m128i_private r_;
4881 
4882     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4883       r_.neon_i16 = vdupq_n_s16(a);
4884     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4885       r_.wasm_v128 = wasm_i16x8_splat(a);
4886     #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
4887       r_.altivec_i16 = vec_splats(HEDLEY_STATIC_CAST(signed short, a));
4888     #else
4889       SIMDE_VECTORIZE
4890       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
4891         r_.i16[i] = a;
4892       }
4893     #endif
4894 
4895     return simde__m128i_from_private(r_);
4896   #endif
4897 }
4898 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4899   #define _mm_set1_epi16(a) simde_mm_set1_epi16(a)
4900 #endif
4901 
4902 SIMDE_FUNCTION_ATTRIBUTES
4903 simde__m128i
simde_mm_set1_epi32(int32_t a)4904 simde_mm_set1_epi32 (int32_t a) {
4905   #if defined(SIMDE_X86_SSE2_NATIVE)
4906     return _mm_set1_epi32(a);
4907   #else
4908     simde__m128i_private r_;
4909 
4910     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4911       r_.neon_i32 = vdupq_n_s32(a);
4912     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4913       r_.wasm_v128 = wasm_i32x4_splat(a);
4914     #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
4915       r_.altivec_i32 = vec_splats(HEDLEY_STATIC_CAST(signed int, a));
4916     #else
4917       SIMDE_VECTORIZE
4918       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
4919         r_.i32[i] = a;
4920       }
4921     #endif
4922 
4923     return simde__m128i_from_private(r_);
4924   #endif
4925 }
4926 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4927   #define _mm_set1_epi32(a) simde_mm_set1_epi32(a)
4928 #endif
4929 
4930 SIMDE_FUNCTION_ATTRIBUTES
4931 simde__m128i
simde_mm_set1_epi64x(int64_t a)4932 simde_mm_set1_epi64x (int64_t a) {
4933   #if defined(SIMDE_X86_SSE2_NATIVE) && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,0,0))
4934     return _mm_set1_epi64x(a);
4935   #else
4936     simde__m128i_private r_;
4937 
4938     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4939       r_.neon_i64 = vdupq_n_s64(a);
4940     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4941       r_.wasm_v128 = wasm_i64x2_splat(a);
4942     #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
4943       r_.altivec_i64 = vec_splats(HEDLEY_STATIC_CAST(signed long long, a));
4944     #else
4945       SIMDE_VECTORIZE
4946       for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
4947         r_.i64[i] = a;
4948       }
4949     #endif
4950 
4951     return simde__m128i_from_private(r_);
4952   #endif
4953 }
4954 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4955   #define _mm_set1_epi64x(a) simde_mm_set1_epi64x(a)
4956 #endif
4957 
4958 SIMDE_FUNCTION_ATTRIBUTES
4959 simde__m128i
simde_mm_set1_epi64(simde__m64 a)4960 simde_mm_set1_epi64 (simde__m64 a) {
4961   #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
4962     return _mm_set1_epi64(a);
4963   #else
4964     simde__m64_private a_ = simde__m64_to_private(a);
4965     return simde_mm_set1_epi64x(a_.i64[0]);
4966   #endif
4967 }
4968 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4969   #define _mm_set1_epi64(a) simde_mm_set1_epi64(a)
4970 #endif
4971 
4972 SIMDE_FUNCTION_ATTRIBUTES
4973 simde__m128i
simde_x_mm_set1_epu8(uint8_t value)4974 simde_x_mm_set1_epu8 (uint8_t value) {
4975   #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
4976     return simde__m128i_from_altivec_u8(vec_splats(HEDLEY_STATIC_CAST(unsigned char, value)));
4977   #else
4978     return simde_mm_set1_epi8(HEDLEY_STATIC_CAST(int8_t, value));
4979   #endif
4980 }
4981 
4982 SIMDE_FUNCTION_ATTRIBUTES
4983 simde__m128i
simde_x_mm_set1_epu16(uint16_t value)4984 simde_x_mm_set1_epu16 (uint16_t value) {
4985   #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
4986     return simde__m128i_from_altivec_u16(vec_splats(HEDLEY_STATIC_CAST(unsigned short, value)));
4987   #else
4988     return simde_mm_set1_epi16(HEDLEY_STATIC_CAST(int16_t, value));
4989   #endif
4990 }
4991 
4992 SIMDE_FUNCTION_ATTRIBUTES
4993 simde__m128i
simde_x_mm_set1_epu32(uint32_t value)4994 simde_x_mm_set1_epu32 (uint32_t value) {
4995   #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
4996     return simde__m128i_from_altivec_u32(vec_splats(HEDLEY_STATIC_CAST(unsigned int, value)));
4997   #else
4998     return simde_mm_set1_epi32(HEDLEY_STATIC_CAST(int32_t, value));
4999   #endif
5000 }
5001 
5002 SIMDE_FUNCTION_ATTRIBUTES
5003 simde__m128i
simde_x_mm_set1_epu64(uint64_t value)5004 simde_x_mm_set1_epu64 (uint64_t value) {
5005   #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
5006     return simde__m128i_from_altivec_u64(vec_splats(HEDLEY_STATIC_CAST(unsigned long long, value)));
5007   #else
5008     return simde_mm_set1_epi64x(HEDLEY_STATIC_CAST(int64_t, value));
5009   #endif
5010 }
5011 
5012 SIMDE_FUNCTION_ATTRIBUTES
5013 simde__m128i
simde_mm_setr_epi8(int8_t e15,int8_t e14,int8_t e13,int8_t e12,int8_t e11,int8_t e10,int8_t e9,int8_t e8,int8_t e7,int8_t e6,int8_t e5,int8_t e4,int8_t e3,int8_t e2,int8_t e1,int8_t e0)5014 simde_mm_setr_epi8 (int8_t e15, int8_t e14, int8_t e13, int8_t e12,
5015         int8_t e11, int8_t e10, int8_t  e9, int8_t  e8,
5016         int8_t  e7, int8_t  e6, int8_t  e5, int8_t  e4,
5017         int8_t  e3, int8_t  e2, int8_t  e1, int8_t  e0) {
5018   #if defined(SIMDE_X86_SSE2_NATIVE)
5019     return _mm_setr_epi8(
5020       e15, e14, e13, e12, e11, e10,  e9,    e8,
5021       e7,  e6,  e5,  e4,  e3,  e2,  e1,  e0);
5022   #else
5023     return simde_mm_set_epi8(
5024       e0, e1, e2, e3, e4, e5, e6, e7,
5025       e8, e9, e10, e11, e12, e13, e14, e15);
5026   #endif
5027 }
5028 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5029   #define _mm_setr_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) simde_mm_setr_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0)
5030 #endif
5031 
5032 SIMDE_FUNCTION_ATTRIBUTES
5033 simde__m128i
simde_mm_setr_epi16(int16_t e7,int16_t e6,int16_t e5,int16_t e4,int16_t e3,int16_t e2,int16_t e1,int16_t e0)5034 simde_mm_setr_epi16 (int16_t e7, int16_t e6, int16_t e5, int16_t e4,
5035          int16_t e3, int16_t e2, int16_t e1, int16_t e0) {
5036   #if defined(SIMDE_X86_SSE2_NATIVE)
5037     return _mm_setr_epi16(e7,  e6,  e5,  e4,  e3,  e2,  e1,  e0);
5038   #else
5039     return simde_mm_set_epi16(e0, e1, e2, e3, e4, e5, e6, e7);
5040   #endif
5041 }
5042 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5043   #define _mm_setr_epi16(e7, e6, e5, e4, e3, e2, e1, e0) simde_mm_setr_epi16(e7, e6, e5, e4, e3, e2, e1, e0)
5044 #endif
5045 
5046 SIMDE_FUNCTION_ATTRIBUTES
5047 simde__m128i
simde_mm_setr_epi32(int32_t e3,int32_t e2,int32_t e1,int32_t e0)5048 simde_mm_setr_epi32 (int32_t e3, int32_t e2, int32_t e1, int32_t e0) {
5049   #if defined(SIMDE_X86_SSE2_NATIVE)
5050     return _mm_setr_epi32(e3, e2, e1, e0);
5051   #else
5052     return simde_mm_set_epi32(e0, e1, e2, e3);
5053   #endif
5054 }
5055 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5056   #define _mm_setr_epi32(e3, e2, e1, e0) simde_mm_setr_epi32(e3, e2, e1, e0)
5057 #endif
5058 
5059 SIMDE_FUNCTION_ATTRIBUTES
5060 simde__m128i
simde_mm_setr_epi64(simde__m64 e1,simde__m64 e0)5061 simde_mm_setr_epi64 (simde__m64 e1, simde__m64 e0) {
5062   #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
5063     return _mm_setr_epi64(e1, e0);
5064   #else
5065     return simde_mm_set_epi64(e0, e1);
5066   #endif
5067 }
5068 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5069   #define _mm_setr_epi64(e1, e0) (simde_mm_setr_epi64((e1), (e0)))
5070 #endif
5071 
5072 SIMDE_FUNCTION_ATTRIBUTES
5073 simde__m128d
simde_mm_setr_pd(simde_float64 e1,simde_float64 e0)5074 simde_mm_setr_pd (simde_float64 e1, simde_float64 e0) {
5075   #if defined(SIMDE_X86_SSE2_NATIVE)
5076     return _mm_setr_pd(e1, e0);
5077   #else
5078     return simde_mm_set_pd(e0, e1);
5079   #endif
5080 }
5081 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5082   #define _mm_setr_pd(e1, e0) simde_mm_setr_pd(e1, e0)
5083 #endif
5084 
5085 SIMDE_FUNCTION_ATTRIBUTES
5086 simde__m128d
simde_mm_setzero_pd(void)5087 simde_mm_setzero_pd (void) {
5088   #if defined(SIMDE_X86_SSE2_NATIVE)
5089     return _mm_setzero_pd();
5090   #else
5091     return simde_mm_castsi128_pd(simde_mm_setzero_si128());
5092   #endif
5093 }
5094 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5095   #define _mm_setzero_pd() simde_mm_setzero_pd()
5096 #endif
5097 
5098 #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
5099 HEDLEY_DIAGNOSTIC_PUSH
5100 SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_
5101 #endif
5102 
5103 SIMDE_FUNCTION_ATTRIBUTES
5104 simde__m128d
simde_mm_undefined_pd(void)5105 simde_mm_undefined_pd (void) {
5106   simde__m128d_private r_;
5107 
5108   #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE__HAVE_UNDEFINED128)
5109     r_.n = _mm_undefined_pd();
5110   #elif !defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
5111     r_ = simde__m128d_to_private(simde_mm_setzero_pd());
5112   #endif
5113 
5114   return simde__m128d_from_private(r_);
5115 }
5116 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5117   #define _mm_undefined_pd() simde_mm_undefined_pd()
5118 #endif
5119 
5120 SIMDE_FUNCTION_ATTRIBUTES
5121 simde__m128i
simde_mm_undefined_si128(void)5122 simde_mm_undefined_si128 (void) {
5123   simde__m128i_private r_;
5124 
5125   #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE__HAVE_UNDEFINED128)
5126     r_.n = _mm_undefined_si128();
5127   #elif !defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
5128     r_ = simde__m128i_to_private(simde_mm_setzero_si128());
5129   #endif
5130 
5131   return simde__m128i_from_private(r_);
5132 }
5133 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5134   #define _mm_undefined_si128() (simde_mm_undefined_si128())
5135 #endif
5136 
5137 #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
5138 HEDLEY_DIAGNOSTIC_POP
5139 #endif
5140 
5141 SIMDE_FUNCTION_ATTRIBUTES
5142 simde__m128d
simde_x_mm_setone_pd(void)5143 simde_x_mm_setone_pd (void) {
5144   return simde_mm_castps_pd(simde_x_mm_setone_ps());
5145 }
5146 
5147 SIMDE_FUNCTION_ATTRIBUTES
5148 simde__m128i
simde_x_mm_setone_si128(void)5149 simde_x_mm_setone_si128 (void) {
5150   return simde_mm_castps_si128(simde_x_mm_setone_ps());
5151 }
5152 
5153 SIMDE_FUNCTION_ATTRIBUTES
5154 simde__m128i
simde_mm_shuffle_epi32(simde__m128i a,const int imm8)5155 simde_mm_shuffle_epi32 (simde__m128i a, const int imm8)
5156     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)  {
5157   simde__m128i_private
5158     r_,
5159     a_ = simde__m128i_to_private(a);
5160 
5161   for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
5162     r_.i32[i] = a_.i32[(imm8 >> (i * 2)) & 3];
5163   }
5164 
5165   return simde__m128i_from_private(r_);
5166 }
5167 #if defined(SIMDE_X86_SSE2_NATIVE)
5168   #define simde_mm_shuffle_epi32(a, imm8) _mm_shuffle_epi32((a), (imm8))
5169 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5170   #define simde_mm_shuffle_epi32(a, imm8)                                   \
5171     __extension__({                                                         \
5172         int32x4_t ret;                                                      \
5173         ret = vmovq_n_s32(                                                  \
5174             vgetq_lane_s32(vreinterpretq_s32_s64(a), (imm8) & (0x3)));     \
5175         ret = vsetq_lane_s32(                                               \
5176             vgetq_lane_s32(vreinterpretq_s32_s64(a), ((imm8) >> 2) & 0x3), \
5177             ret, 1);                                                        \
5178         ret = vsetq_lane_s32(                                               \
5179             vgetq_lane_s32(vreinterpretq_s32_s64(a), ((imm8) >> 4) & 0x3), \
5180             ret, 2);                                                        \
5181         ret = vsetq_lane_s32(                                               \
5182             vgetq_lane_s32(vreinterpretq_s32_s64(a), ((imm8) >> 6) & 0x3), \
5183             ret, 3);                                                        \
5184         vreinterpretq_s64_s32(ret);                                       \
5185     })
5186 #elif defined(SIMDE_SHUFFLE_VECTOR_)
5187   #define simde_mm_shuffle_epi32(a, imm8) (__extension__ ({ \
5188       const simde__m128i_private simde__tmp_a_ = simde__m128i_to_private(a); \
5189       simde__m128i_from_private((simde__m128i_private) { .i32 = \
5190         SIMDE_SHUFFLE_VECTOR_(32, 16, \
5191           (simde__tmp_a_).i32, \
5192           (simde__tmp_a_).i32, \
5193           ((imm8)     ) & 3, \
5194           ((imm8) >> 2) & 3, \
5195           ((imm8) >> 4) & 3, \
5196           ((imm8) >> 6) & 3) }); }))
5197 #endif
5198 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5199   #define _mm_shuffle_epi32(a, imm8) simde_mm_shuffle_epi32(a, imm8)
5200 #endif
5201 
5202 SIMDE_FUNCTION_ATTRIBUTES
5203 simde__m128d
simde_mm_shuffle_pd(simde__m128d a,simde__m128d b,const int imm8)5204 simde_mm_shuffle_pd (simde__m128d a, simde__m128d b, const int imm8)
5205     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3)  {
5206   simde__m128d_private
5207     r_,
5208     a_ = simde__m128d_to_private(a),
5209     b_ = simde__m128d_to_private(b);
5210 
5211   r_.f64[0] = ((imm8 & 1) == 0) ? a_.f64[0] : a_.f64[1];
5212   r_.f64[1] = ((imm8 & 2) == 0) ? b_.f64[0] : b_.f64[1];
5213 
5214   return simde__m128d_from_private(r_);
5215 }
5216 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
5217   #define simde_mm_shuffle_pd(a, b, imm8) _mm_shuffle_pd((a), (b), (imm8))
5218 #elif defined(SIMDE_SHUFFLE_VECTOR_)
5219   #define simde_mm_shuffle_pd(a, b, imm8) (__extension__ ({ \
5220       simde__m128d_from_private((simde__m128d_private) { .f64 = \
5221         SIMDE_SHUFFLE_VECTOR_(64, 16, \
5222           simde__m128d_to_private(a).f64, \
5223           simde__m128d_to_private(b).f64, \
5224           (((imm8)     ) & 1), \
5225           (((imm8) >> 1) & 1) + 2) }); }))
5226 #endif
5227 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5228   #define _mm_shuffle_pd(a, b, imm8) simde_mm_shuffle_pd(a, b, imm8)
5229 #endif
5230 
5231 SIMDE_FUNCTION_ATTRIBUTES
5232 simde__m128i
simde_mm_shufflehi_epi16(simde__m128i a,const int imm8)5233 simde_mm_shufflehi_epi16 (simde__m128i a, const int imm8)
5234     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)  {
5235   simde__m128i_private
5236     r_,
5237     a_ = simde__m128i_to_private(a);
5238 
5239   SIMDE_VECTORIZE
5240   for (size_t i = 0 ; i < ((sizeof(a_.i16) / sizeof(a_.i16[0])) / 2) ; i++) {
5241     r_.i16[i] = a_.i16[i];
5242   }
5243   for (size_t i = ((sizeof(a_.i16) / sizeof(a_.i16[0])) / 2) ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
5244     r_.i16[i] = a_.i16[((imm8 >> ((i - 4) * 2)) & 3) + 4];
5245   }
5246 
5247   return simde__m128i_from_private(r_);
5248 }
5249 #if defined(SIMDE_X86_SSE2_NATIVE)
5250   #define simde_mm_shufflehi_epi16(a, imm8) _mm_shufflehi_epi16((a), (imm8))
5251 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5252   #define simde_mm_shufflehi_epi16(a, imm8) \
5253     __extension__({                                                            \
5254         int16x8_t ret = vreinterpretq_s16_s64(a);                            \
5255         int16x4_t highBits = vget_high_s16(ret);                               \
5256         ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm8) & (0x3)), ret, 4);  \
5257         ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm8) >> 2) & 0x3), ret, \
5258                              5);                                               \
5259         ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm8) >> 4) & 0x3), ret, \
5260                              6);                                               \
5261         ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm8) >> 6) & 0x3), ret, \
5262                              7);                                               \
5263         vreinterpretq_s64_s16(ret);                                          \
5264     })
5265 #elif defined(SIMDE_SHUFFLE_VECTOR_)
5266   #define simde_mm_shufflehi_epi16(a, imm8) (__extension__ ({ \
5267       const simde__m128i_private simde__tmp_a_ = simde__m128i_to_private(a); \
5268       simde__m128i_from_private((simde__m128i_private) { .i16 = \
5269         SIMDE_SHUFFLE_VECTOR_(16, 16, \
5270           (simde__tmp_a_).i16, \
5271           (simde__tmp_a_).i16, \
5272           0, 1, 2, 3, \
5273           (((imm8)     ) & 3) + 4, \
5274           (((imm8) >> 2) & 3) + 4, \
5275           (((imm8) >> 4) & 3) + 4, \
5276           (((imm8) >> 6) & 3) + 4) }); }))
5277 #endif
5278 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5279   #define _mm_shufflehi_epi16(a, imm8) simde_mm_shufflehi_epi16(a, imm8)
5280 #endif
5281 
5282 SIMDE_FUNCTION_ATTRIBUTES
5283 simde__m128i
simde_mm_shufflelo_epi16(simde__m128i a,const int imm8)5284 simde_mm_shufflelo_epi16 (simde__m128i a, const int imm8)
5285     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)  {
5286   simde__m128i_private
5287     r_,
5288     a_ = simde__m128i_to_private(a);
5289 
5290   for (size_t i = 0 ; i < ((sizeof(r_.i16) / sizeof(r_.i16[0])) / 2) ; i++) {
5291     r_.i16[i] = a_.i16[((imm8 >> (i * 2)) & 3)];
5292   }
5293   SIMDE_VECTORIZE
5294   for (size_t i = ((sizeof(a_.i16) / sizeof(a_.i16[0])) / 2) ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
5295     r_.i16[i] = a_.i16[i];
5296   }
5297 
5298   return simde__m128i_from_private(r_);
5299 }
5300 #if defined(SIMDE_X86_SSE2_NATIVE)
5301   #define simde_mm_shufflelo_epi16(a, imm8) _mm_shufflelo_epi16((a), (imm8))
5302 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5303   #define simde_mm_shufflelo_epi16(a, imm8)                                  \
5304     __extension__({                                                           \
5305         int16x8_t ret = vreinterpretq_s16_s64(a);                           \
5306         int16x4_t lowBits = vget_low_s16(ret);                                \
5307         ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm8) & (0x3)), ret, 0);  \
5308         ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm8) >> 2) & 0x3), ret, \
5309                              1);                                              \
5310         ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm8) >> 4) & 0x3), ret, \
5311                              2);                                              \
5312         ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm8) >> 6) & 0x3), ret, \
5313                              3);                                              \
5314         vreinterpretq_s64_s16(ret);                                         \
5315     })
5316 #elif defined(SIMDE_SHUFFLE_VECTOR_)
5317   #define simde_mm_shufflelo_epi16(a, imm8) (__extension__ ({ \
5318       const simde__m128i_private simde__tmp_a_ = simde__m128i_to_private(a); \
5319       simde__m128i_from_private((simde__m128i_private) { .i16 = \
5320         SIMDE_SHUFFLE_VECTOR_(16, 16, \
5321           (simde__tmp_a_).i16, \
5322           (simde__tmp_a_).i16, \
5323           (((imm8)     ) & 3), \
5324           (((imm8) >> 2) & 3), \
5325           (((imm8) >> 4) & 3), \
5326           (((imm8) >> 6) & 3), \
5327           4, 5, 6, 7) }); }))
5328 #endif
5329 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5330   #define _mm_shufflelo_epi16(a, imm8) simde_mm_shufflelo_epi16(a, imm8)
5331 #endif
5332 
5333 SIMDE_FUNCTION_ATTRIBUTES
5334 simde__m128i
simde_mm_sll_epi16(simde__m128i a,simde__m128i count)5335 simde_mm_sll_epi16 (simde__m128i a, simde__m128i count) {
5336   #if defined(SIMDE_X86_SSE2_NATIVE)
5337     return _mm_sll_epi16(a, count);
5338   #else
5339     simde__m128i_private
5340       r_,
5341       a_ = simde__m128i_to_private(a),
5342       count_ = simde__m128i_to_private(count);
5343 
5344     if (count_.u64[0] > 15)
5345       return simde_mm_setzero_si128();
5346 
5347     #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
5348       r_.u16 = (a_.u16 << count_.u64[0]);
5349     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5350       r_.neon_u16 = vshlq_u16(a_.neon_u16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, count_.u64[0])));
5351     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5352       r_.wasm_v128 = ((wasm_i64x2_extract_lane(count_.wasm_v128, 0) < 16) ? wasm_i16x8_shl(a_.wasm_v128, HEDLEY_STATIC_CAST(int32_t, wasm_i64x2_extract_lane(count_.wasm_v128, 0))) : wasm_i16x8_const(0,0,0,0,0,0,0,0));
5353     #else
5354       SIMDE_VECTORIZE
5355       for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
5356         r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, (a_.u16[i] << count_.u64[0]));
5357       }
5358     #endif
5359 
5360     return simde__m128i_from_private(r_);
5361   #endif
5362 }
5363 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5364   #define _mm_sll_epi16(a, count) simde_mm_sll_epi16((a), (count))
5365 #endif
5366 
5367 SIMDE_FUNCTION_ATTRIBUTES
5368 simde__m128i
simde_mm_sll_epi32(simde__m128i a,simde__m128i count)5369 simde_mm_sll_epi32 (simde__m128i a, simde__m128i count) {
5370   #if defined(SIMDE_X86_SSE2_NATIVE)
5371     return _mm_sll_epi32(a, count);
5372   #else
5373     simde__m128i_private
5374       r_,
5375       a_ = simde__m128i_to_private(a),
5376       count_ = simde__m128i_to_private(count);
5377 
5378     if (count_.u64[0] > 31)
5379       return simde_mm_setzero_si128();
5380 
5381     #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
5382       r_.u32 = (a_.u32 << count_.u64[0]);
5383     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5384       r_.neon_u32 = vshlq_u32(a_.neon_u32, vdupq_n_s32(HEDLEY_STATIC_CAST(int32_t, count_.u64[0])));
5385     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5386       r_.wasm_v128 = ((wasm_i64x2_extract_lane(count_.wasm_v128, 0) < 32) ? wasm_i32x4_shl(a_.wasm_v128, HEDLEY_STATIC_CAST(int32_t, wasm_i64x2_extract_lane(count_.wasm_v128, 0))) : wasm_i32x4_const(0,0,0,0));
5387     #else
5388       SIMDE_VECTORIZE
5389       for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
5390         r_.u32[i] = HEDLEY_STATIC_CAST(uint32_t, (a_.u32[i] << count_.u64[0]));
5391       }
5392     #endif
5393 
5394     return simde__m128i_from_private(r_);
5395   #endif
5396 }
5397 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5398   #define _mm_sll_epi32(a, count) (simde_mm_sll_epi32(a, (count)))
5399 #endif
5400 
5401 SIMDE_FUNCTION_ATTRIBUTES
5402 simde__m128i
simde_mm_sll_epi64(simde__m128i a,simde__m128i count)5403 simde_mm_sll_epi64 (simde__m128i a, simde__m128i count) {
5404   #if defined(SIMDE_X86_SSE2_NATIVE)
5405     return _mm_sll_epi64(a, count);
5406   #else
5407     simde__m128i_private
5408       r_,
5409       a_ = simde__m128i_to_private(a),
5410       count_ = simde__m128i_to_private(count);
5411 
5412     if (count_.u64[0] > 63)
5413       return simde_mm_setzero_si128();
5414 
5415     const int_fast16_t s = HEDLEY_STATIC_CAST(int_fast16_t, count_.u64[0]);
5416     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5417       r_.neon_u64 = vshlq_u64(a_.neon_u64, vdupq_n_s64(HEDLEY_STATIC_CAST(int64_t, s)));
5418     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5419       r_.wasm_v128 = (s < 64) ? wasm_i64x2_shl(a_.wasm_v128, s) : wasm_i64x2_const(0,0);
5420     #else
5421       #if !defined(SIMDE_BUG_GCC_94488)
5422         SIMDE_VECTORIZE
5423       #endif
5424       for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
5425         r_.u64[i] = a_.u64[i] << s;
5426       }
5427     #endif
5428 
5429     return simde__m128i_from_private(r_);
5430   #endif
5431 }
5432 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5433   #define _mm_sll_epi64(a, count) (simde_mm_sll_epi64(a, (count)))
5434 #endif
5435 
5436 SIMDE_FUNCTION_ATTRIBUTES
5437 simde__m128d
simde_mm_sqrt_pd(simde__m128d a)5438 simde_mm_sqrt_pd (simde__m128d a) {
5439   #if defined(SIMDE_X86_SSE2_NATIVE)
5440     return _mm_sqrt_pd(a);
5441   #else
5442     simde__m128d_private
5443       r_,
5444       a_ = simde__m128d_to_private(a);
5445 
5446     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
5447       r_.neon_f64 = vsqrtq_f64(a_.neon_f64);
5448     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5449       r_.wasm_v128 = wasm_f64x2_sqrt(a_.wasm_v128);
5450     #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
5451       r_.altivec_f64 = vec_sqrt(a_.altivec_f64);
5452     #elif defined(simde_math_sqrt)
5453       SIMDE_VECTORIZE
5454       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
5455         r_.f64[i] = simde_math_sqrt(a_.f64[i]);
5456       }
5457     #else
5458       HEDLEY_UNREACHABLE();
5459     #endif
5460 
5461     return simde__m128d_from_private(r_);
5462   #endif
5463 }
5464 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5465   #define _mm_sqrt_pd(a) simde_mm_sqrt_pd(a)
5466 #endif
5467 
5468 SIMDE_FUNCTION_ATTRIBUTES
5469 simde__m128d
simde_mm_sqrt_sd(simde__m128d a,simde__m128d b)5470 simde_mm_sqrt_sd (simde__m128d a, simde__m128d b) {
5471   #if defined(SIMDE_X86_SSE2_NATIVE)
5472     return _mm_sqrt_sd(a, b);
5473   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
5474     return simde_mm_move_sd(a, simde_mm_sqrt_pd(b));
5475   #else
5476     simde__m128d_private
5477       r_,
5478       a_ = simde__m128d_to_private(a),
5479       b_ = simde__m128d_to_private(b);
5480 
5481     #if defined(simde_math_sqrt)
5482       r_.f64[0] = simde_math_sqrt(b_.f64[0]);
5483       r_.f64[1] = a_.f64[1];
5484     #else
5485       HEDLEY_UNREACHABLE();
5486     #endif
5487 
5488     return simde__m128d_from_private(r_);
5489   #endif
5490 }
5491 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5492   #define _mm_sqrt_sd(a, b) simde_mm_sqrt_sd(a, b)
5493 #endif
5494 
5495 SIMDE_FUNCTION_ATTRIBUTES
5496 simde__m128i
simde_mm_srl_epi16(simde__m128i a,simde__m128i count)5497 simde_mm_srl_epi16 (simde__m128i a, simde__m128i count) {
5498   #if defined(SIMDE_X86_SSE2_NATIVE)
5499     return _mm_srl_epi16(a, count);
5500   #else
5501     simde__m128i_private
5502       r_,
5503       a_ = simde__m128i_to_private(a),
5504       count_ = simde__m128i_to_private(count);
5505 
5506     const int cnt = HEDLEY_STATIC_CAST(int, (count_.i64[0] > 16 ? 16 : count_.i64[0]));
5507 
5508     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5509       r_.neon_u16 = vshlq_u16(a_.neon_u16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, -cnt)));
5510     #else
5511       SIMDE_VECTORIZE
5512       for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
5513         r_.u16[i] = a_.u16[i] >> cnt;
5514       }
5515     #endif
5516 
5517     return simde__m128i_from_private(r_);
5518   #endif
5519 }
5520 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5521   #define _mm_srl_epi16(a, count) (simde_mm_srl_epi16(a, (count)))
5522 #endif
5523 
5524 SIMDE_FUNCTION_ATTRIBUTES
5525 simde__m128i
simde_mm_srl_epi32(simde__m128i a,simde__m128i count)5526 simde_mm_srl_epi32 (simde__m128i a, simde__m128i count) {
5527   #if defined(SIMDE_X86_SSE2_NATIVE)
5528     return _mm_srl_epi32(a, count);
5529   #else
5530     simde__m128i_private
5531       r_,
5532       a_ = simde__m128i_to_private(a),
5533       count_ = simde__m128i_to_private(count);
5534 
5535     const int cnt = HEDLEY_STATIC_CAST(int, (count_.i64[0] > 32 ? 32 : count_.i64[0]));
5536 
5537     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5538       r_.neon_u32 = vshlq_u32(a_.neon_u32, vdupq_n_s32(HEDLEY_STATIC_CAST(int32_t, -cnt)));
5539     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5540       r_.wasm_v128 = wasm_u32x4_shr(a_.wasm_v128, cnt);
5541     #else
5542       SIMDE_VECTORIZE
5543       for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
5544         r_.u32[i] = a_.u32[i] >> cnt;
5545       }
5546     #endif
5547 
5548     return simde__m128i_from_private(r_);
5549   #endif
5550 }
5551 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5552   #define _mm_srl_epi32(a, count) (simde_mm_srl_epi32(a, (count)))
5553 #endif
5554 
5555 SIMDE_FUNCTION_ATTRIBUTES
5556 simde__m128i
simde_mm_srl_epi64(simde__m128i a,simde__m128i count)5557 simde_mm_srl_epi64 (simde__m128i a, simde__m128i count) {
5558   #if defined(SIMDE_X86_SSE2_NATIVE)
5559     return _mm_srl_epi64(a, count);
5560   #else
5561     simde__m128i_private
5562       r_,
5563       a_ = simde__m128i_to_private(a),
5564       count_ = simde__m128i_to_private(count);
5565 
5566     const int cnt = HEDLEY_STATIC_CAST(int, (count_.i64[0] > 64 ? 64 : count_.i64[0]));
5567 
5568     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5569       r_.neon_u64 = vshlq_u64(a_.neon_u64, vdupq_n_s64(HEDLEY_STATIC_CAST(int64_t, -cnt)));
5570     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5571       r_.wasm_v128 = wasm_u64x2_shr(a_.wasm_v128, cnt);
5572     #else
5573       #if !defined(SIMDE_BUG_GCC_94488)
5574         SIMDE_VECTORIZE
5575       #endif
5576       for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
5577         r_.u64[i] = a_.u64[i] >> cnt;
5578       }
5579     #endif
5580 
5581     return simde__m128i_from_private(r_);
5582   #endif
5583 }
5584 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5585   #define _mm_srl_epi64(a, count) (simde_mm_srl_epi64(a, (count)))
5586 #endif
5587 
5588 SIMDE_FUNCTION_ATTRIBUTES
5589 simde__m128i
simde_mm_srai_epi16(simde__m128i a,const int imm8)5590 simde_mm_srai_epi16 (simde__m128i a, const int imm8)
5591     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
5592   /* MSVC requires a range of (0, 255). */
5593   simde__m128i_private
5594     r_,
5595     a_ = simde__m128i_to_private(a);
5596 
5597   const int cnt = (imm8 & ~15) ? 15 : imm8;
5598 
5599   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5600     r_.neon_i16 = vshlq_s16(a_.neon_i16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, -cnt)));
5601   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5602     r_.wasm_v128 = wasm_i16x8_shr(a_.wasm_v128, cnt);
5603   #else
5604     SIMDE_VECTORIZE
5605     for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i16[0])) ; i++) {
5606       r_.i16[i] = a_.i16[i] >> cnt;
5607     }
5608   #endif
5609 
5610   return simde__m128i_from_private(r_);
5611 }
5612 #if defined(SIMDE_X86_SSE2_NATIVE)
5613   #define simde_mm_srai_epi16(a, imm8) _mm_srai_epi16((a), (imm8))
5614 #endif
5615 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5616   #define _mm_srai_epi16(a, imm8) simde_mm_srai_epi16(a, imm8)
5617 #endif
5618 
5619 SIMDE_FUNCTION_ATTRIBUTES
5620 simde__m128i
simde_mm_srai_epi32(simde__m128i a,const int imm8)5621 simde_mm_srai_epi32 (simde__m128i a, const int imm8)
5622     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
5623   /* MSVC requires a range of (0, 255). */
5624   simde__m128i_private
5625     r_,
5626     a_ = simde__m128i_to_private(a);
5627 
5628   const int cnt = (imm8 & ~31) ? 31 : imm8;
5629 
5630   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5631     r_.neon_i32 = vshlq_s32(a_.neon_i32, vdupq_n_s32(-cnt));
5632   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5633     r_.wasm_v128 = wasm_i32x4_shr(a_.wasm_v128, cnt);
5634   #else
5635     SIMDE_VECTORIZE
5636     for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i32[0])) ; i++) {
5637       r_.i32[i] = a_.i32[i] >> cnt;
5638     }
5639   #endif
5640 
5641   return simde__m128i_from_private(r_);
5642 }
5643 #if defined(SIMDE_X86_SSE2_NATIVE)
5644   #define simde_mm_srai_epi32(a, imm8) _mm_srai_epi32((a), (imm8))
5645 #endif
5646 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5647   #define _mm_srai_epi32(a, imm8) simde_mm_srai_epi32(a, imm8)
5648 #endif
5649 
5650 SIMDE_FUNCTION_ATTRIBUTES
5651 simde__m128i
simde_mm_sra_epi16(simde__m128i a,simde__m128i count)5652 simde_mm_sra_epi16 (simde__m128i a, simde__m128i count) {
5653   #if defined(SIMDE_X86_SSE2_NATIVE)
5654     return _mm_sra_epi16(a, count);
5655   #else
5656     simde__m128i_private
5657       r_,
5658       a_ = simde__m128i_to_private(a),
5659       count_ = simde__m128i_to_private(count);
5660 
5661     const int cnt = HEDLEY_STATIC_CAST(int, (count_.i64[0] > 15 ? 15 : count_.i64[0]));
5662 
5663     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5664       r_.neon_i16 = vshlq_s16(a_.neon_i16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, -cnt)));
5665     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5666       r_.wasm_v128 = wasm_i16x8_shr(a_.wasm_v128, cnt);
5667     #else
5668       SIMDE_VECTORIZE
5669       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
5670         r_.i16[i] = a_.i16[i] >> cnt;
5671       }
5672     #endif
5673 
5674     return simde__m128i_from_private(r_);
5675   #endif
5676 }
5677 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5678   #define _mm_sra_epi16(a, count) (simde_mm_sra_epi16(a, count))
5679 #endif
5680 
5681 SIMDE_FUNCTION_ATTRIBUTES
5682 simde__m128i
simde_mm_sra_epi32(simde__m128i a,simde__m128i count)5683 simde_mm_sra_epi32 (simde__m128i a, simde__m128i count) {
5684   #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_MM_SRA_EPI32)
5685     return _mm_sra_epi32(a, count);
5686   #else
5687     simde__m128i_private
5688       r_,
5689       a_ = simde__m128i_to_private(a),
5690       count_ = simde__m128i_to_private(count);
5691 
5692     const int cnt = count_.u64[0] > 31 ? 31 : HEDLEY_STATIC_CAST(int, count_.u64[0]);
5693 
5694     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5695       r_.neon_i32 = vshlq_s32(a_.neon_i32, vdupq_n_s32(HEDLEY_STATIC_CAST(int32_t, -cnt)));
5696     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5697       r_.wasm_v128 = wasm_i32x4_shr(a_.wasm_v128, cnt);
5698     #else
5699       SIMDE_VECTORIZE
5700       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
5701         r_.i32[i] = a_.i32[i] >> cnt;
5702       }
5703     #endif
5704 
5705     return simde__m128i_from_private(r_);
5706   #endif
5707 }
5708 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5709   #define _mm_sra_epi32(a, count) (simde_mm_sra_epi32(a, (count)))
5710 #endif
5711 
5712 SIMDE_FUNCTION_ATTRIBUTES
5713 simde__m128i
simde_mm_slli_epi16(simde__m128i a,const int imm8)5714 simde_mm_slli_epi16 (simde__m128i a, const int imm8)
5715     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)  {
5716   if (HEDLEY_UNLIKELY((imm8 > 15))) {
5717     return simde_mm_setzero_si128();
5718   }
5719 
5720   simde__m128i_private
5721     r_,
5722     a_ = simde__m128i_to_private(a);
5723 
5724   #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
5725     r_.i16 = a_.i16 << SIMDE_CAST_VECTOR_SHIFT_COUNT(8, imm8 & 0xff);
5726   #else
5727     const int s = (imm8 > HEDLEY_STATIC_CAST(int, sizeof(r_.i16[0]) * CHAR_BIT) - 1) ? 0 : imm8;
5728     SIMDE_VECTORIZE
5729     for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
5730       r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, a_.i16[i] << s);
5731     }
5732   #endif
5733 
5734   return simde__m128i_from_private(r_);
5735 }
5736 #if defined(SIMDE_X86_SSE2_NATIVE)
5737   #define simde_mm_slli_epi16(a, imm8) _mm_slli_epi16(a, imm8)
5738 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5739   #define simde_mm_slli_epi16(a, imm8) \
5740      (__extension__ ({ \
5741         simde__m128i ret; \
5742         if ((imm8) <= 0) { \
5743             ret = a; \
5744         } else if ((imm8) > 15) { \
5745             ret = simde_mm_setzero_si128(); \
5746         } else { \
5747             ret = simde__m128i_from_neon_i16( \
5748                 vshlq_n_s16(simde__m128i_to_neon_i16(a), ((imm8) & 15))); \
5749         } \
5750         ret; \
5751     }))
5752 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5753   #define simde_mm_slli_epi16(a, imm8) \
5754     ((imm8 < 16) ? wasm_i16x8_shl(simde__m128i_to_private(a).wasm_v128, imm8) : wasm_i16x8_const(0,0,0,0,0,0,0,0))
5755 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
5756   #define simde_mm_slli_epi16(a, imm8) \
5757     ((imm8 & ~15) ? simde_mm_setzero_si128() : simde__m128i_from_altivec_i16(vec_sl(simde__m128i_to_altivec_i16(a), vec_splat_u16(HEDLEY_STATIC_CAST(unsigned short, imm8)))))
5758 #endif
5759 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5760   #define _mm_slli_epi16(a, imm8) simde_mm_slli_epi16(a, imm8)
5761 #endif
5762 
5763 SIMDE_FUNCTION_ATTRIBUTES
5764 simde__m128i
simde_mm_slli_epi32(simde__m128i a,const int imm8)5765 simde_mm_slli_epi32 (simde__m128i a, const int imm8)
5766     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)  {
5767   if (HEDLEY_UNLIKELY((imm8 > 31))) {
5768     return simde_mm_setzero_si128();
5769   }
5770   simde__m128i_private
5771     r_,
5772     a_ = simde__m128i_to_private(a);
5773 
5774   #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
5775     r_.i32 = a_.i32 << imm8;
5776   #else
5777     SIMDE_VECTORIZE
5778     for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
5779       r_.i32[i] = a_.i32[i] << (imm8 & 0xff);
5780     }
5781   #endif
5782 
5783   return simde__m128i_from_private(r_);
5784 }
5785 #if defined(SIMDE_X86_SSE2_NATIVE)
5786   #define simde_mm_slli_epi32(a, imm8) _mm_slli_epi32(a, imm8)
5787 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5788   #define simde_mm_slli_epi32(a, imm8) \
5789      (__extension__ ({ \
5790        simde__m128i ret; \
5791        if ((imm8) <= 0) { \
5792          ret = a; \
5793        } else if ((imm8) > 31) { \
5794          ret = simde_mm_setzero_si128(); \
5795        } else { \
5796          ret = simde__m128i_from_neon_i32( \
5797            vshlq_n_s32(simde__m128i_to_neon_i32(a), ((imm8) & 31))); \
5798        } \
5799        ret; \
5800     }))
5801 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5802   #define simde_mm_slli_epi32(a, imm8) \
5803     ((imm8 < 32) ? wasm_i32x4_shl(simde__m128i_to_private(a).wasm_v128, imm8) : wasm_i32x4_const(0,0,0,0))
5804 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
5805   #define simde_mm_slli_epi32(a, imm8) \
5806      (__extension__ ({ \
5807        simde__m128i ret; \
5808        if ((imm8) <= 0) { \
5809          ret = a; \
5810        } else if ((imm8) > 31) { \
5811          ret = simde_mm_setzero_si128(); \
5812        } else { \
5813          ret = simde__m128i_from_altivec_i32( \
5814            vec_sl(simde__m128i_to_altivec_i32(a), \
5815              vec_splats(HEDLEY_STATIC_CAST(unsigned int, (imm8) & 31)))); \
5816        } \
5817        ret; \
5818      }))
5819 #endif
5820 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5821   #define _mm_slli_epi32(a, imm8) simde_mm_slli_epi32(a, imm8)
5822 #endif
5823 
5824 SIMDE_FUNCTION_ATTRIBUTES
5825 simde__m128i
simde_mm_slli_epi64(simde__m128i a,const int imm8)5826 simde_mm_slli_epi64 (simde__m128i a, const int imm8)
5827     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)  {
5828   if (HEDLEY_UNLIKELY((imm8 > 63))) {
5829     return simde_mm_setzero_si128();
5830   }
5831   simde__m128i_private
5832     r_,
5833     a_ = simde__m128i_to_private(a);
5834 
5835   #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
5836     r_.i64 = a_.i64 << imm8;
5837   #else
5838     SIMDE_VECTORIZE
5839     for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
5840       r_.i64[i] = a_.i64[i] << (imm8 & 0xff);
5841     }
5842   #endif
5843 
5844   return simde__m128i_from_private(r_);
5845 }
5846 #if defined(SIMDE_X86_SSE2_NATIVE)
5847   #define simde_mm_slli_epi64(a, imm8) _mm_slli_epi64(a, imm8)
5848 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5849   #define simde_mm_slli_epi64(a, imm8) \
5850      (__extension__ ({ \
5851         simde__m128i ret; \
5852         if ((imm8) <= 0) { \
5853             ret = a; \
5854         } else if ((imm8) > 63) { \
5855             ret = simde_mm_setzero_si128(); \
5856         } else { \
5857             ret = simde__m128i_from_neon_i64( \
5858                 vshlq_n_s64(simde__m128i_to_neon_i64(a), ((imm8) & 63))); \
5859         } \
5860         ret; \
5861     }))
5862 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5863   #define simde_mm_slli_epi64(a, imm8) \
5864     ((imm8 < 64) ? wasm_i64x2_shl(simde__m128i_to_private(a).wasm_v128, imm8) : wasm_i64x2_const(0,0))
5865 #endif
5866 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5867   #define _mm_slli_epi64(a, imm8) simde_mm_slli_epi64(a, imm8)
5868 #endif
5869 
5870 SIMDE_FUNCTION_ATTRIBUTES
5871 simde__m128i
simde_mm_srli_epi16(simde__m128i a,const int imm8)5872 simde_mm_srli_epi16 (simde__m128i a, const int imm8)
5873     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)  {
5874   if (HEDLEY_UNLIKELY((imm8 > 15))) {
5875     return simde_mm_setzero_si128();
5876   }
5877   simde__m128i_private
5878     r_,
5879     a_ = simde__m128i_to_private(a);
5880 
5881   #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
5882     r_.u16 = a_.u16 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(8, imm8);
5883   #else
5884     SIMDE_VECTORIZE
5885     for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
5886       r_.u16[i] = a_.u16[i] >> (imm8 & 0xff);
5887     }
5888   #endif
5889 
5890   return simde__m128i_from_private(r_);
5891 }
5892 #if defined(SIMDE_X86_SSE2_NATIVE)
5893   #define simde_mm_srli_epi16(a, imm8) _mm_srli_epi16(a, imm8)
5894 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5895   #define simde_mm_srli_epi16(a, imm8) \
5896      (__extension__ ({ \
5897         simde__m128i ret; \
5898         if ((imm8) <= 0) { \
5899             ret = a; \
5900         } else if ((imm8) > 15) { \
5901             ret = simde_mm_setzero_si128(); \
5902         } else { \
5903             ret = simde__m128i_from_neon_u16( \
5904                 vshrq_n_u16(simde__m128i_to_neon_u16(a), (((imm8) & 15) | (((imm8) & 15) == 0)))); \
5905         } \
5906         ret; \
5907     }))
5908 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5909   #define simde_mm_srli_epi16(a, imm8) \
5910     ((imm8 < 16) ? wasm_u16x8_shr(simde__m128i_to_private(a).wasm_v128, imm8) : wasm_i16x8_const(0,0,0,0,0,0,0,0))
5911 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
5912   #define simde_mm_srli_epi16(a, imm8) \
5913     ((imm8 & ~15) ? simde_mm_setzero_si128() : simde__m128i_from_altivec_i16(vec_sr(simde__m128i_to_altivec_i16(a), vec_splat_u16(HEDLEY_STATIC_CAST(unsigned short, imm8)))))
5914 #endif
5915 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5916   #define _mm_srli_epi16(a, imm8) simde_mm_srli_epi16(a, imm8)
5917 #endif
5918 
5919 SIMDE_FUNCTION_ATTRIBUTES
5920 simde__m128i
simde_mm_srli_epi32(simde__m128i a,const int imm8)5921 simde_mm_srli_epi32 (simde__m128i a, const int imm8)
5922     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)  {
5923   if (HEDLEY_UNLIKELY((imm8 > 31))) {
5924     return simde_mm_setzero_si128();
5925   }
5926   simde__m128i_private
5927     r_,
5928     a_ = simde__m128i_to_private(a);
5929 
5930   #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
5931     r_.u32 = a_.u32 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(8, imm8 & 0xff);
5932   #else
5933     SIMDE_VECTORIZE
5934     for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
5935       r_.u32[i] = a_.u32[i] >> (imm8 & 0xff);
5936     }
5937   #endif
5938 
5939   return simde__m128i_from_private(r_);
5940 }
5941 #if defined(SIMDE_X86_SSE2_NATIVE)
5942   #define simde_mm_srli_epi32(a, imm8) _mm_srli_epi32(a, imm8)
5943 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5944   #define simde_mm_srli_epi32(a, imm8) \
5945     (__extension__ ({ \
5946         simde__m128i ret; \
5947         if ((imm8) <= 0) { \
5948             ret = a; \
5949         } else if ((imm8) > 31) { \
5950             ret = simde_mm_setzero_si128(); \
5951         } else { \
5952             ret = simde__m128i_from_neon_u32( \
5953               vshrq_n_u32(simde__m128i_to_neon_u32(a), (((imm8) & 31) | (((imm8) & 31) == 0)))); \
5954         } \
5955         ret; \
5956     }))
5957 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5958   #define simde_mm_srli_epi32(a, imm8) \
5959     ((imm8 < 32) ? wasm_u32x4_shr(simde__m128i_to_private(a).wasm_v128, imm8) : wasm_i32x4_const(0,0,0,0))
5960 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
5961   #define simde_mm_srli_epi32(a, imm8) \
5962     (__extension__ ({ \
5963         simde__m128i ret; \
5964         if ((imm8) <= 0) { \
5965             ret = a; \
5966         } else if ((imm8) > 31) { \
5967             ret = simde_mm_setzero_si128(); \
5968         } else { \
5969             ret = simde__m128i_from_altivec_i32( \
5970               vec_sr(simde__m128i_to_altivec_i32(a), \
5971                 vec_splats(HEDLEY_STATIC_CAST(unsigned int, (imm8) & 31)))); \
5972         } \
5973         ret; \
5974     }))
5975 #endif
5976 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5977   #define _mm_srli_epi32(a, imm8) simde_mm_srli_epi32(a, imm8)
5978 #endif
5979 
5980 SIMDE_FUNCTION_ATTRIBUTES
5981 simde__m128i
simde_mm_srli_epi64(simde__m128i a,const int imm8)5982 simde_mm_srli_epi64 (simde__m128i a, const int imm8)
5983     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)  {
5984   simde__m128i_private
5985     r_,
5986     a_ = simde__m128i_to_private(a);
5987 
5988   if (HEDLEY_UNLIKELY((imm8 & 63) != imm8))
5989     return simde_mm_setzero_si128();
5990 
5991   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5992     r_.neon_u64 = vshlq_u64(a_.neon_u64, vdupq_n_s64(-imm8));
5993   #else
5994     #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_94488)
5995       r_.u64 = a_.u64 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(8, imm8);
5996     #else
5997       SIMDE_VECTORIZE
5998       for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
5999         r_.u64[i] = a_.u64[i] >> imm8;
6000       }
6001     #endif
6002   #endif
6003 
6004   return simde__m128i_from_private(r_);
6005 }
6006 #if defined(SIMDE_X86_SSE2_NATIVE)
6007   #define simde_mm_srli_epi64(a, imm8) _mm_srli_epi64(a, imm8)
6008 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6009   #define simde_mm_srli_epi64(a, imm8) \
6010     (__extension__ ({ \
6011         simde__m128i ret; \
6012         if ((imm8) <= 0) { \
6013             ret = a; \
6014         } else if ((imm8) > 63) { \
6015             ret = simde_mm_setzero_si128(); \
6016         } else { \
6017             ret = simde__m128i_from_neon_u64( \
6018               vshrq_n_u64(simde__m128i_to_neon_u64(a), (((imm8) & 63) | (((imm8) & 63) == 0)))); \
6019         } \
6020         ret; \
6021     }))
6022 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
6023   #define simde_mm_srli_epi64(a, imm8) \
6024     ((imm8 < 64) ? wasm_u64x2_shr(simde__m128i_to_private(a).wasm_v128, imm8) : wasm_i64x2_const(0,0))
6025 #endif
6026 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6027   #define _mm_srli_epi64(a, imm8) simde_mm_srli_epi64(a, imm8)
6028 #endif
6029 
6030 SIMDE_FUNCTION_ATTRIBUTES
6031 void
simde_mm_store_pd(simde_float64 mem_addr[HEDLEY_ARRAY_PARAM (2)],simde__m128d a)6032 simde_mm_store_pd (simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128d a) {
6033   #if defined(SIMDE_X86_SSE2_NATIVE)
6034     _mm_store_pd(mem_addr, a);
6035   #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6036     vst1q_f64(mem_addr, simde__m128d_to_private(a).neon_f64);
6037   #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6038     vst1q_s64(HEDLEY_REINTERPRET_CAST(int64_t*, mem_addr), simde__m128d_to_private(a).neon_i64);
6039   #else
6040     simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m128d), &a, sizeof(a));
6041   #endif
6042 }
6043 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6044   #define _mm_store_pd(mem_addr, a) simde_mm_store_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
6045 #endif
6046 
6047 SIMDE_FUNCTION_ATTRIBUTES
6048 void
simde_mm_store1_pd(simde_float64 mem_addr[HEDLEY_ARRAY_PARAM (2)],simde__m128d a)6049 simde_mm_store1_pd (simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128d a) {
6050   #if defined(SIMDE_X86_SSE2_NATIVE)
6051     _mm_store1_pd(mem_addr, a);
6052   #else
6053     simde__m128d_private a_ = simde__m128d_to_private(a);
6054 
6055     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6056       vst1q_f64(mem_addr, vdupq_laneq_f64(a_.neon_f64, 0));
6057     #else
6058       mem_addr[0] = a_.f64[0];
6059       mem_addr[1] = a_.f64[0];
6060     #endif
6061   #endif
6062 }
6063 #define simde_mm_store_pd1(mem_addr, a) simde_mm_store1_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
6064 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6065   #define _mm_store1_pd(mem_addr, a) simde_mm_store1_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
6066   #define _mm_store_pd1(mem_addr, a) simde_mm_store_pd1(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
6067 #endif
6068 
6069 SIMDE_FUNCTION_ATTRIBUTES
6070 void
simde_mm_store_sd(simde_float64 * mem_addr,simde__m128d a)6071 simde_mm_store_sd (simde_float64* mem_addr, simde__m128d a) {
6072   #if defined(SIMDE_X86_SSE2_NATIVE)
6073     _mm_store_sd(mem_addr, a);
6074   #else
6075     simde__m128d_private a_ = simde__m128d_to_private(a);
6076 
6077     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6078       const simde_float64 v = vgetq_lane_f64(a_.neon_f64, 0);
6079       simde_memcpy(mem_addr, &v, sizeof(v));
6080     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6081       const int64_t v = vgetq_lane_s64(a_.neon_i64, 0);
6082       simde_memcpy(HEDLEY_REINTERPRET_CAST(int64_t*, mem_addr), &v, sizeof(v));
6083     #else
6084       simde_float64 v = a_.f64[0];
6085       simde_memcpy(mem_addr, &v, sizeof(simde_float64));
6086     #endif
6087   #endif
6088 }
6089 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6090   #define _mm_store_sd(mem_addr, a) simde_mm_store_sd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
6091 #endif
6092 
6093 SIMDE_FUNCTION_ATTRIBUTES
6094 void
simde_mm_store_si128(simde__m128i * mem_addr,simde__m128i a)6095 simde_mm_store_si128 (simde__m128i* mem_addr, simde__m128i a) {
6096   #if defined(SIMDE_X86_SSE2_NATIVE)
6097     _mm_store_si128(HEDLEY_STATIC_CAST(__m128i*, mem_addr), a);
6098   #else
6099     simde__m128i_private a_ = simde__m128i_to_private(a);
6100 
6101     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6102       vst1q_s32(HEDLEY_REINTERPRET_CAST(int32_t*, mem_addr), a_.neon_i32);
6103     #else
6104       simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m128i), &a_, sizeof(a_));
6105     #endif
6106   #endif
6107 }
6108 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6109   #define _mm_store_si128(mem_addr, a) simde_mm_store_si128(mem_addr, a)
6110 #endif
6111 
6112 SIMDE_FUNCTION_ATTRIBUTES
6113 void
simde_mm_storeh_pd(simde_float64 * mem_addr,simde__m128d a)6114   simde_mm_storeh_pd (simde_float64* mem_addr, simde__m128d a) {
6115   #if defined(SIMDE_X86_SSE2_NATIVE)
6116     _mm_storeh_pd(mem_addr, a);
6117   #else
6118     simde__m128d_private a_ = simde__m128d_to_private(a);
6119 
6120     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6121       *mem_addr = vgetq_lane_f64(a_.neon_f64, 1);
6122     #else
6123       *mem_addr = a_.f64[1];
6124     #endif
6125   #endif
6126 }
6127 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6128   #define _mm_storeh_pd(mem_addr, a) simde_mm_storeh_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
6129 #endif
6130 
6131 SIMDE_FUNCTION_ATTRIBUTES
6132 void
simde_mm_storel_epi64(simde__m128i * mem_addr,simde__m128i a)6133 simde_mm_storel_epi64 (simde__m128i* mem_addr, simde__m128i a) {
6134   #if defined(SIMDE_X86_SSE2_NATIVE)
6135     _mm_storel_epi64(HEDLEY_STATIC_CAST(__m128i*, mem_addr), a);
6136   #else
6137     simde__m128i_private a_ = simde__m128i_to_private(a);
6138     int64_t tmp;
6139 
6140     /* memcpy to prevent aliasing, tmp because we can't take the
6141      * address of a vector element. */
6142 
6143     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6144       tmp = vgetq_lane_s64(a_.neon_i64, 0);
6145     #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
6146       #if defined(SIMDE_BUG_GCC_95227)
6147         (void) a_;
6148       #endif
6149       tmp = vec_extract(a_.altivec_i64, 0);
6150     #else
6151       tmp = a_.i64[0];
6152     #endif
6153 
6154     simde_memcpy(mem_addr, &tmp, sizeof(tmp));
6155   #endif
6156 }
6157 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6158   #define _mm_storel_epi64(mem_addr, a) simde_mm_storel_epi64(mem_addr, a)
6159 #endif
6160 
6161 SIMDE_FUNCTION_ATTRIBUTES
6162 void
simde_mm_storel_pd(simde_float64 * mem_addr,simde__m128d a)6163 simde_mm_storel_pd (simde_float64* mem_addr, simde__m128d a) {
6164   #if defined(SIMDE_X86_SSE2_NATIVE)
6165     _mm_storel_pd(mem_addr, a);
6166   #else
6167     simde__m128d_private a_ = simde__m128d_to_private(a);
6168 
6169     simde_float64 tmp;
6170     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6171       tmp = vgetq_lane_f64(a_.neon_f64, 0);
6172     #else
6173       tmp = a_.f64[0];
6174     #endif
6175     simde_memcpy(mem_addr, &tmp, sizeof(tmp));
6176   #endif
6177 }
6178 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6179   #define _mm_storel_pd(mem_addr, a) simde_mm_storel_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
6180 #endif
6181 
6182 SIMDE_FUNCTION_ATTRIBUTES
6183 void
simde_mm_storer_pd(simde_float64 mem_addr[2],simde__m128d a)6184 simde_mm_storer_pd (simde_float64 mem_addr[2], simde__m128d a) {
6185   #if defined(SIMDE_X86_SSE2_NATIVE)
6186     _mm_storer_pd(mem_addr, a);
6187   #else
6188     simde__m128d_private a_ = simde__m128d_to_private(a);
6189 
6190     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6191       vst1q_s64(HEDLEY_REINTERPRET_CAST(int64_t*, mem_addr), vextq_s64(a_.neon_i64, a_.neon_i64, 1));
6192     #elif defined(SIMDE_SHUFFLE_VECTOR_)
6193       a_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, a_.f64, 1, 0);
6194       simde_mm_store_pd(mem_addr, simde__m128d_from_private(a_));
6195     #else
6196       mem_addr[0] = a_.f64[1];
6197       mem_addr[1] = a_.f64[0];
6198     #endif
6199   #endif
6200 }
6201 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6202   #define _mm_storer_pd(mem_addr, a) simde_mm_storer_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
6203 #endif
6204 
6205 SIMDE_FUNCTION_ATTRIBUTES
6206 void
simde_mm_storeu_pd(simde_float64 * mem_addr,simde__m128d a)6207 simde_mm_storeu_pd (simde_float64* mem_addr, simde__m128d a) {
6208   #if defined(SIMDE_X86_SSE2_NATIVE)
6209     _mm_storeu_pd(mem_addr, a);
6210   #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6211     vst1q_f64(mem_addr, simde__m128d_to_private(a).neon_f64);
6212   #else
6213     simde_memcpy(mem_addr, &a, sizeof(a));
6214   #endif
6215 }
6216 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6217   #define _mm_storeu_pd(mem_addr, a) simde_mm_storeu_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
6218 #endif
6219 
6220 SIMDE_FUNCTION_ATTRIBUTES
6221 void
simde_mm_storeu_si128(void * mem_addr,simde__m128i a)6222 simde_mm_storeu_si128 (void* mem_addr, simde__m128i a) {
6223   #if defined(SIMDE_X86_SSE2_NATIVE)
6224     _mm_storeu_si128(HEDLEY_STATIC_CAST(__m128i*, mem_addr), a);
6225   #else
6226     simde_memcpy(mem_addr, &a, sizeof(a));
6227   #endif
6228 }
6229 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6230   #define _mm_storeu_si128(mem_addr, a) simde_mm_storeu_si128(mem_addr, a)
6231 #endif
6232 
6233 SIMDE_FUNCTION_ATTRIBUTES
6234 void
simde_mm_storeu_si16(void * mem_addr,simde__m128i a)6235 simde_mm_storeu_si16 (void* mem_addr, simde__m128i a) {
6236   #if defined(SIMDE_X86_SSE2_NATIVE) && ( \
6237       SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0) || \
6238       HEDLEY_GCC_VERSION_CHECK(11,0,0) || \
6239       HEDLEY_INTEL_VERSION_CHECK(20,21,1))
6240     _mm_storeu_si16(mem_addr, a);
6241   #else
6242     int16_t val = simde_x_mm_cvtsi128_si16(a);
6243     simde_memcpy(mem_addr, &val, sizeof(val));
6244   #endif
6245 }
6246 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6247   #define _mm_storeu_si16(mem_addr, a) simde_mm_storeu_si16(mem_addr, a)
6248 #endif
6249 
6250 SIMDE_FUNCTION_ATTRIBUTES
6251 void
simde_mm_storeu_si32(void * mem_addr,simde__m128i a)6252 simde_mm_storeu_si32 (void* mem_addr, simde__m128i a) {
6253   #if defined(SIMDE_X86_SSE2_NATIVE) && ( \
6254       SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0) || \
6255       HEDLEY_GCC_VERSION_CHECK(11,0,0) || \
6256       HEDLEY_INTEL_VERSION_CHECK(20,21,1))
6257     _mm_storeu_si32(mem_addr, a);
6258   #else
6259     int32_t val = simde_mm_cvtsi128_si32(a);
6260     simde_memcpy(mem_addr, &val, sizeof(val));
6261   #endif
6262 }
6263 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6264   #define _mm_storeu_si32(mem_addr, a) simde_mm_storeu_si32(mem_addr, a)
6265 #endif
6266 
6267 SIMDE_FUNCTION_ATTRIBUTES
6268 void
simde_mm_storeu_si64(void * mem_addr,simde__m128i a)6269 simde_mm_storeu_si64 (void* mem_addr, simde__m128i a) {
6270   #if defined(SIMDE_X86_SSE2_NATIVE) && ( \
6271       SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0) || \
6272       HEDLEY_GCC_VERSION_CHECK(11,0,0) || \
6273       HEDLEY_INTEL_VERSION_CHECK(20,21,1))
6274     _mm_storeu_si64(mem_addr, a);
6275   #else
6276     int64_t val = simde_mm_cvtsi128_si64(a);
6277     simde_memcpy(mem_addr, &val, sizeof(val));
6278   #endif
6279 }
6280 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6281   #define _mm_storeu_si64(mem_addr, a) simde_mm_storeu_si64(mem_addr, a)
6282 #endif
6283 
6284 SIMDE_FUNCTION_ATTRIBUTES
6285 void
simde_mm_stream_pd(simde_float64 mem_addr[HEDLEY_ARRAY_PARAM (2)],simde__m128d a)6286 simde_mm_stream_pd (simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128d a) {
6287   #if defined(SIMDE_X86_SSE2_NATIVE)
6288     _mm_stream_pd(mem_addr, a);
6289   #else
6290     simde_memcpy(mem_addr, &a, sizeof(a));
6291   #endif
6292 }
6293 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6294   #define _mm_stream_pd(mem_addr, a) simde_mm_stream_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
6295 #endif
6296 
6297 SIMDE_FUNCTION_ATTRIBUTES
6298 void
simde_mm_stream_si128(simde__m128i * mem_addr,simde__m128i a)6299 simde_mm_stream_si128 (simde__m128i* mem_addr, simde__m128i a) {
6300   #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
6301     _mm_stream_si128(HEDLEY_STATIC_CAST(__m128i*, mem_addr), a);
6302   #else
6303     simde_memcpy(mem_addr, &a, sizeof(a));
6304   #endif
6305 }
6306 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6307   #define _mm_stream_si128(mem_addr, a) simde_mm_stream_si128(mem_addr, a)
6308 #endif
6309 
6310 SIMDE_FUNCTION_ATTRIBUTES
6311 void
simde_mm_stream_si32(int32_t * mem_addr,int32_t a)6312 simde_mm_stream_si32 (int32_t* mem_addr, int32_t a) {
6313   #if defined(SIMDE_X86_SSE2_NATIVE)
6314     _mm_stream_si32(mem_addr, a);
6315   #else
6316     *mem_addr = a;
6317   #endif
6318 }
6319 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6320   #define _mm_stream_si32(mem_addr, a) simde_mm_stream_si32(mem_addr, a)
6321 #endif
6322 
6323 SIMDE_FUNCTION_ATTRIBUTES
6324 void
simde_mm_stream_si64(int64_t * mem_addr,int64_t a)6325 simde_mm_stream_si64 (int64_t* mem_addr, int64_t a) {
6326   #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64) && !defined(HEDLEY_MSVC_VERSION)
6327     _mm_stream_si64(SIMDE_CHECKED_REINTERPRET_CAST(long long int*, int64_t*, mem_addr), a);
6328   #else
6329     *mem_addr = a;
6330   #endif
6331 }
6332 #define simde_mm_stream_si64x(mem_addr, a) simde_mm_stream_si64(mem_addr, a)
6333 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64))
6334   #define _mm_stream_si64(mem_addr, a) simde_mm_stream_si64(SIMDE_CHECKED_REINTERPRET_CAST(int64_t*, __int64*, mem_addr), a)
6335   #define _mm_stream_si64x(mem_addr, a) simde_mm_stream_si64(SIMDE_CHECKED_REINTERPRET_CAST(int64_t*, __int64*, mem_addr), a)
6336 #endif
6337 
6338 SIMDE_FUNCTION_ATTRIBUTES
6339 simde__m128i
simde_mm_sub_epi8(simde__m128i a,simde__m128i b)6340 simde_mm_sub_epi8 (simde__m128i a, simde__m128i b) {
6341   #if defined(SIMDE_X86_SSE2_NATIVE)
6342     return _mm_sub_epi8(a, b);
6343   #else
6344     simde__m128i_private
6345       r_,
6346       a_ = simde__m128i_to_private(a),
6347       b_ = simde__m128i_to_private(b);
6348 
6349     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6350       r_.neon_i8 = vsubq_s8(a_.neon_i8, b_.neon_i8);
6351     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
6352       r_.i8 = a_.i8 - b_.i8;
6353     #else
6354       SIMDE_VECTORIZE
6355       for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
6356         r_.i8[i] = a_.i8[i] - b_.i8[i];
6357       }
6358     #endif
6359 
6360     return simde__m128i_from_private(r_);
6361   #endif
6362 }
6363 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6364   #define _mm_sub_epi8(a, b) simde_mm_sub_epi8(a, b)
6365 #endif
6366 
6367 SIMDE_FUNCTION_ATTRIBUTES
6368 simde__m128i
simde_mm_sub_epi16(simde__m128i a,simde__m128i b)6369 simde_mm_sub_epi16 (simde__m128i a, simde__m128i b) {
6370   #if defined(SIMDE_X86_SSE2_NATIVE)
6371     return _mm_sub_epi16(a, b);
6372   #else
6373     simde__m128i_private
6374       r_,
6375       a_ = simde__m128i_to_private(a),
6376       b_ = simde__m128i_to_private(b);
6377 
6378     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6379       r_.neon_i16 = vsubq_s16(a_.neon_i16, b_.neon_i16);
6380     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
6381       r_.i16 = a_.i16 - b_.i16;
6382     #else
6383       SIMDE_VECTORIZE
6384       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
6385         r_.i16[i] = a_.i16[i] - b_.i16[i];
6386       }
6387     #endif
6388 
6389     return simde__m128i_from_private(r_);
6390   #endif
6391 }
6392 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6393   #define _mm_sub_epi16(a, b) simde_mm_sub_epi16(a, b)
6394 #endif
6395 
6396 SIMDE_FUNCTION_ATTRIBUTES
6397 simde__m128i
simde_mm_sub_epi32(simde__m128i a,simde__m128i b)6398 simde_mm_sub_epi32 (simde__m128i a, simde__m128i b) {
6399   #if defined(SIMDE_X86_SSE2_NATIVE)
6400     return _mm_sub_epi32(a, b);
6401   #else
6402     simde__m128i_private
6403       r_,
6404       a_ = simde__m128i_to_private(a),
6405       b_ = simde__m128i_to_private(b);
6406 
6407     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6408       r_.neon_i32 = vsubq_s32(a_.neon_i32, b_.neon_i32);
6409     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
6410       r_.i32 = a_.i32 - b_.i32;
6411     #else
6412       SIMDE_VECTORIZE
6413       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
6414         r_.i32[i] = a_.i32[i] - b_.i32[i];
6415       }
6416     #endif
6417 
6418     return simde__m128i_from_private(r_);
6419   #endif
6420 }
6421 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6422   #define _mm_sub_epi32(a, b) simde_mm_sub_epi32(a, b)
6423 #endif
6424 
6425 SIMDE_FUNCTION_ATTRIBUTES
6426 simde__m128i
simde_mm_sub_epi64(simde__m128i a,simde__m128i b)6427 simde_mm_sub_epi64 (simde__m128i a, simde__m128i b) {
6428   #if defined(SIMDE_X86_SSE2_NATIVE)
6429     return _mm_sub_epi64(a, b);
6430   #else
6431     simde__m128i_private
6432       r_,
6433       a_ = simde__m128i_to_private(a),
6434       b_ = simde__m128i_to_private(b);
6435 
6436     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6437       r_.neon_i64 = vsubq_s64(a_.neon_i64, b_.neon_i64);
6438     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
6439       r_.i64 = a_.i64 - b_.i64;
6440     #else
6441       SIMDE_VECTORIZE
6442       for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
6443         r_.i64[i] = a_.i64[i] - b_.i64[i];
6444       }
6445     #endif
6446 
6447     return simde__m128i_from_private(r_);
6448   #endif
6449 }
6450 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6451   #define _mm_sub_epi64(a, b) simde_mm_sub_epi64(a, b)
6452 #endif
6453 
6454 SIMDE_FUNCTION_ATTRIBUTES
6455 simde__m128i
simde_x_mm_sub_epu32(simde__m128i a,simde__m128i b)6456 simde_x_mm_sub_epu32 (simde__m128i a, simde__m128i b) {
6457   simde__m128i_private
6458     r_,
6459     a_ = simde__m128i_to_private(a),
6460     b_ = simde__m128i_to_private(b);
6461 
6462   #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
6463     r_.u32 = a_.u32 - b_.u32;
6464   #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6465     r_.neon_u32 = vsubq_u32(a_.neon_u32, b_.neon_u32);
6466   #else
6467     SIMDE_VECTORIZE
6468     for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
6469       r_.u32[i] = a_.u32[i] - b_.u32[i];
6470     }
6471   #endif
6472 
6473   return simde__m128i_from_private(r_);
6474 }
6475 
6476 SIMDE_FUNCTION_ATTRIBUTES
6477 simde__m128d
simde_mm_sub_pd(simde__m128d a,simde__m128d b)6478 simde_mm_sub_pd (simde__m128d a, simde__m128d b) {
6479   #if defined(SIMDE_X86_SSE2_NATIVE)
6480     return _mm_sub_pd(a, b);
6481   #else
6482     simde__m128d_private
6483       r_,
6484       a_ = simde__m128d_to_private(a),
6485       b_ = simde__m128d_to_private(b);
6486 
6487     #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
6488       r_.f64 = a_.f64 - b_.f64;
6489     #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6490       r_.neon_f64 = vsubq_f64(a_.neon_f64, b_.neon_f64);
6491     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
6492       r_.wasm_v128 = wasm_f64x2_sub(a_.wasm_v128, b_.wasm_v128);
6493     #else
6494       SIMDE_VECTORIZE
6495       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
6496         r_.f64[i] = a_.f64[i] - b_.f64[i];
6497       }
6498     #endif
6499 
6500     return simde__m128d_from_private(r_);
6501   #endif
6502 }
6503 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6504   #define _mm_sub_pd(a, b) simde_mm_sub_pd(a, b)
6505 #endif
6506 
6507 SIMDE_FUNCTION_ATTRIBUTES
6508 simde__m128d
simde_mm_sub_sd(simde__m128d a,simde__m128d b)6509 simde_mm_sub_sd (simde__m128d a, simde__m128d b) {
6510   #if defined(SIMDE_X86_SSE2_NATIVE)
6511     return _mm_sub_sd(a, b);
6512   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
6513     return simde_mm_move_sd(a, simde_mm_sub_pd(a, b));
6514   #else
6515     simde__m128d_private
6516       r_,
6517       a_ = simde__m128d_to_private(a),
6518       b_ = simde__m128d_to_private(b);
6519 
6520     r_.f64[0] = a_.f64[0] - b_.f64[0];
6521     r_.f64[1] = a_.f64[1];
6522 
6523     return simde__m128d_from_private(r_);
6524   #endif
6525 }
6526 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6527   #define _mm_sub_sd(a, b) simde_mm_sub_sd(a, b)
6528 #endif
6529 
6530 SIMDE_FUNCTION_ATTRIBUTES
6531 simde__m64
simde_mm_sub_si64(simde__m64 a,simde__m64 b)6532 simde_mm_sub_si64 (simde__m64 a, simde__m64 b) {
6533   #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
6534     return _mm_sub_si64(a, b);
6535   #else
6536     simde__m64_private
6537       r_,
6538       a_ = simde__m64_to_private(a),
6539       b_ = simde__m64_to_private(b);
6540 
6541     #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
6542       r_.i64 = a_.i64 - b_.i64;
6543     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6544       r_.neon_i64 = vsub_s64(a_.neon_i64, b_.neon_i64);
6545     #else
6546       r_.i64[0] = a_.i64[0] - b_.i64[0];
6547     #endif
6548 
6549     return simde__m64_from_private(r_);
6550   #endif
6551 }
6552 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6553   #define _mm_sub_si64(a, b) simde_mm_sub_si64(a, b)
6554 #endif
6555 
6556 SIMDE_FUNCTION_ATTRIBUTES
6557 simde__m128i
simde_mm_subs_epi8(simde__m128i a,simde__m128i b)6558 simde_mm_subs_epi8 (simde__m128i a, simde__m128i b) {
6559   #if defined(SIMDE_X86_SSE2_NATIVE)
6560     return _mm_subs_epi8(a, b);
6561   #else
6562     simde__m128i_private
6563       r_,
6564       a_ = simde__m128i_to_private(a),
6565       b_ = simde__m128i_to_private(b);
6566 
6567     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6568       r_.neon_i8 = vqsubq_s8(a_.neon_i8, b_.neon_i8);
6569     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
6570       r_.wasm_v128 = wasm_i8x16_sub_saturate(a_.wasm_v128, b_.wasm_v128);
6571     #else
6572       SIMDE_VECTORIZE
6573       for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i8[0])) ; i++) {
6574         if (((b_.i8[i]) > 0 && (a_.i8[i]) < INT8_MIN + (b_.i8[i]))) {
6575           r_.i8[i] = INT8_MIN;
6576         } else if ((b_.i8[i]) < 0 && (a_.i8[i]) > INT8_MAX + (b_.i8[i])) {
6577           r_.i8[i] = INT8_MAX;
6578         } else {
6579           r_.i8[i] = (a_.i8[i]) - (b_.i8[i]);
6580         }
6581       }
6582     #endif
6583 
6584     return simde__m128i_from_private(r_);
6585   #endif
6586 }
6587 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6588   #define _mm_subs_epi8(a, b) simde_mm_subs_epi8(a, b)
6589 #endif
6590 
6591 SIMDE_FUNCTION_ATTRIBUTES
6592 simde__m128i
simde_mm_subs_epi16(simde__m128i a,simde__m128i b)6593 simde_mm_subs_epi16 (simde__m128i a, simde__m128i b) {
6594   #if defined(SIMDE_X86_SSE2_NATIVE)
6595     return _mm_subs_epi16(a, b);
6596   #else
6597     simde__m128i_private
6598       r_,
6599       a_ = simde__m128i_to_private(a),
6600       b_ = simde__m128i_to_private(b);
6601 
6602     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6603       r_.neon_i16 = vqsubq_s16(a_.neon_i16, b_.neon_i16);
6604     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
6605       r_.wasm_v128 = wasm_i16x8_sub_saturate(a_.wasm_v128, b_.wasm_v128);
6606     #else
6607       SIMDE_VECTORIZE
6608       for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i16[0])) ; i++) {
6609         if (((b_.i16[i]) > 0 && (a_.i16[i]) < INT16_MIN + (b_.i16[i]))) {
6610           r_.i16[i] = INT16_MIN;
6611         } else if ((b_.i16[i]) < 0 && (a_.i16[i]) > INT16_MAX + (b_.i16[i])) {
6612           r_.i16[i] = INT16_MAX;
6613         } else {
6614           r_.i16[i] = (a_.i16[i]) - (b_.i16[i]);
6615         }
6616       }
6617     #endif
6618 
6619     return simde__m128i_from_private(r_);
6620   #endif
6621 }
6622 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6623   #define _mm_subs_epi16(a, b) simde_mm_subs_epi16(a, b)
6624 #endif
6625 
6626 SIMDE_FUNCTION_ATTRIBUTES
6627 simde__m128i
simde_mm_subs_epu8(simde__m128i a,simde__m128i b)6628 simde_mm_subs_epu8 (simde__m128i a, simde__m128i b) {
6629   #if defined(SIMDE_X86_SSE2_NATIVE)
6630     return _mm_subs_epu8(a, b);
6631   #else
6632     simde__m128i_private
6633       r_,
6634       a_ = simde__m128i_to_private(a),
6635       b_ = simde__m128i_to_private(b);
6636 
6637     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6638       r_.neon_u8 = vqsubq_u8(a_.neon_u8, b_.neon_u8);
6639     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
6640       r_.wasm_v128 = wasm_u8x16_sub_saturate(a_.wasm_v128, b_.wasm_v128);
6641     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
6642       r_.altivec_u8 = vec_subs(a_.altivec_u8, b_.altivec_u8);
6643     #else
6644       SIMDE_VECTORIZE
6645       for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i8[0])) ; i++) {
6646         const int32_t x = a_.u8[i] - b_.u8[i];
6647         if (x < 0) {
6648           r_.u8[i] = 0;
6649         } else if (x > UINT8_MAX) {
6650           r_.u8[i] = UINT8_MAX;
6651         } else {
6652           r_.u8[i] = HEDLEY_STATIC_CAST(uint8_t, x);
6653         }
6654       }
6655     #endif
6656 
6657     return simde__m128i_from_private(r_);
6658   #endif
6659 }
6660 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6661   #define _mm_subs_epu8(a, b) simde_mm_subs_epu8(a, b)
6662 #endif
6663 
6664 SIMDE_FUNCTION_ATTRIBUTES
6665 simde__m128i
simde_mm_subs_epu16(simde__m128i a,simde__m128i b)6666 simde_mm_subs_epu16 (simde__m128i a, simde__m128i b) {
6667   #if defined(SIMDE_X86_SSE2_NATIVE)
6668     return _mm_subs_epu16(a, b);
6669   #else
6670     simde__m128i_private
6671       r_,
6672       a_ = simde__m128i_to_private(a),
6673       b_ = simde__m128i_to_private(b);
6674 
6675     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6676       r_.neon_u16 = vqsubq_u16(a_.neon_u16, b_.neon_u16);
6677     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
6678       r_.wasm_v128 = wasm_u16x8_sub_saturate(a_.wasm_v128, b_.wasm_v128);
6679     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
6680       r_.altivec_u16 = vec_subs(a_.altivec_u16, b_.altivec_u16);
6681     #else
6682       SIMDE_VECTORIZE
6683       for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i16[0])) ; i++) {
6684         const int32_t x = a_.u16[i] - b_.u16[i];
6685         if (x < 0) {
6686           r_.u16[i] = 0;
6687         } else if (x > UINT16_MAX) {
6688           r_.u16[i] = UINT16_MAX;
6689         } else {
6690           r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, x);
6691         }
6692       }
6693     #endif
6694 
6695     return simde__m128i_from_private(r_);
6696   #endif
6697 }
6698 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6699   #define _mm_subs_epu16(a, b) simde_mm_subs_epu16(a, b)
6700 #endif
6701 
6702 SIMDE_FUNCTION_ATTRIBUTES
6703 int
simde_mm_ucomieq_sd(simde__m128d a,simde__m128d b)6704 simde_mm_ucomieq_sd (simde__m128d a, simde__m128d b) {
6705   #if defined(SIMDE_X86_SSE2_NATIVE)
6706     return _mm_ucomieq_sd(a, b);
6707   #else
6708     simde__m128d_private
6709       a_ = simde__m128d_to_private(a),
6710       b_ = simde__m128d_to_private(b);
6711     int r;
6712 
6713     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6714       uint64x2_t a_not_nan = vceqq_f64(a_.neon_f64, a_.neon_f64);
6715       uint64x2_t b_not_nan = vceqq_f64(b_.neon_f64, b_.neon_f64);
6716       uint64x2_t a_or_b_nan = vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(vandq_u64(a_not_nan, b_not_nan))));
6717       uint64x2_t a_eq_b = vceqq_f64(a_.neon_f64, b_.neon_f64);
6718       r = !!(vgetq_lane_u64(vorrq_u64(a_or_b_nan, a_eq_b), 0) != 0);
6719     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
6720       return wasm_f64x2_extract_lane(a_.wasm_v128, 0) == wasm_f64x2_extract_lane(b_.wasm_v128, 0);
6721     #elif defined(SIMDE_HAVE_FENV_H)
6722       fenv_t envp;
6723       int x = feholdexcept(&envp);
6724       r =  a_.f64[0] == b_.f64[0];
6725       if (HEDLEY_LIKELY(x == 0))
6726         fesetenv(&envp);
6727     #else
6728       r =  a_.f64[0] == b_.f64[0];
6729     #endif
6730 
6731     return r;
6732   #endif
6733 }
6734 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6735   #define _mm_ucomieq_sd(a, b) simde_mm_ucomieq_sd(a, b)
6736 #endif
6737 
6738 SIMDE_FUNCTION_ATTRIBUTES
6739 int
simde_mm_ucomige_sd(simde__m128d a,simde__m128d b)6740 simde_mm_ucomige_sd (simde__m128d a, simde__m128d b) {
6741   #if defined(SIMDE_X86_SSE2_NATIVE)
6742     return _mm_ucomige_sd(a, b);
6743   #else
6744     simde__m128d_private
6745       a_ = simde__m128d_to_private(a),
6746       b_ = simde__m128d_to_private(b);
6747     int r;
6748 
6749     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6750       uint64x2_t a_not_nan = vceqq_f64(a_.neon_f64, a_.neon_f64);
6751       uint64x2_t b_not_nan = vceqq_f64(b_.neon_f64, b_.neon_f64);
6752       uint64x2_t a_and_b_not_nan = vandq_u64(a_not_nan, b_not_nan);
6753       uint64x2_t a_ge_b = vcgeq_f64(a_.neon_f64, b_.neon_f64);
6754       r = !!(vgetq_lane_u64(vandq_u64(a_and_b_not_nan, a_ge_b), 0) != 0);
6755     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
6756       return wasm_f64x2_extract_lane(a_.wasm_v128, 0) >= wasm_f64x2_extract_lane(b_.wasm_v128, 0);
6757     #elif defined(SIMDE_HAVE_FENV_H)
6758       fenv_t envp;
6759       int x = feholdexcept(&envp);
6760       r = a_.f64[0] >= b_.f64[0];
6761       if (HEDLEY_LIKELY(x == 0))
6762         fesetenv(&envp);
6763     #else
6764       r = a_.f64[0] >= b_.f64[0];
6765     #endif
6766 
6767     return r;
6768   #endif
6769 }
6770 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6771   #define _mm_ucomige_sd(a, b) simde_mm_ucomige_sd(a, b)
6772 #endif
6773 
6774 SIMDE_FUNCTION_ATTRIBUTES
6775 int
simde_mm_ucomigt_sd(simde__m128d a,simde__m128d b)6776 simde_mm_ucomigt_sd (simde__m128d a, simde__m128d b) {
6777   #if defined(SIMDE_X86_SSE2_NATIVE)
6778     return _mm_ucomigt_sd(a, b);
6779   #else
6780     simde__m128d_private
6781       a_ = simde__m128d_to_private(a),
6782       b_ = simde__m128d_to_private(b);
6783     int r;
6784 
6785     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6786       uint64x2_t a_not_nan = vceqq_f64(a_.neon_f64, a_.neon_f64);
6787       uint64x2_t b_not_nan = vceqq_f64(b_.neon_f64, b_.neon_f64);
6788       uint64x2_t a_and_b_not_nan = vandq_u64(a_not_nan, b_not_nan);
6789       uint64x2_t a_gt_b = vcgtq_f64(a_.neon_f64, b_.neon_f64);
6790       r = !!(vgetq_lane_u64(vandq_u64(a_and_b_not_nan, a_gt_b), 0) != 0);
6791     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
6792       return wasm_f64x2_extract_lane(a_.wasm_v128, 0) > wasm_f64x2_extract_lane(b_.wasm_v128, 0);
6793     #elif defined(SIMDE_HAVE_FENV_H)
6794       fenv_t envp;
6795       int x = feholdexcept(&envp);
6796       r = a_.f64[0] > b_.f64[0];
6797       if (HEDLEY_LIKELY(x == 0))
6798         fesetenv(&envp);
6799     #else
6800       r = a_.f64[0] > b_.f64[0];
6801     #endif
6802 
6803     return r;
6804   #endif
6805 }
6806 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6807   #define _mm_ucomigt_sd(a, b) simde_mm_ucomigt_sd(a, b)
6808 #endif
6809 
6810 SIMDE_FUNCTION_ATTRIBUTES
6811 int
simde_mm_ucomile_sd(simde__m128d a,simde__m128d b)6812 simde_mm_ucomile_sd (simde__m128d a, simde__m128d b) {
6813   #if defined(SIMDE_X86_SSE2_NATIVE)
6814     return _mm_ucomile_sd(a, b);
6815   #else
6816     simde__m128d_private
6817       a_ = simde__m128d_to_private(a),
6818       b_ = simde__m128d_to_private(b);
6819     int r;
6820 
6821     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6822       uint64x2_t a_not_nan = vceqq_f64(a_.neon_f64, a_.neon_f64);
6823       uint64x2_t b_not_nan = vceqq_f64(b_.neon_f64, b_.neon_f64);
6824       uint64x2_t a_or_b_nan = vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(vandq_u64(a_not_nan, b_not_nan))));
6825       uint64x2_t a_le_b = vcleq_f64(a_.neon_f64, b_.neon_f64);
6826       r = !!(vgetq_lane_u64(vorrq_u64(a_or_b_nan, a_le_b), 0) != 0);
6827     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
6828       return wasm_f64x2_extract_lane(a_.wasm_v128, 0) <= wasm_f64x2_extract_lane(b_.wasm_v128, 0);
6829     #elif defined(SIMDE_HAVE_FENV_H)
6830       fenv_t envp;
6831       int x = feholdexcept(&envp);
6832       r = a_.f64[0] <= b_.f64[0];
6833       if (HEDLEY_LIKELY(x == 0))
6834         fesetenv(&envp);
6835     #else
6836       r = a_.f64[0] <= b_.f64[0];
6837     #endif
6838 
6839     return r;
6840   #endif
6841 }
6842 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6843   #define _mm_ucomile_sd(a, b) simde_mm_ucomile_sd(a, b)
6844 #endif
6845 
6846 SIMDE_FUNCTION_ATTRIBUTES
6847 int
simde_mm_ucomilt_sd(simde__m128d a,simde__m128d b)6848 simde_mm_ucomilt_sd (simde__m128d a, simde__m128d b) {
6849   #if defined(SIMDE_X86_SSE2_NATIVE)
6850     return _mm_ucomilt_sd(a, b);
6851   #else
6852     simde__m128d_private
6853       a_ = simde__m128d_to_private(a),
6854       b_ = simde__m128d_to_private(b);
6855     int r;
6856 
6857     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6858       uint64x2_t a_not_nan = vceqq_f64(a_.neon_f64, a_.neon_f64);
6859       uint64x2_t b_not_nan = vceqq_f64(b_.neon_f64, b_.neon_f64);
6860       uint64x2_t a_or_b_nan = vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(vandq_u64(a_not_nan, b_not_nan))));
6861       uint64x2_t a_lt_b = vcltq_f64(a_.neon_f64, b_.neon_f64);
6862       r = !!(vgetq_lane_u64(vorrq_u64(a_or_b_nan, a_lt_b), 0) != 0);
6863     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
6864       return wasm_f64x2_extract_lane(a_.wasm_v128, 0) < wasm_f64x2_extract_lane(b_.wasm_v128, 0);
6865     #elif defined(SIMDE_HAVE_FENV_H)
6866       fenv_t envp;
6867       int x = feholdexcept(&envp);
6868       r = a_.f64[0] < b_.f64[0];
6869       if (HEDLEY_LIKELY(x == 0))
6870         fesetenv(&envp);
6871     #else
6872       r = a_.f64[0] < b_.f64[0];
6873     #endif
6874 
6875     return r;
6876   #endif
6877 }
6878 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6879   #define _mm_ucomilt_sd(a, b) simde_mm_ucomilt_sd(a, b)
6880 #endif
6881 
6882 SIMDE_FUNCTION_ATTRIBUTES
6883 int
simde_mm_ucomineq_sd(simde__m128d a,simde__m128d b)6884 simde_mm_ucomineq_sd (simde__m128d a, simde__m128d b) {
6885   #if defined(SIMDE_X86_SSE2_NATIVE)
6886     return _mm_ucomineq_sd(a, b);
6887   #else
6888     simde__m128d_private
6889       a_ = simde__m128d_to_private(a),
6890       b_ = simde__m128d_to_private(b);
6891     int r;
6892 
6893     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6894       uint64x2_t a_not_nan = vceqq_f64(a_.neon_f64, a_.neon_f64);
6895       uint64x2_t b_not_nan = vceqq_f64(b_.neon_f64, b_.neon_f64);
6896       uint64x2_t a_and_b_not_nan = vandq_u64(a_not_nan, b_not_nan);
6897       uint64x2_t a_neq_b = vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(vceqq_f64(a_.neon_f64, b_.neon_f64))));
6898       r = !!(vgetq_lane_u64(vandq_u64(a_and_b_not_nan, a_neq_b), 0) != 0);
6899     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
6900       return wasm_f64x2_extract_lane(a_.wasm_v128, 0) != wasm_f64x2_extract_lane(b_.wasm_v128, 0);
6901     #elif defined(SIMDE_HAVE_FENV_H)
6902       fenv_t envp;
6903       int x = feholdexcept(&envp);
6904       r = a_.f64[0] != b_.f64[0];
6905       if (HEDLEY_LIKELY(x == 0))
6906         fesetenv(&envp);
6907     #else
6908       r = a_.f64[0] != b_.f64[0];
6909     #endif
6910 
6911     return r;
6912   #endif
6913 }
6914 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6915   #define _mm_ucomineq_sd(a, b) simde_mm_ucomineq_sd(a, b)
6916 #endif
6917 
6918 #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
6919   HEDLEY_DIAGNOSTIC_PUSH
6920   SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_
6921 #endif
6922 
6923 #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
6924   HEDLEY_DIAGNOSTIC_POP
6925 #endif
6926 
6927 SIMDE_FUNCTION_ATTRIBUTES
6928 void
simde_mm_lfence(void)6929 simde_mm_lfence (void) {
6930   #if defined(SIMDE_X86_SSE2_NATIVE)
6931     _mm_lfence();
6932   #else
6933     simde_mm_sfence();
6934   #endif
6935 }
6936 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6937   #define _mm_lfence() simde_mm_lfence()
6938 #endif
6939 
6940 SIMDE_FUNCTION_ATTRIBUTES
6941 void
simde_mm_mfence(void)6942 simde_mm_mfence (void) {
6943   #if defined(SIMDE_X86_SSE2_NATIVE)
6944     _mm_mfence();
6945   #else
6946     simde_mm_sfence();
6947   #endif
6948 }
6949 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6950   #define _mm_mfence() simde_mm_mfence()
6951 #endif
6952 
6953 SIMDE_FUNCTION_ATTRIBUTES
6954 simde__m128i
simde_mm_unpackhi_epi8(simde__m128i a,simde__m128i b)6955 simde_mm_unpackhi_epi8 (simde__m128i a, simde__m128i b) {
6956   #if defined(SIMDE_X86_SSE2_NATIVE)
6957     return _mm_unpackhi_epi8(a, b);
6958   #else
6959     simde__m128i_private
6960       r_,
6961       a_ = simde__m128i_to_private(a),
6962       b_ = simde__m128i_to_private(b);
6963 
6964     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6965       r_.neon_i8 = vzip2q_s8(a_.neon_i8, b_.neon_i8);
6966     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6967       int8x8_t a1 = vreinterpret_s8_s16(vget_high_s16(a_.neon_i16));
6968       int8x8_t b1 = vreinterpret_s8_s16(vget_high_s16(b_.neon_i16));
6969       int8x8x2_t result = vzip_s8(a1, b1);
6970       r_.neon_i8 = vcombine_s8(result.val[0], result.val[1]);
6971     #elif defined(SIMDE_SHUFFLE_VECTOR_)
6972       r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, b_.i8, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
6973     #else
6974       SIMDE_VECTORIZE
6975       for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i8[0])) / 2) ; i++) {
6976         r_.i8[(i * 2)]     = a_.i8[i + ((sizeof(r_) / sizeof(r_.i8[0])) / 2)];
6977         r_.i8[(i * 2) + 1] = b_.i8[i + ((sizeof(r_) / sizeof(r_.i8[0])) / 2)];
6978       }
6979     #endif
6980 
6981     return simde__m128i_from_private(r_);
6982   #endif
6983 }
6984 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6985   #define _mm_unpackhi_epi8(a, b) simde_mm_unpackhi_epi8(a, b)
6986 #endif
6987 
6988 SIMDE_FUNCTION_ATTRIBUTES
6989 simde__m128i
simde_mm_unpackhi_epi16(simde__m128i a,simde__m128i b)6990 simde_mm_unpackhi_epi16 (simde__m128i a, simde__m128i b) {
6991   #if defined(SIMDE_X86_SSE2_NATIVE)
6992     return _mm_unpackhi_epi16(a, b);
6993   #else
6994     simde__m128i_private
6995       r_,
6996       a_ = simde__m128i_to_private(a),
6997       b_ = simde__m128i_to_private(b);
6998 
6999     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
7000       r_.neon_i16 = vzip2q_s16(a_.neon_i16, b_.neon_i16);
7001     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
7002       int16x4_t a1 = vget_high_s16(a_.neon_i16);
7003       int16x4_t b1 = vget_high_s16(b_.neon_i16);
7004       int16x4x2_t result = vzip_s16(a1, b1);
7005       r_.neon_i16 = vcombine_s16(result.val[0], result.val[1]);
7006     #elif defined(SIMDE_SHUFFLE_VECTOR_)
7007       r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, b_.i16, 4, 12, 5, 13, 6, 14, 7, 15);
7008     #else
7009       SIMDE_VECTORIZE
7010       for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i16[0])) / 2) ; i++) {
7011         r_.i16[(i * 2)]     = a_.i16[i + ((sizeof(r_) / sizeof(r_.i16[0])) / 2)];
7012         r_.i16[(i * 2) + 1] = b_.i16[i + ((sizeof(r_) / sizeof(r_.i16[0])) / 2)];
7013       }
7014     #endif
7015 
7016     return simde__m128i_from_private(r_);
7017   #endif
7018 }
7019 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
7020   #define _mm_unpackhi_epi16(a, b) simde_mm_unpackhi_epi16(a, b)
7021 #endif
7022 
7023 SIMDE_FUNCTION_ATTRIBUTES
7024 simde__m128i
simde_mm_unpackhi_epi32(simde__m128i a,simde__m128i b)7025 simde_mm_unpackhi_epi32 (simde__m128i a, simde__m128i b) {
7026   #if defined(SIMDE_X86_SSE2_NATIVE)
7027     return _mm_unpackhi_epi32(a, b);
7028   #else
7029     simde__m128i_private
7030       r_,
7031       a_ = simde__m128i_to_private(a),
7032       b_ = simde__m128i_to_private(b);
7033 
7034     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
7035       r_.neon_i32 = vzip2q_s32(a_.neon_i32, b_.neon_i32);
7036     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
7037       int32x2_t a1 = vget_high_s32(a_.neon_i32);
7038       int32x2_t b1 = vget_high_s32(b_.neon_i32);
7039       int32x2x2_t result = vzip_s32(a1, b1);
7040       r_.neon_i32 = vcombine_s32(result.val[0], result.val[1]);
7041     #elif defined(SIMDE_SHUFFLE_VECTOR_)
7042       r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.i32, b_.i32, 2, 6, 3, 7);
7043     #else
7044       SIMDE_VECTORIZE
7045       for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i32[0])) / 2) ; i++) {
7046         r_.i32[(i * 2)]     = a_.i32[i + ((sizeof(r_) / sizeof(r_.i32[0])) / 2)];
7047         r_.i32[(i * 2) + 1] = b_.i32[i + ((sizeof(r_) / sizeof(r_.i32[0])) / 2)];
7048       }
7049     #endif
7050 
7051     return simde__m128i_from_private(r_);
7052   #endif
7053 }
7054 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
7055   #define _mm_unpackhi_epi32(a, b) simde_mm_unpackhi_epi32(a, b)
7056 #endif
7057 
7058 SIMDE_FUNCTION_ATTRIBUTES
7059 simde__m128i
simde_mm_unpackhi_epi64(simde__m128i a,simde__m128i b)7060 simde_mm_unpackhi_epi64 (simde__m128i a, simde__m128i b) {
7061   #if defined(SIMDE_X86_SSE2_NATIVE)
7062     return _mm_unpackhi_epi64(a, b);
7063   #else
7064     simde__m128i_private
7065       r_,
7066       a_ = simde__m128i_to_private(a),
7067       b_ = simde__m128i_to_private(b);
7068 
7069     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
7070       int64x1_t a_h = vget_high_s64(a_.neon_i64);
7071       int64x1_t b_h = vget_high_s64(b_.neon_i64);
7072       r_.neon_i64 = vcombine_s64(a_h, b_h);
7073     #elif defined(SIMDE_SHUFFLE_VECTOR_)
7074       r_.i64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.i64, b_.i64, 1, 3);
7075     #else
7076       SIMDE_VECTORIZE
7077       for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i64[0])) / 2) ; i++) {
7078         r_.i64[(i * 2)]     = a_.i64[i + ((sizeof(r_) / sizeof(r_.i64[0])) / 2)];
7079         r_.i64[(i * 2) + 1] = b_.i64[i + ((sizeof(r_) / sizeof(r_.i64[0])) / 2)];
7080       }
7081     #endif
7082 
7083     return simde__m128i_from_private(r_);
7084   #endif
7085 }
7086 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
7087   #define _mm_unpackhi_epi64(a, b) simde_mm_unpackhi_epi64(a, b)
7088 #endif
7089 
7090 SIMDE_FUNCTION_ATTRIBUTES
7091 simde__m128d
simde_mm_unpackhi_pd(simde__m128d a,simde__m128d b)7092 simde_mm_unpackhi_pd (simde__m128d a, simde__m128d b) {
7093   #if defined(SIMDE_X86_SSE2_NATIVE)
7094     return _mm_unpackhi_pd(a, b);
7095   #else
7096     simde__m128d_private
7097       r_,
7098       a_ = simde__m128d_to_private(a),
7099       b_ = simde__m128d_to_private(b);
7100 
7101     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
7102       float64x1_t a_l = vget_high_f64(a_.f64);
7103       float64x1_t b_l = vget_high_f64(b_.f64);
7104       r_.neon_f64 = vcombine_f64(a_l, b_l);
7105     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
7106       r_.wasm_v128 = wasm_v64x2_shuffle(a_.wasm_v128, b_.wasm_v128, 1, 3);
7107     #elif defined(SIMDE_SHUFFLE_VECTOR_)
7108       r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 1, 3);
7109     #else
7110       SIMDE_VECTORIZE
7111       for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.f64[0])) / 2) ; i++) {
7112         r_.f64[(i * 2)]     = a_.f64[i + ((sizeof(r_) / sizeof(r_.f64[0])) / 2)];
7113         r_.f64[(i * 2) + 1] = b_.f64[i + ((sizeof(r_) / sizeof(r_.f64[0])) / 2)];
7114       }
7115     #endif
7116 
7117     return simde__m128d_from_private(r_);
7118   #endif
7119 }
7120 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
7121   #define _mm_unpackhi_pd(a, b) simde_mm_unpackhi_pd(a, b)
7122 #endif
7123 
7124 SIMDE_FUNCTION_ATTRIBUTES
7125 simde__m128i
simde_mm_unpacklo_epi8(simde__m128i a,simde__m128i b)7126 simde_mm_unpacklo_epi8 (simde__m128i a, simde__m128i b) {
7127   #if defined(SIMDE_X86_SSE2_NATIVE)
7128     return _mm_unpacklo_epi8(a, b);
7129   #else
7130     simde__m128i_private
7131       r_,
7132       a_ = simde__m128i_to_private(a),
7133       b_ = simde__m128i_to_private(b);
7134 
7135     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
7136       r_.neon_i8 = vzip1q_s8(a_.neon_i8, b_.neon_i8);
7137     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
7138       int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(a_.neon_i16));
7139       int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(b_.neon_i16));
7140       int8x8x2_t result = vzip_s8(a1, b1);
7141       r_.neon_i8 = vcombine_s8(result.val[0], result.val[1]);
7142     #elif defined(SIMDE_SHUFFLE_VECTOR_)
7143       r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, b_.i8, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
7144     #else
7145       SIMDE_VECTORIZE
7146       for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i8[0])) / 2) ; i++) {
7147         r_.i8[(i * 2)]     = a_.i8[i];
7148         r_.i8[(i * 2) + 1] = b_.i8[i];
7149       }
7150     #endif
7151 
7152     return simde__m128i_from_private(r_);
7153   #endif
7154 }
7155 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
7156   #define _mm_unpacklo_epi8(a, b) simde_mm_unpacklo_epi8(a, b)
7157 #endif
7158 
7159 SIMDE_FUNCTION_ATTRIBUTES
7160 simde__m128i
simde_mm_unpacklo_epi16(simde__m128i a,simde__m128i b)7161 simde_mm_unpacklo_epi16 (simde__m128i a, simde__m128i b) {
7162   #if defined(SIMDE_X86_SSE2_NATIVE)
7163     return _mm_unpacklo_epi16(a, b);
7164   #else
7165     simde__m128i_private
7166       r_,
7167       a_ = simde__m128i_to_private(a),
7168       b_ = simde__m128i_to_private(b);
7169 
7170     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
7171       r_.neon_i16 = vzip1q_s16(a_.neon_i16, b_.neon_i16);
7172     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
7173       int16x4_t a1 = vget_low_s16(a_.neon_i16);
7174       int16x4_t b1 = vget_low_s16(b_.neon_i16);
7175       int16x4x2_t result = vzip_s16(a1, b1);
7176       r_.neon_i16 = vcombine_s16(result.val[0], result.val[1]);
7177     #elif defined(SIMDE_SHUFFLE_VECTOR_)
7178       r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, b_.i16, 0, 8, 1, 9, 2, 10, 3, 11);
7179     #else
7180       SIMDE_VECTORIZE
7181       for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i16[0])) / 2) ; i++) {
7182         r_.i16[(i * 2)]     = a_.i16[i];
7183         r_.i16[(i * 2) + 1] = b_.i16[i];
7184       }
7185     #endif
7186 
7187     return simde__m128i_from_private(r_);
7188   #endif
7189 }
7190 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
7191   #define _mm_unpacklo_epi16(a, b) simde_mm_unpacklo_epi16(a, b)
7192 #endif
7193 
7194 SIMDE_FUNCTION_ATTRIBUTES
7195 simde__m128i
simde_mm_unpacklo_epi32(simde__m128i a,simde__m128i b)7196 simde_mm_unpacklo_epi32 (simde__m128i a, simde__m128i b) {
7197   #if defined(SIMDE_X86_SSE2_NATIVE)
7198     return _mm_unpacklo_epi32(a, b);
7199   #else
7200     simde__m128i_private
7201       r_,
7202       a_ = simde__m128i_to_private(a),
7203       b_ = simde__m128i_to_private(b);
7204 
7205     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
7206       r_.neon_i32 = vzip1q_s32(a_.neon_i32, b_.neon_i32);
7207     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
7208       int32x2_t a1 = vget_low_s32(a_.neon_i32);
7209       int32x2_t b1 = vget_low_s32(b_.neon_i32);
7210       int32x2x2_t result = vzip_s32(a1, b1);
7211       r_.neon_i32 = vcombine_s32(result.val[0], result.val[1]);
7212     #elif defined(SIMDE_SHUFFLE_VECTOR_)
7213       r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.i32, b_.i32, 0, 4, 1, 5);
7214     #else
7215       SIMDE_VECTORIZE
7216       for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i32[0])) / 2) ; i++) {
7217         r_.i32[(i * 2)]     = a_.i32[i];
7218         r_.i32[(i * 2) + 1] = b_.i32[i];
7219       }
7220     #endif
7221 
7222     return simde__m128i_from_private(r_);
7223   #endif
7224 }
7225 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
7226   #define _mm_unpacklo_epi32(a, b) simde_mm_unpacklo_epi32(a, b)
7227 #endif
7228 
7229 SIMDE_FUNCTION_ATTRIBUTES
7230 simde__m128i
simde_mm_unpacklo_epi64(simde__m128i a,simde__m128i b)7231 simde_mm_unpacklo_epi64 (simde__m128i a, simde__m128i b) {
7232   #if defined(SIMDE_X86_SSE2_NATIVE)
7233     return _mm_unpacklo_epi64(a, b);
7234   #else
7235     simde__m128i_private
7236       r_,
7237       a_ = simde__m128i_to_private(a),
7238       b_ = simde__m128i_to_private(b);
7239 
7240     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
7241       int64x1_t a_l = vget_low_s64(a_.neon_i64);
7242       int64x1_t b_l = vget_low_s64(b_.neon_i64);
7243       r_.neon_i64 = vcombine_s64(a_l, b_l);
7244     #elif defined(SIMDE_SHUFFLE_VECTOR_)
7245       r_.i64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.i64, b_.i64, 0, 2);
7246     #else
7247       SIMDE_VECTORIZE
7248       for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i64[0])) / 2) ; i++) {
7249         r_.i64[(i * 2)]     = a_.i64[i];
7250         r_.i64[(i * 2) + 1] = b_.i64[i];
7251       }
7252     #endif
7253 
7254     return simde__m128i_from_private(r_);
7255   #endif
7256 }
7257 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
7258   #define _mm_unpacklo_epi64(a, b) simde_mm_unpacklo_epi64(a, b)
7259 #endif
7260 
7261 SIMDE_FUNCTION_ATTRIBUTES
7262 simde__m128d
simde_mm_unpacklo_pd(simde__m128d a,simde__m128d b)7263 simde_mm_unpacklo_pd (simde__m128d a, simde__m128d b) {
7264   #if defined(SIMDE_X86_SSE2_NATIVE)
7265     return _mm_unpacklo_pd(a, b);
7266   #else
7267     simde__m128d_private
7268       r_,
7269       a_ = simde__m128d_to_private(a),
7270       b_ = simde__m128d_to_private(b);
7271 
7272     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
7273       float64x1_t a_l = vget_low_f64(a_.f64);
7274       float64x1_t b_l = vget_low_f64(b_.f64);
7275       r_.neon_f64 = vcombine_f64(a_l, b_l);
7276     #elif defined(SIMDE_SHUFFLE_VECTOR_)
7277       r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 0, 2);
7278     #else
7279       SIMDE_VECTORIZE
7280       for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.f64[0])) / 2) ; i++) {
7281         r_.f64[(i * 2)]     = a_.f64[i];
7282         r_.f64[(i * 2) + 1] = b_.f64[i];
7283       }
7284     #endif
7285 
7286     return simde__m128d_from_private(r_);
7287   #endif
7288 }
7289 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
7290   #define _mm_unpacklo_pd(a, b) simde_mm_unpacklo_pd(a, b)
7291 #endif
7292 
7293 SIMDE_FUNCTION_ATTRIBUTES
7294 simde__m128d
simde_x_mm_negate_pd(simde__m128d a)7295 simde_x_mm_negate_pd(simde__m128d a) {
7296   #if defined(SIMDE_X86_SSE2_NATIVE)
7297     return simde_mm_xor_pd(a, _mm_set1_pd(SIMDE_FLOAT64_C(-0.0)));
7298   #else
7299     simde__m128d_private
7300       r_,
7301       a_ = simde__m128d_to_private(a);
7302 
7303     #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) && \
7304         (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,1,0))
7305       r_.altivec_f64 = vec_neg(a_.altivec_f64);
7306     #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
7307       r_.neon_f64 = vnegq_f64(a_.neon_f64);
7308     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
7309       r_.wasm_v128 = wasm_f64x2_neg(a_.wasm_v128);
7310     #elif defined(SIMDE_VECTOR_NEGATE)
7311       r_.f64 = -a_.f64;
7312     #else
7313       SIMDE_VECTORIZE
7314       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
7315         r_.f64[i] = -a_.f64[i];
7316       }
7317     #endif
7318 
7319     return simde__m128d_from_private(r_);
7320   #endif
7321 }
7322 
7323 SIMDE_FUNCTION_ATTRIBUTES
7324 simde__m128i
simde_mm_xor_si128(simde__m128i a,simde__m128i b)7325 simde_mm_xor_si128 (simde__m128i a, simde__m128i b) {
7326   #if defined(SIMDE_X86_SSE2_NATIVE)
7327     return _mm_xor_si128(a, b);
7328   #else
7329     simde__m128i_private
7330       r_,
7331       a_ = simde__m128i_to_private(a),
7332       b_ = simde__m128i_to_private(b);
7333 
7334     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
7335       r_.neon_i32 = veorq_s32(a_.neon_i32, b_.neon_i32);
7336     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
7337       r_.altivec_i32 = vec_xor(a_.altivec_i32, b_.altivec_i32);
7338     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
7339       r_.i32f = a_.i32f ^ b_.i32f;
7340     #else
7341       SIMDE_VECTORIZE
7342       for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
7343         r_.i32f[i] = a_.i32f[i] ^ b_.i32f[i];
7344       }
7345     #endif
7346 
7347     return simde__m128i_from_private(r_);
7348   #endif
7349 }
7350 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
7351   #define _mm_xor_si128(a, b) simde_mm_xor_si128(a, b)
7352 #endif
7353 
7354 SIMDE_FUNCTION_ATTRIBUTES
7355 simde__m128i
simde_x_mm_not_si128(simde__m128i a)7356 simde_x_mm_not_si128 (simde__m128i a) {
7357   #if defined(SIMDE_X86_AVX512VL_NATIVE)
7358     return _mm_ternarylogic_epi32(a, a, a, 0x55);
7359   #else
7360     simde__m128i_private
7361       r_,
7362       a_ = simde__m128i_to_private(a);
7363 
7364     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
7365       r_.neon_i32 = vmvnq_s32(a_.neon_i32);
7366     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
7367       r_.altivec_i32 = vec_nor(a_.altivec_i32, a_.altivec_i32);
7368     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
7369       r_.wasm_v128 = wasm_v128_not(a_.wasm_v128);
7370     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
7371       r_.i32f = ~a_.i32f;
7372     #else
7373       SIMDE_VECTORIZE
7374       for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
7375         r_.i32f[i] = ~(a_.i32f[i]);
7376       }
7377     #endif
7378 
7379     return simde__m128i_from_private(r_);
7380   #endif
7381 }
7382 
7383 #define SIMDE_MM_SHUFFLE2(x, y) (((x) << 1) | (y))
7384 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
7385   #define _MM_SHUFFLE2(x, y) SIMDE_MM_SHUFFLE2(x, y)
7386 #endif
7387 
7388 SIMDE_END_DECLS_
7389 
7390 HEDLEY_DIAGNOSTIC_POP
7391 
7392 #endif /* !defined(SIMDE_X86_SSE2_H) */
7393