1 /* SPDX-License-Identifier: MIT
2  *
3  * Permission is hereby granted, free of charge, to any person
4  * obtaining a copy of this software and associated documentation
5  * files (the "Software"), to deal in the Software without
6  * restriction, including without limitation the rights to use, copy,
7  * modify, merge, publish, distribute, sublicense, and/or sell copies
8  * of the Software, and to permit persons to whom the Software is
9  * furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be
12  * included in all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
18  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
19  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  * Copyright:
24  *   2017-2020 Evan Nemerson <evan@nemerson.com>
25  *   2015-2017 John W. Ratcliff <jratcliffscarab@gmail.com>
26  *   2015      Brandon Rowlett <browlett@nvidia.com>
27  *   2015      Ken Fast <kfast@gdeb.com>
28  *   2017      Hasindu Gamaarachchi <hasindu@unsw.edu.au>
29  *   2018      Jeff Daily <jeff.daily@amd.com>
30  */
31 
32 #if !defined(SIMDE_X86_SSE2_H)
33 #define SIMDE_X86_SSE2_H
34 
35 #include "sse.h"
36 
37 HEDLEY_DIAGNOSTIC_PUSH
38 SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
39 SIMDE_BEGIN_DECLS_
40 
41 typedef union {
42   #if defined(SIMDE_VECTOR_SUBSCRIPT)
43     SIMDE_ALIGN_TO_16 int8_t          i8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
44     SIMDE_ALIGN_TO_16 int16_t        i16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
45     SIMDE_ALIGN_TO_16 int32_t        i32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
46     SIMDE_ALIGN_TO_16 int64_t        i64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
47     SIMDE_ALIGN_TO_16 uint8_t         u8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
48     SIMDE_ALIGN_TO_16 uint16_t       u16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
49     SIMDE_ALIGN_TO_16 uint32_t       u32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
50     SIMDE_ALIGN_TO_16 uint64_t       u64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
51     #if defined(SIMDE_HAVE_INT128_)
52     SIMDE_ALIGN_TO_16 simde_int128  i128 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
53     SIMDE_ALIGN_TO_16 simde_uint128 u128 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
54     #endif
55     SIMDE_ALIGN_TO_16 simde_float32  f32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
56     SIMDE_ALIGN_TO_16 simde_float64  f64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
57 
58     SIMDE_ALIGN_TO_16 int_fast32_t  i32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
59     SIMDE_ALIGN_TO_16 uint_fast32_t u32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
60   #else
61     SIMDE_ALIGN_TO_16 int8_t         i8[16];
62     SIMDE_ALIGN_TO_16 int16_t        i16[8];
63     SIMDE_ALIGN_TO_16 int32_t        i32[4];
64     SIMDE_ALIGN_TO_16 int64_t        i64[2];
65     SIMDE_ALIGN_TO_16 uint8_t        u8[16];
66     SIMDE_ALIGN_TO_16 uint16_t       u16[8];
67     SIMDE_ALIGN_TO_16 uint32_t       u32[4];
68     SIMDE_ALIGN_TO_16 uint64_t       u64[2];
69     #if defined(SIMDE_HAVE_INT128_)
70     SIMDE_ALIGN_TO_16 simde_int128  i128[1];
71     SIMDE_ALIGN_TO_16 simde_uint128 u128[1];
72     #endif
73     SIMDE_ALIGN_TO_16 simde_float32  f32[4];
74     SIMDE_ALIGN_TO_16 simde_float64  f64[2];
75 
76     SIMDE_ALIGN_TO_16 int_fast32_t  i32f[16 / sizeof(int_fast32_t)];
77     SIMDE_ALIGN_TO_16 uint_fast32_t u32f[16 / sizeof(uint_fast32_t)];
78   #endif
79 
80     SIMDE_ALIGN_TO_16 simde__m64_private m64_private[2];
81     SIMDE_ALIGN_TO_16 simde__m64         m64[2];
82 
83   #if defined(SIMDE_X86_SSE2_NATIVE)
84     SIMDE_ALIGN_TO_16 __m128i        n;
85   #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
86     SIMDE_ALIGN_TO_16 int8x16_t      neon_i8;
87     SIMDE_ALIGN_TO_16 int16x8_t      neon_i16;
88     SIMDE_ALIGN_TO_16 int32x4_t      neon_i32;
89     SIMDE_ALIGN_TO_16 int64x2_t      neon_i64;
90     SIMDE_ALIGN_TO_16 uint8x16_t     neon_u8;
91     SIMDE_ALIGN_TO_16 uint16x8_t     neon_u16;
92     SIMDE_ALIGN_TO_16 uint32x4_t     neon_u32;
93     SIMDE_ALIGN_TO_16 uint64x2_t     neon_u64;
94     SIMDE_ALIGN_TO_16 float32x4_t    neon_f32;
95     #if defined(SIMDE_ARCH_AARCH64)
96     SIMDE_ALIGN_TO_16 float64x2_t    neon_f64;
97     #endif
98   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
99     SIMDE_ALIGN_TO_16 v128_t         wasm_v128;
100   #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
101     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed char)          altivec_i8;
102     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed short)         altivec_i16;
103     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed int)           altivec_i32;
104     #if defined(__UINT_FAST32_TYPE__) && defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
105     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(__INT_FAST32_TYPE__)  altivec_i32f;
106     #else
107     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed int)           altivec_i32f;
108     #endif
109     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned char)        altivec_u8;
110     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned short)       altivec_u16;
111     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int)         altivec_u32;
112     #if defined(__UINT_FAST32_TYPE__) && defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
113     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(__UINT_FAST32_TYPE__) altivec_u32f;
114     #else
115     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int)         altivec_u32f;
116     #endif
117     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(float)                altivec_f32;
118     #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
119       SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed long long)   altivec_i64;
120       SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) altivec_u64;
121       SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(double)             altivec_f64;
122     #endif
123   #endif
124 } simde__m128i_private;
125 
126 typedef union {
127   #if defined(SIMDE_VECTOR_SUBSCRIPT)
128     SIMDE_ALIGN_TO_16 int8_t          i8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
129     SIMDE_ALIGN_TO_16 int16_t        i16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
130     SIMDE_ALIGN_TO_16 int32_t        i32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
131     SIMDE_ALIGN_TO_16 int64_t        i64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
132     SIMDE_ALIGN_TO_16 uint8_t         u8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
133     SIMDE_ALIGN_TO_16 uint16_t       u16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
134     SIMDE_ALIGN_TO_16 uint32_t       u32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
135     SIMDE_ALIGN_TO_16 uint64_t       u64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
136     SIMDE_ALIGN_TO_16 simde_float32  f32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
137     SIMDE_ALIGN_TO_16 simde_float64  f64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
138     SIMDE_ALIGN_TO_16 int_fast32_t  i32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
139     SIMDE_ALIGN_TO_16 uint_fast32_t u32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
140   #else
141     SIMDE_ALIGN_TO_16 int8_t         i8[16];
142     SIMDE_ALIGN_TO_16 int16_t        i16[8];
143     SIMDE_ALIGN_TO_16 int32_t        i32[4];
144     SIMDE_ALIGN_TO_16 int64_t        i64[2];
145     SIMDE_ALIGN_TO_16 uint8_t        u8[16];
146     SIMDE_ALIGN_TO_16 uint16_t       u16[8];
147     SIMDE_ALIGN_TO_16 uint32_t       u32[4];
148     SIMDE_ALIGN_TO_16 uint64_t       u64[2];
149     SIMDE_ALIGN_TO_16 simde_float32  f32[4];
150     SIMDE_ALIGN_TO_16 simde_float64  f64[2];
151     SIMDE_ALIGN_TO_16 int_fast32_t  i32f[16 / sizeof(int_fast32_t)];
152     SIMDE_ALIGN_TO_16 uint_fast32_t u32f[16 / sizeof(uint_fast32_t)];
153   #endif
154 
155     SIMDE_ALIGN_TO_16 simde__m64_private m64_private[2];
156     SIMDE_ALIGN_TO_16 simde__m64         m64[2];
157 
158   #if defined(SIMDE_X86_SSE2_NATIVE)
159     SIMDE_ALIGN_TO_16 __m128d        n;
160   #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
161     SIMDE_ALIGN_TO_16 int8x16_t      neon_i8;
162     SIMDE_ALIGN_TO_16 int16x8_t      neon_i16;
163     SIMDE_ALIGN_TO_16 int32x4_t      neon_i32;
164     SIMDE_ALIGN_TO_16 int64x2_t      neon_i64;
165     SIMDE_ALIGN_TO_16 uint8x16_t     neon_u8;
166     SIMDE_ALIGN_TO_16 uint16x8_t     neon_u16;
167     SIMDE_ALIGN_TO_16 uint32x4_t     neon_u32;
168     SIMDE_ALIGN_TO_16 uint64x2_t     neon_u64;
169     SIMDE_ALIGN_TO_16 float32x4_t    neon_f32;
170     #if defined(SIMDE_ARCH_AARCH64)
171     SIMDE_ALIGN_TO_16 float64x2_t    neon_f64;
172     #endif
173   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
174     SIMDE_ALIGN_TO_16 v128_t         wasm_v128;
175   #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
176     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed char)          altivec_i8;
177     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed short)         altivec_i16;
178     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed int)           altivec_i32;
179     #if defined(__INT_FAST32_TYPE__) && defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
180     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(__INT_FAST32_TYPE__)  altivec_i32f;
181     #else
182     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed int)           altivec_i32f;
183     #endif
184     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned char)        altivec_u8;
185     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned short)       altivec_u16;
186     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int)         altivec_u32;
187     #if defined(__UINT_FAST32_TYPE__) && defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
188     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(__UINT_FAST32_TYPE__) altivec_u32f;
189     #else
190     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int)         altivec_u32f;
191     #endif
192     SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(float)                altivec_f32;
193     #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
194       SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed long long)   altivec_i64;
195       SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) altivec_u64;
196       SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(double)             altivec_f64;
197     #endif
198   #endif
199 } simde__m128d_private;
200 
201 #if defined(SIMDE_X86_SSE2_NATIVE)
202   typedef __m128i simde__m128i;
203   typedef __m128d simde__m128d;
204 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
205    typedef int64x2_t simde__m128i;
206 #  if defined(SIMDE_ARCH_AARCH64)
207      typedef float64x2_t simde__m128d;
208 #  elif defined(SIMDE_VECTOR_SUBSCRIPT)
209      typedef simde_float64 simde__m128d SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
210 #  else
211      typedef simde__m128d_private simde__m128d;
212 #  endif
213 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
214    typedef v128_t simde__m128i;
215    typedef v128_t simde__m128d;
216 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
217   typedef SIMDE_POWER_ALTIVEC_VECTOR(float) simde__m128i;
218   #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
219      typedef SIMDE_POWER_ALTIVEC_VECTOR(double) simde__m128d;
220   #else
221      typedef simde__m128d_private simde__m128d;
222   #endif
223 #elif defined(SIMDE_VECTOR_SUBSCRIPT)
224   typedef int64_t simde__m128i SIMDE_ALIGN_TO_16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
225   typedef simde_float64 simde__m128d SIMDE_ALIGN_TO_16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
226 #else
227   typedef simde__m128i_private simde__m128i;
228   typedef simde__m128d_private simde__m128d;
229 #endif
230 
231 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
232   typedef simde__m128i __m128i;
233   typedef simde__m128d __m128d;
234 #endif
235 
236 HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128i), "simde__m128i size incorrect");
237 HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128i_private), "simde__m128i_private size incorrect");
238 HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128d), "simde__m128d size incorrect");
239 HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128d_private), "simde__m128d_private size incorrect");
240 #if defined(SIMDE_CHECK_ALIGNMENT) && defined(SIMDE_ALIGN_OF)
241 HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128i) == 16, "simde__m128i is not 16-byte aligned");
242 HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128i_private) == 16, "simde__m128i_private is not 16-byte aligned");
243 HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128d) == 16, "simde__m128d is not 16-byte aligned");
244 HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128d_private) == 16, "simde__m128d_private is not 16-byte aligned");
245 #endif
246 
247 SIMDE_FUNCTION_ATTRIBUTES
248 simde__m128i
simde__m128i_from_private(simde__m128i_private v)249 simde__m128i_from_private(simde__m128i_private v) {
250   simde__m128i r;
251   simde_memcpy(&r, &v, sizeof(r));
252   return r;
253 }
254 
255 SIMDE_FUNCTION_ATTRIBUTES
256 simde__m128i_private
simde__m128i_to_private(simde__m128i v)257 simde__m128i_to_private(simde__m128i v) {
258   simde__m128i_private r;
259   simde_memcpy(&r, &v, sizeof(r));
260   return r;
261 }
262 
263 SIMDE_FUNCTION_ATTRIBUTES
264 simde__m128d
simde__m128d_from_private(simde__m128d_private v)265 simde__m128d_from_private(simde__m128d_private v) {
266   simde__m128d r;
267   simde_memcpy(&r, &v, sizeof(r));
268   return r;
269 }
270 
271 SIMDE_FUNCTION_ATTRIBUTES
272 simde__m128d_private
simde__m128d_to_private(simde__m128d v)273 simde__m128d_to_private(simde__m128d v) {
274   simde__m128d_private r;
275   simde_memcpy(&r, &v, sizeof(r));
276   return r;
277 }
278 
279 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i,int8x16_t,neon,i8)280   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, int8x16_t, neon, i8)
281   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, int16x8_t, neon, i16)
282   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, int32x4_t, neon, i32)
283   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, int64x2_t, neon, i64)
284   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, uint8x16_t, neon, u8)
285   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, uint16x8_t, neon, u16)
286   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, uint32x4_t, neon, u32)
287   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, uint64x2_t, neon, u64)
288   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, float32x4_t, neon, f32)
289   #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
290     SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, float64x2_t, neon, f64)
291   #endif
292 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
293   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(signed char), altivec, i8)
294   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(signed short), altivec, i16)
295   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(signed int), altivec, i32)
296   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), altivec, u8)
297   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(unsigned short), altivec, u16)
298   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(unsigned int), altivec, u32)
299   #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
300     SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long), altivec, u64)
301     SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(signed long long), altivec, i64)
302   #endif
303 #endif /* defined(SIMDE_ARM_NEON_A32V7_NATIVE) */
304 
305 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
306   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, int8x16_t, neon, i8)
307   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, int16x8_t, neon, i16)
308   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, int32x4_t, neon, i32)
309   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, int64x2_t, neon, i64)
310   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, uint8x16_t, neon, u8)
311   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, uint16x8_t, neon, u16)
312   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, uint32x4_t, neon, u32)
313   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, uint64x2_t, neon, u64)
314   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, float32x4_t, neon, f32)
315   #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
316     SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, float64x2_t, neon, f64)
317   #endif
318 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
319   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(signed char), altivec, i8)
320   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(signed short), altivec, i16)
321   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(signed int), altivec, i32)
322   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), altivec, u8)
323   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(unsigned short), altivec, u16)
324   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(unsigned int), altivec, u32)
325   #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
326     SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long), altivec, u64)
327     SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(signed long long), altivec, i64)
328     #if defined(SIMDE_BUG_GCC_95782)
329       SIMDE_FUNCTION_ATTRIBUTES
330       SIMDE_POWER_ALTIVEC_VECTOR(double)
331       simde__m128d_to_altivec_f64(simde__m128d value) {
332         simde__m128d_private r_ = simde__m128d_to_private(value);
333         return r_.altivec_f64;
334       }
335 
336       SIMDE_FUNCTION_ATTRIBUTES
337       simde__m128d
338       simde__m128d_from_altivec_f64(SIMDE_POWER_ALTIVEC_VECTOR(double) value) {
339         simde__m128d_private r_;
340         r_.altivec_f64 = value;
341         return simde__m128d_from_private(r_);
342       }
343     #else
344       SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(double), altivec, f64)
345     #endif
346   #endif
347 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
348   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, v128_t, wasm, v128);
349   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, v128_t, wasm, v128);
350 #endif /* defined(SIMDE_ARM_NEON_A32V7_NATIVE) */
351 
352 SIMDE_FUNCTION_ATTRIBUTES
353 simde__m128d
354 simde_mm_set_pd (simde_float64 e1, simde_float64 e0) {
355   #if defined(SIMDE_X86_SSE2_NATIVE)
356     return _mm_set_pd(e1, e0);
357   #else
358     simde__m128d_private r_;
359 
360     #if defined(SIMDE_WASM_SIMD128_NATIVE)
361       r_.wasm_v128 = wasm_f64x2_make(e0, e1);
362     #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
363       SIMDE_ALIGN_TO_16 simde_float64 data[2] = { e0, e1 };
364       r_.neon_f64 = vld1q_f64(data);
365     #else
366       r_.f64[0] = e0;
367       r_.f64[1] = e1;
368     #endif
369 
370     return simde__m128d_from_private(r_);
371   #endif
372 }
373 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
374   #define _mm_set_pd(e1, e0) simde_mm_set_pd(e1, e0)
375 #endif
376 
377 SIMDE_FUNCTION_ATTRIBUTES
378 simde__m128d
simde_mm_set1_pd(simde_float64 a)379 simde_mm_set1_pd (simde_float64 a) {
380   #if defined(SIMDE_X86_SSE2_NATIVE)
381     return _mm_set1_pd(a);
382   #else
383     simde__m128d_private r_;
384 
385     #if defined(SIMDE_WASM_SIMD128_NATIVE)
386       r_.wasm_v128 = wasm_f64x2_splat(a);
387     #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
388       r_.neon_f64 = vdupq_n_f64(a);
389     #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
390       r_.altivec_f64 = vec_splats(HEDLEY_STATIC_CAST(double, a));
391     #else
392       SIMDE_VECTORIZE
393       for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
394         r_.f64[i] = a;
395       }
396     #endif
397 
398     return simde__m128d_from_private(r_);
399   #endif
400 }
401 #define simde_mm_set_pd1(a) simde_mm_set1_pd(a)
402 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
403   #define _mm_set1_pd(a) simde_mm_set1_pd(a)
404   #define _mm_set_pd1(a) simde_mm_set1_pd(a)
405 #endif
406 
407 SIMDE_FUNCTION_ATTRIBUTES
408 simde__m128d
simde_x_mm_abs_pd(simde__m128d a)409 simde_x_mm_abs_pd(simde__m128d a) {
410   #if defined(SIMDE_X86_AVX512F_NATIVE) && \
411         (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(7,4,0))
412     return _mm512_castpd512_pd128(_mm512_abs_pd(_mm512_castpd128_pd512(a)));
413   #else
414     simde__m128d_private
415       r_,
416       a_ = simde__m128d_to_private(a);
417 
418     #if defined(SIMDE_ARM_NEON_A32V8_NATIVE)
419       r_.neon_f32 = vabsq_f32(a_.neon_f32);
420     #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
421       r_.altivec_f32 = vec_abs(a_.altivec_f32);
422     #else
423       SIMDE_VECTORIZE
424       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
425         r_.f64[i] = simde_math_fabs(a_.f64[i]);
426       }
427     #endif
428 
429     return simde__m128d_from_private(r_);
430   #endif
431 }
432 
433 SIMDE_FUNCTION_ATTRIBUTES
434 simde__m128d
simde_x_mm_not_pd(simde__m128d a)435 simde_x_mm_not_pd(simde__m128d a) {
436   #if defined(SIMDE_X86_AVX512VL_NATIVE)
437     __m128i ai = _mm_castpd_si128(a);
438     return _mm_castsi128_pd(_mm_ternarylogic_epi64(ai, ai, ai, 0x55));
439   #else
440     simde__m128d_private
441       r_,
442       a_ = simde__m128d_to_private(a);
443 
444     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
445       r_.neon_i32 = vmvnq_s32(a_.neon_i32);
446     #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
447       r_.altivec_f64 = vec_nor(a_.altivec_f64, a_.altivec_f64);
448     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
449       r_.altivec_i32 = vec_nor(a_.altivec_i32, a_.altivec_i32);
450     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
451       r_.wasm_v128 = wasm_v128_not(a_.wasm_v128);
452     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
453       r_.i32f = ~a_.i32f;
454     #else
455       SIMDE_VECTORIZE
456       for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
457         r_.i32f[i] = ~(a_.i32f[i]);
458       }
459     #endif
460 
461     return simde__m128d_from_private(r_);
462   #endif
463 }
464 
465 SIMDE_FUNCTION_ATTRIBUTES
466 simde__m128d
simde_x_mm_select_pd(simde__m128d a,simde__m128d b,simde__m128d mask)467 simde_x_mm_select_pd(simde__m128d a, simde__m128d b, simde__m128d mask) {
468   /* This function is for when you want to blend two elements together
469    * according to a mask.  It is similar to _mm_blendv_pd, except that
470    * it is undefined whether the blend is based on the highest bit in
471    * each lane (like blendv) or just bitwise operations.  This allows
472    * us to implement the function efficiently everywhere.
473    *
474    * Basically, you promise that all the lanes in mask are either 0 or
475    * ~0. */
476   #if defined(SIMDE_X86_SSE4_1_NATIVE)
477     return _mm_blendv_pd(a, b, mask);
478   #else
479     simde__m128d_private
480       r_,
481       a_ = simde__m128d_to_private(a),
482       b_ = simde__m128d_to_private(b),
483       mask_ = simde__m128d_to_private(mask);
484 
485     #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
486       r_.i64 = a_.i64 ^ ((a_.i64 ^ b_.i64) & mask_.i64);
487     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
488       r_.neon_i64 = vbslq_s64(mask_.neon_u64, b_.neon_i64, a_.neon_i64);
489     #else
490       SIMDE_VECTORIZE
491       for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
492         r_.i64[i] = a_.i64[i] ^ ((a_.i64[i] ^ b_.i64[i]) & mask_.i64[i]);
493       }
494     #endif
495 
496     return simde__m128d_from_private(r_);
497   #endif
498 }
499 
500 SIMDE_FUNCTION_ATTRIBUTES
501 simde__m128i
simde_mm_add_epi8(simde__m128i a,simde__m128i b)502 simde_mm_add_epi8 (simde__m128i a, simde__m128i b) {
503   #if defined(SIMDE_X86_SSE2_NATIVE)
504     return _mm_add_epi8(a, b);
505   #else
506     simde__m128i_private
507       r_,
508       a_ = simde__m128i_to_private(a),
509       b_ = simde__m128i_to_private(b);
510 
511     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
512       r_.neon_i8 = vaddq_s8(a_.neon_i8, b_.neon_i8);
513     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
514       r_.altivec_i8 = vec_add(a_.altivec_i8, b_.altivec_i8);
515     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
516       r_.wasm_v128 = wasm_i8x16_add(a_.wasm_v128, b_.wasm_v128);
517     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
518       r_.i8 = a_.i8 + b_.i8;
519     #else
520       SIMDE_VECTORIZE
521       for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
522         r_.i8[i] = a_.i8[i] + b_.i8[i];
523       }
524     #endif
525 
526     return simde__m128i_from_private(r_);
527   #endif
528 }
529 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
530   #define _mm_add_epi8(a, b) simde_mm_add_epi8(a, b)
531 #endif
532 
533 SIMDE_FUNCTION_ATTRIBUTES
534 simde__m128i
simde_mm_add_epi16(simde__m128i a,simde__m128i b)535 simde_mm_add_epi16 (simde__m128i a, simde__m128i b) {
536   #if defined(SIMDE_X86_SSE2_NATIVE)
537     return _mm_add_epi16(a, b);
538   #else
539     simde__m128i_private
540       r_,
541       a_ = simde__m128i_to_private(a),
542       b_ = simde__m128i_to_private(b);
543 
544     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
545       r_.neon_i16 = vaddq_s16(a_.neon_i16, b_.neon_i16);
546     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
547       r_.altivec_i16 = vec_add(a_.altivec_i16, b_.altivec_i16);
548     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
549       r_.wasm_v128 = wasm_i16x8_add(a_.wasm_v128, b_.wasm_v128);
550     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
551       r_.i16 = a_.i16 + b_.i16;
552     #else
553       SIMDE_VECTORIZE
554       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
555         r_.i16[i] = a_.i16[i] + b_.i16[i];
556       }
557     #endif
558 
559     return simde__m128i_from_private(r_);
560   #endif
561 }
562 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
563   #define _mm_add_epi16(a, b) simde_mm_add_epi16(a, b)
564 #endif
565 
566 SIMDE_FUNCTION_ATTRIBUTES
567 simde__m128i
simde_mm_add_epi32(simde__m128i a,simde__m128i b)568 simde_mm_add_epi32 (simde__m128i a, simde__m128i b) {
569   #if defined(SIMDE_X86_SSE2_NATIVE)
570     return _mm_add_epi32(a, b);
571   #else
572     simde__m128i_private
573       r_,
574       a_ = simde__m128i_to_private(a),
575       b_ = simde__m128i_to_private(b);
576 
577     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
578       r_.neon_i32 = vaddq_s32(a_.neon_i32, b_.neon_i32);
579     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
580       r_.altivec_i32 = vec_add(a_.altivec_i32, b_.altivec_i32);
581     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
582       r_.wasm_v128 = wasm_i32x4_add(a_.wasm_v128, b_.wasm_v128);
583     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
584       r_.i32 = a_.i32 + b_.i32;
585     #else
586       SIMDE_VECTORIZE
587       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
588         r_.i32[i] = a_.i32[i] + b_.i32[i];
589       }
590     #endif
591 
592     return simde__m128i_from_private(r_);
593   #endif
594 }
595 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
596   #define _mm_add_epi32(a, b) simde_mm_add_epi32(a, b)
597 #endif
598 
599 SIMDE_FUNCTION_ATTRIBUTES
600 simde__m128i
simde_mm_add_epi64(simde__m128i a,simde__m128i b)601 simde_mm_add_epi64 (simde__m128i a, simde__m128i b) {
602   #if defined(SIMDE_X86_SSE2_NATIVE)
603     return _mm_add_epi64(a, b);
604   #else
605     simde__m128i_private
606       r_,
607       a_ = simde__m128i_to_private(a),
608       b_ = simde__m128i_to_private(b);
609 
610     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
611       r_.neon_i64 = vaddq_s64(a_.neon_i64, b_.neon_i64);
612     #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
613       r_.altivec_i64 = vec_add(a_.altivec_i64, b_.altivec_i64);
614     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
615       r_.wasm_v128 = wasm_i64x2_add(a_.wasm_v128, b_.wasm_v128);
616     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
617       r_.i64 = a_.i64 + b_.i64;
618     #else
619       SIMDE_VECTORIZE
620       for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
621         r_.i64[i] = a_.i64[i] + b_.i64[i];
622       }
623     #endif
624 
625     return simde__m128i_from_private(r_);
626   #endif
627 }
628 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
629   #define _mm_add_epi64(a, b) simde_mm_add_epi64(a, b)
630 #endif
631 
632 SIMDE_FUNCTION_ATTRIBUTES
633 simde__m128d
simde_mm_add_pd(simde__m128d a,simde__m128d b)634 simde_mm_add_pd (simde__m128d a, simde__m128d b) {
635   #if defined(SIMDE_X86_SSE2_NATIVE)
636     return _mm_add_pd(a, b);
637   #else
638     simde__m128d_private
639       r_,
640       a_ = simde__m128d_to_private(a),
641       b_ = simde__m128d_to_private(b);
642 
643     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
644       r_.neon_f64 = vaddq_f64(a_.neon_f64, b_.neon_f64);
645     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
646       r_.wasm_v128 = wasm_f64x2_add(a_.wasm_v128, b_.wasm_v128);
647     #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
648       r_.altivec_f64 = vec_add(a_.altivec_f64, b_.altivec_f64);
649     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
650       r_.wasm_v128 = wasm_f64x2_add(a_.wasm_v128, b_.wasm_v128);
651     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
652       r_.f64 = a_.f64 + b_.f64;
653     #else
654       SIMDE_VECTORIZE
655       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
656         r_.f64[i] = a_.f64[i] + b_.f64[i];
657       }
658     #endif
659 
660     return simde__m128d_from_private(r_);
661   #endif
662 }
663 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
664   #define _mm_add_pd(a, b) simde_mm_add_pd(a, b)
665 #endif
666 
667 SIMDE_FUNCTION_ATTRIBUTES
668 simde__m128d
simde_mm_move_sd(simde__m128d a,simde__m128d b)669 simde_mm_move_sd (simde__m128d a, simde__m128d b) {
670   #if defined(SIMDE_X86_SSE2_NATIVE)
671     return _mm_move_sd(a, b);
672   #else
673     simde__m128d_private
674       r_,
675       a_ = simde__m128d_to_private(a),
676       b_ = simde__m128d_to_private(b);
677 
678     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
679       r_.neon_f64 = vsetq_lane_f64(vgetq_lane_f64(b_.neon_f64, 0), a_.neon_f64, 0);
680     #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
681       #if defined(HEDLEY_IBM_VERSION)
682         r_.altivec_f64 = vec_xxpermdi(a_.altivec_f64, b_.altivec_f64, 1);
683       #else
684         r_.altivec_f64 = vec_xxpermdi(b_.altivec_f64, a_.altivec_f64, 1);
685       #endif
686     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
687       r_.wasm_v128 = wasm_v64x2_shuffle(a_.wasm_v128, b_.wasm_v128, 2, 1);
688     #elif defined(SIMDE_SHUFFLE_VECTOR_)
689       r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 2, 1);
690     #else
691       r_.f64[0] = b_.f64[0];
692       r_.f64[1] = a_.f64[1];
693     #endif
694 
695     return simde__m128d_from_private(r_);
696   #endif
697 }
698 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
699   #define _mm_move_sd(a, b) simde_mm_move_sd(a, b)
700 #endif
701 
702 SIMDE_FUNCTION_ATTRIBUTES
703 simde__m128d
simde_mm_add_sd(simde__m128d a,simde__m128d b)704 simde_mm_add_sd (simde__m128d a, simde__m128d b) {
705   #if defined(SIMDE_X86_SSE2_NATIVE)
706     return _mm_add_sd(a, b);
707   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
708     return simde_mm_move_sd(a, simde_mm_add_pd(a, b));
709   #else
710     simde__m128d_private
711       r_,
712       a_ = simde__m128d_to_private(a),
713       b_ = simde__m128d_to_private(b);
714 
715     r_.f64[0] = a_.f64[0] + b_.f64[0];
716     r_.f64[1] = a_.f64[1];
717 
718     return simde__m128d_from_private(r_);
719   #endif
720 }
721 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
722   #define _mm_add_sd(a, b) simde_mm_add_sd(a, b)
723 #endif
724 
725 SIMDE_FUNCTION_ATTRIBUTES
726 simde__m64
simde_mm_add_si64(simde__m64 a,simde__m64 b)727 simde_mm_add_si64 (simde__m64 a, simde__m64 b) {
728   #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
729     return _mm_add_si64(a, b);
730   #else
731     simde__m64_private
732       r_,
733       a_ = simde__m64_to_private(a),
734       b_ = simde__m64_to_private(b);
735 
736     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
737       r_.neon_i64 = vadd_s64(a_.neon_i64, b_.neon_i64);
738     #else
739       r_.i64[0] = a_.i64[0] + b_.i64[0];
740     #endif
741 
742     return simde__m64_from_private(r_);
743   #endif
744 }
745 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
746   #define _mm_add_si64(a, b) simde_mm_add_si64(a, b)
747 #endif
748 
749 SIMDE_FUNCTION_ATTRIBUTES
750 simde__m128i
simde_mm_adds_epi8(simde__m128i a,simde__m128i b)751 simde_mm_adds_epi8 (simde__m128i a, simde__m128i b) {
752   #if defined(SIMDE_X86_SSE2_NATIVE)
753     return _mm_adds_epi8(a, b);
754   #else
755     simde__m128i_private
756       r_,
757       a_ = simde__m128i_to_private(a),
758       b_ = simde__m128i_to_private(b);
759 
760     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
761       r_.neon_i8 = vqaddq_s8(a_.neon_i8, b_.neon_i8);
762     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
763       r_.wasm_v128 = wasm_i8x16_add_saturate(a_.wasm_v128, b_.wasm_v128);
764     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
765       r_.altivec_i8 = vec_adds(a_.altivec_i8, b_.altivec_i8);
766     #else
767       SIMDE_VECTORIZE
768       for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
769         const int_fast16_t tmp =
770           HEDLEY_STATIC_CAST(int_fast16_t, a_.i8[i]) +
771           HEDLEY_STATIC_CAST(int_fast16_t, b_.i8[i]);
772         r_.i8[i] = HEDLEY_STATIC_CAST(int8_t, ((tmp < INT8_MAX) ? ((tmp > INT8_MIN) ? tmp : INT8_MIN) : INT8_MAX));
773       }
774     #endif
775 
776     return simde__m128i_from_private(r_);
777   #endif
778 }
779 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
780   #define _mm_adds_epi8(a, b) simde_mm_adds_epi8(a, b)
781 #endif
782 
783 SIMDE_FUNCTION_ATTRIBUTES
784 simde__m128i
simde_mm_adds_epi16(simde__m128i a,simde__m128i b)785 simde_mm_adds_epi16 (simde__m128i a, simde__m128i b) {
786   #if defined(SIMDE_X86_SSE2_NATIVE)
787     return _mm_adds_epi16(a, b);
788   #else
789     simde__m128i_private
790       r_,
791       a_ = simde__m128i_to_private(a),
792       b_ = simde__m128i_to_private(b);
793 
794     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
795       r_.neon_i16 = vqaddq_s16(a_.neon_i16, b_.neon_i16);
796     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
797       r_.wasm_v128 = wasm_i16x8_add_saturate(a_.wasm_v128, b_.wasm_v128);
798     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
799       r_.altivec_i16 = vec_adds(a_.altivec_i16, b_.altivec_i16);
800     #else
801       SIMDE_VECTORIZE
802       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
803         const int_fast32_t tmp =
804           HEDLEY_STATIC_CAST(int_fast32_t, a_.i16[i]) +
805           HEDLEY_STATIC_CAST(int_fast32_t, b_.i16[i]);
806         r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, ((tmp < INT16_MAX) ? ((tmp > INT16_MIN) ? tmp : INT16_MIN) : INT16_MAX));
807       }
808     #endif
809 
810     return simde__m128i_from_private(r_);
811   #endif
812 }
813 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
814   #define _mm_adds_epi16(a, b) simde_mm_adds_epi16(a, b)
815 #endif
816 
817 SIMDE_FUNCTION_ATTRIBUTES
818 simde__m128i
simde_mm_adds_epu8(simde__m128i a,simde__m128i b)819 simde_mm_adds_epu8 (simde__m128i a, simde__m128i b) {
820   #if defined(SIMDE_X86_SSE2_NATIVE)
821     return _mm_adds_epu8(a, b);
822   #else
823     simde__m128i_private
824       r_,
825       a_ = simde__m128i_to_private(a),
826       b_ = simde__m128i_to_private(b);
827 
828     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
829       r_.neon_u8 = vqaddq_u8(a_.neon_u8, b_.neon_u8);
830     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
831       r_.wasm_v128 = wasm_u8x16_add_saturate(a_.wasm_v128, b_.wasm_v128);
832     #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
833       r_.altivec_u8 = vec_adds(a_.altivec_u8, b_.altivec_u8);
834     #else
835       SIMDE_VECTORIZE
836       for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
837         r_.u8[i] = ((UINT8_MAX - a_.u8[i]) > b_.u8[i]) ? (a_.u8[i] + b_.u8[i]) : UINT8_MAX;
838       }
839     #endif
840 
841     return simde__m128i_from_private(r_);
842   #endif
843 }
844 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
845   #define _mm_adds_epu8(a, b) simde_mm_adds_epu8(a, b)
846 #endif
847 
848 SIMDE_FUNCTION_ATTRIBUTES
849 simde__m128i
simde_mm_adds_epu16(simde__m128i a,simde__m128i b)850 simde_mm_adds_epu16 (simde__m128i a, simde__m128i b) {
851   #if defined(SIMDE_X86_SSE2_NATIVE)
852     return _mm_adds_epu16(a, b);
853   #else
854     simde__m128i_private
855       r_,
856       a_ = simde__m128i_to_private(a),
857       b_ = simde__m128i_to_private(b);
858 
859     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
860       r_.neon_u16 = vqaddq_u16(a_.neon_u16, b_.neon_u16);
861     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
862       r_.wasm_v128 = wasm_u16x8_add_saturate(a_.wasm_v128, b_.wasm_v128);
863     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
864       r_.altivec_u16 = vec_adds(a_.altivec_u16, b_.altivec_u16);
865     #else
866       SIMDE_VECTORIZE
867       for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
868         r_.u16[i] = ((UINT16_MAX - a_.u16[i]) > b_.u16[i]) ? (a_.u16[i] + b_.u16[i]) : UINT16_MAX;
869       }
870     #endif
871 
872     return simde__m128i_from_private(r_);
873   #endif
874 }
875 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
876   #define _mm_adds_epu16(a, b) simde_mm_adds_epu16(a, b)
877 #endif
878 
879 SIMDE_FUNCTION_ATTRIBUTES
880 simde__m128d
simde_mm_and_pd(simde__m128d a,simde__m128d b)881 simde_mm_and_pd (simde__m128d a, simde__m128d b) {
882   #if defined(SIMDE_X86_SSE2_NATIVE)
883     return _mm_and_pd(a, b);
884   #else
885     simde__m128d_private
886       r_,
887       a_ = simde__m128d_to_private(a),
888       b_ = simde__m128d_to_private(b);
889 
890     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
891       r_.neon_i32 = vandq_s32(a_.neon_i32, b_.neon_i32);
892     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
893       r_.wasm_v128 = wasm_v128_and(a_.wasm_v128, b_.wasm_v128);
894     #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
895       r_.altivec_f64 = vec_and(a_.altivec_f64, b_.altivec_f64);
896     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
897       r_.i32f = a_.i32f & b_.i32f;
898     #else
899       SIMDE_VECTORIZE
900       for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
901         r_.i32f[i] = a_.i32f[i] & b_.i32f[i];
902       }
903     #endif
904 
905     return simde__m128d_from_private(r_);
906   #endif
907 }
908 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
909   #define _mm_and_pd(a, b) simde_mm_and_pd(a, b)
910 #endif
911 
912 SIMDE_FUNCTION_ATTRIBUTES
913 simde__m128i
simde_mm_and_si128(simde__m128i a,simde__m128i b)914 simde_mm_and_si128 (simde__m128i a, simde__m128i b) {
915   #if defined(SIMDE_X86_SSE2_NATIVE)
916     return _mm_and_si128(a, b);
917   #else
918     simde__m128i_private
919       r_,
920       a_ = simde__m128i_to_private(a),
921       b_ = simde__m128i_to_private(b);
922 
923     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
924       r_.neon_i32 = vandq_s32(b_.neon_i32, a_.neon_i32);
925     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
926       r_.altivec_u32f = vec_and(a_.altivec_u32f, b_.altivec_u32f);
927     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
928       r_.i32f = a_.i32f & b_.i32f;
929     #else
930       SIMDE_VECTORIZE
931       for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
932         r_.i32f[i] = a_.i32f[i] & b_.i32f[i];
933       }
934     #endif
935 
936     return simde__m128i_from_private(r_);
937   #endif
938 }
939 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
940   #define _mm_and_si128(a, b) simde_mm_and_si128(a, b)
941 #endif
942 
943 SIMDE_FUNCTION_ATTRIBUTES
944 simde__m128d
simde_mm_andnot_pd(simde__m128d a,simde__m128d b)945 simde_mm_andnot_pd (simde__m128d a, simde__m128d b) {
946   #if defined(SIMDE_X86_SSE2_NATIVE)
947     return _mm_andnot_pd(a, b);
948   #else
949     simde__m128d_private
950       r_,
951       a_ = simde__m128d_to_private(a),
952       b_ = simde__m128d_to_private(b);
953 
954     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
955       r_.neon_i32 = vbicq_s32(b_.neon_i32, a_.neon_i32);
956     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
957       r_.wasm_v128 = wasm_v128_andnot(b_.wasm_v128, a_.wasm_v128);
958     #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
959       r_.altivec_f64 = vec_andc(b_.altivec_f64, a_.altivec_f64);
960     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
961       r_.altivec_i32f = vec_andc(b_.altivec_i32f, a_.altivec_i32f);
962     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
963       r_.i32f = ~a_.i32f & b_.i32f;
964     #else
965       SIMDE_VECTORIZE
966       for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
967         r_.u64[i] = ~a_.u64[i] & b_.u64[i];
968       }
969     #endif
970 
971     return simde__m128d_from_private(r_);
972   #endif
973 }
974 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
975   #define _mm_andnot_pd(a, b) simde_mm_andnot_pd(a, b)
976 #endif
977 
978 SIMDE_FUNCTION_ATTRIBUTES
979 simde__m128i
simde_mm_andnot_si128(simde__m128i a,simde__m128i b)980 simde_mm_andnot_si128 (simde__m128i a, simde__m128i b) {
981   #if defined(SIMDE_X86_SSE2_NATIVE)
982     return _mm_andnot_si128(a, b);
983   #else
984     simde__m128i_private
985       r_,
986       a_ = simde__m128i_to_private(a),
987       b_ = simde__m128i_to_private(b);
988 
989     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
990       r_.neon_i32 = vbicq_s32(b_.neon_i32, a_.neon_i32);
991     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
992       r_.altivec_i32 = vec_andc(b_.altivec_i32, a_.altivec_i32);
993     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
994       r_.i32f = ~a_.i32f & b_.i32f;
995     #else
996       SIMDE_VECTORIZE
997       for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
998         r_.i32f[i] = ~(a_.i32f[i]) & b_.i32f[i];
999       }
1000     #endif
1001 
1002     return simde__m128i_from_private(r_);
1003   #endif
1004 }
1005 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1006   #define _mm_andnot_si128(a, b) simde_mm_andnot_si128(a, b)
1007 #endif
1008 
1009 SIMDE_FUNCTION_ATTRIBUTES
1010 simde__m128d
simde_mm_xor_pd(simde__m128d a,simde__m128d b)1011 simde_mm_xor_pd (simde__m128d a, simde__m128d b) {
1012   #if defined(SIMDE_X86_SSE2_NATIVE)
1013     return _mm_xor_pd(a, b);
1014   #else
1015     simde__m128d_private
1016       r_,
1017       a_ = simde__m128d_to_private(a),
1018       b_ = simde__m128d_to_private(b);
1019 
1020     #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1021       r_.i32f = a_.i32f ^ b_.i32f;
1022     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1023       r_.wasm_v128 = wasm_v128_xor(a_.wasm_v128, b_.wasm_v128);
1024     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1025       r_.neon_i64 = veorq_s64(a_.neon_i64, b_.neon_i64);
1026     #else
1027       SIMDE_VECTORIZE
1028       for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
1029         r_.i32f[i] = a_.i32f[i] ^ b_.i32f[i];
1030       }
1031     #endif
1032 
1033     return simde__m128d_from_private(r_);
1034   #endif
1035 }
1036 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1037   #define _mm_xor_pd(a, b) simde_mm_xor_pd(a, b)
1038 #endif
1039 
1040 SIMDE_FUNCTION_ATTRIBUTES
1041 simde__m128i
simde_mm_avg_epu8(simde__m128i a,simde__m128i b)1042 simde_mm_avg_epu8 (simde__m128i a, simde__m128i b) {
1043   #if defined(SIMDE_X86_SSE2_NATIVE)
1044     return _mm_avg_epu8(a, b);
1045   #else
1046     simde__m128i_private
1047       r_,
1048       a_ = simde__m128i_to_private(a),
1049       b_ = simde__m128i_to_private(b);
1050 
1051     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1052       r_.neon_u8 = vrhaddq_u8(b_.neon_u8, a_.neon_u8);
1053     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1054       r_.wasm_v128 = wasm_u8x16_avgr(a_.wasm_v128, b_.wasm_v128);
1055     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1056       r_.altivec_u8 = vec_avg(a_.altivec_u8, b_.altivec_u8);
1057     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_CONVERT_VECTOR_)
1058       uint16_t wa SIMDE_VECTOR(32);
1059       uint16_t wb SIMDE_VECTOR(32);
1060       uint16_t wr SIMDE_VECTOR(32);
1061       SIMDE_CONVERT_VECTOR_(wa, a_.u8);
1062       SIMDE_CONVERT_VECTOR_(wb, b_.u8);
1063       wr = (wa + wb + 1) >> 1;
1064       SIMDE_CONVERT_VECTOR_(r_.u8, wr);
1065     #else
1066       SIMDE_VECTORIZE
1067       for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
1068         r_.u8[i] = (a_.u8[i] + b_.u8[i] + 1) >> 1;
1069       }
1070     #endif
1071 
1072     return simde__m128i_from_private(r_);
1073   #endif
1074 }
1075 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1076   #define _mm_avg_epu8(a, b) simde_mm_avg_epu8(a, b)
1077 #endif
1078 
1079 SIMDE_FUNCTION_ATTRIBUTES
1080 simde__m128i
simde_mm_avg_epu16(simde__m128i a,simde__m128i b)1081 simde_mm_avg_epu16 (simde__m128i a, simde__m128i b) {
1082   #if defined(SIMDE_X86_SSE2_NATIVE)
1083     return _mm_avg_epu16(a, b);
1084   #else
1085     simde__m128i_private
1086       r_,
1087       a_ = simde__m128i_to_private(a),
1088       b_ = simde__m128i_to_private(b);
1089 
1090     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1091       r_.neon_u16 = vrhaddq_u16(b_.neon_u16, a_.neon_u16);
1092     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1093       r_.wasm_v128 = wasm_u16x8_avgr(a_.wasm_v128, b_.wasm_v128);
1094     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1095       r_.altivec_u16 = vec_avg(a_.altivec_u16, b_.altivec_u16);
1096     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_CONVERT_VECTOR_)
1097       uint32_t wa SIMDE_VECTOR(32);
1098       uint32_t wb SIMDE_VECTOR(32);
1099       uint32_t wr SIMDE_VECTOR(32);
1100       SIMDE_CONVERT_VECTOR_(wa, a_.u16);
1101       SIMDE_CONVERT_VECTOR_(wb, b_.u16);
1102       wr = (wa + wb + 1) >> 1;
1103       SIMDE_CONVERT_VECTOR_(r_.u16, wr);
1104     #else
1105       SIMDE_VECTORIZE
1106       for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
1107         r_.u16[i] = (a_.u16[i] + b_.u16[i] + 1) >> 1;
1108       }
1109     #endif
1110 
1111     return simde__m128i_from_private(r_);
1112   #endif
1113 }
1114 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1115   #define _mm_avg_epu16(a, b) simde_mm_avg_epu16(a, b)
1116 #endif
1117 
1118 SIMDE_FUNCTION_ATTRIBUTES
1119 simde__m128i
simde_mm_setzero_si128(void)1120 simde_mm_setzero_si128 (void) {
1121   #if defined(SIMDE_X86_SSE2_NATIVE)
1122     return _mm_setzero_si128();
1123   #else
1124     simde__m128i_private r_;
1125 
1126     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1127       r_.neon_i32 = vdupq_n_s32(0);
1128     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1129       r_.altivec_i32 = vec_splats(HEDLEY_STATIC_CAST(signed int, 0));
1130     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1131       r_.wasm_v128 = wasm_i32x4_splat(INT32_C(0));
1132     #elif defined(SIMDE_VECTOR_SUBSCRIPT)
1133       r_.i32 = __extension__ (__typeof__(r_.i32)) { 0, 0, 0, 0 };
1134     #else
1135       SIMDE_VECTORIZE
1136       for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
1137         r_.i32f[i] = 0;
1138       }
1139     #endif
1140 
1141     return simde__m128i_from_private(r_);
1142   #endif
1143 }
1144 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1145   #define _mm_setzero_si128() (simde_mm_setzero_si128())
1146 #endif
1147 
1148 SIMDE_FUNCTION_ATTRIBUTES
1149 simde__m128i
simde_mm_bslli_si128(simde__m128i a,const int imm8)1150 simde_mm_bslli_si128 (simde__m128i a, const int imm8)
1151     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)  {
1152   simde__m128i_private
1153     r_,
1154     a_ = simde__m128i_to_private(a);
1155 
1156   if (HEDLEY_UNLIKELY((imm8 & ~15))) {
1157     return simde_mm_setzero_si128();
1158   }
1159 
1160   #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) && defined(SIMDE_ENDIAN_ORDER)
1161     r_.altivec_i8 =
1162       #if (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
1163         vec_slo
1164       #else /* SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_BIG */
1165         vec_sro
1166       #endif
1167         (a_.altivec_i8, vec_splats(HEDLEY_STATIC_CAST(unsigned char, imm8 * 8)));
1168   #elif defined(SIMDE_HAVE_INT128_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
1169     r_.u128[0] = a_.u128[0] << (imm8 * 8);
1170   #else
1171     r_ = simde__m128i_to_private(simde_mm_setzero_si128());
1172     for (int i = imm8 ; i < HEDLEY_STATIC_CAST(int, sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
1173       r_.i8[i] = a_.i8[i - imm8];
1174     }
1175   #endif
1176 
1177   return simde__m128i_from_private(r_);
1178 }
1179 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
1180   #define simde_mm_bslli_si128(a, imm8) _mm_slli_si128(a, imm8)
1181 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__)
1182   #define simde_mm_bslli_si128(a, imm8) \
1183   simde__m128i_from_neon_i8(((imm8) <= 0) ? simde__m128i_to_neon_i8(a) : (((imm8) > 15) ? (vdupq_n_s8(0)) : (vextq_s8(vdupq_n_s8(0), simde__m128i_to_neon_i8(a), 16 - (imm8)))))
1184 #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1185   #define simde_mm_bslli_si128(a, imm8) (__extension__ ({ \
1186     const simde__m128i_private simde__tmp_a_ = simde__m128i_to_private(a); \
1187     const simde__m128i_private simde__tmp_z_ = simde__m128i_to_private(simde_mm_setzero_si128()); \
1188     simde__m128i_private simde__tmp_r_; \
1189     if (HEDLEY_UNLIKELY(imm8 > 15)) { \
1190       simde__tmp_r_ = simde__m128i_to_private(simde_mm_setzero_si128()); \
1191     } else { \
1192       simde__tmp_r_.i8 = \
1193         SIMDE_SHUFFLE_VECTOR_(8, 16, \
1194           simde__tmp_z_.i8, \
1195           (simde__tmp_a_).i8, \
1196           HEDLEY_STATIC_CAST(int8_t, (16 - imm8) & 31), \
1197           HEDLEY_STATIC_CAST(int8_t, (17 - imm8) & 31), \
1198           HEDLEY_STATIC_CAST(int8_t, (18 - imm8) & 31), \
1199           HEDLEY_STATIC_CAST(int8_t, (19 - imm8) & 31), \
1200           HEDLEY_STATIC_CAST(int8_t, (20 - imm8) & 31), \
1201           HEDLEY_STATIC_CAST(int8_t, (21 - imm8) & 31), \
1202           HEDLEY_STATIC_CAST(int8_t, (22 - imm8) & 31), \
1203           HEDLEY_STATIC_CAST(int8_t, (23 - imm8) & 31), \
1204           HEDLEY_STATIC_CAST(int8_t, (24 - imm8) & 31), \
1205           HEDLEY_STATIC_CAST(int8_t, (25 - imm8) & 31), \
1206           HEDLEY_STATIC_CAST(int8_t, (26 - imm8) & 31), \
1207           HEDLEY_STATIC_CAST(int8_t, (27 - imm8) & 31), \
1208           HEDLEY_STATIC_CAST(int8_t, (28 - imm8) & 31), \
1209           HEDLEY_STATIC_CAST(int8_t, (29 - imm8) & 31), \
1210           HEDLEY_STATIC_CAST(int8_t, (30 - imm8) & 31), \
1211           HEDLEY_STATIC_CAST(int8_t, (31 - imm8) & 31)); \
1212     } \
1213     simde__m128i_from_private(simde__tmp_r_); }))
1214 #endif
1215 #define simde_mm_slli_si128(a, imm8) simde_mm_bslli_si128(a, imm8)
1216 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1217   #define _mm_bslli_si128(a, imm8) simde_mm_bslli_si128(a, imm8)
1218   #define _mm_slli_si128(a, imm8) simde_mm_bslli_si128(a, imm8)
1219 #endif
1220 
1221 SIMDE_FUNCTION_ATTRIBUTES
1222 simde__m128i
simde_mm_bsrli_si128(simde__m128i a,const int imm8)1223 simde_mm_bsrli_si128 (simde__m128i a, const int imm8)
1224     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)  {
1225   simde__m128i_private
1226     r_,
1227     a_ = simde__m128i_to_private(a);
1228 
1229   if (HEDLEY_UNLIKELY((imm8 & ~15))) {
1230     return simde_mm_setzero_si128();
1231   }
1232 
1233   #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) && defined(SIMDE_ENDIAN_ORDER)
1234     r_.altivec_i8 =
1235     #if (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
1236       vec_sro
1237     #else /* SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_BIG */
1238       vec_slo
1239     #endif
1240         (a_.altivec_i8, vec_splats(HEDLEY_STATIC_CAST(unsigned char, imm8 * 8)));
1241   #else
1242     SIMDE_VECTORIZE
1243     for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
1244       const int e = HEDLEY_STATIC_CAST(int, i) + imm8;
1245       r_.i8[i] = (e < 16) ? a_.i8[e] : 0;
1246     }
1247   #endif
1248 
1249   return simde__m128i_from_private(r_);
1250 }
1251 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
1252   #define simde_mm_bsrli_si128(a, imm8) _mm_srli_si128(a, imm8)
1253 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__)
1254   #define simde_mm_bsrli_si128(a, imm8) \
1255   simde__m128i_from_neon_i8(((imm8 < 0) || (imm8 > 15)) ? vdupq_n_s8(0) : (vextq_s8(simde__m128i_to_private(a).neon_i8, vdupq_n_s8(0), ((imm8 & 15) != 0) ? imm8 : (imm8 & 15))))
1256 #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1257   #define simde_mm_bsrli_si128(a, imm8) (__extension__ ({ \
1258     const simde__m128i_private simde__tmp_a_ = simde__m128i_to_private(a); \
1259     const simde__m128i_private simde__tmp_z_ = simde__m128i_to_private(simde_mm_setzero_si128()); \
1260     simde__m128i_private simde__tmp_r_ = simde__m128i_to_private(a); \
1261     if (HEDLEY_UNLIKELY(imm8 > 15)) { \
1262       simde__tmp_r_ = simde__m128i_to_private(simde_mm_setzero_si128()); \
1263     } else { \
1264       simde__tmp_r_.i8 = \
1265       SIMDE_SHUFFLE_VECTOR_(8, 16, \
1266         simde__tmp_z_.i8, \
1267         (simde__tmp_a_).i8, \
1268         HEDLEY_STATIC_CAST(int8_t, (imm8 + 16) & 31), \
1269         HEDLEY_STATIC_CAST(int8_t, (imm8 + 17) & 31), \
1270         HEDLEY_STATIC_CAST(int8_t, (imm8 + 18) & 31), \
1271         HEDLEY_STATIC_CAST(int8_t, (imm8 + 19) & 31), \
1272         HEDLEY_STATIC_CAST(int8_t, (imm8 + 20) & 31), \
1273         HEDLEY_STATIC_CAST(int8_t, (imm8 + 21) & 31), \
1274         HEDLEY_STATIC_CAST(int8_t, (imm8 + 22) & 31), \
1275         HEDLEY_STATIC_CAST(int8_t, (imm8 + 23) & 31), \
1276         HEDLEY_STATIC_CAST(int8_t, (imm8 + 24) & 31), \
1277         HEDLEY_STATIC_CAST(int8_t, (imm8 + 25) & 31), \
1278         HEDLEY_STATIC_CAST(int8_t, (imm8 + 26) & 31), \
1279         HEDLEY_STATIC_CAST(int8_t, (imm8 + 27) & 31), \
1280         HEDLEY_STATIC_CAST(int8_t, (imm8 + 28) & 31), \
1281         HEDLEY_STATIC_CAST(int8_t, (imm8 + 29) & 31), \
1282         HEDLEY_STATIC_CAST(int8_t, (imm8 + 30) & 31), \
1283         HEDLEY_STATIC_CAST(int8_t, (imm8 + 31) & 31)); \
1284     } \
1285     simde__m128i_from_private(simde__tmp_r_); }))
1286 #endif
1287 #define simde_mm_srli_si128(a, imm8) simde_mm_bsrli_si128((a), (imm8))
1288 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1289   #define _mm_bsrli_si128(a, imm8) simde_mm_bsrli_si128((a), (imm8))
1290   #define _mm_srli_si128(a, imm8) simde_mm_bsrli_si128((a), (imm8))
1291 #endif
1292 
1293 SIMDE_FUNCTION_ATTRIBUTES
1294 void
simde_mm_clflush(void const * p)1295 simde_mm_clflush (void const* p) {
1296   #if defined(SIMDE_X86_SSE2_NATIVE)
1297     _mm_clflush(p);
1298   #else
1299     (void) p;
1300   #endif
1301 }
1302 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1303   #define _mm_clflush(a, b) simde_mm_clflush()
1304 #endif
1305 
1306 SIMDE_FUNCTION_ATTRIBUTES
1307 int
simde_mm_comieq_sd(simde__m128d a,simde__m128d b)1308 simde_mm_comieq_sd (simde__m128d a, simde__m128d b) {
1309   #if defined(SIMDE_X86_SSE2_NATIVE)
1310     return _mm_comieq_sd(a, b);
1311   #else
1312     simde__m128d_private
1313       a_ = simde__m128d_to_private(a),
1314       b_ = simde__m128d_to_private(b);
1315     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1316       return !!vgetq_lane_u64(vceqq_f64(a_.neon_f64, b_.neon_f64), 0);
1317     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1318       return wasm_f64x2_extract_lane(a_.wasm_v128, 0) == wasm_f64x2_extract_lane(b_.wasm_v128, 0);
1319     #else
1320       return a_.f64[0] == b_.f64[0];
1321     #endif
1322   #endif
1323 }
1324 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1325   #define _mm_comieq_sd(a, b) simde_mm_comieq_sd(a, b)
1326 #endif
1327 
1328 SIMDE_FUNCTION_ATTRIBUTES
1329 int
simde_mm_comige_sd(simde__m128d a,simde__m128d b)1330 simde_mm_comige_sd (simde__m128d a, simde__m128d b) {
1331   #if defined(SIMDE_X86_SSE2_NATIVE)
1332     return _mm_comige_sd(a, b);
1333   #else
1334     simde__m128d_private
1335       a_ = simde__m128d_to_private(a),
1336       b_ = simde__m128d_to_private(b);
1337     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1338       return !!vgetq_lane_u64(vcgeq_f64(a_.neon_f64, b_.neon_f64), 0);
1339     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1340       return wasm_f64x2_extract_lane(a_.wasm_v128, 0) >= wasm_f64x2_extract_lane(b_.wasm_v128, 0);
1341     #else
1342       return a_.f64[0] >= b_.f64[0];
1343     #endif
1344   #endif
1345 }
1346 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1347   #define _mm_comige_sd(a, b) simde_mm_comige_sd(a, b)
1348 #endif
1349 
1350 SIMDE_FUNCTION_ATTRIBUTES
1351 int
simde_mm_comigt_sd(simde__m128d a,simde__m128d b)1352 simde_mm_comigt_sd (simde__m128d a, simde__m128d b) {
1353   #if defined(SIMDE_X86_SSE2_NATIVE)
1354     return _mm_comigt_sd(a, b);
1355   #else
1356     simde__m128d_private
1357       a_ = simde__m128d_to_private(a),
1358       b_ = simde__m128d_to_private(b);
1359     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1360       return !!vgetq_lane_u64(vcgtq_f64(a_.neon_f64, b_.neon_f64), 0);
1361     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1362       return wasm_f64x2_extract_lane(a_.wasm_v128, 0) > wasm_f64x2_extract_lane(b_.wasm_v128, 0);
1363     #else
1364       return a_.f64[0] > b_.f64[0];
1365     #endif
1366   #endif
1367 }
1368 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1369   #define _mm_comigt_sd(a, b) simde_mm_comigt_sd(a, b)
1370 #endif
1371 
1372 SIMDE_FUNCTION_ATTRIBUTES
1373 int
simde_mm_comile_sd(simde__m128d a,simde__m128d b)1374 simde_mm_comile_sd (simde__m128d a, simde__m128d b) {
1375   #if defined(SIMDE_X86_SSE2_NATIVE)
1376     return _mm_comile_sd(a, b);
1377   #else
1378     simde__m128d_private
1379       a_ = simde__m128d_to_private(a),
1380       b_ = simde__m128d_to_private(b);
1381     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1382       return !!vgetq_lane_u64(vcleq_f64(a_.neon_f64, b_.neon_f64), 0);
1383     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1384       return wasm_f64x2_extract_lane(a_.wasm_v128, 0) <= wasm_f64x2_extract_lane(b_.wasm_v128, 0);
1385     #else
1386       return a_.f64[0] <= b_.f64[0];
1387     #endif
1388   #endif
1389 }
1390 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1391   #define _mm_comile_sd(a, b) simde_mm_comile_sd(a, b)
1392 #endif
1393 
1394 SIMDE_FUNCTION_ATTRIBUTES
1395 int
simde_mm_comilt_sd(simde__m128d a,simde__m128d b)1396 simde_mm_comilt_sd (simde__m128d a, simde__m128d b) {
1397   #if defined(SIMDE_X86_SSE2_NATIVE)
1398     return _mm_comilt_sd(a, b);
1399   #else
1400     simde__m128d_private
1401       a_ = simde__m128d_to_private(a),
1402       b_ = simde__m128d_to_private(b);
1403     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1404       return !!vgetq_lane_u64(vcltq_f64(a_.neon_f64, b_.neon_f64), 0);
1405     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1406       return wasm_f64x2_extract_lane(a_.wasm_v128, 0) < wasm_f64x2_extract_lane(b_.wasm_v128, 0);
1407     #else
1408       return a_.f64[0] < b_.f64[0];
1409     #endif
1410   #endif
1411 }
1412 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1413   #define _mm_comilt_sd(a, b) simde_mm_comilt_sd(a, b)
1414 #endif
1415 
1416 SIMDE_FUNCTION_ATTRIBUTES
1417 int
simde_mm_comineq_sd(simde__m128d a,simde__m128d b)1418 simde_mm_comineq_sd (simde__m128d a, simde__m128d b) {
1419   #if defined(SIMDE_X86_SSE2_NATIVE)
1420     return _mm_comineq_sd(a, b);
1421   #else
1422     simde__m128d_private
1423       a_ = simde__m128d_to_private(a),
1424       b_ = simde__m128d_to_private(b);
1425     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1426       return !vgetq_lane_u64(vceqq_f64(a_.neon_f64, b_.neon_f64), 0);
1427     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1428       return wasm_f64x2_extract_lane(a_.wasm_v128, 0) != wasm_f64x2_extract_lane(b_.wasm_v128, 0);
1429     #else
1430       return a_.f64[0] != b_.f64[0];
1431     #endif
1432   #endif
1433 }
1434 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1435   #define _mm_comineq_sd(a, b) simde_mm_comineq_sd(a, b)
1436 #endif
1437 
1438 SIMDE_FUNCTION_ATTRIBUTES
1439 simde__m128d
simde_x_mm_copysign_pd(simde__m128d dest,simde__m128d src)1440 simde_x_mm_copysign_pd(simde__m128d dest, simde__m128d src) {
1441   simde__m128d_private
1442     r_,
1443     dest_ = simde__m128d_to_private(dest),
1444     src_ = simde__m128d_to_private(src);
1445 
1446   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1447     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1448       uint64x2_t sign_pos = vreinterpretq_u64_f64(vdupq_n_f64(-SIMDE_FLOAT64_C(0.0)));
1449     #else
1450       simde_float64 dbl_nz = -SIMDE_FLOAT64_C(0.0);
1451       uint64_t u64_nz;
1452       simde_memcpy(&u64_nz, &dbl_nz, sizeof(u64_nz));
1453       uint64x2_t sign_pos = vdupq_n_u64(u64_nz);
1454     #endif
1455     r_.neon_u64 = vbslq_u64(sign_pos, src_.neon_u64, dest_.neon_u64);
1456   #elif defined(SIMDE_POWER_ALTIVEC_P9_NATIVE)
1457     #if !defined(HEDLEY_IBM_VERSION)
1458       r_.altivec_f64 = vec_cpsgn(dest_.altivec_f64, src_.altivec_f64);
1459     #else
1460       r_.altivec_f64 = vec_cpsgn(src_.altivec_f64, dest_.altivec_f64);
1461     #endif
1462   #elif defined(simde_math_copysign)
1463     SIMDE_VECTORIZE
1464     for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
1465       r_.f64[i] = simde_math_copysign(dest_.f64[i], src_.f64[i]);
1466     }
1467   #else
1468     simde__m128d sgnbit = simde_mm_set1_pd(-SIMDE_FLOAT64_C(0.0));
1469     return simde_mm_xor_pd(simde_mm_and_pd(sgnbit, src), simde_mm_andnot_pd(sgnbit, dest));
1470   #endif
1471 
1472   return simde__m128d_from_private(r_);
1473 }
1474 
1475 SIMDE_FUNCTION_ATTRIBUTES
1476 simde__m128d
simde_x_mm_xorsign_pd(simde__m128d dest,simde__m128d src)1477 simde_x_mm_xorsign_pd(simde__m128d dest, simde__m128d src) {
1478   return simde_mm_xor_pd(simde_mm_and_pd(simde_mm_set1_pd(-0.0), src), dest);
1479 }
1480 
1481 SIMDE_FUNCTION_ATTRIBUTES
1482 simde__m128
simde_mm_castpd_ps(simde__m128d a)1483 simde_mm_castpd_ps (simde__m128d a) {
1484   #if defined(SIMDE_X86_SSE2_NATIVE)
1485     return _mm_castpd_ps(a);
1486   #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1487     return vreinterpretq_f32_f64(a);
1488   #else
1489     simde__m128 r;
1490     simde_memcpy(&r, &a, sizeof(a));
1491     return r;
1492   #endif
1493 }
1494 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1495   #define _mm_castpd_ps(a) simde_mm_castpd_ps(a)
1496 #endif
1497 
1498 SIMDE_FUNCTION_ATTRIBUTES
1499 simde__m128i
simde_mm_castpd_si128(simde__m128d a)1500 simde_mm_castpd_si128 (simde__m128d a) {
1501   #if defined(SIMDE_X86_SSE2_NATIVE)
1502     return _mm_castpd_si128(a);
1503   #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1504     return vreinterpretq_s64_f64(a);
1505   #else
1506     simde__m128i r;
1507     simde_memcpy(&r, &a, sizeof(a));
1508     return r;
1509   #endif
1510 }
1511 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1512   #define _mm_castpd_si128(a) simde_mm_castpd_si128(a)
1513 #endif
1514 
1515 SIMDE_FUNCTION_ATTRIBUTES
1516 simde__m128d
simde_mm_castps_pd(simde__m128 a)1517 simde_mm_castps_pd (simde__m128 a) {
1518   #if defined(SIMDE_X86_SSE2_NATIVE)
1519     return _mm_castps_pd(a);
1520   #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1521     return vreinterpretq_f64_f32(a);
1522   #else
1523     simde__m128d r;
1524     simde_memcpy(&r, &a, sizeof(a));
1525     return r;
1526   #endif
1527 }
1528 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1529   #define _mm_castps_pd(a) simde_mm_castps_pd(a)
1530 #endif
1531 
1532 SIMDE_FUNCTION_ATTRIBUTES
1533 simde__m128i
simde_mm_castps_si128(simde__m128 a)1534 simde_mm_castps_si128 (simde__m128 a) {
1535   #if defined(SIMDE_X86_SSE2_NATIVE)
1536     return _mm_castps_si128(a);
1537   #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1538     return simde__m128i_from_neon_i32(simde__m128_to_private(a).neon_i32);
1539   #else
1540     simde__m128i r;
1541     simde_memcpy(&r, &a, sizeof(a));
1542     return r;
1543   #endif
1544 }
1545 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1546   #define _mm_castps_si128(a) simde_mm_castps_si128(a)
1547 #endif
1548 
1549 SIMDE_FUNCTION_ATTRIBUTES
1550 simde__m128d
simde_mm_castsi128_pd(simde__m128i a)1551 simde_mm_castsi128_pd (simde__m128i a) {
1552   #if defined(SIMDE_X86_SSE2_NATIVE)
1553     return _mm_castsi128_pd(a);
1554   #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1555     return vreinterpretq_f64_s64(a);
1556   #else
1557     simde__m128d r;
1558     simde_memcpy(&r, &a, sizeof(a));
1559     return r;
1560   #endif
1561 }
1562 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1563   #define _mm_castsi128_pd(a) simde_mm_castsi128_pd(a)
1564 #endif
1565 
1566 SIMDE_FUNCTION_ATTRIBUTES
1567 simde__m128
simde_mm_castsi128_ps(simde__m128i a)1568 simde_mm_castsi128_ps (simde__m128i a) {
1569   #if defined(SIMDE_X86_SSE2_NATIVE)
1570     return _mm_castsi128_ps(a);
1571   #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1572     return HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), a);
1573   #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1574     return simde__m128_from_neon_i32(simde__m128i_to_private(a).neon_i32);
1575   #else
1576     simde__m128 r;
1577     simde_memcpy(&r, &a, sizeof(a));
1578     return r;
1579   #endif
1580 }
1581 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1582   #define _mm_castsi128_ps(a) simde_mm_castsi128_ps(a)
1583 #endif
1584 
1585 SIMDE_FUNCTION_ATTRIBUTES
1586 simde__m128i
simde_mm_cmpeq_epi8(simde__m128i a,simde__m128i b)1587 simde_mm_cmpeq_epi8 (simde__m128i a, simde__m128i b) {
1588   #if defined(SIMDE_X86_SSE2_NATIVE)
1589     return _mm_cmpeq_epi8(a, b);
1590   #else
1591     simde__m128i_private
1592       r_,
1593       a_ = simde__m128i_to_private(a),
1594       b_ = simde__m128i_to_private(b);
1595 
1596     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1597       r_.neon_u8 = vceqq_s8(b_.neon_i8, a_.neon_i8);
1598     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1599       r_.wasm_v128 = wasm_i8x16_eq(a_.wasm_v128, b_.wasm_v128);
1600     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1601       r_.altivec_i8 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char), vec_cmpeq(a_.altivec_i8, b_.altivec_i8));
1602     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1603       r_.i8 = HEDLEY_STATIC_CAST(__typeof__(r_.i8), (a_.i8 == b_.i8));
1604     #else
1605       SIMDE_VECTORIZE
1606       for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
1607         r_.i8[i] = (a_.i8[i] == b_.i8[i]) ? ~INT8_C(0) : INT8_C(0);
1608       }
1609     #endif
1610 
1611     return simde__m128i_from_private(r_);
1612   #endif
1613 }
1614 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1615   #define _mm_cmpeq_epi8(a, b) simde_mm_cmpeq_epi8(a, b)
1616 #endif
1617 
1618 SIMDE_FUNCTION_ATTRIBUTES
1619 simde__m128i
simde_mm_cmpeq_epi16(simde__m128i a,simde__m128i b)1620 simde_mm_cmpeq_epi16 (simde__m128i a, simde__m128i b) {
1621   #if defined(SIMDE_X86_SSE2_NATIVE)
1622     return _mm_cmpeq_epi16(a, b);
1623   #else
1624     simde__m128i_private
1625       r_,
1626       a_ = simde__m128i_to_private(a),
1627       b_ = simde__m128i_to_private(b);
1628 
1629     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1630       r_.neon_u16 = vceqq_s16(b_.neon_i16, a_.neon_i16);
1631     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1632       r_.wasm_v128 = wasm_i16x8_eq(a_.wasm_v128, b_.wasm_v128);
1633     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1634       r_.altivec_i16 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed short), vec_cmpeq(a_.altivec_i16, b_.altivec_i16));
1635     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1636       r_.i16 = (a_.i16 == b_.i16);
1637     #else
1638       SIMDE_VECTORIZE
1639       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
1640         r_.i16[i] = (a_.i16[i] == b_.i16[i]) ? ~INT16_C(0) : INT16_C(0);
1641       }
1642     #endif
1643 
1644     return simde__m128i_from_private(r_);
1645   #endif
1646 }
1647 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1648   #define _mm_cmpeq_epi16(a, b) simde_mm_cmpeq_epi16(a, b)
1649 #endif
1650 
1651 SIMDE_FUNCTION_ATTRIBUTES
1652 simde__m128i
simde_mm_cmpeq_epi32(simde__m128i a,simde__m128i b)1653 simde_mm_cmpeq_epi32 (simde__m128i a, simde__m128i b) {
1654   #if defined(SIMDE_X86_SSE2_NATIVE)
1655     return _mm_cmpeq_epi32(a, b);
1656   #else
1657     simde__m128i_private
1658       r_,
1659       a_ = simde__m128i_to_private(a),
1660       b_ = simde__m128i_to_private(b);
1661 
1662     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1663       r_.neon_u32 = vceqq_s32(b_.neon_i32, a_.neon_i32);
1664     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1665       r_.wasm_v128 = wasm_i32x4_eq(a_.wasm_v128, b_.wasm_v128);
1666     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1667       r_.altivec_i32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed int), vec_cmpeq(a_.altivec_i32, b_.altivec_i32));
1668     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1669       r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), a_.i32 == b_.i32);
1670     #else
1671       SIMDE_VECTORIZE
1672       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1673         r_.i32[i] = (a_.i32[i] == b_.i32[i]) ? ~INT32_C(0) : INT32_C(0);
1674       }
1675     #endif
1676 
1677     return simde__m128i_from_private(r_);
1678   #endif
1679 }
1680 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1681   #define _mm_cmpeq_epi32(a, b) simde_mm_cmpeq_epi32(a, b)
1682 #endif
1683 
1684 SIMDE_FUNCTION_ATTRIBUTES
1685 simde__m128d
simde_mm_cmpeq_pd(simde__m128d a,simde__m128d b)1686 simde_mm_cmpeq_pd (simde__m128d a, simde__m128d b) {
1687   #if defined(SIMDE_X86_SSE2_NATIVE)
1688     return _mm_cmpeq_pd(a, b);
1689   #else
1690     simde__m128d_private
1691       r_,
1692       a_ = simde__m128d_to_private(a),
1693       b_ = simde__m128d_to_private(b);
1694 
1695     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1696       r_.neon_u64 = vceqq_s64(b_.neon_i64, a_.neon_i64);
1697     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1698       r_.wasm_v128 = wasm_f64x2_eq(a_.wasm_v128, b_.wasm_v128);
1699     #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
1700       r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_cmpeq(a_.altivec_f64, b_.altivec_f64));
1701     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1702       r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 == b_.f64));
1703     #else
1704       SIMDE_VECTORIZE
1705       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
1706         r_.u64[i] = (a_.f64[i] == b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
1707       }
1708     #endif
1709 
1710     return simde__m128d_from_private(r_);
1711   #endif
1712 }
1713 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1714   #define _mm_cmpeq_pd(a, b) simde_mm_cmpeq_pd(a, b)
1715 #endif
1716 
1717 SIMDE_FUNCTION_ATTRIBUTES
1718 simde__m128d
simde_mm_cmpeq_sd(simde__m128d a,simde__m128d b)1719 simde_mm_cmpeq_sd (simde__m128d a, simde__m128d b) {
1720   #if defined(SIMDE_X86_SSE2_NATIVE)
1721     return _mm_cmpeq_sd(a, b);
1722   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
1723     return simde_mm_move_sd(a, simde_mm_cmpeq_pd(a, b));
1724   #else
1725     simde__m128d_private
1726       r_,
1727       a_ = simde__m128d_to_private(a),
1728       b_ = simde__m128d_to_private(b);
1729 
1730     r_.u64[0] = (a_.u64[0] == b_.u64[0]) ? ~UINT64_C(0) : 0;
1731     r_.u64[1] = a_.u64[1];
1732 
1733     return simde__m128d_from_private(r_);
1734   #endif
1735 }
1736 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1737   #define _mm_cmpeq_sd(a, b) simde_mm_cmpeq_sd(a, b)
1738 #endif
1739 
1740 SIMDE_FUNCTION_ATTRIBUTES
1741 simde__m128d
simde_mm_cmpneq_pd(simde__m128d a,simde__m128d b)1742 simde_mm_cmpneq_pd (simde__m128d a, simde__m128d b) {
1743   #if defined(SIMDE_X86_SSE2_NATIVE)
1744     return _mm_cmpneq_pd(a, b);
1745   #else
1746     simde__m128d_private
1747       r_,
1748       a_ = simde__m128d_to_private(a),
1749       b_ = simde__m128d_to_private(b);
1750 
1751     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1752       r_.neon_u32 = vmvnq_u32(vreinterpretq_u32_u64(vceqq_f64(b_.neon_f64, a_.neon_f64)));
1753     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1754       r_.wasm_v128 = wasm_f64x2_ne(a_.wasm_v128, b_.wasm_v128);
1755     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1756       r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 != b_.f64));
1757     #else
1758       SIMDE_VECTORIZE
1759       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
1760         r_.u64[i] = (a_.f64[i] != b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
1761       }
1762     #endif
1763 
1764     return simde__m128d_from_private(r_);
1765   #endif
1766 }
1767 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1768   #define _mm_cmpneq_pd(a, b) simde_mm_cmpneq_pd(a, b)
1769 #endif
1770 
1771 SIMDE_FUNCTION_ATTRIBUTES
1772 simde__m128d
simde_mm_cmpneq_sd(simde__m128d a,simde__m128d b)1773 simde_mm_cmpneq_sd (simde__m128d a, simde__m128d b) {
1774   #if defined(SIMDE_X86_SSE2_NATIVE)
1775     return _mm_cmpneq_sd(a, b);
1776   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
1777     return simde_mm_move_sd(a, simde_mm_cmpneq_pd(a, b));
1778   #else
1779     simde__m128d_private
1780       r_,
1781       a_ = simde__m128d_to_private(a),
1782       b_ = simde__m128d_to_private(b);
1783 
1784     r_.u64[0] = (a_.f64[0] != b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
1785     r_.u64[1] = a_.u64[1];
1786 
1787 
1788     return simde__m128d_from_private(r_);
1789   #endif
1790 }
1791 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1792   #define _mm_cmpneq_sd(a, b) simde_mm_cmpneq_sd(a, b)
1793 #endif
1794 
1795 SIMDE_FUNCTION_ATTRIBUTES
1796 simde__m128i
simde_mm_cmplt_epi8(simde__m128i a,simde__m128i b)1797 simde_mm_cmplt_epi8 (simde__m128i a, simde__m128i b) {
1798   #if defined(SIMDE_X86_SSE2_NATIVE)
1799     return _mm_cmplt_epi8(a, b);
1800   #else
1801     simde__m128i_private
1802       r_,
1803       a_ = simde__m128i_to_private(a),
1804       b_ = simde__m128i_to_private(b);
1805 
1806     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1807       r_.neon_u8 = vcltq_s8(a_.neon_i8, b_.neon_i8);
1808     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1809       r_.altivec_i8 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char),vec_cmplt(a_.altivec_i8, b_.altivec_i8));
1810     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1811       r_.wasm_v128 = wasm_i8x16_lt(a_.wasm_v128, b_.wasm_v128);
1812     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1813       r_.i8 = HEDLEY_STATIC_CAST(__typeof__(r_.i8), (a_.i8 < b_.i8));
1814     #else
1815       SIMDE_VECTORIZE
1816       for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
1817         r_.i8[i] = (a_.i8[i] < b_.i8[i]) ? ~INT8_C(0) : INT8_C(0);
1818       }
1819     #endif
1820 
1821     return simde__m128i_from_private(r_);
1822   #endif
1823 }
1824 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1825   #define _mm_cmplt_epi8(a, b) simde_mm_cmplt_epi8(a, b)
1826 #endif
1827 
1828 SIMDE_FUNCTION_ATTRIBUTES
1829 simde__m128i
simde_mm_cmplt_epi16(simde__m128i a,simde__m128i b)1830 simde_mm_cmplt_epi16 (simde__m128i a, simde__m128i b) {
1831   #if defined(SIMDE_X86_SSE2_NATIVE)
1832     return _mm_cmplt_epi16(a, b);
1833   #else
1834     simde__m128i_private
1835       r_,
1836       a_ = simde__m128i_to_private(a),
1837       b_ = simde__m128i_to_private(b);
1838 
1839     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1840       r_.neon_u16 = vcltq_s16(a_.neon_i16, b_.neon_i16);
1841     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1842       r_.altivec_i16 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed short), vec_cmplt(a_.altivec_i16, b_.altivec_i16));
1843     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1844       r_.wasm_v128 = wasm_i16x8_lt(a_.wasm_v128, b_.wasm_v128);
1845     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1846       r_.i16 = HEDLEY_STATIC_CAST(__typeof__(r_.i16), (a_.i16 < b_.i16));
1847     #else
1848       SIMDE_VECTORIZE
1849       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
1850         r_.i16[i] = (a_.i16[i] < b_.i16[i]) ? ~INT16_C(0) : INT16_C(0);
1851       }
1852     #endif
1853 
1854     return simde__m128i_from_private(r_);
1855   #endif
1856 }
1857 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1858   #define _mm_cmplt_epi16(a, b) simde_mm_cmplt_epi16(a, b)
1859 #endif
1860 
1861 SIMDE_FUNCTION_ATTRIBUTES
1862 simde__m128i
simde_mm_cmplt_epi32(simde__m128i a,simde__m128i b)1863 simde_mm_cmplt_epi32 (simde__m128i a, simde__m128i b) {
1864   #if defined(SIMDE_X86_SSE2_NATIVE)
1865     return _mm_cmplt_epi32(a, b);
1866   #else
1867     simde__m128i_private
1868       r_,
1869       a_ = simde__m128i_to_private(a),
1870       b_ = simde__m128i_to_private(b);
1871 
1872     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1873       r_.neon_u32 = vcltq_s32(a_.neon_i32, b_.neon_i32);
1874     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1875       r_.altivec_i32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed int), vec_cmplt(a_.altivec_i32, b_.altivec_i32));
1876     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1877       r_.wasm_v128 = wasm_i32x4_lt(a_.wasm_v128, b_.wasm_v128);
1878     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1879       r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.i32 < b_.i32));
1880     #else
1881       SIMDE_VECTORIZE
1882       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1883         r_.i32[i] = (a_.i32[i] < b_.i32[i]) ? ~INT32_C(0) : INT32_C(0);
1884       }
1885     #endif
1886 
1887     return simde__m128i_from_private(r_);
1888   #endif
1889 }
1890 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1891   #define _mm_cmplt_epi32(a, b) simde_mm_cmplt_epi32(a, b)
1892 #endif
1893 
1894 SIMDE_FUNCTION_ATTRIBUTES
1895 simde__m128d
simde_mm_cmplt_pd(simde__m128d a,simde__m128d b)1896 simde_mm_cmplt_pd (simde__m128d a, simde__m128d b) {
1897   #if defined(SIMDE_X86_SSE2_NATIVE)
1898     return _mm_cmplt_pd(a, b);
1899   #else
1900     simde__m128d_private
1901       r_,
1902       a_ = simde__m128d_to_private(a),
1903       b_ = simde__m128d_to_private(b);
1904 
1905     #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1906       r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 < b_.f64));
1907     #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1908       r_.neon_u64 = vcltq_f64(a_.neon_f64, b_.neon_f64);
1909     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1910       r_.wasm_v128 = wasm_f64x2_lt(a_.wasm_v128, b_.wasm_v128);
1911     #else
1912       SIMDE_VECTORIZE
1913       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
1914         r_.u64[i] = (a_.f64[i] < b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
1915       }
1916     #endif
1917 
1918     return simde__m128d_from_private(r_);
1919   #endif
1920 }
1921 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1922   #define _mm_cmplt_pd(a, b) simde_mm_cmplt_pd(a, b)
1923 #endif
1924 
1925 SIMDE_FUNCTION_ATTRIBUTES
1926 simde__m128d
simde_mm_cmplt_sd(simde__m128d a,simde__m128d b)1927 simde_mm_cmplt_sd (simde__m128d a, simde__m128d b) {
1928   #if defined(SIMDE_X86_SSE2_NATIVE)
1929     return _mm_cmplt_sd(a, b);
1930   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
1931     return simde_mm_move_sd(a, simde_mm_cmplt_pd(a, b));
1932   #else
1933     simde__m128d_private
1934       r_,
1935       a_ = simde__m128d_to_private(a),
1936       b_ = simde__m128d_to_private(b);
1937 
1938     r_.u64[0] = (a_.f64[0] < b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
1939     r_.u64[1] = a_.u64[1];
1940 
1941     return simde__m128d_from_private(r_);
1942   #endif
1943 }
1944 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1945   #define _mm_cmplt_sd(a, b) simde_mm_cmplt_sd(a, b)
1946 #endif
1947 
1948 SIMDE_FUNCTION_ATTRIBUTES
1949 simde__m128d
simde_mm_cmple_pd(simde__m128d a,simde__m128d b)1950 simde_mm_cmple_pd (simde__m128d a, simde__m128d b) {
1951   #if defined(SIMDE_X86_SSE2_NATIVE)
1952     return _mm_cmple_pd(a, b);
1953   #else
1954     simde__m128d_private
1955       r_,
1956       a_ = simde__m128d_to_private(a),
1957       b_ = simde__m128d_to_private(b);
1958 
1959     #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1960       r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 <= b_.f64));
1961     #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1962       r_.neon_u64 = vcleq_f64(a_.neon_f64, b_.neon_f64);
1963     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1964       r_.wasm_v128 = wasm_f64x2_le(a_.wasm_v128, b_.wasm_v128);
1965     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1966       r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_cmple(a_.altivec_f64, b_.altivec_f64));
1967     #else
1968       SIMDE_VECTORIZE
1969       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
1970         r_.u64[i] = (a_.f64[i] <= b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
1971       }
1972     #endif
1973 
1974     return simde__m128d_from_private(r_);
1975   #endif
1976 }
1977 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1978   #define _mm_cmple_pd(a, b) simde_mm_cmple_pd(a, b)
1979 #endif
1980 
1981 SIMDE_FUNCTION_ATTRIBUTES
1982 simde__m128d
simde_mm_cmple_sd(simde__m128d a,simde__m128d b)1983 simde_mm_cmple_sd (simde__m128d a, simde__m128d b) {
1984   #if defined(SIMDE_X86_SSE2_NATIVE)
1985     return _mm_cmple_sd(a, b);
1986   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
1987     return simde_mm_move_sd(a, simde_mm_cmple_pd(a, b));
1988   #else
1989     simde__m128d_private
1990       r_,
1991       a_ = simde__m128d_to_private(a),
1992       b_ = simde__m128d_to_private(b);
1993 
1994     r_.u64[0] = (a_.f64[0] <= b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
1995     r_.u64[1] = a_.u64[1];
1996 
1997     return simde__m128d_from_private(r_);
1998   #endif
1999 }
2000 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2001   #define _mm_cmple_sd(a, b) simde_mm_cmple_sd(a, b)
2002 #endif
2003 
2004 SIMDE_FUNCTION_ATTRIBUTES
2005 simde__m128i
simde_mm_cmpgt_epi8(simde__m128i a,simde__m128i b)2006 simde_mm_cmpgt_epi8 (simde__m128i a, simde__m128i b) {
2007   #if defined(SIMDE_X86_SSE2_NATIVE)
2008     return _mm_cmpgt_epi8(a, b);
2009   #else
2010     simde__m128i_private
2011       r_,
2012       a_ = simde__m128i_to_private(a),
2013       b_ = simde__m128i_to_private(b);
2014 
2015     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2016       r_.neon_u8 = vcgtq_s8(a_.neon_i8, b_.neon_i8);
2017     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
2018       r_.wasm_v128 = wasm_i8x16_gt(a_.wasm_v128, b_.wasm_v128);
2019     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
2020       r_.altivec_i8 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char), vec_cmpgt(a_.altivec_i8, b_.altivec_i8));
2021     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
2022       r_.i8 = HEDLEY_STATIC_CAST(__typeof__(r_.i8), (a_.i8 > b_.i8));
2023     #else
2024       SIMDE_VECTORIZE
2025       for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
2026         r_.i8[i] = (a_.i8[i] > b_.i8[i]) ? ~INT8_C(0) : INT8_C(0);
2027       }
2028     #endif
2029 
2030     return simde__m128i_from_private(r_);
2031   #endif
2032 }
2033 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2034   #define _mm_cmpgt_epi8(a, b) simde_mm_cmpgt_epi8(a, b)
2035 #endif
2036 
2037 SIMDE_FUNCTION_ATTRIBUTES
2038 simde__m128i
simde_mm_cmpgt_epi16(simde__m128i a,simde__m128i b)2039 simde_mm_cmpgt_epi16 (simde__m128i a, simde__m128i b) {
2040   #if defined(SIMDE_X86_SSE2_NATIVE)
2041     return _mm_cmpgt_epi16(a, b);
2042   #else
2043     simde__m128i_private
2044       r_,
2045       a_ = simde__m128i_to_private(a),
2046       b_ = simde__m128i_to_private(b);
2047 
2048     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2049       r_.neon_u16 = vcgtq_s16(a_.neon_i16, b_.neon_i16);
2050     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
2051       r_.wasm_v128 = wasm_i16x8_gt(a_.wasm_v128, b_.wasm_v128);
2052     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
2053       r_.altivec_i16 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed short), vec_cmpgt(a_.altivec_i16, b_.altivec_i16));
2054     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
2055       r_.i16 = HEDLEY_STATIC_CAST(__typeof__(r_.i16), (a_.i16 > b_.i16));
2056     #else
2057       SIMDE_VECTORIZE
2058       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
2059         r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? ~INT16_C(0) : INT16_C(0);
2060       }
2061     #endif
2062 
2063     return simde__m128i_from_private(r_);
2064   #endif
2065 }
2066 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2067   #define _mm_cmpgt_epi16(a, b) simde_mm_cmpgt_epi16(a, b)
2068 #endif
2069 
2070 SIMDE_FUNCTION_ATTRIBUTES
2071 simde__m128i
simde_mm_cmpgt_epi32(simde__m128i a,simde__m128i b)2072 simde_mm_cmpgt_epi32 (simde__m128i a, simde__m128i b) {
2073   #if defined(SIMDE_X86_SSE2_NATIVE)
2074     return _mm_cmpgt_epi32(a, b);
2075   #else
2076     simde__m128i_private
2077       r_,
2078       a_ = simde__m128i_to_private(a),
2079       b_ = simde__m128i_to_private(b);
2080 
2081     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2082       r_.neon_u32 = vcgtq_s32(a_.neon_i32, b_.neon_i32);
2083     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
2084       r_.wasm_v128 = wasm_i32x4_gt(a_.wasm_v128, b_.wasm_v128);
2085     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
2086       r_.altivec_i32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed int), vec_cmpgt(a_.altivec_i32, b_.altivec_i32));
2087     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
2088       r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.i32 > b_.i32));
2089     #else
2090       SIMDE_VECTORIZE
2091       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
2092         r_.i32[i] = (a_.i32[i] > b_.i32[i]) ? ~INT32_C(0) : INT32_C(0);
2093       }
2094     #endif
2095 
2096     return simde__m128i_from_private(r_);
2097   #endif
2098 }
2099 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2100   #define _mm_cmpgt_epi32(a, b) simde_mm_cmpgt_epi32(a, b)
2101 #endif
2102 
2103 SIMDE_FUNCTION_ATTRIBUTES
2104 simde__m128d
simde_mm_cmpgt_pd(simde__m128d a,simde__m128d b)2105 simde_mm_cmpgt_pd (simde__m128d a, simde__m128d b) {
2106   #if defined(SIMDE_X86_SSE2_NATIVE)
2107     return _mm_cmpgt_pd(a, b);
2108   #else
2109     simde__m128d_private
2110       r_,
2111       a_ = simde__m128d_to_private(a),
2112       b_ = simde__m128d_to_private(b);
2113 
2114     #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
2115       r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 > b_.f64));
2116     #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2117       r_.neon_u64 = vcgtq_f64(a_.neon_f64, b_.neon_f64);
2118     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
2119       r_.wasm_v128 = wasm_f64x2_gt(a_.wasm_v128, b_.wasm_v128);
2120     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
2121       r_.altivec_f64 = HEDLEY_STATIC_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_cmpgt(a_.altivec_f64, b_.altivec_f64));
2122     #else
2123       SIMDE_VECTORIZE
2124       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
2125         r_.u64[i] = (a_.f64[i] > b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
2126       }
2127     #endif
2128 
2129     return simde__m128d_from_private(r_);
2130   #endif
2131 }
2132 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2133   #define _mm_cmpgt_pd(a, b) simde_mm_cmpgt_pd(a, b)
2134 #endif
2135 
2136 SIMDE_FUNCTION_ATTRIBUTES
2137 simde__m128d
simde_mm_cmpgt_sd(simde__m128d a,simde__m128d b)2138 simde_mm_cmpgt_sd (simde__m128d a, simde__m128d b) {
2139   #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
2140     return _mm_cmpgt_sd(a, b);
2141   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
2142     return simde_mm_move_sd(a, simde_mm_cmpgt_pd(a, b));
2143   #else
2144     simde__m128d_private
2145       r_,
2146       a_ = simde__m128d_to_private(a),
2147       b_ = simde__m128d_to_private(b);
2148 
2149     r_.u64[0] = (a_.f64[0] > b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
2150     r_.u64[1] = a_.u64[1];
2151 
2152     return simde__m128d_from_private(r_);
2153   #endif
2154 }
2155 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2156   #define _mm_cmpgt_sd(a, b) simde_mm_cmpgt_sd(a, b)
2157 #endif
2158 
2159 SIMDE_FUNCTION_ATTRIBUTES
2160 simde__m128d
simde_mm_cmpge_pd(simde__m128d a,simde__m128d b)2161 simde_mm_cmpge_pd (simde__m128d a, simde__m128d b) {
2162   #if defined(SIMDE_X86_SSE2_NATIVE)
2163     return _mm_cmpge_pd(a, b);
2164   #else
2165     simde__m128d_private
2166       r_,
2167       a_ = simde__m128d_to_private(a),
2168       b_ = simde__m128d_to_private(b);
2169 
2170     #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
2171       r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 >= b_.f64));
2172     #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2173       r_.neon_u64 = vcgeq_f64(a_.neon_f64, b_.neon_f64);
2174     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
2175       r_.wasm_v128 = wasm_f64x2_ge(a_.wasm_v128, b_.wasm_v128);
2176     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
2177       r_.altivec_f64 = HEDLEY_STATIC_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_cmpge(a_.altivec_f64, b_.altivec_f64));
2178     #else
2179       SIMDE_VECTORIZE
2180       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
2181         r_.u64[i] = (a_.f64[i] >= b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
2182       }
2183     #endif
2184 
2185     return simde__m128d_from_private(r_);
2186   #endif
2187 }
2188 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2189   #define _mm_cmpge_pd(a, b) simde_mm_cmpge_pd(a, b)
2190 #endif
2191 
2192 SIMDE_FUNCTION_ATTRIBUTES
2193 simde__m128d
simde_mm_cmpge_sd(simde__m128d a,simde__m128d b)2194 simde_mm_cmpge_sd (simde__m128d a, simde__m128d b) {
2195   #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
2196     return _mm_cmpge_sd(a, b);
2197   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
2198     return simde_mm_move_sd(a, simde_mm_cmpge_pd(a, b));
2199   #else
2200     simde__m128d_private
2201       r_,
2202       a_ = simde__m128d_to_private(a),
2203       b_ = simde__m128d_to_private(b);
2204 
2205     r_.u64[0] = (a_.f64[0] >= b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
2206     r_.u64[1] = a_.u64[1];
2207 
2208     return simde__m128d_from_private(r_);
2209   #endif
2210 }
2211 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2212   #define _mm_cmpge_sd(a, b) simde_mm_cmpge_sd(a, b)
2213 #endif
2214 
2215 SIMDE_FUNCTION_ATTRIBUTES
2216 simde__m128d
simde_mm_cmpngt_pd(simde__m128d a,simde__m128d b)2217 simde_mm_cmpngt_pd (simde__m128d a, simde__m128d b) {
2218   #if defined(SIMDE_X86_SSE2_NATIVE)
2219     return _mm_cmpngt_pd(a, b);
2220   #else
2221     return simde_mm_cmple_pd(a, b);
2222   #endif
2223 }
2224 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2225   #define _mm_cmpngt_pd(a, b) simde_mm_cmpngt_pd(a, b)
2226 #endif
2227 
2228 SIMDE_FUNCTION_ATTRIBUTES
2229 simde__m128d
simde_mm_cmpngt_sd(simde__m128d a,simde__m128d b)2230 simde_mm_cmpngt_sd (simde__m128d a, simde__m128d b) {
2231   #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
2232     return _mm_cmpngt_sd(a, b);
2233   #else
2234     return simde_mm_cmple_sd(a, b);
2235   #endif
2236 }
2237 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2238   #define _mm_cmpngt_sd(a, b) simde_mm_cmpngt_sd(a, b)
2239 #endif
2240 
2241 SIMDE_FUNCTION_ATTRIBUTES
2242 simde__m128d
simde_mm_cmpnge_pd(simde__m128d a,simde__m128d b)2243 simde_mm_cmpnge_pd (simde__m128d a, simde__m128d b) {
2244   #if defined(SIMDE_X86_SSE2_NATIVE)
2245     return _mm_cmpnge_pd(a, b);
2246   #else
2247     return simde_mm_cmplt_pd(a, b);
2248   #endif
2249 }
2250 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2251   #define _mm_cmpnge_pd(a, b) simde_mm_cmpnge_pd(a, b)
2252 #endif
2253 
2254 SIMDE_FUNCTION_ATTRIBUTES
2255 simde__m128d
simde_mm_cmpnge_sd(simde__m128d a,simde__m128d b)2256 simde_mm_cmpnge_sd (simde__m128d a, simde__m128d b) {
2257   #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
2258     return _mm_cmpnge_sd(a, b);
2259   #else
2260     return simde_mm_cmplt_sd(a, b);
2261   #endif
2262 }
2263 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2264   #define _mm_cmpnge_sd(a, b) simde_mm_cmpnge_sd(a, b)
2265 #endif
2266 
2267 SIMDE_FUNCTION_ATTRIBUTES
2268 simde__m128d
simde_mm_cmpnlt_pd(simde__m128d a,simde__m128d b)2269 simde_mm_cmpnlt_pd (simde__m128d a, simde__m128d b) {
2270   #if defined(SIMDE_X86_SSE2_NATIVE)
2271     return _mm_cmpnlt_pd(a, b);
2272   #else
2273     return simde_mm_cmpge_pd(a, b);
2274   #endif
2275 }
2276 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2277   #define _mm_cmpnlt_pd(a, b) simde_mm_cmpnlt_pd(a, b)
2278 #endif
2279 
2280 SIMDE_FUNCTION_ATTRIBUTES
2281 simde__m128d
simde_mm_cmpnlt_sd(simde__m128d a,simde__m128d b)2282 simde_mm_cmpnlt_sd (simde__m128d a, simde__m128d b) {
2283   #if defined(SIMDE_X86_SSE2_NATIVE)
2284     return _mm_cmpnlt_sd(a, b);
2285   #else
2286     return simde_mm_cmpge_sd(a, b);
2287   #endif
2288 }
2289 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2290   #define _mm_cmpnlt_sd(a, b) simde_mm_cmpnlt_sd(a, b)
2291 #endif
2292 
2293 SIMDE_FUNCTION_ATTRIBUTES
2294 simde__m128d
simde_mm_cmpnle_pd(simde__m128d a,simde__m128d b)2295 simde_mm_cmpnle_pd (simde__m128d a, simde__m128d b) {
2296   #if defined(SIMDE_X86_SSE2_NATIVE)
2297     return _mm_cmpnle_pd(a, b);
2298   #else
2299     return simde_mm_cmpgt_pd(a, b);
2300   #endif
2301 }
2302 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2303   #define _mm_cmpnle_pd(a, b) simde_mm_cmpnle_pd(a, b)
2304 #endif
2305 
2306 SIMDE_FUNCTION_ATTRIBUTES
2307 simde__m128d
simde_mm_cmpnle_sd(simde__m128d a,simde__m128d b)2308 simde_mm_cmpnle_sd (simde__m128d a, simde__m128d b) {
2309   #if defined(SIMDE_X86_SSE2_NATIVE)
2310     return _mm_cmpnle_sd(a, b);
2311   #else
2312     return simde_mm_cmpgt_sd(a, b);
2313   #endif
2314 }
2315 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2316   #define _mm_cmpnle_sd(a, b) simde_mm_cmpnle_sd(a, b)
2317 #endif
2318 
2319 SIMDE_FUNCTION_ATTRIBUTES
2320 simde__m128d
simde_mm_cmpord_pd(simde__m128d a,simde__m128d b)2321 simde_mm_cmpord_pd (simde__m128d a, simde__m128d b) {
2322   #if defined(SIMDE_X86_SSE2_NATIVE)
2323     return _mm_cmpord_pd(a, b);
2324   #else
2325     simde__m128d_private
2326       r_,
2327       a_ = simde__m128d_to_private(a),
2328       b_ = simde__m128d_to_private(b);
2329 
2330     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2331       /* Note: NEON does not have ordered compare builtin
2332         Need to compare a eq a and b eq b to check for NaN
2333         Do AND of results to get final */
2334       uint64x2_t ceqaa = vceqq_f64(a_.neon_f64, a_.neon_f64);
2335       uint64x2_t ceqbb = vceqq_f64(b_.neon_f64, b_.neon_f64);
2336       r_.neon_u64 = vandq_u64(ceqaa, ceqbb);
2337     #elif defined(simde_math_isnan)
2338       SIMDE_VECTORIZE
2339       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
2340         r_.u64[i] = (!simde_math_isnan(a_.f64[i]) && !simde_math_isnan(b_.f64[i])) ? ~UINT64_C(0) : UINT64_C(0);
2341       }
2342     #else
2343       HEDLEY_UNREACHABLE();
2344     #endif
2345 
2346     return simde__m128d_from_private(r_);
2347   #endif
2348 }
2349 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2350   #define _mm_cmpord_pd(a, b) simde_mm_cmpord_pd(a, b)
2351 #endif
2352 
2353 SIMDE_FUNCTION_ATTRIBUTES
2354 simde_float64
simde_mm_cvtsd_f64(simde__m128d a)2355 simde_mm_cvtsd_f64 (simde__m128d a) {
2356   #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
2357     return _mm_cvtsd_f64(a);
2358   #else
2359     simde__m128d_private a_ = simde__m128d_to_private(a);
2360     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2361       return HEDLEY_STATIC_CAST(simde_float64, vgetq_lane_f64(a_.neon_f64, 0));
2362     #else
2363       return a_.f64[0];
2364     #endif
2365   #endif
2366 }
2367 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2368   #define _mm_cvtsd_f64(a) simde_mm_cvtsd_f64(a)
2369 #endif
2370 
2371 SIMDE_FUNCTION_ATTRIBUTES
2372 simde__m128d
simde_mm_cmpord_sd(simde__m128d a,simde__m128d b)2373 simde_mm_cmpord_sd (simde__m128d a, simde__m128d b) {
2374   #if defined(SIMDE_X86_SSE2_NATIVE)
2375     return _mm_cmpord_sd(a, b);
2376   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
2377     return simde_mm_move_sd(a, simde_mm_cmpord_pd(a, b));
2378   #else
2379     simde__m128d_private
2380       r_,
2381       a_ = simde__m128d_to_private(a),
2382       b_ = simde__m128d_to_private(b);
2383 
2384     #if defined(simde_math_isnan)
2385       r_.u64[0] = (!simde_math_isnan(a_.f64[0]) && !simde_math_isnan(b_.f64[0])) ? ~UINT64_C(0) : UINT64_C(0);
2386       r_.u64[1] = a_.u64[1];
2387     #else
2388       HEDLEY_UNREACHABLE();
2389     #endif
2390 
2391     return simde__m128d_from_private(r_);
2392   #endif
2393 }
2394 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2395   #define _mm_cmpord_sd(a, b) simde_mm_cmpord_sd(a, b)
2396 #endif
2397 
2398 SIMDE_FUNCTION_ATTRIBUTES
2399 simde__m128d
simde_mm_cmpunord_pd(simde__m128d a,simde__m128d b)2400 simde_mm_cmpunord_pd (simde__m128d a, simde__m128d b) {
2401   #if defined(SIMDE_X86_SSE2_NATIVE)
2402     return _mm_cmpunord_pd(a, b);
2403   #else
2404     simde__m128d_private
2405       r_,
2406       a_ = simde__m128d_to_private(a),
2407       b_ = simde__m128d_to_private(b);
2408 
2409     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2410       uint64x2_t ceqaa = vceqq_f64(a_.neon_f64, a_.neon_f64);
2411       uint64x2_t ceqbb = vceqq_f64(b_.neon_f64, b_.neon_f64);
2412       r_.neon_u64 = vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(vandq_u64(ceqaa, ceqbb))));
2413     #elif defined(simde_math_isnan)
2414       SIMDE_VECTORIZE
2415       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
2416         r_.u64[i] = (simde_math_isnan(a_.f64[i]) || simde_math_isnan(b_.f64[i])) ? ~UINT64_C(0) : UINT64_C(0);
2417       }
2418     #else
2419       HEDLEY_UNREACHABLE();
2420     #endif
2421 
2422     return simde__m128d_from_private(r_);
2423   #endif
2424 }
2425 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2426   #define _mm_cmpunord_pd(a, b) simde_mm_cmpunord_pd(a, b)
2427 #endif
2428 
2429 SIMDE_FUNCTION_ATTRIBUTES
2430 simde__m128d
simde_mm_cmpunord_sd(simde__m128d a,simde__m128d b)2431 simde_mm_cmpunord_sd (simde__m128d a, simde__m128d b) {
2432   #if defined(SIMDE_X86_SSE2_NATIVE)
2433     return _mm_cmpunord_sd(a, b);
2434   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
2435     return simde_mm_move_sd(a, simde_mm_cmpunord_pd(a, b));
2436   #else
2437     simde__m128d_private
2438       r_,
2439       a_ = simde__m128d_to_private(a),
2440       b_ = simde__m128d_to_private(b);
2441 
2442     #if defined(simde_math_isnan)
2443       r_.u64[0] = (simde_math_isnan(a_.f64[0]) || simde_math_isnan(b_.f64[0])) ? ~UINT64_C(0) : UINT64_C(0);
2444       r_.u64[1] = a_.u64[1];
2445     #else
2446       HEDLEY_UNREACHABLE();
2447     #endif
2448 
2449     return simde__m128d_from_private(r_);
2450   #endif
2451 }
2452 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2453   #define _mm_cmpunord_sd(a, b) simde_mm_cmpunord_sd(a, b)
2454 #endif
2455 
2456 SIMDE_FUNCTION_ATTRIBUTES
2457 simde__m128d
simde_mm_cvtepi32_pd(simde__m128i a)2458 simde_mm_cvtepi32_pd (simde__m128i a) {
2459   #if defined(SIMDE_X86_SSE2_NATIVE)
2460     return _mm_cvtepi32_pd(a);
2461   #else
2462     simde__m128d_private r_;
2463     simde__m128i_private a_ = simde__m128i_to_private(a);
2464 
2465     #if defined(SIMDE_CONVERT_VECTOR_)
2466       SIMDE_CONVERT_VECTOR_(r_.f64, a_.m64_private[0].i32);
2467     #else
2468       SIMDE_VECTORIZE
2469       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
2470         r_.f64[i] = (simde_float64) a_.i32[i];
2471       }
2472     #endif
2473 
2474     return simde__m128d_from_private(r_);
2475   #endif
2476 }
2477 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2478   #define _mm_cvtepi32_pd(a) simde_mm_cvtepi32_pd(a)
2479 #endif
2480 
2481 SIMDE_FUNCTION_ATTRIBUTES
2482 simde__m128
simde_mm_cvtepi32_ps(simde__m128i a)2483 simde_mm_cvtepi32_ps (simde__m128i a) {
2484   #if defined(SIMDE_X86_SSE2_NATIVE)
2485     return _mm_cvtepi32_ps(a);
2486   #else
2487     simde__m128_private r_;
2488     simde__m128i_private a_ = simde__m128i_to_private(a);
2489 
2490     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2491       r_.neon_f32 = vcvtq_f32_s32(a_.neon_i32);
2492     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
2493       r_.wasm_v128 = wasm_f32x4_convert_i32x4(a_.wasm_v128);
2494     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
2495       HEDLEY_DIAGNOSTIC_PUSH
2496       #if HEDLEY_HAS_WARNING("-Wc11-extensions")
2497         #pragma clang diagnostic ignored "-Wc11-extensions"
2498       #endif
2499       r_.altivec_f32 = vec_ctf(a_.altivec_i32, 0);
2500       HEDLEY_DIAGNOSTIC_POP
2501     #elif defined(SIMDE_CONVERT_VECTOR_)
2502       SIMDE_CONVERT_VECTOR_(r_.f32, a_.i32);
2503     #else
2504       SIMDE_VECTORIZE
2505       for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
2506         r_.f32[i] = (simde_float32) a_.i32[i];
2507       }
2508     #endif
2509 
2510     return simde__m128_from_private(r_);
2511   #endif
2512 }
2513 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2514   #define _mm_cvtepi32_ps(a) simde_mm_cvtepi32_ps(a)
2515 #endif
2516 
2517 SIMDE_FUNCTION_ATTRIBUTES
2518 simde__m64
simde_mm_cvtpd_pi32(simde__m128d a)2519 simde_mm_cvtpd_pi32 (simde__m128d a) {
2520   #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
2521     return _mm_cvtpd_pi32(a);
2522   #else
2523     simde__m64_private r_;
2524     simde__m128d_private a_ = simde__m128d_to_private(a);
2525 
2526     SIMDE_VECTORIZE
2527     for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
2528       simde_float64 v = simde_math_round(a_.f64[i]);
2529       #if defined(SIMDE_FAST_CONVERSION_RANGE)
2530         r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, v);
2531       #else
2532         r_.i32[i] = ((v > HEDLEY_STATIC_CAST(simde_float64, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float64, INT32_MAX))) ?
2533           SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN;
2534       #endif
2535     }
2536 
2537     return simde__m64_from_private(r_);
2538   #endif
2539 }
2540 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2541   #define _mm_cvtpd_pi32(a) simde_mm_cvtpd_pi32(a)
2542 #endif
2543 
2544 SIMDE_FUNCTION_ATTRIBUTES
2545 simde__m128i
simde_mm_cvtpd_epi32(simde__m128d a)2546 simde_mm_cvtpd_epi32 (simde__m128d a) {
2547   #if defined(SIMDE_X86_SSE2_NATIVE)
2548     return _mm_cvtpd_epi32(a);
2549   #else
2550     simde__m128i_private r_;
2551 
2552     r_.m64[0] = simde_mm_cvtpd_pi32(a);
2553     r_.m64[1] = simde_mm_setzero_si64();
2554 
2555     return simde__m128i_from_private(r_);
2556   #endif
2557 }
2558 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2559   #define _mm_cvtpd_epi32(a) simde_mm_cvtpd_epi32(a)
2560 #endif
2561 
2562 SIMDE_FUNCTION_ATTRIBUTES
2563 simde__m128
simde_mm_cvtpd_ps(simde__m128d a)2564 simde_mm_cvtpd_ps (simde__m128d a) {
2565   #if defined(SIMDE_X86_SSE2_NATIVE)
2566     return _mm_cvtpd_ps(a);
2567   #else
2568     simde__m128_private r_;
2569     simde__m128d_private a_ = simde__m128d_to_private(a);
2570 
2571     #if defined(SIMDE_CONVERT_VECTOR_)
2572       SIMDE_CONVERT_VECTOR_(r_.m64_private[0].f32, a_.f64);
2573       r_.m64_private[1] = simde__m64_to_private(simde_mm_setzero_si64());
2574     #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2575       r_.neon_f32 = vreinterpretq_f32_f64(vcombine_f64(vreinterpret_f64_f32(vcvtx_f32_f64(a_.neon_f64)), vdup_n_f64(0)));
2576     #else
2577       SIMDE_VECTORIZE
2578       for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) {
2579         r_.f32[i] = (simde_float32) a_.f64[i];
2580       }
2581       simde_memset(&(r_.m64_private[1]), 0, sizeof(r_.m64_private[1]));
2582     #endif
2583 
2584     return simde__m128_from_private(r_);
2585   #endif
2586 }
2587 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2588   #define _mm_cvtpd_ps(a) simde_mm_cvtpd_ps(a)
2589 #endif
2590 
2591 SIMDE_FUNCTION_ATTRIBUTES
2592 simde__m128d
simde_mm_cvtpi32_pd(simde__m64 a)2593 simde_mm_cvtpi32_pd (simde__m64 a) {
2594   #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
2595     return _mm_cvtpi32_pd(a);
2596   #else
2597     simde__m128d_private r_;
2598     simde__m64_private a_ = simde__m64_to_private(a);
2599 
2600     #if defined(SIMDE_CONVERT_VECTOR_)
2601       SIMDE_CONVERT_VECTOR_(r_.f64, a_.i32);
2602     #else
2603       SIMDE_VECTORIZE
2604       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
2605         r_.f64[i] = (simde_float64) a_.i32[i];
2606       }
2607     #endif
2608 
2609     return simde__m128d_from_private(r_);
2610   #endif
2611 }
2612 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2613   #define _mm_cvtpi32_pd(a) simde_mm_cvtpi32_pd(a)
2614 #endif
2615 
2616 SIMDE_FUNCTION_ATTRIBUTES
2617 simde__m128i
simde_mm_cvtps_epi32(simde__m128 a)2618 simde_mm_cvtps_epi32 (simde__m128 a) {
2619   #if defined(SIMDE_X86_SSE2_NATIVE)
2620     return _mm_cvtps_epi32(a);
2621   #else
2622     simde__m128i_private r_;
2623     simde__m128_private a_ = simde__m128_to_private(a);
2624 
2625     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE)
2626       r_.neon_i32 = vcvtnq_s32_f32(a_.neon_f32);
2627     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE) && defined(SIMDE_FAST_ROUND_TIES)
2628       r_.neon_i32 = vcvtnq_s32_f32(a_.neon_f32);
2629     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE) && defined(SIMDE_FAST_ROUND_TIES)
2630       HEDLEY_DIAGNOSTIC_PUSH
2631       SIMDE_DIAGNOSTIC_DISABLE_C11_EXTENSIONS_
2632       SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_
2633       r_.altivec_i32 = vec_cts(a_.altivec_f32, 1);
2634       HEDLEY_DIAGNOSTIC_POP
2635     #else
2636       a_ = simde__m128_to_private(simde_x_mm_round_ps(a, SIMDE_MM_FROUND_TO_NEAREST_INT, 1));
2637       SIMDE_VECTORIZE
2638       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
2639         simde_float32 v = simde_math_roundf(a_.f32[i]);
2640         #if defined(SIMDE_FAST_CONVERSION_RANGE)
2641           r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, v);
2642         #else
2643           r_.i32[i] = ((v > HEDLEY_STATIC_CAST(simde_float32, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float32, INT32_MAX))) ?
2644             SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN;
2645         #endif
2646       }
2647     #endif
2648 
2649     return simde__m128i_from_private(r_);
2650   #endif
2651 }
2652 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2653   #define _mm_cvtps_epi32(a) simde_mm_cvtps_epi32(a)
2654 #endif
2655 
2656 SIMDE_FUNCTION_ATTRIBUTES
2657 simde__m128d
simde_mm_cvtps_pd(simde__m128 a)2658 simde_mm_cvtps_pd (simde__m128 a) {
2659   #if defined(SIMDE_X86_SSE2_NATIVE)
2660     return _mm_cvtps_pd(a);
2661   #else
2662     simde__m128d_private r_;
2663     simde__m128_private a_ = simde__m128_to_private(a);
2664 
2665     #if defined(SIMDE_CONVERT_VECTOR_)
2666       SIMDE_CONVERT_VECTOR_(r_.f64, a_.m64_private[0].f32);
2667     #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2668       r_.neon_f64 = vcvt_f64_f32(vget_low_f32(a_.neon_f32));
2669     #else
2670       SIMDE_VECTORIZE
2671       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
2672         r_.f64[i] = a_.f32[i];
2673       }
2674     #endif
2675 
2676     return simde__m128d_from_private(r_);
2677   #endif
2678 }
2679 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2680   #define _mm_cvtps_pd(a) simde_mm_cvtps_pd(a)
2681 #endif
2682 
2683 SIMDE_FUNCTION_ATTRIBUTES
2684 int32_t
simde_mm_cvtsd_si32(simde__m128d a)2685 simde_mm_cvtsd_si32 (simde__m128d a) {
2686   #if defined(SIMDE_X86_SSE2_NATIVE)
2687     return _mm_cvtsd_si32(a);
2688   #else
2689     simde__m128d_private a_ = simde__m128d_to_private(a);
2690 
2691     simde_float64 v = simde_math_round(a_.f64[0]);
2692     #if defined(SIMDE_FAST_CONVERSION_RANGE)
2693       return SIMDE_CONVERT_FTOI(int32_t, v);
2694     #else
2695       return ((v > HEDLEY_STATIC_CAST(simde_float64, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float64, INT32_MAX))) ?
2696         SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN;
2697     #endif
2698   #endif
2699 }
2700 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2701   #define _mm_cvtsd_si32(a) simde_mm_cvtsd_si32(a)
2702 #endif
2703 
2704 SIMDE_FUNCTION_ATTRIBUTES
2705 int64_t
simde_mm_cvtsd_si64(simde__m128d a)2706 simde_mm_cvtsd_si64 (simde__m128d a) {
2707   #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
2708     #if defined(__PGI)
2709       return _mm_cvtsd_si64x(a);
2710     #else
2711       return _mm_cvtsd_si64(a);
2712     #endif
2713   #else
2714     simde__m128d_private a_ = simde__m128d_to_private(a);
2715     return SIMDE_CONVERT_FTOI(int64_t, simde_math_round(a_.f64[0]));
2716   #endif
2717 }
2718 #define simde_mm_cvtsd_si64x(a) simde_mm_cvtsd_si64(a)
2719 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2720   #define _mm_cvtsd_si64(a) simde_mm_cvtsd_si64(a)
2721   #define _mm_cvtsd_si64x(a) simde_mm_cvtsd_si64x(a)
2722 #endif
2723 
2724 SIMDE_FUNCTION_ATTRIBUTES
2725 simde__m128
simde_mm_cvtsd_ss(simde__m128 a,simde__m128d b)2726 simde_mm_cvtsd_ss (simde__m128 a, simde__m128d b) {
2727   #if defined(SIMDE_X86_SSE2_NATIVE)
2728     return _mm_cvtsd_ss(a, b);
2729   #else
2730     simde__m128_private
2731       r_,
2732       a_ = simde__m128_to_private(a);
2733     simde__m128d_private b_ = simde__m128d_to_private(b);
2734 
2735     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2736       r_.neon_f32 = vsetq_lane_f32(vcvtxd_f32_f64(vgetq_lane_f64(b_.neon_f64, 0)), a_.neon_f32, 0);
2737     #else
2738       r_.f32[0] = HEDLEY_STATIC_CAST(simde_float32, b_.f64[0]);
2739 
2740       SIMDE_VECTORIZE
2741       for (size_t i = 1 ; i < (sizeof(r_) / sizeof(r_.i32[0])) ; i++) {
2742         r_.i32[i] = a_.i32[i];
2743       }
2744     #endif
2745     return simde__m128_from_private(r_);
2746   #endif
2747 }
2748 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2749   #define _mm_cvtsd_ss(a, b) simde_mm_cvtsd_ss(a, b)
2750 #endif
2751 
2752 SIMDE_FUNCTION_ATTRIBUTES
2753 int16_t
simde_x_mm_cvtsi128_si16(simde__m128i a)2754 simde_x_mm_cvtsi128_si16 (simde__m128i a) {
2755   simde__m128i_private
2756     a_ = simde__m128i_to_private(a);
2757 
2758   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2759     return vgetq_lane_s16(a_.neon_i16, 0);
2760   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
2761     return HEDLEY_STATIC_CAST(int16_t, wasm_i16x8_extract_lane(a_.wasm_v128, 0));
2762   #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
2763     #if defined(SIMDE_BUG_GCC_95227)
2764       (void) a_;
2765     #endif
2766     return vec_extract(a_.altivec_i16, 0);
2767   #else
2768     return a_.i16[0];
2769   #endif
2770 }
2771 
2772 SIMDE_FUNCTION_ATTRIBUTES
2773 int32_t
simde_mm_cvtsi128_si32(simde__m128i a)2774 simde_mm_cvtsi128_si32 (simde__m128i a) {
2775   #if defined(SIMDE_X86_SSE2_NATIVE)
2776     return _mm_cvtsi128_si32(a);
2777   #else
2778     simde__m128i_private
2779       a_ = simde__m128i_to_private(a);
2780 
2781     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2782       return vgetq_lane_s32(a_.neon_i32, 0);
2783     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
2784       return HEDLEY_STATIC_CAST(int32_t, wasm_i32x4_extract_lane(a_.wasm_v128, 0));
2785     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
2786       #if defined(SIMDE_BUG_GCC_95227)
2787         (void) a_;
2788       #endif
2789       return vec_extract(a_.altivec_i32, 0);
2790     #else
2791       return a_.i32[0];
2792     #endif
2793   #endif
2794 }
2795 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2796   #define _mm_cvtsi128_si32(a) simde_mm_cvtsi128_si32(a)
2797 #endif
2798 
2799 SIMDE_FUNCTION_ATTRIBUTES
2800 int64_t
simde_mm_cvtsi128_si64(simde__m128i a)2801 simde_mm_cvtsi128_si64 (simde__m128i a) {
2802   #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
2803     #if defined(__PGI)
2804       return _mm_cvtsi128_si64x(a);
2805     #else
2806       return _mm_cvtsi128_si64(a);
2807     #endif
2808   #else
2809     simde__m128i_private a_ = simde__m128i_to_private(a);
2810   #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) && !defined(HEDLEY_IBM_VERSION)
2811     return vec_extract(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed long long), a_.i64), 0);
2812   #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2813     return vgetq_lane_s64(a_.neon_i64, 0);
2814   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
2815     return HEDLEY_STATIC_CAST(int64_t, wasm_i64x2_extract_lane(a_.wasm_v128, 0));
2816   #endif
2817     return a_.i64[0];
2818   #endif
2819 }
2820 #define simde_mm_cvtsi128_si64x(a) simde_mm_cvtsi128_si64(a)
2821 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2822   #define _mm_cvtsi128_si64(a) simde_mm_cvtsi128_si64(a)
2823   #define _mm_cvtsi128_si64x(a) simde_mm_cvtsi128_si64x(a)
2824 #endif
2825 
2826 SIMDE_FUNCTION_ATTRIBUTES
2827 simde__m128d
simde_mm_cvtsi32_sd(simde__m128d a,int32_t b)2828 simde_mm_cvtsi32_sd (simde__m128d a, int32_t b) {
2829   #if defined(SIMDE_X86_SSE2_NATIVE)
2830     return _mm_cvtsi32_sd(a, b);
2831   #else
2832     simde__m128d_private r_;
2833     simde__m128d_private a_ = simde__m128d_to_private(a);
2834 
2835     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_AMD64)
2836       r_.neon_f64 = vsetq_lane_f64(HEDLEY_STATIC_CAST(float64_t, b), a_.neon_f64, 0);
2837     #else
2838       r_.f64[0] = HEDLEY_STATIC_CAST(simde_float64, b);
2839       r_.i64[1] = a_.i64[1];
2840     #endif
2841 
2842     return simde__m128d_from_private(r_);
2843   #endif
2844 }
2845 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2846   #define _mm_cvtsi32_sd(a, b) simde_mm_cvtsi32_sd(a, b)
2847 #endif
2848 
2849 SIMDE_FUNCTION_ATTRIBUTES
2850 simde__m128i
simde_x_mm_cvtsi16_si128(int16_t a)2851 simde_x_mm_cvtsi16_si128 (int16_t a) {
2852   simde__m128i_private r_;
2853 
2854   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2855     r_.neon_i16 = vsetq_lane_s16(a, vdupq_n_s16(0), 0);
2856   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
2857     r_.wasm_v128 = wasm_i16x8_make(a, 0, 0, 0, 0, 0, 0, 0);
2858   #else
2859     r_.i16[0] = a;
2860     r_.i16[1] = 0;
2861     r_.i16[2] = 0;
2862     r_.i16[3] = 0;
2863     r_.i16[4] = 0;
2864     r_.i16[5] = 0;
2865     r_.i16[6] = 0;
2866     r_.i16[7] = 0;
2867   #endif
2868 
2869   return simde__m128i_from_private(r_);
2870 }
2871 
2872 SIMDE_FUNCTION_ATTRIBUTES
2873 simde__m128i
simde_mm_cvtsi32_si128(int32_t a)2874 simde_mm_cvtsi32_si128 (int32_t a) {
2875   #if defined(SIMDE_X86_SSE2_NATIVE)
2876     return _mm_cvtsi32_si128(a);
2877   #else
2878     simde__m128i_private r_;
2879 
2880     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2881       r_.neon_i32 = vsetq_lane_s32(a, vdupq_n_s32(0), 0);
2882     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
2883       r_.wasm_v128 = wasm_i32x4_make(a, 0, 0, 0);
2884     #else
2885       r_.i32[0] = a;
2886       r_.i32[1] = 0;
2887       r_.i32[2] = 0;
2888       r_.i32[3] = 0;
2889     #endif
2890 
2891     return simde__m128i_from_private(r_);
2892   #endif
2893 }
2894 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2895   #define _mm_cvtsi32_si128(a) simde_mm_cvtsi32_si128(a)
2896 #endif
2897 
2898 SIMDE_FUNCTION_ATTRIBUTES
2899 simde__m128d
simde_mm_cvtsi64_sd(simde__m128d a,int64_t b)2900 simde_mm_cvtsi64_sd (simde__m128d a, int64_t b) {
2901   #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
2902     #if !defined(__PGI)
2903       return _mm_cvtsi64_sd(a, b);
2904     #else
2905       return _mm_cvtsi64x_sd(a, b);
2906     #endif
2907   #else
2908     simde__m128d_private
2909       r_,
2910       a_ = simde__m128d_to_private(a);
2911 
2912     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2913       r_.neon_f64 = vsetq_lane_f64(HEDLEY_STATIC_CAST(float64_t, b), a_.neon_f64, 0);
2914     #else
2915       r_.f64[0] = HEDLEY_STATIC_CAST(simde_float64, b);
2916       r_.f64[1] = a_.f64[1];
2917     #endif
2918 
2919     return simde__m128d_from_private(r_);
2920   #endif
2921 }
2922 #define simde_mm_cvtsi64x_sd(a, b) simde_mm_cvtsi64_sd(a, b)
2923 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2924   #define _mm_cvtsi64_sd(a, b) simde_mm_cvtsi64_sd(a, b)
2925   #define _mm_cvtsi64x_sd(a, b) simde_mm_cvtsi64x_sd(a, b)
2926 #endif
2927 
2928 SIMDE_FUNCTION_ATTRIBUTES
2929 simde__m128i
simde_mm_cvtsi64_si128(int64_t a)2930 simde_mm_cvtsi64_si128 (int64_t a) {
2931   #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
2932     #if !defined(__PGI)
2933       return _mm_cvtsi64_si128(a);
2934     #else
2935       return _mm_cvtsi64x_si128(a);
2936     #endif
2937   #else
2938     simde__m128i_private r_;
2939 
2940     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2941       r_.neon_i64 = vsetq_lane_s64(a, vdupq_n_s64(0), 0);
2942     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
2943       r_.wasm_v128 = wasm_i64x2_make(a, 0);
2944     #else
2945       r_.i64[0] = a;
2946       r_.i64[1] = 0;
2947     #endif
2948 
2949     return simde__m128i_from_private(r_);
2950   #endif
2951 }
2952 #define simde_mm_cvtsi64x_si128(a) simde_mm_cvtsi64_si128(a)
2953 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2954   #define _mm_cvtsi64_si128(a) simde_mm_cvtsi64_si128(a)
2955   #define _mm_cvtsi64x_si128(a) simde_mm_cvtsi64x_si128(a)
2956 #endif
2957 
2958 SIMDE_FUNCTION_ATTRIBUTES
2959 simde__m128d
simde_mm_cvtss_sd(simde__m128d a,simde__m128 b)2960 simde_mm_cvtss_sd (simde__m128d a, simde__m128 b) {
2961   #if defined(SIMDE_X86_SSE2_NATIVE)
2962     return _mm_cvtss_sd(a, b);
2963   #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2964     float64x2_t temp = vcvt_f64_f32(vset_lane_f32(vgetq_lane_f32(simde__m128_to_private(b).neon_f32, 0), vdup_n_f32(0), 0));
2965     return vsetq_lane_f64(vgetq_lane_f64(simde__m128d_to_private(a).neon_f64, 1), temp, 1);
2966   #else
2967     simde__m128d_private
2968       a_ = simde__m128d_to_private(a);
2969     simde__m128_private b_ = simde__m128_to_private(b);
2970 
2971     a_.f64[0] = HEDLEY_STATIC_CAST(simde_float64, b_.f32[0]);
2972 
2973     return simde__m128d_from_private(a_);
2974   #endif
2975 }
2976 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2977   #define _mm_cvtss_sd(a, b) simde_mm_cvtss_sd(a, b)
2978 #endif
2979 
2980 SIMDE_FUNCTION_ATTRIBUTES
2981 simde__m64
simde_mm_cvttpd_pi32(simde__m128d a)2982 simde_mm_cvttpd_pi32 (simde__m128d a) {
2983   #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
2984     return _mm_cvttpd_pi32(a);
2985   #else
2986     simde__m64_private r_;
2987     simde__m128d_private a_ = simde__m128d_to_private(a);
2988 
2989     #if defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FAST_CONVERSION_RANGE)
2990       SIMDE_CONVERT_VECTOR_(r_.i32, a_.f64);
2991     #else
2992       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
2993         simde_float64 v = a_.f64[i];
2994         #if defined(SIMDE_FAST_CONVERSION_RANGE)
2995           r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, v);
2996         #else
2997           r_.i32[i] = ((v > HEDLEY_STATIC_CAST(simde_float64, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float64, INT32_MAX))) ?
2998             SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN;
2999         #endif
3000       }
3001     #endif
3002 
3003     return simde__m64_from_private(r_);
3004   #endif
3005 }
3006 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3007   #define _mm_cvttpd_pi32(a) simde_mm_cvttpd_pi32(a)
3008 #endif
3009 
3010 SIMDE_FUNCTION_ATTRIBUTES
3011 simde__m128i
simde_mm_cvttpd_epi32(simde__m128d a)3012 simde_mm_cvttpd_epi32 (simde__m128d a) {
3013   #if defined(SIMDE_X86_SSE2_NATIVE)
3014     return _mm_cvttpd_epi32(a);
3015   #else
3016     simde__m128i_private r_;
3017 
3018     r_.m64[0] = simde_mm_cvttpd_pi32(a);
3019     r_.m64[1] = simde_mm_setzero_si64();
3020 
3021     return simde__m128i_from_private(r_);
3022   #endif
3023 }
3024 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3025   #define _mm_cvttpd_epi32(a) simde_mm_cvttpd_epi32(a)
3026 #endif
3027 
3028 SIMDE_FUNCTION_ATTRIBUTES
3029 simde__m128i
simde_mm_cvttps_epi32(simde__m128 a)3030 simde_mm_cvttps_epi32 (simde__m128 a) {
3031   #if defined(SIMDE_X86_SSE2_NATIVE)
3032     return _mm_cvttps_epi32(a);
3033   #else
3034     simde__m128i_private r_;
3035     simde__m128_private a_ = simde__m128_to_private(a);
3036 
3037     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE)
3038       r_.neon_i32 = vcvtq_s32_f32(a_.neon_f32);
3039     #elif defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FAST_CONVERSION_RANGE)
3040       SIMDE_CONVERT_VECTOR_(r_.i32, a_.f32);
3041     #else
3042       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
3043         simde_float32 v = a_.f32[i];
3044         #if defined(SIMDE_FAST_CONVERSION_RANGE)
3045           r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, v);
3046         #else
3047           r_.i32[i] = ((v > HEDLEY_STATIC_CAST(simde_float32, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float32, INT32_MAX))) ?
3048             SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN;
3049         #endif
3050       }
3051     #endif
3052 
3053     return simde__m128i_from_private(r_);
3054   #endif
3055 }
3056 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3057   #define _mm_cvttps_epi32(a) simde_mm_cvttps_epi32(a)
3058 #endif
3059 
3060 SIMDE_FUNCTION_ATTRIBUTES
3061 int32_t
simde_mm_cvttsd_si32(simde__m128d a)3062 simde_mm_cvttsd_si32 (simde__m128d a) {
3063   #if defined(SIMDE_X86_SSE2_NATIVE)
3064     return _mm_cvttsd_si32(a);
3065   #else
3066     simde__m128d_private a_ = simde__m128d_to_private(a);
3067     simde_float64 v = a_.f64[0];
3068     #if defined(SIMDE_FAST_CONVERSION_RANGE)
3069       return SIMDE_CONVERT_FTOI(int32_t, v);
3070     #else
3071       return ((v > HEDLEY_STATIC_CAST(simde_float64, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float64, INT32_MAX))) ?
3072         SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN;
3073     #endif
3074   #endif
3075 }
3076 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3077   #define _mm_cvttsd_si32(a) simde_mm_cvttsd_si32(a)
3078 #endif
3079 
3080 SIMDE_FUNCTION_ATTRIBUTES
3081 int64_t
simde_mm_cvttsd_si64(simde__m128d a)3082 simde_mm_cvttsd_si64 (simde__m128d a) {
3083   #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
3084     #if !defined(__PGI)
3085       return _mm_cvttsd_si64(a);
3086     #else
3087       return _mm_cvttsd_si64x(a);
3088     #endif
3089   #else
3090     simde__m128d_private a_ = simde__m128d_to_private(a);
3091     return SIMDE_CONVERT_FTOI(int64_t, a_.f64[0]);
3092   #endif
3093 }
3094 #define simde_mm_cvttsd_si64x(a) simde_mm_cvttsd_si64(a)
3095 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3096   #define _mm_cvttsd_si64(a) simde_mm_cvttsd_si64(a)
3097   #define _mm_cvttsd_si64x(a) simde_mm_cvttsd_si64x(a)
3098 #endif
3099 
3100 SIMDE_FUNCTION_ATTRIBUTES
3101 simde__m128d
simde_mm_div_pd(simde__m128d a,simde__m128d b)3102 simde_mm_div_pd (simde__m128d a, simde__m128d b) {
3103   #if defined(SIMDE_X86_SSE2_NATIVE)
3104     return _mm_div_pd(a, b);
3105   #else
3106     simde__m128d_private
3107       r_,
3108       a_ = simde__m128d_to_private(a),
3109       b_ = simde__m128d_to_private(b);
3110 
3111     #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
3112       r_.f64 = a_.f64 / b_.f64;
3113     #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3114       r_.neon_f64 = vdivq_f64(a_.neon_f64, b_.neon_f64);
3115     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
3116       r_.wasm_v128 =  wasm_f64x2_div(a_.wasm_v128, b_.wasm_v128);
3117     #else
3118       SIMDE_VECTORIZE
3119       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
3120         r_.f64[i] = a_.f64[i] / b_.f64[i];
3121       }
3122     #endif
3123 
3124     return simde__m128d_from_private(r_);
3125   #endif
3126 }
3127 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3128   #define _mm_div_pd(a, b) simde_mm_div_pd(a, b)
3129 #endif
3130 
3131 SIMDE_FUNCTION_ATTRIBUTES
3132 simde__m128d
simde_mm_div_sd(simde__m128d a,simde__m128d b)3133 simde_mm_div_sd (simde__m128d a, simde__m128d b) {
3134   #if defined(SIMDE_X86_SSE2_NATIVE)
3135     return _mm_div_sd(a, b);
3136   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
3137     return simde_mm_move_sd(a, simde_mm_div_pd(a, b));
3138   #else
3139     simde__m128d_private
3140       r_,
3141       a_ = simde__m128d_to_private(a),
3142       b_ = simde__m128d_to_private(b);
3143 
3144     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3145       float64x2_t temp = vdivq_f64(a_.neon_f64, b_.neon_f64);
3146       r_.neon_f64 = vsetq_lane_f64(vgetq_lane(a_.neon_f64, 1), temp, 1);
3147     #else
3148       r_.f64[0] = a_.f64[0] / b_.f64[0];
3149       r_.f64[1] = a_.f64[1];
3150     #endif
3151 
3152     return simde__m128d_from_private(r_);
3153   #endif
3154 }
3155 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3156   #define _mm_div_sd(a, b) simde_mm_div_sd(a, b)
3157 #endif
3158 
3159 SIMDE_FUNCTION_ATTRIBUTES
3160 int32_t
simde_mm_extract_epi16(simde__m128i a,const int imm8)3161 simde_mm_extract_epi16 (simde__m128i a, const int imm8)
3162     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 7)  {
3163   uint16_t r;
3164   simde__m128i_private a_ = simde__m128i_to_private(a);
3165 
3166   #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
3167     #if defined(SIMDE_BUG_GCC_95227)
3168       (void) a_;
3169       (void) imm8;
3170     #endif
3171     r = HEDLEY_STATIC_CAST(uint16_t, vec_extract(a_.altivec_i16, imm8));
3172   #else
3173     r = a_.u16[imm8 & 7];
3174   #endif
3175 
3176   return  HEDLEY_STATIC_CAST(int32_t, r);
3177 }
3178 #if defined(SIMDE_X86_SSE2_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(4,6,0))
3179   #define simde_mm_extract_epi16(a, imm8) _mm_extract_epi16(a, imm8)
3180 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3181   #define simde_mm_extract_epi16(a, imm8) (HEDLEY_STATIC_CAST(int32_t, vgetq_lane_s16(simde__m128i_to_private(a).neon_i16, (imm8))) & (INT32_C(0x0000ffff)))
3182 #endif
3183 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3184   #define _mm_extract_epi16(a, imm8) simde_mm_extract_epi16(a, imm8)
3185 #endif
3186 
3187 SIMDE_FUNCTION_ATTRIBUTES
3188 simde__m128i
simde_mm_insert_epi16(simde__m128i a,int16_t i,const int imm8)3189 simde_mm_insert_epi16 (simde__m128i a, int16_t i, const int imm8)
3190     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 7)  {
3191   simde__m128i_private a_ = simde__m128i_to_private(a);
3192   a_.i16[imm8 & 7] = i;
3193   return simde__m128i_from_private(a_);
3194 }
3195 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
3196   #define simde_mm_insert_epi16(a, i, imm8) _mm_insert_epi16((a), (i), (imm8))
3197 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3198   #define simde_mm_insert_epi16(a, i, imm8) simde__m128i_from_neon_i16(vsetq_lane_s16((i), simde__m128i_to_neon_i16(a), (imm8)))
3199 #endif
3200 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3201   #define _mm_insert_epi16(a, i, imm8) simde_mm_insert_epi16(a, i, imm8)
3202 #endif
3203 
3204 SIMDE_FUNCTION_ATTRIBUTES
3205 simde__m128d
simde_mm_load_pd(simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM (2)])3206 simde_mm_load_pd (simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)]) {
3207   #if defined(SIMDE_X86_SSE2_NATIVE)
3208     return _mm_load_pd(mem_addr);
3209   #else
3210     simde__m128d_private r_;
3211 
3212     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3213       r_.neon_f64 = vld1q_f64(mem_addr);
3214     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3215       r_.neon_u32 = vld1q_u32(HEDLEY_REINTERPRET_CAST(uint32_t const*, mem_addr));
3216     #else
3217       simde_memcpy(&r_, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m128d), sizeof(r_));
3218     #endif
3219 
3220     return simde__m128d_from_private(r_);
3221   #endif
3222 }
3223 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3224   #define _mm_load_pd(mem_addr) simde_mm_load_pd(mem_addr)
3225 #endif
3226 
3227 SIMDE_FUNCTION_ATTRIBUTES
3228 simde__m128d
simde_mm_load1_pd(simde_float64 const * mem_addr)3229 simde_mm_load1_pd (simde_float64 const* mem_addr) {
3230   #if defined(SIMDE_X86_SSE2_NATIVE)
3231     return _mm_load1_pd(mem_addr);
3232   #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3233     return simde__m128d_from_neon_f64(vld1q_dup_f64(mem_addr));
3234   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
3235     return simde__m128d_from_wasm_v128(wasm_v64x2_load_splat(mem_addr));
3236   #else
3237     return simde_mm_set1_pd(*mem_addr);
3238   #endif
3239 }
3240 #define simde_mm_load_pd1(mem_addr) simde_mm_load1_pd(mem_addr)
3241 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3242   #define _mm_load_pd1(mem_addr) simde_mm_load1_pd(mem_addr)
3243   #define _mm_load1_pd(mem_addr) simde_mm_load1_pd(mem_addr)
3244 #endif
3245 
3246 SIMDE_FUNCTION_ATTRIBUTES
3247 simde__m128d
simde_mm_load_sd(simde_float64 const * mem_addr)3248 simde_mm_load_sd (simde_float64 const* mem_addr) {
3249   #if defined(SIMDE_X86_SSE2_NATIVE)
3250     return _mm_load_sd(mem_addr);
3251   #else
3252     simde__m128d_private r_;
3253 
3254     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3255       r_.neon_f64 = vsetq_lane_f64(*mem_addr, vdupq_n_f64(0), 0);
3256     #else
3257       r_.f64[0] = *mem_addr;
3258       r_.u64[1] = UINT64_C(0);
3259     #endif
3260 
3261     return simde__m128d_from_private(r_);
3262   #endif
3263 }
3264 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3265   #define _mm_load_sd(mem_addr) simde_mm_load_sd(mem_addr)
3266 #endif
3267 
3268 SIMDE_FUNCTION_ATTRIBUTES
3269 simde__m128i
simde_mm_load_si128(simde__m128i const * mem_addr)3270 simde_mm_load_si128 (simde__m128i const* mem_addr) {
3271   #if defined(SIMDE_X86_SSE2_NATIVE)
3272     return _mm_load_si128(HEDLEY_REINTERPRET_CAST(__m128i const*, mem_addr));
3273   #else
3274     simde__m128i_private r_;
3275 
3276     #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
3277       r_.altivec_i32 = vec_ld(0, HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(int) const*, mem_addr));
3278     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3279       r_.neon_i32 = vld1q_s32(HEDLEY_REINTERPRET_CAST(int32_t const*, mem_addr));
3280     #else
3281       simde_memcpy(&r_, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m128i), sizeof(simde__m128i));
3282     #endif
3283 
3284     return simde__m128i_from_private(r_);
3285   #endif
3286 }
3287 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3288   #define _mm_load_si128(mem_addr) simde_mm_load_si128(mem_addr)
3289 #endif
3290 
3291 SIMDE_FUNCTION_ATTRIBUTES
3292 simde__m128d
simde_mm_loadh_pd(simde__m128d a,simde_float64 const * mem_addr)3293 simde_mm_loadh_pd (simde__m128d a, simde_float64 const* mem_addr) {
3294   #if defined(SIMDE_X86_SSE2_NATIVE)
3295     return _mm_loadh_pd(a, mem_addr);
3296   #else
3297     simde__m128d_private
3298       r_,
3299       a_ = simde__m128d_to_private(a);
3300 
3301     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3302       r_.neon_f64 = vcombine_f64(vget_low_f64(a_.neon_f64), vld1_f64(HEDLEY_REINTERPRET_CAST(const float64_t*, mem_addr)));
3303     #else
3304       simde_float64 t;
3305 
3306       simde_memcpy(&t, mem_addr, sizeof(t));
3307       r_.f64[0] = a_.f64[0];
3308       r_.f64[1] = t;
3309     #endif
3310 
3311     return simde__m128d_from_private(r_);
3312   #endif
3313 }
3314 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3315   #define _mm_loadh_pd(a, mem_addr) simde_mm_loadh_pd(a, mem_addr)
3316 #endif
3317 
3318 SIMDE_FUNCTION_ATTRIBUTES
3319 simde__m128i
simde_mm_loadl_epi64(simde__m128i const * mem_addr)3320 simde_mm_loadl_epi64 (simde__m128i const* mem_addr) {
3321   #if defined(SIMDE_X86_SSE2_NATIVE)
3322     return _mm_loadl_epi64(mem_addr);
3323   #else
3324     simde__m128i_private r_;
3325 
3326     int64_t value;
3327     simde_memcpy(&value, mem_addr, sizeof(value));
3328 
3329     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3330       r_.neon_i64 = vcombine_s64(vld1_s64(HEDLEY_REINTERPRET_CAST(int64_t const *, mem_addr)), vdup_n_s64(0));
3331     #else
3332       r_.i64[0] = value;
3333       r_.i64[1] = 0;
3334     #endif
3335 
3336     return simde__m128i_from_private(r_);
3337   #endif
3338 }
3339 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3340   #define _mm_loadl_epi64(mem_addr) simde_mm_loadl_epi64(mem_addr)
3341 #endif
3342 
3343 SIMDE_FUNCTION_ATTRIBUTES
3344 simde__m128d
simde_mm_loadl_pd(simde__m128d a,simde_float64 const * mem_addr)3345 simde_mm_loadl_pd (simde__m128d a, simde_float64 const* mem_addr) {
3346   #if defined(SIMDE_X86_SSE2_NATIVE)
3347     return _mm_loadl_pd(a, mem_addr);
3348   #else
3349     simde__m128d_private
3350       r_,
3351       a_ = simde__m128d_to_private(a);
3352 
3353     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3354       r_.neon_f64 = vcombine_f64(vld1_f64(
3355         HEDLEY_REINTERPRET_CAST(const float64_t*, mem_addr)), vget_high_f64(a_.neon_f64));
3356     #else
3357       r_.f64[0] = *mem_addr;
3358       r_.u64[1] = a_.u64[1];
3359     #endif
3360 
3361     return simde__m128d_from_private(r_);
3362   #endif
3363 }
3364 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3365   #define _mm_loadl_pd(a, mem_addr) simde_mm_loadl_pd(a, mem_addr)
3366 #endif
3367 
3368 SIMDE_FUNCTION_ATTRIBUTES
3369 simde__m128d
simde_mm_loadr_pd(simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM (2)])3370 simde_mm_loadr_pd (simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)]) {
3371   #if defined(SIMDE_X86_SSE2_NATIVE)
3372     return _mm_loadr_pd(mem_addr);
3373   #else
3374     simde__m128d_private
3375       r_;
3376 
3377     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3378       r_.neon_f64 = vld1q_f64(mem_addr);
3379       r_.neon_f64 = vextq_f64(r_.neon_f64, r_.neon_f64, 1);
3380     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3381       r_.neon_i64 = vld1q_s64(HEDLEY_REINTERPRET_CAST(int64_t const *, mem_addr));
3382       r_.neon_i64 = vextq_s64(r_.neon_i64, r_.neon_i64, 1);
3383     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
3384       v128_t tmp = wasm_v128_load(mem_addr);
3385       r_.wasm_v128 = wasm_v64x2_shuffle(tmp, tmp, 1, 0);
3386     #else
3387       r_.f64[0] = mem_addr[1];
3388       r_.f64[1] = mem_addr[0];
3389     #endif
3390 
3391     return simde__m128d_from_private(r_);
3392   #endif
3393 }
3394 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3395   #define _mm_loadr_pd(mem_addr) simde_mm_loadr_pd(mem_addr)
3396 #endif
3397 
3398 SIMDE_FUNCTION_ATTRIBUTES
3399 simde__m128d
simde_mm_loadu_pd(simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM (2)])3400 simde_mm_loadu_pd (simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)]) {
3401   #if defined(SIMDE_X86_SSE2_NATIVE)
3402     return _mm_loadu_pd(mem_addr);
3403   #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3404     return vld1q_f64(mem_addr);
3405   #else
3406     simde__m128d_private r_;
3407 
3408     simde_memcpy(&r_, mem_addr, sizeof(r_));
3409 
3410     return simde__m128d_from_private(r_);
3411   #endif
3412 }
3413 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3414   #define _mm_loadu_pd(mem_addr) simde_mm_loadu_pd(mem_addr)
3415 #endif
3416 
3417 SIMDE_FUNCTION_ATTRIBUTES
3418 simde__m128i
simde_x_mm_loadu_epi8(int8_t const * mem_addr)3419 simde_x_mm_loadu_epi8(int8_t const* mem_addr) {
3420   #if defined(SIMDE_X86_SSE2_NATIVE)
3421     return _mm_loadu_si128(SIMDE_ALIGN_CAST(simde__m128i const*, mem_addr));
3422   #else
3423     simde__m128i_private r_;
3424 
3425     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3426       r_.neon_i8 = vld1q_s8(HEDLEY_REINTERPRET_CAST(int8_t const*, mem_addr));
3427     #else
3428       simde_memcpy(&r_, mem_addr, sizeof(r_));
3429     #endif
3430 
3431     return simde__m128i_from_private(r_);
3432   #endif
3433 }
3434 
3435 SIMDE_FUNCTION_ATTRIBUTES
3436 simde__m128i
simde_x_mm_loadu_epi16(int16_t const * mem_addr)3437 simde_x_mm_loadu_epi16(int16_t const* mem_addr) {
3438   #if defined(SIMDE_X86_SSE2_NATIVE)
3439     return _mm_loadu_si128(SIMDE_ALIGN_CAST(simde__m128i const*, mem_addr));
3440   #else
3441     simde__m128i_private r_;
3442 
3443     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3444       r_.neon_i16 = vld1q_s16(HEDLEY_REINTERPRET_CAST(int16_t const*, mem_addr));
3445     #else
3446       simde_memcpy(&r_, mem_addr, sizeof(r_));
3447     #endif
3448 
3449     return simde__m128i_from_private(r_);
3450   #endif
3451 }
3452 
3453 SIMDE_FUNCTION_ATTRIBUTES
3454 simde__m128i
simde_x_mm_loadu_epi32(int32_t const * mem_addr)3455 simde_x_mm_loadu_epi32(int32_t const* mem_addr) {
3456   #if defined(SIMDE_X86_SSE2_NATIVE)
3457     return _mm_loadu_si128(SIMDE_ALIGN_CAST(simde__m128i const*, mem_addr));
3458   #else
3459     simde__m128i_private r_;
3460 
3461     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3462       r_.neon_i32 = vld1q_s32(HEDLEY_REINTERPRET_CAST(int32_t const*, mem_addr));
3463     #else
3464       simde_memcpy(&r_, mem_addr, sizeof(r_));
3465     #endif
3466 
3467     return simde__m128i_from_private(r_);
3468   #endif
3469 }
3470 
3471 SIMDE_FUNCTION_ATTRIBUTES
3472 simde__m128i
simde_x_mm_loadu_epi64(int64_t const * mem_addr)3473 simde_x_mm_loadu_epi64(int64_t const* mem_addr) {
3474   #if defined(SIMDE_X86_SSE2_NATIVE)
3475     return _mm_loadu_si128(SIMDE_ALIGN_CAST(simde__m128i const*, mem_addr));
3476   #else
3477     simde__m128i_private r_;
3478 
3479     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3480       r_.neon_i64 = vld1q_s64(HEDLEY_REINTERPRET_CAST(int64_t const*, mem_addr));
3481     #else
3482       simde_memcpy(&r_, mem_addr, sizeof(r_));
3483     #endif
3484 
3485     return simde__m128i_from_private(r_);
3486   #endif
3487 }
3488 
3489 SIMDE_FUNCTION_ATTRIBUTES
3490 simde__m128i
simde_mm_loadu_si128(void const * mem_addr)3491 simde_mm_loadu_si128 (void const* mem_addr) {
3492   #if defined(SIMDE_X86_SSE2_NATIVE)
3493     return _mm_loadu_si128(HEDLEY_STATIC_CAST(__m128i const*, mem_addr));
3494   #else
3495     simde__m128i_private r_;
3496 
3497     #if HEDLEY_GNUC_HAS_ATTRIBUTE(may_alias,3,3,0)
3498       HEDLEY_DIAGNOSTIC_PUSH
3499       SIMDE_DIAGNOSTIC_DISABLE_PACKED_
3500       struct simde_mm_loadu_si128_s {
3501         __typeof__(r_) v;
3502       } __attribute__((__packed__, __may_alias__));
3503       r_ = HEDLEY_REINTERPRET_CAST(const struct simde_mm_loadu_si128_s *, mem_addr)->v;
3504       HEDLEY_DIAGNOSTIC_POP
3505     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3506       /* Note that this is a lower priority than the struct above since
3507        * clang assumes mem_addr is aligned (since it is a __m128i*). */
3508       r_.neon_i32 = vld1q_s32(HEDLEY_REINTERPRET_CAST(int32_t const*, mem_addr));
3509     #else
3510       simde_memcpy(&r_, mem_addr, sizeof(r_));
3511     #endif
3512 
3513     return simde__m128i_from_private(r_);
3514   #endif
3515 }
3516 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3517   #define _mm_loadu_si128(mem_addr) simde_mm_loadu_si128(mem_addr)
3518 #endif
3519 
3520 SIMDE_FUNCTION_ATTRIBUTES
3521 simde__m128i
simde_mm_madd_epi16(simde__m128i a,simde__m128i b)3522 simde_mm_madd_epi16 (simde__m128i a, simde__m128i b) {
3523   #if defined(SIMDE_X86_SSE2_NATIVE)
3524     return _mm_madd_epi16(a, b);
3525   #else
3526     simde__m128i_private
3527       r_,
3528       a_ = simde__m128i_to_private(a),
3529       b_ = simde__m128i_to_private(b);
3530 
3531     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3532       int32x4_t pl = vmull_s16(vget_low_s16(a_.neon_i16),  vget_low_s16(b_.neon_i16));
3533       int32x4_t ph = vmull_high_s16(a_.neon_i16, b_.neon_i16);
3534       r_.neon_i32 = vpaddq_s32(pl, ph);
3535     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3536       int32x4_t pl = vmull_s16(vget_low_s16(a_.neon_i16),  vget_low_s16(b_.neon_i16));
3537       int32x4_t ph = vmull_s16(vget_high_s16(a_.neon_i16), vget_high_s16(b_.neon_i16));
3538       int32x2_t rl = vpadd_s32(vget_low_s32(pl), vget_high_s32(pl));
3539       int32x2_t rh = vpadd_s32(vget_low_s32(ph), vget_high_s32(ph));
3540       r_.neon_i32 = vcombine_s32(rl, rh);
3541     #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
3542       static const SIMDE_POWER_ALTIVEC_VECTOR(int) tz = { 0, 0, 0, 0 };
3543       r_.altivec_i32 = vec_msum(a_.altivec_i16, b_.altivec_i16, tz);
3544     #else
3545       SIMDE_VECTORIZE
3546       for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i16[0])) ; i += 2) {
3547         r_.i32[i / 2] = (a_.i16[i] * b_.i16[i]) + (a_.i16[i + 1] * b_.i16[i + 1]);
3548       }
3549     #endif
3550 
3551     return simde__m128i_from_private(r_);
3552   #endif
3553 }
3554 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3555   #define _mm_madd_epi16(a, b) simde_mm_madd_epi16(a, b)
3556 #endif
3557 
3558 SIMDE_FUNCTION_ATTRIBUTES
3559 void
simde_mm_maskmoveu_si128(simde__m128i a,simde__m128i mask,int8_t mem_addr[HEDLEY_ARRAY_PARAM (16)])3560 simde_mm_maskmoveu_si128 (simde__m128i a, simde__m128i mask, int8_t mem_addr[HEDLEY_ARRAY_PARAM(16)]) {
3561   #if defined(SIMDE_X86_SSE2_NATIVE)
3562     _mm_maskmoveu_si128(a, mask, HEDLEY_REINTERPRET_CAST(char*, mem_addr));
3563   #else
3564     simde__m128i_private
3565       a_ = simde__m128i_to_private(a),
3566       mask_ = simde__m128i_to_private(mask);
3567 
3568     for (size_t i = 0 ; i < (sizeof(a_.i8) / sizeof(a_.i8[0])) ; i++) {
3569       if (mask_.u8[i] & 0x80) {
3570         mem_addr[i] = a_.i8[i];
3571       }
3572     }
3573   #endif
3574 }
3575 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3576   #define _mm_maskmoveu_si128(a, mask, mem_addr) simde_mm_maskmoveu_si128((a), (mask), SIMDE_CHECKED_REINTERPRET_CAST(int8_t*, char*, (mem_addr)))
3577 #endif
3578 
3579 SIMDE_FUNCTION_ATTRIBUTES
3580 int32_t
simde_mm_movemask_epi8(simde__m128i a)3581 simde_mm_movemask_epi8 (simde__m128i a) {
3582   #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__INTEL_COMPILER)
3583     /* ICC has trouble with _mm_movemask_epi8 at -O2 and above: */
3584     return _mm_movemask_epi8(a);
3585   #else
3586     int32_t r = 0;
3587     simde__m128i_private a_ = simde__m128i_to_private(a);
3588 
3589     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3590       uint8x16_t input = a_.neon_u8;
3591       const int8_t xr[16] = {-7, -6, -5, -4, -3, -2, -1, 0, -7, -6, -5, -4, -3, -2, -1, 0};
3592       const uint8x16_t mask_and = vdupq_n_u8(0x80);
3593       const int8x16_t mask_shift = vld1q_s8(xr);
3594       const uint8x16_t mask_result =
3595         vshlq_u8(vandq_u8(input, mask_and), mask_shift);
3596       uint8x8_t lo = vget_low_u8(mask_result);
3597       uint8x8_t hi = vget_high_u8(mask_result);
3598       r = vaddv_u8(lo) + (vaddv_u8(hi) << 8);
3599     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3600       // Use increasingly wide shifts+adds to collect the sign bits
3601       // together.
3602       // Since the widening shifts would be rather confusing to follow in little endian, everything
3603       // will be illustrated in big endian order instead. This has a different result - the bits
3604       // would actually be reversed on a big endian machine.
3605 
3606       // Starting input (only half the elements are shown):
3607       // 89 ff 1d c0 00 10 99 33
3608       uint8x16_t input = a_.neon_u8;
3609 
3610       // Shift out everything but the sign bits with an unsigned shift right.
3611       //
3612       // Bytes of the vector::
3613       // 89 ff 1d c0 00 10 99 33
3614       // \  \  \  \  \  \  \  \    high_bits = (uint16x4_t)(input >> 7)
3615       //  |  |  |  |  |  |  |  |
3616       // 01 01 00 01 00 00 01 00
3617       //
3618       // Bits of first important lane(s):
3619       // 10001001 (89)
3620       // \______
3621       //        |
3622       // 00000001 (01)
3623       uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
3624 
3625       // Merge the even lanes together with a 16-bit unsigned shift right + add.
3626       // 'xx' represents garbage data which will be ignored in the final result.
3627       // In the important bytes, the add functions like a binary OR.
3628       //
3629       // 01 01 00 01 00 00 01 00
3630       //  \_ |  \_ |  \_ |  \_ |   paired16 = (uint32x4_t)(input + (input >> 7))
3631       //    \|    \|    \|    \|
3632       // xx 03 xx 01 xx 00 xx 02
3633       //
3634       // 00000001 00000001 (01 01)
3635       //        \_______ |
3636       //                \|
3637       // xxxxxxxx xxxxxx11 (xx 03)
3638       uint32x4_t paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
3639 
3640       // Repeat with a wider 32-bit shift + add.
3641       // xx 03 xx 01 xx 00 xx 02
3642       //     \____ |     \____ |  paired32 = (uint64x1_t)(paired16 + (paired16 >> 14))
3643       //          \|          \|
3644       // xx xx xx 0d xx xx xx 02
3645       //
3646       // 00000011 00000001 (03 01)
3647       //        \\_____ ||
3648       //         '----.\||
3649       // xxxxxxxx xxxx1101 (xx 0d)
3650       uint64x2_t paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
3651 
3652       // Last, an even wider 64-bit shift + add to get our result in the low 8 bit lanes.
3653       // xx xx xx 0d xx xx xx 02
3654       //            \_________ |   paired64 = (uint8x8_t)(paired32 + (paired32 >> 28))
3655       //                      \|
3656       // xx xx xx xx xx xx xx d2
3657       //
3658       // 00001101 00000010 (0d 02)
3659       //     \   \___ |  |
3660       //      '---.  \|  |
3661       // xxxxxxxx 11010010 (xx d2)
3662       uint8x16_t paired64 = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
3663 
3664       // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
3665       // xx xx xx xx xx xx xx d2
3666       //                      ||  return paired64[0]
3667       //                      d2
3668       // Note: Little endian would return the correct value 4b (01001011) instead.
3669       r = vgetq_lane_u8(paired64, 0) | (HEDLEY_STATIC_CAST(int32_t, vgetq_lane_u8(paired64, 8)) << 8);
3670     #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) && !defined(HEDLEY_IBM_VERSION) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
3671       static const SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) perm = { 120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8, 0 };
3672       r = HEDLEY_STATIC_CAST(int32_t, vec_extract(vec_vbpermq(a_.altivec_u8, perm), 1));
3673     #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) && !defined(HEDLEY_IBM_VERSION) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_BIG)
3674       static const SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) perm = { 120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8, 0 };
3675       r = HEDLEY_STATIC_CAST(int32_t, vec_extract(vec_vbpermq(a_.altivec_u8, perm), 14));
3676     #else
3677       SIMDE_VECTORIZE_REDUCTION(|:r)
3678       for (size_t i = 0 ; i < (sizeof(a_.u8) / sizeof(a_.u8[0])) ; i++) {
3679         r |= (a_.u8[15 - i] >> 7) << (15 - i);
3680       }
3681     #endif
3682 
3683     return r;
3684   #endif
3685 }
3686 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3687   #define _mm_movemask_epi8(a) simde_mm_movemask_epi8(a)
3688 #endif
3689 
3690 SIMDE_FUNCTION_ATTRIBUTES
3691 int32_t
simde_mm_movemask_pd(simde__m128d a)3692 simde_mm_movemask_pd (simde__m128d a) {
3693   #if defined(SIMDE_X86_SSE2_NATIVE)
3694     return _mm_movemask_pd(a);
3695   #else
3696     int32_t r = 0;
3697     simde__m128d_private a_ = simde__m128d_to_private(a);
3698 
3699     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3700       static const int64_t shift_amount[] = { 0, 1 };
3701       const int64x2_t shift = vld1q_s64(shift_amount);
3702       uint64x2_t tmp = vshrq_n_u64(a_.neon_u64, 63);
3703       return HEDLEY_STATIC_CAST(int32_t, vaddvq_u64(vshlq_u64(tmp, shift)));
3704     #else
3705       SIMDE_VECTORIZE_REDUCTION(|:r)
3706       for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) {
3707         r |= (a_.u64[i] >> 63) << i;
3708       }
3709     #endif
3710 
3711     return r;
3712   #endif
3713 }
3714 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3715   #define _mm_movemask_pd(a) simde_mm_movemask_pd(a)
3716 #endif
3717 
3718 SIMDE_FUNCTION_ATTRIBUTES
3719 simde__m64
simde_mm_movepi64_pi64(simde__m128i a)3720 simde_mm_movepi64_pi64 (simde__m128i a) {
3721   #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
3722     return _mm_movepi64_pi64(a);
3723   #else
3724     simde__m64_private r_;
3725     simde__m128i_private a_ = simde__m128i_to_private(a);
3726 
3727     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3728       r_.neon_i64 = vget_low_s64(a_.neon_i64);
3729     #else
3730       r_.i64[0] = a_.i64[0];
3731     #endif
3732 
3733     return simde__m64_from_private(r_);
3734   #endif
3735 }
3736 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3737   #define _mm_movepi64_pi64(a) simde_mm_movepi64_pi64(a)
3738 #endif
3739 
3740 SIMDE_FUNCTION_ATTRIBUTES
3741 simde__m128i
simde_mm_movpi64_epi64(simde__m64 a)3742 simde_mm_movpi64_epi64 (simde__m64 a) {
3743   #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
3744     return _mm_movpi64_epi64(a);
3745   #else
3746     simde__m128i_private r_;
3747     simde__m64_private a_ = simde__m64_to_private(a);
3748 
3749     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3750       r_.neon_i64 = vcombine_s64(a_.neon_i64, vdup_n_s64(0));
3751     #else
3752       r_.i64[0] = a_.i64[0];
3753       r_.i64[1] = 0;
3754     #endif
3755 
3756     return simde__m128i_from_private(r_);
3757   #endif
3758 }
3759 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3760   #define _mm_movpi64_epi64(a) simde_mm_movpi64_epi64(a)
3761 #endif
3762 
3763 SIMDE_FUNCTION_ATTRIBUTES
3764 simde__m128i
simde_mm_min_epi16(simde__m128i a,simde__m128i b)3765 simde_mm_min_epi16 (simde__m128i a, simde__m128i b) {
3766   #if defined(SIMDE_X86_SSE2_NATIVE)
3767     return _mm_min_epi16(a, b);
3768   #else
3769     simde__m128i_private
3770       r_,
3771       a_ = simde__m128i_to_private(a),
3772       b_ = simde__m128i_to_private(b);
3773 
3774     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3775       r_.neon_i16 = vminq_s16(a_.neon_i16, b_.neon_i16);
3776     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
3777       r_.wasm_v128 = wasm_i16x8_min(a_.wasm_v128, b_.wasm_v128);
3778     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
3779       r_.altivec_i16 = vec_min(a_.altivec_i16, b_.altivec_i16);
3780     #else
3781       SIMDE_VECTORIZE
3782       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
3783         r_.i16[i] = (a_.i16[i] < b_.i16[i]) ? a_.i16[i] : b_.i16[i];
3784       }
3785     #endif
3786 
3787     return simde__m128i_from_private(r_);
3788   #endif
3789 }
3790 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3791   #define _mm_min_epi16(a, b) simde_mm_min_epi16(a, b)
3792 #endif
3793 
3794 SIMDE_FUNCTION_ATTRIBUTES
3795 simde__m128i
simde_mm_min_epu8(simde__m128i a,simde__m128i b)3796 simde_mm_min_epu8 (simde__m128i a, simde__m128i b) {
3797   #if defined(SIMDE_X86_SSE2_NATIVE)
3798     return _mm_min_epu8(a, b);
3799   #else
3800     simde__m128i_private
3801       r_,
3802       a_ = simde__m128i_to_private(a),
3803       b_ = simde__m128i_to_private(b);
3804 
3805     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3806       r_.neon_u8 = vminq_u8(a_.neon_u8, b_.neon_u8);
3807     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
3808       r_.wasm_v128 = wasm_u8x16_min(a_.wasm_v128, b_.wasm_v128);
3809     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
3810       r_.altivec_u8 = vec_min(a_.altivec_u8, b_.altivec_u8);
3811     #else
3812       SIMDE_VECTORIZE
3813       for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
3814         r_.u8[i] = (a_.u8[i] < b_.u8[i]) ? a_.u8[i] : b_.u8[i];
3815       }
3816     #endif
3817 
3818     return simde__m128i_from_private(r_);
3819   #endif
3820 }
3821 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3822   #define _mm_min_epu8(a, b) simde_mm_min_epu8(a, b)
3823 #endif
3824 
3825 SIMDE_FUNCTION_ATTRIBUTES
3826 simde__m128d
simde_mm_min_pd(simde__m128d a,simde__m128d b)3827 simde_mm_min_pd (simde__m128d a, simde__m128d b) {
3828   #if defined(SIMDE_X86_SSE2_NATIVE)
3829     return _mm_min_pd(a, b);
3830   #else
3831     simde__m128d_private
3832       r_,
3833       a_ = simde__m128d_to_private(a),
3834       b_ = simde__m128d_to_private(b);
3835 
3836     #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
3837       r_.altivec_f64 = vec_min(a_.altivec_f64, b_.altivec_f64);
3838     #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3839       r_.neon_f64 = vminq_f64(a_.neon_f64, b_.neon_f64);
3840     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
3841       r_.wasm_v128 = wasm_f64x2_min(a_.wasm_v128, b_.wasm_v128);
3842     #else
3843       SIMDE_VECTORIZE
3844       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
3845         r_.f64[i] = (a_.f64[i] < b_.f64[i]) ? a_.f64[i] : b_.f64[i];
3846       }
3847     #endif
3848 
3849     return simde__m128d_from_private(r_);
3850   #endif
3851 }
3852 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3853   #define _mm_min_pd(a, b) simde_mm_min_pd(a, b)
3854 #endif
3855 
3856 SIMDE_FUNCTION_ATTRIBUTES
3857 simde__m128d
simde_mm_min_sd(simde__m128d a,simde__m128d b)3858 simde_mm_min_sd (simde__m128d a, simde__m128d b) {
3859   #if defined(SIMDE_X86_SSE2_NATIVE)
3860     return _mm_min_sd(a, b);
3861   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
3862     return simde_mm_move_sd(a, simde_mm_min_pd(a, b));
3863   #else
3864     simde__m128d_private
3865       r_,
3866       a_ = simde__m128d_to_private(a),
3867       b_ = simde__m128d_to_private(b);
3868 
3869     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3870       float64x2_t temp = vminq_f64(a_.neon_f64, b_.neon_f64);
3871       r_.neon_f64 = vsetq_lane_f64(vgetq_lane(a_.neon_f64, 1), temp, 1);
3872     #else
3873       r_.f64[0] = (a_.f64[0] < b_.f64[0]) ? a_.f64[0] : b_.f64[0];
3874       r_.f64[1] = a_.f64[1];
3875     #endif
3876 
3877     return simde__m128d_from_private(r_);
3878   #endif
3879 }
3880 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3881   #define _mm_min_sd(a, b) simde_mm_min_sd(a, b)
3882 #endif
3883 
3884 SIMDE_FUNCTION_ATTRIBUTES
3885 simde__m128i
simde_mm_max_epi16(simde__m128i a,simde__m128i b)3886 simde_mm_max_epi16 (simde__m128i a, simde__m128i b) {
3887   #if defined(SIMDE_X86_SSE2_NATIVE)
3888     return _mm_max_epi16(a, b);
3889   #else
3890     simde__m128i_private
3891       r_,
3892       a_ = simde__m128i_to_private(a),
3893       b_ = simde__m128i_to_private(b);
3894 
3895     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3896       r_.neon_i16 = vmaxq_s16(a_.neon_i16, b_.neon_i16);
3897     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
3898       r_.wasm_v128 = wasm_i16x8_max(a_.wasm_v128, b_.wasm_v128);
3899     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
3900       r_.altivec_i16 = vec_max(a_.altivec_i16, b_.altivec_i16);
3901     #else
3902       SIMDE_VECTORIZE
3903       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
3904         r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? a_.i16[i] : b_.i16[i];
3905       }
3906     #endif
3907 
3908     return simde__m128i_from_private(r_);
3909   #endif
3910 }
3911 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3912   #define _mm_max_epi16(a, b) simde_mm_max_epi16(a, b)
3913 #endif
3914 
3915 SIMDE_FUNCTION_ATTRIBUTES
3916 simde__m128i
simde_mm_max_epu8(simde__m128i a,simde__m128i b)3917 simde_mm_max_epu8 (simde__m128i a, simde__m128i b) {
3918   #if defined(SIMDE_X86_SSE2_NATIVE)
3919     return _mm_max_epu8(a, b);
3920   #else
3921     simde__m128i_private
3922       r_,
3923       a_ = simde__m128i_to_private(a),
3924       b_ = simde__m128i_to_private(b);
3925 
3926     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3927       r_.neon_u8 = vmaxq_u8(a_.neon_u8, b_.neon_u8);
3928     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
3929       r_.wasm_v128 = wasm_u8x16_max(a_.wasm_v128, b_.wasm_v128);
3930     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
3931       r_.altivec_u8 = vec_max(a_.altivec_u8, b_.altivec_u8);
3932     #else
3933       SIMDE_VECTORIZE
3934       for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
3935         r_.u8[i] = (a_.u8[i] > b_.u8[i]) ? a_.u8[i] : b_.u8[i];
3936       }
3937     #endif
3938 
3939     return simde__m128i_from_private(r_);
3940   #endif
3941 }
3942 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3943   #define _mm_max_epu8(a, b) simde_mm_max_epu8(a, b)
3944 #endif
3945 
3946 SIMDE_FUNCTION_ATTRIBUTES
3947 simde__m128d
simde_mm_max_pd(simde__m128d a,simde__m128d b)3948 simde_mm_max_pd (simde__m128d a, simde__m128d b) {
3949   #if defined(SIMDE_X86_SSE2_NATIVE)
3950     return _mm_max_pd(a, b);
3951   #else
3952     simde__m128d_private
3953       r_,
3954       a_ = simde__m128d_to_private(a),
3955       b_ = simde__m128d_to_private(b);
3956 
3957     #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
3958       r_.altivec_f64 = vec_max(a_.altivec_f64, b_.altivec_f64);
3959     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
3960       r_.wasm_v128 = wasm_f64x2_max(a_.wasm_v128, b_.wasm_v128);
3961     #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3962       r_.neon_f64 = vmaxq_f64(a_.neon_f64, b_.neon_f64);
3963     #else
3964       SIMDE_VECTORIZE
3965       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
3966         r_.f64[i] = (a_.f64[i] > b_.f64[i]) ? a_.f64[i] : b_.f64[i];
3967       }
3968     #endif
3969 
3970     return simde__m128d_from_private(r_);
3971   #endif
3972 }
3973 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3974   #define _mm_max_pd(a, b) simde_mm_max_pd(a, b)
3975 #endif
3976 
3977 SIMDE_FUNCTION_ATTRIBUTES
3978 simde__m128d
simde_mm_max_sd(simde__m128d a,simde__m128d b)3979 simde_mm_max_sd (simde__m128d a, simde__m128d b) {
3980   #if defined(SIMDE_X86_SSE2_NATIVE)
3981     return _mm_max_sd(a, b);
3982   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
3983     return simde_mm_move_sd(a, simde_mm_max_pd(a, b));
3984   #else
3985     simde__m128d_private
3986       r_,
3987       a_ = simde__m128d_to_private(a),
3988       b_ = simde__m128d_to_private(b);
3989 
3990     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3991       float64x2_t temp = vmaxq_f64(a_.neon_f64, b_.neon_f64);
3992       r_.neon_f64 = vsetq_lane_f64(vgetq_lane(a_.neon_f64, 1), temp, 1);
3993     #else
3994       r_.f64[0] = (a_.f64[0] > b_.f64[0]) ? a_.f64[0] : b_.f64[0];
3995       r_.f64[1] = a_.f64[1];
3996     #endif
3997 
3998     return simde__m128d_from_private(r_);
3999   #endif
4000 }
4001 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4002   #define _mm_max_sd(a, b) simde_mm_max_sd(a, b)
4003 #endif
4004 
4005 SIMDE_FUNCTION_ATTRIBUTES
4006 simde__m128i
simde_mm_move_epi64(simde__m128i a)4007 simde_mm_move_epi64 (simde__m128i a) {
4008   #if defined(SIMDE_X86_SSE2_NATIVE)
4009     return _mm_move_epi64(a);
4010   #else
4011     simde__m128i_private
4012       r_,
4013       a_ = simde__m128i_to_private(a);
4014 
4015     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4016       r_.neon_i64 = vsetq_lane_s64(0, a_.neon_i64, 1);
4017     #else
4018       r_.i64[0] = a_.i64[0];
4019       r_.i64[1] = 0;
4020     #endif
4021 
4022     return simde__m128i_from_private(r_);
4023   #endif
4024 }
4025 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4026   #define _mm_move_epi64(a) simde_mm_move_epi64(a)
4027 #endif
4028 
4029 SIMDE_FUNCTION_ATTRIBUTES
4030 simde__m128i
simde_mm_mul_epu32(simde__m128i a,simde__m128i b)4031 simde_mm_mul_epu32 (simde__m128i a, simde__m128i b) {
4032   #if defined(SIMDE_X86_SSE2_NATIVE)
4033     return _mm_mul_epu32(a, b);
4034   #else
4035     simde__m128i_private
4036       r_,
4037       a_ = simde__m128i_to_private(a),
4038       b_ = simde__m128i_to_private(b);
4039 
4040     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4041       uint32x2_t a_lo = vmovn_u64(a_.neon_u64);
4042       uint32x2_t b_lo = vmovn_u64(b_.neon_u64);
4043       r_.neon_u64 = vmull_u32(a_lo, b_lo);
4044     #elif defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
4045       __typeof__(a_.u32) z = { 0, };
4046       a_.u32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.u32, z, 0, 4, 2, 6);
4047       b_.u32 = SIMDE_SHUFFLE_VECTOR_(32, 16, b_.u32, z, 0, 4, 2, 6);
4048       r_.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), a_.u32) *
4049                HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), b_.u32);
4050     #else
4051       SIMDE_VECTORIZE
4052       for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
4053         r_.u64[i] = HEDLEY_STATIC_CAST(uint64_t, a_.u32[i * 2]) * HEDLEY_STATIC_CAST(uint64_t, b_.u32[i * 2]);
4054       }
4055     #endif
4056 
4057     return simde__m128i_from_private(r_);
4058   #endif
4059 }
4060 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4061   #define _mm_mul_epu32(a, b) simde_mm_mul_epu32(a, b)
4062 #endif
4063 
4064 SIMDE_FUNCTION_ATTRIBUTES
4065 simde__m128i
simde_x_mm_mul_epi64(simde__m128i a,simde__m128i b)4066 simde_x_mm_mul_epi64 (simde__m128i a, simde__m128i b) {
4067   simde__m128i_private
4068     r_,
4069     a_ = simde__m128i_to_private(a),
4070     b_ = simde__m128i_to_private(b);
4071 
4072   #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
4073     r_.i64 = a_.i64 * b_.i64;
4074   #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
4075     r_.neon_f64 = vmulq_s64(a_.neon_f64, b_.neon_f64);
4076   #else
4077     SIMDE_VECTORIZE
4078     for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
4079       r_.i64[i] = a_.i64[i] * b_.i64[i];
4080     }
4081   #endif
4082 
4083   return simde__m128i_from_private(r_);
4084 }
4085 
4086 SIMDE_FUNCTION_ATTRIBUTES
4087 simde__m128i
simde_x_mm_mod_epi64(simde__m128i a,simde__m128i b)4088 simde_x_mm_mod_epi64 (simde__m128i a, simde__m128i b) {
4089   simde__m128i_private
4090     r_,
4091     a_ = simde__m128i_to_private(a),
4092     b_ = simde__m128i_to_private(b);
4093 
4094   #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
4095     r_.i64 = a_.i64 % b_.i64;
4096   #else
4097     SIMDE_VECTORIZE
4098     for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
4099       r_.i64[i] = a_.i64[i] % b_.i64[i];
4100     }
4101   #endif
4102 
4103   return simde__m128i_from_private(r_);
4104 }
4105 
4106 SIMDE_FUNCTION_ATTRIBUTES
4107 simde__m128d
simde_mm_mul_pd(simde__m128d a,simde__m128d b)4108 simde_mm_mul_pd (simde__m128d a, simde__m128d b) {
4109   #if defined(SIMDE_X86_SSE2_NATIVE)
4110     return _mm_mul_pd(a, b);
4111   #else
4112     simde__m128d_private
4113       r_,
4114       a_ = simde__m128d_to_private(a),
4115       b_ = simde__m128d_to_private(b);
4116 
4117     #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
4118       r_.f64 = a_.f64 * b_.f64;
4119     #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
4120       r_.neon_f64 = vmulq_f64(a_.neon_f64, b_.neon_f64);
4121     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4122       r_.wasm_v128 = wasm_f64x2_mul(a_.wasm_v128, b_.wasm_v128);
4123     #else
4124       SIMDE_VECTORIZE
4125       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
4126         r_.f64[i] = a_.f64[i] * b_.f64[i];
4127       }
4128     #endif
4129 
4130     return simde__m128d_from_private(r_);
4131   #endif
4132 }
4133 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4134   #define _mm_mul_pd(a, b) simde_mm_mul_pd(a, b)
4135 #endif
4136 
4137 SIMDE_FUNCTION_ATTRIBUTES
4138 simde__m128d
simde_mm_mul_sd(simde__m128d a,simde__m128d b)4139 simde_mm_mul_sd (simde__m128d a, simde__m128d b) {
4140   #if defined(SIMDE_X86_SSE2_NATIVE)
4141     return _mm_mul_sd(a, b);
4142   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
4143     return simde_mm_move_sd(a, simde_mm_mul_pd(a, b));
4144   #else
4145     simde__m128d_private
4146       r_,
4147       a_ = simde__m128d_to_private(a),
4148       b_ = simde__m128d_to_private(b);
4149 
4150     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
4151       float64x2_t temp = vmulq_f64(a_.neon_f64, b_.neon_f64);
4152       r_.neon_f64 = vsetq_lane_f64(vgetq_lane(a_.neon_f64, 1), temp, 1);
4153     #else
4154       r_.f64[0] = a_.f64[0] * b_.f64[0];
4155       r_.f64[1] = a_.f64[1];
4156     #endif
4157 
4158     return simde__m128d_from_private(r_);
4159   #endif
4160 }
4161 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4162   #define _mm_mul_sd(a, b) simde_mm_mul_sd(a, b)
4163 #endif
4164 
4165 SIMDE_FUNCTION_ATTRIBUTES
4166 simde__m64
simde_mm_mul_su32(simde__m64 a,simde__m64 b)4167 simde_mm_mul_su32 (simde__m64 a, simde__m64 b) {
4168   #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI)
4169     return _mm_mul_su32(a, b);
4170   #else
4171     simde__m64_private
4172       r_,
4173       a_ = simde__m64_to_private(a),
4174       b_ = simde__m64_to_private(b);
4175 
4176     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4177       r_.u64[0] = vget_lane_u64(vget_low_u64(vmull_u32(vreinterpret_u32_s64(a_.neon_i64), vreinterpret_u32_s64(b_.neon_i64))), 0);
4178     #else
4179       r_.u64[0] = HEDLEY_STATIC_CAST(uint64_t, a_.u32[0]) * HEDLEY_STATIC_CAST(uint64_t, b_.u32[0]);
4180     #endif
4181 
4182     return simde__m64_from_private(r_);
4183   #endif
4184 }
4185 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4186   #define _mm_mul_su32(a, b) simde_mm_mul_su32(a, b)
4187 #endif
4188 
4189 SIMDE_FUNCTION_ATTRIBUTES
4190 simde__m128i
simde_mm_mulhi_epi16(simde__m128i a,simde__m128i b)4191 simde_mm_mulhi_epi16 (simde__m128i a, simde__m128i b) {
4192   #if defined(SIMDE_X86_SSE2_NATIVE)
4193     return _mm_mulhi_epi16(a, b);
4194   #else
4195     simde__m128i_private
4196       r_,
4197       a_ = simde__m128i_to_private(a),
4198       b_ = simde__m128i_to_private(b);
4199 
4200     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4201       int16x4_t a3210 = vget_low_s16(a_.neon_i16);
4202       int16x4_t b3210 = vget_low_s16(b_.neon_i16);
4203       int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
4204       #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
4205         int32x4_t ab7654 = vmull_high_s16(a_.neon_i16, b_.neon_i16);
4206         r_.neon_i16 = vuzp2q_s16(vreinterpretq_s16_s32(ab3210), vreinterpretq_s16_s32(ab7654));
4207       #else
4208         int16x4_t a7654 = vget_high_s16(a_.neon_i16);
4209         int16x4_t b7654 = vget_high_s16(b_.neon_i16);
4210         int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
4211         uint16x8x2_t rv = vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
4212         r_.neon_u16 = rv.val[1];
4213       #endif
4214     #else
4215       SIMDE_VECTORIZE
4216       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
4217         r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, (HEDLEY_STATIC_CAST(uint32_t, HEDLEY_STATIC_CAST(int32_t, a_.i16[i]) * HEDLEY_STATIC_CAST(int32_t, b_.i16[i])) >> 16));
4218       }
4219     #endif
4220 
4221     return simde__m128i_from_private(r_);
4222   #endif
4223 }
4224 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4225   #define _mm_mulhi_epi16(a, b) simde_mm_mulhi_epi16(a, b)
4226 #endif
4227 
4228 SIMDE_FUNCTION_ATTRIBUTES
4229 simde__m128i
simde_mm_mulhi_epu16(simde__m128i a,simde__m128i b)4230 simde_mm_mulhi_epu16 (simde__m128i a, simde__m128i b) {
4231   #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
4232     return _mm_mulhi_epu16(a, b);
4233   #else
4234     simde__m128i_private
4235       r_,
4236       a_ = simde__m128i_to_private(a),
4237       b_ = simde__m128i_to_private(b);
4238 
4239     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4240       uint16x4_t a3210 = vget_low_u16(a_.neon_u16);
4241       uint16x4_t b3210 = vget_low_u16(b_.neon_u16);
4242       uint32x4_t ab3210 = vmull_u16(a3210, b3210); /* 3333222211110000 */
4243       #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
4244         uint32x4_t ab7654 = vmull_high_u16(a_.neon_u16, b_.neon_u16);
4245         r_.neon_u16 = vuzp2q_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654));
4246       #else
4247         uint16x4_t a7654 = vget_high_u16(a_.neon_u16);
4248         uint16x4_t b7654 = vget_high_u16(b_.neon_u16);
4249         uint32x4_t ab7654 = vmull_u16(a7654, b7654); /* 7777666655554444 */
4250         uint16x8x2_t neon_r = vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654));
4251         r_.neon_u16 = neon_r.val[1];
4252       #endif
4253     #else
4254       SIMDE_VECTORIZE
4255       for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
4256         r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, HEDLEY_STATIC_CAST(uint32_t, a_.u16[i]) * HEDLEY_STATIC_CAST(uint32_t, b_.u16[i]) >> 16);
4257       }
4258     #endif
4259 
4260     return simde__m128i_from_private(r_);
4261   #endif
4262 }
4263 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4264   #define _mm_mulhi_epu16(a, b) simde_mm_mulhi_epu16(a, b)
4265 #endif
4266 
4267 SIMDE_FUNCTION_ATTRIBUTES
4268 simde__m128i
simde_mm_mullo_epi16(simde__m128i a,simde__m128i b)4269 simde_mm_mullo_epi16 (simde__m128i a, simde__m128i b) {
4270   #if defined(SIMDE_X86_SSE2_NATIVE)
4271     return _mm_mullo_epi16(a, b);
4272   #else
4273     simde__m128i_private
4274       r_,
4275       a_ = simde__m128i_to_private(a),
4276       b_ = simde__m128i_to_private(b);
4277 
4278     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4279       r_.neon_i16 = vmulq_s16(a_.neon_i16, b_.neon_i16);
4280     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
4281       (void) a_;
4282       (void) b_;
4283       r_.altivec_i16 = vec_mul(a_.altivec_i16, b_.altivec_i16);
4284     #else
4285       SIMDE_VECTORIZE
4286       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
4287         r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, HEDLEY_STATIC_CAST(uint32_t, a_.u16[i]) * HEDLEY_STATIC_CAST(uint32_t, b_.u16[i]));
4288       }
4289     #endif
4290 
4291     return simde__m128i_from_private(r_);
4292   #endif
4293 }
4294 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4295   #define _mm_mullo_epi16(a, b) simde_mm_mullo_epi16(a, b)
4296 #endif
4297 
4298 SIMDE_FUNCTION_ATTRIBUTES
4299 simde__m128d
simde_mm_or_pd(simde__m128d a,simde__m128d b)4300 simde_mm_or_pd (simde__m128d a, simde__m128d b) {
4301   #if defined(SIMDE_X86_SSE2_NATIVE)
4302     return _mm_or_pd(a, b);
4303   #else
4304     simde__m128d_private
4305       r_,
4306       a_ = simde__m128d_to_private(a),
4307       b_ = simde__m128d_to_private(b);
4308 
4309     #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
4310       r_.i32f = a_.i32f | b_.i32f;
4311     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4312       r_.wasm_v128 = wasm_v128_or(a_.wasm_v128, b_.wasm_v128);
4313     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4314       r_.neon_i64 = vorrq_s64(a_.neon_i64, b_.neon_i64);
4315     #else
4316       SIMDE_VECTORIZE
4317       for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
4318         r_.i32f[i] = a_.i32f[i] | b_.i32f[i];
4319       }
4320     #endif
4321 
4322     return simde__m128d_from_private(r_);
4323   #endif
4324 }
4325 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4326   #define _mm_or_pd(a, b) simde_mm_or_pd(a, b)
4327 #endif
4328 
4329 SIMDE_FUNCTION_ATTRIBUTES
4330 simde__m128i
simde_mm_or_si128(simde__m128i a,simde__m128i b)4331 simde_mm_or_si128 (simde__m128i a, simde__m128i b) {
4332   #if defined(SIMDE_X86_SSE2_NATIVE)
4333     return _mm_or_si128(a, b);
4334   #else
4335     simde__m128i_private
4336       r_,
4337       a_ = simde__m128i_to_private(a),
4338       b_ = simde__m128i_to_private(b);
4339 
4340     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4341       r_.neon_i32 = vorrq_s32(a_.neon_i32, b_.neon_i32);
4342     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
4343       r_.altivec_i32 = vec_or(a_.altivec_i32, b_.altivec_i32);
4344     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
4345       r_.i32f = a_.i32f | b_.i32f;
4346     #else
4347       SIMDE_VECTORIZE
4348       for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
4349         r_.i32f[i] = a_.i32f[i] | b_.i32f[i];
4350       }
4351     #endif
4352 
4353     return simde__m128i_from_private(r_);
4354   #endif
4355 }
4356 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4357   #define _mm_or_si128(a, b) simde_mm_or_si128(a, b)
4358 #endif
4359 
4360 SIMDE_FUNCTION_ATTRIBUTES
4361 simde__m128i
simde_mm_packs_epi16(simde__m128i a,simde__m128i b)4362 simde_mm_packs_epi16 (simde__m128i a, simde__m128i b) {
4363   #if defined(SIMDE_X86_SSE2_NATIVE)
4364     return _mm_packs_epi16(a, b);
4365   #else
4366     simde__m128i_private
4367       r_,
4368       a_ = simde__m128i_to_private(a),
4369       b_ = simde__m128i_to_private(b);
4370 
4371     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4372       r_.neon_i8 = vcombine_s8(vqmovn_s16(a_.neon_i16), vqmovn_s16(b_.neon_i16));
4373     #else
4374       SIMDE_VECTORIZE
4375       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
4376         r_.i8[i]     = (a_.i16[i] > INT8_MAX) ? INT8_MAX : ((a_.i16[i] < INT8_MIN) ? INT8_MIN : HEDLEY_STATIC_CAST(int8_t, a_.i16[i]));
4377         r_.i8[i + 8] = (b_.i16[i] > INT8_MAX) ? INT8_MAX : ((b_.i16[i] < INT8_MIN) ? INT8_MIN : HEDLEY_STATIC_CAST(int8_t, b_.i16[i]));
4378       }
4379     #endif
4380 
4381     return simde__m128i_from_private(r_);
4382   #endif
4383 }
4384 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4385   #define _mm_packs_epi16(a, b) simde_mm_packs_epi16(a, b)
4386 #endif
4387 
4388 SIMDE_FUNCTION_ATTRIBUTES
4389 simde__m128i
simde_mm_packs_epi32(simde__m128i a,simde__m128i b)4390 simde_mm_packs_epi32 (simde__m128i a, simde__m128i b) {
4391   #if defined(SIMDE_X86_SSE2_NATIVE)
4392     return _mm_packs_epi32(a, b);
4393   #else
4394     simde__m128i_private
4395       r_,
4396       a_ = simde__m128i_to_private(a),
4397       b_ = simde__m128i_to_private(b);
4398 
4399     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4400       r_.neon_i16 = vcombine_s16(vqmovn_s32(a_.neon_i32), vqmovn_s32(b_.neon_i32));
4401     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
4402       r_.altivec_i16 = vec_packs(a_.altivec_i32, b_.altivec_i32);
4403     #else
4404       SIMDE_VECTORIZE
4405       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
4406         r_.i16[i]     = (a_.i32[i] > INT16_MAX) ? INT16_MAX : ((a_.i32[i] < INT16_MIN) ? INT16_MIN : HEDLEY_STATIC_CAST(int16_t, a_.i32[i]));
4407         r_.i16[i + 4] = (b_.i32[i] > INT16_MAX) ? INT16_MAX : ((b_.i32[i] < INT16_MIN) ? INT16_MIN : HEDLEY_STATIC_CAST(int16_t, b_.i32[i]));
4408       }
4409     #endif
4410 
4411     return simde__m128i_from_private(r_);
4412   #endif
4413 }
4414 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4415   #define _mm_packs_epi32(a, b) simde_mm_packs_epi32(a, b)
4416 #endif
4417 
4418 SIMDE_FUNCTION_ATTRIBUTES
4419 simde__m128i
simde_mm_packus_epi16(simde__m128i a,simde__m128i b)4420 simde_mm_packus_epi16 (simde__m128i a, simde__m128i b) {
4421   #if defined(SIMDE_X86_SSE2_NATIVE)
4422     return _mm_packus_epi16(a, b);
4423   #else
4424     simde__m128i_private
4425       r_,
4426       a_ = simde__m128i_to_private(a),
4427       b_ = simde__m128i_to_private(b);
4428 
4429     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4430       r_.neon_u8 = vcombine_u8(vqmovun_s16(a_.neon_i16), vqmovun_s16(b_.neon_i16));
4431     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
4432       r_.altivec_u8 = vec_packsu(a_.altivec_i16, b_.altivec_i16);
4433     #else
4434       SIMDE_VECTORIZE
4435       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
4436         r_.u8[i]     = (a_.i16[i] > UINT8_MAX) ? UINT8_MAX : ((a_.i16[i] < 0) ? UINT8_C(0) : HEDLEY_STATIC_CAST(uint8_t, a_.i16[i]));
4437         r_.u8[i + 8] = (b_.i16[i] > UINT8_MAX) ? UINT8_MAX : ((b_.i16[i] < 0) ? UINT8_C(0) : HEDLEY_STATIC_CAST(uint8_t, b_.i16[i]));
4438       }
4439     #endif
4440 
4441     return simde__m128i_from_private(r_);
4442   #endif
4443 }
4444 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4445   #define _mm_packus_epi16(a, b) simde_mm_packus_epi16(a, b)
4446 #endif
4447 
4448 SIMDE_FUNCTION_ATTRIBUTES
4449 void
simde_mm_pause(void)4450 simde_mm_pause (void) {
4451   #if defined(SIMDE_X86_SSE2_NATIVE)
4452     _mm_pause();
4453   #endif
4454 }
4455 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4456   #define _mm_pause() (simde_mm_pause())
4457 #endif
4458 
4459 SIMDE_FUNCTION_ATTRIBUTES
4460 simde__m128i
simde_mm_sad_epu8(simde__m128i a,simde__m128i b)4461 simde_mm_sad_epu8 (simde__m128i a, simde__m128i b) {
4462   #if defined(SIMDE_X86_SSE2_NATIVE)
4463     return _mm_sad_epu8(a, b);
4464   #else
4465     simde__m128i_private
4466       r_,
4467       a_ = simde__m128i_to_private(a),
4468       b_ = simde__m128i_to_private(b);
4469 
4470     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4471       const uint16x8_t t = vpaddlq_u8(vabdq_u8(a_.neon_u8, b_.neon_u8));
4472       r_.neon_u64 = vcombine_u64(
4473         vpaddl_u32(vpaddl_u16(vget_low_u16(t))),
4474         vpaddl_u32(vpaddl_u16(vget_high_u16(t))));
4475     #else
4476       for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
4477         uint16_t tmp = 0;
4478         SIMDE_VECTORIZE_REDUCTION(+:tmp)
4479         for (size_t j = 0 ; j < ((sizeof(r_.u8) / sizeof(r_.u8[0])) / 2) ; j++) {
4480           const size_t e = j + (i * 8);
4481           tmp += (a_.u8[e] > b_.u8[e]) ? (a_.u8[e] - b_.u8[e]) : (b_.u8[e] - a_.u8[e]);
4482         }
4483         r_.i64[i] = tmp;
4484       }
4485     #endif
4486 
4487     return simde__m128i_from_private(r_);
4488   #endif
4489 }
4490 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4491   #define _mm_sad_epu8(a, b) simde_mm_sad_epu8(a, b)
4492 #endif
4493 
4494 SIMDE_FUNCTION_ATTRIBUTES
4495 simde__m128i
simde_mm_set_epi8(int8_t e15,int8_t e14,int8_t e13,int8_t e12,int8_t e11,int8_t e10,int8_t e9,int8_t e8,int8_t e7,int8_t e6,int8_t e5,int8_t e4,int8_t e3,int8_t e2,int8_t e1,int8_t e0)4496 simde_mm_set_epi8 (int8_t e15, int8_t e14, int8_t e13, int8_t e12,
4497        int8_t e11, int8_t e10, int8_t  e9, int8_t  e8,
4498        int8_t  e7, int8_t  e6, int8_t  e5, int8_t  e4,
4499        int8_t  e3, int8_t  e2, int8_t  e1, int8_t  e0) {
4500 
4501   #if defined(SIMDE_X86_SSE2_NATIVE)
4502     return _mm_set_epi8(
4503       e15, e14, e13, e12, e11, e10,  e9,  e8,
4504        e7,  e6,  e5,  e4,  e3,  e2,  e1,  e0);
4505   #else
4506     simde__m128i_private r_;
4507 
4508     #if defined(SIMDE_WASM_SIMD128_NATIVE)
4509       r_.wasm_v128 = wasm_i8x16_make(
4510          e0,  e1,  e2,  e3,  e4,  e5,  e6,  e7,
4511          e8,  e9, e10, e11, e12, e13, e14, e15);
4512     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4513       SIMDE_ALIGN_LIKE_16(int8x16_t) int8_t data[16] = {
4514         e0,  e1,  e2,  e3,
4515         e4,  e5,  e6,  e7,
4516         e8,  e9,  e10, e11,
4517         e12, e13, e14, e15};
4518       r_.neon_i8 = vld1q_s8(data);
4519     #else
4520       r_.i8[ 0] =  e0;
4521       r_.i8[ 1] =  e1;
4522       r_.i8[ 2] =  e2;
4523       r_.i8[ 3] =  e3;
4524       r_.i8[ 4] =  e4;
4525       r_.i8[ 5] =  e5;
4526       r_.i8[ 6] =  e6;
4527       r_.i8[ 7] =  e7;
4528       r_.i8[ 8] =  e8;
4529       r_.i8[ 9] =  e9;
4530       r_.i8[10] = e10;
4531       r_.i8[11] = e11;
4532       r_.i8[12] = e12;
4533       r_.i8[13] = e13;
4534       r_.i8[14] = e14;
4535       r_.i8[15] = e15;
4536     #endif
4537 
4538     return simde__m128i_from_private(r_);
4539   #endif
4540 }
4541 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4542   #define _mm_set_epi8(e15, e14, e13, e12, e11, e10,  e9,  e8,  e7,  e6,  e5,  e4,  e3,  e2,  e1,  e0) simde_mm_set_epi8(e15, e14, e13, e12, e11, e10,  e9,  e8,  e7,  e6,  e5,  e4,  e3,  e2,  e1,  e0)
4543 #endif
4544 
4545 SIMDE_FUNCTION_ATTRIBUTES
4546 simde__m128i
simde_mm_set_epi16(int16_t e7,int16_t e6,int16_t e5,int16_t e4,int16_t e3,int16_t e2,int16_t e1,int16_t e0)4547 simde_mm_set_epi16 (int16_t e7, int16_t e6, int16_t e5, int16_t e4,
4548         int16_t e3, int16_t e2, int16_t e1, int16_t e0) {
4549   #if defined(SIMDE_X86_SSE2_NATIVE)
4550     return _mm_set_epi16(e7, e6, e5, e4, e3, e2, e1, e0);
4551   #else
4552     simde__m128i_private r_;
4553 
4554     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4555       SIMDE_ALIGN_LIKE_16(int16x8_t) int16_t data[8] = { e0, e1, e2, e3, e4, e5, e6, e7 };
4556       r_.neon_i16 = vld1q_s16(data);
4557     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4558       r_.wasm_v128 = wasm_i16x8_make(e0, e1, e2, e3, e4, e5, e6, e7);
4559     #else
4560       r_.i16[0] = e0;
4561       r_.i16[1] = e1;
4562       r_.i16[2] = e2;
4563       r_.i16[3] = e3;
4564       r_.i16[4] = e4;
4565       r_.i16[5] = e5;
4566       r_.i16[6] = e6;
4567       r_.i16[7] = e7;
4568     #endif
4569 
4570     return simde__m128i_from_private(r_);
4571   #endif
4572 }
4573 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4574   #define _mm_set_epi16(e7,  e6,  e5,  e4,  e3,  e2,  e1,  e0) simde_mm_set_epi16(e7,  e6,  e5,  e4,  e3,  e2,  e1,  e0)
4575 #endif
4576 
4577 SIMDE_FUNCTION_ATTRIBUTES
4578 simde__m128i
simde_mm_loadu_si16(void const * mem_addr)4579 simde_mm_loadu_si16 (void const* mem_addr) {
4580   #if defined(SIMDE_X86_SSE2_NATIVE) && ( \
4581       SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0) || \
4582       HEDLEY_GCC_VERSION_CHECK(11,0,0) || \
4583       HEDLEY_INTEL_VERSION_CHECK(20,21,1))
4584     return _mm_loadu_si16(mem_addr);
4585   #else
4586     int16_t val;
4587     simde_memcpy(&val, mem_addr, sizeof(val));
4588     return simde_x_mm_cvtsi16_si128(val);
4589   #endif
4590 }
4591 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4592   #define _mm_loadu_si16(mem_addr) simde_mm_loadu_si16(mem_addr)
4593 #endif
4594 
4595 SIMDE_FUNCTION_ATTRIBUTES
4596 simde__m128i
simde_mm_set_epi32(int32_t e3,int32_t e2,int32_t e1,int32_t e0)4597 simde_mm_set_epi32 (int32_t e3, int32_t e2, int32_t e1, int32_t e0) {
4598   #if defined(SIMDE_X86_SSE2_NATIVE)
4599     return _mm_set_epi32(e3, e2, e1, e0);
4600   #else
4601     simde__m128i_private r_;
4602 
4603     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4604       SIMDE_ALIGN_LIKE_16(int32x4_t) int32_t data[4] = { e0, e1, e2, e3 };
4605       r_.neon_i32 = vld1q_s32(data);
4606     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4607       r_.wasm_v128 = wasm_i32x4_make(e0, e1, e2, e3);
4608     #else
4609       r_.i32[0] = e0;
4610       r_.i32[1] = e1;
4611       r_.i32[2] = e2;
4612       r_.i32[3] = e3;
4613     #endif
4614 
4615     return simde__m128i_from_private(r_);
4616   #endif
4617 }
4618 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4619   #define _mm_set_epi32(e3,  e2,  e1,  e0) simde_mm_set_epi32(e3,  e2,  e1,  e0)
4620 #endif
4621 
4622 SIMDE_FUNCTION_ATTRIBUTES
4623 simde__m128i
simde_mm_loadu_si32(void const * mem_addr)4624 simde_mm_loadu_si32 (void const* mem_addr) {
4625   #if defined(SIMDE_X86_SSE2_NATIVE) && ( \
4626       SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0) || \
4627       HEDLEY_GCC_VERSION_CHECK(11,0,0) || \
4628       HEDLEY_INTEL_VERSION_CHECK(20,21,1))
4629     return _mm_loadu_si32(mem_addr);
4630   #else
4631     int32_t val;
4632     simde_memcpy(&val, mem_addr, sizeof(val));
4633     return simde_mm_cvtsi32_si128(val);
4634   #endif
4635 }
4636 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4637   #define _mm_loadu_si32(mem_addr) simde_mm_loadu_si32(mem_addr)
4638 #endif
4639 
4640 SIMDE_FUNCTION_ATTRIBUTES
4641 simde__m128i
simde_mm_set_epi64(simde__m64 e1,simde__m64 e0)4642 simde_mm_set_epi64 (simde__m64 e1, simde__m64 e0) {
4643   #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
4644     return _mm_set_epi64(e1, e0);
4645   #else
4646     simde__m128i_private r_;
4647 
4648     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4649       r_.neon_i64 = vcombine_s64(simde__m64_to_neon_i64(e0), simde__m64_to_neon_i64(e1));
4650     #else
4651       r_.m64[0] = e0;
4652       r_.m64[1] = e1;
4653     #endif
4654 
4655     return simde__m128i_from_private(r_);
4656   #endif
4657 }
4658 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4659   #define _mm_set_epi64(e1, e0) (simde_mm_set_epi64((e1), (e0)))
4660 #endif
4661 
4662 SIMDE_FUNCTION_ATTRIBUTES
4663 simde__m128i
simde_mm_set_epi64x(int64_t e1,int64_t e0)4664 simde_mm_set_epi64x (int64_t e1, int64_t e0) {
4665   #if defined(SIMDE_X86_SSE2_NATIVE) && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,0,0))
4666     return _mm_set_epi64x(e1, e0);
4667   #else
4668     simde__m128i_private r_;
4669 
4670     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4671       SIMDE_ALIGN_LIKE_16(int64x2_t) int64_t data[2] = {e0, e1};
4672       r_.neon_i64 = vld1q_s64(data);
4673     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4674       r_.wasm_v128 = wasm_i64x2_make(e0, e1);
4675     #else
4676       r_.i64[0] = e0;
4677       r_.i64[1] = e1;
4678     #endif
4679 
4680     return simde__m128i_from_private(r_);
4681   #endif
4682 }
4683 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4684   #define _mm_set_epi64x(e1, e0) simde_mm_set_epi64x(e1, e0)
4685 #endif
4686 
4687 SIMDE_FUNCTION_ATTRIBUTES
4688 simde__m128i
simde_mm_loadu_si64(void const * mem_addr)4689 simde_mm_loadu_si64 (void const* mem_addr) {
4690   #if defined(SIMDE_X86_SSE2_NATIVE) && ( \
4691       SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0) || \
4692       HEDLEY_GCC_VERSION_CHECK(11,0,0) || \
4693       HEDLEY_INTEL_VERSION_CHECK(20,21,1))
4694     return _mm_loadu_si64(mem_addr);
4695   #else
4696   int64_t val;
4697     simde_memcpy(&val, mem_addr, sizeof(val));
4698     return simde_mm_cvtsi64_si128(val);
4699   #endif
4700 }
4701 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4702   #define _mm_loadu_si64(mem_addr) simde_mm_loadu_si64(mem_addr)
4703 #endif
4704 
4705 SIMDE_FUNCTION_ATTRIBUTES
4706 simde__m128i
simde_x_mm_set_epu8(uint8_t e15,uint8_t e14,uint8_t e13,uint8_t e12,uint8_t e11,uint8_t e10,uint8_t e9,uint8_t e8,uint8_t e7,uint8_t e6,uint8_t e5,uint8_t e4,uint8_t e3,uint8_t e2,uint8_t e1,uint8_t e0)4707 simde_x_mm_set_epu8 (uint8_t e15, uint8_t e14, uint8_t e13, uint8_t e12,
4708          uint8_t e11, uint8_t e10, uint8_t  e9, uint8_t  e8,
4709          uint8_t  e7, uint8_t  e6, uint8_t  e5, uint8_t  e4,
4710          uint8_t  e3, uint8_t  e2, uint8_t  e1, uint8_t  e0) {
4711   #if defined(SIMDE_X86_SSE2_NATIVE)
4712     return _mm_set_epi8(
4713       HEDLEY_STATIC_CAST(char, e15), HEDLEY_STATIC_CAST(char, e14), HEDLEY_STATIC_CAST(char, e13), HEDLEY_STATIC_CAST(char, e12),
4714       HEDLEY_STATIC_CAST(char, e11), HEDLEY_STATIC_CAST(char, e10), HEDLEY_STATIC_CAST(char,  e9), HEDLEY_STATIC_CAST(char,  e8),
4715       HEDLEY_STATIC_CAST(char,  e7), HEDLEY_STATIC_CAST(char,  e6), HEDLEY_STATIC_CAST(char,  e5), HEDLEY_STATIC_CAST(char,  e4),
4716       HEDLEY_STATIC_CAST(char,  e3), HEDLEY_STATIC_CAST(char,  e2), HEDLEY_STATIC_CAST(char,  e1), HEDLEY_STATIC_CAST(char,  e0));
4717   #else
4718     simde__m128i_private r_;
4719 
4720     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4721       SIMDE_ALIGN_LIKE_16(uint8x16_t) uint8_t data[16] = {
4722         e0,  e1,  e2,  e3,
4723         e4,  e5,  e6,  e7,
4724         e8,  e9,  e10, e11,
4725         e12, e13, e14, e15};
4726       r_.neon_u8 = vld1q_u8(data);
4727     #else
4728       r_.u8[ 0] =  e0; r_.u8[ 1] =  e1; r_.u8[ 2] =  e2; r_.u8[ 3] =  e3;
4729       r_.u8[ 4] =  e4; r_.u8[ 5] =  e5; r_.u8[ 6] =  e6; r_.u8[ 7] =  e7;
4730       r_.u8[ 8] =  e8; r_.u8[ 9] =  e9; r_.u8[10] = e10; r_.u8[11] = e11;
4731       r_.u8[12] = e12; r_.u8[13] = e13; r_.u8[14] = e14; r_.u8[15] = e15;
4732     #endif
4733 
4734     return simde__m128i_from_private(r_);
4735   #endif
4736 }
4737 
4738 SIMDE_FUNCTION_ATTRIBUTES
4739 simde__m128i
simde_x_mm_set_epu16(uint16_t e7,uint16_t e6,uint16_t e5,uint16_t e4,uint16_t e3,uint16_t e2,uint16_t e1,uint16_t e0)4740 simde_x_mm_set_epu16 (uint16_t e7, uint16_t e6, uint16_t e5, uint16_t e4,
4741           uint16_t e3, uint16_t e2, uint16_t e1, uint16_t e0) {
4742   #if defined(SIMDE_X86_SSE2_NATIVE)
4743     return _mm_set_epi16(
4744       HEDLEY_STATIC_CAST(short,  e7), HEDLEY_STATIC_CAST(short,  e6), HEDLEY_STATIC_CAST(short,  e5), HEDLEY_STATIC_CAST(short,  e4),
4745       HEDLEY_STATIC_CAST(short,  e3), HEDLEY_STATIC_CAST(short,  e2), HEDLEY_STATIC_CAST(short,  e1), HEDLEY_STATIC_CAST(short,  e0));
4746   #else
4747     simde__m128i_private r_;
4748 
4749     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4750       SIMDE_ALIGN_LIKE_16(uint16x8_t) uint16_t data[8] = { e0, e1, e2, e3, e4, e5, e6, e7 };
4751       r_.neon_u16 = vld1q_u16(data);
4752     #else
4753       r_.u16[0] = e0; r_.u16[1] = e1; r_.u16[2] = e2; r_.u16[3] = e3;
4754       r_.u16[4] = e4; r_.u16[5] = e5; r_.u16[6] = e6; r_.u16[7] = e7;
4755     #endif
4756 
4757     return simde__m128i_from_private(r_);
4758   #endif
4759 }
4760 
4761 SIMDE_FUNCTION_ATTRIBUTES
4762 simde__m128i
simde_x_mm_set_epu32(uint32_t e3,uint32_t e2,uint32_t e1,uint32_t e0)4763 simde_x_mm_set_epu32 (uint32_t e3, uint32_t e2, uint32_t e1, uint32_t e0) {
4764   #if defined(SIMDE_X86_SSE2_NATIVE)
4765     return _mm_set_epi32(
4766       HEDLEY_STATIC_CAST(int,  e3), HEDLEY_STATIC_CAST(int,  e2), HEDLEY_STATIC_CAST(int,  e1), HEDLEY_STATIC_CAST(int,  e0));
4767   #else
4768     simde__m128i_private r_;
4769 
4770     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4771       SIMDE_ALIGN_LIKE_16(uint32x4_t) uint32_t data[4] = { e0, e1, e2, e3 };
4772       r_.neon_u32 = vld1q_u32(data);
4773     #else
4774       r_.u32[0] = e0;
4775       r_.u32[1] = e1;
4776       r_.u32[2] = e2;
4777       r_.u32[3] = e3;
4778     #endif
4779 
4780     return simde__m128i_from_private(r_);
4781   #endif
4782 }
4783 
4784 SIMDE_FUNCTION_ATTRIBUTES
4785 simde__m128i
simde_x_mm_set_epu64x(uint64_t e1,uint64_t e0)4786 simde_x_mm_set_epu64x (uint64_t e1, uint64_t e0) {
4787   #if defined(SIMDE_X86_SSE2_NATIVE) && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,0,0))
4788     return _mm_set_epi64x(HEDLEY_STATIC_CAST(int64_t,  e1), HEDLEY_STATIC_CAST(int64_t,  e0));
4789   #else
4790     simde__m128i_private r_;
4791 
4792     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4793       SIMDE_ALIGN_LIKE_16(uint64x2_t) uint64_t data[2] = {e0, e1};
4794       r_.neon_u64 = vld1q_u64(data);
4795     #else
4796       r_.u64[0] = e0;
4797       r_.u64[1] = e1;
4798     #endif
4799 
4800     return simde__m128i_from_private(r_);
4801   #endif
4802 }
4803 
4804 SIMDE_FUNCTION_ATTRIBUTES
4805 simde__m128d
simde_mm_set_sd(simde_float64 a)4806 simde_mm_set_sd (simde_float64 a) {
4807   #if defined(SIMDE_X86_SSE2_NATIVE)
4808     return _mm_set_sd(a);
4809   #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
4810     return vsetq_lane_f64(a, vdupq_n_f64(SIMDE_FLOAT64_C(0.0)), 0);
4811   #else
4812     return simde_mm_set_pd(SIMDE_FLOAT64_C(0.0), a);
4813   #endif
4814 }
4815 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4816   #define _mm_set_sd(a) simde_mm_set_sd(a)
4817 #endif
4818 
4819 SIMDE_FUNCTION_ATTRIBUTES
4820 simde__m128i
simde_mm_set1_epi8(int8_t a)4821 simde_mm_set1_epi8 (int8_t a) {
4822   #if defined(SIMDE_X86_SSE2_NATIVE)
4823     return _mm_set1_epi8(a);
4824   #else
4825     simde__m128i_private r_;
4826 
4827     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4828       r_.neon_i8 = vdupq_n_s8(a);
4829     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4830       r_.wasm_v128 = wasm_i8x16_splat(a);
4831     #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
4832       r_.altivec_i8 = vec_splats(HEDLEY_STATIC_CAST(signed char, a));
4833     #else
4834       SIMDE_VECTORIZE
4835       for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
4836         r_.i8[i] = a;
4837       }
4838     #endif
4839 
4840     return simde__m128i_from_private(r_);
4841   #endif
4842 }
4843 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4844   #define _mm_set1_epi8(a) simde_mm_set1_epi8(a)
4845 #endif
4846 
4847 SIMDE_FUNCTION_ATTRIBUTES
4848 simde__m128i
simde_mm_set1_epi16(int16_t a)4849 simde_mm_set1_epi16 (int16_t a) {
4850   #if defined(SIMDE_X86_SSE2_NATIVE)
4851     return _mm_set1_epi16(a);
4852   #else
4853     simde__m128i_private r_;
4854 
4855     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4856       r_.neon_i16 = vdupq_n_s16(a);
4857     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4858       r_.wasm_v128 = wasm_i16x8_splat(a);
4859     #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
4860       r_.altivec_i16 = vec_splats(HEDLEY_STATIC_CAST(signed short, a));
4861     #else
4862       SIMDE_VECTORIZE
4863       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
4864         r_.i16[i] = a;
4865       }
4866     #endif
4867 
4868     return simde__m128i_from_private(r_);
4869   #endif
4870 }
4871 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4872   #define _mm_set1_epi16(a) simde_mm_set1_epi16(a)
4873 #endif
4874 
4875 SIMDE_FUNCTION_ATTRIBUTES
4876 simde__m128i
simde_mm_set1_epi32(int32_t a)4877 simde_mm_set1_epi32 (int32_t a) {
4878   #if defined(SIMDE_X86_SSE2_NATIVE)
4879     return _mm_set1_epi32(a);
4880   #else
4881     simde__m128i_private r_;
4882 
4883     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4884       r_.neon_i32 = vdupq_n_s32(a);
4885     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4886       r_.wasm_v128 = wasm_i32x4_splat(a);
4887     #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
4888       r_.altivec_i32 = vec_splats(HEDLEY_STATIC_CAST(signed int, a));
4889     #else
4890       SIMDE_VECTORIZE
4891       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
4892         r_.i32[i] = a;
4893       }
4894     #endif
4895 
4896     return simde__m128i_from_private(r_);
4897   #endif
4898 }
4899 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4900   #define _mm_set1_epi32(a) simde_mm_set1_epi32(a)
4901 #endif
4902 
4903 SIMDE_FUNCTION_ATTRIBUTES
4904 simde__m128i
simde_mm_set1_epi64x(int64_t a)4905 simde_mm_set1_epi64x (int64_t a) {
4906   #if defined(SIMDE_X86_SSE2_NATIVE) && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,0,0))
4907     return _mm_set1_epi64x(a);
4908   #else
4909     simde__m128i_private r_;
4910 
4911     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4912       r_.neon_i64 = vdupq_n_s64(a);
4913     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4914       r_.wasm_v128 = wasm_i64x2_splat(a);
4915     #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
4916       r_.altivec_i64 = vec_splats(HEDLEY_STATIC_CAST(signed long long, a));
4917     #else
4918       SIMDE_VECTORIZE
4919       for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
4920         r_.i64[i] = a;
4921       }
4922     #endif
4923 
4924     return simde__m128i_from_private(r_);
4925   #endif
4926 }
4927 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4928   #define _mm_set1_epi64x(a) simde_mm_set1_epi64x(a)
4929 #endif
4930 
4931 SIMDE_FUNCTION_ATTRIBUTES
4932 simde__m128i
simde_mm_set1_epi64(simde__m64 a)4933 simde_mm_set1_epi64 (simde__m64 a) {
4934   #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
4935     return _mm_set1_epi64(a);
4936   #else
4937     simde__m64_private a_ = simde__m64_to_private(a);
4938     return simde_mm_set1_epi64x(a_.i64[0]);
4939   #endif
4940 }
4941 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4942   #define _mm_set1_epi64(a) simde_mm_set1_epi64(a)
4943 #endif
4944 
4945 SIMDE_FUNCTION_ATTRIBUTES
4946 simde__m128i
simde_x_mm_set1_epu8(uint8_t value)4947 simde_x_mm_set1_epu8 (uint8_t value) {
4948   #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
4949     return simde__m128i_from_altivec_u8(vec_splats(HEDLEY_STATIC_CAST(unsigned char, value)));
4950   #else
4951     return simde_mm_set1_epi8(HEDLEY_STATIC_CAST(int8_t, value));
4952   #endif
4953 }
4954 
4955 SIMDE_FUNCTION_ATTRIBUTES
4956 simde__m128i
simde_x_mm_set1_epu16(uint16_t value)4957 simde_x_mm_set1_epu16 (uint16_t value) {
4958   #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
4959     return simde__m128i_from_altivec_u16(vec_splats(HEDLEY_STATIC_CAST(unsigned short, value)));
4960   #else
4961     return simde_mm_set1_epi16(HEDLEY_STATIC_CAST(int16_t, value));
4962   #endif
4963 }
4964 
4965 SIMDE_FUNCTION_ATTRIBUTES
4966 simde__m128i
simde_x_mm_set1_epu32(uint32_t value)4967 simde_x_mm_set1_epu32 (uint32_t value) {
4968   #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
4969     return simde__m128i_from_altivec_u32(vec_splats(HEDLEY_STATIC_CAST(unsigned int, value)));
4970   #else
4971     return simde_mm_set1_epi32(HEDLEY_STATIC_CAST(int32_t, value));
4972   #endif
4973 }
4974 
4975 SIMDE_FUNCTION_ATTRIBUTES
4976 simde__m128i
simde_x_mm_set1_epu64(uint64_t value)4977 simde_x_mm_set1_epu64 (uint64_t value) {
4978   #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
4979     return simde__m128i_from_altivec_u64(vec_splats(HEDLEY_STATIC_CAST(unsigned long long, value)));
4980   #else
4981     return simde_mm_set1_epi64x(HEDLEY_STATIC_CAST(int64_t, value));
4982   #endif
4983 }
4984 
4985 SIMDE_FUNCTION_ATTRIBUTES
4986 simde__m128i
simde_mm_setr_epi8(int8_t e15,int8_t e14,int8_t e13,int8_t e12,int8_t e11,int8_t e10,int8_t e9,int8_t e8,int8_t e7,int8_t e6,int8_t e5,int8_t e4,int8_t e3,int8_t e2,int8_t e1,int8_t e0)4987 simde_mm_setr_epi8 (int8_t e15, int8_t e14, int8_t e13, int8_t e12,
4988         int8_t e11, int8_t e10, int8_t  e9, int8_t  e8,
4989         int8_t  e7, int8_t  e6, int8_t  e5, int8_t  e4,
4990         int8_t  e3, int8_t  e2, int8_t  e1, int8_t  e0) {
4991   #if defined(SIMDE_X86_SSE2_NATIVE)
4992     return _mm_setr_epi8(
4993       e15, e14, e13, e12, e11, e10,  e9,    e8,
4994       e7,  e6,  e5,  e4,  e3,  e2,  e1,  e0);
4995   #else
4996     return simde_mm_set_epi8(
4997       e0, e1, e2, e3, e4, e5, e6, e7,
4998       e8, e9, e10, e11, e12, e13, e14, e15);
4999   #endif
5000 }
5001 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5002   #define _mm_setr_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) simde_mm_setr_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0)
5003 #endif
5004 
5005 SIMDE_FUNCTION_ATTRIBUTES
5006 simde__m128i
simde_mm_setr_epi16(int16_t e7,int16_t e6,int16_t e5,int16_t e4,int16_t e3,int16_t e2,int16_t e1,int16_t e0)5007 simde_mm_setr_epi16 (int16_t e7, int16_t e6, int16_t e5, int16_t e4,
5008          int16_t e3, int16_t e2, int16_t e1, int16_t e0) {
5009   #if defined(SIMDE_X86_SSE2_NATIVE)
5010     return _mm_setr_epi16(e7,  e6,  e5,  e4,  e3,  e2,  e1,  e0);
5011   #else
5012     return simde_mm_set_epi16(e0, e1, e2, e3, e4, e5, e6, e7);
5013   #endif
5014 }
5015 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5016   #define _mm_setr_epi16(e7, e6, e5, e4, e3, e2, e1, e0) simde_mm_setr_epi16(e7, e6, e5, e4, e3, e2, e1, e0)
5017 #endif
5018 
5019 SIMDE_FUNCTION_ATTRIBUTES
5020 simde__m128i
simde_mm_setr_epi32(int32_t e3,int32_t e2,int32_t e1,int32_t e0)5021 simde_mm_setr_epi32 (int32_t e3, int32_t e2, int32_t e1, int32_t e0) {
5022   #if defined(SIMDE_X86_SSE2_NATIVE)
5023     return _mm_setr_epi32(e3, e2, e1, e0);
5024   #else
5025     return simde_mm_set_epi32(e0, e1, e2, e3);
5026   #endif
5027 }
5028 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5029   #define _mm_setr_epi32(e3, e2, e1, e0) simde_mm_setr_epi32(e3, e2, e1, e0)
5030 #endif
5031 
5032 SIMDE_FUNCTION_ATTRIBUTES
5033 simde__m128i
simde_mm_setr_epi64(simde__m64 e1,simde__m64 e0)5034 simde_mm_setr_epi64 (simde__m64 e1, simde__m64 e0) {
5035   #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
5036     return _mm_setr_epi64(e1, e0);
5037   #else
5038     return simde_mm_set_epi64(e0, e1);
5039   #endif
5040 }
5041 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5042   #define _mm_setr_epi64(e1, e0) (simde_mm_setr_epi64((e1), (e0)))
5043 #endif
5044 
5045 SIMDE_FUNCTION_ATTRIBUTES
5046 simde__m128d
simde_mm_setr_pd(simde_float64 e1,simde_float64 e0)5047 simde_mm_setr_pd (simde_float64 e1, simde_float64 e0) {
5048   #if defined(SIMDE_X86_SSE2_NATIVE)
5049     return _mm_setr_pd(e1, e0);
5050   #else
5051     return simde_mm_set_pd(e0, e1);
5052   #endif
5053 }
5054 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5055   #define _mm_setr_pd(e1, e0) simde_mm_setr_pd(e1, e0)
5056 #endif
5057 
5058 SIMDE_FUNCTION_ATTRIBUTES
5059 simde__m128d
simde_mm_setzero_pd(void)5060 simde_mm_setzero_pd (void) {
5061   #if defined(SIMDE_X86_SSE2_NATIVE)
5062     return _mm_setzero_pd();
5063   #else
5064     return simde_mm_castsi128_pd(simde_mm_setzero_si128());
5065   #endif
5066 }
5067 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5068   #define _mm_setzero_pd() simde_mm_setzero_pd()
5069 #endif
5070 
5071 #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
5072 HEDLEY_DIAGNOSTIC_PUSH
5073 SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_
5074 #endif
5075 
5076 SIMDE_FUNCTION_ATTRIBUTES
5077 simde__m128d
simde_mm_undefined_pd(void)5078 simde_mm_undefined_pd (void) {
5079   simde__m128d_private r_;
5080 
5081   #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE__HAVE_UNDEFINED128)
5082     r_.n = _mm_undefined_pd();
5083   #elif !defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
5084     r_ = simde__m128d_to_private(simde_mm_setzero_pd());
5085   #endif
5086 
5087   return simde__m128d_from_private(r_);
5088 }
5089 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5090   #define _mm_undefined_pd() simde_mm_undefined_pd()
5091 #endif
5092 
5093 SIMDE_FUNCTION_ATTRIBUTES
5094 simde__m128i
simde_mm_undefined_si128(void)5095 simde_mm_undefined_si128 (void) {
5096   simde__m128i_private r_;
5097 
5098   #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE__HAVE_UNDEFINED128)
5099     r_.n = _mm_undefined_si128();
5100   #elif !defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
5101     r_ = simde__m128i_to_private(simde_mm_setzero_si128());
5102   #endif
5103 
5104   return simde__m128i_from_private(r_);
5105 }
5106 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5107   #define _mm_undefined_si128() (simde_mm_undefined_si128())
5108 #endif
5109 
5110 #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
5111 HEDLEY_DIAGNOSTIC_POP
5112 #endif
5113 
5114 SIMDE_FUNCTION_ATTRIBUTES
5115 simde__m128d
simde_x_mm_setone_pd(void)5116 simde_x_mm_setone_pd (void) {
5117   return simde_mm_castps_pd(simde_x_mm_setone_ps());
5118 }
5119 
5120 SIMDE_FUNCTION_ATTRIBUTES
5121 simde__m128i
simde_x_mm_setone_si128(void)5122 simde_x_mm_setone_si128 (void) {
5123   return simde_mm_castps_si128(simde_x_mm_setone_ps());
5124 }
5125 
5126 SIMDE_FUNCTION_ATTRIBUTES
5127 simde__m128i
simde_mm_shuffle_epi32(simde__m128i a,const int imm8)5128 simde_mm_shuffle_epi32 (simde__m128i a, const int imm8)
5129     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)  {
5130   simde__m128i_private
5131     r_,
5132     a_ = simde__m128i_to_private(a);
5133 
5134   for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
5135     r_.i32[i] = a_.i32[(imm8 >> (i * 2)) & 3];
5136   }
5137 
5138   return simde__m128i_from_private(r_);
5139 }
5140 #if defined(SIMDE_X86_SSE2_NATIVE)
5141   #define simde_mm_shuffle_epi32(a, imm8) _mm_shuffle_epi32((a), (imm8))
5142 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5143   #define simde_mm_shuffle_epi32(a, imm8)                                   \
5144     __extension__({                                                         \
5145         int32x4_t ret;                                                      \
5146         ret = vmovq_n_s32(                                                  \
5147             vgetq_lane_s32(vreinterpretq_s32_s64(a), (imm8) & (0x3)));     \
5148         ret = vsetq_lane_s32(                                               \
5149             vgetq_lane_s32(vreinterpretq_s32_s64(a), ((imm8) >> 2) & 0x3), \
5150             ret, 1);                                                        \
5151         ret = vsetq_lane_s32(                                               \
5152             vgetq_lane_s32(vreinterpretq_s32_s64(a), ((imm8) >> 4) & 0x3), \
5153             ret, 2);                                                        \
5154         ret = vsetq_lane_s32(                                               \
5155             vgetq_lane_s32(vreinterpretq_s32_s64(a), ((imm8) >> 6) & 0x3), \
5156             ret, 3);                                                        \
5157         vreinterpretq_s64_s32(ret);                                       \
5158     })
5159 #elif defined(SIMDE_SHUFFLE_VECTOR_)
5160   #define simde_mm_shuffle_epi32(a, imm8) (__extension__ ({ \
5161       const simde__m128i_private simde__tmp_a_ = simde__m128i_to_private(a); \
5162       simde__m128i_from_private((simde__m128i_private) { .i32 = \
5163         SIMDE_SHUFFLE_VECTOR_(32, 16, \
5164           (simde__tmp_a_).i32, \
5165           (simde__tmp_a_).i32, \
5166           ((imm8)     ) & 3, \
5167           ((imm8) >> 2) & 3, \
5168           ((imm8) >> 4) & 3, \
5169           ((imm8) >> 6) & 3) }); }))
5170 #endif
5171 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5172   #define _mm_shuffle_epi32(a, imm8) simde_mm_shuffle_epi32(a, imm8)
5173 #endif
5174 
5175 SIMDE_FUNCTION_ATTRIBUTES
5176 simde__m128d
simde_mm_shuffle_pd(simde__m128d a,simde__m128d b,const int imm8)5177 simde_mm_shuffle_pd (simde__m128d a, simde__m128d b, const int imm8)
5178     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3)  {
5179   simde__m128d_private
5180     r_,
5181     a_ = simde__m128d_to_private(a),
5182     b_ = simde__m128d_to_private(b);
5183 
5184   r_.f64[0] = ((imm8 & 1) == 0) ? a_.f64[0] : a_.f64[1];
5185   r_.f64[1] = ((imm8 & 2) == 0) ? b_.f64[0] : b_.f64[1];
5186 
5187   return simde__m128d_from_private(r_);
5188 }
5189 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
5190   #define simde_mm_shuffle_pd(a, b, imm8) _mm_shuffle_pd((a), (b), (imm8))
5191 #elif defined(SIMDE_SHUFFLE_VECTOR_)
5192   #define simde_mm_shuffle_pd(a, b, imm8) (__extension__ ({ \
5193       simde__m128d_from_private((simde__m128d_private) { .f64 = \
5194         SIMDE_SHUFFLE_VECTOR_(64, 16, \
5195           simde__m128d_to_private(a).f64, \
5196           simde__m128d_to_private(b).f64, \
5197           (((imm8)     ) & 1), \
5198           (((imm8) >> 1) & 1) + 2) }); }))
5199 #endif
5200 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5201   #define _mm_shuffle_pd(a, b, imm8) simde_mm_shuffle_pd(a, b, imm8)
5202 #endif
5203 
5204 SIMDE_FUNCTION_ATTRIBUTES
5205 simde__m128i
simde_mm_shufflehi_epi16(simde__m128i a,const int imm8)5206 simde_mm_shufflehi_epi16 (simde__m128i a, const int imm8)
5207     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)  {
5208   simde__m128i_private
5209     r_,
5210     a_ = simde__m128i_to_private(a);
5211 
5212   SIMDE_VECTORIZE
5213   for (size_t i = 0 ; i < ((sizeof(a_.i16) / sizeof(a_.i16[0])) / 2) ; i++) {
5214     r_.i16[i] = a_.i16[i];
5215   }
5216   for (size_t i = ((sizeof(a_.i16) / sizeof(a_.i16[0])) / 2) ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
5217     r_.i16[i] = a_.i16[((imm8 >> ((i - 4) * 2)) & 3) + 4];
5218   }
5219 
5220   return simde__m128i_from_private(r_);
5221 }
5222 #if defined(SIMDE_X86_SSE2_NATIVE)
5223   #define simde_mm_shufflehi_epi16(a, imm8) _mm_shufflehi_epi16((a), (imm8))
5224 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5225   #define simde_mm_shufflehi_epi16(a, imm8) \
5226     __extension__({                                                            \
5227         int16x8_t ret = vreinterpretq_s16_s64(a);                            \
5228         int16x4_t highBits = vget_high_s16(ret);                               \
5229         ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm8) & (0x3)), ret, 4);  \
5230         ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm8) >> 2) & 0x3), ret, \
5231                              5);                                               \
5232         ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm8) >> 4) & 0x3), ret, \
5233                              6);                                               \
5234         ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm8) >> 6) & 0x3), ret, \
5235                              7);                                               \
5236         vreinterpretq_s64_s16(ret);                                          \
5237     })
5238 #elif defined(SIMDE_SHUFFLE_VECTOR_)
5239   #define simde_mm_shufflehi_epi16(a, imm8) (__extension__ ({ \
5240       const simde__m128i_private simde__tmp_a_ = simde__m128i_to_private(a); \
5241       simde__m128i_from_private((simde__m128i_private) { .i16 = \
5242         SIMDE_SHUFFLE_VECTOR_(16, 16, \
5243           (simde__tmp_a_).i16, \
5244           (simde__tmp_a_).i16, \
5245           0, 1, 2, 3, \
5246           (((imm8)     ) & 3) + 4, \
5247           (((imm8) >> 2) & 3) + 4, \
5248           (((imm8) >> 4) & 3) + 4, \
5249           (((imm8) >> 6) & 3) + 4) }); }))
5250 #endif
5251 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5252   #define _mm_shufflehi_epi16(a, imm8) simde_mm_shufflehi_epi16(a, imm8)
5253 #endif
5254 
5255 SIMDE_FUNCTION_ATTRIBUTES
5256 simde__m128i
simde_mm_shufflelo_epi16(simde__m128i a,const int imm8)5257 simde_mm_shufflelo_epi16 (simde__m128i a, const int imm8)
5258     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)  {
5259   simde__m128i_private
5260     r_,
5261     a_ = simde__m128i_to_private(a);
5262 
5263   for (size_t i = 0 ; i < ((sizeof(r_.i16) / sizeof(r_.i16[0])) / 2) ; i++) {
5264     r_.i16[i] = a_.i16[((imm8 >> (i * 2)) & 3)];
5265   }
5266   SIMDE_VECTORIZE
5267   for (size_t i = ((sizeof(a_.i16) / sizeof(a_.i16[0])) / 2) ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
5268     r_.i16[i] = a_.i16[i];
5269   }
5270 
5271   return simde__m128i_from_private(r_);
5272 }
5273 #if defined(SIMDE_X86_SSE2_NATIVE)
5274   #define simde_mm_shufflelo_epi16(a, imm8) _mm_shufflelo_epi16((a), (imm8))
5275 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5276   #define simde_mm_shufflelo_epi16(a, imm8)                                  \
5277     __extension__({                                                           \
5278         int16x8_t ret = vreinterpretq_s16_s64(a);                           \
5279         int16x4_t lowBits = vget_low_s16(ret);                                \
5280         ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm8) & (0x3)), ret, 0);  \
5281         ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm8) >> 2) & 0x3), ret, \
5282                              1);                                              \
5283         ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm8) >> 4) & 0x3), ret, \
5284                              2);                                              \
5285         ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm8) >> 6) & 0x3), ret, \
5286                              3);                                              \
5287         vreinterpretq_s64_s16(ret);                                         \
5288     })
5289 #elif defined(SIMDE_SHUFFLE_VECTOR_)
5290   #define simde_mm_shufflelo_epi16(a, imm8) (__extension__ ({ \
5291       const simde__m128i_private simde__tmp_a_ = simde__m128i_to_private(a); \
5292       simde__m128i_from_private((simde__m128i_private) { .i16 = \
5293         SIMDE_SHUFFLE_VECTOR_(16, 16, \
5294           (simde__tmp_a_).i16, \
5295           (simde__tmp_a_).i16, \
5296           (((imm8)     ) & 3), \
5297           (((imm8) >> 2) & 3), \
5298           (((imm8) >> 4) & 3), \
5299           (((imm8) >> 6) & 3), \
5300           4, 5, 6, 7) }); }))
5301 #endif
5302 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5303   #define _mm_shufflelo_epi16(a, imm8) simde_mm_shufflelo_epi16(a, imm8)
5304 #endif
5305 
5306 SIMDE_FUNCTION_ATTRIBUTES
5307 simde__m128i
simde_mm_sll_epi16(simde__m128i a,simde__m128i count)5308 simde_mm_sll_epi16 (simde__m128i a, simde__m128i count) {
5309   #if defined(SIMDE_X86_SSE2_NATIVE)
5310     return _mm_sll_epi16(a, count);
5311   #else
5312     simde__m128i_private
5313       r_,
5314       a_ = simde__m128i_to_private(a),
5315       count_ = simde__m128i_to_private(count);
5316 
5317     if (count_.u64[0] > 15)
5318       return simde_mm_setzero_si128();
5319 
5320     #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
5321       r_.u16 = (a_.u16 << count_.u64[0]);
5322     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5323       r_.neon_u16 = vshlq_u16(a_.neon_u16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, count_.u64[0])));
5324     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5325       r_.wasm_v128 = ((wasm_i64x2_extract_lane(count_.wasm_v128, 0) < 16) ? wasm_i16x8_shl(a_.wasm_v128, HEDLEY_STATIC_CAST(int32_t, wasm_i64x2_extract_lane(count_.wasm_v128, 0))) : wasm_i16x8_const(0,0,0,0,0,0,0,0));
5326     #else
5327       SIMDE_VECTORIZE
5328       for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
5329         r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, (a_.u16[i] << count_.u64[0]));
5330       }
5331     #endif
5332 
5333     return simde__m128i_from_private(r_);
5334   #endif
5335 }
5336 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5337   #define _mm_sll_epi16(a, count) simde_mm_sll_epi16((a), (count))
5338 #endif
5339 
5340 SIMDE_FUNCTION_ATTRIBUTES
5341 simde__m128i
simde_mm_sll_epi32(simde__m128i a,simde__m128i count)5342 simde_mm_sll_epi32 (simde__m128i a, simde__m128i count) {
5343   #if defined(SIMDE_X86_SSE2_NATIVE)
5344     return _mm_sll_epi32(a, count);
5345   #else
5346     simde__m128i_private
5347       r_,
5348       a_ = simde__m128i_to_private(a),
5349       count_ = simde__m128i_to_private(count);
5350 
5351     if (count_.u64[0] > 31)
5352       return simde_mm_setzero_si128();
5353 
5354     #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
5355       r_.u32 = (a_.u32 << count_.u64[0]);
5356     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5357       r_.neon_u32 = vshlq_u32(a_.neon_u32, vdupq_n_s32(HEDLEY_STATIC_CAST(int32_t, count_.u64[0])));
5358     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5359       r_.wasm_v128 = ((wasm_i64x2_extract_lane(count_.wasm_v128, 0) < 32) ? wasm_i32x4_shl(a_.wasm_v128, HEDLEY_STATIC_CAST(int32_t, wasm_i64x2_extract_lane(count_.wasm_v128, 0))) : wasm_i32x4_const(0,0,0,0));
5360     #else
5361       SIMDE_VECTORIZE
5362       for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
5363         r_.u32[i] = HEDLEY_STATIC_CAST(uint32_t, (a_.u32[i] << count_.u64[0]));
5364       }
5365     #endif
5366 
5367     return simde__m128i_from_private(r_);
5368   #endif
5369 }
5370 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5371   #define _mm_sll_epi32(a, count) (simde_mm_sll_epi32(a, (count)))
5372 #endif
5373 
5374 SIMDE_FUNCTION_ATTRIBUTES
5375 simde__m128i
simde_mm_sll_epi64(simde__m128i a,simde__m128i count)5376 simde_mm_sll_epi64 (simde__m128i a, simde__m128i count) {
5377   #if defined(SIMDE_X86_SSE2_NATIVE)
5378     return _mm_sll_epi64(a, count);
5379   #else
5380     simde__m128i_private
5381       r_,
5382       a_ = simde__m128i_to_private(a),
5383       count_ = simde__m128i_to_private(count);
5384 
5385     if (count_.u64[0] > 63)
5386       return simde_mm_setzero_si128();
5387 
5388     const int_fast16_t s = HEDLEY_STATIC_CAST(int_fast16_t, count_.u64[0]);
5389     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5390       r_.neon_u64 = vshlq_u64(a_.neon_u64, vdupq_n_s64(HEDLEY_STATIC_CAST(int64_t, s)));
5391     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5392       r_.wasm_v128 = (s < 64) ? wasm_i64x2_shl(a_.wasm_v128, s) : wasm_i64x2_const(0,0);
5393     #else
5394       #if !defined(SIMDE_BUG_GCC_94488)
5395         SIMDE_VECTORIZE
5396       #endif
5397       for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
5398         r_.u64[i] = a_.u64[i] << s;
5399       }
5400     #endif
5401 
5402     return simde__m128i_from_private(r_);
5403   #endif
5404 }
5405 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5406   #define _mm_sll_epi64(a, count) (simde_mm_sll_epi64(a, (count)))
5407 #endif
5408 
5409 SIMDE_FUNCTION_ATTRIBUTES
5410 simde__m128d
simde_mm_sqrt_pd(simde__m128d a)5411 simde_mm_sqrt_pd (simde__m128d a) {
5412   #if defined(SIMDE_X86_SSE2_NATIVE)
5413     return _mm_sqrt_pd(a);
5414   #else
5415     simde__m128d_private
5416       r_,
5417       a_ = simde__m128d_to_private(a);
5418 
5419     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
5420       r_.neon_f64 = vsqrtq_f64(a_.neon_f64);
5421     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5422       r_.wasm_v128 = wasm_f64x2_sqrt(a_.wasm_v128);
5423     #elif defined(simde_math_sqrt)
5424       SIMDE_VECTORIZE
5425       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
5426         r_.f64[i] = simde_math_sqrt(a_.f64[i]);
5427       }
5428     #else
5429       HEDLEY_UNREACHABLE();
5430     #endif
5431 
5432     return simde__m128d_from_private(r_);
5433   #endif
5434 }
5435 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5436   #define _mm_sqrt_pd(a) simde_mm_sqrt_pd(a)
5437 #endif
5438 
5439 SIMDE_FUNCTION_ATTRIBUTES
5440 simde__m128d
simde_mm_sqrt_sd(simde__m128d a,simde__m128d b)5441 simde_mm_sqrt_sd (simde__m128d a, simde__m128d b) {
5442   #if defined(SIMDE_X86_SSE2_NATIVE)
5443     return _mm_sqrt_sd(a, b);
5444   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
5445     return simde_mm_move_sd(a, simde_mm_sqrt_pd(b));
5446   #else
5447     simde__m128d_private
5448       r_,
5449       a_ = simde__m128d_to_private(a),
5450       b_ = simde__m128d_to_private(b);
5451 
5452     #if defined(simde_math_sqrt)
5453       r_.f64[0] = simde_math_sqrt(b_.f64[0]);
5454       r_.f64[1] = a_.f64[1];
5455     #else
5456       HEDLEY_UNREACHABLE();
5457     #endif
5458 
5459     return simde__m128d_from_private(r_);
5460   #endif
5461 }
5462 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5463   #define _mm_sqrt_sd(a, b) simde_mm_sqrt_sd(a, b)
5464 #endif
5465 
5466 SIMDE_FUNCTION_ATTRIBUTES
5467 simde__m128i
simde_mm_srl_epi16(simde__m128i a,simde__m128i count)5468 simde_mm_srl_epi16 (simde__m128i a, simde__m128i count) {
5469   #if defined(SIMDE_X86_SSE2_NATIVE)
5470     return _mm_srl_epi16(a, count);
5471   #else
5472     simde__m128i_private
5473       r_,
5474       a_ = simde__m128i_to_private(a),
5475       count_ = simde__m128i_to_private(count);
5476 
5477     const int cnt = HEDLEY_STATIC_CAST(int, (count_.i64[0] > 16 ? 16 : count_.i64[0]));
5478 
5479     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5480       r_.neon_u16 = vshlq_u16(a_.neon_u16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, -cnt)));
5481     #else
5482       SIMDE_VECTORIZE
5483       for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
5484         r_.u16[i] = a_.u16[i] >> cnt;
5485       }
5486     #endif
5487 
5488     return simde__m128i_from_private(r_);
5489   #endif
5490 }
5491 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5492   #define _mm_srl_epi16(a, count) (simde_mm_srl_epi16(a, (count)))
5493 #endif
5494 
5495 SIMDE_FUNCTION_ATTRIBUTES
5496 simde__m128i
simde_mm_srl_epi32(simde__m128i a,simde__m128i count)5497 simde_mm_srl_epi32 (simde__m128i a, simde__m128i count) {
5498   #if defined(SIMDE_X86_SSE2_NATIVE)
5499     return _mm_srl_epi32(a, count);
5500   #else
5501     simde__m128i_private
5502       r_,
5503       a_ = simde__m128i_to_private(a),
5504       count_ = simde__m128i_to_private(count);
5505 
5506     const int cnt = HEDLEY_STATIC_CAST(int, (count_.i64[0] > 32 ? 32 : count_.i64[0]));
5507 
5508     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5509       r_.neon_u32 = vshlq_u32(a_.neon_u32, vdupq_n_s32(HEDLEY_STATIC_CAST(int32_t, -cnt)));
5510     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5511       r_.wasm_v128 = wasm_u32x4_shr(a_.wasm_v128, cnt);
5512     #else
5513       SIMDE_VECTORIZE
5514       for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
5515         r_.u32[i] = a_.u32[i] >> cnt;
5516       }
5517     #endif
5518 
5519     return simde__m128i_from_private(r_);
5520   #endif
5521 }
5522 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5523   #define _mm_srl_epi32(a, count) (simde_mm_srl_epi32(a, (count)))
5524 #endif
5525 
5526 SIMDE_FUNCTION_ATTRIBUTES
5527 simde__m128i
simde_mm_srl_epi64(simde__m128i a,simde__m128i count)5528 simde_mm_srl_epi64 (simde__m128i a, simde__m128i count) {
5529   #if defined(SIMDE_X86_SSE2_NATIVE)
5530     return _mm_srl_epi64(a, count);
5531   #else
5532     simde__m128i_private
5533       r_,
5534       a_ = simde__m128i_to_private(a),
5535       count_ = simde__m128i_to_private(count);
5536 
5537     const int cnt = HEDLEY_STATIC_CAST(int, (count_.i64[0] > 64 ? 64 : count_.i64[0]));
5538 
5539     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5540       r_.neon_u64 = vshlq_u64(a_.neon_u64, vdupq_n_s64(HEDLEY_STATIC_CAST(int64_t, -cnt)));
5541     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5542       r_.wasm_v128 = wasm_u64x2_shr(a_.wasm_v128, cnt);
5543     #else
5544       #if !defined(SIMDE_BUG_GCC_94488)
5545         SIMDE_VECTORIZE
5546       #endif
5547       for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
5548         r_.u64[i] = a_.u64[i] >> cnt;
5549       }
5550     #endif
5551 
5552     return simde__m128i_from_private(r_);
5553   #endif
5554 }
5555 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5556   #define _mm_srl_epi64(a, count) (simde_mm_srl_epi64(a, (count)))
5557 #endif
5558 
5559 SIMDE_FUNCTION_ATTRIBUTES
5560 simde__m128i
simde_mm_srai_epi16(simde__m128i a,const int imm8)5561 simde_mm_srai_epi16 (simde__m128i a, const int imm8)
5562     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
5563   /* MSVC requires a range of (0, 255). */
5564   simde__m128i_private
5565     r_,
5566     a_ = simde__m128i_to_private(a);
5567 
5568   const int cnt = (imm8 & ~15) ? 15 : imm8;
5569 
5570   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5571     r_.neon_i16 = vshlq_s16(a_.neon_i16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, -cnt)));
5572   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5573     r_.wasm_v128 = wasm_i16x8_shr(a_.wasm_v128, cnt);
5574   #else
5575     SIMDE_VECTORIZE
5576     for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i16[0])) ; i++) {
5577       r_.i16[i] = a_.i16[i] >> cnt;
5578     }
5579   #endif
5580 
5581   return simde__m128i_from_private(r_);
5582 }
5583 #if defined(SIMDE_X86_SSE2_NATIVE)
5584   #define simde_mm_srai_epi16(a, imm8) _mm_srai_epi16((a), (imm8))
5585 #endif
5586 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5587   #define _mm_srai_epi16(a, imm8) simde_mm_srai_epi16(a, imm8)
5588 #endif
5589 
5590 SIMDE_FUNCTION_ATTRIBUTES
5591 simde__m128i
simde_mm_srai_epi32(simde__m128i a,const int imm8)5592 simde_mm_srai_epi32 (simde__m128i a, const int imm8)
5593     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
5594   /* MSVC requires a range of (0, 255). */
5595   simde__m128i_private
5596     r_,
5597     a_ = simde__m128i_to_private(a);
5598 
5599   const int cnt = (imm8 & ~31) ? 31 : imm8;
5600 
5601   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5602     r_.neon_i32 = vshlq_s32(a_.neon_i32, vdupq_n_s32(-cnt));
5603   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5604     r_.wasm_v128 = wasm_i32x4_shr(a_.wasm_v128, cnt);
5605   #else
5606     SIMDE_VECTORIZE
5607     for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i32[0])) ; i++) {
5608       r_.i32[i] = a_.i32[i] >> cnt;
5609     }
5610   #endif
5611 
5612   return simde__m128i_from_private(r_);
5613 }
5614 #if defined(SIMDE_X86_SSE2_NATIVE)
5615   #define simde_mm_srai_epi32(a, imm8) _mm_srai_epi32((a), (imm8))
5616 #endif
5617 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5618   #define _mm_srai_epi32(a, imm8) simde_mm_srai_epi32(a, imm8)
5619 #endif
5620 
5621 SIMDE_FUNCTION_ATTRIBUTES
5622 simde__m128i
simde_mm_sra_epi16(simde__m128i a,simde__m128i count)5623 simde_mm_sra_epi16 (simde__m128i a, simde__m128i count) {
5624   #if defined(SIMDE_X86_SSE2_NATIVE)
5625     return _mm_sra_epi16(a, count);
5626   #else
5627     simde__m128i_private
5628       r_,
5629       a_ = simde__m128i_to_private(a),
5630       count_ = simde__m128i_to_private(count);
5631 
5632     const int cnt = HEDLEY_STATIC_CAST(int, (count_.i64[0] > 15 ? 15 : count_.i64[0]));
5633 
5634     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5635       r_.neon_i16 = vshlq_s16(a_.neon_i16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, -cnt)));
5636     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5637       r_.wasm_v128 = wasm_i16x8_shr(a_.wasm_v128, cnt);
5638     #else
5639       SIMDE_VECTORIZE
5640       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
5641         r_.i16[i] = a_.i16[i] >> cnt;
5642       }
5643     #endif
5644 
5645     return simde__m128i_from_private(r_);
5646   #endif
5647 }
5648 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5649   #define _mm_sra_epi16(a, count) (simde_mm_sra_epi16(a, count))
5650 #endif
5651 
5652 SIMDE_FUNCTION_ATTRIBUTES
5653 simde__m128i
simde_mm_sra_epi32(simde__m128i a,simde__m128i count)5654 simde_mm_sra_epi32 (simde__m128i a, simde__m128i count) {
5655   #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_MM_SRA_EPI32)
5656     return _mm_sra_epi32(a, count);
5657   #else
5658     simde__m128i_private
5659       r_,
5660       a_ = simde__m128i_to_private(a),
5661       count_ = simde__m128i_to_private(count);
5662 
5663     const int cnt = count_.u64[0] > 31 ? 31 : HEDLEY_STATIC_CAST(int, count_.u64[0]);
5664 
5665     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5666       r_.neon_i32 = vshlq_s32(a_.neon_i32, vdupq_n_s32(HEDLEY_STATIC_CAST(int32_t, -cnt)));
5667     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5668       r_.wasm_v128 = wasm_i32x4_shr(a_.wasm_v128, cnt);
5669     #else
5670       SIMDE_VECTORIZE
5671       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
5672         r_.i32[i] = a_.i32[i] >> cnt;
5673       }
5674     #endif
5675 
5676     return simde__m128i_from_private(r_);
5677   #endif
5678 }
5679 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5680   #define _mm_sra_epi32(a, count) (simde_mm_sra_epi32(a, (count)))
5681 #endif
5682 
5683 SIMDE_FUNCTION_ATTRIBUTES
5684 simde__m128i
simde_mm_slli_epi16(simde__m128i a,const int imm8)5685 simde_mm_slli_epi16 (simde__m128i a, const int imm8)
5686     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)  {
5687   if (HEDLEY_UNLIKELY((imm8 > 15))) {
5688     return simde_mm_setzero_si128();
5689   }
5690 
5691   simde__m128i_private
5692     r_,
5693     a_ = simde__m128i_to_private(a);
5694 
5695   #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
5696     r_.i16 = a_.i16 << (imm8 & 0xff);
5697   #else
5698     const int s = (imm8 > HEDLEY_STATIC_CAST(int, sizeof(r_.i16[0]) * CHAR_BIT) - 1) ? 0 : imm8;
5699     SIMDE_VECTORIZE
5700     for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
5701       r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, a_.i16[i] << s);
5702     }
5703   #endif
5704 
5705   return simde__m128i_from_private(r_);
5706 }
5707 #if defined(SIMDE_X86_SSE2_NATIVE)
5708   #define simde_mm_slli_epi16(a, imm8) _mm_slli_epi16(a, imm8)
5709 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5710   #define simde_mm_slli_epi16(a, imm8) \
5711      (__extension__ ({ \
5712         simde__m128i ret; \
5713         if ((imm8) <= 0) { \
5714             ret = a; \
5715         } else if ((imm8) > 15) { \
5716             ret = simde_mm_setzero_si128(); \
5717         } else { \
5718             ret = simde__m128i_from_neon_i16( \
5719                 vshlq_n_s16(simde__m128i_to_neon_i16(a), ((imm8) & 15))); \
5720         } \
5721         ret; \
5722     }))
5723 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5724   #define simde_mm_slli_epi16(a, imm8) \
5725     ((imm8 < 16) ? wasm_i16x8_shl(simde__m128i_to_private(a).wasm_v128, imm8) : wasm_i16x8_const(0,0,0,0,0,0,0,0))
5726 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
5727   #define simde_mm_slli_epi16(a, imm8) \
5728     ((imm8 & ~15) ? simde_mm_setzero_si128() : simde__m128i_from_altivec_i16(vec_sl(simde__m128i_to_altivec_i16(a), vec_splat_u16(HEDLEY_STATIC_CAST(unsigned short, imm8)))))
5729 #endif
5730 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5731   #define _mm_slli_epi16(a, imm8) simde_mm_slli_epi16(a, imm8)
5732 #endif
5733 
5734 SIMDE_FUNCTION_ATTRIBUTES
5735 simde__m128i
simde_mm_slli_epi32(simde__m128i a,const int imm8)5736 simde_mm_slli_epi32 (simde__m128i a, const int imm8)
5737     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)  {
5738   if (HEDLEY_UNLIKELY((imm8 > 31))) {
5739     return simde_mm_setzero_si128();
5740   }
5741   simde__m128i_private
5742     r_,
5743     a_ = simde__m128i_to_private(a);
5744 
5745   #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
5746     r_.i32 = a_.i32 << imm8;
5747   #else
5748     SIMDE_VECTORIZE
5749     for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
5750       r_.i32[i] = a_.i32[i] << (imm8 & 0xff);
5751     }
5752   #endif
5753 
5754   return simde__m128i_from_private(r_);
5755 }
5756 #if defined(SIMDE_X86_SSE2_NATIVE)
5757   #define simde_mm_slli_epi32(a, imm8) _mm_slli_epi32(a, imm8)
5758 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5759   #define simde_mm_slli_epi32(a, imm8) \
5760      (__extension__ ({ \
5761        simde__m128i ret; \
5762        if ((imm8) <= 0) { \
5763          ret = a; \
5764        } else if ((imm8) > 31) { \
5765          ret = simde_mm_setzero_si128(); \
5766        } else { \
5767          ret = simde__m128i_from_neon_i32( \
5768            vshlq_n_s32(simde__m128i_to_neon_i32(a), ((imm8) & 31))); \
5769        } \
5770        ret; \
5771     }))
5772 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5773   #define simde_mm_slli_epi32(a, imm8) \
5774     ((imm8 < 32) ? wasm_i32x4_shl(simde__m128i_to_private(a).wasm_v128, imm8) : wasm_i32x4_const(0,0,0,0))
5775 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
5776   #define simde_mm_slli_epi32(a, imm8) \
5777      (__extension__ ({ \
5778        simde__m128i ret; \
5779        if ((imm8) <= 0) { \
5780          ret = a; \
5781        } else if ((imm8) > 31) { \
5782          ret = simde_mm_setzero_si128(); \
5783        } else { \
5784          ret = simde__m128i_from_altivec_i32( \
5785            vec_sl(simde__m128i_to_altivec_i32(a), \
5786              vec_splats(HEDLEY_STATIC_CAST(unsigned int, (imm8) & 31)))); \
5787        } \
5788        ret; \
5789      }))
5790 #endif
5791 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5792   #define _mm_slli_epi32(a, imm8) simde_mm_slli_epi32(a, imm8)
5793 #endif
5794 
5795 SIMDE_FUNCTION_ATTRIBUTES
5796 simde__m128i
simde_mm_slli_epi64(simde__m128i a,const int imm8)5797 simde_mm_slli_epi64 (simde__m128i a, const int imm8)
5798     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)  {
5799   if (HEDLEY_UNLIKELY((imm8 > 63))) {
5800     return simde_mm_setzero_si128();
5801   }
5802   simde__m128i_private
5803     r_,
5804     a_ = simde__m128i_to_private(a);
5805 
5806   #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
5807     r_.i64 = a_.i64 << imm8;
5808   #else
5809     SIMDE_VECTORIZE
5810     for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
5811       r_.i64[i] = a_.i64[i] << (imm8 & 0xff);
5812     }
5813   #endif
5814 
5815   return simde__m128i_from_private(r_);
5816 }
5817 #if defined(SIMDE_X86_SSE2_NATIVE)
5818   #define simde_mm_slli_epi64(a, imm8) _mm_slli_epi64(a, imm8)
5819 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5820   #define simde_mm_slli_epi64(a, imm8) \
5821      (__extension__ ({ \
5822         simde__m128i ret; \
5823         if ((imm8) <= 0) { \
5824             ret = a; \
5825         } else if ((imm8) > 63) { \
5826             ret = simde_mm_setzero_si128(); \
5827         } else { \
5828             ret = simde__m128i_from_neon_i64( \
5829                 vshlq_n_s64(simde__m128i_to_neon_i64(a), ((imm8) & 63))); \
5830         } \
5831         ret; \
5832     }))
5833 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5834   #define simde_mm_slli_epi64(a, imm8) \
5835     ((imm8 < 64) ? wasm_i64x2_shl(simde__m128i_to_private(a).wasm_v128, imm8) : wasm_i64x2_const(0,0))
5836 #endif
5837 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5838   #define _mm_slli_epi64(a, imm8) simde_mm_slli_epi64(a, imm8)
5839 #endif
5840 
5841 SIMDE_FUNCTION_ATTRIBUTES
5842 simde__m128i
simde_mm_srli_epi16(simde__m128i a,const int imm8)5843 simde_mm_srli_epi16 (simde__m128i a, const int imm8)
5844     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)  {
5845   if (HEDLEY_UNLIKELY((imm8 > 15))) {
5846     return simde_mm_setzero_si128();
5847   }
5848   simde__m128i_private
5849     r_,
5850     a_ = simde__m128i_to_private(a);
5851 
5852   #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
5853     r_.u16 = a_.u16 >> imm8;
5854   #else
5855     SIMDE_VECTORIZE
5856     for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
5857       r_.u16[i] = a_.u16[i] >> (imm8 & 0xff);
5858     }
5859   #endif
5860 
5861   return simde__m128i_from_private(r_);
5862 }
5863 #if defined(SIMDE_X86_SSE2_NATIVE)
5864   #define simde_mm_srli_epi16(a, imm8) _mm_srli_epi16(a, imm8)
5865 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5866   #define simde_mm_srli_epi16(a, imm8) \
5867      (__extension__ ({ \
5868         simde__m128i ret; \
5869         if ((imm8) <= 0) { \
5870             ret = a; \
5871         } else if ((imm8) > 15) { \
5872             ret = simde_mm_setzero_si128(); \
5873         } else { \
5874             ret = simde__m128i_from_neon_u16( \
5875                 vshrq_n_u16(simde__m128i_to_neon_u16(a), (((imm8) & 15) | (((imm8) & 15) == 0)))); \
5876         } \
5877         ret; \
5878     }))
5879 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5880   #define simde_mm_srli_epi16(a, imm8) \
5881     ((imm8 < 16) ? wasm_u16x8_shr(simde__m128i_to_private(a).wasm_v128, imm8) : wasm_i16x8_const(0,0,0,0,0,0,0,0))
5882 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
5883   #define simde_mm_srli_epi16(a, imm8) \
5884     ((imm8 & ~15) ? simde_mm_setzero_si128() : simde__m128i_from_altivec_i16(vec_sr(simde__m128i_to_altivec_i16(a), vec_splat_u16(HEDLEY_STATIC_CAST(unsigned short, imm8)))))
5885 #endif
5886 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5887   #define _mm_srli_epi16(a, imm8) simde_mm_srli_epi16(a, imm8)
5888 #endif
5889 
5890 SIMDE_FUNCTION_ATTRIBUTES
5891 simde__m128i
simde_mm_srli_epi32(simde__m128i a,const int imm8)5892 simde_mm_srli_epi32 (simde__m128i a, const int imm8)
5893     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)  {
5894   if (HEDLEY_UNLIKELY((imm8 > 31))) {
5895     return simde_mm_setzero_si128();
5896   }
5897   simde__m128i_private
5898     r_,
5899     a_ = simde__m128i_to_private(a);
5900 
5901   #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
5902     r_.u32 = a_.u32 >> (imm8 & 0xff);
5903   #else
5904     SIMDE_VECTORIZE
5905     for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
5906       r_.u32[i] = a_.u32[i] >> (imm8 & 0xff);
5907     }
5908   #endif
5909 
5910   return simde__m128i_from_private(r_);
5911 }
5912 #if defined(SIMDE_X86_SSE2_NATIVE)
5913   #define simde_mm_srli_epi32(a, imm8) _mm_srli_epi32(a, imm8)
5914 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5915   #define simde_mm_srli_epi32(a, imm8) \
5916     (__extension__ ({ \
5917         simde__m128i ret; \
5918         if ((imm8) <= 0) { \
5919             ret = a; \
5920         } else if ((imm8) > 31) { \
5921             ret = simde_mm_setzero_si128(); \
5922         } else { \
5923             ret = simde__m128i_from_neon_u32( \
5924               vshrq_n_u32(simde__m128i_to_neon_u32(a), (((imm8) & 31) | (((imm8) & 31) == 0)))); \
5925         } \
5926         ret; \
5927     }))
5928 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5929   #define simde_mm_srli_epi32(a, imm8) \
5930     ((imm8 < 32) ? wasm_u32x4_shr(simde__m128i_to_private(a).wasm_v128, imm8) : wasm_i32x4_const(0,0,0,0))
5931 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
5932   #define simde_mm_srli_epi32(a, imm8) \
5933     (__extension__ ({ \
5934         simde__m128i ret; \
5935         if ((imm8) <= 0) { \
5936             ret = a; \
5937         } else if ((imm8) > 31) { \
5938             ret = simde_mm_setzero_si128(); \
5939         } else { \
5940             ret = simde__m128i_from_altivec_i32( \
5941               vec_sr(simde__m128i_to_altivec_i32(a), \
5942                 vec_splats(HEDLEY_STATIC_CAST(unsigned int, (imm8) & 31)))); \
5943         } \
5944         ret; \
5945     }))
5946 #endif
5947 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5948   #define _mm_srli_epi32(a, imm8) simde_mm_srli_epi32(a, imm8)
5949 #endif
5950 
5951 SIMDE_FUNCTION_ATTRIBUTES
5952 simde__m128i
simde_mm_srli_epi64(simde__m128i a,const int imm8)5953 simde_mm_srli_epi64 (simde__m128i a, const int imm8)
5954     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)  {
5955   simde__m128i_private
5956     r_,
5957     a_ = simde__m128i_to_private(a);
5958 
5959   if (HEDLEY_UNLIKELY((imm8 & 63) != imm8))
5960     return simde_mm_setzero_si128();
5961 
5962   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5963     r_.neon_u64 = vshlq_u64(a_.neon_u64, vdupq_n_s64(-imm8));
5964   #else
5965     #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_94488)
5966       r_.u64 = a_.u64 >> imm8;
5967     #else
5968       SIMDE_VECTORIZE
5969       for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
5970         r_.u64[i] = a_.u64[i] >> imm8;
5971       }
5972     #endif
5973   #endif
5974 
5975   return simde__m128i_from_private(r_);
5976 }
5977 #if defined(SIMDE_X86_SSE2_NATIVE)
5978   #define simde_mm_srli_epi64(a, imm8) _mm_srli_epi64(a, imm8)
5979 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5980   #define simde_mm_srli_epi64(a, imm8) \
5981     (__extension__ ({ \
5982         simde__m128i ret; \
5983         if ((imm8) <= 0) { \
5984             ret = a; \
5985         } else if ((imm8) > 63) { \
5986             ret = simde_mm_setzero_si128(); \
5987         } else { \
5988             ret = simde__m128i_from_neon_u64( \
5989               vshrq_n_u64(simde__m128i_to_neon_u64(a), (((imm8) & 63) | (((imm8) & 63) == 0)))); \
5990         } \
5991         ret; \
5992     }))
5993 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5994   #define simde_mm_srli_epi64(a, imm8) \
5995     ((imm8 < 64) ? wasm_u64x2_shr(simde__m128i_to_private(a).wasm_v128, imm8) : wasm_i64x2_const(0,0))
5996 #endif
5997 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5998   #define _mm_srli_epi64(a, imm8) simde_mm_srli_epi64(a, imm8)
5999 #endif
6000 
6001 SIMDE_FUNCTION_ATTRIBUTES
6002 void
simde_mm_store_pd(simde_float64 mem_addr[HEDLEY_ARRAY_PARAM (2)],simde__m128d a)6003 simde_mm_store_pd (simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128d a) {
6004   #if defined(SIMDE_X86_SSE2_NATIVE)
6005     _mm_store_pd(mem_addr, a);
6006   #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6007     vst1q_f64(mem_addr, simde__m128d_to_private(a).neon_f64);
6008   #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6009     vst1q_s64(HEDLEY_REINTERPRET_CAST(int64_t*, mem_addr), simde__m128d_to_private(a).neon_i64);
6010   #else
6011     simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m128d), &a, sizeof(a));
6012   #endif
6013 }
6014 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6015   #define _mm_store_pd(mem_addr, a) simde_mm_store_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
6016 #endif
6017 
6018 SIMDE_FUNCTION_ATTRIBUTES
6019 void
simde_mm_store1_pd(simde_float64 mem_addr[HEDLEY_ARRAY_PARAM (2)],simde__m128d a)6020 simde_mm_store1_pd (simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128d a) {
6021   #if defined(SIMDE_X86_SSE2_NATIVE)
6022     _mm_store1_pd(mem_addr, a);
6023   #else
6024     simde__m128d_private a_ = simde__m128d_to_private(a);
6025 
6026     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6027       vst1q_f64(mem_addr, vdupq_laneq_f64(a_.neon_f64, 0));
6028     #else
6029       mem_addr[0] = a_.f64[0];
6030       mem_addr[1] = a_.f64[0];
6031     #endif
6032   #endif
6033 }
6034 #define simde_mm_store_pd1(mem_addr, a) simde_mm_store1_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
6035 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6036   #define _mm_store1_pd(mem_addr, a) simde_mm_store1_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
6037   #define _mm_store_pd1(mem_addr, a) simde_mm_store_pd1(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
6038 #endif
6039 
6040 SIMDE_FUNCTION_ATTRIBUTES
6041 void
simde_mm_store_sd(simde_float64 * mem_addr,simde__m128d a)6042 simde_mm_store_sd (simde_float64* mem_addr, simde__m128d a) {
6043   #if defined(SIMDE_X86_SSE2_NATIVE)
6044     _mm_store_sd(mem_addr, a);
6045   #else
6046     simde__m128d_private a_ = simde__m128d_to_private(a);
6047 
6048     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6049       const simde_float64 v = vgetq_lane_f64(a_.neon_f64, 0);
6050       simde_memcpy(mem_addr, &v, sizeof(v));
6051     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6052       const int64_t v = vgetq_lane_s64(a_.neon_i64, 0);
6053       simde_memcpy(HEDLEY_REINTERPRET_CAST(int64_t*, mem_addr), &v, sizeof(v));
6054     #else
6055       simde_float64 v = a_.f64[0];
6056       simde_memcpy(mem_addr, &v, sizeof(simde_float64));
6057     #endif
6058   #endif
6059 }
6060 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6061   #define _mm_store_sd(mem_addr, a) simde_mm_store_sd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
6062 #endif
6063 
6064 SIMDE_FUNCTION_ATTRIBUTES
6065 void
simde_mm_store_si128(simde__m128i * mem_addr,simde__m128i a)6066 simde_mm_store_si128 (simde__m128i* mem_addr, simde__m128i a) {
6067   #if defined(SIMDE_X86_SSE2_NATIVE)
6068     _mm_store_si128(HEDLEY_STATIC_CAST(__m128i*, mem_addr), a);
6069   #else
6070     simde__m128i_private a_ = simde__m128i_to_private(a);
6071 
6072     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6073       vst1q_s32(HEDLEY_REINTERPRET_CAST(int32_t*, mem_addr), a_.neon_i32);
6074     #else
6075       simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m128i), &a_, sizeof(a_));
6076     #endif
6077   #endif
6078 }
6079 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6080   #define _mm_store_si128(mem_addr, a) simde_mm_store_si128(mem_addr, a)
6081 #endif
6082 
6083 SIMDE_FUNCTION_ATTRIBUTES
6084 void
simde_mm_storeh_pd(simde_float64 * mem_addr,simde__m128d a)6085   simde_mm_storeh_pd (simde_float64* mem_addr, simde__m128d a) {
6086   #if defined(SIMDE_X86_SSE2_NATIVE)
6087     _mm_storeh_pd(mem_addr, a);
6088   #else
6089     simde__m128d_private a_ = simde__m128d_to_private(a);
6090 
6091     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6092       *mem_addr = vgetq_lane_f64(a_.neon_f64, 1);
6093     #else
6094       *mem_addr = a_.f64[1];
6095     #endif
6096   #endif
6097 }
6098 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6099   #define _mm_storeh_pd(mem_addr, a) simde_mm_storeh_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
6100 #endif
6101 
6102 SIMDE_FUNCTION_ATTRIBUTES
6103 void
simde_mm_storel_epi64(simde__m128i * mem_addr,simde__m128i a)6104 simde_mm_storel_epi64 (simde__m128i* mem_addr, simde__m128i a) {
6105   #if defined(SIMDE_X86_SSE2_NATIVE)
6106     _mm_storel_epi64(HEDLEY_STATIC_CAST(__m128i*, mem_addr), a);
6107   #else
6108     simde__m128i_private a_ = simde__m128i_to_private(a);
6109     int64_t tmp;
6110 
6111     /* memcpy to prevent aliasing, tmp because we can't take the
6112      * address of a vector element. */
6113 
6114     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6115       tmp = vgetq_lane_s64(a_.neon_i64, 0);
6116     #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
6117       #if defined(SIMDE_BUG_GCC_95227)
6118         (void) a_;
6119       #endif
6120       tmp = vec_extract(a_.altivec_i64, 0);
6121     #else
6122       tmp = a_.i64[0];
6123     #endif
6124 
6125     simde_memcpy(mem_addr, &tmp, sizeof(tmp));
6126   #endif
6127 }
6128 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6129   #define _mm_storel_epi64(mem_addr, a) simde_mm_storel_epi64(mem_addr, a)
6130 #endif
6131 
6132 SIMDE_FUNCTION_ATTRIBUTES
6133 void
simde_mm_storel_pd(simde_float64 * mem_addr,simde__m128d a)6134 simde_mm_storel_pd (simde_float64* mem_addr, simde__m128d a) {
6135   #if defined(SIMDE_X86_SSE2_NATIVE)
6136     _mm_storel_pd(mem_addr, a);
6137   #else
6138     simde__m128d_private a_ = simde__m128d_to_private(a);
6139 
6140     simde_float64 tmp;
6141     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6142       tmp = vgetq_lane_f64(a_.neon_f64, 0);
6143     #else
6144       tmp = a_.f64[0];
6145     #endif
6146     simde_memcpy(mem_addr, &tmp, sizeof(tmp));
6147   #endif
6148 }
6149 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6150   #define _mm_storel_pd(mem_addr, a) simde_mm_storel_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
6151 #endif
6152 
6153 SIMDE_FUNCTION_ATTRIBUTES
6154 void
simde_mm_storer_pd(simde_float64 mem_addr[2],simde__m128d a)6155 simde_mm_storer_pd (simde_float64 mem_addr[2], simde__m128d a) {
6156   #if defined(SIMDE_X86_SSE2_NATIVE)
6157     _mm_storer_pd(mem_addr, a);
6158   #else
6159     simde__m128d_private a_ = simde__m128d_to_private(a);
6160 
6161     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6162       vst1q_s64(HEDLEY_REINTERPRET_CAST(int64_t*, mem_addr), vextq_s64(a_.neon_i64, a_.neon_i64, 1));
6163     #elif defined(SIMDE_SHUFFLE_VECTOR_)
6164       a_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, a_.f64, 1, 0);
6165       simde_mm_store_pd(mem_addr, simde__m128d_from_private(a_));
6166     #else
6167       mem_addr[0] = a_.f64[1];
6168       mem_addr[1] = a_.f64[0];
6169     #endif
6170   #endif
6171 }
6172 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6173   #define _mm_storer_pd(mem_addr, a) simde_mm_storer_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
6174 #endif
6175 
6176 SIMDE_FUNCTION_ATTRIBUTES
6177 void
simde_mm_storeu_pd(simde_float64 * mem_addr,simde__m128d a)6178 simde_mm_storeu_pd (simde_float64* mem_addr, simde__m128d a) {
6179   #if defined(SIMDE_X86_SSE2_NATIVE)
6180     _mm_storeu_pd(mem_addr, a);
6181   #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6182     vst1q_f64(mem_addr, simde__m128d_to_private(a).neon_f64);
6183   #else
6184     simde_memcpy(mem_addr, &a, sizeof(a));
6185   #endif
6186 }
6187 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6188   #define _mm_storeu_pd(mem_addr, a) simde_mm_storeu_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
6189 #endif
6190 
6191 SIMDE_FUNCTION_ATTRIBUTES
6192 void
simde_mm_storeu_si128(simde__m128i * mem_addr,simde__m128i a)6193 simde_mm_storeu_si128 (simde__m128i* mem_addr, simde__m128i a) {
6194   #if defined(SIMDE_X86_SSE2_NATIVE)
6195     _mm_storeu_si128(HEDLEY_STATIC_CAST(__m128i*, mem_addr), a);
6196   #else
6197     simde_memcpy(mem_addr, &a, sizeof(a));
6198   #endif
6199 }
6200 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6201   #define _mm_storeu_si128(mem_addr, a) simde_mm_storeu_si128(mem_addr, a)
6202 #endif
6203 
6204 SIMDE_FUNCTION_ATTRIBUTES
6205 void
simde_mm_storeu_si16(void * mem_addr,simde__m128i a)6206 simde_mm_storeu_si16 (void* mem_addr, simde__m128i a) {
6207   #if defined(SIMDE_X86_SSE2_NATIVE) && ( \
6208       SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0) || \
6209       HEDLEY_GCC_VERSION_CHECK(11,0,0) || \
6210       HEDLEY_INTEL_VERSION_CHECK(20,21,1))
6211     _mm_storeu_si16(mem_addr, a);
6212   #else
6213     int16_t val = simde_x_mm_cvtsi128_si16(a);
6214     simde_memcpy(mem_addr, &val, sizeof(val));
6215   #endif
6216 }
6217 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6218   #define _mm_storeu_si16(mem_addr, a) simde_mm_storeu_si16(mem_addr, a)
6219 #endif
6220 
6221 SIMDE_FUNCTION_ATTRIBUTES
6222 void
simde_mm_storeu_si32(void * mem_addr,simde__m128i a)6223 simde_mm_storeu_si32 (void* mem_addr, simde__m128i a) {
6224   #if defined(SIMDE_X86_SSE2_NATIVE) && ( \
6225       SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0) || \
6226       HEDLEY_GCC_VERSION_CHECK(11,0,0) || \
6227       HEDLEY_INTEL_VERSION_CHECK(20,21,1))
6228     _mm_storeu_si32(mem_addr, a);
6229   #else
6230     int32_t val = simde_mm_cvtsi128_si32(a);
6231     simde_memcpy(mem_addr, &val, sizeof(val));
6232   #endif
6233 }
6234 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6235   #define _mm_storeu_si32(mem_addr, a) simde_mm_storeu_si32(mem_addr, a)
6236 #endif
6237 
6238 SIMDE_FUNCTION_ATTRIBUTES
6239 void
simde_mm_storeu_si64(void * mem_addr,simde__m128i a)6240 simde_mm_storeu_si64 (void* mem_addr, simde__m128i a) {
6241   #if defined(SIMDE_X86_SSE2_NATIVE) && ( \
6242       SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0) || \
6243       HEDLEY_GCC_VERSION_CHECK(11,0,0) || \
6244       HEDLEY_INTEL_VERSION_CHECK(20,21,1))
6245     _mm_storeu_si64(mem_addr, a);
6246   #else
6247     int64_t val = simde_mm_cvtsi128_si64(a);
6248     simde_memcpy(mem_addr, &val, sizeof(val));
6249   #endif
6250 }
6251 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6252   #define _mm_storeu_si64(mem_addr, a) simde_mm_storeu_si64(mem_addr, a)
6253 #endif
6254 
6255 SIMDE_FUNCTION_ATTRIBUTES
6256 void
simde_mm_stream_pd(simde_float64 mem_addr[HEDLEY_ARRAY_PARAM (2)],simde__m128d a)6257 simde_mm_stream_pd (simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128d a) {
6258   #if defined(SIMDE_X86_SSE2_NATIVE)
6259     _mm_stream_pd(mem_addr, a);
6260   #else
6261     simde_memcpy(mem_addr, &a, sizeof(a));
6262   #endif
6263 }
6264 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6265   #define _mm_stream_pd(mem_addr, a) simde_mm_stream_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
6266 #endif
6267 
6268 SIMDE_FUNCTION_ATTRIBUTES
6269 void
simde_mm_stream_si128(simde__m128i * mem_addr,simde__m128i a)6270 simde_mm_stream_si128 (simde__m128i* mem_addr, simde__m128i a) {
6271   #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
6272     _mm_stream_si128(HEDLEY_STATIC_CAST(__m128i*, mem_addr), a);
6273   #else
6274     simde_memcpy(mem_addr, &a, sizeof(a));
6275   #endif
6276 }
6277 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6278   #define _mm_stream_si128(mem_addr, a) simde_mm_stream_si128(mem_addr, a)
6279 #endif
6280 
6281 SIMDE_FUNCTION_ATTRIBUTES
6282 void
simde_mm_stream_si32(int32_t * mem_addr,int32_t a)6283 simde_mm_stream_si32 (int32_t* mem_addr, int32_t a) {
6284   #if defined(SIMDE_X86_SSE2_NATIVE)
6285     _mm_stream_si32(mem_addr, a);
6286   #else
6287     *mem_addr = a;
6288   #endif
6289 }
6290 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6291   #define _mm_stream_si32(mem_addr, a) simde_mm_stream_si32(mem_addr, a)
6292 #endif
6293 
6294 SIMDE_FUNCTION_ATTRIBUTES
6295 void
simde_mm_stream_si64(int64_t * mem_addr,int64_t a)6296 simde_mm_stream_si64 (int64_t* mem_addr, int64_t a) {
6297   #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64) && !defined(HEDLEY_MSVC_VERSION)
6298     _mm_stream_si64(SIMDE_CHECKED_REINTERPRET_CAST(long long int*, int64_t*, mem_addr), a);
6299   #else
6300     *mem_addr = a;
6301   #endif
6302 }
6303 #define simde_mm_stream_si64x(mem_addr, a) simde_mm_stream_si64(mem_addr, a)
6304 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6305   #define _mm_stream_si64(mem_addr, a) simde_mm_stream_si64(SIMDE_CHECKED_REINTERPRET_CAST(int64_t*, __int64*, mem_addr), a)
6306   #define _mm_stream_si64x(mem_addr, a) simde_mm_stream_si64(SIMDE_CHECKED_REINTERPRET_CAST(int64_t*, __int64*, mem_addr), a)
6307 #endif
6308 
6309 SIMDE_FUNCTION_ATTRIBUTES
6310 simde__m128i
simde_mm_sub_epi8(simde__m128i a,simde__m128i b)6311 simde_mm_sub_epi8 (simde__m128i a, simde__m128i b) {
6312   #if defined(SIMDE_X86_SSE2_NATIVE)
6313     return _mm_sub_epi8(a, b);
6314   #else
6315     simde__m128i_private
6316       r_,
6317       a_ = simde__m128i_to_private(a),
6318       b_ = simde__m128i_to_private(b);
6319 
6320     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6321       r_.neon_i8 = vsubq_s8(a_.neon_i8, b_.neon_i8);
6322     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
6323       r_.i8 = a_.i8 - b_.i8;
6324     #else
6325       SIMDE_VECTORIZE
6326       for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
6327         r_.i8[i] = a_.i8[i] - b_.i8[i];
6328       }
6329     #endif
6330 
6331     return simde__m128i_from_private(r_);
6332   #endif
6333 }
6334 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6335   #define _mm_sub_epi8(a, b) simde_mm_sub_epi8(a, b)
6336 #endif
6337 
6338 SIMDE_FUNCTION_ATTRIBUTES
6339 simde__m128i
simde_mm_sub_epi16(simde__m128i a,simde__m128i b)6340 simde_mm_sub_epi16 (simde__m128i a, simde__m128i b) {
6341   #if defined(SIMDE_X86_SSE2_NATIVE)
6342     return _mm_sub_epi16(a, b);
6343   #else
6344     simde__m128i_private
6345       r_,
6346       a_ = simde__m128i_to_private(a),
6347       b_ = simde__m128i_to_private(b);
6348 
6349     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6350       r_.neon_i16 = vsubq_s16(a_.neon_i16, b_.neon_i16);
6351     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
6352       r_.i16 = a_.i16 - b_.i16;
6353     #else
6354       SIMDE_VECTORIZE
6355       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
6356         r_.i16[i] = a_.i16[i] - b_.i16[i];
6357       }
6358     #endif
6359 
6360     return simde__m128i_from_private(r_);
6361   #endif
6362 }
6363 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6364   #define _mm_sub_epi16(a, b) simde_mm_sub_epi16(a, b)
6365 #endif
6366 
6367 SIMDE_FUNCTION_ATTRIBUTES
6368 simde__m128i
simde_mm_sub_epi32(simde__m128i a,simde__m128i b)6369 simde_mm_sub_epi32 (simde__m128i a, simde__m128i b) {
6370   #if defined(SIMDE_X86_SSE2_NATIVE)
6371     return _mm_sub_epi32(a, b);
6372   #else
6373     simde__m128i_private
6374       r_,
6375       a_ = simde__m128i_to_private(a),
6376       b_ = simde__m128i_to_private(b);
6377 
6378     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6379       r_.neon_i32 = vsubq_s32(a_.neon_i32, b_.neon_i32);
6380     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
6381       r_.i32 = a_.i32 - b_.i32;
6382     #else
6383       SIMDE_VECTORIZE
6384       for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
6385         r_.i32[i] = a_.i32[i] - b_.i32[i];
6386       }
6387     #endif
6388 
6389     return simde__m128i_from_private(r_);
6390   #endif
6391 }
6392 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6393   #define _mm_sub_epi32(a, b) simde_mm_sub_epi32(a, b)
6394 #endif
6395 
6396 SIMDE_FUNCTION_ATTRIBUTES
6397 simde__m128i
simde_mm_sub_epi64(simde__m128i a,simde__m128i b)6398 simde_mm_sub_epi64 (simde__m128i a, simde__m128i b) {
6399   #if defined(SIMDE_X86_SSE2_NATIVE)
6400     return _mm_sub_epi64(a, b);
6401   #else
6402     simde__m128i_private
6403       r_,
6404       a_ = simde__m128i_to_private(a),
6405       b_ = simde__m128i_to_private(b);
6406 
6407     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6408       r_.neon_i64 = vsubq_s64(a_.neon_i64, b_.neon_i64);
6409     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
6410       r_.i64 = a_.i64 - b_.i64;
6411     #else
6412       SIMDE_VECTORIZE
6413       for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
6414         r_.i64[i] = a_.i64[i] - b_.i64[i];
6415       }
6416     #endif
6417 
6418     return simde__m128i_from_private(r_);
6419   #endif
6420 }
6421 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6422   #define _mm_sub_epi64(a, b) simde_mm_sub_epi64(a, b)
6423 #endif
6424 
6425 SIMDE_FUNCTION_ATTRIBUTES
6426 simde__m128i
simde_x_mm_sub_epu32(simde__m128i a,simde__m128i b)6427 simde_x_mm_sub_epu32 (simde__m128i a, simde__m128i b) {
6428   simde__m128i_private
6429     r_,
6430     a_ = simde__m128i_to_private(a),
6431     b_ = simde__m128i_to_private(b);
6432 
6433   #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
6434     r_.u32 = a_.u32 - b_.u32;
6435   #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6436     r_.neon_u32 = vsubq_u32(a_.neon_u32, b_.neon_u32);
6437   #else
6438     SIMDE_VECTORIZE
6439     for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
6440       r_.u32[i] = a_.u32[i] - b_.u32[i];
6441     }
6442   #endif
6443 
6444   return simde__m128i_from_private(r_);
6445 }
6446 
6447 SIMDE_FUNCTION_ATTRIBUTES
6448 simde__m128d
simde_mm_sub_pd(simde__m128d a,simde__m128d b)6449 simde_mm_sub_pd (simde__m128d a, simde__m128d b) {
6450   #if defined(SIMDE_X86_SSE2_NATIVE)
6451     return _mm_sub_pd(a, b);
6452   #else
6453     simde__m128d_private
6454       r_,
6455       a_ = simde__m128d_to_private(a),
6456       b_ = simde__m128d_to_private(b);
6457 
6458     #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
6459       r_.f64 = a_.f64 - b_.f64;
6460     #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6461       r_.neon_f64 = vsubq_f64(a_.neon_f64, b_.neon_f64);
6462     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
6463       r_.wasm_v128 = wasm_f64x2_sub(a_.wasm_v128, b_.wasm_v128);
6464     #else
6465       SIMDE_VECTORIZE
6466       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
6467         r_.f64[i] = a_.f64[i] - b_.f64[i];
6468       }
6469     #endif
6470 
6471     return simde__m128d_from_private(r_);
6472   #endif
6473 }
6474 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6475   #define _mm_sub_pd(a, b) simde_mm_sub_pd(a, b)
6476 #endif
6477 
6478 SIMDE_FUNCTION_ATTRIBUTES
6479 simde__m128d
simde_mm_sub_sd(simde__m128d a,simde__m128d b)6480 simde_mm_sub_sd (simde__m128d a, simde__m128d b) {
6481   #if defined(SIMDE_X86_SSE2_NATIVE)
6482     return _mm_sub_sd(a, b);
6483   #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
6484     return simde_mm_move_sd(a, simde_mm_sub_pd(a, b));
6485   #else
6486     simde__m128d_private
6487       r_,
6488       a_ = simde__m128d_to_private(a),
6489       b_ = simde__m128d_to_private(b);
6490 
6491     r_.f64[0] = a_.f64[0] - b_.f64[0];
6492     r_.f64[1] = a_.f64[1];
6493 
6494     return simde__m128d_from_private(r_);
6495   #endif
6496 }
6497 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6498   #define _mm_sub_sd(a, b) simde_mm_sub_sd(a, b)
6499 #endif
6500 
6501 SIMDE_FUNCTION_ATTRIBUTES
6502 simde__m64
simde_mm_sub_si64(simde__m64 a,simde__m64 b)6503 simde_mm_sub_si64 (simde__m64 a, simde__m64 b) {
6504   #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
6505     return _mm_sub_si64(a, b);
6506   #else
6507     simde__m64_private
6508       r_,
6509       a_ = simde__m64_to_private(a),
6510       b_ = simde__m64_to_private(b);
6511 
6512     #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
6513       r_.i64 = a_.i64 - b_.i64;
6514     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6515       r_.neon_i64 = vsub_s64(a_.neon_i64, b_.neon_i64);
6516     #else
6517       r_.i64[0] = a_.i64[0] - b_.i64[0];
6518     #endif
6519 
6520     return simde__m64_from_private(r_);
6521   #endif
6522 }
6523 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6524   #define _mm_sub_si64(a, b) simde_mm_sub_si64(a, b)
6525 #endif
6526 
6527 SIMDE_FUNCTION_ATTRIBUTES
6528 simde__m128i
simde_mm_subs_epi8(simde__m128i a,simde__m128i b)6529 simde_mm_subs_epi8 (simde__m128i a, simde__m128i b) {
6530   #if defined(SIMDE_X86_SSE2_NATIVE)
6531     return _mm_subs_epi8(a, b);
6532   #else
6533     simde__m128i_private
6534       r_,
6535       a_ = simde__m128i_to_private(a),
6536       b_ = simde__m128i_to_private(b);
6537 
6538     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6539       r_.neon_i8 = vqsubq_s8(a_.neon_i8, b_.neon_i8);
6540     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
6541       r_.wasm_v128 = wasm_i8x16_sub_saturate(a_.wasm_v128, b_.wasm_v128);
6542     #else
6543       SIMDE_VECTORIZE
6544       for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i8[0])) ; i++) {
6545         if (((b_.i8[i]) > 0 && (a_.i8[i]) < INT8_MIN + (b_.i8[i]))) {
6546           r_.i8[i] = INT8_MIN;
6547         } else if ((b_.i8[i]) < 0 && (a_.i8[i]) > INT8_MAX + (b_.i8[i])) {
6548           r_.i8[i] = INT8_MAX;
6549         } else {
6550           r_.i8[i] = (a_.i8[i]) - (b_.i8[i]);
6551         }
6552       }
6553     #endif
6554 
6555     return simde__m128i_from_private(r_);
6556   #endif
6557 }
6558 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6559   #define _mm_subs_epi8(a, b) simde_mm_subs_epi8(a, b)
6560 #endif
6561 
6562 SIMDE_FUNCTION_ATTRIBUTES
6563 simde__m128i
simde_mm_subs_epi16(simde__m128i a,simde__m128i b)6564 simde_mm_subs_epi16 (simde__m128i a, simde__m128i b) {
6565   #if defined(SIMDE_X86_SSE2_NATIVE)
6566     return _mm_subs_epi16(a, b);
6567   #else
6568     simde__m128i_private
6569       r_,
6570       a_ = simde__m128i_to_private(a),
6571       b_ = simde__m128i_to_private(b);
6572 
6573     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6574       r_.neon_i16 = vqsubq_s16(a_.neon_i16, b_.neon_i16);
6575     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
6576       r_.wasm_v128 = wasm_i16x8_sub_saturate(a_.wasm_v128, b_.wasm_v128);
6577     #else
6578       SIMDE_VECTORIZE
6579       for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i16[0])) ; i++) {
6580         if (((b_.i16[i]) > 0 && (a_.i16[i]) < INT16_MIN + (b_.i16[i]))) {
6581           r_.i16[i] = INT16_MIN;
6582         } else if ((b_.i16[i]) < 0 && (a_.i16[i]) > INT16_MAX + (b_.i16[i])) {
6583           r_.i16[i] = INT16_MAX;
6584         } else {
6585           r_.i16[i] = (a_.i16[i]) - (b_.i16[i]);
6586         }
6587       }
6588     #endif
6589 
6590     return simde__m128i_from_private(r_);
6591   #endif
6592 }
6593 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6594   #define _mm_subs_epi16(a, b) simde_mm_subs_epi16(a, b)
6595 #endif
6596 
6597 SIMDE_FUNCTION_ATTRIBUTES
6598 simde__m128i
simde_mm_subs_epu8(simde__m128i a,simde__m128i b)6599 simde_mm_subs_epu8 (simde__m128i a, simde__m128i b) {
6600   #if defined(SIMDE_X86_SSE2_NATIVE)
6601     return _mm_subs_epu8(a, b);
6602   #else
6603     simde__m128i_private
6604       r_,
6605       a_ = simde__m128i_to_private(a),
6606       b_ = simde__m128i_to_private(b);
6607 
6608     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6609       r_.neon_u8 = vqsubq_u8(a_.neon_u8, b_.neon_u8);
6610     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
6611       r_.wasm_v128 = wasm_u8x16_sub_saturate(a_.wasm_v128, b_.wasm_v128);
6612     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
6613       r_.altivec_u8 = vec_subs(a_.altivec_u8, b_.altivec_u8);
6614     #else
6615       SIMDE_VECTORIZE
6616       for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i8[0])) ; i++) {
6617         const int32_t x = a_.u8[i] - b_.u8[i];
6618         if (x < 0) {
6619           r_.u8[i] = 0;
6620         } else if (x > UINT8_MAX) {
6621           r_.u8[i] = UINT8_MAX;
6622         } else {
6623           r_.u8[i] = HEDLEY_STATIC_CAST(uint8_t, x);
6624         }
6625       }
6626     #endif
6627 
6628     return simde__m128i_from_private(r_);
6629   #endif
6630 }
6631 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6632   #define _mm_subs_epu8(a, b) simde_mm_subs_epu8(a, b)
6633 #endif
6634 
6635 SIMDE_FUNCTION_ATTRIBUTES
6636 simde__m128i
simde_mm_subs_epu16(simde__m128i a,simde__m128i b)6637 simde_mm_subs_epu16 (simde__m128i a, simde__m128i b) {
6638   #if defined(SIMDE_X86_SSE2_NATIVE)
6639     return _mm_subs_epu16(a, b);
6640   #else
6641     simde__m128i_private
6642       r_,
6643       a_ = simde__m128i_to_private(a),
6644       b_ = simde__m128i_to_private(b);
6645 
6646     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6647       r_.neon_u16 = vqsubq_u16(a_.neon_u16, b_.neon_u16);
6648     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
6649       r_.wasm_v128 = wasm_u16x8_sub_saturate(a_.wasm_v128, b_.wasm_v128);
6650     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
6651       r_.altivec_u16 = vec_subs(a_.altivec_u16, b_.altivec_u16);
6652     #else
6653       SIMDE_VECTORIZE
6654       for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i16[0])) ; i++) {
6655         const int32_t x = a_.u16[i] - b_.u16[i];
6656         if (x < 0) {
6657           r_.u16[i] = 0;
6658         } else if (x > UINT16_MAX) {
6659           r_.u16[i] = UINT16_MAX;
6660         } else {
6661           r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, x);
6662         }
6663       }
6664     #endif
6665 
6666     return simde__m128i_from_private(r_);
6667   #endif
6668 }
6669 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6670   #define _mm_subs_epu16(a, b) simde_mm_subs_epu16(a, b)
6671 #endif
6672 
6673 SIMDE_FUNCTION_ATTRIBUTES
6674 int
simde_mm_ucomieq_sd(simde__m128d a,simde__m128d b)6675 simde_mm_ucomieq_sd (simde__m128d a, simde__m128d b) {
6676   #if defined(SIMDE_X86_SSE2_NATIVE)
6677     return _mm_ucomieq_sd(a, b);
6678   #else
6679     simde__m128d_private
6680       a_ = simde__m128d_to_private(a),
6681       b_ = simde__m128d_to_private(b);
6682     int r;
6683 
6684     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6685       uint64x2_t a_not_nan = vceqq_f64(a_.neon_f64, a_.neon_f64);
6686       uint64x2_t b_not_nan = vceqq_f64(b_.neon_f64, b_.neon_f64);
6687       uint64x2_t a_or_b_nan = vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(vandq_u64(a_not_nan, b_not_nan))));
6688       uint64x2_t a_eq_b = vceqq_f64(a_.neon_f64, b_.neon_f64);
6689       r = !!(vgetq_lane_u64(vorrq_u64(a_or_b_nan, a_eq_b), 0) != 0);
6690     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
6691       return wasm_f64x2_extract_lane(a_.wasm_v128, 0) == wasm_f64x2_extract_lane(b_.wasm_v128, 0);
6692     #elif defined(SIMDE_HAVE_FENV_H)
6693       fenv_t envp;
6694       int x = feholdexcept(&envp);
6695       r =  a_.f64[0] == b_.f64[0];
6696       if (HEDLEY_LIKELY(x == 0))
6697         fesetenv(&envp);
6698     #else
6699       r =  a_.f64[0] == b_.f64[0];
6700     #endif
6701 
6702     return r;
6703   #endif
6704 }
6705 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6706   #define _mm_ucomieq_sd(a, b) simde_mm_ucomieq_sd(a, b)
6707 #endif
6708 
6709 SIMDE_FUNCTION_ATTRIBUTES
6710 int
simde_mm_ucomige_sd(simde__m128d a,simde__m128d b)6711 simde_mm_ucomige_sd (simde__m128d a, simde__m128d b) {
6712   #if defined(SIMDE_X86_SSE2_NATIVE)
6713     return _mm_ucomige_sd(a, b);
6714   #else
6715     simde__m128d_private
6716       a_ = simde__m128d_to_private(a),
6717       b_ = simde__m128d_to_private(b);
6718     int r;
6719 
6720     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6721       uint64x2_t a_not_nan = vceqq_f64(a_.neon_f64, a_.neon_f64);
6722       uint64x2_t b_not_nan = vceqq_f64(b_.neon_f64, b_.neon_f64);
6723       uint64x2_t a_and_b_not_nan = vandq_u64(a_not_nan, b_not_nan);
6724       uint64x2_t a_ge_b = vcgeq_f64(a_.neon_f64, b_.neon_f64);
6725       r = !!(vgetq_lane_u64(vandq_u64(a_and_b_not_nan, a_ge_b), 0) != 0);
6726     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
6727       return wasm_f64x2_extract_lane(a_.wasm_v128, 0) >= wasm_f64x2_extract_lane(b_.wasm_v128, 0);
6728     #elif defined(SIMDE_HAVE_FENV_H)
6729       fenv_t envp;
6730       int x = feholdexcept(&envp);
6731       r = a_.f64[0] >= b_.f64[0];
6732       if (HEDLEY_LIKELY(x == 0))
6733         fesetenv(&envp);
6734     #else
6735       r = a_.f64[0] >= b_.f64[0];
6736     #endif
6737 
6738     return r;
6739   #endif
6740 }
6741 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6742   #define _mm_ucomige_sd(a, b) simde_mm_ucomige_sd(a, b)
6743 #endif
6744 
6745 SIMDE_FUNCTION_ATTRIBUTES
6746 int
simde_mm_ucomigt_sd(simde__m128d a,simde__m128d b)6747 simde_mm_ucomigt_sd (simde__m128d a, simde__m128d b) {
6748   #if defined(SIMDE_X86_SSE2_NATIVE)
6749     return _mm_ucomigt_sd(a, b);
6750   #else
6751     simde__m128d_private
6752       a_ = simde__m128d_to_private(a),
6753       b_ = simde__m128d_to_private(b);
6754     int r;
6755 
6756     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6757       uint64x2_t a_not_nan = vceqq_f64(a_.neon_f64, a_.neon_f64);
6758       uint64x2_t b_not_nan = vceqq_f64(b_.neon_f64, b_.neon_f64);
6759       uint64x2_t a_and_b_not_nan = vandq_u64(a_not_nan, b_not_nan);
6760       uint64x2_t a_gt_b = vcgtq_f64(a_.neon_f64, b_.neon_f64);
6761       r = !!(vgetq_lane_u64(vandq_u64(a_and_b_not_nan, a_gt_b), 0) != 0);
6762     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
6763       return wasm_f64x2_extract_lane(a_.wasm_v128, 0) > wasm_f64x2_extract_lane(b_.wasm_v128, 0);
6764     #elif defined(SIMDE_HAVE_FENV_H)
6765       fenv_t envp;
6766       int x = feholdexcept(&envp);
6767       r = a_.f64[0] > b_.f64[0];
6768       if (HEDLEY_LIKELY(x == 0))
6769         fesetenv(&envp);
6770     #else
6771       r = a_.f64[0] > b_.f64[0];
6772     #endif
6773 
6774     return r;
6775   #endif
6776 }
6777 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6778   #define _mm_ucomigt_sd(a, b) simde_mm_ucomigt_sd(a, b)
6779 #endif
6780 
6781 SIMDE_FUNCTION_ATTRIBUTES
6782 int
simde_mm_ucomile_sd(simde__m128d a,simde__m128d b)6783 simde_mm_ucomile_sd (simde__m128d a, simde__m128d b) {
6784   #if defined(SIMDE_X86_SSE2_NATIVE)
6785     return _mm_ucomile_sd(a, b);
6786   #else
6787     simde__m128d_private
6788       a_ = simde__m128d_to_private(a),
6789       b_ = simde__m128d_to_private(b);
6790     int r;
6791 
6792     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6793       uint64x2_t a_not_nan = vceqq_f64(a_.neon_f64, a_.neon_f64);
6794       uint64x2_t b_not_nan = vceqq_f64(b_.neon_f64, b_.neon_f64);
6795       uint64x2_t a_or_b_nan = vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(vandq_u64(a_not_nan, b_not_nan))));
6796       uint64x2_t a_le_b = vcleq_f64(a_.neon_f64, b_.neon_f64);
6797       r = !!(vgetq_lane_u64(vorrq_u64(a_or_b_nan, a_le_b), 0) != 0);
6798     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
6799       return wasm_f64x2_extract_lane(a_.wasm_v128, 0) <= wasm_f64x2_extract_lane(b_.wasm_v128, 0);
6800     #elif defined(SIMDE_HAVE_FENV_H)
6801       fenv_t envp;
6802       int x = feholdexcept(&envp);
6803       r = a_.f64[0] <= b_.f64[0];
6804       if (HEDLEY_LIKELY(x == 0))
6805         fesetenv(&envp);
6806     #else
6807       r = a_.f64[0] <= b_.f64[0];
6808     #endif
6809 
6810     return r;
6811   #endif
6812 }
6813 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6814   #define _mm_ucomile_sd(a, b) simde_mm_ucomile_sd(a, b)
6815 #endif
6816 
6817 SIMDE_FUNCTION_ATTRIBUTES
6818 int
simde_mm_ucomilt_sd(simde__m128d a,simde__m128d b)6819 simde_mm_ucomilt_sd (simde__m128d a, simde__m128d b) {
6820   #if defined(SIMDE_X86_SSE2_NATIVE)
6821     return _mm_ucomilt_sd(a, b);
6822   #else
6823     simde__m128d_private
6824       a_ = simde__m128d_to_private(a),
6825       b_ = simde__m128d_to_private(b);
6826     int r;
6827 
6828     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6829       uint64x2_t a_not_nan = vceqq_f64(a_.neon_f64, a_.neon_f64);
6830       uint64x2_t b_not_nan = vceqq_f64(b_.neon_f64, b_.neon_f64);
6831       uint64x2_t a_or_b_nan = vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(vandq_u64(a_not_nan, b_not_nan))));
6832       uint64x2_t a_lt_b = vcltq_f64(a_.neon_f64, b_.neon_f64);
6833       r = !!(vgetq_lane_u64(vorrq_u64(a_or_b_nan, a_lt_b), 0) != 0);
6834     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
6835       return wasm_f64x2_extract_lane(a_.wasm_v128, 0) < wasm_f64x2_extract_lane(b_.wasm_v128, 0);
6836     #elif defined(SIMDE_HAVE_FENV_H)
6837       fenv_t envp;
6838       int x = feholdexcept(&envp);
6839       r = a_.f64[0] < b_.f64[0];
6840       if (HEDLEY_LIKELY(x == 0))
6841         fesetenv(&envp);
6842     #else
6843       r = a_.f64[0] < b_.f64[0];
6844     #endif
6845 
6846     return r;
6847   #endif
6848 }
6849 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6850   #define _mm_ucomilt_sd(a, b) simde_mm_ucomilt_sd(a, b)
6851 #endif
6852 
6853 SIMDE_FUNCTION_ATTRIBUTES
6854 int
simde_mm_ucomineq_sd(simde__m128d a,simde__m128d b)6855 simde_mm_ucomineq_sd (simde__m128d a, simde__m128d b) {
6856   #if defined(SIMDE_X86_SSE2_NATIVE)
6857     return _mm_ucomineq_sd(a, b);
6858   #else
6859     simde__m128d_private
6860       a_ = simde__m128d_to_private(a),
6861       b_ = simde__m128d_to_private(b);
6862     int r;
6863 
6864     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6865       uint64x2_t a_not_nan = vceqq_f64(a_.neon_f64, a_.neon_f64);
6866       uint64x2_t b_not_nan = vceqq_f64(b_.neon_f64, b_.neon_f64);
6867       uint64x2_t a_and_b_not_nan = vandq_u64(a_not_nan, b_not_nan);
6868       uint64x2_t a_neq_b = vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(vceqq_f64(a_.neon_f64, b_.neon_f64))));
6869       r = !!(vgetq_lane_u64(vandq_u64(a_and_b_not_nan, a_neq_b), 0) != 0);
6870     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
6871       return wasm_f64x2_extract_lane(a_.wasm_v128, 0) != wasm_f64x2_extract_lane(b_.wasm_v128, 0);
6872     #elif defined(SIMDE_HAVE_FENV_H)
6873       fenv_t envp;
6874       int x = feholdexcept(&envp);
6875       r = a_.f64[0] != b_.f64[0];
6876       if (HEDLEY_LIKELY(x == 0))
6877         fesetenv(&envp);
6878     #else
6879       r = a_.f64[0] != b_.f64[0];
6880     #endif
6881 
6882     return r;
6883   #endif
6884 }
6885 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6886   #define _mm_ucomineq_sd(a, b) simde_mm_ucomineq_sd(a, b)
6887 #endif
6888 
6889 #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
6890   HEDLEY_DIAGNOSTIC_PUSH
6891   SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_
6892 #endif
6893 
6894 #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
6895   HEDLEY_DIAGNOSTIC_POP
6896 #endif
6897 
6898 SIMDE_FUNCTION_ATTRIBUTES
6899 void
simde_mm_lfence(void)6900 simde_mm_lfence (void) {
6901   #if defined(SIMDE_X86_SSE2_NATIVE)
6902     _mm_lfence();
6903   #else
6904     simde_mm_sfence();
6905   #endif
6906 }
6907 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6908   #define _mm_lfence() simde_mm_lfence()
6909 #endif
6910 
6911 SIMDE_FUNCTION_ATTRIBUTES
6912 void
simde_mm_mfence(void)6913 simde_mm_mfence (void) {
6914   #if defined(SIMDE_X86_SSE2_NATIVE)
6915     _mm_mfence();
6916   #else
6917     simde_mm_sfence();
6918   #endif
6919 }
6920 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6921   #define _mm_mfence() simde_mm_mfence()
6922 #endif
6923 
6924 SIMDE_FUNCTION_ATTRIBUTES
6925 simde__m128i
simde_mm_unpackhi_epi8(simde__m128i a,simde__m128i b)6926 simde_mm_unpackhi_epi8 (simde__m128i a, simde__m128i b) {
6927   #if defined(SIMDE_X86_SSE2_NATIVE)
6928     return _mm_unpackhi_epi8(a, b);
6929   #else
6930     simde__m128i_private
6931       r_,
6932       a_ = simde__m128i_to_private(a),
6933       b_ = simde__m128i_to_private(b);
6934 
6935     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6936       r_.neon_i8 = vzip2q_s8(a_.neon_i8, b_.neon_i8);
6937     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6938       int8x8_t a1 = vreinterpret_s8_s16(vget_high_s16(a_.neon_i16));
6939       int8x8_t b1 = vreinterpret_s8_s16(vget_high_s16(b_.neon_i16));
6940       int8x8x2_t result = vzip_s8(a1, b1);
6941       r_.neon_i8 = vcombine_s8(result.val[0], result.val[1]);
6942     #elif defined(SIMDE_SHUFFLE_VECTOR_)
6943       r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, b_.i8, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
6944     #else
6945       SIMDE_VECTORIZE
6946       for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i8[0])) / 2) ; i++) {
6947         r_.i8[(i * 2)]     = a_.i8[i + ((sizeof(r_) / sizeof(r_.i8[0])) / 2)];
6948         r_.i8[(i * 2) + 1] = b_.i8[i + ((sizeof(r_) / sizeof(r_.i8[0])) / 2)];
6949       }
6950     #endif
6951 
6952     return simde__m128i_from_private(r_);
6953   #endif
6954 }
6955 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6956   #define _mm_unpackhi_epi8(a, b) simde_mm_unpackhi_epi8(a, b)
6957 #endif
6958 
6959 SIMDE_FUNCTION_ATTRIBUTES
6960 simde__m128i
simde_mm_unpackhi_epi16(simde__m128i a,simde__m128i b)6961 simde_mm_unpackhi_epi16 (simde__m128i a, simde__m128i b) {
6962   #if defined(SIMDE_X86_SSE2_NATIVE)
6963     return _mm_unpackhi_epi16(a, b);
6964   #else
6965     simde__m128i_private
6966       r_,
6967       a_ = simde__m128i_to_private(a),
6968       b_ = simde__m128i_to_private(b);
6969 
6970     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6971       r_.neon_i16 = vzip2q_s16(a_.neon_i16, b_.neon_i16);
6972     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6973       int16x4_t a1 = vget_high_s16(a_.neon_i16);
6974       int16x4_t b1 = vget_high_s16(b_.neon_i16);
6975       int16x4x2_t result = vzip_s16(a1, b1);
6976       r_.neon_i16 = vcombine_s16(result.val[0], result.val[1]);
6977     #elif defined(SIMDE_SHUFFLE_VECTOR_)
6978       r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, b_.i16, 4, 12, 5, 13, 6, 14, 7, 15);
6979     #else
6980       SIMDE_VECTORIZE
6981       for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i16[0])) / 2) ; i++) {
6982         r_.i16[(i * 2)]     = a_.i16[i + ((sizeof(r_) / sizeof(r_.i16[0])) / 2)];
6983         r_.i16[(i * 2) + 1] = b_.i16[i + ((sizeof(r_) / sizeof(r_.i16[0])) / 2)];
6984       }
6985     #endif
6986 
6987     return simde__m128i_from_private(r_);
6988   #endif
6989 }
6990 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6991   #define _mm_unpackhi_epi16(a, b) simde_mm_unpackhi_epi16(a, b)
6992 #endif
6993 
6994 SIMDE_FUNCTION_ATTRIBUTES
6995 simde__m128i
simde_mm_unpackhi_epi32(simde__m128i a,simde__m128i b)6996 simde_mm_unpackhi_epi32 (simde__m128i a, simde__m128i b) {
6997   #if defined(SIMDE_X86_SSE2_NATIVE)
6998     return _mm_unpackhi_epi32(a, b);
6999   #else
7000     simde__m128i_private
7001       r_,
7002       a_ = simde__m128i_to_private(a),
7003       b_ = simde__m128i_to_private(b);
7004 
7005     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
7006       r_.neon_i32 = vzip2q_s32(a_.neon_i32, b_.neon_i32);
7007     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
7008       int32x2_t a1 = vget_high_s32(a_.neon_i32);
7009       int32x2_t b1 = vget_high_s32(b_.neon_i32);
7010       int32x2x2_t result = vzip_s32(a1, b1);
7011       r_.neon_i32 = vcombine_s32(result.val[0], result.val[1]);
7012     #elif defined(SIMDE_SHUFFLE_VECTOR_)
7013       r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.i32, b_.i32, 2, 6, 3, 7);
7014     #else
7015       SIMDE_VECTORIZE
7016       for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i32[0])) / 2) ; i++) {
7017         r_.i32[(i * 2)]     = a_.i32[i + ((sizeof(r_) / sizeof(r_.i32[0])) / 2)];
7018         r_.i32[(i * 2) + 1] = b_.i32[i + ((sizeof(r_) / sizeof(r_.i32[0])) / 2)];
7019       }
7020     #endif
7021 
7022     return simde__m128i_from_private(r_);
7023   #endif
7024 }
7025 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
7026   #define _mm_unpackhi_epi32(a, b) simde_mm_unpackhi_epi32(a, b)
7027 #endif
7028 
7029 SIMDE_FUNCTION_ATTRIBUTES
7030 simde__m128i
simde_mm_unpackhi_epi64(simde__m128i a,simde__m128i b)7031 simde_mm_unpackhi_epi64 (simde__m128i a, simde__m128i b) {
7032   #if defined(SIMDE_X86_SSE2_NATIVE)
7033     return _mm_unpackhi_epi64(a, b);
7034   #else
7035     simde__m128i_private
7036       r_,
7037       a_ = simde__m128i_to_private(a),
7038       b_ = simde__m128i_to_private(b);
7039 
7040     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
7041       int64x1_t a_h = vget_high_s64(a_.neon_i64);
7042       int64x1_t b_h = vget_high_s64(b_.neon_i64);
7043       r_.neon_i64 = vcombine_s64(a_h, b_h);
7044     #elif defined(SIMDE_SHUFFLE_VECTOR_)
7045       r_.i64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.i64, b_.i64, 1, 3);
7046     #else
7047       SIMDE_VECTORIZE
7048       for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i64[0])) / 2) ; i++) {
7049         r_.i64[(i * 2)]     = a_.i64[i + ((sizeof(r_) / sizeof(r_.i64[0])) / 2)];
7050         r_.i64[(i * 2) + 1] = b_.i64[i + ((sizeof(r_) / sizeof(r_.i64[0])) / 2)];
7051       }
7052     #endif
7053 
7054     return simde__m128i_from_private(r_);
7055   #endif
7056 }
7057 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
7058   #define _mm_unpackhi_epi64(a, b) simde_mm_unpackhi_epi64(a, b)
7059 #endif
7060 
7061 SIMDE_FUNCTION_ATTRIBUTES
7062 simde__m128d
simde_mm_unpackhi_pd(simde__m128d a,simde__m128d b)7063 simde_mm_unpackhi_pd (simde__m128d a, simde__m128d b) {
7064   #if defined(SIMDE_X86_SSE2_NATIVE)
7065     return _mm_unpackhi_pd(a, b);
7066   #else
7067     simde__m128d_private
7068       r_,
7069       a_ = simde__m128d_to_private(a),
7070       b_ = simde__m128d_to_private(b);
7071 
7072     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
7073       float64x1_t a_l = vget_high_f64(a_.f64);
7074       float64x1_t b_l = vget_high_f64(b_.f64);
7075       r_.neon_f64 = vcombine_f64(a_l, b_l);
7076     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
7077       r_.wasm_v128 = wasm_v64x2_shuffle(a_.wasm_v128, b_.wasm_v128, 1, 3);
7078     #elif defined(SIMDE_SHUFFLE_VECTOR_)
7079       r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 1, 3);
7080     #else
7081       SIMDE_VECTORIZE
7082       for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.f64[0])) / 2) ; i++) {
7083         r_.f64[(i * 2)]     = a_.f64[i + ((sizeof(r_) / sizeof(r_.f64[0])) / 2)];
7084         r_.f64[(i * 2) + 1] = b_.f64[i + ((sizeof(r_) / sizeof(r_.f64[0])) / 2)];
7085       }
7086     #endif
7087 
7088     return simde__m128d_from_private(r_);
7089   #endif
7090 }
7091 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
7092   #define _mm_unpackhi_pd(a, b) simde_mm_unpackhi_pd(a, b)
7093 #endif
7094 
7095 SIMDE_FUNCTION_ATTRIBUTES
7096 simde__m128i
simde_mm_unpacklo_epi8(simde__m128i a,simde__m128i b)7097 simde_mm_unpacklo_epi8 (simde__m128i a, simde__m128i b) {
7098   #if defined(SIMDE_X86_SSE2_NATIVE)
7099     return _mm_unpacklo_epi8(a, b);
7100   #else
7101     simde__m128i_private
7102       r_,
7103       a_ = simde__m128i_to_private(a),
7104       b_ = simde__m128i_to_private(b);
7105 
7106     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
7107       r_.neon_i8 = vzip1q_s8(a_.neon_i8, b_.neon_i8);
7108     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
7109       int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(a_.neon_i16));
7110       int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(b_.neon_i16));
7111       int8x8x2_t result = vzip_s8(a1, b1);
7112       r_.neon_i8 = vcombine_s8(result.val[0], result.val[1]);
7113     #elif defined(SIMDE_SHUFFLE_VECTOR_)
7114       r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, b_.i8, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
7115     #else
7116       SIMDE_VECTORIZE
7117       for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i8[0])) / 2) ; i++) {
7118         r_.i8[(i * 2)]     = a_.i8[i];
7119         r_.i8[(i * 2) + 1] = b_.i8[i];
7120       }
7121     #endif
7122 
7123     return simde__m128i_from_private(r_);
7124   #endif
7125 }
7126 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
7127   #define _mm_unpacklo_epi8(a, b) simde_mm_unpacklo_epi8(a, b)
7128 #endif
7129 
7130 SIMDE_FUNCTION_ATTRIBUTES
7131 simde__m128i
simde_mm_unpacklo_epi16(simde__m128i a,simde__m128i b)7132 simde_mm_unpacklo_epi16 (simde__m128i a, simde__m128i b) {
7133   #if defined(SIMDE_X86_SSE2_NATIVE)
7134     return _mm_unpacklo_epi16(a, b);
7135   #else
7136     simde__m128i_private
7137       r_,
7138       a_ = simde__m128i_to_private(a),
7139       b_ = simde__m128i_to_private(b);
7140 
7141     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
7142       r_.neon_i16 = vzip1q_s16(a_.neon_i16, b_.neon_i16);
7143     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
7144       int16x4_t a1 = vget_low_s16(a_.neon_i16);
7145       int16x4_t b1 = vget_low_s16(b_.neon_i16);
7146       int16x4x2_t result = vzip_s16(a1, b1);
7147       r_.neon_i16 = vcombine_s16(result.val[0], result.val[1]);
7148     #elif defined(SIMDE_SHUFFLE_VECTOR_)
7149       r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, b_.i16, 0, 8, 1, 9, 2, 10, 3, 11);
7150     #else
7151       SIMDE_VECTORIZE
7152       for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i16[0])) / 2) ; i++) {
7153         r_.i16[(i * 2)]     = a_.i16[i];
7154         r_.i16[(i * 2) + 1] = b_.i16[i];
7155       }
7156     #endif
7157 
7158     return simde__m128i_from_private(r_);
7159   #endif
7160 }
7161 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
7162   #define _mm_unpacklo_epi16(a, b) simde_mm_unpacklo_epi16(a, b)
7163 #endif
7164 
7165 SIMDE_FUNCTION_ATTRIBUTES
7166 simde__m128i
simde_mm_unpacklo_epi32(simde__m128i a,simde__m128i b)7167 simde_mm_unpacklo_epi32 (simde__m128i a, simde__m128i b) {
7168   #if defined(SIMDE_X86_SSE2_NATIVE)
7169     return _mm_unpacklo_epi32(a, b);
7170   #else
7171     simde__m128i_private
7172       r_,
7173       a_ = simde__m128i_to_private(a),
7174       b_ = simde__m128i_to_private(b);
7175 
7176     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
7177       r_.neon_i32 = vzip1q_s32(a_.neon_i32, b_.neon_i32);
7178     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
7179       int32x2_t a1 = vget_low_s32(a_.neon_i32);
7180       int32x2_t b1 = vget_low_s32(b_.neon_i32);
7181       int32x2x2_t result = vzip_s32(a1, b1);
7182       r_.neon_i32 = vcombine_s32(result.val[0], result.val[1]);
7183     #elif defined(SIMDE_SHUFFLE_VECTOR_)
7184       r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.i32, b_.i32, 0, 4, 1, 5);
7185     #else
7186       SIMDE_VECTORIZE
7187       for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i32[0])) / 2) ; i++) {
7188         r_.i32[(i * 2)]     = a_.i32[i];
7189         r_.i32[(i * 2) + 1] = b_.i32[i];
7190       }
7191     #endif
7192 
7193     return simde__m128i_from_private(r_);
7194   #endif
7195 }
7196 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
7197   #define _mm_unpacklo_epi32(a, b) simde_mm_unpacklo_epi32(a, b)
7198 #endif
7199 
7200 SIMDE_FUNCTION_ATTRIBUTES
7201 simde__m128i
simde_mm_unpacklo_epi64(simde__m128i a,simde__m128i b)7202 simde_mm_unpacklo_epi64 (simde__m128i a, simde__m128i b) {
7203   #if defined(SIMDE_X86_SSE2_NATIVE)
7204     return _mm_unpacklo_epi64(a, b);
7205   #else
7206     simde__m128i_private
7207       r_,
7208       a_ = simde__m128i_to_private(a),
7209       b_ = simde__m128i_to_private(b);
7210 
7211     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
7212       int64x1_t a_l = vget_low_s64(a_.i64);
7213       int64x1_t b_l = vget_low_s64(b_.i64);
7214       r_.neon_i64 = vcombine_s64(a_l, b_l);
7215     #elif defined(SIMDE_SHUFFLE_VECTOR_)
7216       r_.i64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.i64, b_.i64, 0, 2);
7217     #else
7218       SIMDE_VECTORIZE
7219       for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i64[0])) / 2) ; i++) {
7220         r_.i64[(i * 2)]     = a_.i64[i];
7221         r_.i64[(i * 2) + 1] = b_.i64[i];
7222       }
7223     #endif
7224 
7225     return simde__m128i_from_private(r_);
7226   #endif
7227 }
7228 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
7229   #define _mm_unpacklo_epi64(a, b) simde_mm_unpacklo_epi64(a, b)
7230 #endif
7231 
7232 SIMDE_FUNCTION_ATTRIBUTES
7233 simde__m128d
simde_mm_unpacklo_pd(simde__m128d a,simde__m128d b)7234 simde_mm_unpacklo_pd (simde__m128d a, simde__m128d b) {
7235   #if defined(SIMDE_X86_SSE2_NATIVE)
7236     return _mm_unpacklo_pd(a, b);
7237   #else
7238     simde__m128d_private
7239       r_,
7240       a_ = simde__m128d_to_private(a),
7241       b_ = simde__m128d_to_private(b);
7242 
7243     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
7244       float64x1_t a_l = vget_low_f64(a_.f64);
7245       float64x1_t b_l = vget_low_f64(b_.f64);
7246       r_.neon_f64 = vcombine_f64(a_l, b_l);
7247     #elif defined(SIMDE_SHUFFLE_VECTOR_)
7248       r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 0, 2);
7249     #else
7250       SIMDE_VECTORIZE
7251       for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.f64[0])) / 2) ; i++) {
7252         r_.f64[(i * 2)]     = a_.f64[i];
7253         r_.f64[(i * 2) + 1] = b_.f64[i];
7254       }
7255     #endif
7256 
7257     return simde__m128d_from_private(r_);
7258   #endif
7259 }
7260 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
7261   #define _mm_unpacklo_pd(a, b) simde_mm_unpacklo_pd(a, b)
7262 #endif
7263 
7264 SIMDE_FUNCTION_ATTRIBUTES
7265 simde__m128d
simde_x_mm_negate_pd(simde__m128d a)7266 simde_x_mm_negate_pd(simde__m128d a) {
7267   #if defined(SIMDE_X86_SSE_NATIVE)
7268     return simde_mm_xor_pd(a, _mm_set1_pd(SIMDE_FLOAT64_C(-0.0)));
7269   #else
7270     simde__m128d_private
7271       r_,
7272       a_ = simde__m128d_to_private(a);
7273 
7274     #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) && \
7275         (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,1,0))
7276       r_.altivec_f64 = vec_neg(a_.altivec_f64);
7277     #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
7278       r_.neon_f64 = vnegq_f64(a_.neon_f64);
7279     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
7280       r_.wasm_v128 = wasm_f64x2_neg(a_.wasm_v128);
7281     #elif defined(SIMDE_VECTOR_NEGATE)
7282       r_.f64 = -a_.f64;
7283     #else
7284       SIMDE_VECTORIZE
7285       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
7286         r_.f64[i] = -a_.f64[i];
7287       }
7288     #endif
7289 
7290     return simde__m128d_from_private(r_);
7291   #endif
7292 }
7293 
7294 SIMDE_FUNCTION_ATTRIBUTES
7295 simde__m128i
simde_mm_xor_si128(simde__m128i a,simde__m128i b)7296 simde_mm_xor_si128 (simde__m128i a, simde__m128i b) {
7297   #if defined(SIMDE_X86_SSE2_NATIVE)
7298     return _mm_xor_si128(a, b);
7299   #else
7300     simde__m128i_private
7301       r_,
7302       a_ = simde__m128i_to_private(a),
7303       b_ = simde__m128i_to_private(b);
7304 
7305     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
7306       r_.neon_i32 = veorq_s32(a_.neon_i32, b_.neon_i32);
7307     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
7308       r_.altivec_i32 = vec_xor(a_.altivec_i32, b_.altivec_i32);
7309     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
7310       r_.i32f = a_.i32f ^ b_.i32f;
7311     #else
7312       SIMDE_VECTORIZE
7313       for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
7314         r_.i32f[i] = a_.i32f[i] ^ b_.i32f[i];
7315       }
7316     #endif
7317 
7318     return simde__m128i_from_private(r_);
7319   #endif
7320 }
7321 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
7322   #define _mm_xor_si128(a, b) simde_mm_xor_si128(a, b)
7323 #endif
7324 
7325 SIMDE_FUNCTION_ATTRIBUTES
7326 simde__m128i
simde_x_mm_not_si128(simde__m128i a)7327 simde_x_mm_not_si128 (simde__m128i a) {
7328   #if defined(SIMDE_X86_AVX512VL_NATIVE)
7329     return _mm_ternarylogic_epi32(a, a, a, 0x55);
7330   #else
7331     simde__m128i_private
7332       r_,
7333       a_ = simde__m128i_to_private(a);
7334 
7335     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
7336       r_.neon_i32 = vmvnq_s32(a_.neon_i32);
7337     #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
7338       r_.altivec_i32 = vec_nor(a_.altivec_i32, a_.altivec_i32);
7339     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
7340       r_.wasm_v128 = wasm_v128_not(a_.wasm_v128);
7341     #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
7342       r_.i32f = ~a_.i32f;
7343     #else
7344       SIMDE_VECTORIZE
7345       for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
7346         r_.i32f[i] = ~(a_.i32f[i]);
7347       }
7348     #endif
7349 
7350     return simde__m128i_from_private(r_);
7351   #endif
7352 }
7353 
7354 #define SIMDE_MM_SHUFFLE2(x, y) (((x) << 1) | (y))
7355 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
7356   #define _MM_SHUFFLE2(x, y) SIMDE_MM_SHUFFLE2(x, y)
7357 #endif
7358 
7359 SIMDE_END_DECLS_
7360 
7361 HEDLEY_DIAGNOSTIC_POP
7362 
7363 #endif /* !defined(SIMDE_X86_SSE2_H) */
7364