1 /* SPDX-License-Identifier: MIT
2  *
3  * Permission is hereby granted, free of charge, to any person
4  * obtaining a copy of this software and associated documentation
5  * files (the "Software"), to deal in the Software without
6  * restriction, including without limitation the rights to use, copy,
7  * modify, merge, publish, distribute, sublicense, and/or sell copies
8  * of the Software, and to permit persons to whom the Software is
9  * furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be
12  * included in all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
18  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
19  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  * Copyright:
24  *   2017-2020 Evan Nemerson <evan@nemerson.com>
25  *   2015-2017 John W. Ratcliff <jratcliffscarab@gmail.com>
26  *   2015      Brandon Rowlett <browlett@nvidia.com>
27  *   2015      Ken Fast <kfast@gdeb.com>
28  *   2017      Hasindu Gamaarachchi <hasindu@unsw.edu.au>
29  *   2018      Jeff Daily <jeff.daily@amd.com>
30  */
31 
32 #if !defined(SIMDE_X86_SSE2_H)
33 #define SIMDE_X86_SSE2_H
34 
35 #include "sse.h"
36 
37 HEDLEY_DIAGNOSTIC_PUSH
38 SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
39 SIMDE_BEGIN_DECLS_
40 
41 typedef union {
42 #if defined(SIMDE_VECTOR_SUBSCRIPT)
43   SIMDE_ALIGN(16) int8_t          i8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
44   SIMDE_ALIGN(16) int16_t        i16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
45   SIMDE_ALIGN(16) int32_t        i32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
46   SIMDE_ALIGN(16) int64_t        i64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
47   SIMDE_ALIGN(16) uint8_t         u8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
48   SIMDE_ALIGN(16) uint16_t       u16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
49   SIMDE_ALIGN(16) uint32_t       u32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
50   SIMDE_ALIGN(16) uint64_t       u64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
51   #if defined(SIMDE_HAVE_INT128_)
52   SIMDE_ALIGN(16) simde_int128  i128 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
53   SIMDE_ALIGN(16) simde_uint128 u128 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
54   #endif
55   SIMDE_ALIGN(16) simde_float32  f32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
56   SIMDE_ALIGN(16) simde_float64  f64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
57 
58   SIMDE_ALIGN(16) int_fast32_t  i32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
59   SIMDE_ALIGN(16) uint_fast32_t u32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
60 #else
61   SIMDE_ALIGN(16) int8_t         i8[16];
62   SIMDE_ALIGN(16) int16_t        i16[8];
63   SIMDE_ALIGN(16) int32_t        i32[4];
64   SIMDE_ALIGN(16) int64_t        i64[2];
65   SIMDE_ALIGN(16) uint8_t        u8[16];
66   SIMDE_ALIGN(16) uint16_t       u16[8];
67   SIMDE_ALIGN(16) uint32_t       u32[4];
68   SIMDE_ALIGN(16) uint64_t       u64[2];
69   #if defined(SIMDE_HAVE_INT128_)
70   SIMDE_ALIGN(16) simde_int128  i128[1];
71   SIMDE_ALIGN(16) simde_uint128 u128[1];
72   #endif
73   SIMDE_ALIGN(16) simde_float32  f32[4];
74   SIMDE_ALIGN(16) simde_float64  f64[2];
75 
76   SIMDE_ALIGN(16) int_fast32_t  i32f[16 / sizeof(int_fast32_t)];
77   SIMDE_ALIGN(16) uint_fast32_t u32f[16 / sizeof(uint_fast32_t)];
78 #endif
79 
80   SIMDE_ALIGN(16) simde__m64_private m64_private[2];
81   SIMDE_ALIGN(16) simde__m64         m64[2];
82 
83 #if defined(SIMDE_X86_SSE2_NATIVE)
84   SIMDE_ALIGN(16) __m128i        n;
85 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
86   SIMDE_ALIGN(16) int8x16_t      neon_i8;
87   SIMDE_ALIGN(16) int16x8_t      neon_i16;
88   SIMDE_ALIGN(16) int32x4_t      neon_i32;
89   SIMDE_ALIGN(16) int64x2_t      neon_i64;
90   SIMDE_ALIGN(16) uint8x16_t     neon_u8;
91   SIMDE_ALIGN(16) uint16x8_t     neon_u16;
92   SIMDE_ALIGN(16) uint32x4_t     neon_u32;
93   SIMDE_ALIGN(16) uint64x2_t     neon_u64;
94   SIMDE_ALIGN(16) float32x4_t    neon_f32;
95   #if defined(SIMDE_ARCH_AARCH64)
96   SIMDE_ALIGN(16) float64x2_t    neon_f64;
97   #endif
98 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
99   SIMDE_ALIGN(16) v128_t         wasm_v128;
100 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
101   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(signed char)          altivec_i8;
102   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(signed short)         altivec_i16;
103   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(signed int)           altivec_i32;
104   #if defined(__UINT_FAST32_TYPE__) && defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
105   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(__INT_FAST32_TYPE__)  altivec_i32f;
106   #else
107   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(signed int)           altivec_i32f;
108   #endif
109   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(unsigned char)        altivec_u8;
110   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(unsigned short)       altivec_u16;
111   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(unsigned int)         altivec_u32;
112   #if defined(__UINT_FAST32_TYPE__) && defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
113   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(__UINT_FAST32_TYPE__) altivec_u32f;
114   #else
115   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(unsigned int)         altivec_u32f;
116   #endif
117   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(float)                altivec_f32;
118   #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
119     SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(signed long long)   altivec_i64;
120     SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) altivec_u64;
121     SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(double)             altivec_f64;
122   #endif
123 #endif
124 } simde__m128i_private;
125 
126 typedef union {
127 #if defined(SIMDE_VECTOR_SUBSCRIPT)
128   SIMDE_ALIGN(16) int8_t          i8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
129   SIMDE_ALIGN(16) int16_t        i16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
130   SIMDE_ALIGN(16) int32_t        i32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
131   SIMDE_ALIGN(16) int64_t        i64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
132   SIMDE_ALIGN(16) uint8_t         u8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
133   SIMDE_ALIGN(16) uint16_t       u16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
134   SIMDE_ALIGN(16) uint32_t       u32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
135   SIMDE_ALIGN(16) uint64_t       u64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
136   SIMDE_ALIGN(16) simde_float32  f32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
137   SIMDE_ALIGN(16) simde_float64  f64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
138   SIMDE_ALIGN(16) int_fast32_t  i32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
139   SIMDE_ALIGN(16) uint_fast32_t u32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
140 #else
141   SIMDE_ALIGN(16) int8_t         i8[16];
142   SIMDE_ALIGN(16) int16_t        i16[8];
143   SIMDE_ALIGN(16) int32_t        i32[4];
144   SIMDE_ALIGN(16) int64_t        i64[2];
145   SIMDE_ALIGN(16) uint8_t        u8[16];
146   SIMDE_ALIGN(16) uint16_t       u16[8];
147   SIMDE_ALIGN(16) uint32_t       u32[4];
148   SIMDE_ALIGN(16) uint64_t       u64[2];
149   SIMDE_ALIGN(16) simde_float32  f32[4];
150   SIMDE_ALIGN(16) simde_float64  f64[2];
151   SIMDE_ALIGN(16) int_fast32_t  i32f[16 / sizeof(int_fast32_t)];
152   SIMDE_ALIGN(16) uint_fast32_t u32f[16 / sizeof(uint_fast32_t)];
153 #endif
154 
155   SIMDE_ALIGN(16) simde__m64_private m64_private[2];
156   SIMDE_ALIGN(16) simde__m64         m64[2];
157 
158 #if defined(SIMDE_X86_SSE2_NATIVE)
159   SIMDE_ALIGN(16) __m128d        n;
160 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
161   SIMDE_ALIGN(16) int8x16_t      neon_i8;
162   SIMDE_ALIGN(16) int16x8_t      neon_i16;
163   SIMDE_ALIGN(16) int32x4_t      neon_i32;
164   SIMDE_ALIGN(16) int64x2_t      neon_i64;
165   SIMDE_ALIGN(16) uint8x16_t     neon_u8;
166   SIMDE_ALIGN(16) uint16x8_t     neon_u16;
167   SIMDE_ALIGN(16) uint32x4_t     neon_u32;
168   SIMDE_ALIGN(16) uint64x2_t     neon_u64;
169   SIMDE_ALIGN(16) float32x4_t    neon_f32;
170   #if defined(SIMDE_ARCH_AARCH64)
171   SIMDE_ALIGN(16) float64x2_t    neon_f64;
172   #endif
173 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
174   SIMDE_ALIGN(16) v128_t         wasm_v128;
175 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
176   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(signed char)          altivec_i8;
177   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(signed short)         altivec_i16;
178   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(signed int)           altivec_i32;
179   #if defined(__INT_FAST32_TYPE__) && defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
180   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(__INT_FAST32_TYPE__)  altivec_i32f;
181   #else
182   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(signed int)           altivec_i32f;
183   #endif
184   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(unsigned char)        altivec_u8;
185   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(unsigned short)       altivec_u16;
186   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(unsigned int)         altivec_u32;
187   #if defined(__UINT_FAST32_TYPE__) && defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
188   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(__UINT_FAST32_TYPE__) altivec_u32f;
189   #else
190   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(unsigned int)         altivec_u32f;
191   #endif
192   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(float)                altivec_f32;
193   #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
194     SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(signed long long)   altivec_i64;
195     SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) altivec_u64;
196     SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(double)             altivec_f64;
197   #endif
198 #endif
199 } simde__m128d_private;
200 
201 #if defined(SIMDE_X86_SSE2_NATIVE)
202   typedef __m128i simde__m128i;
203   typedef __m128d simde__m128d;
204 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
205    typedef int64x2_t simde__m128i;
206 #  if defined(SIMDE_ARCH_AARCH64)
207      typedef float64x2_t simde__m128d;
208 #  elif defined(SIMDE_VECTOR_SUBSCRIPT)
209      typedef simde_float64 simde__m128d SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
210 #  else
211      typedef simde__m128d_private simde__m128d;
212 #  endif
213 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
214    typedef v128_t simde__m128i;
215    typedef v128_t simde__m128d;
216 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
217   typedef SIMDE_POWER_ALTIVEC_VECTOR(float) simde__m128i;
218   #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
219      typedef SIMDE_POWER_ALTIVEC_VECTOR(double) simde__m128d;
220   #else
221      typedef simde__m128d_private simde__m128d;
222   #endif
223 #elif defined(SIMDE_VECTOR_SUBSCRIPT)
224   typedef int64_t simde__m128i SIMDE_ALIGN(16) SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
225   typedef simde_float64 simde__m128d SIMDE_ALIGN(16) SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
226 #else
227   typedef simde__m128i_private simde__m128i;
228   typedef simde__m128d_private simde__m128d;
229 #endif
230 
231 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
232   typedef simde__m128i __m128i;
233   typedef simde__m128d __m128d;
234 #endif
235 
236 HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128i), "simde__m128i size incorrect");
237 HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128i_private), "simde__m128i_private size incorrect");
238 HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128d), "simde__m128d size incorrect");
239 HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128d_private), "simde__m128d_private size incorrect");
240 #if defined(SIMDE_CHECK_ALIGNMENT) && defined(SIMDE_ALIGN_OF)
241 HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128i) == 16, "simde__m128i is not 16-byte aligned");
242 HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128i_private) == 16, "simde__m128i_private is not 16-byte aligned");
243 HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128d) == 16, "simde__m128d is not 16-byte aligned");
244 HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128d_private) == 16, "simde__m128d_private is not 16-byte aligned");
245 #endif
246 
247 SIMDE_FUNCTION_ATTRIBUTES
248 simde__m128i
simde__m128i_from_private(simde__m128i_private v)249 simde__m128i_from_private(simde__m128i_private v) {
250   simde__m128i r;
251   simde_memcpy(&r, &v, sizeof(r));
252   return r;
253 }
254 
255 SIMDE_FUNCTION_ATTRIBUTES
256 simde__m128i_private
simde__m128i_to_private(simde__m128i v)257 simde__m128i_to_private(simde__m128i v) {
258   simde__m128i_private r;
259   simde_memcpy(&r, &v, sizeof(r));
260   return r;
261 }
262 
263 SIMDE_FUNCTION_ATTRIBUTES
264 simde__m128d
simde__m128d_from_private(simde__m128d_private v)265 simde__m128d_from_private(simde__m128d_private v) {
266   simde__m128d r;
267   simde_memcpy(&r, &v, sizeof(r));
268   return r;
269 }
270 
271 SIMDE_FUNCTION_ATTRIBUTES
272 simde__m128d_private
simde__m128d_to_private(simde__m128d v)273 simde__m128d_to_private(simde__m128d v) {
274   simde__m128d_private r;
275   simde_memcpy(&r, &v, sizeof(r));
276   return r;
277 }
278 
279 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i,int8x16_t,neon,i8)280   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, int8x16_t, neon, i8)
281   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, int16x8_t, neon, i16)
282   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, int32x4_t, neon, i32)
283   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, int64x2_t, neon, i64)
284   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, uint8x16_t, neon, u8)
285   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, uint16x8_t, neon, u16)
286   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, uint32x4_t, neon, u32)
287   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, uint64x2_t, neon, u64)
288   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, float32x4_t, neon, f32)
289   #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
290     SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, float64x2_t, neon, f64)
291   #endif
292 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
293   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(signed char), altivec, i8)
294   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(signed short), altivec, i16)
295   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(signed int), altivec, i32)
296   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), altivec, u8)
297   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(unsigned short), altivec, u16)
298   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(unsigned int), altivec, u32)
299   #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
300     SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long), altivec, u64)
301     SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(signed long long), altivec, i64)
302   #endif
303 #endif /* defined(SIMDE_ARM_NEON_A32V7_NATIVE) */
304 
305 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
306   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, int8x16_t, neon, i8)
307   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, int16x8_t, neon, i16)
308   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, int32x4_t, neon, i32)
309   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, int64x2_t, neon, i64)
310   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, uint8x16_t, neon, u8)
311   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, uint16x8_t, neon, u16)
312   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, uint32x4_t, neon, u32)
313   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, uint64x2_t, neon, u64)
314   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, float32x4_t, neon, f32)
315   #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
316     SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, float64x2_t, neon, f64)
317   #endif
318 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
319   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(signed char), altivec, i8)
320   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(signed short), altivec, i16)
321   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(signed int), altivec, i32)
322   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), altivec, u8)
323   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(unsigned short), altivec, u16)
324   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(unsigned int), altivec, u32)
325   #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
326     SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long), altivec, u64)
327     SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(signed long long), altivec, i64)
328     #if defined(SIMDE_BUG_GCC_95782)
329       SIMDE_FUNCTION_ATTRIBUTES
330       SIMDE_POWER_ALTIVEC_VECTOR(double)
331       simde__m128d_to_altivec_f64(simde__m128d value) {
332         simde__m128d_private r_ = simde__m128d_to_private(value);
333         return r_.altivec_f64;
334       }
335 
336       SIMDE_FUNCTION_ATTRIBUTES
337       simde__m128d
338       simde__m128d_from_altivec_f64(SIMDE_POWER_ALTIVEC_VECTOR(double) value) {
339         simde__m128d_private r_;
340         r_.altivec_f64 = value;
341         return simde__m128d_from_private(r_);
342       }
343     #else
344       SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(double), altivec, f64)
345     #endif
346   #endif
347 #endif /* defined(SIMDE_ARM_NEON_A32V7_NATIVE) */
348 
349 SIMDE_FUNCTION_ATTRIBUTES
350 simde__m128i
351 simde_mm_add_epi8 (simde__m128i a, simde__m128i b) {
352 #if defined(SIMDE_X86_SSE2_NATIVE)
353   return _mm_add_epi8(a, b);
354 #else
355   simde__m128i_private
356     r_,
357     a_ = simde__m128i_to_private(a),
358     b_ = simde__m128i_to_private(b);
359 
360   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
361     r_.neon_i8 = vaddq_s8(a_.neon_i8, b_.neon_i8);
362   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
363     r_.altivec_i8 = vec_add(a_.altivec_i8, b_.altivec_i8);
364   #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
365     r_.i8 = a_.i8 + b_.i8;
366   #else
367     SIMDE_VECTORIZE
368     for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
369       r_.i8[i] = a_.i8[i] + b_.i8[i];
370     }
371   #endif
372 
373   return simde__m128i_from_private(r_);
374 #endif
375 }
376 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
377 #  define _mm_add_epi8(a, b) simde_mm_add_epi8(a, b)
378 #endif
379 
380 SIMDE_FUNCTION_ATTRIBUTES
381 simde__m128i
simde_mm_add_epi16(simde__m128i a,simde__m128i b)382 simde_mm_add_epi16 (simde__m128i a, simde__m128i b) {
383 #if defined(SIMDE_X86_SSE2_NATIVE)
384   return _mm_add_epi16(a, b);
385 #else
386   simde__m128i_private
387     r_,
388     a_ = simde__m128i_to_private(a),
389     b_ = simde__m128i_to_private(b);
390 
391   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
392     r_.neon_i16 = vaddq_s16(a_.neon_i16, b_.neon_i16);
393   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
394     r_.altivec_i16 = vec_add(a_.altivec_i16, b_.altivec_i16);
395   #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
396     r_.i16 = a_.i16 + b_.i16;
397   #else
398     SIMDE_VECTORIZE
399     for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
400       r_.i16[i] = a_.i16[i] + b_.i16[i];
401     }
402   #endif
403 
404   return simde__m128i_from_private(r_);
405 #endif
406 }
407 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
408 #  define _mm_add_epi16(a, b) simde_mm_add_epi16(a, b)
409 #endif
410 
411 SIMDE_FUNCTION_ATTRIBUTES
412 simde__m128i
simde_mm_add_epi32(simde__m128i a,simde__m128i b)413 simde_mm_add_epi32 (simde__m128i a, simde__m128i b) {
414 #if defined(SIMDE_X86_SSE2_NATIVE)
415   return _mm_add_epi32(a, b);
416 #else
417   simde__m128i_private
418     r_,
419     a_ = simde__m128i_to_private(a),
420     b_ = simde__m128i_to_private(b);
421 
422   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
423     r_.neon_i32 = vaddq_s32(a_.neon_i32, b_.neon_i32);
424   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
425     r_.altivec_i32 = vec_add(a_.altivec_i32, b_.altivec_i32);
426   #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
427     r_.i32 = a_.i32 + b_.i32;
428   #else
429     SIMDE_VECTORIZE
430     for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
431       r_.i32[i] = a_.i32[i] + b_.i32[i];
432     }
433   #endif
434 
435   return simde__m128i_from_private(r_);
436 #endif
437 }
438 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
439 #  define _mm_add_epi32(a, b) simde_mm_add_epi32(a, b)
440 #endif
441 
442 SIMDE_FUNCTION_ATTRIBUTES
443 simde__m128i
simde_mm_add_epi64(simde__m128i a,simde__m128i b)444 simde_mm_add_epi64 (simde__m128i a, simde__m128i b) {
445 #if defined(SIMDE_X86_SSE2_NATIVE)
446   return _mm_add_epi64(a, b);
447 #else
448   simde__m128i_private
449     r_,
450     a_ = simde__m128i_to_private(a),
451     b_ = simde__m128i_to_private(b);
452 
453   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
454     r_.neon_i64 = vaddq_s64(a_.neon_i64, b_.neon_i64);
455   #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
456     r_.altivec_i64 = vec_add(a_.altivec_i64, b_.altivec_i64);
457   #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
458     r_.i64 = a_.i64 + b_.i64;
459   #else
460     SIMDE_VECTORIZE
461     for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
462       r_.i64[i] = a_.i64[i] + b_.i64[i];
463     }
464   #endif
465 
466   return simde__m128i_from_private(r_);
467 #endif
468 }
469 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
470 #  define _mm_add_epi64(a, b) simde_mm_add_epi64(a, b)
471 #endif
472 
473 SIMDE_FUNCTION_ATTRIBUTES
474 simde__m128d
simde_mm_add_pd(simde__m128d a,simde__m128d b)475 simde_mm_add_pd (simde__m128d a, simde__m128d b) {
476 #if defined(SIMDE_X86_SSE2_NATIVE)
477   return _mm_add_pd(a, b);
478 #else
479   simde__m128d_private
480     r_,
481     a_ = simde__m128d_to_private(a),
482     b_ = simde__m128d_to_private(b);
483 
484 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
485   r_.neon_f64 = vaddq_f64(a_.neon_f64, b_.neon_f64);
486 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
487   r_.wasm_v128 = wasm_f64x2_add(a_.wasm_v128, b_.wasm_v128);
488 #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
489   r_.altivec_f64 = vec_add(a_.altivec_f64, b_.altivec_f64);
490 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
491   r_.f64 = a_.f64 + b_.f64;
492 #else
493   SIMDE_VECTORIZE
494   for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
495     r_.f64[i] = a_.f64[i] + b_.f64[i];
496   }
497 #endif
498 
499   return simde__m128d_from_private(r_);
500 #endif
501 }
502 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
503 #  define _mm_add_pd(a, b) simde_mm_add_pd(a, b)
504 #endif
505 
506 SIMDE_FUNCTION_ATTRIBUTES
507 simde__m128d
simde_mm_move_sd(simde__m128d a,simde__m128d b)508 simde_mm_move_sd (simde__m128d a, simde__m128d b) {
509 #if defined(SIMDE_X86_SSE2_NATIVE)
510   return _mm_move_sd(a, b);
511 #else
512   simde__m128d_private
513     r_,
514     a_ = simde__m128d_to_private(a),
515     b_ = simde__m128d_to_private(b);
516 
517 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
518   r_.neon_f64 = vsetq_lane_f64(vgetq_lane_f64(b_.neon_f64, 0), a_.neon_f64, 0);
519 #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
520   SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) m = {
521     16, 17, 18, 19, 20, 21, 22, 23,
522      8,  9, 10, 11, 12, 13, 14, 15
523   };
524   r_.altivec_f64 = vec_perm(a_.altivec_f64, b_.altivec_f64, m);
525 #elif defined(SIMDE_SHUFFLE_VECTOR_)
526   r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 2, 1);
527 #else
528   r_.f64[0] = b_.f64[0];
529   r_.f64[1] = a_.f64[1];
530 #endif
531 
532   return simde__m128d_from_private(r_);
533 #endif
534 }
535 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
536 #  define _mm_move_sd(a, b) simde_mm_move_sd(a, b)
537 #endif
538 
539 SIMDE_FUNCTION_ATTRIBUTES
540 simde__m128d
simde_mm_add_sd(simde__m128d a,simde__m128d b)541 simde_mm_add_sd (simde__m128d a, simde__m128d b) {
542 #if defined(SIMDE_X86_SSE2_NATIVE)
543   return _mm_add_sd(a, b);
544 #else
545   simde__m128d_private
546     r_,
547     a_ = simde__m128d_to_private(a),
548     b_ = simde__m128d_to_private(b);
549 
550   r_.f64[0] = a_.f64[0] + b_.f64[0];
551   r_.f64[1] = a_.f64[1];
552 
553 #if defined(SIMDE_ASSUME_VECTORIZATION)
554    return simde_mm_move_sd(a, simde_mm_add_pd(a, b));
555 #else
556     r_.f64[0] = a_.f64[0] + b_.f64[0];
557     r_.f64[1] = a_.f64[1];
558 #endif
559 
560   return simde__m128d_from_private(r_);
561 #endif
562 }
563 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
564 #  define _mm_add_sd(a, b) simde_mm_add_sd(a, b)
565 #endif
566 
567 SIMDE_FUNCTION_ATTRIBUTES
568 simde__m64
simde_mm_add_si64(simde__m64 a,simde__m64 b)569 simde_mm_add_si64 (simde__m64 a, simde__m64 b) {
570 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
571   return _mm_add_si64(a, b);
572 #else
573   simde__m64_private
574     r_,
575     a_ = simde__m64_to_private(a),
576     b_ = simde__m64_to_private(b);
577 
578 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
579   r_.neon_i64 = vadd_s64(a_.neon_i64, b_.neon_i64);
580 #else
581   r_.i64[0] = a_.i64[0] + b_.i64[0];
582 #endif
583 
584   return simde__m64_from_private(r_);
585 #endif
586 }
587 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
588 #  define _mm_add_si64(a, b) simde_mm_add_si64(a, b)
589 #endif
590 
591 SIMDE_FUNCTION_ATTRIBUTES
592 simde__m128i
simde_mm_adds_epi8(simde__m128i a,simde__m128i b)593 simde_mm_adds_epi8 (simde__m128i a, simde__m128i b) {
594 #if defined(SIMDE_X86_SSE2_NATIVE)
595   return _mm_adds_epi8(a, b);
596 #else
597   simde__m128i_private
598     r_,
599     a_ = simde__m128i_to_private(a),
600     b_ = simde__m128i_to_private(b);
601 
602 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
603   r_.neon_i8 = vqaddq_s8(a_.neon_i8, b_.neon_i8);
604 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
605   r_.altivec_i8 = vec_adds(a_.altivec_i8, b_.altivec_i8);
606 #else
607   SIMDE_VECTORIZE
608   for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
609     const int32_t tmp =
610       HEDLEY_STATIC_CAST(int16_t, a_.i8[i]) +
611       HEDLEY_STATIC_CAST(int16_t, b_.i8[i]);
612     r_.i8[i] = HEDLEY_STATIC_CAST(int8_t, ((tmp < INT8_MAX) ? ((tmp > INT8_MIN) ? tmp : INT8_MIN) : INT8_MAX));
613   }
614 #endif
615 
616   return simde__m128i_from_private(r_);
617 #endif
618 }
619 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
620 #  define _mm_adds_epi8(a, b) simde_mm_adds_epi8(a, b)
621 #endif
622 
623 SIMDE_FUNCTION_ATTRIBUTES
624 simde__m128i
simde_mm_adds_epi16(simde__m128i a,simde__m128i b)625 simde_mm_adds_epi16 (simde__m128i a, simde__m128i b) {
626 #if defined(SIMDE_X86_SSE2_NATIVE)
627   return _mm_adds_epi16(a, b);
628 #else
629   simde__m128i_private
630     r_,
631     a_ = simde__m128i_to_private(a),
632     b_ = simde__m128i_to_private(b);
633 
634 
635   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
636     r_.neon_i16 = vqaddq_s16(a_.neon_i16, b_.neon_i16);
637   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
638     r_.altivec_i16 = vec_adds(a_.altivec_i16, b_.altivec_i16);
639   #else
640     SIMDE_VECTORIZE
641     for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
642       const int32_t tmp =
643         HEDLEY_STATIC_CAST(int32_t, a_.i16[i]) +
644         HEDLEY_STATIC_CAST(int32_t, b_.i16[i]);
645       r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, ((tmp < INT16_MAX) ? ((tmp > INT16_MIN) ? tmp : INT16_MIN) : INT16_MAX));
646     }
647   #endif
648 
649   return simde__m128i_from_private(r_);
650 #endif
651 }
652 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
653 #  define _mm_adds_epi16(a, b) simde_mm_adds_epi16(a, b)
654 #endif
655 
656 SIMDE_FUNCTION_ATTRIBUTES
657 simde__m128i
simde_mm_adds_epu8(simde__m128i a,simde__m128i b)658 simde_mm_adds_epu8 (simde__m128i a, simde__m128i b) {
659 #if defined(SIMDE_X86_SSE2_NATIVE)
660   return _mm_adds_epu8(a, b);
661 #else
662   simde__m128i_private
663     r_,
664     a_ = simde__m128i_to_private(a),
665     b_ = simde__m128i_to_private(b);
666 
667   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
668     r_.neon_u8 = vqaddq_u8(a_.neon_u8, b_.neon_u8);
669   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
670     r_.altivec_u8 = vec_adds(a_.altivec_u8, b_.altivec_u8);
671   #else
672     SIMDE_VECTORIZE
673     for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
674       r_.u8[i] = ((UINT8_MAX - a_.u8[i]) > b_.u8[i]) ? (a_.u8[i] + b_.u8[i]) : UINT8_MAX;
675     }
676   #endif
677 
678   return simde__m128i_from_private(r_);
679 #endif
680 }
681 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
682 #  define _mm_adds_epu8(a, b) simde_mm_adds_epu8(a, b)
683 #endif
684 
685 SIMDE_FUNCTION_ATTRIBUTES
686 simde__m128i
simde_mm_adds_epu16(simde__m128i a,simde__m128i b)687 simde_mm_adds_epu16 (simde__m128i a, simde__m128i b) {
688 #if defined(SIMDE_X86_SSE2_NATIVE)
689   return _mm_adds_epu16(a, b);
690 #else
691   simde__m128i_private
692     r_,
693     a_ = simde__m128i_to_private(a),
694     b_ = simde__m128i_to_private(b);
695 
696   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
697     r_.neon_u16 = vqaddq_u16(a_.neon_u16, b_.neon_u16);
698   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
699     r_.altivec_u16 = vec_adds(a_.altivec_u16, b_.altivec_u16);
700   #else
701     SIMDE_VECTORIZE
702     for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
703       r_.u16[i] = ((UINT16_MAX - a_.u16[i]) > b_.u16[i]) ? (a_.u16[i] + b_.u16[i]) : UINT16_MAX;
704     }
705   #endif
706 
707   return simde__m128i_from_private(r_);
708 #endif
709 }
710 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
711 #  define _mm_adds_epu16(a, b) simde_mm_adds_epu16(a, b)
712 #endif
713 
714 SIMDE_FUNCTION_ATTRIBUTES
715 simde__m128d
simde_mm_and_pd(simde__m128d a,simde__m128d b)716 simde_mm_and_pd (simde__m128d a, simde__m128d b) {
717 #if defined(SIMDE_X86_SSE2_NATIVE)
718   return _mm_and_pd(a, b);
719 #else
720   simde__m128d_private
721     r_,
722     a_ = simde__m128d_to_private(a),
723     b_ = simde__m128d_to_private(b);
724 
725 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
726   r_.neon_i32 = vandq_s32(a_.neon_i32, b_.neon_i32);
727 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
728   r_.wasm_v128 = wasm_v128_and(a_.wasm_v128, b_.wasm_v128);
729 #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
730   r_.altivec_f64 = vec_and(a_.altivec_f64, b_.altivec_f64);
731 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
732   r_.i32f = a_.i32f & b_.i32f;
733 #else
734   SIMDE_VECTORIZE
735   for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
736     r_.i32f[i] = a_.i32f[i] & b_.i32f[i];
737   }
738 #endif
739 
740   return simde__m128d_from_private(r_);
741 #endif
742 }
743 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
744 #  define _mm_and_pd(a, b) simde_mm_and_pd(a, b)
745 #endif
746 
747 SIMDE_FUNCTION_ATTRIBUTES
748 simde__m128i
simde_mm_and_si128(simde__m128i a,simde__m128i b)749 simde_mm_and_si128 (simde__m128i a, simde__m128i b) {
750 #if defined(SIMDE_X86_SSE2_NATIVE)
751   return _mm_and_si128(a, b);
752 #else
753   simde__m128i_private
754     r_,
755     a_ = simde__m128i_to_private(a),
756     b_ = simde__m128i_to_private(b);
757 
758   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
759     r_.neon_i32 = vandq_s32(b_.neon_i32, a_.neon_i32);
760   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
761     r_.altivec_u32f = vec_and(a_.altivec_u32f, b_.altivec_u32f);
762   #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
763     r_.i32f = a_.i32f & b_.i32f;
764   #else
765     SIMDE_VECTORIZE
766     for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
767       r_.i32f[i] = a_.i32f[i] & b_.i32f[i];
768     }
769   #endif
770 
771   return simde__m128i_from_private(r_);
772 #endif
773 }
774 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
775 #  define _mm_and_si128(a, b) simde_mm_and_si128(a, b)
776 #endif
777 
778 SIMDE_FUNCTION_ATTRIBUTES
779 simde__m128d
simde_mm_andnot_pd(simde__m128d a,simde__m128d b)780 simde_mm_andnot_pd (simde__m128d a, simde__m128d b) {
781 #if defined(SIMDE_X86_SSE2_NATIVE)
782   return _mm_andnot_pd(a, b);
783 #else
784   simde__m128d_private
785     r_,
786     a_ = simde__m128d_to_private(a),
787     b_ = simde__m128d_to_private(b);
788 
789 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
790   r_.neon_i32 = vbicq_s32(a_.neon_i32, b_.neon_i32);
791 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
792   r_.wasm_v128 = wasm_v128_andnot(b_.wasm_v128, a_.wasm_v128);
793 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
794   r_.altivec_i32f = vec_andc(a_.altivec_i32f, b_.altivec_i32f);
795 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
796   r_.i32f = ~a_.i32f & b_.i32f;
797 #else
798   SIMDE_VECTORIZE
799   for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
800     r_.u64[i] = ~a_.u64[i] & b_.u64[i];
801   }
802 #endif
803 
804   return simde__m128d_from_private(r_);
805 #endif
806 }
807 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
808 #  define _mm_andnot_pd(a, b) simde_mm_andnot_pd(a, b)
809 #endif
810 
811 SIMDE_FUNCTION_ATTRIBUTES
812 simde__m128i
simde_mm_andnot_si128(simde__m128i a,simde__m128i b)813 simde_mm_andnot_si128 (simde__m128i a, simde__m128i b) {
814 #if defined(SIMDE_X86_SSE2_NATIVE)
815   return _mm_andnot_si128(a, b);
816 #else
817   simde__m128i_private
818     r_,
819     a_ = simde__m128i_to_private(a),
820     b_ = simde__m128i_to_private(b);
821 
822   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
823     r_.neon_i32 = vbicq_s32(b_.neon_i32, a_.neon_i32);
824   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
825     r_.altivec_i32 = vec_andc(b_.altivec_i32, a_.altivec_i32);
826   #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
827     r_.i32f = ~a_.i32f & b_.i32f;
828   #else
829     SIMDE_VECTORIZE
830     for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
831       r_.i32f[i] = ~(a_.i32f[i]) & b_.i32f[i];
832     }
833   #endif
834 
835   return simde__m128i_from_private(r_);
836 #endif
837 }
838 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
839 #  define _mm_andnot_si128(a, b) simde_mm_andnot_si128(a, b)
840 #endif
841 
842 SIMDE_FUNCTION_ATTRIBUTES
843 simde__m128i
simde_mm_avg_epu8(simde__m128i a,simde__m128i b)844 simde_mm_avg_epu8 (simde__m128i a, simde__m128i b) {
845 #if defined(SIMDE_X86_SSE2_NATIVE)
846   return _mm_avg_epu8(a, b);
847 #else
848   simde__m128i_private
849     r_,
850     a_ = simde__m128i_to_private(a),
851     b_ = simde__m128i_to_private(b);
852 
853 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
854   r_.neon_u8 = vrhaddq_u8(b_.neon_u8, a_.neon_u8);
855 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
856   r_.altivec_u8 = vec_avg(a_.altivec_u8, b_.altivec_u8);
857 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_CONVERT_VECTOR_)
858   uint16_t wa SIMDE_VECTOR(32);
859   uint16_t wb SIMDE_VECTOR(32);
860   uint16_t wr SIMDE_VECTOR(32);
861   SIMDE_CONVERT_VECTOR_(wa, a_.u8);
862   SIMDE_CONVERT_VECTOR_(wb, b_.u8);
863   wr = (wa + wb + 1) >> 1;
864   SIMDE_CONVERT_VECTOR_(r_.u8, wr);
865 #else
866   SIMDE_VECTORIZE
867   for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
868     r_.u8[i] = (a_.u8[i] + b_.u8[i] + 1) >> 1;
869   }
870 #endif
871 
872   return simde__m128i_from_private(r_);
873 #endif
874 }
875 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
876 #  define _mm_avg_epu8(a, b) simde_mm_avg_epu8(a, b)
877 #endif
878 
879 SIMDE_FUNCTION_ATTRIBUTES
880 simde__m128i
simde_mm_avg_epu16(simde__m128i a,simde__m128i b)881 simde_mm_avg_epu16 (simde__m128i a, simde__m128i b) {
882 #if defined(SIMDE_X86_SSE2_NATIVE)
883   return _mm_avg_epu16(a, b);
884 #else
885   simde__m128i_private
886     r_,
887     a_ = simde__m128i_to_private(a),
888     b_ = simde__m128i_to_private(b);
889 
890 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
891   r_.neon_u16 = vrhaddq_u16(b_.neon_u16, a_.neon_u16);
892 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
893   r_.altivec_u16 = vec_avg(a_.altivec_u16, b_.altivec_u16);
894 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_CONVERT_VECTOR_)
895   uint32_t wa SIMDE_VECTOR(32);
896   uint32_t wb SIMDE_VECTOR(32);
897   uint32_t wr SIMDE_VECTOR(32);
898   SIMDE_CONVERT_VECTOR_(wa, a_.u16);
899   SIMDE_CONVERT_VECTOR_(wb, b_.u16);
900   wr = (wa + wb + 1) >> 1;
901   SIMDE_CONVERT_VECTOR_(r_.u16, wr);
902 #else
903   SIMDE_VECTORIZE
904   for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
905     r_.u16[i] = (a_.u16[i] + b_.u16[i] + 1) >> 1;
906   }
907 #endif
908 
909   return simde__m128i_from_private(r_);
910 #endif
911 }
912 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
913 #  define _mm_avg_epu16(a, b) simde_mm_avg_epu16(a, b)
914 #endif
915 
916 SIMDE_FUNCTION_ATTRIBUTES
917 simde__m128i
simde_mm_setzero_si128(void)918 simde_mm_setzero_si128 (void) {
919   #if defined(SIMDE_X86_SSE2_NATIVE)
920     return _mm_setzero_si128();
921   #else
922     simde__m128i_private r_;
923 
924     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
925       r_.neon_i32 = vdupq_n_s32(0);
926     #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
927       r_.altivec_i32 = vec_splats(HEDLEY_STATIC_CAST(signed int, 0));
928     #elif defined(SIMDE_VECTOR_SUBSCRIPT)
929       r_.i32 = __extension__ (__typeof__(r_.i32)) { 0, 0, 0, 0 };
930     #else
931       SIMDE_VECTORIZE
932       for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
933         r_.i32f[i] = 0;
934       }
935     #endif
936 
937     return simde__m128i_from_private(r_);
938   #endif
939 }
940 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
941 #  define _mm_setzero_si128() (simde_mm_setzero_si128())
942 #endif
943 
944 SIMDE_FUNCTION_ATTRIBUTES
945 simde__m128i
simde_mm_bslli_si128(simde__m128i a,const int imm8)946 simde_mm_bslli_si128 (simde__m128i a, const int imm8)
947     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)  {
948   simde__m128i_private
949     r_,
950     a_ = simde__m128i_to_private(a);
951 
952   if (HEDLEY_UNLIKELY((imm8 & ~15))) {
953     return simde_mm_setzero_si128();
954   }
955 
956   #if defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) && defined(SIMDE_ENDIAN_ORDER)
957     r_.altivec_i8 =
958     #if (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
959       vec_slo
960     #else /* SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_BIG */
961       vec_sro
962     #endif
963         (a_.altivec_i8, vec_splats(HEDLEY_STATIC_CAST(unsigned char, imm8 * 8)));
964   #elif defined(SIMDE_HAVE_INT128_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) && 0
965     r_.u128[0] = a_.u128[0] << s;
966   #else
967     r_ = simde__m128i_to_private(simde_mm_setzero_si128());
968     for (int i = imm8 ; i < HEDLEY_STATIC_CAST(int, sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
969       r_.i8[i] = a_.i8[i - imm8];
970     }
971   #endif
972 
973   return simde__m128i_from_private(r_);
974 }
975 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
976 #  define simde_mm_bslli_si128(a, imm8) _mm_slli_si128(a, imm8)
977 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__)
978 #  define simde_mm_bslli_si128(a, imm8) \
979   simde__m128i_from_neon_i8(((imm8) <= 0) ? simde__m128i_to_neon_i8(a) : (((imm8) > 15) ? (vdupq_n_s8(0)) : (vextq_s8(vdupq_n_s8(0), simde__m128i_to_neon_i8(a), 16 - (imm8)))))
980 #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
981   #define simde_mm_bslli_si128(a, imm8) (__extension__ ({ \
982     const simde__m128i_private simde__tmp_a_ = simde__m128i_to_private(a); \
983     const simde__m128i_private simde__tmp_z_ = simde__m128i_to_private(simde_mm_setzero_si128()); \
984     simde__m128i_private simde__tmp_r_; \
985     if (HEDLEY_UNLIKELY(imm8 > 15)) { \
986       simde__tmp_r_ = simde__m128i_to_private(simde_mm_setzero_si128()); \
987     } else { \
988       simde__tmp_r_.i8 = \
989         SIMDE_SHUFFLE_VECTOR_(8, 16, \
990           simde__tmp_z_.i8, \
991           (simde__tmp_a_).i8, \
992           HEDLEY_STATIC_CAST(int8_t, (16 - imm8) & 31), \
993           HEDLEY_STATIC_CAST(int8_t, (17 - imm8) & 31), \
994           HEDLEY_STATIC_CAST(int8_t, (18 - imm8) & 31), \
995           HEDLEY_STATIC_CAST(int8_t, (19 - imm8) & 31), \
996           HEDLEY_STATIC_CAST(int8_t, (20 - imm8) & 31), \
997           HEDLEY_STATIC_CAST(int8_t, (21 - imm8) & 31), \
998           HEDLEY_STATIC_CAST(int8_t, (22 - imm8) & 31), \
999           HEDLEY_STATIC_CAST(int8_t, (23 - imm8) & 31), \
1000           HEDLEY_STATIC_CAST(int8_t, (24 - imm8) & 31), \
1001           HEDLEY_STATIC_CAST(int8_t, (25 - imm8) & 31), \
1002           HEDLEY_STATIC_CAST(int8_t, (26 - imm8) & 31), \
1003           HEDLEY_STATIC_CAST(int8_t, (27 - imm8) & 31), \
1004           HEDLEY_STATIC_CAST(int8_t, (28 - imm8) & 31), \
1005           HEDLEY_STATIC_CAST(int8_t, (29 - imm8) & 31), \
1006           HEDLEY_STATIC_CAST(int8_t, (30 - imm8) & 31), \
1007           HEDLEY_STATIC_CAST(int8_t, (31 - imm8) & 31)); \
1008     } \
1009     simde__m128i_from_private(simde__tmp_r_); }))
1010 #endif
1011 #define simde_mm_slli_si128(a, imm8) simde_mm_bslli_si128(a, imm8)
1012 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1013 #  define _mm_bslli_si128(a, imm8) simde_mm_bslli_si128(a, imm8)
1014 #  define _mm_slli_si128(a, imm8) simde_mm_bslli_si128(a, imm8)
1015 #endif
1016 
1017 SIMDE_FUNCTION_ATTRIBUTES
1018 simde__m128i
simde_mm_bsrli_si128(simde__m128i a,const int imm8)1019 simde_mm_bsrli_si128 (simde__m128i a, const int imm8)
1020     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)  {
1021   simde__m128i_private
1022     r_,
1023     a_ = simde__m128i_to_private(a);
1024 
1025   if (HEDLEY_UNLIKELY((imm8 & ~15))) {
1026     return simde_mm_setzero_si128();
1027   }
1028 
1029   #if defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) && defined(SIMDE_ENDIAN_ORDER)
1030     r_.altivec_i8 =
1031     #if (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
1032       vec_sro
1033     #else /* SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_BIG */
1034       vec_slo
1035     #endif
1036         (a_.altivec_i8, vec_splats(HEDLEY_STATIC_CAST(unsigned char, imm8 * 8)));
1037   #else
1038     SIMDE_VECTORIZE
1039     for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
1040       const int e = HEDLEY_STATIC_CAST(int, i) + imm8;
1041       r_.i8[i] = (e < 16) ? a_.i8[e] : 0;
1042     }
1043   #endif
1044 
1045   return simde__m128i_from_private(r_);
1046 }
1047 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
1048 #  define simde_mm_bsrli_si128(a, imm8) _mm_srli_si128(a, imm8)
1049 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__)
1050 #  define simde_mm_bsrli_si128(a, imm8) \
1051   simde__m128i_from_neon_i8(((imm8 < 0) || (imm8 > 15)) ? vdupq_n_s8(0) : (vextq_s8(simde__m128i_to_private(a).neon_i8, vdupq_n_s8(0), ((imm8 & 15) != 0) ? imm8 : (imm8 & 15))))
1052 #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1053   #define simde_mm_bsrli_si128(a, imm8) (__extension__ ({ \
1054     const simde__m128i_private simde__tmp_a_ = simde__m128i_to_private(a); \
1055     const simde__m128i_private simde__tmp_z_ = simde__m128i_to_private(simde_mm_setzero_si128()); \
1056     simde__m128i_private simde__tmp_r_ = simde__m128i_to_private(a); \
1057     if (HEDLEY_UNLIKELY(imm8 > 15)) { \
1058       simde__tmp_r_ = simde__m128i_to_private(simde_mm_setzero_si128()); \
1059     } else { \
1060       simde__tmp_r_.i8 = \
1061       SIMDE_SHUFFLE_VECTOR_(8, 16, \
1062         simde__tmp_z_.i8, \
1063         (simde__tmp_a_).i8, \
1064         HEDLEY_STATIC_CAST(int8_t, (imm8 + 16) & 31), \
1065         HEDLEY_STATIC_CAST(int8_t, (imm8 + 17) & 31), \
1066         HEDLEY_STATIC_CAST(int8_t, (imm8 + 18) & 31), \
1067         HEDLEY_STATIC_CAST(int8_t, (imm8 + 19) & 31), \
1068         HEDLEY_STATIC_CAST(int8_t, (imm8 + 20) & 31), \
1069         HEDLEY_STATIC_CAST(int8_t, (imm8 + 21) & 31), \
1070         HEDLEY_STATIC_CAST(int8_t, (imm8 + 22) & 31), \
1071         HEDLEY_STATIC_CAST(int8_t, (imm8 + 23) & 31), \
1072         HEDLEY_STATIC_CAST(int8_t, (imm8 + 24) & 31), \
1073         HEDLEY_STATIC_CAST(int8_t, (imm8 + 25) & 31), \
1074         HEDLEY_STATIC_CAST(int8_t, (imm8 + 26) & 31), \
1075         HEDLEY_STATIC_CAST(int8_t, (imm8 + 27) & 31), \
1076         HEDLEY_STATIC_CAST(int8_t, (imm8 + 28) & 31), \
1077         HEDLEY_STATIC_CAST(int8_t, (imm8 + 29) & 31), \
1078         HEDLEY_STATIC_CAST(int8_t, (imm8 + 30) & 31), \
1079         HEDLEY_STATIC_CAST(int8_t, (imm8 + 31) & 31)); \
1080     } \
1081     simde__m128i_from_private(simde__tmp_r_); }))
1082 #endif
1083 #define simde_mm_srli_si128(a, imm8) simde_mm_bsrli_si128((a), (imm8))
1084 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1085 #  define _mm_bsrli_si128(a, imm8) simde_mm_bsrli_si128((a), (imm8))
1086 #  define _mm_srli_si128(a, imm8) simde_mm_bsrli_si128((a), (imm8))
1087 #endif
1088 
1089 SIMDE_FUNCTION_ATTRIBUTES
1090 void
simde_mm_clflush(void const * p)1091 simde_mm_clflush (void const* p) {
1092 #if defined(SIMDE_X86_SSE2_NATIVE)
1093   _mm_clflush(p);
1094 #else
1095   (void) p;
1096 #endif
1097 }
1098 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1099 #  define _mm_clflush(a, b) simde_mm_clflush()
1100 #endif
1101 
1102 SIMDE_FUNCTION_ATTRIBUTES
1103 int
simde_mm_comieq_sd(simde__m128d a,simde__m128d b)1104 simde_mm_comieq_sd (simde__m128d a, simde__m128d b) {
1105 #if defined(SIMDE_X86_SSE2_NATIVE)
1106   return _mm_comieq_sd(a, b);
1107 #else
1108   simde__m128d_private
1109     a_ = simde__m128d_to_private(a),
1110     b_ = simde__m128d_to_private(b);
1111 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1112   return !!vgetq_lane_u64(vceqq_f64(a_.neon_f64, b_.neon_f64), 0);
1113 #else
1114   return a_.f64[0] == b_.f64[0];
1115 #endif
1116 #endif
1117 }
1118 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1119 #  define _mm_comieq_sd(a, b) simde_mm_comieq_sd(a, b)
1120 #endif
1121 
1122 SIMDE_FUNCTION_ATTRIBUTES
1123 int
simde_mm_comige_sd(simde__m128d a,simde__m128d b)1124 simde_mm_comige_sd (simde__m128d a, simde__m128d b) {
1125 #if defined(SIMDE_X86_SSE2_NATIVE)
1126   return _mm_comige_sd(a, b);
1127 #else
1128   simde__m128d_private
1129     a_ = simde__m128d_to_private(a),
1130     b_ = simde__m128d_to_private(b);
1131 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1132   return !!vgetq_lane_u64(vcgeq_f64(a_.neon_f64, b_.neon_f64), 0);
1133 #else
1134   return a_.f64[0] >= b_.f64[0];
1135 #endif
1136 #endif
1137 }
1138 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1139 #  define _mm_comige_sd(a, b) simde_mm_comige_sd(a, b)
1140 #endif
1141 
1142 SIMDE_FUNCTION_ATTRIBUTES
1143 int
simde_mm_comigt_sd(simde__m128d a,simde__m128d b)1144 simde_mm_comigt_sd (simde__m128d a, simde__m128d b) {
1145 #if defined(SIMDE_X86_SSE2_NATIVE)
1146   return _mm_comigt_sd(a, b);
1147 #else
1148   simde__m128d_private
1149     a_ = simde__m128d_to_private(a),
1150     b_ = simde__m128d_to_private(b);
1151 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1152   return !!vgetq_lane_u64(vcgtq_f64(a_.neon_f64, b_.neon_f64), 0);
1153 #else
1154   return a_.f64[0] > b_.f64[0];
1155 #endif
1156 #endif
1157 }
1158 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1159 #  define _mm_comigt_sd(a, b) simde_mm_comigt_sd(a, b)
1160 #endif
1161 
1162 SIMDE_FUNCTION_ATTRIBUTES
1163 int
simde_mm_comile_sd(simde__m128d a,simde__m128d b)1164 simde_mm_comile_sd (simde__m128d a, simde__m128d b) {
1165 #if defined(SIMDE_X86_SSE2_NATIVE)
1166   return _mm_comile_sd(a, b);
1167 #else
1168   simde__m128d_private
1169     a_ = simde__m128d_to_private(a),
1170     b_ = simde__m128d_to_private(b);
1171 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1172   return !!vgetq_lane_u64(vcleq_f64(a_.neon_f64, b_.neon_f64), 0);
1173 #else
1174   return a_.f64[0] <= b_.f64[0];
1175 #endif
1176 #endif
1177 }
1178 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1179 #  define _mm_comile_sd(a, b) simde_mm_comile_sd(a, b)
1180 #endif
1181 
1182 SIMDE_FUNCTION_ATTRIBUTES
1183 int
simde_mm_comilt_sd(simde__m128d a,simde__m128d b)1184 simde_mm_comilt_sd (simde__m128d a, simde__m128d b) {
1185 #if defined(SIMDE_X86_SSE2_NATIVE)
1186   return _mm_comilt_sd(a, b);
1187 #else
1188   simde__m128d_private
1189     a_ = simde__m128d_to_private(a),
1190     b_ = simde__m128d_to_private(b);
1191 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1192   return !!vgetq_lane_u64(vcltq_f64(a_.neon_f64, b_.neon_f64), 0);
1193 #else
1194   return a_.f64[0] < b_.f64[0];
1195 #endif
1196 #endif
1197 }
1198 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1199 #  define _mm_comilt_sd(a, b) simde_mm_comilt_sd(a, b)
1200 #endif
1201 
1202 SIMDE_FUNCTION_ATTRIBUTES
1203 int
simde_mm_comineq_sd(simde__m128d a,simde__m128d b)1204 simde_mm_comineq_sd (simde__m128d a, simde__m128d b) {
1205 #if defined(SIMDE_X86_SSE2_NATIVE)
1206   return _mm_comineq_sd(a, b);
1207 #else
1208   simde__m128d_private
1209     a_ = simde__m128d_to_private(a),
1210     b_ = simde__m128d_to_private(b);
1211 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1212   return !vgetq_lane_u64(vceqq_f64(a_.neon_f64, b_.neon_f64), 0);
1213 #else
1214   return a_.f64[0] != b_.f64[0];
1215 #endif
1216 #endif
1217 }
1218 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1219 #  define _mm_comineq_sd(a, b) simde_mm_comineq_sd(a, b)
1220 #endif
1221 
1222 SIMDE_FUNCTION_ATTRIBUTES
1223 simde__m128d
simde_x_mm_copysign_pd(simde__m128d dest,simde__m128d src)1224 simde_x_mm_copysign_pd(simde__m128d dest, simde__m128d src) {
1225   simde__m128d_private
1226     r_,
1227     dest_ = simde__m128d_to_private(dest),
1228     src_ = simde__m128d_to_private(src);
1229 
1230   #if defined(simde_math_copysign)
1231     SIMDE_VECTORIZE
1232     for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
1233       r_.f64[i] = simde_math_copysign(dest_.f64[i], src_.f64[i]);
1234     }
1235   #else
1236     simde__m128d sgnbit = simde_mm_xor_pd(simde_mm_set1_pd(SIMDE_FLOAT64_C(0.0)), simde_mm_set1_pd(-SIMDE_FLOAT64_C(0.0)));
1237     return simde_mm_xor_pd(simde_mm_and_pd(sgnbit, src), simde_mm_andnot_pd(sgnbit, dest));
1238   #endif
1239 
1240   return simde__m128d_from_private(r_);
1241 }
1242 
1243 SIMDE_FUNCTION_ATTRIBUTES
1244 simde__m128
simde_mm_castpd_ps(simde__m128d a)1245 simde_mm_castpd_ps (simde__m128d a) {
1246 #if defined(SIMDE_X86_SSE2_NATIVE)
1247   return _mm_castpd_ps(a);
1248 #else
1249   simde__m128 r;
1250   simde_memcpy(&r, &a, sizeof(a));
1251   return r;
1252 #endif
1253 }
1254 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1255 #  define _mm_castpd_ps(a) simde_mm_castpd_ps(a)
1256 #endif
1257 
1258 SIMDE_FUNCTION_ATTRIBUTES
1259 simde__m128i
simde_mm_castpd_si128(simde__m128d a)1260 simde_mm_castpd_si128 (simde__m128d a) {
1261 #if defined(SIMDE_X86_SSE2_NATIVE)
1262   return _mm_castpd_si128(a);
1263 #else
1264   simde__m128i r;
1265   simde_memcpy(&r, &a, sizeof(a));
1266   return r;
1267 #endif
1268 }
1269 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1270 #  define _mm_castpd_si128(a) simde_mm_castpd_si128(a)
1271 #endif
1272 
1273 SIMDE_FUNCTION_ATTRIBUTES
1274 simde__m128d
simde_mm_castps_pd(simde__m128 a)1275 simde_mm_castps_pd (simde__m128 a) {
1276 #if defined(SIMDE_X86_SSE2_NATIVE)
1277   return _mm_castps_pd(a);
1278 #else
1279   simde__m128d r;
1280   simde_memcpy(&r, &a, sizeof(a));
1281   return r;
1282 #endif
1283 }
1284 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1285 #  define _mm_castps_pd(a) simde_mm_castps_pd(a)
1286 #endif
1287 
1288 SIMDE_FUNCTION_ATTRIBUTES
1289 simde__m128i
simde_mm_castps_si128(simde__m128 a)1290 simde_mm_castps_si128 (simde__m128 a) {
1291 #if defined(SIMDE_X86_SSE2_NATIVE)
1292   return _mm_castps_si128(a);
1293 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1294   return simde__m128i_from_neon_i32(simde__m128_to_private(a).neon_i32);
1295 #else
1296   simde__m128i r;
1297   simde_memcpy(&r, &a, sizeof(a));
1298   return r;
1299 #endif
1300 }
1301 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1302 #  define _mm_castps_si128(a) simde_mm_castps_si128(a)
1303 #endif
1304 
1305 SIMDE_FUNCTION_ATTRIBUTES
1306 simde__m128d
simde_mm_castsi128_pd(simde__m128i a)1307 simde_mm_castsi128_pd (simde__m128i a) {
1308 #if defined(SIMDE_X86_SSE2_NATIVE)
1309   return _mm_castsi128_pd(a);
1310 #else
1311   simde__m128d r;
1312   simde_memcpy(&r, &a, sizeof(a));
1313   return r;
1314 #endif
1315 }
1316 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1317 #  define _mm_castsi128_pd(a) simde_mm_castsi128_pd(a)
1318 #endif
1319 
1320 SIMDE_FUNCTION_ATTRIBUTES
1321 simde__m128
simde_mm_castsi128_ps(simde__m128i a)1322 simde_mm_castsi128_ps (simde__m128i a) {
1323 #if defined(SIMDE_X86_SSE2_NATIVE)
1324   return _mm_castsi128_ps(a);
1325 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1326   return HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), a);
1327 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1328   return simde__m128_from_neon_i32(simde__m128i_to_private(a).neon_i32);
1329 #else
1330   simde__m128 r;
1331   simde_memcpy(&r, &a, sizeof(a));
1332   return r;
1333 #endif
1334 }
1335 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1336 #  define _mm_castsi128_ps(a) simde_mm_castsi128_ps(a)
1337 #endif
1338 
1339 SIMDE_FUNCTION_ATTRIBUTES
1340 simde__m128i
simde_mm_cmpeq_epi8(simde__m128i a,simde__m128i b)1341 simde_mm_cmpeq_epi8 (simde__m128i a, simde__m128i b) {
1342 #if defined(SIMDE_X86_SSE2_NATIVE)
1343   return _mm_cmpeq_epi8(a, b);
1344 #else
1345   simde__m128i_private
1346     r_,
1347     a_ = simde__m128i_to_private(a),
1348     b_ = simde__m128i_to_private(b);
1349 
1350   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1351     r_.neon_u8 = vceqq_s8(b_.neon_i8, a_.neon_i8);
1352   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1353     r_.wasm_v128 = wasm_i8x16_eq(a_.wasm_v128, b_.wasm_v128);
1354   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1355     r_.altivec_i8 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char), vec_cmpeq(a_.altivec_i8, b_.altivec_i8));
1356   #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1357     r_.i8 = HEDLEY_STATIC_CAST(__typeof__(r_.i8), (a_.i8 == b_.i8));
1358   #else
1359     SIMDE_VECTORIZE
1360     for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
1361       r_.i8[i] = (a_.i8[i] == b_.i8[i]) ? ~INT8_C(0) : INT8_C(0);
1362     }
1363   #endif
1364 
1365   return simde__m128i_from_private(r_);
1366 #endif
1367 }
1368 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1369 #  define _mm_cmpeq_epi8(a, b) simde_mm_cmpeq_epi8(a, b)
1370 #endif
1371 
1372 SIMDE_FUNCTION_ATTRIBUTES
1373 simde__m128i
simde_mm_cmpeq_epi16(simde__m128i a,simde__m128i b)1374 simde_mm_cmpeq_epi16 (simde__m128i a, simde__m128i b) {
1375 #if defined(SIMDE_X86_SSE2_NATIVE)
1376   return _mm_cmpeq_epi16(a, b);
1377 #else
1378   simde__m128i_private
1379     r_,
1380     a_ = simde__m128i_to_private(a),
1381     b_ = simde__m128i_to_private(b);
1382 
1383   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1384     r_.neon_u16 = vceqq_s16(b_.neon_i16, a_.neon_i16);
1385   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1386     r_.wasm_v128 = wasm_i16x8_eq(a_.wasm_v128, b_.wasm_v128);
1387   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1388     r_.altivec_i16 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed short), vec_cmpeq(a_.altivec_i16, b_.altivec_i16));
1389   #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1390     r_.i16 = (a_.i16 == b_.i16);
1391   #else
1392     SIMDE_VECTORIZE
1393     for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
1394       r_.i16[i] = (a_.i16[i] == b_.i16[i]) ? ~INT16_C(0) : INT16_C(0);
1395     }
1396   #endif
1397 
1398   return simde__m128i_from_private(r_);
1399 #endif
1400 }
1401 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1402 #  define _mm_cmpeq_epi16(a, b) simde_mm_cmpeq_epi16(a, b)
1403 #endif
1404 
1405 SIMDE_FUNCTION_ATTRIBUTES
1406 simde__m128i
simde_mm_cmpeq_epi32(simde__m128i a,simde__m128i b)1407 simde_mm_cmpeq_epi32 (simde__m128i a, simde__m128i b) {
1408 #if defined(SIMDE_X86_SSE2_NATIVE)
1409   return _mm_cmpeq_epi32(a, b);
1410 #else
1411   simde__m128i_private
1412     r_,
1413     a_ = simde__m128i_to_private(a),
1414     b_ = simde__m128i_to_private(b);
1415 
1416   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1417     r_.neon_u32 = vceqq_s32(b_.neon_i32, a_.neon_i32);
1418   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1419     r_.wasm_v128 = wasm_i32x4_eq(a_.wasm_v128, b_.wasm_v128);
1420   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1421     r_.altivec_i32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed int), vec_cmpeq(a_.altivec_i32, b_.altivec_i32));
1422   #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1423     r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), a_.i32 == b_.i32);
1424   #else
1425     SIMDE_VECTORIZE
1426     for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1427       r_.i32[i] = (a_.i32[i] == b_.i32[i]) ? ~INT32_C(0) : INT32_C(0);
1428     }
1429   #endif
1430 
1431   return simde__m128i_from_private(r_);
1432 #endif
1433 }
1434 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1435 #  define _mm_cmpeq_epi32(a, b) simde_mm_cmpeq_epi32(a, b)
1436 #endif
1437 
1438 SIMDE_FUNCTION_ATTRIBUTES
1439 simde__m128d
simde_mm_cmpeq_pd(simde__m128d a,simde__m128d b)1440 simde_mm_cmpeq_pd (simde__m128d a, simde__m128d b) {
1441 #if defined(SIMDE_X86_SSE2_NATIVE)
1442   return _mm_cmpeq_pd(a, b);
1443 #else
1444   simde__m128d_private
1445     r_,
1446     a_ = simde__m128d_to_private(a),
1447     b_ = simde__m128d_to_private(b);
1448 
1449   #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1450     r_.neon_u64 = vceqq_s64(b_.neon_i64, a_.neon_i64);
1451   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1452     r_.wasm_v128 = wasm_f64x2_eq(a_.wasm_v128, b_.wasm_v128);
1453   #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
1454     r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_cmpeq(a_.altivec_f64, b_.altivec_f64));
1455   #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1456     r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 == b_.f64));
1457   #else
1458     SIMDE_VECTORIZE
1459     for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
1460       r_.u64[i] = (a_.f64[i] == b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
1461     }
1462   #endif
1463 
1464   return simde__m128d_from_private(r_);
1465 #endif
1466 }
1467 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1468 #  define _mm_cmpeq_pd(a, b) simde_mm_cmpeq_pd(a, b)
1469 #endif
1470 
1471 SIMDE_FUNCTION_ATTRIBUTES
1472 simde__m128d
simde_mm_cmpeq_sd(simde__m128d a,simde__m128d b)1473 simde_mm_cmpeq_sd (simde__m128d a, simde__m128d b) {
1474 #if defined(SIMDE_X86_SSE2_NATIVE)
1475   return _mm_cmpeq_sd(a, b);
1476 #elif defined(SIMDE_ASSUME_VECTORIZATION)
1477   return simde_mm_move_sd(a, simde_mm_cmpeq_pd(a, b));
1478 #else
1479   simde__m128d_private
1480     r_,
1481     a_ = simde__m128d_to_private(a),
1482     b_ = simde__m128d_to_private(b);
1483 
1484   r_.u64[0] = (a_.u64[0] == b_.u64[0]) ? ~UINT64_C(0) : 0;
1485   r_.u64[1] = a_.u64[1];
1486 
1487   return simde__m128d_from_private(r_);
1488 #endif
1489 }
1490 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1491 #  define _mm_cmpeq_sd(a, b) simde_mm_cmpeq_sd(a, b)
1492 #endif
1493 
1494 SIMDE_FUNCTION_ATTRIBUTES
1495 simde__m128d
simde_mm_cmpneq_pd(simde__m128d a,simde__m128d b)1496 simde_mm_cmpneq_pd (simde__m128d a, simde__m128d b) {
1497 #if defined(SIMDE_X86_SSE2_NATIVE)
1498   return _mm_cmpneq_pd(a, b);
1499 #else
1500   simde__m128d_private
1501     r_,
1502     a_ = simde__m128d_to_private(a),
1503     b_ = simde__m128d_to_private(b);
1504 
1505   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1506     r_.neon_u16 = vmvnq_u16(vceqq_s16(b_.neon_i16, a_.neon_i16));
1507   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1508     r_.wasm_v128 = wasm_f64x2_ne(a_.wasm_v128, b_.wasm_v128);
1509   #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1510     r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 != b_.f64));
1511   #else
1512     SIMDE_VECTORIZE
1513     for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
1514       r_.u64[i] = (a_.f64[i] != b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
1515     }
1516   #endif
1517 
1518   return simde__m128d_from_private(r_);
1519 #endif
1520 }
1521 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1522 #  define _mm_cmpneq_pd(a, b) simde_mm_cmpneq_pd(a, b)
1523 #endif
1524 
1525 SIMDE_FUNCTION_ATTRIBUTES
1526 simde__m128d
simde_mm_cmpneq_sd(simde__m128d a,simde__m128d b)1527 simde_mm_cmpneq_sd (simde__m128d a, simde__m128d b) {
1528 #if defined(SIMDE_X86_SSE2_NATIVE)
1529   return _mm_cmpneq_sd(a, b);
1530 #elif defined(SIMDE_ASSUME_VECTORIZATION)
1531   return simde_mm_move_sd(a, simde_mm_cmpneq_pd(a, b));
1532 #else
1533   simde__m128d_private
1534     r_,
1535     a_ = simde__m128d_to_private(a),
1536     b_ = simde__m128d_to_private(b);
1537 
1538   r_.u64[0] = (a_.f64[0] != b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
1539   r_.u64[1] = a_.u64[1];
1540 
1541 
1542   return simde__m128d_from_private(r_);
1543 #endif
1544 }
1545 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1546 #  define _mm_cmpneq_sd(a, b) simde_mm_cmpneq_sd(a, b)
1547 #endif
1548 
1549 SIMDE_FUNCTION_ATTRIBUTES
1550 simde__m128i
simde_mm_cmplt_epi8(simde__m128i a,simde__m128i b)1551 simde_mm_cmplt_epi8 (simde__m128i a, simde__m128i b) {
1552 #if defined(SIMDE_X86_SSE2_NATIVE)
1553   return _mm_cmplt_epi8(a, b);
1554 #else
1555   simde__m128i_private
1556     r_,
1557     a_ = simde__m128i_to_private(a),
1558     b_ = simde__m128i_to_private(b);
1559 
1560   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1561     r_.neon_u8 = vcltq_s8(a_.neon_i8, b_.neon_i8);
1562   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1563     r_.altivec_i8 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char),vec_cmplt(a_.altivec_i8, b_.altivec_i8));
1564   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1565     r_.wasm_v128 = wasm_i8x16_lt(a_.wasm_v128, b_.wasm_v128);
1566   #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1567     r_.i8 = HEDLEY_STATIC_CAST(__typeof__(r_.i8), (a_.i8 < b_.i8));
1568   #else
1569     SIMDE_VECTORIZE
1570     for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
1571       r_.i8[i] = (a_.i8[i] < b_.i8[i]) ? ~INT8_C(0) : INT8_C(0);
1572     }
1573   #endif
1574 
1575   return simde__m128i_from_private(r_);
1576 #endif
1577 }
1578 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1579 #  define _mm_cmplt_epi8(a, b) simde_mm_cmplt_epi8(a, b)
1580 #endif
1581 
1582 SIMDE_FUNCTION_ATTRIBUTES
1583 simde__m128i
simde_mm_cmplt_epi16(simde__m128i a,simde__m128i b)1584 simde_mm_cmplt_epi16 (simde__m128i a, simde__m128i b) {
1585 #if defined(SIMDE_X86_SSE2_NATIVE)
1586   return _mm_cmplt_epi16(a, b);
1587 #else
1588   simde__m128i_private
1589     r_,
1590     a_ = simde__m128i_to_private(a),
1591     b_ = simde__m128i_to_private(b);
1592 
1593   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1594     r_.neon_u16 = vcltq_s16(a_.neon_i16, b_.neon_i16);
1595   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1596     r_.altivec_i16 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed short), vec_cmplt(a_.altivec_i16, b_.altivec_i16));
1597   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1598     r_.wasm_v128 = wasm_i16x8_lt(a_.wasm_v128, b_.wasm_v128);
1599   #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1600     r_.i16 = HEDLEY_STATIC_CAST(__typeof__(r_.i16), (a_.i16 < b_.i16));
1601   #else
1602     SIMDE_VECTORIZE
1603     for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
1604       r_.i16[i] = (a_.i16[i] < b_.i16[i]) ? ~INT16_C(0) : INT16_C(0);
1605     }
1606   #endif
1607 
1608   return simde__m128i_from_private(r_);
1609 #endif
1610 }
1611 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1612 #  define _mm_cmplt_epi16(a, b) simde_mm_cmplt_epi16(a, b)
1613 #endif
1614 
1615 SIMDE_FUNCTION_ATTRIBUTES
1616 simde__m128i
simde_mm_cmplt_epi32(simde__m128i a,simde__m128i b)1617 simde_mm_cmplt_epi32 (simde__m128i a, simde__m128i b) {
1618 #if defined(SIMDE_X86_SSE2_NATIVE)
1619   return _mm_cmplt_epi32(a, b);
1620 #else
1621   simde__m128i_private
1622     r_,
1623     a_ = simde__m128i_to_private(a),
1624     b_ = simde__m128i_to_private(b);
1625 
1626   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1627     r_.neon_u32 = vcltq_s32(a_.neon_i32, b_.neon_i32);
1628   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1629     r_.altivec_i32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed int), vec_cmplt(a_.altivec_i32, b_.altivec_i32));
1630   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1631     r_.wasm_v128 = wasm_i32x4_lt(a_.wasm_v128, b_.wasm_v128);
1632   #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1633     r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.i32 < b_.i32));
1634   #else
1635     SIMDE_VECTORIZE
1636     for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1637       r_.i32[i] = (a_.i32[i] < b_.i32[i]) ? ~INT32_C(0) : INT32_C(0);
1638     }
1639   #endif
1640 
1641   return simde__m128i_from_private(r_);
1642 #endif
1643 }
1644 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1645 #  define _mm_cmplt_epi32(a, b) simde_mm_cmplt_epi32(a, b)
1646 #endif
1647 
1648 SIMDE_FUNCTION_ATTRIBUTES
1649 simde__m128d
simde_mm_cmplt_pd(simde__m128d a,simde__m128d b)1650 simde_mm_cmplt_pd (simde__m128d a, simde__m128d b) {
1651 #if defined(SIMDE_X86_SSE2_NATIVE)
1652   return _mm_cmplt_pd(a, b);
1653 #else
1654   simde__m128d_private
1655     r_,
1656     a_ = simde__m128d_to_private(a),
1657     b_ = simde__m128d_to_private(b);
1658 
1659   #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1660     r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 < b_.f64));
1661   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1662     r_.wasm_v128 = wasm_f64x2_lt(a_.wasm_v128, b_.wasm_v128);
1663   #else
1664     SIMDE_VECTORIZE
1665     for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
1666       r_.u64[i] = (a_.f64[i] < b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
1667     }
1668   #endif
1669 
1670   return simde__m128d_from_private(r_);
1671 #endif
1672 }
1673 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1674 #  define _mm_cmplt_pd(a, b) simde_mm_cmplt_pd(a, b)
1675 #endif
1676 
1677 SIMDE_FUNCTION_ATTRIBUTES
1678 simde__m128d
simde_mm_cmplt_sd(simde__m128d a,simde__m128d b)1679 simde_mm_cmplt_sd (simde__m128d a, simde__m128d b) {
1680 #if defined(SIMDE_X86_SSE2_NATIVE)
1681   return _mm_cmplt_sd(a, b);
1682 #elif defined(SIMDE_ASSUME_VECTORIZATION)
1683   return simde_mm_move_sd(a, simde_mm_cmplt_pd(a, b));
1684 #else
1685   simde__m128d_private
1686     r_,
1687     a_ = simde__m128d_to_private(a),
1688     b_ = simde__m128d_to_private(b);
1689 
1690   r_.u64[0] = (a_.f64[0] < b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
1691   r_.u64[1] = a_.u64[1];
1692 
1693   return simde__m128d_from_private(r_);
1694 #endif
1695 }
1696 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1697 #  define _mm_cmplt_sd(a, b) simde_mm_cmplt_sd(a, b)
1698 #endif
1699 
1700 SIMDE_FUNCTION_ATTRIBUTES
1701 simde__m128d
simde_mm_cmple_pd(simde__m128d a,simde__m128d b)1702 simde_mm_cmple_pd (simde__m128d a, simde__m128d b) {
1703 #if defined(SIMDE_X86_SSE2_NATIVE)
1704   return _mm_cmple_pd(a, b);
1705 #else
1706   simde__m128d_private
1707     r_,
1708     a_ = simde__m128d_to_private(a),
1709     b_ = simde__m128d_to_private(b);
1710 
1711   #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1712     r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 <= b_.f64));
1713   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1714     r_.wasm_v128 = wasm_f64x2_le(a_.wasm_v128, b_.wasm_v128);
1715   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1716     r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_cmple(a_.altivec_f64, b_.altivec_f64));
1717   #else
1718     SIMDE_VECTORIZE
1719     for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
1720       r_.u64[i] = (a_.f64[i] <= b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
1721     }
1722   #endif
1723 
1724   return simde__m128d_from_private(r_);
1725 #endif
1726 }
1727 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1728 #  define _mm_cmple_pd(a, b) simde_mm_cmple_pd(a, b)
1729 #endif
1730 
1731 SIMDE_FUNCTION_ATTRIBUTES
1732 simde__m128d
simde_mm_cmple_sd(simde__m128d a,simde__m128d b)1733 simde_mm_cmple_sd (simde__m128d a, simde__m128d b) {
1734 #if defined(SIMDE_X86_SSE2_NATIVE)
1735   return _mm_cmple_sd(a, b);
1736 #elif defined(SIMDE_ASSUME_VECTORIZATION)
1737   return simde_mm_move_sd(a, simde_mm_cmple_pd(a, b));
1738 #else
1739   simde__m128d_private
1740     r_,
1741     a_ = simde__m128d_to_private(a),
1742     b_ = simde__m128d_to_private(b);
1743 
1744   r_.u64[0] = (a_.f64[0] <= b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
1745   r_.u64[1] = a_.u64[1];
1746 
1747   return simde__m128d_from_private(r_);
1748 #endif
1749 }
1750 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1751 #  define _mm_cmple_sd(a, b) simde_mm_cmple_sd(a, b)
1752 #endif
1753 
1754 SIMDE_FUNCTION_ATTRIBUTES
1755 simde__m128i
simde_mm_cmpgt_epi8(simde__m128i a,simde__m128i b)1756 simde_mm_cmpgt_epi8 (simde__m128i a, simde__m128i b) {
1757 #if defined(SIMDE_X86_SSE2_NATIVE)
1758   return _mm_cmpgt_epi8(a, b);
1759 #else
1760   simde__m128i_private
1761     r_,
1762     a_ = simde__m128i_to_private(a),
1763     b_ = simde__m128i_to_private(b);
1764 
1765   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1766     r_.neon_u8 = vcgtq_s8(a_.neon_i8, b_.neon_i8);
1767   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1768     r_.wasm_v128 = wasm_i8x16_gt(a_.wasm_v128, b_.wasm_v128);
1769   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1770     r_.altivec_i8 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char), vec_cmpgt(a_.altivec_i8, b_.altivec_i8));
1771   #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1772     r_.i8 = HEDLEY_STATIC_CAST(__typeof__(r_.i8), (a_.i8 > b_.i8));
1773   #else
1774     SIMDE_VECTORIZE
1775     for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
1776       r_.i8[i] = (a_.i8[i] > b_.i8[i]) ? ~INT8_C(0) : INT8_C(0);
1777     }
1778   #endif
1779 
1780   return simde__m128i_from_private(r_);
1781 #endif
1782 }
1783 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1784 #  define _mm_cmpgt_epi8(a, b) simde_mm_cmpgt_epi8(a, b)
1785 #endif
1786 
1787 SIMDE_FUNCTION_ATTRIBUTES
1788 simde__m128i
simde_mm_cmpgt_epi16(simde__m128i a,simde__m128i b)1789 simde_mm_cmpgt_epi16 (simde__m128i a, simde__m128i b) {
1790 #if defined(SIMDE_X86_SSE2_NATIVE)
1791   return _mm_cmpgt_epi16(a, b);
1792 #else
1793   simde__m128i_private
1794     r_,
1795     a_ = simde__m128i_to_private(a),
1796     b_ = simde__m128i_to_private(b);
1797 
1798   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1799     r_.neon_u16 = vcgtq_s16(a_.neon_i16, b_.neon_i16);
1800   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1801     r_.wasm_v128 = wasm_i16x8_gt(a_.wasm_v128, b_.wasm_v128);
1802   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1803     r_.altivec_i16 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed short), vec_cmpgt(a_.altivec_i16, b_.altivec_i16));
1804   #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1805     r_.i16 = HEDLEY_STATIC_CAST(__typeof__(r_.i16), (a_.i16 > b_.i16));
1806   #else
1807     SIMDE_VECTORIZE
1808     for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
1809       r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? ~INT16_C(0) : INT16_C(0);
1810     }
1811   #endif
1812 
1813   return simde__m128i_from_private(r_);
1814 #endif
1815 }
1816 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1817 #  define _mm_cmpgt_epi16(a, b) simde_mm_cmpgt_epi16(a, b)
1818 #endif
1819 
1820 SIMDE_FUNCTION_ATTRIBUTES
1821 simde__m128i
simde_mm_cmpgt_epi32(simde__m128i a,simde__m128i b)1822 simde_mm_cmpgt_epi32 (simde__m128i a, simde__m128i b) {
1823 #if defined(SIMDE_X86_SSE2_NATIVE)
1824   return _mm_cmpgt_epi32(a, b);
1825 #else
1826   simde__m128i_private
1827     r_,
1828     a_ = simde__m128i_to_private(a),
1829     b_ = simde__m128i_to_private(b);
1830 
1831   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1832     r_.neon_u32 = vcgtq_s32(a_.neon_i32, b_.neon_i32);
1833   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1834     r_.wasm_v128 = wasm_i32x4_gt(a_.wasm_v128, b_.wasm_v128);
1835   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1836     r_.altivec_i32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed int), vec_cmpgt(a_.altivec_i32, b_.altivec_i32));
1837   #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1838     r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.i32 > b_.i32));
1839   #else
1840     SIMDE_VECTORIZE
1841     for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1842       r_.i32[i] = (a_.i32[i] > b_.i32[i]) ? ~INT32_C(0) : INT32_C(0);
1843     }
1844   #endif
1845 
1846   return simde__m128i_from_private(r_);
1847 #endif
1848 }
1849 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1850 #  define _mm_cmpgt_epi32(a, b) simde_mm_cmpgt_epi32(a, b)
1851 #endif
1852 
1853 SIMDE_FUNCTION_ATTRIBUTES
1854 simde__m128d
simde_mm_cmpgt_pd(simde__m128d a,simde__m128d b)1855 simde_mm_cmpgt_pd (simde__m128d a, simde__m128d b) {
1856 #if defined(SIMDE_X86_SSE2_NATIVE)
1857   return _mm_cmpgt_pd(a, b);
1858 #else
1859   simde__m128d_private
1860     r_,
1861     a_ = simde__m128d_to_private(a),
1862     b_ = simde__m128d_to_private(b);
1863 
1864   #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1865     r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 > b_.f64));
1866   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1867     r_.wasm_v128 = wasm_f64x2_gt(a_.wasm_v128, b_.wasm_v128);
1868   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1869     r_.altivec_f64 = HEDLEY_STATIC_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_cmpgt(a_.altivec_f64, b_.altivec_f64));
1870   #else
1871     SIMDE_VECTORIZE
1872     for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
1873       r_.u64[i] = (a_.f64[i] > b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
1874     }
1875   #endif
1876 
1877   return simde__m128d_from_private(r_);
1878 #endif
1879 }
1880 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1881 #  define _mm_cmpgt_pd(a, b) simde_mm_cmpgt_pd(a, b)
1882 #endif
1883 
1884 SIMDE_FUNCTION_ATTRIBUTES
1885 simde__m128d
simde_mm_cmpgt_sd(simde__m128d a,simde__m128d b)1886 simde_mm_cmpgt_sd (simde__m128d a, simde__m128d b) {
1887 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
1888   return _mm_cmpgt_sd(a, b);
1889 #elif defined(SIMDE_ASSUME_VECTORIZATION)
1890   return simde_mm_move_sd(a, simde_mm_cmpgt_pd(a, b));
1891 #else
1892   simde__m128d_private
1893     r_,
1894     a_ = simde__m128d_to_private(a),
1895     b_ = simde__m128d_to_private(b);
1896 
1897   r_.u64[0] = (a_.f64[0] > b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
1898   r_.u64[1] = a_.u64[1];
1899 
1900   return simde__m128d_from_private(r_);
1901 #endif
1902 }
1903 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1904 #  define _mm_cmpgt_sd(a, b) simde_mm_cmpgt_sd(a, b)
1905 #endif
1906 
1907 SIMDE_FUNCTION_ATTRIBUTES
1908 simde__m128d
simde_mm_cmpge_pd(simde__m128d a,simde__m128d b)1909 simde_mm_cmpge_pd (simde__m128d a, simde__m128d b) {
1910 #if defined(SIMDE_X86_SSE2_NATIVE)
1911   return _mm_cmpge_pd(a, b);
1912 #else
1913   simde__m128d_private
1914     r_,
1915     a_ = simde__m128d_to_private(a),
1916     b_ = simde__m128d_to_private(b);
1917 
1918   #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1919     r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 >= b_.f64));
1920   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1921     r_.wasm_v128 = wasm_f64x2_ge(a_.wasm_v128, b_.wasm_v128);
1922   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1923     r_.altivec_f64 = HEDLEY_STATIC_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_cmpge(a_.altivec_f64, b_.altivec_f64));
1924   #else
1925     SIMDE_VECTORIZE
1926     for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
1927       r_.u64[i] = (a_.f64[i] >= b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
1928     }
1929   #endif
1930 
1931   return simde__m128d_from_private(r_);
1932 #endif
1933 }
1934 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1935 #  define _mm_cmpge_pd(a, b) simde_mm_cmpge_pd(a, b)
1936 #endif
1937 
1938 SIMDE_FUNCTION_ATTRIBUTES
1939 simde__m128d
simde_mm_cmpge_sd(simde__m128d a,simde__m128d b)1940 simde_mm_cmpge_sd (simde__m128d a, simde__m128d b) {
1941 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
1942   return _mm_cmpge_sd(a, b);
1943 #elif defined(SIMDE_ASSUME_VECTORIZATION)
1944   return simde_mm_move_sd(a, simde_mm_cmpge_pd(a, b));
1945 #else
1946   simde__m128d_private
1947     r_,
1948     a_ = simde__m128d_to_private(a),
1949     b_ = simde__m128d_to_private(b);
1950 
1951   r_.u64[0] = (a_.f64[0] >= b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
1952   r_.u64[1] = a_.u64[1];
1953 
1954   return simde__m128d_from_private(r_);
1955 #endif
1956 }
1957 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1958 #  define _mm_cmpge_sd(a, b) simde_mm_cmpge_sd(a, b)
1959 #endif
1960 
1961 SIMDE_FUNCTION_ATTRIBUTES
1962 simde__m128d
simde_mm_cmpnge_pd(simde__m128d a,simde__m128d b)1963 simde_mm_cmpnge_pd (simde__m128d a, simde__m128d b) {
1964 #if defined(SIMDE_X86_SSE2_NATIVE)
1965   return _mm_cmpnge_pd(a, b);
1966 #else
1967   return simde_mm_cmplt_pd(a, b);
1968 #endif
1969 }
1970 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1971 #  define _mm_cmpnge_pd(a, b) simde_mm_cmpnge_pd(a, b)
1972 #endif
1973 
1974 SIMDE_FUNCTION_ATTRIBUTES
1975 simde__m128d
simde_mm_cmpnge_sd(simde__m128d a,simde__m128d b)1976 simde_mm_cmpnge_sd (simde__m128d a, simde__m128d b) {
1977 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
1978   return _mm_cmpnge_sd(a, b);
1979 #else
1980   return simde_mm_cmplt_sd(a, b);
1981 #endif
1982 }
1983 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1984 #  define _mm_cmpnge_sd(a, b) simde_mm_cmpnge_sd(a, b)
1985 #endif
1986 
1987 SIMDE_FUNCTION_ATTRIBUTES
1988 simde__m128d
simde_mm_cmpnlt_pd(simde__m128d a,simde__m128d b)1989 simde_mm_cmpnlt_pd (simde__m128d a, simde__m128d b) {
1990 #if defined(SIMDE_X86_SSE2_NATIVE)
1991   return _mm_cmpnlt_pd(a, b);
1992 #else
1993   return simde_mm_cmpge_pd(a, b);
1994 #endif
1995 }
1996 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1997 #  define _mm_cmpnlt_pd(a, b) simde_mm_cmpnlt_pd(a, b)
1998 #endif
1999 
2000 SIMDE_FUNCTION_ATTRIBUTES
2001 simde__m128d
simde_mm_cmpnlt_sd(simde__m128d a,simde__m128d b)2002 simde_mm_cmpnlt_sd (simde__m128d a, simde__m128d b) {
2003 #if defined(SIMDE_X86_SSE2_NATIVE)
2004   return _mm_cmpnlt_sd(a, b);
2005 #else
2006   return simde_mm_cmpge_sd(a, b);
2007 #endif
2008 }
2009 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2010 #  define _mm_cmpnlt_sd(a, b) simde_mm_cmpnlt_sd(a, b)
2011 #endif
2012 
2013 SIMDE_FUNCTION_ATTRIBUTES
2014 simde__m128d
simde_mm_cmpnle_pd(simde__m128d a,simde__m128d b)2015 simde_mm_cmpnle_pd (simde__m128d a, simde__m128d b) {
2016 #if defined(SIMDE_X86_SSE2_NATIVE)
2017   return _mm_cmpnle_pd(a, b);
2018 #else
2019   return simde_mm_cmpgt_pd(a, b);
2020 #endif
2021 }
2022 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2023 #  define _mm_cmpnle_pd(a, b) simde_mm_cmpnle_pd(a, b)
2024 #endif
2025 
2026 SIMDE_FUNCTION_ATTRIBUTES
2027 simde__m128d
simde_mm_cmpnle_sd(simde__m128d a,simde__m128d b)2028 simde_mm_cmpnle_sd (simde__m128d a, simde__m128d b) {
2029 #if defined(SIMDE_X86_SSE2_NATIVE)
2030   return _mm_cmpnle_sd(a, b);
2031 #else
2032   return simde_mm_cmpgt_sd(a, b);
2033 #endif
2034 }
2035 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2036 #  define _mm_cmpnle_sd(a, b) simde_mm_cmpnle_sd(a, b)
2037 #endif
2038 
2039 SIMDE_FUNCTION_ATTRIBUTES
2040 simde__m128d
simde_mm_cmpord_pd(simde__m128d a,simde__m128d b)2041 simde_mm_cmpord_pd (simde__m128d a, simde__m128d b) {
2042 #if defined(SIMDE_X86_SSE2_NATIVE)
2043   return _mm_cmpord_pd(a, b);
2044 #else
2045   simde__m128d_private
2046     r_,
2047     a_ = simde__m128d_to_private(a),
2048     b_ = simde__m128d_to_private(b);
2049 
2050 #if defined(simde_math_isnan)
2051   SIMDE_VECTORIZE
2052   for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
2053     r_.u64[i] = (!simde_math_isnan(a_.f64[i]) && !simde_math_isnan(b_.f64[i])) ? ~UINT64_C(0) : UINT64_C(0);
2054   }
2055 #else
2056   HEDLEY_UNREACHABLE();
2057 #endif
2058 
2059   return simde__m128d_from_private(r_);
2060 #endif
2061 }
2062 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2063 #  define _mm_cmpord_pd(a, b) simde_mm_cmpord_pd(a, b)
2064 #endif
2065 
2066 SIMDE_FUNCTION_ATTRIBUTES
2067 simde_float64
simde_mm_cvtsd_f64(simde__m128d a)2068 simde_mm_cvtsd_f64 (simde__m128d a) {
2069 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
2070   return _mm_cvtsd_f64(a);
2071 #else
2072   simde__m128d_private a_ = simde__m128d_to_private(a);
2073   return a_.f64[0];
2074 #endif
2075 }
2076 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2077 #  define _mm_cvtsd_f64(a) simde_mm_cvtsd_f64(a)
2078 #endif
2079 
2080 SIMDE_FUNCTION_ATTRIBUTES
2081 simde__m128d
simde_mm_cmpord_sd(simde__m128d a,simde__m128d b)2082 simde_mm_cmpord_sd (simde__m128d a, simde__m128d b) {
2083 #if defined(SIMDE_X86_SSE2_NATIVE)
2084   return _mm_cmpord_sd(a, b);
2085 #elif defined(SIMDE_ASSUME_VECTORIZATION)
2086   return simde_mm_move_sd(a, simde_mm_cmpord_pd(a, b));
2087 #else
2088   simde__m128d_private
2089     r_,
2090     a_ = simde__m128d_to_private(a),
2091     b_ = simde__m128d_to_private(b);
2092 
2093 #if defined(simde_math_isnan)
2094   r_.u64[0] = (!simde_math_isnan(a_.f64[0]) && !simde_math_isnan(b_.f64[0])) ? ~UINT64_C(0) : UINT64_C(0);
2095   r_.u64[1] = a_.u64[1];
2096 #else
2097   HEDLEY_UNREACHABLE();
2098 #endif
2099 
2100   return simde__m128d_from_private(r_);
2101 #endif
2102 }
2103 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2104 #  define _mm_cmpord_sd(a, b) simde_mm_cmpord_sd(a, b)
2105 #endif
2106 
2107 SIMDE_FUNCTION_ATTRIBUTES
2108 simde__m128d
simde_mm_cmpunord_pd(simde__m128d a,simde__m128d b)2109 simde_mm_cmpunord_pd (simde__m128d a, simde__m128d b) {
2110 #if defined(SIMDE_X86_SSE2_NATIVE)
2111   return _mm_cmpunord_pd(a, b);
2112 #else
2113   simde__m128d_private
2114     r_,
2115     a_ = simde__m128d_to_private(a),
2116     b_ = simde__m128d_to_private(b);
2117 
2118 #if defined(simde_math_isnan)
2119   SIMDE_VECTORIZE
2120   for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
2121     r_.u64[i] = (simde_math_isnan(a_.f64[i]) || simde_math_isnan(b_.f64[i])) ? ~UINT64_C(0) : UINT64_C(0);
2122   }
2123 #else
2124   HEDLEY_UNREACHABLE();
2125 #endif
2126 
2127   return simde__m128d_from_private(r_);
2128 #endif
2129 }
2130 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2131 #  define _mm_cmpunord_pd(a, b) simde_mm_cmpunord_pd(a, b)
2132 #endif
2133 
2134 SIMDE_FUNCTION_ATTRIBUTES
2135 simde__m128d
simde_mm_cmpunord_sd(simde__m128d a,simde__m128d b)2136 simde_mm_cmpunord_sd (simde__m128d a, simde__m128d b) {
2137 #if defined(SIMDE_X86_SSE2_NATIVE)
2138   return _mm_cmpunord_sd(a, b);
2139 #elif defined(SIMDE_ASSUME_VECTORIZATION)
2140   return simde_mm_move_sd(a, simde_mm_cmpunord_pd(a, b));
2141 #else
2142   simde__m128d_private
2143     r_,
2144     a_ = simde__m128d_to_private(a),
2145     b_ = simde__m128d_to_private(b);
2146 
2147 #if defined(simde_math_isnan)
2148   r_.u64[0] = (simde_math_isnan(a_.f64[0]) || simde_math_isnan(b_.f64[0])) ? ~UINT64_C(0) : UINT64_C(0);
2149   r_.u64[1] = a_.u64[1];
2150 
2151 #else
2152   HEDLEY_UNREACHABLE();
2153 #endif
2154 
2155   return simde__m128d_from_private(r_);
2156 #endif
2157 }
2158 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2159 #  define _mm_cmpunord_sd(a, b) simde_mm_cmpunord_sd(a, b)
2160 #endif
2161 
2162 SIMDE_FUNCTION_ATTRIBUTES
2163 simde__m128d
simde_mm_cvtepi32_pd(simde__m128i a)2164 simde_mm_cvtepi32_pd (simde__m128i a) {
2165 #if defined(SIMDE_X86_SSE2_NATIVE)
2166   return _mm_cvtepi32_pd(a);
2167 #else
2168   simde__m128d_private r_;
2169   simde__m128i_private a_ = simde__m128i_to_private(a);
2170 
2171 #if defined(SIMDE_CONVERT_VECTOR_)
2172   SIMDE_CONVERT_VECTOR_(r_.f64, a_.m64_private[0].i32);
2173 #else
2174   SIMDE_VECTORIZE
2175   for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
2176     r_.f64[i] = (simde_float64) a_.i32[i];
2177   }
2178 #endif
2179 
2180   return simde__m128d_from_private(r_);
2181 #endif
2182 }
2183 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2184 #  define _mm_cvtepi32_pd(a) simde_mm_cvtepi32_pd(a)
2185 #endif
2186 
2187 SIMDE_FUNCTION_ATTRIBUTES
2188 simde__m128
simde_mm_cvtepi32_ps(simde__m128i a)2189 simde_mm_cvtepi32_ps (simde__m128i a) {
2190 #if defined(SIMDE_X86_SSE2_NATIVE)
2191   return _mm_cvtepi32_ps(a);
2192 #else
2193   simde__m128_private r_;
2194   simde__m128i_private a_ = simde__m128i_to_private(a);
2195 
2196 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2197   r_.neon_f32 = vcvtq_f32_s32(a_.neon_i32);
2198 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
2199   HEDLEY_DIAGNOSTIC_PUSH
2200   #if HEDLEY_HAS_WARNING("-Wc11-extensions")
2201     #pragma clang diagnostic ignored "-Wc11-extensions"
2202   #endif
2203   r_.altivec_f32 = vec_ctf(a_.altivec_i32, 0);
2204   HEDLEY_DIAGNOSTIC_POP
2205 #elif defined(SIMDE_CONVERT_VECTOR_)
2206   SIMDE_CONVERT_VECTOR_(r_.f32, a_.i32);
2207 #else
2208   SIMDE_VECTORIZE
2209   for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
2210     r_.f32[i] = (simde_float32) a_.i32[i];
2211   }
2212 #endif
2213 
2214   return simde__m128_from_private(r_);
2215 #endif
2216 }
2217 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2218 #  define _mm_cvtepi32_ps(a) simde_mm_cvtepi32_ps(a)
2219 #endif
2220 
2221 SIMDE_FUNCTION_ATTRIBUTES
2222 simde__m128i
simde_mm_cvtpd_epi32(simde__m128d a)2223 simde_mm_cvtpd_epi32 (simde__m128d a) {
2224 #if defined(SIMDE_X86_SSE2_NATIVE)
2225   return _mm_cvtpd_epi32(a);
2226 #else
2227   simde__m128i_private r_;
2228   simde__m128d_private a_ = simde__m128d_to_private(a);
2229 
2230   SIMDE_VECTORIZE
2231   for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) {
2232     r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, simde_math_nearbyint(a_.f64[i]));
2233   }
2234   simde_memset(&(r_.m64_private[1]), 0, sizeof(r_.m64_private[1]));
2235 
2236   return simde__m128i_from_private(r_);
2237 #endif
2238 }
2239 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2240 #  define _mm_cvtpd_epi32(a) simde_mm_cvtpd_epi32(a)
2241 #endif
2242 
2243 SIMDE_FUNCTION_ATTRIBUTES
2244 simde__m64
simde_mm_cvtpd_pi32(simde__m128d a)2245 simde_mm_cvtpd_pi32 (simde__m128d a) {
2246 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
2247   return _mm_cvtpd_pi32(a);
2248 #else
2249   simde__m64_private r_;
2250   simde__m128d_private a_ = simde__m128d_to_private(a);
2251 
2252   SIMDE_VECTORIZE
2253   for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
2254     r_.i32[i] = HEDLEY_STATIC_CAST(int32_t, simde_math_nearbyint(a_.f64[i]));
2255   }
2256 
2257   return simde__m64_from_private(r_);
2258 #endif
2259 }
2260 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2261 #  define _mm_cvtpd_pi32(a) simde_mm_cvtpd_pi32(a)
2262 #endif
2263 
2264 SIMDE_FUNCTION_ATTRIBUTES
2265 simde__m128
simde_mm_cvtpd_ps(simde__m128d a)2266 simde_mm_cvtpd_ps (simde__m128d a) {
2267 #if defined(SIMDE_X86_SSE2_NATIVE)
2268   return _mm_cvtpd_ps(a);
2269 #else
2270   simde__m128_private r_;
2271   simde__m128d_private a_ = simde__m128d_to_private(a);
2272 
2273 #if defined(SIMDE_CONVERT_VECTOR_)
2274   SIMDE_CONVERT_VECTOR_(r_.m64_private[0].f32, a_.f64);
2275   r_.m64_private[1] = simde__m64_to_private(simde_mm_setzero_si64());
2276 #else
2277   SIMDE_VECTORIZE
2278   for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) {
2279     r_.f32[i] = (simde_float32) a_.f64[i];
2280   }
2281   simde_memset(&(r_.m64_private[1]), 0, sizeof(r_.m64_private[1]));
2282 #endif
2283 
2284   return simde__m128_from_private(r_);
2285 #endif
2286 }
2287 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2288 #  define _mm_cvtpd_ps(a) simde_mm_cvtpd_ps(a)
2289 #endif
2290 
2291 SIMDE_FUNCTION_ATTRIBUTES
2292 simde__m128d
simde_mm_cvtpi32_pd(simde__m64 a)2293 simde_mm_cvtpi32_pd (simde__m64 a) {
2294 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
2295   return _mm_cvtpi32_pd(a);
2296 #else
2297   simde__m128d_private r_;
2298   simde__m64_private a_ = simde__m64_to_private(a);
2299 
2300 #if defined(SIMDE_CONVERT_VECTOR_)
2301   SIMDE_CONVERT_VECTOR_(r_.f64, a_.i32);
2302 #else
2303   SIMDE_VECTORIZE
2304   for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
2305     r_.f64[i] = (simde_float64) a_.i32[i];
2306   }
2307 #endif
2308 
2309   return simde__m128d_from_private(r_);
2310 #endif
2311 }
2312 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2313 #  define _mm_cvtpi32_pd(a) simde_mm_cvtpi32_pd(a)
2314 #endif
2315 
2316 SIMDE_FUNCTION_ATTRIBUTES
2317 simde__m128i
simde_mm_cvtps_epi32(simde__m128 a)2318 simde_mm_cvtps_epi32 (simde__m128 a) {
2319 #if defined(SIMDE_X86_SSE2_NATIVE)
2320   return _mm_cvtps_epi32(a);
2321 #else
2322   simde__m128i_private r_;
2323   simde__m128_private a_ = simde__m128_to_private(a);
2324 
2325 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2326   /* The default rounding mode on SSE is 'round to even', which ArmV7
2327      does not support!  It is supported on ARMv8 however. */
2328   #if defined(SIMDE_ARCH_AARCH64)
2329     r_.neon_i32 = vcvtnq_s32_f32(a_.neon_f32);
2330   #else
2331     uint32x4_t signmask = vdupq_n_u32(0x80000000);
2332     float32x4_t half = vbslq_f32(signmask, a_.neon_f32, vdupq_n_f32(0.5f)); /* +/- 0.5 */
2333     int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(a_.neon_f32, half)); /* round to integer: [a + 0.5]*/
2334     int32x4_t r_trunc = vcvtq_s32_f32(a_.neon_f32); /* truncate to integer: [a] */
2335     int32x4_t plusone = vshrq_n_s32(vnegq_s32(r_trunc), 31); /* 1 or 0 */
2336     int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone), vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
2337     float32x4_t delta = vsubq_f32(a_.neon_f32, vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
2338     uint32x4_t is_delta_half = vceqq_f32(delta, half); /* delta == +/- 0.5 */
2339     r_.neon_i32 = vbslq_s32(is_delta_half, r_even, r_normal);
2340   #endif
2341 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
2342   r_.altivec_i32 = vec_cts(vec_round(a_.altivec_f32), 0);
2343 #else
2344   SIMDE_VECTORIZE
2345   for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
2346     r_.i32[i] = HEDLEY_STATIC_CAST(int32_t, simde_math_roundf(a_.f32[i]));
2347   }
2348 #endif
2349 
2350   return simde__m128i_from_private(r_);
2351 #endif
2352 }
2353 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2354 #  define _mm_cvtps_epi32(a) simde_mm_cvtps_epi32(a)
2355 #endif
2356 
2357 SIMDE_FUNCTION_ATTRIBUTES
2358 simde__m128d
simde_mm_cvtps_pd(simde__m128 a)2359 simde_mm_cvtps_pd (simde__m128 a) {
2360 #if defined(SIMDE_X86_SSE2_NATIVE)
2361   return _mm_cvtps_pd(a);
2362 #else
2363   simde__m128d_private r_;
2364   simde__m128_private a_ = simde__m128_to_private(a);
2365 
2366 #if defined(SIMDE_CONVERT_VECTOR_)
2367   SIMDE_CONVERT_VECTOR_(r_.f64, a_.m64_private[0].f32);
2368 #else
2369   SIMDE_VECTORIZE
2370   for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
2371     r_.f64[i] = a_.f32[i];
2372   }
2373 #endif
2374 
2375   return simde__m128d_from_private(r_);
2376 #endif
2377 }
2378 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2379 #  define _mm_cvtps_pd(a) simde_mm_cvtps_pd(a)
2380 #endif
2381 
2382 SIMDE_FUNCTION_ATTRIBUTES
2383 int32_t
simde_mm_cvtsd_si32(simde__m128d a)2384 simde_mm_cvtsd_si32 (simde__m128d a) {
2385 #if defined(SIMDE_X86_SSE2_NATIVE)
2386   return _mm_cvtsd_si32(a);
2387 #else
2388   simde__m128d_private a_ = simde__m128d_to_private(a);
2389   return SIMDE_CONVERT_FTOI(int32_t, simde_math_round(a_.f64[0]));
2390 #endif
2391 }
2392 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2393 #  define _mm_cvtsd_si32(a) simde_mm_cvtsd_si32(a)
2394 #endif
2395 
2396 SIMDE_FUNCTION_ATTRIBUTES
2397 int64_t
simde_mm_cvtsd_si64(simde__m128d a)2398 simde_mm_cvtsd_si64 (simde__m128d a) {
2399 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
2400   #if defined(__PGI)
2401     return _mm_cvtsd_si64x(a);
2402   #else
2403     return _mm_cvtsd_si64(a);
2404   #endif
2405 #else
2406   simde__m128d_private a_ = simde__m128d_to_private(a);
2407   return SIMDE_CONVERT_FTOI(int64_t, simde_math_round(a_.f64[0]));
2408 #endif
2409 }
2410 #define simde_mm_cvtsd_si64x(a) simde_mm_cvtsd_si64(a)
2411 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2412 #  define _mm_cvtsd_si64(a) simde_mm_cvtsd_si64(a)
2413 #  define _mm_cvtsd_si64x(a) simde_mm_cvtsd_si64x(a)
2414 #endif
2415 
2416 SIMDE_FUNCTION_ATTRIBUTES
2417 simde__m128
simde_mm_cvtsd_ss(simde__m128 a,simde__m128d b)2418 simde_mm_cvtsd_ss (simde__m128 a, simde__m128d b) {
2419 #if defined(SIMDE_X86_SSE2_NATIVE)
2420   return _mm_cvtsd_ss(a, b);
2421 #else
2422   simde__m128_private
2423     r_,
2424     a_ = simde__m128_to_private(a);
2425   simde__m128d_private b_ = simde__m128d_to_private(b);
2426 
2427   r_.f32[0] = HEDLEY_STATIC_CAST(simde_float32, b_.f64[0]);
2428 
2429   SIMDE_VECTORIZE
2430   for (size_t i = 1 ; i < (sizeof(r_) / sizeof(r_.i32[0])) ; i++) {
2431     r_.i32[i] = a_.i32[i];
2432   }
2433 
2434   return simde__m128_from_private(r_);
2435 #endif
2436 }
2437 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2438 #  define _mm_cvtsd_ss(a, b) simde_mm_cvtsd_ss(a, b)
2439 #endif
2440 
2441 SIMDE_FUNCTION_ATTRIBUTES
2442 int32_t
simde_mm_cvtsi128_si32(simde__m128i a)2443 simde_mm_cvtsi128_si32 (simde__m128i a) {
2444   #if defined(SIMDE_X86_SSE2_NATIVE)
2445     return _mm_cvtsi128_si32(a);
2446   #else
2447     simde__m128i_private
2448       a_ = simde__m128i_to_private(a);
2449 
2450     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2451       return vgetq_lane_s32(a_.neon_i32, 0);
2452     #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
2453       #if defined(SIMDE_BUG_GCC_95227)
2454         (void) a_;
2455       #endif
2456       return vec_extract(a_.altivec_i32, 0);
2457     #else
2458       return a_.i32[0];
2459     #endif
2460   #endif
2461 }
2462 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2463 #  define _mm_cvtsi128_si32(a) simde_mm_cvtsi128_si32(a)
2464 #endif
2465 
2466 SIMDE_FUNCTION_ATTRIBUTES
2467 int64_t
simde_mm_cvtsi128_si64(simde__m128i a)2468 simde_mm_cvtsi128_si64 (simde__m128i a) {
2469 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
2470   #if defined(__PGI)
2471     return _mm_cvtsi128_si64x(a);
2472   #else
2473     return _mm_cvtsi128_si64(a);
2474   #endif
2475 #else
2476   simde__m128i_private a_ = simde__m128i_to_private(a);
2477 #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) && !defined(HEDLEY_IBM_VERSION)
2478   return vec_extract(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed long long), a_.i64), 0);
2479 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2480   return vgetq_lane_s64(a_.neon_i64, 0);
2481 #endif
2482   return a_.i64[0];
2483 #endif
2484 }
2485 #define simde_mm_cvtsi128_si64x(a) simde_mm_cvtsi128_si64(a)
2486 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2487 #  define _mm_cvtsi128_si64(a) simde_mm_cvtsi128_si64(a)
2488 #  define _mm_cvtsi128_si64x(a) simde_mm_cvtsi128_si64x(a)
2489 #endif
2490 
2491 SIMDE_FUNCTION_ATTRIBUTES
2492 simde__m128d
simde_mm_cvtsi32_sd(simde__m128d a,int32_t b)2493 simde_mm_cvtsi32_sd (simde__m128d a, int32_t b) {
2494 
2495 #if defined(SIMDE_X86_SSE2_NATIVE)
2496   return _mm_cvtsi32_sd(a, b);
2497 #else
2498   simde__m128d_private r_;
2499   simde__m128d_private a_ = simde__m128d_to_private(a);
2500 
2501 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_AMD64)
2502   r_.neon_f64 = vsetq_lane_f64(HEDLEY_STATIC_CAST(float64_t, b), a_.neon_f64, 0);
2503 #else
2504   r_.f64[0] = HEDLEY_STATIC_CAST(simde_float64, b);
2505   r_.i64[1] = a_.i64[1];
2506 #endif
2507 
2508   return simde__m128d_from_private(r_);
2509 #endif
2510 }
2511 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2512 #  define _mm_cvtsi32_sd(a, b) simde_mm_cvtsi32_sd(a, b)
2513 #endif
2514 
2515 SIMDE_FUNCTION_ATTRIBUTES
2516 simde__m128i
simde_mm_cvtsi32_si128(int32_t a)2517 simde_mm_cvtsi32_si128 (int32_t a) {
2518 #if defined(SIMDE_X86_SSE2_NATIVE)
2519   return _mm_cvtsi32_si128(a);
2520 #else
2521   simde__m128i_private r_;
2522 
2523 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2524   r_.neon_i32 = vsetq_lane_s32(a, vdupq_n_s32(0), 0);
2525 #else
2526   r_.i32[0] = a;
2527   r_.i32[1] = 0;
2528   r_.i32[2] = 0;
2529   r_.i32[3] = 0;
2530 #endif
2531 
2532   return simde__m128i_from_private(r_);
2533 #endif
2534 }
2535 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2536 #  define _mm_cvtsi32_si128(a) simde_mm_cvtsi32_si128(a)
2537 #endif
2538 
2539 SIMDE_FUNCTION_ATTRIBUTES
2540 simde__m128d
simde_mm_cvtsi64_sd(simde__m128d a,int64_t b)2541 simde_mm_cvtsi64_sd (simde__m128d a, int64_t b) {
2542 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
2543   #if !defined(__PGI)
2544     return _mm_cvtsi64_sd(a, b);
2545   #else
2546     return _mm_cvtsi64x_sd(a, b);
2547   #endif
2548 #else
2549   simde__m128d_private
2550     r_,
2551     a_ = simde__m128d_to_private(a);
2552 
2553 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2554   r_.neon_f64 = vsetq_lane_f64(HEDLEY_STATIC_CAST(float64_t, b), a_.neon_f64, 0);
2555 #else
2556   r_.f64[0] = HEDLEY_STATIC_CAST(simde_float64, b);
2557   r_.f64[1] = a_.f64[1];
2558 #endif
2559 
2560   return simde__m128d_from_private(r_);
2561 #endif
2562 }
2563 #define simde_mm_cvtsi64x_sd(a, b) simde_mm_cvtsi64_sd(a, b)
2564 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2565 #  define _mm_cvtsi64_sd(a, b) simde_mm_cvtsi64_sd(a, b)
2566 #  define _mm_cvtsi64x_sd(a, b) simde_mm_cvtsi64x_sd(a, b)
2567 #endif
2568 
2569 SIMDE_FUNCTION_ATTRIBUTES
2570 simde__m128i
simde_mm_cvtsi64_si128(int64_t a)2571 simde_mm_cvtsi64_si128 (int64_t a) {
2572 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
2573   #if !defined(__PGI)
2574     return _mm_cvtsi64_si128(a);
2575   #else
2576     return _mm_cvtsi64x_si128(a);
2577   #endif
2578 #else
2579   simde__m128i_private r_;
2580 
2581   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2582     r_.neon_i64 = vsetq_lane_s64(a, vdupq_n_s64(0), 0);
2583   #else
2584     r_.i64[0] = a;
2585     r_.i64[1] = 0;
2586   #endif
2587 
2588   return simde__m128i_from_private(r_);
2589 #endif
2590 }
2591 #define simde_mm_cvtsi64x_si128(a) simde_mm_cvtsi64_si128(a)
2592 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2593 #  define _mm_cvtsi64_si128(a) simde_mm_cvtsi64_si128(a)
2594 #  define _mm_cvtsi64x_si128(a) simde_mm_cvtsi64x_si128(a)
2595 #endif
2596 
2597 SIMDE_FUNCTION_ATTRIBUTES
2598 simde__m128d
simde_mm_cvtss_sd(simde__m128d a,simde__m128 b)2599 simde_mm_cvtss_sd (simde__m128d a, simde__m128 b) {
2600 #if defined(SIMDE_X86_SSE2_NATIVE)
2601   return _mm_cvtss_sd(a, b);
2602 #else
2603   simde__m128d_private
2604     a_ = simde__m128d_to_private(a);
2605   simde__m128_private b_ = simde__m128_to_private(b);
2606 
2607   a_.f64[0] = HEDLEY_STATIC_CAST(simde_float64, b_.f32[0]);
2608 
2609   return simde__m128d_from_private(a_);
2610 #endif
2611 }
2612 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2613 #  define _mm_cvtss_sd(a, b) simde_mm_cvtss_sd(a, b)
2614 #endif
2615 
2616 SIMDE_FUNCTION_ATTRIBUTES
2617 simde__m128i
simde_mm_cvttpd_epi32(simde__m128d a)2618 simde_mm_cvttpd_epi32 (simde__m128d a) {
2619 #if defined(SIMDE_X86_SSE2_NATIVE)
2620   return _mm_cvttpd_epi32(a);
2621 #else
2622   simde__m128i_private r_;
2623   simde__m128d_private a_ = simde__m128d_to_private(a);
2624 
2625   for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) {
2626     r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, a_.f64[i]);
2627   }
2628   simde_memset(&(r_.m64_private[1]), 0, sizeof(r_.m64_private[1]));
2629 
2630   return simde__m128i_from_private(r_);
2631 #endif
2632 }
2633 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2634 #  define _mm_cvttpd_epi32(a) simde_mm_cvttpd_epi32(a)
2635 #endif
2636 
2637 SIMDE_FUNCTION_ATTRIBUTES
2638 simde__m64
simde_mm_cvttpd_pi32(simde__m128d a)2639 simde_mm_cvttpd_pi32 (simde__m128d a) {
2640 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
2641   return _mm_cvttpd_pi32(a);
2642 #else
2643   simde__m64_private r_;
2644   simde__m128d_private a_ = simde__m128d_to_private(a);
2645 
2646 #if defined(SIMDE_CONVERT_VECTOR_)
2647   SIMDE_CONVERT_VECTOR_(r_.i32, a_.f64);
2648 #else
2649   for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
2650     r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, a_.f64[i]);
2651   }
2652 #endif
2653 
2654   return simde__m64_from_private(r_);
2655 #endif
2656 }
2657 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2658 #  define _mm_cvttpd_pi32(a) simde_mm_cvttpd_pi32(a)
2659 #endif
2660 
2661 SIMDE_FUNCTION_ATTRIBUTES
2662 simde__m128i
simde_mm_cvttps_epi32(simde__m128 a)2663 simde_mm_cvttps_epi32 (simde__m128 a) {
2664 #if defined(SIMDE_X86_SSE2_NATIVE)
2665   return _mm_cvttps_epi32(a);
2666 #else
2667   simde__m128i_private r_;
2668   simde__m128_private a_ = simde__m128_to_private(a);
2669 
2670 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2671   r_.neon_i32 = vcvtq_s32_f32(a_.neon_f32);
2672 #elif defined(SIMDE_CONVERT_VECTOR_)
2673   SIMDE_CONVERT_VECTOR_(r_.i32, a_.f32);
2674 #else
2675   for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
2676     r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, a_.f32[i]);
2677   }
2678 #endif
2679 
2680   return simde__m128i_from_private(r_);
2681 #endif
2682 }
2683 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2684 #  define _mm_cvttps_epi32(a) simde_mm_cvttps_epi32(a)
2685 #endif
2686 
2687 SIMDE_FUNCTION_ATTRIBUTES
2688 int32_t
simde_mm_cvttsd_si32(simde__m128d a)2689 simde_mm_cvttsd_si32 (simde__m128d a) {
2690 #if defined(SIMDE_X86_SSE2_NATIVE)
2691   return _mm_cvttsd_si32(a);
2692 #else
2693   simde__m128d_private a_ = simde__m128d_to_private(a);
2694   return SIMDE_CONVERT_FTOI(int32_t, a_.f64[0]);
2695 #endif
2696 }
2697 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2698 #  define _mm_cvttsd_si32(a) simde_mm_cvttsd_si32(a)
2699 #endif
2700 
2701 SIMDE_FUNCTION_ATTRIBUTES
2702 int64_t
simde_mm_cvttsd_si64(simde__m128d a)2703 simde_mm_cvttsd_si64 (simde__m128d a) {
2704 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
2705   #if !defined(__PGI)
2706     return _mm_cvttsd_si64(a);
2707   #else
2708     return _mm_cvttsd_si64x(a);
2709   #endif
2710 #else
2711   simde__m128d_private a_ = simde__m128d_to_private(a);
2712   return SIMDE_CONVERT_FTOI(int64_t, a_.f64[0]);
2713 #endif
2714 }
2715 #define simde_mm_cvttsd_si64x(a) simde_mm_cvttsd_si64(a)
2716 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2717 #  define _mm_cvttsd_si64(a) simde_mm_cvttsd_si64(a)
2718 #  define _mm_cvttsd_si64x(a) simde_mm_cvttsd_si64x(a)
2719 #endif
2720 
2721 SIMDE_FUNCTION_ATTRIBUTES
2722 simde__m128d
simde_mm_div_pd(simde__m128d a,simde__m128d b)2723 simde_mm_div_pd (simde__m128d a, simde__m128d b) {
2724 #if defined(SIMDE_X86_SSE2_NATIVE)
2725   return _mm_div_pd(a, b);
2726 #else
2727   simde__m128d_private
2728     r_,
2729     a_ = simde__m128d_to_private(a),
2730     b_ = simde__m128d_to_private(b);
2731 
2732 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
2733   r_.f64 = a_.f64 / b_.f64;
2734 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
2735     r_.wasm_v128 =  wasm_f64x2_div(a_.wasm_v128, b_.wasm_v128);
2736 #else
2737   SIMDE_VECTORIZE
2738   for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
2739     r_.f64[i] = a_.f64[i] / b_.f64[i];
2740   }
2741 #endif
2742 
2743   return simde__m128d_from_private(r_);
2744 #endif
2745 }
2746 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2747 #  define _mm_div_pd(a, b) simde_mm_div_pd(a, b)
2748 #endif
2749 
2750 SIMDE_FUNCTION_ATTRIBUTES
2751 simde__m128d
simde_mm_div_sd(simde__m128d a,simde__m128d b)2752 simde_mm_div_sd (simde__m128d a, simde__m128d b) {
2753 #if defined(SIMDE_X86_SSE2_NATIVE)
2754   return _mm_div_sd(a, b);
2755 #elif defined(SIMDE_ASSUME_VECTORIZATION)
2756   return simde_mm_move_sd(a, simde_mm_div_pd(a, b));
2757 #else
2758   simde__m128d_private
2759     r_,
2760     a_ = simde__m128d_to_private(a),
2761     b_ = simde__m128d_to_private(b);
2762 
2763   r_.f64[0] = a_.f64[0] / b_.f64[0];
2764   r_.f64[1] = a_.f64[1];
2765 
2766   return simde__m128d_from_private(r_);
2767 #endif
2768 }
2769 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2770 #  define _mm_div_sd(a, b) simde_mm_div_sd(a, b)
2771 #endif
2772 
2773 SIMDE_FUNCTION_ATTRIBUTES
2774 int32_t
simde_mm_extract_epi16(simde__m128i a,const int imm8)2775 simde_mm_extract_epi16 (simde__m128i a, const int imm8)
2776     SIMDE_REQUIRE_RANGE(imm8, 0, 7)  {
2777   uint16_t r;
2778   simde__m128i_private a_ = simde__m128i_to_private(a);
2779 
2780   #if defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
2781     #if defined(SIMDE_BUG_GCC_95227)
2782       (void) a_;
2783       (void) imm8;
2784     #endif
2785     r = HEDLEY_STATIC_CAST(uint16_t, vec_extract(a_.altivec_i16, imm8));
2786   #else
2787     r = a_.u16[imm8 & 7];
2788   #endif
2789 
2790   return  HEDLEY_STATIC_CAST(int32_t, r);
2791 }
2792 #if defined(SIMDE_X86_SSE2_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(4,6,0))
2793 #  define simde_mm_extract_epi16(a, imm8) _mm_extract_epi16(a, imm8)
2794 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2795 #  define simde_mm_extract_epi16(a, imm8) (HEDLEY_STATIC_CAST(int32_t, vgetq_lane_s16(simde__m128i_to_private(a).neon_i16, (imm8))) & (INT32_C(0x0000ffff)))
2796 #endif
2797 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2798 #  define _mm_extract_epi16(a, imm8) simde_mm_extract_epi16(a, imm8)
2799 #endif
2800 
2801 SIMDE_FUNCTION_ATTRIBUTES
2802 simde__m128i
simde_mm_insert_epi16(simde__m128i a,int16_t i,const int imm8)2803 simde_mm_insert_epi16 (simde__m128i a, int16_t i, const int imm8)
2804     SIMDE_REQUIRE_RANGE(imm8, 0, 7)  {
2805   simde__m128i_private a_ = simde__m128i_to_private(a);
2806   a_.i16[imm8 & 7] = i;
2807   return simde__m128i_from_private(a_);
2808 }
2809 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
2810 #  define simde_mm_insert_epi16(a, i, imm8) _mm_insert_epi16((a), (i), (imm8))
2811 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2812 #  define simde_mm_insert_epi16(a, i, imm8) simde__m128i_from_neon_i16(vsetq_lane_s16((i), simde__m128i_to_neon_i16(a), (imm8)))
2813 #endif
2814 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2815 #  define _mm_insert_epi16(a, i, imm8) simde_mm_insert_epi16(a, i, imm8)
2816 #endif
2817 
2818 SIMDE_FUNCTION_ATTRIBUTES
2819 simde__m128d
simde_mm_load_pd(simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM (2)])2820 simde_mm_load_pd (simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)]) {
2821   simde_assert_aligned(16, mem_addr);
2822 
2823   #if defined(SIMDE_X86_SSE2_NATIVE)
2824     return _mm_load_pd(mem_addr);
2825   #else
2826     simde__m128d_private r_;
2827 
2828     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2829       r_.neon_u32 = vld1q_u32(HEDLEY_REINTERPRET_CAST(uint32_t const*, mem_addr));
2830     #else
2831       r_ = *SIMDE_ALIGN_CAST(simde__m128d_private const*, mem_addr);
2832     #endif
2833 
2834     return simde__m128d_from_private(r_);
2835   #endif
2836 }
2837 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2838 #  define _mm_load_pd(mem_addr) simde_mm_load_pd(mem_addr)
2839 #endif
2840 
2841 SIMDE_FUNCTION_ATTRIBUTES
2842 simde__m128d
simde_mm_load_pd1(simde_float64 const * mem_addr)2843 simde_mm_load_pd1 (simde_float64 const* mem_addr) {
2844 #if defined(SIMDE_X86_SSE2_NATIVE)
2845   return _mm_load1_pd(mem_addr);
2846 #else
2847   simde__m128d_private r_;
2848 
2849   r_.f64[0] = *mem_addr;
2850   r_.f64[1] = *mem_addr;
2851 
2852   return simde__m128d_from_private(r_);
2853 #endif
2854 }
2855 #define simde_mm_load1_pd(mem_addr) simde_mm_load_pd1(mem_addr)
2856 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2857 #  define _mm_load_pd1(mem_addr) simde_mm_load_pd1(mem_addr)
2858 #  define _mm_load1_pd(mem_addr) simde_mm_load1_pd(mem_addr)
2859 #endif
2860 
2861 SIMDE_FUNCTION_ATTRIBUTES
2862 simde__m128d
simde_mm_load_sd(simde_float64 const * mem_addr)2863 simde_mm_load_sd (simde_float64 const* mem_addr) {
2864 #if defined(SIMDE_X86_SSE2_NATIVE)
2865   return _mm_load_sd(mem_addr);
2866 #else
2867   simde__m128d_private r_;
2868 
2869 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2870   r_.neon_f64 = vsetq_lane_f64(*mem_addr, vdupq_n_f64(0), 0);
2871 #else
2872   r_.f64[0] = *mem_addr;
2873   r_.u64[1] = UINT64_C(0);
2874 #endif
2875 
2876   return simde__m128d_from_private(r_);
2877 #endif
2878 }
2879 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2880 #  define _mm_load_sd(mem_addr) simde_mm_load_sd(mem_addr)
2881 #endif
2882 
2883 SIMDE_FUNCTION_ATTRIBUTES
2884 simde__m128i
simde_mm_load_si128(simde__m128i const * mem_addr)2885 simde_mm_load_si128 (simde__m128i const* mem_addr) {
2886   simde_assert_aligned(16, mem_addr);
2887 
2888   #if defined(SIMDE_X86_SSE2_NATIVE)
2889     return _mm_load_si128(HEDLEY_REINTERPRET_CAST(__m128i const*, mem_addr));
2890   #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) || defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
2891     simde__m128i_private r_;
2892 
2893     #if defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
2894       r_.altivec_i32 = vec_ld(0, HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(int) const*, mem_addr));
2895     #else
2896       r_.neon_i32 = vld1q_s32(HEDLEY_REINTERPRET_CAST(int32_t const*, mem_addr));
2897     #endif
2898 
2899     return simde__m128i_from_private(r_);
2900   #else
2901     return *mem_addr;
2902   #endif
2903 }
2904 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2905 #  define _mm_load_si128(mem_addr) simde_mm_load_si128(mem_addr)
2906 #endif
2907 
2908 SIMDE_FUNCTION_ATTRIBUTES
2909 simde__m128d
simde_mm_loadh_pd(simde__m128d a,simde_float64 const * mem_addr)2910 simde_mm_loadh_pd (simde__m128d a, simde_float64 const* mem_addr) {
2911 #if defined(SIMDE_X86_SSE2_NATIVE)
2912   return _mm_loadh_pd(a, mem_addr);
2913 #else
2914   simde__m128d_private
2915     r_,
2916     a_ = simde__m128d_to_private(a);
2917   simde_float64 t;
2918 
2919   simde_memcpy(&t, mem_addr, sizeof(t));
2920   r_.f64[0] = a_.f64[0];
2921   r_.f64[1] = t;
2922 
2923   return simde__m128d_from_private(r_);
2924 #endif
2925 }
2926 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2927 #  define _mm_loadh_pd(a, mem_addr) simde_mm_loadh_pd(a, mem_addr)
2928 #endif
2929 
2930 SIMDE_FUNCTION_ATTRIBUTES
2931 simde__m128i
simde_mm_loadl_epi64(simde__m128i const * mem_addr)2932 simde_mm_loadl_epi64 (simde__m128i const* mem_addr) {
2933 #if defined(SIMDE_X86_SSE2_NATIVE)
2934   return _mm_loadl_epi64(mem_addr);
2935 #else
2936   simde__m128i_private r_;
2937 
2938   int64_t value;
2939   simde_memcpy(&value, mem_addr, sizeof(value));
2940 
2941   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2942     r_.neon_i64 = vcombine_s64(vld1_s64(HEDLEY_REINTERPRET_CAST(int64_t const *, mem_addr)), vdup_n_s64(0));
2943   #else
2944     r_.i64[0] = value;
2945     r_.i64[1] = 0;
2946   #endif
2947 
2948   return simde__m128i_from_private(r_);
2949 #endif
2950 }
2951 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2952 #  define _mm_loadl_epi64(mem_addr) simde_mm_loadl_epi64(mem_addr)
2953 #endif
2954 
2955 SIMDE_FUNCTION_ATTRIBUTES
2956 simde__m128d
simde_mm_loadl_pd(simde__m128d a,simde_float64 const * mem_addr)2957 simde_mm_loadl_pd (simde__m128d a, simde_float64 const* mem_addr) {
2958 #if defined(SIMDE_X86_SSE2_NATIVE)
2959   return _mm_loadl_pd(a, mem_addr);
2960 #else
2961   simde__m128d_private
2962     r_,
2963     a_ = simde__m128d_to_private(a);
2964 
2965   r_.f64[0] = *mem_addr;
2966   r_.u64[1] = a_.u64[1];
2967 
2968   return simde__m128d_from_private(r_);
2969 #endif
2970 }
2971 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2972 #  define _mm_loadl_pd(a, mem_addr) simde_mm_loadl_pd(a, mem_addr)
2973 #endif
2974 
2975 SIMDE_FUNCTION_ATTRIBUTES
2976 simde__m128d
simde_mm_loadr_pd(simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM (2)])2977 simde_mm_loadr_pd (simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)]) {
2978   simde_assert_aligned(16, mem_addr);
2979 
2980 #if defined(SIMDE_X86_SSE2_NATIVE)
2981   return _mm_loadr_pd(mem_addr);
2982 #else
2983   simde__m128d_private r_;
2984 
2985   r_.f64[0] = mem_addr[1];
2986   r_.f64[1] = mem_addr[0];
2987 
2988   return simde__m128d_from_private(r_);
2989 #endif
2990 }
2991 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2992 #  define _mm_loadr_pd(mem_addr) simde_mm_loadr_pd(mem_addr)
2993 #endif
2994 
2995 SIMDE_FUNCTION_ATTRIBUTES
2996 simde__m128d
simde_mm_loadu_pd(simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM (2)])2997 simde_mm_loadu_pd (simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)]) {
2998 #if defined(SIMDE_X86_SSE2_NATIVE)
2999   return _mm_loadu_pd(mem_addr);
3000 #else
3001   simde__m128d_private r_;
3002 
3003   simde_memcpy(&r_, mem_addr, sizeof(r_));
3004 
3005   return simde__m128d_from_private(r_);
3006 #endif
3007 }
3008 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3009 #  define _mm_loadu_pd(mem_addr) simde_mm_loadu_pd(mem_addr)
3010 #endif
3011 
3012 SIMDE_FUNCTION_ATTRIBUTES
3013 simde__m128i
simde_x_mm_loadu_epi8(int8_t const * mem_addr)3014 simde_x_mm_loadu_epi8(int8_t const* mem_addr) {
3015   #if defined(SIMDE_X86_SSE2_NATIVE)
3016     return _mm_loadu_si128(SIMDE_ALIGN_CAST(simde__m128i const*, mem_addr));
3017   #else
3018     simde__m128i_private r_;
3019 
3020     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3021       r_.neon_i8 = vld1q_s8(HEDLEY_REINTERPRET_CAST(int8_t const*, mem_addr));
3022     #else
3023       simde_memcpy(&r_, mem_addr, sizeof(r_));
3024     #endif
3025 
3026     return simde__m128i_from_private(r_);
3027   #endif
3028 }
3029 
3030 SIMDE_FUNCTION_ATTRIBUTES
3031 simde__m128i
simde_x_mm_loadu_epi16(int16_t const * mem_addr)3032 simde_x_mm_loadu_epi16(int16_t const* mem_addr) {
3033   #if defined(SIMDE_X86_SSE2_NATIVE)
3034     return _mm_loadu_si128(SIMDE_ALIGN_CAST(simde__m128i const*, mem_addr));
3035   #else
3036     simde__m128i_private r_;
3037 
3038     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3039       r_.neon_i16 = vld1q_s16(HEDLEY_REINTERPRET_CAST(int16_t const*, mem_addr));
3040     #else
3041       simde_memcpy(&r_, mem_addr, sizeof(r_));
3042     #endif
3043 
3044     return simde__m128i_from_private(r_);
3045   #endif
3046 }
3047 
3048 SIMDE_FUNCTION_ATTRIBUTES
3049 simde__m128i
simde_x_mm_loadu_epi32(int32_t const * mem_addr)3050 simde_x_mm_loadu_epi32(int32_t const* mem_addr) {
3051   #if defined(SIMDE_X86_SSE2_NATIVE)
3052     return _mm_loadu_si128(SIMDE_ALIGN_CAST(simde__m128i const*, mem_addr));
3053   #else
3054     simde__m128i_private r_;
3055 
3056     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3057       r_.neon_i32 = vld1q_s32(HEDLEY_REINTERPRET_CAST(int32_t const*, mem_addr));
3058     #else
3059       simde_memcpy(&r_, mem_addr, sizeof(r_));
3060     #endif
3061 
3062     return simde__m128i_from_private(r_);
3063   #endif
3064 }
3065 
3066 SIMDE_FUNCTION_ATTRIBUTES
3067 simde__m128i
simde_x_mm_loadu_epi64(int64_t const * mem_addr)3068 simde_x_mm_loadu_epi64(int64_t const* mem_addr) {
3069   #if defined(SIMDE_X86_SSE2_NATIVE)
3070     return _mm_loadu_si128(SIMDE_ALIGN_CAST(simde__m128i const*, mem_addr));
3071   #else
3072     simde__m128i_private r_;
3073 
3074     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3075       r_.neon_i64 = vld1q_s64(HEDLEY_REINTERPRET_CAST(int64_t const*, mem_addr));
3076     #else
3077       simde_memcpy(&r_, mem_addr, sizeof(r_));
3078     #endif
3079 
3080     return simde__m128i_from_private(r_);
3081   #endif
3082 }
3083 
3084 SIMDE_FUNCTION_ATTRIBUTES
3085 simde__m128i
simde_mm_loadu_si128(void const * mem_addr)3086 simde_mm_loadu_si128 (void const* mem_addr) {
3087   #if defined(SIMDE_X86_SSE2_NATIVE)
3088     return _mm_loadu_si128(HEDLEY_STATIC_CAST(__m128i const*, mem_addr));
3089   #else
3090     simde__m128i_private r_;
3091 
3092     #if HEDLEY_GNUC_HAS_ATTRIBUTE(may_alias,3,3,0)
3093       HEDLEY_DIAGNOSTIC_PUSH
3094       SIMDE_DIAGNOSTIC_DISABLE_PACKED_
3095       struct simde_mm_loadu_si128_s {
3096         __typeof__(r_) v;
3097       } __attribute__((__packed__, __may_alias__));
3098       r_ = HEDLEY_REINTERPRET_CAST(const struct simde_mm_loadu_si128_s *, mem_addr)->v;
3099       HEDLEY_DIAGNOSTIC_POP
3100     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3101       /* Note that this is a lower priority than the struct above since
3102        * clang assumes mem_addr is aligned (since it is a __m128i*). */
3103       r_.neon_i32 = vld1q_s32(HEDLEY_REINTERPRET_CAST(int32_t const*, mem_addr));
3104     #else
3105       simde_memcpy(&r_, mem_addr, sizeof(r_));
3106     #endif
3107 
3108     return simde__m128i_from_private(r_);
3109   #endif
3110 }
3111 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3112 #  define _mm_loadu_si128(mem_addr) simde_mm_loadu_si128(mem_addr)
3113 #endif
3114 
3115 SIMDE_FUNCTION_ATTRIBUTES
3116 simde__m128i
simde_mm_madd_epi16(simde__m128i a,simde__m128i b)3117 simde_mm_madd_epi16 (simde__m128i a, simde__m128i b) {
3118 #if defined(SIMDE_X86_SSE2_NATIVE)
3119   return _mm_madd_epi16(a, b);
3120 #else
3121   simde__m128i_private
3122     r_,
3123     a_ = simde__m128i_to_private(a),
3124     b_ = simde__m128i_to_private(b);
3125 
3126   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3127     int32x4_t pl = vmull_s16(vget_low_s16(a_.neon_i16),  vget_low_s16(b_.neon_i16));
3128     int32x4_t ph = vmull_s16(vget_high_s16(a_.neon_i16), vget_high_s16(b_.neon_i16));
3129     int32x2_t rl = vpadd_s32(vget_low_s32(pl), vget_high_s32(pl));
3130     int32x2_t rh = vpadd_s32(vget_low_s32(ph), vget_high_s32(ph));
3131     r_.neon_i32 = vcombine_s32(rl, rh);
3132   #else
3133     SIMDE_VECTORIZE
3134     for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i16[0])) ; i += 2) {
3135       r_.i32[i / 2] = (a_.i16[i] * b_.i16[i]) + (a_.i16[i + 1] * b_.i16[i + 1]);
3136     }
3137   #endif
3138 
3139   return simde__m128i_from_private(r_);
3140 #endif
3141 }
3142 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3143 #  define _mm_madd_epi16(a, b) simde_mm_madd_epi16(a, b)
3144 #endif
3145 
3146 SIMDE_FUNCTION_ATTRIBUTES
3147 void
simde_mm_maskmoveu_si128(simde__m128i a,simde__m128i mask,int8_t mem_addr[HEDLEY_ARRAY_PARAM (16)])3148 simde_mm_maskmoveu_si128 (simde__m128i a, simde__m128i mask, int8_t mem_addr[HEDLEY_ARRAY_PARAM(16)]) {
3149 #if defined(SIMDE_X86_SSE2_NATIVE)
3150   _mm_maskmoveu_si128(a, mask, HEDLEY_REINTERPRET_CAST(char*, mem_addr));
3151 #else
3152   simde__m128i_private
3153     a_ = simde__m128i_to_private(a),
3154     mask_ = simde__m128i_to_private(mask);
3155 
3156   for (size_t i = 0 ; i < (sizeof(a_.i8) / sizeof(a_.i8[0])) ; i++) {
3157     if (mask_.u8[i] & 0x80) {
3158       mem_addr[i] = a_.i8[i];
3159     }
3160   }
3161 #endif
3162 }
3163 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3164 #  define _mm_maskmoveu_si128(a, mask, mem_addr) simde_mm_maskmoveu_si128((a), (mask), SIMDE_CHECKED_REINTERPRET_CAST(int8_t*, char*, (mem_addr)))
3165 #endif
3166 
3167 SIMDE_FUNCTION_ATTRIBUTES
3168 int32_t
simde_mm_movemask_epi8(simde__m128i a)3169 simde_mm_movemask_epi8 (simde__m128i a) {
3170 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__INTEL_COMPILER)
3171   /* ICC has trouble with _mm_movemask_epi8 at -O2 and above: */
3172   return _mm_movemask_epi8(a);
3173 #else
3174   int32_t r = 0;
3175   simde__m128i_private a_ = simde__m128i_to_private(a);
3176 
3177 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3178   // Use increasingly wide shifts+adds to collect the sign bits
3179     // together.
3180     // Since the widening shifts would be rather confusing to follow in little endian, everything
3181     // will be illustrated in big endian order instead. This has a different result - the bits
3182     // would actually be reversed on a big endian machine.
3183 
3184     // Starting input (only half the elements are shown):
3185     // 89 ff 1d c0 00 10 99 33
3186     uint8x16_t input = a_.neon_u8;
3187 
3188     // Shift out everything but the sign bits with an unsigned shift right.
3189     //
3190     // Bytes of the vector::
3191     // 89 ff 1d c0 00 10 99 33
3192     // \  \  \  \  \  \  \  \    high_bits = (uint16x4_t)(input >> 7)
3193     //  |  |  |  |  |  |  |  |
3194     // 01 01 00 01 00 00 01 00
3195     //
3196     // Bits of first important lane(s):
3197     // 10001001 (89)
3198     // \______
3199     //        |
3200     // 00000001 (01)
3201     uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
3202 
3203     // Merge the even lanes together with a 16-bit unsigned shift right + add.
3204     // 'xx' represents garbage data which will be ignored in the final result.
3205     // In the important bytes, the add functions like a binary OR.
3206     //
3207     // 01 01 00 01 00 00 01 00
3208     //  \_ |  \_ |  \_ |  \_ |   paired16 = (uint32x4_t)(input + (input >> 7))
3209     //    \|    \|    \|    \|
3210     // xx 03 xx 01 xx 00 xx 02
3211     //
3212     // 00000001 00000001 (01 01)
3213     //        \_______ |
3214     //                \|
3215     // xxxxxxxx xxxxxx11 (xx 03)
3216     uint32x4_t paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
3217 
3218     // Repeat with a wider 32-bit shift + add.
3219     // xx 03 xx 01 xx 00 xx 02
3220     //     \____ |     \____ |  paired32 = (uint64x1_t)(paired16 + (paired16 >> 14))
3221     //          \|          \|
3222     // xx xx xx 0d xx xx xx 02
3223     //
3224     // 00000011 00000001 (03 01)
3225     //        \\_____ ||
3226     //         '----.\||
3227     // xxxxxxxx xxxx1101 (xx 0d)
3228     uint64x2_t paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
3229 
3230     // Last, an even wider 64-bit shift + add to get our result in the low 8 bit lanes.
3231     // xx xx xx 0d xx xx xx 02
3232     //            \_________ |   paired64 = (uint8x8_t)(paired32 + (paired32 >> 28))
3233     //                      \|
3234     // xx xx xx xx xx xx xx d2
3235     //
3236     // 00001101 00000010 (0d 02)
3237     //     \   \___ |  |
3238     //      '---.  \|  |
3239     // xxxxxxxx 11010010 (xx d2)
3240     uint8x16_t paired64 = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
3241 
3242     // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
3243     // xx xx xx xx xx xx xx d2
3244     //                      ||  return paired64[0]
3245     //                      d2
3246     // Note: Little endian would return the correct value 4b (01001011) instead.
3247     r = vgetq_lane_u8(paired64, 0) | (HEDLEY_STATIC_CAST(int32_t, vgetq_lane_u8(paired64, 8)) << 8);
3248 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) && !defined(HEDLEY_IBM_VERSION) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
3249   static const SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) perm = { 120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8, 0 };
3250   r = HEDLEY_STATIC_CAST(int32_t, vec_extract(vec_vbpermq(a_.altivec_u8, perm), 1));
3251 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) && !defined(HEDLEY_IBM_VERSION) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_BIG)
3252   static const SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) perm = { 120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8, 0 };
3253   r = HEDLEY_STATIC_CAST(int32_t, vec_extract(vec_vbpermq(a_.altivec_u8, perm), 14));
3254 #else
3255   SIMDE_VECTORIZE_REDUCTION(|:r)
3256   for (size_t i = 0 ; i < (sizeof(a_.u8) / sizeof(a_.u8[0])) ; i++) {
3257     r |= (a_.u8[15 - i] >> 7) << (15 - i);
3258   }
3259 #endif
3260 
3261   return r;
3262 #endif
3263 }
3264 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3265 #  define _mm_movemask_epi8(a) simde_mm_movemask_epi8(a)
3266 #endif
3267 
3268 SIMDE_FUNCTION_ATTRIBUTES
3269 int32_t
simde_mm_movemask_pd(simde__m128d a)3270 simde_mm_movemask_pd (simde__m128d a) {
3271 #if defined(SIMDE_X86_SSE2_NATIVE)
3272   return _mm_movemask_pd(a);
3273 #else
3274   int32_t r = 0;
3275   simde__m128d_private a_ = simde__m128d_to_private(a);
3276 
3277   SIMDE_VECTORIZE
3278   for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) {
3279     r |= (a_.u64[i] >> 63) << i;
3280   }
3281 
3282   return r;
3283 #endif
3284 }
3285 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3286 #  define _mm_movemask_pd(a) simde_mm_movemask_pd(a)
3287 #endif
3288 
3289 SIMDE_FUNCTION_ATTRIBUTES
3290 simde__m64
simde_mm_movepi64_pi64(simde__m128i a)3291 simde_mm_movepi64_pi64 (simde__m128i a) {
3292 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
3293   return _mm_movepi64_pi64(a);
3294 #else
3295   simde__m64_private r_;
3296   simde__m128i_private a_ = simde__m128i_to_private(a);
3297 
3298   r_.i64[0] = a_.i64[0];
3299 
3300   return simde__m64_from_private(r_);
3301 #endif
3302 }
3303 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3304 #  define _mm_movepi64_pi64(a) simde_mm_movepi64_pi64(a)
3305 #endif
3306 
3307 SIMDE_FUNCTION_ATTRIBUTES
3308 simde__m128i
simde_mm_movpi64_epi64(simde__m64 a)3309 simde_mm_movpi64_epi64 (simde__m64 a) {
3310 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
3311   return _mm_movpi64_epi64(a);
3312 #else
3313   simde__m128i_private r_;
3314   simde__m64_private a_ = simde__m64_to_private(a);
3315 
3316   r_.i64[0] = a_.i64[0];
3317   r_.i64[1] = 0;
3318 
3319   return simde__m128i_from_private(r_);
3320 #endif
3321 }
3322 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3323 #  define _mm_movpi64_epi64(a) simde_mm_movpi64_epi64(a)
3324 #endif
3325 
3326 SIMDE_FUNCTION_ATTRIBUTES
3327 simde__m128i
simde_mm_min_epi16(simde__m128i a,simde__m128i b)3328 simde_mm_min_epi16 (simde__m128i a, simde__m128i b) {
3329 #if defined(SIMDE_X86_SSE2_NATIVE)
3330   return _mm_min_epi16(a, b);
3331 #else
3332   simde__m128i_private
3333     r_,
3334     a_ = simde__m128i_to_private(a),
3335     b_ = simde__m128i_to_private(b);
3336 
3337   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3338     r_.neon_i16 = vminq_s16(a_.neon_i16, b_.neon_i16);
3339   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
3340     r_.altivec_i16 = vec_min(a_.altivec_i16, b_.altivec_i16);
3341   #else
3342     SIMDE_VECTORIZE
3343     for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
3344       r_.i16[i] = (a_.i16[i] < b_.i16[i]) ? a_.i16[i] : b_.i16[i];
3345     }
3346   #endif
3347 
3348   return simde__m128i_from_private(r_);
3349 #endif
3350 }
3351 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3352 #  define _mm_min_epi16(a, b) simde_mm_min_epi16(a, b)
3353 #endif
3354 
3355 SIMDE_FUNCTION_ATTRIBUTES
3356 simde__m128i
simde_mm_min_epu8(simde__m128i a,simde__m128i b)3357 simde_mm_min_epu8 (simde__m128i a, simde__m128i b) {
3358 #if defined(SIMDE_X86_SSE2_NATIVE)
3359   return _mm_min_epu8(a, b);
3360 #else
3361   simde__m128i_private
3362     r_,
3363     a_ = simde__m128i_to_private(a),
3364     b_ = simde__m128i_to_private(b);
3365 
3366   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3367     r_.neon_u8 = vminq_u8(a_.neon_u8, b_.neon_u8);
3368   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
3369     r_.altivec_u8 = vec_min(a_.altivec_u8, b_.altivec_u8);
3370   #else
3371     SIMDE_VECTORIZE
3372     for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
3373       r_.u8[i] = (a_.u8[i] < b_.u8[i]) ? a_.u8[i] : b_.u8[i];
3374     }
3375   #endif
3376 
3377   return simde__m128i_from_private(r_);
3378 #endif
3379 }
3380 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3381 #  define _mm_min_epu8(a, b) simde_mm_min_epu8(a, b)
3382 #endif
3383 
3384 SIMDE_FUNCTION_ATTRIBUTES
3385 simde__m128d
simde_mm_min_pd(simde__m128d a,simde__m128d b)3386 simde_mm_min_pd (simde__m128d a, simde__m128d b) {
3387 #if defined(SIMDE_X86_SSE2_NATIVE)
3388   return _mm_min_pd(a, b);
3389 #else
3390   simde__m128d_private
3391     r_,
3392     a_ = simde__m128d_to_private(a),
3393     b_ = simde__m128d_to_private(b);
3394 
3395   #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
3396     r_.altivec_f64 = vec_min(a_.altivec_f64, b_.altivec_f64);
3397   #else
3398     SIMDE_VECTORIZE
3399     for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
3400       r_.f64[i] = (a_.f64[i] < b_.f64[i]) ? a_.f64[i] : b_.f64[i];
3401     }
3402   #endif
3403 
3404   return simde__m128d_from_private(r_);
3405 #endif
3406 }
3407 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3408 #  define _mm_min_pd(a, b) simde_mm_min_pd(a, b)
3409 #endif
3410 
3411 SIMDE_FUNCTION_ATTRIBUTES
3412 simde__m128d
simde_mm_min_sd(simde__m128d a,simde__m128d b)3413 simde_mm_min_sd (simde__m128d a, simde__m128d b) {
3414 #if defined(SIMDE_X86_SSE2_NATIVE)
3415   return _mm_min_sd(a, b);
3416 #elif defined(SIMDE_ASSUME_VECTORIZATION)
3417   return simde_mm_move_sd(a, simde_mm_min_pd(a, b));
3418 #else
3419   simde__m128d_private
3420     r_,
3421     a_ = simde__m128d_to_private(a),
3422     b_ = simde__m128d_to_private(b);
3423 
3424   r_.f64[0] = (a_.f64[0] < b_.f64[0]) ? a_.f64[0] : b_.f64[0];
3425   r_.f64[1] = a_.f64[1];
3426 
3427   return simde__m128d_from_private(r_);
3428 #endif
3429 }
3430 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3431 #  define _mm_min_sd(a, b) simde_mm_min_sd(a, b)
3432 #endif
3433 
3434 SIMDE_FUNCTION_ATTRIBUTES
3435 simde__m128i
simde_mm_max_epi16(simde__m128i a,simde__m128i b)3436 simde_mm_max_epi16 (simde__m128i a, simde__m128i b) {
3437   #if defined(SIMDE_X86_SSE2_NATIVE)
3438     return _mm_max_epi16(a, b);
3439   #else
3440     simde__m128i_private
3441       r_,
3442       a_ = simde__m128i_to_private(a),
3443       b_ = simde__m128i_to_private(b);
3444 
3445     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3446       r_.neon_i16 = vmaxq_s16(a_.neon_i16, b_.neon_i16);
3447     #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
3448       r_.altivec_i16 = vec_max(a_.altivec_i16, b_.altivec_i16);
3449     #else
3450       SIMDE_VECTORIZE
3451       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
3452         r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? a_.i16[i] : b_.i16[i];
3453       }
3454     #endif
3455 
3456     return simde__m128i_from_private(r_);
3457   #endif
3458 }
3459 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3460 #  define _mm_max_epi16(a, b) simde_mm_max_epi16(a, b)
3461 #endif
3462 
3463 SIMDE_FUNCTION_ATTRIBUTES
3464 simde__m128i
simde_mm_max_epu8(simde__m128i a,simde__m128i b)3465 simde_mm_max_epu8 (simde__m128i a, simde__m128i b) {
3466   #if defined(SIMDE_X86_SSE2_NATIVE)
3467     return _mm_max_epu8(a, b);
3468   #else
3469     simde__m128i_private
3470       r_,
3471       a_ = simde__m128i_to_private(a),
3472       b_ = simde__m128i_to_private(b);
3473 
3474     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3475       r_.neon_u8 = vmaxq_u8(a_.neon_u8, b_.neon_u8);
3476     #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
3477       r_.altivec_u8 = vec_max(a_.altivec_u8, b_.altivec_u8);
3478     #else
3479       SIMDE_VECTORIZE
3480       for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
3481         r_.u8[i] = (a_.u8[i] > b_.u8[i]) ? a_.u8[i] : b_.u8[i];
3482       }
3483     #endif
3484 
3485     return simde__m128i_from_private(r_);
3486   #endif
3487 }
3488 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3489 #  define _mm_max_epu8(a, b) simde_mm_max_epu8(a, b)
3490 #endif
3491 
3492 SIMDE_FUNCTION_ATTRIBUTES
3493 simde__m128d
simde_mm_max_pd(simde__m128d a,simde__m128d b)3494 simde_mm_max_pd (simde__m128d a, simde__m128d b) {
3495   #if defined(SIMDE_X86_SSE2_NATIVE)
3496     return _mm_max_pd(a, b);
3497   #else
3498     simde__m128d_private
3499       r_,
3500       a_ = simde__m128d_to_private(a),
3501       b_ = simde__m128d_to_private(b);
3502 
3503     #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
3504       r_.altivec_f64 = vec_max(a_.altivec_f64, b_.altivec_f64);
3505     #else
3506       SIMDE_VECTORIZE
3507       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
3508         r_.f64[i] = (a_.f64[i] > b_.f64[i]) ? a_.f64[i] : b_.f64[i];
3509       }
3510     #endif
3511 
3512     return simde__m128d_from_private(r_);
3513   #endif
3514 }
3515 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3516 #  define _mm_max_pd(a, b) simde_mm_max_pd(a, b)
3517 #endif
3518 
3519 SIMDE_FUNCTION_ATTRIBUTES
3520 simde__m128d
simde_mm_max_sd(simde__m128d a,simde__m128d b)3521 simde_mm_max_sd (simde__m128d a, simde__m128d b) {
3522 #if defined(SIMDE_X86_SSE2_NATIVE)
3523   return _mm_max_sd(a, b);
3524 #elif defined(SIMDE_ASSUME_VECTORIZATION)
3525   return simde_mm_move_sd(a, simde_mm_max_pd(a, b));
3526 #else
3527   simde__m128d_private
3528     r_,
3529     a_ = simde__m128d_to_private(a),
3530     b_ = simde__m128d_to_private(b);
3531 
3532   r_.f64[0] = (a_.f64[0] > b_.f64[0]) ? a_.f64[0] : b_.f64[0];
3533   r_.f64[1] = a_.f64[1];
3534 
3535   return simde__m128d_from_private(r_);
3536 #endif
3537 }
3538 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3539 #  define _mm_max_sd(a, b) simde_mm_max_sd(a, b)
3540 #endif
3541 
3542 SIMDE_FUNCTION_ATTRIBUTES
3543 simde__m128i
simde_mm_move_epi64(simde__m128i a)3544 simde_mm_move_epi64 (simde__m128i a) {
3545 #if defined(SIMDE_X86_SSE2_NATIVE)
3546   return _mm_move_epi64(a);
3547 #else
3548   simde__m128i_private
3549     r_,
3550     a_ = simde__m128i_to_private(a);
3551 
3552 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3553   r_.neon_i64 = vsetq_lane_s64(0, a_.neon_i64, 1);
3554 #else
3555   r_.i64[0] = a_.i64[0];
3556   r_.i64[1] = 0;
3557 #endif
3558 
3559   return simde__m128i_from_private(r_);
3560 #endif
3561 }
3562 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3563 #  define _mm_move_epi64(a) simde_mm_move_epi64(a)
3564 #endif
3565 
3566 SIMDE_FUNCTION_ATTRIBUTES
3567 simde__m128i
simde_mm_mul_epu32(simde__m128i a,simde__m128i b)3568 simde_mm_mul_epu32 (simde__m128i a, simde__m128i b) {
3569 #if defined(SIMDE_X86_SSE2_NATIVE)
3570   return _mm_mul_epu32(a, b);
3571 #else
3572   simde__m128i_private
3573     r_,
3574     a_ = simde__m128i_to_private(a),
3575     b_ = simde__m128i_to_private(b);
3576 
3577   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3578     uint32x2_t a_lo = vmovn_u64(a_.neon_u64);
3579     uint32x2_t b_lo = vmovn_u64(b_.neon_u64);
3580     r_.neon_u64 = vmull_u32(a_lo, b_lo);
3581   #else
3582     SIMDE_VECTORIZE
3583     for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
3584       r_.u64[i] = HEDLEY_STATIC_CAST(uint64_t, a_.u32[i * 2]) * HEDLEY_STATIC_CAST(uint64_t, b_.u32[i * 2]);
3585     }
3586   #endif
3587 
3588   return simde__m128i_from_private(r_);
3589 #endif
3590 }
3591 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3592 #  define _mm_mul_epu32(a, b) simde_mm_mul_epu32(a, b)
3593 #endif
3594 
3595 SIMDE_FUNCTION_ATTRIBUTES
3596 simde__m128i
simde_x_mm_mul_epi64(simde__m128i a,simde__m128i b)3597 simde_x_mm_mul_epi64 (simde__m128i a, simde__m128i b) {
3598   simde__m128i_private
3599     r_,
3600     a_ = simde__m128i_to_private(a),
3601     b_ = simde__m128i_to_private(b);
3602 
3603 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
3604   r_.i64 = a_.i64 * b_.i64;
3605 #else
3606   SIMDE_VECTORIZE
3607   for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
3608     r_.i64[i] = a_.i64[i] * b_.i64[i];
3609   }
3610 #endif
3611 
3612   return simde__m128i_from_private(r_);
3613 }
3614 
3615 SIMDE_FUNCTION_ATTRIBUTES
3616 simde__m128i
simde_x_mm_mod_epi64(simde__m128i a,simde__m128i b)3617 simde_x_mm_mod_epi64 (simde__m128i a, simde__m128i b) {
3618   simde__m128i_private
3619     r_,
3620     a_ = simde__m128i_to_private(a),
3621     b_ = simde__m128i_to_private(b);
3622 
3623 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
3624   r_.i64 = a_.i64 % b_.i64;
3625 #else
3626   SIMDE_VECTORIZE
3627   for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
3628     r_.i64[i] = a_.i64[i] % b_.i64[i];
3629   }
3630 #endif
3631 
3632   return simde__m128i_from_private(r_);
3633 }
3634 
3635 SIMDE_FUNCTION_ATTRIBUTES
3636 simde__m128d
simde_mm_mul_pd(simde__m128d a,simde__m128d b)3637 simde_mm_mul_pd (simde__m128d a, simde__m128d b) {
3638 #if defined(SIMDE_X86_SSE2_NATIVE)
3639   return _mm_mul_pd(a, b);
3640 #else
3641   simde__m128d_private
3642     r_,
3643     a_ = simde__m128d_to_private(a),
3644     b_ = simde__m128d_to_private(b);
3645 
3646 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
3647   r_.f64 = a_.f64 * b_.f64;
3648 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
3649   r_.wasm_v128 = wasm_f64x2_mul(a_.wasm_v128, b_.wasm_v128);
3650 #else
3651   SIMDE_VECTORIZE
3652   for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
3653     r_.f64[i] = a_.f64[i] * b_.f64[i];
3654   }
3655 #endif
3656 
3657   return simde__m128d_from_private(r_);
3658 #endif
3659 }
3660 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3661 #  define _mm_mul_pd(a, b) simde_mm_mul_pd(a, b)
3662 #endif
3663 
3664 SIMDE_FUNCTION_ATTRIBUTES
3665 simde__m128d
simde_mm_mul_sd(simde__m128d a,simde__m128d b)3666 simde_mm_mul_sd (simde__m128d a, simde__m128d b) {
3667 #if defined(SIMDE_X86_SSE2_NATIVE)
3668   return _mm_mul_sd(a, b);
3669 #elif defined(SIMDE_ASSUME_VECTORIZATION)
3670   return simde_mm_move_sd(a, simde_mm_mul_pd(a, b));
3671 #else
3672   simde__m128d_private
3673     r_,
3674     a_ = simde__m128d_to_private(a),
3675     b_ = simde__m128d_to_private(b);
3676 
3677   r_.f64[0] = a_.f64[0] * b_.f64[0];
3678   r_.f64[1] = a_.f64[1];
3679 
3680   return simde__m128d_from_private(r_);
3681 #endif
3682 }
3683 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3684 #  define _mm_mul_sd(a, b) simde_mm_mul_sd(a, b)
3685 #endif
3686 
3687 SIMDE_FUNCTION_ATTRIBUTES
3688 simde__m64
simde_mm_mul_su32(simde__m64 a,simde__m64 b)3689 simde_mm_mul_su32 (simde__m64 a, simde__m64 b) {
3690 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI)
3691   return _mm_mul_su32(a, b);
3692 #else
3693   simde__m64_private
3694     r_,
3695     a_ = simde__m64_to_private(a),
3696     b_ = simde__m64_to_private(b);
3697 
3698   r_.u64[0] = HEDLEY_STATIC_CAST(uint64_t, a_.u32[0]) * HEDLEY_STATIC_CAST(uint64_t, b_.u32[0]);
3699 
3700   return simde__m64_from_private(r_);
3701 #endif
3702 }
3703 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3704 #  define _mm_mul_su32(a, b) simde_mm_mul_su32(a, b)
3705 #endif
3706 
3707 SIMDE_FUNCTION_ATTRIBUTES
3708 simde__m128i
simde_mm_mulhi_epi16(simde__m128i a,simde__m128i b)3709 simde_mm_mulhi_epi16 (simde__m128i a, simde__m128i b) {
3710 #if defined(SIMDE_X86_SSE2_NATIVE)
3711   return _mm_mulhi_epi16(a, b);
3712 #else
3713   simde__m128i_private
3714     r_,
3715     a_ = simde__m128i_to_private(a),
3716     b_ = simde__m128i_to_private(b);
3717 
3718   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3719     int16x4_t a3210 = vget_low_s16(a_.neon_i16);
3720     int16x4_t b3210 = vget_low_s16(b_.neon_i16);
3721     int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
3722     int16x4_t a7654 = vget_high_s16(a_.neon_i16);
3723     int16x4_t b7654 = vget_high_s16(b_.neon_i16);
3724     int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
3725     uint16x8x2_t rv = vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
3726     r_.neon_u16 = rv.val[1];
3727   #else
3728     SIMDE_VECTORIZE
3729     for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
3730       r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, (HEDLEY_STATIC_CAST(uint32_t, HEDLEY_STATIC_CAST(int32_t, a_.i16[i]) * HEDLEY_STATIC_CAST(int32_t, b_.i16[i])) >> 16));
3731     }
3732   #endif
3733 
3734   return simde__m128i_from_private(r_);
3735 #endif
3736 }
3737 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3738 #  define _mm_mulhi_epi16(a, b) simde_mm_mulhi_epi16(a, b)
3739 #endif
3740 
3741 SIMDE_FUNCTION_ATTRIBUTES
3742 simde__m128i
simde_mm_mulhi_epu16(simde__m128i a,simde__m128i b)3743 simde_mm_mulhi_epu16 (simde__m128i a, simde__m128i b) {
3744 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
3745   return _mm_mulhi_epu16(a, b);
3746 #else
3747   simde__m128i_private
3748     r_,
3749     a_ = simde__m128i_to_private(a),
3750     b_ = simde__m128i_to_private(b);
3751 
3752   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3753     uint16x4_t a3210 = vget_low_u16(a_.neon_u16);
3754     uint16x4_t b3210 = vget_low_u16(b_.neon_u16);
3755     uint32x4_t ab3210 = vmull_u16(a3210, b3210); /* 3333222211110000 */
3756     uint16x4_t a7654 = vget_high_u16(a_.neon_u16);
3757     uint16x4_t b7654 = vget_high_u16(b_.neon_u16);
3758     uint32x4_t ab7654 = vmull_u16(a7654, b7654); /* 7777666655554444 */
3759     uint16x8x2_t neon_r =
3760             vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654));
3761     r_.neon_u16 = neon_r.val[1];
3762   #else
3763     SIMDE_VECTORIZE
3764     for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
3765       r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, HEDLEY_STATIC_CAST(uint32_t, a_.u16[i]) * HEDLEY_STATIC_CAST(uint32_t, b_.u16[i]) >> 16);
3766     }
3767   #endif
3768 
3769   return simde__m128i_from_private(r_);
3770 #endif
3771 }
3772 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3773 #  define _mm_mulhi_epu16(a, b) simde_mm_mulhi_epu16(a, b)
3774 #endif
3775 
3776 SIMDE_FUNCTION_ATTRIBUTES
3777 simde__m128i
simde_mm_mullo_epi16(simde__m128i a,simde__m128i b)3778 simde_mm_mullo_epi16 (simde__m128i a, simde__m128i b) {
3779 #if defined(SIMDE_X86_SSE2_NATIVE)
3780   return _mm_mullo_epi16(a, b);
3781 #else
3782   simde__m128i_private
3783     r_,
3784     a_ = simde__m128i_to_private(a),
3785     b_ = simde__m128i_to_private(b);
3786 
3787   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3788     r_.neon_i16 = vmulq_s16(a_.neon_i16, b_.neon_i16);
3789   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
3790     (void) a_;
3791     (void) b_;
3792     r_.altivec_i16 = vec_mul(a_.altivec_i16, b_.altivec_i16);
3793   #else
3794     SIMDE_VECTORIZE
3795     for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
3796       r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, HEDLEY_STATIC_CAST(uint32_t, a_.u16[i]) * HEDLEY_STATIC_CAST(uint32_t, b_.u16[i]));
3797     }
3798   #endif
3799 
3800   return simde__m128i_from_private(r_);
3801 #endif
3802 }
3803 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3804 #  define _mm_mullo_epi16(a, b) simde_mm_mullo_epi16(a, b)
3805 #endif
3806 
3807 SIMDE_FUNCTION_ATTRIBUTES
3808 simde__m128d
simde_mm_or_pd(simde__m128d a,simde__m128d b)3809 simde_mm_or_pd (simde__m128d a, simde__m128d b) {
3810 #if defined(SIMDE_X86_SSE2_NATIVE)
3811   return _mm_or_pd(a, b);
3812 #else
3813   simde__m128d_private
3814     r_,
3815     a_ = simde__m128d_to_private(a),
3816     b_ = simde__m128d_to_private(b);
3817 
3818 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
3819   r_.i32f = a_.i32f | b_.i32f;
3820 #else
3821   SIMDE_VECTORIZE
3822   for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
3823     r_.i32f[i] = a_.i32f[i] | b_.i32f[i];
3824   }
3825 #endif
3826 
3827   return simde__m128d_from_private(r_);
3828 #endif
3829 }
3830 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3831 #  define _mm_or_pd(a, b) simde_mm_or_pd(a, b)
3832 #endif
3833 
3834 SIMDE_FUNCTION_ATTRIBUTES
3835 simde__m128i
simde_mm_or_si128(simde__m128i a,simde__m128i b)3836 simde_mm_or_si128 (simde__m128i a, simde__m128i b) {
3837 #if defined(SIMDE_X86_SSE2_NATIVE)
3838   return _mm_or_si128(a, b);
3839 #else
3840   simde__m128i_private
3841     r_,
3842     a_ = simde__m128i_to_private(a),
3843     b_ = simde__m128i_to_private(b);
3844 
3845   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3846     r_.neon_i32 = vorrq_s32(a_.neon_i32, b_.neon_i32);
3847   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
3848     r_.altivec_i32 = vec_or(a_.altivec_i32, b_.altivec_i32);
3849   #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
3850     r_.i32f = a_.i32f | b_.i32f;
3851   #else
3852     SIMDE_VECTORIZE
3853     for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
3854       r_.i32f[i] = a_.i32f[i] | b_.i32f[i];
3855     }
3856   #endif
3857 
3858   return simde__m128i_from_private(r_);
3859 #endif
3860 }
3861 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3862 #  define _mm_or_si128(a, b) simde_mm_or_si128(a, b)
3863 #endif
3864 
3865 SIMDE_FUNCTION_ATTRIBUTES
3866 simde__m128i
simde_mm_packs_epi16(simde__m128i a,simde__m128i b)3867 simde_mm_packs_epi16 (simde__m128i a, simde__m128i b) {
3868 #if defined(SIMDE_X86_SSE2_NATIVE)
3869   return _mm_packs_epi16(a, b);
3870 #else
3871   simde__m128i_private
3872     r_,
3873     a_ = simde__m128i_to_private(a),
3874     b_ = simde__m128i_to_private(b);
3875 
3876 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3877   r_.neon_i8 = vcombine_s8(vqmovn_s16(a_.neon_i16), vqmovn_s16(b_.neon_i16));
3878 #else
3879   SIMDE_VECTORIZE
3880   for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
3881     r_.i8[i]     = (a_.i16[i] > INT8_MAX) ? INT8_MAX : ((a_.i16[i] < INT8_MIN) ? INT8_MIN : HEDLEY_STATIC_CAST(int8_t, a_.i16[i]));
3882     r_.i8[i + 8] = (b_.i16[i] > INT8_MAX) ? INT8_MAX : ((b_.i16[i] < INT8_MIN) ? INT8_MIN : HEDLEY_STATIC_CAST(int8_t, b_.i16[i]));
3883   }
3884 #endif
3885 
3886   return simde__m128i_from_private(r_);
3887 #endif
3888 }
3889 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3890 #  define _mm_packs_epi16(a, b) simde_mm_packs_epi16(a, b)
3891 #endif
3892 
3893 SIMDE_FUNCTION_ATTRIBUTES
3894 simde__m128i
simde_mm_packs_epi32(simde__m128i a,simde__m128i b)3895 simde_mm_packs_epi32 (simde__m128i a, simde__m128i b) {
3896 #if defined(SIMDE_X86_SSE2_NATIVE)
3897   return _mm_packs_epi32(a, b);
3898 #else
3899   simde__m128i_private
3900     r_,
3901     a_ = simde__m128i_to_private(a),
3902     b_ = simde__m128i_to_private(b);
3903 
3904 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3905   r_.neon_i16 = vcombine_s16(vqmovn_s32(a_.neon_i32), vqmovn_s32(b_.neon_i32));
3906 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
3907   r_.altivec_i16 = vec_packs(a_.altivec_i32, b_.altivec_i32);
3908 #else
3909   SIMDE_VECTORIZE
3910   for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
3911     r_.i16[i]     = (a_.i32[i] > INT16_MAX) ? INT16_MAX : ((a_.i32[i] < INT16_MIN) ? INT16_MIN : HEDLEY_STATIC_CAST(int16_t, a_.i32[i]));
3912     r_.i16[i + 4] = (b_.i32[i] > INT16_MAX) ? INT16_MAX : ((b_.i32[i] < INT16_MIN) ? INT16_MIN : HEDLEY_STATIC_CAST(int16_t, b_.i32[i]));
3913   }
3914 #endif
3915 
3916   return simde__m128i_from_private(r_);
3917 #endif
3918 }
3919 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3920 #  define _mm_packs_epi32(a, b) simde_mm_packs_epi32(a, b)
3921 #endif
3922 
3923 SIMDE_FUNCTION_ATTRIBUTES
3924 simde__m128i
simde_mm_packus_epi16(simde__m128i a,simde__m128i b)3925 simde_mm_packus_epi16 (simde__m128i a, simde__m128i b) {
3926 #if defined(SIMDE_X86_SSE2_NATIVE)
3927   return _mm_packus_epi16(a, b);
3928 #else
3929   simde__m128i_private
3930     r_,
3931     a_ = simde__m128i_to_private(a),
3932     b_ = simde__m128i_to_private(b);
3933 
3934 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3935   r_.neon_u8 = vcombine_u8(vqmovun_s16(a_.neon_i16), vqmovun_s16(b_.neon_i16));
3936 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
3937   r_.altivec_u8 = vec_packsu(a_.altivec_i16, b_.altivec_i16);
3938 #else
3939   SIMDE_VECTORIZE
3940   for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
3941     r_.u8[i]     = (a_.i16[i] > UINT8_MAX) ? UINT8_MAX : ((a_.i16[i] < 0) ? UINT8_C(0) : HEDLEY_STATIC_CAST(uint8_t, a_.i16[i]));
3942     r_.u8[i + 8] = (b_.i16[i] > UINT8_MAX) ? UINT8_MAX : ((b_.i16[i] < 0) ? UINT8_C(0) : HEDLEY_STATIC_CAST(uint8_t, b_.i16[i]));
3943   }
3944 #endif
3945 
3946   return simde__m128i_from_private(r_);
3947 #endif
3948 }
3949 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3950 #  define _mm_packus_epi16(a, b) simde_mm_packus_epi16(a, b)
3951 #endif
3952 
3953 SIMDE_FUNCTION_ATTRIBUTES
3954 void
simde_mm_pause(void)3955 simde_mm_pause (void) {
3956 #if defined(SIMDE_X86_SSE2_NATIVE)
3957   _mm_pause();
3958 #endif
3959 }
3960 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3961 #  define _mm_pause() (simde_mm_pause())
3962 #endif
3963 
3964 SIMDE_FUNCTION_ATTRIBUTES
3965 simde__m128i
simde_mm_sad_epu8(simde__m128i a,simde__m128i b)3966 simde_mm_sad_epu8 (simde__m128i a, simde__m128i b) {
3967 #if defined(SIMDE_X86_SSE2_NATIVE)
3968   return _mm_sad_epu8(a, b);
3969 #else
3970   simde__m128i_private
3971     r_,
3972     a_ = simde__m128i_to_private(a),
3973     b_ = simde__m128i_to_private(b);
3974 
3975   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3976     uint16x8_t t = vpaddlq_u8(vabdq_u8(a_.neon_u8, b_.neon_u8));
3977     uint16_t r0 = t[0] + t[1] + t[2] + t[3];
3978     uint16_t r4 = t[4] + t[5] + t[6] + t[7];
3979     uint16x8_t r = vsetq_lane_u16(r0, vdupq_n_u16(0), 0);
3980     r_.neon_u16 = vsetq_lane_u16(r4, r, 4);
3981   #else
3982     for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
3983       uint16_t tmp = 0;
3984       SIMDE_VECTORIZE_REDUCTION(+:tmp)
3985       for (size_t j = 0 ; j < ((sizeof(r_.u8) / sizeof(r_.u8[0])) / 2) ; j++) {
3986         const size_t e = j + (i * 8);
3987         tmp += (a_.u8[e] > b_.u8[e]) ? (a_.u8[e] - b_.u8[e]) : (b_.u8[e] - a_.u8[e]);
3988       }
3989       r_.i64[i] = tmp;
3990     }
3991   #endif
3992 
3993   return simde__m128i_from_private(r_);
3994 #endif
3995 }
3996 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3997 #  define _mm_sad_epu8(a, b) simde_mm_sad_epu8(a, b)
3998 #endif
3999 
4000 SIMDE_FUNCTION_ATTRIBUTES
4001 simde__m128i
simde_mm_set_epi8(int8_t e15,int8_t e14,int8_t e13,int8_t e12,int8_t e11,int8_t e10,int8_t e9,int8_t e8,int8_t e7,int8_t e6,int8_t e5,int8_t e4,int8_t e3,int8_t e2,int8_t e1,int8_t e0)4002 simde_mm_set_epi8 (int8_t e15, int8_t e14, int8_t e13, int8_t e12,
4003        int8_t e11, int8_t e10, int8_t  e9, int8_t  e8,
4004        int8_t  e7, int8_t  e6, int8_t  e5, int8_t  e4,
4005        int8_t  e3, int8_t  e2, int8_t  e1, int8_t  e0) {
4006 
4007   #if defined(SIMDE_X86_SSE2_NATIVE)
4008     return _mm_set_epi8(
4009       e15, e14, e13, e12, e11, e10,  e9,  e8,
4010        e7,  e6,  e5,  e4,  e3,  e2,  e1,  e0);
4011   #else
4012     simde__m128i_private r_;
4013 
4014     #if defined(SIMDE_WASM_SIMD128_NATIVE)
4015       r_.wasm_v128 = wasm_i8x16_make(
4016          e0,  e1,  e2,  e3,  e4,  e5,  e6,  e7,
4017          e8,  e9, e10, e11, e12, e13, e14, e15);
4018     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4019       SIMDE_ALIGN_AS(16, int8x16_t) int8_t data[16] = {
4020         e0,  e1,  e2,  e3,
4021         e4,  e5,  e6,  e7,
4022         e8,  e9,  e10, e11,
4023         e12, e13, e14, e15};
4024       r_.neon_i8 = vld1q_s8(data);
4025     #else
4026       r_.i8[ 0] =  e0;
4027       r_.i8[ 1] =  e1;
4028       r_.i8[ 2] =  e2;
4029       r_.i8[ 3] =  e3;
4030       r_.i8[ 4] =  e4;
4031       r_.i8[ 5] =  e5;
4032       r_.i8[ 6] =  e6;
4033       r_.i8[ 7] =  e7;
4034       r_.i8[ 8] =  e8;
4035       r_.i8[ 9] =  e9;
4036       r_.i8[10] = e10;
4037       r_.i8[11] = e11;
4038       r_.i8[12] = e12;
4039       r_.i8[13] = e13;
4040       r_.i8[14] = e14;
4041       r_.i8[15] = e15;
4042     #endif
4043 
4044     return simde__m128i_from_private(r_);
4045   #endif
4046 }
4047 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4048 #  define _mm_set_epi8(e15, e14, e13, e12, e11, e10,  e9,  e8,  e7,  e6,  e5,  e4,  e3,  e2,  e1,  e0) simde_mm_set_epi8(e15, e14, e13, e12, e11, e10,  e9,  e8,  e7,  e6,  e5,  e4,  e3,  e2,  e1,  e0)
4049 #endif
4050 
4051 SIMDE_FUNCTION_ATTRIBUTES
4052 simde__m128i
simde_mm_set_epi16(int16_t e7,int16_t e6,int16_t e5,int16_t e4,int16_t e3,int16_t e2,int16_t e1,int16_t e0)4053 simde_mm_set_epi16 (int16_t e7, int16_t e6, int16_t e5, int16_t e4,
4054         int16_t e3, int16_t e2, int16_t e1, int16_t e0) {
4055   #if defined(SIMDE_X86_SSE2_NATIVE)
4056     return _mm_set_epi16(e7, e6, e5, e4, e3, e2, e1, e0);
4057   #else
4058     simde__m128i_private r_;
4059 
4060     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4061       SIMDE_ALIGN_AS(16, int16x8_t) int16_t data[8] = { e0, e1, e2, e3, e4, e5, e6, e7 };
4062       r_.neon_i16 = vld1q_s16(data);
4063     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4064       r_.wasm_v128 = wasm_i16x8_make(e0, e1, e2, e3, e4, e5, e6, e7);
4065     #else
4066       r_.i16[0] = e0;
4067       r_.i16[1] = e1;
4068       r_.i16[2] = e2;
4069       r_.i16[3] = e3;
4070       r_.i16[4] = e4;
4071       r_.i16[5] = e5;
4072       r_.i16[6] = e6;
4073       r_.i16[7] = e7;
4074     #endif
4075 
4076     return simde__m128i_from_private(r_);
4077   #endif
4078 }
4079 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4080 #  define _mm_set_epi16(e7,  e6,  e5,  e4,  e3,  e2,  e1,  e0) simde_mm_set_epi16(e7,  e6,  e5,  e4,  e3,  e2,  e1,  e0)
4081 #endif
4082 
4083 SIMDE_FUNCTION_ATTRIBUTES
4084 simde__m128i
simde_mm_set_epi32(int32_t e3,int32_t e2,int32_t e1,int32_t e0)4085 simde_mm_set_epi32 (int32_t e3, int32_t e2, int32_t e1, int32_t e0) {
4086   #if defined(SIMDE_X86_SSE2_NATIVE)
4087     return _mm_set_epi32(e3, e2, e1, e0);
4088   #else
4089     simde__m128i_private r_;
4090 
4091     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4092       SIMDE_ALIGN_AS(16, int32x4_t) int32_t data[4] = { e0, e1, e2, e3 };
4093       r_.neon_i32 = vld1q_s32(data);
4094     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4095       r_.wasm_v128 = wasm_i32x4_make(e0, e1, e2, e3);
4096     #else
4097       r_.i32[0] = e0;
4098       r_.i32[1] = e1;
4099       r_.i32[2] = e2;
4100       r_.i32[3] = e3;
4101     #endif
4102 
4103     return simde__m128i_from_private(r_);
4104   #endif
4105 }
4106 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4107 #  define _mm_set_epi32(e3,  e2,  e1,  e0) simde_mm_set_epi32(e3,  e2,  e1,  e0)
4108 #endif
4109 
4110 SIMDE_FUNCTION_ATTRIBUTES
4111 simde__m128i
simde_mm_set_epi64(simde__m64 e1,simde__m64 e0)4112 simde_mm_set_epi64 (simde__m64 e1, simde__m64 e0) {
4113   #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
4114     return _mm_set_epi64(e1, e0);
4115   #else
4116     simde__m128i_private r_;
4117 
4118     r_.m64_private[0] = simde__m64_to_private(e0);
4119     r_.m64_private[1] = simde__m64_to_private(e1);
4120 
4121     return simde__m128i_from_private(r_);
4122   #endif
4123 }
4124 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4125 #  define _mm_set_epi64(e1, e0) (simde_mm_set_epi64((e1), (e0)))
4126 #endif
4127 
4128 SIMDE_FUNCTION_ATTRIBUTES
4129 simde__m128i
simde_mm_set_epi64x(int64_t e1,int64_t e0)4130 simde_mm_set_epi64x (int64_t e1, int64_t e0) {
4131 #if defined(SIMDE_X86_SSE2_NATIVE) && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,0,0))
4132   return _mm_set_epi64x(e1, e0);
4133 #else
4134   simde__m128i_private r_;
4135 
4136   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4137     SIMDE_ALIGN_AS(16, int64x2_t) int64_t data[2] = {e0, e1};
4138     r_.neon_i64 = vld1q_s64(data);
4139   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4140     r_.wasm_v128 = wasm_i64x2_make(e0, e1);
4141   #else
4142     r_.i64[0] = e0;
4143     r_.i64[1] = e1;
4144   #endif
4145 
4146   return simde__m128i_from_private(r_);
4147 #endif
4148 }
4149 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4150 #  define _mm_set_epi64x(e1, e0) simde_mm_set_epi64x(e1, e0)
4151 #endif
4152 
4153 SIMDE_FUNCTION_ATTRIBUTES
4154 simde__m128i
simde_x_mm_set_epu8(uint8_t e15,uint8_t e14,uint8_t e13,uint8_t e12,uint8_t e11,uint8_t e10,uint8_t e9,uint8_t e8,uint8_t e7,uint8_t e6,uint8_t e5,uint8_t e4,uint8_t e3,uint8_t e2,uint8_t e1,uint8_t e0)4155 simde_x_mm_set_epu8 (uint8_t e15, uint8_t e14, uint8_t e13, uint8_t e12,
4156          uint8_t e11, uint8_t e10, uint8_t  e9, uint8_t  e8,
4157          uint8_t  e7, uint8_t  e6, uint8_t  e5, uint8_t  e4,
4158          uint8_t  e3, uint8_t  e2, uint8_t  e1, uint8_t  e0) {
4159   #if defined(SIMDE_X86_SSE2_NATIVE)
4160     return _mm_set_epi8(
4161       HEDLEY_STATIC_CAST(char, e15), HEDLEY_STATIC_CAST(char, e14), HEDLEY_STATIC_CAST(char, e13), HEDLEY_STATIC_CAST(char, e12),
4162       HEDLEY_STATIC_CAST(char, e11), HEDLEY_STATIC_CAST(char, e10), HEDLEY_STATIC_CAST(char,  e9), HEDLEY_STATIC_CAST(char,  e8),
4163       HEDLEY_STATIC_CAST(char,  e7), HEDLEY_STATIC_CAST(char,  e6), HEDLEY_STATIC_CAST(char,  e5), HEDLEY_STATIC_CAST(char,  e4),
4164       HEDLEY_STATIC_CAST(char,  e3), HEDLEY_STATIC_CAST(char,  e2), HEDLEY_STATIC_CAST(char,  e1), HEDLEY_STATIC_CAST(char,  e0));
4165   #else
4166     simde__m128i_private r_;
4167 
4168     r_.u8[ 0] =  e0; r_.u8[ 1] =  e1; r_.u8[ 2] =  e2; r_.u8[ 3] =  e3;
4169     r_.u8[ 4] =  e4; r_.u8[ 5] =  e5; r_.u8[ 6] =  e6; r_.u8[ 7] =  e7;
4170     r_.u8[ 8] =  e8; r_.u8[ 9] =  e9; r_.u8[10] = e10; r_.u8[11] = e11;
4171     r_.u8[12] = e12; r_.u8[13] = e13; r_.u8[14] = e14; r_.u8[15] = e15;
4172 
4173     return simde__m128i_from_private(r_);
4174   #endif
4175 }
4176 
4177 SIMDE_FUNCTION_ATTRIBUTES
4178 simde__m128i
simde_x_mm_set_epu16(uint16_t e7,uint16_t e6,uint16_t e5,uint16_t e4,uint16_t e3,uint16_t e2,uint16_t e1,uint16_t e0)4179 simde_x_mm_set_epu16 (uint16_t e7, uint16_t e6, uint16_t e5, uint16_t e4,
4180           uint16_t e3, uint16_t e2, uint16_t e1, uint16_t e0) {
4181   #if defined(SIMDE_X86_SSE2_NATIVE)
4182     return _mm_set_epi16(
4183       HEDLEY_STATIC_CAST(short,  e7), HEDLEY_STATIC_CAST(short,  e6), HEDLEY_STATIC_CAST(short,  e5), HEDLEY_STATIC_CAST(short,  e4),
4184       HEDLEY_STATIC_CAST(short,  e3), HEDLEY_STATIC_CAST(short,  e2), HEDLEY_STATIC_CAST(short,  e1), HEDLEY_STATIC_CAST(short,  e0));
4185   #else
4186     simde__m128i_private r_;
4187 
4188     r_.u16[0] = e0; r_.u16[1] = e1; r_.u16[2] = e2; r_.u16[3] = e3;
4189     r_.u16[4] = e4; r_.u16[5] = e5; r_.u16[6] = e6; r_.u16[7] = e7;
4190 
4191     return simde__m128i_from_private(r_);
4192   #endif
4193 }
4194 
4195 SIMDE_FUNCTION_ATTRIBUTES
4196 simde__m128i
simde_x_mm_set_epu32(uint32_t e3,uint32_t e2,uint32_t e1,uint32_t e0)4197 simde_x_mm_set_epu32 (uint32_t e3, uint32_t e2, uint32_t e1, uint32_t e0) {
4198   #if defined(SIMDE_X86_SSE2_NATIVE)
4199     return _mm_set_epi32(
4200       HEDLEY_STATIC_CAST(int,  e3), HEDLEY_STATIC_CAST(int,  e2), HEDLEY_STATIC_CAST(int,  e1), HEDLEY_STATIC_CAST(int,  e0));
4201   #else
4202     simde__m128i_private r_;
4203 
4204     r_.u32[0] = e0;
4205     r_.u32[1] = e1;
4206     r_.u32[2] = e2;
4207     r_.u32[3] = e3;
4208 
4209     return simde__m128i_from_private(r_);
4210   #endif
4211 }
4212 
4213 SIMDE_FUNCTION_ATTRIBUTES
4214 simde__m128i
simde_x_mm_set_epu64x(uint64_t e1,uint64_t e0)4215 simde_x_mm_set_epu64x (uint64_t e1, uint64_t e0) {
4216   #if defined(SIMDE_X86_SSE2_NATIVE) && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,0,0))
4217     return _mm_set_epi64x(HEDLEY_STATIC_CAST(int64_t,  e1), HEDLEY_STATIC_CAST(int64_t,  e0));
4218   #else
4219     simde__m128i_private r_;
4220 
4221     r_.u64[0] = e0;
4222     r_.u64[1] = e1;
4223 
4224     return simde__m128i_from_private(r_);
4225   #endif
4226 }
4227 
4228 SIMDE_FUNCTION_ATTRIBUTES
4229 simde__m128d
simde_mm_set_pd(simde_float64 e1,simde_float64 e0)4230 simde_mm_set_pd (simde_float64 e1, simde_float64 e0) {
4231 #if defined(SIMDE_X86_SSE2_NATIVE)
4232   return _mm_set_pd(e1, e0);
4233 #else
4234   simde__m128d_private r_;
4235 
4236   #if defined(SIMDE_WASM_SIMD128_NATIVE)
4237     r_.wasm_v128 = wasm_f64x2_make(e0, e1);
4238     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4239       r_.wasm_v128 = wasm_f64x2_make(e0, e1);
4240   #else
4241     r_.f64[0] = e0;
4242     r_.f64[1] = e1;
4243   #endif
4244 
4245   return simde__m128d_from_private(r_);
4246 #endif
4247 }
4248 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4249 #  define _mm_set_pd(e1, e0) simde_mm_set_pd(e1, e0)
4250 #endif
4251 
4252 SIMDE_FUNCTION_ATTRIBUTES
4253 simde__m128d
simde_mm_set_pd1(simde_float64 a)4254 simde_mm_set_pd1 (simde_float64 a) {
4255 #if defined(SIMDE_X86_SSE2_NATIVE)
4256   return _mm_set1_pd(a);
4257 #else
4258   simde__m128d_private r_;
4259 
4260   r_.f64[0] = a;
4261   r_.f64[1] = a;
4262 
4263   return simde__m128d_from_private(r_);
4264 #endif
4265 }
4266 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4267 #  define _mm_set_pd1(a) simde_mm_set1_pd(a)
4268 #endif
4269 
4270 SIMDE_FUNCTION_ATTRIBUTES
4271 simde__m128d
simde_mm_set_sd(simde_float64 a)4272 simde_mm_set_sd (simde_float64 a) {
4273 #if defined(SIMDE_X86_SSE2_NATIVE)
4274   return _mm_set_sd(a);
4275 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
4276   return vsetq_lane_f64(a, vdupq_n_f64(SIMDE_FLOAT64_C(0.0)), 0);
4277 #else
4278   return simde_mm_set_pd(SIMDE_FLOAT64_C(0.0), a);
4279 
4280 #endif
4281 }
4282 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4283 #  define _mm_set_sd(a) simde_mm_set_sd(a)
4284 #endif
4285 
4286 SIMDE_FUNCTION_ATTRIBUTES
4287 simde__m128i
simde_mm_set1_epi8(int8_t a)4288 simde_mm_set1_epi8 (int8_t a) {
4289 #if defined(SIMDE_X86_SSE2_NATIVE)
4290   return _mm_set1_epi8(a);
4291 #else
4292   simde__m128i_private r_;
4293 
4294   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4295     r_.neon_i8 = vdupq_n_s8(a);
4296   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4297     r_.wasm_v128 = wasm_i8x16_splat(a);
4298   #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
4299     r_.altivec_i8 = vec_splats(HEDLEY_STATIC_CAST(signed char, a));
4300   #else
4301     SIMDE_VECTORIZE
4302     for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
4303       r_.i8[i] = a;
4304     }
4305   #endif
4306 
4307   return simde__m128i_from_private(r_);
4308 #endif
4309 }
4310 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4311 #  define _mm_set1_epi8(a) simde_mm_set1_epi8(a)
4312 #endif
4313 
4314 SIMDE_FUNCTION_ATTRIBUTES
4315 simde__m128i
simde_mm_set1_epi16(int16_t a)4316 simde_mm_set1_epi16 (int16_t a) {
4317 #if defined(SIMDE_X86_SSE2_NATIVE)
4318   return _mm_set1_epi16(a);
4319 #else
4320   simde__m128i_private r_;
4321 
4322   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4323     r_.neon_i16 = vdupq_n_s16(a);
4324   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4325     r_.wasm_v128 = wasm_i16x8_splat(a);
4326   #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
4327     r_.altivec_i16 = vec_splats(HEDLEY_STATIC_CAST(signed short, a));
4328   #else
4329     SIMDE_VECTORIZE
4330     for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
4331       r_.i16[i] = a;
4332     }
4333   #endif
4334 
4335   return simde__m128i_from_private(r_);
4336 #endif
4337 }
4338 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4339 #  define _mm_set1_epi16(a) simde_mm_set1_epi16(a)
4340 #endif
4341 
4342 SIMDE_FUNCTION_ATTRIBUTES
4343 simde__m128i
simde_mm_set1_epi32(int32_t a)4344 simde_mm_set1_epi32 (int32_t a) {
4345 #if defined(SIMDE_X86_SSE2_NATIVE)
4346   return _mm_set1_epi32(a);
4347 #else
4348   simde__m128i_private r_;
4349 
4350   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4351     r_.neon_i32 = vdupq_n_s32(a);
4352   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4353     r_.wasm_v128 = wasm_i32x4_splat(a);
4354   #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
4355     r_.altivec_i32 = vec_splats(HEDLEY_STATIC_CAST(signed int, a));
4356   #else
4357     SIMDE_VECTORIZE
4358     for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
4359       r_.i32[i] = a;
4360     }
4361   #endif
4362 
4363   return simde__m128i_from_private(r_);
4364 #endif
4365 }
4366 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4367 #  define _mm_set1_epi32(a) simde_mm_set1_epi32(a)
4368 #endif
4369 
4370 SIMDE_FUNCTION_ATTRIBUTES
4371 simde__m128i
simde_mm_set1_epi64x(int64_t a)4372 simde_mm_set1_epi64x (int64_t a) {
4373 #if defined(SIMDE_X86_SSE2_NATIVE) && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,0,0))
4374   return _mm_set1_epi64x(a);
4375 #else
4376   simde__m128i_private r_;
4377 
4378   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4379     r_.neon_i64 = vmovq_n_s64(a);
4380   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4381     r_.wasm_v128 = wasm_i64x2_splat(a);
4382   #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
4383     r_.altivec_i64 = vec_splats(HEDLEY_STATIC_CAST(signed long long, a));
4384   #else
4385     SIMDE_VECTORIZE
4386     for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
4387       r_.i64[i] = a;
4388     }
4389   #endif
4390 
4391   return simde__m128i_from_private(r_);
4392 #endif
4393 }
4394 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4395 #  define _mm_set1_epi64x(a) simde_mm_set1_epi64x(a)
4396 #endif
4397 
4398 SIMDE_FUNCTION_ATTRIBUTES
4399 simde__m128i
simde_mm_set1_epi64(simde__m64 a)4400 simde_mm_set1_epi64 (simde__m64 a) {
4401 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
4402   return _mm_set1_epi64(a);
4403 #else
4404   simde__m64_private a_ = simde__m64_to_private(a);
4405   return simde_mm_set1_epi64x(a_.i64[0]);
4406 #endif
4407 }
4408 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4409 #  define _mm_set1_epi64(a) simde_mm_set1_epi64(a)
4410 #endif
4411 
4412 SIMDE_FUNCTION_ATTRIBUTES
4413 simde__m128i
simde_x_mm_set1_epu8(uint8_t value)4414 simde_x_mm_set1_epu8 (uint8_t value) {
4415   #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
4416     return simde__m128i_from_altivec_u8(vec_splats(HEDLEY_STATIC_CAST(unsigned char, value)));
4417   #else
4418     return simde_mm_set1_epi8(HEDLEY_STATIC_CAST(int8_t, value));
4419   #endif
4420 }
4421 
4422 SIMDE_FUNCTION_ATTRIBUTES
4423 simde__m128i
simde_x_mm_set1_epu16(uint16_t value)4424 simde_x_mm_set1_epu16 (uint16_t value) {
4425   #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
4426     return simde__m128i_from_altivec_u16(vec_splats(HEDLEY_STATIC_CAST(unsigned short, value)));
4427   #else
4428     return simde_mm_set1_epi16(HEDLEY_STATIC_CAST(int16_t, value));
4429   #endif
4430 }
4431 
4432 SIMDE_FUNCTION_ATTRIBUTES
4433 simde__m128i
simde_x_mm_set1_epu32(uint32_t value)4434 simde_x_mm_set1_epu32 (uint32_t value) {
4435   #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
4436     return simde__m128i_from_altivec_u32(vec_splats(HEDLEY_STATIC_CAST(unsigned int, value)));
4437   #else
4438     return simde_mm_set1_epi32(HEDLEY_STATIC_CAST(int32_t, value));
4439   #endif
4440 }
4441 
4442 SIMDE_FUNCTION_ATTRIBUTES
4443 simde__m128i
simde_x_mm_set1_epu64(uint64_t value)4444 simde_x_mm_set1_epu64 (uint64_t value) {
4445   #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
4446     return simde__m128i_from_altivec_u64(vec_splats(HEDLEY_STATIC_CAST(unsigned long long, value)));
4447   #else
4448     return simde_mm_set1_epi64x(HEDLEY_STATIC_CAST(int64_t, value));
4449   #endif
4450 }
4451 
4452 SIMDE_FUNCTION_ATTRIBUTES
4453 simde__m128d
simde_mm_set1_pd(simde_float64 a)4454 simde_mm_set1_pd (simde_float64 a) {
4455 #if defined(SIMDE_X86_SSE2_NATIVE)
4456   return _mm_set1_pd(a);
4457 #else
4458   simde__m128d_private r_;
4459 
4460   #if defined(SIMDE_WASM_SIMD128_NATIVE)
4461     r_.wasm_v128 = wasm_f64x2_splat(a);
4462   #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
4463     r_.altivec_f64 = vec_splats(HEDLEY_STATIC_CAST(double, a));
4464   #else
4465     SIMDE_VECTORIZE
4466     for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
4467       r_.f64[i] = a;
4468     }
4469   #endif
4470 
4471   return simde__m128d_from_private(r_);
4472 #endif
4473 }
4474 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4475 #  define _mm_set1_pd(a) simde_mm_set1_pd(a)
4476 #endif
4477 
4478 SIMDE_FUNCTION_ATTRIBUTES
4479 simde__m128i
simde_mm_setr_epi8(int8_t e15,int8_t e14,int8_t e13,int8_t e12,int8_t e11,int8_t e10,int8_t e9,int8_t e8,int8_t e7,int8_t e6,int8_t e5,int8_t e4,int8_t e3,int8_t e2,int8_t e1,int8_t e0)4480 simde_mm_setr_epi8 (int8_t e15, int8_t e14, int8_t e13, int8_t e12,
4481         int8_t e11, int8_t e10, int8_t  e9, int8_t  e8,
4482         int8_t  e7, int8_t  e6, int8_t  e5, int8_t  e4,
4483         int8_t  e3, int8_t  e2, int8_t  e1, int8_t  e0) {
4484 #if defined(SIMDE_X86_SSE2_NATIVE)
4485   return _mm_setr_epi8(
4486     e15, e14, e13, e12, e11, e10,  e9,    e8,
4487      e7,  e6,  e5,  e4,  e3,  e2,  e1,  e0);
4488 #else
4489   return simde_mm_set_epi8(
4490     e0, e1, e2, e3, e4, e5, e6, e7,
4491     e8, e9, e10, e11, e12, e13, e14, e15);
4492 #endif
4493 }
4494 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4495 #  define _mm_setr_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) simde_mm_setr_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0)
4496 #endif
4497 
4498 SIMDE_FUNCTION_ATTRIBUTES
4499 simde__m128i
simde_mm_setr_epi16(int16_t e7,int16_t e6,int16_t e5,int16_t e4,int16_t e3,int16_t e2,int16_t e1,int16_t e0)4500 simde_mm_setr_epi16 (int16_t e7, int16_t e6, int16_t e5, int16_t e4,
4501          int16_t e3, int16_t e2, int16_t e1, int16_t e0) {
4502 #if defined(SIMDE_X86_SSE2_NATIVE)
4503   return _mm_setr_epi16(e7,  e6,  e5,  e4,  e3,  e2,  e1,  e0);
4504 #else
4505   return simde_mm_set_epi16(e0, e1, e2, e3, e4, e5, e6, e7);
4506 #endif
4507 }
4508 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4509 #  define _mm_setr_epi16(e7, e6, e5, e4, e3, e2, e1, e0) simde_mm_setr_epi16(e7, e6, e5, e4, e3, e2, e1, e0)
4510 #endif
4511 
4512 SIMDE_FUNCTION_ATTRIBUTES
4513 simde__m128i
simde_mm_setr_epi32(int32_t e3,int32_t e2,int32_t e1,int32_t e0)4514 simde_mm_setr_epi32 (int32_t e3, int32_t e2, int32_t e1, int32_t e0) {
4515 #if defined(SIMDE_X86_SSE2_NATIVE)
4516   return _mm_setr_epi32(e3, e2, e1, e0);
4517 #else
4518   return simde_mm_set_epi32(e0, e1, e2, e3);
4519 #endif
4520 }
4521 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4522 #  define _mm_setr_epi32(e3, e2, e1, e0) simde_mm_setr_epi32(e3, e2, e1, e0)
4523 #endif
4524 
4525 SIMDE_FUNCTION_ATTRIBUTES
4526 simde__m128i
simde_mm_setr_epi64(simde__m64 e1,simde__m64 e0)4527 simde_mm_setr_epi64 (simde__m64 e1, simde__m64 e0) {
4528 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
4529   return _mm_setr_epi64(e1, e0);
4530 #else
4531   return simde_mm_set_epi64(e0, e1);
4532 #endif
4533 }
4534 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4535 #  define _mm_setr_epi64(e1, e0) (simde_mm_setr_epi64((e1), (e0)))
4536 #endif
4537 
4538 SIMDE_FUNCTION_ATTRIBUTES
4539 simde__m128d
simde_mm_setr_pd(simde_float64 e1,simde_float64 e0)4540 simde_mm_setr_pd (simde_float64 e1, simde_float64 e0) {
4541 #if defined(SIMDE_X86_SSE2_NATIVE)
4542   return _mm_setr_pd(e1, e0);
4543 #else
4544   return simde_mm_set_pd(e0, e1);
4545 #endif
4546 }
4547 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4548 #  define _mm_setr_pd(e1, e0) simde_mm_setr_pd(e1, e0)
4549 #endif
4550 
4551 SIMDE_FUNCTION_ATTRIBUTES
4552 simde__m128d
simde_mm_setzero_pd(void)4553 simde_mm_setzero_pd (void) {
4554 #if defined(SIMDE_X86_SSE2_NATIVE)
4555   return _mm_setzero_pd();
4556 #else
4557   return simde_mm_castsi128_pd(simde_mm_setzero_si128());
4558 #endif
4559 }
4560 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4561 #  define _mm_setzero_pd() simde_mm_setzero_pd()
4562 #endif
4563 
4564 #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
4565 HEDLEY_DIAGNOSTIC_PUSH
4566 SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_
4567 #endif
4568 
4569 SIMDE_FUNCTION_ATTRIBUTES
4570 simde__m128d
simde_mm_undefined_pd(void)4571 simde_mm_undefined_pd (void) {
4572   simde__m128d_private r_;
4573 
4574 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE__HAVE_UNDEFINED128)
4575   r_.n = _mm_undefined_pd();
4576 #elif !defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
4577   r_ = simde__m128d_to_private(simde_mm_setzero_pd());
4578 #endif
4579 
4580   return simde__m128d_from_private(r_);
4581 }
4582 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4583 #  define _mm_undefined_pd() simde_mm_undefined_pd()
4584 #endif
4585 
4586 SIMDE_FUNCTION_ATTRIBUTES
4587 simde__m128i
simde_mm_undefined_si128(void)4588 simde_mm_undefined_si128 (void) {
4589   simde__m128i_private r_;
4590 
4591 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE__HAVE_UNDEFINED128)
4592   r_.n = _mm_undefined_si128();
4593 #elif !defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
4594   r_ = simde__m128i_to_private(simde_mm_setzero_si128());
4595 #endif
4596 
4597   return simde__m128i_from_private(r_);
4598 }
4599 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4600 #  define _mm_undefined_si128() (simde_mm_undefined_si128())
4601 #endif
4602 
4603 #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
4604 HEDLEY_DIAGNOSTIC_POP
4605 #endif
4606 
4607 SIMDE_FUNCTION_ATTRIBUTES
4608 simde__m128d
simde_x_mm_setone_pd(void)4609 simde_x_mm_setone_pd (void) {
4610   return simde_mm_castps_pd(simde_x_mm_setone_ps());
4611 }
4612 
4613 SIMDE_FUNCTION_ATTRIBUTES
4614 simde__m128i
simde_x_mm_setone_si128(void)4615 simde_x_mm_setone_si128 (void) {
4616   return simde_mm_castps_si128(simde_x_mm_setone_ps());
4617 }
4618 
4619 SIMDE_FUNCTION_ATTRIBUTES
4620 simde__m128i
simde_mm_shuffle_epi32(simde__m128i a,const int imm8)4621 simde_mm_shuffle_epi32 (simde__m128i a, const int imm8)
4622     SIMDE_REQUIRE_RANGE(imm8, 0, 255)  {
4623   simde__m128i_private
4624     r_,
4625     a_ = simde__m128i_to_private(a);
4626 
4627   for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
4628     r_.i32[i] = a_.i32[(imm8 >> (i * 2)) & 3];
4629   }
4630 
4631   return simde__m128i_from_private(r_);
4632 }
4633 #if defined(SIMDE_X86_SSE2_NATIVE)
4634 #  define simde_mm_shuffle_epi32(a, imm8) _mm_shuffle_epi32((a), (imm8))
4635 #elif defined(SIMDE_SHUFFLE_VECTOR_)
4636 #  define simde_mm_shuffle_epi32(a, imm8) (__extension__ ({ \
4637       const simde__m128i_private simde__tmp_a_ = simde__m128i_to_private(a); \
4638       simde__m128i_from_private((simde__m128i_private) { .i32 = \
4639         SIMDE_SHUFFLE_VECTOR_(32, 16, \
4640           (simde__tmp_a_).i32, \
4641           (simde__tmp_a_).i32, \
4642           ((imm8)     ) & 3, \
4643           ((imm8) >> 2) & 3, \
4644           ((imm8) >> 4) & 3, \
4645           ((imm8) >> 6) & 3) }); }))
4646 #endif
4647 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4648 #  define _mm_shuffle_epi32(a, imm8) simde_mm_shuffle_epi32(a, imm8)
4649 #endif
4650 
4651 SIMDE_FUNCTION_ATTRIBUTES
4652 simde__m128d
simde_mm_shuffle_pd(simde__m128d a,simde__m128d b,const int imm8)4653 simde_mm_shuffle_pd (simde__m128d a, simde__m128d b, const int imm8)
4654     SIMDE_REQUIRE_RANGE(imm8, 0, 3)  {
4655   simde__m128d_private
4656     r_,
4657     a_ = simde__m128d_to_private(a),
4658     b_ = simde__m128d_to_private(b);
4659 
4660   r_.f64[0] = ((imm8 & 1) == 0) ? a_.f64[0] : a_.f64[1];
4661   r_.f64[1] = ((imm8 & 2) == 0) ? b_.f64[0] : b_.f64[1];
4662 
4663   return simde__m128d_from_private(r_);
4664 }
4665 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
4666 #  define simde_mm_shuffle_pd(a, b, imm8) _mm_shuffle_pd((a), (b), (imm8))
4667 #elif defined(SIMDE_SHUFFLE_VECTOR_)
4668 #  define simde_mm_shuffle_pd(a, b, imm8) (__extension__ ({ \
4669       simde__m128d_from_private((simde__m128d_private) { .f64 = \
4670         SIMDE_SHUFFLE_VECTOR_(64, 16, \
4671           simde__m128d_to_private(a).f64, \
4672           simde__m128d_to_private(b).f64, \
4673           (((imm8)     ) & 1), \
4674           (((imm8) >> 1) & 1) + 2) }); }))
4675 #endif
4676 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4677 #  define _mm_shuffle_pd(a, b, imm8) simde_mm_shuffle_pd(a, b, imm8)
4678 #endif
4679 
4680 SIMDE_FUNCTION_ATTRIBUTES
4681 simde__m128i
simde_mm_shufflehi_epi16(simde__m128i a,const int imm8)4682 simde_mm_shufflehi_epi16 (simde__m128i a, const int imm8)
4683     SIMDE_REQUIRE_RANGE(imm8, 0, 255)  {
4684   simde__m128i_private
4685     r_,
4686     a_ = simde__m128i_to_private(a);
4687 
4688   SIMDE_VECTORIZE
4689   for (size_t i = 0 ; i < ((sizeof(a_.i16) / sizeof(a_.i16[0])) / 2) ; i++) {
4690     r_.i16[i] = a_.i16[i];
4691   }
4692   for (size_t i = ((sizeof(a_.i16) / sizeof(a_.i16[0])) / 2) ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
4693     r_.i16[i] = a_.i16[((imm8 >> ((i - 4) * 2)) & 3) + 4];
4694   }
4695 
4696   return simde__m128i_from_private(r_);
4697 }
4698 #if defined(SIMDE_X86_SSE2_NATIVE)
4699 #  define simde_mm_shufflehi_epi16(a, imm8) _mm_shufflehi_epi16((a), (imm8))
4700 #elif defined(SIMDE_SHUFFLE_VECTOR_)
4701 #  define simde_mm_shufflehi_epi16(a, imm8) (__extension__ ({ \
4702       const simde__m128i_private simde__tmp_a_ = simde__m128i_to_private(a); \
4703       simde__m128i_from_private((simde__m128i_private) { .i16 = \
4704         SIMDE_SHUFFLE_VECTOR_(16, 16, \
4705           (simde__tmp_a_).i16, \
4706           (simde__tmp_a_).i16, \
4707           0, 1, 2, 3, \
4708           (((imm8)     ) & 3) + 4, \
4709           (((imm8) >> 2) & 3) + 4, \
4710           (((imm8) >> 4) & 3) + 4, \
4711           (((imm8) >> 6) & 3) + 4) }); }))
4712 #endif
4713 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4714 #  define _mm_shufflehi_epi16(a, imm8) simde_mm_shufflehi_epi16(a, imm8)
4715 #endif
4716 
4717 SIMDE_FUNCTION_ATTRIBUTES
4718 simde__m128i
simde_mm_shufflelo_epi16(simde__m128i a,const int imm8)4719 simde_mm_shufflelo_epi16 (simde__m128i a, const int imm8)
4720     SIMDE_REQUIRE_RANGE(imm8, 0, 255)  {
4721   simde__m128i_private
4722     r_,
4723     a_ = simde__m128i_to_private(a);
4724 
4725   for (size_t i = 0 ; i < ((sizeof(r_.i16) / sizeof(r_.i16[0])) / 2) ; i++) {
4726     r_.i16[i] = a_.i16[((imm8 >> (i * 2)) & 3)];
4727   }
4728   SIMDE_VECTORIZE
4729   for (size_t i = ((sizeof(a_.i16) / sizeof(a_.i16[0])) / 2) ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
4730     r_.i16[i] = a_.i16[i];
4731   }
4732 
4733   return simde__m128i_from_private(r_);
4734 }
4735 #if defined(SIMDE_X86_SSE2_NATIVE)
4736 #  define simde_mm_shufflelo_epi16(a, imm8) _mm_shufflelo_epi16((a), (imm8))
4737 #elif defined(SIMDE_SHUFFLE_VECTOR_)
4738 #  define simde_mm_shufflelo_epi16(a, imm8) (__extension__ ({ \
4739       const simde__m128i_private simde__tmp_a_ = simde__m128i_to_private(a); \
4740       simde__m128i_from_private((simde__m128i_private) { .i16 = \
4741         SIMDE_SHUFFLE_VECTOR_(16, 16, \
4742           (simde__tmp_a_).i16, \
4743           (simde__tmp_a_).i16, \
4744           (((imm8)     ) & 3), \
4745           (((imm8) >> 2) & 3), \
4746           (((imm8) >> 4) & 3), \
4747           (((imm8) >> 6) & 3), \
4748           4, 5, 6, 7) }); }))
4749 #endif
4750 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4751 #  define _mm_shufflelo_epi16(a, imm8) simde_mm_shufflelo_epi16(a, imm8)
4752 #endif
4753 
4754 SIMDE_FUNCTION_ATTRIBUTES
4755 simde__m128i
simde_mm_sll_epi16(simde__m128i a,simde__m128i count)4756 simde_mm_sll_epi16 (simde__m128i a, simde__m128i count) {
4757 #if defined(SIMDE_X86_SSE2_NATIVE)
4758   return _mm_sll_epi16(a, count);
4759 #else
4760   simde__m128i_private
4761     r_,
4762     a_ = simde__m128i_to_private(a),
4763     count_ = simde__m128i_to_private(count);
4764 
4765   if (count_.u64[0] > 15)
4766     return simde_mm_setzero_si128();
4767 
4768   #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
4769     r_.u16 = (a_.u16 << count_.u64[0]);
4770   #else
4771     SIMDE_VECTORIZE
4772     for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
4773       r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, (a_.u16[i] << count_.u64[0]));
4774     }
4775   #endif
4776 
4777   return simde__m128i_from_private(r_);
4778 #endif
4779 }
4780 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4781 #  define _mm_sll_epi16(a, count) simde_mm_sll_epi16((a), (count))
4782 #endif
4783 
4784 SIMDE_FUNCTION_ATTRIBUTES
4785 simde__m128i
simde_mm_sll_epi32(simde__m128i a,simde__m128i count)4786 simde_mm_sll_epi32 (simde__m128i a, simde__m128i count) {
4787 #if defined(SIMDE_X86_SSE2_NATIVE)
4788   return _mm_sll_epi32(a, count);
4789 #else
4790   simde__m128i_private
4791     r_,
4792     a_ = simde__m128i_to_private(a),
4793     count_ = simde__m128i_to_private(count);
4794 
4795   if (count_.u64[0] > 31)
4796     return simde_mm_setzero_si128();
4797 
4798 #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
4799   r_.u32 = (a_.u32 << count_.u64[0]);
4800 #else
4801   SIMDE_VECTORIZE
4802   for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
4803     r_.u32[i] = HEDLEY_STATIC_CAST(uint32_t, (a_.u32[i] << count_.u64[0]));
4804   }
4805 #endif
4806 
4807   return simde__m128i_from_private(r_);
4808 #endif
4809 }
4810 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4811 #  define _mm_sll_epi32(a, count) (simde_mm_sll_epi32(a, (count)))
4812 #endif
4813 
4814 SIMDE_FUNCTION_ATTRIBUTES
4815 simde__m128i
simde_mm_sll_epi64(simde__m128i a,simde__m128i count)4816 simde_mm_sll_epi64 (simde__m128i a, simde__m128i count) {
4817 #if defined(SIMDE_X86_SSE2_NATIVE)
4818   return _mm_sll_epi64(a, count);
4819 #else
4820   simde__m128i_private
4821     r_,
4822     a_ = simde__m128i_to_private(a),
4823     count_ = simde__m128i_to_private(count);
4824 
4825   if (count_.u64[0] > 63)
4826     return simde_mm_setzero_si128();
4827 
4828   const int_fast16_t s = HEDLEY_STATIC_CAST(int_fast16_t, count_.u64[0]);
4829   #if !defined(SIMDE_BUG_GCC_94488)
4830     SIMDE_VECTORIZE
4831   #endif
4832   for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
4833     r_.u64[i] = a_.u64[i] << s;
4834   }
4835 
4836   return simde__m128i_from_private(r_);
4837 #endif
4838 }
4839 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4840 #  define _mm_sll_epi64(a, count) (simde_mm_sll_epi64(a, (count)))
4841 #endif
4842 
4843 SIMDE_FUNCTION_ATTRIBUTES
4844 simde__m128d
simde_mm_sqrt_pd(simde__m128d a)4845 simde_mm_sqrt_pd (simde__m128d a) {
4846   #if defined(SIMDE_X86_SSE2_NATIVE)
4847     return _mm_sqrt_pd(a);
4848   #else
4849     simde__m128d_private
4850       r_,
4851       a_ = simde__m128d_to_private(a);
4852 
4853     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
4854       r_.neon_f64 = vsqrtq_f64(a_.neon_f64);
4855     #elif defined(simde_math_sqrt)
4856       SIMDE_VECTORIZE
4857       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
4858         r_.f64[i] = simde_math_sqrt(a_.f64[i]);
4859       }
4860     #else
4861       HEDLEY_UNREACHABLE();
4862     #endif
4863 
4864     return simde__m128d_from_private(r_);
4865   #endif
4866 }
4867 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4868 #  define _mm_sqrt_pd(a) simde_mm_sqrt_pd(a)
4869 #endif
4870 
4871 SIMDE_FUNCTION_ATTRIBUTES
4872 simde__m128d
simde_mm_sqrt_sd(simde__m128d a,simde__m128d b)4873 simde_mm_sqrt_sd (simde__m128d a, simde__m128d b) {
4874   #if defined(SIMDE_X86_SSE2_NATIVE)
4875     return _mm_sqrt_sd(a, b);
4876   #elif defined(SIMDE_ASSUME_VECTORIZATION)
4877     return simde_mm_move_sd(a, simde_mm_sqrt_pd(b));
4878   #else
4879     simde__m128d_private
4880       r_,
4881       a_ = simde__m128d_to_private(a),
4882       b_ = simde__m128d_to_private(b);
4883 
4884     #if defined(simde_math_sqrt)
4885       r_.f64[0] = simde_math_sqrt(b_.f64[0]);
4886       r_.f64[1] = a_.f64[1];
4887     #else
4888       HEDLEY_UNREACHABLE();
4889     #endif
4890 
4891     return simde__m128d_from_private(r_);
4892   #endif
4893 }
4894 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4895 #  define _mm_sqrt_sd(a, b) simde_mm_sqrt_sd(a, b)
4896 #endif
4897 
4898 SIMDE_FUNCTION_ATTRIBUTES
4899 simde__m128i
simde_mm_srl_epi16(simde__m128i a,simde__m128i count)4900 simde_mm_srl_epi16 (simde__m128i a, simde__m128i count) {
4901 #if defined(SIMDE_X86_SSE2_NATIVE)
4902   return _mm_srl_epi16(a, count);
4903 #else
4904   simde__m128i_private
4905     r_,
4906     a_ = simde__m128i_to_private(a),
4907     count_ = simde__m128i_to_private(count);
4908 
4909   const int cnt = HEDLEY_STATIC_CAST(int, (count_.i64[0] > 16 ? 16 : count_.i64[0]));
4910 
4911   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4912     r_.neon_u16 = vshlq_u16(a_.neon_u16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, -cnt)));
4913   #else
4914     SIMDE_VECTORIZE
4915     for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
4916       r_.u16[i] = a_.u16[i] >> cnt;
4917     }
4918   #endif
4919 
4920   return simde__m128i_from_private(r_);
4921 #endif
4922 }
4923 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4924   #define _mm_srl_epi16(a, count) (simde_mm_srl_epi16(a, (count)))
4925 #endif
4926 
4927 SIMDE_FUNCTION_ATTRIBUTES
4928 simde__m128i
simde_mm_srl_epi32(simde__m128i a,simde__m128i count)4929 simde_mm_srl_epi32 (simde__m128i a, simde__m128i count) {
4930 #if defined(SIMDE_X86_SSE2_NATIVE)
4931   return _mm_srl_epi32(a, count);
4932 #else
4933   simde__m128i_private
4934     r_,
4935     a_ = simde__m128i_to_private(a),
4936     count_ = simde__m128i_to_private(count);
4937 
4938   const int cnt = HEDLEY_STATIC_CAST(int, (count_.i64[0] > 32 ? 32 : count_.i64[0]));
4939 
4940   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4941     r_.neon_u32 = vshlq_u32(a_.neon_u32, vdupq_n_s32(HEDLEY_STATIC_CAST(int32_t, -cnt)));
4942   #else
4943     SIMDE_VECTORIZE
4944     for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
4945       r_.u32[i] = a_.u32[i] >> cnt;
4946     }
4947   #endif
4948 
4949   return simde__m128i_from_private(r_);
4950 #endif
4951 }
4952 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4953 #  define _mm_srl_epi32(a, count) (simde_mm_srl_epi32(a, (count)))
4954 #endif
4955 
4956 SIMDE_FUNCTION_ATTRIBUTES
4957 simde__m128i
simde_mm_srl_epi64(simde__m128i a,simde__m128i count)4958 simde_mm_srl_epi64 (simde__m128i a, simde__m128i count) {
4959 #if defined(SIMDE_X86_SSE2_NATIVE)
4960   return _mm_srl_epi64(a, count);
4961 #else
4962   simde__m128i_private
4963     r_,
4964     a_ = simde__m128i_to_private(a),
4965     count_ = simde__m128i_to_private(count);
4966 
4967   const int cnt = HEDLEY_STATIC_CAST(int, (count_.i64[0] > 64 ? 64 : count_.i64[0]));
4968 
4969   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4970     r_.neon_u64 = vshlq_u64(a_.neon_u64, vdupq_n_s64(HEDLEY_STATIC_CAST(int64_t, -cnt)));
4971   #else
4972     #if !defined(SIMDE_BUG_GCC_94488)
4973       SIMDE_VECTORIZE
4974     #endif
4975     for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
4976       r_.u64[i] = a_.u64[i] >> cnt;
4977     }
4978   #endif
4979 
4980   return simde__m128i_from_private(r_);
4981 #endif
4982 }
4983 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4984 #  define _mm_srl_epi64(a, count) (simde_mm_srl_epi64(a, (count)))
4985 #endif
4986 
4987 SIMDE_FUNCTION_ATTRIBUTES
4988 simde__m128i
simde_mm_srai_epi16(simde__m128i a,const int imm8)4989 simde_mm_srai_epi16 (simde__m128i a, const int imm8)
4990     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
4991   /* MSVC requires a range of (0, 255). */
4992   simde__m128i_private
4993     r_,
4994     a_ = simde__m128i_to_private(a);
4995 
4996   const int cnt = (imm8 & ~15) ? 15 : imm8;
4997 
4998   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4999     r_.neon_i16 = vshlq_s16(a_.neon_i16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, -cnt)));
5000   #else
5001     SIMDE_VECTORIZE
5002     for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i16[0])) ; i++) {
5003       r_.i16[i] = a_.i16[i] >> cnt;
5004     }
5005   #endif
5006 
5007   return simde__m128i_from_private(r_);
5008 }
5009 #if defined(SIMDE_X86_SSE2_NATIVE)
5010   #define simde_mm_srai_epi16(a, imm8) _mm_srai_epi16((a), (imm8))
5011 #endif
5012 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5013   #define _mm_srai_epi16(a, imm8) simde_mm_srai_epi16(a, imm8)
5014 #endif
5015 
5016 SIMDE_FUNCTION_ATTRIBUTES
5017 simde__m128i
simde_mm_srai_epi32(simde__m128i a,const int imm8)5018 simde_mm_srai_epi32 (simde__m128i a, const int imm8)
5019     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
5020   /* MSVC requires a range of (0, 255). */
5021   simde__m128i_private
5022     r_,
5023     a_ = simde__m128i_to_private(a);
5024 
5025   const int cnt = (imm8 & ~31) ? 31 : imm8;
5026 
5027   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5028     r_.neon_i32 = vshlq_s32(a_.neon_i32, vdupq_n_s32(-cnt));
5029   #else
5030     SIMDE_VECTORIZE
5031     for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i32[0])) ; i++) {
5032       r_.i32[i] = a_.i32[i] >> cnt;
5033     }
5034   #endif
5035 
5036   return simde__m128i_from_private(r_);
5037 }
5038 #if defined(SIMDE_X86_SSE2_NATIVE)
5039   #define simde_mm_srai_epi32(a, imm8) _mm_srai_epi32((a), (imm8))
5040 #endif
5041 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5042   #define _mm_srai_epi32(a, imm8) simde_mm_srai_epi32(a, imm8)
5043 #endif
5044 
5045 SIMDE_FUNCTION_ATTRIBUTES
5046 simde__m128i
simde_mm_sra_epi16(simde__m128i a,simde__m128i count)5047 simde_mm_sra_epi16 (simde__m128i a, simde__m128i count) {
5048 #if defined(SIMDE_X86_SSE2_NATIVE)
5049   return _mm_sra_epi16(a, count);
5050 #else
5051   simde__m128i_private
5052     r_,
5053     a_ = simde__m128i_to_private(a),
5054     count_ = simde__m128i_to_private(count);
5055 
5056   const int cnt = HEDLEY_STATIC_CAST(int, (count_.i64[0] > 15 ? 15 : count_.i64[0]));
5057 
5058   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5059     r_.neon_i16 = vshlq_s16(a_.neon_i16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, -cnt)));
5060   #else
5061     SIMDE_VECTORIZE
5062     for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
5063       r_.i16[i] = a_.i16[i] >> cnt;
5064     }
5065   #endif
5066 
5067   return simde__m128i_from_private(r_);
5068 #endif
5069 }
5070 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5071 #  define _mm_sra_epi16(a, count) (simde_mm_sra_epi16(a, count))
5072 #endif
5073 
5074 SIMDE_FUNCTION_ATTRIBUTES
5075 simde__m128i
simde_mm_sra_epi32(simde__m128i a,simde__m128i count)5076 simde_mm_sra_epi32 (simde__m128i a, simde__m128i count) {
5077 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_MM_SRA_EPI32)
5078   return _mm_sra_epi32(a, count);
5079 #else
5080   simde__m128i_private
5081     r_,
5082     a_ = simde__m128i_to_private(a),
5083     count_ = simde__m128i_to_private(count);
5084 
5085   const int cnt = count_.u64[0] > 31 ? 31 : HEDLEY_STATIC_CAST(int, count_.u64[0]);
5086 
5087   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5088     r_.neon_i32 = vshlq_s32(a_.neon_i32, vdupq_n_s32(HEDLEY_STATIC_CAST(int32_t, -cnt)));
5089   #else
5090     SIMDE_VECTORIZE
5091     for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
5092       r_.i32[i] = a_.i32[i] >> cnt;
5093     }
5094   #endif
5095 
5096   return simde__m128i_from_private(r_);
5097 #endif
5098 }
5099 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5100 #  define _mm_sra_epi32(a, count) (simde_mm_sra_epi32(a, (count)))
5101 #endif
5102 
5103 SIMDE_FUNCTION_ATTRIBUTES
5104 simde__m128i
simde_mm_slli_epi16(simde__m128i a,const int imm8)5105 simde_mm_slli_epi16 (simde__m128i a, const int imm8)
5106     SIMDE_REQUIRE_RANGE(imm8, 0, 255)  {
5107   if (HEDLEY_UNLIKELY((imm8 > 15))) {
5108     return simde_mm_setzero_si128();
5109   }
5110 
5111   simde__m128i_private
5112     r_,
5113     a_ = simde__m128i_to_private(a);
5114 
5115   #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
5116     r_.i16 = a_.i16 << (imm8 & 0xff);
5117   #else
5118     const int s = (imm8 > HEDLEY_STATIC_CAST(int, sizeof(r_.i16[0]) * CHAR_BIT) - 1) ? 0 : imm8;
5119     SIMDE_VECTORIZE
5120     for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
5121       r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, a_.i16[i] << s);
5122     }
5123   #endif
5124 
5125   return simde__m128i_from_private(r_);
5126 }
5127 #if defined(SIMDE_X86_SSE2_NATIVE)
5128 #  define simde_mm_slli_epi16(a, imm8) _mm_slli_epi16(a, imm8)
5129 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(__clang__)
5130 #  define simde_mm_slli_epi16(a, imm8) \
5131   simde__m128i_from_neon_u16(vshlq_n_u16(simde__m128i_to_neon_u16(a), (imm8)))
5132 // The above is allowed by gcc/g++ 9 with -march=armv8-a, might work on A32V8 and elsewhere but needs testing
5133 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) // clang can't handle the potential out of range use of imm8 even though that is handled
5134 #  define simde_mm_slli_epi16(a, imm8) \
5135      ({                                                            \
5136         simde__m128i ret;                                          \
5137         if ((imm8) <= 0) {                                         \
5138             ret = a;                                               \
5139         } else if ((imm8) > 15) {                                  \
5140             ret = simde_mm_setzero_si128();      \
5141         } else {                                                   \
5142             ret = simde__m128i_from_neon_i16(                      \
5143                 vshlq_n_s16(simde__m128i_to_neon_i16(a), (imm8))); \
5144         }                                                          \
5145         ret;                                                       \
5146     })
5147 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
5148   #define simde_mm_slli_epi16(a, imm8) \
5149     ((imm8 & ~15) ? simde_mm_setzero_si128() : simde__m128i_from_altivec_i16(vec_sl(simde__m128i_to_altivec_i16(a), vec_splat_u16(HEDLEY_STATIC_CAST(unsigned short, imm8)))))
5150 #endif
5151 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5152 #  define _mm_slli_epi16(a, imm8) simde_mm_slli_epi16(a, imm8)
5153 #endif
5154 
5155 SIMDE_FUNCTION_ATTRIBUTES
5156 simde__m128i
simde_mm_slli_epi32(simde__m128i a,const int imm8)5157 simde_mm_slli_epi32 (simde__m128i a, const int imm8)
5158     SIMDE_REQUIRE_RANGE(imm8, 0, 255)  {
5159   if (HEDLEY_UNLIKELY((imm8 > 31))) {
5160     return simde_mm_setzero_si128();
5161   }
5162   simde__m128i_private
5163     r_,
5164     a_ = simde__m128i_to_private(a);
5165 
5166   #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
5167     r_.i32 = a_.i32 << imm8;
5168   #else
5169     SIMDE_VECTORIZE
5170     for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
5171       r_.i32[i] = a_.i32[i] << (imm8 & 0xff);
5172     }
5173   #endif
5174 
5175   return simde__m128i_from_private(r_);
5176 }
5177 #if defined(SIMDE_X86_SSE2_NATIVE)
5178 #  define simde_mm_slli_epi32(a, imm8) _mm_slli_epi32(a, imm8)
5179 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(__clang__)
5180 #  define simde_mm_slli_epi32(a, imm8) \
5181   simde__m128i_from_neon_u32(vshlq_n_u32(simde__m128i_to_neon_u32(a), (imm8)))
5182 // The above is allowed by gcc/g++ 9 with -march=armv8-a, might work on A32V8 and elsewhere but needs testing
5183 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) // clang can't handle the potential out of range use of imm8 even though that is handled
5184 #  define simde_mm_slli_epi32(a, imm8) \
5185      ({                                                       \
5186        simde__m128i ret;                                      \
5187        if ((imm8) <= 0) {                                     \
5188          ret = a;                                             \
5189        } else if ((imm8) > 31) {                              \
5190          ret = simde_mm_setzero_si128();                      \
5191        } else {                                               \
5192          ret = simde__m128i_from_neon_i32(                    \
5193            vshlq_n_s32(simde__m128i_to_neon_i32(a), (imm8))); \
5194        }                                                      \
5195        ret;                                                   \
5196     })
5197 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
5198   #define simde_mm_slli_epi32(a, imm8) \
5199      ({                                                            \
5200        simde__m128i ret;                                           \
5201        if ((imm8) <= 0) {                                          \
5202          ret = a;                                                  \
5203        } else if ((imm8) > 31) {                                   \
5204          ret = simde_mm_setzero_si128();                           \
5205        } else {                                                    \
5206          ret = simde__m128i_from_altivec_i32(                      \
5207            vec_sl(simde__m128i_to_altivec_i32(a),                  \
5208              vec_splats(HEDLEY_STATIC_CAST(unsigned int, imm8)))); \
5209        }                                                           \
5210        ret;                                                        \
5211      })
5212 #endif
5213 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5214 #  define _mm_slli_epi32(a, imm8) simde_mm_slli_epi32(a, imm8)
5215 #endif
5216 
5217 SIMDE_FUNCTION_ATTRIBUTES
5218 simde__m128i
simde_mm_slli_epi64(simde__m128i a,const int imm8)5219 simde_mm_slli_epi64 (simde__m128i a, const int imm8)
5220     SIMDE_REQUIRE_RANGE(imm8, 0, 255)  {
5221   if (HEDLEY_UNLIKELY((imm8 > 63))) {
5222     return simde_mm_setzero_si128();
5223   }
5224   simde__m128i_private
5225     r_,
5226     a_ = simde__m128i_to_private(a);
5227 
5228 #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
5229   r_.i64 = a_.i64 << imm8;
5230 #else
5231   SIMDE_VECTORIZE
5232   for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
5233     r_.i64[i] = a_.i64[i] << (imm8 & 0xff);
5234   }
5235 #endif
5236 
5237   return simde__m128i_from_private(r_);
5238 }
5239 #if defined(SIMDE_X86_SSE2_NATIVE)
5240 #  define simde_mm_slli_epi64(a, imm8) _mm_slli_epi64(a, imm8)
5241 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(__clang__)
5242 #  define simde_mm_slli_epi64(a, imm8) \
5243   simde__m128i_from_neon_u64(vshlq_n_u64(simde__m128i_to_neon_u64(a), (imm8)))
5244 // The above is allowed by gcc/g++ 9 with -march=armv8-a, might work on A32V8 and elsewhere but needs testing
5245 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) // clang can't handle the potential out of range use of imm8 even though that is handled
5246 #  define simde_mm_slli_epi64(a, imm8) \
5247      ({                                                            \
5248         simde__m128i ret;                                          \
5249         if ((imm8) <= 0) {                                         \
5250             ret = a;                                               \
5251         } else if ((imm8) > 63) {                                  \
5252             ret = simde_mm_setzero_si128();                        \
5253         } else {                                                   \
5254             ret = simde__m128i_from_neon_i64(                      \
5255                 vshlq_n_s64(simde__m128i_to_neon_i64(a), (imm8))); \
5256         }                                                          \
5257         ret;                                                       \
5258     })
5259 #endif
5260 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5261 #  define _mm_slli_epi64(a, imm8) simde_mm_slli_epi64(a, imm8)
5262 #endif
5263 
5264 SIMDE_FUNCTION_ATTRIBUTES
5265 simde__m128i
simde_mm_srli_epi16(simde__m128i a,const int imm8)5266 simde_mm_srli_epi16 (simde__m128i a, const int imm8)
5267     SIMDE_REQUIRE_RANGE(imm8, 0, 255)  {
5268   if (HEDLEY_UNLIKELY((imm8 > 15))) {
5269     return simde_mm_setzero_si128();
5270   }
5271   simde__m128i_private
5272     r_,
5273     a_ = simde__m128i_to_private(a);
5274 
5275 #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
5276   r_.u16 = a_.u16 >> imm8;
5277 #else
5278   SIMDE_VECTORIZE
5279   for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
5280     r_.u16[i] = a_.u16[i] >> (imm8 & 0xff);
5281   }
5282 #endif
5283 
5284   return simde__m128i_from_private(r_);
5285 }
5286 #if defined(SIMDE_X86_SSE2_NATIVE)
5287 #  define simde_mm_srli_epi16(a, imm8) _mm_srli_epi16(a, imm8)
5288 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(__clang__)
5289 #  define simde_mm_srli_epi16(a, imm8) \
5290   simde__m128i_from_neon_u16(vshrq_n_u16(simde__m128i_to_neon_u16(a), imm8))
5291 // The above is allowed by gcc/g++ 9 with -march=armv8-a, might work on A32V8 and elsewhere but needs testing
5292 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) // clang can't handle the potential out of range use of imm8 even though that is handled
5293 #  define simde_mm_srli_epi16(a, imm8) \
5294      ({                                                            \
5295         simde__m128i ret;                                          \
5296         if ((imm8) <= 0) {                                         \
5297             ret = a;                                               \
5298         } else if ((imm8) > 15) {                                  \
5299             ret = simde_mm_setzero_si128();                        \
5300         } else {                                                   \
5301             ret = simde__m128i_from_neon_u16(                      \
5302                 vshrq_n_u16(simde__m128i_to_neon_u16(a), (imm8))); \
5303         }                                                          \
5304         ret;                                                       \
5305     })
5306 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
5307   #define simde_mm_srli_epi16(a, imm8) \
5308     ((imm8 & ~15) ? simde_mm_setzero_si128() : simde__m128i_from_altivec_i16(vec_sr(simde__m128i_to_altivec_i16(a), vec_splat_u16(HEDLEY_STATIC_CAST(unsigned short, imm8)))))
5309 #endif
5310 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5311 #  define _mm_srli_epi16(a, imm8) simde_mm_srli_epi16(a, imm8)
5312 #endif
5313 
5314 SIMDE_FUNCTION_ATTRIBUTES
5315 simde__m128i
simde_mm_srli_epi32(simde__m128i a,const int imm8)5316 simde_mm_srli_epi32 (simde__m128i a, const int imm8)
5317     SIMDE_REQUIRE_RANGE(imm8, 0, 255)  {
5318   if (HEDLEY_UNLIKELY((imm8 > 31))) {
5319     return simde_mm_setzero_si128();
5320   }
5321   simde__m128i_private
5322     r_,
5323     a_ = simde__m128i_to_private(a);
5324 
5325 #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
5326   r_.u32 = a_.u32 >> (imm8 & 0xff);
5327 #else
5328   SIMDE_VECTORIZE
5329   for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
5330     r_.u32[i] = a_.u32[i] >> (imm8 & 0xff);
5331   }
5332 #endif
5333 
5334   return simde__m128i_from_private(r_);
5335 }
5336 #if defined(SIMDE_X86_SSE2_NATIVE)
5337 #  define simde_mm_srli_epi32(a, imm8) _mm_srli_epi32(a, imm8)
5338 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(__clang__)
5339 #  define simde_mm_srli_epi32(a, imm8) \
5340      simde__m128i_from_neon_u32(vshrq_n_u32(simde__m128i_to_neon_u32(a), imm8))
5341 // The above is allowed by gcc/g++ 9 with -march=armv8-a, might work on A32V8 and elsewhere but needs testing
5342 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) // clang can't handle the potential out of range use of imm8 even though that is handled
5343 #  define simde_mm_srli_epi32(a, imm8) \
5344     ({                                                           \
5345         simde__m128i ret;                                        \
5346         if ((imm8) <= 0) {                                       \
5347             ret = a;                                             \
5348         } else if ((imm8) > 31) {                                \
5349             ret = simde_mm_setzero_si128();                      \
5350         } else {                                                 \
5351             ret = simde__m128i_from_neon_u32(                    \
5352               vshrq_n_u32(simde__m128i_to_neon_u32(a), (imm8))); \
5353         }                                                        \
5354         ret;                                                     \
5355     })
5356 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
5357 #  define simde_mm_srli_epi32(a, imm8) \
5358     ({                                                                \
5359         simde__m128i ret;                                             \
5360         if ((imm8) <= 0) {                                            \
5361             ret = a;                                                  \
5362         } else if ((imm8) > 31) {                                     \
5363             ret = simde_mm_setzero_si128();                           \
5364         } else {                                                      \
5365             ret = simde__m128i_from_altivec_i32(                      \
5366               vec_sr(simde__m128i_to_altivec_i32(a),                  \
5367                 vec_splats(HEDLEY_STATIC_CAST(unsigned int, imm8)))); \
5368         }                                                             \
5369         ret;                                                          \
5370     })
5371 #endif
5372 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5373 #  define _mm_srli_epi32(a, imm8) simde_mm_srli_epi32(a, imm8)
5374 #endif
5375 
5376 SIMDE_FUNCTION_ATTRIBUTES
5377 simde__m128i
simde_mm_srli_epi64(simde__m128i a,const int imm8)5378 simde_mm_srli_epi64 (simde__m128i a, const int imm8)
5379     SIMDE_REQUIRE_RANGE(imm8, 0, 255)  {
5380   simde__m128i_private
5381     r_,
5382     a_ = simde__m128i_to_private(a);
5383 
5384   if (HEDLEY_UNLIKELY((imm8 & 63) != imm8))
5385     return simde_mm_setzero_si128();
5386 
5387   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5388     r_.neon_u64 = vshlq_u64(a_.neon_u64, vdupq_n_s64(-imm8));
5389   #else
5390     #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_94488)
5391       r_.u64 = a_.u64 >> imm8;
5392     #else
5393       SIMDE_VECTORIZE
5394       for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
5395         r_.u64[i] = a_.u64[i] >> imm8;
5396       }
5397     #endif
5398   #endif
5399 
5400   return simde__m128i_from_private(r_);
5401 }
5402 #if defined(SIMDE_X86_SSE2_NATIVE)
5403 #  define simde_mm_srli_epi64(a, imm8) _mm_srli_epi64(a, imm8)
5404 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(__clang__)
5405 #  define simde_mm_srli_epi64(a, imm8) \
5406     ((imm8 == 0) ? (a) : (simde__m128i_from_neon_u64(vshrq_n_u64(simde__m128i_to_neon_u64(a), imm8))))
5407 // The above is allowed by gcc/g++ 9 with -march=armv8-a, might work on A32V8 and elsewhere but needs testing
5408 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) // clang can't handle the potential out of range use of imm8 even though that is handled
5409 #  define simde_mm_srli_epi64(a, imm8) \
5410     ({                                                           \
5411         simde__m128i ret;                                        \
5412         if ((imm8) <= 0) {                                       \
5413             ret = a;                                             \
5414         } else if ((imm8) > 63) {                                \
5415             ret = simde_mm_setzero_si128();                      \
5416         } else {                                                 \
5417             ret = simde__m128i_from_neon_u64(                    \
5418               vshrq_n_u64(simde__m128i_to_neon_u64(a), (imm8))); \
5419         }                                                        \
5420         ret;                                                     \
5421     })
5422 #endif
5423 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5424 #  define _mm_srli_epi64(a, imm8) simde_mm_srli_epi64(a, imm8)
5425 #endif
5426 
5427 SIMDE_FUNCTION_ATTRIBUTES
5428 void
simde_mm_store_pd(simde_float64 mem_addr[HEDLEY_ARRAY_PARAM (2)],simde__m128d a)5429 simde_mm_store_pd (simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128d a) {
5430   simde_assert_aligned(16, mem_addr);
5431 
5432 #if defined(SIMDE_X86_SSE2_NATIVE)
5433   _mm_store_pd(mem_addr, a);
5434 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
5435   vst1q_f64(mem_addr, simde__m128d_to_private(a).neon_f64);
5436 #else
5437   simde_memcpy(mem_addr, &a, sizeof(a));
5438 #endif
5439 }
5440 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5441 #  define _mm_store_pd(mem_addr, a) simde_mm_store_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
5442 #endif
5443 
5444 SIMDE_FUNCTION_ATTRIBUTES
5445 void
simde_mm_store1_pd(simde_float64 mem_addr[HEDLEY_ARRAY_PARAM (2)],simde__m128d a)5446 simde_mm_store1_pd (simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128d a) {
5447   simde_assert_aligned(16, mem_addr);
5448 
5449 #if defined(SIMDE_X86_SSE2_NATIVE)
5450   _mm_store1_pd(mem_addr, a);
5451 #else
5452   simde__m128d_private a_ = simde__m128d_to_private(a);
5453 
5454   mem_addr[0] = a_.f64[0];
5455   mem_addr[1] = a_.f64[0];
5456 #endif
5457 }
5458 #define simde_mm_store_pd1(mem_addr, a) simde_mm_store1_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
5459 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5460 #  define _mm_store1_pd(mem_addr, a) simde_mm_store1_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
5461 #  define _mm_store_pd1(mem_addr, a) simde_mm_store_pd1(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
5462 #endif
5463 
5464 SIMDE_FUNCTION_ATTRIBUTES
5465 void
simde_mm_store_sd(simde_float64 * mem_addr,simde__m128d a)5466 simde_mm_store_sd (simde_float64* mem_addr, simde__m128d a) {
5467 #if defined(SIMDE_X86_SSE2_NATIVE)
5468   _mm_store_sd(mem_addr, a);
5469 #else
5470   simde__m128d_private a_ = simde__m128d_to_private(a);
5471 
5472 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
5473   simde_float64 v = vgetq_lane_f64(a_.neon_f64, 0);
5474   simde_memcpy(mem_addr, &v, sizeof(simde_float64));
5475 #else
5476   simde_float64 v = a_.f64[0];
5477   simde_memcpy(mem_addr, &v, sizeof(simde_float64));
5478 #endif
5479 #endif
5480 }
5481 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5482 #  define _mm_store_sd(mem_addr, a) simde_mm_store_sd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
5483 #endif
5484 
5485 SIMDE_FUNCTION_ATTRIBUTES
5486 void
simde_mm_store_si128(simde__m128i * mem_addr,simde__m128i a)5487 simde_mm_store_si128 (simde__m128i* mem_addr, simde__m128i a) {
5488 #if defined(SIMDE_X86_SSE2_NATIVE)
5489   _mm_store_si128(HEDLEY_STATIC_CAST(__m128i*, mem_addr), a);
5490 #else
5491   simde__m128i_private a_ = simde__m128i_to_private(a);
5492 
5493   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5494     vst1q_s32(HEDLEY_REINTERPRET_CAST(int32_t*, mem_addr), a_.neon_i32);
5495   #else
5496     simde_memcpy(SIMDE_ASSUME_ALIGNED(16, mem_addr), &a_, sizeof(a_));
5497   #endif
5498 #endif
5499 }
5500 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5501 #  define _mm_store_si128(mem_addr, a) simde_mm_store_si128(mem_addr, a)
5502 #endif
5503 
5504 SIMDE_FUNCTION_ATTRIBUTES
5505 void
simde_mm_storeh_pd(simde_float64 * mem_addr,simde__m128d a)5506 simde_mm_storeh_pd (simde_float64* mem_addr, simde__m128d a) {
5507 #if defined(SIMDE_X86_SSE2_NATIVE)
5508   _mm_storeh_pd(mem_addr, a);
5509 #else
5510   simde__m128d_private a_ = simde__m128d_to_private(a);
5511 
5512   #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
5513     *mem_addr = vgetq_lane_f64(a_.neon_f64, 1);
5514   #else
5515     *mem_addr = a_.f64[1];
5516   #endif
5517 #endif
5518 }
5519 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5520 #  define _mm_storeh_pd(mem_addr, a) simde_mm_storeh_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
5521 #endif
5522 
5523 SIMDE_FUNCTION_ATTRIBUTES
5524 void
simde_mm_storel_epi64(simde__m128i * mem_addr,simde__m128i a)5525 simde_mm_storel_epi64 (simde__m128i* mem_addr, simde__m128i a) {
5526   #if defined(SIMDE_X86_SSE2_NATIVE)
5527     _mm_storel_epi64(HEDLEY_STATIC_CAST(__m128i*, mem_addr), a);
5528   #else
5529     simde__m128i_private a_ = simde__m128i_to_private(a);
5530     int64_t tmp;
5531 
5532     /* memcpy to prevent aliasing, tmp because we can't take the
5533      * address of a vector element. */
5534 
5535     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5536       tmp = vgetq_lane_s64(a_.neon_i64, 0);
5537     #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
5538       #if defined(SIMDE_BUG_GCC_95227)
5539         (void) a_;
5540       #endif
5541       tmp = vec_extract(a_.altivec_i64, 0);
5542     #else
5543       tmp = a_.i64[0];
5544     #endif
5545 
5546     simde_memcpy(mem_addr, &tmp, sizeof(tmp));
5547   #endif
5548 }
5549 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5550 #  define _mm_storel_epi64(mem_addr, a) simde_mm_storel_epi64(mem_addr, a)
5551 #endif
5552 
5553 SIMDE_FUNCTION_ATTRIBUTES
5554 void
simde_mm_storel_pd(simde_float64 * mem_addr,simde__m128d a)5555 simde_mm_storel_pd (simde_float64* mem_addr, simde__m128d a) {
5556 #if defined(SIMDE_X86_SSE2_NATIVE)
5557   _mm_storel_pd(mem_addr, a);
5558 #else
5559   simde__m128d_private a_ = simde__m128d_to_private(a);
5560 
5561   *mem_addr = a_.f64[0];
5562 #endif
5563 }
5564 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5565 #  define _mm_storel_pd(mem_addr, a) simde_mm_storel_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
5566 #endif
5567 
5568 SIMDE_FUNCTION_ATTRIBUTES
5569 void
simde_mm_storer_pd(simde_float64 mem_addr[2],simde__m128d a)5570 simde_mm_storer_pd (simde_float64 mem_addr[2], simde__m128d a) {
5571   simde_assert_aligned(16, mem_addr);
5572 
5573 #if defined(SIMDE_X86_SSE2_NATIVE)
5574   _mm_storer_pd(mem_addr, a);
5575 #else
5576   simde__m128d_private a_ = simde__m128d_to_private(a);
5577 
5578   mem_addr[0] = a_.f64[1];
5579   mem_addr[1] = a_.f64[0];
5580 #endif
5581 }
5582 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5583 #  define _mm_storer_pd(mem_addr, a) simde_mm_storer_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
5584 #endif
5585 
5586 SIMDE_FUNCTION_ATTRIBUTES
5587 void
simde_mm_storeu_pd(simde_float64 * mem_addr,simde__m128d a)5588 simde_mm_storeu_pd (simde_float64* mem_addr, simde__m128d a) {
5589 #if defined(SIMDE_X86_SSE2_NATIVE)
5590   _mm_storeu_pd(mem_addr, a);
5591 #else
5592   simde_memcpy(mem_addr, &a, sizeof(a));
5593 #endif
5594 }
5595 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5596 #  define _mm_storeu_pd(mem_addr, a) simde_mm_storeu_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
5597 #endif
5598 
5599 SIMDE_FUNCTION_ATTRIBUTES
5600 void
simde_mm_storeu_si128(simde__m128i * mem_addr,simde__m128i a)5601 simde_mm_storeu_si128 (simde__m128i* mem_addr, simde__m128i a) {
5602 #if defined(SIMDE_X86_SSE2_NATIVE)
5603   _mm_storeu_si128(HEDLEY_STATIC_CAST(__m128i*, mem_addr), a);
5604 #else
5605   simde__m128i_private a_ = simde__m128i_to_private(a);
5606 
5607   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5608     vst1q_s32(HEDLEY_REINTERPRET_CAST(int32_t*, mem_addr), a_.neon_i32);
5609   #else
5610     simde_memcpy(mem_addr, &a_, sizeof(a_));
5611   #endif
5612 #endif
5613 }
5614 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5615 #  define _mm_storeu_si128(mem_addr, a) simde_mm_storeu_si128(mem_addr, a)
5616 #endif
5617 
5618 SIMDE_FUNCTION_ATTRIBUTES
5619 void
simde_mm_stream_pd(simde_float64 mem_addr[HEDLEY_ARRAY_PARAM (2)],simde__m128d a)5620 simde_mm_stream_pd (simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128d a) {
5621   simde_assert_aligned(16, mem_addr);
5622 
5623 #if defined(SIMDE_X86_SSE2_NATIVE)
5624   _mm_stream_pd(mem_addr, a);
5625 #else
5626   simde_memcpy(mem_addr, &a, sizeof(a));
5627 #endif
5628 }
5629 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5630 #  define _mm_stream_pd(mem_addr, a) simde_mm_stream_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
5631 #endif
5632 
5633 SIMDE_FUNCTION_ATTRIBUTES
5634 void
simde_mm_stream_si128(simde__m128i * mem_addr,simde__m128i a)5635 simde_mm_stream_si128 (simde__m128i* mem_addr, simde__m128i a) {
5636   simde_assert_aligned(16, mem_addr);
5637 
5638 #if defined(SIMDE_X86_SSE2_NATIVE)
5639   _mm_stream_si128(HEDLEY_STATIC_CAST(__m128i*, mem_addr), a);
5640 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5641   vst1q_s64(HEDLEY_REINTERPRET_CAST(int64_t *, mem_addr), simde__m128i_to_neon_i64(a));
5642 #else
5643   simde_memcpy(mem_addr, &a, sizeof(a));
5644 #endif
5645 }
5646 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5647 #  define _mm_stream_si128(mem_addr, a) simde_mm_stream_si128(mem_addr, a)
5648 #endif
5649 
5650 SIMDE_FUNCTION_ATTRIBUTES
5651 void
simde_mm_stream_si32(int32_t * mem_addr,int32_t a)5652 simde_mm_stream_si32 (int32_t* mem_addr, int32_t a) {
5653 #if defined(SIMDE_X86_SSE2_NATIVE)
5654   _mm_stream_si32(mem_addr, a);
5655 #else
5656   *mem_addr = a;
5657 #endif
5658 }
5659 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5660 #  define _mm_stream_si32(mem_addr, a) simde_mm_stream_si32(mem_addr, a)
5661 #endif
5662 
5663 SIMDE_FUNCTION_ATTRIBUTES
5664 void
simde_mm_stream_si64(int64_t * mem_addr,int64_t a)5665 simde_mm_stream_si64 (int64_t* mem_addr, int64_t a) {
5666   *mem_addr = a;
5667 }
5668 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5669 #  define _mm_stream_si64(mem_addr, a) simde_mm_stream_si64(SIMDE_CHECKED_REINTERPRET_CAST(int64_t*, __int64*, mem_addr), a)
5670 #endif
5671 
5672 SIMDE_FUNCTION_ATTRIBUTES
5673 simde__m128i
simde_mm_sub_epi8(simde__m128i a,simde__m128i b)5674 simde_mm_sub_epi8 (simde__m128i a, simde__m128i b) {
5675 #if defined(SIMDE_X86_SSE2_NATIVE)
5676   return _mm_sub_epi8(a, b);
5677 #else
5678   simde__m128i_private
5679     r_,
5680     a_ = simde__m128i_to_private(a),
5681     b_ = simde__m128i_to_private(b);
5682 
5683   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5684     r_.neon_i8 = vsubq_s8(a_.neon_i8, b_.neon_i8);
5685   #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
5686     r_.i8 = a_.i8 - b_.i8;
5687   #else
5688     SIMDE_VECTORIZE
5689     for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
5690       r_.i8[i] = a_.i8[i] - b_.i8[i];
5691     }
5692   #endif
5693 
5694   return simde__m128i_from_private(r_);
5695 #endif
5696 }
5697 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5698 #  define _mm_sub_epi8(a, b) simde_mm_sub_epi8(a, b)
5699 #endif
5700 
5701 SIMDE_FUNCTION_ATTRIBUTES
5702 simde__m128i
simde_mm_sub_epi16(simde__m128i a,simde__m128i b)5703 simde_mm_sub_epi16 (simde__m128i a, simde__m128i b) {
5704 #if defined(SIMDE_X86_SSE2_NATIVE)
5705   return _mm_sub_epi16(a, b);
5706 #else
5707   simde__m128i_private
5708     r_,
5709     a_ = simde__m128i_to_private(a),
5710     b_ = simde__m128i_to_private(b);
5711 
5712   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5713     r_.neon_i16 = vsubq_s16(a_.neon_i16, b_.neon_i16);
5714   #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
5715     r_.i16 = a_.i16 - b_.i16;
5716   #else
5717     SIMDE_VECTORIZE
5718     for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
5719       r_.i16[i] = a_.i16[i] - b_.i16[i];
5720     }
5721   #endif
5722 
5723   return simde__m128i_from_private(r_);
5724 #endif
5725 }
5726 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5727 #  define _mm_sub_epi16(a, b) simde_mm_sub_epi16(a, b)
5728 #endif
5729 
5730 SIMDE_FUNCTION_ATTRIBUTES
5731 simde__m128i
simde_mm_sub_epi32(simde__m128i a,simde__m128i b)5732 simde_mm_sub_epi32 (simde__m128i a, simde__m128i b) {
5733 #if defined(SIMDE_X86_SSE2_NATIVE)
5734   return _mm_sub_epi32(a, b);
5735 #else
5736   simde__m128i_private
5737     r_,
5738     a_ = simde__m128i_to_private(a),
5739     b_ = simde__m128i_to_private(b);
5740 
5741   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5742     r_.neon_i32 = vsubq_s32(a_.neon_i32, b_.neon_i32);
5743   #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
5744     r_.i32 = a_.i32 - b_.i32;
5745   #else
5746     SIMDE_VECTORIZE
5747     for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
5748       r_.i32[i] = a_.i32[i] - b_.i32[i];
5749     }
5750   #endif
5751 
5752   return simde__m128i_from_private(r_);
5753 #endif
5754 }
5755 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5756 #  define _mm_sub_epi32(a, b) simde_mm_sub_epi32(a, b)
5757 #endif
5758 
5759 SIMDE_FUNCTION_ATTRIBUTES
5760 simde__m128i
simde_mm_sub_epi64(simde__m128i a,simde__m128i b)5761 simde_mm_sub_epi64 (simde__m128i a, simde__m128i b) {
5762 #if defined(SIMDE_X86_SSE2_NATIVE)
5763   return _mm_sub_epi64(a, b);
5764 #else
5765   simde__m128i_private
5766     r_,
5767     a_ = simde__m128i_to_private(a),
5768     b_ = simde__m128i_to_private(b);
5769 
5770   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5771     r_.neon_i64 = vsubq_s64(a_.neon_i64, b_.neon_i64);
5772   #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
5773     r_.i64 = a_.i64 - b_.i64;
5774   #else
5775     SIMDE_VECTORIZE
5776     for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
5777       r_.i64[i] = a_.i64[i] - b_.i64[i];
5778     }
5779   #endif
5780 
5781   return simde__m128i_from_private(r_);
5782 #endif
5783 }
5784 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5785 #  define _mm_sub_epi64(a, b) simde_mm_sub_epi64(a, b)
5786 #endif
5787 
5788 SIMDE_FUNCTION_ATTRIBUTES
5789 simde__m128i
simde_x_mm_sub_epu32(simde__m128i a,simde__m128i b)5790 simde_x_mm_sub_epu32 (simde__m128i a, simde__m128i b) {
5791   simde__m128i_private
5792     r_,
5793     a_ = simde__m128i_to_private(a),
5794     b_ = simde__m128i_to_private(b);
5795 
5796   #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
5797     r_.u32 = a_.u32 - b_.u32;
5798   #else
5799     SIMDE_VECTORIZE
5800     for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
5801       r_.u32[i] = a_.u32[i] - b_.u32[i];
5802     }
5803   #endif
5804 
5805   return simde__m128i_from_private(r_);
5806 }
5807 
5808 SIMDE_FUNCTION_ATTRIBUTES
5809 simde__m128d
simde_mm_sub_pd(simde__m128d a,simde__m128d b)5810 simde_mm_sub_pd (simde__m128d a, simde__m128d b) {
5811 #if defined(SIMDE_X86_SSE2_NATIVE)
5812   return _mm_sub_pd(a, b);
5813 #else
5814   simde__m128d_private
5815     r_,
5816     a_ = simde__m128d_to_private(a),
5817     b_ = simde__m128d_to_private(b);
5818 
5819 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
5820   r_.f64 = a_.f64 - b_.f64;
5821 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5822   r_.wasm_v128 = wasm_f64x2_sub(a_.wasm_v128, b_.wasm_v128);
5823 #else
5824   SIMDE_VECTORIZE
5825   for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
5826     r_.f64[i] = a_.f64[i] - b_.f64[i];
5827   }
5828 #endif
5829 
5830   return simde__m128d_from_private(r_);
5831 #endif
5832 }
5833 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5834 #  define _mm_sub_pd(a, b) simde_mm_sub_pd(a, b)
5835 #endif
5836 
5837 SIMDE_FUNCTION_ATTRIBUTES
5838 simde__m128d
simde_mm_sub_sd(simde__m128d a,simde__m128d b)5839 simde_mm_sub_sd (simde__m128d a, simde__m128d b) {
5840 #if defined(SIMDE_X86_SSE2_NATIVE)
5841   return _mm_sub_sd(a, b);
5842 #elif defined(SIMDE_ASSUME_VECTORIZATION)
5843   return simde_mm_move_sd(a, simde_mm_sub_pd(a, b));
5844 #else
5845   simde__m128d_private
5846     r_,
5847     a_ = simde__m128d_to_private(a),
5848     b_ = simde__m128d_to_private(b);
5849 
5850   r_.f64[0] = a_.f64[0] - b_.f64[0];
5851   r_.f64[1] = a_.f64[1];
5852 
5853   return simde__m128d_from_private(r_);
5854 #endif
5855 }
5856 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5857 #  define _mm_sub_sd(a, b) simde_mm_sub_sd(a, b)
5858 #endif
5859 
5860 SIMDE_FUNCTION_ATTRIBUTES
5861 simde__m64
simde_mm_sub_si64(simde__m64 a,simde__m64 b)5862 simde_mm_sub_si64 (simde__m64 a, simde__m64 b) {
5863 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
5864   return _mm_sub_si64(a, b);
5865 #else
5866   simde__m64_private
5867     r_,
5868     a_ = simde__m64_to_private(a),
5869     b_ = simde__m64_to_private(b);
5870 
5871 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
5872   r_.i64 = a_.i64 - b_.i64;
5873 #else
5874   r_.i64[0] = a_.i64[0] - b_.i64[0];
5875 #endif
5876 
5877   return simde__m64_from_private(r_);
5878 #endif
5879 }
5880 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5881 #  define _mm_sub_si64(a, b) simde_mm_sub_si64(a, b)
5882 #endif
5883 
5884 SIMDE_FUNCTION_ATTRIBUTES
5885 simde__m128i
simde_mm_subs_epi8(simde__m128i a,simde__m128i b)5886 simde_mm_subs_epi8 (simde__m128i a, simde__m128i b) {
5887 #if defined(SIMDE_X86_SSE2_NATIVE)
5888   return _mm_subs_epi8(a, b);
5889 #else
5890   simde__m128i_private
5891     r_,
5892     a_ = simde__m128i_to_private(a),
5893     b_ = simde__m128i_to_private(b);
5894 
5895 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5896   r_.neon_i8 = vqsubq_s8(a_.neon_i8, b_.neon_i8);
5897 #else
5898   SIMDE_VECTORIZE
5899   for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i8[0])) ; i++) {
5900     if (((b_.i8[i]) > 0 && (a_.i8[i]) < INT8_MIN + (b_.i8[i]))) {
5901       r_.i8[i] = INT8_MIN;
5902     } else if ((b_.i8[i]) < 0 && (a_.i8[i]) > INT8_MAX + (b_.i8[i])) {
5903       r_.i8[i] = INT8_MAX;
5904     } else {
5905       r_.i8[i] = (a_.i8[i]) - (b_.i8[i]);
5906     }
5907   }
5908 #endif
5909 
5910   return simde__m128i_from_private(r_);
5911 #endif
5912 }
5913 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5914 #  define _mm_subs_epi8(a, b) simde_mm_subs_epi8(a, b)
5915 #endif
5916 
5917 SIMDE_FUNCTION_ATTRIBUTES
5918 simde__m128i
simde_mm_subs_epi16(simde__m128i a,simde__m128i b)5919 simde_mm_subs_epi16 (simde__m128i a, simde__m128i b) {
5920 #if defined(SIMDE_X86_SSE2_NATIVE)
5921   return _mm_subs_epi16(a, b);
5922 #else
5923   simde__m128i_private
5924     r_,
5925     a_ = simde__m128i_to_private(a),
5926     b_ = simde__m128i_to_private(b);
5927 
5928   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5929     r_.neon_i16 = vqsubq_s16(a_.neon_i16, b_.neon_i16);
5930   #else
5931     SIMDE_VECTORIZE
5932     for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i16[0])) ; i++) {
5933       if (((b_.i16[i]) > 0 && (a_.i16[i]) < INT16_MIN + (b_.i16[i]))) {
5934         r_.i16[i] = INT16_MIN;
5935       } else if ((b_.i16[i]) < 0 && (a_.i16[i]) > INT16_MAX + (b_.i16[i])) {
5936         r_.i16[i] = INT16_MAX;
5937       } else {
5938         r_.i16[i] = (a_.i16[i]) - (b_.i16[i]);
5939       }
5940     }
5941   #endif
5942 
5943   return simde__m128i_from_private(r_);
5944 #endif
5945 }
5946 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5947 #  define _mm_subs_epi16(a, b) simde_mm_subs_epi16(a, b)
5948 #endif
5949 
5950 SIMDE_FUNCTION_ATTRIBUTES
5951 simde__m128i
simde_mm_subs_epu8(simde__m128i a,simde__m128i b)5952 simde_mm_subs_epu8 (simde__m128i a, simde__m128i b) {
5953 #if defined(SIMDE_X86_SSE2_NATIVE)
5954   return _mm_subs_epu8(a, b);
5955 #else
5956   simde__m128i_private
5957     r_,
5958     a_ = simde__m128i_to_private(a),
5959     b_ = simde__m128i_to_private(b);
5960 
5961   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5962     r_.neon_u8 = vqsubq_u8(a_.neon_u8, b_.neon_u8);
5963   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
5964     r_.altivec_u8 = vec_subs(a_.altivec_u8, b_.altivec_u8);
5965   #else
5966     SIMDE_VECTORIZE
5967     for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i8[0])) ; i++) {
5968       const int32_t x = a_.u8[i] - b_.u8[i];
5969       if (x < 0) {
5970         r_.u8[i] = 0;
5971       } else if (x > UINT8_MAX) {
5972         r_.u8[i] = UINT8_MAX;
5973       } else {
5974         r_.u8[i] = HEDLEY_STATIC_CAST(uint8_t, x);
5975       }
5976     }
5977   #endif
5978 
5979   return simde__m128i_from_private(r_);
5980 #endif
5981 }
5982 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5983 #  define _mm_subs_epu8(a, b) simde_mm_subs_epu8(a, b)
5984 #endif
5985 
5986 SIMDE_FUNCTION_ATTRIBUTES
5987 simde__m128i
simde_mm_subs_epu16(simde__m128i a,simde__m128i b)5988 simde_mm_subs_epu16 (simde__m128i a, simde__m128i b) {
5989 #if defined(SIMDE_X86_SSE2_NATIVE)
5990   return _mm_subs_epu16(a, b);
5991 #else
5992   simde__m128i_private
5993     r_,
5994     a_ = simde__m128i_to_private(a),
5995     b_ = simde__m128i_to_private(b);
5996 
5997   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5998     r_.neon_u16 = vqsubq_u16(a_.neon_u16, b_.neon_u16);
5999   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
6000     r_.altivec_u16 = vec_subs(a_.altivec_u16, b_.altivec_u16);
6001   #else
6002     SIMDE_VECTORIZE
6003     for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i16[0])) ; i++) {
6004       const int32_t x = a_.u16[i] - b_.u16[i];
6005       if (x < 0) {
6006         r_.u16[i] = 0;
6007       } else if (x > UINT16_MAX) {
6008         r_.u16[i] = UINT16_MAX;
6009       } else {
6010         r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, x);
6011       }
6012     }
6013   #endif
6014 
6015   return simde__m128i_from_private(r_);
6016 #endif
6017 }
6018 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6019 #  define _mm_subs_epu16(a, b) simde_mm_subs_epu16(a, b)
6020 #endif
6021 
6022 SIMDE_FUNCTION_ATTRIBUTES
6023 int
simde_mm_ucomieq_sd(simde__m128d a,simde__m128d b)6024 simde_mm_ucomieq_sd (simde__m128d a, simde__m128d b) {
6025 #if defined(SIMDE_X86_SSE2_NATIVE)
6026   return _mm_ucomieq_sd(a, b);
6027 #else
6028   simde__m128d_private
6029     a_ = simde__m128d_to_private(a),
6030     b_ = simde__m128d_to_private(b);
6031   int r;
6032 
6033 #if defined(SIMDE_HAVE_FENV_H)
6034   fenv_t envp;
6035   int x = feholdexcept(&envp);
6036   r =  a_.f64[0] == b_.f64[0];
6037   if (HEDLEY_LIKELY(x == 0))
6038     fesetenv(&envp);
6039 #else
6040   r =  a_.f64[0] == b_.f64[0];
6041 #endif
6042 
6043   return r;
6044 #endif
6045 }
6046 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6047 #  define _mm_ucomieq_sd(a, b) simde_mm_ucomieq_sd(a, b)
6048 #endif
6049 
6050 SIMDE_FUNCTION_ATTRIBUTES
6051 int
simde_mm_ucomige_sd(simde__m128d a,simde__m128d b)6052 simde_mm_ucomige_sd (simde__m128d a, simde__m128d b) {
6053 #if defined(SIMDE_X86_SSE2_NATIVE)
6054   return _mm_ucomige_sd(a, b);
6055 #else
6056   simde__m128d_private
6057     a_ = simde__m128d_to_private(a),
6058     b_ = simde__m128d_to_private(b);
6059   int r;
6060 
6061 #if defined(SIMDE_HAVE_FENV_H)
6062   fenv_t envp;
6063   int x = feholdexcept(&envp);
6064   r = a_.f64[0] >= b_.f64[0];
6065   if (HEDLEY_LIKELY(x == 0))
6066     fesetenv(&envp);
6067 #else
6068   r = a_.f64[0] >= b_.f64[0];
6069 #endif
6070 
6071   return r;
6072 #endif
6073 }
6074 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6075 #  define _mm_ucomige_sd(a, b) simde_mm_ucomige_sd(a, b)
6076 #endif
6077 
6078 SIMDE_FUNCTION_ATTRIBUTES
6079 int
simde_mm_ucomigt_sd(simde__m128d a,simde__m128d b)6080 simde_mm_ucomigt_sd (simde__m128d a, simde__m128d b) {
6081 #if defined(SIMDE_X86_SSE2_NATIVE)
6082   return _mm_ucomigt_sd(a, b);
6083 #else
6084   simde__m128d_private
6085     a_ = simde__m128d_to_private(a),
6086     b_ = simde__m128d_to_private(b);
6087   int r;
6088 
6089 #if defined(SIMDE_HAVE_FENV_H)
6090   fenv_t envp;
6091   int x = feholdexcept(&envp);
6092   r = a_.f64[0] > b_.f64[0];
6093   if (HEDLEY_LIKELY(x == 0))
6094     fesetenv(&envp);
6095 #else
6096   r = a_.f64[0] > b_.f64[0];
6097 #endif
6098 
6099   return r;
6100 #endif
6101 }
6102 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6103 #  define _mm_ucomigt_sd(a, b) simde_mm_ucomigt_sd(a, b)
6104 #endif
6105 
6106 SIMDE_FUNCTION_ATTRIBUTES
6107 int
simde_mm_ucomile_sd(simde__m128d a,simde__m128d b)6108 simde_mm_ucomile_sd (simde__m128d a, simde__m128d b) {
6109 #if defined(SIMDE_X86_SSE2_NATIVE)
6110   return _mm_ucomile_sd(a, b);
6111 #else
6112   simde__m128d_private
6113     a_ = simde__m128d_to_private(a),
6114     b_ = simde__m128d_to_private(b);
6115   int r;
6116 
6117 #if defined(SIMDE_HAVE_FENV_H)
6118   fenv_t envp;
6119   int x = feholdexcept(&envp);
6120   r = a_.f64[0] <= b_.f64[0];
6121   if (HEDLEY_LIKELY(x == 0))
6122     fesetenv(&envp);
6123 #else
6124   r = a_.f64[0] <= b_.f64[0];
6125 #endif
6126 
6127   return r;
6128 #endif
6129 }
6130 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6131 #  define _mm_ucomile_sd(a, b) simde_mm_ucomile_sd(a, b)
6132 #endif
6133 
6134 SIMDE_FUNCTION_ATTRIBUTES
6135 int
simde_mm_ucomilt_sd(simde__m128d a,simde__m128d b)6136 simde_mm_ucomilt_sd (simde__m128d a, simde__m128d b) {
6137 #if defined(SIMDE_X86_SSE2_NATIVE)
6138   return _mm_ucomilt_sd(a, b);
6139 #else
6140   simde__m128d_private
6141     a_ = simde__m128d_to_private(a),
6142     b_ = simde__m128d_to_private(b);
6143   int r;
6144 
6145 #if defined(SIMDE_HAVE_FENV_H)
6146   fenv_t envp;
6147   int x = feholdexcept(&envp);
6148   r = a_.f64[0] < b_.f64[0];
6149   if (HEDLEY_LIKELY(x == 0))
6150     fesetenv(&envp);
6151 #else
6152   r = a_.f64[0] < b_.f64[0];
6153 #endif
6154 
6155   return r;
6156 #endif
6157 }
6158 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6159 #  define _mm_ucomilt_sd(a, b) simde_mm_ucomilt_sd(a, b)
6160 #endif
6161 
6162 SIMDE_FUNCTION_ATTRIBUTES
6163 int
simde_mm_ucomineq_sd(simde__m128d a,simde__m128d b)6164 simde_mm_ucomineq_sd (simde__m128d a, simde__m128d b) {
6165 #if defined(SIMDE_X86_SSE2_NATIVE)
6166   return _mm_ucomineq_sd(a, b);
6167 #else
6168   simde__m128d_private
6169     a_ = simde__m128d_to_private(a),
6170     b_ = simde__m128d_to_private(b);
6171   int r;
6172 
6173 #if defined(SIMDE_HAVE_FENV_H)
6174   fenv_t envp;
6175   int x = feholdexcept(&envp);
6176   r = a_.f64[0] != b_.f64[0];
6177   if (HEDLEY_LIKELY(x == 0))
6178     fesetenv(&envp);
6179 #else
6180   r = a_.f64[0] != b_.f64[0];
6181 #endif
6182 
6183   return r;
6184 #endif
6185 }
6186 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6187 #  define _mm_ucomineq_sd(a, b) simde_mm_ucomineq_sd(a, b)
6188 #endif
6189 
6190 #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
6191   HEDLEY_DIAGNOSTIC_PUSH
6192   SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_
6193 #endif
6194 
6195 #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
6196   HEDLEY_DIAGNOSTIC_POP
6197 #endif
6198 
6199 SIMDE_FUNCTION_ATTRIBUTES
6200 void
simde_mm_lfence(void)6201 simde_mm_lfence (void) {
6202 #if defined(SIMDE_X86_SSE2_NATIVE)
6203   _mm_lfence();
6204 #else
6205   simde_mm_sfence();
6206 #endif
6207 }
6208 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6209 #  define _mm_lfence() simde_mm_lfence()
6210 #endif
6211 
6212 SIMDE_FUNCTION_ATTRIBUTES
6213 void
simde_mm_mfence(void)6214 simde_mm_mfence (void) {
6215 #if defined(SIMDE_X86_SSE2_NATIVE)
6216   _mm_mfence();
6217 #else
6218   simde_mm_sfence();
6219 #endif
6220 }
6221 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6222 #  define _mm_mfence() simde_mm_mfence()
6223 #endif
6224 
6225 SIMDE_FUNCTION_ATTRIBUTES
6226 simde__m128i
simde_mm_unpackhi_epi8(simde__m128i a,simde__m128i b)6227 simde_mm_unpackhi_epi8 (simde__m128i a, simde__m128i b) {
6228 #if defined(SIMDE_X86_SSE2_NATIVE)
6229   return _mm_unpackhi_epi8(a, b);
6230 #else
6231   simde__m128i_private
6232     r_,
6233     a_ = simde__m128i_to_private(a),
6234     b_ = simde__m128i_to_private(b);
6235 
6236 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6237   r_.neon_i8 = vzip2q_s8(a_.neon_i8, b_.neon_i8);
6238 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6239   int8x8_t a1 = vreinterpret_s8_s16(vget_high_s16(a_.neon_i16));
6240   int8x8_t b1 = vreinterpret_s8_s16(vget_high_s16(b_.neon_i16));
6241   int8x8x2_t result = vzip_s8(a1, b1);
6242   r_.neon_i8 = vcombine_s8(result.val[0], result.val[1]);
6243 #elif defined(SIMDE_SHUFFLE_VECTOR_)
6244   r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, b_.i8, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
6245 #else
6246   SIMDE_VECTORIZE
6247   for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i8[0])) / 2) ; i++) {
6248     r_.i8[(i * 2)]     = a_.i8[i + ((sizeof(r_) / sizeof(r_.i8[0])) / 2)];
6249     r_.i8[(i * 2) + 1] = b_.i8[i + ((sizeof(r_) / sizeof(r_.i8[0])) / 2)];
6250   }
6251 #endif
6252 
6253   return simde__m128i_from_private(r_);
6254 #endif
6255 }
6256 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6257 #  define _mm_unpackhi_epi8(a, b) simde_mm_unpackhi_epi8(a, b)
6258 #endif
6259 
6260 SIMDE_FUNCTION_ATTRIBUTES
6261 simde__m128i
simde_mm_unpackhi_epi16(simde__m128i a,simde__m128i b)6262 simde_mm_unpackhi_epi16 (simde__m128i a, simde__m128i b) {
6263 #if defined(SIMDE_X86_SSE2_NATIVE)
6264   return _mm_unpackhi_epi16(a, b);
6265 #else
6266   simde__m128i_private
6267     r_,
6268     a_ = simde__m128i_to_private(a),
6269     b_ = simde__m128i_to_private(b);
6270 
6271 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6272   r_.neon_i16 = vzip2q_s16(a_.neon_i16, b_.neon_i16);
6273 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6274   int16x4_t a1 = vget_high_s16(a_.neon_i16);
6275   int16x4_t b1 = vget_high_s16(b_.neon_i16);
6276   int16x4x2_t result = vzip_s16(a1, b1);
6277   r_.neon_i16 = vcombine_s16(result.val[0], result.val[1]);
6278 #elif defined(SIMDE_SHUFFLE_VECTOR_)
6279   r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, b_.i16, 4, 12, 5, 13, 6, 14, 7, 15);
6280 #else
6281   SIMDE_VECTORIZE
6282   for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i16[0])) / 2) ; i++) {
6283     r_.i16[(i * 2)]     = a_.i16[i + ((sizeof(r_) / sizeof(r_.i16[0])) / 2)];
6284     r_.i16[(i * 2) + 1] = b_.i16[i + ((sizeof(r_) / sizeof(r_.i16[0])) / 2)];
6285   }
6286 #endif
6287 
6288   return simde__m128i_from_private(r_);
6289 #endif
6290 }
6291 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6292 #  define _mm_unpackhi_epi16(a, b) simde_mm_unpackhi_epi16(a, b)
6293 #endif
6294 
6295 SIMDE_FUNCTION_ATTRIBUTES
6296 simde__m128i
simde_mm_unpackhi_epi32(simde__m128i a,simde__m128i b)6297 simde_mm_unpackhi_epi32 (simde__m128i a, simde__m128i b) {
6298 #if defined(SIMDE_X86_SSE2_NATIVE)
6299   return _mm_unpackhi_epi32(a, b);
6300 #else
6301   simde__m128i_private
6302     r_,
6303     a_ = simde__m128i_to_private(a),
6304     b_ = simde__m128i_to_private(b);
6305 
6306 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6307   r_.neon_i32 = vzip2q_s32(a_.neon_i32, b_.neon_i32);
6308 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6309   int32x2_t a1 = vget_high_s32(a_.neon_i32);
6310   int32x2_t b1 = vget_high_s32(b_.neon_i32);
6311   int32x2x2_t result = vzip_s32(a1, b1);
6312   r_.neon_i32 = vcombine_s32(result.val[0], result.val[1]);
6313 #elif defined(SIMDE_SHUFFLE_VECTOR_)
6314   r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.i32, b_.i32, 2, 6, 3, 7);
6315 #else
6316   SIMDE_VECTORIZE
6317   for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i32[0])) / 2) ; i++) {
6318     r_.i32[(i * 2)]     = a_.i32[i + ((sizeof(r_) / sizeof(r_.i32[0])) / 2)];
6319     r_.i32[(i * 2) + 1] = b_.i32[i + ((sizeof(r_) / sizeof(r_.i32[0])) / 2)];
6320   }
6321 #endif
6322 
6323   return simde__m128i_from_private(r_);
6324 #endif
6325 }
6326 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6327 #  define _mm_unpackhi_epi32(a, b) simde_mm_unpackhi_epi32(a, b)
6328 #endif
6329 
6330 SIMDE_FUNCTION_ATTRIBUTES
6331 simde__m128i
simde_mm_unpackhi_epi64(simde__m128i a,simde__m128i b)6332 simde_mm_unpackhi_epi64 (simde__m128i a, simde__m128i b) {
6333 #if defined(SIMDE_X86_SSE2_NATIVE)
6334   return _mm_unpackhi_epi64(a, b);
6335 #else
6336   simde__m128i_private
6337     r_,
6338     a_ = simde__m128i_to_private(a),
6339     b_ = simde__m128i_to_private(b);
6340 
6341 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6342   int64x1_t a_h = vget_high_s64(a_.neon_i64);
6343   int64x1_t b_h = vget_high_s64(b_.neon_i64);
6344   r_.neon_i64 = vcombine_s64(a_h, b_h);
6345 #elif defined(SIMDE_SHUFFLE_VECTOR_)
6346   r_.i64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.i64, b_.i64, 1, 3);
6347 #else
6348   SIMDE_VECTORIZE
6349   for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i64[0])) / 2) ; i++) {
6350     r_.i64[(i * 2)]     = a_.i64[i + ((sizeof(r_) / sizeof(r_.i64[0])) / 2)];
6351     r_.i64[(i * 2) + 1] = b_.i64[i + ((sizeof(r_) / sizeof(r_.i64[0])) / 2)];
6352   }
6353 #endif
6354 
6355   return simde__m128i_from_private(r_);
6356 #endif
6357 }
6358 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6359 #  define _mm_unpackhi_epi64(a, b) simde_mm_unpackhi_epi64(a, b)
6360 #endif
6361 
6362 SIMDE_FUNCTION_ATTRIBUTES
6363 simde__m128d
simde_mm_unpackhi_pd(simde__m128d a,simde__m128d b)6364 simde_mm_unpackhi_pd (simde__m128d a, simde__m128d b) {
6365 #if defined(SIMDE_X86_SSE2_NATIVE)
6366   return _mm_unpackhi_pd(a, b);
6367 #else
6368   simde__m128d_private
6369     r_,
6370     a_ = simde__m128d_to_private(a),
6371     b_ = simde__m128d_to_private(b);
6372 
6373 #if defined(SIMDE_SHUFFLE_VECTOR_)
6374   r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 1, 3);
6375 #else
6376   SIMDE_VECTORIZE
6377   for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.f64[0])) / 2) ; i++) {
6378     r_.f64[(i * 2)]     = a_.f64[i + ((sizeof(r_) / sizeof(r_.f64[0])) / 2)];
6379     r_.f64[(i * 2) + 1] = b_.f64[i + ((sizeof(r_) / sizeof(r_.f64[0])) / 2)];
6380   }
6381 #endif
6382 
6383   return simde__m128d_from_private(r_);
6384 #endif
6385 }
6386 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6387 #  define _mm_unpackhi_pd(a, b) simde_mm_unpackhi_pd(a, b)
6388 #endif
6389 
6390 SIMDE_FUNCTION_ATTRIBUTES
6391 simde__m128i
simde_mm_unpacklo_epi8(simde__m128i a,simde__m128i b)6392 simde_mm_unpacklo_epi8 (simde__m128i a, simde__m128i b) {
6393 #if defined(SIMDE_X86_SSE2_NATIVE)
6394   return _mm_unpacklo_epi8(a, b);
6395 #else
6396   simde__m128i_private
6397     r_,
6398     a_ = simde__m128i_to_private(a),
6399     b_ = simde__m128i_to_private(b);
6400 
6401 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6402   r_.neon_i8 = vzip1q_s8(a_.neon_i8, b_.neon_i8);
6403 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6404   int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(a_.neon_i16));
6405   int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(b_.neon_i16));
6406   int8x8x2_t result = vzip_s8(a1, b1);
6407   r_.neon_i8 = vcombine_s8(result.val[0], result.val[1]);
6408 #elif defined(SIMDE_SHUFFLE_VECTOR_)
6409   r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, b_.i8, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
6410 #else
6411   SIMDE_VECTORIZE
6412   for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i8[0])) / 2) ; i++) {
6413     r_.i8[(i * 2)]     = a_.i8[i];
6414     r_.i8[(i * 2) + 1] = b_.i8[i];
6415   }
6416 #endif
6417 
6418   return simde__m128i_from_private(r_);
6419 #endif
6420 }
6421 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6422 #  define _mm_unpacklo_epi8(a, b) simde_mm_unpacklo_epi8(a, b)
6423 #endif
6424 
6425 SIMDE_FUNCTION_ATTRIBUTES
6426 simde__m128i
simde_mm_unpacklo_epi16(simde__m128i a,simde__m128i b)6427 simde_mm_unpacklo_epi16 (simde__m128i a, simde__m128i b) {
6428 #if defined(SIMDE_X86_SSE2_NATIVE)
6429   return _mm_unpacklo_epi16(a, b);
6430 #else
6431   simde__m128i_private
6432     r_,
6433     a_ = simde__m128i_to_private(a),
6434     b_ = simde__m128i_to_private(b);
6435 
6436 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6437   r_.neon_i16 = vzip1q_s16(a_.neon_i16, b_.neon_i16);
6438 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6439   int16x4_t a1 = vget_low_s16(a_.neon_i16);
6440   int16x4_t b1 = vget_low_s16(b_.neon_i16);
6441   int16x4x2_t result = vzip_s16(a1, b1);
6442   r_.neon_i16 = vcombine_s16(result.val[0], result.val[1]);
6443 #elif defined(SIMDE_SHUFFLE_VECTOR_)
6444   r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, b_.i16, 0, 8, 1, 9, 2, 10, 3, 11);
6445 #else
6446   SIMDE_VECTORIZE
6447   for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i16[0])) / 2) ; i++) {
6448     r_.i16[(i * 2)]     = a_.i16[i];
6449     r_.i16[(i * 2) + 1] = b_.i16[i];
6450   }
6451 #endif
6452 
6453   return simde__m128i_from_private(r_);
6454 #endif
6455 }
6456 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6457 #  define _mm_unpacklo_epi16(a, b) simde_mm_unpacklo_epi16(a, b)
6458 #endif
6459 
6460 SIMDE_FUNCTION_ATTRIBUTES
6461 simde__m128i
simde_mm_unpacklo_epi32(simde__m128i a,simde__m128i b)6462 simde_mm_unpacklo_epi32 (simde__m128i a, simde__m128i b) {
6463 #if defined(SIMDE_X86_SSE2_NATIVE)
6464   return _mm_unpacklo_epi32(a, b);
6465 #else
6466   simde__m128i_private
6467     r_,
6468     a_ = simde__m128i_to_private(a),
6469     b_ = simde__m128i_to_private(b);
6470 
6471 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6472   r_.neon_i32 = vzip1q_s32(a_.neon_i32, b_.neon_i32);
6473 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6474   int32x2_t a1 = vget_low_s32(a_.neon_i32);
6475   int32x2_t b1 = vget_low_s32(b_.neon_i32);
6476   int32x2x2_t result = vzip_s32(a1, b1);
6477   r_.neon_i32 = vcombine_s32(result.val[0], result.val[1]);
6478 #elif defined(SIMDE_SHUFFLE_VECTOR_)
6479   r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.i32, b_.i32, 0, 4, 1, 5);
6480 #else
6481   SIMDE_VECTORIZE
6482   for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i32[0])) / 2) ; i++) {
6483     r_.i32[(i * 2)]     = a_.i32[i];
6484     r_.i32[(i * 2) + 1] = b_.i32[i];
6485   }
6486 #endif
6487 
6488   return simde__m128i_from_private(r_);
6489 #endif
6490 }
6491 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6492 #  define _mm_unpacklo_epi32(a, b) simde_mm_unpacklo_epi32(a, b)
6493 #endif
6494 
6495 SIMDE_FUNCTION_ATTRIBUTES
6496 simde__m128i
simde_mm_unpacklo_epi64(simde__m128i a,simde__m128i b)6497 simde_mm_unpacklo_epi64 (simde__m128i a, simde__m128i b) {
6498 #if defined(SIMDE_X86_SSE2_NATIVE)
6499   return _mm_unpacklo_epi64(a, b);
6500 #else
6501   simde__m128i_private
6502     r_,
6503     a_ = simde__m128i_to_private(a),
6504     b_ = simde__m128i_to_private(b);
6505 
6506 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6507   int64x1_t a_l = vget_low_s64(a_.i64);
6508   int64x1_t b_l = vget_low_s64(b_.i64);
6509   r_.neon_i64 = vcombine_s64(a_l, b_l);
6510 #elif defined(SIMDE_SHUFFLE_VECTOR_)
6511   r_.i64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.i64, b_.i64, 0, 2);
6512 #else
6513   SIMDE_VECTORIZE
6514   for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i64[0])) / 2) ; i++) {
6515     r_.i64[(i * 2)]     = a_.i64[i];
6516     r_.i64[(i * 2) + 1] = b_.i64[i];
6517   }
6518 #endif
6519 
6520   return simde__m128i_from_private(r_);
6521 #endif
6522 }
6523 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6524 #  define _mm_unpacklo_epi64(a, b) simde_mm_unpacklo_epi64(a, b)
6525 #endif
6526 
6527 SIMDE_FUNCTION_ATTRIBUTES
6528 simde__m128d
simde_mm_unpacklo_pd(simde__m128d a,simde__m128d b)6529 simde_mm_unpacklo_pd (simde__m128d a, simde__m128d b) {
6530 #if defined(SIMDE_X86_SSE2_NATIVE)
6531   return _mm_unpacklo_pd(a, b);
6532 #else
6533   simde__m128d_private
6534     r_,
6535     a_ = simde__m128d_to_private(a),
6536     b_ = simde__m128d_to_private(b);
6537 
6538 #if defined(SIMDE_SHUFFLE_VECTOR_)
6539   r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 0, 2);
6540 #else
6541   SIMDE_VECTORIZE
6542   for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.f64[0])) / 2) ; i++) {
6543     r_.f64[(i * 2)]     = a_.f64[i];
6544     r_.f64[(i * 2) + 1] = b_.f64[i];
6545   }
6546 #endif
6547 
6548   return simde__m128d_from_private(r_);
6549 #endif
6550 }
6551 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6552 #  define _mm_unpacklo_pd(a, b) simde_mm_unpacklo_pd(a, b)
6553 #endif
6554 
6555 SIMDE_FUNCTION_ATTRIBUTES
6556 simde__m128d
simde_mm_xor_pd(simde__m128d a,simde__m128d b)6557 simde_mm_xor_pd (simde__m128d a, simde__m128d b) {
6558 #if defined(SIMDE_X86_SSE2_NATIVE)
6559   return _mm_xor_pd(a, b);
6560 #else
6561   simde__m128d_private
6562     r_,
6563     a_ = simde__m128d_to_private(a),
6564     b_ = simde__m128d_to_private(b);
6565 
6566 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
6567   r_.i32f = a_.i32f ^ b_.i32f;
6568 #else
6569   SIMDE_VECTORIZE
6570   for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
6571     r_.i32f[i] = a_.i32f[i] ^ b_.i32f[i];
6572   }
6573 #endif
6574 
6575   return simde__m128d_from_private(r_);
6576 #endif
6577 }
6578 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6579 #  define _mm_xor_pd(a, b) simde_mm_xor_pd(a, b)
6580 #endif
6581 
6582 SIMDE_FUNCTION_ATTRIBUTES
6583 simde__m128d
simde_x_mm_negate_pd(simde__m128d a)6584 simde_x_mm_negate_pd(simde__m128d a) {
6585   #if defined(SIMDE_X86_SSE_NATIVE)
6586     return simde_mm_xor_pd(a, _mm_set1_pd(SIMDE_FLOAT64_C(-0.0)));
6587   #else
6588     simde__m128d_private
6589       r_,
6590       a_ = simde__m128d_to_private(a);
6591 
6592     #if defined(SIMDE_POWER_ALTIVEC_P9_NATIVE)
6593       r_.altivec_f64 = vec_neg(a_.altivec_f64);
6594     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6595       r_.neon_f64 = vnegq_f64(a_.neon_f64);
6596     #elif defined(SIMDE_WASM_SIMD128d_NATIVE)
6597       r_.wasm_v128d = wasm_f64x2_neg(a_.wasm_v128d);
6598     #elif defined(SIMDE_VECTOR_OPS)
6599       r_.f64 = -a_.f64;
6600     #else
6601       SIMDE_VECTORIZE
6602       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
6603         r_.f64[i] = -a_.f64[i];
6604       }
6605     #endif
6606 
6607     return simde__m128d_from_private(r_);
6608   #endif
6609 }
6610 
6611 SIMDE_FUNCTION_ATTRIBUTES
6612 simde__m128i
simde_mm_xor_si128(simde__m128i a,simde__m128i b)6613 simde_mm_xor_si128 (simde__m128i a, simde__m128i b) {
6614 #if defined(SIMDE_X86_SSE2_NATIVE)
6615   return _mm_xor_si128(a, b);
6616 #else
6617   simde__m128i_private
6618     r_,
6619     a_ = simde__m128i_to_private(a),
6620     b_ = simde__m128i_to_private(b);
6621 
6622   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6623     r_.neon_i32 = veorq_s32(a_.neon_i32, b_.neon_i32);
6624   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
6625     r_.altivec_i32 = vec_xor(a_.altivec_i32, b_.altivec_i32);
6626   #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
6627     r_.i32f = a_.i32f ^ b_.i32f;
6628   #else
6629     SIMDE_VECTORIZE
6630     for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
6631       r_.i32f[i] = a_.i32f[i] ^ b_.i32f[i];
6632     }
6633   #endif
6634 
6635   return simde__m128i_from_private(r_);
6636 #endif
6637 }
6638 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6639 #  define _mm_xor_si128(a, b) simde_mm_xor_si128(a, b)
6640 #endif
6641 
6642 SIMDE_FUNCTION_ATTRIBUTES
6643 simde__m128i
simde_x_mm_not_si128(simde__m128i a)6644 simde_x_mm_not_si128 (simde__m128i a) {
6645   simde__m128i_private
6646     r_,
6647     a_ = simde__m128i_to_private(a);
6648 
6649 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6650   r_.neon_i32 = vmvnq_s32(a_.neon_i32);
6651 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
6652   r_.i32f = ~(a_.i32f);
6653 #else
6654   SIMDE_VECTORIZE
6655   for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
6656     r_.i32f[i] = ~(a_.i32f[i]);
6657   }
6658 #endif
6659 
6660   return simde__m128i_from_private(r_);
6661 }
6662 
6663 #define SIMDE_MM_SHUFFLE2(x, y) (((x) << 1) | (y))
6664 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6665 #  define _MM_SHUFFLE2(x, y) SIMDE_MM_SHUFFLE2(x, y)
6666 #endif
6667 
6668 SIMDE_END_DECLS_
6669 
6670 HEDLEY_DIAGNOSTIC_POP
6671 
6672 #endif /* !defined(SIMDE_X86_SSE2_H) */
6673