1 /* SPDX-License-Identifier: MIT
2  *
3  * Permission is hereby granted, free of charge, to any person
4  * obtaining a copy of this software and associated documentation
5  * files (the "Software"), to deal in the Software without
6  * restriction, including without limitation the rights to use, copy,
7  * modify, merge, publish, distribute, sublicense, and/or sell copies
8  * of the Software, and to permit persons to whom the Software is
9  * furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be
12  * included in all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
18  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
19  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  * Copyright:
24  *   2017-2020 Evan Nemerson <evan@nemerson.com>
25  *   2015-2017 John W. Ratcliff <jratcliffscarab@gmail.com>
26  *   2015      Brandon Rowlett <browlett@nvidia.com>
27  *   2015      Ken Fast <kfast@gdeb.com>
28  *   2017      Hasindu Gamaarachchi <hasindu@unsw.edu.au>
29  *   2018      Jeff Daily <jeff.daily@amd.com>
30  */
31 
32 #if !defined(SIMDE_X86_SSE2_H)
33 #define SIMDE_X86_SSE2_H
34 
35 #include "sse.h"
36 
37 HEDLEY_DIAGNOSTIC_PUSH
38 SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
39 SIMDE_BEGIN_DECLS_
40 
41 typedef union {
42 #if defined(SIMDE_VECTOR_SUBSCRIPT)
43   SIMDE_ALIGN(16) int8_t          i8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
44   SIMDE_ALIGN(16) int16_t        i16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
45   SIMDE_ALIGN(16) int32_t        i32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
46   SIMDE_ALIGN(16) int64_t        i64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
47   SIMDE_ALIGN(16) uint8_t         u8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
48   SIMDE_ALIGN(16) uint16_t       u16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
49   SIMDE_ALIGN(16) uint32_t       u32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
50   SIMDE_ALIGN(16) uint64_t       u64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
51   #if defined(SIMDE_HAVE_INT128_)
52   SIMDE_ALIGN(16) simde_int128  i128 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
53   SIMDE_ALIGN(16) simde_uint128 u128 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
54   #endif
55   SIMDE_ALIGN(16) simde_float32  f32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
56   SIMDE_ALIGN(16) simde_float64  f64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
57 
58   SIMDE_ALIGN(16) int_fast32_t  i32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
59   SIMDE_ALIGN(16) uint_fast32_t u32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
60 #else
61   SIMDE_ALIGN(16) int8_t         i8[16];
62   SIMDE_ALIGN(16) int16_t        i16[8];
63   SIMDE_ALIGN(16) int32_t        i32[4];
64   SIMDE_ALIGN(16) int64_t        i64[2];
65   SIMDE_ALIGN(16) uint8_t        u8[16];
66   SIMDE_ALIGN(16) uint16_t       u16[8];
67   SIMDE_ALIGN(16) uint32_t       u32[4];
68   SIMDE_ALIGN(16) uint64_t       u64[2];
69   #if defined(SIMDE_HAVE_INT128_)
70   SIMDE_ALIGN(16) simde_int128  i128[1];
71   SIMDE_ALIGN(16) simde_uint128 u128[1];
72   #endif
73   SIMDE_ALIGN(16) simde_float32  f32[4];
74   SIMDE_ALIGN(16) simde_float64  f64[2];
75 
76   SIMDE_ALIGN(16) int_fast32_t  i32f[16 / sizeof(int_fast32_t)];
77   SIMDE_ALIGN(16) uint_fast32_t u32f[16 / sizeof(uint_fast32_t)];
78 #endif
79 
80   SIMDE_ALIGN(16) simde__m64_private m64_private[2];
81   SIMDE_ALIGN(16) simde__m64         m64[2];
82 
83 #if defined(SIMDE_X86_SSE2_NATIVE)
84   SIMDE_ALIGN(16) __m128i        n;
85 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
86   SIMDE_ALIGN(16) int8x16_t      neon_i8;
87   SIMDE_ALIGN(16) int16x8_t      neon_i16;
88   SIMDE_ALIGN(16) int32x4_t      neon_i32;
89   SIMDE_ALIGN(16) int64x2_t      neon_i64;
90   SIMDE_ALIGN(16) uint8x16_t     neon_u8;
91   SIMDE_ALIGN(16) uint16x8_t     neon_u16;
92   SIMDE_ALIGN(16) uint32x4_t     neon_u32;
93   SIMDE_ALIGN(16) uint64x2_t     neon_u64;
94   SIMDE_ALIGN(16) float32x4_t    neon_f32;
95   #if defined(SIMDE_ARCH_AARCH64)
96   SIMDE_ALIGN(16) float64x2_t    neon_f64;
97   #endif
98 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
99   SIMDE_ALIGN(16) v128_t         wasm_v128;
100 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
101   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(signed char)          altivec_i8;
102   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(signed short)         altivec_i16;
103   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(signed int)           altivec_i32;
104   #if defined(__UINT_FAST32_TYPE__)
105   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(__INT_FAST32_TYPE__)  altivec_i32f;
106   #else
107   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(signed int)           altivec_i32f;
108   #endif
109   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(signed long long)     altivec_i64;
110   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(unsigned char)        altivec_u8;
111   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(unsigned short)       altivec_u16;
112   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(unsigned int)         altivec_u32;
113   #if defined(__UINT_FAST32_TYPE__)
114   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(__UINT_FAST32_TYPE__) altivec_u32f;
115   #else
116   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(unsigned int)         altivec_u32f;
117   #endif
118   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long)   altivec_u64;
119   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(float)                altivec_f32;
120   #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
121     SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(double)             altivec_f64;
122   #endif
123 #endif
124 } simde__m128i_private;
125 
126 typedef union {
127 #if defined(SIMDE_VECTOR_SUBSCRIPT)
128   SIMDE_ALIGN(16) int8_t          i8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
129   SIMDE_ALIGN(16) int16_t        i16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
130   SIMDE_ALIGN(16) int32_t        i32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
131   SIMDE_ALIGN(16) int64_t        i64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
132   SIMDE_ALIGN(16) uint8_t         u8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
133   SIMDE_ALIGN(16) uint16_t       u16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
134   SIMDE_ALIGN(16) uint32_t       u32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
135   SIMDE_ALIGN(16) uint64_t       u64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
136   SIMDE_ALIGN(16) simde_float32  f32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
137   SIMDE_ALIGN(16) simde_float64  f64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
138   SIMDE_ALIGN(16) int_fast32_t  i32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
139   SIMDE_ALIGN(16) uint_fast32_t u32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
140 #else
141   SIMDE_ALIGN(16) int8_t         i8[16];
142   SIMDE_ALIGN(16) int16_t        i16[8];
143   SIMDE_ALIGN(16) int32_t        i32[4];
144   SIMDE_ALIGN(16) int64_t        i64[2];
145   SIMDE_ALIGN(16) uint8_t        u8[16];
146   SIMDE_ALIGN(16) uint16_t       u16[8];
147   SIMDE_ALIGN(16) uint32_t       u32[4];
148   SIMDE_ALIGN(16) uint64_t       u64[2];
149   SIMDE_ALIGN(16) simde_float32  f32[4];
150   SIMDE_ALIGN(16) simde_float64  f64[2];
151   SIMDE_ALIGN(16) int_fast32_t  i32f[16 / sizeof(int_fast32_t)];
152   SIMDE_ALIGN(16) uint_fast32_t u32f[16 / sizeof(uint_fast32_t)];
153 #endif
154 
155   SIMDE_ALIGN(16) simde__m64_private m64_private[2];
156   SIMDE_ALIGN(16) simde__m64         m64[2];
157 
158 #if defined(SIMDE_X86_SSE2_NATIVE)
159   SIMDE_ALIGN(16) __m128d        n;
160 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
161   SIMDE_ALIGN(16) int8x16_t      neon_i8;
162   SIMDE_ALIGN(16) int16x8_t      neon_i16;
163   SIMDE_ALIGN(16) int32x4_t      neon_i32;
164   SIMDE_ALIGN(16) int64x2_t      neon_i64;
165   SIMDE_ALIGN(16) uint8x16_t     neon_u8;
166   SIMDE_ALIGN(16) uint16x8_t     neon_u16;
167   SIMDE_ALIGN(16) uint32x4_t     neon_u32;
168   SIMDE_ALIGN(16) uint64x2_t     neon_u64;
169   SIMDE_ALIGN(16) float32x4_t    neon_f32;
170   #if defined(SIMDE_ARCH_AARCH64)
171   SIMDE_ALIGN(16) float64x2_t    neon_f64;
172   #endif
173 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
174   SIMDE_ALIGN(16) v128_t         wasm_v128;
175 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
176   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(signed char)          altivec_i8;
177   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(signed short)         altivec_i16;
178   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(signed int)           altivec_i32;
179   #if defined(__INT_FAST32_TYPE__)
180   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(__INT_FAST32_TYPE__)  altivec_i32f;
181   #else
182   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(signed int)           altivec_i32f;
183   #endif
184   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(signed long long)     altivec_i64;
185   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(unsigned char)        altivec_u8;
186   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(unsigned short)       altivec_u16;
187   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(unsigned int)         altivec_u32;
188   #if defined(__UINT_FAST32_TYPE__)
189   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(__UINT_FAST32_TYPE__) altivec_u32f;
190   #else
191   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(unsigned int)         altivec_u32f;
192   #endif
193   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long)   altivec_u64;
194   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(float)                altivec_f32;
195   #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
196     SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(double)             altivec_f64;
197   #endif
198 #endif
199 } simde__m128d_private;
200 
201 #if defined(SIMDE_X86_SSE2_NATIVE)
202   typedef __m128i simde__m128i;
203   typedef __m128d simde__m128d;
204 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
205    typedef int64x2_t simde__m128i;
206 #  if defined(SIMDE_ARCH_AARCH64)
207      typedef float64x2_t simde__m128d;
208 #  elif defined(SIMDE_VECTOR_SUBSCRIPT)
209      typedef simde_float64 simde__m128d SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
210 #  else
211      typedef simde__m128d_private simde__m128d;
212 #  endif
213 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
214    typedef v128_t simde__m128i;
215    typedef v128_t simde__m128d;
216 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
217   typedef SIMDE_POWER_ALTIVEC_VECTOR(float) simde__m128i;
218   #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
219      typedef SIMDE_POWER_ALTIVEC_VECTOR(double) simde__m128d;
220   #else
221      typedef simde__m128d_private simde__m128d;
222   #endif
223 #elif defined(SIMDE_VECTOR_SUBSCRIPT)
224   typedef int64_t simde__m128i SIMDE_ALIGN(16) SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
225   typedef simde_float64 simde__m128d SIMDE_ALIGN(16) SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
226 #else
227   typedef simde__m128i_private simde__m128i;
228   typedef simde__m128d_private simde__m128d;
229 #endif
230 
231 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
232   typedef simde__m128i __m128i;
233   typedef simde__m128d __m128d;
234 #endif
235 
236 HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128i), "simde__m128i size incorrect");
237 HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128i_private), "simde__m128i_private size incorrect");
238 HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128d), "simde__m128d size incorrect");
239 HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128d_private), "simde__m128d_private size incorrect");
240 #if defined(SIMDE_CHECK_ALIGNMENT) && defined(SIMDE_ALIGN_OF)
241 HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128i) == 16, "simde__m128i is not 16-byte aligned");
242 HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128i_private) == 16, "simde__m128i_private is not 16-byte aligned");
243 HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128d) == 16, "simde__m128d is not 16-byte aligned");
244 HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128d_private) == 16, "simde__m128d_private is not 16-byte aligned");
245 #endif
246 
247 SIMDE_FUNCTION_ATTRIBUTES
248 simde__m128i
simde__m128i_from_private(simde__m128i_private v)249 simde__m128i_from_private(simde__m128i_private v) {
250   simde__m128i r;
251   simde_memcpy(&r, &v, sizeof(r));
252   return r;
253 }
254 
255 SIMDE_FUNCTION_ATTRIBUTES
256 simde__m128i_private
simde__m128i_to_private(simde__m128i v)257 simde__m128i_to_private(simde__m128i v) {
258   simde__m128i_private r;
259   simde_memcpy(&r, &v, sizeof(r));
260   return r;
261 }
262 
263 SIMDE_FUNCTION_ATTRIBUTES
264 simde__m128d
simde__m128d_from_private(simde__m128d_private v)265 simde__m128d_from_private(simde__m128d_private v) {
266   simde__m128d r;
267   simde_memcpy(&r, &v, sizeof(r));
268   return r;
269 }
270 
271 SIMDE_FUNCTION_ATTRIBUTES
272 simde__m128d_private
simde__m128d_to_private(simde__m128d v)273 simde__m128d_to_private(simde__m128d v) {
274   simde__m128d_private r;
275   simde_memcpy(&r, &v, sizeof(r));
276   return r;
277 }
278 
279 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i,int8x16_t,neon,i8)280   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, int8x16_t, neon, i8)
281   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, int16x8_t, neon, i16)
282   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, int32x4_t, neon, i32)
283   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, int64x2_t, neon, i64)
284   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, uint8x16_t, neon, u8)
285   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, uint16x8_t, neon, u16)
286   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, uint32x4_t, neon, u32)
287   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, uint64x2_t, neon, u64)
288   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, float32x4_t, neon, f32)
289   #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
290     SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, float64x2_t, neon, f64)
291   #endif
292 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
293   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(signed char), altivec, i8)
294   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(signed short), altivec, i16)
295   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(signed int), altivec, i32)
296   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), altivec, u8)
297   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(unsigned short), altivec, u16)
298   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(unsigned int), altivec, u32)
299   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long), altivec, u64)
300   #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
301     SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(signed long long), altivec, i64)
302   #endif
303 #endif /* defined(SIMDE_ARM_NEON_A32V7_NATIVE) */
304 
305 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
306   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, int8x16_t, neon, i8)
307   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, int16x8_t, neon, i16)
308   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, int32x4_t, neon, i32)
309   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, int64x2_t, neon, i64)
310   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, uint8x16_t, neon, u8)
311   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, uint16x8_t, neon, u16)
312   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, uint32x4_t, neon, u32)
313   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, uint64x2_t, neon, u64)
314   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, float32x4_t, neon, f32)
315   #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
316     SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, float64x2_t, neon, f64)
317   #endif
318 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
319   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(signed char), altivec, i8)
320   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(signed short), altivec, i16)
321   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(signed int), altivec, i32)
322   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), altivec, u8)
323   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(unsigned short), altivec, u16)
324   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(unsigned int), altivec, u32)
325   SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long), altivec, u64)
326   #if defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
327     SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(signed long long), altivec, i64)
328   #endif
329 
330   #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
331     #if defined(SIMDE_BUG_GCC_95782)
332       SIMDE_FUNCTION_ATTRIBUTES
333       SIMDE_POWER_ALTIVEC_VECTOR(double)
334       simde__m128d_to_altivec_f64(simde__m128d value) {
335         simde__m128d_private r_ = simde__m128d_to_private(value);
336         return r_.altivec_f64;
337       }
338 
339       SIMDE_FUNCTION_ATTRIBUTES
340       simde__m128d
341       simde__m128d_from_altivec_f64(SIMDE_POWER_ALTIVEC_VECTOR(double) value) {
342         simde__m128d_private r_;
343         r_.altivec_f64 = value;
344         return simde__m128d_from_private(r_);
345       }
346     #else
347       SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(double), altivec, f64)
348     #endif
349   #endif
350 #endif /* defined(SIMDE_ARM_NEON_A32V7_NATIVE) */
351 
352 SIMDE_FUNCTION_ATTRIBUTES
353 simde__m128i
354 simde_mm_add_epi8 (simde__m128i a, simde__m128i b) {
355 #if defined(SIMDE_X86_SSE2_NATIVE)
356   return _mm_add_epi8(a, b);
357 #else
358   simde__m128i_private
359     r_,
360     a_ = simde__m128i_to_private(a),
361     b_ = simde__m128i_to_private(b);
362 
363   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
364     r_.neon_i8 = vaddq_s8(a_.neon_i8, b_.neon_i8);
365   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
366     r_.altivec_i8 = vec_add(a_.altivec_i8, b_.altivec_i8);
367   #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
368     r_.i8 = a_.i8 + b_.i8;
369   #else
370     SIMDE_VECTORIZE
371     for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
372       r_.i8[i] = a_.i8[i] + b_.i8[i];
373     }
374   #endif
375 
376   return simde__m128i_from_private(r_);
377 #endif
378 }
379 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
380 #  define _mm_add_epi8(a, b) simde_mm_add_epi8(a, b)
381 #endif
382 
383 SIMDE_FUNCTION_ATTRIBUTES
384 simde__m128i
simde_mm_add_epi16(simde__m128i a,simde__m128i b)385 simde_mm_add_epi16 (simde__m128i a, simde__m128i b) {
386 #if defined(SIMDE_X86_SSE2_NATIVE)
387   return _mm_add_epi16(a, b);
388 #else
389   simde__m128i_private
390     r_,
391     a_ = simde__m128i_to_private(a),
392     b_ = simde__m128i_to_private(b);
393 
394   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
395     r_.neon_i16 = vaddq_s16(a_.neon_i16, b_.neon_i16);
396   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
397     r_.altivec_i16 = vec_add(a_.altivec_i16, b_.altivec_i16);
398   #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
399     r_.i16 = a_.i16 + b_.i16;
400   #else
401     SIMDE_VECTORIZE
402     for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
403       r_.i16[i] = a_.i16[i] + b_.i16[i];
404     }
405   #endif
406 
407   return simde__m128i_from_private(r_);
408 #endif
409 }
410 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
411 #  define _mm_add_epi16(a, b) simde_mm_add_epi16(a, b)
412 #endif
413 
414 SIMDE_FUNCTION_ATTRIBUTES
415 simde__m128i
simde_mm_add_epi32(simde__m128i a,simde__m128i b)416 simde_mm_add_epi32 (simde__m128i a, simde__m128i b) {
417 #if defined(SIMDE_X86_SSE2_NATIVE)
418   return _mm_add_epi32(a, b);
419 #else
420   simde__m128i_private
421     r_,
422     a_ = simde__m128i_to_private(a),
423     b_ = simde__m128i_to_private(b);
424 
425   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
426     r_.neon_i32 = vaddq_s32(a_.neon_i32, b_.neon_i32);
427   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
428     r_.altivec_i32 = vec_add(a_.altivec_i32, b_.altivec_i32);
429   #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
430     r_.i32 = a_.i32 + b_.i32;
431   #else
432     SIMDE_VECTORIZE
433     for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
434       r_.i32[i] = a_.i32[i] + b_.i32[i];
435     }
436   #endif
437 
438   return simde__m128i_from_private(r_);
439 #endif
440 }
441 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
442 #  define _mm_add_epi32(a, b) simde_mm_add_epi32(a, b)
443 #endif
444 
445 SIMDE_FUNCTION_ATTRIBUTES
446 simde__m128i
simde_mm_add_epi64(simde__m128i a,simde__m128i b)447 simde_mm_add_epi64 (simde__m128i a, simde__m128i b) {
448 #if defined(SIMDE_X86_SSE2_NATIVE)
449   return _mm_add_epi64(a, b);
450 #else
451   simde__m128i_private
452     r_,
453     a_ = simde__m128i_to_private(a),
454     b_ = simde__m128i_to_private(b);
455 
456   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
457     r_.neon_i64 = vaddq_s64(a_.neon_i64, b_.neon_i64);
458   #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
459     r_.altivec_i64 = vec_add(a_.altivec_i64, b_.altivec_i64);
460   #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
461     r_.i64 = a_.i64 + b_.i64;
462   #else
463     SIMDE_VECTORIZE
464     for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
465       r_.i64[i] = a_.i64[i] + b_.i64[i];
466     }
467   #endif
468 
469   return simde__m128i_from_private(r_);
470 #endif
471 }
472 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
473 #  define _mm_add_epi64(a, b) simde_mm_add_epi64(a, b)
474 #endif
475 
476 SIMDE_FUNCTION_ATTRIBUTES
477 simde__m128d
simde_mm_add_pd(simde__m128d a,simde__m128d b)478 simde_mm_add_pd (simde__m128d a, simde__m128d b) {
479 #if defined(SIMDE_X86_SSE2_NATIVE)
480   return _mm_add_pd(a, b);
481 #else
482   simde__m128d_private
483     r_,
484     a_ = simde__m128d_to_private(a),
485     b_ = simde__m128d_to_private(b);
486 
487 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
488   r_.neon_f64 = vaddq_f64(a_.neon_f64, b_.neon_f64);
489 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
490   r_.wasm_v128 = wasm_f64x2_add(a_.wasm_v128, b_.wasm_v128);
491 #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
492   r_.altivec_f64 = vec_add(a_.altivec_f64, b_.altivec_f64);
493 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
494   r_.f64 = a_.f64 + b_.f64;
495 #else
496   SIMDE_VECTORIZE
497   for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
498     r_.f64[i] = a_.f64[i] + b_.f64[i];
499   }
500 #endif
501 
502   return simde__m128d_from_private(r_);
503 #endif
504 }
505 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
506 #  define _mm_add_pd(a, b) simde_mm_add_pd(a, b)
507 #endif
508 
509 SIMDE_FUNCTION_ATTRIBUTES
510 simde__m128d
simde_mm_move_sd(simde__m128d a,simde__m128d b)511 simde_mm_move_sd (simde__m128d a, simde__m128d b) {
512 #if defined(SIMDE_X86_SSE2_NATIVE)
513   return _mm_move_sd(a, b);
514 #else
515   simde__m128d_private
516     r_,
517     a_ = simde__m128d_to_private(a),
518     b_ = simde__m128d_to_private(b);
519 
520 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
521   r_.neon_f64 = vsetq_lane_f64(vgetq_lane_f64(b_.neon_f64, 0), a_.neon_f64, 0);
522 #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
523   SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) m = {
524     16, 17, 18, 19, 20, 21, 22, 23,
525      8,  9, 10, 11, 12, 13, 14, 15
526   };
527   r_.altivec_f64 = vec_perm(a_.altivec_f64, b_.altivec_f64, m);
528 #elif defined(SIMDE_SHUFFLE_VECTOR_)
529   r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 2, 1);
530 #else
531   r_.f64[0] = b_.f64[0];
532   r_.f64[1] = a_.f64[1];
533 #endif
534 
535   return simde__m128d_from_private(r_);
536 #endif
537 }
538 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
539 #  define _mm_move_sd(a, b) simde_mm_move_sd(a, b)
540 #endif
541 
542 SIMDE_FUNCTION_ATTRIBUTES
543 simde__m128d
simde_mm_add_sd(simde__m128d a,simde__m128d b)544 simde_mm_add_sd (simde__m128d a, simde__m128d b) {
545 #if defined(SIMDE_X86_SSE2_NATIVE)
546   return _mm_add_sd(a, b);
547 #else
548   simde__m128d_private
549     r_,
550     a_ = simde__m128d_to_private(a),
551     b_ = simde__m128d_to_private(b);
552 
553   r_.f64[0] = a_.f64[0] + b_.f64[0];
554   r_.f64[1] = a_.f64[1];
555 
556 #if defined(SIMDE_ASSUME_VECTORIZATION)
557    return simde_mm_move_sd(a, simde_mm_add_pd(a, b));
558 #else
559     r_.f64[0] = a_.f64[0] + b_.f64[0];
560     r_.f64[1] = a_.f64[1];
561 #endif
562 
563   return simde__m128d_from_private(r_);
564 #endif
565 }
566 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
567 #  define _mm_add_sd(a, b) simde_mm_add_sd(a, b)
568 #endif
569 
570 SIMDE_FUNCTION_ATTRIBUTES
571 simde__m64
simde_mm_add_si64(simde__m64 a,simde__m64 b)572 simde_mm_add_si64 (simde__m64 a, simde__m64 b) {
573 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
574   return _mm_add_si64(a, b);
575 #else
576   simde__m64_private
577     r_,
578     a_ = simde__m64_to_private(a),
579     b_ = simde__m64_to_private(b);
580 
581 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
582   r_.neon_i64 = vadd_s64(a_.neon_i64, b_.neon_i64);
583 #else
584   r_.i64[0] = a_.i64[0] + b_.i64[0];
585 #endif
586 
587   return simde__m64_from_private(r_);
588 #endif
589 }
590 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
591 #  define _mm_add_si64(a, b) simde_mm_add_si64(a, b)
592 #endif
593 
594 SIMDE_FUNCTION_ATTRIBUTES
595 simde__m128i
simde_mm_adds_epi8(simde__m128i a,simde__m128i b)596 simde_mm_adds_epi8 (simde__m128i a, simde__m128i b) {
597 #if defined(SIMDE_X86_SSE2_NATIVE)
598   return _mm_adds_epi8(a, b);
599 #else
600   simde__m128i_private
601     r_,
602     a_ = simde__m128i_to_private(a),
603     b_ = simde__m128i_to_private(b);
604 
605 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
606   r_.neon_i8 = vqaddq_s8(a_.neon_i8, b_.neon_i8);
607 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
608   r_.altivec_i8 = vec_adds(a_.altivec_i8, b_.altivec_i8);
609 #else
610   SIMDE_VECTORIZE
611   for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
612     const int32_t tmp =
613       HEDLEY_STATIC_CAST(int16_t, a_.i8[i]) +
614       HEDLEY_STATIC_CAST(int16_t, b_.i8[i]);
615     r_.i8[i] = HEDLEY_STATIC_CAST(int8_t, ((tmp < INT8_MAX) ? ((tmp > INT8_MIN) ? tmp : INT8_MIN) : INT8_MAX));
616   }
617 #endif
618 
619   return simde__m128i_from_private(r_);
620 #endif
621 }
622 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
623 #  define _mm_adds_epi8(a, b) simde_mm_adds_epi8(a, b)
624 #endif
625 
626 SIMDE_FUNCTION_ATTRIBUTES
627 simde__m128i
simde_mm_adds_epi16(simde__m128i a,simde__m128i b)628 simde_mm_adds_epi16 (simde__m128i a, simde__m128i b) {
629 #if defined(SIMDE_X86_SSE2_NATIVE)
630   return _mm_adds_epi16(a, b);
631 #else
632   simde__m128i_private
633     r_,
634     a_ = simde__m128i_to_private(a),
635     b_ = simde__m128i_to_private(b);
636 
637 
638   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
639     r_.neon_i16 = vqaddq_s16(a_.neon_i16, b_.neon_i16);
640   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
641     r_.altivec_i16 = vec_adds(a_.altivec_i16, b_.altivec_i16);
642   #else
643     SIMDE_VECTORIZE
644     for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
645       const int32_t tmp =
646         HEDLEY_STATIC_CAST(int32_t, a_.i16[i]) +
647         HEDLEY_STATIC_CAST(int32_t, b_.i16[i]);
648       r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, ((tmp < INT16_MAX) ? ((tmp > INT16_MIN) ? tmp : INT16_MIN) : INT16_MAX));
649     }
650   #endif
651 
652   return simde__m128i_from_private(r_);
653 #endif
654 }
655 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
656 #  define _mm_adds_epi16(a, b) simde_mm_adds_epi16(a, b)
657 #endif
658 
659 SIMDE_FUNCTION_ATTRIBUTES
660 simde__m128i
simde_mm_adds_epu8(simde__m128i a,simde__m128i b)661 simde_mm_adds_epu8 (simde__m128i a, simde__m128i b) {
662 #if defined(SIMDE_X86_SSE2_NATIVE)
663   return _mm_adds_epu8(a, b);
664 #else
665   simde__m128i_private
666     r_,
667     a_ = simde__m128i_to_private(a),
668     b_ = simde__m128i_to_private(b);
669 
670   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
671     r_.neon_u8 = vqaddq_u8(a_.neon_u8, b_.neon_u8);
672   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
673     r_.altivec_u8 = vec_adds(a_.altivec_u8, b_.altivec_u8);
674   #else
675     SIMDE_VECTORIZE
676     for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
677       r_.u8[i] = ((UINT8_MAX - a_.u8[i]) > b_.u8[i]) ? (a_.u8[i] + b_.u8[i]) : UINT8_MAX;
678     }
679   #endif
680 
681   return simde__m128i_from_private(r_);
682 #endif
683 }
684 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
685 #  define _mm_adds_epu8(a, b) simde_mm_adds_epu8(a, b)
686 #endif
687 
688 SIMDE_FUNCTION_ATTRIBUTES
689 simde__m128i
simde_mm_adds_epu16(simde__m128i a,simde__m128i b)690 simde_mm_adds_epu16 (simde__m128i a, simde__m128i b) {
691 #if defined(SIMDE_X86_SSE2_NATIVE)
692   return _mm_adds_epu16(a, b);
693 #else
694   simde__m128i_private
695     r_,
696     a_ = simde__m128i_to_private(a),
697     b_ = simde__m128i_to_private(b);
698 
699   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
700     r_.neon_u16 = vqaddq_u16(a_.neon_u16, b_.neon_u16);
701   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
702     r_.altivec_u16 = vec_adds(a_.altivec_u16, b_.altivec_u16);
703   #else
704     SIMDE_VECTORIZE
705     for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
706       r_.u16[i] = ((UINT16_MAX - a_.u16[i]) > b_.u16[i]) ? (a_.u16[i] + b_.u16[i]) : UINT16_MAX;
707     }
708   #endif
709 
710   return simde__m128i_from_private(r_);
711 #endif
712 }
713 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
714 #  define _mm_adds_epu16(a, b) simde_mm_adds_epu16(a, b)
715 #endif
716 
717 SIMDE_FUNCTION_ATTRIBUTES
718 simde__m128d
simde_mm_and_pd(simde__m128d a,simde__m128d b)719 simde_mm_and_pd (simde__m128d a, simde__m128d b) {
720 #if defined(SIMDE_X86_SSE2_NATIVE)
721   return _mm_and_pd(a, b);
722 #else
723   simde__m128d_private
724     r_,
725     a_ = simde__m128d_to_private(a),
726     b_ = simde__m128d_to_private(b);
727 
728 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
729   r_.neon_i32 = vandq_s32(a_.neon_i32, b_.neon_i32);
730 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
731   r_.wasm_v128 = wasm_v128_and(a_.wasm_v128, b_.wasm_v128);
732 #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
733   r_.altivec_f64 = vec_and(a_.altivec_f64, b_.altivec_f64);
734 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
735   r_.i32f = a_.i32f & b_.i32f;
736 #else
737   SIMDE_VECTORIZE
738   for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
739     r_.i32f[i] = a_.i32f[i] & b_.i32f[i];
740   }
741 #endif
742 
743   return simde__m128d_from_private(r_);
744 #endif
745 }
746 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
747 #  define _mm_and_pd(a, b) simde_mm_and_pd(a, b)
748 #endif
749 
750 SIMDE_FUNCTION_ATTRIBUTES
751 simde__m128i
simde_mm_and_si128(simde__m128i a,simde__m128i b)752 simde_mm_and_si128 (simde__m128i a, simde__m128i b) {
753 #if defined(SIMDE_X86_SSE2_NATIVE)
754   return _mm_and_si128(a, b);
755 #else
756   simde__m128i_private
757     r_,
758     a_ = simde__m128i_to_private(a),
759     b_ = simde__m128i_to_private(b);
760 
761   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
762     r_.neon_i32 = vandq_s32(b_.neon_i32, a_.neon_i32);
763   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
764     r_.altivec_u32f = vec_and(a_.altivec_u32f, b_.altivec_u32f);
765   #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
766     r_.i32f = a_.i32f & b_.i32f;
767   #else
768     SIMDE_VECTORIZE
769     for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
770       r_.i32f[i] = a_.i32f[i] & b_.i32f[i];
771     }
772   #endif
773 
774   return simde__m128i_from_private(r_);
775 #endif
776 }
777 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
778 #  define _mm_and_si128(a, b) simde_mm_and_si128(a, b)
779 #endif
780 
781 SIMDE_FUNCTION_ATTRIBUTES
782 simde__m128d
simde_mm_andnot_pd(simde__m128d a,simde__m128d b)783 simde_mm_andnot_pd (simde__m128d a, simde__m128d b) {
784 #if defined(SIMDE_X86_SSE2_NATIVE)
785   return _mm_andnot_pd(a, b);
786 #else
787   simde__m128d_private
788     r_,
789     a_ = simde__m128d_to_private(a),
790     b_ = simde__m128d_to_private(b);
791 
792 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
793   r_.neon_i32 = vbicq_s32(a_.neon_i32, b_.neon_i32);
794 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
795   r_.wasm_v128 = wasm_v128_andnot(b_.wasm_v128, a_.wasm_v128);
796 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
797   r_.altivec_i32f = vec_andc(a_.altivec_i32f, b_.altivec_i32f);
798 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
799   r_.i32f = ~a_.i32f & b_.i32f;
800 #else
801   SIMDE_VECTORIZE
802   for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
803     r_.u64[i] = ~a_.u64[i] & b_.u64[i];
804   }
805 #endif
806 
807   return simde__m128d_from_private(r_);
808 #endif
809 }
810 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
811 #  define _mm_andnot_pd(a, b) simde_mm_andnot_pd(a, b)
812 #endif
813 
814 SIMDE_FUNCTION_ATTRIBUTES
815 simde__m128i
simde_mm_andnot_si128(simde__m128i a,simde__m128i b)816 simde_mm_andnot_si128 (simde__m128i a, simde__m128i b) {
817 #if defined(SIMDE_X86_SSE2_NATIVE)
818   return _mm_andnot_si128(a, b);
819 #else
820   simde__m128i_private
821     r_,
822     a_ = simde__m128i_to_private(a),
823     b_ = simde__m128i_to_private(b);
824 
825   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
826     r_.neon_i32 = vbicq_s32(b_.neon_i32, a_.neon_i32);
827   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
828     r_.altivec_i32 = vec_andc(b_.altivec_i32, a_.altivec_i32);
829   #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
830     r_.i32f = ~a_.i32f & b_.i32f;
831   #else
832     SIMDE_VECTORIZE
833     for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
834       r_.i32f[i] = ~(a_.i32f[i]) & b_.i32f[i];
835     }
836   #endif
837 
838   return simde__m128i_from_private(r_);
839 #endif
840 }
841 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
842 #  define _mm_andnot_si128(a, b) simde_mm_andnot_si128(a, b)
843 #endif
844 
845 SIMDE_FUNCTION_ATTRIBUTES
846 simde__m128i
simde_mm_avg_epu8(simde__m128i a,simde__m128i b)847 simde_mm_avg_epu8 (simde__m128i a, simde__m128i b) {
848 #if defined(SIMDE_X86_SSE2_NATIVE)
849   return _mm_avg_epu8(a, b);
850 #else
851   simde__m128i_private
852     r_,
853     a_ = simde__m128i_to_private(a),
854     b_ = simde__m128i_to_private(b);
855 
856 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
857   r_.neon_u8 = vrhaddq_u8(b_.neon_u8, a_.neon_u8);
858 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
859   r_.altivec_u8 = vec_avg(a_.altivec_u8, b_.altivec_u8);
860 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_CONVERT_VECTOR_)
861   uint16_t wa SIMDE_VECTOR(32);
862   uint16_t wb SIMDE_VECTOR(32);
863   uint16_t wr SIMDE_VECTOR(32);
864   SIMDE_CONVERT_VECTOR_(wa, a_.u8);
865   SIMDE_CONVERT_VECTOR_(wb, b_.u8);
866   wr = (wa + wb + 1) >> 1;
867   SIMDE_CONVERT_VECTOR_(r_.u8, wr);
868 #else
869   SIMDE_VECTORIZE
870   for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
871     r_.u8[i] = (a_.u8[i] + b_.u8[i] + 1) >> 1;
872   }
873 #endif
874 
875   return simde__m128i_from_private(r_);
876 #endif
877 }
878 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
879 #  define _mm_avg_epu8(a, b) simde_mm_avg_epu8(a, b)
880 #endif
881 
882 SIMDE_FUNCTION_ATTRIBUTES
883 simde__m128i
simde_mm_avg_epu16(simde__m128i a,simde__m128i b)884 simde_mm_avg_epu16 (simde__m128i a, simde__m128i b) {
885 #if defined(SIMDE_X86_SSE2_NATIVE)
886   return _mm_avg_epu16(a, b);
887 #else
888   simde__m128i_private
889     r_,
890     a_ = simde__m128i_to_private(a),
891     b_ = simde__m128i_to_private(b);
892 
893 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
894   r_.neon_u16 = vrhaddq_u16(b_.neon_u16, a_.neon_u16);
895 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
896   r_.altivec_u16 = vec_avg(a_.altivec_u16, b_.altivec_u16);
897 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_CONVERT_VECTOR_)
898   uint32_t wa SIMDE_VECTOR(32);
899   uint32_t wb SIMDE_VECTOR(32);
900   uint32_t wr SIMDE_VECTOR(32);
901   SIMDE_CONVERT_VECTOR_(wa, a_.u16);
902   SIMDE_CONVERT_VECTOR_(wb, b_.u16);
903   wr = (wa + wb + 1) >> 1;
904   SIMDE_CONVERT_VECTOR_(r_.u16, wr);
905 #else
906   SIMDE_VECTORIZE
907   for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
908     r_.u16[i] = (a_.u16[i] + b_.u16[i] + 1) >> 1;
909   }
910 #endif
911 
912   return simde__m128i_from_private(r_);
913 #endif
914 }
915 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
916 #  define _mm_avg_epu16(a, b) simde_mm_avg_epu16(a, b)
917 #endif
918 
919 SIMDE_FUNCTION_ATTRIBUTES
920 simde__m128i
simde_mm_setzero_si128(void)921 simde_mm_setzero_si128 (void) {
922   #if defined(SIMDE_X86_SSE2_NATIVE)
923     return _mm_setzero_si128();
924   #else
925     simde__m128i_private r_;
926 
927     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
928       r_.neon_i32 = vdupq_n_s32(0);
929     #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
930       r_.altivec_i32 = vec_splats(HEDLEY_STATIC_CAST(signed int, 0));
931     #elif defined(SIMDE_VECTOR_SUBSCRIPT)
932       r_.i32 = __extension__ (__typeof__(r_.i32)) { 0, 0, 0, 0 };
933     #else
934       SIMDE_VECTORIZE
935       for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
936         r_.i32f[i] = 0;
937       }
938     #endif
939 
940     return simde__m128i_from_private(r_);
941   #endif
942 }
943 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
944 #  define _mm_setzero_si128() (simde_mm_setzero_si128())
945 #endif
946 
947 SIMDE_FUNCTION_ATTRIBUTES
948 simde__m128i
simde_mm_bslli_si128(simde__m128i a,const int imm8)949 simde_mm_bslli_si128 (simde__m128i a, const int imm8)
950     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)  {
951   simde__m128i_private
952     r_,
953     a_ = simde__m128i_to_private(a);
954 
955   if (HEDLEY_UNLIKELY((imm8 & ~15))) {
956     return simde_mm_setzero_si128();
957   }
958 
959   #if defined(SIMDE_HAVE_INT128_) && defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) && 0
960     r_.u128[0] = a_.u128[0] << s;
961   #else
962     r_ = simde__m128i_to_private(simde_mm_setzero_si128());
963     for (int i = imm8 ; i < HEDLEY_STATIC_CAST(int, sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
964       r_.i8[i] = a_.i8[i - imm8];
965     }
966   #endif
967 
968   return simde__m128i_from_private(r_);
969 }
970 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
971 #  define simde_mm_bslli_si128(a, imm8) _mm_slli_si128(a, imm8)
972 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__)
973 #  define simde_mm_bslli_si128(a, imm8) \
974   simde__m128i_from_neon_i8(((imm8) <= 0) ? simde__m128i_to_neon_i8(a) : (((imm8) > 15) ? (vdupq_n_s8(0)) : (vextq_s8(vdupq_n_s8(0), simde__m128i_to_neon_i8(a), 16 - (imm8)))))
975 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
976   #define simde_mm_bslli_si128(a, imm8) \
977     (__extension__ ({ \
978       SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) simde_mm_bslli_si128_z_ = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; \
979       simde__m128i_from_altivec_u8((imm8 < 16) ? vec_sld(simde__m128i_to_altivec_u8(a), simde_mm_bslli_si128_z_, imm8 & 15) : simde_mm_bslli_si128_z_); \
980     }))
981 #elif defined(SIMDE_SHUFFLE_VECTOR_)
982   #define simde_mm_bslli_si128(a, imm8) (__extension__ ({ \
983     const simde__m128i_private simde__tmp_a_ = simde__m128i_to_private(a); \
984     const simde__m128i_private simde__tmp_z_ = simde__m128i_to_private(simde_mm_setzero_si128()); \
985     simde__m128i_private simde__tmp_r_; \
986     if (HEDLEY_UNLIKELY(imm8 > 15)) { \
987       simde__tmp_r_ = simde__m128i_to_private(simde_mm_setzero_si128()); \
988     } else { \
989       simde__tmp_r_.i8 = \
990         SIMDE_SHUFFLE_VECTOR_(8, 16, \
991           simde__tmp_z_.i8, \
992           (simde__tmp_a_).i8, \
993           HEDLEY_STATIC_CAST(int8_t, (16 - imm8) & 31), \
994           HEDLEY_STATIC_CAST(int8_t, (17 - imm8) & 31), \
995           HEDLEY_STATIC_CAST(int8_t, (18 - imm8) & 31), \
996           HEDLEY_STATIC_CAST(int8_t, (19 - imm8) & 31), \
997           HEDLEY_STATIC_CAST(int8_t, (20 - imm8) & 31), \
998           HEDLEY_STATIC_CAST(int8_t, (21 - imm8) & 31), \
999           HEDLEY_STATIC_CAST(int8_t, (22 - imm8) & 31), \
1000           HEDLEY_STATIC_CAST(int8_t, (23 - imm8) & 31), \
1001           HEDLEY_STATIC_CAST(int8_t, (24 - imm8) & 31), \
1002           HEDLEY_STATIC_CAST(int8_t, (25 - imm8) & 31), \
1003           HEDLEY_STATIC_CAST(int8_t, (26 - imm8) & 31), \
1004           HEDLEY_STATIC_CAST(int8_t, (27 - imm8) & 31), \
1005           HEDLEY_STATIC_CAST(int8_t, (28 - imm8) & 31), \
1006           HEDLEY_STATIC_CAST(int8_t, (29 - imm8) & 31), \
1007           HEDLEY_STATIC_CAST(int8_t, (30 - imm8) & 31), \
1008           HEDLEY_STATIC_CAST(int8_t, (31 - imm8) & 31)); \
1009     } \
1010     simde__m128i_from_private(simde__tmp_r_); }))
1011 #endif
1012 #define simde_mm_slli_si128(a, imm8) simde_mm_bslli_si128(a, imm8)
1013 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1014 #  define _mm_bslli_si128(a, imm8) simde_mm_bslli_si128(a, imm8)
1015 #  define _mm_slli_si128(a, imm8) simde_mm_bslli_si128(a, imm8)
1016 #endif
1017 
1018 SIMDE_FUNCTION_ATTRIBUTES
1019 simde__m128i
simde_mm_bsrli_si128(simde__m128i a,const int imm8)1020 simde_mm_bsrli_si128 (simde__m128i a, const int imm8)
1021     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)  {
1022   simde__m128i_private
1023     r_,
1024     a_ = simde__m128i_to_private(a);
1025 
1026   SIMDE_VECTORIZE
1027   for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
1028     const int e = HEDLEY_STATIC_CAST(int, i) + imm8;
1029     r_.i8[i] = (e < 16) ? a_.i8[e] : 0;
1030   }
1031 
1032   return simde__m128i_from_private(r_);
1033 }
1034 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
1035 #  define simde_mm_bsrli_si128(a, imm8) _mm_srli_si128(a, imm8)
1036 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__)
1037 #  define simde_mm_bsrli_si128(a, imm8) \
1038   simde__m128i_from_neon_i8(((imm8 < 0) || (imm8 > 15)) ? vdupq_n_s8(0) : (vextq_s8(simde__m128i_to_private(a).neon_i8, vdupq_n_s8(0), ((imm8 & 15) != 0) ? imm8 : (imm8 & 15))))
1039 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
1040   #define simde_mm_bsrli_si128(a, imm8) \
1041     (__extension__ ({ \
1042       SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) simde_mm_bslli_si128_z_ = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; \
1043       simde__m128i_from_altivec_u8((imm8 < 16) ? vec_sro(simde__m128i_to_altivec_u8(a), vec_splats(HEDLEY_STATIC_CAST(unsigned char, imm8 * 8))) : simde_mm_bslli_si128_z_); \
1044     }))
1045 #elif defined(SIMDE_SHUFFLE_VECTOR_)
1046   #define simde_mm_bsrli_si128(a, imm8) (__extension__ ({ \
1047     const simde__m128i_private simde__tmp_a_ = simde__m128i_to_private(a); \
1048     const simde__m128i_private simde__tmp_z_ = simde__m128i_to_private(simde_mm_setzero_si128()); \
1049     simde__m128i_private simde__tmp_r_ = simde__m128i_to_private(a); \
1050     if (HEDLEY_UNLIKELY(imm8 > 15)) { \
1051       simde__tmp_r_ = simde__m128i_to_private(simde_mm_setzero_si128()); \
1052     } else { \
1053       simde__tmp_r_.i8 = \
1054       SIMDE_SHUFFLE_VECTOR_(8, 16, \
1055         simde__tmp_z_.i8, \
1056         (simde__tmp_a_).i8, \
1057         HEDLEY_STATIC_CAST(int8_t, (imm8 + 16) & 31), \
1058         HEDLEY_STATIC_CAST(int8_t, (imm8 + 17) & 31), \
1059         HEDLEY_STATIC_CAST(int8_t, (imm8 + 18) & 31), \
1060         HEDLEY_STATIC_CAST(int8_t, (imm8 + 19) & 31), \
1061         HEDLEY_STATIC_CAST(int8_t, (imm8 + 20) & 31), \
1062         HEDLEY_STATIC_CAST(int8_t, (imm8 + 21) & 31), \
1063         HEDLEY_STATIC_CAST(int8_t, (imm8 + 22) & 31), \
1064         HEDLEY_STATIC_CAST(int8_t, (imm8 + 23) & 31), \
1065         HEDLEY_STATIC_CAST(int8_t, (imm8 + 24) & 31), \
1066         HEDLEY_STATIC_CAST(int8_t, (imm8 + 25) & 31), \
1067         HEDLEY_STATIC_CAST(int8_t, (imm8 + 26) & 31), \
1068         HEDLEY_STATIC_CAST(int8_t, (imm8 + 27) & 31), \
1069         HEDLEY_STATIC_CAST(int8_t, (imm8 + 28) & 31), \
1070         HEDLEY_STATIC_CAST(int8_t, (imm8 + 29) & 31), \
1071         HEDLEY_STATIC_CAST(int8_t, (imm8 + 30) & 31), \
1072         HEDLEY_STATIC_CAST(int8_t, (imm8 + 31) & 31)); \
1073     } \
1074     simde__m128i_from_private(simde__tmp_r_); }))
1075 #endif
1076 #define simde_mm_srli_si128(a, imm8) simde_mm_bsrli_si128((a), (imm8))
1077 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1078 #  define _mm_bsrli_si128(a, imm8) simde_mm_bsrli_si128((a), (imm8))
1079 #  define _mm_srli_si128(a, imm8) simde_mm_bsrli_si128((a), (imm8))
1080 #endif
1081 
1082 SIMDE_FUNCTION_ATTRIBUTES
1083 void
simde_mm_clflush(void const * p)1084 simde_mm_clflush (void const* p) {
1085 #if defined(SIMDE_X86_SSE2_NATIVE)
1086   _mm_clflush(p);
1087 #else
1088   (void) p;
1089 #endif
1090 }
1091 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1092 #  define _mm_clflush(a, b) simde_mm_clflush()
1093 #endif
1094 
1095 SIMDE_FUNCTION_ATTRIBUTES
1096 int
simde_mm_comieq_sd(simde__m128d a,simde__m128d b)1097 simde_mm_comieq_sd (simde__m128d a, simde__m128d b) {
1098 #if defined(SIMDE_X86_SSE2_NATIVE)
1099   return _mm_comieq_sd(a, b);
1100 #else
1101   simde__m128d_private
1102     a_ = simde__m128d_to_private(a),
1103     b_ = simde__m128d_to_private(b);
1104 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1105   return !!vgetq_lane_u64(vceqq_f64(a_.neon_f64, b_.neon_f64), 0);
1106 #else
1107   return a_.f64[0] == b_.f64[0];
1108 #endif
1109 #endif
1110 }
1111 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1112 #  define _mm_comieq_sd(a, b) simde_mm_comieq_sd(a, b)
1113 #endif
1114 
1115 SIMDE_FUNCTION_ATTRIBUTES
1116 int
simde_mm_comige_sd(simde__m128d a,simde__m128d b)1117 simde_mm_comige_sd (simde__m128d a, simde__m128d b) {
1118 #if defined(SIMDE_X86_SSE2_NATIVE)
1119   return _mm_comige_sd(a, b);
1120 #else
1121   simde__m128d_private
1122     a_ = simde__m128d_to_private(a),
1123     b_ = simde__m128d_to_private(b);
1124 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1125   return !!vgetq_lane_u64(vcgeq_f64(a_.neon_f64, b_.neon_f64), 0);
1126 #else
1127   return a_.f64[0] >= b_.f64[0];
1128 #endif
1129 #endif
1130 }
1131 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1132 #  define _mm_comige_sd(a, b) simde_mm_comige_sd(a, b)
1133 #endif
1134 
1135 SIMDE_FUNCTION_ATTRIBUTES
1136 int
simde_mm_comigt_sd(simde__m128d a,simde__m128d b)1137 simde_mm_comigt_sd (simde__m128d a, simde__m128d b) {
1138 #if defined(SIMDE_X86_SSE2_NATIVE)
1139   return _mm_comigt_sd(a, b);
1140 #else
1141   simde__m128d_private
1142     a_ = simde__m128d_to_private(a),
1143     b_ = simde__m128d_to_private(b);
1144 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1145   return !!vgetq_lane_u64(vcgtq_f64(a_.neon_f64, b_.neon_f64), 0);
1146 #else
1147   return a_.f64[0] > b_.f64[0];
1148 #endif
1149 #endif
1150 }
1151 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1152 #  define _mm_comigt_sd(a, b) simde_mm_comigt_sd(a, b)
1153 #endif
1154 
1155 SIMDE_FUNCTION_ATTRIBUTES
1156 int
simde_mm_comile_sd(simde__m128d a,simde__m128d b)1157 simde_mm_comile_sd (simde__m128d a, simde__m128d b) {
1158 #if defined(SIMDE_X86_SSE2_NATIVE)
1159   return _mm_comile_sd(a, b);
1160 #else
1161   simde__m128d_private
1162     a_ = simde__m128d_to_private(a),
1163     b_ = simde__m128d_to_private(b);
1164 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1165   return !!vgetq_lane_u64(vcleq_f64(a_.neon_f64, b_.neon_f64), 0);
1166 #else
1167   return a_.f64[0] <= b_.f64[0];
1168 #endif
1169 #endif
1170 }
1171 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1172 #  define _mm_comile_sd(a, b) simde_mm_comile_sd(a, b)
1173 #endif
1174 
1175 SIMDE_FUNCTION_ATTRIBUTES
1176 int
simde_mm_comilt_sd(simde__m128d a,simde__m128d b)1177 simde_mm_comilt_sd (simde__m128d a, simde__m128d b) {
1178 #if defined(SIMDE_X86_SSE2_NATIVE)
1179   return _mm_comilt_sd(a, b);
1180 #else
1181   simde__m128d_private
1182     a_ = simde__m128d_to_private(a),
1183     b_ = simde__m128d_to_private(b);
1184 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1185   return !!vgetq_lane_u64(vcltq_f64(a_.neon_f64, b_.neon_f64), 0);
1186 #else
1187   return a_.f64[0] < b_.f64[0];
1188 #endif
1189 #endif
1190 }
1191 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1192 #  define _mm_comilt_sd(a, b) simde_mm_comilt_sd(a, b)
1193 #endif
1194 
1195 SIMDE_FUNCTION_ATTRIBUTES
1196 int
simde_mm_comineq_sd(simde__m128d a,simde__m128d b)1197 simde_mm_comineq_sd (simde__m128d a, simde__m128d b) {
1198 #if defined(SIMDE_X86_SSE2_NATIVE)
1199   return _mm_comineq_sd(a, b);
1200 #else
1201   simde__m128d_private
1202     a_ = simde__m128d_to_private(a),
1203     b_ = simde__m128d_to_private(b);
1204 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1205   return !vgetq_lane_u64(vceqq_f64(a_.neon_f64, b_.neon_f64), 0);
1206 #else
1207   return a_.f64[0] != b_.f64[0];
1208 #endif
1209 #endif
1210 }
1211 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1212 #  define _mm_comineq_sd(a, b) simde_mm_comineq_sd(a, b)
1213 #endif
1214 
1215 SIMDE_FUNCTION_ATTRIBUTES
1216 simde__m128
simde_mm_castpd_ps(simde__m128d a)1217 simde_mm_castpd_ps (simde__m128d a) {
1218 #if defined(SIMDE_X86_SSE2_NATIVE)
1219   return _mm_castpd_ps(a);
1220 #else
1221   simde__m128 r;
1222   simde_memcpy(&r, &a, sizeof(a));
1223   return r;
1224 #endif
1225 }
1226 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1227 #  define _mm_castpd_ps(a) simde_mm_castpd_ps(a)
1228 #endif
1229 
1230 SIMDE_FUNCTION_ATTRIBUTES
1231 simde__m128i
simde_mm_castpd_si128(simde__m128d a)1232 simde_mm_castpd_si128 (simde__m128d a) {
1233 #if defined(SIMDE_X86_SSE2_NATIVE)
1234   return _mm_castpd_si128(a);
1235 #else
1236   simde__m128i r;
1237   simde_memcpy(&r, &a, sizeof(a));
1238   return r;
1239 #endif
1240 }
1241 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1242 #  define _mm_castpd_si128(a) simde_mm_castpd_si128(a)
1243 #endif
1244 
1245 SIMDE_FUNCTION_ATTRIBUTES
1246 simde__m128d
simde_mm_castps_pd(simde__m128 a)1247 simde_mm_castps_pd (simde__m128 a) {
1248 #if defined(SIMDE_X86_SSE2_NATIVE)
1249   return _mm_castps_pd(a);
1250 #else
1251   simde__m128d r;
1252   simde_memcpy(&r, &a, sizeof(a));
1253   return r;
1254 #endif
1255 }
1256 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1257 #  define _mm_castps_pd(a) simde_mm_castps_pd(a)
1258 #endif
1259 
1260 SIMDE_FUNCTION_ATTRIBUTES
1261 simde__m128i
simde_mm_castps_si128(simde__m128 a)1262 simde_mm_castps_si128 (simde__m128 a) {
1263 #if defined(SIMDE_X86_SSE2_NATIVE)
1264   return _mm_castps_si128(a);
1265 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1266   return simde__m128i_from_neon_i32(simde__m128_to_private(a).neon_i32);
1267 #else
1268   simde__m128i r;
1269   simde_memcpy(&r, &a, sizeof(a));
1270   return r;
1271 #endif
1272 }
1273 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1274 #  define _mm_castps_si128(a) simde_mm_castps_si128(a)
1275 #endif
1276 
1277 SIMDE_FUNCTION_ATTRIBUTES
1278 simde__m128d
simde_mm_castsi128_pd(simde__m128i a)1279 simde_mm_castsi128_pd (simde__m128i a) {
1280 #if defined(SIMDE_X86_SSE2_NATIVE)
1281   return _mm_castsi128_pd(a);
1282 #else
1283   simde__m128d r;
1284   simde_memcpy(&r, &a, sizeof(a));
1285   return r;
1286 #endif
1287 }
1288 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1289 #  define _mm_castsi128_pd(a) simde_mm_castsi128_pd(a)
1290 #endif
1291 
1292 SIMDE_FUNCTION_ATTRIBUTES
1293 simde__m128
simde_mm_castsi128_ps(simde__m128i a)1294 simde_mm_castsi128_ps (simde__m128i a) {
1295 #if defined(SIMDE_X86_SSE2_NATIVE)
1296   return _mm_castsi128_ps(a);
1297 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1298   return HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), a);
1299 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1300   return simde__m128_from_neon_i32(simde__m128i_to_private(a).neon_i32);
1301 #else
1302   simde__m128 r;
1303   simde_memcpy(&r, &a, sizeof(a));
1304   return r;
1305 #endif
1306 }
1307 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1308 #  define _mm_castsi128_ps(a) simde_mm_castsi128_ps(a)
1309 #endif
1310 
1311 SIMDE_FUNCTION_ATTRIBUTES
1312 simde__m128i
simde_mm_cmpeq_epi8(simde__m128i a,simde__m128i b)1313 simde_mm_cmpeq_epi8 (simde__m128i a, simde__m128i b) {
1314 #if defined(SIMDE_X86_SSE2_NATIVE)
1315   return _mm_cmpeq_epi8(a, b);
1316 #else
1317   simde__m128i_private
1318     r_,
1319     a_ = simde__m128i_to_private(a),
1320     b_ = simde__m128i_to_private(b);
1321 
1322   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1323     r_.neon_u8 = vceqq_s8(b_.neon_i8, a_.neon_i8);
1324   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1325     r_.wasm_v128 = wasm_i8x16_eq(a_.wasm_v128, b_.wasm_v128);
1326   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1327     r_.altivec_i8 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char), vec_cmpeq(a_.altivec_i8, b_.altivec_i8));
1328   #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1329     r_.i8 = HEDLEY_STATIC_CAST(__typeof__(r_.i8), (a_.i8 == b_.i8));
1330   #else
1331     SIMDE_VECTORIZE
1332     for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
1333       r_.i8[i] = (a_.i8[i] == b_.i8[i]) ? ~INT8_C(0) : INT8_C(0);
1334     }
1335   #endif
1336 
1337   return simde__m128i_from_private(r_);
1338 #endif
1339 }
1340 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1341 #  define _mm_cmpeq_epi8(a, b) simde_mm_cmpeq_epi8(a, b)
1342 #endif
1343 
1344 SIMDE_FUNCTION_ATTRIBUTES
1345 simde__m128i
simde_mm_cmpeq_epi16(simde__m128i a,simde__m128i b)1346 simde_mm_cmpeq_epi16 (simde__m128i a, simde__m128i b) {
1347 #if defined(SIMDE_X86_SSE2_NATIVE)
1348   return _mm_cmpeq_epi16(a, b);
1349 #else
1350   simde__m128i_private
1351     r_,
1352     a_ = simde__m128i_to_private(a),
1353     b_ = simde__m128i_to_private(b);
1354 
1355   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1356     r_.neon_u16 = vceqq_s16(b_.neon_i16, a_.neon_i16);
1357   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1358     r_.wasm_v128 = wasm_i16x8_eq(a_.wasm_v128, b_.wasm_v128);
1359   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1360     r_.altivec_i16 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed short), vec_cmpeq(a_.altivec_i16, b_.altivec_i16));
1361   #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1362     r_.i16 = (a_.i16 == b_.i16);
1363   #else
1364     SIMDE_VECTORIZE
1365     for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
1366       r_.i16[i] = (a_.i16[i] == b_.i16[i]) ? ~INT16_C(0) : INT16_C(0);
1367     }
1368   #endif
1369 
1370   return simde__m128i_from_private(r_);
1371 #endif
1372 }
1373 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1374 #  define _mm_cmpeq_epi16(a, b) simde_mm_cmpeq_epi16(a, b)
1375 #endif
1376 
1377 SIMDE_FUNCTION_ATTRIBUTES
1378 simde__m128i
simde_mm_cmpeq_epi32(simde__m128i a,simde__m128i b)1379 simde_mm_cmpeq_epi32 (simde__m128i a, simde__m128i b) {
1380 #if defined(SIMDE_X86_SSE2_NATIVE)
1381   return _mm_cmpeq_epi32(a, b);
1382 #else
1383   simde__m128i_private
1384     r_,
1385     a_ = simde__m128i_to_private(a),
1386     b_ = simde__m128i_to_private(b);
1387 
1388   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1389     r_.neon_u32 = vceqq_s32(b_.neon_i32, a_.neon_i32);
1390   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1391     r_.wasm_v128 = wasm_i32x4_eq(a_.wasm_v128, b_.wasm_v128);
1392   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1393     r_.altivec_i32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed int), vec_cmpeq(a_.altivec_i32, b_.altivec_i32));
1394   #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1395     r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), a_.i32 == b_.i32);
1396   #else
1397     SIMDE_VECTORIZE
1398     for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1399       r_.i32[i] = (a_.i32[i] == b_.i32[i]) ? ~INT32_C(0) : INT32_C(0);
1400     }
1401   #endif
1402 
1403   return simde__m128i_from_private(r_);
1404 #endif
1405 }
1406 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1407 #  define _mm_cmpeq_epi32(a, b) simde_mm_cmpeq_epi32(a, b)
1408 #endif
1409 
1410 SIMDE_FUNCTION_ATTRIBUTES
1411 simde__m128d
simde_mm_cmpeq_pd(simde__m128d a,simde__m128d b)1412 simde_mm_cmpeq_pd (simde__m128d a, simde__m128d b) {
1413 #if defined(SIMDE_X86_SSE2_NATIVE)
1414   return _mm_cmpeq_pd(a, b);
1415 #else
1416   simde__m128d_private
1417     r_,
1418     a_ = simde__m128d_to_private(a),
1419     b_ = simde__m128d_to_private(b);
1420 
1421   #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1422     r_.neon_u64 = vceqq_s64(b_.neon_i64, a_.neon_i64);
1423   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1424     r_.wasm_v128 = wasm_f64x2_eq(a_.wasm_v128, b_.wasm_v128);
1425   #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
1426     r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_cmpeq(a_.altivec_f64, b_.altivec_f64));
1427   #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1428     r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 == b_.f64));
1429   #else
1430     SIMDE_VECTORIZE
1431     for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
1432       r_.u64[i] = (a_.f64[i] == b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
1433     }
1434   #endif
1435 
1436   return simde__m128d_from_private(r_);
1437 #endif
1438 }
1439 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1440 #  define _mm_cmpeq_pd(a, b) simde_mm_cmpeq_pd(a, b)
1441 #endif
1442 
1443 SIMDE_FUNCTION_ATTRIBUTES
1444 simde__m128d
simde_mm_cmpeq_sd(simde__m128d a,simde__m128d b)1445 simde_mm_cmpeq_sd (simde__m128d a, simde__m128d b) {
1446 #if defined(SIMDE_X86_SSE2_NATIVE)
1447   return _mm_cmpeq_sd(a, b);
1448 #elif defined(SIMDE_ASSUME_VECTORIZATION)
1449   return simde_mm_move_sd(a, simde_mm_cmpeq_pd(a, b));
1450 #else
1451   simde__m128d_private
1452     r_,
1453     a_ = simde__m128d_to_private(a),
1454     b_ = simde__m128d_to_private(b);
1455 
1456   r_.u64[0] = (a_.u64[0] == b_.u64[0]) ? ~UINT64_C(0) : 0;
1457   r_.u64[1] = a_.u64[1];
1458 
1459   return simde__m128d_from_private(r_);
1460 #endif
1461 }
1462 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1463 #  define _mm_cmpeq_sd(a, b) simde_mm_cmpeq_sd(a, b)
1464 #endif
1465 
1466 SIMDE_FUNCTION_ATTRIBUTES
1467 simde__m128d
simde_mm_cmpneq_pd(simde__m128d a,simde__m128d b)1468 simde_mm_cmpneq_pd (simde__m128d a, simde__m128d b) {
1469 #if defined(SIMDE_X86_SSE2_NATIVE)
1470   return _mm_cmpneq_pd(a, b);
1471 #else
1472   simde__m128d_private
1473     r_,
1474     a_ = simde__m128d_to_private(a),
1475     b_ = simde__m128d_to_private(b);
1476 
1477   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1478     r_.neon_u16 = vmvnq_u16(vceqq_s16(b_.neon_i16, a_.neon_i16));
1479   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1480     r_.wasm_v128 = wasm_f64x2_ne(a_.wasm_v128, b_.wasm_v128);
1481   #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1482     r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 != b_.f64));
1483   #else
1484     SIMDE_VECTORIZE
1485     for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
1486       r_.u64[i] = (a_.f64[i] != b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
1487     }
1488   #endif
1489 
1490   return simde__m128d_from_private(r_);
1491 #endif
1492 }
1493 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1494 #  define _mm_cmpneq_pd(a, b) simde_mm_cmpneq_pd(a, b)
1495 #endif
1496 
1497 SIMDE_FUNCTION_ATTRIBUTES
1498 simde__m128d
simde_mm_cmpneq_sd(simde__m128d a,simde__m128d b)1499 simde_mm_cmpneq_sd (simde__m128d a, simde__m128d b) {
1500 #if defined(SIMDE_X86_SSE2_NATIVE)
1501   return _mm_cmpneq_sd(a, b);
1502 #elif defined(SIMDE_ASSUME_VECTORIZATION)
1503   return simde_mm_move_sd(a, simde_mm_cmpneq_pd(a, b));
1504 #else
1505   simde__m128d_private
1506     r_,
1507     a_ = simde__m128d_to_private(a),
1508     b_ = simde__m128d_to_private(b);
1509 
1510   r_.u64[0] = (a_.f64[0] != b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
1511   r_.u64[1] = a_.u64[1];
1512 
1513 
1514   return simde__m128d_from_private(r_);
1515 #endif
1516 }
1517 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1518 #  define _mm_cmpneq_sd(a, b) simde_mm_cmpneq_sd(a, b)
1519 #endif
1520 
1521 SIMDE_FUNCTION_ATTRIBUTES
1522 simde__m128i
simde_mm_cmplt_epi8(simde__m128i a,simde__m128i b)1523 simde_mm_cmplt_epi8 (simde__m128i a, simde__m128i b) {
1524 #if defined(SIMDE_X86_SSE2_NATIVE)
1525   return _mm_cmplt_epi8(a, b);
1526 #else
1527   simde__m128i_private
1528     r_,
1529     a_ = simde__m128i_to_private(a),
1530     b_ = simde__m128i_to_private(b);
1531 
1532   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1533     r_.neon_u8 = vcltq_s8(a_.neon_i8, b_.neon_i8);
1534   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1535     r_.altivec_i8 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char),vec_cmplt(a_.altivec_i8, b_.altivec_i8));
1536   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1537     r_.wasm_v128 = wasm_i8x16_lt(a_.wasm_v128, b_.wasm_v128);
1538   #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1539     r_.i8 = HEDLEY_STATIC_CAST(__typeof__(r_.i8), (a_.i8 < b_.i8));
1540   #else
1541     SIMDE_VECTORIZE
1542     for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
1543       r_.i8[i] = (a_.i8[i] < b_.i8[i]) ? ~INT8_C(0) : INT8_C(0);
1544     }
1545   #endif
1546 
1547   return simde__m128i_from_private(r_);
1548 #endif
1549 }
1550 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1551 #  define _mm_cmplt_epi8(a, b) simde_mm_cmplt_epi8(a, b)
1552 #endif
1553 
1554 SIMDE_FUNCTION_ATTRIBUTES
1555 simde__m128i
simde_mm_cmplt_epi16(simde__m128i a,simde__m128i b)1556 simde_mm_cmplt_epi16 (simde__m128i a, simde__m128i b) {
1557 #if defined(SIMDE_X86_SSE2_NATIVE)
1558   return _mm_cmplt_epi16(a, b);
1559 #else
1560   simde__m128i_private
1561     r_,
1562     a_ = simde__m128i_to_private(a),
1563     b_ = simde__m128i_to_private(b);
1564 
1565   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1566     r_.neon_u16 = vcltq_s16(a_.neon_i16, b_.neon_i16);
1567   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1568     r_.altivec_i16 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed short), vec_cmplt(a_.altivec_i16, b_.altivec_i16));
1569   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1570     r_.wasm_v128 = wasm_i16x8_lt(a_.wasm_v128, b_.wasm_v128);
1571   #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1572     r_.i16 = HEDLEY_STATIC_CAST(__typeof__(r_.i16), (a_.i16 < b_.i16));
1573   #else
1574     SIMDE_VECTORIZE
1575     for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
1576       r_.i16[i] = (a_.i16[i] < b_.i16[i]) ? ~INT16_C(0) : INT16_C(0);
1577     }
1578   #endif
1579 
1580   return simde__m128i_from_private(r_);
1581 #endif
1582 }
1583 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1584 #  define _mm_cmplt_epi16(a, b) simde_mm_cmplt_epi16(a, b)
1585 #endif
1586 
1587 SIMDE_FUNCTION_ATTRIBUTES
1588 simde__m128i
simde_mm_cmplt_epi32(simde__m128i a,simde__m128i b)1589 simde_mm_cmplt_epi32 (simde__m128i a, simde__m128i b) {
1590 #if defined(SIMDE_X86_SSE2_NATIVE)
1591   return _mm_cmplt_epi32(a, b);
1592 #else
1593   simde__m128i_private
1594     r_,
1595     a_ = simde__m128i_to_private(a),
1596     b_ = simde__m128i_to_private(b);
1597 
1598   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1599     r_.neon_u32 = vcltq_s32(a_.neon_i32, b_.neon_i32);
1600   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1601     r_.altivec_i32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed int), vec_cmplt(a_.altivec_i32, b_.altivec_i32));
1602   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1603     r_.wasm_v128 = wasm_i32x4_lt(a_.wasm_v128, b_.wasm_v128);
1604   #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1605     r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.i32 < b_.i32));
1606   #else
1607     SIMDE_VECTORIZE
1608     for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1609       r_.i32[i] = (a_.i32[i] < b_.i32[i]) ? ~INT32_C(0) : INT32_C(0);
1610     }
1611   #endif
1612 
1613   return simde__m128i_from_private(r_);
1614 #endif
1615 }
1616 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1617 #  define _mm_cmplt_epi32(a, b) simde_mm_cmplt_epi32(a, b)
1618 #endif
1619 
1620 SIMDE_FUNCTION_ATTRIBUTES
1621 simde__m128d
simde_mm_cmplt_pd(simde__m128d a,simde__m128d b)1622 simde_mm_cmplt_pd (simde__m128d a, simde__m128d b) {
1623 #if defined(SIMDE_X86_SSE2_NATIVE)
1624   return _mm_cmplt_pd(a, b);
1625 #else
1626   simde__m128d_private
1627     r_,
1628     a_ = simde__m128d_to_private(a),
1629     b_ = simde__m128d_to_private(b);
1630 
1631   #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1632     r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 < b_.f64));
1633   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1634     r_.wasm_v128 = wasm_f64x2_lt(a_.wasm_v128, b_.wasm_v128);
1635   #else
1636     SIMDE_VECTORIZE
1637     for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
1638       r_.u64[i] = (a_.f64[i] < b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
1639     }
1640   #endif
1641 
1642   return simde__m128d_from_private(r_);
1643 #endif
1644 }
1645 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1646 #  define _mm_cmplt_pd(a, b) simde_mm_cmplt_pd(a, b)
1647 #endif
1648 
1649 SIMDE_FUNCTION_ATTRIBUTES
1650 simde__m128d
simde_mm_cmplt_sd(simde__m128d a,simde__m128d b)1651 simde_mm_cmplt_sd (simde__m128d a, simde__m128d b) {
1652 #if defined(SIMDE_X86_SSE2_NATIVE)
1653   return _mm_cmplt_sd(a, b);
1654 #elif defined(SIMDE_ASSUME_VECTORIZATION)
1655   return simde_mm_move_sd(a, simde_mm_cmplt_pd(a, b));
1656 #else
1657   simde__m128d_private
1658     r_,
1659     a_ = simde__m128d_to_private(a),
1660     b_ = simde__m128d_to_private(b);
1661 
1662   r_.u64[0] = (a_.f64[0] < b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
1663   r_.u64[1] = a_.u64[1];
1664 
1665   return simde__m128d_from_private(r_);
1666 #endif
1667 }
1668 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1669 #  define _mm_cmplt_sd(a, b) simde_mm_cmplt_sd(a, b)
1670 #endif
1671 
1672 SIMDE_FUNCTION_ATTRIBUTES
1673 simde__m128d
simde_mm_cmple_pd(simde__m128d a,simde__m128d b)1674 simde_mm_cmple_pd (simde__m128d a, simde__m128d b) {
1675 #if defined(SIMDE_X86_SSE2_NATIVE)
1676   return _mm_cmple_pd(a, b);
1677 #else
1678   simde__m128d_private
1679     r_,
1680     a_ = simde__m128d_to_private(a),
1681     b_ = simde__m128d_to_private(b);
1682 
1683   #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1684     r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 <= b_.f64));
1685   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1686     r_.wasm_v128 = wasm_f64x2_le(a_.wasm_v128, b_.wasm_v128);
1687   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1688     r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_cmple(a_.altivec_f64, b_.altivec_f64));
1689   #else
1690     SIMDE_VECTORIZE
1691     for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
1692       r_.u64[i] = (a_.f64[i] <= b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
1693     }
1694   #endif
1695 
1696   return simde__m128d_from_private(r_);
1697 #endif
1698 }
1699 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1700 #  define _mm_cmple_pd(a, b) simde_mm_cmple_pd(a, b)
1701 #endif
1702 
1703 SIMDE_FUNCTION_ATTRIBUTES
1704 simde__m128d
simde_mm_cmple_sd(simde__m128d a,simde__m128d b)1705 simde_mm_cmple_sd (simde__m128d a, simde__m128d b) {
1706 #if defined(SIMDE_X86_SSE2_NATIVE)
1707   return _mm_cmple_sd(a, b);
1708 #elif defined(SIMDE_ASSUME_VECTORIZATION)
1709   return simde_mm_move_sd(a, simde_mm_cmple_pd(a, b));
1710 #else
1711   simde__m128d_private
1712     r_,
1713     a_ = simde__m128d_to_private(a),
1714     b_ = simde__m128d_to_private(b);
1715 
1716   r_.u64[0] = (a_.f64[0] <= b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
1717   r_.u64[1] = a_.u64[1];
1718 
1719   return simde__m128d_from_private(r_);
1720 #endif
1721 }
1722 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1723 #  define _mm_cmple_sd(a, b) simde_mm_cmple_sd(a, b)
1724 #endif
1725 
1726 SIMDE_FUNCTION_ATTRIBUTES
1727 simde__m128i
simde_mm_cmpgt_epi8(simde__m128i a,simde__m128i b)1728 simde_mm_cmpgt_epi8 (simde__m128i a, simde__m128i b) {
1729 #if defined(SIMDE_X86_SSE2_NATIVE)
1730   return _mm_cmpgt_epi8(a, b);
1731 #else
1732   simde__m128i_private
1733     r_,
1734     a_ = simde__m128i_to_private(a),
1735     b_ = simde__m128i_to_private(b);
1736 
1737   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1738     r_.neon_u8 = vcgtq_s8(a_.neon_i8, b_.neon_i8);
1739   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1740     r_.wasm_v128 = wasm_i8x16_gt(a_.wasm_v128, b_.wasm_v128);
1741   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1742     r_.altivec_i8 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char), vec_cmpgt(a_.altivec_i8, b_.altivec_i8));
1743   #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1744     r_.i8 = HEDLEY_STATIC_CAST(__typeof__(r_.i8), (a_.i8 > b_.i8));
1745   #else
1746     SIMDE_VECTORIZE
1747     for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
1748       r_.i8[i] = (a_.i8[i] > b_.i8[i]) ? ~INT8_C(0) : INT8_C(0);
1749     }
1750   #endif
1751 
1752   return simde__m128i_from_private(r_);
1753 #endif
1754 }
1755 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1756 #  define _mm_cmpgt_epi8(a, b) simde_mm_cmpgt_epi8(a, b)
1757 #endif
1758 
1759 SIMDE_FUNCTION_ATTRIBUTES
1760 simde__m128i
simde_mm_cmpgt_epi16(simde__m128i a,simde__m128i b)1761 simde_mm_cmpgt_epi16 (simde__m128i a, simde__m128i b) {
1762 #if defined(SIMDE_X86_SSE2_NATIVE)
1763   return _mm_cmpgt_epi16(a, b);
1764 #else
1765   simde__m128i_private
1766     r_,
1767     a_ = simde__m128i_to_private(a),
1768     b_ = simde__m128i_to_private(b);
1769 
1770   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1771     r_.neon_u16 = vcgtq_s16(a_.neon_i16, b_.neon_i16);
1772   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1773     r_.wasm_v128 = wasm_i16x8_gt(a_.wasm_v128, b_.wasm_v128);
1774   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1775     r_.altivec_i16 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed short), vec_cmpgt(a_.altivec_i16, b_.altivec_i16));
1776   #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1777     r_.i16 = HEDLEY_STATIC_CAST(__typeof__(r_.i16), (a_.i16 > b_.i16));
1778   #else
1779     SIMDE_VECTORIZE
1780     for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
1781       r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? ~INT16_C(0) : INT16_C(0);
1782     }
1783   #endif
1784 
1785   return simde__m128i_from_private(r_);
1786 #endif
1787 }
1788 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1789 #  define _mm_cmpgt_epi16(a, b) simde_mm_cmpgt_epi16(a, b)
1790 #endif
1791 
1792 SIMDE_FUNCTION_ATTRIBUTES
1793 simde__m128i
simde_mm_cmpgt_epi32(simde__m128i a,simde__m128i b)1794 simde_mm_cmpgt_epi32 (simde__m128i a, simde__m128i b) {
1795 #if defined(SIMDE_X86_SSE2_NATIVE)
1796   return _mm_cmpgt_epi32(a, b);
1797 #else
1798   simde__m128i_private
1799     r_,
1800     a_ = simde__m128i_to_private(a),
1801     b_ = simde__m128i_to_private(b);
1802 
1803   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1804     r_.neon_u32 = vcgtq_s32(a_.neon_i32, b_.neon_i32);
1805   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1806     r_.wasm_v128 = wasm_i32x4_gt(a_.wasm_v128, b_.wasm_v128);
1807   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1808     r_.altivec_i32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed int), vec_cmpgt(a_.altivec_i32, b_.altivec_i32));
1809   #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1810     r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.i32 > b_.i32));
1811   #else
1812     SIMDE_VECTORIZE
1813     for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
1814       r_.i32[i] = (a_.i32[i] > b_.i32[i]) ? ~INT32_C(0) : INT32_C(0);
1815     }
1816   #endif
1817 
1818   return simde__m128i_from_private(r_);
1819 #endif
1820 }
1821 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1822 #  define _mm_cmpgt_epi32(a, b) simde_mm_cmpgt_epi32(a, b)
1823 #endif
1824 
1825 SIMDE_FUNCTION_ATTRIBUTES
1826 simde__m128d
simde_mm_cmpgt_pd(simde__m128d a,simde__m128d b)1827 simde_mm_cmpgt_pd (simde__m128d a, simde__m128d b) {
1828 #if defined(SIMDE_X86_SSE2_NATIVE)
1829   return _mm_cmpgt_pd(a, b);
1830 #else
1831   simde__m128d_private
1832     r_,
1833     a_ = simde__m128d_to_private(a),
1834     b_ = simde__m128d_to_private(b);
1835 
1836   #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1837     r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 > b_.f64));
1838   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1839     r_.wasm_v128 = wasm_f64x2_gt(a_.wasm_v128, b_.wasm_v128);
1840   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1841     r_.altivec_f64 = HEDLEY_STATIC_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_cmpgt(a_.altivec_f64, b_.altivec_f64));
1842   #else
1843     SIMDE_VECTORIZE
1844     for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
1845       r_.u64[i] = (a_.f64[i] > b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
1846     }
1847   #endif
1848 
1849   return simde__m128d_from_private(r_);
1850 #endif
1851 }
1852 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1853 #  define _mm_cmpgt_pd(a, b) simde_mm_cmpgt_pd(a, b)
1854 #endif
1855 
1856 SIMDE_FUNCTION_ATTRIBUTES
1857 simde__m128d
simde_mm_cmpgt_sd(simde__m128d a,simde__m128d b)1858 simde_mm_cmpgt_sd (simde__m128d a, simde__m128d b) {
1859 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
1860   return _mm_cmpgt_sd(a, b);
1861 #elif defined(SIMDE_ASSUME_VECTORIZATION)
1862   return simde_mm_move_sd(a, simde_mm_cmpgt_pd(a, b));
1863 #else
1864   simde__m128d_private
1865     r_,
1866     a_ = simde__m128d_to_private(a),
1867     b_ = simde__m128d_to_private(b);
1868 
1869   r_.u64[0] = (a_.f64[0] > b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
1870   r_.u64[1] = a_.u64[1];
1871 
1872   return simde__m128d_from_private(r_);
1873 #endif
1874 }
1875 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1876 #  define _mm_cmpgt_sd(a, b) simde_mm_cmpgt_sd(a, b)
1877 #endif
1878 
1879 SIMDE_FUNCTION_ATTRIBUTES
1880 simde__m128d
simde_mm_cmpge_pd(simde__m128d a,simde__m128d b)1881 simde_mm_cmpge_pd (simde__m128d a, simde__m128d b) {
1882 #if defined(SIMDE_X86_SSE2_NATIVE)
1883   return _mm_cmpge_pd(a, b);
1884 #else
1885   simde__m128d_private
1886     r_,
1887     a_ = simde__m128d_to_private(a),
1888     b_ = simde__m128d_to_private(b);
1889 
1890   #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1891     r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 >= b_.f64));
1892   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
1893     r_.wasm_v128 = wasm_f64x2_ge(a_.wasm_v128, b_.wasm_v128);
1894   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1895     r_.altivec_f64 = HEDLEY_STATIC_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_cmpge(a_.altivec_f64, b_.altivec_f64));
1896   #else
1897     SIMDE_VECTORIZE
1898     for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
1899       r_.u64[i] = (a_.f64[i] >= b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
1900     }
1901   #endif
1902 
1903   return simde__m128d_from_private(r_);
1904 #endif
1905 }
1906 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1907 #  define _mm_cmpge_pd(a, b) simde_mm_cmpge_pd(a, b)
1908 #endif
1909 
1910 SIMDE_FUNCTION_ATTRIBUTES
1911 simde__m128d
simde_mm_cmpge_sd(simde__m128d a,simde__m128d b)1912 simde_mm_cmpge_sd (simde__m128d a, simde__m128d b) {
1913 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
1914   return _mm_cmpge_sd(a, b);
1915 #elif defined(SIMDE_ASSUME_VECTORIZATION)
1916   return simde_mm_move_sd(a, simde_mm_cmpge_pd(a, b));
1917 #else
1918   simde__m128d_private
1919     r_,
1920     a_ = simde__m128d_to_private(a),
1921     b_ = simde__m128d_to_private(b);
1922 
1923   r_.u64[0] = (a_.f64[0] >= b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
1924   r_.u64[1] = a_.u64[1];
1925 
1926   return simde__m128d_from_private(r_);
1927 #endif
1928 }
1929 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1930 #  define _mm_cmpge_sd(a, b) simde_mm_cmpge_sd(a, b)
1931 #endif
1932 
1933 SIMDE_FUNCTION_ATTRIBUTES
1934 simde__m128d
simde_mm_cmpnge_pd(simde__m128d a,simde__m128d b)1935 simde_mm_cmpnge_pd (simde__m128d a, simde__m128d b) {
1936 #if defined(SIMDE_X86_SSE2_NATIVE)
1937   return _mm_cmpnge_pd(a, b);
1938 #else
1939   return simde_mm_cmplt_pd(a, b);
1940 #endif
1941 }
1942 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1943 #  define _mm_cmpnge_pd(a, b) simde_mm_cmpnge_pd(a, b)
1944 #endif
1945 
1946 SIMDE_FUNCTION_ATTRIBUTES
1947 simde__m128d
simde_mm_cmpnge_sd(simde__m128d a,simde__m128d b)1948 simde_mm_cmpnge_sd (simde__m128d a, simde__m128d b) {
1949 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
1950   return _mm_cmpnge_sd(a, b);
1951 #else
1952   return simde_mm_cmplt_sd(a, b);
1953 #endif
1954 }
1955 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1956 #  define _mm_cmpnge_sd(a, b) simde_mm_cmpnge_sd(a, b)
1957 #endif
1958 
1959 SIMDE_FUNCTION_ATTRIBUTES
1960 simde__m128d
simde_mm_cmpnlt_pd(simde__m128d a,simde__m128d b)1961 simde_mm_cmpnlt_pd (simde__m128d a, simde__m128d b) {
1962 #if defined(SIMDE_X86_SSE2_NATIVE)
1963   return _mm_cmpnlt_pd(a, b);
1964 #else
1965   return simde_mm_cmpge_pd(a, b);
1966 #endif
1967 }
1968 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1969 #  define _mm_cmpnlt_pd(a, b) simde_mm_cmpnlt_pd(a, b)
1970 #endif
1971 
1972 SIMDE_FUNCTION_ATTRIBUTES
1973 simde__m128d
simde_mm_cmpnlt_sd(simde__m128d a,simde__m128d b)1974 simde_mm_cmpnlt_sd (simde__m128d a, simde__m128d b) {
1975 #if defined(SIMDE_X86_SSE2_NATIVE)
1976   return _mm_cmpnlt_sd(a, b);
1977 #else
1978   return simde_mm_cmpge_sd(a, b);
1979 #endif
1980 }
1981 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1982 #  define _mm_cmpnlt_sd(a, b) simde_mm_cmpnlt_sd(a, b)
1983 #endif
1984 
1985 SIMDE_FUNCTION_ATTRIBUTES
1986 simde__m128d
simde_mm_cmpnle_pd(simde__m128d a,simde__m128d b)1987 simde_mm_cmpnle_pd (simde__m128d a, simde__m128d b) {
1988 #if defined(SIMDE_X86_SSE2_NATIVE)
1989   return _mm_cmpnle_pd(a, b);
1990 #else
1991   return simde_mm_cmpgt_pd(a, b);
1992 #endif
1993 }
1994 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1995 #  define _mm_cmpnle_pd(a, b) simde_mm_cmpnle_pd(a, b)
1996 #endif
1997 
1998 SIMDE_FUNCTION_ATTRIBUTES
1999 simde__m128d
simde_mm_cmpnle_sd(simde__m128d a,simde__m128d b)2000 simde_mm_cmpnle_sd (simde__m128d a, simde__m128d b) {
2001 #if defined(SIMDE_X86_SSE2_NATIVE)
2002   return _mm_cmpnle_sd(a, b);
2003 #else
2004   return simde_mm_cmpgt_sd(a, b);
2005 #endif
2006 }
2007 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2008 #  define _mm_cmpnle_sd(a, b) simde_mm_cmpnle_sd(a, b)
2009 #endif
2010 
2011 SIMDE_FUNCTION_ATTRIBUTES
2012 simde__m128d
simde_mm_cmpord_pd(simde__m128d a,simde__m128d b)2013 simde_mm_cmpord_pd (simde__m128d a, simde__m128d b) {
2014 #if defined(SIMDE_X86_SSE2_NATIVE)
2015   return _mm_cmpord_pd(a, b);
2016 #else
2017   simde__m128d_private
2018     r_,
2019     a_ = simde__m128d_to_private(a),
2020     b_ = simde__m128d_to_private(b);
2021 
2022 #if defined(simde_math_isnan)
2023   SIMDE_VECTORIZE
2024   for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
2025     r_.u64[i] = (!simde_math_isnan(a_.f64[i]) && !simde_math_isnan(b_.f64[i])) ? ~UINT64_C(0) : UINT64_C(0);
2026   }
2027 #else
2028   HEDLEY_UNREACHABLE();
2029 #endif
2030 
2031   return simde__m128d_from_private(r_);
2032 #endif
2033 }
2034 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2035 #  define _mm_cmpord_pd(a, b) simde_mm_cmpord_pd(a, b)
2036 #endif
2037 
2038 SIMDE_FUNCTION_ATTRIBUTES
2039 simde_float64
simde_mm_cvtsd_f64(simde__m128d a)2040 simde_mm_cvtsd_f64 (simde__m128d a) {
2041 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
2042   return _mm_cvtsd_f64(a);
2043 #else
2044   simde__m128d_private a_ = simde__m128d_to_private(a);
2045   return a_.f64[0];
2046 #endif
2047 }
2048 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2049 #  define _mm_cvtsd_f64(a) simde_mm_cvtsd_f64(a)
2050 #endif
2051 
2052 SIMDE_FUNCTION_ATTRIBUTES
2053 simde__m128d
simde_mm_cmpord_sd(simde__m128d a,simde__m128d b)2054 simde_mm_cmpord_sd (simde__m128d a, simde__m128d b) {
2055 #if defined(SIMDE_X86_SSE2_NATIVE)
2056   return _mm_cmpord_sd(a, b);
2057 #elif defined(SIMDE_ASSUME_VECTORIZATION)
2058   return simde_mm_move_sd(a, simde_mm_cmpord_pd(a, b));
2059 #else
2060   simde__m128d_private
2061     r_,
2062     a_ = simde__m128d_to_private(a),
2063     b_ = simde__m128d_to_private(b);
2064 
2065 #if defined(simde_math_isnan)
2066   r_.u64[0] = (!simde_math_isnan(a_.f64[0]) && !simde_math_isnan(b_.f64[0])) ? ~UINT64_C(0) : UINT64_C(0);
2067   r_.u64[1] = a_.u64[1];
2068 #else
2069   HEDLEY_UNREACHABLE();
2070 #endif
2071 
2072   return simde__m128d_from_private(r_);
2073 #endif
2074 }
2075 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2076 #  define _mm_cmpord_sd(a, b) simde_mm_cmpord_sd(a, b)
2077 #endif
2078 
2079 SIMDE_FUNCTION_ATTRIBUTES
2080 simde__m128d
simde_mm_cmpunord_pd(simde__m128d a,simde__m128d b)2081 simde_mm_cmpunord_pd (simde__m128d a, simde__m128d b) {
2082 #if defined(SIMDE_X86_SSE2_NATIVE)
2083   return _mm_cmpunord_pd(a, b);
2084 #else
2085   simde__m128d_private
2086     r_,
2087     a_ = simde__m128d_to_private(a),
2088     b_ = simde__m128d_to_private(b);
2089 
2090 #if defined(simde_math_isnan)
2091   SIMDE_VECTORIZE
2092   for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
2093     r_.u64[i] = (simde_math_isnan(a_.f64[i]) || simde_math_isnan(b_.f64[i])) ? ~UINT64_C(0) : UINT64_C(0);
2094   }
2095 #else
2096   HEDLEY_UNREACHABLE();
2097 #endif
2098 
2099   return simde__m128d_from_private(r_);
2100 #endif
2101 }
2102 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2103 #  define _mm_cmpunord_pd(a, b) simde_mm_cmpunord_pd(a, b)
2104 #endif
2105 
2106 SIMDE_FUNCTION_ATTRIBUTES
2107 simde__m128d
simde_mm_cmpunord_sd(simde__m128d a,simde__m128d b)2108 simde_mm_cmpunord_sd (simde__m128d a, simde__m128d b) {
2109 #if defined(SIMDE_X86_SSE2_NATIVE)
2110   return _mm_cmpunord_sd(a, b);
2111 #elif defined(SIMDE_ASSUME_VECTORIZATION)
2112   return simde_mm_move_sd(a, simde_mm_cmpunord_pd(a, b));
2113 #else
2114   simde__m128d_private
2115     r_,
2116     a_ = simde__m128d_to_private(a),
2117     b_ = simde__m128d_to_private(b);
2118 
2119 #if defined(simde_math_isnan)
2120   r_.u64[0] = (simde_math_isnan(a_.f64[0]) || simde_math_isnan(b_.f64[0])) ? ~UINT64_C(0) : UINT64_C(0);
2121   r_.u64[1] = a_.u64[1];
2122 
2123 #else
2124   HEDLEY_UNREACHABLE();
2125 #endif
2126 
2127   return simde__m128d_from_private(r_);
2128 #endif
2129 }
2130 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2131 #  define _mm_cmpunord_sd(a, b) simde_mm_cmpunord_sd(a, b)
2132 #endif
2133 
2134 SIMDE_FUNCTION_ATTRIBUTES
2135 simde__m128d
simde_mm_cvtepi32_pd(simde__m128i a)2136 simde_mm_cvtepi32_pd (simde__m128i a) {
2137 #if defined(SIMDE_X86_SSE2_NATIVE)
2138   return _mm_cvtepi32_pd(a);
2139 #else
2140   simde__m128d_private r_;
2141   simde__m128i_private a_ = simde__m128i_to_private(a);
2142 
2143 #if defined(SIMDE_CONVERT_VECTOR_)
2144   SIMDE_CONVERT_VECTOR_(r_.f64, a_.m64_private[0].i32);
2145 #else
2146   SIMDE_VECTORIZE
2147   for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
2148     r_.f64[i] = (simde_float64) a_.i32[i];
2149   }
2150 #endif
2151 
2152   return simde__m128d_from_private(r_);
2153 #endif
2154 }
2155 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2156 #  define _mm_cvtepi32_pd(a) simde_mm_cvtepi32_pd(a)
2157 #endif
2158 
2159 SIMDE_FUNCTION_ATTRIBUTES
2160 simde__m128
simde_mm_cvtepi32_ps(simde__m128i a)2161 simde_mm_cvtepi32_ps (simde__m128i a) {
2162 #if defined(SIMDE_X86_SSE2_NATIVE)
2163   return _mm_cvtepi32_ps(a);
2164 #else
2165   simde__m128_private r_;
2166   simde__m128i_private a_ = simde__m128i_to_private(a);
2167 
2168 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2169   r_.neon_f32 = vcvtq_f32_s32(a_.neon_i32);
2170 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
2171   HEDLEY_DIAGNOSTIC_PUSH
2172   #if HEDLEY_HAS_WARNING("-Wc11-extensions")
2173     #pragma clang diagnostic ignored "-Wc11-extensions"
2174   #endif
2175   r_.altivec_f32 = vec_ctf(a_.altivec_i32, 0);
2176   HEDLEY_DIAGNOSTIC_POP
2177 #elif defined(SIMDE_CONVERT_VECTOR_)
2178   SIMDE_CONVERT_VECTOR_(r_.f32, a_.i32);
2179 #else
2180   SIMDE_VECTORIZE
2181   for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
2182     r_.f32[i] = (simde_float32) a_.i32[i];
2183   }
2184 #endif
2185 
2186   return simde__m128_from_private(r_);
2187 #endif
2188 }
2189 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2190 #  define _mm_cvtepi32_ps(a) simde_mm_cvtepi32_ps(a)
2191 #endif
2192 
2193 SIMDE_FUNCTION_ATTRIBUTES
2194 simde__m128i
simde_mm_cvtpd_epi32(simde__m128d a)2195 simde_mm_cvtpd_epi32 (simde__m128d a) {
2196 #if defined(SIMDE_X86_SSE2_NATIVE)
2197   return _mm_cvtpd_epi32(a);
2198 #else
2199   simde__m128i_private r_;
2200   simde__m128d_private a_ = simde__m128d_to_private(a);
2201 
2202   SIMDE_VECTORIZE
2203   for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) {
2204     r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, simde_math_nearbyint(a_.f64[i]));
2205   }
2206   simde_memset(&(r_.m64_private[1]), 0, sizeof(r_.m64_private[1]));
2207 
2208   return simde__m128i_from_private(r_);
2209 #endif
2210 }
2211 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2212 #  define _mm_cvtpd_epi32(a) simde_mm_cvtpd_epi32(a)
2213 #endif
2214 
2215 SIMDE_FUNCTION_ATTRIBUTES
2216 simde__m64
simde_mm_cvtpd_pi32(simde__m128d a)2217 simde_mm_cvtpd_pi32 (simde__m128d a) {
2218 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
2219   return _mm_cvtpd_pi32(a);
2220 #else
2221   simde__m64_private r_;
2222   simde__m128d_private a_ = simde__m128d_to_private(a);
2223 
2224   SIMDE_VECTORIZE
2225   for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
2226     r_.i32[i] = HEDLEY_STATIC_CAST(int32_t, simde_math_nearbyint(a_.f64[i]));
2227   }
2228 
2229   return simde__m64_from_private(r_);
2230 #endif
2231 }
2232 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2233 #  define _mm_cvtpd_pi32(a) simde_mm_cvtpd_pi32(a)
2234 #endif
2235 
2236 SIMDE_FUNCTION_ATTRIBUTES
2237 simde__m128
simde_mm_cvtpd_ps(simde__m128d a)2238 simde_mm_cvtpd_ps (simde__m128d a) {
2239 #if defined(SIMDE_X86_SSE2_NATIVE)
2240   return _mm_cvtpd_ps(a);
2241 #else
2242   simde__m128_private r_;
2243   simde__m128d_private a_ = simde__m128d_to_private(a);
2244 
2245 #if defined(SIMDE_CONVERT_VECTOR_)
2246   SIMDE_CONVERT_VECTOR_(r_.m64_private[0].f32, a_.f64);
2247   r_.m64_private[1] = simde__m64_to_private(simde_mm_setzero_si64());
2248 #else
2249   SIMDE_VECTORIZE
2250   for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) {
2251     r_.f32[i] = (simde_float32) a_.f64[i];
2252   }
2253   simde_memset(&(r_.m64_private[1]), 0, sizeof(r_.m64_private[1]));
2254 #endif
2255 
2256   return simde__m128_from_private(r_);
2257 #endif
2258 }
2259 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2260 #  define _mm_cvtpd_ps(a) simde_mm_cvtpd_ps(a)
2261 #endif
2262 
2263 SIMDE_FUNCTION_ATTRIBUTES
2264 simde__m128d
simde_mm_cvtpi32_pd(simde__m64 a)2265 simde_mm_cvtpi32_pd (simde__m64 a) {
2266 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
2267   return _mm_cvtpi32_pd(a);
2268 #else
2269   simde__m128d_private r_;
2270   simde__m64_private a_ = simde__m64_to_private(a);
2271 
2272 #if defined(SIMDE_CONVERT_VECTOR_)
2273   SIMDE_CONVERT_VECTOR_(r_.f64, a_.i32);
2274 #else
2275   SIMDE_VECTORIZE
2276   for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
2277     r_.f64[i] = (simde_float64) a_.i32[i];
2278   }
2279 #endif
2280 
2281   return simde__m128d_from_private(r_);
2282 #endif
2283 }
2284 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2285 #  define _mm_cvtpi32_pd(a) simde_mm_cvtpi32_pd(a)
2286 #endif
2287 
2288 SIMDE_FUNCTION_ATTRIBUTES
2289 simde__m128i
simde_mm_cvtps_epi32(simde__m128 a)2290 simde_mm_cvtps_epi32 (simde__m128 a) {
2291 #if defined(SIMDE_X86_SSE2_NATIVE)
2292   return _mm_cvtps_epi32(a);
2293 #else
2294   simde__m128i_private r_;
2295   simde__m128_private a_ = simde__m128_to_private(a);
2296 
2297 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2298   /* The default rounding mode on SSE is 'round to even', which ArmV7
2299      does not support!  It is supported on ARMv8 however. */
2300   #if defined(SIMDE_ARCH_AARCH64)
2301     r_.neon_i32 = vcvtnq_s32_f32(a_.neon_f32);
2302   #else
2303     uint32x4_t signmask = vdupq_n_u32(0x80000000);
2304     float32x4_t half = vbslq_f32(signmask, a_.neon_f32, vdupq_n_f32(0.5f)); /* +/- 0.5 */
2305     int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(a_.neon_f32, half)); /* round to integer: [a + 0.5]*/
2306     int32x4_t r_trunc = vcvtq_s32_f32(a_.neon_f32); /* truncate to integer: [a] */
2307     int32x4_t plusone = vshrq_n_s32(vnegq_s32(r_trunc), 31); /* 1 or 0 */
2308     int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone), vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
2309     float32x4_t delta = vsubq_f32(a_.neon_f32, vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
2310     uint32x4_t is_delta_half = vceqq_f32(delta, half); /* delta == +/- 0.5 */
2311     r_.neon_i32 = vbslq_s32(is_delta_half, r_even, r_normal);
2312   #endif
2313 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
2314   r_.altivec_i32 = vec_cts(vec_round(a_.altivec_f32), 0);
2315 #else
2316   SIMDE_VECTORIZE
2317   for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
2318     r_.i32[i] = HEDLEY_STATIC_CAST(int32_t, simde_math_roundf(a_.f32[i]));
2319   }
2320 #endif
2321 
2322   return simde__m128i_from_private(r_);
2323 #endif
2324 }
2325 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2326 #  define _mm_cvtps_epi32(a) simde_mm_cvtps_epi32(a)
2327 #endif
2328 
2329 SIMDE_FUNCTION_ATTRIBUTES
2330 simde__m128d
simde_mm_cvtps_pd(simde__m128 a)2331 simde_mm_cvtps_pd (simde__m128 a) {
2332 #if defined(SIMDE_X86_SSE2_NATIVE)
2333   return _mm_cvtps_pd(a);
2334 #else
2335   simde__m128d_private r_;
2336   simde__m128_private a_ = simde__m128_to_private(a);
2337 
2338 #if defined(SIMDE_CONVERT_VECTOR_)
2339   SIMDE_CONVERT_VECTOR_(r_.f64, a_.m64_private[0].f32);
2340 #else
2341   SIMDE_VECTORIZE
2342   for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
2343     r_.f64[i] = a_.f32[i];
2344   }
2345 #endif
2346 
2347   return simde__m128d_from_private(r_);
2348 #endif
2349 }
2350 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2351 #  define _mm_cvtps_pd(a) simde_mm_cvtps_pd(a)
2352 #endif
2353 
2354 SIMDE_FUNCTION_ATTRIBUTES
2355 int32_t
simde_mm_cvtsd_si32(simde__m128d a)2356 simde_mm_cvtsd_si32 (simde__m128d a) {
2357 #if defined(SIMDE_X86_SSE2_NATIVE)
2358   return _mm_cvtsd_si32(a);
2359 #else
2360   simde__m128d_private a_ = simde__m128d_to_private(a);
2361   return SIMDE_CONVERT_FTOI(int32_t, simde_math_round(a_.f64[0]));
2362 #endif
2363 }
2364 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2365 #  define _mm_cvtsd_si32(a) simde_mm_cvtsd_si32(a)
2366 #endif
2367 
2368 SIMDE_FUNCTION_ATTRIBUTES
2369 int64_t
simde_mm_cvtsd_si64(simde__m128d a)2370 simde_mm_cvtsd_si64 (simde__m128d a) {
2371 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
2372   #if defined(__PGI)
2373     return _mm_cvtsd_si64x(a);
2374   #else
2375     return _mm_cvtsd_si64(a);
2376   #endif
2377 #else
2378   simde__m128d_private a_ = simde__m128d_to_private(a);
2379   return SIMDE_CONVERT_FTOI(int64_t, simde_math_round(a_.f64[0]));
2380 #endif
2381 }
2382 #define simde_mm_cvtsd_si64x(a) simde_mm_cvtsd_si64(a)
2383 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2384 #  define _mm_cvtsd_si64(a) simde_mm_cvtsd_si64(a)
2385 #  define _mm_cvtsd_si64x(a) simde_mm_cvtsd_si64x(a)
2386 #endif
2387 
2388 SIMDE_FUNCTION_ATTRIBUTES
2389 simde__m128
simde_mm_cvtsd_ss(simde__m128 a,simde__m128d b)2390 simde_mm_cvtsd_ss (simde__m128 a, simde__m128d b) {
2391 #if defined(SIMDE_X86_SSE2_NATIVE)
2392   return _mm_cvtsd_ss(a, b);
2393 #else
2394   simde__m128_private
2395     r_,
2396     a_ = simde__m128_to_private(a);
2397   simde__m128d_private b_ = simde__m128d_to_private(b);
2398 
2399   r_.f32[0] = HEDLEY_STATIC_CAST(simde_float32, b_.f64[0]);
2400 
2401   SIMDE_VECTORIZE
2402   for (size_t i = 1 ; i < (sizeof(r_) / sizeof(r_.i32[0])) ; i++) {
2403     r_.i32[i] = a_.i32[i];
2404   }
2405 
2406   return simde__m128_from_private(r_);
2407 #endif
2408 }
2409 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2410 #  define _mm_cvtsd_ss(a, b) simde_mm_cvtsd_ss(a, b)
2411 #endif
2412 
2413 SIMDE_FUNCTION_ATTRIBUTES
2414 int32_t
simde_mm_cvtsi128_si32(simde__m128i a)2415 simde_mm_cvtsi128_si32 (simde__m128i a) {
2416   #if defined(SIMDE_X86_SSE2_NATIVE)
2417     return _mm_cvtsi128_si32(a);
2418   #else
2419     simde__m128i_private
2420       a_ = simde__m128i_to_private(a);
2421 
2422     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2423       return vgetq_lane_s32(a_.neon_i32, 0);
2424     #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
2425       #if defined(SIMDE_BUG_GCC_95227)
2426         (void) a_;
2427       #endif
2428       return vec_extract(a_.altivec_i32, 0);
2429     #else
2430       return a_.i32[0];
2431     #endif
2432   #endif
2433 }
2434 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2435 #  define _mm_cvtsi128_si32(a) simde_mm_cvtsi128_si32(a)
2436 #endif
2437 
2438 SIMDE_FUNCTION_ATTRIBUTES
2439 int64_t
simde_mm_cvtsi128_si64(simde__m128i a)2440 simde_mm_cvtsi128_si64 (simde__m128i a) {
2441 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
2442   #if defined(__PGI)
2443     return _mm_cvtsi128_si64x(a);
2444   #else
2445     return _mm_cvtsi128_si64(a);
2446   #endif
2447 #else
2448   simde__m128i_private a_ = simde__m128i_to_private(a);
2449 #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) && !defined(HEDLEY_IBM_VERSION)
2450   return vec_extract(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed long long), a_.i64), 0);
2451 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2452   return vgetq_lane_s64(a_.neon_i64, 0);
2453 #endif
2454   return a_.i64[0];
2455 #endif
2456 }
2457 #define simde_mm_cvtsi128_si64x(a) simde_mm_cvtsi128_si64(a)
2458 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2459 #  define _mm_cvtsi128_si64(a) simde_mm_cvtsi128_si64(a)
2460 #  define _mm_cvtsi128_si64x(a) simde_mm_cvtsi128_si64x(a)
2461 #endif
2462 
2463 SIMDE_FUNCTION_ATTRIBUTES
2464 simde__m128d
simde_mm_cvtsi32_sd(simde__m128d a,int32_t b)2465 simde_mm_cvtsi32_sd (simde__m128d a, int32_t b) {
2466 
2467 #if defined(SIMDE_X86_SSE2_NATIVE)
2468   return _mm_cvtsi32_sd(a, b);
2469 #else
2470   simde__m128d_private r_;
2471   simde__m128d_private a_ = simde__m128d_to_private(a);
2472 
2473 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_AMD64)
2474   r_.neon_f64 = vsetq_lane_f64(HEDLEY_STATIC_CAST(float64_t, b), a_.neon_f64, 0);
2475 #else
2476   r_.f64[0] = HEDLEY_STATIC_CAST(simde_float64, b);
2477   r_.i64[1] = a_.i64[1];
2478 #endif
2479 
2480   return simde__m128d_from_private(r_);
2481 #endif
2482 }
2483 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2484 #  define _mm_cvtsi32_sd(a, b) simde_mm_cvtsi32_sd(a, b)
2485 #endif
2486 
2487 SIMDE_FUNCTION_ATTRIBUTES
2488 simde__m128i
simde_mm_cvtsi32_si128(int32_t a)2489 simde_mm_cvtsi32_si128 (int32_t a) {
2490 #if defined(SIMDE_X86_SSE2_NATIVE)
2491   return _mm_cvtsi32_si128(a);
2492 #else
2493   simde__m128i_private r_;
2494 
2495 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2496   r_.neon_i32 = vsetq_lane_s32(a, vdupq_n_s32(0), 0);
2497 #else
2498   r_.i32[0] = a;
2499   r_.i32[1] = 0;
2500   r_.i32[2] = 0;
2501   r_.i32[3] = 0;
2502 #endif
2503 
2504   return simde__m128i_from_private(r_);
2505 #endif
2506 }
2507 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2508 #  define _mm_cvtsi32_si128(a) simde_mm_cvtsi32_si128(a)
2509 #endif
2510 
2511 SIMDE_FUNCTION_ATTRIBUTES
2512 simde__m128d
simde_mm_cvtsi64_sd(simde__m128d a,int64_t b)2513 simde_mm_cvtsi64_sd (simde__m128d a, int64_t b) {
2514 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
2515   #if !defined(__PGI)
2516     return _mm_cvtsi64_sd(a, b);
2517   #else
2518     return _mm_cvtsi64x_sd(a, b);
2519   #endif
2520 #else
2521   simde__m128d_private
2522     r_,
2523     a_ = simde__m128d_to_private(a);
2524 
2525 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2526   r_.neon_f64 = vsetq_lane_f64(HEDLEY_STATIC_CAST(float64_t, b), a_.neon_f64, 0);
2527 #else
2528   r_.f64[0] = HEDLEY_STATIC_CAST(simde_float64, b);
2529   r_.f64[1] = a_.f64[1];
2530 #endif
2531 
2532   return simde__m128d_from_private(r_);
2533 #endif
2534 }
2535 #define simde_mm_cvtsi64x_sd(a, b) simde_mm_cvtsi64_sd(a, b)
2536 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2537 #  define _mm_cvtsi64_sd(a, b) simde_mm_cvtsi64_sd(a, b)
2538 #  define _mm_cvtsi64x_sd(a, b) simde_mm_cvtsi64x_sd(a, b)
2539 #endif
2540 
2541 SIMDE_FUNCTION_ATTRIBUTES
2542 simde__m128i
simde_mm_cvtsi64_si128(int64_t a)2543 simde_mm_cvtsi64_si128 (int64_t a) {
2544 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
2545   #if !defined(__PGI)
2546     return _mm_cvtsi64_si128(a);
2547   #else
2548     return _mm_cvtsi64x_si128(a);
2549   #endif
2550 #else
2551   simde__m128i_private r_;
2552 
2553   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2554     r_.neon_i64 = vsetq_lane_s64(a, vdupq_n_s64(0), 0);
2555   #else
2556     r_.i64[0] = a;
2557     r_.i64[1] = 0;
2558   #endif
2559 
2560   return simde__m128i_from_private(r_);
2561 #endif
2562 }
2563 #define simde_mm_cvtsi64x_si128(a) simde_mm_cvtsi64_si128(a)
2564 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2565 #  define _mm_cvtsi64_si128(a) simde_mm_cvtsi64_si128(a)
2566 #  define _mm_cvtsi64x_si128(a) simde_mm_cvtsi64x_si128(a)
2567 #endif
2568 
2569 SIMDE_FUNCTION_ATTRIBUTES
2570 simde__m128d
simde_mm_cvtss_sd(simde__m128d a,simde__m128 b)2571 simde_mm_cvtss_sd (simde__m128d a, simde__m128 b) {
2572 #if defined(SIMDE_X86_SSE2_NATIVE)
2573   return _mm_cvtss_sd(a, b);
2574 #else
2575   simde__m128d_private
2576     a_ = simde__m128d_to_private(a);
2577   simde__m128_private b_ = simde__m128_to_private(b);
2578 
2579   a_.f64[0] = HEDLEY_STATIC_CAST(simde_float64, b_.f32[0]);
2580 
2581   return simde__m128d_from_private(a_);
2582 #endif
2583 }
2584 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2585 #  define _mm_cvtss_sd(a, b) simde_mm_cvtss_sd(a, b)
2586 #endif
2587 
2588 SIMDE_FUNCTION_ATTRIBUTES
2589 simde__m128i
simde_mm_cvttpd_epi32(simde__m128d a)2590 simde_mm_cvttpd_epi32 (simde__m128d a) {
2591 #if defined(SIMDE_X86_SSE2_NATIVE)
2592   return _mm_cvttpd_epi32(a);
2593 #else
2594   simde__m128i_private r_;
2595   simde__m128d_private a_ = simde__m128d_to_private(a);
2596 
2597   for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) {
2598     r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, a_.f64[i]);
2599   }
2600   simde_memset(&(r_.m64_private[1]), 0, sizeof(r_.m64_private[1]));
2601 
2602   return simde__m128i_from_private(r_);
2603 #endif
2604 }
2605 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2606 #  define _mm_cvttpd_epi32(a) simde_mm_cvttpd_epi32(a)
2607 #endif
2608 
2609 SIMDE_FUNCTION_ATTRIBUTES
2610 simde__m64
simde_mm_cvttpd_pi32(simde__m128d a)2611 simde_mm_cvttpd_pi32 (simde__m128d a) {
2612 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
2613   return _mm_cvttpd_pi32(a);
2614 #else
2615   simde__m64_private r_;
2616   simde__m128d_private a_ = simde__m128d_to_private(a);
2617 
2618 #if defined(SIMDE_CONVERT_VECTOR_)
2619   SIMDE_CONVERT_VECTOR_(r_.i32, a_.f64);
2620 #else
2621   for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
2622     r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, a_.f64[i]);
2623   }
2624 #endif
2625 
2626   return simde__m64_from_private(r_);
2627 #endif
2628 }
2629 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2630 #  define _mm_cvttpd_pi32(a) simde_mm_cvttpd_pi32(a)
2631 #endif
2632 
2633 SIMDE_FUNCTION_ATTRIBUTES
2634 simde__m128i
simde_mm_cvttps_epi32(simde__m128 a)2635 simde_mm_cvttps_epi32 (simde__m128 a) {
2636 #if defined(SIMDE_X86_SSE2_NATIVE)
2637   return _mm_cvttps_epi32(a);
2638 #else
2639   simde__m128i_private r_;
2640   simde__m128_private a_ = simde__m128_to_private(a);
2641 
2642 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2643   r_.neon_i32 = vcvtq_s32_f32(a_.neon_f32);
2644 #elif defined(SIMDE_CONVERT_VECTOR_)
2645   SIMDE_CONVERT_VECTOR_(r_.i32, a_.f32);
2646 #else
2647   for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
2648     r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, a_.f32[i]);
2649   }
2650 #endif
2651 
2652   return simde__m128i_from_private(r_);
2653 #endif
2654 }
2655 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2656 #  define _mm_cvttps_epi32(a) simde_mm_cvttps_epi32(a)
2657 #endif
2658 
2659 SIMDE_FUNCTION_ATTRIBUTES
2660 int32_t
simde_mm_cvttsd_si32(simde__m128d a)2661 simde_mm_cvttsd_si32 (simde__m128d a) {
2662 #if defined(SIMDE_X86_SSE2_NATIVE)
2663   return _mm_cvttsd_si32(a);
2664 #else
2665   simde__m128d_private a_ = simde__m128d_to_private(a);
2666   return SIMDE_CONVERT_FTOI(int32_t, a_.f64[0]);
2667 #endif
2668 }
2669 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2670 #  define _mm_cvttsd_si32(a) simde_mm_cvttsd_si32(a)
2671 #endif
2672 
2673 SIMDE_FUNCTION_ATTRIBUTES
2674 int64_t
simde_mm_cvttsd_si64(simde__m128d a)2675 simde_mm_cvttsd_si64 (simde__m128d a) {
2676 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
2677   #if !defined(__PGI)
2678     return _mm_cvttsd_si64(a);
2679   #else
2680     return _mm_cvttsd_si64x(a);
2681   #endif
2682 #else
2683   simde__m128d_private a_ = simde__m128d_to_private(a);
2684   return SIMDE_CONVERT_FTOI(int64_t, a_.f64[0]);
2685 #endif
2686 }
2687 #define simde_mm_cvttsd_si64x(a) simde_mm_cvttsd_si64(a)
2688 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2689 #  define _mm_cvttsd_si64(a) simde_mm_cvttsd_si64(a)
2690 #  define _mm_cvttsd_si64x(a) simde_mm_cvttsd_si64x(a)
2691 #endif
2692 
2693 SIMDE_FUNCTION_ATTRIBUTES
2694 simde__m128d
simde_mm_div_pd(simde__m128d a,simde__m128d b)2695 simde_mm_div_pd (simde__m128d a, simde__m128d b) {
2696 #if defined(SIMDE_X86_SSE2_NATIVE)
2697   return _mm_div_pd(a, b);
2698 #else
2699   simde__m128d_private
2700     r_,
2701     a_ = simde__m128d_to_private(a),
2702     b_ = simde__m128d_to_private(b);
2703 
2704 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
2705   r_.f64 = a_.f64 / b_.f64;
2706 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
2707     r_.wasm_v128 =  wasm_f64x2_div(a_.wasm_v128, b_.wasm_v128);
2708 #else
2709   SIMDE_VECTORIZE
2710   for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
2711     r_.f64[i] = a_.f64[i] / b_.f64[i];
2712   }
2713 #endif
2714 
2715   return simde__m128d_from_private(r_);
2716 #endif
2717 }
2718 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2719 #  define _mm_div_pd(a, b) simde_mm_div_pd(a, b)
2720 #endif
2721 
2722 SIMDE_FUNCTION_ATTRIBUTES
2723 simde__m128d
simde_mm_div_sd(simde__m128d a,simde__m128d b)2724 simde_mm_div_sd (simde__m128d a, simde__m128d b) {
2725 #if defined(SIMDE_X86_SSE2_NATIVE)
2726   return _mm_div_sd(a, b);
2727 #elif defined(SIMDE_ASSUME_VECTORIZATION)
2728   return simde_mm_move_sd(a, simde_mm_div_pd(a, b));
2729 #else
2730   simde__m128d_private
2731     r_,
2732     a_ = simde__m128d_to_private(a),
2733     b_ = simde__m128d_to_private(b);
2734 
2735   r_.f64[0] = a_.f64[0] / b_.f64[0];
2736   r_.f64[1] = a_.f64[1];
2737 
2738   return simde__m128d_from_private(r_);
2739 #endif
2740 }
2741 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2742 #  define _mm_div_sd(a, b) simde_mm_div_sd(a, b)
2743 #endif
2744 
2745 SIMDE_FUNCTION_ATTRIBUTES
2746 int32_t
simde_mm_extract_epi16(simde__m128i a,const int imm8)2747 simde_mm_extract_epi16 (simde__m128i a, const int imm8)
2748     SIMDE_REQUIRE_RANGE(imm8, 0, 7)  {
2749   uint16_t r;
2750   simde__m128i_private a_ = simde__m128i_to_private(a);
2751 
2752   #if defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
2753     #if defined(SIMDE_BUG_GCC_95227)
2754       (void) a_;
2755       (void) imm8;
2756     #endif
2757     r = HEDLEY_STATIC_CAST(uint16_t, vec_extract(a_.altivec_i16, imm8));
2758   #else
2759     r = a_.u16[imm8 & 7];
2760   #endif
2761 
2762   return  HEDLEY_STATIC_CAST(int32_t, r);
2763 }
2764 #if defined(SIMDE_X86_SSE2_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(4,6,0))
2765 #  define simde_mm_extract_epi16(a, imm8) _mm_extract_epi16(a, imm8)
2766 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2767 #  define simde_mm_extract_epi16(a, imm8) (HEDLEY_STATIC_CAST(int32_t, vgetq_lane_s16(simde__m128i_to_private(a).neon_i16, (imm8))) & (INT32_C(0x0000ffff)))
2768 #endif
2769 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2770 #  define _mm_extract_epi16(a, imm8) simde_mm_extract_epi16(a, imm8)
2771 #endif
2772 
2773 SIMDE_FUNCTION_ATTRIBUTES
2774 simde__m128i
simde_mm_insert_epi16(simde__m128i a,int16_t i,const int imm8)2775 simde_mm_insert_epi16 (simde__m128i a, int16_t i, const int imm8)
2776     SIMDE_REQUIRE_RANGE(imm8, 0, 7)  {
2777   simde__m128i_private a_ = simde__m128i_to_private(a);
2778   a_.i16[imm8 & 7] = i;
2779   return simde__m128i_from_private(a_);
2780 }
2781 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
2782 #  define simde_mm_insert_epi16(a, i, imm8) _mm_insert_epi16((a), (i), (imm8))
2783 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2784 #  define simde_mm_insert_epi16(a, i, imm8) simde__m128i_from_neon_i16(vsetq_lane_s16((i), simde__m128i_to_neon_i16(a), (imm8)))
2785 #endif
2786 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2787 #  define _mm_insert_epi16(a, i, imm8) simde_mm_insert_epi16(a, i, imm8)
2788 #endif
2789 
2790 SIMDE_FUNCTION_ATTRIBUTES
2791 simde__m128d
simde_mm_load_pd(simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM (2)])2792 simde_mm_load_pd (simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)]) {
2793   simde_assert_aligned(16, mem_addr);
2794 
2795   #if defined(SIMDE_X86_SSE2_NATIVE)
2796     return _mm_load_pd(mem_addr);
2797   #else
2798     simde__m128d_private r_;
2799 
2800     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2801       r_.neon_u32 = vld1q_u32(HEDLEY_REINTERPRET_CAST(uint32_t const*, mem_addr));
2802     #else
2803       r_ = *SIMDE_ALIGN_CAST(simde__m128d_private const*, mem_addr);
2804     #endif
2805 
2806     return simde__m128d_from_private(r_);
2807   #endif
2808 }
2809 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2810 #  define _mm_load_pd(mem_addr) simde_mm_load_pd(mem_addr)
2811 #endif
2812 
2813 SIMDE_FUNCTION_ATTRIBUTES
2814 simde__m128d
simde_mm_load_pd1(simde_float64 const * mem_addr)2815 simde_mm_load_pd1 (simde_float64 const* mem_addr) {
2816 #if defined(SIMDE_X86_SSE2_NATIVE)
2817   return _mm_load1_pd(mem_addr);
2818 #else
2819   simde__m128d_private r_;
2820 
2821   r_.f64[0] = *mem_addr;
2822   r_.f64[1] = *mem_addr;
2823 
2824   return simde__m128d_from_private(r_);
2825 #endif
2826 }
2827 #define simde_mm_load1_pd(mem_addr) simde_mm_load_pd1(mem_addr)
2828 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2829 #  define _mm_load_pd1(mem_addr) simde_mm_load_pd1(mem_addr)
2830 #  define _mm_load1_pd(mem_addr) simde_mm_load1_pd(mem_addr)
2831 #endif
2832 
2833 SIMDE_FUNCTION_ATTRIBUTES
2834 simde__m128d
simde_mm_load_sd(simde_float64 const * mem_addr)2835 simde_mm_load_sd (simde_float64 const* mem_addr) {
2836 #if defined(SIMDE_X86_SSE2_NATIVE)
2837   return _mm_load_sd(mem_addr);
2838 #else
2839   simde__m128d_private r_;
2840 
2841 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2842   r_.neon_f64 = vsetq_lane_f64(*mem_addr, vdupq_n_f64(0), 0);
2843 #else
2844   r_.f64[0] = *mem_addr;
2845   r_.u64[1] = UINT64_C(0);
2846 #endif
2847 
2848   return simde__m128d_from_private(r_);
2849 #endif
2850 }
2851 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2852 #  define _mm_load_sd(mem_addr) simde_mm_load_sd(mem_addr)
2853 #endif
2854 
2855 SIMDE_FUNCTION_ATTRIBUTES
2856 simde__m128i
simde_mm_load_si128(simde__m128i const * mem_addr)2857 simde_mm_load_si128 (simde__m128i const* mem_addr) {
2858   simde_assert_aligned(16, mem_addr);
2859 
2860   #if defined(SIMDE_X86_SSE2_NATIVE)
2861     return _mm_load_si128(HEDLEY_REINTERPRET_CAST(__m128i const*, mem_addr));
2862   #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) || defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
2863     simde__m128i_private r_;
2864 
2865     #if defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
2866       r_.altivec_i32 = vec_ld(0, HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(int) const*, mem_addr));
2867     #else
2868       r_.neon_i32 = vld1q_s32(HEDLEY_REINTERPRET_CAST(int32_t const*, mem_addr));
2869     #endif
2870 
2871     return simde__m128i_from_private(r_);
2872   #else
2873     return *mem_addr;
2874   #endif
2875 }
2876 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2877 #  define _mm_load_si128(mem_addr) simde_mm_load_si128(mem_addr)
2878 #endif
2879 
2880 SIMDE_FUNCTION_ATTRIBUTES
2881 simde__m128d
simde_mm_loadh_pd(simde__m128d a,simde_float64 const * mem_addr)2882 simde_mm_loadh_pd (simde__m128d a, simde_float64 const* mem_addr) {
2883 #if defined(SIMDE_X86_SSE2_NATIVE)
2884   return _mm_loadh_pd(a, mem_addr);
2885 #else
2886   simde__m128d_private
2887     r_,
2888     a_ = simde__m128d_to_private(a);
2889   simde_float64 t;
2890 
2891   simde_memcpy(&t, mem_addr, sizeof(t));
2892   r_.f64[0] = a_.f64[0];
2893   r_.f64[1] = t;
2894 
2895   return simde__m128d_from_private(r_);
2896 #endif
2897 }
2898 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2899 #  define _mm_loadh_pd(a, mem_addr) simde_mm_loadh_pd(a, mem_addr)
2900 #endif
2901 
2902 SIMDE_FUNCTION_ATTRIBUTES
2903 simde__m128i
simde_mm_loadl_epi64(simde__m128i const * mem_addr)2904 simde_mm_loadl_epi64 (simde__m128i const* mem_addr) {
2905 #if defined(SIMDE_X86_SSE2_NATIVE)
2906   return _mm_loadl_epi64(mem_addr);
2907 #else
2908   simde__m128i_private r_;
2909 
2910   int64_t value;
2911   simde_memcpy(&value, mem_addr, sizeof(value));
2912 
2913   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2914     r_.neon_i64 = vcombine_s64(vld1_s64(HEDLEY_REINTERPRET_CAST(int64_t const *, mem_addr)), vdup_n_s64(0));
2915   #else
2916     r_.i64[0] = value;
2917     r_.i64[1] = 0;
2918   #endif
2919 
2920   return simde__m128i_from_private(r_);
2921 #endif
2922 }
2923 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2924 #  define _mm_loadl_epi64(mem_addr) simde_mm_loadl_epi64(mem_addr)
2925 #endif
2926 
2927 SIMDE_FUNCTION_ATTRIBUTES
2928 simde__m128d
simde_mm_loadl_pd(simde__m128d a,simde_float64 const * mem_addr)2929 simde_mm_loadl_pd (simde__m128d a, simde_float64 const* mem_addr) {
2930 #if defined(SIMDE_X86_SSE2_NATIVE)
2931   return _mm_loadl_pd(a, mem_addr);
2932 #else
2933   simde__m128d_private
2934     r_,
2935     a_ = simde__m128d_to_private(a);
2936 
2937   r_.f64[0] = *mem_addr;
2938   r_.u64[1] = a_.u64[1];
2939 
2940   return simde__m128d_from_private(r_);
2941 #endif
2942 }
2943 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2944 #  define _mm_loadl_pd(a, mem_addr) simde_mm_loadl_pd(a, mem_addr)
2945 #endif
2946 
2947 SIMDE_FUNCTION_ATTRIBUTES
2948 simde__m128d
simde_mm_loadr_pd(simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM (2)])2949 simde_mm_loadr_pd (simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)]) {
2950   simde_assert_aligned(16, mem_addr);
2951 
2952 #if defined(SIMDE_X86_SSE2_NATIVE)
2953   return _mm_loadr_pd(mem_addr);
2954 #else
2955   simde__m128d_private r_;
2956 
2957   r_.f64[0] = mem_addr[1];
2958   r_.f64[1] = mem_addr[0];
2959 
2960   return simde__m128d_from_private(r_);
2961 #endif
2962 }
2963 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2964 #  define _mm_loadr_pd(mem_addr) simde_mm_loadr_pd(mem_addr)
2965 #endif
2966 
2967 SIMDE_FUNCTION_ATTRIBUTES
2968 simde__m128d
simde_mm_loadu_pd(simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM (2)])2969 simde_mm_loadu_pd (simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)]) {
2970 #if defined(SIMDE_X86_SSE2_NATIVE)
2971   return _mm_loadu_pd(mem_addr);
2972 #else
2973   simde__m128d_private r_;
2974 
2975   simde_memcpy(&r_, mem_addr, sizeof(r_));
2976 
2977   return simde__m128d_from_private(r_);
2978 #endif
2979 }
2980 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2981 #  define _mm_loadu_pd(mem_addr) simde_mm_loadu_pd(mem_addr)
2982 #endif
2983 
2984 SIMDE_FUNCTION_ATTRIBUTES
2985 simde__m128i
simde_x_mm_loadu_epi8(int8_t const * mem_addr)2986 simde_x_mm_loadu_epi8(int8_t const* mem_addr) {
2987   #if defined(SIMDE_X86_SSE2_NATIVE)
2988     return _mm_loadu_si128(SIMDE_ALIGN_CAST(simde__m128i const*, mem_addr));
2989   #else
2990     simde__m128i_private r_;
2991 
2992     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2993       r_.neon_i8 = vld1q_s8(HEDLEY_REINTERPRET_CAST(int8_t const*, mem_addr));
2994     #else
2995       simde_memcpy(&r_, mem_addr, sizeof(r_));
2996     #endif
2997 
2998     return simde__m128i_from_private(r_);
2999   #endif
3000 }
3001 
3002 SIMDE_FUNCTION_ATTRIBUTES
3003 simde__m128i
simde_x_mm_loadu_epi16(int16_t const * mem_addr)3004 simde_x_mm_loadu_epi16(int16_t const* mem_addr) {
3005   #if defined(SIMDE_X86_SSE2_NATIVE)
3006     return _mm_loadu_si128(SIMDE_ALIGN_CAST(simde__m128i const*, mem_addr));
3007   #else
3008     simde__m128i_private r_;
3009 
3010     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3011       r_.neon_i16 = vld1q_s16(HEDLEY_REINTERPRET_CAST(int16_t const*, mem_addr));
3012     #else
3013       simde_memcpy(&r_, mem_addr, sizeof(r_));
3014     #endif
3015 
3016     return simde__m128i_from_private(r_);
3017   #endif
3018 }
3019 
3020 SIMDE_FUNCTION_ATTRIBUTES
3021 simde__m128i
simde_x_mm_loadu_epi32(int32_t const * mem_addr)3022 simde_x_mm_loadu_epi32(int32_t const* mem_addr) {
3023   #if defined(SIMDE_X86_SSE2_NATIVE)
3024     return _mm_loadu_si128(SIMDE_ALIGN_CAST(simde__m128i const*, mem_addr));
3025   #else
3026     simde__m128i_private r_;
3027 
3028     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3029       r_.neon_i32 = vld1q_s32(HEDLEY_REINTERPRET_CAST(int32_t const*, mem_addr));
3030     #else
3031       simde_memcpy(&r_, mem_addr, sizeof(r_));
3032     #endif
3033 
3034     return simde__m128i_from_private(r_);
3035   #endif
3036 }
3037 
3038 SIMDE_FUNCTION_ATTRIBUTES
3039 simde__m128i
simde_x_mm_loadu_epi64(int64_t const * mem_addr)3040 simde_x_mm_loadu_epi64(int64_t const* mem_addr) {
3041   #if defined(SIMDE_X86_SSE2_NATIVE)
3042     return _mm_loadu_si128(SIMDE_ALIGN_CAST(simde__m128i const*, mem_addr));
3043   #else
3044     simde__m128i_private r_;
3045 
3046     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3047       r_.neon_i64 = vld1q_s64(HEDLEY_REINTERPRET_CAST(int64_t const*, mem_addr));
3048     #else
3049       simde_memcpy(&r_, mem_addr, sizeof(r_));
3050     #endif
3051 
3052     return simde__m128i_from_private(r_);
3053   #endif
3054 }
3055 
3056 SIMDE_FUNCTION_ATTRIBUTES
3057 simde__m128i
simde_mm_loadu_si128(void const * mem_addr)3058 simde_mm_loadu_si128 (void const* mem_addr) {
3059   #if defined(SIMDE_X86_SSE2_NATIVE)
3060     return _mm_loadu_si128(HEDLEY_STATIC_CAST(__m128i const*, mem_addr));
3061   #else
3062     simde__m128i_private r_;
3063 
3064     #if HEDLEY_GNUC_HAS_ATTRIBUTE(may_alias,3,3,0)
3065       HEDLEY_DIAGNOSTIC_PUSH
3066       SIMDE_DIAGNOSTIC_DISABLE_PACKED_
3067       struct simde_mm_loadu_si128_s {
3068         __typeof__(r_) v;
3069       } __attribute__((__packed__, __may_alias__));
3070       r_ = HEDLEY_REINTERPRET_CAST(const struct simde_mm_loadu_si128_s *, mem_addr)->v;
3071       HEDLEY_DIAGNOSTIC_POP
3072     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3073       /* Note that this is a lower priority than the struct above since
3074        * clang assumes mem_addr is aligned (since it is a __m128i*). */
3075       r_.neon_i32 = vld1q_s32(HEDLEY_REINTERPRET_CAST(int32_t const*, mem_addr));
3076     #else
3077       simde_memcpy(&r_, mem_addr, sizeof(r_));
3078     #endif
3079 
3080     return simde__m128i_from_private(r_);
3081   #endif
3082 }
3083 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3084 #  define _mm_loadu_si128(mem_addr) simde_mm_loadu_si128(mem_addr)
3085 #endif
3086 
3087 SIMDE_FUNCTION_ATTRIBUTES
3088 simde__m128i
simde_mm_madd_epi16(simde__m128i a,simde__m128i b)3089 simde_mm_madd_epi16 (simde__m128i a, simde__m128i b) {
3090 #if defined(SIMDE_X86_SSE2_NATIVE)
3091   return _mm_madd_epi16(a, b);
3092 #else
3093   simde__m128i_private
3094     r_,
3095     a_ = simde__m128i_to_private(a),
3096     b_ = simde__m128i_to_private(b);
3097 
3098   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3099     int32x4_t pl = vmull_s16(vget_low_s16(a_.neon_i16),  vget_low_s16(b_.neon_i16));
3100     int32x4_t ph = vmull_s16(vget_high_s16(a_.neon_i16), vget_high_s16(b_.neon_i16));
3101     int32x2_t rl = vpadd_s32(vget_low_s32(pl), vget_high_s32(pl));
3102     int32x2_t rh = vpadd_s32(vget_low_s32(ph), vget_high_s32(ph));
3103     r_.neon_i32 = vcombine_s32(rl, rh);
3104   #else
3105     SIMDE_VECTORIZE
3106     for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i16[0])) ; i += 2) {
3107       r_.i32[i / 2] = (a_.i16[i] * b_.i16[i]) + (a_.i16[i + 1] * b_.i16[i + 1]);
3108     }
3109   #endif
3110 
3111   return simde__m128i_from_private(r_);
3112 #endif
3113 }
3114 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3115 #  define _mm_madd_epi16(a, b) simde_mm_madd_epi16(a, b)
3116 #endif
3117 
3118 SIMDE_FUNCTION_ATTRIBUTES
3119 void
simde_mm_maskmoveu_si128(simde__m128i a,simde__m128i mask,int8_t mem_addr[HEDLEY_ARRAY_PARAM (16)])3120 simde_mm_maskmoveu_si128 (simde__m128i a, simde__m128i mask, int8_t mem_addr[HEDLEY_ARRAY_PARAM(16)]) {
3121 #if defined(SIMDE_X86_SSE2_NATIVE)
3122   _mm_maskmoveu_si128(a, mask, HEDLEY_REINTERPRET_CAST(char*, mem_addr));
3123 #else
3124   simde__m128i_private
3125     a_ = simde__m128i_to_private(a),
3126     mask_ = simde__m128i_to_private(mask);
3127 
3128   for (size_t i = 0 ; i < (sizeof(a_.i8) / sizeof(a_.i8[0])) ; i++) {
3129     if (mask_.u8[i] & 0x80) {
3130       mem_addr[i] = a_.i8[i];
3131     }
3132   }
3133 #endif
3134 }
3135 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3136 #  define _mm_maskmoveu_si128(a, mask, mem_addr) simde_mm_maskmoveu_si128((a), (mask), SIMDE_CHECKED_REINTERPRET_CAST(int8_t*, char*, (mem_addr)))
3137 #endif
3138 
3139 SIMDE_FUNCTION_ATTRIBUTES
3140 int32_t
simde_mm_movemask_epi8(simde__m128i a)3141 simde_mm_movemask_epi8 (simde__m128i a) {
3142 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__INTEL_COMPILER)
3143   /* ICC has trouble with _mm_movemask_epi8 at -O2 and above: */
3144   return _mm_movemask_epi8(a);
3145 #else
3146   int32_t r = 0;
3147   simde__m128i_private a_ = simde__m128i_to_private(a);
3148 
3149 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3150   // Use increasingly wide shifts+adds to collect the sign bits
3151     // together.
3152     // Since the widening shifts would be rather confusing to follow in little endian, everything
3153     // will be illustrated in big endian order instead. This has a different result - the bits
3154     // would actually be reversed on a big endian machine.
3155 
3156     // Starting input (only half the elements are shown):
3157     // 89 ff 1d c0 00 10 99 33
3158     uint8x16_t input = a_.neon_u8;
3159 
3160     // Shift out everything but the sign bits with an unsigned shift right.
3161     //
3162     // Bytes of the vector::
3163     // 89 ff 1d c0 00 10 99 33
3164     // \  \  \  \  \  \  \  \    high_bits = (uint16x4_t)(input >> 7)
3165     //  |  |  |  |  |  |  |  |
3166     // 01 01 00 01 00 00 01 00
3167     //
3168     // Bits of first important lane(s):
3169     // 10001001 (89)
3170     // \______
3171     //        |
3172     // 00000001 (01)
3173     uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
3174 
3175     // Merge the even lanes together with a 16-bit unsigned shift right + add.
3176     // 'xx' represents garbage data which will be ignored in the final result.
3177     // In the important bytes, the add functions like a binary OR.
3178     //
3179     // 01 01 00 01 00 00 01 00
3180     //  \_ |  \_ |  \_ |  \_ |   paired16 = (uint32x4_t)(input + (input >> 7))
3181     //    \|    \|    \|    \|
3182     // xx 03 xx 01 xx 00 xx 02
3183     //
3184     // 00000001 00000001 (01 01)
3185     //        \_______ |
3186     //                \|
3187     // xxxxxxxx xxxxxx11 (xx 03)
3188     uint32x4_t paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
3189 
3190     // Repeat with a wider 32-bit shift + add.
3191     // xx 03 xx 01 xx 00 xx 02
3192     //     \____ |     \____ |  paired32 = (uint64x1_t)(paired16 + (paired16 >> 14))
3193     //          \|          \|
3194     // xx xx xx 0d xx xx xx 02
3195     //
3196     // 00000011 00000001 (03 01)
3197     //        \\_____ ||
3198     //         '----.\||
3199     // xxxxxxxx xxxx1101 (xx 0d)
3200     uint64x2_t paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
3201 
3202     // Last, an even wider 64-bit shift + add to get our result in the low 8 bit lanes.
3203     // xx xx xx 0d xx xx xx 02
3204     //            \_________ |   paired64 = (uint8x8_t)(paired32 + (paired32 >> 28))
3205     //                      \|
3206     // xx xx xx xx xx xx xx d2
3207     //
3208     // 00001101 00000010 (0d 02)
3209     //     \   \___ |  |
3210     //      '---.  \|  |
3211     // xxxxxxxx 11010010 (xx d2)
3212     uint8x16_t paired64 = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
3213 
3214     // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
3215     // xx xx xx xx xx xx xx d2
3216     //                      ||  return paired64[0]
3217     //                      d2
3218     // Note: Little endian would return the correct value 4b (01001011) instead.
3219     r = vgetq_lane_u8(paired64, 0) | (HEDLEY_STATIC_CAST(int32_t, vgetq_lane_u8(paired64, 8)) << 8);
3220 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) && !defined(HEDLEY_IBM_VERSION)
3221   static const SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) perm = { 120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8, 0 };
3222   r = HEDLEY_STATIC_CAST(int32_t, vec_extract(vec_vbpermq(a_.altivec_u8, perm), 1));
3223 #else
3224   SIMDE_VECTORIZE_REDUCTION(|:r)
3225   for (size_t i = 0 ; i < (sizeof(a_.u8) / sizeof(a_.u8[0])) ; i++) {
3226     r |= (a_.u8[15 - i] >> 7) << (15 - i);
3227   }
3228 #endif
3229 
3230   return r;
3231 #endif
3232 }
3233 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3234 #  define _mm_movemask_epi8(a) simde_mm_movemask_epi8(a)
3235 #endif
3236 
3237 SIMDE_FUNCTION_ATTRIBUTES
3238 int32_t
simde_mm_movemask_pd(simde__m128d a)3239 simde_mm_movemask_pd (simde__m128d a) {
3240 #if defined(SIMDE_X86_SSE2_NATIVE)
3241   return _mm_movemask_pd(a);
3242 #else
3243   int32_t r = 0;
3244   simde__m128d_private a_ = simde__m128d_to_private(a);
3245 
3246   SIMDE_VECTORIZE
3247   for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) {
3248     r |= (a_.u64[i] >> 63) << i;
3249   }
3250 
3251   return r;
3252 #endif
3253 }
3254 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3255 #  define _mm_movemask_pd(a) simde_mm_movemask_pd(a)
3256 #endif
3257 
3258 SIMDE_FUNCTION_ATTRIBUTES
3259 simde__m64
simde_mm_movepi64_pi64(simde__m128i a)3260 simde_mm_movepi64_pi64 (simde__m128i a) {
3261 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
3262   return _mm_movepi64_pi64(a);
3263 #else
3264   simde__m64_private r_;
3265   simde__m128i_private a_ = simde__m128i_to_private(a);
3266 
3267   r_.i64[0] = a_.i64[0];
3268 
3269   return simde__m64_from_private(r_);
3270 #endif
3271 }
3272 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3273 #  define _mm_movepi64_pi64(a) simde_mm_movepi64_pi64(a)
3274 #endif
3275 
3276 SIMDE_FUNCTION_ATTRIBUTES
3277 simde__m128i
simde_mm_movpi64_epi64(simde__m64 a)3278 simde_mm_movpi64_epi64 (simde__m64 a) {
3279 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
3280   return _mm_movpi64_epi64(a);
3281 #else
3282   simde__m128i_private r_;
3283   simde__m64_private a_ = simde__m64_to_private(a);
3284 
3285   r_.i64[0] = a_.i64[0];
3286   r_.i64[1] = 0;
3287 
3288   return simde__m128i_from_private(r_);
3289 #endif
3290 }
3291 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3292 #  define _mm_movpi64_epi64(a) simde_mm_movpi64_epi64(a)
3293 #endif
3294 
3295 SIMDE_FUNCTION_ATTRIBUTES
3296 simde__m128i
simde_mm_min_epi16(simde__m128i a,simde__m128i b)3297 simde_mm_min_epi16 (simde__m128i a, simde__m128i b) {
3298 #if defined(SIMDE_X86_SSE2_NATIVE)
3299   return _mm_min_epi16(a, b);
3300 #else
3301   simde__m128i_private
3302     r_,
3303     a_ = simde__m128i_to_private(a),
3304     b_ = simde__m128i_to_private(b);
3305 
3306   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3307     r_.neon_i16 = vminq_s16(a_.neon_i16, b_.neon_i16);
3308   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
3309     r_.altivec_i16 = vec_min(a_.altivec_i16, b_.altivec_i16);
3310   #else
3311     SIMDE_VECTORIZE
3312     for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
3313       r_.i16[i] = (a_.i16[i] < b_.i16[i]) ? a_.i16[i] : b_.i16[i];
3314     }
3315   #endif
3316 
3317   return simde__m128i_from_private(r_);
3318 #endif
3319 }
3320 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3321 #  define _mm_min_epi16(a, b) simde_mm_min_epi16(a, b)
3322 #endif
3323 
3324 SIMDE_FUNCTION_ATTRIBUTES
3325 simde__m128i
simde_mm_min_epu8(simde__m128i a,simde__m128i b)3326 simde_mm_min_epu8 (simde__m128i a, simde__m128i b) {
3327 #if defined(SIMDE_X86_SSE2_NATIVE)
3328   return _mm_min_epu8(a, b);
3329 #else
3330   simde__m128i_private
3331     r_,
3332     a_ = simde__m128i_to_private(a),
3333     b_ = simde__m128i_to_private(b);
3334 
3335   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3336     r_.neon_u8 = vminq_u8(a_.neon_u8, b_.neon_u8);
3337   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
3338     r_.altivec_u8 = vec_min(a_.altivec_u8, b_.altivec_u8);
3339   #else
3340     SIMDE_VECTORIZE
3341     for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
3342       r_.u8[i] = (a_.u8[i] < b_.u8[i]) ? a_.u8[i] : b_.u8[i];
3343     }
3344   #endif
3345 
3346   return simde__m128i_from_private(r_);
3347 #endif
3348 }
3349 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3350 #  define _mm_min_epu8(a, b) simde_mm_min_epu8(a, b)
3351 #endif
3352 
3353 SIMDE_FUNCTION_ATTRIBUTES
3354 simde__m128d
simde_mm_min_pd(simde__m128d a,simde__m128d b)3355 simde_mm_min_pd (simde__m128d a, simde__m128d b) {
3356 #if defined(SIMDE_X86_SSE2_NATIVE)
3357   return _mm_min_pd(a, b);
3358 #else
3359   simde__m128d_private
3360     r_,
3361     a_ = simde__m128d_to_private(a),
3362     b_ = simde__m128d_to_private(b);
3363 
3364   #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
3365     r_.altivec_f64 = vec_min(a_.altivec_f64, b_.altivec_f64);
3366   #else
3367     SIMDE_VECTORIZE
3368     for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
3369       r_.f64[i] = (a_.f64[i] < b_.f64[i]) ? a_.f64[i] : b_.f64[i];
3370     }
3371   #endif
3372 
3373   return simde__m128d_from_private(r_);
3374 #endif
3375 }
3376 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3377 #  define _mm_min_pd(a, b) simde_mm_min_pd(a, b)
3378 #endif
3379 
3380 SIMDE_FUNCTION_ATTRIBUTES
3381 simde__m128d
simde_mm_min_sd(simde__m128d a,simde__m128d b)3382 simde_mm_min_sd (simde__m128d a, simde__m128d b) {
3383 #if defined(SIMDE_X86_SSE2_NATIVE)
3384   return _mm_min_sd(a, b);
3385 #elif defined(SIMDE_ASSUME_VECTORIZATION)
3386   return simde_mm_move_sd(a, simde_mm_min_pd(a, b));
3387 #else
3388   simde__m128d_private
3389     r_,
3390     a_ = simde__m128d_to_private(a),
3391     b_ = simde__m128d_to_private(b);
3392 
3393   r_.f64[0] = (a_.f64[0] < b_.f64[0]) ? a_.f64[0] : b_.f64[0];
3394   r_.f64[1] = a_.f64[1];
3395 
3396   return simde__m128d_from_private(r_);
3397 #endif
3398 }
3399 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3400 #  define _mm_min_sd(a, b) simde_mm_min_sd(a, b)
3401 #endif
3402 
3403 SIMDE_FUNCTION_ATTRIBUTES
3404 simde__m128i
simde_mm_max_epi16(simde__m128i a,simde__m128i b)3405 simde_mm_max_epi16 (simde__m128i a, simde__m128i b) {
3406   #if defined(SIMDE_X86_SSE2_NATIVE)
3407     return _mm_max_epi16(a, b);
3408   #else
3409     simde__m128i_private
3410       r_,
3411       a_ = simde__m128i_to_private(a),
3412       b_ = simde__m128i_to_private(b);
3413 
3414     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3415       r_.neon_i16 = vmaxq_s16(a_.neon_i16, b_.neon_i16);
3416     #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
3417       r_.altivec_i16 = vec_max(a_.altivec_i16, b_.altivec_i16);
3418     #else
3419       SIMDE_VECTORIZE
3420       for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
3421         r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? a_.i16[i] : b_.i16[i];
3422       }
3423     #endif
3424 
3425     return simde__m128i_from_private(r_);
3426   #endif
3427 }
3428 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3429 #  define _mm_max_epi16(a, b) simde_mm_max_epi16(a, b)
3430 #endif
3431 
3432 SIMDE_FUNCTION_ATTRIBUTES
3433 simde__m128i
simde_mm_max_epu8(simde__m128i a,simde__m128i b)3434 simde_mm_max_epu8 (simde__m128i a, simde__m128i b) {
3435   #if defined(SIMDE_X86_SSE2_NATIVE)
3436     return _mm_max_epu8(a, b);
3437   #else
3438     simde__m128i_private
3439       r_,
3440       a_ = simde__m128i_to_private(a),
3441       b_ = simde__m128i_to_private(b);
3442 
3443     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3444       r_.neon_u8 = vmaxq_u8(a_.neon_u8, b_.neon_u8);
3445     #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
3446       r_.altivec_u8 = vec_max(a_.altivec_u8, b_.altivec_u8);
3447     #else
3448       SIMDE_VECTORIZE
3449       for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
3450         r_.u8[i] = (a_.u8[i] > b_.u8[i]) ? a_.u8[i] : b_.u8[i];
3451       }
3452     #endif
3453 
3454     return simde__m128i_from_private(r_);
3455   #endif
3456 }
3457 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3458 #  define _mm_max_epu8(a, b) simde_mm_max_epu8(a, b)
3459 #endif
3460 
3461 SIMDE_FUNCTION_ATTRIBUTES
3462 simde__m128d
simde_mm_max_pd(simde__m128d a,simde__m128d b)3463 simde_mm_max_pd (simde__m128d a, simde__m128d b) {
3464   #if defined(SIMDE_X86_SSE2_NATIVE)
3465     return _mm_max_pd(a, b);
3466   #else
3467     simde__m128d_private
3468       r_,
3469       a_ = simde__m128d_to_private(a),
3470       b_ = simde__m128d_to_private(b);
3471 
3472     #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
3473       r_.altivec_f64 = vec_max(a_.altivec_f64, b_.altivec_f64);
3474     #else
3475       SIMDE_VECTORIZE
3476       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
3477         r_.f64[i] = (a_.f64[i] > b_.f64[i]) ? a_.f64[i] : b_.f64[i];
3478       }
3479     #endif
3480 
3481     return simde__m128d_from_private(r_);
3482   #endif
3483 }
3484 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3485 #  define _mm_max_pd(a, b) simde_mm_max_pd(a, b)
3486 #endif
3487 
3488 SIMDE_FUNCTION_ATTRIBUTES
3489 simde__m128d
simde_mm_max_sd(simde__m128d a,simde__m128d b)3490 simde_mm_max_sd (simde__m128d a, simde__m128d b) {
3491 #if defined(SIMDE_X86_SSE2_NATIVE)
3492   return _mm_max_sd(a, b);
3493 #elif defined(SIMDE_ASSUME_VECTORIZATION)
3494   return simde_mm_move_sd(a, simde_mm_max_pd(a, b));
3495 #else
3496   simde__m128d_private
3497     r_,
3498     a_ = simde__m128d_to_private(a),
3499     b_ = simde__m128d_to_private(b);
3500 
3501   r_.f64[0] = (a_.f64[0] > b_.f64[0]) ? a_.f64[0] : b_.f64[0];
3502   r_.f64[1] = a_.f64[1];
3503 
3504   return simde__m128d_from_private(r_);
3505 #endif
3506 }
3507 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3508 #  define _mm_max_sd(a, b) simde_mm_max_sd(a, b)
3509 #endif
3510 
3511 SIMDE_FUNCTION_ATTRIBUTES
3512 simde__m128i
simde_mm_move_epi64(simde__m128i a)3513 simde_mm_move_epi64 (simde__m128i a) {
3514 #if defined(SIMDE_X86_SSE2_NATIVE)
3515   return _mm_move_epi64(a);
3516 #else
3517   simde__m128i_private
3518     r_,
3519     a_ = simde__m128i_to_private(a);
3520 
3521 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3522   r_.neon_i64 = vsetq_lane_s64(0, a_.neon_i64, 1);
3523 #else
3524   r_.i64[0] = a_.i64[0];
3525   r_.i64[1] = 0;
3526 #endif
3527 
3528   return simde__m128i_from_private(r_);
3529 #endif
3530 }
3531 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3532 #  define _mm_move_epi64(a) simde_mm_move_epi64(a)
3533 #endif
3534 
3535 SIMDE_FUNCTION_ATTRIBUTES
3536 simde__m128i
simde_mm_mul_epu32(simde__m128i a,simde__m128i b)3537 simde_mm_mul_epu32 (simde__m128i a, simde__m128i b) {
3538 #if defined(SIMDE_X86_SSE2_NATIVE)
3539   return _mm_mul_epu32(a, b);
3540 #else
3541   simde__m128i_private
3542     r_,
3543     a_ = simde__m128i_to_private(a),
3544     b_ = simde__m128i_to_private(b);
3545 
3546   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3547     uint32x2_t a_lo = vmovn_u64(a_.neon_u64);
3548     uint32x2_t b_lo = vmovn_u64(b_.neon_u64);
3549     r_.neon_u64 = vmull_u32(a_lo, b_lo);
3550   #else
3551     SIMDE_VECTORIZE
3552     for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
3553       r_.u64[i] = HEDLEY_STATIC_CAST(uint64_t, a_.u32[i * 2]) * HEDLEY_STATIC_CAST(uint64_t, b_.u32[i * 2]);
3554     }
3555   #endif
3556 
3557   return simde__m128i_from_private(r_);
3558 #endif
3559 }
3560 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3561 #  define _mm_mul_epu32(a, b) simde_mm_mul_epu32(a, b)
3562 #endif
3563 
3564 SIMDE_FUNCTION_ATTRIBUTES
3565 simde__m128i
simde_x_mm_mul_epi64(simde__m128i a,simde__m128i b)3566 simde_x_mm_mul_epi64 (simde__m128i a, simde__m128i b) {
3567   simde__m128i_private
3568     r_,
3569     a_ = simde__m128i_to_private(a),
3570     b_ = simde__m128i_to_private(b);
3571 
3572 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
3573   r_.i64 = a_.i64 * b_.i64;
3574 #else
3575   SIMDE_VECTORIZE
3576   for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
3577     r_.i64[i] = a_.i64[i] * b_.i64[i];
3578   }
3579 #endif
3580 
3581   return simde__m128i_from_private(r_);
3582 }
3583 
3584 SIMDE_FUNCTION_ATTRIBUTES
3585 simde__m128i
simde_x_mm_mod_epi64(simde__m128i a,simde__m128i b)3586 simde_x_mm_mod_epi64 (simde__m128i a, simde__m128i b) {
3587   simde__m128i_private
3588     r_,
3589     a_ = simde__m128i_to_private(a),
3590     b_ = simde__m128i_to_private(b);
3591 
3592 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
3593   r_.i64 = a_.i64 % b_.i64;
3594 #else
3595   SIMDE_VECTORIZE
3596   for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
3597     r_.i64[i] = a_.i64[i] % b_.i64[i];
3598   }
3599 #endif
3600 
3601   return simde__m128i_from_private(r_);
3602 }
3603 
3604 SIMDE_FUNCTION_ATTRIBUTES
3605 simde__m128d
simde_mm_mul_pd(simde__m128d a,simde__m128d b)3606 simde_mm_mul_pd (simde__m128d a, simde__m128d b) {
3607 #if defined(SIMDE_X86_SSE2_NATIVE)
3608   return _mm_mul_pd(a, b);
3609 #else
3610   simde__m128d_private
3611     r_,
3612     a_ = simde__m128d_to_private(a),
3613     b_ = simde__m128d_to_private(b);
3614 
3615 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
3616   r_.f64 = a_.f64 * b_.f64;
3617 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
3618   r_.wasm_v128 = wasm_f64x2_mul(a_.wasm_v128, b_.wasm_v128);
3619 #else
3620   SIMDE_VECTORIZE
3621   for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
3622     r_.f64[i] = a_.f64[i] * b_.f64[i];
3623   }
3624 #endif
3625 
3626   return simde__m128d_from_private(r_);
3627 #endif
3628 }
3629 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3630 #  define _mm_mul_pd(a, b) simde_mm_mul_pd(a, b)
3631 #endif
3632 
3633 SIMDE_FUNCTION_ATTRIBUTES
3634 simde__m128d
simde_mm_mul_sd(simde__m128d a,simde__m128d b)3635 simde_mm_mul_sd (simde__m128d a, simde__m128d b) {
3636 #if defined(SIMDE_X86_SSE2_NATIVE)
3637   return _mm_mul_sd(a, b);
3638 #elif defined(SIMDE_ASSUME_VECTORIZATION)
3639   return simde_mm_move_sd(a, simde_mm_mul_pd(a, b));
3640 #else
3641   simde__m128d_private
3642     r_,
3643     a_ = simde__m128d_to_private(a),
3644     b_ = simde__m128d_to_private(b);
3645 
3646   r_.f64[0] = a_.f64[0] * b_.f64[0];
3647   r_.f64[1] = a_.f64[1];
3648 
3649   return simde__m128d_from_private(r_);
3650 #endif
3651 }
3652 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3653 #  define _mm_mul_sd(a, b) simde_mm_mul_sd(a, b)
3654 #endif
3655 
3656 SIMDE_FUNCTION_ATTRIBUTES
3657 simde__m64
simde_mm_mul_su32(simde__m64 a,simde__m64 b)3658 simde_mm_mul_su32 (simde__m64 a, simde__m64 b) {
3659 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI)
3660   return _mm_mul_su32(a, b);
3661 #else
3662   simde__m64_private
3663     r_,
3664     a_ = simde__m64_to_private(a),
3665     b_ = simde__m64_to_private(b);
3666 
3667   r_.u64[0] = HEDLEY_STATIC_CAST(uint64_t, a_.u32[0]) * HEDLEY_STATIC_CAST(uint64_t, b_.u32[0]);
3668 
3669   return simde__m64_from_private(r_);
3670 #endif
3671 }
3672 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3673 #  define _mm_mul_su32(a, b) simde_mm_mul_su32(a, b)
3674 #endif
3675 
3676 SIMDE_FUNCTION_ATTRIBUTES
3677 simde__m128i
simde_mm_mulhi_epi16(simde__m128i a,simde__m128i b)3678 simde_mm_mulhi_epi16 (simde__m128i a, simde__m128i b) {
3679 #if defined(SIMDE_X86_SSE2_NATIVE)
3680   return _mm_mulhi_epi16(a, b);
3681 #else
3682   simde__m128i_private
3683     r_,
3684     a_ = simde__m128i_to_private(a),
3685     b_ = simde__m128i_to_private(b);
3686 
3687   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3688     int16x4_t a3210 = vget_low_s16(a_.neon_i16);
3689     int16x4_t b3210 = vget_low_s16(b_.neon_i16);
3690     int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
3691     int16x4_t a7654 = vget_high_s16(a_.neon_i16);
3692     int16x4_t b7654 = vget_high_s16(b_.neon_i16);
3693     int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
3694     uint16x8x2_t rv = vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
3695     r_.neon_u16 = rv.val[1];
3696   #else
3697     SIMDE_VECTORIZE
3698     for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
3699       r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, (HEDLEY_STATIC_CAST(uint32_t, HEDLEY_STATIC_CAST(int32_t, a_.i16[i]) * HEDLEY_STATIC_CAST(int32_t, b_.i16[i])) >> 16));
3700     }
3701   #endif
3702 
3703   return simde__m128i_from_private(r_);
3704 #endif
3705 }
3706 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3707 #  define _mm_mulhi_epi16(a, b) simde_mm_mulhi_epi16(a, b)
3708 #endif
3709 
3710 SIMDE_FUNCTION_ATTRIBUTES
3711 simde__m128i
simde_mm_mulhi_epu16(simde__m128i a,simde__m128i b)3712 simde_mm_mulhi_epu16 (simde__m128i a, simde__m128i b) {
3713 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
3714   return _mm_mulhi_epu16(a, b);
3715 #else
3716   simde__m128i_private
3717     r_,
3718     a_ = simde__m128i_to_private(a),
3719     b_ = simde__m128i_to_private(b);
3720 
3721   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3722     uint16x4_t a3210 = vget_low_u16(a_.neon_u16);
3723     uint16x4_t b3210 = vget_low_u16(b_.neon_u16);
3724     uint32x4_t ab3210 = vmull_u16(a3210, b3210); /* 3333222211110000 */
3725     uint16x4_t a7654 = vget_high_u16(a_.neon_u16);
3726     uint16x4_t b7654 = vget_high_u16(b_.neon_u16);
3727     uint32x4_t ab7654 = vmull_u16(a7654, b7654); /* 7777666655554444 */
3728     uint16x8x2_t neon_r =
3729             vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654));
3730     r_.neon_u16 = neon_r.val[1];
3731   #else
3732     SIMDE_VECTORIZE
3733     for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
3734       r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, HEDLEY_STATIC_CAST(uint32_t, a_.u16[i]) * HEDLEY_STATIC_CAST(uint32_t, b_.u16[i]) >> 16);
3735     }
3736   #endif
3737 
3738   return simde__m128i_from_private(r_);
3739 #endif
3740 }
3741 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3742 #  define _mm_mulhi_epu16(a, b) simde_mm_mulhi_epu16(a, b)
3743 #endif
3744 
3745 SIMDE_FUNCTION_ATTRIBUTES
3746 simde__m128i
simde_mm_mullo_epi16(simde__m128i a,simde__m128i b)3747 simde_mm_mullo_epi16 (simde__m128i a, simde__m128i b) {
3748 #if defined(SIMDE_X86_SSE2_NATIVE)
3749   return _mm_mullo_epi16(a, b);
3750 #else
3751   simde__m128i_private
3752     r_,
3753     a_ = simde__m128i_to_private(a),
3754     b_ = simde__m128i_to_private(b);
3755 
3756   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3757     r_.neon_i16 = vmulq_s16(a_.neon_i16, b_.neon_i16);
3758   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
3759     (void) a_;
3760     (void) b_;
3761     r_.altivec_i16 = vec_mul(a_.altivec_i16, b_.altivec_i16);
3762   #else
3763     SIMDE_VECTORIZE
3764     for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
3765       r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, HEDLEY_STATIC_CAST(uint32_t, a_.u16[i]) * HEDLEY_STATIC_CAST(uint32_t, b_.u16[i]));
3766     }
3767   #endif
3768 
3769   return simde__m128i_from_private(r_);
3770 #endif
3771 }
3772 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3773 #  define _mm_mullo_epi16(a, b) simde_mm_mullo_epi16(a, b)
3774 #endif
3775 
3776 SIMDE_FUNCTION_ATTRIBUTES
3777 simde__m128d
simde_mm_or_pd(simde__m128d a,simde__m128d b)3778 simde_mm_or_pd (simde__m128d a, simde__m128d b) {
3779 #if defined(SIMDE_X86_SSE2_NATIVE)
3780   return _mm_or_pd(a, b);
3781 #else
3782   simde__m128d_private
3783     r_,
3784     a_ = simde__m128d_to_private(a),
3785     b_ = simde__m128d_to_private(b);
3786 
3787 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
3788   r_.i32f = a_.i32f | b_.i32f;
3789 #else
3790   SIMDE_VECTORIZE
3791   for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
3792     r_.i32f[i] = a_.i32f[i] | b_.i32f[i];
3793   }
3794 #endif
3795 
3796   return simde__m128d_from_private(r_);
3797 #endif
3798 }
3799 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3800 #  define _mm_or_pd(a, b) simde_mm_or_pd(a, b)
3801 #endif
3802 
3803 SIMDE_FUNCTION_ATTRIBUTES
3804 simde__m128i
simde_mm_or_si128(simde__m128i a,simde__m128i b)3805 simde_mm_or_si128 (simde__m128i a, simde__m128i b) {
3806 #if defined(SIMDE_X86_SSE2_NATIVE)
3807   return _mm_or_si128(a, b);
3808 #else
3809   simde__m128i_private
3810     r_,
3811     a_ = simde__m128i_to_private(a),
3812     b_ = simde__m128i_to_private(b);
3813 
3814   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3815     r_.neon_i32 = vorrq_s32(a_.neon_i32, b_.neon_i32);
3816   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
3817     r_.altivec_i32 = vec_or(a_.altivec_i32, b_.altivec_i32);
3818   #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
3819     r_.i32f = a_.i32f | b_.i32f;
3820   #else
3821     SIMDE_VECTORIZE
3822     for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
3823       r_.i32f[i] = a_.i32f[i] | b_.i32f[i];
3824     }
3825   #endif
3826 
3827   return simde__m128i_from_private(r_);
3828 #endif
3829 }
3830 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3831 #  define _mm_or_si128(a, b) simde_mm_or_si128(a, b)
3832 #endif
3833 
3834 SIMDE_FUNCTION_ATTRIBUTES
3835 simde__m128i
simde_mm_packs_epi16(simde__m128i a,simde__m128i b)3836 simde_mm_packs_epi16 (simde__m128i a, simde__m128i b) {
3837 #if defined(SIMDE_X86_SSE2_NATIVE)
3838   return _mm_packs_epi16(a, b);
3839 #else
3840   simde__m128i_private
3841     r_,
3842     a_ = simde__m128i_to_private(a),
3843     b_ = simde__m128i_to_private(b);
3844 
3845 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3846   r_.neon_i8 = vcombine_s8(vqmovn_s16(a_.neon_i16), vqmovn_s16(b_.neon_i16));
3847 #else
3848   SIMDE_VECTORIZE
3849   for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
3850     r_.i8[i]     = (a_.i16[i] > INT8_MAX) ? INT8_MAX : ((a_.i16[i] < INT8_MIN) ? INT8_MIN : HEDLEY_STATIC_CAST(int8_t, a_.i16[i]));
3851     r_.i8[i + 8] = (b_.i16[i] > INT8_MAX) ? INT8_MAX : ((b_.i16[i] < INT8_MIN) ? INT8_MIN : HEDLEY_STATIC_CAST(int8_t, b_.i16[i]));
3852   }
3853 #endif
3854 
3855   return simde__m128i_from_private(r_);
3856 #endif
3857 }
3858 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3859 #  define _mm_packs_epi16(a, b) simde_mm_packs_epi16(a, b)
3860 #endif
3861 
3862 SIMDE_FUNCTION_ATTRIBUTES
3863 simde__m128i
simde_mm_packs_epi32(simde__m128i a,simde__m128i b)3864 simde_mm_packs_epi32 (simde__m128i a, simde__m128i b) {
3865 #if defined(SIMDE_X86_SSE2_NATIVE)
3866   return _mm_packs_epi32(a, b);
3867 #else
3868   simde__m128i_private
3869     r_,
3870     a_ = simde__m128i_to_private(a),
3871     b_ = simde__m128i_to_private(b);
3872 
3873 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3874   r_.neon_i16 = vcombine_s16(vqmovn_s32(a_.neon_i32), vqmovn_s32(b_.neon_i32));
3875 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
3876   r_.altivec_i16 = vec_packs(a_.altivec_i32, b_.altivec_i32);
3877 #else
3878   SIMDE_VECTORIZE
3879   for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
3880     r_.i16[i]     = (a_.i32[i] > INT16_MAX) ? INT16_MAX : ((a_.i32[i] < INT16_MIN) ? INT16_MIN : HEDLEY_STATIC_CAST(int16_t, a_.i32[i]));
3881     r_.i16[i + 4] = (b_.i32[i] > INT16_MAX) ? INT16_MAX : ((b_.i32[i] < INT16_MIN) ? INT16_MIN : HEDLEY_STATIC_CAST(int16_t, b_.i32[i]));
3882   }
3883 #endif
3884 
3885   return simde__m128i_from_private(r_);
3886 #endif
3887 }
3888 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3889 #  define _mm_packs_epi32(a, b) simde_mm_packs_epi32(a, b)
3890 #endif
3891 
3892 SIMDE_FUNCTION_ATTRIBUTES
3893 simde__m128i
simde_mm_packus_epi16(simde__m128i a,simde__m128i b)3894 simde_mm_packus_epi16 (simde__m128i a, simde__m128i b) {
3895 #if defined(SIMDE_X86_SSE2_NATIVE)
3896   return _mm_packus_epi16(a, b);
3897 #else
3898   simde__m128i_private
3899     r_,
3900     a_ = simde__m128i_to_private(a),
3901     b_ = simde__m128i_to_private(b);
3902 
3903 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3904   r_.neon_u8 = vcombine_u8(vqmovun_s16(a_.neon_i16), vqmovun_s16(b_.neon_i16));
3905 #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
3906   r_.altivec_u8 = vec_packsu(a_.altivec_i16, b_.altivec_i16);
3907 #else
3908   SIMDE_VECTORIZE
3909   for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
3910     r_.u8[i]     = (a_.i16[i] > UINT8_MAX) ? UINT8_MAX : ((a_.i16[i] < 0) ? UINT8_C(0) : HEDLEY_STATIC_CAST(uint8_t, a_.i16[i]));
3911     r_.u8[i + 8] = (b_.i16[i] > UINT8_MAX) ? UINT8_MAX : ((b_.i16[i] < 0) ? UINT8_C(0) : HEDLEY_STATIC_CAST(uint8_t, b_.i16[i]));
3912   }
3913 #endif
3914 
3915   return simde__m128i_from_private(r_);
3916 #endif
3917 }
3918 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3919 #  define _mm_packus_epi16(a, b) simde_mm_packus_epi16(a, b)
3920 #endif
3921 
3922 SIMDE_FUNCTION_ATTRIBUTES
3923 void
simde_mm_pause(void)3924 simde_mm_pause (void) {
3925 #if defined(SIMDE_X86_SSE2_NATIVE)
3926   _mm_pause();
3927 #endif
3928 }
3929 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3930 #  define _mm_pause() (simde_mm_pause())
3931 #endif
3932 
3933 SIMDE_FUNCTION_ATTRIBUTES
3934 simde__m128i
simde_mm_sad_epu8(simde__m128i a,simde__m128i b)3935 simde_mm_sad_epu8 (simde__m128i a, simde__m128i b) {
3936 #if defined(SIMDE_X86_SSE2_NATIVE)
3937   return _mm_sad_epu8(a, b);
3938 #else
3939   simde__m128i_private
3940     r_,
3941     a_ = simde__m128i_to_private(a),
3942     b_ = simde__m128i_to_private(b);
3943 
3944   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3945     uint16x8_t t = vpaddlq_u8(vabdq_u8(a_.neon_u8, b_.neon_u8));
3946     uint16_t r0 = t[0] + t[1] + t[2] + t[3];
3947     uint16_t r4 = t[4] + t[5] + t[6] + t[7];
3948     uint16x8_t r = vsetq_lane_u16(r0, vdupq_n_u16(0), 0);
3949     r_.neon_u16 = vsetq_lane_u16(r4, r, 4);
3950   #else
3951     for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
3952       uint16_t tmp = 0;
3953       SIMDE_VECTORIZE_REDUCTION(+:tmp)
3954       for (size_t j = 0 ; j < ((sizeof(r_.u8) / sizeof(r_.u8[0])) / 2) ; j++) {
3955         const size_t e = j + (i * 8);
3956         tmp += (a_.u8[e] > b_.u8[e]) ? (a_.u8[e] - b_.u8[e]) : (b_.u8[e] - a_.u8[e]);
3957       }
3958       r_.i64[i] = tmp;
3959     }
3960   #endif
3961 
3962   return simde__m128i_from_private(r_);
3963 #endif
3964 }
3965 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3966 #  define _mm_sad_epu8(a, b) simde_mm_sad_epu8(a, b)
3967 #endif
3968 
3969 SIMDE_FUNCTION_ATTRIBUTES
3970 simde__m128i
simde_mm_set_epi8(int8_t e15,int8_t e14,int8_t e13,int8_t e12,int8_t e11,int8_t e10,int8_t e9,int8_t e8,int8_t e7,int8_t e6,int8_t e5,int8_t e4,int8_t e3,int8_t e2,int8_t e1,int8_t e0)3971 simde_mm_set_epi8 (int8_t e15, int8_t e14, int8_t e13, int8_t e12,
3972        int8_t e11, int8_t e10, int8_t  e9, int8_t  e8,
3973        int8_t  e7, int8_t  e6, int8_t  e5, int8_t  e4,
3974        int8_t  e3, int8_t  e2, int8_t  e1, int8_t  e0) {
3975 
3976   #if defined(SIMDE_X86_SSE2_NATIVE)
3977     return _mm_set_epi8(
3978       e15, e14, e13, e12, e11, e10,  e9,  e8,
3979        e7,  e6,  e5,  e4,  e3,  e2,  e1,  e0);
3980   #else
3981     simde__m128i_private r_;
3982 
3983     #if defined(SIMDE_WASM_SIMD128_NATIVE)
3984       r_.wasm_v128 = wasm_i8x16_make(
3985          e0,  e1,  e2,  e3,  e4,  e5,  e6,  e7,
3986          e8,  e9, e10, e11, e12, e13, e14, e15);
3987     #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3988       SIMDE_ALIGN_AS(16, int8x16_t) int8_t data[16] = {
3989         e0,  e1,  e2,  e3,
3990         e4,  e5,  e6,  e7,
3991         e8,  e9,  e10, e11,
3992         e12, e13, e14, e15};
3993       r_.neon_i8 = vld1q_s8(data);
3994     #else
3995       r_.i8[ 0] =  e0;
3996       r_.i8[ 1] =  e1;
3997       r_.i8[ 2] =  e2;
3998       r_.i8[ 3] =  e3;
3999       r_.i8[ 4] =  e4;
4000       r_.i8[ 5] =  e5;
4001       r_.i8[ 6] =  e6;
4002       r_.i8[ 7] =  e7;
4003       r_.i8[ 8] =  e8;
4004       r_.i8[ 9] =  e9;
4005       r_.i8[10] = e10;
4006       r_.i8[11] = e11;
4007       r_.i8[12] = e12;
4008       r_.i8[13] = e13;
4009       r_.i8[14] = e14;
4010       r_.i8[15] = e15;
4011     #endif
4012 
4013     return simde__m128i_from_private(r_);
4014   #endif
4015 }
4016 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4017 #  define _mm_set_epi8(e15, e14, e13, e12, e11, e10,  e9,  e8,  e7,  e6,  e5,  e4,  e3,  e2,  e1,  e0) simde_mm_set_epi8(e15, e14, e13, e12, e11, e10,  e9,  e8,  e7,  e6,  e5,  e4,  e3,  e2,  e1,  e0)
4018 #endif
4019 
4020 SIMDE_FUNCTION_ATTRIBUTES
4021 simde__m128i
simde_mm_set_epi16(int16_t e7,int16_t e6,int16_t e5,int16_t e4,int16_t e3,int16_t e2,int16_t e1,int16_t e0)4022 simde_mm_set_epi16 (int16_t e7, int16_t e6, int16_t e5, int16_t e4,
4023         int16_t e3, int16_t e2, int16_t e1, int16_t e0) {
4024   #if defined(SIMDE_X86_SSE2_NATIVE)
4025     return _mm_set_epi16(e7, e6, e5, e4, e3, e2, e1, e0);
4026   #else
4027     simde__m128i_private r_;
4028 
4029     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4030       SIMDE_ALIGN_AS(16, int16x8_t) int16_t data[8] = { e0, e1, e2, e3, e4, e5, e6, e7 };
4031       r_.neon_i16 = vld1q_s16(data);
4032     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4033       r_.wasm_v128 = wasm_i16x8_make(e0, e1, e2, e3, e4, e5, e6, e7);
4034     #else
4035       r_.i16[0] = e0;
4036       r_.i16[1] = e1;
4037       r_.i16[2] = e2;
4038       r_.i16[3] = e3;
4039       r_.i16[4] = e4;
4040       r_.i16[5] = e5;
4041       r_.i16[6] = e6;
4042       r_.i16[7] = e7;
4043     #endif
4044 
4045     return simde__m128i_from_private(r_);
4046   #endif
4047 }
4048 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4049 #  define _mm_set_epi16(e7,  e6,  e5,  e4,  e3,  e2,  e1,  e0) simde_mm_set_epi16(e7,  e6,  e5,  e4,  e3,  e2,  e1,  e0)
4050 #endif
4051 
4052 SIMDE_FUNCTION_ATTRIBUTES
4053 simde__m128i
simde_mm_set_epi32(int32_t e3,int32_t e2,int32_t e1,int32_t e0)4054 simde_mm_set_epi32 (int32_t e3, int32_t e2, int32_t e1, int32_t e0) {
4055   #if defined(SIMDE_X86_SSE2_NATIVE)
4056     return _mm_set_epi32(e3, e2, e1, e0);
4057   #else
4058     simde__m128i_private r_;
4059 
4060     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4061       SIMDE_ALIGN_AS(16, int32x4_t) int32_t data[4] = { e0, e1, e2, e3 };
4062       r_.neon_i32 = vld1q_s32(data);
4063     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4064       r_.wasm_v128 = wasm_i32x4_make(e0, e1, e2, e3);
4065     #else
4066       r_.i32[0] = e0;
4067       r_.i32[1] = e1;
4068       r_.i32[2] = e2;
4069       r_.i32[3] = e3;
4070     #endif
4071 
4072     return simde__m128i_from_private(r_);
4073   #endif
4074 }
4075 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4076 #  define _mm_set_epi32(e3,  e2,  e1,  e0) simde_mm_set_epi32(e3,  e2,  e1,  e0)
4077 #endif
4078 
4079 SIMDE_FUNCTION_ATTRIBUTES
4080 simde__m128i
simde_mm_set_epi64(simde__m64 e1,simde__m64 e0)4081 simde_mm_set_epi64 (simde__m64 e1, simde__m64 e0) {
4082   #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
4083     return _mm_set_epi64(e1, e0);
4084   #else
4085     simde__m128i_private r_;
4086 
4087     r_.m64_private[0] = simde__m64_to_private(e0);
4088     r_.m64_private[1] = simde__m64_to_private(e1);
4089 
4090     return simde__m128i_from_private(r_);
4091   #endif
4092 }
4093 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4094 #  define _mm_set_epi64(e1, e0) (simde_mm_set_epi64((e1), (e0)))
4095 #endif
4096 
4097 SIMDE_FUNCTION_ATTRIBUTES
4098 simde__m128i
simde_mm_set_epi64x(int64_t e1,int64_t e0)4099 simde_mm_set_epi64x (int64_t e1, int64_t e0) {
4100 #if defined(SIMDE_X86_SSE2_NATIVE) && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,0,0))
4101   return _mm_set_epi64x(e1, e0);
4102 #else
4103   simde__m128i_private r_;
4104 
4105   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4106     SIMDE_ALIGN_AS(16, int64x2_t) int64_t data[2] = {e0, e1};
4107     r_.neon_i64 = vld1q_s64(data);
4108   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4109     r_.wasm_v128 = wasm_i64x2_make(e0, e1);
4110   #else
4111     r_.i64[0] = e0;
4112     r_.i64[1] = e1;
4113   #endif
4114 
4115   return simde__m128i_from_private(r_);
4116 #endif
4117 }
4118 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4119 #  define _mm_set_epi64x(e1, e0) simde_mm_set_epi64x(e1, e0)
4120 #endif
4121 
4122 SIMDE_FUNCTION_ATTRIBUTES
4123 simde__m128i
simde_x_mm_set_epu8(uint8_t e15,uint8_t e14,uint8_t e13,uint8_t e12,uint8_t e11,uint8_t e10,uint8_t e9,uint8_t e8,uint8_t e7,uint8_t e6,uint8_t e5,uint8_t e4,uint8_t e3,uint8_t e2,uint8_t e1,uint8_t e0)4124 simde_x_mm_set_epu8 (uint8_t e15, uint8_t e14, uint8_t e13, uint8_t e12,
4125          uint8_t e11, uint8_t e10, uint8_t  e9, uint8_t  e8,
4126          uint8_t  e7, uint8_t  e6, uint8_t  e5, uint8_t  e4,
4127          uint8_t  e3, uint8_t  e2, uint8_t  e1, uint8_t  e0) {
4128   #if defined(SIMDE_X86_SSE2_NATIVE)
4129     return _mm_set_epi8(
4130       HEDLEY_STATIC_CAST(char, e15), HEDLEY_STATIC_CAST(char, e14), HEDLEY_STATIC_CAST(char, e13), HEDLEY_STATIC_CAST(char, e12),
4131       HEDLEY_STATIC_CAST(char, e11), HEDLEY_STATIC_CAST(char, e10), HEDLEY_STATIC_CAST(char,  e9), HEDLEY_STATIC_CAST(char,  e8),
4132       HEDLEY_STATIC_CAST(char,  e7), HEDLEY_STATIC_CAST(char,  e6), HEDLEY_STATIC_CAST(char,  e5), HEDLEY_STATIC_CAST(char,  e4),
4133       HEDLEY_STATIC_CAST(char,  e3), HEDLEY_STATIC_CAST(char,  e2), HEDLEY_STATIC_CAST(char,  e1), HEDLEY_STATIC_CAST(char,  e0));
4134   #else
4135     simde__m128i_private r_;
4136 
4137     r_.u8[ 0] =  e0; r_.u8[ 1] =  e1; r_.u8[ 2] =  e2; r_.u8[ 3] =  e3;
4138     r_.u8[ 4] =  e4; r_.u8[ 5] =  e5; r_.u8[ 6] =  e6; r_.u8[ 7] =  e7;
4139     r_.u8[ 8] =  e8; r_.u8[ 9] =  e9; r_.u8[10] = e10; r_.u8[11] = e11;
4140     r_.u8[12] = e12; r_.u8[13] = e13; r_.u8[14] = e14; r_.u8[15] = e15;
4141 
4142     return simde__m128i_from_private(r_);
4143   #endif
4144 }
4145 
4146 SIMDE_FUNCTION_ATTRIBUTES
4147 simde__m128i
simde_x_mm_set_epu16(uint16_t e7,uint16_t e6,uint16_t e5,uint16_t e4,uint16_t e3,uint16_t e2,uint16_t e1,uint16_t e0)4148 simde_x_mm_set_epu16 (uint16_t e7, uint16_t e6, uint16_t e5, uint16_t e4,
4149           uint16_t e3, uint16_t e2, uint16_t e1, uint16_t e0) {
4150   #if defined(SIMDE_X86_SSE2_NATIVE)
4151     return _mm_set_epi16(
4152       HEDLEY_STATIC_CAST(short,  e7), HEDLEY_STATIC_CAST(short,  e6), HEDLEY_STATIC_CAST(short,  e5), HEDLEY_STATIC_CAST(short,  e4),
4153       HEDLEY_STATIC_CAST(short,  e3), HEDLEY_STATIC_CAST(short,  e2), HEDLEY_STATIC_CAST(short,  e1), HEDLEY_STATIC_CAST(short,  e0));
4154   #else
4155     simde__m128i_private r_;
4156 
4157     r_.u16[0] = e0; r_.u16[1] = e1; r_.u16[2] = e2; r_.u16[3] = e3;
4158     r_.u16[4] = e4; r_.u16[5] = e5; r_.u16[6] = e6; r_.u16[7] = e7;
4159 
4160     return simde__m128i_from_private(r_);
4161   #endif
4162 }
4163 
4164 SIMDE_FUNCTION_ATTRIBUTES
4165 simde__m128i
simde_x_mm_set_epu32(uint32_t e3,uint32_t e2,uint32_t e1,uint32_t e0)4166 simde_x_mm_set_epu32 (uint32_t e3, uint32_t e2, uint32_t e1, uint32_t e0) {
4167   #if defined(SIMDE_X86_SSE2_NATIVE)
4168     return _mm_set_epi32(
4169       HEDLEY_STATIC_CAST(int,  e3), HEDLEY_STATIC_CAST(int,  e2), HEDLEY_STATIC_CAST(int,  e1), HEDLEY_STATIC_CAST(int,  e0));
4170   #else
4171     simde__m128i_private r_;
4172 
4173     r_.u32[0] = e0;
4174     r_.u32[1] = e1;
4175     r_.u32[2] = e2;
4176     r_.u32[3] = e3;
4177 
4178     return simde__m128i_from_private(r_);
4179   #endif
4180 }
4181 
4182 SIMDE_FUNCTION_ATTRIBUTES
4183 simde__m128i
simde_x_mm_set_epu64x(uint64_t e1,uint64_t e0)4184 simde_x_mm_set_epu64x (uint64_t e1, uint64_t e0) {
4185   #if defined(SIMDE_X86_SSE2_NATIVE) && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,0,0))
4186     return _mm_set_epi64x(HEDLEY_STATIC_CAST(int64_t,  e1), HEDLEY_STATIC_CAST(int64_t,  e0));
4187   #else
4188     simde__m128i_private r_;
4189 
4190     r_.u64[0] = e0;
4191     r_.u64[1] = e1;
4192 
4193     return simde__m128i_from_private(r_);
4194   #endif
4195 }
4196 
4197 SIMDE_FUNCTION_ATTRIBUTES
4198 simde__m128d
simde_mm_set_pd(simde_float64 e1,simde_float64 e0)4199 simde_mm_set_pd (simde_float64 e1, simde_float64 e0) {
4200 #if defined(SIMDE_X86_SSE2_NATIVE)
4201   return _mm_set_pd(e1, e0);
4202 #else
4203   simde__m128d_private r_;
4204 
4205   #if defined(SIMDE_WASM_SIMD128_NATIVE)
4206     r_.wasm_v128 = wasm_f64x2_make(e0, e1);
4207     #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4208       r_.wasm_v128 = wasm_f64x2_make(e0, e1);
4209   #else
4210     r_.f64[0] = e0;
4211     r_.f64[1] = e1;
4212   #endif
4213 
4214   return simde__m128d_from_private(r_);
4215 #endif
4216 }
4217 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4218 #  define _mm_set_pd(e1, e0) simde_mm_set_pd(e1, e0)
4219 #endif
4220 
4221 SIMDE_FUNCTION_ATTRIBUTES
4222 simde__m128d
simde_mm_set_pd1(simde_float64 a)4223 simde_mm_set_pd1 (simde_float64 a) {
4224 #if defined(SIMDE_X86_SSE2_NATIVE)
4225   return _mm_set1_pd(a);
4226 #else
4227   simde__m128d_private r_;
4228 
4229   r_.f64[0] = a;
4230   r_.f64[1] = a;
4231 
4232   return simde__m128d_from_private(r_);
4233 #endif
4234 }
4235 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4236 #  define _mm_set_pd1(a) simde_mm_set1_pd(a)
4237 #endif
4238 
4239 SIMDE_FUNCTION_ATTRIBUTES
4240 simde__m128d
simde_mm_set_sd(simde_float64 a)4241 simde_mm_set_sd (simde_float64 a) {
4242 #if defined(SIMDE_X86_SSE2_NATIVE)
4243   return _mm_set_sd(a);
4244 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
4245   return vsetq_lane_f64(a, vdupq_n_f64(SIMDE_FLOAT64_C(0.0)), 0);
4246 #else
4247   return simde_mm_set_pd(SIMDE_FLOAT64_C(0.0), a);
4248 
4249 #endif
4250 }
4251 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4252 #  define _mm_set_sd(a) simde_mm_set_sd(a)
4253 #endif
4254 
4255 SIMDE_FUNCTION_ATTRIBUTES
4256 simde__m128i
simde_mm_set1_epi8(int8_t a)4257 simde_mm_set1_epi8 (int8_t a) {
4258 #if defined(SIMDE_X86_SSE2_NATIVE)
4259   return _mm_set1_epi8(a);
4260 #else
4261   simde__m128i_private r_;
4262 
4263   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4264     r_.neon_i8 = vdupq_n_s8(a);
4265   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4266     r_.wasm_v128 = wasm_i8x16_splat(a);
4267   #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
4268     r_.altivec_i8 = vec_splats(HEDLEY_STATIC_CAST(signed char, a));
4269   #else
4270     SIMDE_VECTORIZE
4271     for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
4272       r_.i8[i] = a;
4273     }
4274   #endif
4275 
4276   return simde__m128i_from_private(r_);
4277 #endif
4278 }
4279 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4280 #  define _mm_set1_epi8(a) simde_mm_set1_epi8(a)
4281 #endif
4282 
4283 SIMDE_FUNCTION_ATTRIBUTES
4284 simde__m128i
simde_mm_set1_epi16(int16_t a)4285 simde_mm_set1_epi16 (int16_t a) {
4286 #if defined(SIMDE_X86_SSE2_NATIVE)
4287   return _mm_set1_epi16(a);
4288 #else
4289   simde__m128i_private r_;
4290 
4291   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4292     r_.neon_i16 = vdupq_n_s16(a);
4293   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4294     r_.wasm_v128 = wasm_i16x8_splat(a);
4295   #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
4296     r_.altivec_i16 = vec_splats(HEDLEY_STATIC_CAST(signed short, a));
4297   #else
4298     SIMDE_VECTORIZE
4299     for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
4300       r_.i16[i] = a;
4301     }
4302   #endif
4303 
4304   return simde__m128i_from_private(r_);
4305 #endif
4306 }
4307 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4308 #  define _mm_set1_epi16(a) simde_mm_set1_epi16(a)
4309 #endif
4310 
4311 SIMDE_FUNCTION_ATTRIBUTES
4312 simde__m128i
simde_mm_set1_epi32(int32_t a)4313 simde_mm_set1_epi32 (int32_t a) {
4314 #if defined(SIMDE_X86_SSE2_NATIVE)
4315   return _mm_set1_epi32(a);
4316 #else
4317   simde__m128i_private r_;
4318 
4319   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4320     r_.neon_i32 = vdupq_n_s32(a);
4321   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4322     r_.wasm_v128 = wasm_i32x4_splat(a);
4323   #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
4324     r_.altivec_i32 = vec_splats(HEDLEY_STATIC_CAST(signed int, a));
4325   #else
4326     SIMDE_VECTORIZE
4327     for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
4328       r_.i32[i] = a;
4329     }
4330   #endif
4331 
4332   return simde__m128i_from_private(r_);
4333 #endif
4334 }
4335 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4336 #  define _mm_set1_epi32(a) simde_mm_set1_epi32(a)
4337 #endif
4338 
4339 SIMDE_FUNCTION_ATTRIBUTES
4340 simde__m128i
simde_mm_set1_epi64x(int64_t a)4341 simde_mm_set1_epi64x (int64_t a) {
4342 #if defined(SIMDE_X86_SSE2_NATIVE) && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,0,0))
4343   return _mm_set1_epi64x(a);
4344 #else
4345   simde__m128i_private r_;
4346 
4347   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4348     r_.neon_i64 = vmovq_n_s64(a);
4349   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
4350     r_.wasm_v128 = wasm_i64x2_splat(a);
4351   #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
4352     r_.altivec_i64 = vec_splats(HEDLEY_STATIC_CAST(signed long long, a));
4353   #else
4354     SIMDE_VECTORIZE
4355     for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
4356       r_.i64[i] = a;
4357     }
4358   #endif
4359 
4360   return simde__m128i_from_private(r_);
4361 #endif
4362 }
4363 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4364 #  define _mm_set1_epi64x(a) simde_mm_set1_epi64x(a)
4365 #endif
4366 
4367 SIMDE_FUNCTION_ATTRIBUTES
4368 simde__m128i
simde_mm_set1_epi64(simde__m64 a)4369 simde_mm_set1_epi64 (simde__m64 a) {
4370 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
4371   return _mm_set1_epi64(a);
4372 #else
4373   simde__m64_private a_ = simde__m64_to_private(a);
4374   return simde_mm_set1_epi64x(a_.i64[0]);
4375 #endif
4376 }
4377 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4378 #  define _mm_set1_epi64(a) simde_mm_set1_epi64(a)
4379 #endif
4380 
4381 SIMDE_FUNCTION_ATTRIBUTES
4382 simde__m128i
simde_x_mm_set1_epu8(uint8_t value)4383 simde_x_mm_set1_epu8 (uint8_t value) {
4384   #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
4385     return simde__m128i_from_altivec_u8(vec_splats(HEDLEY_STATIC_CAST(unsigned char, value)));
4386   #else
4387     return simde_mm_set1_epi8(HEDLEY_STATIC_CAST(int8_t, value));
4388   #endif
4389 }
4390 
4391 SIMDE_FUNCTION_ATTRIBUTES
4392 simde__m128i
simde_x_mm_set1_epu16(uint16_t value)4393 simde_x_mm_set1_epu16 (uint16_t value) {
4394   #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
4395     return simde__m128i_from_altivec_u16(vec_splats(HEDLEY_STATIC_CAST(unsigned short, value)));
4396   #else
4397     return simde_mm_set1_epi16(HEDLEY_STATIC_CAST(int16_t, value));
4398   #endif
4399 }
4400 
4401 SIMDE_FUNCTION_ATTRIBUTES
4402 simde__m128i
simde_x_mm_set1_epu32(uint32_t value)4403 simde_x_mm_set1_epu32 (uint32_t value) {
4404   #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
4405     return simde__m128i_from_altivec_u32(vec_splats(HEDLEY_STATIC_CAST(unsigned int, value)));
4406   #else
4407     return simde_mm_set1_epi32(HEDLEY_STATIC_CAST(int32_t, value));
4408   #endif
4409 }
4410 
4411 SIMDE_FUNCTION_ATTRIBUTES
4412 simde__m128i
simde_x_mm_set1_epu64(uint64_t value)4413 simde_x_mm_set1_epu64 (uint64_t value) {
4414   #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
4415     return simde__m128i_from_altivec_u64(vec_splats(HEDLEY_STATIC_CAST(unsigned long long, value)));
4416   #else
4417     return simde_mm_set1_epi64x(HEDLEY_STATIC_CAST(int64_t, value));
4418   #endif
4419 }
4420 
4421 SIMDE_FUNCTION_ATTRIBUTES
4422 simde__m128d
simde_mm_set1_pd(simde_float64 a)4423 simde_mm_set1_pd (simde_float64 a) {
4424 #if defined(SIMDE_X86_SSE2_NATIVE)
4425   return _mm_set1_pd(a);
4426 #else
4427   simde__m128d_private r_;
4428 
4429   #if defined(SIMDE_WASM_SIMD128_NATIVE)
4430     r_.wasm_v128 = wasm_f64x2_splat(a);
4431   #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
4432     r_.altivec_f64 = vec_splats(HEDLEY_STATIC_CAST(double, a));
4433   #else
4434     SIMDE_VECTORIZE
4435     for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
4436       r_.f64[i] = a;
4437     }
4438   #endif
4439 
4440   return simde__m128d_from_private(r_);
4441 #endif
4442 }
4443 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4444 #  define _mm_set1_pd(a) simde_mm_set1_pd(a)
4445 #endif
4446 
4447 SIMDE_FUNCTION_ATTRIBUTES
4448 simde__m128i
simde_mm_setr_epi8(int8_t e15,int8_t e14,int8_t e13,int8_t e12,int8_t e11,int8_t e10,int8_t e9,int8_t e8,int8_t e7,int8_t e6,int8_t e5,int8_t e4,int8_t e3,int8_t e2,int8_t e1,int8_t e0)4449 simde_mm_setr_epi8 (int8_t e15, int8_t e14, int8_t e13, int8_t e12,
4450         int8_t e11, int8_t e10, int8_t  e9, int8_t  e8,
4451         int8_t  e7, int8_t  e6, int8_t  e5, int8_t  e4,
4452         int8_t  e3, int8_t  e2, int8_t  e1, int8_t  e0) {
4453 #if defined(SIMDE_X86_SSE2_NATIVE)
4454   return _mm_setr_epi8(
4455     e15, e14, e13, e12, e11, e10,  e9,    e8,
4456      e7,  e6,  e5,  e4,  e3,  e2,  e1,  e0);
4457 #else
4458   return simde_mm_set_epi8(
4459     e0, e1, e2, e3, e4, e5, e6, e7,
4460     e8, e9, e10, e11, e12, e13, e14, e15);
4461 #endif
4462 }
4463 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4464 #  define _mm_setr_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) simde_mm_setr_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0)
4465 #endif
4466 
4467 SIMDE_FUNCTION_ATTRIBUTES
4468 simde__m128i
simde_mm_setr_epi16(int16_t e7,int16_t e6,int16_t e5,int16_t e4,int16_t e3,int16_t e2,int16_t e1,int16_t e0)4469 simde_mm_setr_epi16 (int16_t e7, int16_t e6, int16_t e5, int16_t e4,
4470          int16_t e3, int16_t e2, int16_t e1, int16_t e0) {
4471 #if defined(SIMDE_X86_SSE2_NATIVE)
4472   return _mm_setr_epi16(e7,  e6,  e5,  e4,  e3,  e2,  e1,  e0);
4473 #else
4474   return simde_mm_set_epi16(e0, e1, e2, e3, e4, e5, e6, e7);
4475 #endif
4476 }
4477 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4478 #  define _mm_setr_epi16(e7, e6, e5, e4, e3, e2, e1, e0) simde_mm_setr_epi16(e7, e6, e5, e4, e3, e2, e1, e0)
4479 #endif
4480 
4481 SIMDE_FUNCTION_ATTRIBUTES
4482 simde__m128i
simde_mm_setr_epi32(int32_t e3,int32_t e2,int32_t e1,int32_t e0)4483 simde_mm_setr_epi32 (int32_t e3, int32_t e2, int32_t e1, int32_t e0) {
4484 #if defined(SIMDE_X86_SSE2_NATIVE)
4485   return _mm_setr_epi32(e3, e2, e1, e0);
4486 #else
4487   return simde_mm_set_epi32(e0, e1, e2, e3);
4488 #endif
4489 }
4490 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4491 #  define _mm_setr_epi32(e3, e2, e1, e0) simde_mm_setr_epi32(e3, e2, e1, e0)
4492 #endif
4493 
4494 SIMDE_FUNCTION_ATTRIBUTES
4495 simde__m128i
simde_mm_setr_epi64(simde__m64 e1,simde__m64 e0)4496 simde_mm_setr_epi64 (simde__m64 e1, simde__m64 e0) {
4497 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
4498   return _mm_setr_epi64(e1, e0);
4499 #else
4500   return simde_mm_set_epi64(e0, e1);
4501 #endif
4502 }
4503 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4504 #  define _mm_setr_epi64(e1, e0) (simde_mm_setr_epi64((e1), (e0)))
4505 #endif
4506 
4507 SIMDE_FUNCTION_ATTRIBUTES
4508 simde__m128d
simde_mm_setr_pd(simde_float64 e1,simde_float64 e0)4509 simde_mm_setr_pd (simde_float64 e1, simde_float64 e0) {
4510 #if defined(SIMDE_X86_SSE2_NATIVE)
4511   return _mm_setr_pd(e1, e0);
4512 #else
4513   return simde_mm_set_pd(e0, e1);
4514 #endif
4515 }
4516 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4517 #  define _mm_setr_pd(e1, e0) simde_mm_setr_pd(e1, e0)
4518 #endif
4519 
4520 SIMDE_FUNCTION_ATTRIBUTES
4521 simde__m128d
simde_mm_setzero_pd(void)4522 simde_mm_setzero_pd (void) {
4523 #if defined(SIMDE_X86_SSE2_NATIVE)
4524   return _mm_setzero_pd();
4525 #else
4526   return simde_mm_castsi128_pd(simde_mm_setzero_si128());
4527 #endif
4528 }
4529 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4530 #  define _mm_setzero_pd() simde_mm_setzero_pd()
4531 #endif
4532 
4533 #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
4534 HEDLEY_DIAGNOSTIC_PUSH
4535 SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_
4536 #endif
4537 
4538 SIMDE_FUNCTION_ATTRIBUTES
4539 simde__m128d
simde_mm_undefined_pd(void)4540 simde_mm_undefined_pd (void) {
4541   simde__m128d_private r_;
4542 
4543 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE__HAVE_UNDEFINED128)
4544   r_.n = _mm_undefined_pd();
4545 #elif !defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
4546   r_ = simde__m128d_to_private(simde_mm_setzero_pd());
4547 #endif
4548 
4549   return simde__m128d_from_private(r_);
4550 }
4551 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4552 #  define _mm_undefined_pd() simde_mm_undefined_pd()
4553 #endif
4554 
4555 SIMDE_FUNCTION_ATTRIBUTES
4556 simde__m128i
simde_mm_undefined_si128(void)4557 simde_mm_undefined_si128 (void) {
4558   simde__m128i_private r_;
4559 
4560 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE__HAVE_UNDEFINED128)
4561   r_.n = _mm_undefined_si128();
4562 #elif !defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
4563   r_ = simde__m128i_to_private(simde_mm_setzero_si128());
4564 #endif
4565 
4566   return simde__m128i_from_private(r_);
4567 }
4568 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4569 #  define _mm_undefined_si128() (simde_mm_undefined_si128())
4570 #endif
4571 
4572 #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
4573 HEDLEY_DIAGNOSTIC_POP
4574 #endif
4575 
4576 SIMDE_FUNCTION_ATTRIBUTES
4577 simde__m128d
simde_x_mm_setone_pd(void)4578 simde_x_mm_setone_pd (void) {
4579   return simde_mm_castps_pd(simde_x_mm_setone_ps());
4580 }
4581 
4582 SIMDE_FUNCTION_ATTRIBUTES
4583 simde__m128i
simde_x_mm_setone_si128(void)4584 simde_x_mm_setone_si128 (void) {
4585   return simde_mm_castps_si128(simde_x_mm_setone_ps());
4586 }
4587 
4588 SIMDE_FUNCTION_ATTRIBUTES
4589 simde__m128i
simde_mm_shuffle_epi32(simde__m128i a,const int imm8)4590 simde_mm_shuffle_epi32 (simde__m128i a, const int imm8)
4591     SIMDE_REQUIRE_RANGE(imm8, 0, 255)  {
4592   simde__m128i_private
4593     r_,
4594     a_ = simde__m128i_to_private(a);
4595 
4596   for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
4597     r_.i32[i] = a_.i32[(imm8 >> (i * 2)) & 3];
4598   }
4599 
4600   return simde__m128i_from_private(r_);
4601 }
4602 #if defined(SIMDE_X86_SSE2_NATIVE)
4603 #  define simde_mm_shuffle_epi32(a, imm8) _mm_shuffle_epi32((a), (imm8))
4604 #elif defined(SIMDE_SHUFFLE_VECTOR_)
4605 #  define simde_mm_shuffle_epi32(a, imm8) (__extension__ ({ \
4606       const simde__m128i_private simde__tmp_a_ = simde__m128i_to_private(a); \
4607       simde__m128i_from_private((simde__m128i_private) { .i32 = \
4608         SIMDE_SHUFFLE_VECTOR_(32, 16, \
4609           (simde__tmp_a_).i32, \
4610           (simde__tmp_a_).i32, \
4611           ((imm8)     ) & 3, \
4612           ((imm8) >> 2) & 3, \
4613           ((imm8) >> 4) & 3, \
4614           ((imm8) >> 6) & 3) }); }))
4615 #endif
4616 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4617 #  define _mm_shuffle_epi32(a, imm8) simde_mm_shuffle_epi32(a, imm8)
4618 #endif
4619 
4620 SIMDE_FUNCTION_ATTRIBUTES
4621 simde__m128d
simde_mm_shuffle_pd(simde__m128d a,simde__m128d b,const int imm8)4622 simde_mm_shuffle_pd (simde__m128d a, simde__m128d b, const int imm8)
4623     SIMDE_REQUIRE_RANGE(imm8, 0, 3)  {
4624   simde__m128d_private
4625     r_,
4626     a_ = simde__m128d_to_private(a),
4627     b_ = simde__m128d_to_private(b);
4628 
4629   r_.f64[0] = ((imm8 & 1) == 0) ? a_.f64[0] : a_.f64[1];
4630   r_.f64[1] = ((imm8 & 2) == 0) ? b_.f64[0] : b_.f64[1];
4631 
4632   return simde__m128d_from_private(r_);
4633 }
4634 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
4635 #  define simde_mm_shuffle_pd(a, b, imm8) _mm_shuffle_pd((a), (b), (imm8))
4636 #elif defined(SIMDE_SHUFFLE_VECTOR_)
4637 #  define simde_mm_shuffle_pd(a, b, imm8) (__extension__ ({ \
4638       simde__m128d_from_private((simde__m128d_private) { .f64 = \
4639         SIMDE_SHUFFLE_VECTOR_(64, 16, \
4640           simde__m128d_to_private(a).f64, \
4641           simde__m128d_to_private(b).f64, \
4642           (((imm8)     ) & 1), \
4643           (((imm8) >> 1) & 1) + 2) }); }))
4644 #endif
4645 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4646 #  define _mm_shuffle_pd(a, b, imm8) simde_mm_shuffle_pd(a, b, imm8)
4647 #endif
4648 
4649 SIMDE_FUNCTION_ATTRIBUTES
4650 simde__m128i
simde_mm_shufflehi_epi16(simde__m128i a,const int imm8)4651 simde_mm_shufflehi_epi16 (simde__m128i a, const int imm8)
4652     SIMDE_REQUIRE_RANGE(imm8, 0, 255)  {
4653   simde__m128i_private
4654     r_,
4655     a_ = simde__m128i_to_private(a);
4656 
4657   SIMDE_VECTORIZE
4658   for (size_t i = 0 ; i < ((sizeof(a_.i16) / sizeof(a_.i16[0])) / 2) ; i++) {
4659     r_.i16[i] = a_.i16[i];
4660   }
4661   for (size_t i = ((sizeof(a_.i16) / sizeof(a_.i16[0])) / 2) ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
4662     r_.i16[i] = a_.i16[((imm8 >> ((i - 4) * 2)) & 3) + 4];
4663   }
4664 
4665   return simde__m128i_from_private(r_);
4666 }
4667 #if defined(SIMDE_X86_SSE2_NATIVE)
4668 #  define simde_mm_shufflehi_epi16(a, imm8) _mm_shufflehi_epi16((a), (imm8))
4669 #elif defined(SIMDE_SHUFFLE_VECTOR_)
4670 #  define simde_mm_shufflehi_epi16(a, imm8) (__extension__ ({ \
4671       const simde__m128i_private simde__tmp_a_ = simde__m128i_to_private(a); \
4672       simde__m128i_from_private((simde__m128i_private) { .i16 = \
4673         SIMDE_SHUFFLE_VECTOR_(16, 16, \
4674           (simde__tmp_a_).i16, \
4675           (simde__tmp_a_).i16, \
4676           0, 1, 2, 3, \
4677           (((imm8)     ) & 3) + 4, \
4678           (((imm8) >> 2) & 3) + 4, \
4679           (((imm8) >> 4) & 3) + 4, \
4680           (((imm8) >> 6) & 3) + 4) }); }))
4681 #endif
4682 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4683 #  define _mm_shufflehi_epi16(a, imm8) simde_mm_shufflehi_epi16(a, imm8)
4684 #endif
4685 
4686 SIMDE_FUNCTION_ATTRIBUTES
4687 simde__m128i
simde_mm_shufflelo_epi16(simde__m128i a,const int imm8)4688 simde_mm_shufflelo_epi16 (simde__m128i a, const int imm8)
4689     SIMDE_REQUIRE_RANGE(imm8, 0, 255)  {
4690   simde__m128i_private
4691     r_,
4692     a_ = simde__m128i_to_private(a);
4693 
4694   for (size_t i = 0 ; i < ((sizeof(r_.i16) / sizeof(r_.i16[0])) / 2) ; i++) {
4695     r_.i16[i] = a_.i16[((imm8 >> (i * 2)) & 3)];
4696   }
4697   SIMDE_VECTORIZE
4698   for (size_t i = ((sizeof(a_.i16) / sizeof(a_.i16[0])) / 2) ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
4699     r_.i16[i] = a_.i16[i];
4700   }
4701 
4702   return simde__m128i_from_private(r_);
4703 }
4704 #if defined(SIMDE_X86_SSE2_NATIVE)
4705 #  define simde_mm_shufflelo_epi16(a, imm8) _mm_shufflelo_epi16((a), (imm8))
4706 #elif defined(SIMDE_SHUFFLE_VECTOR_)
4707 #  define simde_mm_shufflelo_epi16(a, imm8) (__extension__ ({ \
4708       const simde__m128i_private simde__tmp_a_ = simde__m128i_to_private(a); \
4709       simde__m128i_from_private((simde__m128i_private) { .i16 = \
4710         SIMDE_SHUFFLE_VECTOR_(16, 16, \
4711           (simde__tmp_a_).i16, \
4712           (simde__tmp_a_).i16, \
4713           (((imm8)     ) & 3), \
4714           (((imm8) >> 2) & 3), \
4715           (((imm8) >> 4) & 3), \
4716           (((imm8) >> 6) & 3), \
4717           4, 5, 6, 7) }); }))
4718 #endif
4719 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4720 #  define _mm_shufflelo_epi16(a, imm8) simde_mm_shufflelo_epi16(a, imm8)
4721 #endif
4722 
4723 SIMDE_FUNCTION_ATTRIBUTES
4724 simde__m128i
simde_mm_sll_epi16(simde__m128i a,simde__m128i count)4725 simde_mm_sll_epi16 (simde__m128i a, simde__m128i count) {
4726 #if defined(SIMDE_X86_SSE2_NATIVE)
4727   return _mm_sll_epi16(a, count);
4728 #else
4729   simde__m128i_private
4730     r_,
4731     a_ = simde__m128i_to_private(a),
4732     count_ = simde__m128i_to_private(count);
4733 
4734   if (count_.u64[0] > 15)
4735     return simde_mm_setzero_si128();
4736 
4737   #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
4738     r_.u16 = (a_.u16 << count_.u64[0]);
4739   #else
4740     SIMDE_VECTORIZE
4741     for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
4742       r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, (a_.u16[i] << count_.u64[0]));
4743     }
4744   #endif
4745 
4746   return simde__m128i_from_private(r_);
4747 #endif
4748 }
4749 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4750 #  define _mm_sll_epi16(a, count) simde_mm_sll_epi16((a), (count))
4751 #endif
4752 
4753 SIMDE_FUNCTION_ATTRIBUTES
4754 simde__m128i
simde_mm_sll_epi32(simde__m128i a,simde__m128i count)4755 simde_mm_sll_epi32 (simde__m128i a, simde__m128i count) {
4756 #if defined(SIMDE_X86_SSE2_NATIVE)
4757   return _mm_sll_epi32(a, count);
4758 #else
4759   simde__m128i_private
4760     r_,
4761     a_ = simde__m128i_to_private(a),
4762     count_ = simde__m128i_to_private(count);
4763 
4764   if (count_.u64[0] > 31)
4765     return simde_mm_setzero_si128();
4766 
4767 #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
4768   r_.u32 = (a_.u32 << count_.u64[0]);
4769 #else
4770   SIMDE_VECTORIZE
4771   for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
4772     r_.u32[i] = HEDLEY_STATIC_CAST(uint32_t, (a_.u32[i] << count_.u64[0]));
4773   }
4774 #endif
4775 
4776   return simde__m128i_from_private(r_);
4777 #endif
4778 }
4779 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4780 #  define _mm_sll_epi32(a, count) (simde_mm_sll_epi32(a, (count)))
4781 #endif
4782 
4783 SIMDE_FUNCTION_ATTRIBUTES
4784 simde__m128i
simde_mm_sll_epi64(simde__m128i a,simde__m128i count)4785 simde_mm_sll_epi64 (simde__m128i a, simde__m128i count) {
4786 #if defined(SIMDE_X86_SSE2_NATIVE)
4787   return _mm_sll_epi64(a, count);
4788 #else
4789   simde__m128i_private
4790     r_,
4791     a_ = simde__m128i_to_private(a),
4792     count_ = simde__m128i_to_private(count);
4793 
4794   if (count_.u64[0] > 63)
4795     return simde_mm_setzero_si128();
4796 
4797   const int_fast16_t s = HEDLEY_STATIC_CAST(int_fast16_t, count_.u64[0]);
4798   #if !defined(SIMDE_BUG_GCC_94488)
4799     SIMDE_VECTORIZE
4800   #endif
4801   for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
4802     r_.u64[i] = a_.u64[i] << s;
4803   }
4804 
4805   return simde__m128i_from_private(r_);
4806 #endif
4807 }
4808 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4809 #  define _mm_sll_epi64(a, count) (simde_mm_sll_epi64(a, (count)))
4810 #endif
4811 
4812 SIMDE_FUNCTION_ATTRIBUTES
4813 simde__m128d
simde_mm_sqrt_pd(simde__m128d a)4814 simde_mm_sqrt_pd (simde__m128d a) {
4815   #if defined(SIMDE_X86_SSE2_NATIVE)
4816     return _mm_sqrt_pd(a);
4817   #else
4818     simde__m128d_private
4819       r_,
4820       a_ = simde__m128d_to_private(a);
4821 
4822     #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
4823       r_.neon_f64 = vsqrtq_f64(a_.neon_f64);
4824     #elif defined(simde_math_sqrt)
4825       SIMDE_VECTORIZE
4826       for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
4827         r_.f64[i] = simde_math_sqrt(a_.f64[i]);
4828       }
4829     #else
4830       HEDLEY_UNREACHABLE();
4831     #endif
4832 
4833     return simde__m128d_from_private(r_);
4834   #endif
4835 }
4836 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4837 #  define _mm_sqrt_pd(a) simde_mm_sqrt_pd(a)
4838 #endif
4839 
4840 SIMDE_FUNCTION_ATTRIBUTES
4841 simde__m128d
simde_mm_sqrt_sd(simde__m128d a,simde__m128d b)4842 simde_mm_sqrt_sd (simde__m128d a, simde__m128d b) {
4843   #if defined(SIMDE_X86_SSE2_NATIVE)
4844     return _mm_sqrt_sd(a, b);
4845   #elif defined(SIMDE_ASSUME_VECTORIZATION)
4846     return simde_mm_move_sd(a, simde_mm_sqrt_pd(b));
4847   #else
4848     simde__m128d_private
4849       r_,
4850       a_ = simde__m128d_to_private(a),
4851       b_ = simde__m128d_to_private(b);
4852 
4853     #if defined(simde_math_sqrt)
4854       r_.f64[0] = simde_math_sqrt(b_.f64[0]);
4855       r_.f64[1] = a_.f64[1];
4856     #else
4857       HEDLEY_UNREACHABLE();
4858     #endif
4859 
4860     return simde__m128d_from_private(r_);
4861   #endif
4862 }
4863 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4864 #  define _mm_sqrt_sd(a, b) simde_mm_sqrt_sd(a, b)
4865 #endif
4866 
4867 SIMDE_FUNCTION_ATTRIBUTES
4868 simde__m128i
simde_mm_srl_epi16(simde__m128i a,simde__m128i count)4869 simde_mm_srl_epi16 (simde__m128i a, simde__m128i count) {
4870 #if defined(SIMDE_X86_SSE2_NATIVE)
4871   return _mm_srl_epi16(a, count);
4872 #else
4873   simde__m128i_private
4874     r_,
4875     a_ = simde__m128i_to_private(a),
4876     count_ = simde__m128i_to_private(count);
4877 
4878   const int cnt = HEDLEY_STATIC_CAST(int, (count_.i64[0] > 16 ? 16 : count_.i64[0]));
4879 
4880   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4881     r_.neon_u16 = vshlq_u16(a_.neon_u16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, -cnt)));
4882   #else
4883     SIMDE_VECTORIZE
4884     for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
4885       r_.u16[i] = a_.u16[i] >> cnt;
4886     }
4887   #endif
4888 
4889   return simde__m128i_from_private(r_);
4890 #endif
4891 }
4892 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4893   #define _mm_srl_epi16(a, count) (simde_mm_srl_epi16(a, (count)))
4894 #endif
4895 
4896 SIMDE_FUNCTION_ATTRIBUTES
4897 simde__m128i
simde_mm_srl_epi32(simde__m128i a,simde__m128i count)4898 simde_mm_srl_epi32 (simde__m128i a, simde__m128i count) {
4899 #if defined(SIMDE_X86_SSE2_NATIVE)
4900   return _mm_srl_epi32(a, count);
4901 #else
4902   simde__m128i_private
4903     r_,
4904     a_ = simde__m128i_to_private(a),
4905     count_ = simde__m128i_to_private(count);
4906 
4907   const int cnt = HEDLEY_STATIC_CAST(int, (count_.i64[0] > 32 ? 32 : count_.i64[0]));
4908 
4909   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4910     r_.neon_u32 = vshlq_u32(a_.neon_u32, vdupq_n_s32(HEDLEY_STATIC_CAST(int32_t, -cnt)));
4911   #else
4912     SIMDE_VECTORIZE
4913     for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
4914       r_.u32[i] = a_.u32[i] >> cnt;
4915     }
4916   #endif
4917 
4918   return simde__m128i_from_private(r_);
4919 #endif
4920 }
4921 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4922 #  define _mm_srl_epi32(a, count) (simde_mm_srl_epi32(a, (count)))
4923 #endif
4924 
4925 SIMDE_FUNCTION_ATTRIBUTES
4926 simde__m128i
simde_mm_srl_epi64(simde__m128i a,simde__m128i count)4927 simde_mm_srl_epi64 (simde__m128i a, simde__m128i count) {
4928 #if defined(SIMDE_X86_SSE2_NATIVE)
4929   return _mm_srl_epi64(a, count);
4930 #else
4931   simde__m128i_private
4932     r_,
4933     a_ = simde__m128i_to_private(a),
4934     count_ = simde__m128i_to_private(count);
4935 
4936   const int cnt = HEDLEY_STATIC_CAST(int, (count_.i64[0] > 64 ? 64 : count_.i64[0]));
4937 
4938   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4939     r_.neon_u64 = vshlq_u64(a_.neon_u64, vdupq_n_s64(HEDLEY_STATIC_CAST(int64_t, -cnt)));
4940   #else
4941     #if !defined(SIMDE_BUG_GCC_94488)
4942       SIMDE_VECTORIZE
4943     #endif
4944     for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
4945       r_.u64[i] = a_.u64[i] >> cnt;
4946     }
4947   #endif
4948 
4949   return simde__m128i_from_private(r_);
4950 #endif
4951 }
4952 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4953 #  define _mm_srl_epi64(a, count) (simde_mm_srl_epi64(a, (count)))
4954 #endif
4955 
4956 SIMDE_FUNCTION_ATTRIBUTES
4957 simde__m128i
simde_mm_srai_epi16(simde__m128i a,const int imm8)4958 simde_mm_srai_epi16 (simde__m128i a, const int imm8)
4959     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
4960   /* MSVC requires a range of (0, 255). */
4961   simde__m128i_private
4962     r_,
4963     a_ = simde__m128i_to_private(a);
4964 
4965   const int cnt = (imm8 & ~15) ? 15 : imm8;
4966 
4967   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4968     r_.neon_i16 = vshlq_s16(a_.neon_i16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, -cnt)));
4969   #else
4970     SIMDE_VECTORIZE
4971     for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i16[0])) ; i++) {
4972       r_.i16[i] = a_.i16[i] >> cnt;
4973     }
4974   #endif
4975 
4976   return simde__m128i_from_private(r_);
4977 }
4978 #if defined(SIMDE_X86_SSE2_NATIVE)
4979   #define simde_mm_srai_epi16(a, imm8) _mm_srai_epi16((a), (imm8))
4980 #endif
4981 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4982   #define _mm_srai_epi16(a, imm8) simde_mm_srai_epi16(a, imm8)
4983 #endif
4984 
4985 SIMDE_FUNCTION_ATTRIBUTES
4986 simde__m128i
simde_mm_srai_epi32(simde__m128i a,const int imm8)4987 simde_mm_srai_epi32 (simde__m128i a, const int imm8)
4988     SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
4989   /* MSVC requires a range of (0, 255). */
4990   simde__m128i_private
4991     r_,
4992     a_ = simde__m128i_to_private(a);
4993 
4994   const int cnt = (imm8 & ~31) ? 31 : imm8;
4995 
4996   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4997     r_.neon_i32 = vshlq_s32(a_.neon_i32, vdupq_n_s32(-cnt));
4998   #else
4999     SIMDE_VECTORIZE
5000     for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i32[0])) ; i++) {
5001       r_.i32[i] = a_.i32[i] >> cnt;
5002     }
5003   #endif
5004 
5005   return simde__m128i_from_private(r_);
5006 }
5007 #if defined(SIMDE_X86_SSE2_NATIVE)
5008   #define simde_mm_srai_epi32(a, imm8) _mm_srai_epi32((a), (imm8))
5009 #endif
5010 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5011   #define _mm_srai_epi32(a, imm8) simde_mm_srai_epi32(a, imm8)
5012 #endif
5013 
5014 SIMDE_FUNCTION_ATTRIBUTES
5015 simde__m128i
simde_mm_sra_epi16(simde__m128i a,simde__m128i count)5016 simde_mm_sra_epi16 (simde__m128i a, simde__m128i count) {
5017 #if defined(SIMDE_X86_SSE2_NATIVE)
5018   return _mm_sra_epi16(a, count);
5019 #else
5020   simde__m128i_private
5021     r_,
5022     a_ = simde__m128i_to_private(a),
5023     count_ = simde__m128i_to_private(count);
5024 
5025   const int cnt = HEDLEY_STATIC_CAST(int, (count_.i64[0] > 15 ? 15 : count_.i64[0]));
5026 
5027   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5028     r_.neon_i16 = vshlq_s16(a_.neon_i16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, -cnt)));
5029   #else
5030     SIMDE_VECTORIZE
5031     for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
5032       r_.i16[i] = a_.i16[i] >> cnt;
5033     }
5034   #endif
5035 
5036   return simde__m128i_from_private(r_);
5037 #endif
5038 }
5039 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5040 #  define _mm_sra_epi16(a, count) (simde_mm_sra_epi16(a, count))
5041 #endif
5042 
5043 SIMDE_FUNCTION_ATTRIBUTES
5044 simde__m128i
simde_mm_sra_epi32(simde__m128i a,simde__m128i count)5045 simde_mm_sra_epi32 (simde__m128i a, simde__m128i count) {
5046 #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_MM_SRA_EPI32)
5047   return _mm_sra_epi32(a, count);
5048 #else
5049   simde__m128i_private
5050     r_,
5051     a_ = simde__m128i_to_private(a),
5052     count_ = simde__m128i_to_private(count);
5053 
5054   const int cnt = count_.u64[0] > 31 ? 31 : HEDLEY_STATIC_CAST(int, count_.u64[0]);
5055 
5056   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5057     r_.neon_i32 = vshlq_s32(a_.neon_i32, vdupq_n_s32(HEDLEY_STATIC_CAST(int32_t, -cnt)));
5058   #else
5059     SIMDE_VECTORIZE
5060     for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
5061       r_.i32[i] = a_.i32[i] >> cnt;
5062     }
5063   #endif
5064 
5065   return simde__m128i_from_private(r_);
5066 #endif
5067 }
5068 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5069 #  define _mm_sra_epi32(a, count) (simde_mm_sra_epi32(a, (count)))
5070 #endif
5071 
5072 SIMDE_FUNCTION_ATTRIBUTES
5073 simde__m128i
simde_mm_slli_epi16(simde__m128i a,const int imm8)5074 simde_mm_slli_epi16 (simde__m128i a, const int imm8)
5075     SIMDE_REQUIRE_RANGE(imm8, 0, 255)  {
5076   if (HEDLEY_UNLIKELY((imm8 > 15))) {
5077     return simde_mm_setzero_si128();
5078   }
5079 
5080   simde__m128i_private
5081     r_,
5082     a_ = simde__m128i_to_private(a);
5083 
5084   #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
5085     r_.i16 = a_.i16 << (imm8 & 0xff);
5086   #else
5087     const int s = (imm8 > HEDLEY_STATIC_CAST(int, sizeof(r_.i16[0]) * CHAR_BIT) - 1) ? 0 : imm8;
5088     SIMDE_VECTORIZE
5089     for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
5090       r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, a_.i16[i] << s);
5091     }
5092   #endif
5093 
5094   return simde__m128i_from_private(r_);
5095 }
5096 #if defined(SIMDE_X86_SSE2_NATIVE)
5097 #  define simde_mm_slli_epi16(a, imm8) _mm_slli_epi16(a, imm8)
5098 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(__clang__)
5099 #  define simde_mm_slli_epi16(a, imm8) \
5100   simde__m128i_from_neon_u16(vshlq_n_u16(simde__m128i_to_neon_u16(a), (imm8)))
5101 // The above is allowed by gcc/g++ 9 with -march=armv8-a, might work on A32V8 and elsewhere but needs testing
5102 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) // clang can't handle the potential out of range use of imm8 even though that is handled
5103 #  define simde_mm_slli_epi16(a, imm8) \
5104      ({                                                            \
5105         simde__m128i ret;                                          \
5106         if ((imm8) <= 0) {                                         \
5107             ret = a;                                               \
5108         } else if ((imm8) > 15) {                                  \
5109             ret = simde_mm_setzero_si128();      \
5110         } else {                                                   \
5111             ret = simde__m128i_from_neon_i16(                      \
5112                 vshlq_n_s16(simde__m128i_to_neon_i16(a), (imm8))); \
5113         }                                                          \
5114         ret;                                                       \
5115     })
5116 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
5117   #define simde_mm_slli_epi16(a, imm8) \
5118     ((imm8 & ~15) ? simde_mm_setzero_si128() : simde__m128i_from_altivec_i16(vec_sl(simde__m128i_to_altivec_i16(a), vec_splat_u16(HEDLEY_STATIC_CAST(unsigned short, imm8)))))
5119 #endif
5120 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5121 #  define _mm_slli_epi16(a, imm8) simde_mm_slli_epi16(a, imm8)
5122 #endif
5123 
5124 SIMDE_FUNCTION_ATTRIBUTES
5125 simde__m128i
simde_mm_slli_epi32(simde__m128i a,const int imm8)5126 simde_mm_slli_epi32 (simde__m128i a, const int imm8)
5127     SIMDE_REQUIRE_RANGE(imm8, 0, 255)  {
5128   if (HEDLEY_UNLIKELY((imm8 > 31))) {
5129     return simde_mm_setzero_si128();
5130   }
5131   simde__m128i_private
5132     r_,
5133     a_ = simde__m128i_to_private(a);
5134 
5135   #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
5136     r_.i32 = a_.i32 << imm8;
5137   #else
5138     SIMDE_VECTORIZE
5139     for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
5140       r_.i32[i] = a_.i32[i] << (imm8 & 0xff);
5141     }
5142   #endif
5143 
5144   return simde__m128i_from_private(r_);
5145 }
5146 #if defined(SIMDE_X86_SSE2_NATIVE)
5147 #  define simde_mm_slli_epi32(a, imm8) _mm_slli_epi32(a, imm8)
5148 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(__clang__)
5149 #  define simde_mm_slli_epi32(a, imm8) \
5150   simde__m128i_from_neon_u32(vshlq_n_u32(simde__m128i_to_neon_u32(a), (imm8)))
5151 // The above is allowed by gcc/g++ 9 with -march=armv8-a, might work on A32V8 and elsewhere but needs testing
5152 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) // clang can't handle the potential out of range use of imm8 even though that is handled
5153 #  define simde_mm_slli_epi32(a, imm8) \
5154      ({                                                       \
5155        simde__m128i ret;                                      \
5156        if ((imm8) <= 0) {                                     \
5157          ret = a;                                             \
5158        } else if ((imm8) > 31) {                              \
5159          ret = simde_mm_setzero_si128();                      \
5160        } else {                                               \
5161          ret = simde__m128i_from_neon_i32(                    \
5162            vshlq_n_s32(simde__m128i_to_neon_i32(a), (imm8))); \
5163        }                                                      \
5164        ret;                                                   \
5165     })
5166 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
5167   #define simde_mm_slli_epi32(a, imm8) \
5168      ({                                                            \
5169        simde__m128i ret;                                           \
5170        if ((imm8) <= 0) {                                          \
5171          ret = a;                                                  \
5172        } else if ((imm8) > 31) {                                   \
5173          ret = simde_mm_setzero_si128();                           \
5174        } else {                                                    \
5175          ret = simde__m128i_from_altivec_i32(                      \
5176            vec_sl(simde__m128i_to_altivec_i32(a),                  \
5177              vec_splats(HEDLEY_STATIC_CAST(unsigned int, imm8)))); \
5178        }                                                           \
5179        ret;                                                        \
5180      })
5181 #endif
5182 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5183 #  define _mm_slli_epi32(a, imm8) simde_mm_slli_epi32(a, imm8)
5184 #endif
5185 
5186 SIMDE_FUNCTION_ATTRIBUTES
5187 simde__m128i
simde_mm_slli_epi64(simde__m128i a,const int imm8)5188 simde_mm_slli_epi64 (simde__m128i a, const int imm8)
5189     SIMDE_REQUIRE_RANGE(imm8, 0, 255)  {
5190   if (HEDLEY_UNLIKELY((imm8 > 63))) {
5191     return simde_mm_setzero_si128();
5192   }
5193   simde__m128i_private
5194     r_,
5195     a_ = simde__m128i_to_private(a);
5196 
5197 #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
5198   r_.i64 = a_.i64 << imm8;
5199 #else
5200   SIMDE_VECTORIZE
5201   for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
5202     r_.i64[i] = a_.i64[i] << (imm8 & 0xff);
5203   }
5204 #endif
5205 
5206   return simde__m128i_from_private(r_);
5207 }
5208 #if defined(SIMDE_X86_SSE2_NATIVE)
5209 #  define simde_mm_slli_epi64(a, imm8) _mm_slli_epi64(a, imm8)
5210 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(__clang__)
5211 #  define simde_mm_slli_epi64(a, imm8) \
5212   simde__m128i_from_neon_u64(vshlq_n_u64(simde__m128i_to_neon_u64(a), (imm8)))
5213 // The above is allowed by gcc/g++ 9 with -march=armv8-a, might work on A32V8 and elsewhere but needs testing
5214 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) // clang can't handle the potential out of range use of imm8 even though that is handled
5215 #  define simde_mm_slli_epi64(a, imm8) \
5216      ({                                                            \
5217         simde__m128i ret;                                          \
5218         if ((imm8) <= 0) {                                         \
5219             ret = a;                                               \
5220         } else if ((imm8) > 63) {                                  \
5221             ret = simde_mm_setzero_si128();                        \
5222         } else {                                                   \
5223             ret = simde__m128i_from_neon_i64(                      \
5224                 vshlq_n_s64(simde__m128i_to_neon_i64(a), (imm8))); \
5225         }                                                          \
5226         ret;                                                       \
5227     })
5228 #endif
5229 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5230 #  define _mm_slli_epi64(a, imm8) simde_mm_slli_epi64(a, imm8)
5231 #endif
5232 
5233 SIMDE_FUNCTION_ATTRIBUTES
5234 simde__m128i
simde_mm_srli_epi16(simde__m128i a,const int imm8)5235 simde_mm_srli_epi16 (simde__m128i a, const int imm8)
5236     SIMDE_REQUIRE_RANGE(imm8, 0, 255)  {
5237   if (HEDLEY_UNLIKELY((imm8 > 15))) {
5238     return simde_mm_setzero_si128();
5239   }
5240   simde__m128i_private
5241     r_,
5242     a_ = simde__m128i_to_private(a);
5243 
5244 #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
5245   r_.u16 = a_.u16 >> imm8;
5246 #else
5247   SIMDE_VECTORIZE
5248   for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
5249     r_.u16[i] = a_.u16[i] >> (imm8 & 0xff);
5250   }
5251 #endif
5252 
5253   return simde__m128i_from_private(r_);
5254 }
5255 #if defined(SIMDE_X86_SSE2_NATIVE)
5256 #  define simde_mm_srli_epi16(a, imm8) _mm_srli_epi16(a, imm8)
5257 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(__clang__)
5258 #  define simde_mm_srli_epi16(a, imm8) \
5259   simde__m128i_from_neon_u16(vshrq_n_u16(simde__m128i_to_neon_u16(a), imm8))
5260 // The above is allowed by gcc/g++ 9 with -march=armv8-a, might work on A32V8 and elsewhere but needs testing
5261 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) // clang can't handle the potential out of range use of imm8 even though that is handled
5262 #  define simde_mm_srli_epi16(a, imm8) \
5263      ({                                                            \
5264         simde__m128i ret;                                          \
5265         if ((imm8) <= 0) {                                         \
5266             ret = a;                                               \
5267         } else if ((imm8) > 15) {                                  \
5268             ret = simde_mm_setzero_si128();                        \
5269         } else {                                                   \
5270             ret = simde__m128i_from_neon_u16(                      \
5271                 vshrq_n_u16(simde__m128i_to_neon_u16(a), (imm8))); \
5272         }                                                          \
5273         ret;                                                       \
5274     })
5275 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
5276   #define simde_mm_srli_epi16(a, imm8) \
5277     ((imm8 & ~15) ? simde_mm_setzero_si128() : simde__m128i_from_altivec_i16(vec_sr(simde__m128i_to_altivec_i16(a), vec_splat_u16(HEDLEY_STATIC_CAST(unsigned short, imm8)))))
5278 #endif
5279 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5280 #  define _mm_srli_epi16(a, imm8) simde_mm_srli_epi16(a, imm8)
5281 #endif
5282 
5283 SIMDE_FUNCTION_ATTRIBUTES
5284 simde__m128i
simde_mm_srli_epi32(simde__m128i a,const int imm8)5285 simde_mm_srli_epi32 (simde__m128i a, const int imm8)
5286     SIMDE_REQUIRE_RANGE(imm8, 0, 255)  {
5287   if (HEDLEY_UNLIKELY((imm8 > 31))) {
5288     return simde_mm_setzero_si128();
5289   }
5290   simde__m128i_private
5291     r_,
5292     a_ = simde__m128i_to_private(a);
5293 
5294 #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
5295   r_.u32 = a_.u32 >> (imm8 & 0xff);
5296 #else
5297   SIMDE_VECTORIZE
5298   for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
5299     r_.u32[i] = a_.u32[i] >> (imm8 & 0xff);
5300   }
5301 #endif
5302 
5303   return simde__m128i_from_private(r_);
5304 }
5305 #if defined(SIMDE_X86_SSE2_NATIVE)
5306 #  define simde_mm_srli_epi32(a, imm8) _mm_srli_epi32(a, imm8)
5307 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(__clang__)
5308 #  define simde_mm_srli_epi32(a, imm8) \
5309      simde__m128i_from_neon_u32(vshrq_n_u32(simde__m128i_to_neon_u32(a), imm8))
5310 // The above is allowed by gcc/g++ 9 with -march=armv8-a, might work on A32V8 and elsewhere but needs testing
5311 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) // clang can't handle the potential out of range use of imm8 even though that is handled
5312 #  define simde_mm_srli_epi32(a, imm8) \
5313     ({                                                           \
5314         simde__m128i ret;                                        \
5315         if ((imm8) <= 0) {                                       \
5316             ret = a;                                             \
5317         } else if ((imm8) > 31) {                                \
5318             ret = simde_mm_setzero_si128();                      \
5319         } else {                                                 \
5320             ret = simde__m128i_from_neon_u32(                    \
5321               vshrq_n_u32(simde__m128i_to_neon_u32(a), (imm8))); \
5322         }                                                        \
5323         ret;                                                     \
5324     })
5325 #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
5326 #  define simde_mm_srli_epi32(a, imm8) \
5327     ({                                                                \
5328         simde__m128i ret;                                             \
5329         if ((imm8) <= 0) {                                            \
5330             ret = a;                                                  \
5331         } else if ((imm8) > 31) {                                     \
5332             ret = simde_mm_setzero_si128();                           \
5333         } else {                                                      \
5334             ret = simde__m128i_from_altivec_i32(                      \
5335               vec_sr(simde__m128i_to_altivec_i32(a),                  \
5336                 vec_splats(HEDLEY_STATIC_CAST(unsigned int, imm8)))); \
5337         }                                                             \
5338         ret;                                                          \
5339     })
5340 #endif
5341 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5342 #  define _mm_srli_epi32(a, imm8) simde_mm_srli_epi32(a, imm8)
5343 #endif
5344 
5345 SIMDE_FUNCTION_ATTRIBUTES
5346 simde__m128i
simde_mm_srli_epi64(simde__m128i a,const int imm8)5347 simde_mm_srli_epi64 (simde__m128i a, const int imm8)
5348     SIMDE_REQUIRE_RANGE(imm8, 0, 255)  {
5349   simde__m128i_private
5350     r_,
5351     a_ = simde__m128i_to_private(a);
5352 
5353   if (HEDLEY_UNLIKELY((imm8 & 63) != imm8))
5354     return simde_mm_setzero_si128();
5355 
5356   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5357     r_.neon_u64 = vshlq_u64(a_.neon_u64, vdupq_n_s64(-imm8));
5358   #else
5359     #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_94488)
5360       r_.u64 = a_.u64 >> imm8;
5361     #else
5362       SIMDE_VECTORIZE
5363       for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
5364         r_.u64[i] = a_.u64[i] >> imm8;
5365       }
5366     #endif
5367   #endif
5368 
5369   return simde__m128i_from_private(r_);
5370 }
5371 #if defined(SIMDE_X86_SSE2_NATIVE)
5372 #  define simde_mm_srli_epi64(a, imm8) _mm_srli_epi64(a, imm8)
5373 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(__clang__)
5374 #  define simde_mm_srli_epi64(a, imm8) \
5375     ((imm8 == 0) ? (a) : (simde__m128i_from_neon_u64(vshrq_n_u64(simde__m128i_to_neon_u64(a), imm8))))
5376 // The above is allowed by gcc/g++ 9 with -march=armv8-a, might work on A32V8 and elsewhere but needs testing
5377 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) // clang can't handle the potential out of range use of imm8 even though that is handled
5378 #  define simde_mm_srli_epi64(a, imm8) \
5379     ({                                                           \
5380         simde__m128i ret;                                        \
5381         if ((imm8) <= 0) {                                       \
5382             ret = a;                                             \
5383         } else if ((imm8) > 63) {                                \
5384             ret = simde_mm_setzero_si128();                      \
5385         } else {                                                 \
5386             ret = simde__m128i_from_neon_u64(                    \
5387               vshrq_n_u64(simde__m128i_to_neon_u64(a), (imm8))); \
5388         }                                                        \
5389         ret;                                                     \
5390     })
5391 #endif
5392 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5393 #  define _mm_srli_epi64(a, imm8) simde_mm_srli_epi64(a, imm8)
5394 #endif
5395 
5396 SIMDE_FUNCTION_ATTRIBUTES
5397 void
simde_mm_store_pd(simde_float64 mem_addr[HEDLEY_ARRAY_PARAM (2)],simde__m128d a)5398 simde_mm_store_pd (simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128d a) {
5399   simde_assert_aligned(16, mem_addr);
5400 
5401 #if defined(SIMDE_X86_SSE2_NATIVE)
5402   _mm_store_pd(mem_addr, a);
5403 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
5404   vst1q_f64(mem_addr, simde__m128d_to_private(a).neon_f64);
5405 #else
5406   simde_memcpy(mem_addr, &a, sizeof(a));
5407 #endif
5408 }
5409 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5410 #  define _mm_store_pd(mem_addr, a) simde_mm_store_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
5411 #endif
5412 
5413 SIMDE_FUNCTION_ATTRIBUTES
5414 void
simde_mm_store1_pd(simde_float64 mem_addr[HEDLEY_ARRAY_PARAM (2)],simde__m128d a)5415 simde_mm_store1_pd (simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128d a) {
5416   simde_assert_aligned(16, mem_addr);
5417 
5418 #if defined(SIMDE_X86_SSE2_NATIVE)
5419   _mm_store1_pd(mem_addr, a);
5420 #else
5421   simde__m128d_private a_ = simde__m128d_to_private(a);
5422 
5423   mem_addr[0] = a_.f64[0];
5424   mem_addr[1] = a_.f64[0];
5425 #endif
5426 }
5427 #define simde_mm_store_pd1(mem_addr, a) simde_mm_store1_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
5428 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5429 #  define _mm_store1_pd(mem_addr, a) simde_mm_store1_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
5430 #  define _mm_store_pd1(mem_addr, a) simde_mm_store_pd1(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
5431 #endif
5432 
5433 SIMDE_FUNCTION_ATTRIBUTES
5434 void
simde_mm_store_sd(simde_float64 * mem_addr,simde__m128d a)5435 simde_mm_store_sd (simde_float64* mem_addr, simde__m128d a) {
5436 #if defined(SIMDE_X86_SSE2_NATIVE)
5437   _mm_store_sd(mem_addr, a);
5438 #else
5439   simde__m128d_private a_ = simde__m128d_to_private(a);
5440 
5441 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
5442   simde_float64 v = vgetq_lane_f64(a_.neon_f64, 0);
5443   simde_memcpy(mem_addr, &v, sizeof(simde_float64));
5444 #else
5445   simde_float64 v = a_.f64[0];
5446   simde_memcpy(mem_addr, &v, sizeof(simde_float64));
5447 #endif
5448 #endif
5449 }
5450 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5451 #  define _mm_store_sd(mem_addr, a) simde_mm_store_sd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
5452 #endif
5453 
5454 SIMDE_FUNCTION_ATTRIBUTES
5455 void
simde_mm_store_si128(simde__m128i * mem_addr,simde__m128i a)5456 simde_mm_store_si128 (simde__m128i* mem_addr, simde__m128i a) {
5457 #if defined(SIMDE_X86_SSE2_NATIVE)
5458   _mm_store_si128(HEDLEY_STATIC_CAST(__m128i*, mem_addr), a);
5459 #else
5460   simde__m128i_private a_ = simde__m128i_to_private(a);
5461 
5462   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5463     vst1q_s32(HEDLEY_REINTERPRET_CAST(int32_t*, mem_addr), a_.neon_i32);
5464   #else
5465     simde_memcpy(SIMDE_ASSUME_ALIGNED(16, mem_addr), &a_, sizeof(a_));
5466   #endif
5467 #endif
5468 }
5469 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5470 #  define _mm_store_si128(mem_addr, a) simde_mm_store_si128(mem_addr, a)
5471 #endif
5472 
5473 SIMDE_FUNCTION_ATTRIBUTES
5474 void
simde_mm_storeh_pd(simde_float64 * mem_addr,simde__m128d a)5475 simde_mm_storeh_pd (simde_float64* mem_addr, simde__m128d a) {
5476 #if defined(SIMDE_X86_SSE2_NATIVE)
5477   _mm_storeh_pd(mem_addr, a);
5478 #else
5479   simde__m128d_private a_ = simde__m128d_to_private(a);
5480 
5481   #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
5482     *mem_addr = vgetq_lane_f64(a_.neon_f64, 1);
5483   #else
5484     *mem_addr = a_.f64[1];
5485   #endif
5486 #endif
5487 }
5488 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5489 #  define _mm_storeh_pd(mem_addr, a) simde_mm_storeh_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
5490 #endif
5491 
5492 SIMDE_FUNCTION_ATTRIBUTES
5493 void
simde_mm_storel_epi64(simde__m128i * mem_addr,simde__m128i a)5494 simde_mm_storel_epi64 (simde__m128i* mem_addr, simde__m128i a) {
5495   #if defined(SIMDE_X86_SSE2_NATIVE)
5496     _mm_storel_epi64(HEDLEY_STATIC_CAST(__m128i*, mem_addr), a);
5497   #else
5498     simde__m128i_private a_ = simde__m128i_to_private(a);
5499     int64_t tmp;
5500 
5501     /* memcpy to prevent aliasing, tmp because we can't take the
5502      * address of a vector element. */
5503 
5504     #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5505       tmp = vgetq_lane_s64(a_.neon_i64, 0);
5506     #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
5507       #if defined(SIMDE_BUG_GCC_95227)
5508         (void) a_;
5509       #endif
5510       tmp = vec_extract(a_.altivec_i64, 0);
5511     #else
5512       tmp = a_.i64[0];
5513     #endif
5514 
5515     simde_memcpy(mem_addr, &tmp, sizeof(tmp));
5516   #endif
5517 }
5518 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5519 #  define _mm_storel_epi64(mem_addr, a) simde_mm_storel_epi64(mem_addr, a)
5520 #endif
5521 
5522 SIMDE_FUNCTION_ATTRIBUTES
5523 void
simde_mm_storel_pd(simde_float64 * mem_addr,simde__m128d a)5524 simde_mm_storel_pd (simde_float64* mem_addr, simde__m128d a) {
5525 #if defined(SIMDE_X86_SSE2_NATIVE)
5526   _mm_storel_pd(mem_addr, a);
5527 #else
5528   simde__m128d_private a_ = simde__m128d_to_private(a);
5529 
5530   *mem_addr = a_.f64[0];
5531 #endif
5532 }
5533 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5534 #  define _mm_storel_pd(mem_addr, a) simde_mm_storel_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
5535 #endif
5536 
5537 SIMDE_FUNCTION_ATTRIBUTES
5538 void
simde_mm_storer_pd(simde_float64 mem_addr[2],simde__m128d a)5539 simde_mm_storer_pd (simde_float64 mem_addr[2], simde__m128d a) {
5540   simde_assert_aligned(16, mem_addr);
5541 
5542 #if defined(SIMDE_X86_SSE2_NATIVE)
5543   _mm_storer_pd(mem_addr, a);
5544 #else
5545   simde__m128d_private a_ = simde__m128d_to_private(a);
5546 
5547   mem_addr[0] = a_.f64[1];
5548   mem_addr[1] = a_.f64[0];
5549 #endif
5550 }
5551 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5552 #  define _mm_storer_pd(mem_addr, a) simde_mm_storer_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
5553 #endif
5554 
5555 SIMDE_FUNCTION_ATTRIBUTES
5556 void
simde_mm_storeu_pd(simde_float64 * mem_addr,simde__m128d a)5557 simde_mm_storeu_pd (simde_float64* mem_addr, simde__m128d a) {
5558 #if defined(SIMDE_X86_SSE2_NATIVE)
5559   _mm_storeu_pd(mem_addr, a);
5560 #else
5561   simde_memcpy(mem_addr, &a, sizeof(a));
5562 #endif
5563 }
5564 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5565 #  define _mm_storeu_pd(mem_addr, a) simde_mm_storeu_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
5566 #endif
5567 
5568 SIMDE_FUNCTION_ATTRIBUTES
5569 void
simde_mm_storeu_si128(simde__m128i * mem_addr,simde__m128i a)5570 simde_mm_storeu_si128 (simde__m128i* mem_addr, simde__m128i a) {
5571 #if defined(SIMDE_X86_SSE2_NATIVE)
5572   _mm_storeu_si128(HEDLEY_STATIC_CAST(__m128i*, mem_addr), a);
5573 #else
5574   simde__m128i_private a_ = simde__m128i_to_private(a);
5575 
5576   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5577     vst1q_s32(HEDLEY_REINTERPRET_CAST(int32_t*, mem_addr), a_.neon_i32);
5578   #else
5579     simde_memcpy(mem_addr, &a_, sizeof(a_));
5580   #endif
5581 #endif
5582 }
5583 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5584 #  define _mm_storeu_si128(mem_addr, a) simde_mm_storeu_si128(mem_addr, a)
5585 #endif
5586 
5587 SIMDE_FUNCTION_ATTRIBUTES
5588 void
simde_mm_stream_pd(simde_float64 mem_addr[HEDLEY_ARRAY_PARAM (2)],simde__m128d a)5589 simde_mm_stream_pd (simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128d a) {
5590   simde_assert_aligned(16, mem_addr);
5591 
5592 #if defined(SIMDE_X86_SSE2_NATIVE)
5593   _mm_stream_pd(mem_addr, a);
5594 #else
5595   simde_memcpy(mem_addr, &a, sizeof(a));
5596 #endif
5597 }
5598 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5599 #  define _mm_stream_pd(mem_addr, a) simde_mm_stream_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a)
5600 #endif
5601 
5602 SIMDE_FUNCTION_ATTRIBUTES
5603 void
simde_mm_stream_si128(simde__m128i * mem_addr,simde__m128i a)5604 simde_mm_stream_si128 (simde__m128i* mem_addr, simde__m128i a) {
5605   simde_assert_aligned(16, mem_addr);
5606 
5607 #if defined(SIMDE_X86_SSE2_NATIVE)
5608   _mm_stream_si128(HEDLEY_STATIC_CAST(__m128i*, mem_addr), a);
5609 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5610   vst1q_s64(HEDLEY_REINTERPRET_CAST(int64_t *, mem_addr), simde__m128i_to_neon_i64(a));
5611 #else
5612   simde_memcpy(mem_addr, &a, sizeof(a));
5613 #endif
5614 }
5615 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5616 #  define _mm_stream_si128(mem_addr, a) simde_mm_stream_si128(mem_addr, a)
5617 #endif
5618 
5619 SIMDE_FUNCTION_ATTRIBUTES
5620 void
simde_mm_stream_si32(int32_t * mem_addr,int32_t a)5621 simde_mm_stream_si32 (int32_t* mem_addr, int32_t a) {
5622 #if defined(SIMDE_X86_SSE2_NATIVE)
5623   _mm_stream_si32(mem_addr, a);
5624 #else
5625   *mem_addr = a;
5626 #endif
5627 }
5628 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5629 #  define _mm_stream_si32(mem_addr, a) simde_mm_stream_si32(mem_addr, a)
5630 #endif
5631 
5632 SIMDE_FUNCTION_ATTRIBUTES
5633 void
simde_mm_stream_si64(int64_t * mem_addr,int64_t a)5634 simde_mm_stream_si64 (int64_t* mem_addr, int64_t a) {
5635   *mem_addr = a;
5636 }
5637 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5638 #  define _mm_stream_si64(mem_addr, a) simde_mm_stream_si64(SIMDE_CHECKED_REINTERPRET_CAST(int64_t*, __int64*, mem_addr), a)
5639 #endif
5640 
5641 SIMDE_FUNCTION_ATTRIBUTES
5642 simde__m128i
simde_mm_sub_epi8(simde__m128i a,simde__m128i b)5643 simde_mm_sub_epi8 (simde__m128i a, simde__m128i b) {
5644 #if defined(SIMDE_X86_SSE2_NATIVE)
5645   return _mm_sub_epi8(a, b);
5646 #else
5647   simde__m128i_private
5648     r_,
5649     a_ = simde__m128i_to_private(a),
5650     b_ = simde__m128i_to_private(b);
5651 
5652   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5653     r_.neon_i8 = vsubq_s8(a_.neon_i8, b_.neon_i8);
5654   #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
5655     r_.i8 = a_.i8 - b_.i8;
5656   #else
5657     SIMDE_VECTORIZE
5658     for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
5659       r_.i8[i] = a_.i8[i] - b_.i8[i];
5660     }
5661   #endif
5662 
5663   return simde__m128i_from_private(r_);
5664 #endif
5665 }
5666 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5667 #  define _mm_sub_epi8(a, b) simde_mm_sub_epi8(a, b)
5668 #endif
5669 
5670 SIMDE_FUNCTION_ATTRIBUTES
5671 simde__m128i
simde_mm_sub_epi16(simde__m128i a,simde__m128i b)5672 simde_mm_sub_epi16 (simde__m128i a, simde__m128i b) {
5673 #if defined(SIMDE_X86_SSE2_NATIVE)
5674   return _mm_sub_epi16(a, b);
5675 #else
5676   simde__m128i_private
5677     r_,
5678     a_ = simde__m128i_to_private(a),
5679     b_ = simde__m128i_to_private(b);
5680 
5681   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5682     r_.neon_i16 = vsubq_s16(a_.neon_i16, b_.neon_i16);
5683   #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
5684     r_.i16 = a_.i16 - b_.i16;
5685   #else
5686     SIMDE_VECTORIZE
5687     for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
5688       r_.i16[i] = a_.i16[i] - b_.i16[i];
5689     }
5690   #endif
5691 
5692   return simde__m128i_from_private(r_);
5693 #endif
5694 }
5695 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5696 #  define _mm_sub_epi16(a, b) simde_mm_sub_epi16(a, b)
5697 #endif
5698 
5699 SIMDE_FUNCTION_ATTRIBUTES
5700 simde__m128i
simde_mm_sub_epi32(simde__m128i a,simde__m128i b)5701 simde_mm_sub_epi32 (simde__m128i a, simde__m128i b) {
5702 #if defined(SIMDE_X86_SSE2_NATIVE)
5703   return _mm_sub_epi32(a, b);
5704 #else
5705   simde__m128i_private
5706     r_,
5707     a_ = simde__m128i_to_private(a),
5708     b_ = simde__m128i_to_private(b);
5709 
5710   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5711     r_.neon_i32 = vsubq_s32(a_.neon_i32, b_.neon_i32);
5712   #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
5713     r_.i32 = a_.i32 - b_.i32;
5714   #else
5715     SIMDE_VECTORIZE
5716     for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
5717       r_.i32[i] = a_.i32[i] - b_.i32[i];
5718     }
5719   #endif
5720 
5721   return simde__m128i_from_private(r_);
5722 #endif
5723 }
5724 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5725 #  define _mm_sub_epi32(a, b) simde_mm_sub_epi32(a, b)
5726 #endif
5727 
5728 SIMDE_FUNCTION_ATTRIBUTES
5729 simde__m128i
simde_mm_sub_epi64(simde__m128i a,simde__m128i b)5730 simde_mm_sub_epi64 (simde__m128i a, simde__m128i b) {
5731 #if defined(SIMDE_X86_SSE2_NATIVE)
5732   return _mm_sub_epi64(a, b);
5733 #else
5734   simde__m128i_private
5735     r_,
5736     a_ = simde__m128i_to_private(a),
5737     b_ = simde__m128i_to_private(b);
5738 
5739   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5740     r_.neon_i64 = vsubq_s64(a_.neon_i64, b_.neon_i64);
5741   #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
5742     r_.i64 = a_.i64 - b_.i64;
5743   #else
5744     SIMDE_VECTORIZE
5745     for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
5746       r_.i64[i] = a_.i64[i] - b_.i64[i];
5747     }
5748   #endif
5749 
5750   return simde__m128i_from_private(r_);
5751 #endif
5752 }
5753 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5754 #  define _mm_sub_epi64(a, b) simde_mm_sub_epi64(a, b)
5755 #endif
5756 
5757 SIMDE_FUNCTION_ATTRIBUTES
5758 simde__m128i
simde_x_mm_sub_epu32(simde__m128i a,simde__m128i b)5759 simde_x_mm_sub_epu32 (simde__m128i a, simde__m128i b) {
5760   simde__m128i_private
5761     r_,
5762     a_ = simde__m128i_to_private(a),
5763     b_ = simde__m128i_to_private(b);
5764 
5765   #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
5766     r_.u32 = a_.u32 - b_.u32;
5767   #else
5768     SIMDE_VECTORIZE
5769     for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
5770       r_.u32[i] = a_.u32[i] - b_.u32[i];
5771     }
5772   #endif
5773 
5774   return simde__m128i_from_private(r_);
5775 }
5776 
5777 SIMDE_FUNCTION_ATTRIBUTES
5778 simde__m128d
simde_mm_sub_pd(simde__m128d a,simde__m128d b)5779 simde_mm_sub_pd (simde__m128d a, simde__m128d b) {
5780 #if defined(SIMDE_X86_SSE2_NATIVE)
5781   return _mm_sub_pd(a, b);
5782 #else
5783   simde__m128d_private
5784     r_,
5785     a_ = simde__m128d_to_private(a),
5786     b_ = simde__m128d_to_private(b);
5787 
5788 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
5789   r_.f64 = a_.f64 - b_.f64;
5790 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
5791   r_.wasm_v128 = wasm_f64x2_sub(a_.wasm_v128, b_.wasm_v128);
5792 #else
5793   SIMDE_VECTORIZE
5794   for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
5795     r_.f64[i] = a_.f64[i] - b_.f64[i];
5796   }
5797 #endif
5798 
5799   return simde__m128d_from_private(r_);
5800 #endif
5801 }
5802 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5803 #  define _mm_sub_pd(a, b) simde_mm_sub_pd(a, b)
5804 #endif
5805 
5806 SIMDE_FUNCTION_ATTRIBUTES
5807 simde__m128d
simde_mm_sub_sd(simde__m128d a,simde__m128d b)5808 simde_mm_sub_sd (simde__m128d a, simde__m128d b) {
5809 #if defined(SIMDE_X86_SSE2_NATIVE)
5810   return _mm_sub_sd(a, b);
5811 #elif defined(SIMDE_ASSUME_VECTORIZATION)
5812   return simde_mm_move_sd(a, simde_mm_sub_pd(a, b));
5813 #else
5814   simde__m128d_private
5815     r_,
5816     a_ = simde__m128d_to_private(a),
5817     b_ = simde__m128d_to_private(b);
5818 
5819   r_.f64[0] = a_.f64[0] - b_.f64[0];
5820   r_.f64[1] = a_.f64[1];
5821 
5822   return simde__m128d_from_private(r_);
5823 #endif
5824 }
5825 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5826 #  define _mm_sub_sd(a, b) simde_mm_sub_sd(a, b)
5827 #endif
5828 
5829 SIMDE_FUNCTION_ATTRIBUTES
5830 simde__m64
simde_mm_sub_si64(simde__m64 a,simde__m64 b)5831 simde_mm_sub_si64 (simde__m64 a, simde__m64 b) {
5832 #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
5833   return _mm_sub_si64(a, b);
5834 #else
5835   simde__m64_private
5836     r_,
5837     a_ = simde__m64_to_private(a),
5838     b_ = simde__m64_to_private(b);
5839 
5840 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
5841   r_.i64 = a_.i64 - b_.i64;
5842 #else
5843   r_.i64[0] = a_.i64[0] - b_.i64[0];
5844 #endif
5845 
5846   return simde__m64_from_private(r_);
5847 #endif
5848 }
5849 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5850 #  define _mm_sub_si64(a, b) simde_mm_sub_si64(a, b)
5851 #endif
5852 
5853 SIMDE_FUNCTION_ATTRIBUTES
5854 simde__m128i
simde_mm_subs_epi8(simde__m128i a,simde__m128i b)5855 simde_mm_subs_epi8 (simde__m128i a, simde__m128i b) {
5856 #if defined(SIMDE_X86_SSE2_NATIVE)
5857   return _mm_subs_epi8(a, b);
5858 #else
5859   simde__m128i_private
5860     r_,
5861     a_ = simde__m128i_to_private(a),
5862     b_ = simde__m128i_to_private(b);
5863 
5864 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5865   r_.neon_i8 = vqsubq_s8(a_.neon_i8, b_.neon_i8);
5866 #else
5867   SIMDE_VECTORIZE
5868   for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i8[0])) ; i++) {
5869     if (((b_.i8[i]) > 0 && (a_.i8[i]) < INT8_MIN + (b_.i8[i]))) {
5870       r_.i8[i] = INT8_MIN;
5871     } else if ((b_.i8[i]) < 0 && (a_.i8[i]) > INT8_MAX + (b_.i8[i])) {
5872       r_.i8[i] = INT8_MAX;
5873     } else {
5874       r_.i8[i] = (a_.i8[i]) - (b_.i8[i]);
5875     }
5876   }
5877 #endif
5878 
5879   return simde__m128i_from_private(r_);
5880 #endif
5881 }
5882 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5883 #  define _mm_subs_epi8(a, b) simde_mm_subs_epi8(a, b)
5884 #endif
5885 
5886 SIMDE_FUNCTION_ATTRIBUTES
5887 simde__m128i
simde_mm_subs_epi16(simde__m128i a,simde__m128i b)5888 simde_mm_subs_epi16 (simde__m128i a, simde__m128i b) {
5889 #if defined(SIMDE_X86_SSE2_NATIVE)
5890   return _mm_subs_epi16(a, b);
5891 #else
5892   simde__m128i_private
5893     r_,
5894     a_ = simde__m128i_to_private(a),
5895     b_ = simde__m128i_to_private(b);
5896 
5897   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5898     r_.neon_i16 = vqsubq_s16(a_.neon_i16, b_.neon_i16);
5899   #else
5900     SIMDE_VECTORIZE
5901     for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i16[0])) ; i++) {
5902       if (((b_.i16[i]) > 0 && (a_.i16[i]) < INT16_MIN + (b_.i16[i]))) {
5903         r_.i16[i] = INT16_MIN;
5904       } else if ((b_.i16[i]) < 0 && (a_.i16[i]) > INT16_MAX + (b_.i16[i])) {
5905         r_.i16[i] = INT16_MAX;
5906       } else {
5907         r_.i16[i] = (a_.i16[i]) - (b_.i16[i]);
5908       }
5909     }
5910   #endif
5911 
5912   return simde__m128i_from_private(r_);
5913 #endif
5914 }
5915 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5916 #  define _mm_subs_epi16(a, b) simde_mm_subs_epi16(a, b)
5917 #endif
5918 
5919 SIMDE_FUNCTION_ATTRIBUTES
5920 simde__m128i
simde_mm_subs_epu8(simde__m128i a,simde__m128i b)5921 simde_mm_subs_epu8 (simde__m128i a, simde__m128i b) {
5922 #if defined(SIMDE_X86_SSE2_NATIVE)
5923   return _mm_subs_epu8(a, b);
5924 #else
5925   simde__m128i_private
5926     r_,
5927     a_ = simde__m128i_to_private(a),
5928     b_ = simde__m128i_to_private(b);
5929 
5930   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5931     r_.neon_u8 = vqsubq_u8(a_.neon_u8, b_.neon_u8);
5932   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
5933     r_.altivec_u8 = vec_subs(a_.altivec_u8, b_.altivec_u8);
5934   #else
5935     SIMDE_VECTORIZE
5936     for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i8[0])) ; i++) {
5937       const int32_t x = a_.u8[i] - b_.u8[i];
5938       if (x < 0) {
5939         r_.u8[i] = 0;
5940       } else if (x > UINT8_MAX) {
5941         r_.u8[i] = UINT8_MAX;
5942       } else {
5943         r_.u8[i] = HEDLEY_STATIC_CAST(uint8_t, x);
5944       }
5945     }
5946   #endif
5947 
5948   return simde__m128i_from_private(r_);
5949 #endif
5950 }
5951 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5952 #  define _mm_subs_epu8(a, b) simde_mm_subs_epu8(a, b)
5953 #endif
5954 
5955 SIMDE_FUNCTION_ATTRIBUTES
5956 simde__m128i
simde_mm_subs_epu16(simde__m128i a,simde__m128i b)5957 simde_mm_subs_epu16 (simde__m128i a, simde__m128i b) {
5958 #if defined(SIMDE_X86_SSE2_NATIVE)
5959   return _mm_subs_epu16(a, b);
5960 #else
5961   simde__m128i_private
5962     r_,
5963     a_ = simde__m128i_to_private(a),
5964     b_ = simde__m128i_to_private(b);
5965 
5966   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5967     r_.neon_u16 = vqsubq_u16(a_.neon_u16, b_.neon_u16);
5968   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
5969     r_.altivec_u16 = vec_subs(a_.altivec_u16, b_.altivec_u16);
5970   #else
5971     SIMDE_VECTORIZE
5972     for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i16[0])) ; i++) {
5973       const int32_t x = a_.u16[i] - b_.u16[i];
5974       if (x < 0) {
5975         r_.u16[i] = 0;
5976       } else if (x > UINT16_MAX) {
5977         r_.u16[i] = UINT16_MAX;
5978       } else {
5979         r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, x);
5980       }
5981     }
5982   #endif
5983 
5984   return simde__m128i_from_private(r_);
5985 #endif
5986 }
5987 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5988 #  define _mm_subs_epu16(a, b) simde_mm_subs_epu16(a, b)
5989 #endif
5990 
5991 SIMDE_FUNCTION_ATTRIBUTES
5992 int
simde_mm_ucomieq_sd(simde__m128d a,simde__m128d b)5993 simde_mm_ucomieq_sd (simde__m128d a, simde__m128d b) {
5994 #if defined(SIMDE_X86_SSE2_NATIVE)
5995   return _mm_ucomieq_sd(a, b);
5996 #else
5997   simde__m128d_private
5998     a_ = simde__m128d_to_private(a),
5999     b_ = simde__m128d_to_private(b);
6000   int r;
6001 
6002 #if defined(SIMDE_HAVE_FENV_H)
6003   fenv_t envp;
6004   int x = feholdexcept(&envp);
6005   r =  a_.f64[0] == b_.f64[0];
6006   if (HEDLEY_LIKELY(x == 0))
6007     fesetenv(&envp);
6008 #else
6009   r =  a_.f64[0] == b_.f64[0];
6010 #endif
6011 
6012   return r;
6013 #endif
6014 }
6015 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6016 #  define _mm_ucomieq_sd(a, b) simde_mm_ucomieq_sd(a, b)
6017 #endif
6018 
6019 SIMDE_FUNCTION_ATTRIBUTES
6020 int
simde_mm_ucomige_sd(simde__m128d a,simde__m128d b)6021 simde_mm_ucomige_sd (simde__m128d a, simde__m128d b) {
6022 #if defined(SIMDE_X86_SSE2_NATIVE)
6023   return _mm_ucomige_sd(a, b);
6024 #else
6025   simde__m128d_private
6026     a_ = simde__m128d_to_private(a),
6027     b_ = simde__m128d_to_private(b);
6028   int r;
6029 
6030 #if defined(SIMDE_HAVE_FENV_H)
6031   fenv_t envp;
6032   int x = feholdexcept(&envp);
6033   r = a_.f64[0] >= b_.f64[0];
6034   if (HEDLEY_LIKELY(x == 0))
6035     fesetenv(&envp);
6036 #else
6037   r = a_.f64[0] >= b_.f64[0];
6038 #endif
6039 
6040   return r;
6041 #endif
6042 }
6043 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6044 #  define _mm_ucomige_sd(a, b) simde_mm_ucomige_sd(a, b)
6045 #endif
6046 
6047 SIMDE_FUNCTION_ATTRIBUTES
6048 int
simde_mm_ucomigt_sd(simde__m128d a,simde__m128d b)6049 simde_mm_ucomigt_sd (simde__m128d a, simde__m128d b) {
6050 #if defined(SIMDE_X86_SSE2_NATIVE)
6051   return _mm_ucomigt_sd(a, b);
6052 #else
6053   simde__m128d_private
6054     a_ = simde__m128d_to_private(a),
6055     b_ = simde__m128d_to_private(b);
6056   int r;
6057 
6058 #if defined(SIMDE_HAVE_FENV_H)
6059   fenv_t envp;
6060   int x = feholdexcept(&envp);
6061   r = a_.f64[0] > b_.f64[0];
6062   if (HEDLEY_LIKELY(x == 0))
6063     fesetenv(&envp);
6064 #else
6065   r = a_.f64[0] > b_.f64[0];
6066 #endif
6067 
6068   return r;
6069 #endif
6070 }
6071 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6072 #  define _mm_ucomigt_sd(a, b) simde_mm_ucomigt_sd(a, b)
6073 #endif
6074 
6075 SIMDE_FUNCTION_ATTRIBUTES
6076 int
simde_mm_ucomile_sd(simde__m128d a,simde__m128d b)6077 simde_mm_ucomile_sd (simde__m128d a, simde__m128d b) {
6078 #if defined(SIMDE_X86_SSE2_NATIVE)
6079   return _mm_ucomile_sd(a, b);
6080 #else
6081   simde__m128d_private
6082     a_ = simde__m128d_to_private(a),
6083     b_ = simde__m128d_to_private(b);
6084   int r;
6085 
6086 #if defined(SIMDE_HAVE_FENV_H)
6087   fenv_t envp;
6088   int x = feholdexcept(&envp);
6089   r = a_.f64[0] <= b_.f64[0];
6090   if (HEDLEY_LIKELY(x == 0))
6091     fesetenv(&envp);
6092 #else
6093   r = a_.f64[0] <= b_.f64[0];
6094 #endif
6095 
6096   return r;
6097 #endif
6098 }
6099 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6100 #  define _mm_ucomile_sd(a, b) simde_mm_ucomile_sd(a, b)
6101 #endif
6102 
6103 SIMDE_FUNCTION_ATTRIBUTES
6104 int
simde_mm_ucomilt_sd(simde__m128d a,simde__m128d b)6105 simde_mm_ucomilt_sd (simde__m128d a, simde__m128d b) {
6106 #if defined(SIMDE_X86_SSE2_NATIVE)
6107   return _mm_ucomilt_sd(a, b);
6108 #else
6109   simde__m128d_private
6110     a_ = simde__m128d_to_private(a),
6111     b_ = simde__m128d_to_private(b);
6112   int r;
6113 
6114 #if defined(SIMDE_HAVE_FENV_H)
6115   fenv_t envp;
6116   int x = feholdexcept(&envp);
6117   r = a_.f64[0] < b_.f64[0];
6118   if (HEDLEY_LIKELY(x == 0))
6119     fesetenv(&envp);
6120 #else
6121   r = a_.f64[0] < b_.f64[0];
6122 #endif
6123 
6124   return r;
6125 #endif
6126 }
6127 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6128 #  define _mm_ucomilt_sd(a, b) simde_mm_ucomilt_sd(a, b)
6129 #endif
6130 
6131 SIMDE_FUNCTION_ATTRIBUTES
6132 int
simde_mm_ucomineq_sd(simde__m128d a,simde__m128d b)6133 simde_mm_ucomineq_sd (simde__m128d a, simde__m128d b) {
6134 #if defined(SIMDE_X86_SSE2_NATIVE)
6135   return _mm_ucomineq_sd(a, b);
6136 #else
6137   simde__m128d_private
6138     a_ = simde__m128d_to_private(a),
6139     b_ = simde__m128d_to_private(b);
6140   int r;
6141 
6142 #if defined(SIMDE_HAVE_FENV_H)
6143   fenv_t envp;
6144   int x = feholdexcept(&envp);
6145   r = a_.f64[0] != b_.f64[0];
6146   if (HEDLEY_LIKELY(x == 0))
6147     fesetenv(&envp);
6148 #else
6149   r = a_.f64[0] != b_.f64[0];
6150 #endif
6151 
6152   return r;
6153 #endif
6154 }
6155 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6156 #  define _mm_ucomineq_sd(a, b) simde_mm_ucomineq_sd(a, b)
6157 #endif
6158 
6159 #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
6160   HEDLEY_DIAGNOSTIC_PUSH
6161   SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_
6162 #endif
6163 
6164 #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
6165   HEDLEY_DIAGNOSTIC_POP
6166 #endif
6167 
6168 SIMDE_FUNCTION_ATTRIBUTES
6169 void
simde_mm_lfence(void)6170 simde_mm_lfence (void) {
6171 #if defined(SIMDE_X86_SSE2_NATIVE)
6172   _mm_lfence();
6173 #else
6174   simde_mm_sfence();
6175 #endif
6176 }
6177 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6178 #  define _mm_lfence() simde_mm_lfence()
6179 #endif
6180 
6181 SIMDE_FUNCTION_ATTRIBUTES
6182 void
simde_mm_mfence(void)6183 simde_mm_mfence (void) {
6184 #if defined(SIMDE_X86_SSE2_NATIVE)
6185   _mm_mfence();
6186 #else
6187   simde_mm_sfence();
6188 #endif
6189 }
6190 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6191 #  define _mm_mfence() simde_mm_mfence()
6192 #endif
6193 
6194 SIMDE_FUNCTION_ATTRIBUTES
6195 simde__m128i
simde_mm_unpackhi_epi8(simde__m128i a,simde__m128i b)6196 simde_mm_unpackhi_epi8 (simde__m128i a, simde__m128i b) {
6197 #if defined(SIMDE_X86_SSE2_NATIVE)
6198   return _mm_unpackhi_epi8(a, b);
6199 #else
6200   simde__m128i_private
6201     r_,
6202     a_ = simde__m128i_to_private(a),
6203     b_ = simde__m128i_to_private(b);
6204 
6205 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6206   r_.neon_i8 = vzip2q_s8(a_.neon_i8, b_.neon_i8);
6207 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6208   int8x8_t a1 = vreinterpret_s8_s16(vget_high_s16(a_.neon_i16));
6209   int8x8_t b1 = vreinterpret_s8_s16(vget_high_s16(b_.neon_i16));
6210   int8x8x2_t result = vzip_s8(a1, b1);
6211   r_.neon_i8 = vcombine_s8(result.val[0], result.val[1]);
6212 #elif defined(SIMDE_SHUFFLE_VECTOR_)
6213   r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, b_.i8, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
6214 #else
6215   SIMDE_VECTORIZE
6216   for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i8[0])) / 2) ; i++) {
6217     r_.i8[(i * 2)]     = a_.i8[i + ((sizeof(r_) / sizeof(r_.i8[0])) / 2)];
6218     r_.i8[(i * 2) + 1] = b_.i8[i + ((sizeof(r_) / sizeof(r_.i8[0])) / 2)];
6219   }
6220 #endif
6221 
6222   return simde__m128i_from_private(r_);
6223 #endif
6224 }
6225 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6226 #  define _mm_unpackhi_epi8(a, b) simde_mm_unpackhi_epi8(a, b)
6227 #endif
6228 
6229 SIMDE_FUNCTION_ATTRIBUTES
6230 simde__m128i
simde_mm_unpackhi_epi16(simde__m128i a,simde__m128i b)6231 simde_mm_unpackhi_epi16 (simde__m128i a, simde__m128i b) {
6232 #if defined(SIMDE_X86_SSE2_NATIVE)
6233   return _mm_unpackhi_epi16(a, b);
6234 #else
6235   simde__m128i_private
6236     r_,
6237     a_ = simde__m128i_to_private(a),
6238     b_ = simde__m128i_to_private(b);
6239 
6240 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6241   r_.neon_i16 = vzip2q_s16(a_.neon_i16, b_.neon_i16);
6242 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6243   int16x4_t a1 = vget_high_s16(a_.neon_i16);
6244   int16x4_t b1 = vget_high_s16(b_.neon_i16);
6245   int16x4x2_t result = vzip_s16(a1, b1);
6246   r_.neon_i16 = vcombine_s16(result.val[0], result.val[1]);
6247 #elif defined(SIMDE_SHUFFLE_VECTOR_)
6248   r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, b_.i16, 4, 12, 5, 13, 6, 14, 7, 15);
6249 #else
6250   SIMDE_VECTORIZE
6251   for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i16[0])) / 2) ; i++) {
6252     r_.i16[(i * 2)]     = a_.i16[i + ((sizeof(r_) / sizeof(r_.i16[0])) / 2)];
6253     r_.i16[(i * 2) + 1] = b_.i16[i + ((sizeof(r_) / sizeof(r_.i16[0])) / 2)];
6254   }
6255 #endif
6256 
6257   return simde__m128i_from_private(r_);
6258 #endif
6259 }
6260 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6261 #  define _mm_unpackhi_epi16(a, b) simde_mm_unpackhi_epi16(a, b)
6262 #endif
6263 
6264 SIMDE_FUNCTION_ATTRIBUTES
6265 simde__m128i
simde_mm_unpackhi_epi32(simde__m128i a,simde__m128i b)6266 simde_mm_unpackhi_epi32 (simde__m128i a, simde__m128i b) {
6267 #if defined(SIMDE_X86_SSE2_NATIVE)
6268   return _mm_unpackhi_epi32(a, b);
6269 #else
6270   simde__m128i_private
6271     r_,
6272     a_ = simde__m128i_to_private(a),
6273     b_ = simde__m128i_to_private(b);
6274 
6275 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6276   r_.neon_i32 = vzip2q_s32(a_.neon_i32, b_.neon_i32);
6277 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6278   int32x2_t a1 = vget_high_s32(a_.neon_i32);
6279   int32x2_t b1 = vget_high_s32(b_.neon_i32);
6280   int32x2x2_t result = vzip_s32(a1, b1);
6281   r_.neon_i32 = vcombine_s32(result.val[0], result.val[1]);
6282 #elif defined(SIMDE_SHUFFLE_VECTOR_)
6283   r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.i32, b_.i32, 2, 6, 3, 7);
6284 #else
6285   SIMDE_VECTORIZE
6286   for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i32[0])) / 2) ; i++) {
6287     r_.i32[(i * 2)]     = a_.i32[i + ((sizeof(r_) / sizeof(r_.i32[0])) / 2)];
6288     r_.i32[(i * 2) + 1] = b_.i32[i + ((sizeof(r_) / sizeof(r_.i32[0])) / 2)];
6289   }
6290 #endif
6291 
6292   return simde__m128i_from_private(r_);
6293 #endif
6294 }
6295 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6296 #  define _mm_unpackhi_epi32(a, b) simde_mm_unpackhi_epi32(a, b)
6297 #endif
6298 
6299 SIMDE_FUNCTION_ATTRIBUTES
6300 simde__m128i
simde_mm_unpackhi_epi64(simde__m128i a,simde__m128i b)6301 simde_mm_unpackhi_epi64 (simde__m128i a, simde__m128i b) {
6302 #if defined(SIMDE_X86_SSE2_NATIVE)
6303   return _mm_unpackhi_epi64(a, b);
6304 #else
6305   simde__m128i_private
6306     r_,
6307     a_ = simde__m128i_to_private(a),
6308     b_ = simde__m128i_to_private(b);
6309 
6310 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6311   int64x1_t a_h = vget_high_s64(a_.neon_i64);
6312   int64x1_t b_h = vget_high_s64(b_.neon_i64);
6313   r_.neon_i64 = vcombine_s64(a_h, b_h);
6314 #elif defined(SIMDE_SHUFFLE_VECTOR_)
6315   r_.i64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.i64, b_.i64, 1, 3);
6316 #else
6317   SIMDE_VECTORIZE
6318   for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i64[0])) / 2) ; i++) {
6319     r_.i64[(i * 2)]     = a_.i64[i + ((sizeof(r_) / sizeof(r_.i64[0])) / 2)];
6320     r_.i64[(i * 2) + 1] = b_.i64[i + ((sizeof(r_) / sizeof(r_.i64[0])) / 2)];
6321   }
6322 #endif
6323 
6324   return simde__m128i_from_private(r_);
6325 #endif
6326 }
6327 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6328 #  define _mm_unpackhi_epi64(a, b) simde_mm_unpackhi_epi64(a, b)
6329 #endif
6330 
6331 SIMDE_FUNCTION_ATTRIBUTES
6332 simde__m128d
simde_mm_unpackhi_pd(simde__m128d a,simde__m128d b)6333 simde_mm_unpackhi_pd (simde__m128d a, simde__m128d b) {
6334 #if defined(SIMDE_X86_SSE2_NATIVE)
6335   return _mm_unpackhi_pd(a, b);
6336 #else
6337   simde__m128d_private
6338     r_,
6339     a_ = simde__m128d_to_private(a),
6340     b_ = simde__m128d_to_private(b);
6341 
6342 #if defined(SIMDE_SHUFFLE_VECTOR_)
6343   r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 1, 3);
6344 #else
6345   SIMDE_VECTORIZE
6346   for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.f64[0])) / 2) ; i++) {
6347     r_.f64[(i * 2)]     = a_.f64[i + ((sizeof(r_) / sizeof(r_.f64[0])) / 2)];
6348     r_.f64[(i * 2) + 1] = b_.f64[i + ((sizeof(r_) / sizeof(r_.f64[0])) / 2)];
6349   }
6350 #endif
6351 
6352   return simde__m128d_from_private(r_);
6353 #endif
6354 }
6355 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6356 #  define _mm_unpackhi_pd(a, b) simde_mm_unpackhi_pd(a, b)
6357 #endif
6358 
6359 SIMDE_FUNCTION_ATTRIBUTES
6360 simde__m128i
simde_mm_unpacklo_epi8(simde__m128i a,simde__m128i b)6361 simde_mm_unpacklo_epi8 (simde__m128i a, simde__m128i b) {
6362 #if defined(SIMDE_X86_SSE2_NATIVE)
6363   return _mm_unpacklo_epi8(a, b);
6364 #else
6365   simde__m128i_private
6366     r_,
6367     a_ = simde__m128i_to_private(a),
6368     b_ = simde__m128i_to_private(b);
6369 
6370 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6371   r_.neon_i8 = vzip1q_s8(a_.neon_i8, b_.neon_i8);
6372 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6373   int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(a_.neon_i16));
6374   int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(b_.neon_i16));
6375   int8x8x2_t result = vzip_s8(a1, b1);
6376   r_.neon_i8 = vcombine_s8(result.val[0], result.val[1]);
6377 #elif defined(SIMDE_SHUFFLE_VECTOR_)
6378   r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, b_.i8, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
6379 #else
6380   SIMDE_VECTORIZE
6381   for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i8[0])) / 2) ; i++) {
6382     r_.i8[(i * 2)]     = a_.i8[i];
6383     r_.i8[(i * 2) + 1] = b_.i8[i];
6384   }
6385 #endif
6386 
6387   return simde__m128i_from_private(r_);
6388 #endif
6389 }
6390 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6391 #  define _mm_unpacklo_epi8(a, b) simde_mm_unpacklo_epi8(a, b)
6392 #endif
6393 
6394 SIMDE_FUNCTION_ATTRIBUTES
6395 simde__m128i
simde_mm_unpacklo_epi16(simde__m128i a,simde__m128i b)6396 simde_mm_unpacklo_epi16 (simde__m128i a, simde__m128i b) {
6397 #if defined(SIMDE_X86_SSE2_NATIVE)
6398   return _mm_unpacklo_epi16(a, b);
6399 #else
6400   simde__m128i_private
6401     r_,
6402     a_ = simde__m128i_to_private(a),
6403     b_ = simde__m128i_to_private(b);
6404 
6405 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6406   r_.neon_i16 = vzip1q_s16(a_.neon_i16, b_.neon_i16);
6407 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6408   int16x4_t a1 = vget_low_s16(a_.neon_i16);
6409   int16x4_t b1 = vget_low_s16(b_.neon_i16);
6410   int16x4x2_t result = vzip_s16(a1, b1);
6411   r_.neon_i16 = vcombine_s16(result.val[0], result.val[1]);
6412 #elif defined(SIMDE_SHUFFLE_VECTOR_)
6413   r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, b_.i16, 0, 8, 1, 9, 2, 10, 3, 11);
6414 #else
6415   SIMDE_VECTORIZE
6416   for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i16[0])) / 2) ; i++) {
6417     r_.i16[(i * 2)]     = a_.i16[i];
6418     r_.i16[(i * 2) + 1] = b_.i16[i];
6419   }
6420 #endif
6421 
6422   return simde__m128i_from_private(r_);
6423 #endif
6424 }
6425 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6426 #  define _mm_unpacklo_epi16(a, b) simde_mm_unpacklo_epi16(a, b)
6427 #endif
6428 
6429 SIMDE_FUNCTION_ATTRIBUTES
6430 simde__m128i
simde_mm_unpacklo_epi32(simde__m128i a,simde__m128i b)6431 simde_mm_unpacklo_epi32 (simde__m128i a, simde__m128i b) {
6432 #if defined(SIMDE_X86_SSE2_NATIVE)
6433   return _mm_unpacklo_epi32(a, b);
6434 #else
6435   simde__m128i_private
6436     r_,
6437     a_ = simde__m128i_to_private(a),
6438     b_ = simde__m128i_to_private(b);
6439 
6440 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6441   r_.neon_i32 = vzip1q_s32(a_.neon_i32, b_.neon_i32);
6442 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6443   int32x2_t a1 = vget_low_s32(a_.neon_i32);
6444   int32x2_t b1 = vget_low_s32(b_.neon_i32);
6445   int32x2x2_t result = vzip_s32(a1, b1);
6446   r_.neon_i32 = vcombine_s32(result.val[0], result.val[1]);
6447 #elif defined(SIMDE_SHUFFLE_VECTOR_)
6448   r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.i32, b_.i32, 0, 4, 1, 5);
6449 #else
6450   SIMDE_VECTORIZE
6451   for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i32[0])) / 2) ; i++) {
6452     r_.i32[(i * 2)]     = a_.i32[i];
6453     r_.i32[(i * 2) + 1] = b_.i32[i];
6454   }
6455 #endif
6456 
6457   return simde__m128i_from_private(r_);
6458 #endif
6459 }
6460 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6461 #  define _mm_unpacklo_epi32(a, b) simde_mm_unpacklo_epi32(a, b)
6462 #endif
6463 
6464 SIMDE_FUNCTION_ATTRIBUTES
6465 simde__m128i
simde_mm_unpacklo_epi64(simde__m128i a,simde__m128i b)6466 simde_mm_unpacklo_epi64 (simde__m128i a, simde__m128i b) {
6467 #if defined(SIMDE_X86_SSE2_NATIVE)
6468   return _mm_unpacklo_epi64(a, b);
6469 #else
6470   simde__m128i_private
6471     r_,
6472     a_ = simde__m128i_to_private(a),
6473     b_ = simde__m128i_to_private(b);
6474 
6475 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6476   int64x1_t a_l = vget_low_s64(a_.i64);
6477   int64x1_t b_l = vget_low_s64(b_.i64);
6478   r_.neon_i64 = vcombine_s64(a_l, b_l);
6479 #elif defined(SIMDE_SHUFFLE_VECTOR_)
6480   r_.i64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.i64, b_.i64, 0, 2);
6481 #else
6482   SIMDE_VECTORIZE
6483   for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i64[0])) / 2) ; i++) {
6484     r_.i64[(i * 2)]     = a_.i64[i];
6485     r_.i64[(i * 2) + 1] = b_.i64[i];
6486   }
6487 #endif
6488 
6489   return simde__m128i_from_private(r_);
6490 #endif
6491 }
6492 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6493 #  define _mm_unpacklo_epi64(a, b) simde_mm_unpacklo_epi64(a, b)
6494 #endif
6495 
6496 SIMDE_FUNCTION_ATTRIBUTES
6497 simde__m128d
simde_mm_unpacklo_pd(simde__m128d a,simde__m128d b)6498 simde_mm_unpacklo_pd (simde__m128d a, simde__m128d b) {
6499 #if defined(SIMDE_X86_SSE2_NATIVE)
6500   return _mm_unpacklo_pd(a, b);
6501 #else
6502   simde__m128d_private
6503     r_,
6504     a_ = simde__m128d_to_private(a),
6505     b_ = simde__m128d_to_private(b);
6506 
6507 #if defined(SIMDE_SHUFFLE_VECTOR_)
6508   r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 0, 2);
6509 #else
6510   SIMDE_VECTORIZE
6511   for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.f64[0])) / 2) ; i++) {
6512     r_.f64[(i * 2)]     = a_.f64[i];
6513     r_.f64[(i * 2) + 1] = b_.f64[i];
6514   }
6515 #endif
6516 
6517   return simde__m128d_from_private(r_);
6518 #endif
6519 }
6520 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6521 #  define _mm_unpacklo_pd(a, b) simde_mm_unpacklo_pd(a, b)
6522 #endif
6523 
6524 SIMDE_FUNCTION_ATTRIBUTES
6525 simde__m128d
simde_mm_xor_pd(simde__m128d a,simde__m128d b)6526 simde_mm_xor_pd (simde__m128d a, simde__m128d b) {
6527 #if defined(SIMDE_X86_SSE2_NATIVE)
6528   return _mm_xor_pd(a, b);
6529 #else
6530   simde__m128d_private
6531     r_,
6532     a_ = simde__m128d_to_private(a),
6533     b_ = simde__m128d_to_private(b);
6534 
6535 #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
6536   r_.i32f = a_.i32f ^ b_.i32f;
6537 #else
6538   SIMDE_VECTORIZE
6539   for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
6540     r_.i32f[i] = a_.i32f[i] ^ b_.i32f[i];
6541   }
6542 #endif
6543 
6544   return simde__m128d_from_private(r_);
6545 #endif
6546 }
6547 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6548 #  define _mm_xor_pd(a, b) simde_mm_xor_pd(a, b)
6549 #endif
6550 
6551 SIMDE_FUNCTION_ATTRIBUTES
6552 simde__m128i
simde_mm_xor_si128(simde__m128i a,simde__m128i b)6553 simde_mm_xor_si128 (simde__m128i a, simde__m128i b) {
6554 #if defined(SIMDE_X86_SSE2_NATIVE)
6555   return _mm_xor_si128(a, b);
6556 #else
6557   simde__m128i_private
6558     r_,
6559     a_ = simde__m128i_to_private(a),
6560     b_ = simde__m128i_to_private(b);
6561 
6562   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6563     r_.neon_i32 = veorq_s32(a_.neon_i32, b_.neon_i32);
6564   #elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
6565     r_.altivec_i32 = vec_xor(a_.altivec_i32, b_.altivec_i32);
6566   #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
6567     r_.i32f = a_.i32f ^ b_.i32f;
6568   #else
6569     SIMDE_VECTORIZE
6570     for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
6571       r_.i32f[i] = a_.i32f[i] ^ b_.i32f[i];
6572     }
6573   #endif
6574 
6575   return simde__m128i_from_private(r_);
6576 #endif
6577 }
6578 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6579 #  define _mm_xor_si128(a, b) simde_mm_xor_si128(a, b)
6580 #endif
6581 
6582 SIMDE_FUNCTION_ATTRIBUTES
6583 simde__m128i
simde_x_mm_not_si128(simde__m128i a)6584 simde_x_mm_not_si128 (simde__m128i a) {
6585   simde__m128i_private
6586     r_,
6587     a_ = simde__m128i_to_private(a);
6588 
6589 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6590   r_.neon_i32 = vmvnq_s32(a_.neon_i32);
6591 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
6592   r_.i32f = ~(a_.i32f);
6593 #else
6594   SIMDE_VECTORIZE
6595   for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
6596     r_.i32f[i] = ~(a_.i32f[i]);
6597   }
6598 #endif
6599 
6600   return simde__m128i_from_private(r_);
6601 }
6602 
6603 #define SIMDE_MM_SHUFFLE2(x, y) (((x) << 1) | (y))
6604 #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6605 #  define _MM_SHUFFLE2(x, y) SIMDE_MM_SHUFFLE2(x, y)
6606 #endif
6607 
6608 SIMDE_END_DECLS_
6609 
6610 HEDLEY_DIAGNOSTIC_POP
6611 
6612 #endif /* !defined(SIMDE_X86_SSE2_H) */
6613